Commit 1889c4cb authored by Xiang Zhang's avatar Xiang Zhang Committed by GitHub

bpo-29456: Fix bugs in unicodedata.normalize: u1176, u11a7 and u11c3 (GH-1958) (GH-7704)

Hangul composition check boundaries are wrong for the second character
([0x1161, 0x1176) instead of [0x1161, 0x1176]) and third character ((0x11A7, 0x11C3)
instead of [0x11A7, 0x11C3])..
(cherry picked from commit d134809c)
Co-authored-by: default avatarWonsup Yoon <pusnow@me.com>
parent fc8ea20c
...@@ -204,6 +204,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): ...@@ -204,6 +204,19 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
b = u'C\u0338' * 20 + u'\xC7' b = u'C\u0338' * 20 + u'\xC7'
self.assertEqual(self.db.normalize('NFC', a), b) self.assertEqual(self.db.normalize('NFC', a), b)
def test_issue29456(self):
# Fix #29456
u1176_str_a = u'\u1100\u1176\u11a8'
u1176_str_b = u'\u1100\u1176\u11a8'
u11a7_str_a = u'\u1100\u1175\u11a7'
u11a7_str_b = u'\uae30\u11a7'
u11c3_str_a = u'\u1100\u1175\u11c3'
u11c3_str_b = u'\uae30\u11c3'
self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
def test_east_asian_width(self): def test_east_asian_width(self):
eaw = self.db.east_asian_width eaw = self.db.east_asian_width
self.assertRaises(TypeError, eaw, 'a') self.assertRaises(TypeError, eaw, 'a')
......
...@@ -1578,6 +1578,7 @@ Jason Yeo ...@@ -1578,6 +1578,7 @@ Jason Yeo
EungJun Yi EungJun Yi
Bob Yodlowski Bob Yodlowski
Danny Yoo Danny Yoo
Wonsup Yoon
Rory Yorke Rory Yorke
George Yoshida George Yoshida
Kazuhiro Yoshida Kazuhiro Yoshida
......
Fix bugs in hangul normalization: u1176, u11a7 and u11c3
...@@ -664,14 +664,18 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) ...@@ -664,14 +664,18 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
pairs, since we always have decomposed data. */ pairs, since we always have decomposed data. */
if (LBase <= *i && *i < (LBase+LCount) && if (LBase <= *i && *i < (LBase+LCount) &&
i + 1 < end && i + 1 < end &&
VBase <= i[1] && i[1] <= (VBase+VCount)) { VBase <= i[1] && i[1] < (VBase+VCount)) {
/* check L character is a modern leading consonant (0x1100 ~ 0x1112)
and V character is a modern vowel (0x1161 ~ 0x1175). */
int LIndex, VIndex; int LIndex, VIndex;
LIndex = i[0] - LBase; LIndex = i[0] - LBase;
VIndex = i[1] - VBase; VIndex = i[1] - VBase;
code = SBase + (LIndex*VCount+VIndex)*TCount; code = SBase + (LIndex*VCount+VIndex)*TCount;
i+=2; i+=2;
if (i < end && if (i < end &&
TBase <= *i && *i <= (TBase+TCount)) { TBase < *i && *i < (TBase+TCount)) {
/* check T character is a modern trailing consonant
(0x11A8 ~ 0x11C2). */
code += *i-TBase; code += *i-TBase;
i++; i++;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment