Commit d5890c8d authored by Benjamin Peterson's avatar Benjamin Peterson

add str.casefold() (closes #13752)

parent 94d5a717
......@@ -1002,6 +1002,14 @@ functions based on regular expressions.
rest lowercased.
.. method:: str.casefold()
Return a casefolded copy of the string. Casefolded strings may be used for
caseless matching. For example, ``"MASSE".casefold() == "maße".casefold()``.
.. versionadded:: 3.3
.. method:: str.center(width[, fillchar])
Return centered in a string of length *width*. Padding is done using the
......
......@@ -2023,6 +2023,11 @@ PyAPI_FUNC(int) _PyUnicode_ToUpperFull(
Py_UCS4 *res
);
PyAPI_FUNC(int) _PyUnicode_ToFoldedFull(
Py_UCS4 ch, /* Unicode character */
Py_UCS4 *res
);
PyAPI_FUNC(int) _PyUnicode_IsCaseIgnorable(
Py_UCS4 ch /* Unicode character */
);
......
......@@ -565,6 +565,14 @@ class UnicodeTest(string_tests.CommonTest,
self.assertEqual('\U0008fffe'.lower(), '\U0008fffe')
self.assertEqual('\u2177'.lower(), '\u2177')
def test_casefold(self):
self.assertEqual('hello'.casefold(), 'hello')
self.assertEqual('hELlo'.casefold(), 'hello')
self.assertEqual('ß'.casefold(), 'ss')
self.assertEqual('fi'.casefold(), 'fi')
self.assertEqual('\u03a3'.casefold(), '\u03c3')
self.assertEqual('A\u0345\u03a3'.casefold(), 'a\u03b9\u03c3')
def test_upper(self):
string_tests.CommonTest.test_upper(self)
self.assertEqual('\U0001044F'.upper(), '\U00010427')
......
......@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
- Issue #13752: Add a casefold() method to str.
- Issue #13761: Add a "flush" keyword argument to the print() function,
used to ensure flushing the output stream.
......
......@@ -185,7 +185,7 @@ Py_UCS4 _PyUnicode_ToUppercase(Py_UCS4 ch)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK)
return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFFFF];
return _PyUnicode_ExtendedCase[ctype->upper & 0xFFFF];
return ctype->upper ? ctype->upper : ch;
}
......@@ -197,7 +197,7 @@ Py_UCS4 _PyUnicode_ToLowercase(Py_UCS4 ch)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK)
return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFFFF];
return _PyUnicode_ExtendedCase[ctype->lower & 0xFFFF];
return ctype->lower ? ctype->lower : ch;
}
......@@ -206,7 +206,7 @@ int _PyUnicode_ToLowerFull(Py_UCS4 ch, Py_UCS4 *res)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
int index = ctype->lower & 0xFFFFFF;
int index = ctype->lower & 0xFFFF;
int n = ctype->lower >> 24;
int i;
for (i = 0; i < n; i++)
......@@ -222,7 +222,7 @@ int _PyUnicode_ToTitleFull(Py_UCS4 ch, Py_UCS4 *res)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
int index = ctype->title & 0xFFFFFF;
int index = ctype->title & 0xFFFF;
int n = ctype->title >> 24;
int i;
for (i = 0; i < n; i++)
......@@ -238,7 +238,7 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK) {
int index = ctype->upper & 0xFFFFFF;
int index = ctype->upper & 0xFFFF;
int n = ctype->upper >> 24;
int i;
for (i = 0; i < n; i++)
......@@ -249,6 +249,21 @@ int _PyUnicode_ToUpperFull(Py_UCS4 ch, Py_UCS4 *res)
return 1;
}
int _PyUnicode_ToFoldedFull(Py_UCS4 ch, Py_UCS4 *res)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
if (ctype->flags & EXTENDED_CASE_MASK && (ctype->lower >> 20) & 7) {
int index = (ctype->lower & 0xFFFF) + (ctype->lower >> 24);
int n = (ctype->lower >> 20) & 7;
int i;
for (i = 0; i < n; i++)
res[i] = _PyUnicode_ExtendedCase[index + i];
return n;
}
return _PyUnicode_ToLowerFull(ch, res);
}
int _PyUnicode_IsCased(Py_UCS4 ch)
{
const _PyUnicode_TypeRecord *ctype = gettyperecord(ch);
......
......@@ -9576,6 +9576,24 @@ do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar
return do_upper_or_lower(kind, data, length, res, maxchar, 1);
}
static Py_ssize_t
do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{
Py_ssize_t i, k = 0;
for (i = 0; i < length; i++) {
Py_UCS4 c = PyUnicode_READ(kind, data, i);
Py_UCS4 mapped[3];
int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
for (j = 0; j < n_res; j++) {
if (mapped[j] > *maxchar)
*maxchar = mapped[j];
res[k++] = mapped[j];
}
}
return k;
}
static Py_ssize_t
do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
{
......@@ -10501,6 +10519,22 @@ unicode_capitalize(PyObject *self)
return case_operation(self, do_capitalize);
}
PyDoc_STRVAR(casefold__doc__,
"S.casefold() -> str\n\
\n\
Return a version of S suitable for caseless comparisons.");
static PyObject *
unicode_casefold(PyObject *self)
{
if (PyUnicode_READY(self) == -1)
return NULL;
if (PyUnicode_IS_ASCII(self))
return ascii_upper_or_lower(self, 1);
return case_operation(self, do_casefold);
}
/* Argument converter. Coerces to a single unicode character */
static int
......@@ -12998,6 +13032,7 @@ static PyMethodDef unicode_methods[] = {
{"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
{"join", (PyCFunction) unicode_join, METH_O, join__doc__},
{"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
{"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
{"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
{"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
{"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
......
......@@ -76,7 +76,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{0, 0, 0, 0, 0, 4096},
{0, 0, 0, 0, 2, 3076},
{0, 0, 0, 0, 3, 3076},
{924, 181, 924, 0, 0, 9993},
{16777218, 17825792, 16777218, 0, 0, 26377},
{0, 0, 0, 0, 0, 5632},
{0, 0, 0, 0, 1, 3076},
{0, 0, 0, 0, 0, 3072},
......@@ -110,7 +110,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{220, 252, 220, 0, 0, 10113},
{221, 253, 221, 0, 0, 10113},
{222, 254, 222, 0, 0, 10113},
{33554433, 16777216, 33554435, 0, 0, 26377},
{33554438, 18874371, 33554440, 0, 0, 26377},
{192, 224, 192, 0, 0, 9993},
{193, 225, 193, 0, 0, 9993},
{194, 226, 194, 0, 0, 9993},
......@@ -190,7 +190,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{300, 301, 300, 0, 0, 9993},
{302, 303, 302, 0, 0, 10113},
{302, 303, 302, 0, 0, 9993},
{16777223, 33554437, 16777223, 0, 0, 26497},
{16777228, 33554442, 16777228, 0, 0, 26497},
{73, 305, 73, 0, 0, 9993},
{306, 307, 306, 0, 0, 10113},
{306, 307, 306, 0, 0, 9993},
......@@ -214,7 +214,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{325, 326, 325, 0, 0, 9993},
{327, 328, 327, 0, 0, 10113},
{327, 328, 327, 0, 0, 9993},
{33554441, 16777224, 33554441, 0, 0, 26377},
{33554448, 18874381, 33554448, 0, 0, 26377},
{330, 331, 330, 0, 0, 10113},
{330, 331, 330, 0, 0, 9993},
{332, 333, 332, 0, 0, 10113},
......@@ -268,7 +268,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{379, 380, 379, 0, 0, 9993},
{381, 382, 381, 0, 0, 10113},
{381, 382, 381, 0, 0, 9993},
{83, 383, 83, 0, 0, 9993},
{16777236, 17825810, 16777236, 0, 0, 26377},
{579, 384, 579, 0, 0, 9993},
{385, 595, 385, 0, 0, 10113},
{386, 387, 386, 0, 0, 10113},
......@@ -371,7 +371,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{492, 493, 492, 0, 0, 9993},
{494, 495, 494, 0, 0, 10113},
{494, 495, 494, 0, 0, 9993},
{33554444, 16777227, 33554444, 0, 0, 26377},
{33554456, 18874389, 33554456, 0, 0, 26377},
{497, 499, 498, 0, 0, 10113},
{497, 499, 498, 0, 0, 10049},
{497, 499, 498, 0, 0, 9993},
......@@ -490,7 +490,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{439, 658, 439, 0, 0, 9993},
{0, 0, 0, 0, 0, 14089},
{0, 0, 0, 0, 0, 5889},
{921, 837, 921, 0, 0, 13832},
{16777244, 17825818, 16777244, 0, 0, 30216},
{880, 881, 880, 0, 0, 10113},
{880, 881, 880, 0, 0, 9993},
{882, 883, 882, 0, 0, 10113},
......@@ -508,7 +508,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{908, 972, 908, 0, 0, 10113},
{910, 973, 910, 0, 0, 10113},
{911, 974, 911, 0, 0, 10113},
{50331663, 16777230, 50331663, 0, 0, 26377},
{50331681, 19922973, 50331681, 0, 0, 26377},
{913, 945, 913, 0, 0, 10113},
{914, 946, 914, 0, 0, 10113},
{915, 947, 915, 0, 0, 10113},
......@@ -539,7 +539,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{904, 941, 904, 0, 0, 9993},
{905, 942, 905, 0, 0, 9993},
{906, 943, 906, 0, 0, 9993},
{50331667, 16777234, 50331667, 0, 0, 26377},
{50331688, 19922980, 50331688, 0, 0, 26377},
{913, 945, 913, 0, 0, 9993},
{914, 946, 914, 0, 0, 9993},
{915, 947, 915, 0, 0, 9993},
......@@ -557,7 +557,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{927, 959, 927, 0, 0, 9993},
{928, 960, 928, 0, 0, 9993},
{929, 961, 929, 0, 0, 9993},
{931, 962, 931, 0, 0, 9993},
{16777261, 17825835, 16777261, 0, 0, 26377},
{931, 963, 931, 0, 0, 9993},
{932, 964, 932, 0, 0, 9993},
{933, 965, 933, 0, 0, 9993},
......@@ -571,11 +571,11 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{910, 973, 910, 0, 0, 9993},
{911, 974, 911, 0, 0, 9993},
{975, 983, 975, 0, 0, 10113},
{914, 976, 914, 0, 0, 9993},
{920, 977, 920, 0, 0, 9993},
{16777264, 17825838, 16777264, 0, 0, 26377},
{16777267, 17825841, 16777267, 0, 0, 26377},
{0, 0, 0, 0, 0, 10113},
{934, 981, 934, 0, 0, 9993},
{928, 982, 928, 0, 0, 9993},
{16777270, 17825844, 16777270, 0, 0, 26377},
{16777273, 17825847, 16777273, 0, 0, 26377},
{975, 983, 975, 0, 0, 9993},
{984, 985, 984, 0, 0, 10113},
{984, 985, 984, 0, 0, 9993},
......@@ -601,11 +601,11 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{1004, 1005, 1004, 0, 0, 9993},
{1006, 1007, 1006, 0, 0, 10113},
{1006, 1007, 1006, 0, 0, 9993},
{922, 1008, 922, 0, 0, 9993},
{929, 1009, 929, 0, 0, 9993},
{16777276, 17825850, 16777276, 0, 0, 26377},
{16777279, 17825853, 16777279, 0, 0, 26377},
{1017, 1010, 1017, 0, 0, 9993},
{1012, 952, 1012, 0, 0, 10113},
{917, 1013, 917, 0, 0, 9993},
{16777282, 17825856, 16777282, 0, 0, 26377},
{1015, 1016, 1015, 0, 0, 10113},
{1015, 1016, 1015, 0, 0, 9993},
{1017, 1010, 1017, 0, 0, 10113},
......@@ -978,7 +978,7 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{1364, 1412, 1364, 0, 0, 9993},
{1365, 1413, 1365, 0, 0, 9993},
{1366, 1414, 1366, 0, 0, 9993},
{33554455, 16777238, 33554457, 0, 0, 26377},
{33554502, 18874435, 33554504, 0, 0, 26377},
{0, 0, 0, 0, 0, 1537},
{4256, 11520, 4256, 0, 0, 10113},
{4257, 11521, 4257, 0, 0, 10113},
......@@ -1180,13 +1180,13 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{7826, 7827, 7826, 0, 0, 9993},
{7828, 7829, 7828, 0, 0, 10113},
{7828, 7829, 7828, 0, 0, 9993},
{33554460, 16777243, 33554460, 0, 0, 26377},
{33554463, 16777246, 33554463, 0, 0, 26377},
{33554466, 16777249, 33554466, 0, 0, 26377},
{33554469, 16777252, 33554469, 0, 0, 26377},
{33554472, 16777255, 33554472, 0, 0, 26377},
{7776, 7835, 7776, 0, 0, 9993},
{7838, 223, 7838, 0, 0, 10113},
{33554509, 18874442, 33554509, 0, 0, 26377},
{33554514, 18874447, 33554514, 0, 0, 26377},
{33554519, 18874452, 33554519, 0, 0, 26377},
{33554524, 18874457, 33554524, 0, 0, 26377},
{33554529, 18874462, 33554529, 0, 0, 26377},
{16777317, 17825891, 16777317, 0, 0, 26377},
{16777321, 18874470, 16777321, 0, 0, 26497},
{7840, 7841, 7840, 0, 0, 10113},
{7840, 7841, 7840, 0, 0, 9993},
{7842, 7843, 7842, 0, 0, 10113},
......@@ -1355,13 +1355,13 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{8011, 8003, 8011, 0, 0, 10113},
{8012, 8004, 8012, 0, 0, 10113},
{8013, 8005, 8013, 0, 0, 10113},
{33554475, 16777258, 33554475, 0, 0, 26377},
{33554541, 18874474, 33554541, 0, 0, 26377},
{8025, 8017, 8025, 0, 0, 9993},
{50331694, 16777261, 50331694, 0, 0, 26377},
{50331763, 19923055, 50331763, 0, 0, 26377},
{8027, 8019, 8027, 0, 0, 9993},
{50331698, 16777265, 50331698, 0, 0, 26377},
{50331770, 19923062, 50331770, 0, 0, 26377},
{8029, 8021, 8029, 0, 0, 9993},
{50331702, 16777269, 50331702, 0, 0, 26377},
{50331777, 19923069, 50331777, 0, 0, 26377},
{8031, 8023, 8031, 0, 0, 9993},
{8025, 8017, 8025, 0, 0, 10113},
{8027, 8019, 8027, 0, 0, 10113},
......@@ -1397,110 +1397,110 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{8171, 8059, 8171, 0, 0, 9993},
{8186, 8060, 8186, 0, 0, 9993},
{8187, 8061, 8187, 0, 0, 9993},
{33554490, 16777273, 16777276, 0, 0, 26377},
{33554494, 16777277, 16777280, 0, 0, 26377},
{33554498, 16777281, 16777284, 0, 0, 26377},
{33554502, 16777285, 16777288, 0, 0, 26377},
{33554506, 16777289, 16777292, 0, 0, 26377},
{33554510, 16777293, 16777296, 0, 0, 26377},
{33554514, 16777297, 16777300, 0, 0, 26377},
{33554518, 16777301, 16777304, 0, 0, 26377},
{33554522, 16777305, 16777308, 0, 0, 26433},
{33554526, 16777309, 16777312, 0, 0, 26433},
{33554530, 16777313, 16777316, 0, 0, 26433},
{33554534, 16777317, 16777320, 0, 0, 26433},
{33554538, 16777321, 16777324, 0, 0, 26433},
{33554542, 16777325, 16777328, 0, 0, 26433},
{33554546, 16777329, 16777332, 0, 0, 26433},
{33554550, 16777333, 16777336, 0, 0, 26433},
{33554554, 16777337, 16777340, 0, 0, 26377},
{33554558, 16777341, 16777344, 0, 0, 26377},
{33554562, 16777345, 16777348, 0, 0, 26377},
{33554566, 16777349, 16777352, 0, 0, 26377},
{33554570, 16777353, 16777356, 0, 0, 26377},
{33554574, 16777357, 16777360, 0, 0, 26377},
{33554578, 16777361, 16777364, 0, 0, 26377},
{33554582, 16777365, 16777368, 0, 0, 26377},
{33554586, 16777369, 16777372, 0, 0, 26433},
{33554590, 16777373, 16777376, 0, 0, 26433},
{33554594, 16777377, 16777380, 0, 0, 26433},
{33554598, 16777381, 16777384, 0, 0, 26433},
{33554602, 16777385, 16777388, 0, 0, 26433},
{33554606, 16777389, 16777392, 0, 0, 26433},
{33554610, 16777393, 16777396, 0, 0, 26433},
{33554614, 16777397, 16777400, 0, 0, 26433},
{33554618, 16777401, 16777404, 0, 0, 26377},
{33554622, 16777405, 16777408, 0, 0, 26377},
{33554626, 16777409, 16777412, 0, 0, 26377},
{33554630, 16777413, 16777416, 0, 0, 26377},
{33554634, 16777417, 16777420, 0, 0, 26377},
{33554638, 16777421, 16777424, 0, 0, 26377},
{33554642, 16777425, 16777428, 0, 0, 26377},
{33554646, 16777429, 16777432, 0, 0, 26377},
{33554650, 16777433, 16777436, 0, 0, 26433},
{33554654, 16777437, 16777440, 0, 0, 26433},
{33554658, 16777441, 16777444, 0, 0, 26433},
{33554662, 16777445, 16777448, 0, 0, 26433},
{33554666, 16777449, 16777452, 0, 0, 26433},
{33554670, 16777453, 16777456, 0, 0, 26433},
{33554674, 16777457, 16777460, 0, 0, 26433},
{33554678, 16777461, 16777464, 0, 0, 26433},
{33554567, 18874500, 16777353, 0, 0, 26377},
{33554573, 18874506, 16777359, 0, 0, 26377},
{33554579, 18874512, 16777365, 0, 0, 26377},
{33554585, 18874518, 16777371, 0, 0, 26377},
{33554591, 18874524, 16777377, 0, 0, 26377},
{33554597, 18874530, 16777383, 0, 0, 26377},
{33554603, 18874536, 16777389, 0, 0, 26377},
{33554609, 18874542, 16777395, 0, 0, 26377},
{33554615, 18874548, 16777401, 0, 0, 26433},
{33554621, 18874554, 16777407, 0, 0, 26433},
{33554627, 18874560, 16777413, 0, 0, 26433},
{33554633, 18874566, 16777419, 0, 0, 26433},
{33554639, 18874572, 16777425, 0, 0, 26433},
{33554645, 18874578, 16777431, 0, 0, 26433},
{33554651, 18874584, 16777437, 0, 0, 26433},
{33554657, 18874590, 16777443, 0, 0, 26433},
{33554663, 18874596, 16777449, 0, 0, 26377},
{33554669, 18874602, 16777455, 0, 0, 26377},
{33554675, 18874608, 16777461, 0, 0, 26377},
{33554681, 18874614, 16777467, 0, 0, 26377},
{33554687, 18874620, 16777473, 0, 0, 26377},
{33554693, 18874626, 16777479, 0, 0, 26377},
{33554699, 18874632, 16777485, 0, 0, 26377},
{33554705, 18874638, 16777491, 0, 0, 26377},
{33554711, 18874644, 16777497, 0, 0, 26433},
{33554717, 18874650, 16777503, 0, 0, 26433},
{33554723, 18874656, 16777509, 0, 0, 26433},
{33554729, 18874662, 16777515, 0, 0, 26433},
{33554735, 18874668, 16777521, 0, 0, 26433},
{33554741, 18874674, 16777527, 0, 0, 26433},
{33554747, 18874680, 16777533, 0, 0, 26433},
{33554753, 18874686, 16777539, 0, 0, 26433},
{33554759, 18874692, 16777545, 0, 0, 26377},
{33554765, 18874698, 16777551, 0, 0, 26377},
{33554771, 18874704, 16777557, 0, 0, 26377},
{33554777, 18874710, 16777563, 0, 0, 26377},
{33554783, 18874716, 16777569, 0, 0, 26377},
{33554789, 18874722, 16777575, 0, 0, 26377},
{33554795, 18874728, 16777581, 0, 0, 26377},
{33554801, 18874734, 16777587, 0, 0, 26377},
{33554807, 18874740, 16777593, 0, 0, 26433},
{33554813, 18874746, 16777599, 0, 0, 26433},
{33554819, 18874752, 16777605, 0, 0, 26433},
{33554825, 18874758, 16777611, 0, 0, 26433},
{33554831, 18874764, 16777617, 0, 0, 26433},
{33554837, 18874770, 16777623, 0, 0, 26433},
{33554843, 18874776, 16777629, 0, 0, 26433},
{33554849, 18874782, 16777635, 0, 0, 26433},
{8120, 8112, 8120, 0, 0, 9993},
{8121, 8113, 8121, 0, 0, 9993},
{33554682, 16777465, 33554684, 0, 0, 26377},
{33554687, 16777470, 16777473, 0, 0, 26377},
{33554691, 16777474, 33554693, 0, 0, 26377},
{33554696, 16777479, 33554696, 0, 0, 26377},
{50331915, 16777482, 50331918, 0, 0, 26377},
{33554855, 18874788, 33554857, 0, 0, 26377},
{33554862, 18874795, 16777648, 0, 0, 26377},
{33554868, 18874801, 33554870, 0, 0, 26377},
{33554875, 18874808, 33554875, 0, 0, 26377},
{50332097, 19923389, 50332100, 0, 0, 26377},
{8120, 8112, 8120, 0, 0, 10113},
{8121, 8113, 8121, 0, 0, 10113},
{8122, 8048, 8122, 0, 0, 10113},
{8123, 8049, 8123, 0, 0, 10113},
{33554706, 16777489, 16777492, 0, 0, 26433},
{921, 8126, 921, 0, 0, 9993},
{33554710, 16777493, 33554712, 0, 0, 26377},
{33554715, 16777498, 16777501, 0, 0, 26377},
{33554719, 16777502, 33554721, 0, 0, 26377},
{33554724, 16777507, 33554724, 0, 0, 26377},
{50331943, 16777510, 50331946, 0, 0, 26377},
{33554890, 18874823, 16777676, 0, 0, 26433},
{16777679, 17826253, 16777679, 0, 0, 26377},
{33554899, 18874832, 33554901, 0, 0, 26377},
{33554906, 18874839, 16777692, 0, 0, 26377},
{33554912, 18874845, 33554914, 0, 0, 26377},
{33554919, 18874852, 33554919, 0, 0, 26377},
{50332141, 19923433, 50332144, 0, 0, 26377},
{8136, 8050, 8136, 0, 0, 10113},
{8137, 8051, 8137, 0, 0, 10113},
{8138, 8052, 8138, 0, 0, 10113},
{8139, 8053, 8139, 0, 0, 10113},
{33554734, 16777517, 16777520, 0, 0, 26433},
{33554934, 18874867, 16777720, 0, 0, 26433},
{8152, 8144, 8152, 0, 0, 9993},
{8153, 8145, 8153, 0, 0, 9993},
{50331954, 16777521, 50331954, 0, 0, 26377},
{50331958, 16777525, 50331958, 0, 0, 26377},
{33554746, 16777529, 33554746, 0, 0, 26377},
{50331965, 16777532, 50331965, 0, 0, 26377},
{50332157, 19923449, 50332157, 0, 0, 26377},
{50332164, 19923456, 50332164, 0, 0, 26377},
{33554954, 18874887, 33554954, 0, 0, 26377},
{50332176, 19923468, 50332176, 0, 0, 26377},
{8152, 8144, 8152, 0, 0, 10113},
{8153, 8145, 8153, 0, 0, 10113},
{8154, 8054, 8154, 0, 0, 10113},
{8155, 8055, 8155, 0, 0, 10113},
{8168, 8160, 8168, 0, 0, 9993},
{8169, 8161, 8169, 0, 0, 9993},
{50331969, 16777536, 50331969, 0, 0, 26377},
{50331973, 16777540, 50331973, 0, 0, 26377},
{33554761, 16777544, 33554761, 0, 0, 26377},
{50332183, 19923475, 50332183, 0, 0, 26377},
{50332190, 19923482, 50332190, 0, 0, 26377},
{33554980, 18874913, 33554980, 0, 0, 26377},
{8172, 8165, 8172, 0, 0, 9993},
{33554764, 16777547, 33554764, 0, 0, 26377},
{50331983, 16777550, 50331983, 0, 0, 26377},
{33554985, 18874918, 33554985, 0, 0, 26377},
{50332207, 19923499, 50332207, 0, 0, 26377},
{8168, 8160, 8168, 0, 0, 10113},
{8169, 8161, 8169, 0, 0, 10113},
{8170, 8058, 8170, 0, 0, 10113},
{8171, 8059, 8171, 0, 0, 10113},
{8172, 8165, 8172, 0, 0, 10113},
{33554771, 16777554, 33554773, 0, 0, 26377},
{33554776, 16777559, 16777562, 0, 0, 26377},
{33554780, 16777563, 33554782, 0, 0, 26377},
{33554785, 16777568, 33554785, 0, 0, 26377},
{50332004, 16777571, 50332007, 0, 0, 26377},
{33554997, 18874930, 33554999, 0, 0, 26377},
{33555004, 18874937, 16777790, 0, 0, 26377},
{33555010, 18874943, 33555012, 0, 0, 26377},
{33555017, 18874950, 33555017, 0, 0, 26377},
{50332239, 19923531, 50332242, 0, 0, 26377},
{8184, 8056, 8184, 0, 0, 10113},
{8185, 8057, 8185, 0, 0, 10113},
{8186, 8060, 8186, 0, 0, 10113},
{8187, 8061, 8187, 0, 0, 10113},
{33554795, 16777578, 16777581, 0, 0, 26433},
{33555032, 18874965, 16777818, 0, 0, 26433},
{0, 0, 0, 0, 0, 3076},
{0, 0, 0, 0, 4, 3076},
{0, 0, 0, 0, 5, 3076},
......@@ -2037,18 +2037,18 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
{42918, 42919, 42918, 0, 0, 9993},
{42920, 42921, 42920, 0, 0, 10113},
{42920, 42921, 42920, 0, 0, 9993},
{33554799, 16777582, 33554801, 0, 0, 26377},
{33554804, 16777587, 33554806, 0, 0, 26377},
{33554809, 16777592, 33554811, 0, 0, 26377},
{50332030, 16777597, 50332033, 0, 0, 26377},
{50332037, 16777604, 50332040, 0, 0, 26377},
{33554828, 16777611, 33554830, 0, 0, 26377},
{33554833, 16777616, 33554835, 0, 0, 26377},
{33554838, 16777621, 33554840, 0, 0, 26377},
{33554843, 16777626, 33554845, 0, 0, 26377},
{33554848, 16777631, 33554850, 0, 0, 26377},
{33554853, 16777636, 33554855, 0, 0, 26377},
{33554858, 16777641, 33554860, 0, 0, 26377},
{33555038, 18874971, 33555040, 0, 0, 26377},
{33555045, 18874978, 33555047, 0, 0, 26377},
{33555052, 18874985, 33555054, 0, 0, 26377},
{50332276, 19923568, 50332279, 0, 0, 26377},
{50332286, 19923578, 50332289, 0, 0, 26377},
{33555079, 18875012, 33555081, 0, 0, 26377},
{33555086, 18875019, 33555088, 0, 0, 26377},
{33555093, 18875026, 33555095, 0, 0, 26377},
{33555100, 18875033, 33555102, 0, 0, 26377},
{33555107, 18875040, 33555109, 0, 0, 26377},
{33555114, 18875047, 33555116, 0, 0, 26377},
{33555121, 18875054, 33555123, 0, 0, 26377},
{0, 0, 0, 0, 0, 1025},
{65313, 65345, 65313, 0, 0, 10113},
{65314, 65346, 65314, 0, 0, 10113},
......@@ -2188,7 +2188,12 @@ const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {
/* extended case mappings */
const Py_UCS4 _PyUnicode_ExtendedCase[] = {
181,
956,
924,
223,
115,
115,
83,
83,
83,
......@@ -2198,263 +2203,440 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
304,
329,
700,
110,
700,
78,
383,
115,
83,
496,
106,
780,
74,
780,
837,
953,
921,
912,
953,
776,
769,
921,
776,
769,
944,
965,
776,
769,
933,
776,
769,
962,
963,
931,
976,
946,
914,
977,
952,
920,
981,
966,
934,
982,
960,
928,
1008,
954,
922,
1009,
961,
929,
1013,
949,
917,
1415,
1381,
1410,
1333,
1362,
1333,
1410,
7830,
104,
817,
72,
817,
7831,
116,
776,
84,
776,
7832,
119,
778,
87,
778,
7833,
121,
778,
89,
778,
7834,
97,
702,
65,
702,
7835,
7777,
7776,
223,
115,
115,
7838,
8016,
965,
787,
933,
787,
8018,
965,
787,
768,
933,
787,
768,
8020,
965,
787,
769,
933,
787,
769,
8022,
965,
787,
834,
933,
787,
834,
8064,
7936,
953,
7944,
921,
8072,
8065,
7937,
953,
7945,
921,
8073,
8066,
7938,
953,
7946,
921,
8074,
8067,
7939,
953,
7947,
921,
8075,
8068,
7940,
953,
7948,
921,
8076,
8069,
7941,
953,
7949,
921,
8077,
8070,
7942,
953,
7950,
921,
8078,
8071,
7943,
953,
7951,
921,
8079,
8064,
7936,
953,
7944,
921,
8072,
8065,
7937,
953,
7945,
921,
8073,
8066,
7938,
953,
7946,
921,
8074,
8067,
7939,
953,
7947,
921,
8075,
8068,
7940,
953,
7948,
921,
8076,
8069,
7941,
953,
7949,
921,
8077,
8070,
7942,
953,
7950,
921,
8078,
8071,
7943,
953,
7951,
921,
8079,
8080,
7968,
953,
7976,
921,
8088,
8081,
7969,
953,
7977,
921,
8089,
8082,
7970,
953,
7978,
921,
8090,
8083,
7971,
953,
7979,
921,
8091,
8084,
7972,
953,
7980,
921,
8092,
8085,
7973,
953,
7981,
921,
8093,
8086,
7974,
953,
7982,
921,
8094,
8087,
7975,
953,
7983,
921,
8095,
8080,
7968,
953,
7976,
921,
8088,
8081,
7969,
953,
7977,
921,
8089,
8082,
7970,
953,
7978,
921,
8090,
8083,
7971,
953,
7979,
921,
8091,
8084,
7972,
953,
7980,
921,
8092,
8085,
7973,
953,
7981,
921,
8093,
8086,
7974,
953,
7982,
921,
8094,
8087,
7975,
953,
7983,
921,
8095,
8096,
8032,
953,
8040,
921,
8104,
8097,
8033,
953,
8041,
921,
8105,
8098,
8034,
953,
8042,
921,
8106,
8099,
8035,
953,
8043,
921,
8107,
8100,
8036,
953,
8044,
921,
8108,
8101,
8037,
953,
8045,
921,
8109,
8102,
8038,
953,
8046,
921,
8110,
8103,
8039,
953,
8047,
921,
8111,
8096,
8032,
953,
8040,
921,
8104,
8097,
8033,
953,
8041,
921,
8105,
8098,
8034,
953,
8042,
921,
8106,
8099,
8035,
953,
8043,
921,
8107,
8100,
8036,
953,
8044,
921,
8108,
8101,
8037,
953,
8045,
921,
8109,
8102,
8038,
953,
8046,
921,
8110,
8103,
8039,
953,
8047,
921,
8111,
8114,
8048,
953,
8122,
921,
8122,
837,
8115,
945,
953,
913,
921,
8124,
8116,
940,
953,
902,
921,
902,
837,
8118,
945,
834,
913,
834,
8119,
945,
834,
953,
913,
834,
921,
......@@ -2462,27 +2644,43 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
834,
837,
8115,
945,
953,
913,
921,
8124,
8126,
953,
921,
8130,
8052,
953,
8138,
921,
8138,
837,
8131,
951,
953,
919,
921,
8140,
8132,
942,
953,
905,
921,
905,
837,
8134,
951,
834,
919,
834,
8135,
951,
834,
953,
919,
834,
921,
......@@ -2490,60 +2688,97 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
834,
837,
8131,
951,
953,
919,
921,
8140,
8146,
953,
776,
768,
921,
776,
768,
8147,
953,
776,
769,
921,
776,
769,
8150,
953,
834,
921,
834,
8151,
953,
776,
834,
921,
776,
834,
8162,
965,
776,
768,
933,
776,
768,
8163,
965,
776,
769,
933,
776,
769,
8164,
961,
787,
929,
787,
8166,
965,
834,
933,
834,
8167,
965,
776,
834,
933,
776,
834,
8178,
8060,
953,
8186,
921,
8186,
837,
8179,
969,
953,
937,
921,
8188,
8180,
974,
953,
911,
921,
911,
837,
8182,
969,
834,
937,
834,
8183,
969,
834,
953,
937,
834,
921,
......@@ -2551,25 +2786,36 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
834,
837,
8179,
969,
953,
937,
921,
8188,
64256,
102,
102,
70,
70,
70,
102,
64257,
102,
105,
70,
73,
70,
105,
64258,
102,
108,
70,
76,
70,
108,
64259,
102,
102,
105,
70,
70,
73,
......@@ -2577,6 +2823,9 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
102,
105,
64260,
102,
102,
108,
70,
70,
76,
......@@ -2584,36 +2833,50 @@ const Py_UCS4 _PyUnicode_ExtendedCase[] = {
102,
108,
64261,
115,
116,
83,
84,
83,
116,
64262,
115,
116,
83,
84,
83,
116,
64275,
1396,
1398,
1348,
1350,
1348,
1398,
64276,
1396,
1381,
1348,
1333,
1348,
1381,
64277,
1396,
1387,
1348,
1339,
1348,
1387,
64278,
1406,
1398,
1358,
1350,
1358,
1398,
64279,
1396,
1389,
1348,
1341,
1348,
......
......@@ -49,6 +49,7 @@ LINE_BREAK = "LineBreak%s.txt"
NAME_ALIASES = "NameAliases%s.txt"
NAMED_SEQUENCES = "NamedSequences%s.txt"
SPECIAL_CASING = "SpecialCasing%s.txt"
CASE_FOLDING = "CaseFolding%s.txt"
# Private Use Areas -- in planes 1, 15, 16
PUA_1 = range(0xE000, 0xF900)
......@@ -424,7 +425,7 @@ def makeunicodetype(unicode, trace):
if "Case_Ignorable" in properties:
flags |= CASE_IGNORABLE_MASK
sc = unicode.special_casing.get(char)
if sc is None:
cf = unicode.case_folding.get(char, [char])
if record[12]:
upper = int(record[12], 16)
else:
......@@ -437,15 +438,23 @@ def makeunicodetype(unicode, trace):
title = int(record[14], 16)
else:
title = upper
if sc is None and cf != [lower]:
sc = ([lower], [title], [upper])
if sc is None:
if upper == lower == title:
upper = lower = title = 0
else:
# This happens when some character maps to more than one
# character in uppercase, lowercase, or titlecase. The extra
# characters are stored in a different array.
# This happens either when some character maps to more than one
# character in uppercase, lowercase, or titlecase or the
# casefolded version of the character is different from the
# lowercase. The extra characters are stored in a different
# array.
flags |= EXTENDED_CASE_MASK
lower = len(extra_casing) | (len(sc[0]) << 24)
extra_casing.extend(sc[0])
if cf != sc[0]:
lower |= len(cf) << 20
extra_casing.extend(cf)
upper = len(extra_casing) | (len(sc[2]) << 24)
extra_casing.extend(sc[2])
# Title is probably equal to upper.
......@@ -1107,6 +1116,17 @@ class UnicodeData:
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
cf = self.case_folding = {}
if version != '3.2.0':
with open_data(CASE_FOLDING, version) as file:
for s in file:
s = s[:-1].split('#', 1)[0]
if not s:
continue
data = s.split("; ")
if data[1] in "CF":
c = int(data[0], 16)
cf[c] = [int(char, 16) for char in data[2].split()]
def uselatin1(self):
# restrict character range to ISO Latin 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment