Commit e988e286 authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #1734234: Massively speedup `unicodedata.normalize()` when the

string is already in normalized form, by performing a quick check beforehand.
Original patch by Rauli Ruohonen.
parent 8b8f8cc1
...@@ -612,6 +612,7 @@ Craig Rowland ...@@ -612,6 +612,7 @@ Craig Rowland
Paul Rubin Paul Rubin
Sam Ruby Sam Ruby
Audun S. Runde Audun S. Runde
Rauli Ruohonen
Jeff Rush Jeff Rush
Sam Rushing Sam Rushing
Mark Russell Mark Russell
......
...@@ -255,6 +255,10 @@ Core and Builtins ...@@ -255,6 +255,10 @@ Core and Builtins
Library Library
------- -------
- Issue #1734234: Massively speedup ``unicodedata.normalize()`` when the
string is already in normalized form, by performing a quick check beforehand.
Original patch by Rauli Ruohonen.
- Issue #5853: calling a function of the mimetypes module from several threads - Issue #5853: calling a function of the mimetypes module from several threads
at once could hit the recursion limit if the mimetypes database hadn't been at once could hit the recursion limit if the mimetypes database hadn't been
initialized before. initialized before.
......
...@@ -27,6 +27,7 @@ typedef struct { ...@@ -27,6 +27,7 @@ typedef struct {
const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char mirrored; /* true if mirrored in bidir mode */
const unsigned char east_asian_width; /* index into const unsigned char east_asian_width; /* index into
_PyUnicode_EastAsianWidth */ _PyUnicode_EastAsianWidth */
const unsigned char normalization_quick_check; /* see is_normalized() */
} _PyUnicode_DatabaseRecord; } _PyUnicode_DatabaseRecord;
typedef struct change_record { typedef struct change_record {
...@@ -720,7 +721,39 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) ...@@ -720,7 +721,39 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result)); PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
return result; return result;
} }
/* Return 1 if the input is certainly normalized, 0 if it might not be. */
static int
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
{
Py_UNICODE *i, *end;
unsigned char prev_combining = 0, quickcheck_mask;
/* An older version of the database is requested, quickchecks must be
disabled. */
if (self != NULL)
return 0;
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
i = PyUnicode_AS_UNICODE(input);
end = i + PyUnicode_GET_SIZE(input);
while (i < end) {
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(*i++);
unsigned char combining = record->combining;
unsigned char quickcheck = record->normalization_quick_check;
if (quickcheck & quickcheck_mask)
return 0; /* this string might need normalization */
if (combining && prev_combining > combining)
return 0; /* non-canonical sort order, not normalized */
prev_combining = combining;
}
return 1; /* certainly normalized */
}
PyDoc_STRVAR(unicodedata_normalize__doc__, PyDoc_STRVAR(unicodedata_normalize__doc__,
"normalize(form, unistr)\n\ "normalize(form, unistr)\n\
\n\ \n\
...@@ -744,14 +777,34 @@ unicodedata_normalize(PyObject *self, PyObject *args) ...@@ -744,14 +777,34 @@ unicodedata_normalize(PyObject *self, PyObject *args)
return input; return input;
} }
if (strcmp(form, "NFC") == 0) if (strcmp(form, "NFC") == 0) {
if (is_normalized(self, input, 1, 0)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0); return nfc_nfkc(self, input, 0);
if (strcmp(form, "NFKC") == 0) }
if (strcmp(form, "NFKC") == 0) {
if (is_normalized(self, input, 1, 1)) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1); return nfc_nfkc(self, input, 1);
if (strcmp(form, "NFD") == 0) }
if (strcmp(form, "NFD") == 0) {
if (is_normalized(self, input, 0, 0)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0); return nfd_nfkd(self, input, 0);
if (strcmp(form, "NFKD") == 0) }
if (strcmp(form, "NFKD") == 0) {
if (is_normalized(self, input, 0, 1)) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 1); return nfd_nfkd(self, input, 1);
}
PyErr_SetString(PyExc_ValueError, "invalid normalization form"); PyErr_SetString(PyExc_ValueError, "invalid normalization form");
return NULL; return NULL;
} }
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0" ...@@ -34,6 +34,7 @@ UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
old_versions = ["3.2.0"] old_versions = ["3.2.0"]
...@@ -66,7 +67,8 @@ def maketables(trace=0): ...@@ -66,7 +67,8 @@ def maketables(trace=0):
version = "" version = ""
unicode = UnicodeData(UNICODE_DATA % version, unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version, COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version) EASTASIAN_WIDTH % version,
DERIVEDNORMALIZATION_PROPS % version)
print len(filter(None, unicode.table)), "characters" print len(filter(None, unicode.table)), "characters"
...@@ -87,7 +89,7 @@ def maketables(trace=0): ...@@ -87,7 +89,7 @@ def maketables(trace=0):
def makeunicodedata(unicode, trace): def makeunicodedata(unicode, trace):
dummy = (0, 0, 0, 0, 0) dummy = (0, 0, 0, 0, 0, 0)
table = [dummy] table = [dummy]
cache = {0: dummy} cache = {0: dummy}
index = [0] * len(unicode.chars) index = [0] * len(unicode.chars)
...@@ -107,8 +109,10 @@ def makeunicodedata(unicode, trace): ...@@ -107,8 +109,10 @@ def makeunicodedata(unicode, trace):
bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y" mirrored = record[9] == "Y"
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15]) eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
normalizationquickcheck = record[16]
item = ( item = (
category, combining, bidirectional, mirrored, eastasianwidth category, combining, bidirectional, mirrored, eastasianwidth,
normalizationquickcheck
) )
# add entry to index and item tables # add entry to index and item tables
i = cache.get(item) i = cache.get(item)
...@@ -222,7 +226,7 @@ def makeunicodedata(unicode, trace): ...@@ -222,7 +226,7 @@ def makeunicodedata(unicode, trace):
print >>fp, \ print >>fp, \
"const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
for item in table: for item in table:
print >>fp, " {%d, %d, %d, %d, %d}," % item print >>fp, " {%d, %d, %d, %d, %d, %d}," % item
print >>fp, "};" print >>fp, "};"
print >>fp print >>fp
...@@ -698,7 +702,8 @@ import sys ...@@ -698,7 +702,8 @@ import sys
class UnicodeData: class UnicodeData:
def __init__(self, filename, exclusions, eastasianwidth, expand=1): def __init__(self, filename, exclusions, eastasianwidth,
derivednormalizationprops=None, expand=1):
self.changed = [] self.changed = []
file = open(filename) file = open(filename)
table = [None] * 0x110000 table = [None] * 0x110000
...@@ -761,6 +766,28 @@ class UnicodeData: ...@@ -761,6 +766,28 @@ class UnicodeData:
for i in range(0, 0x110000): for i in range(0, 0x110000):
if table[i] is not None: if table[i] is not None:
table[i].append(widths[i]) table[i].append(widths[i])
if derivednormalizationprops:
quickchecks = [0] * 0x110000 # default is Yes
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
for s in open(derivednormalizationprops):
if '#' in s:
s = s[:s.index('#')]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in qc_order:
continue
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
quickcheck_shift = qc_order.index(s[1])*2
quickcheck <<= quickcheck_shift
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
def uselatin1(self): def uselatin1(self):
# restrict character range to ISO Latin 1 # restrict character range to ISO Latin 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment