Commit 2f094139 authored by Greg Price's avatar Greg Price Committed by Benjamin Peterson

closes bpo-37966: Fully implement the UAX #15 quick-check algorithm. (GH-15558)

The purpose of the `unicodedata.is_normalized` function is to answer
the question `str == unicodedata.normalized(form, str)` more
efficiently than writing just that, by using the "quick check"
optimization described in the Unicode standard in UAX #15.

However, it turns out the code doesn't implement the full algorithm
from the standard, and as a result we often miss the optimization and
end up having to compute the whole normalized string after all.

Implement the standard's algorithm.  This greatly speeds up
`unicodedata.is_normalized` in many cases where our partial variant
of quick-check had been returning MAYBE and the standard algorithm
returns NO.

At a quick test on my desktop, the existing code takes about 4.4 ms/MB
(so 4.4 ns per byte) when the partial quick-check returns MAYBE and it
has to do the slow normalize-and-compare:

  $ build.base/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \
      -- 'unicodedata.is_normalized("NFD", s)'
  50 loops, best of 5: 4.39 msec per loop

With this patch, it gets the answer instantly (58 ns) on the same 1 MB
string:

  $ build.dev/python -m timeit -s 'import unicodedata; s = "\uf900"*500000' \
      -- 'unicodedata.is_normalized("NFD", s)'
  5000000 loops, best of 5: 58.2 nsec per loop

This restores a small optimization that the original version of this
code had for the `unicodedata.normalize` use case.

With this, that case is actually faster than in master!

$ build.base/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \
    -- 'unicodedata.normalize("NFD", s)'
500 loops, best of 5: 561 usec per loop

$ build.dev/python -m timeit -s 'import unicodedata; s = "\u0338"*500000' \
    -- 'unicodedata.normalize("NFD", s)'
500 loops, best of 5: 512 usec per loop
parent 580bdb0e
...@@ -1090,8 +1090,9 @@ unicodedata ...@@ -1090,8 +1090,9 @@ unicodedata
<http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release. <http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release.
* New function :func:`~unicodedata.is_normalized` can be used to verify a string * New function :func:`~unicodedata.is_normalized` can be used to verify a string
is in a specific normal form. (Contributed by Max Belanger and David Euresti in is in a specific normal form, often much faster than by actually normalizing
:issue:`32285`). the string. (Contributed by Max Belanger, David Euresti, and Greg Price in
:issue:`32285` and :issue:`37966`).
unittest unittest
......
...@@ -208,6 +208,8 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): ...@@ -208,6 +208,8 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b) self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b) self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
# For tests of unicodedata.is_normalized / self.db.is_normalized ,
# see test_normalization.py .
def test_east_asian_width(self): def test_east_asian_width(self):
eaw = self.db.east_asian_width eaw = self.db.east_asian_width
......
The implementation of :func:`~unicodedata.is_normalized` has been greatly
sped up on strings that aren't normalized, by implementing the full
normalization-quick-check algorithm from the Unicode standard.
...@@ -19,6 +19,8 @@ ...@@ -19,6 +19,8 @@
#include "ucnhash.h" #include "ucnhash.h"
#include "structmember.h" #include "structmember.h"
#include <stdbool.h>
_Py_IDENTIFIER(NFC); _Py_IDENTIFIER(NFC);
_Py_IDENTIFIER(NFD); _Py_IDENTIFIER(NFD);
_Py_IDENTIFIER(NFKC); _Py_IDENTIFIER(NFKC);
...@@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) ...@@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return result; return result;
} }
typedef enum {YES, NO, MAYBE} NormalMode; // This needs to match the logic in makeunicodedata.py
// which constructs the quickcheck data.
/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */ typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
static NormalMode
is_normalized(PyObject *self, PyObject *input, int nfc, int k) /* Run the Unicode normalization "quickcheck" algorithm.
*
* Return YES or NO if quickcheck determines the input is certainly
* normalized or certainly not, and MAYBE if quickcheck is unable to
* tell.
*
* If `yes_only` is true, then return MAYBE as soon as we determine
* the answer is not YES.
*
* For background and details on the algorithm, see UAX #15:
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
*/
static QuickcheckResult
is_normalized_quickcheck(PyObject *self, PyObject *input,
int nfc, int k, bool yes_only)
{ {
Py_ssize_t i, len;
int kind;
void *data;
unsigned char prev_combining = 0, quickcheck_mask;
/* An older version of the database is requested, quickchecks must be /* An older version of the database is requested, quickchecks must be
disabled. */ disabled. */
if (self && UCD_Check(self)) if (self && UCD_Check(self))
return NO; return NO;
/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, Py_ssize_t i, len;
as described in http://unicode.org/reports/tr15/#Annex8. */ int kind;
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0)); void *data;
unsigned char prev_combining = 0;
/* The two quickcheck bits at this shift have type QuickcheckResult. */
int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
QuickcheckResult result = YES; /* certainly normalized, unless we find something */
i = 0; i = 0;
kind = PyUnicode_KIND(input); kind = PyUnicode_KIND(input);
...@@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) ...@@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
while (i < len) { while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i++); Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch); const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
unsigned char combining = record->combining;
unsigned char quickcheck = record->normalization_quick_check;
if (quickcheck & quickcheck_mask) unsigned char combining = record->combining;
return MAYBE; /* this string might need normalization */
if (combining && prev_combining > combining) if (combining && prev_combining > combining)
return NO; /* non-canonical sort order, not normalized */ return NO; /* non-canonical sort order, not normalized */
prev_combining = combining; prev_combining = combining;
unsigned char quickcheck_whole = record->normalization_quick_check;
if (yes_only) {
if (quickcheck_whole & (3 << quickcheck_shift))
return MAYBE;
} else {
switch ((quickcheck_whole >> quickcheck_shift) & 3) {
case NO:
return NO;
case MAYBE:
result = MAYBE; /* this string might need normalization */
}
}
} }
return YES; /* certainly normalized */ return result;
} }
/*[clinic input] /*[clinic input]
...@@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, ...@@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
PyObject *result; PyObject *result;
int nfc = 0; int nfc = 0;
int k = 0; int k = 0;
NormalMode m; QuickcheckResult m;
PyObject *cmp; PyObject *cmp;
int match = 0; int match = 0;
...@@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, ...@@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
return NULL; return NULL;
} }
m = is_normalized(self, input, nfc, k); m = is_normalized_quickcheck(self, input, nfc, k, false);
if (m == MAYBE) { if (m == MAYBE) {
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
...@@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, ...@@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
} }
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
if (is_normalized(self, input, 1, 0) == YES) { if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
Py_INCREF(input); Py_INCREF(input);
return input; return input;
} }
return nfc_nfkc(self, input, 0); return nfc_nfkc(self, input, 0);
} }
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
if (is_normalized(self, input, 1, 1) == YES) { if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
Py_INCREF(input); Py_INCREF(input);
return input; return input;
} }
return nfc_nfkc(self, input, 1); return nfc_nfkc(self, input, 1);
} }
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
if (is_normalized(self, input, 0, 0) == YES) { if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
Py_INCREF(input); Py_INCREF(input);
return input; return input;
} }
return nfd_nfkd(self, input, 0); return nfd_nfkd(self, input, 0);
} }
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
if (is_normalized(self, input, 0, 1) == YES) { if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
Py_INCREF(input); Py_INCREF(input);
return input; return input;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment