Patch #626485: Support Unicode normalization.

f504c248 · Martin v. Löwis · 868799f0 · f504c248 · f504c248 · f504c248
Commit f504c248 authored Nov 23, 2002 by Martin v. Löwis
6 changed files
--- a/Doc/lib/libunicodedata.tex
+++ b/Doc/lib/libunicodedata.tex
@@ -5,7 +5,7 @@
 \modulesynopsis{Access the Unicode Database.}
 \moduleauthor{Marc-Andre Lemburg}{mal@lemburg.com}
 \sectionauthor{Marc-Andre Lemburg}{mal@lemburg.com}
+\sectionauthor{Martin v. L\"owis}{martin@v.loewis.de}
 \index{Unicode}
 \index{character}
@@ -14,10 +14,10 @@
 This module provides access to the Unicode Character Database which
 defines character properties for all Unicode characters. The data in
 this database is based on the \file{UnicodeData.txt} file version
-3.0.0 which is publically available from \url{ftp://ftp.unicode.org/}.
+3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}.
 The module uses the same names and symbols as defined by the
-UnicodeData File Format 3.0.0 (see
+UnicodeData File Format 3.2.0 (see
 \url{http://www.unicode.org/Public/UNIDATA/UnicodeData.html}).  It
 defines the following functions:
@@ -83,3 +83,37 @@ defines the following functions:
  character \var{unichr} as string. An empty string is returned in case
  no such mapping is defined.
 \end{funcdesc}
+\begin{funcdesc}{normalize}{form, unistr}
+Return the normal form \var{form} for the Unicode string \var{unistr}.
+Valid values for \var{form} are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
+The Unicode standard defines various normalization forms of a Unicode
+string, based on the definition of canonical equivalence and
+compatibility equivalence. In Unicode, several characters can be
+expressed in various way. For example, the character U+00C7 (LATIN
+CAPITAL LETTER C WITH CEDILLA) can also be expressed as the sequence
+U+0043 (LATIN CAPITAL LETTER C) U+0327 (COMBINING CEDILLA).
+For each character, there are two normal forms: normal form C and
+normal form D. Normal form D (NFD) is also known as canonical
+decomposition, and translates each character into its decomposed form.
+Normal form C (NFC) first applies a canonical decomposition, then
+composes pre-combined characters again.
+In addition to these two forms, there two additional normal forms
+based on compatibility equivalence. In Unicode, certain characters are
+supported which normally would be unified with other characters. For
+example, U+2160 (ROMAN NUMERAL ONE) is really the same thing as U+0049
+(LATIN CAPITAL LETTER I). However, it is supported in Unicode for
+compatibility with existing character sets (e.g. gb2312).
+The normal form KD (NFKD) will apply the compatibility decomposition,
+i.e. replace all compatibility characters with their equivalents. The
+normal form KC (NFKC) first applies the compatibility decomposition,
+followed by the canonical composition.
+\versionadded{2.3}
+\end{funcdesc}
--- a/Lib/test/test_normalization.py
+++ b/Lib/test/test_normalization.py
+from test.test_support import verbose, TestFailed, TestSkipped, verify
+import sys
+from unicodedata import normalize
+try:
+    data = open("NormalizationTest.txt","r").readlines()
+except IOError:
+    raise TestSkipped("NormalizationTest.txt not found, download from http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt")
+class RangeError:
+    pass
+def NFC(str):
+    return normalize("NFC", str)
+def NFKC(str):
+    return normalize("NFKC", str)
+def NFD(str):
+    return normalize("NFD", str)
+def NFKD(str):
+    return normalize("NFKD", str)
+def unistr(data):
+    data = [int(x, 16) for x in data.split(" ")]
+    for x in data:
+        if x > sys.maxunicode:
+            raise RangeError
+    return u"".join([unichr(x) for x in data])
+part1_data = {}
+for line in data:
+    if '#' in line:
+        line = line.split('#')[0]
+    line = line.strip()
+    if not line:
+        continue
+    if line.startswith("@Part"):
+        part = line
+        continue
+    try:
+        c1,c2,c3,c4,c5 = [unistr(x) for x in line.split(';')[:-1]]
+    except RangeError:
+        # Skip unsupported characters
+        continue
+    if verbose:
+        print line
+    # Perform tests
+    verify(c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3), line)
+    verify(c4 ==  NFC(c4) ==  NFC(c5), line)
+    verify(c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3), line)
+    verify(c5 ==  NFD(c4) ==  NFD(c5), line)
+    verify(c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5), line)
+    verify(c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5), line)
+    # Record part 1 data
+    if part == "@Part1":
+        part1_data[c1] = 1
+# Perform tests for all other data
+for c in range(sys.maxunicode+1):
+    X = unichr(c)
+    if X in part1_data:
+        continue
+    assert X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -317,8 +317,8 @@ Extension modules
  available in source code, but not built automatically anymore, and
  is now named bsddb185.
- unicodedata was updated to Unicode 3.2. In now also supports names
+- unicodedata was updated to Unicode 3.2. It supports normalization
-  for Hangul syllables and CJK unified ideographs.
+  and names for Hangul syllables and CJK unified ideographs.
 - resource.getrlimit() now returns longs instead of ints.

--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -30,13 +30,9 @@ typedef struct {
 #include "unicodedata_db.h"
 static const _PyUnicode_DatabaseRecord*
-_getrecord(PyUnicodeObject* v)
+_getrecord_ex(Py_UCS4 code)
 {
-    int code;
    int index;
-    code = (int) *PyUnicode_AS_UNICODE(v);
    if (code < 0 || code >= 0x110000)
        index = 0;
    else {
@@ -47,6 +43,12 @@ _getrecord(PyUnicodeObject* v)
    return &_PyUnicode_Database_Records[index];
 }
+static const _PyUnicode_DatabaseRecord*
+_getrecord(PyUnicodeObject* v)
+{
+    return _getrecord_ex(*PyUnicode_AS_UNICODE(v));
+}
 /* --- Module API --------------------------------------------------------- */
 static PyObject *
@@ -253,6 +255,276 @@ unicodedata_decomposition(PyObject *self, PyObject *args)
    return PyString_FromString(decomp);
 }
+void
+get_decomp_record(Py_UCS4 code, int *index, int *prefix, int *count)
+{
+    if (code < 0 || code >= 0x110000) {
+        *index = 0;
+    } 
+    else {
+        *index = decomp_index1[(code>>DECOMP_SHIFT)];
+        *index = decomp_index2[(*index<<DECOMP_SHIFT)+
+                               (code&((1<<DECOMP_SHIFT)-1))];
+    }
+    /* high byte is number of hex bytes (usually one or two), low byte
+       is prefix code (from*/
+    *count = decomp_data[*index] >> 8;
+    *prefix = decomp_data[*index] & 255;
+    (*index)++;
+}
+#define SBase   0xAC00
+#define LBase   0x1100
+#define VBase   0x1161
+#define TBase   0x11A7
+#define LCount  19
+#define VCount  21
+#define TCount  28
+#define NCount  (VCount*TCount)
+#define SCount  (LCount*NCount)
+static PyObject*
+nfd_nfkd(PyObject *input, int k)
+{
+    PyObject *result;
+    Py_UNICODE *i, *end, *o;
+    /* Longest decomposition in Unicode 3.2: U+FDFA */
+    Py_UNICODE stack[20]; 
+    int space, stackptr, isize;
+    int index, prefix, count;
+    unsigned char prev, cur;
+    stackptr = 0;
+    isize = PyUnicode_GET_SIZE(input);
+    /* Overallocate atmost 10 characters. */
+    space = (isize > 10 ? 10 : isize) + isize;
+    result = PyUnicode_FromUnicode(NULL, space);
+    if (!result)
+        return NULL;
+    i = PyUnicode_AS_UNICODE(input);
+    end = i + isize;
+    o = PyUnicode_AS_UNICODE(result);
+    while (i < end) {
+        stack[stackptr++] = *i++;
+        while(stackptr) {
+            Py_UNICODE code = stack[--stackptr];
+            if (!space) {
+                space = PyString_GET_SIZE(result) + 10;
+                if (PyUnicode_Resize(&result, space) == -1)
+                    return NULL;
+                o = PyUnicode_AS_UNICODE(result) + space - 10;
+                space = 10;
+            }
+            /* Hangul Decomposition. */
+            if (SBase <= code && code < (SBase+SCount)) {
+                int SIndex = code - SBase;
+                int L = LBase + SIndex / NCount;
+                int V = VBase + (SIndex % NCount) / TCount;
+                int T = TBase + SIndex % TCount;
+                *o++ = L;
+                *o++ = V;
+                space -= 2;
+                if (T != TBase) {
+                    *o++ = T;
+                    space --;
+                }
+                continue;
+            }
+            /* Other decompoistions. */
+            get_decomp_record(code, &index, &prefix, &count);
+            /* Copy character if it is not decomposable, or has a
+               compatibility decomposition, but we do NFD. */
+            if (!count || (prefix && !k)) {
+                *o++ = code;
+                space--;
+                continue;
+            }
+            /* Copy decomposition onto the stack, in reverse
+               order.  */
+            while(count) {
+                code = decomp_data[index + (--count)];
+                stack[stackptr++] = code;
+            }
+        }
+    }
+    /* Drop overallocation. Cannot fail. */
+    PyUnicode_Resize(&result, PyUnicode_GET_SIZE(result) - space);
+    /* Sort canonically. */
+    i = PyUnicode_AS_UNICODE(result);
+    prev = _getrecord_ex(*i)->combining;
+    end = i + PyUnicode_GET_SIZE(result);
+    for (i++; i < end; i++) {
+        cur = _getrecord_ex(*i)->combining;
+        if (prev == 0 || cur == 0 || prev <= cur) {
+            prev = cur;
+            continue;
+        }
+        /* Non-canonical order. Need to switch *i with previous. */
+        o = i - 1;
+        while (1) {
+            Py_UNICODE tmp = o[1];
+            o[1] = o[0];
+            o[0] = tmp;
+            o--;
+            if (o < PyUnicode_AS_UNICODE(result))
+                break;
+            prev = _getrecord_ex(*o)->combining;
+            if (prev == 0 || prev <= cur)
+                break;
+        }
+        prev = _getrecord_ex(*i)->combining;
+    }
+    return result;
+}
+static int
+find_nfc_index(struct reindex* nfc, Py_UNICODE code)
+{
+    int index;
+    for (index = 0; nfc[index].start; index++) {
+        int start = nfc[index].start;
+        if (code < start)
+            return -1;
+        if (code <= start + nfc[index].count) {
+            int delta = code - start;
+            return nfc[index].index + delta;
+        }
+    }
+    return -1;
+}
+static PyObject*
+nfc_nfkc(PyObject *input, int k)
+{
+    PyObject *result;
+    Py_UNICODE *i, *i1, *o, *end;
+    int f,l,index,index1,comb;
+    Py_UNICODE code;
+    Py_UNICODE *skipped[20];
+    int cskipped = 0;
+    result = nfd_nfkd(input, k);
+    if (!result)
+        return NULL;
+    /* We are going to modify result in-place.
+       If nfd_nfkd is changed to sometimes return the input,
+       this code needs to be reviewed. */
+    assert(result != input);
+    i = PyUnicode_AS_UNICODE(result);
+    end = i + PyUnicode_GET_SIZE(result);
+    o = PyUnicode_AS_UNICODE(result);
+  again:
+    while (i < end) {
+      for (index = 0; index < cskipped; index++) {
+          if (skipped[index] == i) {
+              /* *i character is skipped. 
+                 Remove from list. */
+              skipped[index] = skipped[cskipped-1];
+              cskipped--;
+              i++;
+              goto again; // continue while
+          }
+      }
+      /* Hangul Composition. We don't need to check for <LV,T>
+         pairs, since we always have decomposed data. */
+      if (LBase <= *i && *i < (LBase+LCount) &&
+          i + 1 < end && 
+          VBase <= i[1] && i[1] <= (VBase+VCount)) {
+          int LIndex, VIndex;
+          LIndex = i[0] - LBase;
+          VIndex = i[1] - VBase;
+          code = SBase + (LIndex*VCount+VIndex)*TCount;
+          i+=2;
+          if (i < end &&
+              TBase <= *i && *i <= (TBase+TCount)) {
+              code += *i-TBase;
+              i++;
+          }
+          *o++ = code;
+          continue;
+      }
+      f = find_nfc_index(nfc_first, *i);
+      if (f == -1) {
+          *o++ = *i++;
+          continue;
+      }
+      /* Find next unblocked character. */
+      i1 = i+1;
+      comb = 0;
+      while (i1 < end) {
+          int comb1 = _getrecord_ex(*i1)->combining;
+          if (comb1 && comb == comb1) {
+              /* Character is blocked. */
+              i1++;
+              continue;
+          }
+          l = find_nfc_index(nfc_last, *i1);
+          /* *i1 cannot be combined with *i. If *i1
+             is a starter, we don't need to look further.
+             Otherwise, record the combining class. */
+          if (l == -1) {
+            not_combinable:
+              if (comb1 == 0)
+                  break;
+              comb = comb1;
+              i1++;
+              continue;
+          }
+          index = f*TOTAL_LAST + l;
+          index1 = comp_index[index >> COMP_SHIFT];
+          code = comp_data[(index1<<COMP_SHIFT)+
+                           (index&((1<<COMP_SHIFT)-1))];
+          if (code == 0)
+              goto not_combinable;
+          /* Replace the original character. */
+          *i = code;
+          /* Mark the second character unused. */
+          skipped[cskipped++] = i1;
+          i1++;
+          f = find_nfc_index(nfc_first, *i);
+          if (f == -1)
+              break;
+      }
+      *o++ = *i++;
+    }
+    if (o != end)
+        PyUnicode_Resize(&result, o - PyUnicode_AS_UNICODE(result));
+    return result;
+}
+static PyObject*
+unicodedata_normalize(PyObject *self, PyObject *args)
+{
+    char *form;
+    PyObject *input;
+    if(!PyArg_ParseTuple(args, "sO!:normalized",
+                         &form, &PyUnicode_Type, &input))
+        return NULL;
+    if (strcmp(form, "NFC") == 0)
+        return nfc_nfkc(input, 0);
+    if (strcmp(form, "NFKC") == 0)
+        return nfc_nfkc(input, 1);
+    if (strcmp(form, "NFD") == 0)
+        return nfd_nfkd(input, 0);
+    if (strcmp(form, "NFKD") == 0)
+        return nfd_nfkd(input, 1);
+    PyErr_SetString(PyExc_ValueError, "invalid normalization form");
+    return NULL;
+}
 /* -------------------------------------------------------------------- */
 /* unicode character name tables */
@@ -277,16 +549,6 @@ _gethash(const char *s, int len, int scale)
    return h;
 }
-#define SBase   0xAC00
-#define LBase   0x1100
-#define VBase   0x1161
-#define TBase   0x11A7
-#define LCount  19
-#define VCount  21
-#define TCount  28
-#define NCount  (VCount*TCount)
-#define SCount  (LCount*NCount)
 static char *hangul_syllables[][3] = {
    { "G",  "A",   ""   },
    { "GG", "AE",  "G"  },
@@ -594,6 +856,7 @@ static PyMethodDef unicodedata_functions[] = {
    {"decomposition",unicodedata_decomposition, METH_VARARGS},
    {"name", unicodedata_name, METH_VARARGS},
    {"lookup", unicodedata_lookup, METH_VARARGS},
+    {"normalize", unicodedata_normalize, METH_VARARGS},
    {NULL, NULL}		/* sentinel */
 };
@@ -618,5 +881,6 @@ initunicodedata(void)
 /* 
 Local variables:
 c-basic-offset: 4
+indent-tabs-mode: nil
 End:
 */
--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -13,6 +13,9 @@
 # 2000-11-03 fl   expand first/last ranges
 # 2001-01-19 fl   added character name tables (2.1)
 # 2001-01-21 fl   added decomp compression; dynamic phrasebook threshold
+# 2002-09-11 wd   use string methods
+# 2002-10-18 mvl  update to Unicode 3.2
+# 2002-10-22 mvl  generate NFC tables
 #
 # written by Fredrik Lundh (fredrik@pythonware.com)
 #
@@ -22,7 +25,8 @@ import sys
 SCRIPT = sys.argv[0]
 VERSION = "2.1"
-UNICODE_DATA = "UnicodeData-Latest.txt"
+UNICODE_DATA = "UnicodeData.txt"
+COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt"
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -47,7 +51,7 @@ def maketables(trace=0):
    print "--- Reading", UNICODE_DATA, "..."
-    unicode = UnicodeData(UNICODE_DATA)
+    unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS)
    print len(filter(None, unicode.table)), "characters"
@@ -96,6 +100,10 @@ def makeunicodedata(unicode, trace):
    decomp_index = [0] * len(unicode.chars)
    decomp_size = 0
+    comp_pairs = []
+    comp_first = [None] * len(unicode.chars)
+    comp_last = [None] * len(unicode.chars)
    for char in unicode.chars:
        record = unicode.table[char]
        if record:
@@ -116,6 +124,14 @@ def makeunicodedata(unicode, trace):
                # content
                decomp = [prefix + (len(decomp)<<8)] +\
                         map(lambda s: int(s, 16), decomp)
+                # Collect NFC pairs
+                if not prefix and len(decomp) == 3 and \
+                   char not in unicode.exclusions and \
+                   unicode.table[decomp[1]][3] == "0":
+                    p, l, r = decomp
+                    comp_first[l] = 1
+                    comp_last[r] = 1
+                    comp_pairs.append((l,r,char))
                try:
                    i = decomp_data.index(decomp)
                except ValueError:
@@ -126,10 +142,49 @@ def makeunicodedata(unicode, trace):
                i = 0
            decomp_index[char] = i
+    f = l = 0
+    comp_first_ranges = []
+    comp_last_ranges = []
+    prev_f = prev_l = None
+    for i in unicode.chars:
+        if comp_first[i] is not None:
+            comp_first[i] = f
+            f += 1
+            if prev_f is None:
+                prev_f = (i,i)
+            elif prev_f[1]+1 == i:
+                prev_f = prev_f[0],i
+            else:
+                comp_first_ranges.append(prev_f)
+                prev_f = (i,i)
+        if comp_last[i] is not None:
+            comp_last[i] = l
+            l += 1
+            if prev_l is None:
+                prev_l = (i,i)
+            elif prev_l[1]+1 == i:
+                prev_l = prev_l[0],i
+            else:
+                comp_last_ranges.append(prev_l)
+                prev_l = (i,i)
+    comp_first_ranges.append(prev_f)
+    comp_last_ranges.append(prev_l)
+    total_first = f
+    total_last = l
+    comp_data = [0]*(total_first*total_last)
+    for f,l,char in comp_pairs:
+        f = comp_first[f]
+        l = comp_last[l]
+        comp_data[f*total_last+l] = char
    print len(table), "unique properties"
    print len(decomp_prefix), "unique decomposition prefixes"
    print len(decomp_data), "unique decomposition entries:",
    print decomp_size, "bytes"
+    print total_first, "first characters in NFC"
+    print total_last, "last characters in NFC"
+    print len(comp_pairs), "NFC pairs"
    print "--- Writing", FILE, "..."
@@ -144,6 +199,21 @@ def makeunicodedata(unicode, trace):
    print >>fp, "};"
    print >>fp
+    print >>fp, "/* Reindexing of NFC first characters. */"
+    print >>fp, "#define TOTAL_FIRST",total_first
+    print >>fp, "#define TOTAL_LAST",total_last
+    print >>fp, "struct reindex{int start;short count,index;};"
+    print >>fp, "struct reindex nfc_first[] = {"
+    for start,end in comp_first_ranges:
+        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_first[start])
+    print >>fp,"  {0,0,0}"
+    print >>fp,"};\n"
+    print >>fp, "struct reindex nfc_last[] = {"
+    for start,end in comp_last_ranges:
+        print >>fp,"  { %d, %d, %d}," % (start,end-start,comp_last[start])
+    print >>fp,"  {0,0,0}"
+    print >>fp,"};\n"
    # FIXME: <fl> the following tables could be made static, and
    # the support code moved into unicodedatabase.c
@@ -185,6 +255,12 @@ def makeunicodedata(unicode, trace):
    Array("decomp_index1", index1).dump(fp, trace)
    Array("decomp_index2", index2).dump(fp, trace)
+    index, index2, shift = splitbins(comp_data, trace)
+    print >>fp, "/* NFC pairs */"
+    print >>fp, "#define COMP_SHIFT", shift
+    Array("comp_index", index).dump(fp, trace)
+    Array("comp_data", index2).dump(fp, trace)
    fp.close()
 # --------------------------------------------------------------------
@@ -454,7 +530,7 @@ import sys
 class UnicodeData:
-    def __init__(self, filename, expand=1):
+    def __init__(self, filename, exclusions, expand=1):
        file = open(filename)
        table = [None] * 0x110000
        while 1:
@@ -486,6 +562,17 @@ class UnicodeData:
        self.table = table
        self.chars = range(0x110000) # unicode 3.2
+        file = open(exclusions)
+        self.exclusions = {}
+        for s in file:
+            s = s.strip()
+            if not s:
+                continue
+            if s[0] == '#':
+                continue
+            char = int(s.split()[0],16)
+            self.exclusions[char] = 1
    def uselatin1(self):
        # restrict character range to ISO Latin 1
        self.chars = range(256)