unicode database compression, step 2:

- fixed attributions - moved decomposition data to a separate table, in preparation for step 3 (which won't happen before 2.0 final, promise!) - use relative paths in the generator script I have a lot more stuff in the works for 2.1, but let's leave that for another day...

unicode database compression, step 2:
- fixed attributions - moved decomposition data to a separate table, in preparation for step 3 (which won't happen before 2.0 final, promise!) - use relative paths in the generator script I have a lot more stuff in the works for 2.1, but let's leave that for another day...
cfcea492 · Fredrik Lundh · 21013488 · cfcea492 · cfcea492 · cfcea492
Commit cfcea492 authored Sep 25, 2000 by Fredrik Lundh
5 changed files
--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -4,17 +4,16 @@
   Data was extracted from the Unicode 3.0 UnicodeData.txt file.
-Written by Marc-Andre Lemburg (mal@lemburg.com).
+   Written by Marc-Andre Lemburg (mal@lemburg.com).
+   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
-Copyright (c) Corporation for National Research Initiatives.
+   Copyright (c) Corporation for National Research Initiatives.
   ------------------------------------------------------------------------ */
 #include "Python.h"
 #include "unicodedatabase.h"
-#define unicode_db _PyUnicode_Database_GetRecord
 /* --- Module API --------------------------------------------------------- */
 static PyObject *
@@ -134,15 +133,9 @@ unicodedata_category(PyObject *self,
 			"need a single Unicode character as parameter");
 	goto onError;
    }
-    index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->category;
+    index = (int) _PyUnicode_Database_GetRecord(
-    if (index < 0 || 
+        (int) *PyUnicode_AS_UNICODE(v)
-	index > sizeof(_PyUnicode_CategoryNames) / 
+        )->category;
-	        sizeof(_PyUnicode_CategoryNames[0])) {
-	PyErr_Format(PyExc_SystemError,
-		     "category index out of range: %i",
-		     index);
-	goto onError;
-    }
    return PyString_FromString(_PyUnicode_CategoryNames[index]);
 onError:
@@ -164,15 +157,9 @@ unicodedata_bidirectional(PyObject *self,
 			"need a single Unicode character as parameter");
 	goto onError;
    }
-    index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->bidirectional;
+    index = (int) _PyUnicode_Database_GetRecord(
-    if (index < 0 || 
+        (int) *PyUnicode_AS_UNICODE(v)
-	index > sizeof(_PyUnicode_CategoryNames) / 
+        )->bidirectional;
-	        sizeof(_PyUnicode_CategoryNames[0])) {
-	PyErr_Format(PyExc_SystemError,
-		     "bidirectional index out of range: %i",
-		     index);
-	goto onError;
-    }
    return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
 onError:
@@ -194,7 +181,9 @@ unicodedata_combining(PyObject *self,
 			"need a single Unicode character as parameter");
 	goto onError;
    }
-    value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->combining;
+    value = (int) _PyUnicode_Database_GetRecord(
+        (int) *PyUnicode_AS_UNICODE(v)
+        )->combining;
    return PyInt_FromLong(value);
 onError:
@@ -216,7 +205,9 @@ unicodedata_mirrored(PyObject *self,
 			"need a single Unicode character as parameter");
 	goto onError;
    }
-    value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->mirrored;
+    value = (int) _PyUnicode_Database_GetRecord(
+        (int) *PyUnicode_AS_UNICODE(v)
+        )->mirrored;
    return PyInt_FromLong(value);
 onError:
@@ -238,10 +229,9 @@ unicodedata_decomposition(PyObject *self,
 			"need a single Unicode character as parameter");
 	goto onError;
    }
-    value = unicode_db((int)*PyUnicode_AS_UNICODE(v))->decomposition;
+    value = _PyUnicode_Database_GetDecomposition(
-    if (value == NULL)
+        (int) *PyUnicode_AS_UNICODE(v)
-	return PyString_FromString("");
+        );
-    else
 	return PyString_FromString(value);
 onError:

--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodedatabase.c
+++ b/Modules/unicodedatabase.c
@@ -4,9 +4,10 @@
   Data was extracted from the Unicode 3.0 UnicodeData.txt file.
-Written by Marc-Andre Lemburg (mal@lemburg.com).
+   Written by Marc-Andre Lemburg (mal@lemburg.com).
+   Rewritten for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
-Copyright (c) Corporation for National Research Initiatives.
+   Copyright (c) Corporation for National Research Initiatives.
   ------------------------------------------------------------------------ */
@@ -29,3 +30,18 @@ _PyUnicode_Database_GetRecord(int code)
    }
    return &_PyUnicode_Database_Records[index];
 }
+const char *
+_PyUnicode_Database_GetDecomposition(int code)
+{
+    int index;
+    if (code < 0 || code >= 65536)
+        index = 0;
+    else {
+        index = decomp_index1[(code>>DECOMP_SHIFT)];
+        index = decomp_index2[(index<<DECOMP_SHIFT)+
+                             (code&((1<<DECOMP_SHIFT)-1))];
+    }
+    return decomp_data[index];
+}
--- a/Modules/unicodedatabase.h
+++ b/Modules/unicodedatabase.h
@@ -4,9 +4,10 @@
   Data was extracted from the Unicode 3.0 UnicodeData.txt file.
-Written by Marc-Andre Lemburg (mal@lemburg.com).
+   Written by Marc-Andre Lemburg (mal@lemburg.com).
+   Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
-Copyright (c) Corporation for National Research Initiatives.
+   Copyright (c) Corporation for National Research Initiatives.
   ------------------------------------------------------------------------ */
@@ -19,15 +20,14 @@ typedef struct {
    const unsigned char	bidirectional; 	/* index into
 					   _PyUnicode_BidirectionalNames */
    const unsigned char mirrored;	/* true if mirrored in bidir mode */
-    const char *decomposition;		/* pointer to the decomposition
-					   string or NULL */
 } _PyUnicode_DatabaseRecord;
 /* --- Unicode category names --------------------------------------------- */
-extern const char *_PyUnicode_CategoryNames[32];
+extern const char *_PyUnicode_CategoryNames[];
-extern const char *_PyUnicode_BidirectionalNames[21];
+extern const char *_PyUnicode_BidirectionalNames[];
 /* --- Unicode Database --------------------------------------------------- */
 extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database_GetRecord(int ch);
+extern const char *_PyUnicode_Database_GetDecomposition(int ch);
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
 #
-# makeunidb.py -- generate a compact version of the unicode property
+# generate a compact version of the unicode property database
-# database (unicodedatabase.h)
+#
+# history:
+# 2000-09-24 fl   created (based on bits and pieces from unidb)
+# 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
+#
+# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
 #
 import sys
 SCRIPT = sys.argv[0]
-VERSION = "1.0"
+VERSION = "1.1"
-UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt"
+UNICODE_DATA = "../UnicodeData-Latest.txt"
 CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
    "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
@@ -24,13 +29,12 @@ def maketable():
    unicode = UnicodeData(UNICODE_DATA)
    # extract unicode properties
-    dummy = (0, 0, 0, 0, "NULL")
+    dummy = (0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)
-    DECOMPOSITION = [""]
+    # 1) database properties
    for char in unicode.chars:
        record = unicode.table[char]
        if record:
@@ -39,12 +43,8 @@ def maketable():
            combining = int(record[3])
            bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
            mirrored = record[9] == "Y"
-            if record[5]:
-                decomposition = '"%s"' % record[5]
-            else:
-                decomposition = "NULL"
            item = (
-                category, combining, bidirectional, mirrored, decomposition
+                category, combining, bidirectional, mirrored
                )
            # add entry to index and item tables
            i = cache.get(item)
@@ -53,8 +53,26 @@ def maketable():
                table.append(item)
            index[char] = i
-    # FIXME: we really should compress the decomposition stuff
+    # 2) decomposition data
-    # (see the unidb utilities for one way to do this)
+    # FIXME: <fl> using the encoding stuff from unidb would save
+    # another 50k or so, but I'll leave that for 2.1...
+    decomp_data = [""]
+    decomp_index = [0] * len(unicode.chars)
+    for char in unicode.chars:
+        record = unicode.table[char]
+        if record:
+            if record[5]:
+                try:
+                    i = decomp_data.index(record[5])
+                except ValueError:
+                    i = len(decomp_data)
+                    decomp_data.append(record[5])
+            else:
+                i = 0
+            decomp_index[char] = i
    FILE = "unicodedata_db.h"
@@ -65,7 +83,7 @@ def maketable():
    print "/* a list of unique database records */"
    print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
    for item in table:
-        print "    {%d, %d, %d, %d, %s}," % item
+        print "    {%d, %d, %d, %d}," % item
    print "};"
    print
@@ -82,6 +100,12 @@ def maketable():
    print "    NULL"
    print "};"
+    print "static const char *decomp_data[] = {"
+    for name in decomp_data:
+        print "    \"%s\"," % name
+    print "    NULL"
+    print "};"
    # split index table
    index1, index2, shift = splitbins(index)
@@ -90,6 +114,14 @@ def maketable():
    Array("index1", index1).dump(sys.stdout)
    Array("index2", index2).dump(sys.stdout)
+    # split index table
+    index1, index2, shift = splitbins(decomp_index)
+    print "/* same, for the decomposition data */"
+    print "#define DECOMP_SHIFT", shift
+    Array("decomp_index1", index1).dump(sys.stdout)
+    Array("decomp_index2", index2).dump(sys.stdout)
    sys.stdout = sys.__stdout__
 # --------------------------------------------------------------------