forgot to check in the new makeunicodedata.py script

938ac765 · Fredrik Lundh · d3099536 · 938ac765 · 938ac765 · 938ac765
Commit 938ac765 authored Jan 21, 2001 by Fredrik Lundh
5 changed files
--- a/Modules/ucnhash.c
+++ b/Modules/ucnhash.c
@@ -11,16 +11,13 @@
 /* database code (cut and pasted from the unidb package) */

 static unsigned long
-gethash(const char *s, int len)
+gethash(const char *s, int len, int scale)
 {
    int i;
    unsigned long h = 0;
    unsigned long ix;
    for (i = 0; i < len; i++) {
-        /* magic value 47 was chosen to minimize the number
-           of collisions for the uninames dataset.  see the
-           makeunicodedata script for more background */
-        h = (h * 47) + (unsigned char) toupper(s[i]);
+        h = (h * scale) + (unsigned char) toupper(s[i]);
        ix = h & 0xff000000;
        if (ix)
            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
@@ -40,8 +37,9 @@ getname(Py_UCS4 code, char* buffer, int buflen)
        return 0;

    /* get offset into phrasebook */
-    offset = phrasebook_offset1[(code>>SHIFT)];
-    offset = phrasebook_offset2[(offset<<SHIFT)+(code&((1<<SHIFT)-1))];
+    offset = phrasebook_offset1[(code>>phrasebook_shift)];
+    offset = phrasebook_offset2[(offset<<phrasebook_shift)+
+                               (code&((1<<phrasebook_shift)-1))];
    if (!offset)
        return 0;

@@ -99,14 +97,14 @@ static int
 getcode(const char* name, int namelen, Py_UCS4* code)
 {
    unsigned int h, v;
-    unsigned int mask = CODE_SIZE-1;
+    unsigned int mask = code_size-1;
    unsigned int i, incr;

    /* the following is the same as python's dictionary lookup, with
       only minor changes.  see the makeunicodedata script for more
       details */

-    h = (unsigned int) gethash(name, namelen);
+    h = (unsigned int) gethash(name, namelen, code_magic);
    i = (~h) & mask;
    v = code_hash[i];
    if (!v)
@@ -129,7 +127,7 @@ getcode(const char* name, int namelen, Py_UCS4* code)
        }
        incr = incr << 1;
        if (incr > mask)
-            incr = incr ^ CODE_POLY;
+            incr = incr ^ code_poly;
    }
 }


--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
-/* this file was generated by tools\unicode\makeunicodedata.py 1.1 */
+/* this file was generated by tools\unicode\makeunicodedata.py 2.1 */

 /* a list of unique database records */
 const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {

--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
-/* this file was generated by tools\unicode\makeunicodedata.py 1.1 */
+/* this file was generated by tools\unicode\makeunicodedata.py 2.1 */

 /* a list of unique character type descriptors */
 const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {

--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -2,14 +2,16 @@
 # (re)generate unicode property and type databases
 #
 # this script converts a unicode 3.0 database file to
-# Modules/unicodedata_db.h and Objects/unicodetype_db.h
+# Modules/unicodedata_db.h, Modules/unicodename_db.h,
+# and Objects/unicodetype_db.h
 #
 # history:
 # 2000-09-24 fl   created (based on bits and pieces from unidb)
 # 2000-09-25 fl   merged tim's splitbin fixes, separate decomposition table
 # 2000-09-25 fl   added character type table
-# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields
+# 2000-09-26 fl   added LINEBREAK, DECIMAL, and DIGIT flags/fields (2.0)
 # 2000-11-03 fl   expand first/last ranges
+# 2001-01-19 fl   added character name tables (2.1)
 #
 # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
 #
@@ -17,7 +19,7 @@
 import sys

 SCRIPT = sys.argv[0]
-VERSION = "1.1"
+VERSION = "2.1"

 UNICODE_DATA = "UnicodeData-Latest.txt"

@@ -42,18 +44,32 @@ UPPER_MASK = 0x80

 def maketables(trace=0):

+    print "--- Reading", UNICODE_DATA, "..."
+
    unicode = UnicodeData(UNICODE_DATA)

-    print "--- Processing", UNICODE_DATA, "..."
    print len(filter(None, unicode.table)), "characters"

-    # extract unicode properties
+    makeunicodedata(unicode, trace)
+    makeunicodetype(unicode, trace)
+    makeunicodename(unicode, trace)
+
+# --------------------------------------------------------------------
+# unicode character properties
+
+def makeunicodedata(unicode, trace):
+
    dummy = (0, 0, 0, 0)
    table = [dummy]
    cache = {0: dummy}
    index = [0] * len(unicode.chars)

+    FILE = "Modules/unicodedata_db.h"
+
+    print "--- Preparing", FILE, "..."
+
    # 1) database properties
+
    for char in unicode.chars:
        record = unicode.table[char]
        if record:
@@ -93,13 +109,11 @@ def maketables(trace=0):
                i = 0
            decomp_index[char] = i

-    FILE = "Modules/unicodedata_db.h"
-
-    print "--- Writing", FILE, "..."
-
    print len(table), "unique properties"
    print len(decomp_data), "unique decomposition entries"

+    print "--- Writing", FILE, "..."
+
    fp = open(FILE, "w")
    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
    print >>fp
@@ -111,7 +125,7 @@ def maketables(trace=0):
    print >>fp, "};"
    print >>fp

-    # FIXME: the following tables should be made static, and
+    # FIXME: <fl> the following tables could be made static, and
    # the support code moved into unicodedatabase.c

    print >>fp, "/* string literals */"
@@ -149,8 +163,16 @@ def maketables(trace=0):
    Array("decomp_index1", index1).dump(fp)
    Array("decomp_index2", index2).dump(fp)

-    #
-    # 3) unicode type data
+    fp.close()
+
+# --------------------------------------------------------------------
+# unicode character type tables
+
+def makeunicodetype(unicode, trace):
+
+    FILE = "Objects/unicodetype_db.h"
+
+    print "--- Preparing", FILE, "..."

    # extract unicode types
    dummy = (0, 0, 0, 0, 0, 0)
@@ -209,14 +231,11 @@ def maketables(trace=0):
                table.append(item)
            index[char] = i

-    FILE = "Objects/unicodetype_db.h"
-
-    fp = open(FILE, "w")
+    print len(table), "unique character type entries"

    print "--- Writing", FILE, "..."

-    print len(table), "unique character type entries"
-
+    fp = open(FILE, "w")
    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
    print >>fp
    print >>fp, "/* a list of unique character type descriptors */"
@@ -234,6 +253,155 @@ def maketables(trace=0):
    Array("index1", index1).dump(fp)
    Array("index2", index2).dump(fp)

+    fp.close()
+
+# --------------------------------------------------------------------
+# unicode name database
+
+def makeunicodename(unicode, trace):
+
+    FILE = "Modules/unicodename_db.h"
+
+    print "--- Preparing", FILE, "..."
+
+    # collect names
+    names = [None] * len(unicode.chars)
+
+    for char in unicode.chars:
+        record = unicode.table[char]
+        if record:
+            name = record[1].strip()
+            if name and name[0] != "<":
+                names[char] = name + chr(0)
+
+    print len(filter(lambda n: n is not None, names)), "distinct names"
+
+    # collect unique words from names (note that we differ between
+    # words inside a sentence, and words ending a sentence.  the
+    # latter includes the trailing null byte.
+
+    words = {}
+    n = b = 0
+    for char in unicode.chars:
+        name = names[char]
+        if name:
+            w = name.split()
+            b = b + len(name)
+            n = n + len(w)
+            for w in w:
+                l = words.get(w)
+                if l:
+                    l.append(None)
+                else:
+                    words[w] = [len(words)]
+
+    print n, "words in text;", b, "bytes"
+
+    wordlist = words.items()
+
+    # sort on falling frequency
+    wordlist.sort(lambda a, b: len(b[1])-len(a[1]))
+
+    # statistics
+    n = 0
+    for i in range(128):
+        n = n + len(wordlist[i][1])
+    print n, "short words (7-bit indices)"
+
+    # pick the 128 most commonly used words, and sort the rest on
+    # falling length (to maximize overlap)
+
+    wordlist, wordtail = wordlist[:128], wordlist[128:]
+    wordtail.sort(lambda a, b: len(b[0])-len(a[0]))
+    wordlist.extend(wordtail)
+
+    # generate lexicon from words
+
+    lexicon_offset = [0]
+    lexicon = ""
+    words = {}
+
+    # build a lexicon string
+    offset = 0
+    for w, x in wordlist:
+        # encoding: bit 7 indicates last character in word (chr(128)
+        # indicates the last character in an entire string)
+        ww = w[:-1] + chr(ord(w[-1])+128)
+        # reuse string tails, when possible
+        o = string.find(lexicon, ww)
+        if o < 0:
+            o = offset
+            lexicon = lexicon + ww
+            offset = offset + len(w)
+        words[w] = len(lexicon_offset)
+        lexicon_offset.append(offset)
+
+    print len(words), "words in lexicon;", len(lexicon), "bytes"
+
+    assert len(words) < 32768 # 15-bit word indices
+
+    lexicon = map(ord, lexicon)
+
+    # generate phrasebook from names and lexicon
+    phrasebook = [0]
+    phrasebook_offset = [0] * len(unicode.chars)
+    for char in unicode.chars:
+        name = names[char]
+        if name:
+            w = name.split()
+            phrasebook_offset[char] = len(phrasebook)
+            for w in w:
+                i = words[w]
+                if i < 128:
+                    phrasebook.append(128+i)
+                else:
+                    phrasebook.append(i>>8)
+                    phrasebook.append(i&255)
+
+    #
+    # unicode name hash table
+
+    # extract names
+    data = []
+    for char in unicode.chars:
+        record = unicode.table[char]
+        if record:
+            name = record[1].strip()
+            if name and name[0] != "<":
+                data.append((name, char))
+
+    # the magic number 47 was chosen to minimize the number of
+    # collisions on the current data set.  if you like, change it
+    # and see what happens...
+
+    codehash = Hash("code", data, 47)
+
+    print "--- Writing", FILE, "..."
+
+    fp = open(FILE, "w")
+    print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
+    print >>fp
+    print >>fp, "#define NAME_MAXLEN", 256
+    print >>fp
+    print >>fp, "/* lexicon */"
+    Array("lexicon", lexicon).dump(fp)
+    Array("lexicon_offset", lexicon_offset).dump(fp)
+
+    # split decomposition index table
+    offset1, offset2, shift = splitbins(phrasebook_offset, trace)
+
+    print >>fp, "/* code->name phrasebook */"
+    print >>fp, "#define phrasebook_shift", shift
+
+    Array("phrasebook", phrasebook).dump(fp)
+    Array("phrasebook_offset1", offset1).dump(fp)
+    Array("phrasebook_offset2", offset2).dump(fp)
+
+    print >>fp, "/* name->code dictionary */"
+    codehash.dump(fp)
+
+    fp.close()
+
 # --------------------------------------------------------------------
 # the following support code is taken from the unidb utilities
 # Copyright (c) 1999-2000 by Secret Labs AB
@@ -280,6 +448,92 @@ class UnicodeData:
        # restrict character range to ISO Latin 1
        self.chars = range(256)

+# hash table tools
+
+# this is a straight-forward reimplementation of Python's built-in
+# dictionary type, using a static data structure, and a custom string
+# hash algorithm.
+
+def myhash(s, magic):
+    h = 0
+    for c in map(ord, string.upper(s)):
+        h = (h * magic) + c
+        ix = h & 0xff000000
+        if ix:
+            h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff
+    return h
+
+SIZES = [
+    (4,3), (8,3), (16,3), (32,5), (64,3), (128,3), (256,29), (512,17),
+    (1024,9), (2048,5), (4096,83), (8192,27), (16384,43), (32768,3),
+    (65536,45), (131072,9), (262144,39), (524288,39), (1048576,9),
+    (2097152,5), (4194304,3), (8388608,33), (16777216,27)
+]
+
+class Hash:
+    def __init__(self, name, data, magic):
+        # turn a (key, value) list into a static hash table structure
+
+        # determine table size
+        for size, poly in SIZES:
+            if size > len(data):
+                poly = size + poly
+                break
+        else:
+            raise AssertionError, "ran out of polynominals"
+
+        print size, "slots in hash table"
+
+        table = [None] * size
+
+        mask = size-1
+
+        n = 0
+
+        hash = myhash
+
+        # initialize hash table
+        for key, value in data:
+            h = hash(key, magic)
+            i = (~h) & mask
+            v = table[i]
+            if v is None:
+                table[i] = value
+                continue
+            incr = (h ^ (h >> 3)) & mask;
+            if not incr:
+                incr = mask
+            while 1:
+                n = n + 1
+                i = (i + incr) & mask
+                v = table[i]
+                if v is None:
+                    table[i] = value
+                    break
+                incr = incr << 1
+                if incr > mask:
+                    incr = incr ^ poly
+
+        print n, "collisions"
+        self.collisions = n
+
+        for i in range(len(table)):
+            if table[i] is None:
+                table[i] = 0
+
+        self.data = Array(name + "_hash", table)
+        self.magic = magic
+        self.name = name
+        self.size = size
+        self.poly = poly
+
+    def dump(self, file):
+        # write data to file, as a C array
+        self.data.dump(file)
+        file.write("#define %s_magic %d\n" % (self.name, self.magic))
+        file.write("#define %s_size %d\n" % (self.name, self.size))
+        file.write("#define %s_poly %d\n" % (self.name, self.poly))
+
 # stuff to deal with arrays of unsigned integers

 class Array: