Commit fad27aee authored by Fredrik Lundh's avatar Fredrik Lundh

Added 38,642 missing characters to the Unicode database (first-last

ranges) -- but thanks to the 2.0 compression scheme, this doesn't add
a single byte to the resulting binaries (!)

Closes bug #117524
parent 063ee7bb
test_unicodedata
Testing Unicode Database...
Methods: 86793e1265f3cf5506e6ede8f69ab4deb973f3ea
Functions: 5abd7e976848725e58f5834a0e5e37615f40d3a2
Methods: 6c7a7c02657b69d0fdd7a7d174f573194bba2e18
Functions: 41e1d4792185d6474a43c83ce4f593b1bdb01f8a
API: ok
This diff is collapsed.
This diff is collapsed.
......@@ -9,6 +9,7 @@
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl added character type table
# 2000-09-26 fl added LINEBREAK, DECIMAL, and DIGIT flags/fields
# 2000-11-03 fl expand first/last ranges
#
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
#
......@@ -39,10 +40,13 @@ SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
def maketables():
def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA)
print "--- Processing", UNICODE_DATA, "..."
print len(filter(None, unicode.table)), "characters"
# extract unicode properties
dummy = (0, 0, 0, 0)
table = [dummy]
......@@ -91,6 +95,11 @@ def maketables():
FILE = "Modules/unicodedata_db.h"
print "--- Writing", FILE, "..."
print len(table), "unique properties"
print len(decomp_data), "unique decomposition entries"
fp = open(FILE, "w")
print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
print >>fp
......@@ -125,7 +134,7 @@ def maketables():
print >>fp, "};"
# split record index table
index1, index2, shift = splitbins(index)
index1, index2, shift = splitbins(index, trace)
print >>fp, "/* index tables for the database records */"
print >>fp, "#define SHIFT", shift
......@@ -133,7 +142,7 @@ def maketables():
Array("index2", index2).dump(fp)
# split decomposition index table
index1, index2, shift = splitbins(decomp_index)
index1, index2, shift = splitbins(decomp_index, trace)
print >>fp, "/* index tables for the decomposition data */"
print >>fp, "#define DECOMP_SHIFT", shift
......@@ -200,12 +209,14 @@ def maketables():
table.append(item)
index[char] = i
print len(table), "ctype entries"
FILE = "Objects/unicodetype_db.h"
fp = open(FILE, "w")
print "--- Writing", FILE, "..."
print len(table), "unique character type entries"
print >>fp, "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
print >>fp
print >>fp, "/* a list of unique character type descriptors */"
......@@ -216,7 +227,7 @@ def maketables():
print >>fp
# split decomposition index table
index1, index2, shift = splitbins(index)
index1, index2, shift = splitbins(index, trace)
print >>fp, "/* type indexes */"
print >>fp, "#define SHIFT", shift
......@@ -233,7 +244,7 @@ import string, sys
class UnicodeData:
def __init__(self, filename):
def __init__(self, filename, expand=1):
file = open(filename)
table = [None] * 65536
while 1:
......@@ -244,6 +255,22 @@ class UnicodeData:
char = string.atoi(s[0], 16)
table[char] = s
# expand first-last ranges (ignore surrogates and private use)
if expand:
field = None
for i in range(0, 0xD800):
s = table[i]
if s:
if s[1][-6:] == "First>":
s[1] = ""
field = s[:]
elif s[1][-5:] == "Last>":
s[1] = ""
field = None
elif field:
field[0] = hex(i)
table[i] = field
# public attributes
self.filename = filename
self.table = table
......@@ -306,8 +333,9 @@ def splitbins(t, trace=0):
t[i] == t2[(t1[i >> shift] << shift) + (i & mask)]
where mask is a bitmask isolating the last "shift" bits.
If optional arg trace is true (default false), progress info is
printed to sys.stderr.
If optional arg trace is non-zero (default zero), progress info
is printed to sys.stderr. The higher the value, the more info
you'll get.
"""
import sys
......@@ -341,7 +369,7 @@ def splitbins(t, trace=0):
t1.append(index >> shift)
# determine memory size
b = len(t1)*getsize(t1) + len(t2)*getsize(t2)
if trace:
if trace > 1:
dump(t1, t2, shift, b)
if b < bytes:
best = t1, t2, shift
......@@ -358,4 +386,4 @@ def splitbins(t, trace=0):
return best
if __name__ == "__main__":
maketables()
maketables(1)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment