Commit 1e8b59c1 authored by Fredrik Lundh's avatar Fredrik Lundh

unicode database compression, step 3:

- use unidb compression for the unicodectype module.  smaller,
  faster, and slightly more portable...

- also mention the unicode directory in Tools/README
parent a9018d00
...@@ -21,6 +21,9 @@ scripts A number of useful single-file programs, e.g. tabnanny.py ...@@ -21,6 +21,9 @@ scripts A number of useful single-file programs, e.g. tabnanny.py
(by Tim Peters), which checks for inconsistent mixing (by Tim Peters), which checks for inconsistent mixing
of tabs and spaces. of tabs and spaces.
unicode Tools used to generate unicode database files for
Python 2.0 (by Fredrik Lundh).
versioncheck A tool to automate checking whether you have the latest versioncheck A tool to automate checking whether you have the latest
version of a package (by Jack Jansen). version of a package (by Jack Jansen).
......
# #
# generate a compact version of the unicode property database # (re)generate unicode property and type databases
#
# this script converts a unicode 3.0 database file to
# Modules/unicodedata_db.h and Objects/unicodetype_db.h
# #
# history: # history:
# 2000-09-24 fl created (based on bits and pieces from unidb) # 2000-09-24 fl created (based on bits and pieces from unidb)
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table # 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
# 2000-09-25 fl added character type table
# #
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000 # written by Fredrik Lundh (fredrik@pythonware.com), September 2000
# #
...@@ -13,7 +17,7 @@ import sys ...@@ -13,7 +17,7 @@ import sys
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "1.1" VERSION = "1.1"
UNICODE_DATA = "../UnicodeData-Latest.txt" UNICODE_DATA = "UnicodeData-Latest.txt"
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
...@@ -24,7 +28,16 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO", ...@@ -24,7 +28,16 @@ BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
"PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS", "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
"ON" ] "ON" ]
def maketable(): ALPHA_MASK = 0x01
DECIMAL_MASK = 0x02
DIGIT_MASK = 0x04
LOWER_MASK = 0x08
NUMERIC_MASK = 0x10
SPACE_MASK = 0x20
TITLE_MASK = 0x40
UPPER_MASK = 0x80
def maketables():
unicode = UnicodeData(UNICODE_DATA) unicode = UnicodeData(UNICODE_DATA)
...@@ -74,7 +87,7 @@ def maketable(): ...@@ -74,7 +87,7 @@ def maketable():
i = 0 i = 0
decomp_index[char] = i decomp_index[char] = i
FILE = "unicodedata_db.h" FILE = "Modules/unicodedata_db.h"
sys.stdout = open(FILE, "w") sys.stdout = open(FILE, "w")
...@@ -87,6 +100,9 @@ def maketable(): ...@@ -87,6 +100,9 @@ def maketable():
print "};" print "};"
print print
# FIXME: the following tables should be made static, and
# the support code moved into unicodedatabase.c
print "/* string literals */" print "/* string literals */"
print "const char *_PyUnicode_CategoryNames[] = {" print "const char *_PyUnicode_CategoryNames[] = {"
for name in CATEGORY_NAMES: for name in CATEGORY_NAMES:
...@@ -106,24 +122,96 @@ def maketable(): ...@@ -106,24 +122,96 @@ def maketable():
print " NULL" print " NULL"
print "};" print "};"
# split index table # split record index table
index1, index2, shift = splitbins(index) index1, index2, shift = splitbins(index)
print "/* index tables used to find the right database record */" print "/* index tables for the database records */"
print "#define SHIFT", shift print "#define SHIFT", shift
Array("index1", index1).dump(sys.stdout) Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout) Array("index2", index2).dump(sys.stdout)
# split index table # split decomposition index table
index1, index2, shift = splitbins(decomp_index) index1, index2, shift = splitbins(decomp_index)
print "/* same, for the decomposition data */" print "/* index tables for the decomposition data */"
print "#define DECOMP_SHIFT", shift print "#define DECOMP_SHIFT", shift
Array("decomp_index1", index1).dump(sys.stdout) Array("decomp_index1", index1).dump(sys.stdout)
Array("decomp_index2", index2).dump(sys.stdout) Array("decomp_index2", index2).dump(sys.stdout)
sys.stdout = sys.__stdout__ sys.stdout = sys.__stdout__
#
# 3) unicode type data
# extract unicode types
dummy = (0, 0, 0, 0)
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
for char in unicode.chars:
record = unicode.table[char]
if record:
# extract database properties
category = record[2]
bidirectional = record[4]
flags = 0
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
flags |= ALPHA_MASK
if category == "Ll":
flags |= LOWER_MASK
if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK
if category in ["Lt", "Lu"]:
flags |= TITLE_MASK
if category == "Lu":
flags |= UPPER_MASK
# use delta predictor for upper/lower/title
if record[12]:
upper = (int(record[12], 16) - char) & 0xffff
else:
upper = 0
if record[13]:
lower = (int(record[13], 16) - char) & 0xffff
else:
lower = 0
if record[14]:
title = (int(record[14], 16) - char) & 0xffff
else:
title = 0
item = (
flags, upper, lower, title
)
# add entry to index and item tables
i = cache.get(item)
if i is None:
cache[item] = i = len(table)
table.append(item)
index[char] = i
FILE = "Objects/unicodetype_db.h"
sys.stdout = open(FILE, "w")
print "/* this file was generated by %s %s */" % (SCRIPT, VERSION)
print
print "/* a list of unique character type descriptors */"
print "const _PyUnicode_TypeRecord _PyUnicode_TypeRecords[] = {"
for item in table:
print " {%d, %d, %d, %d}," % item
print "};"
print
# split decomposition index table
index1, index2, shift = splitbins(index)
print "/* type indexes */"
print "#define SHIFT", shift
Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout)
sys.stdout = sys.__stdout__
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# the following support code is taken from the unidb utilities # the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB # Copyright (c) 1999-2000 by Secret Labs AB
...@@ -259,4 +347,4 @@ def splitbins(t, trace=0): ...@@ -259,4 +347,4 @@ def splitbins(t, trace=0):
return best return best
if __name__ == "__main__": if __name__ == "__main__":
maketable() maketables()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment