Commit cfcea492 authored by Fredrik Lundh's avatar Fredrik Lundh

unicode database compression, step 2:

- fixed attributions
- moved decomposition data to a separate table, in preparation
  for step 3 (which won't happen before 2.0 final, promise!)
- use relative paths in the generator script

I have a lot more stuff in the works for 2.1, but let's leave
that for another day...
parent 21013488
...@@ -4,17 +4,16 @@ ...@@ -4,17 +4,16 @@
Data was extracted from the Unicode 3.0 UnicodeData.txt file. Data was extracted from the Unicode 3.0 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com). Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Copyright (c) Corporation for National Research Initiatives. Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */ ------------------------------------------------------------------------ */
#include "Python.h" #include "Python.h"
#include "unicodedatabase.h" #include "unicodedatabase.h"
#define unicode_db _PyUnicode_Database_GetRecord
/* --- Module API --------------------------------------------------------- */ /* --- Module API --------------------------------------------------------- */
static PyObject * static PyObject *
...@@ -134,15 +133,9 @@ unicodedata_category(PyObject *self, ...@@ -134,15 +133,9 @@ unicodedata_category(PyObject *self,
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
goto onError; goto onError;
} }
index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->category; index = (int) _PyUnicode_Database_GetRecord(
if (index < 0 || (int) *PyUnicode_AS_UNICODE(v)
index > sizeof(_PyUnicode_CategoryNames) / )->category;
sizeof(_PyUnicode_CategoryNames[0])) {
PyErr_Format(PyExc_SystemError,
"category index out of range: %i",
index);
goto onError;
}
return PyString_FromString(_PyUnicode_CategoryNames[index]); return PyString_FromString(_PyUnicode_CategoryNames[index]);
onError: onError:
...@@ -164,15 +157,9 @@ unicodedata_bidirectional(PyObject *self, ...@@ -164,15 +157,9 @@ unicodedata_bidirectional(PyObject *self,
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
goto onError; goto onError;
} }
index = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->bidirectional; index = (int) _PyUnicode_Database_GetRecord(
if (index < 0 || (int) *PyUnicode_AS_UNICODE(v)
index > sizeof(_PyUnicode_CategoryNames) / )->bidirectional;
sizeof(_PyUnicode_CategoryNames[0])) {
PyErr_Format(PyExc_SystemError,
"bidirectional index out of range: %i",
index);
goto onError;
}
return PyString_FromString(_PyUnicode_BidirectionalNames[index]); return PyString_FromString(_PyUnicode_BidirectionalNames[index]);
onError: onError:
...@@ -194,7 +181,9 @@ unicodedata_combining(PyObject *self, ...@@ -194,7 +181,9 @@ unicodedata_combining(PyObject *self,
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
goto onError; goto onError;
} }
value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->combining; value = (int) _PyUnicode_Database_GetRecord(
(int) *PyUnicode_AS_UNICODE(v)
)->combining;
return PyInt_FromLong(value); return PyInt_FromLong(value);
onError: onError:
...@@ -216,7 +205,9 @@ unicodedata_mirrored(PyObject *self, ...@@ -216,7 +205,9 @@ unicodedata_mirrored(PyObject *self,
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
goto onError; goto onError;
} }
value = (int)unicode_db((int)*PyUnicode_AS_UNICODE(v))->mirrored; value = (int) _PyUnicode_Database_GetRecord(
(int) *PyUnicode_AS_UNICODE(v)
)->mirrored;
return PyInt_FromLong(value); return PyInt_FromLong(value);
onError: onError:
...@@ -238,10 +229,9 @@ unicodedata_decomposition(PyObject *self, ...@@ -238,10 +229,9 @@ unicodedata_decomposition(PyObject *self,
"need a single Unicode character as parameter"); "need a single Unicode character as parameter");
goto onError; goto onError;
} }
value = unicode_db((int)*PyUnicode_AS_UNICODE(v))->decomposition; value = _PyUnicode_Database_GetDecomposition(
if (value == NULL) (int) *PyUnicode_AS_UNICODE(v)
return PyString_FromString(""); );
else
return PyString_FromString(value); return PyString_FromString(value);
onError: onError:
......
This diff is collapsed.
...@@ -4,9 +4,10 @@ ...@@ -4,9 +4,10 @@
Data was extracted from the Unicode 3.0 UnicodeData.txt file. Data was extracted from the Unicode 3.0 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com). Written by Marc-Andre Lemburg (mal@lemburg.com).
Rewritten for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Copyright (c) Corporation for National Research Initiatives. Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */ ------------------------------------------------------------------------ */
...@@ -29,3 +30,18 @@ _PyUnicode_Database_GetRecord(int code) ...@@ -29,3 +30,18 @@ _PyUnicode_Database_GetRecord(int code)
} }
return &_PyUnicode_Database_Records[index]; return &_PyUnicode_Database_Records[index];
} }
const char *
_PyUnicode_Database_GetDecomposition(int code)
{
int index;
if (code < 0 || code >= 65536)
index = 0;
else {
index = decomp_index1[(code>>DECOMP_SHIFT)];
index = decomp_index2[(index<<DECOMP_SHIFT)+
(code&((1<<DECOMP_SHIFT)-1))];
}
return decomp_data[index];
}
...@@ -4,9 +4,10 @@ ...@@ -4,9 +4,10 @@
Data was extracted from the Unicode 3.0 UnicodeData.txt file. Data was extracted from the Unicode 3.0 UnicodeData.txt file.
Written by Marc-Andre Lemburg (mal@lemburg.com). Written by Marc-Andre Lemburg (mal@lemburg.com).
Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
Copyright (c) Corporation for National Research Initiatives. Copyright (c) Corporation for National Research Initiatives.
------------------------------------------------------------------------ */ ------------------------------------------------------------------------ */
...@@ -19,15 +20,14 @@ typedef struct { ...@@ -19,15 +20,14 @@ typedef struct {
const unsigned char bidirectional; /* index into const unsigned char bidirectional; /* index into
_PyUnicode_BidirectionalNames */ _PyUnicode_BidirectionalNames */
const unsigned char mirrored; /* true if mirrored in bidir mode */ const unsigned char mirrored; /* true if mirrored in bidir mode */
const char *decomposition; /* pointer to the decomposition
string or NULL */
} _PyUnicode_DatabaseRecord; } _PyUnicode_DatabaseRecord;
/* --- Unicode category names --------------------------------------------- */ /* --- Unicode category names --------------------------------------------- */
extern const char *_PyUnicode_CategoryNames[32]; extern const char *_PyUnicode_CategoryNames[];
extern const char *_PyUnicode_BidirectionalNames[21]; extern const char *_PyUnicode_BidirectionalNames[];
/* --- Unicode Database --------------------------------------------------- */ /* --- Unicode Database --------------------------------------------------- */
extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database_GetRecord(int ch); extern const _PyUnicode_DatabaseRecord *_PyUnicode_Database_GetRecord(int ch);
extern const char *_PyUnicode_Database_GetDecomposition(int ch);
# #
# makeunidb.py -- generate a compact version of the unicode property # generate a compact version of the unicode property database
# database (unicodedatabase.h) #
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
# 2000-09-25 fl merged tim's splitbin fixes, separate decomposition table
#
# written by Fredrik Lundh (fredrik@pythonware.com), September 2000
# #
import sys import sys
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "1.0" VERSION = "1.1"
UNICODE_DATA = "c:/pythonware/modules/unidb/etc/UnicodeData-Latest.txt" UNICODE_DATA = "../UnicodeData-Latest.txt"
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
...@@ -24,13 +29,12 @@ def maketable(): ...@@ -24,13 +29,12 @@ def maketable():
unicode = UnicodeData(UNICODE_DATA) unicode = UnicodeData(UNICODE_DATA)
# extract unicode properties # extract unicode properties
dummy = (0, 0, 0, 0, "NULL") dummy = (0, 0, 0, 0)
table = [dummy] table = [dummy]
cache = {0: dummy} cache = {0: dummy}
index = [0] * len(unicode.chars) index = [0] * len(unicode.chars)
DECOMPOSITION = [""] # 1) database properties
for char in unicode.chars: for char in unicode.chars:
record = unicode.table[char] record = unicode.table[char]
if record: if record:
...@@ -39,12 +43,8 @@ def maketable(): ...@@ -39,12 +43,8 @@ def maketable():
combining = int(record[3]) combining = int(record[3])
bidirectional = BIDIRECTIONAL_NAMES.index(record[4]) bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
mirrored = record[9] == "Y" mirrored = record[9] == "Y"
if record[5]:
decomposition = '"%s"' % record[5]
else:
decomposition = "NULL"
item = ( item = (
category, combining, bidirectional, mirrored, decomposition category, combining, bidirectional, mirrored
) )
# add entry to index and item tables # add entry to index and item tables
i = cache.get(item) i = cache.get(item)
...@@ -53,8 +53,26 @@ def maketable(): ...@@ -53,8 +53,26 @@ def maketable():
table.append(item) table.append(item)
index[char] = i index[char] = i
# FIXME: we really should compress the decomposition stuff # 2) decomposition data
# (see the unidb utilities for one way to do this)
# FIXME: <fl> using the encoding stuff from unidb would save
# another 50k or so, but I'll leave that for 2.1...
decomp_data = [""]
decomp_index = [0] * len(unicode.chars)
for char in unicode.chars:
record = unicode.table[char]
if record:
if record[5]:
try:
i = decomp_data.index(record[5])
except ValueError:
i = len(decomp_data)
decomp_data.append(record[5])
else:
i = 0
decomp_index[char] = i
FILE = "unicodedata_db.h" FILE = "unicodedata_db.h"
...@@ -65,7 +83,7 @@ def maketable(): ...@@ -65,7 +83,7 @@ def maketable():
print "/* a list of unique database records */" print "/* a list of unique database records */"
print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {" print "const _PyUnicode_DatabaseRecord _PyUnicode_Database_Records[] = {"
for item in table: for item in table:
print " {%d, %d, %d, %d, %s}," % item print " {%d, %d, %d, %d}," % item
print "};" print "};"
print print
...@@ -82,6 +100,12 @@ def maketable(): ...@@ -82,6 +100,12 @@ def maketable():
print " NULL" print " NULL"
print "};" print "};"
print "static const char *decomp_data[] = {"
for name in decomp_data:
print " \"%s\"," % name
print " NULL"
print "};"
# split index table # split index table
index1, index2, shift = splitbins(index) index1, index2, shift = splitbins(index)
...@@ -90,6 +114,14 @@ def maketable(): ...@@ -90,6 +114,14 @@ def maketable():
Array("index1", index1).dump(sys.stdout) Array("index1", index1).dump(sys.stdout)
Array("index2", index2).dump(sys.stdout) Array("index2", index2).dump(sys.stdout)
# split index table
index1, index2, shift = splitbins(decomp_index)
print "/* same, for the decomposition data */"
print "#define DECOMP_SHIFT", shift
Array("decomp_index1", index1).dump(sys.stdout)
Array("decomp_index2", index2).dump(sys.stdout)
sys.stdout = sys.__stdout__ sys.stdout = sys.__stdout__
# -------------------------------------------------------------------- # --------------------------------------------------------------------
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment