Commit 480f1bb6 authored by Martin v. Löwis's avatar Martin v. Löwis

Update Unicode database to Unicode 4.1.

parent e2b46772
...@@ -14,11 +14,11 @@ ...@@ -14,11 +14,11 @@
This module provides access to the Unicode Character Database which This module provides access to the Unicode Character Database which
defines character properties for all Unicode characters. The data in defines character properties for all Unicode characters. The data in
this database is based on the \file{UnicodeData.txt} file version this database is based on the \file{UnicodeData.txt} file version
3.2.0 which is publically available from \url{ftp://ftp.unicode.org/}. 4.1.0 which is publically available from \url{ftp://ftp.unicode.org/}.
The module uses the same names and symbols as defined by the The module uses the same names and symbols as defined by the
UnicodeData File Format 3.2.0 (see UnicodeData File Format 4.1.0 (see
\url{http://www.unicode.org/Public/3.2-Update/UnicodeData-3.2.0.html}). It \url{http://www.unicode.org/Public/4.1-Update/UnicodeData-4.1.0.html}). It
defines the following functions: defines the following functions:
\begin{funcdesc}{lookup}{name} \begin{funcdesc}{lookup}{name}
...@@ -130,3 +130,12 @@ The version of the Unicode database used in this module. ...@@ -130,3 +130,12 @@ The version of the Unicode database used in this module.
\versionadded{2.3} \versionadded{2.3}
\end{datadesc} \end{datadesc}
\begin{datadesc}{db_3_2_0}
This is an object that has the same methods as the entire
module, but uses the Unicode database version 3.2 instead,
for applications that require this specific version of
the Unicode database (such as IDNA).
\versionadded{2.5}
\end{datadesc}
...@@ -14,12 +14,14 @@ typedef struct { ...@@ -14,12 +14,14 @@ typedef struct {
int size; int size;
/* Get name for a given character code. Returns non-zero if /* Get name for a given character code. Returns non-zero if
success, zero if not. Does not set Python exceptions. */ success, zero if not. Does not set Python exceptions.
int (*getname)(Py_UCS4 code, char* buffer, int buflen); If self is NULL, data come from the default version of the database.
If it is not NULL, it should be a unicodedata.db_X_Y_Z object */
int (*getname)(PyObject *self, Py_UCS4 code, char* buffer, int buflen);
/* Get character code for a given name. Same error handling /* Get character code for a given name. Same error handling
as for getname. */ as for getname. */
int (*getcode)(const char* name, int namelen, Py_UCS4* code); int (*getcode)(PyObject *self, const char* name, int namelen, Py_UCS4* code);
} _PyUnicode_Name_CAPI; } _PyUnicode_Name_CAPI;
......
# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep) # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)
import stringprep, unicodedata, re, codecs import stringprep, re, codecs
from unicodedata import db_3_2_0 as unicodedata
# IDNA section 3.1 # IDNA section 3.1
dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]") dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")
......
...@@ -5,7 +5,7 @@ There are two kinds of tables: sets, for which a member test is provided, ...@@ -5,7 +5,7 @@ There are two kinds of tables: sets, for which a member test is provided,
and mappings, for which a mapping function is provided. and mappings, for which a mapping function is provided.
""" """
import unicodedata from unicodedata import db_3_2_0 as unicodedata
assert unicodedata.unidata_version == '3.2.0' assert unicodedata.unidata_version == '3.2.0'
......
...@@ -16,7 +16,7 @@ encoding = 'utf-8' ...@@ -16,7 +16,7 @@ encoding = 'utf-8'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'a37276dc2c158bef6dfd908ad34525c97180fad9' expectedchecksum = 'a6555cd209d960dcfa17bfdce0c96d91cfa9a9ba'
def test_method_checksum(self): def test_method_checksum(self):
h = sha.sha() h = sha.sha()
...@@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase): ...@@ -75,7 +75,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest): class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'cfe20a967a450ebc82ca68c3e4eed344164e11af' expectedchecksum = 'b45b79f3203ee1a896d9b5655484adaff5d4964b'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []
......
...@@ -279,6 +279,10 @@ Core and builtins ...@@ -279,6 +279,10 @@ Core and builtins
Extension Modules Extension Modules
----------------- -----------------
- The unicodedata module was updated to the 4.1 version of the Unicode
database. The 3.2 version is still available as unicodedata.db_3_2_0
for applications that require this specific version (such as IDNA).
- The timing module is no longer built by default. It was deprecated - The timing module is no longer built by default. It was deprecated
in PEP 4 in Python 2.0 or earlier. in PEP 4 in Python 2.0 or earlier.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -1898,7 +1898,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -1898,7 +1898,7 @@ PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
/* found a name. look it up in the unicode database */ /* found a name. look it up in the unicode database */
message = "unknown Unicode character name"; message = "unknown Unicode character name";
s++; s++;
if (ucnhash_CAPI->getcode(start, (int)(s-start-1), &chr)) if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
goto store; goto store;
} }
} }
......
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -26,13 +26,15 @@ ...@@ -26,13 +26,15 @@
import sys import sys
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "2.3" VERSION = "2.5"
# The Unicode Database # The Unicode Database
UNIDATA_VERSION = "3.2.0" UNIDATA_VERSION = "4.1.0"
UNICODE_DATA = "UnicodeData.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
old_versions = ["3.2.0"]
CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd", CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",
"Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm", "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", "Cs", "Co", "Cn", "Lm",
...@@ -57,13 +59,23 @@ UPPER_MASK = 0x80 ...@@ -57,13 +59,23 @@ UPPER_MASK = 0x80
def maketables(trace=0): def maketables(trace=0):
print "--- Reading", UNICODE_DATA, "..." print "--- Reading", UNICODE_DATA % "", "..."
unicode = UnicodeData(UNICODE_DATA, COMPOSITION_EXCLUSIONS, version = ""
EASTASIAN_WIDTH) unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version)
print len(filter(None, unicode.table)), "characters" print len(filter(None, unicode.table)), "characters"
for version in old_versions:
print "--- Reading", UNICODE_DATA % ("-"+version), "..."
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version))
print len(filter(None, old_unicode.table)), "characters"
merge_old_version(version, unicode, old_unicode)
makeunicodename(unicode, trace) makeunicodename(unicode, trace)
makeunicodedata(unicode, trace) makeunicodedata(unicode, trace)
makeunicodetype(unicode, trace) makeunicodetype(unicode, trace)
...@@ -119,6 +131,8 @@ def makeunicodedata(unicode, trace): ...@@ -119,6 +131,8 @@ def makeunicodedata(unicode, trace):
if record: if record:
if record[5]: if record[5]:
decomp = record[5].split() decomp = record[5].split()
if len(decomp) > 19:
raise Exception, "character %x has a decomposition too large for nfd_nfkd" % char
# prefix # prefix
if decomp[0][0] == "<": if decomp[0][0] == "<":
prefix = decomp.pop(0) prefix = decomp.pop(0)
...@@ -278,6 +292,44 @@ def makeunicodedata(unicode, trace): ...@@ -278,6 +292,44 @@ def makeunicodedata(unicode, trace):
Array("comp_index", index).dump(fp, trace) Array("comp_index", index).dump(fp, trace)
Array("comp_data", index2).dump(fp, trace) Array("comp_data", index2).dump(fp, trace)
# Generate delta tables for old versions
for version, table, normalization in unicode.changed:
cversion = version.replace(".","_")
records = [table[0]]
cache = {table[0]:0}
index = [0] * len(table)
for i, record in enumerate(table):
try:
index[i] = cache[record]
except KeyError:
index[i] = cache[record] = len(records)
records.append(record)
index1, index2, shift = splitbins(index, trace)
print >>fp, "static const change_record change_records_%s[] = {" % cversion
for record in records:
print >>fp, "\t{ %s }," % ", ".join(map(str,record))
print >>fp, "};"
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
print >>fp, "static const change_record* get_change_%s(Py_UCS4 n)" % cversion
print >>fp, "{"
print >>fp, "\tint index;"
print >>fp, "\tif (n >= 0x110000) index = 0;"
print >>fp, "\telse {"
print >>fp, "\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift)
print >>fp, "\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
(cversion, shift, ((1<<shift)-1))
print >>fp, "\t}"
print >>fp, "\treturn change_records_%s+index;" % cversion
print >>fp, "}\n"
print >>fp, "static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion
print >>fp, "{"
print >>fp, "\tswitch(n) {"
for k, v in normalization:
print >>fp, "\tcase %s: return 0x%s;" % (hex(k), v)
print >>fp, "\tdefault: return 0;"
print >>fp, "\t}\n}\n"
fp.close() fp.close()
# -------------------------------------------------------------------- # --------------------------------------------------------------------
...@@ -540,6 +592,82 @@ def makeunicodename(unicode, trace): ...@@ -540,6 +592,82 @@ def makeunicodename(unicode, trace):
fp.close() fp.close()
def merge_old_version(version, new, old):
# Changes to exclusion file not implemented yet
if old.exclusions != new.exclusions:
raise NotImplementedError, "exclusions differ"
# In these change records, 0xFF means "no change"
bidir_changes = [0xFF]*0x110000
category_changes = [0xFF]*0x110000
decimal_changes = [0xFF]*0x110000
# In numeric data, 0 means "no change",
# -1 means "did not have a numeric value
numeric_changes = [0] * 0x110000
# normalization_changes is a list of key-value pairs
normalization_changes = []
for i in range(0x110000):
if new.table[i] is None:
# Characters unassigned in the new version ought to
# be unassigned in the old one
assert old.table[i] is None
continue
# check characters unassigned in the old version
if old.table[i] is None:
# category 0 is "unassigned"
category_changes[i] = 0
continue
# check characters that differ
if old.table[i] != new.table[i]:
for k in range(len(old.table[i])):
if old.table[i][k] != new.table[i][k]:
value = old.table[i][k]
if k == 2:
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
category_changes[i] = CATEGORY_NAMES.index(value)
elif k == 4:
#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
elif k == 5:
#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
# We assume that all normalization changes are in 1:1 mappings
assert " " not in value
normalization_changes.append((i, value))
elif k == 6:
#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
# we only support changes where the old value is a single digit
assert value in "0123456789"
decimal_changes[i] = int(value)
elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0
assert value != "0" and value != "-1"
if not value:
numeric_changes[i] = -1
else:
assert re.match("^[0-9]+$", value)
numeric_changes[i] = int(value)
elif k == 11:
# change to ISO comment, ignore
pass
elif k == 12:
# change to simple uppercase mapping; ignore
pass
elif k == 13:
# change to simple lowercase mapping; ignore
pass
elif k == 14:
# change to simple titlecase mapping; ignore
pass
else:
class Difference(Exception):pass
raise Difference, (hex(i), k, old.table[i], new.table[i])
new.changed.append((version, zip(bidir_changes, category_changes,
decimal_changes, numeric_changes),
normalization_changes))
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# the following support code is taken from the unidb utilities # the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB # Copyright (c) 1999-2000 by Secret Labs AB
...@@ -551,6 +679,7 @@ import sys ...@@ -551,6 +679,7 @@ import sys
class UnicodeData: class UnicodeData:
def __init__(self, filename, exclusions, eastasianwidth, expand=1): def __init__(self, filename, exclusions, eastasianwidth, expand=1):
self.changed = []
file = open(filename) file = open(filename)
table = [None] * 0x110000 table = [None] * 0x110000
while 1: while 1:
...@@ -569,13 +698,14 @@ class UnicodeData: ...@@ -569,13 +698,14 @@ class UnicodeData:
if s: if s:
if s[1][-6:] == "First>": if s[1][-6:] == "First>":
s[1] = "" s[1] = ""
field = s[:] field = s
elif s[1][-5:] == "Last>": elif s[1][-5:] == "Last>":
s[1] = "" s[1] = ""
field = None field = None
elif field: elif field:
field[0] = hex(i) f2 = field[:]
table[i] = field f2[0] = "%X" % i
table[i] = f2
# public attributes # public attributes
self.filename = filename self.filename = filename
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment