Commit 7d520793 authored by Amaury Forgeot d'Arc's avatar Amaury Forgeot d'Arc

Merged revisions 75272-75273 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r75272 | amaury.forgeotdarc | 2009-10-06 21:56:32 +0200 (mar., 06 oct. 2009) | 5 lines

  #1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,
  _PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace.

  It now also parses the Unihan.txt for numeric values.
........
  r75273 | amaury.forgeotdarc | 2009-10-06 22:02:09 +0200 (mar., 06 oct. 2009) | 2 lines

  Add Anders Chrigstrom to Misc/ACKS for his work on unicodedata.
........
parent e1b60d48
...@@ -21,7 +21,7 @@ errors = 'surrogatepass' ...@@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '6ec65b65835614ec00634c674bba0e50cd32c189' expectedchecksum = '0b915116051f3ed029a98542c2b7df63c9646272'
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
...@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase): ...@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest): class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca' expectedchecksum = 'd4169ccff998ebbd1ec007a0b3fbd66e5ccf0229'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []
...@@ -119,6 +119,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): ...@@ -119,6 +119,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
self.assertEqual(self.db.numeric('9'), 9) self.assertEqual(self.db.numeric('9'), 9)
self.assertEqual(self.db.numeric('\u215b'), 0.125) self.assertEqual(self.db.numeric('\u215b'), 0.125)
self.assertEqual(self.db.numeric('\u2468'), 9.0) self.assertEqual(self.db.numeric('\u2468'), 9.0)
self.assertEqual(self.db.numeric('\ua627'), 7.0)
self.assertEqual(self.db.numeric('\U00020000', None), None) self.assertEqual(self.db.numeric('\U00020000', None), None)
self.assertRaises(TypeError, self.db.numeric) self.assertRaises(TypeError, self.db.numeric)
......
...@@ -131,6 +131,7 @@ Michael Chermside ...@@ -131,6 +131,7 @@ Michael Chermside
Albert Chin-A-Young Albert Chin-A-Young
Adal Chiriliuc Adal Chiriliuc
Matt Chisholm Matt Chisholm
Anders Chrigström
Tom Christiansen Tom Christiansen
Vadim Chugunov Vadim Chugunov
David Cinege David Cinege
......
...@@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1? ...@@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #1571184: The Unicode database contains properties for more characters.
The tables for code points representing numeric values, white spaces or line
breaks are now generated from the official Unicode Character Database files,
and include information from the Unihan.txt file.
- Issue #7019: Raise ValueError when unmarshalling bad long data, instead - Issue #7019: Raise ValueError when unmarshalling bad long data, instead
of producing internally inconsistent Python longs. of producing internally inconsistent Python longs.
......
...@@ -36,7 +36,7 @@ typedef struct change_record { ...@@ -36,7 +36,7 @@ typedef struct change_record {
const unsigned char category_changed; const unsigned char category_changed;
const unsigned char decimal_changed; const unsigned char decimal_changed;
const unsigned char mirrored_changed; const unsigned char mirrored_changed;
const int numeric_changed; const double numeric_changed;
} change_record; } change_record;
/* data file generated by Tools/unicode/makeunicodedata.py */ /* data file generated by Tools/unicode/makeunicodedata.py */
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
...@@ -35,6 +35,7 @@ UNIDATA_VERSION = "5.1.0" ...@@ -35,6 +35,7 @@ UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.txt"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
...@@ -64,6 +65,7 @@ XID_START_MASK = 0x100 ...@@ -64,6 +65,7 @@ XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200 XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400 PRINTABLE_MASK = 0x400
NODELTA_MASK = 0x800 NODELTA_MASK = 0x800
NUMERIC_MASK = 0x1000
def maketables(trace=0): def maketables(trace=0):
...@@ -73,6 +75,7 @@ def maketables(trace=0): ...@@ -73,6 +75,7 @@ def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA % version, unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version, COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version, EASTASIAN_WIDTH % version,
UNIHAN % version,
DERIVED_CORE_PROPERTIES % version, DERIVED_CORE_PROPERTIES % version,
DERIVEDNORMALIZATION_PROPS % version) DERIVEDNORMALIZATION_PROPS % version)
...@@ -83,6 +86,7 @@ def maketables(trace=0): ...@@ -83,6 +86,7 @@ def maketables(trace=0):
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version), COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version), EASTASIAN_WIDTH % ("-"+version),
UNIHAN % ("-"+version),
DERIVED_CORE_PROPERTIES % ("-"+version)) DERIVED_CORE_PROPERTIES % ("-"+version))
print(len(list(filter(None, old_unicode.table))), "characters") print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode) merge_old_version(version, unicode, old_unicode)
...@@ -357,6 +361,9 @@ def makeunicodetype(unicode, trace): ...@@ -357,6 +361,9 @@ def makeunicodetype(unicode, trace):
table = [dummy] table = [dummy]
cache = {0: dummy} cache = {0: dummy}
index = [0] * len(unicode.chars) index = [0] * len(unicode.chars)
numeric = {}
spaces = []
linebreaks = []
for char in unicode.chars: for char in unicode.chars:
record = unicode.table[char] record = unicode.table[char]
...@@ -373,8 +380,10 @@ def makeunicodetype(unicode, trace): ...@@ -373,8 +380,10 @@ def makeunicodetype(unicode, trace):
flags |= LOWER_MASK flags |= LOWER_MASK
if category == "Zl" or bidirectional == "B": if category == "Zl" or bidirectional == "B":
flags |= LINEBREAK_MASK flags |= LINEBREAK_MASK
linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"): if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK flags |= SPACE_MASK
spaces.append(char)
if category == "Lt": if category == "Lt":
flags |= TITLE_MASK flags |= TITLE_MASK
if category == "Lu": if category == "Lu":
...@@ -423,6 +432,9 @@ def makeunicodetype(unicode, trace): ...@@ -423,6 +432,9 @@ def makeunicodetype(unicode, trace):
if record[7]: if record[7]:
flags |= DIGIT_MASK flags |= DIGIT_MASK
digit = int(record[7]) digit = int(record[7])
if record[8]:
flags |= NUMERIC_MASK
numeric.setdefault(record[8], []).append(char)
item = ( item = (
upper, lower, title, decimal, digit, flags upper, lower, title, decimal, digit, flags
) )
...@@ -434,6 +446,9 @@ def makeunicodetype(unicode, trace): ...@@ -434,6 +446,9 @@ def makeunicodetype(unicode, trace):
index[char] = i index[char] = i
print(len(table), "unique character type entries") print(len(table), "unique character type entries")
print(sum(map(len, numeric.values())), "numeric code points")
print(len(spaces), "whitespace code points")
print(len(linebreaks), "linebreak code points")
print("--- Writing", FILE, "...") print("--- Writing", FILE, "...")
...@@ -455,6 +470,96 @@ def makeunicodetype(unicode, trace): ...@@ -455,6 +470,96 @@ def makeunicodetype(unicode, trace):
Array("index1", index1).dump(fp, trace) Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace) Array("index2", index2).dump(fp, trace)
# Generate code for _PyUnicode_ToNumeric()
numeric_items = sorted(numeric.items())
print('/* Returns the numeric value as double for Unicode characters', file=fp)
print(' * having this property, -1.0 otherwise.', file=fp)
print(' */', file=fp)
print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
for value, codepoints in numeric_items:
haswide = False
hasnonewide = False
codepoints.sort()
for codepoint in codepoints:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print('#ifdef Py_UNICODE_WIDE', file=fp)
haswide = True
print(' case 0x%04X:' % (codepoint,), file=fp)
if haswide and hasnonewide:
print('#endif', file=fp)
print(' return (double) %s;' % (value,), file=fp)
if haswide and not hasnonewide:
print('#endif', file=fp)
print(' }', file=fp)
print(' return -1.0;', file=fp)
print('}', file=fp)
print(file=fp)
# Generate code for _PyUnicode_IsWhitespace()
print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
print('{', file=fp)
print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
print(' return iswspace(ch);', file=fp)
print('#else', file=fp)
print(' switch (ch) {', file=fp)
haswide = False
hasnonewide = False
spaces.sort()
for codepoint in spaces:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print('#ifdef Py_UNICODE_WIDE', file=fp)
haswide = True
print(' case 0x%04X:' % (codepoint,), file=fp)
if haswide and hasnonewide:
print('#endif', file=fp)
print(' return 1;', file=fp)
if haswide and not hasnonewide:
print('#endif', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('#endif', file=fp)
print('}', file=fp)
print(file=fp)
# Generate code for _PyUnicode_IsLinebreak()
print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
haswide = False
hasnonewide = False
linebreaks.sort()
for codepoint in linebreaks:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print('#ifdef Py_UNICODE_WIDE', file=fp)
haswide = True
print(' case 0x%04X:' % (codepoint,), file=fp)
if haswide and hasnonewide:
print('#endif', file=fp)
print(' return 1;', file=fp)
if haswide and not hasnonewide:
print('#endif', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('}', file=fp)
print(file=fp)
fp.close() fp.close()
# -------------------------------------------------------------------- # --------------------------------------------------------------------
...@@ -670,12 +775,11 @@ def merge_old_version(version, new, old): ...@@ -670,12 +775,11 @@ def merge_old_version(version, new, old):
elif k == 8: elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k] # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0 # Since 0 encodes "no change", the old value is better not 0
assert value != "0" and value != "-1"
if not value: if not value:
numeric_changes[i] = -1 numeric_changes[i] = -1
else: else:
assert re.match("^[0-9]+$", value) numeric_changes[i] = float(value)
numeric_changes[i] = int(value) assert numeric_changes[i] not in (0, -1)
elif k == 9: elif k == 9:
if value == 'Y': if value == 'Y':
mirrored_changes[i] = '1' mirrored_changes[i] = '1'
...@@ -711,8 +815,6 @@ def merge_old_version(version, new, old): ...@@ -711,8 +815,6 @@ def merge_old_version(version, new, old):
# load a unicode-data file from disk # load a unicode-data file from disk
import sys
class UnicodeData: class UnicodeData:
# Record structure: # Record structure:
# [ID, name, category, combining, bidi, decomp, (6) # [ID, name, category, combining, bidi, decomp, (6)
...@@ -720,7 +822,7 @@ class UnicodeData: ...@@ -720,7 +822,7 @@ class UnicodeData:
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17) # derived-props] (17)
def __init__(self, filename, exclusions, eastasianwidth, def __init__(self, filename, exclusions, eastasianwidth, unihan,
derivedprops, derivednormalizationprops=None, expand=1): derivedprops, derivednormalizationprops=None, expand=1):
self.changed = [] self.changed = []
file = open(filename) file = open(filename)
...@@ -830,6 +932,19 @@ class UnicodeData: ...@@ -830,6 +932,19 @@ class UnicodeData:
if table[i] is not None: if table[i] is not None:
table[i].append(quickchecks[i]) table[i].append(quickchecks[i])
for line in open(unihan, encoding='utf-8'):
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]
if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
'kOtherNumeric'):
continue
value = value.strip().replace(',', '')
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
def uselatin1(self): def uselatin1(self):
# restrict character range to ISO Latin 1 # restrict character range to ISO Latin 1
self.chars = list(range(256)) self.chars = list(range(256))
...@@ -979,7 +1094,6 @@ def splitbins(t, trace=0): ...@@ -979,7 +1094,6 @@ def splitbins(t, trace=0):
you'll get. you'll get.
""" """
import sys
if trace: if trace:
def dump(t1, t2, shift, bytes): def dump(t1, t2, shift, bytes):
print("%d+%d bins at shift %d; %d bytes" % ( print("%d+%d bins at shift %d; %d bytes" % (
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment