Commit 7d520793 authored by Amaury Forgeot d'Arc's avatar Amaury Forgeot d'Arc

Merged revisions 75272-75273 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r75272 | amaury.forgeotdarc | 2009-10-06 21:56:32 +0200 (mar., 06 oct. 2009) | 5 lines

  #1571184: makeunicodedata.py now generates the functions _PyUnicode_ToNumeric,
  _PyUnicode_IsLinebreak and _PyUnicode_IsWhitespace.

  It now also parses the Unihan.txt for numeric values.
........
  r75273 | amaury.forgeotdarc | 2009-10-06 22:02:09 +0200 (mar., 06 oct. 2009) | 2 lines

  Add Anders Chrigstrom to Misc/ACKS for his work on unicodedata.
........
parent e1b60d48
......@@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = '6ec65b65835614ec00634c674bba0e50cd32c189'
expectedchecksum = '0b915116051f3ed029a98542c2b7df63c9646272'
def test_method_checksum(self):
h = hashlib.sha1()
......@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes
expectedchecksum = '3136d5afd787dc2bcb1bdcac95e385349fbebbca'
expectedchecksum = 'd4169ccff998ebbd1ec007a0b3fbd66e5ccf0229'
def test_function_checksum(self):
data = []
......@@ -119,6 +119,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
self.assertEqual(self.db.numeric('9'), 9)
self.assertEqual(self.db.numeric('\u215b'), 0.125)
self.assertEqual(self.db.numeric('\u2468'), 9.0)
self.assertEqual(self.db.numeric('\ua627'), 7.0)
self.assertEqual(self.db.numeric('\U00020000', None), None)
self.assertRaises(TypeError, self.db.numeric)
......
......@@ -131,6 +131,7 @@ Michael Chermside
Albert Chin-A-Young
Adal Chiriliuc
Matt Chisholm
Anders Chrigström
Tom Christiansen
Vadim Chugunov
David Cinege
......
......@@ -12,6 +12,11 @@ What's New in Python 3.2 Alpha 1?
Core and Builtins
-----------------
- Issue #1571184: The Unicode database contains properties for more characters.
The tables for code points representing numeric values, white spaces or line
breaks are now generated from the official Unicode Character Database files,
and include information from the Unihan.txt file.
- Issue #7019: Raise ValueError when unmarshalling bad long data, instead
of producing internally inconsistent Python longs.
......
......@@ -36,7 +36,7 @@ typedef struct change_record {
const unsigned char category_changed;
const unsigned char decimal_changed;
const unsigned char mirrored_changed;
const int numeric_changed;
const double numeric_changed;
} change_record;
/* data file generated by Tools/unicode/makeunicodedata.py */
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -35,6 +35,7 @@ UNIDATA_VERSION = "5.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.txt"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
......@@ -64,6 +65,7 @@ XID_START_MASK = 0x100
XID_CONTINUE_MASK = 0x200
PRINTABLE_MASK = 0x400
NODELTA_MASK = 0x800
NUMERIC_MASK = 0x1000
def maketables(trace=0):
......@@ -73,6 +75,7 @@ def maketables(trace=0):
unicode = UnicodeData(UNICODE_DATA % version,
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version,
UNIHAN % version,
DERIVED_CORE_PROPERTIES % version,
DERIVEDNORMALIZATION_PROPS % version)
......@@ -83,6 +86,7 @@ def maketables(trace=0):
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version),
COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version),
UNIHAN % ("-"+version),
DERIVED_CORE_PROPERTIES % ("-"+version))
print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode)
......@@ -357,6 +361,9 @@ def makeunicodetype(unicode, trace):
table = [dummy]
cache = {0: dummy}
index = [0] * len(unicode.chars)
numeric = {}
spaces = []
linebreaks = []
for char in unicode.chars:
record = unicode.table[char]
......@@ -373,8 +380,10 @@ def makeunicodetype(unicode, trace):
flags |= LOWER_MASK
if category == "Zl" or bidirectional == "B":
flags |= LINEBREAK_MASK
linebreaks.append(char)
if category == "Zs" or bidirectional in ("WS", "B", "S"):
flags |= SPACE_MASK
spaces.append(char)
if category == "Lt":
flags |= TITLE_MASK
if category == "Lu":
......@@ -423,6 +432,9 @@ def makeunicodetype(unicode, trace):
if record[7]:
flags |= DIGIT_MASK
digit = int(record[7])
if record[8]:
flags |= NUMERIC_MASK
numeric.setdefault(record[8], []).append(char)
item = (
upper, lower, title, decimal, digit, flags
)
......@@ -434,6 +446,9 @@ def makeunicodetype(unicode, trace):
index[char] = i
print(len(table), "unique character type entries")
print(sum(map(len, numeric.values())), "numeric code points")
print(len(spaces), "whitespace code points")
print(len(linebreaks), "linebreak code points")
print("--- Writing", FILE, "...")
......@@ -455,6 +470,96 @@ def makeunicodetype(unicode, trace):
Array("index1", index1).dump(fp, trace)
Array("index2", index2).dump(fp, trace)
# Generate code for _PyUnicode_ToNumeric()
numeric_items = sorted(numeric.items())
print('/* Returns the numeric value as double for Unicode characters', file=fp)
print(' * having this property, -1.0 otherwise.', file=fp)
print(' */', file=fp)
print('double _PyUnicode_ToNumeric(Py_UNICODE ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
for value, codepoints in numeric_items:
haswide = False
hasnonewide = False
codepoints.sort()
for codepoint in codepoints:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print('#ifdef Py_UNICODE_WIDE', file=fp)
haswide = True
print(' case 0x%04X:' % (codepoint,), file=fp)
if haswide and hasnonewide:
print('#endif', file=fp)
print(' return (double) %s;' % (value,), file=fp)
if haswide and not hasnonewide:
print('#endif', file=fp)
print(' }', file=fp)
print(' return -1.0;', file=fp)
print('}', file=fp)
print(file=fp)
# Generate code for _PyUnicode_IsWhitespace()
print("/* Returns 1 for Unicode characters having the bidirectional", file=fp)
print(" * type 'WS', 'B' or 'S' or the category 'Zs', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsWhitespace(register const Py_UNICODE ch)', file=fp)
print('{', file=fp)
print('#ifdef WANT_WCTYPE_FUNCTIONS', file=fp)
print(' return iswspace(ch);', file=fp)
print('#else', file=fp)
print(' switch (ch) {', file=fp)
haswide = False
hasnonewide = False
spaces.sort()
for codepoint in spaces:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print('#ifdef Py_UNICODE_WIDE', file=fp)
haswide = True
print(' case 0x%04X:' % (codepoint,), file=fp)
if haswide and hasnonewide:
print('#endif', file=fp)
print(' return 1;', file=fp)
if haswide and not hasnonewide:
print('#endif', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('#endif', file=fp)
print('}', file=fp)
print(file=fp)
# Generate code for _PyUnicode_IsLinebreak()
print("/* Returns 1 for Unicode characters having the category 'Zl',", file=fp)
print(" * 'Zp' or type 'B', 0 otherwise.", file=fp)
print(" */", file=fp)
print('int _PyUnicode_IsLinebreak(register const Py_UNICODE ch)', file=fp)
print('{', file=fp)
print(' switch (ch) {', file=fp)
haswide = False
hasnonewide = False
linebreaks.sort()
for codepoint in linebreaks:
if codepoint < 0x10000:
hasnonewide = True
if codepoint >= 0x10000 and not haswide:
print('#ifdef Py_UNICODE_WIDE', file=fp)
haswide = True
print(' case 0x%04X:' % (codepoint,), file=fp)
if haswide and hasnonewide:
print('#endif', file=fp)
print(' return 1;', file=fp)
if haswide and not hasnonewide:
print('#endif', file=fp)
print(' }', file=fp)
print(' return 0;', file=fp)
print('}', file=fp)
print(file=fp)
fp.close()
# --------------------------------------------------------------------
......@@ -670,12 +775,11 @@ def merge_old_version(version, new, old):
elif k == 8:
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
# Since 0 encodes "no change", the old value is better not 0
assert value != "0" and value != "-1"
if not value:
numeric_changes[i] = -1
else:
assert re.match("^[0-9]+$", value)
numeric_changes[i] = int(value)
numeric_changes[i] = float(value)
assert numeric_changes[i] not in (0, -1)
elif k == 9:
if value == 'Y':
mirrored_changes[i] = '1'
......@@ -711,8 +815,6 @@ def merge_old_version(version, new, old):
# load a unicode-data file from disk
import sys
class UnicodeData:
# Record structure:
# [ID, name, category, combining, bidi, decomp, (6)
......@@ -720,7 +822,7 @@ class UnicodeData:
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17)
def __init__(self, filename, exclusions, eastasianwidth,
def __init__(self, filename, exclusions, eastasianwidth, unihan,
derivedprops, derivednormalizationprops=None, expand=1):
self.changed = []
file = open(filename)
......@@ -830,6 +932,19 @@ class UnicodeData:
if table[i] is not None:
table[i].append(quickchecks[i])
for line in open(unihan, encoding='utf-8'):
if not line.startswith('U+'):
continue
code, tag, value = line.split(None, 3)[:3]
if tag not in ('kAccountingNumeric', 'kPrimaryNumeric',
'kOtherNumeric'):
continue
value = value.strip().replace(',', '')
i = int(code[2:], 16)
# Patch the numeric field
if table[i] is not None:
table[i][8] = value
def uselatin1(self):
# restrict character range to ISO Latin 1
self.chars = list(range(256))
......@@ -979,7 +1094,6 @@ def splitbins(t, trace=0):
you'll get.
"""
import sys
if trace:
def dump(t1, t2, shift, bytes):
print("%d+%d bins at shift %d; %d bytes" % (
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment