Commit baecd724 authored by Martin v. Löwis's avatar Martin v. Löwis

Upgrade to Unicode 6.0.0.

makeunicodedata.py: download all data files from unicode.org,
  switch to extracting Unihan data from zip file.
  Read linebreakprops and derivednormalizationprops even for
  old versions, even though they are not used in delta records.
test:unicode.py: U+11000 is now assigned, use U+14000 instead.
parent e8930228
...@@ -1349,7 +1349,7 @@ class UnicodeTest(string_tests.CommonTest, ...@@ -1349,7 +1349,7 @@ class UnicodeTest(string_tests.CommonTest,
def test_printable_repr(self): def test_printable_repr(self):
self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable self.assertEqual(repr('\U00010000'), "'%c'" % (0x10000,)) # printable
self.assertEqual(repr('\U00011000'), "'\\U00011000'") # nonprintable self.assertEqual(repr('\U00014000'), "'\\U00014000'") # nonprintable
def test_expandtabs_overflows_gracefully(self): def test_expandtabs_overflows_gracefully(self):
# This test only affects 32-bit platforms because expandtabs can only take # This test only affects 32-bit platforms because expandtabs can only take
......
...@@ -21,7 +21,7 @@ errors = 'surrogatepass' ...@@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '4504dffd035baea02c5b9de82bebc3d65e0e0baf' expectedchecksum = '21b90f1aed00081b81ca7942b22196af090015a0'
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
...@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase): ...@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest): class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'e89a6380093a00a7685ac7b92e7367d737fcb79b' expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []
h = hashlib.sha1() h = hashlib.sha1()
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -25,17 +25,17 @@ ...@@ -25,17 +25,17 @@
# written by Fredrik Lundh (fredrik@pythonware.com) # written by Fredrik Lundh (fredrik@pythonware.com)
# #
import sys import sys, os, zipfile
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "3.2" VERSION = "3.2"
# The Unicode Database # The Unicode Database
UNIDATA_VERSION = "5.2.0" UNIDATA_VERSION = "6.0.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
UNIHAN = "Unihan%s.txt" UNIHAN = "Unihan%s.zip"
DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt" DERIVED_CORE_PROPERTIES = "DerivedCoreProperties%s.txt"
DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt" DERIVEDNORMALIZATION_PROPS = "DerivedNormalizationProps%s.txt"
LINE_BREAK = "LineBreak%s.txt" LINE_BREAK = "LineBreak%s.txt"
...@@ -75,23 +75,13 @@ def maketables(trace=0): ...@@ -75,23 +75,13 @@ def maketables(trace=0):
print("--- Reading", UNICODE_DATA % "", "...") print("--- Reading", UNICODE_DATA % "", "...")
version = "" version = ""
unicode = UnicodeData(UNICODE_DATA % version, unicode = UnicodeData(UNIDATA_VERSION)
COMPOSITION_EXCLUSIONS % version,
EASTASIAN_WIDTH % version,
UNIHAN % version,
DERIVED_CORE_PROPERTIES % version,
DERIVEDNORMALIZATION_PROPS % version,
LINE_BREAK % version)
print(len(list(filter(None, unicode.table))), "characters") print(len(list(filter(None, unicode.table))), "characters")
for version in old_versions: for version in old_versions:
print("--- Reading", UNICODE_DATA % ("-"+version), "...") print("--- Reading", UNICODE_DATA % ("-"+version), "...")
old_unicode = UnicodeData(UNICODE_DATA % ("-"+version), old_unicode = UnicodeData(version)
COMPOSITION_EXCLUSIONS % ("-"+version),
EASTASIAN_WIDTH % ("-"+version),
UNIHAN % ("-"+version),
DERIVED_CORE_PROPERTIES % ("-"+version))
print(len(list(filter(None, old_unicode.table))), "characters") print(len(list(filter(None, old_unicode.table))), "characters")
merge_old_version(version, unicode, old_unicode) merge_old_version(version, unicode, old_unicode)
...@@ -771,6 +761,10 @@ def merge_old_version(version, new, old): ...@@ -771,6 +761,10 @@ def merge_old_version(version, new, old):
elif k == 16: elif k == 16:
# derived property changes; not yet # derived property changes; not yet
pass pass
elif k == 17:
# normalization quickchecks are not performed
# for older versions
pass
else: else:
class Difference(Exception):pass class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i]) raise Difference(hex(i), k, old.table[i], new.table[i])
...@@ -779,6 +773,21 @@ def merge_old_version(version, new, old): ...@@ -779,6 +773,21 @@ def merge_old_version(version, new, old):
numeric_changes)), numeric_changes)),
normalization_changes)) normalization_changes))
def open_data(template, version):
local = template % ('-'+version,)
if not os.path.exists(local):
import urllib.request
if version == '3.2.0':
# irregular url structure
url = 'http://www.unicode.org/Public/3.2-Update/' + local
else:
url = ('http://www.unicode.org/Public/%s/ucd/'+template) % (version, '')
urllib.request.urlretrieve(url, filename=local)
if local.endswith('.txt'):
return open(local, encoding='utf-8')
else:
# Unihan.zip
return open(local, 'rb')
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# the following support code is taken from the unidb utilities # the following support code is taken from the unidb utilities
...@@ -793,11 +802,11 @@ class UnicodeData: ...@@ -793,11 +802,11 @@ class UnicodeData:
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16) # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
# derived-props] (17) # derived-props] (17)
def __init__(self, filename, exclusions, eastasianwidth, unihan, def __init__(self, version,
derivedprops, derivednormalizationprops=None, linebreakprops=None, linebreakprops=False,
expand=1): expand=1):
self.changed = [] self.changed = []
file = open(filename) file = open_data(UNICODE_DATA, version)
table = [None] * 0x110000 table = [None] * 0x110000
while 1: while 1:
s = file.readline() s = file.readline()
...@@ -825,11 +834,11 @@ class UnicodeData: ...@@ -825,11 +834,11 @@ class UnicodeData:
table[i] = f2 table[i] = f2
# public attributes # public attributes
self.filename = filename self.filename = UNICODE_DATA % ''
self.table = table self.table = table
self.chars = list(range(0x110000)) # unicode 3.2 self.chars = list(range(0x110000)) # unicode 3.2
file = open(exclusions) file = open_data(COMPOSITION_EXCLUSIONS, version)
self.exclusions = {} self.exclusions = {}
for s in file: for s in file:
s = s.strip() s = s.strip()
...@@ -841,7 +850,7 @@ class UnicodeData: ...@@ -841,7 +850,7 @@ class UnicodeData:
self.exclusions[char] = 1 self.exclusions[char] = 1
widths = [None] * 0x110000 widths = [None] * 0x110000
for s in open(eastasianwidth): for s in open_data(EASTASIAN_WIDTH, version):
s = s.strip() s = s.strip()
if not s: if not s:
continue continue
...@@ -862,7 +871,7 @@ class UnicodeData: ...@@ -862,7 +871,7 @@ class UnicodeData:
for i in range(0, 0x110000): for i in range(0, 0x110000):
if table[i] is not None: if table[i] is not None:
table[i].append(set()) table[i].append(set())
for s in open(derivedprops): for s in open_data(DERIVED_CORE_PROPERTIES, version):
s = s.split('#', 1)[0].strip() s = s.split('#', 1)[0].strip()
if not s: if not s:
continue continue
...@@ -881,43 +890,53 @@ class UnicodeData: ...@@ -881,43 +890,53 @@ class UnicodeData:
# apply to unassigned code points; ignore them # apply to unassigned code points; ignore them
table[char][-1].add(p) table[char][-1].add(p)
if linebreakprops: for s in open_data(LINE_BREAK, version):
for s in open(linebreakprops): s = s.partition('#')[0]
s = s.partition('#')[0] s = [i.strip() for i in s.split(';')]
s = [i.strip() for i in s.split(';')] if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS: continue
continue if '..' not in s[0]:
if '..' not in s[0]: first = last = int(s[0], 16)
first = last = int(s[0], 16) else:
else: first, last = [int(c, 16) for c in s[0].split('..')]
first, last = [int(c, 16) for c in s[0].split('..')] for char in range(first, last+1):
for char in range(first, last+1): table[char][-1].add('Line_Break')
table[char][-1].add('Line_Break')
# We only want the quickcheck properties
if derivednormalizationprops: # Format: NF?_QC; Y(es)/N(o)/M(aybe)
quickchecks = [0] * 0x110000 # default is Yes # Yes is the default, hence only N and M occur
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() # In 3.2.0, the format was different (NF?_NO)
for s in open(derivednormalizationprops): # The parsing will incorrectly determine these as
if '#' in s: # "yes", however, unicodedata.c will not perform quickchecks
s = s[:s.index('#')] # for older versions, and no delta records will be created.
s = [i.strip() for i in s.split(';')] quickchecks = [0] * 0x110000
if len(s) < 2 or s[1] not in qc_order: qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
continue for s in open_data(DERIVEDNORMALIZATION_PROPS, version):
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No if '#' in s:
quickcheck_shift = qc_order.index(s[1])*2 s = s[:s.index('#')]
quickcheck <<= quickcheck_shift s = [i.strip() for i in s.split(';')]
if '..' not in s[0]: if len(s) < 2 or s[1] not in qc_order:
first = last = int(s[0], 16) continue
else: quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
first, last = [int(c, 16) for c in s[0].split('..')] quickcheck_shift = qc_order.index(s[1])*2
for char in range(first, last+1): quickcheck <<= quickcheck_shift
assert not (quickchecks[char]>>quickcheck_shift)&3 if '..' not in s[0]:
quickchecks[char] |= quickcheck first = last = int(s[0], 16)
for i in range(0, 0x110000): else:
if table[i] is not None: first, last = [int(c, 16) for c in s[0].split('..')]
table[i].append(quickchecks[i]) for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000):
if table[i] is not None:
table[i].append(quickchecks[i])
for line in open(unihan, encoding='utf-8'): zip = zipfile.ZipFile(open_data(UNIHAN, version))
if version == '3.2.0':
data = zip.open('Unihan-3.2.0.txt').read()
else:
data = zip.open('Unihan_NumericValues.txt').read()
for line in data.decode("utf-8").splitlines():
if not line.startswith('U+'): if not line.startswith('U+'):
continue continue
code, tag, value = line.split(None, 3)[:3] code, tag, value = line.split(None, 3)[:3]
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment