Commit 71f660e0 authored by Benjamin Peterson's avatar Benjamin Peterson

update to Unicode 6.1

parent 16fa2a10
......@@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921'
expectedchecksum = 'bf7a78f1a532421b5033600102e23a92044dbba9'
def test_method_checksum(self):
h = hashlib.sha1()
......@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes
expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50'
expectedchecksum = '17fe2f12b788e4fff5479b469c4404bb6ecf841f'
def test_function_checksum(self):
data = []
h = hashlib.sha1()
......
......@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins
-----------------
- Upgrade Unicode data to Unicode 6.1.
- Issue #14040: Remove rarely used file name suffixes for C extensions
(under POSIX mainly).
......
......@@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */
(0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -38,7 +38,7 @@ SCRIPT = sys.argv[0]
VERSION = "3.2"
# The Unicode Database
UNIDATA_VERSION = "6.0.0"
UNIDATA_VERSION = "6.1.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
......@@ -58,7 +58,7 @@ PUA_16 = range(0x100000, 0x10FFFE)
# we use this ranges of PUA_15 to store name aliases and named sequences
NAME_ALIASES_START = 0xF0000
NAMED_SEQUENCES_START = 0xF0100
NAMED_SEQUENCES_START = 0xF0200
old_versions = ["3.2.0"]
......@@ -95,7 +95,7 @@ EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DB5'),
('4E00', '9FCB'),
('4E00', '9FCC'),
('20000', '2A6D6'),
('2A700', '2B734'),
('2B740', '2B81D')
......@@ -958,7 +958,7 @@ class UnicodeData:
s = s.strip()
if not s or s.startswith('#'):
continue
char, name = s.split(';')
char, name, abbrev = s.split(';')
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
......@@ -971,6 +971,7 @@ class UnicodeData:
# in order to take advantage of the compression and lookup
# algorithms used for the other characters.
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file:
for s in file:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment