Commit 71f660e0 authored by Benjamin Peterson's avatar Benjamin Peterson

update to Unicode 6.1

parent 16fa2a10
...@@ -21,7 +21,7 @@ errors = 'surrogatepass' ...@@ -21,7 +21,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'df0b3ca6785a070b21f837b227dbdbdff3c2e921' expectedchecksum = 'bf7a78f1a532421b5033600102e23a92044dbba9'
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
...@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase): ...@@ -80,7 +80,7 @@ class UnicodeDatabaseTest(unittest.TestCase):
class UnicodeFunctionsTest(UnicodeDatabaseTest): class UnicodeFunctionsTest(UnicodeDatabaseTest):
# update this, if the database changes # update this, if the database changes
expectedchecksum = 'c23dfc0b5eaf3ca2aad32d733de96bb182ccda50' expectedchecksum = '17fe2f12b788e4fff5479b469c4404bb6ecf841f'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []
h = hashlib.sha1() h = hashlib.sha1()
......
...@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1? ...@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
Core and Builtins Core and Builtins
----------------- -----------------
- Upgrade Unicode data to Unicode 6.1.
- Issue #14040: Remove rarely used file name suffixes for C extensions - Issue #14040: Remove rarely used file name suffixes for C extensions
(under POSIX mainly). (under POSIX mainly).
......
...@@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code) ...@@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code)
{ {
return return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FCB) || /* CJK Ideograph */ (0x4E00 <= code && code <= 0x9FCC) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */ (0x2B740 <= code && code <= 0x2B81D); /* CJK Ideograph Extension D */
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -38,7 +38,7 @@ SCRIPT = sys.argv[0] ...@@ -38,7 +38,7 @@ SCRIPT = sys.argv[0]
VERSION = "3.2" VERSION = "3.2"
# The Unicode Database # The Unicode Database
UNIDATA_VERSION = "6.0.0" UNIDATA_VERSION = "6.1.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
...@@ -58,7 +58,7 @@ PUA_16 = range(0x100000, 0x10FFFE) ...@@ -58,7 +58,7 @@ PUA_16 = range(0x100000, 0x10FFFE)
# we use this ranges of PUA_15 to store name aliases and named sequences # we use this ranges of PUA_15 to store name aliases and named sequences
NAME_ALIASES_START = 0xF0000 NAME_ALIASES_START = 0xF0000
NAMED_SEQUENCES_START = 0xF0100 NAMED_SEQUENCES_START = 0xF0200
old_versions = ["3.2.0"] old_versions = ["3.2.0"]
...@@ -95,7 +95,7 @@ EXTENDED_CASE_MASK = 0x4000 ...@@ -95,7 +95,7 @@ EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph # these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [ cjk_ranges = [
('3400', '4DB5'), ('3400', '4DB5'),
('4E00', '9FCB'), ('4E00', '9FCC'),
('20000', '2A6D6'), ('20000', '2A6D6'),
('2A700', '2B734'), ('2A700', '2B734'),
('2B740', '2B81D') ('2B740', '2B81D')
...@@ -958,7 +958,7 @@ class UnicodeData: ...@@ -958,7 +958,7 @@ class UnicodeData:
s = s.strip() s = s.strip()
if not s or s.startswith('#'): if not s or s.startswith('#'):
continue continue
char, name = s.split(';') char, name, abbrev = s.split(';')
char = int(char, 16) char = int(char, 16)
self.aliases.append((name, char)) self.aliases.append((name, char))
# also store the name in the PUA 1 # also store the name in the PUA 1
...@@ -971,6 +971,7 @@ class UnicodeData: ...@@ -971,6 +971,7 @@ class UnicodeData:
# in order to take advantage of the compression and lookup # in order to take advantage of the compression and lookup
# algorithms used for the other characters. # algorithms used for the other characters.
assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file: with open_data(NAMED_SEQUENCES, version) as file:
for s in file: for s in file:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment