Commit 67752315 authored by Benjamin Peterson's avatar Benjamin Peterson

Unicode 9.0.0

Not completely mechanical since support for East Asian Width changes—emoji
codepoints became Wide—had to be added to unicodedata.
parent 7ec64562
......@@ -17,8 +17,8 @@
This module provides access to the Unicode Character Database (UCD) which
defines character properties for all Unicode characters. The data contained in
this database is compiled from the `UCD version 8.0.0
<http://www.unicode.org/Public/8.0.0/ucd>`_.
this database is compiled from the `UCD version 9.0.0
<http://www.unicode.org/Public/9.0.0/ucd>`_.
The module uses the same names and symbols as defined by Unicode
Standard Annex #44, `"Unicode Character Database"
......@@ -168,6 +168,6 @@ Examples:
.. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/8.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/9.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/8.0.0/ucd/NamedSequences.txt
.. [#] http://www.unicode.org/Public/9.0.0/ucd/NamedSequences.txt
......@@ -966,6 +966,13 @@ representing :class:`contextlib.AbstractContextManager`.
(Contributed by Brett Cannon in :issue:`25609`.)
unicodedata
-----------
The internal database has been upgraded to use Unicode 9.0.0. (Contributed by
Benjamin Peterson.)
unittest.mock
-------------
......
......@@ -20,7 +20,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = '5971760872b2f98bb9c701e6c0db3273d756b3ec'
expectedchecksum = 'c1fa98674a683aa8a8d8dee0c84494f8d36346e6'
def test_method_checksum(self):
h = hashlib.sha1()
......@@ -80,7 +80,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = '5e74827cd07f9e546a30f34b7bcf6cc2eac38c8c'
expectedchecksum = 'f891b1e6430c712531b9bc935a38e22d78ba1bf3'
def test_function_checksum(self):
data = []
h = hashlib.sha1()
......@@ -222,6 +222,10 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
self.assertEqual(eaw('\u2010'), 'A')
self.assertEqual(eaw('\U00020000'), 'W')
def test_east_asian_width_9_0_changes(self):
self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
class UnicodeMiscTest(UnicodeDatabaseTest):
def test_failed_import_during_compiling(self):
......
......@@ -10,6 +10,8 @@ What's New in Python 3.6.0 beta 2
Core and Builtins
-----------------
- Upgrade internal unicode databases to Unicode version 9.0.0.
- Issue #28131: Fix a regression in zipimport's compile_source(). zipimport
should use the same optimization level as the interpreter.
......
......@@ -45,6 +45,7 @@ typedef struct change_record {
const unsigned char category_changed;
const unsigned char decimal_changed;
const unsigned char mirrored_changed;
const unsigned char east_asian_width_changed;
const double numeric_changed;
} change_record;
......@@ -375,6 +376,8 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
const change_record *old = get_old_record(self, c);
if (old->category_changed == 0)
index = 0; /* unassigned */
else if (old->east_asian_width_changed != 0xFF)
index = old->east_asian_width_changed;
}
return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
}
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -42,7 +42,7 @@ VERSION = "3.2"
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "8.0.0"
UNIDATA_VERSION = "9.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
......@@ -796,6 +796,7 @@ def merge_old_version(version, new, old):
category_changes = [0xFF]*0x110000
decimal_changes = [0xFF]*0x110000
mirrored_changes = [0xFF]*0x110000
east_asian_width_changes = [0xFF]*0x110000
# In numeric data, 0 means "no change",
# -1 means "did not have a numeric value
numeric_changes = [0] * 0x110000
......@@ -862,6 +863,9 @@ def merge_old_version(version, new, old):
elif k == 14:
# change to simple titlecase mapping; ignore
pass
elif k == 15:
# change to east asian width
east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
elif k == 16:
# derived property changes; not yet
pass
......@@ -873,8 +877,9 @@ def merge_old_version(version, new, old):
class Difference(Exception):pass
raise Difference(hex(i), k, old.table[i], new.table[i])
new.changed.append((version, list(zip(bidir_changes, category_changes,
decimal_changes, mirrored_changes,
numeric_changes)),
decimal_changes, mirrored_changes,
east_asian_width_changes,
numeric_changes)),
normalization_changes))
def open_data(template, version):
......
......@@ -652,7 +652,8 @@ class PyBuildExt(build_ext):
# profiler (_lsprof is for cProfile.py)
exts.append( Extension('_lsprof', ['_lsprof.c', 'rotatingtree.c']) )
# static Unicode character database
exts.append( Extension('unicodedata', ['unicodedata.c']) )
exts.append( Extension('unicodedata', ['unicodedata.c'],
depends=['unicodedata_db.h', 'unicodename_db.h']) )
# _opcode module
exts.append( Extension('_opcode', ['_opcode.c']) )
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment