Unicode 9.0.0

Not completely mechanical since support for East Asian Width changes—emoji codepoints became Wide—had to be added to unicodedata.

Unicode 9.0.0
Not completely mechanical since support for East Asian Width changes—emoji codepoints became Wide—had to be added to unicodedata.
67752315 · Benjamin Peterson · 7ec64562 · 67752315 · 67752315 · 67752315
Commit 67752315 authored Sep 14, 2016 by Benjamin Peterson
10 changed files
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@@ -17,8 +17,8 @@

 This module provides access to the Unicode Character Database (UCD) which
 defines character properties for all Unicode characters. The data contained in
-this database is compiled from the `UCD version 8.0.0
-<http://www.unicode.org/Public/8.0.0/ucd>`_.
+this database is compiled from the `UCD version 9.0.0
+<http://www.unicode.org/Public/9.0.0/ucd>`_.

 The module uses the same names and symbols as defined by Unicode
 Standard Annex #44, `"Unicode Character Database"
@@ -168,6 +168,6 @@ Examples:

 .. rubric:: Footnotes

-.. [#] http://www.unicode.org/Public/8.0.0/ucd/NameAliases.txt
+.. [#] http://www.unicode.org/Public/9.0.0/ucd/NameAliases.txt

-.. [#] http://www.unicode.org/Public/8.0.0/ucd/NamedSequences.txt
+.. [#] http://www.unicode.org/Public/9.0.0/ucd/NamedSequences.txt
--- a/Doc/whatsnew/3.6.rst
+++ b/Doc/whatsnew/3.6.rst
@@ -966,6 +966,13 @@ representing :class:`contextlib.AbstractContextManager`.
 (Contributed by Brett Cannon in :issue:`25609`.)


+unicodedata
+-----------
+
+The internal database has been upgraded to use Unicode 9.0.0. (Contributed by
+Benjamin Peterson.)
+
+
 unittest.mock
 -------------


--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -20,7 +20,7 @@ errors = 'surrogatepass'
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = '5971760872b2f98bb9c701e6c0db3273d756b3ec'
+    expectedchecksum = 'c1fa98674a683aa8a8d8dee0c84494f8d36346e6'

    def test_method_checksum(self):
        h = hashlib.sha1()
@@ -80,7 +80,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):

    # Update this if the database changes. Make sure to do a full rebuild
    # (e.g. 'make distclean && make') to get the correct checksum.
-    expectedchecksum = '5e74827cd07f9e546a30f34b7bcf6cc2eac38c8c'
+    expectedchecksum = 'f891b1e6430c712531b9bc935a38e22d78ba1bf3'
    def test_function_checksum(self):
        data = []
        h = hashlib.sha1()
@@ -222,6 +222,10 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
        self.assertEqual(eaw('\u2010'), 'A')
        self.assertEqual(eaw('\U00020000'), 'W')

+    def test_east_asian_width_9_0_changes(self):
+        self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
+        self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
+
 class UnicodeMiscTest(UnicodeDatabaseTest):

    def test_failed_import_during_compiling(self):

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ What's New in Python 3.6.0 beta 2
 Core and Builtins
 -----------------

+- Upgrade internal unicode databases to Unicode version 9.0.0.
+
 - Issue #28131: Fix a regression in zipimport's compile_source().  zipimport
  should use the same optimization level as the interpreter.


--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -45,6 +45,7 @@ typedef struct change_record {
    const unsigned char category_changed;
    const unsigned char decimal_changed;
    const unsigned char mirrored_changed;
+    const unsigned char east_asian_width_changed;
    const double numeric_changed;
 } change_record;

@@ -375,6 +376,8 @@ unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
        const change_record *old = get_old_record(self, c);
        if (old->category_changed == 0)
            index = 0; /* unassigned */
+        else if (old->east_asian_width_changed != 0xFF)
+            index = old->east_asian_width_changed;
    }
    return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
 }

--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -42,7 +42,7 @@ VERSION = "3.2"
 #   * Doc/library/stdtypes.rst, and
 #   * Doc/library/unicodedata.rst
 #   * Doc/reference/lexical_analysis.rst (two occurrences)
-UNIDATA_VERSION = "8.0.0"
+UNIDATA_VERSION = "9.0.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@@ -796,6 +796,7 @@ def merge_old_version(version, new, old):
    category_changes = [0xFF]*0x110000
    decimal_changes = [0xFF]*0x110000
    mirrored_changes = [0xFF]*0x110000
+    east_asian_width_changes = [0xFF]*0x110000
    # In numeric data, 0 means "no change",
    # -1 means "did not have a numeric value
    numeric_changes = [0] * 0x110000
@@ -862,6 +863,9 @@ def merge_old_version(version, new, old):
                    elif k == 14:
                        # change to simple titlecase mapping; ignore
                        pass
+                    elif k == 15:
+                        # change to east asian width
+                        east_asian_width_changes[i] = EASTASIANWIDTH_NAMES.index(value)
                    elif k == 16:
                        # derived property changes; not yet
                        pass
@@ -873,8 +877,9 @@ def merge_old_version(version, new, old):
                        class Difference(Exception):pass
                        raise Difference(hex(i), k, old.table[i], new.table[i])
    new.changed.append((version, list(zip(bidir_changes, category_changes,
-                                     decimal_changes, mirrored_changes,
-                                     numeric_changes)),
+                                          decimal_changes, mirrored_changes,
+                                          east_asian_width_changes,
+                                          numeric_changes)),
                        normalization_changes))

 def open_data(template, version):

--- a/setup.py
+++ b/setup.py
@@ -652,7 +652,8 @@ class PyBuildExt(build_ext):
        # profiler (_lsprof is for cProfile.py)
        exts.append( Extension('_lsprof', ['_lsprof.c', 'rotatingtree.c']) )
        # static Unicode character database
-        exts.append( Extension('unicodedata', ['unicodedata.c']) )
+        exts.append( Extension('unicodedata', ['unicodedata.c'],
+                               depends=['unicodedata_db.h', 'unicodename_db.h']) )
        # _opcode module
        exts.append( Extension('_opcode', ['_opcode.c']) )