upgrade unicode db to 6.3.0 (closes #19221)

7da80597 · Benjamin Peterson · f7102c19 · 7da80597 · 7da80597 · 7da80597
Commit 7da80597 authored Oct 10, 2013 by Benjamin Peterson
8 changed files
--- a/Doc/library/unicodedata.rst
+++ b/Doc/library/unicodedata.rst
@@ -15,8 +15,8 @@

 This module provides access to the Unicode Character Database (UCD) which
 defines character properties for all Unicode characters. The data contained in
-this database is compiled from the `UCD version 6.2.0
-<http://www.unicode.org/Public/6.2.0/ucd>`_.
+this database is compiled from the `UCD version 6.3.0
+<http://www.unicode.org/Public/6.3.0/ucd>`_.

 The module uses the same names and symbols as defined by Unicode
 Standard Annex #44, `"Unicode Character Database"
@@ -166,6 +166,6 @@ Examples:

 .. rubric:: Footnotes

-.. [#] http://www.unicode.org/Public/6.2.0/ucd/NameAliases.txt
+.. [#] http://www.unicode.org/Public/6.3.0/ucd/NameAliases.txt

-.. [#] http://www.unicode.org/Public/6.2.0/ucd/NamedSequences.txt
+.. [#] http://www.unicode.org/Public/6.3.0/ucd/NamedSequences.txt
--- a/Lib/test/test_unicodedata.py
+++ b/Lib/test/test_unicodedata.py
@@ -21,7 +21,7 @@ errors = 'surrogatepass'
 class UnicodeMethodsTest(unittest.TestCase):

    # update this, if the database changes
-    expectedchecksum = 'bf7a78f1a532421b5033600102e23a92044dbba9'
+    expectedchecksum = 'e74e878de71b6e780ffac271785c3cb58f6251f3'

    def test_method_checksum(self):
        h = hashlib.sha1()

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ Projected release date: 2013-10-20
 Core and Builtins
 -----------------

+- Issue #19221: Upgrade Unicode database to version 6.3.0.
+
 - Issue #16742: The result of the C callback PyOS_ReadlineFunctionPointer must
  now be a string allocated by PyMem_RawMalloc() or PyMem_RawRealloc() (or NULL
  if an error occurred), instead of a string allocated by PyMem_Malloc() or

--- a/Modules/unicodedata.c
+++ b/Modules/unicodedata.c
@@ -1322,10 +1322,10 @@ PyDoc_STRVAR(unicodedata_docstring,
 "This module provides access to the Unicode Character Database which\n\
 defines character properties for all Unicode characters. The data in\n\
 this database is based on the UnicodeData.txt file version\n\
-6.0.0 which is publically available from ftp://ftp.unicode.org/.\n\
+6.3.0 which is publically available from ftp://ftp.unicode.org/.\n\
 \n\
 The module uses the same names and symbols as defined by the\n\
-UnicodeData File Format 6.0.0 (see\n\
+UnicodeData File Format 6.3.0 (see\n\
 http://www.unicode.org/reports/tr44/tr44-6.html).");



--- a/Modules/unicodedata_db.h
+++ b/Modules/unicodedata_db.h
--- a/Modules/unicodename_db.h
+++ b/Modules/unicodename_db.h
--- a/Objects/unicodetype_db.h
+++ b/Objects/unicodetype_db.h
@@ -1589,7 +1589,7 @@ static unsigned short index2[] = {
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 0, 0, 
    0, 0, 0, 55, 55, 55, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 21, 21, 
    21, 21, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 25, 25, 25, 25, 25, 25, 25, 25, 
-    25, 25, 25, 5, 0, 0, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
+    25, 25, 25, 5, 21, 0, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
    55, 55, 55, 96, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 25, 25, 25, 25, 
    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 7, 8, 
@@ -1801,7 +1801,7 @@ static unsigned short index2[] = {
    25, 25, 25, 25, 25, 25, 25, 25, 5, 5, 5, 96, 5, 5, 5, 5, 55, 25, 0, 0, 7, 
    8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 27, 27, 27, 27, 27, 
    27, 27, 27, 27, 27, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
-    25, 25, 25, 2, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 
+    25, 25, 25, 21, 0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0, 
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 96, 
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
@@ -1828,7 +1828,7 @@ static unsigned short index2[] = {
    7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 132, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
    5, 5, 5, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
-    55, 55, 55, 55, 55, 55, 55, 25, 25, 18, 18, 18, 0, 0, 5, 5, 55, 55, 55, 
+    55, 55, 55, 55, 55, 55, 55, 25, 25, 18, 18, 25, 0, 0, 5, 5, 55, 55, 55, 
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 
    55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 55, 18, 25, 18, 25, 
@@ -1915,7 +1915,7 @@ static unsigned short index2[] = {
    5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 6, 3, 3, 21, 21, 21, 21, 21, 2, 5, 5, 
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 18, 18, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 18, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 21, 
-    21, 21, 21, 21, 0, 0, 0, 0, 0, 21, 21, 21, 21, 21, 21, 245, 95, 0, 0, 
+    21, 21, 21, 21, 0, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 245, 95, 0, 0, 
    246, 247, 248, 249, 250, 251, 5, 5, 5, 5, 5, 95, 245, 26, 22, 23, 246, 
    247, 248, 249, 250, 251, 5, 5, 5, 5, 5, 0, 95, 95, 95, 95, 95, 95, 95, 
    95, 95, 95, 95, 95, 95, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
@@ -2925,9 +2925,6 @@ static unsigned short index2[] = {
 double _PyUnicode_ToNumeric(Py_UCS4 ch)
 {
    switch (ch) {
-    case 0x12456:
-    case 0x12457:
-        return (double) -1.0;
    case 0x0F33:
        return (double) -1.0/2.0;
    case 0x0030:
@@ -3383,6 +3380,7 @@ double _PyUnicode_ToNumeric(Py_UCS4 ch)
    case 0x12435:
    case 0x1244A:
    case 0x12450:
+    case 0x12456:
    case 0x12459:
    case 0x1D361:
    case 0x1D7D0:
@@ -3539,6 +3537,7 @@ double _PyUnicode_ToNumeric(Py_UCS4 ch)
    case 0x1243B:
    case 0x1244B:
    case 0x12451:
+    case 0x12457:
    case 0x1D362:
    case 0x1D7D1:
    case 0x1D7DB:
@@ -4294,7 +4293,6 @@ int _PyUnicode_IsWhitespace(const Py_UCS4 ch)
    case 0x0085:
    case 0x00A0:
    case 0x1680:
-    case 0x180E:
    case 0x2000:
    case 0x2001:
    case 0x2002:

--- a/Tools/unicode/makeunicodedata.py
+++ b/Tools/unicode/makeunicodedata.py
@@ -37,7 +37,7 @@ SCRIPT = sys.argv[0]
 VERSION = "3.2"

 # The Unicode Database
-UNIDATA_VERSION = "6.2.0"
+UNIDATA_VERSION = "6.3.0"
 UNICODE_DATA = "UnicodeData%s.txt"
 COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
 EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
@@ -68,7 +68,7 @@ CATEGORY_NAMES = [ "Cn", "Lu", "Ll", "Lt", "Mn", "Mc", "Me", "Nd",

 BIDIRECTIONAL_NAMES = [ "", "L", "LRE", "LRO", "R", "AL", "RLE", "RLO",
    "PDF", "EN", "ES", "ET", "AN", "CS", "NSM", "BN", "B", "S", "WS",
-    "ON" ]
+    "ON", "LRI", "RLI", "FSI", "PDI" ]

 EASTASIANWIDTH_NAMES = [ "F", "H", "W", "Na", "A", "N" ]