Commit 7c69c1c0 authored by Benjamin Peterson's avatar Benjamin Peterson Committed by GitHub

update to Unicode 11.0.0 (closes bpo-33778) (GH-7439)

Also, standardize indentation of generated tables.
parent 9f04f0df
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
This module provides access to the Unicode Character Database (UCD) which This module provides access to the Unicode Character Database (UCD) which
defines character properties for all Unicode characters. The data contained in defines character properties for all Unicode characters. The data contained in
this database is compiled from the `UCD version 10.0.0 this database is compiled from the `UCD version 11.0.0
<http://www.unicode.org/Public/10.0.0/ucd>`_. <http://www.unicode.org/Public/11.0.0/ucd>`_.
The module uses the same names and symbols as defined by Unicode The module uses the same names and symbols as defined by Unicode
Standard Annex #44, `"Unicode Character Database" Standard Annex #44, `"Unicode Character Database"
...@@ -168,6 +168,6 @@ Examples: ...@@ -168,6 +168,6 @@ Examples:
.. rubric:: Footnotes .. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/10.0.0/ucd/NameAliases.txt .. [#] http://www.unicode.org/Public/11.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/10.0.0/ucd/NamedSequences.txt .. [#] http://www.unicode.org/Public/11.0.0/ucd/NamedSequences.txt
...@@ -313,7 +313,7 @@ The Unicode category codes mentioned above stand for: ...@@ -313,7 +313,7 @@ The Unicode category codes mentioned above stand for:
* *Nd* - decimal numbers * *Nd* - decimal numbers
* *Pc* - connector punctuations * *Pc* - connector punctuations
* *Other_ID_Start* - explicit list of characters in `PropList.txt * *Other_ID_Start* - explicit list of characters in `PropList.txt
<http://www.unicode.org/Public/10.0.0/ucd/PropList.txt>`_ to support backwards <http://www.unicode.org/Public/11.0.0/ucd/PropList.txt>`_ to support backwards
compatibility compatibility
* *Other_ID_Continue* - likewise * *Other_ID_Continue* - likewise
...@@ -876,4 +876,4 @@ occurrence outside string literals and comments is an unconditional error: ...@@ -876,4 +876,4 @@ occurrence outside string literals and comments is an unconditional error:
.. rubric:: Footnotes .. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/10.0.0/ucd/NameAliases.txt .. [#] http://www.unicode.org/Public/11.0.0/ucd/NameAliases.txt
...@@ -20,7 +20,7 @@ errors = 'surrogatepass' ...@@ -20,7 +20,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase): class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes # update this, if the database changes
expectedchecksum = '727091e0fd5807eb41c72912ae95cdd74c795e27' expectedchecksum = '97a41f208c53d5e08c77c1175187e95386b82b6f'
def test_method_checksum(self): def test_method_checksum(self):
h = hashlib.sha1() h = hashlib.sha1()
...@@ -80,7 +80,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest): ...@@ -80,7 +80,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
# Update this if the database changes. Make sure to do a full rebuild # Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum. # (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = 'db6f92bb5010f8e85000634b08e77233355ab37a' expectedchecksum = '4f73278b19c2ec3099724c132f0b90a1d25c19e4'
def test_function_checksum(self): def test_function_checksum(self):
data = [] data = []
h = hashlib.sha1() h = hashlib.sha1()
......
Update ``unicodedata``'s database to Unicode version 11.0.0.
...@@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code) ...@@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code)
{ {
return return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */ (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FEA) || /* CJK Ideograph */ (0x4E00 <= code && code <= 0x9FEF) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */ (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */ (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */ (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
......
This diff is collapsed.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
# #
# (re)generate unicode property and type databases # (re)generate unicode property and type databases
# #
# this script converts a unicode 3.2 database file to # This script converts Unicode database files to Modules/unicodedata_db.h,
# Modules/unicodedata_db.h, Modules/unicodename_db.h, # Modules/unicodename_db.h, and Objects/unicodetype_db.h
# and Objects/unicodetype_db.h
# #
# history: # history:
# 2000-09-24 fl created (based on bits and pieces from unidb) # 2000-09-24 fl created (based on bits and pieces from unidb)
...@@ -34,7 +33,7 @@ import zipfile ...@@ -34,7 +33,7 @@ import zipfile
from textwrap import dedent from textwrap import dedent
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "3.2" VERSION = "3.3"
# The Unicode Database # The Unicode Database
# -------------------- # --------------------
...@@ -42,7 +41,7 @@ VERSION = "3.2" ...@@ -42,7 +41,7 @@ VERSION = "3.2"
# * Doc/library/stdtypes.rst, and # * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst # * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences) # * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "10.0.0" UNIDATA_VERSION = "11.0.0"
UNICODE_DATA = "UnicodeData%s.txt" UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt" COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt" EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
...@@ -99,7 +98,7 @@ EXTENDED_CASE_MASK = 0x4000 ...@@ -99,7 +98,7 @@ EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph # these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [ cjk_ranges = [
('3400', '4DB5'), ('3400', '4DB5'),
('4E00', '9FEA'), ('4E00', '9FEF'),
('20000', '2A6D6'), ('20000', '2A6D6'),
('2A700', '2B734'), ('2A700', '2B734'),
('2B740', '2B81D'), ('2B740', '2B81D'),
...@@ -276,8 +275,8 @@ def makeunicodedata(unicode, trace): ...@@ -276,8 +275,8 @@ def makeunicodedata(unicode, trace):
print("struct reindex{int start;short count,index;};", file=fp) print("struct reindex{int start;short count,index;};", file=fp)
print("static struct reindex nfc_first[] = {", file=fp) print("static struct reindex nfc_first[] = {", file=fp)
for start,end in comp_first_ranges: for start,end in comp_first_ranges:
print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp) print(" { %d, %d, %d}," % (start,end-start,comp_first[start]), file=fp)
print(" {0,0,0}", file=fp) print(" {0,0,0}", file=fp)
print("};\n", file=fp) print("};\n", file=fp)
print("static struct reindex nfc_last[] = {", file=fp) print("static struct reindex nfc_last[] = {", file=fp)
for start,end in comp_last_ranges: for start,end in comp_last_ranges:
...@@ -353,28 +352,28 @@ def makeunicodedata(unicode, trace): ...@@ -353,28 +352,28 @@ def makeunicodedata(unicode, trace):
index1, index2, shift = splitbins(index, trace) index1, index2, shift = splitbins(index, trace)
print("static const change_record change_records_%s[] = {" % cversion, file=fp) print("static const change_record change_records_%s[] = {" % cversion, file=fp)
for record in records: for record in records:
print("\t{ %s }," % ", ".join(map(str,record)), file=fp) print(" { %s }," % ", ".join(map(str,record)), file=fp)
print("};", file=fp) print("};", file=fp)
Array("changes_%s_index" % cversion, index1).dump(fp, trace) Array("changes_%s_index" % cversion, index1).dump(fp, trace)
Array("changes_%s_data" % cversion, index2).dump(fp, trace) Array("changes_%s_data" % cversion, index2).dump(fp, trace)
print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp) print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
print("{", file=fp) print("{", file=fp)
print("\tint index;", file=fp) print(" int index;", file=fp)
print("\tif (n >= 0x110000) index = 0;", file=fp) print(" if (n >= 0x110000) index = 0;", file=fp)
print("\telse {", file=fp) print(" else {", file=fp)
print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp) print(" index = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \ print(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
(cversion, shift, ((1<<shift)-1)), file=fp) (cversion, shift, ((1<<shift)-1)), file=fp)
print("\t}", file=fp) print(" }", file=fp)
print("\treturn change_records_%s+index;" % cversion, file=fp) print(" return change_records_%s+index;" % cversion, file=fp)
print("}\n", file=fp) print("}\n", file=fp)
print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp) print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
print("{", file=fp) print("{", file=fp)
print("\tswitch(n) {", file=fp) print(" switch(n) {", file=fp)
for k, v in normalization: for k, v in normalization:
print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp) print(" case %s: return 0x%s;" % (hex(k), v), file=fp)
print("\tdefault: return 0;", file=fp) print(" default: return 0;", file=fp)
print("\t}\n}\n", file=fp) print(" }\n}\n", file=fp)
fp.close() fp.close()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment