Commit 7c69c1c0 authored by Benjamin Peterson's avatar Benjamin Peterson Committed by GitHub

update to Unicode 11.0.0 (closes bpo-33778) (GH-7439)

Also, standardize indentation of generated tables.
parent 9f04f0df
......@@ -17,8 +17,8 @@
This module provides access to the Unicode Character Database (UCD) which
defines character properties for all Unicode characters. The data contained in
this database is compiled from the `UCD version 10.0.0
<http://www.unicode.org/Public/10.0.0/ucd>`_.
this database is compiled from the `UCD version 11.0.0
<http://www.unicode.org/Public/11.0.0/ucd>`_.
The module uses the same names and symbols as defined by Unicode
Standard Annex #44, `"Unicode Character Database"
......@@ -168,6 +168,6 @@ Examples:
.. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/10.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/11.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/10.0.0/ucd/NamedSequences.txt
.. [#] http://www.unicode.org/Public/11.0.0/ucd/NamedSequences.txt
......@@ -313,7 +313,7 @@ The Unicode category codes mentioned above stand for:
* *Nd* - decimal numbers
* *Pc* - connector punctuations
* *Other_ID_Start* - explicit list of characters in `PropList.txt
<http://www.unicode.org/Public/10.0.0/ucd/PropList.txt>`_ to support backwards
<http://www.unicode.org/Public/11.0.0/ucd/PropList.txt>`_ to support backwards
compatibility
* *Other_ID_Continue* - likewise
......@@ -876,4 +876,4 @@ occurrence outside string literals and comments is an unconditional error:
.. rubric:: Footnotes
.. [#] http://www.unicode.org/Public/10.0.0/ucd/NameAliases.txt
.. [#] http://www.unicode.org/Public/11.0.0/ucd/NameAliases.txt
......@@ -20,7 +20,7 @@ errors = 'surrogatepass'
class UnicodeMethodsTest(unittest.TestCase):
# update this, if the database changes
expectedchecksum = '727091e0fd5807eb41c72912ae95cdd74c795e27'
expectedchecksum = '97a41f208c53d5e08c77c1175187e95386b82b6f'
def test_method_checksum(self):
h = hashlib.sha1()
......@@ -80,7 +80,7 @@ class UnicodeFunctionsTest(UnicodeDatabaseTest):
# Update this if the database changes. Make sure to do a full rebuild
# (e.g. 'make distclean && make') to get the correct checksum.
expectedchecksum = 'db6f92bb5010f8e85000634b08e77233355ab37a'
expectedchecksum = '4f73278b19c2ec3099724c132f0b90a1d25c19e4'
def test_function_checksum(self):
data = []
h = hashlib.sha1()
......
Update ``unicodedata``'s database to Unicode version 11.0.0.
......@@ -921,7 +921,7 @@ is_unified_ideograph(Py_UCS4 code)
{
return
(0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
(0x4E00 <= code && code <= 0x9FEA) || /* CJK Ideograph */
(0x4E00 <= code && code <= 0x9FEF) || /* CJK Ideograph */
(0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
(0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
(0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
......
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#
# (re)generate unicode property and type databases
#
# this script converts a unicode 3.2 database file to
# Modules/unicodedata_db.h, Modules/unicodename_db.h,
# and Objects/unicodetype_db.h
# This script converts Unicode database files to Modules/unicodedata_db.h,
# Modules/unicodename_db.h, and Objects/unicodetype_db.h
#
# history:
# 2000-09-24 fl created (based on bits and pieces from unidb)
......@@ -34,7 +33,7 @@ import zipfile
from textwrap import dedent
SCRIPT = sys.argv[0]
VERSION = "3.2"
VERSION = "3.3"
# The Unicode Database
# --------------------
......@@ -42,7 +41,7 @@ VERSION = "3.2"
# * Doc/library/stdtypes.rst, and
# * Doc/library/unicodedata.rst
# * Doc/reference/lexical_analysis.rst (two occurrences)
UNIDATA_VERSION = "10.0.0"
UNIDATA_VERSION = "11.0.0"
UNICODE_DATA = "UnicodeData%s.txt"
COMPOSITION_EXCLUSIONS = "CompositionExclusions%s.txt"
EASTASIAN_WIDTH = "EastAsianWidth%s.txt"
......@@ -99,7 +98,7 @@ EXTENDED_CASE_MASK = 0x4000
# these ranges need to match unicodedata.c:is_unified_ideograph
cjk_ranges = [
('3400', '4DB5'),
('4E00', '9FEA'),
('4E00', '9FEF'),
('20000', '2A6D6'),
('2A700', '2B734'),
('2B740', '2B81D'),
......@@ -353,28 +352,28 @@ def makeunicodedata(unicode, trace):
index1, index2, shift = splitbins(index, trace)
print("static const change_record change_records_%s[] = {" % cversion, file=fp)
for record in records:
print("\t{ %s }," % ", ".join(map(str,record)), file=fp)
print(" { %s }," % ", ".join(map(str,record)), file=fp)
print("};", file=fp)
Array("changes_%s_index" % cversion, index1).dump(fp, trace)
Array("changes_%s_data" % cversion, index2).dump(fp, trace)
print("static const change_record* get_change_%s(Py_UCS4 n)" % cversion, file=fp)
print("{", file=fp)
print("\tint index;", file=fp)
print("\tif (n >= 0x110000) index = 0;", file=fp)
print("\telse {", file=fp)
print("\t\tindex = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
print("\t\tindex = changes_%s_data[(index<<%d)+(n & %d)];" % \
print(" int index;", file=fp)
print(" if (n >= 0x110000) index = 0;", file=fp)
print(" else {", file=fp)
print(" index = changes_%s_index[n>>%d];" % (cversion, shift), file=fp)
print(" index = changes_%s_data[(index<<%d)+(n & %d)];" % \
(cversion, shift, ((1<<shift)-1)), file=fp)
print("\t}", file=fp)
print("\treturn change_records_%s+index;" % cversion, file=fp)
print(" }", file=fp)
print(" return change_records_%s+index;" % cversion, file=fp)
print("}\n", file=fp)
print("static Py_UCS4 normalization_%s(Py_UCS4 n)" % cversion, file=fp)
print("{", file=fp)
print("\tswitch(n) {", file=fp)
print(" switch(n) {", file=fp)
for k, v in normalization:
print("\tcase %s: return 0x%s;" % (hex(k), v), file=fp)
print("\tdefault: return 0;", file=fp)
print("\t}\n}\n", file=fp)
print(" case %s: return 0x%s;" % (hex(k), v), file=fp)
print(" default: return 0;", file=fp)
print(" }\n}\n", file=fp)
fp.close()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment