Commit ef2af1ad authored by Greg Price's avatar Greg Price Committed by Benjamin Peterson

bpo-37760: Factor out the basic UCD parsing logic of makeunicodedata. (GH-15130)

There were 10 copies of this, and almost as many distinct versions of
exactly how it was written.  They're all implementing the same
standard.  Pull them out to the top, so the more interesting logic
that remains becomes easier to read.
parent 66a34d35
...@@ -30,8 +30,9 @@ import os ...@@ -30,8 +30,9 @@ import os
import sys import sys
import zipfile import zipfile
from textwrap import dedent
from functools import partial from functools import partial
from textwrap import dedent
from typing import *
SCRIPT = sys.argv[0] SCRIPT = sys.argv[0]
VERSION = "3.3" VERSION = "3.3"
...@@ -903,6 +904,32 @@ def open_data(template, version): ...@@ -903,6 +904,32 @@ def open_data(template, version):
return open(local, 'rb') return open(local, 'rb')
class UcdFile:
'''
A file in the standard format of the UCD.
See: https://www.unicode.org/reports/tr44/#Format_Conventions
Note that, as described there, the Unihan data files have their
own separate format.
'''
def __init__(self, template: str, version: str) -> None:
self.template = template
self.version = version
def records(self) -> Iterator[List[str]]:
with open_data(self.template, self.version) as file:
for line in file:
line = line.split('#', 1)[0].strip()
if not line:
continue
yield [field.strip() for field in line.split(';')]
def __iter__(self) -> Iterator[List[str]]:
return self.records()
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# the following support code is taken from the unidb utilities # the following support code is taken from the unidb utilities
# Copyright (c) 1999-2000 by Secret Labs AB # Copyright (c) 1999-2000 by Secret Labs AB
...@@ -922,14 +949,9 @@ class UnicodeData: ...@@ -922,14 +949,9 @@ class UnicodeData:
cjk_check=True): cjk_check=True):
self.changed = [] self.changed = []
table = [None] * 0x110000 table = [None] * 0x110000
with open_data(UNICODE_DATA, version) as file: for s in UcdFile(UNICODE_DATA, version):
while 1: char = int(s[0], 16)
s = file.readline() table[char] = s
if not s:
break
s = s.strip().split(";")
char = int(s[0], 16)
table[char] = s
cjk_ranges_found = [] cjk_ranges_found = []
...@@ -968,17 +990,12 @@ class UnicodeData: ...@@ -968,17 +990,12 @@ class UnicodeData:
# in order to take advantage of the compression and lookup # in order to take advantage of the compression and lookup
# algorithms used for the other characters # algorithms used for the other characters
pua_index = NAME_ALIASES_START pua_index = NAME_ALIASES_START
with open_data(NAME_ALIASES, version) as file: for char, name, abbrev in UcdFile(NAME_ALIASES, version):
for s in file: char = int(char, 16)
s = s.strip() self.aliases.append((name, char))
if not s or s.startswith('#'): # also store the name in the PUA 1
continue self.table[pua_index][1] = name
char, name, abbrev = s.split(';') pua_index += 1
char = int(char, 16)
self.aliases.append((name, char))
# also store the name in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAME_ALIASES_START == len(self.aliases) assert pua_index - NAME_ALIASES_START == len(self.aliases)
self.named_sequences = [] self.named_sequences = []
...@@ -988,50 +1005,32 @@ class UnicodeData: ...@@ -988,50 +1005,32 @@ class UnicodeData:
assert pua_index < NAMED_SEQUENCES_START assert pua_index < NAMED_SEQUENCES_START
pua_index = NAMED_SEQUENCES_START pua_index = NAMED_SEQUENCES_START
with open_data(NAMED_SEQUENCES, version) as file: for name, chars in UcdFile(NAMED_SEQUENCES, version):
for s in file: chars = tuple(int(char, 16) for char in chars.split())
s = s.strip() # check that the structure defined in makeunicodename is OK
if not s or s.startswith('#'): assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
continue assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
name, chars = s.split(';') "the NamedSequence struct and in unicodedata_lookup")
chars = tuple(int(char, 16) for char in chars.split()) self.named_sequences.append((name, chars))
# check that the structure defined in makeunicodename is OK # also store these in the PUA 1
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size" self.table[pua_index][1] = name
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in " pua_index += 1
"the NamedSequence struct and in unicodedata_lookup")
self.named_sequences.append((name, chars))
# also store these in the PUA 1
self.table[pua_index][1] = name
pua_index += 1
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences) assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
self.exclusions = {} self.exclusions = {}
with open_data(COMPOSITION_EXCLUSIONS, version) as file: for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
for s in file: char = int(char, 16)
s = s.strip() self.exclusions[char] = 1
if not s:
continue
if s[0] == '#':
continue
char = int(s.split()[0],16)
self.exclusions[char] = 1
widths = [None] * 0x110000 widths = [None] * 0x110000
with open_data(EASTASIAN_WIDTH, version) as file: for s in UcdFile(EASTASIAN_WIDTH, version):
for s in file: if '..' in s[0]:
s = s.strip() first, last = [int(c, 16) for c in s[0].split('..')]
if not s: chars = list(range(first, last+1))
continue else:
if s[0] == '#': chars = [int(s[0], 16)]
continue for char in chars:
s = s.split()[0].split(';') widths[char] = s[1]
if '..' in s[0]:
first, last = [int(c, 16) for c in s[0].split('..')]
chars = list(range(first, last+1))
else:
chars = [int(s[0], 16)]
for char in chars:
widths[char] = s[1]
for i in range(0, 0x110000): for i in range(0, 0x110000):
if table[i] is not None: if table[i] is not None:
...@@ -1041,38 +1040,27 @@ class UnicodeData: ...@@ -1041,38 +1040,27 @@ class UnicodeData:
if table[i] is not None: if table[i] is not None:
table[i].append(set()) table[i].append(set())
with open_data(DERIVED_CORE_PROPERTIES, version) as file: for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
for s in file: if ".." in r:
s = s.split('#', 1)[0].strip() first, last = [int(c, 16) for c in r.split('..')]
if not s: chars = list(range(first, last+1))
continue else:
chars = [int(r, 16)]
r, p = s.split(";") for char in chars:
r = r.strip() if table[char]:
p = p.strip() # Some properties (e.g. Default_Ignorable_Code_Point)
if ".." in r: # apply to unassigned code points; ignore them
first, last = [int(c, 16) for c in r.split('..')] table[char][-1].add(p)
chars = list(range(first, last+1))
else: for s in UcdFile(LINE_BREAK, version):
chars = [int(r, 16)] if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
for char in chars: continue
if table[char]: if '..' not in s[0]:
# Some properties (e.g. Default_Ignorable_Code_Point) first = last = int(s[0], 16)
# apply to unassigned code points; ignore them else:
table[char][-1].add(p) first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
with open_data(LINE_BREAK, version) as file: table[char][-1].add('Line_Break')
for s in file:
s = s.partition('#')[0]
s = [i.strip() for i in s.split(';')]
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
continue
if '..' not in s[0]:
first = last = int(s[0], 16)
else:
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
table[char][-1].add('Line_Break')
# We only want the quickcheck properties # We only want the quickcheck properties
# Format: NF?_QC; Y(es)/N(o)/M(aybe) # Format: NF?_QC; Y(es)/N(o)/M(aybe)
...@@ -1083,23 +1071,19 @@ class UnicodeData: ...@@ -1083,23 +1071,19 @@ class UnicodeData:
# for older versions, and no delta records will be created. # for older versions, and no delta records will be created.
quickchecks = [0] * 0x110000 quickchecks = [0] * 0x110000
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split() qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
with open_data(DERIVEDNORMALIZATION_PROPS, version) as file: for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
for s in file: if len(s) < 2 or s[1] not in qc_order:
if '#' in s: continue
s = s[:s.index('#')] quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
s = [i.strip() for i in s.split(';')] quickcheck_shift = qc_order.index(s[1])*2
if len(s) < 2 or s[1] not in qc_order: quickcheck <<= quickcheck_shift
continue if '..' not in s[0]:
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No first = last = int(s[0], 16)
quickcheck_shift = qc_order.index(s[1])*2 else:
quickcheck <<= quickcheck_shift first, last = [int(c, 16) for c in s[0].split('..')]
if '..' not in s[0]: for char in range(first, last+1):
first = last = int(s[0], 16) assert not (quickchecks[char]>>quickcheck_shift)&3
else: quickchecks[char] |= quickcheck
first, last = [int(c, 16) for c in s[0].split('..')]
for char in range(first, last+1):
assert not (quickchecks[char]>>quickcheck_shift)&3
quickchecks[char] |= quickcheck
for i in range(0, 0x110000): for i in range(0, 0x110000):
if table[i] is not None: if table[i] is not None:
table[i].append(quickchecks[i]) table[i].append(quickchecks[i])
...@@ -1122,34 +1106,26 @@ class UnicodeData: ...@@ -1122,34 +1106,26 @@ class UnicodeData:
# Patch the numeric field # Patch the numeric field
if table[i] is not None: if table[i] is not None:
table[i][8] = value table[i][8] = value
sc = self.special_casing = {} sc = self.special_casing = {}
with open_data(SPECIAL_CASING, version) as file: for data in UcdFile(SPECIAL_CASING, version):
for s in file: if data[4]:
s = s[:-1].split('#', 1)[0] # We ignore all conditionals (since they depend on
if not s: # languages) except for one, which is hardcoded. See
continue # handle_capital_sigma in unicodeobject.c.
data = s.split("; ") continue
if data[4]: c = int(data[0], 16)
# We ignore all conditionals (since they depend on lower = [int(char, 16) for char in data[1].split()]
# languages) except for one, which is hardcoded. See title = [int(char, 16) for char in data[2].split()]
# handle_capital_sigma in unicodeobject.c. upper = [int(char, 16) for char in data[3].split()]
continue sc[c] = (lower, title, upper)
c = int(data[0], 16)
lower = [int(char, 16) for char in data[1].split()]
title = [int(char, 16) for char in data[2].split()]
upper = [int(char, 16) for char in data[3].split()]
sc[c] = (lower, title, upper)
cf = self.case_folding = {} cf = self.case_folding = {}
if version != '3.2.0': if version != '3.2.0':
with open_data(CASE_FOLDING, version) as file: for data in UcdFile(CASE_FOLDING, version):
for s in file: if data[1] in "CF":
s = s[:-1].split('#', 1)[0] c = int(data[0], 16)
if not s: cf[c] = [int(char, 16) for char in data[2].split()]
continue
data = s.split("; ")
if data[1] in "CF":
c = int(data[0], 16)
cf[c] = [int(char, 16) for char in data[2].split()]
def uselatin1(self): def uselatin1(self):
# restrict character range to ISO Latin 1 # restrict character range to ISO Latin 1
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment