Commit c9dfe708 authored by Stefan Behnel's avatar Stefan Behnel

Clean up the Lexicon.py generation script and use f-strings to prevent...

Clean up the Lexicon.py generation script and use f-strings to prevent accidentally running it with older Python versions.
parent 270bf960
......@@ -153,14 +153,16 @@ def make_lexicon():
#debug_file = scanner_dump_file
)
# BEGIN GENERATED CODE
# generated with:
# cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
# cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
unicode_start_ch_any = (
u"_ªµºˬˮͿΆΌՙەۿܐޱߺࠚࠤࠨऽॐলঽৎৼਫ਼ઽૐૹଽୱஃஜௐఽಀಽೞഽൎලาຄຊຍລວາຽໆༀဿၡႎჇჍቘዀៗៜᢪ"
u"ᪧὙὛὝιⁱⁿℂℇℕℤΩℨⅎⴧⴭⵯꣻꧏꩺꪱꫀꫂיִמּﹱﹳﹷﹹﹻﹽ𐠈𐠼𐨀𐼧𑅄𑅶𑇚𑇜𑊈𑌽𑍐𑓇𑙄𑣿𑨀𑨺𑩐𑪝𑱀𑵆𑶘𖽐𝒢𝒻𝕆𞸤𞸧𞸹𞸻"
u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾")
u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾"
)
unicode_start_ch_range = (
u"AZazÀÖØöøˁˆˑˠˤͰʹͶͷͻͽΈΊΎΡΣϵϷҁҊԯԱՖՠֈאתׯײؠيٮٯٱۓۥۦۮۯۺۼܒܯݍޥߊߪߴߵࠀࠕ"
u"ࡀࡘࡠࡪࢠࢴࢶࢽऄहक़ॡॱঀঅঌএঐওনপরশহড়ঢ়য়ৡৰৱਅਊਏਐਓਨਪਰਲਲ਼ਵਸ਼ਸਹਖ਼ੜੲੴઅઍએઑઓનપરલળવહ"
......@@ -177,11 +179,13 @@ unicode_start_ch_range = (
u"𑈀𑈑𑈓𑈫𑊀𑊆𑊊𑊍𑊏𑊝𑊟𑊨𑊰𑋞𑌅𑌌𑌏𑌐𑌓𑌨𑌪𑌰𑌲𑌳𑌵𑌹𑍝𑍡𑐀𑐴𑑇𑑊𑒀𑒯𑓄𑓅𑖀𑖮𑗘𑗛𑘀𑘯𑚀𑚪𑜀𑜚𑠀𑠫𑢠𑣟𑨋𑨲𑩜𑪃𑪆𑪉𑫀𑫸𑰀𑰈"
u"𑰊𑰮𑱲𑲏𑴀𑴆𑴈𑴉𑴋𑴰𑵠𑵥𑵧𑵨𑵪𑶉𑻠𑻲𒀀𒎙𒐀𒑮𒒀𒕃𓀀𓐮𔐀𔙆𖠀𖨸𖩀𖩞𖫐𖫭𖬀𖬯𖭀𖭃𖭣𖭷𖭽𖮏𖹀𖹿𖼀𖽄𖾓𖾟𖿠𖿡𗀀𘟱𘠀𘫲𛀀𛄞𛅰𛋻𛰀𛱪"
u"𛱰𛱼𛲀𛲈𛲐𛲙𝐀𝑔𝑖𝒜𝒞𝒟𝒥𝒦𝒩𝒬𝒮𝒹𝒽𝓃𝓅𝔅𝔇𝔊𝔍𝔔𝔖𝔜𝔞𝔹𝔻𝔾𝕀𝕄𝕊𝕐𝕒𝚥𝚨𝛀𝛂𝛚𝛜𝛺𝛼𝜔𝜖𝜴𝜶𝝎𝝐𝝮𝝰𝞈𝞊𝞨𝞪𝟂𝟄𝟋"
u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠")
u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠"
)
unicode_continuation_ch_any = (
u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁_𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄")
u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁_𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄"
)
unicode_continuation_ch_range = (
u"09ֽׁׂًؚ֑ׅ̀ͯ҃҇ׄؐ٩۪ۭۖۜ۟ۤۧۨ۰۹ܰ݊ަް߀߉࡙࡛࣓ࣣ߫߳ࠖ࠙ࠛࠣࠥࠧࠩ࠭࣡ःऺ़ाॏ॑ॗॢॣ०९ঁঃ"
u"াৄেৈো্ৢৣ০৯ਁਃਾੂੇੈੋ੍੦ੱઁઃાૅેૉો્ૢૣ૦૯ૺ૿ଁଃାୄେୈୋ୍ୖୗୢୣ୦୯ாூெைொ்௦௯ఀఄాౄ"
......@@ -192,7 +196,8 @@ unicode_continuation_ch_range = (
u"︠︯︳︴﹍﹏09゙゚𐍶𐍺𐒠𐒩𐨁𐨃𐨅𐨆𐨌𐨺𐫦𐨏𐨸𐫥𐴤𐴧𐴰𐴹𐽆𐽐𑀀𑀂𑀸𑁆𑁦𑁯𑁿𑂂𑂰𑂺𑃰𑃹𑄀𑄂𑄧𑄴𑄶𑄿𑅅𑅆𑆀𑆂𑆳𑇀𑇉𑇌𑇐𑇙𑈬𑈷"
u"𑋟𑋪𑋰𑋹𑌀𑌃𑌻𑌼𑌾𑍄𑍇𑍈𑍋𑍍𑍢𑍣𑍦𑍬𑍰𑍴𑐵𑑆𑑐𑑙𑒰𑓃𑓐𑓙𑖯𑖵𑖸𑗀𑗜𑗝𑘰𑙀𑙐𑙙𑚫𑚷𑛀𑛉𑜝𑜫𑜰𑜹𑠬𑠺𑣠𑣩𑨁𑨊𑨳𑨹𑨻𑨾𑩑𑩛𑪊𑪙"
u"𑰯𑰶𑰸𑰿𑱐𑱙𑲒𑲧𑲩𑲶𑴱𑴶𑴼𑴽𑴿𑵅𑵐𑵙𑶊𑶎𑶐𑶑𑶓𑶗𑶠𑶩𑻳𑻶𖩠𖩩𖫰𖫴𖬰𖬶𖭐𖭙𖽑𖽾𖾏𖾒𛲝𛲞𝅩𝅥𝅲𝅻𝆂𝆋𝅭𝆅𝆪𝆭𝉂𝉄𝟎𝟿𝨀𝨶𝨻𝩬"
u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙")
u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙"
)
# END GENERATED CODE
......@@ -11,21 +11,23 @@
# --overwrite to update the existing Lexicon.py file
# --here to create a copy of Lexicon.py in the current directory
import functools
import re
import os
import sys
from io import StringIO
import os
import functools
# Make sure we import the right Cython
cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
cythonpath, _ = os.path.split(cythonpath)
if os.path.exists(os.path.join(cythonpath,"Cython")):
if os.path.exists(os.path.join(cythonpath, "Cython")):
sys.path.insert(0, cythonpath)
print("Found (and using) local cython directory")
# else we aren't in a development directory
from Cython.Compiler import Lexicon
def main():
arg = '--overwrite'
if len(sys.argv) == 2:
......@@ -37,35 +39,24 @@ def main():
""")
return
generated_code = StringIO()
print("# generated with:\n #", sys.implementation.name, sys.version, file=generated_code)
print(file=generated_code)
print(start_expression(), file=generated_code)
print(file=generated_code)
print(cont_expression(), file=generated_code)
print(file=generated_code)
generated_code = generated_code.getvalue()
output = StringIO()
mode = 0 # 1 when found generated section, 2 afterwards
generated_code = (
f"# generated with:\n"
f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
"\n"
f"{generate_character_sets()}\n"
)
print("Reading file", Lexicon.__file__)
with open(Lexicon.__file__,'r') as f:
for line in f:
if mode != 1:
output.write(line)
else:
if line.strip() == "# END GENERATED CODE":
mode = 2
output.write(line)
if mode == 0:
if line.strip() == "# BEGIN GENERATED CODE":
mode = 1
output.write(generated_code)
if mode != 2:
with open(Lexicon.__file__, 'r') as f:
parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())
if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
print("Warning: generated code section not found - code not inserted")
return
parts[2] = generated_code
output = "".join(parts)
if arg == "--here":
outfile = "Lexicon.py"
else:
......@@ -73,23 +64,26 @@ def main():
outfile = Lexicon.__file__
print("Writing to file", outfile)
with open(outfile,'w') as f:
f.write(output.getvalue())
with open(outfile, 'w') as f:
f.write(output)
# The easiest way to generate an appropriate character set is just to use the str.isidentifier method
# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
@functools.lru_cache(None)
@functools.lru_cache()
def get_start_characters_as_number():
return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]
def get_continue_characters_as_number():
return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]
def get_continue_not_start_as_number():
start = get_start_characters_as_number()
cont = get_continue_characters_as_number()
return sorted(set(cont)-set(start))
return sorted(set(cont) - set(start))
def to_ranges(char_num_list):
# Convert the large lists of character digits to
......@@ -106,47 +100,32 @@ def to_ranges(char_num_list):
if first_good_val == char_num_list[n-1]:
single_chars.append(chr(char_num_list[n-1]))
else:
ranges.append(chr(first_good_val)+chr(char_num_list[n-1]))
ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
first_good_val = char_num_list[n]
return single_chars, ranges
def make_split_strings(chars, splitby=60):
out = []
for i in range(0, len(chars), splitby):
out.append('u"{}"'.format("".join(chars[i:i+splitby])))
return "\n ".join(out)
def start_expression():
output = StringIO()
print("unicode_start_ch_any = (\n ", end='', file=output)
single_chars, ranges = to_ranges(get_start_characters_as_number())
single_chars = "".join(single_chars)
ranges = "".join(ranges)
return ''.join(single_chars), ''.join(ranges)
print(make_split_strings(single_chars), end='', file=output)
print(")", file=output)
print("unicode_start_ch_range = (\n ", end='', file=output)
print(make_split_strings(ranges), end='', file=output)
print(")", file=output)
return output.getvalue()
def make_split_strings(chars, splitby=60, indent=" "):
lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
return indent + f"\n{indent}".join(lines)
def cont_expression():
output = StringIO()
print("unicode_continuation_ch_any = (\n ", end='', file=output)
single_chars, ranges = to_ranges(get_continue_not_start_as_number())
single_chars = "".join(single_chars)
ranges = "".join(ranges)
def generate_character_sets():
declarations = []
for char_type, char_generator in [
("unicode_start_ch", get_start_characters_as_number),
("unicode_continuation_ch", get_continue_not_start_as_number),
]:
for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
declarations.append(
f"{char_type}_{set_type} = (\n"
f"{make_split_strings(chars)}\n"
f")\n"
)
print(make_split_strings(single_chars), end='', file=output)
print(")", file=output)
print("unicode_continuation_ch_range = (\n ", end='', file=output)
print(make_split_strings(ranges), end='', file=output)
print(")", file=output)
return "".join(declarations)
return output.getvalue()
if __name__ == "__main__":
main()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment