Clean up the Lexicon.py generation script and use f-strings to prevent...

Clean up the Lexicon.py generation script and use f-strings to prevent accidentally running it with older Python versions.

Clean up the Lexicon.py generation script and use f-strings to prevent...
Clean up the Lexicon.py generation script and use f-strings to prevent accidentally running it with older Python versions.
c9dfe708 · Stefan Behnel · 270bf960 · c9dfe708 · c9dfe708
Commit c9dfe708 authored Aug 24, 2019 by Stefan Behnel
Hide whitespace changes
Inline Side-by-side

Showing with 54 additions and 70 deletions

Cython/Compiler/Lexicon.py Cython/Compiler/Lexicon.py +10 -5

bin/cython-generate-lexicon.py bin/cython-generate-lexicon.py +44 -65

No files found.
--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -153,14 +153,16 @@ def make_lexicon():
        #debug_file = scanner_dump_file
        )
 # BEGIN GENERATED CODE
 # generated with:
- # cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
+# cpython 3.7.3 (default, Apr 09 2019, 05:18:21) [GCC]
 unicode_start_ch_any = (
    u"_ªµºˬˮͿΆΌՙەۿܐޱߺࠚࠤࠨऽॐলঽৎৼਫ਼ઽૐૹଽୱஃஜௐఽಀಽೞഽൎලาຄຊຍລວາຽໆༀဿၡႎჇჍቘዀៗៜᢪ"
    u"ᪧὙὛὝιⁱⁿℂℇℕℤΩℨⅎⴧⴭⵯꣻꧏꩺꪱꫀꫂיִמּﹱﹳﹷﹹﹻﹽ𐠈𐠼𐨀𐼧𑅄𑅶𑇚𑇜𑊈𑌽𑍐𑓇𑙄𑣿𑨀𑨺𑩐𑪝𑱀𑵆𑶘𖽐𝒢𝒻𝕆𞸤𞸧𞸹𞸻"
-    u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾")
+    u"𞹂𞹇𞹉𞹋𞹔𞹗𞹙𞹛𞹝𞹟𞹤𞹾"
+)
 unicode_start_ch_range = (
    u"AZazÀÖØöøˁˆˑˠˤͰʹͶͷͻͽΈΊΎΡΣϵϷҁҊԯԱՖՠֈאתׯײؠيٮٯٱۓۥۦۮۯۺۼܒܯݍޥߊߪߴߵࠀࠕ"
    u"ࡀࡘࡠࡪࢠࢴࢶࢽऄहक़ॡॱঀঅঌএঐওনপরশহড়ঢ়য়ৡৰৱਅਊਏਐਓਨਪਰਲਲ਼ਵਸ਼ਸਹਖ਼ੜੲੴઅઍએઑઓનપરલળવહ"
@@ -177,11 +179,13 @@ unicode_start_ch_range = (
    u"𑈀𑈑𑈓𑈫𑊀𑊆𑊊𑊍𑊏𑊝𑊟𑊨𑊰𑋞𑌅𑌌𑌏𑌐𑌓𑌨𑌪𑌰𑌲𑌳𑌵𑌹𑍝𑍡𑐀𑐴𑑇𑑊𑒀𑒯𑓄𑓅𑖀𑖮𑗘𑗛𑘀𑘯𑚀𑚪𑜀𑜚𑠀𑠫𑢠𑣟𑨋𑨲𑩜𑪃𑪆𑪉𑫀𑫸𑰀𑰈"
    u"𑰊𑰮𑱲𑲏𑴀𑴆𑴈𑴉𑴋𑴰𑵠𑵥𑵧𑵨𑵪𑶉𑻠𑻲𒀀𒎙𒐀𒑮𒒀𒕃𓀀𓐮𔐀𔙆𖠀𖨸𖩀𖩞𖫐𖫭𖬀𖬯𖭀𖭃𖭣𖭷𖭽𖮏𖹀𖹿𖼀𖽄𖾓𖾟𖿠𖿡𗀀𘟱𘠀𘫲𛀀𛄞𛅰𛋻𛰀𛱪"
    u"𛱰𛱼𛲀𛲈𛲐𛲙𝐀𝑔𝑖𝒜𝒞𝒟𝒥𝒦𝒩𝒬𝒮𝒹𝒽𝓃𝓅𝔅𝔇𝔊𝔍𝔔𝔖𝔜𝔞𝔹𝔻𝔾𝕀𝕄𝕊𝕐𝕒𝚥𝚨𝛀𝛂𝛚𝛜𝛺𝛼𝜔𝜖𝜴𝜶𝝎𝝐𝝮𝝰𝞈𝞊𝞨𝞪𝟂𝟄𝟋"
-    u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠")
+    u"𞠀𞣄𞤀𞥃𞸀𞸃𞸅𞸟𞸡𞸢𞸩𞸲𞸴𞸷𞹍𞹏𞹑𞹒𞹡𞹢𞹧𞹪𞹬𞹲𞹴𞹷𞹹𞹼𞺀𞺉𞺋𞺛𞺡𞺣𞺥𞺩𞺫𞺻𠀀𪛖𪜀𫜴𫝀𫠝𫠠𬺡𬺰𮯠"
+)
 unicode_continuation_ch_any = (
-    u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁＿𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄")
+    u"··়ׇֿٰܑ߽ৗ਼৾ੑੵ઼଼ஂௗ಼ൗ්ූัັ༹༵༷࿆᳭ᢩ៝⁔⵿⃡꙯ꠂ꠆ꠋꧥꩃﬞꪰ꫁＿𑅳𐨿𐇽𐋠𑈾𑍗𑩇𑑞𑴺𑵇𝩵𝪄"
+)
 unicode_continuation_ch_range = (
    u"09ֽׁׂًؚ֑ׅ̀ͯ҃҇ׄؐ٩۪ۭۖۜ۟ۤۧۨ۰۹ܰ݊ަް߀߉࡙࡛࣓ࣣ߫߳ࠖ࠙ࠛࠣࠥࠧࠩ࠭࣡ःऺ़ाॏ॑ॗॢॣ०९ঁঃ"
    u"াৄেৈো্ৢৣ০৯ਁਃਾੂੇੈੋ੍੦ੱઁઃાૅેૉો્ૢૣ૦૯ૺ૿ଁଃାୄେୈୋ୍ୖୗୢୣ୦୯ாூெைொ்௦௯ఀఄాౄ"
@@ -192,7 +196,8 @@ unicode_continuation_ch_range = (
    u"︠︯︳︴﹍﹏０９ﾞﾟ𐍶𐍺𐒠𐒩𐨁𐨃𐨅𐨆𐨌𐨺𐫦𐨏𐨸𐫥𐴤𐴧𐴰𐴹𐽆𐽐𑀀𑀂𑀸𑁆𑁦𑁯𑁿𑂂𑂰𑂺𑃰𑃹𑄀𑄂𑄧𑄴𑄶𑄿𑅅𑅆𑆀𑆂𑆳𑇀𑇉𑇌𑇐𑇙𑈬𑈷"
    u"𑋟𑋪𑋰𑋹𑌀𑌃𑌻𑌼𑌾𑍄𑍇𑍈𑍋𑍍𑍢𑍣𑍦𑍬𑍰𑍴𑐵𑑆𑑐𑑙𑒰𑓃𑓐𑓙𑖯𑖵𑖸𑗀𑗜𑗝𑘰𑙀𑙐𑙙𑚫𑚷𑛀𑛉𑜝𑜫𑜰𑜹𑠬𑠺𑣠𑣩𑨁𑨊𑨳𑨹𑨻𑨾𑩑𑩛𑪊𑪙"
    u"𑰯𑰶𑰸𑰿𑱐𑱙𑲒𑲧𑲩𑲶𑴱𑴶𑴼𑴽𑴿𑵅𑵐𑵙𑶊𑶎𑶐𑶑𑶓𑶗𑶠𑶩𑻳𑻶𖩠𖩩𖫰𖫴𖬰𖬶𖭐𖭙𖽑𖽾𖾏𖾒𛲝𛲞𝅩𝅥𝅲𝅻𝆂𝆋𝅭𝆅𝆪𝆭𝉂𝉄𝟎𝟿𝨀𝨶𝨻𝩬"
-    u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙")
+    u"𝪛𝪟𝪡𝪯𞥊𞣐𞣖𞀀𞀆𞀈𞀘𞀛𞀡𞀣𞀤𞀦𞀪𞥄𞥐𞥙"
+)
 # END GENERATED CODE
--- a/bin/cython-generate-lexicon.py
+++ b/bin/cython-generate-lexicon.py
@@ -11,21 +11,23 @@
 #    --overwrite    to update the existing Lexicon.py file
 #    --here         to create a copy of Lexicon.py in the current directory
+import functools
+import re
+import os
 import sys
 from io import StringIO
-import os
-import functools
 # Make sure we import the right Cython
 cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
 cythonpath, _ = os.path.split(cythonpath)
-if os.path.exists(os.path.join(cythonpath,"Cython")):
+if os.path.exists(os.path.join(cythonpath, "Cython")):
    sys.path.insert(0, cythonpath)
    print("Found (and using) local cython directory")
 # else we aren't in a development directory
 from Cython.Compiler import Lexicon
 def main():
    arg = '--overwrite'
    if len(sys.argv) == 2:
@@ -37,35 +39,24 @@ def main():
 """)
        return
-    generated_code = StringIO()
+    generated_code = (
-    print("# generated with:\n #", sys.implementation.name, sys.version, file=generated_code)
+        f"# generated with:\n"
-    print(file=generated_code)
+        f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
-    print(start_expression(), file=generated_code)
+        "\n"
-    print(file=generated_code)
+        f"{generate_character_sets()}\n"
-    print(cont_expression(), file=generated_code)
+    )
-    print(file=generated_code)
-    generated_code = generated_code.getvalue()
-    output = StringIO()
-    mode = 0 # 1 when found generated section, 2 afterwards
    print("Reading file", Lexicon.__file__)
-    with open(Lexicon.__file__,'r') as f:
+    with open(Lexicon.__file__, 'r') as f:
-        for line in f:
+        parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())
-            if mode != 1:
-                output.write(line)
+    if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
-            else:
-                if line.strip() == "# END GENERATED CODE":
-                    mode = 2
-                    output.write(line)
-            if mode == 0:
-                if line.strip() == "# BEGIN GENERATED CODE":
-                    mode = 1
-                    output.write(generated_code)
-    if mode != 2:
        print("Warning: generated code section not found - code not inserted")
        return
+    parts[2] = generated_code
+    output = "".join(parts)
    if arg == "--here":
        outfile = "Lexicon.py"
    else:
@@ -73,23 +64,26 @@ def main():
        outfile = Lexicon.__file__
    print("Writing to file", outfile)
-    with open(outfile,'w') as f:
+    with open(outfile, 'w') as f:
-        f.write(output.getvalue())
+        f.write(output)
 # The easiest way to generate an appropriate character set is just to use the str.isidentifier method
 # An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
-@functools.lru_cache(None)
+@functools.lru_cache()
 def get_start_characters_as_number():
    return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]
 def get_continue_characters_as_number():
    return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]
 def get_continue_not_start_as_number():
    start = get_start_characters_as_number()
    cont = get_continue_characters_as_number()
-    return sorted(set(cont)-set(start))
+    return sorted(set(cont) - set(start))
 def to_ranges(char_num_list):
    # Convert the large lists of character digits to
@@ -106,47 +100,32 @@ def to_ranges(char_num_list):
            if first_good_val == char_num_list[n-1]:
                single_chars.append(chr(char_num_list[n-1]))
            else:
-                ranges.append(chr(first_good_val)+chr(char_num_list[n-1]))
+                ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
            first_good_val = char_num_list[n]
-    return single_chars, ranges
-def make_split_strings(chars, splitby=60):
-    out = []
-    for i in range(0, len(chars), splitby):
-        out.append('u"{}"'.format("".join(chars[i:i+splitby])))
-    return "\n    ".join(out)
-def start_expression():
-    output = StringIO()
-    print("unicode_start_ch_any = (\n    ", end='', file=output)
-    single_chars, ranges = to_ranges(get_start_characters_as_number())
+    return ''.join(single_chars), ''.join(ranges)
-    single_chars = "".join(single_chars)
-    ranges = "".join(ranges)
-    print(make_split_strings(single_chars), end='', file=output)
-    print(")", file=output)
-    print("unicode_start_ch_range = (\n    ", end='', file=output)
-    print(make_split_strings(ranges), end='', file=output)
-    print(")", file=output)
-    return output.getvalue()
+def make_split_strings(chars, splitby=60, indent="    "):
+    lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
+    return indent + f"\n{indent}".join(lines)
-def cont_expression():
-    output = StringIO()
-    print("unicode_continuation_ch_any = (\n    ", end='', file=output)
-    single_chars, ranges = to_ranges(get_continue_not_start_as_number())
+def generate_character_sets():
-    single_chars = "".join(single_chars)
+    declarations = []
-    ranges = "".join(ranges)
+    for char_type, char_generator in [
+        ("unicode_start_ch", get_start_characters_as_number),
+        ("unicode_continuation_ch", get_continue_not_start_as_number),
+    ]:
+        for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
+            declarations.append(
+                f"{char_type}_{set_type} = (\n"
+                f"{make_split_strings(chars)}\n"
+                f")\n"
+            )
-    print(make_split_strings(single_chars), end='', file=output)
+    return "".join(declarations)
-    print(")", file=output)
-    print("unicode_continuation_ch_range = (\n    ", end='', file=output)
-    print(make_split_strings(ranges), end='', file=output)
-    print(")", file=output)
-    return output.getvalue()
 if __name__ == "__main__":
    main()