cython-generate-lexicon.py 4.26 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#!/usr/bin/env python3

#
#   Updates Cython's Lexicon.py with the unicode characters that are accepted as
#   identifiers. Should be run with the most recent version of Python possible
#   to ensure that Lexicon is as complete as possible.
#
#   Python3 only (it relies on str.isidentifier which is a Python 3 addition)
#
#   Run with either
#    --overwrite    to update the existing Lexicon.py file
#    --here         to create a copy of Lexicon.py in the current directory

14 15 16
import functools
import re
import os
17 18 19 20 21
import sys

# Make sure we import the right Cython
cythonpath, _ = os.path.split(os.path.realpath(__file__)) # bin directory
cythonpath, _ = os.path.split(cythonpath)
22
if os.path.exists(os.path.join(cythonpath, "Cython")):
23 24 25 26 27 28
    sys.path.insert(0, cythonpath)
    print("Found (and using) local cython directory")
# else we aren't in a development directory

from Cython.Compiler import Lexicon

29

30 31 32 33 34 35 36 37 38 39 40
def main():
    arg = '--overwrite'
    if len(sys.argv) == 2:
        arg = sys.argv[1]
    if len(sys.argv) > 2 or arg not in ['--overwrite','--here']:
        print("""Call the script with either:
  --overwrite    to update the existing Lexicon.py file (default)
  --here         to create an version of Lexicon.py in the current directory
""")
        return

41 42 43 44 45 46 47
    generated_code = (
        f"# generated with:\n"
        f"# {sys.implementation.name} {sys.version.splitlines()[0].strip()}\n"
        "\n"
        f"{generate_character_sets()}\n"
    )

48
    print("Reading file", Lexicon.__file__)
49 50 51 52
    with open(Lexicon.__file__, 'r') as f:
        parts = re.split(r"(# (?:BEGIN|END) GENERATED CODE\n?)", f.read())

    if len(parts) not in (4,5) or ' GENERATED CODE' not in parts[1] or ' GENERATED CODE' not in parts[3]:
53 54 55
        print("Warning: generated code section not found - code not inserted")
        return

56 57 58
    parts[2] = generated_code
    output = "".join(parts)

59 60 61 62 63 64 65
    if arg == "--here":
        outfile = "Lexicon.py"
    else:
        assert arg == "--overwrite"
        outfile = Lexicon.__file__

    print("Writing to file", outfile)
66 67
    with open(outfile, 'w') as f:
        f.write(output)
68 69 70 71


# The easiest way to generate an appropriate character set is just to use the str.isidentifier method
# An alternative approach for getting character sets is at https://stackoverflow.com/a/49332214/4657412
72
@functools.lru_cache()
73 74 75
def get_start_characters_as_number():
    return [ i for i in range(sys.maxunicode) if str.isidentifier(chr(i)) ]

76

77 78 79
def get_continue_characters_as_number():
    return [ i for i in range(sys.maxunicode) if str.isidentifier('a'+chr(i)) ]

80

81 82 83
def get_continue_not_start_as_number():
    start = get_start_characters_as_number()
    cont = get_continue_characters_as_number()
84 85 86
    assert set(start) <= set(cont), \
        "We assume that all identifier start characters are also continuation characters."
    return sorted(set(cont).difference(start))
87

88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

def to_ranges(char_num_list):
    # Convert the large lists of character digits to
    #  list of characters
    #  a list pairs of characters representing closed ranges
    char_num_list = sorted(char_num_list)
    first_good_val = char_num_list[0]

    single_chars = []
    ranges = []
    for n in range(1, len(char_num_list)):
        if char_num_list[n]-1 != char_num_list[n-1]:
            # discontinuous
            if first_good_val == char_num_list[n-1]:
                single_chars.append(chr(char_num_list[n-1]))
            else:
104
                ranges.append(chr(first_good_val) + chr(char_num_list[n-1]))
105 106
            first_good_val = char_num_list[n]

107
    return ''.join(single_chars), ''.join(ranges)
108 109


110 111 112
def make_split_strings(chars, splitby=60, indent="    "):
    lines = [f'u"{chars[i:i+splitby]}"' for i in range(0, len(chars), splitby)]
    return indent + f"\n{indent}".join(lines)
113 114


115 116 117 118 119 120 121 122 123 124 125 126
def generate_character_sets():
    declarations = []
    for char_type, char_generator in [
        ("unicode_start_ch", get_start_characters_as_number),
        ("unicode_continuation_ch", get_continue_not_start_as_number),
    ]:
        for set_type, chars in zip(("any", "range"), to_ranges(char_generator())):
            declarations.append(
                f"{char_type}_{set_type} = (\n"
                f"{make_split_strings(chars)}\n"
                f")\n"
            )
127

128
    return "".join(declarations)
129 130 131 132


if __name__ == "__main__":
    main()