Handle normalization of unicode identifiers (GH-3096)

ca8763a2 · da-woods · Stefan Behnel · 00c1dc96 · ca8763a2 · ca8763a2
Commit ca8763a2 authored Sep 02, 2019 by da-woods Committed by Stefan Behnel Sep 02, 2019
10 changed files
--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -85,7 +85,7 @@ def make_lexicon():
    comment = Str("#") + Rep(AnyBut("\n"))

    return Lexicon([
-        (name, IDENT),
+        (name, Method('normalize_ident')),
        (intliteral, Method('strip_underscores', symbol='INT')),
        (fltconst, Method('strip_underscores', symbol='FLOAT')),
        (imagconst, Method('strip_underscores', symbol='IMAG')),

--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -12,6 +12,7 @@ cython.declare(make_lexicon=object, lexicon=object,

 import os
 import platform
+from unicodedata import normalize

 from .. import Utils
 from ..Plex.Scanners import Scanner
@@ -341,6 +342,13 @@ class PyrexScanner(Scanner):
        self.sy = ''
        self.next()

+    def normalize_ident(self, text):
+        try:
+            text.encode('ascii') # really just name.isascii but supports Python 2 and 3
+        except UnicodeEncodeError:
+            text = normalize('NFKC', text)
+        self.produce(IDENT, text)
+
    def commentline(self, text):
        if self.parse_comments:
            self.produce('commentline', text)

--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -216,7 +216,7 @@ def decode_filename(filename):
 _match_file_encoding = re.compile(br"(\w*coding)[:=]\s*([-\w.]+)").search


-def detect_opened_file_encoding(f):
+def detect_opened_file_encoding(f, default='UTF-8'):
    # PEPs 263 and 3120
    # Most of the time the first two lines fall in the first couple of hundred chars,
    # and this bulk read/split is much faster.
@@ -236,7 +236,7 @@ def detect_opened_file_encoding(f):
        m = _match_file_encoding(lines[1])
        if m:
            return m.group(2).decode('iso8859-1')
-    return "UTF-8"
+    return default


 def skip_bom(f):

--- a/runtests.py
+++ b/runtests.py
@@ -545,9 +545,14 @@ class build_ext(_build_ext):
 class ErrorWriter(object):
    match_error = re.compile(r'(warning:)?(?:.*:)?\s*([-0-9]+)\s*:\s*([-0-9]+)\s*:\s*(.*)').match

-    def __init__(self):
+    def __init__(self, encoding=None):
        self.output = []
-        self.write = self.output.append
+        self.encoding = encoding
+
+    def write(self, value):
+        if self.encoding:
+            value = value.encode('ISO-8859-1').decode(self.encoding)
+        self.output.append(value)

    def _collect(self):
        s = ''.join(self.output)
@@ -1002,6 +1007,13 @@ class CythonCompileTestCase(unittest.TestCase):

    def split_source_and_output(self, test_directory, module, workdir):
        source_file = self.find_module_source_file(os.path.join(test_directory, module) + '.pyx')
+
+        from Cython.Utils import detect_opened_file_encoding
+        with io_open(source_file, 'rb') as f:
+            # encoding is passed to ErrorWriter but not used on the source
+            # since it is sometimes deliberately wrong
+            encoding = detect_opened_file_encoding(f, default=None)
+
        with io_open(source_file, 'r', encoding='ISO-8859-1') as source_and_output:
            error_writer = warnings_writer = None
            out = io_open(os.path.join(workdir, module + os.path.splitext(source_file)[1]),
@@ -1010,10 +1022,10 @@ class CythonCompileTestCase(unittest.TestCase):
                for line in source_and_output:
                    if line.startswith("_ERRORS"):
                        out.close()
-                        out = error_writer = ErrorWriter()
+                        out = error_writer = ErrorWriter(encoding=encoding)
                    elif line.startswith("_WARNINGS"):
                        out.close()
-                        out = warnings_writer = ErrorWriter()
+                        out = warnings_writer = ErrorWriter(encoding=encoding)
                    else:
                        out.write(line)
            finally:

--- a/tests/errors/unicode_identifiers_e1.pyx
+++ b/tests/errors/unicode_identifiers_e1.pyx
+# -*- coding: utf-8 -*-
+# mode: error
+
+★1 = 5 # invalid start symbol
+
+_ERRORS = u"""
+4:0: Unrecognized character
+"""
--- a/tests/errors/unicode_identifiers_e2.pyx
+++ b/tests/errors/unicode_identifiers_e2.pyx
+# -*- coding: utf-8 -*-
+# mode: error
+
+class MyClass₡: # invalid continue symbol
+    pass
+
+_ERRORS = u"""
+4:13: Unrecognized character
+"""
--- a/tests/errors/unicode_identifiers_e3.pyx
+++ b/tests/errors/unicode_identifiers_e3.pyx
+# -*- coding: utf-8 -*-
+# mode: error
+
+def f():
+    a = 1
+    ́b = 2 # looks like an identation error but is actually a combining accent as the first letter of column 4
+    c = 3
+
+_ERRORS = u"""
+6:4: Unrecognized character
+"""
--- a/tests/errors/unicode_identifiers_e4.pyx
+++ b/tests/errors/unicode_identifiers_e4.pyx
+# -*- coding: utf-8 -*-
+# mode: error
+
+cdef class C:
+    # these two symbols "\u1e69" and "\u1e9b\u0323" normalize to the same thing
+    # so the two attributes can't coexist
+    cdef int ṩomething
+    cdef double ẛ̣omething
+
+_ERRORS = u"""
+7:13: Previous declaration is here
+8:16: 'ṩomething' redeclared
+"""
--- a/tests/run/unicode_identifiers.pyx
+++ b/tests/run/unicode_identifiers.pyx
@@ -49,6 +49,12 @@ if sys.version_info[0]>2:
    10
    >>> NormalClassΓΓ().εxciting_function(None).__qualname__
    'NormalClassΓΓ.εxciting_function.<locals>.nestεd'
+
+    Do kwargs work?
+    >>> unicode_kwarg(αrg=5)
+    5
+    >>> unicode_kwarg_from_cy()
+    1
    """
 else:
    __doc__ = ""
@@ -184,6 +190,28 @@ class NormalClassΓΓ(Γναμε2):
            pass
        return nestεd

+def unicode_kwarg(*,αrg):
+    return αrg
+
+def unicode_kwarg_from_cy():
+    return unicode_kwarg(αrg=1)
+
+cdef class NormalizeAttrCdef:
+    """Python normalizes identifier names before they are used;
+    therefore ﬁ and fi should access the same attribute.
+    A more comprehensive version of this is in "unicode_identifiers_normalize.py"
+    comparing the behaviour to Python. The version here shows it
+    behaves the same in a cdef class and is tested with Python 2
+
+    >>> NormalizeAttrCdef().get()
+    5
+    """
+    cdef int ﬁ # note unicode ligature symbol
+    def __init__(self):
+        self.fi = 5
+    def get(self):
+        return self.ﬁ
+
 if sys.version_info[0]<=2:
    # These symbols are causing problems for doctest
    del NormalClassΓΓ

--- a/tests/run/unicode_identifiers_normalization.srctree
+++ b/tests/run/unicode_identifiers_normalization.srctree
+# -*- coding: utf-8 -*-
+# mode: run
+# tag: pure3.0, pep3131
+
+PYTHON build_tests.py
+# show behaviour in Python mode
+PYTHON -m doctest test0.py
+PYTHON -m doctest test1.py
+PYTHON -m doctest test2.py
+
+PYTHON setup.py build_ext --inplace
+# test in Cython mode
+PYTHON -c "import doctest; import test0 as m; exit(doctest.testmod(m)[0])"
+PYTHON -c "import doctest; import test1 as m; exit(doctest.testmod(m)[0])"
+PYTHON -c "import doctest; import test2 as m; exit(doctest.testmod(m)[0])"
+
+########## setup.py #########
+
+from Cython.Build.Dependencies import cythonize
+from distutils.core import setup
+
+setup(
+  ext_modules = cythonize("test*.py"),
+)
+
+######### build_tests.py ########
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+import sys
+import unicodedata
+
+# a few pairs of unicode strings that should be equivalent after normalization
+string_pairs = [("ﬁ", "fi"), # ligature and two letters
+                ("a\u0301", '\u00e1'), # a with acute accent with combining character or as 1 character
+                ("α\u0334\u0362", "α\u0362\u0334") # alpha with a pair of combining characters
+                    # in a different order. No single character to normalize to
+                ]
+
+# Show that the pairs genuinely aren't equal before normalization
+for sp in string_pairs:
+    assert sp[0] != sp[1]
+    assert unicodedata.normalize('NFKC', sp[0]) == unicodedata.normalize('NFKC', sp[1])
+    
+# some code that accesses the identifiers through the two different names
+#  contains doctests
+example_code = [
+"""
+class C:
+    '''
+    >>> C().get()
+    True
+    '''
+    def __init__(self):
+        self.{0} = True
+    def get(self):
+        return self.{1}
+""", """
+def pass_through({0}):
+    '''
+    >>> pass_through(True)
+    True
+    '''
+    return {1}
+""", """
+import cython
+{0} = True
+def test():
+    '''
+    >>> test()
+    True
+    '''
+    return {1}
+"""]
+
+for idx in range(len(example_code)):
+    with open("test{0}.py".format(idx),"w") as f:
+        if sys.version_info[0] > 2:
+            f.write("# -*- coding: utf-8 -*-\n")
+            f.write(example_code[idx].format(*string_pairs[idx]))
+        else:
+            f.write("\n") # code isn't Python 2 compatible - write a dummy file