unicode imports (#3119)

* Handle normalization of unicode identifiers * Support unicode characters in module names (Only valid under Python 3)

unicode imports (#3119)
* Handle normalization of unicode identifiers * Support unicode characters in module names (Only valid under Python 3)
074362b4 · da-woods · Stefan Behnel · 7e233ab0 · 074362b4 · 074362b4
Commit 074362b4 authored Sep 30, 2019 by da-woods Committed by Stefan Behnel Sep 30, 2019
10 changed files
--- a/Cython/Build/Dependencies.py
+++ b/Cython/Build/Dependencies.py
@@ -783,6 +783,8 @@ def create_extension_list(patterns, exclude=None, ctx=None, aliases=None, quiet=
    create_extension = ctx.options.create_extension or default_create_extension
    for pattern in patterns:
+        if not isinstance(pattern, (Extension_distutils, Extension_setuptools)):
+            pattern = encode_filename_in_py2(pattern)
        if isinstance(pattern, str):
            filepattern = pattern
            template = Extension(pattern, [])  # Fake Extension without sources

--- a/Cython/Compiler/Main.py
+++ b/Cython/Compiler/Main.py
@@ -32,8 +32,23 @@ from .. import Utils
 from . import Options
 from .Options import CompilationOptions, default_options
 from .CmdLine import parse_command_line
+from .Lexicon import (unicode_start_ch_any, unicode_continuation_ch_any,
+                      unicode_start_ch_range, unicode_continuation_ch_range)
+def _make_range_re(chrs):
+    out = []
+    for i in range(0, len(chrs), 2):
+        out.append(u"{0}-{1}".format(chrs[i], chrs[i+1]))
+    return u"".join(out)
+# py2 version looked like r"[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$"
+module_name_pattern = u"[{0}{1}][{0}{2}{1}{3}]*".format(
+    unicode_start_ch_any, _make_range_re(unicode_start_ch_range),
+    unicode_continuation_ch_any,
+    _make_range_re(unicode_continuation_ch_range))
+module_name_pattern = re.compile(u"{0}(\\.{0})*$".format(module_name_pattern))
-module_name_pattern = re.compile(r"[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$")
 standard_include_path = os.path.abspath(
    os.path.join(os.path.dirname(os.path.dirname(__file__)), 'Includes'))
@@ -158,7 +173,7 @@ class Context(object):
        if not module_name_pattern.match(qualified_name):
            raise CompileError(pos or (module_name, 0, 0),
-                               "'%s' is not a valid module name" % module_name)
+                               u"'%s' is not a valid module name" % module_name)
        if relative_to:
            if debug_find_module:
@@ -433,6 +448,12 @@ def create_default_resultobj(compilation_source, options):
 def run_pipeline(source, options, full_module_name=None, context=None):
    from . import Pipeline
+    # ensure that the inputs are unicode (for Python 2)
+    if sys.version_info[0] == 2:
+        source = source.decode(sys.getfilesystemencoding())
+        if full_module_name:
+            full_module_name = full_module_name.decode("utf-8")
    source_ext = os.path.splitext(source)[1]
    options.configure_language_defaults(source_ext[1:]) # py/pyx
    if context is None:
@@ -442,6 +463,7 @@ def run_pipeline(source, options, full_module_name=None, context=None):
    cwd = os.getcwd()
    abs_path = os.path.abspath(source)
    full_module_name = full_module_name or context.extract_module_name(source, options)
+    full_module_name = EncodedString(full_module_name)
    Utils.raise_error_if_module_name_forbidden(full_module_name)
@@ -611,7 +633,6 @@ def search_include_directories(dirs, qualified_name, suffix, pos, include=False)
    The 'include' option will disable package dereferencing.
    """
    if pos:
        file_desc = pos[0]
        if not isinstance(file_desc, FileSourceDescriptor):
@@ -662,7 +683,6 @@ def search_include_directories(dirs, qualified_name, suffix, pos, include=False)
        # search for namespaces second - PEP420
        for package_dir in namespace_dirs:
            # matches modules of the form: <dir>/foo/bar.pxd
            path = os.path.join(package_dir, module_filename)
            if os.path.exists(path):

--- a/Cython/Compiler/ModuleNode.py
+++ b/Cython/Compiler/ModuleNode.py
--- a/Cython/Compiler/Naming.py
+++ b/Cython/Compiler/Naming.py
@@ -59,7 +59,7 @@ convert_func_prefix = pyrex_prefix + "convert_"
 closure_scope_prefix = pyrex_prefix + "scope_"
 closure_class_prefix = pyrex_prefix + "scope_struct_"
 lambda_func_prefix = pyrex_prefix + "lambda_"
-module_is_main   = pyrex_prefix + "module_is_main_"
+module_is_main   = pyrex_prefix + "module_is_main"
 defaults_struct_prefix = pyrex_prefix + "defaults"
 dynamic_args_cname = pyrex_prefix + "dynamic_args"
@@ -163,8 +163,11 @@ exc_vars = (exc_type_name, exc_value_name, exc_tb_name)
 api_name        = pyrex_prefix + "capi__"
-h_guard_prefix   = "__PYX_HAVE__"
+# the h and api guards get changed to:
-api_guard_prefix = "__PYX_HAVE_API__"
+#  __PYX_HAVE__FILENAME (for ascii filenames)
+#  __PYX_HAVE_U_PUNYCODEFILENAME (for non-ascii filenames)
+h_guard_prefix   = "__PYX_HAVE_"
+api_guard_prefix = "__PYX_HAVE_API_"
 api_func_guard   = "__PYX_HAVE_API_FUNC_"
 PYX_NAN          = "__PYX_NAN()"

--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -146,6 +146,16 @@ class EncodedString(_unicode):
            s = bytes_literal(self.byteencode(), self.encoding)
        return s.as_c_string_literal()
+    if not hasattr(_unicode, "isascii"):
+        def isascii(self):
+            # not defined for Python3.7+ since the class already has it
+            try:
+                self.encode("ascii")
+            except UnicodeEncodeError:
+                return False
+            else:
+                return True
 def string_contains_surrogates(ustring):
    """
@@ -191,6 +201,11 @@ class BytesLiteral(_bytes):
        value = split_string_literal(escape_byte_string(self))
        return '"%s"' % value
+    if not hasattr(_bytes, "isascii"):
+        def isascii(self):
+            # already defined for Python3.7+
+            return True
 def bytes_literal(s, encoding):
    assert isinstance(s, bytes)
@@ -206,6 +221,12 @@ def encoded_string(s, encoding):
        s.encoding = encoding
    return s
+def encoded_string_or_bytes_literal(s, encoding):
+    if isinstance(s, bytes):
+        return bytes_literal(s, encoding)
+    else:
+        return encoded_string(s, encoding)
 char_from_escape_sequence = {
    r'\a' : u'\a',

--- a/Cython/StringIOTree.py
+++ b/Cython/StringIOTree.py
@@ -39,6 +39,7 @@ try:
    from cStringIO import StringIO
 except ImportError:
    from io import StringIO
+import sys
 class StringIOTree(object):

--- a/Cython/Utility/CConvert.pyx
+++ b/Cython/Utility/CConvert.pyx
@@ -18,7 +18,7 @@ cdef {{struct_type}} {{funcname}}(obj) except *:
        value = obj['{{member.name}}']
    except KeyError:
        raise ValueError("No value specified for struct attribute '{{member.name}}'")
-    result.{{member.cname}} = value
+    result.{{member.name}} = value
    {{endfor}}
    return result

--- a/tests/run/struct_conversion.pyx
+++ b/tests/run/struct_conversion.pyx
@@ -167,3 +167,19 @@ def test_nested_obj_to_struct(NestedStruct nested):
                                            nested.mystruct.s.decode('UTF-8'),
                                            nested.d)
+cdef struct OverriddenCname:
+    int x "not_x"
+def test_obj_to_struct_cnames(OverriddenCname s):
+    """
+    >>> test_obj_to_struct_cnames({ 'x': 1 })
+    1
+    """
+    print(s.x)
+def test_struct_to_obj_cnames():
+    """
+    >>> test_struct_to_obj_cnames()
+    {'x': 2}
+    """
+    return OverriddenCname(2)
--- a/tests/run/unicode_identifiers.pyx
+++ b/tests/run/unicode_identifiers.pyx
@@ -51,10 +51,15 @@ if sys.version_info[0]>2:
    'NormalClassΓΓ.εxciting_function.<locals>.nestεd'
    Do kwargs work?
-    >>> unicode_kwarg(αrg=5)
+    >>> unicode_kwarg(αrγ=5)
    5
    >>> unicode_kwarg_from_cy()
    1
+    Normalization of attributes
+    (The cdef class version is testable in Python 2 too)
+    >>> NormalizeAttrPy().get()
+    5
    """
 else:
    __doc__ = ""
@@ -169,8 +174,8 @@ cdef class Derived(Γναμε2):
 cdef Γναμε2 global_ναμε3 = Γναμε2()
-def function_taking_fancy_argument(Γναμε2 αrg):
+def function_taking_fancy_argument(Γναμε2 αrγ):
-    return αrg
+    return αrγ
 class NormalClassΓΓ(Γναμε2):
    """
@@ -190,19 +195,23 @@ class NormalClassΓΓ(Γναμε2):
            pass
        return nestεd
-def unicode_kwarg(*,αrg):
+def unicode_kwarg(*, αrγ):
-    return αrg
+    return αrγ
 def unicode_kwarg_from_cy():
-    return unicode_kwarg(αrg=1)
+    return unicode_kwarg(αrγ=1)
-cdef class NormalizeAttrCdef:
+class NormalizeAttrPy:
    """Python normalizes identifier names before they are used;
-    therefore ﬁ and fi should access the same attribute.
+    therefore ﬁ and fi should access the same attribute"""
-    A more comprehensive version of this is in "unicode_identifiers_normalize.py"
+    def __init__(self):
-    comparing the behaviour to Python. The version here shows it
+        self.ﬁ = 5 # note unicode ligature symbol
-    behaves the same in a cdef class and is tested with Python 2
+    def get(self):
+        return self.fi
+cdef class NormalizeAttrCdef:
+    """Python normalizes identifier names before they are used;
+    therefore ﬁ and fi should access the same attribute
    >>> NormalizeAttrCdef().get()
    5
    """

--- a/tests/run/unicode_imports.srctree
+++ b/tests/run/unicode_imports.srctree
+# -*- coding: utf-8 -*-
+# tag: py3, pep489
+PYTHON setup.py build_ext --inplace
+PYTHON -m mydoctest
+########### mydoctest.py #######
+import sys
+if (sys.version_info[0] < 3 or
+    (sys.version_info[0] == 3 and sys.version_info[1] < 5)):
+    # The module is only Cythonized and not build for these versions
+    # so don't run the tests
+    exit()
+import doctest
+import from_py
+val = doctest.testmod(from_py)[0]
+import from_cy
+val += doctest.testmod(from_cy)[0]
+exit(val)
+########### setup.py ########
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+import sys
+from Cython.Build import cythonize
+files = ["mymoδ.pyx", "from_cy.pyx"]
+# For Python 2 and Python <= 3.4 just run pyx->c; 
+# don't compile the C file
+modules = cythonize(files)
+if sys.version_info >= (3, 5):
+    from distutils.core import setup
+    setup(
+        ext_modules = modules
+    )
+############ mymoδ.pyx #########
+def f():
+    return True
+cdef public api void cdef_func():
+    pass
+############ pxd_moδ.pxd ##########
+cdef struct S:
+    int x
+cdef public api void cdef_func() # just to test generation of headers
+############ from_py.py #########
+# -*- coding: utf-8 -*-
+import mymoδ
+from mymoδ import f
+__doc__ = """
+>>> mymoδ.f()
+True
+>>> f()
+True
+"""
+######### from_cy.pyx ##########
+# -*- coding: utf-8 -*-
+import mymoδ
+from mymoδ import f
+cimport pxd_moδ
+from pxd_moδ cimport S
+def test_imported():
+    """
+    >>> test_imported()
+    True
+    """
+    return mymoδ.f() and f() # True and True
+def test_cimported():
+    """
+    >>> test_cimported()
+    3
+    """
+    cdef pxd_moδ.S v1
+    v1.x = 1
+    cdef S v2
+    v2.x = 2
+    return v1.x + v2.x