Commit 074362b4 authored by da-woods's avatar da-woods Committed by Stefan Behnel

unicode imports (#3119)

* Handle normalization of unicode identifiers
* Support unicode characters in module names
  (Only valid under Python 3)
parent 7e233ab0
...@@ -783,6 +783,8 @@ def create_extension_list(patterns, exclude=None, ctx=None, aliases=None, quiet= ...@@ -783,6 +783,8 @@ def create_extension_list(patterns, exclude=None, ctx=None, aliases=None, quiet=
create_extension = ctx.options.create_extension or default_create_extension create_extension = ctx.options.create_extension or default_create_extension
for pattern in patterns: for pattern in patterns:
if not isinstance(pattern, (Extension_distutils, Extension_setuptools)):
pattern = encode_filename_in_py2(pattern)
if isinstance(pattern, str): if isinstance(pattern, str):
filepattern = pattern filepattern = pattern
template = Extension(pattern, []) # Fake Extension without sources template = Extension(pattern, []) # Fake Extension without sources
......
...@@ -32,8 +32,23 @@ from .. import Utils ...@@ -32,8 +32,23 @@ from .. import Utils
from . import Options from . import Options
from .Options import CompilationOptions, default_options from .Options import CompilationOptions, default_options
from .CmdLine import parse_command_line from .CmdLine import parse_command_line
from .Lexicon import (unicode_start_ch_any, unicode_continuation_ch_any,
unicode_start_ch_range, unicode_continuation_ch_range)
def _make_range_re(chrs):
out = []
for i in range(0, len(chrs), 2):
out.append(u"{0}-{1}".format(chrs[i], chrs[i+1]))
return u"".join(out)
# py2 version looked like r"[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$"
module_name_pattern = u"[{0}{1}][{0}{2}{1}{3}]*".format(
unicode_start_ch_any, _make_range_re(unicode_start_ch_range),
unicode_continuation_ch_any,
_make_range_re(unicode_continuation_ch_range))
module_name_pattern = re.compile(u"{0}(\\.{0})*$".format(module_name_pattern))
module_name_pattern = re.compile(r"[A-Za-z_][A-Za-z0-9_]*(\.[A-Za-z_][A-Za-z0-9_]*)*$")
standard_include_path = os.path.abspath( standard_include_path = os.path.abspath(
os.path.join(os.path.dirname(os.path.dirname(__file__)), 'Includes')) os.path.join(os.path.dirname(os.path.dirname(__file__)), 'Includes'))
...@@ -158,7 +173,7 @@ class Context(object): ...@@ -158,7 +173,7 @@ class Context(object):
if not module_name_pattern.match(qualified_name): if not module_name_pattern.match(qualified_name):
raise CompileError(pos or (module_name, 0, 0), raise CompileError(pos or (module_name, 0, 0),
"'%s' is not a valid module name" % module_name) u"'%s' is not a valid module name" % module_name)
if relative_to: if relative_to:
if debug_find_module: if debug_find_module:
...@@ -433,6 +448,12 @@ def create_default_resultobj(compilation_source, options): ...@@ -433,6 +448,12 @@ def create_default_resultobj(compilation_source, options):
def run_pipeline(source, options, full_module_name=None, context=None): def run_pipeline(source, options, full_module_name=None, context=None):
from . import Pipeline from . import Pipeline
# ensure that the inputs are unicode (for Python 2)
if sys.version_info[0] == 2:
source = source.decode(sys.getfilesystemencoding())
if full_module_name:
full_module_name = full_module_name.decode("utf-8")
source_ext = os.path.splitext(source)[1] source_ext = os.path.splitext(source)[1]
options.configure_language_defaults(source_ext[1:]) # py/pyx options.configure_language_defaults(source_ext[1:]) # py/pyx
if context is None: if context is None:
...@@ -442,6 +463,7 @@ def run_pipeline(source, options, full_module_name=None, context=None): ...@@ -442,6 +463,7 @@ def run_pipeline(source, options, full_module_name=None, context=None):
cwd = os.getcwd() cwd = os.getcwd()
abs_path = os.path.abspath(source) abs_path = os.path.abspath(source)
full_module_name = full_module_name or context.extract_module_name(source, options) full_module_name = full_module_name or context.extract_module_name(source, options)
full_module_name = EncodedString(full_module_name)
Utils.raise_error_if_module_name_forbidden(full_module_name) Utils.raise_error_if_module_name_forbidden(full_module_name)
...@@ -611,7 +633,6 @@ def search_include_directories(dirs, qualified_name, suffix, pos, include=False) ...@@ -611,7 +633,6 @@ def search_include_directories(dirs, qualified_name, suffix, pos, include=False)
The 'include' option will disable package dereferencing. The 'include' option will disable package dereferencing.
""" """
if pos: if pos:
file_desc = pos[0] file_desc = pos[0]
if not isinstance(file_desc, FileSourceDescriptor): if not isinstance(file_desc, FileSourceDescriptor):
...@@ -662,7 +683,6 @@ def search_include_directories(dirs, qualified_name, suffix, pos, include=False) ...@@ -662,7 +683,6 @@ def search_include_directories(dirs, qualified_name, suffix, pos, include=False)
# search for namespaces second - PEP420 # search for namespaces second - PEP420
for package_dir in namespace_dirs: for package_dir in namespace_dirs:
# matches modules of the form: <dir>/foo/bar.pxd # matches modules of the form: <dir>/foo/bar.pxd
path = os.path.join(package_dir, module_filename) path = os.path.join(package_dir, module_filename)
if os.path.exists(path): if os.path.exists(path):
......
This diff is collapsed.
...@@ -59,7 +59,7 @@ convert_func_prefix = pyrex_prefix + "convert_" ...@@ -59,7 +59,7 @@ convert_func_prefix = pyrex_prefix + "convert_"
closure_scope_prefix = pyrex_prefix + "scope_" closure_scope_prefix = pyrex_prefix + "scope_"
closure_class_prefix = pyrex_prefix + "scope_struct_" closure_class_prefix = pyrex_prefix + "scope_struct_"
lambda_func_prefix = pyrex_prefix + "lambda_" lambda_func_prefix = pyrex_prefix + "lambda_"
module_is_main = pyrex_prefix + "module_is_main_" module_is_main = pyrex_prefix + "module_is_main"
defaults_struct_prefix = pyrex_prefix + "defaults" defaults_struct_prefix = pyrex_prefix + "defaults"
dynamic_args_cname = pyrex_prefix + "dynamic_args" dynamic_args_cname = pyrex_prefix + "dynamic_args"
...@@ -163,8 +163,11 @@ exc_vars = (exc_type_name, exc_value_name, exc_tb_name) ...@@ -163,8 +163,11 @@ exc_vars = (exc_type_name, exc_value_name, exc_tb_name)
api_name = pyrex_prefix + "capi__" api_name = pyrex_prefix + "capi__"
h_guard_prefix = "__PYX_HAVE__" # the h and api guards get changed to:
api_guard_prefix = "__PYX_HAVE_API__" # __PYX_HAVE__FILENAME (for ascii filenames)
# __PYX_HAVE_U_PUNYCODEFILENAME (for non-ascii filenames)
h_guard_prefix = "__PYX_HAVE_"
api_guard_prefix = "__PYX_HAVE_API_"
api_func_guard = "__PYX_HAVE_API_FUNC_" api_func_guard = "__PYX_HAVE_API_FUNC_"
PYX_NAN = "__PYX_NAN()" PYX_NAN = "__PYX_NAN()"
......
...@@ -146,6 +146,16 @@ class EncodedString(_unicode): ...@@ -146,6 +146,16 @@ class EncodedString(_unicode):
s = bytes_literal(self.byteencode(), self.encoding) s = bytes_literal(self.byteencode(), self.encoding)
return s.as_c_string_literal() return s.as_c_string_literal()
if not hasattr(_unicode, "isascii"):
def isascii(self):
# not defined for Python3.7+ since the class already has it
try:
self.encode("ascii")
except UnicodeEncodeError:
return False
else:
return True
def string_contains_surrogates(ustring): def string_contains_surrogates(ustring):
""" """
...@@ -191,6 +201,11 @@ class BytesLiteral(_bytes): ...@@ -191,6 +201,11 @@ class BytesLiteral(_bytes):
value = split_string_literal(escape_byte_string(self)) value = split_string_literal(escape_byte_string(self))
return '"%s"' % value return '"%s"' % value
if not hasattr(_bytes, "isascii"):
def isascii(self):
# already defined for Python3.7+
return True
def bytes_literal(s, encoding): def bytes_literal(s, encoding):
assert isinstance(s, bytes) assert isinstance(s, bytes)
...@@ -206,6 +221,12 @@ def encoded_string(s, encoding): ...@@ -206,6 +221,12 @@ def encoded_string(s, encoding):
s.encoding = encoding s.encoding = encoding
return s return s
def encoded_string_or_bytes_literal(s, encoding):
if isinstance(s, bytes):
return bytes_literal(s, encoding)
else:
return encoded_string(s, encoding)
char_from_escape_sequence = { char_from_escape_sequence = {
r'\a' : u'\a', r'\a' : u'\a',
......
...@@ -39,6 +39,7 @@ try: ...@@ -39,6 +39,7 @@ try:
from cStringIO import StringIO from cStringIO import StringIO
except ImportError: except ImportError:
from io import StringIO from io import StringIO
import sys
class StringIOTree(object): class StringIOTree(object):
......
...@@ -18,7 +18,7 @@ cdef {{struct_type}} {{funcname}}(obj) except *: ...@@ -18,7 +18,7 @@ cdef {{struct_type}} {{funcname}}(obj) except *:
value = obj['{{member.name}}'] value = obj['{{member.name}}']
except KeyError: except KeyError:
raise ValueError("No value specified for struct attribute '{{member.name}}'") raise ValueError("No value specified for struct attribute '{{member.name}}'")
result.{{member.cname}} = value result.{{member.name}} = value
{{endfor}} {{endfor}}
return result return result
......
...@@ -167,3 +167,19 @@ def test_nested_obj_to_struct(NestedStruct nested): ...@@ -167,3 +167,19 @@ def test_nested_obj_to_struct(NestedStruct nested):
nested.mystruct.s.decode('UTF-8'), nested.mystruct.s.decode('UTF-8'),
nested.d) nested.d)
cdef struct OverriddenCname:
int x "not_x"
def test_obj_to_struct_cnames(OverriddenCname s):
"""
>>> test_obj_to_struct_cnames({ 'x': 1 })
1
"""
print(s.x)
def test_struct_to_obj_cnames():
"""
>>> test_struct_to_obj_cnames()
{'x': 2}
"""
return OverriddenCname(2)
...@@ -51,10 +51,15 @@ if sys.version_info[0]>2: ...@@ -51,10 +51,15 @@ if sys.version_info[0]>2:
'NormalClassΓΓ.εxciting_function.<locals>.nestεd' 'NormalClassΓΓ.εxciting_function.<locals>.nestεd'
Do kwargs work? Do kwargs work?
>>> unicode_kwarg(αrg=5) >>> unicode_kwarg(αrγ=5)
5 5
>>> unicode_kwarg_from_cy() >>> unicode_kwarg_from_cy()
1 1
Normalization of attributes
(The cdef class version is testable in Python 2 too)
>>> NormalizeAttrPy().get()
5
""" """
else: else:
__doc__ = "" __doc__ = ""
...@@ -169,8 +174,8 @@ cdef class Derived(Γναμε2): ...@@ -169,8 +174,8 @@ cdef class Derived(Γναμε2):
cdef Γναμε2 global_ναμε3 = Γναμε2() cdef Γναμε2 global_ναμε3 = Γναμε2()
def function_taking_fancy_argument(Γναμε2 αrg): def function_taking_fancy_argument(Γναμε2 αrγ):
return αrg return αrγ
class NormalClassΓΓ(Γναμε2): class NormalClassΓΓ(Γναμε2):
""" """
...@@ -190,19 +195,23 @@ class NormalClassΓΓ(Γναμε2): ...@@ -190,19 +195,23 @@ class NormalClassΓΓ(Γναμε2):
pass pass
return nestεd return nestεd
def unicode_kwarg(*,αrg): def unicode_kwarg(*, αrγ):
return αrg return αrγ
def unicode_kwarg_from_cy(): def unicode_kwarg_from_cy():
return unicode_kwarg(αrg=1) return unicode_kwarg(αrγ=1)
cdef class NormalizeAttrCdef: class NormalizeAttrPy:
"""Python normalizes identifier names before they are used; """Python normalizes identifier names before they are used;
therefore and fi should access the same attribute. therefore and fi should access the same attribute"""
A more comprehensive version of this is in "unicode_identifiers_normalize.py" def __init__(self):
comparing the behaviour to Python. The version here shows it self.fi = 5 # note unicode ligature symbol
behaves the same in a cdef class and is tested with Python 2 def get(self):
return self.fi
cdef class NormalizeAttrCdef:
"""Python normalizes identifier names before they are used;
therefore and fi should access the same attribute
>>> NormalizeAttrCdef().get() >>> NormalizeAttrCdef().get()
5 5
""" """
......
# -*- coding: utf-8 -*-
# tag: py3, pep489
PYTHON setup.py build_ext --inplace
PYTHON -m mydoctest
########### mydoctest.py #######
import sys
if (sys.version_info[0] < 3 or
(sys.version_info[0] == 3 and sys.version_info[1] < 5)):
# The module is only Cythonized and not build for these versions
# so don't run the tests
exit()
import doctest
import from_py
val = doctest.testmod(from_py)[0]
import from_cy
val += doctest.testmod(from_cy)[0]
exit(val)
########### setup.py ########
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import sys
from Cython.Build import cythonize
files = ["mymoδ.pyx", "from_cy.pyx"]
# For Python 2 and Python <= 3.4 just run pyx->c;
# don't compile the C file
modules = cythonize(files)
if sys.version_info >= (3, 5):
from distutils.core import setup
setup(
ext_modules = modules
)
############ mymoδ.pyx #########
def f():
return True
cdef public api void cdef_func():
pass
############ pxd_moδ.pxd ##########
cdef struct S:
int x
cdef public api void cdef_func() # just to test generation of headers
############ from_py.py #########
# -*- coding: utf-8 -*-
import mymoδ
from mymoδ import f
__doc__ = """
>>> mymoδ.f()
True
>>> f()
True
"""
######### from_cy.pyx ##########
# -*- coding: utf-8 -*-
import mymoδ
from mymoδ import f
cimport pxd_moδ
from pxd_moδ cimport S
def test_imported():
"""
>>> test_imported()
True
"""
return mymoδ.f() and f() # True and True
def test_cimported():
"""
>>> test_cimported()
3
"""
cdef pxd_moδ.S v1
v1.x = 1
cdef S v2
v2.x = 2
return v1.x + v2.x
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment