Commit f180a00f authored by Stefan Behnel's avatar Stefan Behnel

fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms

parent 015b5ef0
...@@ -969,6 +969,23 @@ class UnicodeNode(PyConstNode): ...@@ -969,6 +969,23 @@ class UnicodeNode(PyConstNode):
def can_coerce_to_char_literal(self): def can_coerce_to_char_literal(self):
return len(self.value) == 1 return len(self.value) == 1
def contains_surrogates(self):
# Check if the unicode string contains surrogate code points
# on a CPython platform with wide (UCS-4) or narrow (UTF-16)
# Unicode, i.e. characters that would be spelled as two
# separate code units on a narrow platform.
for c in map(ord, self.value):
if c > 65535: # can only happen on wide platforms
return True
# We only look for the first code unit (D800-DBFF) of a
# surrogate pair - if we find one, the other one
# (DC00-DFFF) is likely there, too. If we don't find it,
# any second code unit cannot make for a surrogate pair by
# itself.
if c >= 0xD800 and c <= 0xDBFF:
return True
return False
def generate_evaluation_code(self, code): def generate_evaluation_code(self, code):
self.result_code = code.get_py_string_const(self.value) self.result_code = code.get_py_string_const(self.value)
......
...@@ -600,6 +600,12 @@ class SwitchTransform(Visitor.VisitorTransform): ...@@ -600,6 +600,12 @@ class SwitchTransform(Visitor.VisitorTransform):
not_in = cond.operator == 'not_in' not_in = cond.operator == 'not_in'
if not_in and not allow_not_in: if not_in and not allow_not_in:
return self.NO_MATCH return self.NO_MATCH
if isinstance(cond.operand2, ExprNodes.UnicodeNode) and \
cond.operand2.contains_surrogates():
# dealing with surrogates leads to different
# behaviour on wide and narrow Unicode
# platforms => refuse to optimise this case
return self.NO_MATCH
# this looks somewhat silly, but it does the right # this looks somewhat silly, but it does the right
# checks for NameNode and AttributeNode # checks for NameNode and AttributeNode
if is_common_value(cond.operand1, cond.operand1): if is_common_value(cond.operand1, cond.operand1):
......
...@@ -195,6 +195,34 @@ def m_unicode_literal(Py_UNICODE a): ...@@ -195,6 +195,34 @@ def m_unicode_literal(Py_UNICODE a):
cdef int result = a in u'abc\0defg\u1234\uF8D2' cdef int result = a in u'abc\0defg\u1234\uF8D2'
return result return result
cdef unicode wide_unicode_character = u'\U0010FEDC'
py_wide_unicode_character = wide_unicode_character
cdef unicode wide_unicode_character_surrogate1 = u'\uDBFF'
cdef unicode wide_unicode_character_surrogate2 = u'\uDEDC'
py_wide_unicode_character_surrogate1 = wide_unicode_character_surrogate1
py_wide_unicode_character_surrogate2 = wide_unicode_character_surrogate2
@cython.test_fail_if_path_exists("//SwitchStatNode")
@cython.test_assert_path_exists("//PrimaryCmpNode")
def m_wide_unicode_literal(Py_UNICODE a):
"""
>>> m_unicode_literal(ord('f'))
1
>>> m_unicode_literal(ord('X'))
0
>>> import sys
>>> if sys.maxunicode == 65535:
... m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate1))
... m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate2))
... else:
... m_wide_unicode_literal(ord(py_wide_unicode_character))
... 1
1
1
"""
cdef int result = a in u'abc\0defg\u1234\uF8D2\U0010FEDC'
return result
@cython.test_assert_path_exists("//SwitchStatNode") @cython.test_assert_path_exists("//SwitchStatNode")
@cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode") @cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode")
def conditional_int(int a): def conditional_int(int a):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment