fix "Py_UNICODE in ..." against wide unicode literals on narrow Unicode platforms

f180a00f · Stefan Behnel · 015b5ef0 · f180a00f · f180a00f · f180a00f
Commit f180a00f authored Jul 03, 2010 by Stefan Behnel
Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 0 deletions

Cython/Compiler/ExprNodes.py Cython/Compiler/ExprNodes.py +17 -0

Cython/Compiler/Optimize.py Cython/Compiler/Optimize.py +6 -0

tests/run/inop.pyx tests/run/inop.pyx +28 -0

No files found.
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -969,6 +969,23 @@ class UnicodeNode(PyConstNode):
    def can_coerce_to_char_literal(self):
        return len(self.value) == 1
+    def contains_surrogates(self):
+        # Check if the unicode string contains surrogate code points
+        # on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+        # Unicode, i.e. characters that would be spelled as two
+        # separate code units on a narrow platform.
+        for c in map(ord, self.value):
+            if c > 65535: # can only happen on wide platforms
+                return True
+            # We only look for the first code unit (D800-DBFF) of a
+            # surrogate pair - if we find one, the other one
+            # (DC00-DFFF) is likely there, too.  If we don't find it,
+            # any second code unit cannot make for a surrogate pair by
+            # itself.
+            if c >= 0xD800 and c <= 0xDBFF:
+                return True
+        return False
    def generate_evaluation_code(self, code):
        self.result_code = code.get_py_string_const(self.value)

--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -600,6 +600,12 @@ class SwitchTransform(Visitor.VisitorTransform):
                    not_in = cond.operator == 'not_in'
                    if not_in and not allow_not_in:
                        return self.NO_MATCH
+                    if isinstance(cond.operand2, ExprNodes.UnicodeNode) and \
+                           cond.operand2.contains_surrogates():
+                        # dealing with surrogates leads to different
+                        # behaviour on wide and narrow Unicode
+                        # platforms => refuse to optimise this case
+                        return self.NO_MATCH
                    # this looks somewhat silly, but it does the right
                    # checks for NameNode and AttributeNode
                    if is_common_value(cond.operand1, cond.operand1):

--- a/tests/run/inop.pyx
+++ b/tests/run/inop.pyx
@@ -195,6 +195,34 @@ def m_unicode_literal(Py_UNICODE a):
    cdef int result = a in u'abc\0defg\u1234\uF8D2'
    return result
+cdef unicode wide_unicode_character = u'\U0010FEDC'
+py_wide_unicode_character = wide_unicode_character
+cdef unicode wide_unicode_character_surrogate1 = u'\uDBFF'
+cdef unicode wide_unicode_character_surrogate2 = u'\uDEDC'
+py_wide_unicode_character_surrogate1 = wide_unicode_character_surrogate1
+py_wide_unicode_character_surrogate2 = wide_unicode_character_surrogate2
+@cython.test_fail_if_path_exists("//SwitchStatNode")
+@cython.test_assert_path_exists("//PrimaryCmpNode")
+def m_wide_unicode_literal(Py_UNICODE a):
+    """
+    >>> m_unicode_literal(ord('f'))
+    1
+    >>> m_unicode_literal(ord('X'))
+    0
+    >>> import sys
+    >>> if sys.maxunicode == 65535:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate1))
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character_surrogate2))
+    ... else:
+    ...     m_wide_unicode_literal(ord(py_wide_unicode_character))
+    ...     1
+    1
+    1
+    """
+    cdef int result = a in u'abc\0defg\u1234\uF8D2\U0010FEDC'
+    return result
 @cython.test_assert_path_exists("//SwitchStatNode")
 @cython.test_fail_if_path_exists("//BoolBinopNode", "//PrimaryCmpNode")
 def conditional_int(int a):