Full support for Py_UNICODE[] literals with non-BMP characters.

85486ea2 · Nikita Nemkin · 3ce78016 · 85486ea2 · 85486ea2 · 85486ea2
Commit 85486ea2 authored Mar 03, 2013 by Nikita Nemkin
4 changed files
--- a/Cython/Compiler/Code.py
+++ b/Cython/Compiler/Code.py
@@ -778,15 +778,6 @@ class StringConst(object):
        self.py_strings[key] = py_string
        return py_string

-class UnicodeConst(object):
-    """Global info about a Py_UNICODE[] constant held by GlobalState.
-    """
-    # cname            string
-    # text             EncodedString (unicode)
-
-    def __init__(self, cname, text):
-        self.cname = cname
-        self.text = text

 class PyStringConst(object):
    """Global info about a Python string constant held by GlobalState.
@@ -1033,8 +1024,7 @@ class GlobalState(object):
        try:
            c = self.unicode_const_index[text]
        except KeyError:
-            c = UnicodeConst(self.new_const_cname(), text)
-            self.unicode_const_index[text] = c
+            c = self.unicode_const_index[text] = self.new_const_cname()
        return c

    def get_py_string_const(self, text, identifier=None,
@@ -1162,9 +1152,16 @@ class GlobalState(object):
                for py_string in c.py_strings.values():
                    py_strings.append((c.cname, len(py_string.cname), py_string))

-        for c in self.unicode_const_index.values():
-            decls_writer.putln('static Py_UNICODE %s[] = { %s };' % (
-                c.cname, StringEncoding.encode_py_unicode_string(c.text)))
+        for c, cname in self.unicode_const_index.items():
+            utf16_array, utf32_array = StringEncoding.encode_py_unicode_string(c)
+            if utf16_array:
+                # Narrow and wide representations differ
+                decls_writer.putln("#if Py_UNICODE_WIDE")
+            decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf32_array))
+            if utf16_array:
+                decls_writer.putln("#else")
+                decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf16_array))
+                decls_writer.putln("#endif")

        if py_strings:
            self.use_utility_code(UtilityCode.load_cached("InitStrings", "StringTools.c"))
@@ -1461,7 +1458,7 @@ class CCodeWriter(object):
        return self.globalstate.get_string_const(text).cname

    def get_unicode_const(self, text):
-        return self.globalstate.get_unicode_const(text).cname
+        return self.globalstate.get_unicode_const(text)

    def get_py_string_const(self, text, identifier=None,
                            is_str=False, unicode_value=None):

--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1242,8 +1242,6 @@ class UnicodeNode(ConstNode):
        if self.type.is_pyobject:
            self.result_code = code.get_py_string_const(self.value)
        else:
-            if self.contains_surrogates():
-                warning(self.pos, "Py_UNICODE* literals with characters outside BMP are not portable.", level=1);
            self.result_code = code.get_unicode_const(self.value)

    def calculate_result_code(self):

--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -267,10 +267,18 @@ def split_string_literal(s, limit=2000):
 def encode_py_unicode_string(s):
    """Create Py_UNICODE[] representation of a given unicode string.
    """
-    # Non-BMP characters will appear as surrogates, which is not compatible with
-    # wide (UTF-32) Python builds. UnicodeNode will warn the user about this.
+    utf32_array = array.array('i', s.encode('UTF-32'))
+    assert utf32_array.itemsize == 4
+    utf32_array.pop(0)     # Remove BOM
+    utf32_array.append(0)  # Add NULL terminator
+
+    for c in utf32_array:
+        if c > 65535:
+            utf16_array = array.array('H', s.encode('UTF-16'))
+            utf16_array.pop(0)     # Remove BOM
+            utf16_array.append(0)  # Add NULL terminator
+            break
+    else:
+        utf16_array = []

-    a = array.array('H', s.encode('UTF-16'))
-    a.pop(0)     # Remove BOM
-    a.append(0)  # Add NULL terminator
-    return u",".join(map(unicode, a))
+    return ",".join(map(unicode, utf16_array)), ",".join(map(unicode, utf32_array))
--- a/tests/run/py_unicode_strings.pyx
+++ b/tests/run/py_unicode_strings.pyx
@@ -19,6 +19,8 @@ cdef Py_UNICODE c_pu_arr[42]
 cdef LPWSTR c_wstr = u"unicode\u1234"
 cdef Py_UNICODE* c_pu_empty = u""
 cdef char* c_empty = ""
+cdef unicode uwide_literal = u'\U00020000\U00020001'
+cdef Py_UNICODE* c_pu_wide_literal = u'\U00020000\U00020001'

 memcpy(c_pu_arr, c_pu_str, sizeof(Py_UNICODE) * (len(uobj) + 1))

@@ -55,6 +57,12 @@ def test_c_to_python():
    assert sizeof(c_pu_arr) == sizeof(Py_UNICODE) * 42
    assert sizeof(c_pu_str) == sizeof(void*)

+    assert c_pu_wide_literal == uwide_literal
+    if sizeof(Py_UNICODE) >= 4:
+        assert len(c_pu_wide_literal) == 2
+    else:
+        assert len(c_pu_wide_literal) == 4
+
    assert u'unicode'
    assert not u''
    assert c_pu_str
@@ -80,5 +88,7 @@ def test_python_to_c():
    u = uobj[1]
    assert Py_UNICODE_equal(<Py_UNICODE*>u"n", u)

+    assert Py_UNICODE_equal(uwide_literal, <Py_UNICODE*>c_pu_wide_literal)
+
    assert len(u"abc\0") == 4
    assert len(<Py_UNICODE*>u"abc\0") == 3