Merge pull request #191 from nnemkin/py_unicode_strings

Py_UNICODE* string support

Merge pull request #191 from nnemkin/py_unicode_strings
Py_UNICODE* string support
d61f929f · scoder · e6826689 · e351aa7d · d61f929f · d61f929f
Commit d61f929f authored Mar 05, 2013 by scoder
11 changed files
--- a/Cython/Compiler/Code.py
+++ b/Cython/Compiler/Code.py
@@ -873,6 +873,7 @@ class GlobalState(object):

        self.const_cname_counter = 1
        self.string_const_index = {}
+        self.pyunicode_ptr_const_index = {}
        self.int_const_index = {}
        self.py_constants = []

@@ -1016,6 +1017,15 @@ class GlobalState(object):
        c.add_py_version(py_version)
        return c

+    def get_pyunicode_ptr_const(self, text):
+        # return a Py_UNICODE[] constant, creating a new one if necessary
+        assert text.is_unicode
+        try:
+            c = self.pyunicode_ptr_const_index[text]
+        except KeyError:
+            c = self.pyunicode_ptr_const_index[text] = self.new_const_cname()
+        return c
+
    def get_py_string_const(self, text, identifier=None,
                            is_str=False, unicode_value=None):
        # return a Python string constant, creating a new one if necessary
@@ -1141,6 +1151,17 @@ class GlobalState(object):
                for py_string in c.py_strings.values():
                    py_strings.append((c.cname, len(py_string.cname), py_string))

+        for c, cname in self.pyunicode_ptr_const_index.items():
+            utf16_array, utf32_array = StringEncoding.encode_pyunicode_string(c)
+            if utf16_array:
+                # Narrow and wide representations differ
+                decls_writer.putln("#ifdef Py_UNICODE_WIDE")
+            decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf32_array))
+            if utf16_array:
+                decls_writer.putln("#else")
+                decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf16_array))
+                decls_writer.putln("#endif")
+
        if py_strings:
            self.use_utility_code(UtilityCode.load_cached("InitStrings", "StringTools.c"))
            py_strings.sort()
@@ -1435,6 +1456,9 @@ class CCodeWriter(object):
    def get_string_const(self, text):
        return self.globalstate.get_string_const(text).cname

+    def get_pyunicode_ptr_const(self, text):
+        return self.globalstate.get_pyunicode_ptr_const(text)
+
    def get_py_string_const(self, text, identifier=None,
                            is_str=False, unicode_value=None):
        return self.globalstate.get_py_string_const(

--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -63,14 +63,16 @@ coercion_error_dict = {
    # string related errors
    (Builtin.unicode_type, Builtin.bytes_type) : "Cannot convert Unicode string to 'bytes' implicitly, encoding required.",
    (Builtin.unicode_type, Builtin.str_type)   : "Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.",
-    (Builtin.unicode_type, PyrexTypes.c_char_ptr_type) : "Unicode objects do not support coercion to C types.",
-    (Builtin.unicode_type, PyrexTypes.c_uchar_ptr_type) : "Unicode objects do not support coercion to C types.",
+    (Builtin.unicode_type, PyrexTypes.c_char_ptr_type) : "Unicode objects only support coercion to Py_UNICODE*.",
+    (Builtin.unicode_type, PyrexTypes.c_uchar_ptr_type) : "Unicode objects only support coercion to Py_UNICODE*.",
    (Builtin.bytes_type, Builtin.unicode_type) : "Cannot convert 'bytes' object to unicode implicitly, decoding required",
    (Builtin.bytes_type, Builtin.str_type) : "Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.",
+    (Builtin.bytes_type, PyrexTypes.c_py_unicode_ptr_type) : "Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.",
    (Builtin.str_type, Builtin.unicode_type) : "str objects do not support coercion to unicode, use a unicode string literal instead (u'')",
    (Builtin.str_type, Builtin.bytes_type) : "Cannot convert 'str' to 'bytes' implicitly. This is not portable.",
    (Builtin.str_type, PyrexTypes.c_char_ptr_type) : "'str' objects do not support coercion to C types (use 'bytes'?).",
    (Builtin.str_type, PyrexTypes.c_uchar_ptr_type) : "'str' objects do not support coercion to C types (use 'bytes'?).",
+    (Builtin.str_type, PyrexTypes.c_py_unicode_ptr_type) : "'str' objects do not support coercion to C types (use 'unicode'?).",
    (PyrexTypes.c_char_ptr_type, Builtin.unicode_type) : "Cannot convert 'char*' to unicode implicitly, decoding required",
    (PyrexTypes.c_uchar_ptr_type, Builtin.unicode_type) : "Cannot convert 'char*' to unicode implicitly, decoding required",
 }
@@ -1171,8 +1173,8 @@ class BytesNode(ConstNode):
        return self.result_code


-class UnicodeNode(PyConstNode):
-    # A Python unicode object
+class UnicodeNode(ConstNode):
+    # A Py_UNICODE* or unicode literal
    #
    # value        EncodedString
    # bytes_value  BytesLiteral    the literal parsed as bytes string ('-3' unicode literals only)
@@ -1213,7 +1215,11 @@ class UnicodeNode(PyConstNode):
            if dst_type.is_string and self.bytes_value is not None:
                # special case: '-3' enforced unicode literal used in a C char* context
                return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env)
-            error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.")
+            if dst_type.is_pyunicode_ptr:
+                node = UnicodeNode(self.pos, value=self.value)
+                node.type = dst_type
+                return node
+            error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).")
        elif dst_type is not py_object_type:
            if not self.check_for_coercion_error(dst_type, env):
                self.fail_assignment(dst_type)
@@ -1225,11 +1231,18 @@ class UnicodeNode(PyConstNode):
            ##     and (0xD800 <= self.value[0] <= 0xDBFF)
            ##     and (0xDC00 <= self.value[1] <= 0xDFFF))

+    def coerce_to_boolean(self, env):
+        bool_value = bool(self.value)
+        return BoolNode(self.pos, value=bool_value, constant_result=bool_value)
+
    def contains_surrogates(self):
        return _string_contains_surrogates(self.value)

    def generate_evaluation_code(self, code):
-        self.result_code = code.get_py_string_const(self.value)
+        if self.type.is_pyobject:
+            self.result_code = code.get_py_string_const(self.value)
+        else:
+            self.result_code = code.get_pyunicode_ptr_const(self.value)

    def calculate_result_code(self):
        return self.result_code
@@ -2633,6 +2646,9 @@ class IndexNode(ExprNode):
            if base_type.is_string:
                # sliced C strings must coerce to Python
                return bytes_type
+            elif base_type.is_pyunicode_ptr:
+                # sliced Py_UNICODE* strings must coerce to Python
+                return unicode_type
            elif base_type in (unicode_type, bytes_type, str_type, list_type, tuple_type):
                # slicing these returns the same type
                return base_type
@@ -3446,6 +3462,8 @@ class SliceIndexNode(ExprNode):
        base_type = self.base.infer_type(env)
        if base_type.is_string or base_type.is_cpp_class:
            return bytes_type
+        elif base_type.is_pyunicode_ptr:
+            return unicode_type
        elif base_type in (bytes_type, str_type, unicode_type,
                           list_type, tuple_type):
            return base_type
@@ -3510,6 +3528,8 @@ class SliceIndexNode(ExprNode):
        base_type = self.base.type
        if base_type.is_string or base_type.is_cpp_string:
            self.type = default_str_type(env)
+        elif base_type.is_pyunicode_ptr:
+            self.type = unicode_type
        elif base_type.is_ptr:
            self.type = base_type
        elif base_type.is_array:
@@ -3578,6 +3598,27 @@ class SliceIndexNode(ExprNode):
                        stop_code,
                        start_code,
                        code.error_goto_if_null(result, self.pos)))
+        elif self.base.type.is_pyunicode_ptr:
+            base_result = self.base.result()
+            if self.base.type != PyrexTypes.c_py_unicode_ptr_type:
+                base_result = '((const Py_UNICODE*)%s)' % base_result
+            if self.stop is None:
+                code.putln(
+                    "%s = __Pyx_PyUnicode_FromUnicode(%s + %s); %s" % (
+                        result,
+                        base_result,
+                        start_code,
+                        code.error_goto_if_null(result, self.pos)))
+            else:
+                code.putln(
+                    "%s = __Pyx_PyUnicode_FromUnicodeAndLength(%s + %s, %s - %s); %s" % (
+                        result,
+                        base_result,
+                        start_code,
+                        stop_code,
+                        start_code,
+                        code.error_goto_if_null(result, self.pos)))
+
        elif self.base.type is unicode_type:
            code.globalstate.use_utility_code( 
                          UtilityCode.load_cached("PyUnicode_Substring", "StringTools.c")) 
@@ -4903,11 +4944,11 @@ class AttributeNode(ExprNode):
        self.is_py_attr = 0
        self.member = self.attribute
        if obj_type is None:
-            if self.obj.type.is_string:
+            if self.obj.type.is_string or self.obj.type.is_pyunicode_ptr:
                self.obj = self.obj.coerce_to_pyobject(env)
            obj_type = self.obj.type
        else:
-            if obj_type.is_string:
+            if obj_type.is_string or obj_type.is_pyunicode_ptr:
                obj_type = py_object_type
        if obj_type.is_ptr or obj_type.is_array:
            obj_type = obj_type.base_type
@@ -8334,8 +8375,12 @@ class BinopNode(ExprNode):
        if self.is_py_operation_types(type1, type2):
            if type2.is_string:
                type2 = Builtin.bytes_type
+            elif type2.is_pyunicode_ptr:
+                type2 = Builtin.unicode_type
            if type1.is_string:
                type1 = Builtin.bytes_type
+            elif type1.is_pyunicode_ptr:
+                type1 = Builtin.unicode_type
            elif self.operator == '%' \
                     and type1 in (Builtin.str_type, Builtin.unicode_type):
                # note that  b'%s' % b'abc'  doesn't work in Py3
@@ -8584,7 +8629,7 @@ class AddNode(NumBinopNode):
    #  '+' operator.

    def is_py_operation_types(self, type1, type2):
-        if type1.is_string and type2.is_string:
+        if type1.is_string and type2.is_string or type1.is_pyunicode_ptr and type2.is_pyunicode_ptr:
            return 1
        else:
            return NumBinopNode.is_py_operation_types(self, type1, type2)
@@ -9947,7 +9992,7 @@ class CoerceToPyTypeNode(CoercionNode):
            # be specific about some known types
            if arg.type.is_string or arg.type.is_cpp_string:
                self.type = default_str_type(env)
-            elif arg.type.is_unicode_char:
+            elif arg.type.is_pyunicode_ptr or arg.type.is_unicode_char:
                self.type = unicode_type
            elif arg.type.is_complex:
                self.type = Builtin.complex_type
@@ -10062,13 +10107,13 @@ class CoerceFromPyTypeNode(CoercionNode):
        if not result_type.create_from_py_utility_code(env):
            error(arg.pos,
                  "Cannot convert Python object to '%s'" % result_type)
-        if self.type.is_string:
+        if self.type.is_string or self.type.is_pyunicode_ptr:
            if self.arg.is_ephemeral():
                error(arg.pos,
-                      "Obtaining char* from temporary Python value")
+                      "Obtaining '%s' from temporary Python value" % result_type)
            elif self.arg.is_name and self.arg.entry and self.arg.entry.is_pyglobal:
                warning(arg.pos,
-                        "Obtaining char* from externally modifiable global Python value",
+                        "Obtaining '%s' from externally modifiable global Python value" % result_type,
                        level=1)

    def analyse_types(self, env):

--- a/Cython/Compiler/Optimize.py
+++ b/Cython/Compiler/Optimize.py
@@ -1977,6 +1977,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
            PyrexTypes.CFuncTypeArg("bytes", PyrexTypes.c_char_ptr_type, None)
            ])

+    Pyx_Py_UNICODE_strlen_func_type = PyrexTypes.CFuncType(
+        PyrexTypes.c_size_t_type, [
+            PyrexTypes.CFuncTypeArg("unicode", PyrexTypes.c_py_unicode_ptr_type, None)
+            ])
+
    PyObject_Size_func_type = PyrexTypes.CFuncType(
        PyrexTypes.c_py_ssize_t_type, [
            PyrexTypes.CFuncTypeArg("obj", PyrexTypes.py_object_type, None)
@@ -1996,7 +2001,8 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
    _ext_types_with_pysize = set(["cpython.array.array"])

    def _handle_simple_function_len(self, node, pos_args):
-        """Replace len(char*) by the equivalent call to strlen() and
+        """Replace len(char*) by the equivalent call to strlen(),
+        len(Py_UNICODE) by the equivalent Py_UNICODE_strlen() and
        len(known_builtin_type) by an equivalent C-API call.
        """
        if len(pos_args) != 1:
@@ -2011,6 +2017,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
                args = [arg],
                is_temp = node.is_temp,
                utility_code = UtilityCode.load_cached("IncludeStringH", "StringTools.c"))
+        elif arg.type.is_pyunicode_ptr:
+            new_node = ExprNodes.PythonCapiCallNode(
+                node.pos, "__Pyx_Py_UNICODE_strlen", self.Pyx_Py_UNICODE_strlen_func_type,
+                args = [arg],
+                is_temp = node.is_temp)
        elif arg.type.is_pyobject:
            cfunc_name = self._map_to_capi_len_function(arg.type)
            if cfunc_name is None:

--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -145,6 +145,7 @@ class PyrexType(BaseType):
    #  is_enum               boolean     Is a C enum type
    #  is_typedef            boolean     Is a typedef type
    #  is_string             boolean     Is a C char * type
+    #  is_pyunicode_ptr      boolean     Is a C PyUNICODE * type
    #  is_cpp_string         boolean     Is a C++ std::string type
    #  is_unicode_char       boolean     Is either Py_UCS4 or Py_UNICODE
    #  is_returncode         boolean     Is used only to signal exceptions
@@ -202,6 +203,7 @@ class PyrexType(BaseType):
    is_enum = 0
    is_typedef = 0
    is_string = 0
+    is_pyunicode_ptr = 0
    is_unicode_char = 0
    is_returncode = 0
    is_error = 0
@@ -873,7 +875,7 @@ class PyObjectType(PyrexType):

    def assignable_from(self, src_type):
        # except for pointers, conversion will be attempted
-        return not src_type.is_ptr or src_type.is_string
+        return not src_type.is_ptr or src_type.is_string or src_type.is_pyunicode_ptr

    def declaration_code(self, entity_code,
            for_display = 0, dll_linkage = None, pyrex = 0):
@@ -1163,7 +1165,7 @@ class CType(PyrexType):

    def error_condition(self, result_code):
        conds = []
-        if self.is_string:
+        if self.is_string or self.is_pyunicode_ptr:
            conds.append("(!%s)" % result_code)
        elif self.exception_value is not None:
            conds.append("(%s == (%s)%s)" % (result_code, self.sign_and_name(), self.exception_value))
@@ -2180,6 +2182,9 @@ class CPointerBaseType(CType):
            if base_type.same_as(char_type):
                self.is_string = 1
                break
+        else:
+            if base_type.same_as(c_py_unicode_type):
+                self.is_pyunicode_ptr = 1

        if self.is_string and not base_type.is_error:
            if base_type.signed:
@@ -2191,10 +2196,17 @@ class CPointerBaseType(CType):
                if self.is_ptr:
                    self.from_py_function = "__Pyx_PyObject_AsUString"
            self.exception_value = "NULL"
+        elif self.is_pyunicode_ptr and not base_type.is_error:
+            self.to_py_function = "__Pyx_PyUnicode_FromUnicode"
+            if self.is_ptr:
+                self.from_py_function = "__Pyx_PyUnicode_AsUnicode"
+            self.exception_value = "NULL"

    def py_type_name(self):
        if self.is_string:
            return "bytes"
+        elif self.is_pyunicode_ptr:
+            return "unicode"
        else:
            return super(CPointerBaseType, self).py_type_name()


--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -4,6 +4,7 @@

 import re
 import sys
+import array

 if sys.version_info[0] >= 3:
    _unicode, _str, _bytes = str, str, bytes
@@ -262,3 +263,22 @@ def split_string_literal(s, limit=2000):
            chunks.append(s[start:end])
            start = end
        return '""'.join(chunks)
+
+def encode_pyunicode_string(s):
+    """Create Py_UNICODE[] representation of a given unicode string.
+    """
+    utf32_array = array.array('i', s.encode('UTF-32'))
+    assert utf32_array.itemsize == 4
+    utf32_array.pop(0)     # Remove BOM
+    utf32_array.append(0)  # Add NULL terminator
+
+    for c in utf32_array:
+        if c > 65535:
+            utf16_array = array.array('H', s.encode('UTF-16'))
+            utf16_array.pop(0)     # Remove BOM
+            utf16_array.append(0)  # Add NULL terminator
+            break
+    else:
+        utf16_array = []
+
+    return ",".join(map(unicode, utf16_array)), ",".join(map(unicode, utf32_array))
--- a/Cython/Utility/TypeConversion.c
+++ b/Cython/Utility/TypeConversion.c
@@ -24,6 +24,21 @@ static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*);
 #define __Pyx_PyStr_FromUString(s)     __Pyx_PyStr_FromString((char*)s)
 #define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s)

+#if PY_MAJOR_VERSION < 3
+static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u)
+{
+    const Py_UNICODE *u_end = u;
+    while (*u_end++) ;
+    return u_end - u - 1;
+}
+#else
+#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
+#endif
+
+#define __Pyx_PyUnicode_FromUnicode(u)       PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
+#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
+#define __Pyx_PyUnicode_AsUnicode            PyUnicode_AsUnicode
+
 #define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None)
 #define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
 static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);

--- a/docs/src/tutorial/strings.rst
+++ b/docs/src/tutorial/strings.rst
@@ -546,3 +546,56 @@ code will run in plain C code, (actually using a switch statement)::

 Combined with the looping optimisation above, this can result in very
 efficient character switching code, e.g. in unicode parsers.
+
+Windows and wide character APIs
+-------------------------------
+
+Windows system APIs natively support Unicode in the form of
+zero-terminated UTF-16 encoded :c:type:`wchar_t*` strings, so called
+"wide strings".
+
+By default, Windows builds of CPython define :c:type:`Py_UNICODE` as
+a synonym for :c:type:`wchar_t`. This makes internal ``unicode``
+representation compatible with UTF-16 and allows for efficient zero-copy
+conversions. This also means that Windows builds are always
+`Narrow Unicode builds`_ with all the caveats.
+
+To aid interoperation with Windows APIs, Cython 0.19 supports wide
+strings (in the form of :c:type:`Py_UNICODE*`) and implicitly converts
+them to and from ``unicode`` string objects.  These conversions behave the
+same way as they do for :c:type:`char*` and ``bytes`` as described in
+`Passing byte strings`_.
+
+In addition to automatic conversion, unicode literals that appear
+in C context become C-level wide string literals and :py:func:`len`
+built-in function is specialized to compute the length of zero-terminated
+:c:type:`Py_UNICODE*` string or array.
+
+Here is an example of how one would call a Unicode API on Windows::
+
+    cdef extern from "Windows.h":
+
+        ctypedef Py_UNICODE WCHAR
+        ctypedef const WCHAR* LPCWSTR
+        ctypedef void* HWND
+
+        int MessageBoxW(HWND hWnd, LPCWSTR lpText, LPCWSTR lpCaption, int uType) 
+
+    title = u"Windows Interop Demo - Python %d.%d.%d" % sys.version_info[:3]
+    MessageBoxW(NULL, u"Hello Cython \u263a", title, 0)
+
+.. Warning::
+
+    The use of :c:type:`Py_UNICODE*` strings outside of Windows is
+    strongly discouraged. :c:type:`Py_UNICODE` is inherently not
+    portable between different platforms and Python versions.
+
+    CPython 3.3 has moved to a flexible internal representation of
+    unicode strings (:pep:`393`), making all :c:type:`Py_UNICODE` related
+    APIs deprecated and inefficient.
+
+One consequence of CPython 3.3 changes is that :py:func:`len` of
+``unicode`` strings is always measured in *code points* ("characters"),
+while Windows API expect the number of UTF-16 *code units*
+(where each surrogate is counted individually). To always get the number
+of code units, call :c:func:`PyUnicode_GetSize` directly.
--- a/tests/errors/charptr_from_temp.pyx
+++ b/tests/errors/charptr_from_temp.pyx
 # mode: error
-# tag: werror, charptr, conversion, temp
+# tag: werror, charptr, conversion, temp, py_unicode_strings

 cdef bytes c_s = b"abc"
 s = b"abc"
@@ -18,7 +18,28 @@ cptr = s
 # temp => error
 cptr = s + b"cba"

+
+cdef unicode  c_u = u"abc"
+u = u"abc"
+
+cdef Py_UNICODE* cuptr
+
+# constant => ok
+cuptr = u"xyz"
+
+# global cdef variable => ok
+cuptr = c_u
+
+# pyglobal => warning
+cuptr = u
+
+# temp => error
+cuptr = u + u"cba"
+
+
 _ERRORS = """
-16:8: Obtaining char* from externally modifiable global Python value
-19:9: Obtaining char* from temporary Python value
+16:8: Obtaining 'char *' from externally modifiable global Python value
+19:9: Obtaining 'char *' from temporary Python value
+34:9: Obtaining 'Py_UNICODE *' from externally modifiable global Python value
+37:10: Obtaining 'Py_UNICODE *' from temporary Python value
 """
--- a/tests/errors/e_strcoerce.pyx
+++ b/tests/errors/e_strcoerce.pyx
@@ -15,5 +15,5 @@ _ERRORS = """
 4:14: Only single-character string literals can be coerced into ints.
 5:14: Only single-character string literals can be coerced into ints.
 8:15: Only single-character string literals can be coerced into ints.
-11:14: Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.
+11:14: Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).
 """
--- a/tests/errors/string_assignments.pyx
+++ b/tests/errors/string_assignments.pyx
 # mode: error
 # coding: ASCII
+# tag: py_unicode_strings

 # ok:
 cdef char* c1   =  "abc"
 cdef str s1     =  "abc"

 cdef unicode u1 = u"abc"
+cdef Py_UNICODE* cu1 = u1

 cdef bytes b1 = b"abc"
 cdef char* c2 = b"abc"
@@ -21,12 +23,18 @@ o4 = c1
 o5 = b1
 o6 = s1
 o7 = u1
+o8 = cu1

 # errors:
 cdef char* c_f1   = u"abc"
 cdef char* c_f2   = u1
 cdef char* c_f3   = s1

+cdef Py_UNICODE* cu_f1 = c1
+cdef Py_UNICODE* cu_f2 = b1
+cdef Py_UNICODE* cu_f3 = s1
+cdef Py_UNICODE* cu_f4 = b"abc"
+
 cdef bytes b_f1   = u"abc"
 cdef bytes b_f2   = u1
 cdef bytes b_f3   = s1
@@ -56,31 +64,36 @@ print <unicode>c1
 print <unicode>c1[1:2]

 _ERRORS = u"""
-26:20: Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.
-27:22: Unicode objects do not support coercion to C types.
-28:22: 'str' objects do not support coercion to C types (use 'bytes'?).
-
-30:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
-31:22: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
-32:22: Cannot convert 'str' to 'bytes' implicitly. This is not portable.
-
-34:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
-35:19: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
-36:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
-37:19: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
-
-39:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
-40:22: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
-41:20: Cannot convert 'bytes' object to unicode implicitly, decoding required
-42:22: Cannot convert 'bytes' object to unicode implicitly, decoding required
-43:22: Cannot convert 'char*' to unicode implicitly, decoding required
-
-45:19: Cannot assign type 'str object' to 'tuple object'
-46:18: Cannot assign type 'unicode object' to 'tuple object'
-47:18: Cannot assign type 'bytes object' to 'tuple object'
-
-53:13: default encoding required for conversion from 'char *' to 'str object'
-54:13: default encoding required for conversion from 'char *' to 'str object'
-55:17: Cannot convert 'char*' to unicode implicitly, decoding required
-56:17: default encoding required for conversion from 'char *' to 'unicode object'
+29:20: Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).
+30:22: Unicode objects only support coercion to Py_UNICODE*.
+31:22: 'str' objects do not support coercion to C types (use 'bytes'?).
+
+33:27: Cannot assign type 'char *' to 'Py_UNICODE *'
+34:27: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.
+35:27: 'str' objects do not support coercion to C types (use 'unicode'?).
+36:25: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.
+
+38:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
+39:22: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
+40:22: Cannot convert 'str' to 'bytes' implicitly. This is not portable.
+
+42:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
+43:19: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
+44:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
+45:19: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
+
+47:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
+48:22: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
+49:20: Cannot convert 'bytes' object to unicode implicitly, decoding required
+50:22: Cannot convert 'bytes' object to unicode implicitly, decoding required
+51:22: Cannot convert 'char*' to unicode implicitly, decoding required
+
+53:19: Cannot assign type 'str object' to 'tuple object'
+54:18: Cannot assign type 'unicode object' to 'tuple object'
+55:18: Cannot assign type 'bytes object' to 'tuple object'
+
+61:13: default encoding required for conversion from 'char *' to 'str object'
+62:13: default encoding required for conversion from 'char *' to 'str object'
+63:17: Cannot convert 'char*' to unicode implicitly, decoding required
+64:17: default encoding required for conversion from 'char *' to 'unicode object'
 """
--- a/tests/run/py_unicode_strings.pyx
+++ b/tests/run/py_unicode_strings.pyx
+# tag: py_unicode_strings
+
+import sys
+
+cimport cython
+from libc.string cimport memcpy, strcpy
+
+cdef bint Py_UNICODE_equal(const Py_UNICODE* u1, const Py_UNICODE* u2):
+    while u1[0] != 0 and u2[0] != 0 and u1[0] == u2[0]:
+        u1 += 1
+        u2 += 1
+    return u1[0] == u2[0]
+
+
+ctypedef Py_UNICODE* LPWSTR
+
+cdef unicode uobj = u'unicode\u1234'
+cdef unicode uobj1 = u'u'
+cdef Py_UNICODE* c_pu_str = u"unicode\u1234"
+cdef Py_UNICODE c_pu_arr[42]
+cdef LPWSTR c_wstr = u"unicode\u1234"
+cdef Py_UNICODE* c_pu_empty = u""
+cdef char* c_empty = ""
+cdef unicode uwide_literal = u'\U00020000\U00020001'
+cdef Py_UNICODE* c_pu_wide_literal = u'\U00020000\U00020001'
+
+memcpy(c_pu_arr, c_pu_str, sizeof(Py_UNICODE) * (len(uobj) + 1))
+
+
+def test_c_to_python():
+    """
+    >>> test_c_to_python()
+    """
+    assert c_pu_arr == uobj
+    assert c_pu_str == uobj
+    assert c_wstr == uobj
+
+    assert c_pu_arr[1:] == uobj[1:]
+    assert c_pu_str[1:] == uobj[1:]
+    assert c_wstr[1:] == uobj[1:]
+
+    assert c_pu_arr[:1] == uobj[:1]
+    assert c_pu_arr[:1] == uobj[:1]
+    assert c_pu_str[:1] == uobj[:1]
+    assert c_wstr[:1] == uobj[:1]
+
+    assert c_pu_arr[1:7] == uobj[1:7]
+    assert c_pu_str[1:7] == uobj[1:7]
+    assert c_wstr[1:7] == uobj[1:7]
+
+    assert c_pu_arr[1] == uobj[1]
+    assert c_pu_str[1] == uobj[1]
+    assert c_wstr[1] == uobj[1]
+
+    assert len(c_pu_str) == 8
+    assert len(c_pu_arr) == 8
+    assert len(c_wstr) == 8
+
+    assert sizeof(c_pu_arr) == sizeof(Py_UNICODE) * 42
+    assert sizeof(c_pu_str) == sizeof(void*)
+
+    assert c_pu_wide_literal == uwide_literal
+    if sizeof(Py_UNICODE) >= 4:
+        assert len(c_pu_wide_literal) == 2
+    else:
+        assert len(c_pu_wide_literal) == 4
+
+    if sys.version_info >= (3, 3):
+        # Make sure len(unicode) is not reverted to pre-3.3 behavior
+        assert len(uwide_literal) == 2
+
+    assert u'unicode'
+    assert not u''
+    assert c_pu_str
+    assert c_pu_empty
+
+
+def test_python_to_c():
+    """
+    >>> test_python_to_c()
+    """
+    cdef unicode u
+
+    assert Py_UNICODE_equal(c_pu_arr, uobj)
+    assert Py_UNICODE_equal(c_pu_str, uobj)
+    assert Py_UNICODE_equal(c_pu_str, <LPWSTR>uobj)
+    u = uobj[1:]
+    assert Py_UNICODE_equal(c_pu_str + 1, u)
+    assert Py_UNICODE_equal(c_wstr + 1, u)
+    u = uobj[:1]
+    assert Py_UNICODE_equal(<Py_UNICODE*>u"u", u)
+    u = uobj[1:7]
+    assert Py_UNICODE_equal(<Py_UNICODE*>u"nicode", u)
+    u = uobj[1]
+    assert Py_UNICODE_equal(<Py_UNICODE*>u"n", u)
+
+    assert Py_UNICODE_equal(uwide_literal, <Py_UNICODE*>c_pu_wide_literal)
+
+    assert len(u"abc\0") == 4
+    assert len(<Py_UNICODE*>u"abc\0") == 3