Commit d61f929f authored by scoder's avatar scoder

Merge pull request #191 from nnemkin/py_unicode_strings

Py_UNICODE* string support
parents e6826689 e351aa7d
...@@ -873,6 +873,7 @@ class GlobalState(object): ...@@ -873,6 +873,7 @@ class GlobalState(object):
self.const_cname_counter = 1 self.const_cname_counter = 1
self.string_const_index = {} self.string_const_index = {}
self.pyunicode_ptr_const_index = {}
self.int_const_index = {} self.int_const_index = {}
self.py_constants = [] self.py_constants = []
...@@ -1016,6 +1017,15 @@ class GlobalState(object): ...@@ -1016,6 +1017,15 @@ class GlobalState(object):
c.add_py_version(py_version) c.add_py_version(py_version)
return c return c
def get_pyunicode_ptr_const(self, text):
# return a Py_UNICODE[] constant, creating a new one if necessary
assert text.is_unicode
try:
c = self.pyunicode_ptr_const_index[text]
except KeyError:
c = self.pyunicode_ptr_const_index[text] = self.new_const_cname()
return c
def get_py_string_const(self, text, identifier=None, def get_py_string_const(self, text, identifier=None,
is_str=False, unicode_value=None): is_str=False, unicode_value=None):
# return a Python string constant, creating a new one if necessary # return a Python string constant, creating a new one if necessary
...@@ -1141,6 +1151,17 @@ class GlobalState(object): ...@@ -1141,6 +1151,17 @@ class GlobalState(object):
for py_string in c.py_strings.values(): for py_string in c.py_strings.values():
py_strings.append((c.cname, len(py_string.cname), py_string)) py_strings.append((c.cname, len(py_string.cname), py_string))
for c, cname in self.pyunicode_ptr_const_index.items():
utf16_array, utf32_array = StringEncoding.encode_pyunicode_string(c)
if utf16_array:
# Narrow and wide representations differ
decls_writer.putln("#ifdef Py_UNICODE_WIDE")
decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf32_array))
if utf16_array:
decls_writer.putln("#else")
decls_writer.putln("static Py_UNICODE %s[] = { %s };" % (cname, utf16_array))
decls_writer.putln("#endif")
if py_strings: if py_strings:
self.use_utility_code(UtilityCode.load_cached("InitStrings", "StringTools.c")) self.use_utility_code(UtilityCode.load_cached("InitStrings", "StringTools.c"))
py_strings.sort() py_strings.sort()
...@@ -1435,6 +1456,9 @@ class CCodeWriter(object): ...@@ -1435,6 +1456,9 @@ class CCodeWriter(object):
def get_string_const(self, text): def get_string_const(self, text):
return self.globalstate.get_string_const(text).cname return self.globalstate.get_string_const(text).cname
def get_pyunicode_ptr_const(self, text):
return self.globalstate.get_pyunicode_ptr_const(text)
def get_py_string_const(self, text, identifier=None, def get_py_string_const(self, text, identifier=None,
is_str=False, unicode_value=None): is_str=False, unicode_value=None):
return self.globalstate.get_py_string_const( return self.globalstate.get_py_string_const(
......
...@@ -63,14 +63,16 @@ coercion_error_dict = { ...@@ -63,14 +63,16 @@ coercion_error_dict = {
# string related errors # string related errors
(Builtin.unicode_type, Builtin.bytes_type) : "Cannot convert Unicode string to 'bytes' implicitly, encoding required.", (Builtin.unicode_type, Builtin.bytes_type) : "Cannot convert Unicode string to 'bytes' implicitly, encoding required.",
(Builtin.unicode_type, Builtin.str_type) : "Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.", (Builtin.unicode_type, Builtin.str_type) : "Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.",
(Builtin.unicode_type, PyrexTypes.c_char_ptr_type) : "Unicode objects do not support coercion to C types.", (Builtin.unicode_type, PyrexTypes.c_char_ptr_type) : "Unicode objects only support coercion to Py_UNICODE*.",
(Builtin.unicode_type, PyrexTypes.c_uchar_ptr_type) : "Unicode objects do not support coercion to C types.", (Builtin.unicode_type, PyrexTypes.c_uchar_ptr_type) : "Unicode objects only support coercion to Py_UNICODE*.",
(Builtin.bytes_type, Builtin.unicode_type) : "Cannot convert 'bytes' object to unicode implicitly, decoding required", (Builtin.bytes_type, Builtin.unicode_type) : "Cannot convert 'bytes' object to unicode implicitly, decoding required",
(Builtin.bytes_type, Builtin.str_type) : "Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.", (Builtin.bytes_type, Builtin.str_type) : "Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.",
(Builtin.bytes_type, PyrexTypes.c_py_unicode_ptr_type) : "Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.",
(Builtin.str_type, Builtin.unicode_type) : "str objects do not support coercion to unicode, use a unicode string literal instead (u'')", (Builtin.str_type, Builtin.unicode_type) : "str objects do not support coercion to unicode, use a unicode string literal instead (u'')",
(Builtin.str_type, Builtin.bytes_type) : "Cannot convert 'str' to 'bytes' implicitly. This is not portable.", (Builtin.str_type, Builtin.bytes_type) : "Cannot convert 'str' to 'bytes' implicitly. This is not portable.",
(Builtin.str_type, PyrexTypes.c_char_ptr_type) : "'str' objects do not support coercion to C types (use 'bytes'?).", (Builtin.str_type, PyrexTypes.c_char_ptr_type) : "'str' objects do not support coercion to C types (use 'bytes'?).",
(Builtin.str_type, PyrexTypes.c_uchar_ptr_type) : "'str' objects do not support coercion to C types (use 'bytes'?).", (Builtin.str_type, PyrexTypes.c_uchar_ptr_type) : "'str' objects do not support coercion to C types (use 'bytes'?).",
(Builtin.str_type, PyrexTypes.c_py_unicode_ptr_type) : "'str' objects do not support coercion to C types (use 'unicode'?).",
(PyrexTypes.c_char_ptr_type, Builtin.unicode_type) : "Cannot convert 'char*' to unicode implicitly, decoding required", (PyrexTypes.c_char_ptr_type, Builtin.unicode_type) : "Cannot convert 'char*' to unicode implicitly, decoding required",
(PyrexTypes.c_uchar_ptr_type, Builtin.unicode_type) : "Cannot convert 'char*' to unicode implicitly, decoding required", (PyrexTypes.c_uchar_ptr_type, Builtin.unicode_type) : "Cannot convert 'char*' to unicode implicitly, decoding required",
} }
...@@ -1171,8 +1173,8 @@ class BytesNode(ConstNode): ...@@ -1171,8 +1173,8 @@ class BytesNode(ConstNode):
return self.result_code return self.result_code
class UnicodeNode(PyConstNode): class UnicodeNode(ConstNode):
# A Python unicode object # A Py_UNICODE* or unicode literal
# #
# value EncodedString # value EncodedString
# bytes_value BytesLiteral the literal parsed as bytes string ('-3' unicode literals only) # bytes_value BytesLiteral the literal parsed as bytes string ('-3' unicode literals only)
...@@ -1213,7 +1215,11 @@ class UnicodeNode(PyConstNode): ...@@ -1213,7 +1215,11 @@ class UnicodeNode(PyConstNode):
if dst_type.is_string and self.bytes_value is not None: if dst_type.is_string and self.bytes_value is not None:
# special case: '-3' enforced unicode literal used in a C char* context # special case: '-3' enforced unicode literal used in a C char* context
return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env) return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env)
error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.") if dst_type.is_pyunicode_ptr:
node = UnicodeNode(self.pos, value=self.value)
node.type = dst_type
return node
error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).")
elif dst_type is not py_object_type: elif dst_type is not py_object_type:
if not self.check_for_coercion_error(dst_type, env): if not self.check_for_coercion_error(dst_type, env):
self.fail_assignment(dst_type) self.fail_assignment(dst_type)
...@@ -1225,11 +1231,18 @@ class UnicodeNode(PyConstNode): ...@@ -1225,11 +1231,18 @@ class UnicodeNode(PyConstNode):
## and (0xD800 <= self.value[0] <= 0xDBFF) ## and (0xD800 <= self.value[0] <= 0xDBFF)
## and (0xDC00 <= self.value[1] <= 0xDFFF)) ## and (0xDC00 <= self.value[1] <= 0xDFFF))
def coerce_to_boolean(self, env):
bool_value = bool(self.value)
return BoolNode(self.pos, value=bool_value, constant_result=bool_value)
def contains_surrogates(self): def contains_surrogates(self):
return _string_contains_surrogates(self.value) return _string_contains_surrogates(self.value)
def generate_evaluation_code(self, code): def generate_evaluation_code(self, code):
self.result_code = code.get_py_string_const(self.value) if self.type.is_pyobject:
self.result_code = code.get_py_string_const(self.value)
else:
self.result_code = code.get_pyunicode_ptr_const(self.value)
def calculate_result_code(self): def calculate_result_code(self):
return self.result_code return self.result_code
...@@ -2633,6 +2646,9 @@ class IndexNode(ExprNode): ...@@ -2633,6 +2646,9 @@ class IndexNode(ExprNode):
if base_type.is_string: if base_type.is_string:
# sliced C strings must coerce to Python # sliced C strings must coerce to Python
return bytes_type return bytes_type
elif base_type.is_pyunicode_ptr:
# sliced Py_UNICODE* strings must coerce to Python
return unicode_type
elif base_type in (unicode_type, bytes_type, str_type, list_type, tuple_type): elif base_type in (unicode_type, bytes_type, str_type, list_type, tuple_type):
# slicing these returns the same type # slicing these returns the same type
return base_type return base_type
...@@ -3446,6 +3462,8 @@ class SliceIndexNode(ExprNode): ...@@ -3446,6 +3462,8 @@ class SliceIndexNode(ExprNode):
base_type = self.base.infer_type(env) base_type = self.base.infer_type(env)
if base_type.is_string or base_type.is_cpp_class: if base_type.is_string or base_type.is_cpp_class:
return bytes_type return bytes_type
elif base_type.is_pyunicode_ptr:
return unicode_type
elif base_type in (bytes_type, str_type, unicode_type, elif base_type in (bytes_type, str_type, unicode_type,
list_type, tuple_type): list_type, tuple_type):
return base_type return base_type
...@@ -3510,6 +3528,8 @@ class SliceIndexNode(ExprNode): ...@@ -3510,6 +3528,8 @@ class SliceIndexNode(ExprNode):
base_type = self.base.type base_type = self.base.type
if base_type.is_string or base_type.is_cpp_string: if base_type.is_string or base_type.is_cpp_string:
self.type = default_str_type(env) self.type = default_str_type(env)
elif base_type.is_pyunicode_ptr:
self.type = unicode_type
elif base_type.is_ptr: elif base_type.is_ptr:
self.type = base_type self.type = base_type
elif base_type.is_array: elif base_type.is_array:
...@@ -3578,6 +3598,27 @@ class SliceIndexNode(ExprNode): ...@@ -3578,6 +3598,27 @@ class SliceIndexNode(ExprNode):
stop_code, stop_code,
start_code, start_code,
code.error_goto_if_null(result, self.pos))) code.error_goto_if_null(result, self.pos)))
elif self.base.type.is_pyunicode_ptr:
base_result = self.base.result()
if self.base.type != PyrexTypes.c_py_unicode_ptr_type:
base_result = '((const Py_UNICODE*)%s)' % base_result
if self.stop is None:
code.putln(
"%s = __Pyx_PyUnicode_FromUnicode(%s + %s); %s" % (
result,
base_result,
start_code,
code.error_goto_if_null(result, self.pos)))
else:
code.putln(
"%s = __Pyx_PyUnicode_FromUnicodeAndLength(%s + %s, %s - %s); %s" % (
result,
base_result,
start_code,
stop_code,
start_code,
code.error_goto_if_null(result, self.pos)))
elif self.base.type is unicode_type: elif self.base.type is unicode_type:
code.globalstate.use_utility_code( code.globalstate.use_utility_code(
UtilityCode.load_cached("PyUnicode_Substring", "StringTools.c")) UtilityCode.load_cached("PyUnicode_Substring", "StringTools.c"))
...@@ -4903,11 +4944,11 @@ class AttributeNode(ExprNode): ...@@ -4903,11 +4944,11 @@ class AttributeNode(ExprNode):
self.is_py_attr = 0 self.is_py_attr = 0
self.member = self.attribute self.member = self.attribute
if obj_type is None: if obj_type is None:
if self.obj.type.is_string: if self.obj.type.is_string or self.obj.type.is_pyunicode_ptr:
self.obj = self.obj.coerce_to_pyobject(env) self.obj = self.obj.coerce_to_pyobject(env)
obj_type = self.obj.type obj_type = self.obj.type
else: else:
if obj_type.is_string: if obj_type.is_string or obj_type.is_pyunicode_ptr:
obj_type = py_object_type obj_type = py_object_type
if obj_type.is_ptr or obj_type.is_array: if obj_type.is_ptr or obj_type.is_array:
obj_type = obj_type.base_type obj_type = obj_type.base_type
...@@ -8334,8 +8375,12 @@ class BinopNode(ExprNode): ...@@ -8334,8 +8375,12 @@ class BinopNode(ExprNode):
if self.is_py_operation_types(type1, type2): if self.is_py_operation_types(type1, type2):
if type2.is_string: if type2.is_string:
type2 = Builtin.bytes_type type2 = Builtin.bytes_type
elif type2.is_pyunicode_ptr:
type2 = Builtin.unicode_type
if type1.is_string: if type1.is_string:
type1 = Builtin.bytes_type type1 = Builtin.bytes_type
elif type1.is_pyunicode_ptr:
type1 = Builtin.unicode_type
elif self.operator == '%' \ elif self.operator == '%' \
and type1 in (Builtin.str_type, Builtin.unicode_type): and type1 in (Builtin.str_type, Builtin.unicode_type):
# note that b'%s' % b'abc' doesn't work in Py3 # note that b'%s' % b'abc' doesn't work in Py3
...@@ -8584,7 +8629,7 @@ class AddNode(NumBinopNode): ...@@ -8584,7 +8629,7 @@ class AddNode(NumBinopNode):
# '+' operator. # '+' operator.
def is_py_operation_types(self, type1, type2): def is_py_operation_types(self, type1, type2):
if type1.is_string and type2.is_string: if type1.is_string and type2.is_string or type1.is_pyunicode_ptr and type2.is_pyunicode_ptr:
return 1 return 1
else: else:
return NumBinopNode.is_py_operation_types(self, type1, type2) return NumBinopNode.is_py_operation_types(self, type1, type2)
...@@ -9947,7 +9992,7 @@ class CoerceToPyTypeNode(CoercionNode): ...@@ -9947,7 +9992,7 @@ class CoerceToPyTypeNode(CoercionNode):
# be specific about some known types # be specific about some known types
if arg.type.is_string or arg.type.is_cpp_string: if arg.type.is_string or arg.type.is_cpp_string:
self.type = default_str_type(env) self.type = default_str_type(env)
elif arg.type.is_unicode_char: elif arg.type.is_pyunicode_ptr or arg.type.is_unicode_char:
self.type = unicode_type self.type = unicode_type
elif arg.type.is_complex: elif arg.type.is_complex:
self.type = Builtin.complex_type self.type = Builtin.complex_type
...@@ -10062,13 +10107,13 @@ class CoerceFromPyTypeNode(CoercionNode): ...@@ -10062,13 +10107,13 @@ class CoerceFromPyTypeNode(CoercionNode):
if not result_type.create_from_py_utility_code(env): if not result_type.create_from_py_utility_code(env):
error(arg.pos, error(arg.pos,
"Cannot convert Python object to '%s'" % result_type) "Cannot convert Python object to '%s'" % result_type)
if self.type.is_string: if self.type.is_string or self.type.is_pyunicode_ptr:
if self.arg.is_ephemeral(): if self.arg.is_ephemeral():
error(arg.pos, error(arg.pos,
"Obtaining char* from temporary Python value") "Obtaining '%s' from temporary Python value" % result_type)
elif self.arg.is_name and self.arg.entry and self.arg.entry.is_pyglobal: elif self.arg.is_name and self.arg.entry and self.arg.entry.is_pyglobal:
warning(arg.pos, warning(arg.pos,
"Obtaining char* from externally modifiable global Python value", "Obtaining '%s' from externally modifiable global Python value" % result_type,
level=1) level=1)
def analyse_types(self, env): def analyse_types(self, env):
......
...@@ -1977,6 +1977,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform): ...@@ -1977,6 +1977,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
PyrexTypes.CFuncTypeArg("bytes", PyrexTypes.c_char_ptr_type, None) PyrexTypes.CFuncTypeArg("bytes", PyrexTypes.c_char_ptr_type, None)
]) ])
Pyx_Py_UNICODE_strlen_func_type = PyrexTypes.CFuncType(
PyrexTypes.c_size_t_type, [
PyrexTypes.CFuncTypeArg("unicode", PyrexTypes.c_py_unicode_ptr_type, None)
])
PyObject_Size_func_type = PyrexTypes.CFuncType( PyObject_Size_func_type = PyrexTypes.CFuncType(
PyrexTypes.c_py_ssize_t_type, [ PyrexTypes.c_py_ssize_t_type, [
PyrexTypes.CFuncTypeArg("obj", PyrexTypes.py_object_type, None) PyrexTypes.CFuncTypeArg("obj", PyrexTypes.py_object_type, None)
...@@ -1996,7 +2001,8 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform): ...@@ -1996,7 +2001,8 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
_ext_types_with_pysize = set(["cpython.array.array"]) _ext_types_with_pysize = set(["cpython.array.array"])
def _handle_simple_function_len(self, node, pos_args): def _handle_simple_function_len(self, node, pos_args):
"""Replace len(char*) by the equivalent call to strlen() and """Replace len(char*) by the equivalent call to strlen(),
len(Py_UNICODE) by the equivalent Py_UNICODE_strlen() and
len(known_builtin_type) by an equivalent C-API call. len(known_builtin_type) by an equivalent C-API call.
""" """
if len(pos_args) != 1: if len(pos_args) != 1:
...@@ -2011,6 +2017,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform): ...@@ -2011,6 +2017,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
args = [arg], args = [arg],
is_temp = node.is_temp, is_temp = node.is_temp,
utility_code = UtilityCode.load_cached("IncludeStringH", "StringTools.c")) utility_code = UtilityCode.load_cached("IncludeStringH", "StringTools.c"))
elif arg.type.is_pyunicode_ptr:
new_node = ExprNodes.PythonCapiCallNode(
node.pos, "__Pyx_Py_UNICODE_strlen", self.Pyx_Py_UNICODE_strlen_func_type,
args = [arg],
is_temp = node.is_temp)
elif arg.type.is_pyobject: elif arg.type.is_pyobject:
cfunc_name = self._map_to_capi_len_function(arg.type) cfunc_name = self._map_to_capi_len_function(arg.type)
if cfunc_name is None: if cfunc_name is None:
......
...@@ -145,6 +145,7 @@ class PyrexType(BaseType): ...@@ -145,6 +145,7 @@ class PyrexType(BaseType):
# is_enum boolean Is a C enum type # is_enum boolean Is a C enum type
# is_typedef boolean Is a typedef type # is_typedef boolean Is a typedef type
# is_string boolean Is a C char * type # is_string boolean Is a C char * type
# is_pyunicode_ptr boolean Is a C PyUNICODE * type
# is_cpp_string boolean Is a C++ std::string type # is_cpp_string boolean Is a C++ std::string type
# is_unicode_char boolean Is either Py_UCS4 or Py_UNICODE # is_unicode_char boolean Is either Py_UCS4 or Py_UNICODE
# is_returncode boolean Is used only to signal exceptions # is_returncode boolean Is used only to signal exceptions
...@@ -202,6 +203,7 @@ class PyrexType(BaseType): ...@@ -202,6 +203,7 @@ class PyrexType(BaseType):
is_enum = 0 is_enum = 0
is_typedef = 0 is_typedef = 0
is_string = 0 is_string = 0
is_pyunicode_ptr = 0
is_unicode_char = 0 is_unicode_char = 0
is_returncode = 0 is_returncode = 0
is_error = 0 is_error = 0
...@@ -873,7 +875,7 @@ class PyObjectType(PyrexType): ...@@ -873,7 +875,7 @@ class PyObjectType(PyrexType):
def assignable_from(self, src_type): def assignable_from(self, src_type):
# except for pointers, conversion will be attempted # except for pointers, conversion will be attempted
return not src_type.is_ptr or src_type.is_string return not src_type.is_ptr or src_type.is_string or src_type.is_pyunicode_ptr
def declaration_code(self, entity_code, def declaration_code(self, entity_code,
for_display = 0, dll_linkage = None, pyrex = 0): for_display = 0, dll_linkage = None, pyrex = 0):
...@@ -1163,7 +1165,7 @@ class CType(PyrexType): ...@@ -1163,7 +1165,7 @@ class CType(PyrexType):
def error_condition(self, result_code): def error_condition(self, result_code):
conds = [] conds = []
if self.is_string: if self.is_string or self.is_pyunicode_ptr:
conds.append("(!%s)" % result_code) conds.append("(!%s)" % result_code)
elif self.exception_value is not None: elif self.exception_value is not None:
conds.append("(%s == (%s)%s)" % (result_code, self.sign_and_name(), self.exception_value)) conds.append("(%s == (%s)%s)" % (result_code, self.sign_and_name(), self.exception_value))
...@@ -2180,6 +2182,9 @@ class CPointerBaseType(CType): ...@@ -2180,6 +2182,9 @@ class CPointerBaseType(CType):
if base_type.same_as(char_type): if base_type.same_as(char_type):
self.is_string = 1 self.is_string = 1
break break
else:
if base_type.same_as(c_py_unicode_type):
self.is_pyunicode_ptr = 1
if self.is_string and not base_type.is_error: if self.is_string and not base_type.is_error:
if base_type.signed: if base_type.signed:
...@@ -2191,10 +2196,17 @@ class CPointerBaseType(CType): ...@@ -2191,10 +2196,17 @@ class CPointerBaseType(CType):
if self.is_ptr: if self.is_ptr:
self.from_py_function = "__Pyx_PyObject_AsUString" self.from_py_function = "__Pyx_PyObject_AsUString"
self.exception_value = "NULL" self.exception_value = "NULL"
elif self.is_pyunicode_ptr and not base_type.is_error:
self.to_py_function = "__Pyx_PyUnicode_FromUnicode"
if self.is_ptr:
self.from_py_function = "__Pyx_PyUnicode_AsUnicode"
self.exception_value = "NULL"
def py_type_name(self): def py_type_name(self):
if self.is_string: if self.is_string:
return "bytes" return "bytes"
elif self.is_pyunicode_ptr:
return "unicode"
else: else:
return super(CPointerBaseType, self).py_type_name() return super(CPointerBaseType, self).py_type_name()
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
import re import re
import sys import sys
import array
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
_unicode, _str, _bytes = str, str, bytes _unicode, _str, _bytes = str, str, bytes
...@@ -262,3 +263,22 @@ def split_string_literal(s, limit=2000): ...@@ -262,3 +263,22 @@ def split_string_literal(s, limit=2000):
chunks.append(s[start:end]) chunks.append(s[start:end])
start = end start = end
return '""'.join(chunks) return '""'.join(chunks)
def encode_pyunicode_string(s):
"""Create Py_UNICODE[] representation of a given unicode string.
"""
utf32_array = array.array('i', s.encode('UTF-32'))
assert utf32_array.itemsize == 4
utf32_array.pop(0) # Remove BOM
utf32_array.append(0) # Add NULL terminator
for c in utf32_array:
if c > 65535:
utf16_array = array.array('H', s.encode('UTF-16'))
utf16_array.pop(0) # Remove BOM
utf16_array.append(0) # Add NULL terminator
break
else:
utf16_array = []
return ",".join(map(unicode, utf16_array)), ",".join(map(unicode, utf32_array))
...@@ -24,6 +24,21 @@ static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*); ...@@ -24,6 +24,21 @@ static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*);
#define __Pyx_PyStr_FromUString(s) __Pyx_PyStr_FromString((char*)s) #define __Pyx_PyStr_FromUString(s) __Pyx_PyStr_FromString((char*)s)
#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s) #define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s)
#if PY_MAJOR_VERSION < 3
static CYTHON_INLINE size_t __Pyx_Py_UNICODE_strlen(const Py_UNICODE *u)
{
const Py_UNICODE *u_end = u;
while (*u_end++) ;
return u_end - u - 1;
}
#else
#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
#endif
#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode
#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None) #define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None)
#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False)) #define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*); static CYTHON_INLINE int __Pyx_PyObject_IsTrue(PyObject*);
......
...@@ -546,3 +546,56 @@ code will run in plain C code, (actually using a switch statement):: ...@@ -546,3 +546,56 @@ code will run in plain C code, (actually using a switch statement)::
Combined with the looping optimisation above, this can result in very Combined with the looping optimisation above, this can result in very
efficient character switching code, e.g. in unicode parsers. efficient character switching code, e.g. in unicode parsers.
Windows and wide character APIs
-------------------------------
Windows system APIs natively support Unicode in the form of
zero-terminated UTF-16 encoded :c:type:`wchar_t*` strings, so called
"wide strings".
By default, Windows builds of CPython define :c:type:`Py_UNICODE` as
a synonym for :c:type:`wchar_t`. This makes internal ``unicode``
representation compatible with UTF-16 and allows for efficient zero-copy
conversions. This also means that Windows builds are always
`Narrow Unicode builds`_ with all the caveats.
To aid interoperation with Windows APIs, Cython 0.19 supports wide
strings (in the form of :c:type:`Py_UNICODE*`) and implicitly converts
them to and from ``unicode`` string objects. These conversions behave the
same way as they do for :c:type:`char*` and ``bytes`` as described in
`Passing byte strings`_.
In addition to automatic conversion, unicode literals that appear
in C context become C-level wide string literals and :py:func:`len`
built-in function is specialized to compute the length of zero-terminated
:c:type:`Py_UNICODE*` string or array.
Here is an example of how one would call a Unicode API on Windows::
cdef extern from "Windows.h":
ctypedef Py_UNICODE WCHAR
ctypedef const WCHAR* LPCWSTR
ctypedef void* HWND
int MessageBoxW(HWND hWnd, LPCWSTR lpText, LPCWSTR lpCaption, int uType)
title = u"Windows Interop Demo - Python %d.%d.%d" % sys.version_info[:3]
MessageBoxW(NULL, u"Hello Cython \u263a", title, 0)
.. Warning::
The use of :c:type:`Py_UNICODE*` strings outside of Windows is
strongly discouraged. :c:type:`Py_UNICODE` is inherently not
portable between different platforms and Python versions.
CPython 3.3 has moved to a flexible internal representation of
unicode strings (:pep:`393`), making all :c:type:`Py_UNICODE` related
APIs deprecated and inefficient.
One consequence of CPython 3.3 changes is that :py:func:`len` of
``unicode`` strings is always measured in *code points* ("characters"),
while Windows API expect the number of UTF-16 *code units*
(where each surrogate is counted individually). To always get the number
of code units, call :c:func:`PyUnicode_GetSize` directly.
# mode: error # mode: error
# tag: werror, charptr, conversion, temp # tag: werror, charptr, conversion, temp, py_unicode_strings
cdef bytes c_s = b"abc" cdef bytes c_s = b"abc"
s = b"abc" s = b"abc"
...@@ -18,7 +18,28 @@ cptr = s ...@@ -18,7 +18,28 @@ cptr = s
# temp => error # temp => error
cptr = s + b"cba" cptr = s + b"cba"
cdef unicode c_u = u"abc"
u = u"abc"
cdef Py_UNICODE* cuptr
# constant => ok
cuptr = u"xyz"
# global cdef variable => ok
cuptr = c_u
# pyglobal => warning
cuptr = u
# temp => error
cuptr = u + u"cba"
_ERRORS = """ _ERRORS = """
16:8: Obtaining char* from externally modifiable global Python value 16:8: Obtaining 'char *' from externally modifiable global Python value
19:9: Obtaining char* from temporary Python value 19:9: Obtaining 'char *' from temporary Python value
34:9: Obtaining 'Py_UNICODE *' from externally modifiable global Python value
37:10: Obtaining 'Py_UNICODE *' from temporary Python value
""" """
...@@ -15,5 +15,5 @@ _ERRORS = """ ...@@ -15,5 +15,5 @@ _ERRORS = """
4:14: Only single-character string literals can be coerced into ints. 4:14: Only single-character string literals can be coerced into ints.
5:14: Only single-character string literals can be coerced into ints. 5:14: Only single-character string literals can be coerced into ints.
8:15: Only single-character string literals can be coerced into ints. 8:15: Only single-character string literals can be coerced into ints.
11:14: Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4. 11:14: Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).
""" """
# mode: error # mode: error
# coding: ASCII # coding: ASCII
# tag: py_unicode_strings
# ok: # ok:
cdef char* c1 = "abc" cdef char* c1 = "abc"
cdef str s1 = "abc" cdef str s1 = "abc"
cdef unicode u1 = u"abc" cdef unicode u1 = u"abc"
cdef Py_UNICODE* cu1 = u1
cdef bytes b1 = b"abc" cdef bytes b1 = b"abc"
cdef char* c2 = b"abc" cdef char* c2 = b"abc"
...@@ -21,12 +23,18 @@ o4 = c1 ...@@ -21,12 +23,18 @@ o4 = c1
o5 = b1 o5 = b1
o6 = s1 o6 = s1
o7 = u1 o7 = u1
o8 = cu1
# errors: # errors:
cdef char* c_f1 = u"abc" cdef char* c_f1 = u"abc"
cdef char* c_f2 = u1 cdef char* c_f2 = u1
cdef char* c_f3 = s1 cdef char* c_f3 = s1
cdef Py_UNICODE* cu_f1 = c1
cdef Py_UNICODE* cu_f2 = b1
cdef Py_UNICODE* cu_f3 = s1
cdef Py_UNICODE* cu_f4 = b"abc"
cdef bytes b_f1 = u"abc" cdef bytes b_f1 = u"abc"
cdef bytes b_f2 = u1 cdef bytes b_f2 = u1
cdef bytes b_f3 = s1 cdef bytes b_f3 = s1
...@@ -56,31 +64,36 @@ print <unicode>c1 ...@@ -56,31 +64,36 @@ print <unicode>c1
print <unicode>c1[1:2] print <unicode>c1[1:2]
_ERRORS = u""" _ERRORS = u"""
26:20: Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4. 29:20: Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).
27:22: Unicode objects do not support coercion to C types. 30:22: Unicode objects only support coercion to Py_UNICODE*.
28:22: 'str' objects do not support coercion to C types (use 'bytes'?). 31:22: 'str' objects do not support coercion to C types (use 'bytes'?).
30:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required. 33:27: Cannot assign type 'char *' to 'Py_UNICODE *'
31:22: Cannot convert Unicode string to 'bytes' implicitly, encoding required. 34:27: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.
32:22: Cannot convert 'str' to 'bytes' implicitly. This is not portable. 35:27: 'str' objects do not support coercion to C types (use 'unicode'?).
36:25: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.
34:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
35:19: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3. 38:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
36:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding. 39:22: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
37:19: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding. 40:22: Cannot convert 'str' to 'bytes' implicitly. This is not portable.
39:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'') 42:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
40:22: str objects do not support coercion to unicode, use a unicode string literal instead (u'') 43:19: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
41:20: Cannot convert 'bytes' object to unicode implicitly, decoding required 44:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
42:22: Cannot convert 'bytes' object to unicode implicitly, decoding required 45:19: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
43:22: Cannot convert 'char*' to unicode implicitly, decoding required
47:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
45:19: Cannot assign type 'str object' to 'tuple object' 48:22: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
46:18: Cannot assign type 'unicode object' to 'tuple object' 49:20: Cannot convert 'bytes' object to unicode implicitly, decoding required
47:18: Cannot assign type 'bytes object' to 'tuple object' 50:22: Cannot convert 'bytes' object to unicode implicitly, decoding required
51:22: Cannot convert 'char*' to unicode implicitly, decoding required
53:13: default encoding required for conversion from 'char *' to 'str object'
54:13: default encoding required for conversion from 'char *' to 'str object' 53:19: Cannot assign type 'str object' to 'tuple object'
55:17: Cannot convert 'char*' to unicode implicitly, decoding required 54:18: Cannot assign type 'unicode object' to 'tuple object'
56:17: default encoding required for conversion from 'char *' to 'unicode object' 55:18: Cannot assign type 'bytes object' to 'tuple object'
61:13: default encoding required for conversion from 'char *' to 'str object'
62:13: default encoding required for conversion from 'char *' to 'str object'
63:17: Cannot convert 'char*' to unicode implicitly, decoding required
64:17: default encoding required for conversion from 'char *' to 'unicode object'
""" """
# tag: py_unicode_strings
import sys
cimport cython
from libc.string cimport memcpy, strcpy
cdef bint Py_UNICODE_equal(const Py_UNICODE* u1, const Py_UNICODE* u2):
while u1[0] != 0 and u2[0] != 0 and u1[0] == u2[0]:
u1 += 1
u2 += 1
return u1[0] == u2[0]
ctypedef Py_UNICODE* LPWSTR
cdef unicode uobj = u'unicode\u1234'
cdef unicode uobj1 = u'u'
cdef Py_UNICODE* c_pu_str = u"unicode\u1234"
cdef Py_UNICODE c_pu_arr[42]
cdef LPWSTR c_wstr = u"unicode\u1234"
cdef Py_UNICODE* c_pu_empty = u""
cdef char* c_empty = ""
cdef unicode uwide_literal = u'\U00020000\U00020001'
cdef Py_UNICODE* c_pu_wide_literal = u'\U00020000\U00020001'
memcpy(c_pu_arr, c_pu_str, sizeof(Py_UNICODE) * (len(uobj) + 1))
def test_c_to_python():
"""
>>> test_c_to_python()
"""
assert c_pu_arr == uobj
assert c_pu_str == uobj
assert c_wstr == uobj
assert c_pu_arr[1:] == uobj[1:]
assert c_pu_str[1:] == uobj[1:]
assert c_wstr[1:] == uobj[1:]
assert c_pu_arr[:1] == uobj[:1]
assert c_pu_arr[:1] == uobj[:1]
assert c_pu_str[:1] == uobj[:1]
assert c_wstr[:1] == uobj[:1]
assert c_pu_arr[1:7] == uobj[1:7]
assert c_pu_str[1:7] == uobj[1:7]
assert c_wstr[1:7] == uobj[1:7]
assert c_pu_arr[1] == uobj[1]
assert c_pu_str[1] == uobj[1]
assert c_wstr[1] == uobj[1]
assert len(c_pu_str) == 8
assert len(c_pu_arr) == 8
assert len(c_wstr) == 8
assert sizeof(c_pu_arr) == sizeof(Py_UNICODE) * 42
assert sizeof(c_pu_str) == sizeof(void*)
assert c_pu_wide_literal == uwide_literal
if sizeof(Py_UNICODE) >= 4:
assert len(c_pu_wide_literal) == 2
else:
assert len(c_pu_wide_literal) == 4
if sys.version_info >= (3, 3):
# Make sure len(unicode) is not reverted to pre-3.3 behavior
assert len(uwide_literal) == 2
assert u'unicode'
assert not u''
assert c_pu_str
assert c_pu_empty
def test_python_to_c():
"""
>>> test_python_to_c()
"""
cdef unicode u
assert Py_UNICODE_equal(c_pu_arr, uobj)
assert Py_UNICODE_equal(c_pu_str, uobj)
assert Py_UNICODE_equal(c_pu_str, <LPWSTR>uobj)
u = uobj[1:]
assert Py_UNICODE_equal(c_pu_str + 1, u)
assert Py_UNICODE_equal(c_wstr + 1, u)
u = uobj[:1]
assert Py_UNICODE_equal(<Py_UNICODE*>u"u", u)
u = uobj[1:7]
assert Py_UNICODE_equal(<Py_UNICODE*>u"nicode", u)
u = uobj[1]
assert Py_UNICODE_equal(<Py_UNICODE*>u"n", u)
assert Py_UNICODE_equal(uwide_literal, <Py_UNICODE*>c_pu_wide_literal)
assert len(u"abc\0") == 4
assert len(<Py_UNICODE*>u"abc\0") == 3
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment