Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Labels
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Commits
Open sidebar
nexedi
cython
Commits
d61f929f
Commit
d61f929f
authored
Mar 05, 2013
by
scoder
Browse files
Options
Browse Files
Download
Plain Diff
Merge pull request #191 from nnemkin/py_unicode_strings
Py_UNICODE* string support
parents
e6826689
e351aa7d
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
361 additions
and
47 deletions
+361
-47
Cython/Compiler/Code.py
Cython/Compiler/Code.py
+24
-0
Cython/Compiler/ExprNodes.py
Cython/Compiler/ExprNodes.py
+58
-13
Cython/Compiler/Optimize.py
Cython/Compiler/Optimize.py
+12
-1
Cython/Compiler/PyrexTypes.py
Cython/Compiler/PyrexTypes.py
+14
-2
Cython/Compiler/StringEncoding.py
Cython/Compiler/StringEncoding.py
+20
-0
Cython/Utility/TypeConversion.c
Cython/Utility/TypeConversion.c
+15
-0
docs/src/tutorial/strings.rst
docs/src/tutorial/strings.rst
+53
-0
tests/errors/charptr_from_temp.pyx
tests/errors/charptr_from_temp.pyx
+24
-3
tests/errors/e_strcoerce.pyx
tests/errors/e_strcoerce.pyx
+1
-1
tests/errors/string_assignments.pyx
tests/errors/string_assignments.pyx
+40
-27
tests/run/py_unicode_strings.pyx
tests/run/py_unicode_strings.pyx
+100
-0
No files found.
Cython/Compiler/Code.py
View file @
d61f929f
...
...
@@ -873,6 +873,7 @@ class GlobalState(object):
self.const_cname_counter = 1
self.string_const_index = {}
self.pyunicode_ptr_const_index = {}
self.int_const_index = {}
self.py_constants = []
...
...
@@ -1016,6 +1017,15 @@ class GlobalState(object):
c
.
add_py_version
(
py_version
)
return
c
def
get_pyunicode_ptr_const
(
self
,
text
):
# return a Py_UNICODE[] constant, creating a new one if necessary
assert
text
.
is_unicode
try
:
c
=
self
.
pyunicode_ptr_const_index
[
text
]
except
KeyError
:
c
=
self
.
pyunicode_ptr_const_index
[
text
]
=
self
.
new_const_cname
()
return
c
def
get_py_string_const
(
self
,
text
,
identifier
=
None
,
is_str
=
False
,
unicode_value
=
None
):
# return a Python string constant, creating a new one if necessary
...
...
@@ -1141,6 +1151,17 @@ class GlobalState(object):
for
py_string
in
c
.
py_strings
.
values
():
py_strings
.
append
((
c
.
cname
,
len
(
py_string
.
cname
),
py_string
))
for
c
,
cname
in
self
.
pyunicode_ptr_const_index
.
items
():
utf16_array
,
utf32_array
=
StringEncoding
.
encode_pyunicode_string
(
c
)
if
utf16_array
:
# Narrow and wide representations differ
decls_writer
.
putln
(
"#ifdef Py_UNICODE_WIDE"
)
decls_writer
.
putln
(
"static Py_UNICODE %s[] = { %s };"
%
(
cname
,
utf32_array
))
if
utf16_array
:
decls_writer
.
putln
(
"#else"
)
decls_writer
.
putln
(
"static Py_UNICODE %s[] = { %s };"
%
(
cname
,
utf16_array
))
decls_writer
.
putln
(
"#endif"
)
if
py_strings
:
self
.
use_utility_code
(
UtilityCode
.
load_cached
(
"InitStrings"
,
"StringTools.c"
))
py_strings
.
sort
()
...
...
@@ -1435,6 +1456,9 @@ class CCodeWriter(object):
def
get_string_const
(
self
,
text
):
return
self
.
globalstate
.
get_string_const
(
text
).
cname
def
get_pyunicode_ptr_const
(
self
,
text
):
return
self
.
globalstate
.
get_pyunicode_ptr_const
(
text
)
def
get_py_string_const
(
self
,
text
,
identifier
=
None
,
is_str
=
False
,
unicode_value
=
None
):
return
self
.
globalstate
.
get_py_string_const
(
...
...
Cython/Compiler/ExprNodes.py
View file @
d61f929f
...
...
@@ -63,14 +63,16 @@ coercion_error_dict = {
# string related errors
(
Builtin
.
unicode_type
,
Builtin
.
bytes_type
)
:
"Cannot convert Unicode string to 'bytes' implicitly, encoding required."
,
(
Builtin
.
unicode_type
,
Builtin
.
str_type
)
:
"Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_char_ptr_type
)
:
"Unicode objects
do not support coercion to C types
."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_uchar_ptr_type
)
:
"Unicode objects
do not support coercion to C types
."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_char_ptr_type
)
:
"Unicode objects
only support coercion to Py_UNICODE*
."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_uchar_ptr_type
)
:
"Unicode objects
only support coercion to Py_UNICODE*
."
,
(
Builtin
.
bytes_type
,
Builtin
.
unicode_type
)
:
"Cannot convert 'bytes' object to unicode implicitly, decoding required"
,
(
Builtin
.
bytes_type
,
Builtin
.
str_type
)
:
"Cannot convert 'bytes' object to str implicitly. This is not portable to Py3."
,
(
Builtin
.
bytes_type
,
PyrexTypes
.
c_py_unicode_ptr_type
)
:
"Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'."
,
(
Builtin
.
str_type
,
Builtin
.
unicode_type
)
:
"str objects do not support coercion to unicode, use a unicode string literal instead (u'')"
,
(
Builtin
.
str_type
,
Builtin
.
bytes_type
)
:
"Cannot convert 'str' to 'bytes' implicitly. This is not portable."
,
(
Builtin
.
str_type
,
PyrexTypes
.
c_char_ptr_type
)
:
"'str' objects do not support coercion to C types (use 'bytes'?)."
,
(
Builtin
.
str_type
,
PyrexTypes
.
c_uchar_ptr_type
)
:
"'str' objects do not support coercion to C types (use 'bytes'?)."
,
(
Builtin
.
str_type
,
PyrexTypes
.
c_py_unicode_ptr_type
)
:
"'str' objects do not support coercion to C types (use 'unicode'?)."
,
(
PyrexTypes
.
c_char_ptr_type
,
Builtin
.
unicode_type
)
:
"Cannot convert 'char*' to unicode implicitly, decoding required"
,
(
PyrexTypes
.
c_uchar_ptr_type
,
Builtin
.
unicode_type
)
:
"Cannot convert 'char*' to unicode implicitly, decoding required"
,
}
...
...
@@ -1171,8 +1173,8 @@ class BytesNode(ConstNode):
return
self
.
result_code
class
UnicodeNode
(
Py
ConstNode
):
# A Py
thon unicode object
class
UnicodeNode
(
ConstNode
):
# A Py
_UNICODE* or unicode literal
#
# value EncodedString
# bytes_value BytesLiteral the literal parsed as bytes string ('-3' unicode literals only)
...
...
@@ -1213,7 +1215,11 @@ class UnicodeNode(PyConstNode):
if
dst_type
.
is_string
and
self
.
bytes_value
is
not
None
:
# special case: '-3' enforced unicode literal used in a C char* context
return
BytesNode
(
self
.
pos
,
value
=
self
.
bytes_value
).
coerce_to
(
dst_type
,
env
)
error
(
self
.
pos
,
"Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4."
)
if
dst_type
.
is_pyunicode_ptr
:
node
=
UnicodeNode
(
self
.
pos
,
value
=
self
.
value
)
node
.
type
=
dst_type
return
node
error
(
self
.
pos
,
"Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings)."
)
elif
dst_type
is
not
py_object_type
:
if
not
self
.
check_for_coercion_error
(
dst_type
,
env
):
self
.
fail_assignment
(
dst_type
)
...
...
@@ -1225,11 +1231,18 @@ class UnicodeNode(PyConstNode):
## and (0xD800 <= self.value[0] <= 0xDBFF)
## and (0xDC00 <= self.value[1] <= 0xDFFF))
def
coerce_to_boolean
(
self
,
env
):
bool_value
=
bool
(
self
.
value
)
return
BoolNode
(
self
.
pos
,
value
=
bool_value
,
constant_result
=
bool_value
)
def
contains_surrogates
(
self
):
return
_string_contains_surrogates
(
self
.
value
)
def
generate_evaluation_code
(
self
,
code
):
if
self
.
type
.
is_pyobject
:
self
.
result_code
=
code
.
get_py_string_const
(
self
.
value
)
else
:
self
.
result_code
=
code
.
get_pyunicode_ptr_const
(
self
.
value
)
def
calculate_result_code
(
self
):
return
self
.
result_code
...
...
@@ -2633,6 +2646,9 @@ class IndexNode(ExprNode):
if
base_type
.
is_string
:
# sliced C strings must coerce to Python
return
bytes_type
elif
base_type
.
is_pyunicode_ptr
:
# sliced Py_UNICODE* strings must coerce to Python
return
unicode_type
elif
base_type
in
(
unicode_type
,
bytes_type
,
str_type
,
list_type
,
tuple_type
):
# slicing these returns the same type
return
base_type
...
...
@@ -3446,6 +3462,8 @@ class SliceIndexNode(ExprNode):
base_type
=
self
.
base
.
infer_type
(
env
)
if
base_type
.
is_string
or
base_type
.
is_cpp_class
:
return
bytes_type
elif
base_type
.
is_pyunicode_ptr
:
return
unicode_type
elif
base_type
in
(
bytes_type
,
str_type
,
unicode_type
,
list_type
,
tuple_type
):
return
base_type
...
...
@@ -3510,6 +3528,8 @@ class SliceIndexNode(ExprNode):
base_type
=
self
.
base
.
type
if
base_type
.
is_string
or
base_type
.
is_cpp_string
:
self
.
type
=
default_str_type
(
env
)
elif
base_type
.
is_pyunicode_ptr
:
self
.
type
=
unicode_type
elif
base_type
.
is_ptr
:
self
.
type
=
base_type
elif
base_type
.
is_array
:
...
...
@@ -3578,6 +3598,27 @@ class SliceIndexNode(ExprNode):
stop_code
,
start_code
,
code
.
error_goto_if_null
(
result
,
self
.
pos
)))
elif
self
.
base
.
type
.
is_pyunicode_ptr
:
base_result
=
self
.
base
.
result
()
if
self
.
base
.
type
!=
PyrexTypes
.
c_py_unicode_ptr_type
:
base_result
=
'((const Py_UNICODE*)%s)'
%
base_result
if
self
.
stop
is
None
:
code
.
putln
(
"%s = __Pyx_PyUnicode_FromUnicode(%s + %s); %s"
%
(
result
,
base_result
,
start_code
,
code
.
error_goto_if_null
(
result
,
self
.
pos
)))
else
:
code
.
putln
(
"%s = __Pyx_PyUnicode_FromUnicodeAndLength(%s + %s, %s - %s); %s"
%
(
result
,
base_result
,
start_code
,
stop_code
,
start_code
,
code
.
error_goto_if_null
(
result
,
self
.
pos
)))
elif
self
.
base
.
type
is
unicode_type
:
code
.
globalstate
.
use_utility_code
(
UtilityCode
.
load_cached
(
"PyUnicode_Substring"
,
"StringTools.c"
))
...
...
@@ -4903,11 +4944,11 @@ class AttributeNode(ExprNode):
self
.
is_py_attr
=
0
self
.
member
=
self
.
attribute
if
obj_type
is
None
:
if
self
.
obj
.
type
.
is_string
:
if
self
.
obj
.
type
.
is_string
or
self
.
obj
.
type
.
is_pyunicode_ptr
:
self
.
obj
=
self
.
obj
.
coerce_to_pyobject
(
env
)
obj_type
=
self
.
obj
.
type
else
:
if
obj_type
.
is_string
:
if
obj_type
.
is_string
or
obj_type
.
is_pyunicode_ptr
:
obj_type
=
py_object_type
if
obj_type
.
is_ptr
or
obj_type
.
is_array
:
obj_type
=
obj_type
.
base_type
...
...
@@ -8334,8 +8375,12 @@ class BinopNode(ExprNode):
if
self
.
is_py_operation_types
(
type1
,
type2
):
if
type2
.
is_string
:
type2
=
Builtin
.
bytes_type
elif
type2
.
is_pyunicode_ptr
:
type2
=
Builtin
.
unicode_type
if
type1
.
is_string
:
type1
=
Builtin
.
bytes_type
elif
type1
.
is_pyunicode_ptr
:
type1
=
Builtin
.
unicode_type
elif
self
.
operator
==
'%'
\
and
type1
in
(
Builtin
.
str_type
,
Builtin
.
unicode_type
):
# note that b'%s' % b'abc' doesn't work in Py3
...
...
@@ -8584,7 +8629,7 @@ class AddNode(NumBinopNode):
# '+' operator.
def
is_py_operation_types
(
self
,
type1
,
type2
):
if
type1
.
is_string
and
type2
.
is_string
:
if
type1
.
is_string
and
type2
.
is_string
or
type1
.
is_pyunicode_ptr
and
type2
.
is_pyunicode_ptr
:
return
1
else
:
return
NumBinopNode
.
is_py_operation_types
(
self
,
type1
,
type2
)
...
...
@@ -9947,7 +9992,7 @@ class CoerceToPyTypeNode(CoercionNode):
# be specific about some known types
if
arg
.
type
.
is_string
or
arg
.
type
.
is_cpp_string
:
self
.
type
=
default_str_type
(
env
)
elif
arg
.
type
.
is_unicode_char
:
elif
arg
.
type
.
is_
pyunicode_ptr
or
arg
.
type
.
is_
unicode_char
:
self
.
type
=
unicode_type
elif
arg
.
type
.
is_complex
:
self
.
type
=
Builtin
.
complex_type
...
...
@@ -10062,13 +10107,13 @@ class CoerceFromPyTypeNode(CoercionNode):
if
not
result_type
.
create_from_py_utility_code
(
env
):
error
(
arg
.
pos
,
"Cannot convert Python object to '%s'"
%
result_type
)
if
self
.
type
.
is_string
:
if
self
.
type
.
is_string
or
self
.
type
.
is_pyunicode_ptr
:
if
self
.
arg
.
is_ephemeral
():
error
(
arg
.
pos
,
"Obtaining
char* from temporary Python value"
)
"Obtaining
'%s' from temporary Python value"
%
result_type
)
elif
self
.
arg
.
is_name
and
self
.
arg
.
entry
and
self
.
arg
.
entry
.
is_pyglobal
:
warning
(
arg
.
pos
,
"Obtaining
char* from externally modifiable global Python value"
,
"Obtaining
'%s' from externally modifiable global Python value"
%
result_type
,
level
=
1
)
def
analyse_types
(
self
,
env
):
...
...
Cython/Compiler/Optimize.py
View file @
d61f929f
...
...
@@ -1977,6 +1977,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
PyrexTypes
.
CFuncTypeArg
(
"bytes"
,
PyrexTypes
.
c_char_ptr_type
,
None
)
])
Pyx_Py_UNICODE_strlen_func_type
=
PyrexTypes
.
CFuncType
(
PyrexTypes
.
c_size_t_type
,
[
PyrexTypes
.
CFuncTypeArg
(
"unicode"
,
PyrexTypes
.
c_py_unicode_ptr_type
,
None
)
])
PyObject_Size_func_type
=
PyrexTypes
.
CFuncType
(
PyrexTypes
.
c_py_ssize_t_type
,
[
PyrexTypes
.
CFuncTypeArg
(
"obj"
,
PyrexTypes
.
py_object_type
,
None
)
...
...
@@ -1996,7 +2001,8 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
_ext_types_with_pysize
=
set
([
"cpython.array.array"
])
def
_handle_simple_function_len
(
self
,
node
,
pos_args
):
"""Replace len(char*) by the equivalent call to strlen() and
"""Replace len(char*) by the equivalent call to strlen(),
len(Py_UNICODE) by the equivalent Py_UNICODE_strlen() and
len(known_builtin_type) by an equivalent C-API call.
"""
if
len
(
pos_args
)
!=
1
:
...
...
@@ -2011,6 +2017,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
args
=
[
arg
],
is_temp
=
node
.
is_temp
,
utility_code
=
UtilityCode
.
load_cached
(
"IncludeStringH"
,
"StringTools.c"
))
elif
arg
.
type
.
is_pyunicode_ptr
:
new_node
=
ExprNodes
.
PythonCapiCallNode
(
node
.
pos
,
"__Pyx_Py_UNICODE_strlen"
,
self
.
Pyx_Py_UNICODE_strlen_func_type
,
args
=
[
arg
],
is_temp
=
node
.
is_temp
)
elif
arg
.
type
.
is_pyobject
:
cfunc_name
=
self
.
_map_to_capi_len_function
(
arg
.
type
)
if
cfunc_name
is
None
:
...
...
Cython/Compiler/PyrexTypes.py
View file @
d61f929f
...
...
@@ -145,6 +145,7 @@ class PyrexType(BaseType):
# is_enum boolean Is a C enum type
# is_typedef boolean Is a typedef type
# is_string boolean Is a C char * type
# is_pyunicode_ptr boolean Is a C PyUNICODE * type
# is_cpp_string boolean Is a C++ std::string type
# is_unicode_char boolean Is either Py_UCS4 or Py_UNICODE
# is_returncode boolean Is used only to signal exceptions
...
...
@@ -202,6 +203,7 @@ class PyrexType(BaseType):
is_enum
=
0
is_typedef
=
0
is_string
=
0
is_pyunicode_ptr
=
0
is_unicode_char
=
0
is_returncode
=
0
is_error
=
0
...
...
@@ -873,7 +875,7 @@ class PyObjectType(PyrexType):
def
assignable_from
(
self
,
src_type
):
# except for pointers, conversion will be attempted
return
not
src_type
.
is_ptr
or
src_type
.
is_string
return
not
src_type
.
is_ptr
or
src_type
.
is_string
or
src_type
.
is_pyunicode_ptr
def
declaration_code
(
self
,
entity_code
,
for_display
=
0
,
dll_linkage
=
None
,
pyrex
=
0
):
...
...
@@ -1163,7 +1165,7 @@ class CType(PyrexType):
def
error_condition
(
self
,
result_code
):
conds
=
[]
if
self
.
is_string
:
if
self
.
is_string
or
self
.
is_pyunicode_ptr
:
conds
.
append
(
"(!%s)"
%
result_code
)
elif
self
.
exception_value
is
not
None
:
conds
.
append
(
"(%s == (%s)%s)"
%
(
result_code
,
self
.
sign_and_name
(),
self
.
exception_value
))
...
...
@@ -2180,6 +2182,9 @@ class CPointerBaseType(CType):
if
base_type
.
same_as
(
char_type
):
self
.
is_string
=
1
break
else
:
if
base_type
.
same_as
(
c_py_unicode_type
):
self
.
is_pyunicode_ptr
=
1
if
self
.
is_string
and
not
base_type
.
is_error
:
if
base_type
.
signed
:
...
...
@@ -2191,10 +2196,17 @@ class CPointerBaseType(CType):
if
self
.
is_ptr
:
self
.
from_py_function
=
"__Pyx_PyObject_AsUString"
self
.
exception_value
=
"NULL"
elif
self
.
is_pyunicode_ptr
and
not
base_type
.
is_error
:
self
.
to_py_function
=
"__Pyx_PyUnicode_FromUnicode"
if
self
.
is_ptr
:
self
.
from_py_function
=
"__Pyx_PyUnicode_AsUnicode"
self
.
exception_value
=
"NULL"
def
py_type_name
(
self
):
if
self
.
is_string
:
return
"bytes"
elif
self
.
is_pyunicode_ptr
:
return
"unicode"
else
:
return
super
(
CPointerBaseType
,
self
).
py_type_name
()
...
...
Cython/Compiler/StringEncoding.py
View file @
d61f929f
...
...
@@ -4,6 +4,7 @@
import
re
import
sys
import
array
if
sys
.
version_info
[
0
]
>=
3
:
_unicode
,
_str
,
_bytes
=
str
,
str
,
bytes
...
...
@@ -262,3 +263,22 @@ def split_string_literal(s, limit=2000):
chunks
.
append
(
s
[
start
:
end
])
start
=
end
return
'""'
.
join
(
chunks
)
def
encode_pyunicode_string
(
s
):
"""Create Py_UNICODE[] representation of a given unicode string.
"""
utf32_array
=
array
.
array
(
'i'
,
s
.
encode
(
'UTF-32'
))
assert
utf32_array
.
itemsize
==
4
utf32_array
.
pop
(
0
)
# Remove BOM
utf32_array
.
append
(
0
)
# Add NULL terminator
for
c
in
utf32_array
:
if
c
>
65535
:
utf16_array
=
array
.
array
(
'H'
,
s
.
encode
(
'UTF-16'
))
utf16_array
.
pop
(
0
)
# Remove BOM
utf16_array
.
append
(
0
)
# Add NULL terminator
break
else
:
utf16_array
=
[]
return
","
.
join
(
map
(
unicode
,
utf16_array
)),
","
.
join
(
map
(
unicode
,
utf32_array
))
Cython/Utility/TypeConversion.c
View file @
d61f929f
...
...
@@ -24,6 +24,21 @@ static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*);
#define __Pyx_PyStr_FromUString(s) __Pyx_PyStr_FromString((char*)s)
#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s)
#if PY_MAJOR_VERSION < 3
static
CYTHON_INLINE
size_t
__Pyx_Py_UNICODE_strlen
(
const
Py_UNICODE
*
u
)
{
const
Py_UNICODE
*
u_end
=
u
;
while
(
*
u_end
++
)
;
return
u_end
-
u
-
1
;
}
#else
#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
#endif
#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode
#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None)
#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
static
CYTHON_INLINE
int
__Pyx_PyObject_IsTrue
(
PyObject
*
);
...
...
docs/src/tutorial/strings.rst
View file @
d61f929f
...
...
@@ -546,3 +546,56 @@ code will run in plain C code, (actually using a switch statement)::
Combined with the looping optimisation above, this can result in very
efficient character switching code, e.g. in unicode parsers.
Windows and wide character APIs
-------------------------------
Windows system APIs natively support Unicode in the form of
zero-terminated UTF-16 encoded :c:type:`wchar_t*` strings, so called
"wide strings".
By default, Windows builds of CPython define :c:type:`Py_UNICODE` as
a synonym for :c:type:`wchar_t`. This makes internal ``unicode``
representation compatible with UTF-16 and allows for efficient zero-copy
conversions. This also means that Windows builds are always
`Narrow Unicode builds`_ with all the caveats.
To aid interoperation with Windows APIs, Cython 0.19 supports wide
strings (in the form of :c:type:`Py_UNICODE*`) and implicitly converts
them to and from ``unicode`` string objects. These conversions behave the
same way as they do for :c:type:`char*` and ``bytes`` as described in
`Passing byte strings`_.
In addition to automatic conversion, unicode literals that appear
in C context become C-level wide string literals and :py:func:`len`
built-in function is specialized to compute the length of zero-terminated
:c:type:`Py_UNICODE*` string or array.
Here is an example of how one would call a Unicode API on Windows::
cdef extern from "Windows.h":
ctypedef Py_UNICODE WCHAR
ctypedef const WCHAR* LPCWSTR
ctypedef void* HWND
int MessageBoxW(HWND hWnd, LPCWSTR lpText, LPCWSTR lpCaption, int uType)
title = u"Windows Interop Demo - Python %d.%d.%d" % sys.version_info[:3]
MessageBoxW(NULL, u"Hello Cython \u263a", title, 0)
.. Warning::
The use of :c:type:`Py_UNICODE*` strings outside of Windows is
strongly discouraged. :c:type:`Py_UNICODE` is inherently not
portable between different platforms and Python versions.
CPython 3.3 has moved to a flexible internal representation of
unicode strings (:pep:`393`), making all :c:type:`Py_UNICODE` related
APIs deprecated and inefficient.
One consequence of CPython 3.3 changes is that :py:func:`len` of
``unicode`` strings is always measured in *code points* ("characters"),
while Windows API expect the number of UTF-16 *code units*
(where each surrogate is counted individually). To always get the number
of code units, call :c:func:`PyUnicode_GetSize` directly.
tests/errors/charptr_from_temp.pyx
View file @
d61f929f
# mode: error
# tag: werror, charptr, conversion, temp
# tag: werror, charptr, conversion, temp
, py_unicode_strings
cdef
bytes
c_s
=
b"abc"
s
=
b"abc"
...
...
@@ -18,7 +18,28 @@ cptr = s
# temp => error
cptr
=
s
+
b"cba"
cdef
unicode
c_u
=
u"abc"
u
=
u"abc"
cdef
Py_UNICODE
*
cuptr
# constant => ok
cuptr
=
u"xyz"
# global cdef variable => ok
cuptr
=
c_u
# pyglobal => warning
cuptr
=
u
# temp => error
cuptr
=
u
+
u"cba"
_ERRORS
=
"""
16:8: Obtaining char* from externally modifiable global Python value
19:9: Obtaining char* from temporary Python value
16:8: Obtaining 'char *' from externally modifiable global Python value
19:9: Obtaining 'char *' from temporary Python value
34:9: Obtaining 'Py_UNICODE *' from externally modifiable global Python value
37:10: Obtaining 'Py_UNICODE *' from temporary Python value
"""
tests/errors/e_strcoerce.pyx
View file @
d61f929f
...
...
@@ -15,5 +15,5 @@ _ERRORS = """
4:14: Only single-character string literals can be coerced into ints.
5:14: Only single-character string literals can be coerced into ints.
8:15: Only single-character string literals can be coerced into ints.
11:14: Unicode literals do not support coercion to C types other than Py_UNICODE
or Py_UCS4
.
11:14: Unicode literals do not support coercion to C types other than Py_UNICODE
/Py_UCS4 (for characters) or Py_UNICODE* (for strings)
.
"""
tests/errors/string_assignments.pyx
View file @
d61f929f
# mode: error
# coding: ASCII
# tag: py_unicode_strings
# ok:
cdef
char
*
c1
=
"abc"
cdef
str
s1
=
"abc"
cdef
unicode
u1
=
u"abc"
cdef
Py_UNICODE
*
cu1
=
u1
cdef
bytes
b1
=
b"abc"
cdef
char
*
c2
=
b"abc"
...
...
@@ -21,12 +23,18 @@ o4 = c1
o5
=
b1
o6
=
s1
o7
=
u1
o8
=
cu1
# errors:
cdef
char
*
c_f1
=
u"abc"
cdef
char
*
c_f2
=
u1
cdef
char
*
c_f3
=
s1
cdef
Py_UNICODE
*
cu_f1
=
c1
cdef
Py_UNICODE
*
cu_f2
=
b1
cdef
Py_UNICODE
*
cu_f3
=
s1
cdef
Py_UNICODE
*
cu_f4
=
b"abc"
cdef
bytes
b_f1
=
u"abc"
cdef
bytes
b_f2
=
u1
cdef
bytes
b_f3
=
s1
...
...
@@ -56,31 +64,36 @@ print <unicode>c1
print
<
unicode
>
c1
[
1
:
2
]
_ERRORS
=
u"""
26:20: Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.
27:22: Unicode objects do not support coercion to C types.
28:22: 'str' objects do not support coercion to C types (use 'bytes'?).
30:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
31:22: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
32:22: Cannot convert 'str' to 'bytes' implicitly. This is not portable.
34:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
35:19: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
36:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
37:19: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
39:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
40:22: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
41:20: Cannot convert 'bytes' object to unicode implicitly, decoding required
42:22: Cannot convert 'bytes' object to unicode implicitly, decoding required
43:22: Cannot convert 'char*' to unicode implicitly, decoding required
45:19: Cannot assign type 'str object' to 'tuple object'
46:18: Cannot assign type 'unicode object' to 'tuple object'
47:18: Cannot assign type 'bytes object' to 'tuple object'
53:13: default encoding required for conversion from 'char *' to 'str object'
54:13: default encoding required for conversion from 'char *' to 'str object'
55:17: Cannot convert 'char*' to unicode implicitly, decoding required
56:17: default encoding required for conversion from 'char *' to 'unicode object'
29:20: Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings).
30:22: Unicode objects only support coercion to Py_UNICODE*.
31:22: 'str' objects do not support coercion to C types (use 'bytes'?).
33:27: Cannot assign type 'char *' to 'Py_UNICODE *'
34:27: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.
35:27: 'str' objects do not support coercion to C types (use 'unicode'?).
36:25: Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'.
38:20: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
39:22: Cannot convert Unicode string to 'bytes' implicitly, encoding required.
40:22: Cannot convert 'str' to 'bytes' implicitly. This is not portable.
42:17: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
43:19: Cannot convert 'bytes' object to str implicitly. This is not portable to Py3.
44:17: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
45:19: Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding.
47:20: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
48:22: str objects do not support coercion to unicode, use a unicode string literal instead (u'')
49:20: Cannot convert 'bytes' object to unicode implicitly, decoding required
50:22: Cannot convert 'bytes' object to unicode implicitly, decoding required
51:22: Cannot convert 'char*' to unicode implicitly, decoding required
53:19: Cannot assign type 'str object' to 'tuple object'
54:18: Cannot assign type 'unicode object' to 'tuple object'
55:18: Cannot assign type 'bytes object' to 'tuple object'
61:13: default encoding required for conversion from 'char *' to 'str object'
62:13: default encoding required for conversion from 'char *' to 'str object'
63:17: Cannot convert 'char*' to unicode implicitly, decoding required
64:17: default encoding required for conversion from 'char *' to 'unicode object'
"""
tests/run/py_unicode_strings.pyx
0 → 100644
View file @
d61f929f
# tag: py_unicode_strings
import
sys
cimport
cython
from
libc.string
cimport
memcpy
,
strcpy
cdef
bint
Py_UNICODE_equal
(
const
Py_UNICODE
*
u1
,
const
Py_UNICODE
*
u2
):
while
u1
[
0
]
!=
0
and
u2
[
0
]
!=
0
and
u1
[
0
]
==
u2
[
0
]:
u1
+=
1
u2
+=
1
return
u1
[
0
]
==
u2
[
0
]
ctypedef
Py_UNICODE
*
LPWSTR
cdef
unicode
uobj
=
u'unicode
\
u1234
'
cdef
unicode
uobj1
=
u'u'
cdef
Py_UNICODE
*
c_pu_str
=
u"unicode
\
u1234
"
cdef
Py_UNICODE
c_pu_arr
[
42
]
cdef
LPWSTR
c_wstr
=
u"unicode
\
u1234
"
cdef
Py_UNICODE
*
c_pu_empty
=
u""
cdef
char
*
c_empty
=
""
cdef
unicode
uwide_literal
=
u'
\
U00020000
\
U00020001
'
cdef
Py_UNICODE
*
c_pu_wide_literal
=
u'
\
U00020000
\
U00020001
'
memcpy
(
c_pu_arr
,
c_pu_str
,
sizeof
(
Py_UNICODE
)
*
(
len
(
uobj
)
+
1
))
def
test_c_to_python
():
"""
>>> test_c_to_python()
"""
assert
c_pu_arr
==
uobj
assert
c_pu_str
==
uobj
assert
c_wstr
==
uobj
assert
c_pu_arr
[
1
:]
==
uobj
[
1
:]
assert
c_pu_str
[
1
:]
==
uobj
[
1
:]
assert
c_wstr
[
1
:]
==
uobj
[
1
:]
assert
c_pu_arr
[:
1
]
==
uobj
[:
1
]
assert
c_pu_arr
[:
1
]
==
uobj
[:
1
]
assert
c_pu_str
[:
1
]
==
uobj
[:
1
]
assert
c_wstr
[:
1
]
==
uobj
[:
1
]
assert
c_pu_arr
[
1
:
7
]
==
uobj
[
1
:
7
]
assert
c_pu_str
[
1
:
7
]
==
uobj
[
1
:
7
]
assert
c_wstr
[
1
:
7
]
==
uobj
[
1
:
7
]
assert
c_pu_arr
[
1
]
==
uobj
[
1
]
assert
c_pu_str
[
1
]
==
uobj
[
1
]
assert
c_wstr
[
1
]
==
uobj
[
1
]
assert
len
(
c_pu_str
)
==
8
assert
len
(
c_pu_arr
)
==
8
assert
len
(
c_wstr
)
==
8
assert
sizeof
(
c_pu_arr
)
==
sizeof
(
Py_UNICODE
)
*
42
assert
sizeof
(
c_pu_str
)
==
sizeof
(
void
*
)
assert
c_pu_wide_literal
==
uwide_literal
if
sizeof
(
Py_UNICODE
)
>=
4
:
assert
len
(
c_pu_wide_literal
)
==
2
else
:
assert
len
(
c_pu_wide_literal
)
==
4
if
sys
.
version_info
>=
(
3
,
3
):
# Make sure len(unicode) is not reverted to pre-3.3 behavior
assert
len
(
uwide_literal
)
==
2
assert
u'unicode'
assert
not
u''
assert
c_pu_str
assert
c_pu_empty
def
test_python_to_c
():
"""
>>> test_python_to_c()
"""
cdef
unicode
u
assert
Py_UNICODE_equal
(
c_pu_arr
,
uobj
)
assert
Py_UNICODE_equal
(
c_pu_str
,
uobj
)
assert
Py_UNICODE_equal
(
c_pu_str
,
<
LPWSTR
>
uobj
)
u
=
uobj
[
1
:]
assert
Py_UNICODE_equal
(
c_pu_str
+
1
,
u
)
assert
Py_UNICODE_equal
(
c_wstr
+
1
,
u
)
u
=
uobj
[:
1
]
assert
Py_UNICODE_equal
(
<
Py_UNICODE
*>
u"u"
,
u
)
u
=
uobj
[
1
:
7
]
assert
Py_UNICODE_equal
(
<
Py_UNICODE
*>
u"nicode"
,
u
)
u
=
uobj
[
1
]
assert
Py_UNICODE_equal
(
<
Py_UNICODE
*>
u"n"
,
u
)
assert
Py_UNICODE_equal
(
uwide_literal
,
<
Py_UNICODE
*>
c_pu_wide_literal
)
assert
len
(
u"abc
\
0
"
)
==
4
assert
len
(
<
Py_UNICODE
*>
u"abc
\
0
"
)
==
3
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment