Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Gwenaël Samain
cython
Commits
a664239a
Commit
a664239a
authored
Mar 03, 2013
by
Nikita Nemkin
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Basic support for Py_UNICODE* strings.
parent
0d651b18
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
150 additions
and
16 deletions
+150
-16
Cython/Compiler/Code.py
Cython/Compiler/Code.py
+28
-0
Cython/Compiler/ExprNodes.py
Cython/Compiler/ExprNodes.py
+60
-13
Cython/Compiler/Optimize.py
Cython/Compiler/Optimize.py
+13
-1
Cython/Compiler/PyrexTypes.py
Cython/Compiler/PyrexTypes.py
+14
-2
Cython/Compiler/StringEncoding.py
Cython/Compiler/StringEncoding.py
+12
-0
Cython/Utility/StringTools.c
Cython/Utility/StringTools.c
+14
-0
Cython/Utility/TypeConversion.c
Cython/Utility/TypeConversion.c
+9
-0
No files found.
Cython/Compiler/Code.py
View file @
a664239a
...
...
@@ -778,6 +778,16 @@ class StringConst(object):
self.py_strings[key] = py_string
return py_string
class UnicodeConst(object):
"""Global info about a Py_UNICODE[] constant held by GlobalState.
"""
# cname string
# text EncodedString (unicode)
def __init__(self, cname, text):
self.cname = cname
self.text = text
class PyStringConst(object):
"""Global info about a Python string constant held by GlobalState.
"""
...
...
@@ -873,6 +883,7 @@ class GlobalState(object):
self.const_cname_counter = 1
self.string_const_index = {}
self.unicode_const_index = {}
self.int_const_index = {}
self.py_constants = []
...
...
@@ -1016,6 +1027,16 @@ class GlobalState(object):
c
.
add_py_version
(
py_version
)
return
c
def
get_unicode_const
(
self
,
text
):
# return a Py_UNICODE[] constant, creating a new one if necessary
assert
text
.
is_unicode
try
:
c
=
self
.
unicode_const_index
[
text
]
except
KeyError
:
c
=
UnicodeConst
(
self
.
new_const_cname
(),
text
)
self
.
unicode_const_index
[
text
]
=
c
return
c
def
get_py_string_const
(
self
,
text
,
identifier
=
None
,
is_str
=
False
,
unicode_value
=
None
):
# return a Python string constant, creating a new one if necessary
...
...
@@ -1141,6 +1162,10 @@ class GlobalState(object):
for
py_string
in
c
.
py_strings
.
values
():
py_strings
.
append
((
c
.
cname
,
len
(
py_string
.
cname
),
py_string
))
for
c
in
self
.
unicode_const_index
.
values
():
decls_writer
.
putln
(
'static Py_UNICODE %s[] = { %s };'
%
(
c
.
cname
,
StringEncoding
.
encode_py_unicode_string
(
c
.
text
)))
if
py_strings
:
self
.
use_utility_code
(
UtilityCode
.
load_cached
(
"InitStrings"
,
"StringTools.c"
))
py_strings
.
sort
()
...
...
@@ -1435,6 +1460,9 @@ class CCodeWriter(object):
def
get_string_const
(
self
,
text
):
return
self
.
globalstate
.
get_string_const
(
text
).
cname
def
get_unicode_const
(
self
,
text
):
return
self
.
globalstate
.
get_unicode_const
(
text
).
cname
def
get_py_string_const
(
self
,
text
,
identifier
=
None
,
is_str
=
False
,
unicode_value
=
None
):
return
self
.
globalstate
.
get_py_string_const
(
...
...
Cython/Compiler/ExprNodes.py
View file @
a664239a
...
...
@@ -63,14 +63,16 @@ coercion_error_dict = {
# string related errors
(
Builtin
.
unicode_type
,
Builtin
.
bytes_type
)
:
"Cannot convert Unicode string to 'bytes' implicitly, encoding required."
,
(
Builtin
.
unicode_type
,
Builtin
.
str_type
)
:
"Cannot convert Unicode string to 'str' implicitly. This is not portable and requires explicit encoding."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_char_ptr_type
)
:
"Unicode objects
do not support coercion to C types
."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_uchar_ptr_type
)
:
"Unicode objects
do not support coercion to C types
."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_char_ptr_type
)
:
"Unicode objects
only support coercion to Py_UNICODE*
."
,
(
Builtin
.
unicode_type
,
PyrexTypes
.
c_uchar_ptr_type
)
:
"Unicode objects
only support coercion to Py_UNICODE*
."
,
(
Builtin
.
bytes_type
,
Builtin
.
unicode_type
)
:
"Cannot convert 'bytes' object to unicode implicitly, decoding required"
,
(
Builtin
.
bytes_type
,
Builtin
.
str_type
)
:
"Cannot convert 'bytes' object to str implicitly. This is not portable to Py3."
,
(
Builtin
.
bytes_type
,
PyrexTypes
.
c_py_unicode_ptr_type
)
:
"Cannot convert 'bytes' object to Py_UNICODE*, use 'unicode'."
,
(
Builtin
.
str_type
,
Builtin
.
unicode_type
)
:
"str objects do not support coercion to unicode, use a unicode string literal instead (u'')"
,
(
Builtin
.
str_type
,
Builtin
.
bytes_type
)
:
"Cannot convert 'str' to 'bytes' implicitly. This is not portable."
,
(
Builtin
.
str_type
,
PyrexTypes
.
c_char_ptr_type
)
:
"'str' objects do not support coercion to C types (use 'bytes'?)."
,
(
Builtin
.
str_type
,
PyrexTypes
.
c_uchar_ptr_type
)
:
"'str' objects do not support coercion to C types (use 'bytes'?)."
,
(
Builtin
.
str_type
,
PyrexTypes
.
c_py_unicode_ptr_type
)
:
"'str' objects do not support coercion to C types (use 'unicode'?)."
,
(
PyrexTypes
.
c_char_ptr_type
,
Builtin
.
unicode_type
)
:
"Cannot convert 'char*' to unicode implicitly, decoding required"
,
(
PyrexTypes
.
c_uchar_ptr_type
,
Builtin
.
unicode_type
)
:
"Cannot convert 'char*' to unicode implicitly, decoding required"
,
}
...
...
@@ -1171,8 +1173,8 @@ class BytesNode(ConstNode):
return
self
.
result_code
class
UnicodeNode
(
Py
ConstNode
):
# A Py
thon unicode object
class
UnicodeNode
(
ConstNode
):
# A Py
_UNICODE* or unicode literal
#
# value EncodedString
# bytes_value BytesLiteral the literal parsed as bytes string ('-3' unicode literals only)
...
...
@@ -1213,7 +1215,11 @@ class UnicodeNode(PyConstNode):
if
dst_type
.
is_string
and
self
.
bytes_value
is
not
None
:
# special case: '-3' enforced unicode literal used in a C char* context
return
BytesNode
(
self
.
pos
,
value
=
self
.
bytes_value
).
coerce_to
(
dst_type
,
env
)
error
(
self
.
pos
,
"Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4."
)
if
dst_type
.
is_unicode
:
node
=
UnicodeNode
(
self
.
pos
,
value
=
self
.
value
)
node
.
type
=
dst_type
return
node
error
(
self
.
pos
,
"Unicode literals do not support coercion to C types other than Py_UNICODE/Py_UCS4 (for characters) or Py_UNICODE* (for strings)."
)
elif
dst_type
is
not
py_object_type
:
if
not
self
.
check_for_coercion_error
(
dst_type
,
env
):
self
.
fail_assignment
(
dst_type
)
...
...
@@ -1225,11 +1231,20 @@ class UnicodeNode(PyConstNode):
## and (0xD800 <= self.value[0] <= 0xDBFF)
## and (0xDC00 <= self.value[1] <= 0xDFFF))
def
coerce_to_boolean
(
self
,
env
):
bool_value
=
bool
(
self
.
value
)
return
BoolNode
(
self
.
pos
,
value
=
bool_value
,
constant_result
=
bool_value
)
def
contains_surrogates
(
self
):
return
_string_contains_surrogates
(
self
.
value
)
def
generate_evaluation_code
(
self
,
code
):
if
self
.
type
.
is_pyobject
:
self
.
result_code
=
code
.
get_py_string_const
(
self
.
value
)
else
:
if
self
.
contains_surrogates
():
warning
(
self
.
pos
,
"Py_UNICODE* literals with characters outside BMP are not portable."
,
level
=
1
);
self
.
result_code
=
code
.
get_unicode_const
(
self
.
value
)
def
calculate_result_code
(
self
):
return
self
.
result_code
...
...
@@ -2633,6 +2648,9 @@ class IndexNode(ExprNode):
if
base_type
.
is_string
:
# sliced C strings must coerce to Python
return
bytes_type
elif
base_type
.
is_unicode
:
# sliced Py_UNICODE* strings must coerce to Python
return
unicode_type
elif
base_type
in
(
unicode_type
,
bytes_type
,
str_type
,
list_type
,
tuple_type
):
# slicing these returns the same type
return
base_type
...
...
@@ -3446,6 +3464,8 @@ class SliceIndexNode(ExprNode):
base_type
=
self
.
base
.
infer_type
(
env
)
if
base_type
.
is_string
or
base_type
.
is_cpp_class
:
return
bytes_type
elif
base_type
.
is_unicode
:
return
unicode_type
elif
base_type
in
(
bytes_type
,
str_type
,
unicode_type
,
list_type
,
tuple_type
):
return
base_type
...
...
@@ -3510,6 +3530,8 @@ class SliceIndexNode(ExprNode):
base_type
=
self
.
base
.
type
if
base_type
.
is_string
or
base_type
.
is_cpp_string
:
self
.
type
=
default_str_type
(
env
)
elif
base_type
.
is_unicode
:
self
.
type
=
unicode_type
elif
base_type
.
is_ptr
:
self
.
type
=
base_type
elif
base_type
.
is_array
:
...
...
@@ -3578,6 +3600,27 @@ class SliceIndexNode(ExprNode):
stop_code
,
start_code
,
code
.
error_goto_if_null
(
result
,
self
.
pos
)))
elif
self
.
base
.
type
.
is_unicode
:
base_result
=
self
.
base
.
result
()
if
self
.
base
.
type
!=
PyrexTypes
.
c_py_unicode_ptr_type
:
base_result
=
'((const Py_UNICODE*)%s)'
%
base_result
if
self
.
stop
is
None
:
code
.
putln
(
"%s = __Pyx_PyUnicode_FromUnicode(%s + %s); %s"
%
(
result
,
base_result
,
start_code
,
code
.
error_goto_if_null
(
result
,
self
.
pos
)))
else
:
code
.
putln
(
"%s = __Pyx_PyUnicode_FromUnicodeAndLength(%s + %s, %s - %s); %s"
%
(
result
,
base_result
,
start_code
,
stop_code
,
start_code
,
code
.
error_goto_if_null
(
result
,
self
.
pos
)))
elif
self
.
base
.
type
is
unicode_type
:
code
.
globalstate
.
use_utility_code
(
UtilityCode
.
load_cached
(
"PyUnicode_Substring"
,
"StringTools.c"
))
...
...
@@ -4903,11 +4946,11 @@ class AttributeNode(ExprNode):
self
.
is_py_attr
=
0
self
.
member
=
self
.
attribute
if
obj_type
is
None
:
if
self
.
obj
.
type
.
is_string
:
if
self
.
obj
.
type
.
is_string
or
self
.
obj
.
type
.
is_unicode
:
self
.
obj
=
self
.
obj
.
coerce_to_pyobject
(
env
)
obj_type
=
self
.
obj
.
type
else
:
if
obj_type
.
is_string
:
if
obj_type
.
is_string
or
obj_type
.
is_unicode
:
obj_type
=
py_object_type
if
obj_type
.
is_ptr
or
obj_type
.
is_array
:
obj_type
=
obj_type
.
base_type
...
...
@@ -8337,8 +8380,12 @@ class BinopNode(ExprNode):
if
self
.
is_py_operation_types
(
type1
,
type2
):
if
type2
.
is_string
:
type2
=
Builtin
.
bytes_type
elif
type2
.
is_unicode
:
type2
=
Builtin
.
unicode_type
if
type1
.
is_string
:
type1
=
Builtin
.
bytes_type
elif
type1
.
is_unicode
:
type1
=
Builtin
.
unicode_type
elif
self
.
operator
==
'%'
\
and
type1
in
(
Builtin
.
str_type
,
Builtin
.
unicode_type
):
# note that b'%s' % b'abc' doesn't work in Py3
...
...
@@ -8587,7 +8634,7 @@ class AddNode(NumBinopNode):
# '+' operator.
def
is_py_operation_types
(
self
,
type1
,
type2
):
if
type1
.
is_string
and
type2
.
is_string
:
if
type1
.
is_string
and
type2
.
is_string
or
type1
.
is_unicode
and
type2
.
is_unicode
:
return
1
else
:
return
NumBinopNode
.
is_py_operation_types
(
self
,
type1
,
type2
)
...
...
@@ -9950,7 +9997,7 @@ class CoerceToPyTypeNode(CoercionNode):
# be specific about some known types
if
arg
.
type
.
is_string
or
arg
.
type
.
is_cpp_string
:
self
.
type
=
default_str_type
(
env
)
elif
arg
.
type
.
is_unicode_char
:
elif
arg
.
type
.
is_unicode
or
arg
.
type
.
is_unicode
_char
:
self
.
type
=
unicode_type
elif
arg
.
type
.
is_complex
:
self
.
type
=
Builtin
.
complex_type
...
...
@@ -10065,13 +10112,13 @@ class CoerceFromPyTypeNode(CoercionNode):
if
not
result_type
.
create_from_py_utility_code
(
env
):
error
(
arg
.
pos
,
"Cannot convert Python object to '%s'"
%
result_type
)
if
self
.
type
.
is_string
:
if
self
.
type
.
is_string
or
self
.
type
.
is_unicode
:
if
self
.
arg
.
is_ephemeral
():
error
(
arg
.
pos
,
"Obtaining
char* from temporary Python value"
)
"Obtaining
'%s' from temporary Python value"
%
result_type
)
elif
self
.
arg
.
is_name
and
self
.
arg
.
entry
and
self
.
arg
.
entry
.
is_pyglobal
:
warning
(
arg
.
pos
,
"Obtaining
char* from externally modifiable global Python value"
,
"Obtaining
'%s' from externally modifiable global Python value"
%
result_type
,
level
=
1
)
def
analyse_types
(
self
,
env
):
...
...
Cython/Compiler/Optimize.py
View file @
a664239a
...
...
@@ -1977,6 +1977,11 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
PyrexTypes
.
CFuncTypeArg
(
"bytes"
,
PyrexTypes
.
c_char_ptr_type
,
None
)
])
Pyx_Py_UNICODE_strlen_func_type
=
PyrexTypes
.
CFuncType
(
PyrexTypes
.
c_size_t_type
,
[
PyrexTypes
.
CFuncTypeArg
(
"unicode"
,
PyrexTypes
.
c_py_unicode_ptr_type
,
None
)
])
PyObject_Size_func_type
=
PyrexTypes
.
CFuncType
(
PyrexTypes
.
c_py_ssize_t_type
,
[
PyrexTypes
.
CFuncTypeArg
(
"obj"
,
PyrexTypes
.
py_object_type
,
None
)
...
...
@@ -1996,7 +2001,8 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
_ext_types_with_pysize
=
set
([
"cpython.array.array"
])
def
_handle_simple_function_len
(
self
,
node
,
pos_args
):
"""Replace len(char*) by the equivalent call to strlen() and
"""Replace len(char*) by the equivalent call to strlen(),
len(Py_UNICODE) by the equivalent Py_UNICODE_strlen() and
len(known_builtin_type) by an equivalent C-API call.
"""
if
len
(
pos_args
)
!=
1
:
...
...
@@ -2011,6 +2017,12 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
args
=
[
arg
],
is_temp
=
node
.
is_temp
,
utility_code
=
UtilityCode
.
load_cached
(
"IncludeStringH"
,
"StringTools.c"
))
elif
arg
.
type
.
is_unicode
:
new_node
=
ExprNodes
.
PythonCapiCallNode
(
node
.
pos
,
"__Pyx_Py_UNICODE_strlen"
,
self
.
Pyx_Py_UNICODE_strlen_func_type
,
args
=
[
arg
],
is_temp
=
node
.
is_temp
,
utility_code
=
UtilityCode
.
load_cached
(
"py_unicode_strlen"
,
"StringTools.c"
))
elif
arg
.
type
.
is_pyobject
:
cfunc_name
=
self
.
_map_to_capi_len_function
(
arg
.
type
)
if
cfunc_name
is
None
:
...
...
Cython/Compiler/PyrexTypes.py
View file @
a664239a
...
...
@@ -145,6 +145,7 @@ class PyrexType(BaseType):
# is_enum boolean Is a C enum type
# is_typedef boolean Is a typedef type
# is_string boolean Is a C char * type
# is_unicode boolean Is a C PyUNICODE * type
# is_cpp_string boolean Is a C++ std::string type
# is_unicode_char boolean Is either Py_UCS4 or Py_UNICODE
# is_returncode boolean Is used only to signal exceptions
...
...
@@ -202,6 +203,7 @@ class PyrexType(BaseType):
is_enum
=
0
is_typedef
=
0
is_string
=
0
is_unicode
=
0
is_unicode_char
=
0
is_returncode
=
0
is_error
=
0
...
...
@@ -871,7 +873,7 @@ class PyObjectType(PyrexType):
def
assignable_from
(
self
,
src_type
):
# except for pointers, conversion will be attempted
return
not
src_type
.
is_ptr
or
src_type
.
is_string
return
not
src_type
.
is_ptr
or
src_type
.
is_string
or
src_type
.
is_unicode
def
declaration_code
(
self
,
entity_code
,
for_display
=
0
,
dll_linkage
=
None
,
pyrex
=
0
):
...
...
@@ -1161,7 +1163,7 @@ class CType(PyrexType):
def
error_condition
(
self
,
result_code
):
conds
=
[]
if
self
.
is_string
:
if
self
.
is_string
or
self
.
is_unicode
:
conds
.
append
(
"(!%s)"
%
result_code
)
elif
self
.
exception_value
is
not
None
:
conds
.
append
(
"(%s == (%s)%s)"
%
(
result_code
,
self
.
sign_and_name
(),
self
.
exception_value
))
...
...
@@ -2178,6 +2180,9 @@ class CPointerBaseType(CType):
if
base_type
.
same_as
(
char_type
):
self
.
is_string
=
1
break
else
:
if
base_type
.
same_as
(
c_py_unicode_type
):
self
.
is_unicode
=
1
if
self
.
is_string
and
not
base_type
.
is_error
:
if
base_type
.
signed
:
...
...
@@ -2189,10 +2194,17 @@ class CPointerBaseType(CType):
if
self
.
is_ptr
:
self
.
from_py_function
=
"__Pyx_PyObject_AsUString"
self
.
exception_value
=
"NULL"
elif
self
.
is_unicode
and
not
base_type
.
is_error
:
self
.
to_py_function
=
"__Pyx_PyUnicode_FromUnicode"
if
self
.
is_ptr
:
self
.
from_py_function
=
"__Pyx_PyUnicode_AsUnicode"
self
.
exception_value
=
"NULL"
def
py_type_name
(
self
):
if
self
.
is_string
:
return
"bytes"
elif
self
.
is_unicode
:
return
"unicode"
else
:
return
super
(
CPointerBaseType
,
self
).
py_type_name
()
...
...
Cython/Compiler/StringEncoding.py
View file @
a664239a
...
...
@@ -4,6 +4,7 @@
import
re
import
sys
import
array
if
sys
.
version_info
[
0
]
>=
3
:
_unicode
,
_str
,
_bytes
=
str
,
str
,
bytes
...
...
@@ -262,3 +263,14 @@ def split_string_literal(s, limit=2000):
chunks
.
append
(
s
[
start
:
end
])
start
=
end
return
'""'
.
join
(
chunks
)
def
encode_py_unicode_string
(
s
):
"""Create Py_UNICODE[] representation of a given unicode string.
"""
# Non-BMP characters will appear as surrogates, which is not compatible with
# wide (UTF-32) Python builds. UnicodeNode will warn the user about this.
a
=
array
.
array
(
'H'
,
s
.
encode
(
'UTF-16'
))
a
.
pop
(
0
)
# Remove BOM
a
.
append
(
0
)
# Add NULL terminator
return
u","
.
join
(
map
(
unicode
,
a
))
Cython/Utility/StringTools.c
View file @
a664239a
...
...
@@ -604,3 +604,17 @@ static CYTHON_INLINE char __Pyx_PyBytes_GetItemInt(PyObject* bytes, Py_ssize_t i
index
+=
PyBytes_GET_SIZE
(
bytes
);
return
PyBytes_AS_STRING
(
bytes
)[
index
];
}
/////////////// py_unicode_strlen.proto ///////////////
#if PY_VERSION_HEX < 0x03000000
static
CYTHON_INLINE
size_t
__Pyx_Py_UNICODE_strlen
(
const
Py_UNICODE
*
u
)
{
const
Py_UNICODE
*
u_end
=
u
;
while
(
*
u_end
++
)
;
return
u_end
-
u
-
1
;
}
#else
#define __Pyx_Py_UNICODE_strlen Py_UNICODE_strlen
#endif
Cython/Utility/TypeConversion.c
View file @
a664239a
/////////////// TypeConversions.proto ///////////////
// @requires: py_unicode_strlen
/* Type Conversion Predeclarations */
...
...
@@ -24,6 +25,14 @@ static CYTHON_INLINE PyObject* __Pyx_PyUnicode_FromString(char*);
#define __Pyx_PyStr_FromUString(s) __Pyx_PyStr_FromString((char*)s)
#define __Pyx_PyUnicode_FromUString(s) __Pyx_PyUnicode_FromString((char*)s)
#define __Pyx_PyUnicode_FromUnicode(u) PyUnicode_FromUnicode(u, __Pyx_Py_UNICODE_strlen(u))
#define __Pyx_PyUnicode_FromUnicodeAndLength PyUnicode_FromUnicode
#if CYTHON_PEP393_ENABLED
#define __Pyx_PyUnicode_AsUnicode PyUnicode_AsUnicode
#else
#define __Pyx_PyUnicode_AsUnicode PyUnicode_AS_UNICODE
#endif
#define __Pyx_Owned_Py_None(b) (Py_INCREF(Py_None), Py_None)
#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
static
CYTHON_INLINE
int
__Pyx_PyObject_IsTrue
(
PyObject
*
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment