Commit dc320e05 authored by Stefan Behnel's avatar Stefan Behnel

native support for Py_UNICODE, coercion between 1-character unicode literals...

native support for Py_UNICODE, coercion between 1-character unicode literals and Py_UNICODE, fix C iteration over unicode strings by using Py_UNICODE*
parent 025153fb
...@@ -860,7 +860,10 @@ class BytesNode(ConstNode): ...@@ -860,7 +860,10 @@ class BytesNode(ConstNode):
def coerce_to(self, dst_type, env): def coerce_to(self, dst_type, env):
if dst_type.is_int: if dst_type.is_int:
if not self.can_coerce_to_char_literal(): if not self.can_coerce_to_char_literal():
error(self.pos, "Only single-character strings can be coerced into ints.") error(self.pos, "Only single-character string literals can be coerced into ints.")
return self
if dst_type is PyrexTypes.c_py_unicode_type:
error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
return self return self
return CharNode(self.pos, value=self.value) return CharNode(self.pos, value=self.value)
...@@ -915,13 +918,22 @@ class UnicodeNode(PyConstNode): ...@@ -915,13 +918,22 @@ class UnicodeNode(PyConstNode):
def coerce_to(self, dst_type, env): def coerce_to(self, dst_type, env):
if dst_type is self.type: if dst_type is self.type:
pass pass
elif dst_type is PyrexTypes.c_py_unicode_type:
if not self.can_coerce_to_char_literal():
error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
return self
int_value = ord(self.value)
return IntNode(self.pos, value=int_value, constant_result=int_value)
elif not dst_type.is_pyobject: elif not dst_type.is_pyobject:
error(self.pos, "Unicode objects do not support coercion to C types.") error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
elif dst_type is not py_object_type: elif dst_type is not py_object_type:
if not self.check_for_coercion_error(dst_type): if not self.check_for_coercion_error(dst_type):
self.fail_assignment(dst_type) self.fail_assignment(dst_type)
return self return self
def can_coerce_to_char_literal(self):
return len(self.value) == 1
def generate_evaluation_code(self, code): def generate_evaluation_code(self, code):
self.result_code = code.get_py_string_const(self.value) self.result_code = code.get_py_string_const(self.value)
...@@ -5426,10 +5438,10 @@ class CmpNode(object): ...@@ -5426,10 +5438,10 @@ class CmpNode(object):
type1_can_be_int = False type1_can_be_int = False
type2_can_be_int = False type2_can_be_int = False
if isinstance(operand1, (StringNode, BytesNode)) \ if isinstance(operand1, (StringNode, BytesNode, UnicodeNode)) \
and operand1.can_coerce_to_char_literal(): and operand1.can_coerce_to_char_literal():
type1_can_be_int = True type1_can_be_int = True
if isinstance(operand2, (StringNode, BytesNode)) \ if isinstance(operand2, (StringNode, BytesNode, UnicodeNode)) \
and operand2.can_coerce_to_char_literal(): and operand2.can_coerce_to_char_literal():
type2_can_be_int = True type2_can_be_int = True
......
...@@ -137,7 +137,7 @@ class IterationTransform(Visitor.VisitorTransform): ...@@ -137,7 +137,7 @@ class IterationTransform(Visitor.VisitorTransform):
return node return node
PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType( PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType(
PyrexTypes.CPtrType(PyrexTypes.c_uint_type), [ # FIXME: return type is actually Py_UNICODE* PyrexTypes.CPtrType(PyrexTypes.c_py_unicode_type), [
PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None) PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None)
]) ])
......
...@@ -1851,6 +1851,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint") ...@@ -1851,6 +1851,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint")
special_basic_c_types = { special_basic_c_types = {
# name : (signed, longness) # name : (signed, longness)
"Py_UNICODE" : (0, 0),
"Py_ssize_t" : (2, 0), "Py_ssize_t" : (2, 0),
"size_t" : (0, 0), "size_t" : (0, 0),
} }
......
...@@ -863,6 +863,20 @@ class CAnonEnumType(CIntType): ...@@ -863,6 +863,20 @@ class CAnonEnumType(CIntType):
return 'int' return 'int'
class CPyUnicodeIntType(CIntType):
# Py_UNICODE
# Conversion from a unicode string to Py_UNICODE at runtime is not
# currently supported and may never be - we only convert from and
# to integers here. The maximum value for a Py_UNICODE is
# 1114111, so PyInt_FromLong() will do just fine here.
to_py_function = "PyInt_FromLong"
def sign_and_name(self):
return "Py_UNICODE"
class CPySSizeTType(CIntType): class CPySSizeTType(CIntType):
to_py_function = "PyInt_FromSsize_t" to_py_function = "PyInt_FromSsize_t"
...@@ -2075,14 +2089,15 @@ class ErrorType(PyrexType): ...@@ -2075,14 +2089,15 @@ class ErrorType(PyrexType):
rank_to_type_name = ( rank_to_type_name = (
"char", # 0 "char", # 0
"short", # 1 "short", # 1
"int", # 2 "Py_UNICODE", # 2
"long", # 3 "int", # 3
"Py_ssize_t", # 4 "long", # 4
"size_t", # 5 "Py_ssize_t", # 5
"PY_LONG_LONG", # 6 "size_t", # 6
"float", # 7 "PY_LONG_LONG", # 7
"double", # 8 "float", # 8
"long double", # 9 "double", # 9
"long double", # 10
) )
py_object_type = PyObjectType() py_object_type = PyObjectType()
...@@ -2093,29 +2108,30 @@ c_void_ptr_ptr_type = CPtrType(c_void_ptr_type) ...@@ -2093,29 +2108,30 @@ c_void_ptr_ptr_type = CPtrType(c_void_ptr_type)
c_uchar_type = CIntType(0, 0) c_uchar_type = CIntType(0, 0)
c_ushort_type = CIntType(1, 0) c_ushort_type = CIntType(1, 0)
c_uint_type = CIntType(2, 0) c_py_unicode_type = CPyUnicodeIntType(2, 0)
c_ulong_type = CIntType(3, 0) c_uint_type = CIntType(3, 0)
c_ulonglong_type = CIntType(6, 0) c_ulong_type = CIntType(4, 0)
c_ulonglong_type = CIntType(7, 0)
c_char_type = CIntType(0, 1) c_char_type = CIntType(0, 1)
c_short_type = CIntType(1, 1) c_short_type = CIntType(1, 1)
c_int_type = CIntType(2, 1) c_int_type = CIntType(3, 1)
c_long_type = CIntType(3, 1) c_long_type = CIntType(4, 1)
c_longlong_type = CIntType(6, 1) c_longlong_type = CIntType(7, 1)
c_schar_type = CIntType(0, 2) c_schar_type = CIntType(0, 2)
c_sshort_type = CIntType(1, 2) c_sshort_type = CIntType(1, 2)
c_sint_type = CIntType(2, 2) c_sint_type = CIntType(3, 2)
c_slong_type = CIntType(3, 2) c_slong_type = CIntType(4, 2)
c_slonglong_type = CIntType(6, 2) c_slonglong_type = CIntType(7, 2)
c_bint_type = CBIntType(2, 1) c_bint_type = CBIntType(3, 1)
c_py_ssize_t_type = CPySSizeTType(4, 2) c_py_ssize_t_type = CPySSizeTType(5, 2)
c_size_t_type = CSizeTType(5, 0) c_size_t_type = CSizeTType(6, 0)
c_float_type = CFloatType(7, math_h_modifier='f') c_float_type = CFloatType(8, math_h_modifier='f')
c_double_type = CFloatType(8) c_double_type = CFloatType(9)
c_longdouble_type = CFloatType(9, math_h_modifier='l') c_longdouble_type = CFloatType(10, math_h_modifier='l')
c_float_complex_type = CComplexType(c_float_type) c_float_complex_type = CComplexType(c_float_type)
c_double_complex_type = CComplexType(c_double_type) c_double_complex_type = CComplexType(c_double_type)
...@@ -2131,7 +2147,7 @@ c_int_ptr_type = CPtrType(c_int_type) ...@@ -2131,7 +2147,7 @@ c_int_ptr_type = CPtrType(c_int_type)
c_py_ssize_t_ptr_type = CPtrType(c_py_ssize_t_type) c_py_ssize_t_ptr_type = CPtrType(c_py_ssize_t_type)
c_size_t_ptr_type = CPtrType(c_size_t_type) c_size_t_ptr_type = CPtrType(c_size_t_type)
c_returncode_type = CIntType(2, 1, is_returncode = 1) c_returncode_type = CIntType(3, 1, is_returncode = 1)
c_anon_enum_type = CAnonEnumType(-1, 1) c_anon_enum_type = CAnonEnumType(-1, 1)
# the Py_buffer type is defined in Builtin.py # the Py_buffer type is defined in Builtin.py
...@@ -2165,6 +2181,7 @@ modifiers_and_name_to_type = { ...@@ -2165,6 +2181,7 @@ modifiers_and_name_to_type = {
(1, 0, "bint"): c_bint_type, (1, 0, "bint"): c_bint_type,
(0, 0, "size_t") : c_size_t_type, (0, 0, "size_t") : c_size_t_type,
(2, 0, "Py_ssize_t"): c_py_ssize_t_type, (2, 0, "Py_ssize_t"): c_py_ssize_t_type,
(0, 0, "Py_UNICODE"): c_py_unicode_type,
(1, 0, "float"): c_float_type, (1, 0, "float"): c_float_type,
(1, 0, "double"): c_double_type, (1, 0, "double"): c_double_type,
...@@ -2383,6 +2400,8 @@ def parse_basic_type(name): ...@@ -2383,6 +2400,8 @@ def parse_basic_type(name):
signed = 2 signed = 2
elif name == 'size_t': elif name == 'size_t':
signed = 0 signed = 0
elif name == 'Py_UNICODE':
signed = 0
else: else:
if name.startswith('u'): if name.startswith('u'):
name = name[1:] name = name[1:]
......
...@@ -174,7 +174,7 @@ except ImportError: ...@@ -174,7 +174,7 @@ except ImportError:
# Predefined types # Predefined types
int_types = ['char', 'short', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t'] int_types = ['char', 'short', 'Py_UNICODE', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
float_types = ['longdouble', 'double', 'float'] float_types = ['longdouble', 'double', 'float']
complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex'] complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex']
other_types = ['bint', 'void'] other_types = ['bint', 'void']
...@@ -183,7 +183,7 @@ gs = globals() ...@@ -183,7 +183,7 @@ gs = globals()
for name in int_types: for name in int_types:
gs[name] = typedef(py_int) gs[name] = typedef(py_int)
if not name.endswith('size_t'): if name != 'Py_UNICODE' and not name.endswith('size_t'):
gs['u'+name] = typedef(py_int) gs['u'+name] = typedef(py_int)
gs['s'+name] = typedef(py_int) gs['s'+name] = typedef(py_int)
......
...@@ -4,12 +4,14 @@ cdef int cx = "test" # fails ...@@ -4,12 +4,14 @@ cdef int cx = "test" # fails
cdef int x1 = "\xFF" # works cdef int x1 = "\xFF" # works
cdef int x2 = "\u0FFF" # fails cdef int x2 = "\u0FFF" # fails
cdef int x3 = u"\xFF" # fails
cdef Py_UNICODE u1 = u"\xFF" # works
cdef int u3 = u"\xFF" # fails
_ERRORS = u"""
2:14: Only single-character strings can be coerced into ints. _ERRORS = """
3:14: Only single-character strings can be coerced into ints. 2:14: Only single-character string literals can be coerced into ints.
6:15: Only single-character strings can be coerced into ints. 3:14: Only single-character string literals can be coerced into ints.
7:14: Unicode objects do not support coercion to C types. 6:15: Only single-character string literals can be coerced into ints.
9:14: Unicode literals do not support coercion to C types other than Py_UNICODE.
""" """
# -*- coding: iso-8859-1 -*-
cdef Py_UNICODE char_ASCII = u'A'
cdef Py_UNICODE char_KLINGON = u'\uF8D2'
def char_too_long_ASCII():
cdef Py_UNICODE c = u'AB'
def char_too_long_Unicode():
cdef Py_UNICODE c = u'A\uF8D2'
def char_too_long_bytes():
cdef Py_UNICODE c = b'AB'
def char_too_long_latin1():
cdef Py_UNICODE char_bytes_latin1 = b''
_ERRORS = """
7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
13:24: Only single-character string literals can be coerced into ints.
16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
"""
...@@ -50,7 +50,7 @@ cdef list l_f2 = b1 ...@@ -50,7 +50,7 @@ cdef list l_f2 = b1
cdef list l_f3 = u1 cdef list l_f3 = u1
_ERRORS = u""" _ERRORS = u"""
25:20: Unicode objects do not support coercion to C types. 25:20: Unicode literals do not support coercion to C types other than Py_UNICODE.
26:22: Unicode objects do not support coercion to C types. 26:22: Unicode objects do not support coercion to C types.
27:22: 'str' objects do not support coercion to C types (use 'bytes'?). 27:22: 'str' objects do not support coercion to C types (use 'bytes'?).
......
...@@ -14,7 +14,7 @@ def for_in_bytes(bytes s): ...@@ -14,7 +14,7 @@ def for_in_bytes(bytes s):
'C' 'C'
""" """
for c in s: for c in s:
if c == 'C': if c == b'C':
return 'C' return 'C'
else: else:
return 'X' return 'X'
...@@ -28,21 +28,21 @@ def for_char_in_bytes(bytes s): ...@@ -28,21 +28,21 @@ def for_char_in_bytes(bytes s):
""" """
cdef char c cdef char c
for c in s: for c in s:
if c == 'C': if c == b'C':
return 'C' return 'C'
else: else:
return 'X' return 'X'
def for_int_in_unicode(unicode s): def for_pyunicode_in_unicode(unicode s):
""" """
>>> for_int_in_unicode(unicode_abc) >>> for_pyunicode_in_unicode(unicode_abc)
'X' 'X'
>>> for_int_in_unicode(unicode_ABC) >>> for_pyunicode_in_unicode(unicode_ABC)
'C' 'C'
""" """
cdef int c cdef Py_UNICODE c
for c in s: for c in s:
if c == 'C': if c == u'C':
return 'C' return 'C'
else: else:
return 'X' return 'X'
# -*- coding: iso-8859-1 -*-
cdef Py_UNICODE char_ASCII = u'A'
cdef Py_UNICODE char_KLINGON = u'\uF8D2'
def compare_ASCII():
"""
>>> compare_ASCII()
True
False
False
"""
print(char_ASCII == u'A')
print(char_ASCII == u'B')
print(char_ASCII == u'\uF8D2')
def compare_KLINGON():
"""
>>> compare_ASCII()
True
False
False
"""
print(char_KLINGON == u'\uF8D2')
print(char_KLINGON == u'A')
print(char_KLINGON == u'B')
def index_literal(int i):
"""
>>> index_literal(0) == '1'
True
>>> index_literal(-5) == '1'
True
>>> index_literal(2) == '3'
True
>>> index_literal(4) == '5'
True
"""
# runtime casts are not currently supported
#return <Py_UNICODE>(u"12345"[i])
return u"12345"[i]
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment