Commit dc320e05 authored by Stefan Behnel's avatar Stefan Behnel

native support for Py_UNICODE, coercion between 1-character unicode literals...

native support for Py_UNICODE, coercion between 1-character unicode literals and Py_UNICODE, fix C iteration over unicode strings by using Py_UNICODE*
parent 025153fb
......@@ -860,7 +860,10 @@ class BytesNode(ConstNode):
def coerce_to(self, dst_type, env):
if dst_type.is_int:
if not self.can_coerce_to_char_literal():
error(self.pos, "Only single-character strings can be coerced into ints.")
error(self.pos, "Only single-character string literals can be coerced into ints.")
return self
if dst_type is PyrexTypes.c_py_unicode_type:
error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
return self
return CharNode(self.pos, value=self.value)
......@@ -915,13 +918,22 @@ class UnicodeNode(PyConstNode):
def coerce_to(self, dst_type, env):
if dst_type is self.type:
pass
elif dst_type is PyrexTypes.c_py_unicode_type:
if not self.can_coerce_to_char_literal():
error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
return self
int_value = ord(self.value)
return IntNode(self.pos, value=int_value, constant_result=int_value)
elif not dst_type.is_pyobject:
error(self.pos, "Unicode objects do not support coercion to C types.")
error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
elif dst_type is not py_object_type:
if not self.check_for_coercion_error(dst_type):
self.fail_assignment(dst_type)
return self
def can_coerce_to_char_literal(self):
return len(self.value) == 1
def generate_evaluation_code(self, code):
self.result_code = code.get_py_string_const(self.value)
......@@ -5426,10 +5438,10 @@ class CmpNode(object):
type1_can_be_int = False
type2_can_be_int = False
if isinstance(operand1, (StringNode, BytesNode)) \
if isinstance(operand1, (StringNode, BytesNode, UnicodeNode)) \
and operand1.can_coerce_to_char_literal():
type1_can_be_int = True
if isinstance(operand2, (StringNode, BytesNode)) \
if isinstance(operand2, (StringNode, BytesNode, UnicodeNode)) \
and operand2.can_coerce_to_char_literal():
type2_can_be_int = True
......
......@@ -137,7 +137,7 @@ class IterationTransform(Visitor.VisitorTransform):
return node
PyUnicode_AS_UNICODE_func_type = PyrexTypes.CFuncType(
PyrexTypes.CPtrType(PyrexTypes.c_uint_type), [ # FIXME: return type is actually Py_UNICODE*
PyrexTypes.CPtrType(PyrexTypes.c_py_unicode_type), [
PyrexTypes.CFuncTypeArg("s", Builtin.unicode_type, None)
])
......
......@@ -1851,6 +1851,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint")
special_basic_c_types = {
# name : (signed, longness)
"Py_UNICODE" : (0, 0),
"Py_ssize_t" : (2, 0),
"size_t" : (0, 0),
}
......
......@@ -863,6 +863,20 @@ class CAnonEnumType(CIntType):
return 'int'
class CPyUnicodeIntType(CIntType):
# Py_UNICODE
# Conversion from a unicode string to Py_UNICODE at runtime is not
# currently supported and may never be - we only convert from and
# to integers here. The maximum value for a Py_UNICODE is
# 1114111, so PyInt_FromLong() will do just fine here.
to_py_function = "PyInt_FromLong"
def sign_and_name(self):
return "Py_UNICODE"
class CPySSizeTType(CIntType):
to_py_function = "PyInt_FromSsize_t"
......@@ -2075,14 +2089,15 @@ class ErrorType(PyrexType):
rank_to_type_name = (
"char", # 0
"short", # 1
"int", # 2
"long", # 3
"Py_ssize_t", # 4
"size_t", # 5
"PY_LONG_LONG", # 6
"float", # 7
"double", # 8
"long double", # 9
"Py_UNICODE", # 2
"int", # 3
"long", # 4
"Py_ssize_t", # 5
"size_t", # 6
"PY_LONG_LONG", # 7
"float", # 8
"double", # 9
"long double", # 10
)
py_object_type = PyObjectType()
......@@ -2093,29 +2108,30 @@ c_void_ptr_ptr_type = CPtrType(c_void_ptr_type)
c_uchar_type = CIntType(0, 0)
c_ushort_type = CIntType(1, 0)
c_uint_type = CIntType(2, 0)
c_ulong_type = CIntType(3, 0)
c_ulonglong_type = CIntType(6, 0)
c_py_unicode_type = CPyUnicodeIntType(2, 0)
c_uint_type = CIntType(3, 0)
c_ulong_type = CIntType(4, 0)
c_ulonglong_type = CIntType(7, 0)
c_char_type = CIntType(0, 1)
c_short_type = CIntType(1, 1)
c_int_type = CIntType(2, 1)
c_long_type = CIntType(3, 1)
c_longlong_type = CIntType(6, 1)
c_int_type = CIntType(3, 1)
c_long_type = CIntType(4, 1)
c_longlong_type = CIntType(7, 1)
c_schar_type = CIntType(0, 2)
c_sshort_type = CIntType(1, 2)
c_sint_type = CIntType(2, 2)
c_slong_type = CIntType(3, 2)
c_slonglong_type = CIntType(6, 2)
c_sint_type = CIntType(3, 2)
c_slong_type = CIntType(4, 2)
c_slonglong_type = CIntType(7, 2)
c_bint_type = CBIntType(2, 1)
c_py_ssize_t_type = CPySSizeTType(4, 2)
c_size_t_type = CSizeTType(5, 0)
c_bint_type = CBIntType(3, 1)
c_py_ssize_t_type = CPySSizeTType(5, 2)
c_size_t_type = CSizeTType(6, 0)
c_float_type = CFloatType(7, math_h_modifier='f')
c_double_type = CFloatType(8)
c_longdouble_type = CFloatType(9, math_h_modifier='l')
c_float_type = CFloatType(8, math_h_modifier='f')
c_double_type = CFloatType(9)
c_longdouble_type = CFloatType(10, math_h_modifier='l')
c_float_complex_type = CComplexType(c_float_type)
c_double_complex_type = CComplexType(c_double_type)
......@@ -2131,7 +2147,7 @@ c_int_ptr_type = CPtrType(c_int_type)
c_py_ssize_t_ptr_type = CPtrType(c_py_ssize_t_type)
c_size_t_ptr_type = CPtrType(c_size_t_type)
c_returncode_type = CIntType(2, 1, is_returncode = 1)
c_returncode_type = CIntType(3, 1, is_returncode = 1)
c_anon_enum_type = CAnonEnumType(-1, 1)
# the Py_buffer type is defined in Builtin.py
......@@ -2165,6 +2181,7 @@ modifiers_and_name_to_type = {
(1, 0, "bint"): c_bint_type,
(0, 0, "size_t") : c_size_t_type,
(2, 0, "Py_ssize_t"): c_py_ssize_t_type,
(0, 0, "Py_UNICODE"): c_py_unicode_type,
(1, 0, "float"): c_float_type,
(1, 0, "double"): c_double_type,
......@@ -2383,6 +2400,8 @@ def parse_basic_type(name):
signed = 2
elif name == 'size_t':
signed = 0
elif name == 'Py_UNICODE':
signed = 0
else:
if name.startswith('u'):
name = name[1:]
......
......@@ -174,7 +174,7 @@ except ImportError:
# Predefined types
int_types = ['char', 'short', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
int_types = ['char', 'short', 'Py_UNICODE', 'int', 'long', 'longlong', 'Py_ssize_t', 'size_t']
float_types = ['longdouble', 'double', 'float']
complex_types = ['longdoublecomplex', 'doublecomplex', 'floatcomplex', 'complex']
other_types = ['bint', 'void']
......@@ -183,7 +183,7 @@ gs = globals()
for name in int_types:
gs[name] = typedef(py_int)
if not name.endswith('size_t'):
if name != 'Py_UNICODE' and not name.endswith('size_t'):
gs['u'+name] = typedef(py_int)
gs['s'+name] = typedef(py_int)
......
......@@ -4,12 +4,14 @@ cdef int cx = "test" # fails
cdef int x1 = "\xFF" # works
cdef int x2 = "\u0FFF" # fails
cdef int x3 = u"\xFF" # fails
cdef Py_UNICODE u1 = u"\xFF" # works
cdef int u3 = u"\xFF" # fails
_ERRORS = u"""
2:14: Only single-character strings can be coerced into ints.
3:14: Only single-character strings can be coerced into ints.
6:15: Only single-character strings can be coerced into ints.
7:14: Unicode objects do not support coercion to C types.
_ERRORS = """
2:14: Only single-character string literals can be coerced into ints.
3:14: Only single-character string literals can be coerced into ints.
6:15: Only single-character string literals can be coerced into ints.
9:14: Unicode literals do not support coercion to C types other than Py_UNICODE.
"""
# -*- coding: iso-8859-1 -*-
cdef Py_UNICODE char_ASCII = u'A'
cdef Py_UNICODE char_KLINGON = u'\uF8D2'
def char_too_long_ASCII():
cdef Py_UNICODE c = u'AB'
def char_too_long_Unicode():
cdef Py_UNICODE c = u'A\uF8D2'
def char_too_long_bytes():
cdef Py_UNICODE c = b'AB'
def char_too_long_latin1():
cdef Py_UNICODE char_bytes_latin1 = b''
_ERRORS = """
7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
13:24: Only single-character string literals can be coerced into ints.
16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
"""
......@@ -50,7 +50,7 @@ cdef list l_f2 = b1
cdef list l_f3 = u1
_ERRORS = u"""
25:20: Unicode objects do not support coercion to C types.
25:20: Unicode literals do not support coercion to C types other than Py_UNICODE.
26:22: Unicode objects do not support coercion to C types.
27:22: 'str' objects do not support coercion to C types (use 'bytes'?).
......
......@@ -14,7 +14,7 @@ def for_in_bytes(bytes s):
'C'
"""
for c in s:
if c == 'C':
if c == b'C':
return 'C'
else:
return 'X'
......@@ -28,21 +28,21 @@ def for_char_in_bytes(bytes s):
"""
cdef char c
for c in s:
if c == 'C':
if c == b'C':
return 'C'
else:
return 'X'
def for_int_in_unicode(unicode s):
def for_pyunicode_in_unicode(unicode s):
"""
>>> for_int_in_unicode(unicode_abc)
>>> for_pyunicode_in_unicode(unicode_abc)
'X'
>>> for_int_in_unicode(unicode_ABC)
>>> for_pyunicode_in_unicode(unicode_ABC)
'C'
"""
cdef int c
cdef Py_UNICODE c
for c in s:
if c == 'C':
if c == u'C':
return 'C'
else:
return 'X'
# -*- coding: iso-8859-1 -*-
cdef Py_UNICODE char_ASCII = u'A'
cdef Py_UNICODE char_KLINGON = u'\uF8D2'
def compare_ASCII():
"""
>>> compare_ASCII()
True
False
False
"""
print(char_ASCII == u'A')
print(char_ASCII == u'B')
print(char_ASCII == u'\uF8D2')
def compare_KLINGON():
"""
>>> compare_ASCII()
True
False
False
"""
print(char_KLINGON == u'\uF8D2')
print(char_KLINGON == u'A')
print(char_KLINGON == u'B')
def index_literal(int i):
"""
>>> index_literal(0) == '1'
True
>>> index_literal(-5) == '1'
True
>>> index_literal(2) == '3'
True
>>> index_literal(4) == '5'
True
"""
# runtime casts are not currently supported
#return <Py_UNICODE>(u"12345"[i])
return u"12345"[i]
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment