Commit e68b3b57 authored by Stefan Behnel's avatar Stefan Behnel

implemented Py_UCS4 type

--HG--
rename : tests/errors/py_unicode_type_errors.pyx => tests/errors/py_ucs4_type_errors.pyx
rename : tests/run/py_unicode_type.pyx => tests/run/py_ucs4_type.pyx
parent 7cdfd770
......@@ -981,8 +981,8 @@ class BytesNode(ConstNode):
if not self.can_coerce_to_char_literal():
error(self.pos, "Only single-character string literals can be coerced into ints.")
return self
if dst_type is PyrexTypes.c_py_unicode_type:
error(self.pos, "Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.")
if dst_type.is_unicode_char:
error(self.pos, "Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.")
return self
return CharNode(self.pos, value=self.value)
......@@ -1033,17 +1033,17 @@ class UnicodeNode(PyConstNode):
def coerce_to(self, dst_type, env):
if dst_type is self.type:
pass
elif dst_type is PyrexTypes.c_py_unicode_type:
elif dst_type.is_unicode_char:
if not self.can_coerce_to_char_literal():
error(self.pos, "Only single-character Unicode string literals can be coerced into Py_UNICODE.")
error(self.pos, "Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.")
return self
int_value = ord(self.value)
return IntNode(self.pos, value=int_value, constant_result=int_value)
return IntNode(self.pos, type=dst_type, value=str(int_value), constant_result=int_value)
elif not dst_type.is_pyobject:
if dst_type.is_string and self.bytes_value is not None:
# special case: '-3' enforced unicode literal used in a C char* context
return BytesNode(self.pos, value=self.bytes_value).coerce_to(dst_type, env)
error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE.")
error(self.pos, "Unicode literals do not support coercion to C types other than Py_UNICODE or Py_UCS4.")
elif dst_type is not py_object_type:
if not self.check_for_coercion_error(dst_type):
self.fail_assignment(dst_type)
......@@ -1051,6 +1051,9 @@ class UnicodeNode(PyConstNode):
def can_coerce_to_char_literal(self):
return len(self.value) == 1
## or (len(self.value) == 2
## and (0xD800 <= self.value[0] <= 0xDBFF)
## and (0xDC00 <= self.value[1] <= 0xDFFF))
def contains_surrogates(self):
# Check if the unicode string contains surrogate code points
......@@ -2165,8 +2168,8 @@ class IndexNode(ExprNode):
elif not skip_child_analysis:
self.index.analyse_types(env)
self.original_index_type = self.index.type
if base_type is PyrexTypes.c_py_unicode_type:
# we infer Py_UNICODE for unicode strings in some
if base_type.is_unicode_char:
# we infer Py_UNICODE/Py_UCS4 for unicode strings in some
# cases, but indexing must still work for them
if self.index.constant_result in (0, -1):
# FIXME: we know that this node is redundant -
......@@ -2188,7 +2191,7 @@ class IndexNode(ExprNode):
self.index = self.index.coerce_to_pyobject(env)
self.is_temp = 1
if self.index.type.is_int and base_type is unicode_type:
# Py_UNICODE will automatically coerce to a unicode string
# Py_UNICODE/Py_UCS4 will automatically coerce to a unicode string
# if required, so this is fast and safe
self.type = PyrexTypes.c_py_unicode_type
elif is_slice and base_type in (bytes_type, str_type, unicode_type, list_type, tuple_type):
......@@ -2253,7 +2256,7 @@ class IndexNode(ExprNode):
return "PyList_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result())
elif self.base.type is tuple_type:
return "PyTuple_GET_ITEM(%s, %s)" % (self.base.result(), self.index.result())
elif self.base.type is unicode_type and self.type is PyrexTypes.c_py_unicode_type:
elif self.base.type is unicode_type and self.type.is_unicode_char:
return "PyUnicode_AS_UNICODE(%s)[%s]" % (self.base.result(), self.index.result())
elif (self.type.is_ptr or self.type.is_array) and self.type == self.base.type:
error(self.pos, "Invalid use of pointer slice")
......@@ -2332,7 +2335,7 @@ class IndexNode(ExprNode):
self.result(),
code.error_goto(self.pos)))
code.put_gotref(self.py_result())
elif self.type is PyrexTypes.c_py_unicode_type and self.base.type is unicode_type:
elif self.type.is_unicode_char and self.base.type is unicode_type:
assert self.index.type.is_int
index_code = self.index.result()
function = "__Pyx_GetItemInt_Unicode"
......@@ -5845,8 +5848,8 @@ class NumBinopNode(BinopNode):
self.operand2.result())
def is_py_operation_types(self, type1, type2):
return (type1 is PyrexTypes.c_py_unicode_type or
type2 is PyrexTypes.c_py_unicode_type or
return (type1.is_unicode_char or
type2.is_unicode_char or
BinopNode.is_py_operation_types(self, type1, type2))
def py_operation_function(self):
......@@ -6503,7 +6506,7 @@ class CmpNode(object):
return self.operator in ('in', 'not_in') and \
((self.operand1.type.is_int
and (self.operand2.type.is_string or self.operand2.type is bytes_type)) or
(self.operand1.type is PyrexTypes.c_py_unicode_type
(self.operand1.type.is_unicode_char
and self.operand2.type is unicode_type))
def is_ptr_contains(self):
......@@ -7166,7 +7169,7 @@ class CoerceToPyTypeNode(CoercionNode):
# be specific about some known types
if arg.type.is_string:
self.type = bytes_type
elif arg.type is PyrexTypes.c_py_unicode_type:
elif arg.type.is_unicode_char:
self.type = unicode_type
elif arg.type.is_complex:
self.type = Builtin.complex_type
......
......@@ -1936,7 +1936,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
node.pos, cfunc_name, self.PyObject_Size_func_type,
args = [arg],
is_temp = node.is_temp)
elif arg.type is PyrexTypes.c_py_unicode_type:
elif arg.type.is_unicode_char:
return ExprNodes.IntNode(node.pos, value='1', constant_result=1,
type=node.type)
else:
......@@ -2028,7 +2028,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
return node
arg = pos_args[0]
if isinstance(arg, ExprNodes.CoerceToPyTypeNode):
if arg.arg.type is PyrexTypes.c_py_unicode_type:
if arg.arg.type.is_unicode_char:
return arg.arg.coerce_to(node.type, self.current_env())
return node
......@@ -2191,7 +2191,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
return node
ustring = args[0]
if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
ustring.arg.type is not PyrexTypes.c_py_unicode_type:
not ustring.arg.type.is_unicode_char:
return node
uchar = ustring.arg
method_name = node.function.attribute
......@@ -2230,7 +2230,7 @@ class OptimizeBuiltinCalls(Visitor.EnvTransform):
return node
ustring = args[0]
if not isinstance(ustring, ExprNodes.CoerceToPyTypeNode) or \
ustring.arg.type is not PyrexTypes.c_py_unicode_type:
not ustring.arg.type.is_unicode_char:
return node
uchar = ustring.arg
method_name = node.function.attribute
......
......@@ -2041,6 +2041,7 @@ basic_c_type_names = ("void", "char", "int", "float", "double", "bint")
special_basic_c_types = {
# name : (signed, longness)
"Py_UNICODE" : (0, 0),
"Py_UCS4" : (0, 0),
"Py_ssize_t" : (2, 0),
"ssize_t" : (2, 0),
"size_t" : (0, 0),
......
......@@ -49,6 +49,7 @@ class PyrexType(BaseType):
# is_typedef boolean Is a typedef type
# is_string boolean Is a C char * type
# is_unicode boolean Is a UTF-8 encoded C char * type
# is_unicode_char boolean Is either Py_UCS4 or Py_UNICODE
# is_returncode boolean Is used only to signal exceptions
# is_error boolean Is the dummy error type
# is_buffer boolean Is buffer access type
......@@ -101,6 +102,7 @@ class PyrexType(BaseType):
is_typedef = 0
is_string = 0
is_unicode = 0
is_unicode_char = 0
is_returncode = 0
is_error = 0
is_buffer = 0
......@@ -924,9 +926,78 @@ class CBIntType(CIntType):
return "<CNumericType bint>"
class CPyUCS4IntType(CIntType):
# Py_UCS4
is_unicode_char = True
# Py_UCS4 coerces from and to single character unicode strings (or
# at most two characters on 16bit Unicode builds), but we also
# allow Python integers as input. The value range for Py_UCS4
# is 0..1114111, which is checked when converting from an integer
# value.
to_py_function = "PyUnicode_FromOrdinal"
from_py_function = "__Pyx_PyObject_AsPy_UCS4"
def create_from_py_utility_code(self, env):
env.use_utility_code(pyobject_as_py_ucs4_utility_code)
return True
def sign_and_name(self):
return "Py_UCS4"
pyobject_as_py_ucs4_utility_code = UtilityCode(
proto='''
static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject*);
''',
impl='''
static CYTHON_INLINE Py_UCS4 __Pyx_PyObject_AsPy_UCS4(PyObject* x) {
long ival;
if (PyUnicode_Check(x)) {
if (likely(PyUnicode_GET_SIZE(x) == 1)) {
return PyUnicode_AS_UNICODE(x)[0];
} else if (PyUnicode_GET_SIZE(x) == 2) {
Py_UCS4 high_val = PyUnicode_AS_UNICODE(x)[0];
if (high_val >= 0xD800 && high_val <= 0xDBFF) {
Py_UCS4 low_val = PyUnicode_AS_UNICODE(x)[1];
if (low_val >= 0xDC00 && low_val <= 0xDFFF) {
return 0x10000 | ((high_val & ((1<<10)-1)) << 10) | (low_val & ((1<<10)-1));
}
}
}
PyErr_Format(PyExc_ValueError,
"only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length "
#if PY_VERSION_HEX < 0x02050000
"%d",
#else
"%zd",
#endif
PyUnicode_GET_SIZE(x));
return (Py_UCS4)-1;
}
ival = __Pyx_PyInt_AsLong(x);
if (unlikely(ival < 0)) {
if (!PyErr_Occurred())
PyErr_SetString(PyExc_OverflowError,
"cannot convert negative value to Py_UCS4");
return (Py_UCS4)-1;
} else if (unlikely(ival > 1114111)) {
PyErr_SetString(PyExc_OverflowError,
"value too large to convert to Py_UCS4");
return (Py_UCS4)-1;
}
return (Py_UCS4)ival;
}
''')
class CPyUnicodeIntType(CIntType):
# Py_UNICODE
is_unicode_char = True
# Py_UNICODE coerces from and to single character unicode strings,
# but we also allow Python integers as input. The value range for
# Py_UNICODE is 0..1114111, which is checked when converting from
......@@ -2306,6 +2377,7 @@ c_anon_enum_type = CAnonEnumType(-1)
c_returncode_type = CReturnCodeType(RANK_INT)
c_bint_type = CBIntType(RANK_INT)
c_py_unicode_type = CPyUnicodeIntType(RANK_INT-0.5, UNSIGNED)
c_py_ucs4_type = CPyUCS4IntType(RANK_LONG-0.5, UNSIGNED)
c_py_ssize_t_type = CPySSizeTType(RANK_LONG+0.5, SIGNED)
c_ssize_t_type = CSSizeTType(RANK_LONG+0.5, SIGNED)
c_size_t_type = CSizeTType(RANK_LONG+0.5, UNSIGNED)
......@@ -2367,6 +2439,7 @@ modifiers_and_name_to_type = {
(1, 0, "bint"): c_bint_type,
(0, 0, "Py_UNICODE"): c_py_unicode_type,
(0, 0, "Py_UCS4"): c_py_ucs4_type,
(2, 0, "Py_ssize_t"): c_py_ssize_t_type,
(2, 0, "ssize_t") : c_ssize_t_type,
(0, 0, "size_t") : c_size_t_type,
......@@ -2614,6 +2687,8 @@ def parse_basic_type(name):
longness = 0
if name == 'Py_UNICODE':
signed = 0
elif name == 'Py_UCS4':
signed = 0
elif name == 'Py_ssize_t':
signed = 2
elif name == 'ssize_t':
......
# -*- coding: iso-8859-1 -*-
cdef Py_UCS4 char_ASCII = u'A'
cdef Py_UCS4 char_KLINGON = u'\uF8D2'
def char_too_long_ASCII():
cdef Py_UCS4 c = u'AB'
def char_too_long_Unicode():
cdef Py_UCS4 c = u'A\uF8D2'
def char_too_long_bytes():
cdef Py_UCS4 c = b'AB'
def char_too_long_latin1():
cdef Py_UCS4 char_bytes_latin1 = b'\xf6'
_ERRORS = """
7:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
10:21: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
13:21: Only single-character string literals can be coerced into ints.
16:37: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.
"""
......@@ -17,8 +17,8 @@ def char_too_long_latin1():
_ERRORS = """
7:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
10:24: Only single-character Unicode string literals can be coerced into Py_UNICODE.
7:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
10:24: Only single-character Unicode string literals or surrogate pairs can be coerced into Py_UCS4/Py_UNICODE.
13:24: Only single-character string literals can be coerced into ints.
16:40: Bytes literals cannot coerce to Py_UNICODE, use a unicode literal instead.
16:40: Bytes literals cannot coerce to Py_UNICODE/Py_UCS4, use a unicode literal instead.
"""
# -*- coding: iso-8859-1 -*-
cimport cython
cdef Py_UCS4 char_ASCII = u'A'
cdef Py_UCS4 char_KLINGON = u'\uF8D2'
def compare_ASCII():
"""
>>> compare_ASCII()
True
False
False
"""
print(char_ASCII == u'A')
print(char_ASCII == u'B')
print(char_ASCII == u'\uF8D2')
def compare_klingon():
"""
>>> compare_klingon()
True
False
False
"""
print(char_KLINGON == u'\uF8D2')
print(char_KLINGON == u'A')
print(char_KLINGON == u'B')
from cpython.unicode cimport PyUnicode_FromOrdinal
import sys
u0 = u'\x00'
u1 = u'\x01'
umax = PyUnicode_FromOrdinal(sys.maxunicode)
def unicode_ordinal(Py_UCS4 i):
"""
>>> ord(unicode_ordinal(0)) == 0
True
>>> ord(unicode_ordinal(1)) == 1
True
>>> ord(unicode_ordinal(sys.maxunicode)) == sys.maxunicode
True
>>> ord(unicode_ordinal(u0)) == 0
True
>>> ord(unicode_ordinal(u1)) == 1
True
>>> ord(unicode_ordinal(umax)) == sys.maxunicode
True
Value too small:
>>> unicode_ordinal(-1) #doctest: +ELLIPSIS
Traceback (most recent call last):
...
OverflowError: ...
Value too large:
>>> unicode_ordinal(sys.maxunicode+1) #doctest: +ELLIPSIS
Traceback (most recent call last):
...
OverflowError: ...
Less than one character:
>>> unicode_ordinal(u0[:0])
Traceback (most recent call last):
...
ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 0
More than one character:
>>> unicode_ordinal(u0+u1)
Traceback (most recent call last):
...
ValueError: only single character unicode strings or surrogate pairs can be converted to Py_UCS4, got length 2
"""
return i
@cython.test_assert_path_exists('//PythonCapiCallNode')
@cython.test_fail_if_path_exists('//SimpleCallNode')
def unicode_type_methods(Py_UCS4 uchar):
"""
>>> unicode_type_methods(ord('A'))
[True, True, False, False, False, False, False, True, True]
>>> unicode_type_methods(ord('a'))
[True, True, False, False, True, False, False, False, False]
>>> unicode_type_methods(ord('8'))
[True, False, True, True, False, True, False, False, False]
>>> unicode_type_methods(ord('\\t'))
[False, False, False, False, False, False, True, False, False]
"""
return [
# character types
uchar.isalnum(),
uchar.isalpha(),
uchar.isdecimal(),
uchar.isdigit(),
uchar.islower(),
uchar.isnumeric(),
uchar.isspace(),
uchar.istitle(),
uchar.isupper(),
]
@cython.test_assert_path_exists('//PythonCapiCallNode')
@cython.test_fail_if_path_exists('//SimpleCallNode')
def unicode_methods(Py_UCS4 uchar):
"""
>>> unicode_methods(ord('A')) == ['a', 'A', 'A']
True
>>> unicode_methods(ord('a')) == ['a', 'A', 'A']
True
"""
return [
# character conversion
uchar.lower(),
uchar.upper(),
uchar.title(),
]
@cython.test_assert_path_exists('//IntNode')
@cython.test_fail_if_path_exists('//SimpleCallNode',
'//PythonCapiCallNode')
def len_uchar(Py_UCS4 uchar):
"""
>>> len_uchar(ord('A'))
1
"""
return len(uchar)
def index_uchar(Py_UCS4 uchar, Py_ssize_t i):
"""
>>> index_uchar(ord('A'), 0) == ('A', 'A', 'A')
True
>>> index_uchar(ord('A'), -1) == ('A', 'A', 'A')
True
>>> index_uchar(ord('A'), 1)
Traceback (most recent call last):
IndexError: string index out of range
"""
return uchar[0], uchar[-1], uchar[i]
mixed_ustring = u'AbcDefGhIjKlmnoP'
lower_ustring = mixed_ustring.lower()
upper_ustring = mixed_ustring.lower()
@cython.test_assert_path_exists('//PythonCapiCallNode',
'//ForFromStatNode')
@cython.test_fail_if_path_exists('//SimpleCallNode',
'//ForInStatNode')
def count_lower_case_characters(unicode ustring):
"""
>>> count_lower_case_characters(mixed_ustring)
10
>>> count_lower_case_characters(lower_ustring)
16
"""
cdef Py_ssize_t count = 0
for uchar in ustring:
if uchar.islower():
count += 1
return count
@cython.test_assert_path_exists('//SwitchStatNode',
'//ForFromStatNode')
@cython.test_fail_if_path_exists('//ForInStatNode')
def iter_and_in():
"""
>>> iter_and_in()
a
b
e
f
h
"""
for c in u'abcdefgh':
if c in u'abCDefGh':
print c
@cython.test_assert_path_exists('//SwitchStatNode',
'//ForFromStatNode')
@cython.test_fail_if_path_exists('//ForInStatNode')
def index_and_in():
"""
>>> index_and_in()
1
3
4
7
8
"""
cdef int i
for i in range(1,9):
if u'abcdefgh'[-i] in u'abCDefGh':
print i
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment