Commit 12cf67fd authored by Stefan Behnel's avatar Stefan Behnel

fix unicode encoding optimisation when running in Python 3

parent f534aeb7
...@@ -2736,54 +2736,46 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform): ...@@ -2736,54 +2736,46 @@ class OptimizeBuiltinCalls(Visitor.MethodDispatcherTransform):
null_node = ExprNodes.NullNode(pos) null_node = ExprNodes.NullNode(pos)
if len(args) >= 2: if len(args) >= 2:
encoding_node = args[1] encoding, encoding_node = self._unpack_string_and_cstring_node(args[1])
if isinstance(encoding_node, ExprNodes.CoerceToPyTypeNode): if encoding_node is None:
encoding_node = encoding_node.arg
if isinstance(encoding_node, (ExprNodes.UnicodeNode, ExprNodes.StringNode,
ExprNodes.BytesNode)):
encoding = encoding_node.value
encoding_node = ExprNodes.BytesNode(encoding_node.pos, value=encoding,
type=PyrexTypes.c_char_ptr_type)
elif encoding_node.type is Builtin.bytes_type:
encoding = None
encoding_node = encoding_node.coerce_to(
PyrexTypes.c_char_ptr_type, self.current_env())
elif encoding_node.type.is_string:
encoding = None
else:
return None return None
else: else:
encoding = None encoding = None
encoding_node = null_node encoding_node = null_node
if len(args) == 3: if len(args) == 3:
error_handling_node = args[2] error_handling, error_handling_node = self._unpack_string_and_cstring_node(args[2])
if isinstance(error_handling_node, ExprNodes.CoerceToPyTypeNode): if error_handling_node is None:
error_handling_node = error_handling_node.arg return None
if isinstance(error_handling_node,
(ExprNodes.UnicodeNode, ExprNodes.StringNode,
ExprNodes.BytesNode)):
error_handling = error_handling_node.value
if error_handling == 'strict': if error_handling == 'strict':
error_handling_node = null_node error_handling_node = null_node
else:
error_handling_node = ExprNodes.BytesNode(
error_handling_node.pos, value=error_handling,
type=PyrexTypes.c_char_ptr_type)
elif error_handling_node.type is Builtin.bytes_type:
error_handling = None
error_handling_node = error_handling_node.coerce_to(
PyrexTypes.c_char_ptr_type, self.current_env())
elif error_handling_node.type.is_string:
error_handling = None
else:
return None
else: else:
error_handling = 'strict' error_handling = 'strict'
error_handling_node = null_node error_handling_node = null_node
return (encoding, encoding_node, error_handling, error_handling_node) return (encoding, encoding_node, error_handling, error_handling_node)
def _unpack_string_and_cstring_node(self, node):
if isinstance(node, ExprNodes.CoerceToPyTypeNode):
node = node.arg
if isinstance(node, ExprNodes.UnicodeNode):
encoding = node.value
node = ExprNodes.BytesNode(
node.pos, value=BytesLiteral(encoding.utf8encode()),
type=PyrexTypes.c_char_ptr_type)
elif isinstance(node, (ExprNodes.StringNode, ExprNodes.BytesNode)):
encoding = node.value.decode('ISO-8859-1')
node = ExprNodes.BytesNode(
node.pos, value=node.value, type=PyrexTypes.c_char_ptr_type)
elif node.type is Builtin.bytes_type:
encoding = None
node = node.coerce_to(PyrexTypes.c_char_ptr_type, self.current_env())
elif node.type.is_string:
encoding = None
else:
node = None
return encoding, node
def _handle_simple_method_str_endswith(self, node, args, is_unbound_method): def _handle_simple_method_str_endswith(self, node, args, is_unbound_method):
return self._inject_tailmatch( return self._inject_tailmatch(
node, args, is_unbound_method, 'str', 'endswith', node, args, is_unbound_method, 'str', 'endswith',
......
...@@ -3,50 +3,98 @@ ...@@ -3,50 +3,98 @@
__doc__ = u""" __doc__ = u"""
>>> len(u) >>> len(u)
15 15
>>> default == 'abcdefg'.encode()
True
>>> isinstance(utf8, _bytes)
True
>>> utf8 == u.encode('UTF-8')
True
>>> isinstance(utf8_strict, _bytes)
True
>>> utf8_strict == u.encode('UTF-8', 'strict')
True
>>> isinstance(ascii_replace, _bytes)
True
>>> ascii_replace == u.encode('ASCII', 'replace')
True
>>> isinstance(cp850_strict, _bytes)
True
>>> cp850_strict == u.encode('cp850', 'strict')
True
>>> isinstance(latin1, _bytes)
True
>>> latin1 == u.encode('latin-1')
True
>>> isinstance(latin1_constant, _bytes)
True
>>> latin1_constant == latin1
True
""" """
cimport cython
_bytes = bytes _bytes = bytes
cdef unicode text = u'abcäöüöéèâÁÀABC' cdef unicode text = u'abcäöüöéèâÁÀABC'
u = text u = text
default = u'abcdefg'.encode() def default():
"""
>>> default() == 'abcdefg'.encode()
True
"""
return u'abcdefg'.encode()
@cython.test_assert_path_exists('//PythonCapiFunctionNode[@cname = "PyUnicode_AsUTF8String"]')
def utf8():
"""
>>> isinstance(utf8(), _bytes)
True
>>> utf8() == u.encode('UTF-8')
True
"""
return text.encode(u'UTF-8')
@cython.test_assert_path_exists('//PythonCapiFunctionNode[@cname = "PyUnicode_AsUTF8String"]')
def utf8_strict():
"""
>>> isinstance(utf8_strict(), _bytes)
True
>>> utf8_strict() == u.encode('UTF-8', 'strict')
True
"""
return text.encode(u'UTF-8', u'strict')
utf8 = text.encode(u'UTF-8') @cython.test_assert_path_exists('//PythonCapiFunctionNode[@cname = "PyUnicode_AsUTF8String"]')
def utf8_str_strict():
"""
>>> isinstance(utf8_str_strict(), _bytes)
True
>>> utf8_str_strict() == u.encode('UTF-8', 'strict')
True
"""
return text.encode('UTF-8', 'strict')
utf8_strict = text.encode(u'UTF-8', u'strict') @cython.test_assert_path_exists('//PythonCapiFunctionNode[@cname = "PyUnicode_AsUTF8String"]')
def utf8_bytes_strict():
"""
>>> isinstance(utf8_bytes_strict(), _bytes)
True
>>> utf8_bytes_strict() == u.encode('UTF-8', 'strict')
True
"""
return text.encode(b'UTF-8', b'strict')
ascii_replace = text.encode(u'ASCII', u'replace') @cython.test_assert_path_exists('//PythonCapiFunctionNode[@cname = "PyUnicode_AsEncodedString"]')
def ascii_replace():
"""
>>> isinstance(ascii_replace(), _bytes)
True
>>> ascii_replace() == u.encode('ASCII', 'replace')
True
"""
return text.encode(u'ASCII', u'replace')
cp850_strict = text.encode(u'cp850', u'strict') def cp850_strict():
"""
>>> isinstance(cp850_strict(), _bytes)
True
>>> cp850_strict() == u.encode('cp850', 'strict')
True
"""
return text.encode(u'cp850', u'strict')
latin1 = text.encode(u'latin-1') @cython.test_assert_path_exists('//PythonCapiFunctionNode[@cname = "PyUnicode_AsLatin1String"]')
def latin1():
"""
>>> isinstance(latin1(), _bytes)
True
>>> latin1() == u.encode('latin-1')
True
"""
return text.encode(u'latin-1')
latin1_constant = u'abcäöüöéèâÁÀABC'.encode('latin1') @cython.test_fail_if_path_exists('//PythonCapiFunctionNode', '//SimpleCallNode')
def latin1_constant():
"""
>>> isinstance(latin1_constant(), _bytes)
True
>>> latin1_constant() == latin1()
True
"""
return u'abcäöüöéèâÁÀABC'.encode('latin1')
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment