Commit ac90a80a authored by Stefan Behnel's avatar Stefan Behnel

support surrogates in unicode string literals in Py3.3

parent cd98dbdb
...@@ -1187,7 +1187,7 @@ class UnicodeNode(ConstNode): ...@@ -1187,7 +1187,7 @@ class UnicodeNode(ConstNode):
self.constant_result = self.value self.constant_result = self.value
def as_sliced_node(self, start, stop, step=None): def as_sliced_node(self, start, stop, step=None):
if _string_contains_surrogates(self.value[:stop]): if StringEncoding.string_contains_surrogates(self.value[:stop]):
# this is unsafe as it may give different results in different runtimes # this is unsafe as it may give different results in different runtimes
return None return None
value = StringEncoding.EncodedString(self.value[start:stop:step]) value = StringEncoding.EncodedString(self.value[start:stop:step])
...@@ -1236,11 +1236,30 @@ class UnicodeNode(ConstNode): ...@@ -1236,11 +1236,30 @@ class UnicodeNode(ConstNode):
return BoolNode(self.pos, value=bool_value, constant_result=bool_value) return BoolNode(self.pos, value=bool_value, constant_result=bool_value)
def contains_surrogates(self): def contains_surrogates(self):
return _string_contains_surrogates(self.value) return StringEncoding.string_contains_surrogates(self.value)
def generate_evaluation_code(self, code): def generate_evaluation_code(self, code):
if self.type.is_pyobject: if self.type.is_pyobject:
self.result_code = code.get_py_string_const(self.value) if self.contains_surrogates():
# surrogates are not really portable and cannot be
# decoded by the UTF-8 codec in Py3.3
self.result_code = code.get_py_const(py_object_type, 'ustring_')
data_cname = code.get_pyunicode_ptr_const(self.value)
code = code.get_cached_constants_writer()
code.mark_pos(self.pos)
code.putln(
"%s = PyUnicode_FromUnicode(%s, (sizeof(%s) / sizeof(Py_UNICODE))-1); %s" % (
self.result_code,
data_cname,
data_cname,
code.error_goto_if_null(self.result_code, self.pos)))
code.putln("#if CYTHON_PEP393_ENABLED")
code.putln(
code.error_goto_if_neg(
"PyUnicode_READY(%s)" % self.result_code, self.pos))
code.putln("#endif")
else:
self.result_code = code.get_py_string_const(self.value)
else: else:
self.result_code = code.get_pyunicode_ptr_const(self.value) self.result_code = code.get_pyunicode_ptr_const(self.value)
...@@ -1271,7 +1290,7 @@ class StringNode(PyConstNode): ...@@ -1271,7 +1290,7 @@ class StringNode(PyConstNode):
value = type(self.value)(self.value[start:stop:step]) value = type(self.value)(self.value[start:stop:step])
value.encoding = self.value.encoding value.encoding = self.value.encoding
if self.unicode_value is not None: if self.unicode_value is not None:
if _string_contains_surrogates(self.unicode_value[:stop]): if StringEncoding.string_contains_surrogates(self.unicode_value[:stop]):
# this is unsafe as it may give different results in different runtimes # this is unsafe as it may give different results in different runtimes
return None return None
unicode_value = StringEncoding.EncodedString( unicode_value = StringEncoding.EncodedString(
...@@ -1316,26 +1335,6 @@ class IdentifierStringNode(StringNode): ...@@ -1316,26 +1335,6 @@ class IdentifierStringNode(StringNode):
is_identifier = True is_identifier = True
def _string_contains_surrogates(ustring):
"""
Check if the unicode string contains surrogate code points
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
Unicode, i.e. characters that would be spelled as two
separate code units on a narrow platform.
"""
for c in map(ord, ustring):
if c > 65535: # can only happen on wide platforms
return True
# We only look for the first code unit (D800-DBFF) of a
# surrogate pair - if we find one, the other one
# (DC00-DFFF) is likely there, too. If we don't find it,
# any second code unit cannot make for a surrogate pair by
# itself.
if 0xD800 <= c <= 0xDBFF:
return True
return False
class ImagNode(AtomicExprNode): class ImagNode(AtomicExprNode):
# Imaginary number literal # Imaginary number literal
# #
......
...@@ -126,9 +126,28 @@ class EncodedString(_unicode): ...@@ -126,9 +126,28 @@ class EncodedString(_unicode):
assert self.encoding is None assert self.encoding is None
return self.encode("UTF-8") return self.encode("UTF-8")
@property
def is_unicode(self): def is_unicode(self):
return self.encoding is None return self.encoding is None
is_unicode = property(is_unicode)
def contains_surrogates(self):
return string_contains_surrogates(self)
def string_contains_surrogates(ustring):
"""
Check if the unicode string contains surrogate code points
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
Unicode, i.e. characters that would be spelled as two
separate code units on a narrow platform.
"""
for c in map(ord, ustring):
if c > 65535: # can only happen on wide platforms
return True
if 0xD800 <= c <= 0xDFFF:
return True
return False
class BytesLiteral(_bytes): class BytesLiteral(_bytes):
# bytes subclass that is compatible with EncodedString # bytes subclass that is compatible with EncodedString
...@@ -155,6 +174,7 @@ class BytesLiteral(_bytes): ...@@ -155,6 +174,7 @@ class BytesLiteral(_bytes):
is_unicode = False is_unicode = False
char_from_escape_sequence = { char_from_escape_sequence = {
r'\a' : u'\a', r'\a' : u'\a',
r'\b' : u'\b', r'\b' : u'\b',
......
...@@ -17,6 +17,10 @@ __doc__ = br""" ...@@ -17,6 +17,10 @@ __doc__ = br"""
u'\x03g\xf8\uf8d2S\xf8k ik' u'\x03g\xf8\uf8d2S\xf8k ik'
>>> f >>> f
u'\xf8' u'\xf8'
>>> g
u'\udc00'
>>> h
u'\ud800'
>>> add >>> add
u'S\xf8k ik\xfc\xd6\xe4abc' u'S\xf8k ik\xfc\xd6\xe4abc'
>>> null >>> null
...@@ -36,6 +40,10 @@ __doc__ = br""" ...@@ -36,6 +40,10 @@ __doc__ = br"""
10 10
>>> len(f) >>> len(f)
1 1
>>> len(g)
1
>>> len(h)
1
>>> len(add) >>> len(add)
12 12
>>> len(null) >>> len(null)
...@@ -63,6 +71,10 @@ __doc__ = br""" ...@@ -63,6 +71,10 @@ __doc__ = br"""
True True
>>> f == u'\\xf8' # unescaped by Python >>> f == u'\\xf8' # unescaped by Python
True True
>>> g == u'\\udc00' # unescaped by Python (required by doctest)
True
>>> h == u'\\ud800' # unescaped by Python (required by doctest)
True
>>> k == u'\\N{SNOWMAN}' == u'\\u2603' >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
True True
>>> add == u'Søk ik' + u'üÖä' + 'abc' >>> add == u'Søk ik' + u'üÖä' + 'abc'
...@@ -95,6 +107,8 @@ c = u'Søk ik' ...@@ -95,6 +107,8 @@ c = u'Søk ik'
d = u'üÖä' d = u'üÖä'
e = u'\x03\x67\xf8\uf8d2Søk ik' e = u'\x03\x67\xf8\uf8d2Søk ik'
f = u'\xf8' f = u'\xf8'
g = u'\udc00' # lone trail surrogate
h = u'\ud800' # lone lead surrogate
k = u'\N{SNOWMAN}' k = u'\N{SNOWMAN}'
add = u'Søk ik' + u'üÖä' + u'abc' add = u'Søk ik' + u'üÖä' + u'abc'
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment