fix surrogates in Unicode literals in Python 3.3 (the UTF-8 codec rejects them explictly)

496d3224 · Stefan Behnel · 02a5f1e0 · 496d3224 · 496d3224 · 496d3224
Commit 496d3224 authored Jan 06, 2013 by Stefan Behnel
5 changed files
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -24,6 +24,13 @@ Features added
 Bugs fixed
 ----------

+* Surrogate code points in Unicode string literals failed to compile and/or
+  load in CPython 3.3.  To work around this change introduced by CPython,
+  Cython switched from UTF-8 to Python Unicode escapes ('\u0101') internally
+  for storing literal Unicode strings in C code.  This may add a slight
+  initialisation overhead if a large number of non-Latin1 characters are
+  used in the code.
+
 Other changes
 -------------


--- a/Cython/Compiler/Code.py
+++ b/Cython/Compiler/Code.py
@@ -991,7 +991,7 @@ class GlobalState(object):
    def get_string_const(self, text, py_version=None):
        # return a C string constant, creating a new one if necessary
        if text.is_unicode:
-            byte_string = text.utf8encode()
+            byte_string = text.escapeencode()
        else:
            byte_string = text.byteencode()
        try:
@@ -1006,7 +1006,7 @@ class GlobalState(object):
        # return a Python string constant, creating a new one if necessary
        py3str_cstring = None
        if is_str and unicode_value is not None \
-               and unicode_value.utf8encode() != text.byteencode():
+               and unicode_value.escapeencode() != text.byteencode():
            py3str_cstring = self.get_string_const(unicode_value, py_version=3)
            c_string = self.get_string_const(text, py_version=2)
        else:

--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -12,6 +12,8 @@ else:
    _unicode, _str, _bytes = unicode, str, str
    IS_PYTHON3 = False

+IS_PYTHON24 = sys.version_info[:2] < (2,5)
+
 empty_bytes = _bytes()
 empty_unicode = _unicode()

@@ -126,6 +128,13 @@ class EncodedString(_unicode):
        assert self.encoding is None
        return self.encode("UTF-8")

+    def escapeencode(self):
+        assert self.encoding is None
+        if IS_PYTHON24:
+            # work around bug in Py24 encoder
+            return self.replace(u'\\', u'\\\\').encode('unicode_escape')
+        return self.encode('unicode_escape')
+
    def is_unicode(self):
        return self.encoding is None
    is_unicode = property(is_unicode)
@@ -147,6 +156,9 @@ class BytesLiteral(_bytes):
    def utf8encode(self):
        assert False, "this is not a unicode string: %r" % self

+    def escapeencode(self):
+        assert False, "this is not a unicode string: %r" % self
+
    def __str__(self):
        """Fake-decode the byte string to unicode to support %
        formatting of unicode strings.
@@ -165,6 +177,8 @@ char_from_escape_sequence = {
    r'\v' : u'\v',
    }.get

+_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
+
 def _to_escape_sequence(s):
    if s in '\n\r\t':
        return repr(s)[1:-1]
@@ -176,19 +190,22 @@ def _to_escape_sequence(s):
        # within a character sequence, oct passes much better than hex
        return ''.join(['\\%03o' % ord(c) for c in s])

-_c_special = ('\\', '??', '"') + tuple(map(chr, range(32)))
-_c_special_replacements = [(orig.encode('ASCII'),
-                            _to_escape_sequence(orig).encode('ASCII'))
-                           for orig in _c_special ]
-
-def _build_specials_test():
+def _build_specials_replacer():
    subexps = []
+    replacements = {}
    for special in _c_special:
        regexp = ''.join(['[%s]' % c.replace('\\', '\\\\') for c in special])
        subexps.append(regexp)
-    return re.compile('|'.join(subexps).encode('ASCII')).search
+        replacements[special.encode('ASCII')] = _to_escape_sequence(special).encode('ASCII')
+
+    sub = re.compile(('(%s)' % '|'.join(subexps)).encode('ASCII')).sub
+    def replace_specials(m):
+        return replacements[m.group(1)]
+    def replace(s):
+        return sub(replace_specials, s)
+    return replace

-_has_specials = _build_specials_test()
+_replace_specials = _build_specials_replacer()

 def escape_char(c):
    if IS_PYTHON3:
@@ -210,10 +227,7 @@ def escape_byte_string(s):
    encoded as ISO-8859-1, will result in the correct byte sequence
    being written.
    """
-    if _has_specials(s):
-        for special, replacement in _c_special_replacements:
-            if special in s:
-                s = s.replace(special, replacement)
+    s = _replace_specials(s)
    try:
        return s.decode("ASCII") # trial decoding: plain ASCII => done
    except UnicodeDecodeError:

--- a/Cython/Utility/StringTools.c
+++ b/Cython/Utility/StringTools.c
@@ -17,7 +17,7 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
    while (t->p) {
        #if PY_MAJOR_VERSION < 3
        if (t->is_unicode) {
-            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+            *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
        } else if (t->intern) {
            *t->p = PyString_InternFromString(t->s);
        } else {
@@ -25,12 +25,13 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
        }
        #else  /* Python 3+ has unicode identifiers */
        if (t->is_unicode | t->is_str) {
-            if (t->intern) {
-                *t->p = PyUnicode_InternFromString(t->s);
-            } else if (t->encoding) {
+            if (unlikely(t->encoding)) {
                *t->p = PyUnicode_Decode(t->s, t->n - 1, t->encoding, NULL);
            } else {
-                *t->p = PyUnicode_FromStringAndSize(t->s, t->n - 1);
+                *t->p = PyUnicode_DecodeUnicodeEscape(t->s, t->n - 1, NULL);
+            }
+            if (t->intern && likely(*t->p)) {
+                PyUnicode_InternInPlace(t->p);
            }
        } else {
            *t->p = PyBytes_FromStringAndSize(t->s, t->n - 1);

--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -17,6 +17,10 @@ __doc__ = br"""
    u'\x03g\xf8\uf8d2S\xf8k ik'
    >>> f
    u'\xf8'
+    >>> g
+    u'\udc00'
+    >>> h
+    u'\ud800'
    >>> add
    u'S\xf8k ik\xfc\xd6\xe4abc'
    >>> null
@@ -36,6 +40,10 @@ __doc__ = br"""
    10
    >>> len(f)
    1
+    >>> len(g)
+    1
+    >>> len(h)
+    1
    >>> len(add)
    12
    >>> len(null)
@@ -63,6 +71,10 @@ __doc__ = br"""
    True
    >>> f == u'\\xf8' # unescaped by Python
    True
+    >>> g == u'\\udc00' # unescaped by Python (required by doctest)
+    True
+    >>> h == u'\\ud800' # unescaped by Python (required by doctest)
+    True
    >>> add == u'Søk ik' + u'üÖä' + 'abc'
    True
    >>> null == u'\\x00' # unescaped by Python (required by doctest)
@@ -93,6 +105,8 @@ c = u'Søk ik'
 d = u'üÖä'
 e = u'\x03\x67\xf8\uf8d2Søk ik'
 f = u'\xf8'
+g = u'\udc00'   # lone trail surrogate
+h = u'\ud800'   # lone lead surrogate

 add = u'Søk ik' + u'üÖä' + u'abc'
 null = u'\x00'