Update copy of PyUnicode_EncodeRawUnicodeEscape in _pickle.

Add astral character test case.

Update copy of PyUnicode_EncodeRawUnicodeEscape in _pickle.
Add astral character test case.
554d878b · Alexandre Vassalotti · aa0e531e · 554d878b · 554d878b
Commit 554d878b authored Dec 27, 2008 by Alexandre Vassalotti
Show whitespace changes
Inline Side-by-side

Showing with 45 additions and 6 deletions

Lib/test/pickletester.py Lib/test/pickletester.py +9 -1

Modules/_pickle.c Modules/_pickle.c +36 -5

No files found.
--- a/Lib/test/pickletester.py
+++ b/Lib/test/pickletester.py
@@ -484,13 +484,21 @@ class AbstractPickleTests(unittest.TestCase):
            self.assertRaises(ValueError, self.loads, buf)

    def test_unicode(self):
-        endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',  '<\\>']
+        endcases = ['', '<\\u>', '<\\\u1234>', '<\n>',  '<\\>',
+                    '<\\\U00012345>']
        for proto in protocols:
            for u in endcases:
                p = self.dumps(u, proto)
                u2 = self.loads(p)
                self.assertEqual(u2, u)

+    def test_unicode_high_plane(self):
+        t = '\U00012345'
+        for proto in protocols:
+            p = self.dumps(t, proto)
+            t2 = self.loads(p)
+            self.assertEqual(t2, t)
+
    def test_bytes(self):
        for proto in protocols:
            for u in b'', b'xyz', b'xyz'*100:

--- a/Modules/_pickle.c
+++ b/Modules/_pickle.c
@@ -1109,16 +1109,21 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
    static const char *hexdigits = "0123456789abcdef";

 #ifdef Py_UNICODE_WIDE
-    repr = PyBytes_FromStringAndSize(NULL, 10 * size);
+    const Py_ssize_t expandsize = 10;
 #else
-    repr = PyBytes_FromStringAndSize(NULL, 6 * size);
+    const Py_ssize_t expandsize = 6;
 #endif
+    
+    if (size > PY_SSIZE_T_MAX / expandsize)
+        return PyErr_NoMemory();
+    
+    repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
    if (repr == NULL)
        return NULL;
    if (size == 0)
        goto done;

-    p = q = PyBytes_AS_STRING(repr);
+    p = q = PyByteArray_AS_STRING(repr);
    while (size-- > 0) {
        Py_UNICODE ch = *s++;
 #ifdef Py_UNICODE_WIDE
@@ -1136,6 +1141,32 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
            *p++ = hexdigits[ch & 15];
        }
        else
+#else
+            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
+            if (ch >= 0xD800 && ch < 0xDC00) {
+                Py_UNICODE ch2;
+                Py_UCS4 ucs;
+
+                ch2 = *s++;
+                size--;
+                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
+                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
+                    *p++ = '\\';
+                    *p++ = 'U';
+                    *p++ = hexdigits[(ucs >> 28) & 0xf];
+                    *p++ = hexdigits[(ucs >> 24) & 0xf];
+                    *p++ = hexdigits[(ucs >> 20) & 0xf];
+                    *p++ = hexdigits[(ucs >> 16) & 0xf];
+                    *p++ = hexdigits[(ucs >> 12) & 0xf];
+                    *p++ = hexdigits[(ucs >> 8) & 0xf];
+                    *p++ = hexdigits[(ucs >> 4) & 0xf];
+                    *p++ = hexdigits[ucs & 0xf];
+                    continue;
+                }
+                /* Fall through: isolated surrogates are copied as-is */
+                s--;
+                size++;
+            }
 #endif
        /* Map 16-bit characters to '\uxxxx' */
        if (ch >= 256 || ch == '\\' || ch == '\n') {
@@ -1153,7 +1184,7 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
    size = p - q;

  done:
-    result = PyBytes_FromStringAndSize(PyBytes_AS_STRING(repr), size);
+    result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
    Py_DECREF(repr);
    return result;
 }