Commit 554d878b authored by Alexandre Vassalotti's avatar Alexandre Vassalotti

Update copy of PyUnicode_EncodeRawUnicodeEscape in _pickle.

Add astral character test case.
parent aa0e531e
...@@ -484,13 +484,21 @@ class AbstractPickleTests(unittest.TestCase): ...@@ -484,13 +484,21 @@ class AbstractPickleTests(unittest.TestCase):
self.assertRaises(ValueError, self.loads, buf) self.assertRaises(ValueError, self.loads, buf)
def test_unicode(self): def test_unicode(self):
endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', '<\\>'] endcases = ['', '<\\u>', '<\\\u1234>', '<\n>', '<\\>',
'<\\\U00012345>']
for proto in protocols: for proto in protocols:
for u in endcases: for u in endcases:
p = self.dumps(u, proto) p = self.dumps(u, proto)
u2 = self.loads(p) u2 = self.loads(p)
self.assertEqual(u2, u) self.assertEqual(u2, u)
def test_unicode_high_plane(self):
t = '\U00012345'
for proto in protocols:
p = self.dumps(t, proto)
t2 = self.loads(p)
self.assertEqual(t2, t)
def test_bytes(self): def test_bytes(self):
for proto in protocols: for proto in protocols:
for u in b'', b'xyz', b'xyz'*100: for u in b'', b'xyz', b'xyz'*100:
......
...@@ -1109,16 +1109,21 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) ...@@ -1109,16 +1109,21 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
static const char *hexdigits = "0123456789abcdef"; static const char *hexdigits = "0123456789abcdef";
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
repr = PyBytes_FromStringAndSize(NULL, 10 * size); const Py_ssize_t expandsize = 10;
#else #else
repr = PyBytes_FromStringAndSize(NULL, 6 * size); const Py_ssize_t expandsize = 6;
#endif #endif
if (size > PY_SSIZE_T_MAX / expandsize)
return PyErr_NoMemory();
repr = PyByteArray_FromStringAndSize(NULL, expandsize * size);
if (repr == NULL) if (repr == NULL)
return NULL; return NULL;
if (size == 0) if (size == 0)
goto done; goto done;
p = q = PyBytes_AS_STRING(repr); p = q = PyByteArray_AS_STRING(repr);
while (size-- > 0) { while (size-- > 0) {
Py_UNICODE ch = *s++; Py_UNICODE ch = *s++;
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
...@@ -1136,6 +1141,32 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) ...@@ -1136,6 +1141,32 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
*p++ = hexdigits[ch & 15]; *p++ = hexdigits[ch & 15];
} }
else else
#else
/* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
if (ch >= 0xD800 && ch < 0xDC00) {
Py_UNICODE ch2;
Py_UCS4 ucs;
ch2 = *s++;
size--;
if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
*p++ = '\\';
*p++ = 'U';
*p++ = hexdigits[(ucs >> 28) & 0xf];
*p++ = hexdigits[(ucs >> 24) & 0xf];
*p++ = hexdigits[(ucs >> 20) & 0xf];
*p++ = hexdigits[(ucs >> 16) & 0xf];
*p++ = hexdigits[(ucs >> 12) & 0xf];
*p++ = hexdigits[(ucs >> 8) & 0xf];
*p++ = hexdigits[(ucs >> 4) & 0xf];
*p++ = hexdigits[ucs & 0xf];
continue;
}
/* Fall through: isolated surrogates are copied as-is */
s--;
size++;
}
#endif #endif
/* Map 16-bit characters to '\uxxxx' */ /* Map 16-bit characters to '\uxxxx' */
if (ch >= 256 || ch == '\\' || ch == '\n') { if (ch >= 256 || ch == '\\' || ch == '\n') {
...@@ -1146,14 +1177,14 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size) ...@@ -1146,14 +1177,14 @@ raw_unicode_escape(const Py_UNICODE *s, Py_ssize_t size)
*p++ = hexdigits[(ch >> 4) & 0xf]; *p++ = hexdigits[(ch >> 4) & 0xf];
*p++ = hexdigits[ch & 15]; *p++ = hexdigits[ch & 15];
} }
/* Copy everything else as-is */ /* Copy everything else as-is */
else else
*p++ = (char) ch; *p++ = (char) ch;
} }
size = p - q; size = p - q;
done: done:
result = PyBytes_FromStringAndSize(PyBytes_AS_STRING(repr), size); result = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(repr), size);
Py_DECREF(repr); Py_DECREF(repr);
return result; return result;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment