Commit 8d57968c authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #9804: ascii() now always represents unicode surrogate pairs as

a single `\UXXXXXXXX`, regardless of whether the character is printable
or not.  Also, the "backslashreplace" error handler now joins surrogate
pairs into a single character on UCS-2 builds.
parent 2327df02
...@@ -179,6 +179,28 @@ class BuiltinTest(unittest.TestCase): ...@@ -179,6 +179,28 @@ class BuiltinTest(unittest.TestCase):
a = {} a = {}
a[0] = a a[0] = a
self.assertEqual(ascii(a), '{0: {...}}') self.assertEqual(ascii(a), '{0: {...}}')
# Advanced checks for unicode strings
def _check_uni(s):
self.assertEqual(ascii(s), repr(s))
_check_uni("'")
_check_uni('"')
_check_uni('"\'')
_check_uni('\0')
_check_uni('\r\n\t .')
# Unprintable non-ASCII characters
_check_uni('\x85')
_check_uni('\u1fff')
_check_uni('\U00012fff')
# Lone surrogates
_check_uni('\ud800')
_check_uni('\udfff')
# Issue #9804: surrogates should be joined even for printable
# wide characters (UCS-2 builds).
self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
# All together
s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
self.assertEqual(ascii(s),
r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
def test_neg(self): def test_neg(self):
x = -sys.maxsize-1 x = -sys.maxsize-1
......
...@@ -577,17 +577,31 @@ class CodecCallbackTest(unittest.TestCase): ...@@ -577,17 +577,31 @@ class CodecCallbackTest(unittest.TestCase):
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
("\\uffff", 1) ("\\uffff", 1)
) )
if sys.maxunicode>0xffff: # 1 on UCS-4 builds, 2 on UCS-2
self.assertEquals( len_wide = len("\U00010000")
codecs.backslashreplace_errors( self.assertEquals(
UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")), codecs.backslashreplace_errors(
("\\U00010000", 1) UnicodeEncodeError("ascii", "\U00010000",
) 0, len_wide, "ouch")),
self.assertEquals( ("\\U00010000", len_wide)
codecs.backslashreplace_errors( )
UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")), self.assertEquals(
("\\U0010ffff", 1) codecs.backslashreplace_errors(
) UnicodeEncodeError("ascii", "\U0010ffff",
0, len_wide, "ouch")),
("\\U0010ffff", len_wide)
)
# Lone surrogates (regardless of unicode width)
self.assertEquals(
codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
("\\ud800", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
("\\udfff", 1)
)
def test_badhandlerresults(self): def test_badhandlerresults(self):
results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
......
...@@ -10,6 +10,11 @@ What's New in Python 3.2 Alpha 3? ...@@ -10,6 +10,11 @@ What's New in Python 3.2 Alpha 3?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #9804: ascii() now always represents unicode surrogate pairs as
a single ``\UXXXXXXXX``, regardless of whether the character is printable
or not. Also, the "backslashreplace" error handler now joins surrogate
pairs into a single character on UCS-2 builds.
- Issue #9757: memoryview objects get a release() method to release the - Issue #9757: memoryview objects get a release() method to release the
underlying buffer (previously this was only done when deallocating the underlying buffer (previously this was only done when deallocating the
memoryview), and gain support for the context management protocol. memoryview), and gain support for the context management protocol.
......
...@@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = { ...@@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{ {
#ifndef Py_UNICODE_WIDE
#define IS_SURROGATE_PAIR(p, end) \
(*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
*(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
#else
#define IS_SURROGATE_PAIR(p, end) 0
#endif
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
...@@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -702,7 +709,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
else else
#endif #endif
if (*p >= 0x100) { if (*p >= 0x100) {
ressize += 1+1+4; if (IS_SURROGATE_PAIR(p, startp+end)) {
ressize += 1+1+8;
++p;
}
else
ressize += 1+1+4;
} }
else else
ressize += 1+1+2; ressize += 1+1+2;
...@@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
return NULL; return NULL;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
p < startp+end; ++p) { p < startp+end; ++p) {
Py_UNICODE c = *p; Py_UCS4 c = (Py_UCS4) *p;
*outp++ = '\\'; *outp++ = '\\';
#ifdef Py_UNICODE_WIDE if (IS_SURROGATE_PAIR(p, startp+end)) {
c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
++p;
}
if (c >= 0x00010000) { if (c >= 0x00010000) {
*outp++ = 'U'; *outp++ = 'U';
*outp++ = hexdigits[(c>>28)&0xf]; *outp++ = hexdigits[(c>>28)&0xf];
...@@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
*outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf]; *outp++ = hexdigits[(c>>8)&0xf];
} }
else else if (c >= 0x100) {
#endif
if (c >= 0x100) {
*outp++ = 'u'; *outp++ = 'u';
*outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf]; *outp++ = hexdigits[(c>>8)&0xf];
...@@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
wrong_exception_type(exc); wrong_exception_type(exc);
return NULL; return NULL;
} }
#undef IS_SURROGATE_PAIR
} }
/* This handler is declared static until someone demonstrates /* This handler is declared static until someone demonstrates
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment