Commit c9a8df24 authored by Antoine Pitrou's avatar Antoine Pitrou

Merged revisions 84655 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/branches/py3k

........
  r84655 | antoine.pitrou | 2010-09-09 22:30:23 +0200 (jeu., 09 sept. 2010) | 6 lines

  Issue #9804: ascii() now always represents unicode surrogate pairs as
  a single `\UXXXXXXXX`, regardless of whether the character is printable
  or not.  Also, the "backslashreplace" error handler now joins surrogate
  pairs into a single character on UCS-2 builds.
........
parent 8e0bb6a1
...@@ -174,6 +174,28 @@ class BuiltinTest(unittest.TestCase): ...@@ -174,6 +174,28 @@ class BuiltinTest(unittest.TestCase):
a = {} a = {}
a[0] = a a[0] = a
self.assertEqual(ascii(a), '{0: {...}}') self.assertEqual(ascii(a), '{0: {...}}')
# Advanced checks for unicode strings
def _check_uni(s):
self.assertEqual(ascii(s), repr(s))
_check_uni("'")
_check_uni('"')
_check_uni('"\'')
_check_uni('\0')
_check_uni('\r\n\t .')
# Unprintable non-ASCII characters
_check_uni('\x85')
_check_uni('\u1fff')
_check_uni('\U00012fff')
# Lone surrogates
_check_uni('\ud800')
_check_uni('\udfff')
# Issue #9804: surrogates should be joined even for printable
# wide characters (UCS-2 builds).
self.assertEqual(ascii('\U0001d121'), "'\\U0001d121'")
# All together
s = "'\0\"\n\r\t abcd\x85é\U00012fff\uD800\U0001D121xxx."
self.assertEqual(ascii(s),
r"""'\'\x00"\n\r\t abcd\x85\xe9\U00012fff\ud800\U0001d121xxx.'""")
def test_neg(self): def test_neg(self):
x = -sys.maxsize-1 x = -sys.maxsize-1
......
...@@ -577,16 +577,30 @@ class CodecCallbackTest(unittest.TestCase): ...@@ -577,16 +577,30 @@ class CodecCallbackTest(unittest.TestCase):
UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\uffff", 0, 1, "ouch")),
("\\uffff", 1) ("\\uffff", 1)
) )
if sys.maxunicode>0xffff: # 1 on UCS-4 builds, 2 on UCS-2
len_wide = len("\U00010000")
self.assertEquals( self.assertEquals(
codecs.backslashreplace_errors( codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\U00010000", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\U00010000",
("\\U00010000", 1) 0, len_wide, "ouch")),
("\\U00010000", len_wide)
) )
self.assertEquals( self.assertEquals(
codecs.backslashreplace_errors( codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\U0010ffff", 0, 1, "ouch")), UnicodeEncodeError("ascii", "\U0010ffff",
("\\U0010ffff", 1) 0, len_wide, "ouch")),
("\\U0010ffff", len_wide)
)
# Lone surrogates (regardless of unicode width)
self.assertEquals(
codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\ud800", 0, 1, "ouch")),
("\\ud800", 1)
)
self.assertEquals(
codecs.backslashreplace_errors(
UnicodeEncodeError("ascii", "\udfff", 0, 1, "ouch")),
("\\udfff", 1)
) )
def test_badhandlerresults(self): def test_badhandlerresults(self):
......
...@@ -12,6 +12,11 @@ What's New in Python 3.1.3? ...@@ -12,6 +12,11 @@ What's New in Python 3.1.3?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #9804: ascii() now always represents unicode surrogate pairs as
a single ``\UXXXXXXXX``, regardless of whether the character is printable
or not. Also, the "backslashreplace" error handler now joins surrogate
pairs into a single character on UCS-2 builds.
- Issue #9797: pystate.c wrongly assumed that zero couldn't be a valid - Issue #9797: pystate.c wrongly assumed that zero couldn't be a valid
thread-local storage key. thread-local storage key.
......
...@@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = { ...@@ -678,6 +678,13 @@ static Py_UNICODE hexdigits[] = {
PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
{ {
#ifndef Py_UNICODE_WIDE
#define IS_SURROGATE_PAIR(p, end) \
(*p >= 0xD800 && *p <= 0xDBFF && (p + 1) < end && \
*(p + 1) >= 0xDC00 && *(p + 1) <= 0xDFFF)
#else
#define IS_SURROGATE_PAIR(p, end) 0
#endif
if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) { if (PyObject_IsInstance(exc, PyExc_UnicodeEncodeError)) {
PyObject *restuple; PyObject *restuple;
PyObject *object; PyObject *object;
...@@ -702,6 +709,11 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -702,6 +709,11 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
else else
#endif #endif
if (*p >= 0x100) { if (*p >= 0x100) {
if (IS_SURROGATE_PAIR(p, startp+end)) {
ressize += 1+1+8;
++p;
}
else
ressize += 1+1+4; ressize += 1+1+4;
} }
else else
...@@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -712,9 +724,12 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
return NULL; return NULL;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
p < startp+end; ++p) { p < startp+end; ++p) {
Py_UNICODE c = *p; Py_UCS4 c = (Py_UCS4) *p;
*outp++ = '\\'; *outp++ = '\\';
#ifdef Py_UNICODE_WIDE if (IS_SURROGATE_PAIR(p, startp+end)) {
c = ((*p & 0x3FF) << 10) + (*(p + 1) & 0x3FF) + 0x10000;
++p;
}
if (c >= 0x00010000) { if (c >= 0x00010000) {
*outp++ = 'U'; *outp++ = 'U';
*outp++ = hexdigits[(c>>28)&0xf]; *outp++ = hexdigits[(c>>28)&0xf];
...@@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -724,9 +739,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
*outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf]; *outp++ = hexdigits[(c>>8)&0xf];
} }
else else if (c >= 0x100) {
#endif
if (c >= 0x100) {
*outp++ = 'u'; *outp++ = 'u';
*outp++ = hexdigits[(c>>12)&0xf]; *outp++ = hexdigits[(c>>12)&0xf];
*outp++ = hexdigits[(c>>8)&0xf]; *outp++ = hexdigits[(c>>8)&0xf];
...@@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc) ...@@ -746,6 +759,7 @@ PyObject *PyCodec_BackslashReplaceErrors(PyObject *exc)
wrong_exception_type(exc); wrong_exception_type(exc);
return NULL; return NULL;
} }
#undef IS_SURROGATE_PAIR
} }
/* This handler is declared static until someone demonstrates /* This handler is declared static until someone demonstrates
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment