Commit e822b034 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #15866: The xmlcharrefreplace error handler no more produces two XML

entities for a non-BMP character on narrow build.
parent 5ad35148
......@@ -66,15 +66,34 @@ class CodecCallbackTest(unittest.TestCase):
# replace unencodable characters which numeric character entities.
# For ascii, latin-1 and charmaps this is completely implemented
# in C and should be reasonably fast.
s = u"\u30b9\u30d1\u30e2 \xe4nd eggs"
s = u"\u30b9\u30d1\u30e2 \xe4nd egg\u0161"
self.assertEqual(
s.encode("ascii", "xmlcharrefreplace"),
"スパモ änd eggs"
"スパモ änd eggš"
)
self.assertEqual(
s.encode("latin-1", "xmlcharrefreplace"),
"スパモ \xe4nd eggs"
"スパモ \xe4nd eggš"
)
self.assertEqual(
s.encode("iso-8859-15", "xmlcharrefreplace"),
"スパモ \xe4nd egg\xa8"
)
def test_xmlcharrefreplace_with_surrogates(self):
tests = [(u'\U0001f49d', '💝'),
(u'\ud83d', '�'),
(u'\udc9d', '�'),
(u'\ud83d\udc9d', '💝' if len(u'\U0001f49d') > 1 else
'��'),
]
for encoding in ['ascii', 'latin1', 'iso-8859-15']:
for s, exp in tests:
self.assertEqual(s.encode(encoding, 'xmlcharrefreplace'),
exp, msg='%r.encode(%r)' % (s, encoding))
self.assertEqual((s+'X').encode(encoding, 'xmlcharrefreplace'),
exp+'X',
msg='%r.encode(%r)' % (s + 'X', encoding))
def test_xmlcharnamereplace(self):
# This time use a named character entity for unencodable
......
......@@ -1658,6 +1658,18 @@ class UnicodeTest(
self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
b'123?0')
def test_encode_decimal_with_surrogates(self):
from _testcapi import unicode_encodedecimal
tests = [(u'\U0001f49d', '💝'),
(u'\ud83d', '�'),
(u'\udc9d', '�'),
(u'\ud83d\udc9d', '💝' if len(u'\U0001f49d') > 1 else
'��'),
]
for s, exp in tests:
self.assertEqual(
unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
'123' + exp)
def test_main():
test_support.run_unittest(__name__)
......
......@@ -9,6 +9,9 @@ What's New in Python 2.7.6?
Core and Builtins
-----------------
- Issue #15866: The xmlcharrefreplace error handler no more produces two XML
entities for a non-BMP character on narrow build.
- Issue #18184: PyUnicode_FromFormat() and PyUnicode_FromFormatV() now raise
OverflowError when an argument of %c format is out of range.
......
......@@ -1118,7 +1118,7 @@ unicode_encodedecimal(PyObject *self, PyObject *args)
if (!PyArg_ParseTuple(args, "u#|s", &unicode, &length, &errors))
return NULL;
decimal_length = length * 7; /* len('€') */
decimal_length = length * 10; /* len('') */
decimal = PyBytes_FromStringAndSize(NULL, decimal_length);
if (decimal == NULL)
return NULL;
......
......@@ -547,6 +547,37 @@ PyObject *PyUnicode_FromString(const char *u)
return PyUnicode_FromStringAndSize(u, size);
}
/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
* by 'ptr', possibly combining surrogate pairs on narrow builds.
* 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
* that should be returned and 'end' pointing to the end of the buffer.
* ('end' is used on narrow builds to detect a lone surrogate at the
* end of the buffer that should be returned unchanged.)
* The ptr and end arguments should be side-effect free and ptr must an lvalue.
* The type of the returned char is always Py_UCS4.
*
* Note: the macro advances ptr to next char, so it might have side-effects
* (especially if used with other macros).
*/
/* helper macros used by _Py_UNICODE_NEXT */
#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define _Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
#ifdef Py_UNICODE_WIDE
#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
#else
#define _Py_UNICODE_NEXT(ptr, end) \
(((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
_Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
(Py_UCS4)*(ptr)++)
#endif
#ifdef HAVE_WCHAR_H
#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
......@@ -3642,26 +3673,22 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
case 4: /* xmlcharrefreplace */
respos = str-PyString_AS_STRING(res);
/* determine replacement size (temporarily (mis)uses p) */
for (p = collstart, repsize = 0; p < collend; ++p) {
if (*p<10)
for (p = collstart, repsize = 0; p < collend;) {
Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
if (ch < 10)
repsize += 2+1+1;
else if (*p<100)
else if (ch < 100)
repsize += 2+2+1;
else if (*p<1000)
else if (ch < 1000)
repsize += 2+3+1;
else if (*p<10000)
else if (ch < 10000)
repsize += 2+4+1;
#ifndef Py_UNICODE_WIDE
else
else if (ch < 100000)
repsize += 2+5+1;
#else
else if (*p<100000)
repsize += 2+5+1;
else if (*p<1000000)
else if (ch < 1000000)
repsize += 2+6+1;
else
repsize += 2+7+1;
#endif
}
requiredsize = respos+repsize+(endp-collend);
if (requiredsize > ressize) {
......@@ -3673,8 +3700,9 @@ static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
ressize = requiredsize;
}
/* generate replacement (temporarily (mis)uses p) */
for (p = collstart; p < collend; ++p) {
str += sprintf(str, "&#%d;", (int)*p);
for (p = collstart; p < collend;) {
Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
str += sprintf(str, "&#%d;", (int)ch);
}
p = collend;
break;
......@@ -4649,11 +4677,20 @@ int charmap_encoding_error(
*inpos = collendpos;
break;
case 4: /* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */
for (collpos = collstartpos; collpos < collendpos; ++collpos) {
/* generate replacement */
for (collpos = collstartpos; collpos < collendpos;) {
char buffer[2+29+1+1];
char *cp;
sprintf(buffer, "&#%d;", (int)p[collpos]);
Py_UCS4 ch = p[collpos++];
#ifndef Py_UNICODE_WIDE
if ((0xD800 <= ch && ch <= 0xDBFF) &&
(collpos < collendpos) &&
(0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
ch = ((((ch & 0x03FF) << 10) |
((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
}
#endif
sprintf(buffer, "&#%d;", (int)ch);
for (cp = buffer; *cp; ++cp) {
x = charmapencode_output(*cp, mapping, res, respos);
if (x==enc_EXCEPTION)
......@@ -5068,10 +5105,11 @@ PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
break;
case 4: /* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */
for (p = collstart; p < collend; ++p) {
for (p = collstart; p < collend;) {
char buffer[2+29+1+1];
char *cp;
sprintf(buffer, "&#%d;", (int)*p);
Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
sprintf(buffer, "&#%d;", (int)ch);
if (charmaptranslate_makespace(&res, &str,
(str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
goto onError;
......@@ -5222,8 +5260,10 @@ int PyUnicode_EncodeDecimal(Py_UNICODE *s,
break;
case 4: /* xmlcharrefreplace */
/* generate replacement (temporarily (mis)uses p) */
for (p = collstart; p < collend; ++p)
output += sprintf(output, "&#%d;", (int)*p);
for (p = collstart; p < collend;) {
Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
output += sprintf(output, "&#%d;", ch);
}
p = collend;
break;
default:
......
......@@ -556,6 +556,7 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
PyObject *res;
Py_UNICODE *p;
Py_UNICODE *startp;
Py_UNICODE *e;
Py_UNICODE *outp;
int ressize;
if (PyUnicodeEncodeError_GetStart(exc, &start))
......@@ -565,26 +566,31 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
if (!(object = PyUnicodeEncodeError_GetObject(exc)))
return NULL;
startp = PyUnicode_AS_UNICODE(object);
for (p = startp+start, ressize = 0; p < startp+end; ++p) {
if (*p<10)
e = startp + end;
for (p = startp+start, ressize = 0; p < e;) {
Py_UCS4 ch = *p++;
#ifndef Py_UNICODE_WIDE
if ((0xD800 <= ch && ch <= 0xDBFF) &&
(p < e) &&
(0xDC00 <= *p && *p <= 0xDFFF)) {
ch = ((((ch & 0x03FF) << 10) |
((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
}
#endif
if (ch < 10)
ressize += 2+1+1;
else if (*p<100)
else if (ch < 100)
ressize += 2+2+1;
else if (*p<1000)
else if (ch < 1000)
ressize += 2+3+1;
else if (*p<10000)
else if (ch < 10000)
ressize += 2+4+1;
#ifndef Py_UNICODE_WIDE
else
ressize += 2+5+1;
#else
else if (*p<100000)
else if (ch < 100000)
ressize += 2+5+1;
else if (*p<1000000)
else if (ch < 1000000)
ressize += 2+6+1;
else
ressize += 2+7+1;
#endif
}
/* allocate replacement */
res = PyUnicode_FromUnicode(NULL, ressize);
......@@ -593,40 +599,41 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
return NULL;
}
/* generate replacement */
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res);
p < startp+end; ++p) {
Py_UNICODE c = *p;
for (p = startp+start, outp = PyUnicode_AS_UNICODE(res); p < e;) {
int digits;
int base;
Py_UCS4 ch = *p++;
#ifndef Py_UNICODE_WIDE
if ((0xD800 <= ch && ch <= 0xDBFF) &&
(p < startp+end) &&
(0xDC00 <= *p && *p <= 0xDFFF)) {
ch = ((((ch & 0x03FF) << 10) |
((Py_UCS4)*p++ & 0x03FF)) + 0x10000);
}
#endif
*outp++ = '&';
*outp++ = '#';
if (*p<10) {
if (ch < 10) {
digits = 1;
base = 1;
}
else if (*p<100) {
else if (ch < 100) {
digits = 2;
base = 10;
}
else if (*p<1000) {
else if (ch < 1000) {
digits = 3;
base = 100;
}
else if (*p<10000) {
else if (ch < 10000) {
digits = 4;
base = 1000;
}
#ifndef Py_UNICODE_WIDE
else {
digits = 5;
base = 10000;
}
#else
else if (*p<100000) {
else if (ch < 100000) {
digits = 5;
base = 10000;
}
else if (*p<1000000) {
else if (ch < 1000000) {
digits = 6;
base = 100000;
}
......@@ -634,10 +641,9 @@ PyObject *PyCodec_XMLCharRefReplaceErrors(PyObject *exc)
digits = 7;
base = 1000000;
}
#endif
while (digits-->0) {
*outp++ = '0' + c/base;
c %= base;
*outp++ = '0' + ch/base;
ch %= base;
base /= 10;
}
*outp++ = ';';
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment