Commit e9b11c1c authored by Martin v. Löwis's avatar Martin v. Löwis

Change decoders to use Unicode API instead of Py_UNICODE.

parent d03a4915
...@@ -1512,6 +1512,13 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length) ...@@ -1512,6 +1512,13 @@ unicode_resize(PyObject **p_unicode, Py_ssize_t length)
if (old_length == length) if (old_length == length)
return 0; return 0;
if (length == 0) {
Py_DECREF(*p_unicode);
*p_unicode = unicode_empty;
Py_INCREF(*p_unicode);
return 0;
}
if (!unicode_resizable(unicode)) { if (!unicode_resizable(unicode)) {
PyObject *copy = resize_copy(unicode, length); PyObject *copy = resize_copy(unicode, length);
if (copy == NULL) if (copy == NULL)
...@@ -1540,8 +1547,7 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) ...@@ -1540,8 +1547,7 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
return -1; return -1;
} }
unicode = *p_unicode; unicode = *p_unicode;
if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
|| _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
{ {
PyErr_BadInternalCall(); PyErr_BadInternalCall();
return -1; return -1;
...@@ -1549,6 +1555,36 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length) ...@@ -1549,6 +1555,36 @@ PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
return unicode_resize(p_unicode, length); return unicode_resize(p_unicode, length);
} }
static int
unicode_widen(PyObject **p_unicode, int maxchar)
{
PyObject *result;
assert(PyUnicode_IS_READY(*p_unicode));
if (maxchar <= PyUnicode_MAX_CHAR_VALUE(*p_unicode))
return 0;
result = PyUnicode_New(PyUnicode_GET_LENGTH(*p_unicode),
maxchar);
if (result == NULL)
return -1;
PyUnicode_CopyCharacters(result, 0, *p_unicode, 0,
PyUnicode_GET_LENGTH(*p_unicode));
Py_DECREF(*p_unicode);
*p_unicode = result;
return 0;
}
static int
unicode_putchar(PyObject **p_unicode, Py_ssize_t *pos,
Py_UCS4 ch)
{
if (unicode_widen(p_unicode, ch) < 0)
return -1;
PyUnicode_WRITE(PyUnicode_KIND(*p_unicode),
PyUnicode_DATA(*p_unicode),
(*pos)++, ch);
return 0;
}
static PyObject* static PyObject*
get_latin1_char(unsigned char ch) get_latin1_char(unsigned char ch)
{ {
...@@ -3581,19 +3617,18 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, ...@@ -3581,19 +3617,18 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
const char *encoding, const char *reason, const char *encoding, const char *reason,
const char **input, const char **inend, Py_ssize_t *startinpos, const char **input, const char **inend, Py_ssize_t *startinpos,
Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr, Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
PyObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr) PyObject **output, Py_ssize_t *outpos)
{ {
static char *argparse = "O!n;decoding error handler must return (str, int) tuple"; static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
PyObject *restuple = NULL; PyObject *restuple = NULL;
PyObject *repunicode = NULL; PyObject *repunicode = NULL;
Py_ssize_t outsize = PyUnicode_GET_SIZE(*output); Py_ssize_t outsize = PyUnicode_GET_LENGTH(*output);
Py_ssize_t insize; Py_ssize_t insize;
Py_ssize_t requiredsize; Py_ssize_t requiredsize;
Py_ssize_t newpos; Py_ssize_t newpos;
const Py_UNICODE *repptr;
PyObject *inputobj = NULL; PyObject *inputobj = NULL;
Py_ssize_t repsize; Py_ssize_t replen;
int res = -1; int res = -1;
if (*errorHandler == NULL) { if (*errorHandler == NULL) {
...@@ -3619,6 +3654,8 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, ...@@ -3619,6 +3654,8 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
} }
if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos)) if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
goto onError; goto onError;
if (PyUnicode_READY(repunicode) < 0)
goto onError;
/* Copy back the bytes variables, which might have been modified by the /* Copy back the bytes variables, which might have been modified by the
callback */ callback */
...@@ -3646,21 +3683,20 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler, ...@@ -3646,21 +3683,20 @@ unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
have+the replacement+the rest of the string (starting have+the replacement+the rest of the string (starting
at the new input position), so we won't have to check space at the new input position), so we won't have to check space
when there are no errors in the rest of the string) */ when there are no errors in the rest of the string) */
repptr = PyUnicode_AS_UNICODE(repunicode); replen = PyUnicode_GET_LENGTH(repunicode);
repsize = PyUnicode_GET_SIZE(repunicode); requiredsize = *outpos + replen + insize-newpos;
requiredsize = *outpos + repsize + insize-newpos;
if (requiredsize > outsize) { if (requiredsize > outsize) {
if (requiredsize<2*outsize) if (requiredsize<2*outsize)
requiredsize = 2*outsize; requiredsize = 2*outsize;
if (PyUnicode_Resize(output, requiredsize) < 0) if (unicode_resize(output, requiredsize) < 0)
goto onError; goto onError;
*outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
} }
if (unicode_widen(output, PyUnicode_MAX_CHAR_VALUE(repunicode)) < 0)
goto onError;
*endinpos = newpos; *endinpos = newpos;
*inptr = *input + newpos; *inptr = *input + newpos;
Py_UNICODE_COPY(*outptr, repptr, repsize); PyUnicode_CopyCharacters(*output, *outpos, repunicode, 0, replen);
*outptr += repsize; *outpos += replen;
*outpos += repsize;
/* we made it! */ /* we made it! */
res = 0; res = 0;
...@@ -3778,17 +3814,17 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -3778,17 +3814,17 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
Py_ssize_t outpos; Py_ssize_t outpos;
const char *e; const char *e;
PyObject *unicode; PyObject *unicode;
Py_UNICODE *p;
const char *errmsg = ""; const char *errmsg = "";
int inShift = 0; int inShift = 0;
Py_UNICODE *shiftOutStart; Py_ssize_t shiftOutStart;
unsigned int base64bits = 0; unsigned int base64bits = 0;
unsigned long base64buffer = 0; unsigned long base64buffer = 0;
Py_UNICODE surrogate = 0; Py_UNICODE surrogate = 0;
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
unicode = (PyObject*)_PyUnicode_New(size); /* Start off assuming it's all ASCII. Widen later as necessary. */
unicode = PyUnicode_New(size, 127);
if (!unicode) if (!unicode)
return NULL; return NULL;
if (size == 0) { if (size == 0) {
...@@ -3797,12 +3833,11 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -3797,12 +3833,11 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
return unicode; return unicode;
} }
p = PyUnicode_AS_UNICODE(unicode); shiftOutStart = outpos = 0;
shiftOutStart = p;
e = s + size; e = s + size;
while (s < e) { while (s < e) {
Py_UNICODE ch; Py_UCS4 ch;
restart: restart:
ch = (unsigned char) *s; ch = (unsigned char) *s;
...@@ -3820,13 +3855,10 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -3820,13 +3855,10 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
if (surrogate) { if (surrogate) {
/* expecting a second surrogate */ /* expecting a second surrogate */
if (outCh >= 0xDC00 && outCh <= 0xDFFF) { if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
#ifdef Py_UNICODE_WIDE Py_UCS4 ch2 = (((surrogate & 0x3FF)<<10)
*p++ = (((surrogate & 0x3FF)<<10) | (outCh & 0x3FF)) + 0x10000;
| (outCh & 0x3FF)) + 0x10000; if (unicode_putchar(&unicode, &outpos, ch2) < 0)
#else goto onError;
*p++ = surrogate;
*p++ = outCh;
#endif
surrogate = 0; surrogate = 0;
} }
else { else {
...@@ -3844,7 +3876,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -3844,7 +3876,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
goto utf7Error; goto utf7Error;
} }
else { else {
*p++ = outCh; if (unicode_putchar(&unicode, &outpos, outCh) < 0)
goto onError;
} }
} }
} }
...@@ -3872,7 +3905,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -3872,7 +3905,8 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
if (ch != '-') { if (ch != '-') {
/* '-' is absorbed; other terminating /* '-' is absorbed; other terminating
characters are preserved */ characters are preserved */
*p++ = ch; if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
} }
} }
} }
...@@ -3881,16 +3915,18 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -3881,16 +3915,18 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
s++; /* consume '+' */ s++; /* consume '+' */
if (s < e && *s == '-') { /* '+-' encodes '+' */ if (s < e && *s == '-') { /* '+-' encodes '+' */
s++; s++;
*p++ = '+'; if (unicode_putchar(&unicode, &outpos, '+') < 0)
goto onError;
} }
else { /* begin base64-encoded section */ else { /* begin base64-encoded section */
inShift = 1; inShift = 1;
shiftOutStart = p; shiftOutStart = outpos;
base64bits = 0; base64bits = 0;
} }
} }
else if (DECODE_DIRECT(ch)) { /* character decodes as itself */ else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
*p++ = ch; if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
s++; s++;
} }
else { else {
...@@ -3901,13 +3937,12 @@ PyUnicode_DecodeUTF7Stateful(const char *s, ...@@ -3901,13 +3937,12 @@ PyUnicode_DecodeUTF7Stateful(const char *s,
} }
continue; continue;
utf7Error: utf7Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
endinpos = s-starts; endinpos = s-starts;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf7", errmsg, "utf7", errmsg,
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p)) &unicode, &outpos))
goto onError; goto onError;
} }
...@@ -3918,13 +3953,12 @@ utf7Error: ...@@ -3918,13 +3953,12 @@ utf7Error:
if (surrogate || if (surrogate ||
(base64bits >= 6) || (base64bits >= 6) ||
(base64bits > 0 && base64buffer != 0)) { (base64bits > 0 && base64buffer != 0)) {
outpos = p-PyUnicode_AS_UNICODE(unicode);
endinpos = size; endinpos = size;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf7", "unterminated shift sequence", "utf7", "unterminated shift sequence",
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &outpos, &p)) &unicode, &outpos))
goto onError; goto onError;
if (s < e) if (s < e)
goto restart; goto restart;
...@@ -3934,7 +3968,7 @@ utf7Error: ...@@ -3934,7 +3968,7 @@ utf7Error:
/* return state */ /* return state */
if (consumed) { if (consumed) {
if (inShift) { if (inShift) {
p = shiftOutStart; /* back off output */ outpos = shiftOutStart; /* back off output */
*consumed = startinpos; *consumed = startinpos;
} }
else { else {
...@@ -3942,7 +3976,7 @@ utf7Error: ...@@ -3942,7 +3976,7 @@ utf7Error:
} }
} }
if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) if (unicode_resize(&unicode, outpos) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
...@@ -4208,7 +4242,7 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, ...@@ -4208,7 +4242,7 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
err = 1; err = 1;
break; break;
} }
for (cont = p + 1; cont < (p + n); ++cont) { for (cont = p + 1; cont <= (p + n); ++cont) {
if ((*cont & 0xc0) != 0x80) { if ((*cont & 0xc0) != 0x80) {
err = 1; err = 1;
break; break;
...@@ -4229,19 +4263,23 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size, ...@@ -4229,19 +4263,23 @@ utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
return max_char; return max_char;
} }
/* Similar to PyUnicode_WRITE but can also write into wstr field /* Similar to PyUnicode_WRITE but may attempt to widen and resize the string
of the legacy unicode representation */ in case of errors. Implicit parameters: unicode, kind, data, has_errors,
#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \ onError. Potential resizing overallocates, so the result needs to shrink
do { \ at the end.
const int k_ = (kind); \ */
if (k_ == PyUnicode_WCHAR_KIND) \ #define WRITE_MAYBE_FAIL(index, value) \
((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \ do { \
else if (k_ == PyUnicode_1BYTE_KIND) \ if (has_errors) { \
((unsigned char *)(buf))[(index)] = (unsigned char)(value); \ Py_ssize_t pos = index; \
else if (k_ == PyUnicode_2BYTE_KIND) \ if (pos > PyUnicode_GET_LENGTH(unicode) && \
((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \ unicode_resize(&unicode, pos + pos/8) < 0) \
else \ goto onError; \
((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \ if (unicode_putchar(&unicode, &pos, value) < 0) \
goto onError; \
} \
else \
PyUnicode_WRITE(kind, data, index, value); \
} while (0) } while (0)
PyObject * PyObject *
...@@ -4266,10 +4304,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4266,10 +4304,6 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
int kind; int kind;
void *data; void *data;
int has_errors; int has_errors;
Py_UNICODE *error_outptr;
#if SIZEOF_WCHAR_T == 2
Py_ssize_t wchar_offset = 0;
#endif
if (size == 0) { if (size == 0) {
if (consumed) if (consumed)
...@@ -4278,28 +4312,23 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4278,28 +4312,23 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
} }
maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size, maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
consumed, &has_errors); consumed, &has_errors);
if (has_errors) { if (has_errors)
unicode = (PyObject*)_PyUnicode_New(size); /* maxchar and size computation might be incorrect;
if (!unicode) code below widens and resizes as necessary. */
return NULL; unicode = PyUnicode_New(size, 127);
kind = PyUnicode_WCHAR_KIND; else
data = PyUnicode_AS_UNICODE(unicode);
assert(data != NULL);
}
else {
unicode = PyUnicode_New(unicode_size, maxchar); unicode = PyUnicode_New(unicode_size, maxchar);
if (!unicode) if (!unicode)
return NULL; return NULL;
/* When the string is ASCII only, just use memcpy and return. /* When the string is ASCII only, just use memcpy and return.
unicode_size may be != size if there is an incomplete UTF-8 unicode_size may be != size if there is an incomplete UTF-8
sequence at the end of the ASCII block. */ sequence at the end of the ASCII block. */
if (maxchar < 128 && size == unicode_size) { if (!has_errors && maxchar < 128 && size == unicode_size) {
Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size); Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
return unicode; return unicode;
}
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
} }
kind = PyUnicode_KIND(unicode);
data = PyUnicode_DATA(unicode);
/* Unpack UTF-8 encoded data */ /* Unpack UTF-8 encoded data */
i = 0; i = 0;
e = s + size; e = s + size;
...@@ -4327,15 +4356,15 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4327,15 +4356,15 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
unsigned long value = *(unsigned long *) _s; unsigned long value = *(unsigned long *) _s;
if (value & ASCII_CHAR_MASK) if (value & ASCII_CHAR_MASK)
break; break;
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]); WRITE_MAYBE_FAIL(_i+0, _s[0]);
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]); WRITE_MAYBE_FAIL(_i+1, _s[1]);
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]); WRITE_MAYBE_FAIL(_i+2, _s[2]);
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]); WRITE_MAYBE_FAIL(_i+3, _s[3]);
#if (SIZEOF_LONG == 8) #if (SIZEOF_LONG == 8)
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]); WRITE_MAYBE_FAIL(_i+4, _s[4]);
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]); WRITE_MAYBE_FAIL(_i+5, _s[5]);
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]); WRITE_MAYBE_FAIL(_i+6, _s[6]);
WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]); WRITE_MAYBE_FAIL(_i+7, _s[7]);
#endif #endif
_s += SIZEOF_LONG; _s += SIZEOF_LONG;
_i += SIZEOF_LONG; _i += SIZEOF_LONG;
...@@ -4349,7 +4378,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4349,7 +4378,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
} }
if (ch < 0x80) { if (ch < 0x80) {
WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); WRITE_MAYBE_FAIL(i++, ch);
s++; s++;
continue; continue;
} }
...@@ -4392,7 +4421,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4392,7 +4421,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
} }
ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f); ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
assert ((ch > 0x007F) && (ch <= 0x07FF)); assert ((ch > 0x007F) && (ch <= 0x07FF));
WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); WRITE_MAYBE_FAIL(i++, ch);
break; break;
case 3: case 3:
...@@ -4421,7 +4450,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4421,7 +4450,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
} }
ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f); ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
assert ((ch > 0x07FF) && (ch <= 0xFFFF)); assert ((ch > 0x07FF) && (ch <= 0xFFFF));
WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch); WRITE_MAYBE_FAIL(i++, ch);
break; break;
case 4: case 4:
...@@ -4446,86 +4475,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4446,86 +4475,56 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
((s[2] & 0x3f) << 6) + (s[3] & 0x3f); ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
/* If the string is flexible or we have native UCS-4, write WRITE_MAYBE_FAIL(i++, ch);
directly.. */
if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
else {
/* compute and append the two surrogates: */
/* translate from 10000..10FFFF to 0..FFFF */
ch -= 0x10000;
/* high surrogate = top 10 bits added to D800 */
WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
(Py_UNICODE)(0xD800 + (ch >> 10)));
/* low surrogate = bottom 10 bits added to DC00 */
WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
(Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
}
#if SIZEOF_WCHAR_T == 2
wchar_offset++;
#endif
break; break;
} }
s += n; s += n;
continue; continue;
utf8Error: utf8Error:
/* If this is not yet a resizable string, make it one.. */ if (!has_errors) {
if (kind != PyUnicode_WCHAR_KIND) { PyObject *tmp;
const Py_UNICODE *u; Py_ssize_t k;
PyObject *new_unicode = (PyObject*)_PyUnicode_New(size); /* We encountered some error that wasn't detected in the original scan,
if (!new_unicode) e.g. an encoded surrogate character. The original maxchar computation may
have been incorrect, so redo it now. */
for (k = 0, maxchar = 0; k < i; k++)
maxchar = Py_MAX(maxchar, PyUnicode_READ(kind, data, k));
tmp = PyUnicode_New(PyUnicode_GET_LENGTH(unicode), maxchar);
if (tmp == NULL)
goto onError; goto onError;
u = PyUnicode_AsUnicode(unicode); PyUnicode_CopyCharacters(tmp, 0, unicode, 0, i);
if (!u)
goto onError;
#if SIZEOF_WCHAR_T == 2
i += wchar_offset;
#endif
Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
Py_DECREF(unicode); Py_DECREF(unicode);
unicode = new_unicode; unicode = tmp;
kind = 0; has_errors = 1;
data = PyUnicode_AS_UNICODE(new_unicode);
assert(data != NULL);
} }
error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf8", errmsg, "utf8", errmsg,
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&unicode, &i, &error_outptr)) &unicode, &i))
goto onError; goto onError;
/* Update data because unicode_decode_call_errorhandler might have /* Update data because unicode_decode_call_errorhandler might have
re-created or resized the unicode object. */ re-created or resized the unicode object. */
data = PyUnicode_AS_UNICODE(unicode); data = PyUnicode_DATA(unicode);
kind = PyUnicode_KIND(unicode);
aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK); aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
} }
/* Ensure the unicode_size calculation above was correct: */ /* Ensure the unicode_size calculation above was correct: */
assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size); assert(has_errors || i == unicode_size);
if (consumed) if (consumed)
*consumed = s-starts; *consumed = s-starts;
/* Adjust length and ready string when it contained errors and /* Adjust length and ready string when it contained errors and
is of the old resizable kind. */ is of the old resizable kind. */
if (kind == PyUnicode_WCHAR_KIND) { if (has_errors) {
if (PyUnicode_Resize(&unicode, i) < 0) if (PyUnicode_Resize(&unicode, i) < 0)
goto onError; goto onError;
} }
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&unicode)) {
Py_DECREF(unicode);
return NULL;
}
#endif
assert(_PyUnicode_CheckConsistency(unicode, 1)); assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode; return unicode;
...@@ -4536,7 +4535,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4536,7 +4535,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
return NULL; return NULL;
} }
#undef WRITE_FLEXIBLE_OR_WSTR #undef WRITE_MAYBE_FAIL
#ifdef __APPLE__ #ifdef __APPLE__
...@@ -4871,13 +4870,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ...@@ -4871,13 +4870,6 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *unicode; PyObject *unicode;
Py_UNICODE *p;
#ifndef Py_UNICODE_WIDE
int pairs = 0;
const unsigned char *qq;
#else
const int pairs = 0;
#endif
const unsigned char *q, *e; const unsigned char *q, *e;
int bo = 0; /* assume native ordering by default */ int bo = 0; /* assume native ordering by default */
const char *errmsg = ""; const char *errmsg = "";
...@@ -4941,23 +4933,13 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ...@@ -4941,23 +4933,13 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
iorder[3] = 0; iorder[3] = 0;
} }
/* On narrow builds we split characters outside the BMP into two
codepoints => count how much extra space we need. */
#ifndef Py_UNICODE_WIDE
for (qq = q; qq < e; qq += 4)
if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
pairs++;
#endif
/* This might be one to much, because of a BOM */ /* This might be one to much, because of a BOM */
unicode = (PyObject*)_PyUnicode_New((size+3)/4+pairs); unicode = PyUnicode_New((size+3)/4, 127);
if (!unicode) if (!unicode)
return NULL; return NULL;
if (size == 0) if (size == 0)
return unicode; return unicode;
outpos = 0;
/* Unpack UTF-32 encoded data */
p = PyUnicode_AS_UNICODE(unicode);
while (q < e) { while (q < e) {
Py_UCS4 ch; Py_UCS4 ch;
...@@ -4982,24 +4964,16 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ...@@ -4982,24 +4964,16 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
endinpos = startinpos+4; endinpos = startinpos+4;
goto utf32Error; goto utf32Error;
} }
#ifndef Py_UNICODE_WIDE if (unicode_putchar(&unicode, &outpos, ch) < 0)
if (ch >= 0x10000) goto onError;
{
*p++ = 0xD800 | ((ch-0x10000) >> 10);
*p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
}
else
#endif
*p++ = ch;
q += 4; q += 4;
continue; continue;
utf32Error: utf32Error:
outpos = p-PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"utf32", errmsg, "utf32", errmsg,
&starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q, &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
&unicode, &outpos, &p)) &unicode, &outpos))
goto onError; goto onError;
} }
...@@ -5010,7 +4984,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s, ...@@ -5010,7 +4984,7 @@ PyUnicode_DecodeUTF32Stateful(const char *s,
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;
/* Adjust length */ /* Adjust length */
if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) if (PyUnicode_Resize(&unicode, outpos) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
...@@ -5171,7 +5145,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5171,7 +5145,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *unicode; PyObject *unicode;
Py_UNICODE *p;
const unsigned char *q, *e, *aligned_end; const unsigned char *q, *e, *aligned_end;
int bo = 0; /* assume native ordering by default */ int bo = 0; /* assume native ordering by default */
int native_ordering = 0; int native_ordering = 0;
...@@ -5187,14 +5160,13 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5187,14 +5160,13 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Note: size will always be longer than the resulting Unicode /* Note: size will always be longer than the resulting Unicode
character count */ character count */
unicode = (PyObject*)_PyUnicode_New(size); unicode = PyUnicode_New(size, 127);
if (!unicode) if (!unicode)
return NULL; return NULL;
if (size == 0) if (size == 0)
return unicode; return unicode;
outpos = 0;
/* Unpack UTF-16 encoded data */
p = PyUnicode_AS_UNICODE(unicode);
q = (unsigned char *)s; q = (unsigned char *)s;
e = q + size - 1; e = q + size - 1;
...@@ -5254,68 +5226,51 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5254,68 +5226,51 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
if (!((size_t) q & LONG_PTR_MASK)) { if (!((size_t) q & LONG_PTR_MASK)) {
/* Fast path for runs of non-surrogate chars. */ /* Fast path for runs of non-surrogate chars. */
register const unsigned char *_q = q; register const unsigned char *_q = q;
Py_UNICODE *_p = p; int kind = PyUnicode_KIND(unicode);
if (native_ordering) { void *data = PyUnicode_DATA(unicode);
/* Native ordering is simple: as long as the input cannot while (_q < aligned_end) {
possibly contain a surrogate char, do an unrolled copy unsigned long block = * (unsigned long *) _q;
of several 16-bit code points to the target object. unsigned short *pblock = (unsigned short*)&block;
The non-surrogate check is done on several input bytes Py_UCS4 maxch;
at a time (as many as a C 'long' can contain). */ if (native_ordering) {
while (_q < aligned_end) { /* Can use buffer directly */
unsigned long data = * (unsigned long *) _q; if (block & FAST_CHAR_MASK)
if (data & FAST_CHAR_MASK)
break; break;
_p[0] = ((unsigned short *) _q)[0];
_p[1] = ((unsigned short *) _q)[1];
#if (SIZEOF_LONG == 8)
_p[2] = ((unsigned short *) _q)[2];
_p[3] = ((unsigned short *) _q)[3];
#endif
_q += SIZEOF_LONG;
_p += SIZEOF_LONG / 2;
} }
} else {
else { /* Need to byte-swap */
/* Byteswapped ordering is similar, but we must decompose unsigned char *_p = (unsigned char*)pblock;
the copy bytewise, and take care of zero'ing out the if (block & SWAPPED_FAST_CHAR_MASK)
upper bytes if the target object is in 32-bit units
(that is, in UCS-4 builds). */
while (_q < aligned_end) {
unsigned long data = * (unsigned long *) _q;
if (data & SWAPPED_FAST_CHAR_MASK)
break; break;
/* Zero upper bytes in UCS-4 builds */ _p[0] = _q[1];
#if (Py_UNICODE_SIZE > 2) _p[1] = _q[0];
_p[0] = 0; _p[2] = _q[3];
_p[1] = 0; _p[3] = _q[2];
#if (SIZEOF_LONG == 8) #if (SIZEOF_LONG == 8)
_p[2] = 0; _p[4] = _q[5];
_p[3] = 0; _p[5] = _q[4];
_p[6] = _q[7];
_p[7] = _q[6];
#endif #endif
}
maxch = Py_MAX(pblock[0], pblock[1]);
#if SIZEOF_LONG == 8
maxch = Py_MAX(maxch, Py_MAX(pblock[2], pblock[3]));
#endif #endif
/* Issue #4916; UCS-4 builds on big endian machines must if (maxch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
fill the two last bytes of each 4-byte unit. */ if (unicode_widen(&unicode, maxch) < 0)
#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2) goto onError;
# define OFF 2 kind = PyUnicode_KIND(unicode);
#else data = PyUnicode_DATA(unicode);
# define OFF 0
#endif
((unsigned char *) _p)[OFF + 1] = _q[0];
((unsigned char *) _p)[OFF + 0] = _q[1];
((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
#if (SIZEOF_LONG == 8)
((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
#endif
#undef OFF
_q += SIZEOF_LONG;
_p += SIZEOF_LONG / 2;
} }
PyUnicode_WRITE(kind, data, outpos++, pblock[0]);
PyUnicode_WRITE(kind, data, outpos++, pblock[1]);
#if SIZEOF_LONG == 8
PyUnicode_WRITE(kind, data, outpos++, pblock[2]);
PyUnicode_WRITE(kind, data, outpos++, pblock[3]);
#endif
_q += SIZEOF_LONG;
} }
p = _p;
q = _q; q = _q;
if (q >= e) if (q >= e)
break; break;
...@@ -5325,7 +5280,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5325,7 +5280,8 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
q += 2; q += 2;
if (ch < 0xD800 || ch > 0xDFFF) { if (ch < 0xD800 || ch > 0xDFFF) {
*p++ = ch; if (unicode_putchar(&unicode, &outpos, ch) < 0)
goto onError;
continue; continue;
} }
...@@ -5340,12 +5296,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5340,12 +5296,10 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo]; Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
q += 2; q += 2;
if (0xDC00 <= ch2 && ch2 <= 0xDFFF) { if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
#ifndef Py_UNICODE_WIDE if (unicode_putchar(&unicode, &outpos,
*p++ = ch; (((ch & 0x3FF)<<10) |
*p++ = ch2; (ch2 & 0x3FF)) + 0x10000) < 0)
#else goto onError;
*p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
#endif
continue; continue;
} }
else { else {
...@@ -5362,7 +5316,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5362,7 +5316,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
/* Fall through to report the error */ /* Fall through to report the error */
utf16Error: utf16Error:
outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, errors,
&errorHandler, &errorHandler,
...@@ -5374,8 +5327,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5374,8 +5327,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
&exc, &exc,
(const char **)&q, (const char **)&q,
&unicode, &unicode,
&outpos, &outpos))
&p))
goto onError; goto onError;
} }
/* remaining byte at the end? (size should be even) */ /* remaining byte at the end? (size should be even) */
...@@ -5384,7 +5336,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5384,7 +5336,6 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
errmsg = "truncated data"; errmsg = "truncated data";
startinpos = ((const char *)q) - starts; startinpos = ((const char *)q) - starts;
endinpos = ((const char *)e) + 1 - starts; endinpos = ((const char *)e) + 1 - starts;
outpos = p - PyUnicode_AS_UNICODE(unicode);
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, errors,
&errorHandler, &errorHandler,
...@@ -5396,8 +5347,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5396,8 +5347,7 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
&exc, &exc,
(const char **)&q, (const char **)&q,
&unicode, &unicode,
&outpos, &outpos))
&p))
goto onError; goto onError;
/* The remaining input chars are ignored if the callback /* The remaining input chars are ignored if the callback
chooses to skip the input */ chooses to skip the input */
...@@ -5411,17 +5361,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s, ...@@ -5411,17 +5361,11 @@ PyUnicode_DecodeUTF16Stateful(const char *s,
*consumed = (const char *)q-starts; *consumed = (const char *)q-starts;
/* Adjust length */ /* Adjust length */
if (PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0) if (PyUnicode_Resize(&unicode, outpos) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&unicode)) {
Py_DECREF(unicode);
return NULL;
}
#endif
assert(_PyUnicode_CheckConsistency(unicode, 1)); assert(_PyUnicode_CheckConsistency(unicode, 1));
return unicode; return unicode;
...@@ -5613,31 +5557,26 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5613,31 +5557,26 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
Py_ssize_t endinpos; Py_ssize_t endinpos;
int j; int j;
PyObject *v; PyObject *v;
Py_UNICODE *p;
const char *end; const char *end;
char* message; char* message;
Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */ Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
Py_ssize_t ascii_length; Py_ssize_t len;
Py_ssize_t i; Py_ssize_t i;
int kind;
void *data;
ascii_length = length_of_escaped_ascii_string(s, size); len = length_of_escaped_ascii_string(s, size);
/* After length_of_escaped_ascii_string() there are two alternatives, /* After length_of_escaped_ascii_string() there are two alternatives,
either the string is pure ASCII with named escapes like \n, etc. either the string is pure ASCII with named escapes like \n, etc.
and we determined it's exact size (common case) and we determined it's exact size (common case)
or it contains \x, \u, ... escape sequences. then we create a or it contains \x, \u, ... escape sequences. then we create a
legacy wchar string and resize it at the end of this function. */ legacy wchar string and resize it at the end of this function. */
if (ascii_length >= 0) { if (len >= 0) {
v = PyUnicode_New(ascii_length, 127); v = PyUnicode_New(len, 127);
if (!v) if (!v)
goto onError; goto onError;
assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND); assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
kind = PyUnicode_1BYTE_KIND;
data = PyUnicode_DATA(v);
} }
else { else {
/* Escaped strings will always be longer than the resulting /* Escaped strings will always be longer than the resulting
...@@ -5645,11 +5584,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5645,11 +5584,10 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
length after conversion to the true value. length after conversion to the true value.
(but if the error callback returns a long replacement string (but if the error callback returns a long replacement string
we'll have to allocate more space) */ we'll have to allocate more space) */
v = (PyObject*)_PyUnicode_New(size); v = PyUnicode_New(size, 127);
if (!v) if (!v)
goto onError; goto onError;
kind = PyUnicode_WCHAR_KIND; len = size;
data = PyUnicode_AS_UNICODE(v);
} }
if (size == 0) if (size == 0)
...@@ -5662,18 +5600,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5662,18 +5600,14 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
Py_UNICODE x; Py_UNICODE x;
int digits; int digits;
if (kind == PyUnicode_WCHAR_KIND) { /* The only case in which i == ascii_length is a backslash
assert(i < _PyUnicode_WSTR_LENGTH(v)); followed by a newline. */
} assert(i <= len);
else {
/* The only case in which i == ascii_length is a backslash
followed by a newline. */
assert(i <= ascii_length);
}
/* Non-escape characters are interpreted as Unicode ordinals */ /* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') { if (*s != '\\') {
WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++); if (unicode_putchar(&v, &i, (unsigned char) *s++) < 0)
goto onError;
continue; continue;
} }
...@@ -5684,32 +5618,33 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5684,32 +5618,33 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
if (s > end) if (s > end)
c = '\0'; /* Invalid after \ */ c = '\0'; /* Invalid after \ */
if (kind == PyUnicode_WCHAR_KIND) { /* The only case in which i == ascii_length is a backslash
assert(i < _PyUnicode_WSTR_LENGTH(v)); followed by a newline. */
} assert(i < len || (i == len && c == '\n'));
else {
/* The only case in which i == ascii_length is a backslash
followed by a newline. */
assert(i < ascii_length || (i == ascii_length && c == '\n'));
}
switch (c) { switch (c) {
/* \x escapes */ /* \x escapes */
#define WRITECHAR(ch) \
do { \
if (unicode_putchar(&v, &i, ch) < 0) \
goto onError; \
}while(0)
case '\n': break; case '\n': break;
case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break; case '\\': WRITECHAR('\\'); break;
case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break; case '\'': WRITECHAR('\''); break;
case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break; case '\"': WRITECHAR('\"'); break;
case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break; case 'b': WRITECHAR('\b'); break;
/* FF */ /* FF */
case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break; case 'f': WRITECHAR('\014'); break;
case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break; case 't': WRITECHAR('\t'); break;
case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break; case 'n': WRITECHAR('\n'); break;
case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break; case 'r': WRITECHAR('\r'); break;
/* VT */ /* VT */
case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break; case 'v': WRITECHAR('\013'); break;
/* BEL, not classic C */ /* BEL, not classic C */
case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break; case 'a': WRITECHAR('\007'); break;
/* \OOO (octal) escapes */ /* \OOO (octal) escapes */
case '0': case '1': case '2': case '3': case '0': case '1': case '2': case '3':
...@@ -5720,7 +5655,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5720,7 +5655,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
if (s < end && '0' <= *s && *s <= '7') if (s < end && '0' <= *s && *s <= '7')
x = (x<<3) + *s++ - '0'; x = (x<<3) + *s++ - '0';
} }
WRITE_WSTR(data, i++, x); WRITECHAR(x);
break; break;
/* hex escapes */ /* hex escapes */
...@@ -5742,30 +5677,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5742,30 +5677,27 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
message = "truncated \\UXXXXXXXX escape"; message = "truncated \\UXXXXXXXX escape";
hexescape: hexescape:
chr = 0; chr = 0;
p = PyUnicode_AS_UNICODE(v) + i;
if (s+digits>end) { if (s+digits>end) {
endinpos = size; endinpos = size;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"unicodeescape", "end of string in escape sequence", "unicodeescape", "end of string in escape sequence",
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &i, &p)) &v, &i))
goto onError; goto onError;
data = PyUnicode_AS_UNICODE(v);
goto nextByte; goto nextByte;
} }
for (j = 0; j < digits; ++j) { for (j = 0; j < digits; ++j) {
c = (unsigned char) s[j]; c = (unsigned char) s[j];
if (!Py_ISXDIGIT(c)) { if (!Py_ISXDIGIT(c)) {
endinpos = (s+j+1)-starts; endinpos = (s+j+1)-starts;
p = PyUnicode_AS_UNICODE(v) + i;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"unicodeescape", message, "unicodeescape", message,
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &i, &p)) &v, &i))
goto onError; goto onError;
data = PyUnicode_AS_UNICODE(v); len = PyUnicode_GET_LENGTH(v);
goto nextByte; goto nextByte;
} }
chr = (chr<<4) & ~0xF; chr = (chr<<4) & ~0xF;
...@@ -5783,29 +5715,16 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5783,29 +5715,16 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
break; break;
store: store:
/* when we get here, chr is a 32-bit unicode character */ /* when we get here, chr is a 32-bit unicode character */
if (chr <= 0xffff) if (chr <= 0x10ffff) {
/* UCS-2 character */ WRITECHAR(chr);
WRITE_WSTR(data, i++, chr);
else if (chr <= 0x10ffff) {
/* UCS-4 character. Either store directly, or as
surrogate pair. */
#ifdef Py_UNICODE_WIDE
WRITE_WSTR(data, i++, chr);
#else
chr -= 0x10000L;
WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
#endif
} else { } else {
endinpos = s-starts; endinpos = s-starts;
p = PyUnicode_AS_UNICODE(v) + i;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"unicodeescape", "illegal Unicode character", "unicodeescape", "illegal Unicode character",
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &i, &p)) &v, &i))
goto onError; goto onError;
data = PyUnicode_AS_UNICODE(v);
} }
break; break;
...@@ -5834,48 +5753,39 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5834,48 +5753,39 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
} }
} }
endinpos = s-starts; endinpos = s-starts;
p = PyUnicode_AS_UNICODE(v) + i;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"unicodeescape", message, "unicodeescape", message,
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &i, &p)) &v, &i))
goto onError; goto onError;
data = PyUnicode_AS_UNICODE(v);
break; break;
default: default:
if (s > end) { if (s > end) {
assert(kind == PyUnicode_WCHAR_KIND);
message = "\\ at end of string"; message = "\\ at end of string";
s--; s--;
endinpos = s-starts; endinpos = s-starts;
p = PyUnicode_AS_UNICODE(v) + i;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"unicodeescape", message, "unicodeescape", message,
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &i, &p)) &v, &i))
goto onError; goto onError;
data = PyUnicode_AS_UNICODE(v);
} }
else { else {
WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); WRITECHAR('\\');
WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]); WRITECHAR(s[-1]);
} }
break; break;
} }
nextByte: nextByte:
; ;
} }
/* Ensure the length prediction worked in case of ASCII strings */ #undef WRITECHAR
assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
if (kind == PyUnicode_WCHAR_KIND) if (PyUnicode_Resize(&v, i) < 0)
{ goto onError;
if (PyUnicode_Resize(&v, i) < 0)
goto onError;
}
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
#ifndef DONT_MAKE_RESULT_READY #ifndef DONT_MAKE_RESULT_READY
...@@ -6081,7 +5991,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ...@@ -6081,7 +5991,6 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *v; PyObject *v;
Py_UNICODE *p;
const char *end; const char *end;
const char *bs; const char *bs;
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
...@@ -6091,12 +6000,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ...@@ -6091,12 +6000,12 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
Unicode string, so we start with size here and then reduce the Unicode string, so we start with size here and then reduce the
length after conversion to the true value. (But decoding error length after conversion to the true value. (But decoding error
handler might have to resize the string) */ handler might have to resize the string) */
v = (PyObject*)_PyUnicode_New(size); v = PyUnicode_New(size, 127);
if (v == NULL) if (v == NULL)
goto onError; goto onError;
if (size == 0) if (size == 0)
return v; return v;
p = PyUnicode_AS_UNICODE(v); outpos = 0;
end = s + size; end = s + size;
while (s < end) { while (s < end) {
unsigned char c; unsigned char c;
...@@ -6106,7 +6015,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ...@@ -6106,7 +6015,8 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
/* Non-escape characters are interpreted as Unicode ordinals */ /* Non-escape characters are interpreted as Unicode ordinals */
if (*s != '\\') { if (*s != '\\') {
*p++ = (unsigned char)*s++; if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
goto onError;
continue; continue;
} }
startinpos = s-starts; startinpos = s-starts;
...@@ -6117,19 +6027,19 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ...@@ -6117,19 +6027,19 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
for (;s < end;) { for (;s < end;) {
if (*s != '\\') if (*s != '\\')
break; break;
*p++ = (unsigned char)*s++; if (unicode_putchar(&v, &outpos, (unsigned char)*s++) < 0)
goto onError;
} }
if (((s - bs) & 1) == 0 || if (((s - bs) & 1) == 0 ||
s >= end || s >= end ||
(*s != 'u' && *s != 'U')) { (*s != 'u' && *s != 'U')) {
continue; continue;
} }
p--; outpos--;
count = *s=='u' ? 4 : 8; count = *s=='u' ? 4 : 8;
s++; s++;
/* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */ /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
outpos = p-PyUnicode_AS_UNICODE(v);
for (x = 0, i = 0; i < count; ++i, ++s) { for (x = 0, i = 0; i < count; ++i, ++s) {
c = (unsigned char)*s; c = (unsigned char)*s;
if (!Py_ISXDIGIT(c)) { if (!Py_ISXDIGIT(c)) {
...@@ -6138,7 +6048,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ...@@ -6138,7 +6048,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
errors, &errorHandler, errors, &errorHandler,
"rawunicodeescape", "truncated \\uXXXX", "rawunicodeescape", "truncated \\uXXXX",
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &outpos, &p)) &v, &outpos))
goto onError; goto onError;
goto nextByte; goto nextByte;
} }
...@@ -6150,42 +6060,25 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ...@@ -6150,42 +6060,25 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
else else
x += 10 + c - 'A'; x += 10 + c - 'A';
} }
if (x <= 0xffff) if (x <= 0x10ffff) {
/* UCS-2 character */ if (unicode_putchar(&v, &outpos, x) < 0)
*p++ = (Py_UNICODE) x; goto onError;
else if (x <= 0x10ffff) {
/* UCS-4 character. Either store directly, or as
surrogate pair. */
#ifdef Py_UNICODE_WIDE
*p++ = (Py_UNICODE) x;
#else
x -= 0x10000L;
*p++ = 0xD800 + (Py_UNICODE) (x >> 10);
*p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
#endif
} else { } else {
endinpos = s-starts; endinpos = s-starts;
outpos = p-PyUnicode_AS_UNICODE(v);
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"rawunicodeescape", "\\Uxxxxxxxx out of range", "rawunicodeescape", "\\Uxxxxxxxx out of range",
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &outpos, &p)) &v, &outpos))
goto onError; goto onError;
} }
nextByte: nextByte:
; ;
} }
if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) if (PyUnicode_Resize(&v, outpos) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&v)) {
Py_DECREF(v);
return NULL;
}
#endif
assert(_PyUnicode_CheckConsistency(v, 1)); assert(_PyUnicode_CheckConsistency(v, 1));
return v; return v;
...@@ -6311,34 +6204,27 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, ...@@ -6311,34 +6204,27 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
PyObject *v; PyObject *v;
Py_UNICODE *p;
const char *end; const char *end;
const char *reason; const char *reason;
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
#ifdef Py_UNICODE_WIDE
Py_UNICODE unimax = PyUnicode_GetMax();
#endif
/* XXX overflow detection missing */ /* XXX overflow detection missing */
v = (PyObject*)_PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE); v = PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE, 127);
if (v == NULL) if (v == NULL)
goto onError; goto onError;
/* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH if (PyUnicode_GET_LENGTH(v) == 0)
as string was created with the old API. */
if (PyUnicode_GET_SIZE(v) == 0)
return v; return v;
p = PyUnicode_AS_UNICODE(v); outpos = 0;
end = s + size; end = s + size;
while (s < end) { while (s < end) {
memcpy(p, s, sizeof(Py_UNICODE)); Py_UCS4 ch = *(Py_UNICODE*)s;
/* We have to sanity check the raw data, otherwise doom looms for /* We have to sanity check the raw data, otherwise doom looms for
some malformed UCS-4 data. */ some malformed UCS-4 data. */
if ( if (
#ifdef Py_UNICODE_WIDE #ifdef Py_UNICODE_WIDE
*p > unimax || *p < 0 || ch > 0x10ffff ||
#endif #endif
end-s < Py_UNICODE_SIZE end-s < Py_UNICODE_SIZE
) )
...@@ -6352,31 +6238,25 @@ _PyUnicode_DecodeUnicodeInternal(const char *s, ...@@ -6352,31 +6238,25 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
endinpos = s - starts + Py_UNICODE_SIZE; endinpos = s - starts + Py_UNICODE_SIZE;
reason = "illegal code point (> 0x10FFFF)"; reason = "illegal code point (> 0x10FFFF)";
} }
outpos = p - PyUnicode_AS_UNICODE(v);
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"unicode_internal", reason, "unicode_internal", reason,
&starts, &end, &startinpos, &endinpos, &exc, &s, &starts, &end, &startinpos, &endinpos, &exc, &s,
&v, &outpos, &p)) { &v, &outpos)) {
goto onError; goto onError;
} }
} }
else { else {
p++; if (unicode_putchar(&v, &outpos, ch) < 0)
goto onError;
s += Py_UNICODE_SIZE; s += Py_UNICODE_SIZE;
} }
} }
if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0) if (PyUnicode_Resize(&v, outpos) < 0)
goto onError; goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&v)) {
Py_DECREF(v);
return NULL;
}
#endif
assert(_PyUnicode_CheckConsistency(v, 1)); assert(_PyUnicode_CheckConsistency(v, 1));
return v; return v;
...@@ -6749,7 +6629,8 @@ PyUnicode_DecodeASCII(const char *s, ...@@ -6749,7 +6629,8 @@ PyUnicode_DecodeASCII(const char *s,
{ {
const char *starts = s; const char *starts = s;
PyObject *v; PyObject *v;
Py_UNICODE *u; int kind;
void *data;
Py_ssize_t startinpos; Py_ssize_t startinpos;
Py_ssize_t endinpos; Py_ssize_t endinpos;
Py_ssize_t outpos; Py_ssize_t outpos;
...@@ -6797,42 +6678,38 @@ PyUnicode_DecodeASCII(const char *s, ...@@ -6797,42 +6678,38 @@ PyUnicode_DecodeASCII(const char *s,
if (!has_error) if (!has_error)
return unicode_fromascii((const unsigned char *)s, size); return unicode_fromascii((const unsigned char *)s, size);
v = (PyObject*)_PyUnicode_New(size); v = PyUnicode_New(size, 127);
if (v == NULL) if (v == NULL)
goto onError; goto onError;
if (size == 0) if (size == 0)
return v; return v;
u = PyUnicode_AS_UNICODE(v); kind = PyUnicode_KIND(v);
data = PyUnicode_DATA(v);
outpos = 0;
e = s + size; e = s + size;
while (s < e) { while (s < e) {
register unsigned char c = (unsigned char)*s; register unsigned char c = (unsigned char)*s;
if (c < 128) { if (c < 128) {
*u++ = c; PyUnicode_WRITE(kind, data, outpos++, c);
++s; ++s;
} }
else { else {
startinpos = s-starts; startinpos = s-starts;
endinpos = startinpos + 1; endinpos = startinpos + 1;
outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"ascii", "ordinal not in range(128)", "ascii", "ordinal not in range(128)",
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&v, &outpos, &u)) &v, &outpos))
goto onError; goto onError;
kind = PyUnicode_KIND(v);
data = PyUnicode_DATA(v);
} }
} }
if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (PyUnicode_Resize(&v, outpos) < 0)
if (PyUnicode_Resize(&v, u - PyUnicode_AS_UNICODE(v)) < 0) goto onError;
goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&v)) {
Py_DECREF(v);
return NULL;
}
#endif
assert(_PyUnicode_CheckConsistency(v, 1)); assert(_PyUnicode_CheckConsistency(v, 1));
return v; return v;
...@@ -7648,7 +7525,6 @@ PyUnicode_DecodeCharmap(const char *s, ...@@ -7648,7 +7525,6 @@ PyUnicode_DecodeCharmap(const char *s,
Py_ssize_t outpos; Py_ssize_t outpos;
const char *e; const char *e;
PyObject *v; PyObject *v;
Py_UNICODE *p;
Py_ssize_t extrachars = 0; Py_ssize_t extrachars = 0;
PyObject *errorHandler = NULL; PyObject *errorHandler = NULL;
PyObject *exc = NULL; PyObject *exc = NULL;
...@@ -7659,12 +7535,12 @@ PyUnicode_DecodeCharmap(const char *s, ...@@ -7659,12 +7535,12 @@ PyUnicode_DecodeCharmap(const char *s,
if (mapping == NULL) if (mapping == NULL)
return PyUnicode_DecodeLatin1(s, size, errors); return PyUnicode_DecodeLatin1(s, size, errors);
v = (PyObject*)_PyUnicode_New(size); v = PyUnicode_New(size, 127);
if (v == NULL) if (v == NULL)
goto onError; goto onError;
if (size == 0) if (size == 0)
return v; return v;
p = PyUnicode_AS_UNICODE(v); outpos = 0;
e = s + size; e = s + size;
if (PyUnicode_CheckExact(mapping)) { if (PyUnicode_CheckExact(mapping)) {
mapstring = PyUnicode_AS_UNICODE(mapping); mapstring = PyUnicode_AS_UNICODE(mapping);
...@@ -7678,19 +7554,19 @@ PyUnicode_DecodeCharmap(const char *s, ...@@ -7678,19 +7554,19 @@ PyUnicode_DecodeCharmap(const char *s,
if (x == 0xfffe) { if (x == 0xfffe) {
/* undefined mapping */ /* undefined mapping */
outpos = p-PyUnicode_AS_UNICODE(v);
startinpos = s-starts; startinpos = s-starts;
endinpos = startinpos+1; endinpos = startinpos+1;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"charmap", "character maps to <undefined>", "charmap", "character maps to <undefined>",
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&v, &outpos, &p)) { &v, &outpos)) {
goto onError; goto onError;
} }
continue; continue;
} }
*p++ = x; if (unicode_putchar(&v, &outpos, x) < 0)
goto onError;
++s; ++s;
} }
} }
...@@ -7724,18 +7600,18 @@ PyUnicode_DecodeCharmap(const char *s, ...@@ -7724,18 +7600,18 @@ PyUnicode_DecodeCharmap(const char *s,
Py_DECREF(x); Py_DECREF(x);
goto onError; goto onError;
} }
*p++ = (Py_UNICODE)value; if (unicode_putchar(&v, &outpos, value) < 0)
goto onError;
} }
else if (x == Py_None) { else if (x == Py_None) {
/* undefined mapping */ /* undefined mapping */
outpos = p-PyUnicode_AS_UNICODE(v);
startinpos = s-starts; startinpos = s-starts;
endinpos = startinpos+1; endinpos = startinpos+1;
if (unicode_decode_call_errorhandler( if (unicode_decode_call_errorhandler(
errors, &errorHandler, errors, &errorHandler,
"charmap", "character maps to <undefined>", "charmap", "character maps to <undefined>",
&starts, &e, &startinpos, &endinpos, &exc, &s, &starts, &e, &startinpos, &endinpos, &exc, &s,
&v, &outpos, &p)) { &v, &outpos)) {
Py_DECREF(x); Py_DECREF(x);
goto onError; goto onError;
} }
...@@ -7743,32 +7619,36 @@ PyUnicode_DecodeCharmap(const char *s, ...@@ -7743,32 +7619,36 @@ PyUnicode_DecodeCharmap(const char *s,
continue; continue;
} }
else if (PyUnicode_Check(x)) { else if (PyUnicode_Check(x)) {
Py_ssize_t targetsize = PyUnicode_GET_SIZE(x); Py_ssize_t targetsize;
if (targetsize == 1) if (PyUnicode_READY(x) < 0)
/* 1-1 mapping */ goto onError;
*p++ = *PyUnicode_AS_UNICODE(x); targetsize = PyUnicode_GET_LENGTH(x);
if (targetsize == 1) {
/* 1-1 mapping */
if (unicode_putchar(&v, &outpos,
PyUnicode_READ_CHAR(x, 0)) < 0)
goto onError;
}
else if (targetsize > 1) { else if (targetsize > 1) {
/* 1-n mapping */ /* 1-n mapping */
if (targetsize > extrachars) { if (targetsize > extrachars) {
/* resize first */ /* resize first */
Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
Py_ssize_t needed = (targetsize - extrachars) + \ Py_ssize_t needed = (targetsize - extrachars) + \
(targetsize << 2); (targetsize << 2);
extrachars += needed; extrachars += needed;
/* XXX overflow detection missing */ /* XXX overflow detection missing */
if (PyUnicode_Resize(&v, if (PyUnicode_Resize(&v,
PyUnicode_GET_SIZE(v) + needed) < 0) { PyUnicode_GET_LENGTH(v) + needed) < 0) {
Py_DECREF(x); Py_DECREF(x);
goto onError; goto onError;
} }
p = PyUnicode_AS_UNICODE(v) + oldpos;
} }
Py_UNICODE_COPY(p, if (unicode_widen(&v, PyUnicode_MAX_CHAR_VALUE(x)) < 0)
PyUnicode_AS_UNICODE(x), goto onError;
targetsize); PyUnicode_CopyCharacters(v, outpos, x, 0, targetsize);
p += targetsize; outpos += targetsize;
extrachars -= targetsize; extrachars -= targetsize;
} }
/* 1-0 mapping: skip the character */ /* 1-0 mapping: skip the character */
...@@ -7784,17 +7664,9 @@ PyUnicode_DecodeCharmap(const char *s, ...@@ -7784,17 +7664,9 @@ PyUnicode_DecodeCharmap(const char *s,
++s; ++s;
} }
} }
if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v)) if (PyUnicode_Resize(&v, outpos) < 0)
if (PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
goto onError;
Py_XDECREF(errorHandler); Py_XDECREF(errorHandler);
Py_XDECREF(exc); Py_XDECREF(exc);
#ifndef DONT_MAKE_RESULT_READY
if (_PyUnicode_READY_REPLACE(&v)) {
Py_DECREF(v);
return NULL;
}
#endif
assert(_PyUnicode_CheckConsistency(v, 1)); assert(_PyUnicode_CheckConsistency(v, 1));
return v; return v;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment