Commit 8faf8216 authored by Victor Stinner's avatar Victor Stinner

PyUnicode_FromWideChar() and PyUnicode_FromUnicode() raise a ValueError if a

character in not in range [U+0000; U+10ffff].
parent bc9f0c68
...@@ -66,6 +66,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ...@@ -66,6 +66,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
extern "C" { extern "C" {
#endif #endif
/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
#define MAX_UNICODE 0x10ffff
#ifdef Py_DEBUG #ifdef Py_DEBUG
# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0) # define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
#else #else
...@@ -393,9 +396,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content) ...@@ -393,9 +396,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
} }
else { else {
assert(maxchar >= 0x10000); assert(maxchar >= 0x10000);
/* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm() assert(maxchar <= MAX_UNICODE);
return characters outside the range U+0000-U+10FFFF. */
/* assert(maxchar <= 0x10FFFF); */
} }
} }
return 1; return 1;
...@@ -1295,36 +1296,37 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end, ...@@ -1295,36 +1296,37 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
Py_UCS4 *maxchar, Py_ssize_t *num_surrogates) Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
{ {
const wchar_t *iter; const wchar_t *iter;
Py_UCS4 ch;
assert(num_surrogates != NULL && maxchar != NULL); assert(num_surrogates != NULL && maxchar != NULL);
*num_surrogates = 0; *num_surrogates = 0;
*maxchar = 0; *maxchar = 0;
for (iter = begin; iter < end; ) { for (iter = begin; iter < end; ) {
if (*iter > *maxchar) {
*maxchar = *iter;
#if SIZEOF_WCHAR_T != 2
if (*maxchar >= 0x10000)
return 0;
#endif
}
#if SIZEOF_WCHAR_T == 2 #if SIZEOF_WCHAR_T == 2
if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0]) if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
&& (iter+1) < end && (iter+1) < end
&& Py_UNICODE_IS_LOW_SURROGATE(iter[1])) && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
{ {
Py_UCS4 surrogate_val; ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
surrogate_val = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
++(*num_surrogates); ++(*num_surrogates);
if (surrogate_val > *maxchar)
*maxchar = surrogate_val;
iter += 2; iter += 2;
} }
else else
iter++;
#else
iter++;
#endif #endif
{
ch = *iter;
iter++;
}
if (ch > *maxchar) {
*maxchar = ch;
if (*maxchar > MAX_UNICODE) {
PyErr_Format(PyExc_ValueError,
"character U+%x is not in range [U+0000; U+10ffff]",
ch);
return -1;
}
}
} }
return 0; return 0;
} }
...@@ -1669,8 +1671,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size) ...@@ -1669,8 +1671,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
&maxchar, &num_surrogates) == -1) &maxchar, &num_surrogates) == -1)
return NULL; return NULL;
unicode = PyUnicode_New(size - num_surrogates, unicode = PyUnicode_New(size - num_surrogates, maxchar);
maxchar);
if (!unicode) if (!unicode)
return NULL; return NULL;
...@@ -1808,7 +1809,7 @@ kind_maxchar_limit(unsigned int kind) ...@@ -1808,7 +1809,7 @@ kind_maxchar_limit(unsigned int kind)
return 0x10000; return 0x10000;
default: default:
assert(0 && "invalid kind"); assert(0 && "invalid kind");
return 0x10ffff; return MAX_UNICODE;
} }
} }
...@@ -2796,7 +2797,7 @@ PyObject * ...@@ -2796,7 +2797,7 @@ PyObject *
PyUnicode_FromOrdinal(int ordinal) PyUnicode_FromOrdinal(int ordinal)
{ {
PyObject *v; PyObject *v;
if (ordinal < 0 || ordinal > 0x10ffff) { if (ordinal < 0 || ordinal > MAX_UNICODE) {
PyErr_SetString(PyExc_ValueError, PyErr_SetString(PyExc_ValueError,
"chr() arg not in range(0x110000)"); "chr() arg not in range(0x110000)");
return NULL; return NULL;
...@@ -3472,7 +3473,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size) ...@@ -3472,7 +3473,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
four_bytes = PyUnicode_4BYTE_DATA(unicode); four_bytes = PyUnicode_4BYTE_DATA(unicode);
for (; four_bytes < ucs4_end; ++four_bytes, ++w) { for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
if (*four_bytes > 0xFFFF) { if (*four_bytes > 0xFFFF) {
assert(*four_bytes <= 0x10FFFF); assert(*four_bytes <= MAX_UNICODE);
/* encode surrogate pair in this case */ /* encode surrogate pair in this case */
*w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes); *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
*w = Py_UNICODE_LOW_SURROGATE(*four_bytes); *w = Py_UNICODE_LOW_SURROGATE(*four_bytes);
...@@ -4118,7 +4119,7 @@ _PyUnicode_EncodeUTF7(PyObject *str, ...@@ -4118,7 +4119,7 @@ _PyUnicode_EncodeUTF7(PyObject *str,
continue; continue;
encode_char: encode_char:
if (ch >= 0x10000) { if (ch >= 0x10000) {
assert(ch <= 0x10FFFF); assert(ch <= MAX_UNICODE);
/* code first surrogate */ /* code first surrogate */
base64bits += 16; base64bits += 16;
...@@ -4577,7 +4578,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s, ...@@ -4577,7 +4578,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
} }
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f); ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
WRITE_MAYBE_FAIL(i++, ch); WRITE_MAYBE_FAIL(i++, ch);
break; break;
...@@ -4714,7 +4715,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size) ...@@ -4714,7 +4715,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
} }
ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) + ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
((s[2] & 0x3f) << 6) + (s[3] & 0x3f); ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
assert ((ch > 0xFFFF) && (ch <= 0x10ffff)); assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));
#if SIZEOF_WCHAR_T == 4 #if SIZEOF_WCHAR_T == 4
*p++ = (wchar_t)ch; *p++ = (wchar_t)ch;
...@@ -4884,7 +4885,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors) ...@@ -4884,7 +4885,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
*p++ = (char)(0x80 | ((ch >> 6) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
*p++ = (char)(0x80 | (ch & 0x3f)); *p++ = (char)(0x80 | (ch & 0x3f));
} else /* ch >= 0x10000 */ { } else /* ch >= 0x10000 */ {
assert(ch <= 0x10FFFF); assert(ch <= MAX_UNICODE);
/* Encode UCS4 Unicode ordinals */ /* Encode UCS4 Unicode ordinals */
*p++ = (char)(0xf0 | (ch >> 18)); *p++ = (char)(0xf0 | (ch >> 18));
*p++ = (char)(0x80 | ((ch >> 12) & 0x3f)); *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
...@@ -5792,7 +5793,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s, ...@@ -5792,7 +5793,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
break; break;
store: store:
/* when we get here, chr is a 32-bit unicode character */ /* when we get here, chr is a 32-bit unicode character */
if (chr <= 0x10ffff) { if (chr <= MAX_UNICODE) {
WRITECHAR(chr); WRITECHAR(chr);
} else { } else {
endinpos = s-starts; endinpos = s-starts;
...@@ -5957,7 +5958,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode) ...@@ -5957,7 +5958,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
/* Map 21-bit characters to '\U00xxxxxx' */ /* Map 21-bit characters to '\U00xxxxxx' */
else if (ch >= 0x10000) { else if (ch >= 0x10000) {
assert(ch <= 0x10FFFF); assert(ch <= MAX_UNICODE);
*p++ = '\\'; *p++ = '\\';
*p++ = 'U'; *p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0x0000000F]; *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
...@@ -6108,7 +6109,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s, ...@@ -6108,7 +6109,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
else else
x += 10 + c - 'A'; x += 10 + c - 'A';
} }
if (x <= 0x10ffff) { if (x <= MAX_UNICODE) {
if (unicode_putchar(&v, &outpos, x) < 0) if (unicode_putchar(&v, &outpos, x) < 0)
goto onError; goto onError;
} else { } else {
...@@ -6175,7 +6176,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode) ...@@ -6175,7 +6176,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Py_UCS4 ch = PyUnicode_READ(kind, data, pos); Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
/* Map 32-bit characters to '\Uxxxxxxxx' */ /* Map 32-bit characters to '\Uxxxxxxxx' */
if (ch >= 0x10000) { if (ch >= 0x10000) {
assert(ch <= 0x10FFFF); assert(ch <= MAX_UNICODE);
*p++ = '\\'; *p++ = '\\';
*p++ = 'U'; *p++ = 'U';
*p++ = Py_hexdigits[(ch >> 28) & 0xf]; *p++ = Py_hexdigits[(ch >> 28) & 0xf];
...@@ -6536,7 +6537,7 @@ unicode_encode_ucs1(PyObject *unicode, ...@@ -6536,7 +6537,7 @@ unicode_encode_ucs1(PyObject *unicode,
else if (ch < 1000000) else if (ch < 1000000)
repsize += 2+6+1; repsize += 2+6+1;
else { else {
assert(ch <= 0x10FFFF); assert(ch <= MAX_UNICODE);
repsize += 2+7+1; repsize += 2+7+1;
} }
} }
...@@ -9275,7 +9276,7 @@ fixup(PyObject *self, ...@@ -9275,7 +9276,7 @@ fixup(PyObject *self,
else if (maxchar_new <= 65535) else if (maxchar_new <= 65535)
maxchar_new = 65535; maxchar_new = 65535;
else else
maxchar_new = 1114111; /* 0x10ffff */ maxchar_new = MAX_UNICODE;
if (!maxchar_new && PyUnicode_CheckExact(self)) { if (!maxchar_new && PyUnicode_CheckExact(self)) {
/* fixfct should return TRUE if it modified the buffer. If /* fixfct should return TRUE if it modified the buffer. If
...@@ -13059,7 +13060,7 @@ formatchar(PyObject *v) ...@@ -13059,7 +13060,7 @@ formatchar(PyObject *v)
if (x == -1 && PyErr_Occurred()) if (x == -1 && PyErr_Occurred())
goto onError; goto onError;
if (x < 0 || x > 0x10ffff) { if (x < 0 || x > MAX_UNICODE) {
PyErr_SetString(PyExc_OverflowError, PyErr_SetString(PyExc_OverflowError,
"%c arg not in range(0x110000)"); "%c arg not in range(0x110000)");
return (Py_UCS4) -1; return (Py_UCS4) -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment