PyUnicode_FromWideChar() and PyUnicode_FromUnicode() raise a ValueError if a

character in not in range [U+0000; U+10ffff].

PyUnicode_FromWideChar() and PyUnicode_FromUnicode() raise a ValueError if a
character in not in range [U+0000; U+10ffff].
8faf8216 · Victor Stinner · bc9f0c68 · 8faf8216
Commit 8faf8216 authored Dec 08, 2011 by Victor Stinner
Show whitespace changes
Inline Side-by-side

Showing with 34 additions and 33 deletions

Objects/unicodeobject.c Objects/unicodeobject.c +34 -33

No files found.
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -66,6 +66,9 @@ OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 extern "C" {
 #endif

+/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
+#define MAX_UNICODE 0x10ffff
+
 #ifdef Py_DEBUG
 #  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
 #else
@@ -393,9 +396,7 @@ _PyUnicode_CheckConsistency(PyObject *op, int check_content)
        }
        else {
            assert(maxchar >= 0x10000);
-            /* FIXME: Issue #13441: on Solaris, localeconv() and strxfrm()
-               return characters outside the range U+0000-U+10FFFF. */
-            /* assert(maxchar <= 0x10FFFF); */
+            assert(maxchar <= MAX_UNICODE);
        }
    }
    return 1;
@@ -1295,36 +1296,37 @@ find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
 {
    const wchar_t *iter;
+    Py_UCS4 ch;

    assert(num_surrogates != NULL && maxchar != NULL);
    *num_surrogates = 0;
    *maxchar = 0;

    for (iter = begin; iter < end; ) {
-        if (*iter > *maxchar) {
-            *maxchar = *iter;
-#if SIZEOF_WCHAR_T != 2
-            if (*maxchar >= 0x10000)
-                return 0;
-#endif
-        }
 #if SIZEOF_WCHAR_T == 2
        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
            && (iter+1) < end
            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
        {
-            Py_UCS4 surrogate_val;
-            surrogate_val = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
+            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
            ++(*num_surrogates);
-            if (surrogate_val > *maxchar)
-                *maxchar = surrogate_val;
            iter += 2;
        }
        else
-            iter++;
-#else
-        iter++;
 #endif
+        {
+            ch = *iter;
+            iter++;
+        }
+        if (ch > *maxchar) {
+            *maxchar = ch;
+            if (*maxchar > MAX_UNICODE) {
+                PyErr_Format(PyExc_ValueError,
+                             "character U+%x is not in range [U+0000; U+10ffff]",
+                             ch);
+                return -1;
+            }
+        }
    }
    return 0;
 }
@@ -1669,8 +1671,7 @@ PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
                                &maxchar, &num_surrogates) == -1)
        return NULL;

-    unicode = PyUnicode_New(size - num_surrogates,
-                                                maxchar);
+    unicode = PyUnicode_New(size - num_surrogates, maxchar);
    if (!unicode)
        return NULL;

@@ -1808,7 +1809,7 @@ kind_maxchar_limit(unsigned int kind)
        return 0x10000;
    default:
        assert(0 && "invalid kind");
-        return 0x10ffff;
+        return MAX_UNICODE;
    }
 }

@@ -2796,7 +2797,7 @@ PyObject *
 PyUnicode_FromOrdinal(int ordinal)
 {
    PyObject *v;
-    if (ordinal < 0 || ordinal > 0x10ffff) {
+    if (ordinal < 0 || ordinal > MAX_UNICODE) {
        PyErr_SetString(PyExc_ValueError,
                        "chr() arg not in range(0x110000)");
        return NULL;
@@ -3472,7 +3473,7 @@ PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
            four_bytes = PyUnicode_4BYTE_DATA(unicode);
            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
                if (*four_bytes > 0xFFFF) {
-                    assert(*four_bytes <= 0x10FFFF);
+                    assert(*four_bytes <= MAX_UNICODE);
                    /* encode surrogate pair in this case */
                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
@@ -4118,7 +4119,7 @@ _PyUnicode_EncodeUTF7(PyObject *str,
        continue;
 encode_char:
        if (ch >= 0x10000) {
-            assert(ch <= 0x10FFFF);
+            assert(ch <= MAX_UNICODE);

            /* code first surrogate */
            base64bits += 16;
@@ -4577,7 +4578,7 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
            }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
-            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
+            assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));

            WRITE_MAYBE_FAIL(i++, ch);
            break;
@@ -4714,7 +4715,7 @@ _Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
            }
            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
-            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
+            assert ((ch > 0xFFFF) && (ch <= MAX_UNICODE));

 #if SIZEOF_WCHAR_T == 4
            *p++ = (wchar_t)ch;
@@ -4884,7 +4885,7 @@ _PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *p++ = (char)(0x80 | (ch & 0x3f));
        } else /* ch >= 0x10000 */ {
-            assert(ch <= 0x10FFFF);
+            assert(ch <= MAX_UNICODE);
            /* Encode UCS4 Unicode ordinals */
            *p++ = (char)(0xf0 | (ch >> 18));
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
@@ -5792,7 +5793,7 @@ PyUnicode_DecodeUnicodeEscape(const char *s,
                break;
        store:
            /* when we get here, chr is a 32-bit unicode character */
-            if (chr <= 0x10ffff) {
+            if (chr <= MAX_UNICODE) {
                WRITECHAR(chr);
            } else {
                endinpos = s-starts;
@@ -5957,7 +5958,7 @@ PyUnicode_AsUnicodeEscapeString(PyObject *unicode)

        /* Map 21-bit characters to '\U00xxxxxx' */
        else if (ch >= 0x10000) {
-            assert(ch <= 0x10FFFF);
+            assert(ch <= MAX_UNICODE);
            *p++ = '\\';
            *p++ = 'U';
            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
@@ -6108,7 +6109,7 @@ PyUnicode_DecodeRawUnicodeEscape(const char *s,
            else
                x += 10 + c - 'A';
        }
-        if (x <= 0x10ffff) {
+        if (x <= MAX_UNICODE) {
            if (unicode_putchar(&v, &outpos, x) < 0)
                goto onError;
        } else {
@@ -6175,7 +6176,7 @@ PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
        /* Map 32-bit characters to '\Uxxxxxxxx' */
        if (ch >= 0x10000) {
-            assert(ch <= 0x10FFFF);
+            assert(ch <= MAX_UNICODE);
            *p++ = '\\';
            *p++ = 'U';
            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
@@ -6536,7 +6537,7 @@ unicode_encode_ucs1(PyObject *unicode,
                    else if (ch < 1000000)
                        repsize += 2+6+1;
                    else {
-                        assert(ch <= 0x10FFFF);
+                        assert(ch <= MAX_UNICODE);
                        repsize += 2+7+1;
                    }
                }
@@ -9275,7 +9276,7 @@ fixup(PyObject *self,
    else if (maxchar_new <= 65535)
        maxchar_new = 65535;
    else
-        maxchar_new = 1114111; /* 0x10ffff */
+        maxchar_new = MAX_UNICODE;

    if (!maxchar_new && PyUnicode_CheckExact(self)) {
        /* fixfct should return TRUE if it modified the buffer. If
@@ -13059,7 +13060,7 @@ formatchar(PyObject *v)
        if (x == -1 && PyErr_Occurred())
            goto onError;

-        if (x < 0 || x > 0x10ffff) {
+        if (x < 0 || x > MAX_UNICODE) {
            PyErr_SetString(PyExc_OverflowError,
                            "%c arg not in range(0x110000)");
            return (Py_UCS4) -1;