Issue #17043: The unicode-internal decoder no longer read past the end of

input buffer.

Issue #17043: The unicode-internal decoder no longer read past the end of
input buffer.
03ee12ed · Serhiy Storchaka · cf0904ff · 3fd4ab35 · 03ee12ed · 03ee12ed
Commit 03ee12ed authored Feb 07, 2013 by Serhiy Storchaka
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 26 deletions

Misc/NEWS Misc/NEWS +3 -0

Objects/unicodeobject.c Objects/unicodeobject.c +22 -26

No files found.
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -12,6 +12,9 @@ What's New in Python 3.3.1?
 Core and Builtins
 -----------------

+- Issue #17043: The unicode-internal decoder no longer read past the end of
+  input buffer.
+
 - Issue #17098: All modules now have __loader__ set even if they pre-exist the
  bootstrapping of importlib.


--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -6103,6 +6103,11 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
    while (s < end) {
        Py_UNICODE uch;
        Py_UCS4 ch;
+        if (end - s < Py_UNICODE_SIZE) {
+            endinpos = end-starts;
+            reason = "truncated input";
+            goto error;
+        }
        /* We copy the raw representation one byte at a time because the
           pointer may be unaligned (see test_codeccallbacks). */
        ((char *) &uch)[0] = s[0];
@@ -6112,37 +6117,18 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,
        ((char *) &uch)[3] = s[3];
 #endif
        ch = uch;
-
+#ifdef Py_UNICODE_WIDE
        /* We have to sanity check the raw data, otherwise doom looms for
           some malformed UCS-4 data. */
-        if (
-#ifdef Py_UNICODE_WIDE
-            ch > 0x10ffff ||
-#endif
-            end-s < Py_UNICODE_SIZE
-            )
-        {
-            startinpos = s - starts;
-            if (end-s < Py_UNICODE_SIZE) {
-                endinpos = end-starts;
-                reason = "truncated input";
-            }
-            else {
+        if (ch > 0x10ffff) {
            endinpos = s - starts + Py_UNICODE_SIZE;
            reason = "illegal code point (> 0x10FFFF)";
+            goto error;
        }
-            if (unicode_decode_call_errorhandler(
-                    errors, &errorHandler,
-                    "unicode_internal", reason,
-                    &starts, &end, &startinpos, &endinpos, &exc, &s,
-                    &v, &outpos))
-                goto onError;
-            continue;
-        }
-
+#endif
        s += Py_UNICODE_SIZE;
 #ifndef Py_UNICODE_WIDE
-        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
+        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
        {
            Py_UNICODE uch2;
            ((char *) &uch2)[0] = s[0];
@@ -6157,6 +6143,16 @@ _PyUnicode_DecodeUnicodeInternal(const char *s,

        if (unicode_putchar(&v, &outpos, ch) < 0)
            goto onError;
+        continue;
+
+  error:
+        startinpos = s - starts;
+        if (unicode_decode_call_errorhandler(
+                errors, &errorHandler,
+                "unicode_internal", reason,
+                &starts, &end, &startinpos, &endinpos, &exc, &s,
+                &v, &outpos))
+            goto onError;
    }

    if (unicode_resize(&v, outpos) < 0)