Issue #25388: Fixed tokenizer crash when processing undecodable source code

with a null byte.

Issue #25388: Fixed tokenizer crash when processing undecodable source code
with a null byte.
7e2b870b · Serhiy Storchaka · 2463001a · 0d441119 · 7e2b870b · 7e2b870b
Commit 7e2b870b authored Nov 14, 2015 by Serhiy Storchaka
Hide whitespace changes
Inline Side-by-side

Showing with 19 additions and 8 deletions

Lib/test/test_compile.py Lib/test/test_compile.py +10 -0

Misc/NEWS Misc/NEWS +3 -0

Parser/tokenizer.c Parser/tokenizer.c +6 -8

No files found.
--- a/Lib/test/test_compile.py
+++ b/Lib/test/test_compile.py
@@ -516,6 +516,16 @@ if 1:
            res = script_helper.run_python_until_end(fn)[0]
        self.assertIn(b"Non-UTF-8", res.err)

+    def test_yet_more_evil_still_undecodable(self):
+        # Issue #25388
+        src = b"#\x00\n#\xfd\n"
+        with tempfile.TemporaryDirectory() as tmpd:
+            fn = os.path.join(tmpd, "bad.py")
+            with open(fn, "wb") as fp:
+                fp.write(src)
+            res = script_helper.run_python_until_end(fn)[0]
+        self.assertIn(b"Non-UTF-8", res.err)
+
    @support.cpython_only
    def test_compiler_recursion_limit(self):
        # Expected limit is sys.getrecursionlimit() * the scaling factor

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -11,6 +11,9 @@ Release date: TBA
 Core and Builtins
 -----------------

+- Issue #25388: Fixed tokenizer crash when processing undecodable source code
+  with a null byte.
+
 - Issue #25462: The hash of the key now is calculated only once in most
  operations in C implementation of OrderedDict.


--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -196,7 +196,8 @@ error_ret(struct tok_state *tok) /* XXX */
    tok->decoding_erred = 1;
    if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
        PyMem_FREE(tok->buf);
-    tok->buf = NULL;
+    tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
+    tok->done = E_DECODE;
    return NULL;                /* as if it were EOF */
 }

@@ -952,11 +953,6 @@ tok_nextc(struct tok_state *tok)
                }
                buflen = PyBytes_GET_SIZE(u);
                buf = PyBytes_AS_STRING(u);
-                if (!buf) {
-                    Py_DECREF(u);
-                    tok->done = E_DECODE;
-                    return EOF;
-                }
                newtok = PyMem_MALLOC(buflen+1);
                strcpy(newtok, buf);
                Py_DECREF(u);
@@ -998,7 +994,6 @@ tok_nextc(struct tok_state *tok)
                if (tok->buf != NULL)
                    PyMem_FREE(tok->buf);
                tok->buf = newtok;
-                tok->line_start = tok->buf;
                tok->cur = tok->buf;
                tok->line_start = tok->buf;
                tok->inp = strchr(tok->buf, '\0');
@@ -1021,7 +1016,8 @@ tok_nextc(struct tok_state *tok)
                }
                if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
                          tok) == NULL) {
-                    tok->done = E_EOF;
+                    if (!tok->decoding_erred)
+                        tok->done = E_EOF;
                    done = 1;
                }
                else {
@@ -1055,6 +1051,8 @@ tok_nextc(struct tok_state *tok)
                    return EOF;
                }
                tok->buf = newbuf;
+                tok->cur = tok->buf + cur;
+                tok->line_start = tok->cur;
                tok->inp = tok->buf + curvalid;
                tok->end = tok->buf + newsize;
                tok->start = curstart < 0 ? NULL :