Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.

fe7c5b5b · Victor Stinner · 7f2fee36 · fe7c5b5b · fe7c5b5b · fe7c5b5b
Commit fe7c5b5b authored Apr 05, 2011 by Victor Stinner
6 changed files
--- a/Lib/test/test_imp.py
+++ b/Lib/test/test_imp.py
@@ -58,6 +58,12 @@ class ImportTests(unittest.TestCase):
            with imp.find_module('module_' + mod, self.test_path)[0] as fd:
                self.assertEqual(fd.encoding, encoding)

+        path = [os.path.dirname(__file__)]
+        self.assertRaisesRegex(SyntaxError,
+            r"Non-UTF-8 code starting with '\\xf6'"
+            r" in file .*badsyntax_pep3120.py",
+            imp.find_module, 'badsyntax_pep3120', path)
+
    def test_issue1267(self):
        for mod, encoding, _ in self.test_strings:
            fp, filename, info  = imp.find_module('module_' + mod,

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -10,6 +10,8 @@ What's New in Python 3.3 Alpha 1?
 Core and Builtins
 -----------------

+- Issue #9319: Include the filename in "Non-UTF8 code ..." syntax error.
+
 - Issue #10785: Store the filename as Unicode in the Python parser.

 - Issue #11619: _PyImport_LoadDynamicModule() doesn't encode the path to bytes

--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -1690,17 +1690,18 @@ PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
    return result;
 }

-/* Get -*- encoding -*- from a Python file.
+/* Get the encoding of a Python file. Check for the coding cookie and check if
+   the file starts with a BOM.

-   PyTokenizer_FindEncoding returns NULL when it can't find the encoding in
-   the first or second line of the file (in which case the encoding
-   should be assumed to be PyUnicode_GetDefaultEncoding()).
+   PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
+   encoding in the first or second line of the file (in which case the encoding
+   should be assumed to be UTF-8).
+
+   The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
+   by the caller. */

-   The char * returned is malloc'ed via PyMem_MALLOC() and thus must be freed
-   by the caller.
-*/
 char *
-PyTokenizer_FindEncoding(int fd)
+PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
 {
    struct tok_state *tok;
    FILE *fp;
@@ -1720,9 +1721,18 @@ PyTokenizer_FindEncoding(int fd)
        return NULL;
    }
 #ifndef PGEN
+    if (filename != NULL) {
+        Py_INCREF(filename);
+        tok->filename = filename;
+    }
+    else {
        tok->filename = PyUnicode_FromString("<string>");
-    if (tok->filename == NULL)
-        goto error;
+        if (tok->filename == NULL) {
+            fclose(fp);
+            PyTokenizer_Free(tok);
+            return encoding;
+        }
+    }
 #endif
    while (tok->lineno < 2 && tok->done == E_OK) {
        PyTokenizer_Get(tok, &p_start, &p_end);
@@ -1733,13 +1743,16 @@ PyTokenizer_FindEncoding(int fd)
        if (encoding)
        strcpy(encoding, tok->encoding);
    }
-#ifndef PGEN
-error:
-#endif
    PyTokenizer_Free(tok);
    return encoding;
 }

+char *
+PyTokenizer_FindEncoding(int fd)
+{
+    return PyTokenizer_FindEncodingFilename(fd, NULL);
+}
+
 #ifdef Py_DEBUG

 void

--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -75,7 +75,6 @@ extern void PyTokenizer_Free(struct tok_state *);
 extern int PyTokenizer_Get(struct tok_state *, char **, char **);
 extern char * PyTokenizer_RestoreEncoding(struct tok_state* tok,
                                          int len, int *offset);
-extern char * PyTokenizer_FindEncoding(int);

 #ifdef __cplusplus
 }

--- a/Python/import.c
+++ b/Python/import.c
@@ -124,12 +124,12 @@ static const Py_UNICODE PYC_TAG_UNICODE[] = {
 /* See _PyImport_FixupExtensionObject() below */
 static PyObject *extensions = NULL;

+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);
+
 /* This table is defined in config.c: */
 extern struct _inittab _PyImport_Inittab[];

-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
-
 struct _inittab *PyImport_Inittab = _PyImport_Inittab;

 /* these tables define the module suffixes that Python recognizes */
@@ -3540,9 +3540,9 @@ call_find_module(PyObject *name, PyObject *path_list)
    }
    if (fd != -1) {
        if (strchr(fdp->mode, 'b') == NULL) {
-            /* PyTokenizer_FindEncoding() returns PyMem_MALLOC'ed
+            /* PyTokenizer_FindEncodingFilename() returns PyMem_MALLOC'ed
               memory. */
-            found_encoding = PyTokenizer_FindEncoding(fd);
+            found_encoding = PyTokenizer_FindEncodingFilename(fd, pathobj);
            lseek(fd, 0, 0); /* Reset position */
            if (found_encoding == NULL && PyErr_Occurred()) {
                Py_XDECREF(pathobj);

--- a/Python/traceback.c
+++ b/Python/traceback.c
@@ -18,8 +18,8 @@
 #define MAX_FRAME_DEPTH 100
 #define MAX_NTHREADS 100

-/* Method from Parser/tokenizer.c */
-extern char * PyTokenizer_FindEncoding(int);
+/* Function from Parser/tokenizer.c */
+extern char * PyTokenizer_FindEncodingFilename(int, PyObject *);

 static PyObject *
 tb_dir(PyTracebackObject *self)
@@ -251,7 +251,7 @@ _Py_DisplaySourceLine(PyObject *f, PyObject *filename, int lineno, int indent)

    /* use the right encoding to decode the file as unicode */
    fd = PyObject_AsFileDescriptor(binary);
-    found_encoding = PyTokenizer_FindEncoding(fd);
+    found_encoding = PyTokenizer_FindEncodingFilename(fd, filename);
    encoding = (found_encoding != NULL) ? found_encoding : "utf-8";
    lseek(fd, 0, 0); /* Reset position */
    fob = PyObject_CallMethod(io, "TextIOWrapper", "Os", binary, encoding);