Merge pull request #476 from undingen/coding

Add support for specifying the encoding inside source files.

Merge pull request #476 from undingen/coding
Add support for specifying the encoding inside source files.
12abc42b · Kevin Modzelewski · 039ea8f2 · cea982c5 · 12abc42b · 91428d4e
Commit 12abc42b authored May 11, 2015 by Kevin Modzelewski
9 changed files
--- a/from_cpython/Include/fileobject.h
+++ b/from_cpython/Include/fileobject.h
@@ -51,7 +51,7 @@ PyAPI_FUNC(PyObject *) PyFile_FromString(char *, char *) PYSTON_NOEXCEPT;
 PyAPI_FUNC(void) PyFile_SetBufSize(PyObject *, int) PYSTON_NOEXCEPT;
 PyAPI_FUNC(int) PyFile_SetEncoding(PyObject *, const char *) PYSTON_NOEXCEPT;
 PyAPI_FUNC(int) PyFile_SetEncodingAndErrors(PyObject *, const char *, char *errors) PYSTON_NOEXCEPT;
-PyAPI_FUNC(PyObject *) PyFile_FromFile(FILE *, char *, char *,
+PyAPI_FUNC(PyObject *) PyFile_FromFile(FILE *, const char *, const char *,
                                             int (*)(FILE *)) PYSTON_NOEXCEPT;
 PyAPI_FUNC(FILE *) PyFile_AsFile(PyObject *) PYSTON_NOEXCEPT;
 PyAPI_FUNC(void) PyFile_IncUseCount(PyFileObject *) PYSTON_NOEXCEPT;

--- a/libpypa @ 91428d4e
+++ b/libpypa @ 91428d4e
-Subproject commit 51f55dd4068f116ca287369cedddce624de73da6
+Subproject commit 91428d4e7a72c53a058682f3a8d1993a90efdccd
--- a/src/codegen/parser.cpp
+++ b/src/codegen/parser.cpp
@@ -973,6 +973,7 @@ AST_Module* parse_string(const char* code) {

    FILE* f = fopen(tmp.c_str(), "w");
    fwrite(code, 1, size, f);
+    fputc('\n', f);
    fclose(f);

    AST_Module* m = parse_file(tmp.c_str());

--- a/src/codegen/pypa-parser.cpp
+++ b/src/codegen/pypa-parser.cpp
@@ -32,6 +32,7 @@
 #include "core/stats.h"
 #include "core/types.h"
 #include "core/util.h"
+#include "gc/collector.h"
 #include "runtime/capi.h"
 #include "runtime/objmodel.h"
 #include "runtime/types.h"
@@ -839,29 +840,256 @@ void pypaErrorHandler(pypa::Error e) {
    }
 }

-pypa::String pypaUnicodeEscapeDecoder(pypa::String s, bool raw_prefix, bool& error) {
+static PyObject* decode_utf8(const char** sPtr, const char* end, const char* encoding) noexcept {
+#ifndef Py_USING_UNICODE
+    Py_FatalError("decode_utf8 should not be called in this build.");
+    return NULL;
+#else
+    PyObject* u, *v;
+    const char* s, *t;
+    t = s = (const char*)*sPtr;
+    /* while (s < end && *s != '\\') s++; */ /* inefficient for u".." */
+    while (s < end && (*s & 0x80))
+        s++;
+    *sPtr = s;
+    u = PyUnicode_DecodeUTF8(t, s - t, NULL);
+    if (u == NULL)
+        return NULL;
+    v = PyUnicode_AsEncodedString(u, encoding, NULL);
+    Py_DECREF(u);
+    return v;
+#endif
+}
+
+#ifdef Py_USING_UNICODE
+static PyObject* decode_unicode(const char* s, size_t len, int rawmode, const char* encoding) noexcept {
+    PyObject* v;
+    PyObject* u = NULL;
+    char* buf;
+    char* p;
+    const char* end;
+    if (encoding != NULL && strcmp(encoding, "iso-8859-1")) {
+        /* check for integer overflow */
+        if (len > PY_SIZE_MAX / 6)
+            return NULL;
+        /* "<C3><A4>" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
+           "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
+        u = PyString_FromStringAndSize((char*)NULL, len * 6);
+        if (u == NULL)
+            return NULL;
+        p = buf = PyString_AsString(u);
+        end = s + len;
+        while (s < end) {
+            if (*s == '\\') {
+                *p++ = *s++;
+                if (*s & 0x80) {
+                    strcpy(p, "u005c");
+                    p += 5;
+                }
+            }
+            if (*s & 0x80) { /* XXX inefficient */
+                PyObject* w;
+                char* r;
+                Py_ssize_t rn, i;
+                w = decode_utf8(&s, end, "utf-32-be");
+                if (w == NULL) {
+                    Py_DECREF(u);
+                    return NULL;
+                }
+                r = PyString_AsString(w);
+                rn = PyString_Size(w);
+                assert(rn % 4 == 0);
+                for (i = 0; i < rn; i += 4) {
+                    sprintf(p, "\\U%02x%02x%02x%02x", r[i + 0] & 0xFF, r[i + 1] & 0xFF, r[i + 2] & 0xFF,
+                            r[i + 3] & 0xFF);
+                    p += 10;
+                }
+                Py_DECREF(w);
+            } else {
+                *p++ = *s++;
+            }
+        }
+        len = p - buf;
+        s = buf;
+    }
+    if (rawmode)
+        v = PyUnicode_DecodeRawUnicodeEscape(s, len, NULL);
+    else
+        v = PyUnicode_DecodeUnicodeEscape(s, len, NULL);
+    Py_XDECREF(u);
+    return v;
+}
+#endif
+
+pypa::String pypaEscapeDecoder(const pypa::String& s, const pypa::String& encoding, bool unicode, bool raw_prefix,
+                               bool& error) {
    try {
        error = false;
-        Box* unicode = NULL;
-        if (raw_prefix)
-            unicode = PyUnicode_DecodeRawUnicodeEscape(s.c_str(), s.size(), "strict");
-        else
-            unicode = PyUnicode_DecodeUnicodeEscape(s.c_str(), s.size(), "strict");
-        checkAndThrowCAPIException();
-        BoxedString* str_utf8 = (BoxedString*)PyUnicode_AsUTF8String(unicode);
-        checkAndThrowCAPIException();
-        return std::string(str_utf8->s);
+        if (unicode) {
+            PyObject* str = decode_unicode(s.c_str(), s.size(), raw_prefix, encoding.c_str());
+            if (!str)
+                throwCAPIException();
+            BoxedString* str_utf8 = (BoxedString*)PyUnicode_AsUTF8String(str);
+            assert(str_utf8->cls == str_cls);
+            checkAndThrowCAPIException();
+            return str_utf8->s.str();
+        }
+
+        bool need_encoding = encoding != "utf-8" && encoding != "iso-8859-1";
+        if (raw_prefix || s.find('\\') == pypa::String::npos) {
+            if (need_encoding) {
+                PyObject* u = PyUnicode_DecodeUTF8(s.c_str(), s.size(), NULL);
+                if (!u)
+                    throwCAPIException();
+                BoxedString* str = (BoxedString*)PyUnicode_AsEncodedString(u, encoding.c_str(), NULL);
+                assert(str->cls == str_cls);
+                return str->s.str();
+            } else {
+                return s;
+            }
+        }
+
+        BoxedString* decoded = (BoxedString*)PyString_DecodeEscape(s.c_str(), s.size(), NULL, false,
+                                                                   need_encoding ? encoding.c_str() : NULL);
+        if (!decoded)
+            throwCAPIException();
+        assert(decoded->cls == str_cls);
+        return decoded->s.str();
    } catch (ExcInfo e) {
        error = true;
        BoxedString* error_message = str(e.value);
        if (error_message && error_message->cls == str_cls)
            return std::string(error_message->s);
-        return "Encountered an unknown error inside pypaUnicodeEscapeDecoder";
+        return "Encountered an unknown error inside pypaEscapeDecoder";
    }
 }

+class PystonSourceReader : public pypa::Reader {
+public:
+    PystonSourceReader();
+    ~PystonSourceReader() override;
+
+    bool open_file(const std::string& file_path);
+    void close();
+
+    bool set_encoding(const std::string& coding) override;
+    std::string get_encoding() const { return encoding; }
+    std::string get_line() override;
+    unsigned get_line_number() const override { return line_number; }
+    std::string get_filename() const override { return file_path; }
+    bool eof() const override { return is_eof; }
+
+private:
+    char next();
+
+    std::string file_path;
+    bool is_eof;
+    FILE* file;
+    unsigned line_number;
+    PyObject* readline;
+    std::string encoding;
+};
+
+PystonSourceReader::PystonSourceReader() : file(nullptr), readline(nullptr) {
+    close();
+}
+
+PystonSourceReader::~PystonSourceReader() {
+    close();
+}
+
+bool PystonSourceReader::open_file(const std::string& _file_path) {
+    file = fopen(_file_path.c_str(), "r");
+    if (!file)
+        return false;
+
+    file_path = _file_path;
+    is_eof = false;
+    line_number = 0;
+    readline = nullptr;
+    return true;
+}
+
+void PystonSourceReader::close() {
+    if (file)
+        fclose(file);
+    file = nullptr;
+    file_path.clear();
+    is_eof = true;
+    if (readline)
+        gc::deregisterPermanentRoot(readline);
+    readline = nullptr;
+    line_number = 0;
+}
+
+bool PystonSourceReader::set_encoding(const std::string& coding) {
+    PyObject* stream = PyFile_FromFile(file, file_path.c_str(), "rb", NULL);
+    if (stream == NULL)
+        return false;
+
+    PyObject* reader = PyCodec_StreamReader(coding.c_str(), stream, NULL);
+    if (reader == NULL)
+        return false;
+
+    readline = PyObject_GetAttrString(reader, "readline");
+    if (readline == NULL)
+        return false;
+
+    gc::registerPermanentRoot(readline);
+    return true;
+}
+
+char PystonSourceReader::next() {
+    if (is_eof)
+        return 0;
+
+    int c = fgetc(file);
+    if (c == EOF) {
+        is_eof = true;
+        return 0;
+    }
+    return c;
+}
+
+std::string PystonSourceReader::get_line() {
+    if (eof())
+        return std::string();
+
+    if (!readline) {
+        std::string line;
+        char c;
+        do {
+            c = next();
+            if (eof())
+                break;
+            line.push_back(c);
+        } while (c != '\n' && c != '\x0c');
+        if (!eof())
+            ++line_number;
+        return line;
+    }
+
+    BoxedString* line = (BoxedString*)runtimeCall(readline, ArgPassSpec(0), 0, 0, 0, 0, 0);
+    if (line->cls == unicode_cls) {
+        line = (BoxedString*)PyUnicode_AsUTF8String(line);
+        if (line == NULL) {
+            is_eof = true;
+            return std::string();
+        }
+    }
+    assert(line->cls == str_cls);
+    if (!line->size())
+        is_eof = true;
+    ++line_number;
+    return line->s;
+}
+
 AST_Module* pypa_parse(char const* file_path) {
-    pypa::Lexer lexer(file_path);
+    auto reader = llvm::make_unique<PystonSourceReader>();
+    if (!reader->open_file(file_path))
+        return nullptr;
+
+    pypa::Lexer lexer(std::move(reader));
    pypa::SymbolTablePtr symbols;
    pypa::AstModulePtr module;
    pypa::ParserOptions options;
@@ -871,7 +1099,7 @@ AST_Module* pypa_parse(char const* file_path) {
    options.python3only = false;
    options.handle_future_errors = false;
    options.error_handler = pypaErrorHandler;
-    options.unicode_escape_handler = pypaUnicodeEscapeDecoder;
+    options.escape_handler = pypaEscapeDecoder;

    if (pypa::parse(lexer, module, symbols, options) && module) {
        return readModule(*module);

--- a/src/gc/collector.cpp
+++ b/src/gc/collector.cpp
@@ -136,6 +136,12 @@ void registerPermanentRoot(void* obj, bool allow_duplicates) {
    roots.insert(obj);
 }

+void deregisterPermanentRoot(void* obj) {
+    assert(global_heap.getAllocationFromInteriorPointer(obj));
+    ASSERT(roots.count(obj), "");
+    roots.erase(obj);
+}
+
 extern "C" PyObject* PyGC_AddRoot(PyObject* obj) noexcept {
    if (obj) {
        // Allow duplicates from CAPI code since they shouldn't have to know

--- a/src/gc/collector.h
+++ b/src/gc/collector.h
@@ -26,6 +26,8 @@ namespace gc {
 // (Note: this marks the gc allocation itself, not the pointer that points to one.  For that, use
 // a GCRootHandle)
 void registerPermanentRoot(void* root_obj, bool allow_duplicates = false);
+void deregisterPermanentRoot(void* root_obj);
+
 // Register an object that was not allocated through this collector, as a root for this collector.
 // The motivating usecase is statically-allocated PyTypeObject objects, which are full Python objects
 // even if they are not heap allocated.

--- a/src/runtime/file.cpp
+++ b/src/runtime/file.cpp
@@ -1134,7 +1134,7 @@ extern "C" void PyFile_SetFP(PyObject* _f, FILE* fp) noexcept {
    f->f_fp = fp;
 }

-extern "C" PyObject* PyFile_FromFile(FILE* fp, char* name, char* mode, int (*close)(FILE*)) noexcept {
+extern "C" PyObject* PyFile_FromFile(FILE* fp, const char* name, const char* mode, int (*close)(FILE*)) noexcept {
    return new BoxedFile(fp, name, mode, close);
 }


--- a/test/tests/coding_cp1252.py
+++ b/test/tests/coding_cp1252.py
+# coding: cp1252
+s = u""
+print ord(s), s
--- a/test/tests/coding_koi8.py
+++ b/test/tests/coding_koi8.py
+# -*- coding: koi8-r -*-
+def test(s):
+    print s, "len:", len(s)
+    for c in s:
+        print hex(ord(c)),
+    print ""
+test(u"ðÉÔÏÎ".encode("utf8"))