Add support for unicode literals when using pypa

We support the \u, \U and \N escape sequences and the unicode_literals option. In addition updates pypa to latest version with unicode support.

Add support for unicode literals when using pypa
We support the \u, \U and \N escape sequences and the unicode_literals option. In addition updates pypa to latest version with unicode support.
6fbad08e · Marius Wachtler · 075620c7 · 2ce3f0ef · 6fbad08e · 6fbad08e
Commit 6fbad08e authored Mar 17, 2015 by Marius Wachtler
8 changed files
--- a/libpypa @ 2ce3f0ef
+++ b/libpypa @ 2ce3f0ef
-Subproject commit 94fd3e1551188171fca8fb1d4bb7e2f916be33c4
+Subproject commit 2ce3f0ef83f6d3d4bdd1ab841e2ca4c3417d93a4
--- a/src/codegen/pypa-parser.cpp
+++ b/src/codegen/pypa-parser.cpp
@@ -31,6 +31,9 @@
 #include "core/stats.h"
 #include "core/types.h"
 #include "core/util.h"
+#include "runtime/capi.h"
+#include "runtime/objmodel.h"
+#include "runtime/types.h"
 namespace pypa {
 bool string_to_double(String const& s, double& result);
@@ -511,7 +514,7 @@ struct expr_dispatcher {
    ResultPtr read(pypa::AstStr& s) {
        AST_Str* ptr = new AST_Str();
        location(ptr, s);
-        ptr->str_type = AST_Str::STR;
+        ptr->str_type = s.unicode ? AST_Str::UNICODE : AST_Str::STR;
        ptr->str_data = s.value;
        return ptr;
    }
@@ -799,7 +802,7 @@ struct stmt_dispatcher {
        location(ptr, d);
        AST_Str* str = new AST_Str();
        ptr->value = str;
-        str->str_type = AST_Str::STR;
+        str->str_type = d.unicode ? AST_Str::UNICODE : AST_Str::STR;
        str->str_data = d.doc;
        return ptr;
    }
@@ -823,14 +826,32 @@ AST_Module* readModule(pypa::AstModule& t) {
 }
 void pypaErrorHandler(pypa::Error e) {
-    //    raiseSyntaxError
-    //    void raiseSyntaxError(const char* msg, int lineno, int col_offset, const
-    //    std::string& file, const std::string& func);
    if (e.type != pypa::ErrorType::SyntaxWarning) {
        raiseSyntaxError(e.message.c_str(), e.cur.line, e.cur.column, e.file_name, std::string());
    }
 }
+pypa::String pypaUnicodeEscapeDecoder(pypa::String s, bool raw_prefix, bool& error) {
+    try {
+        error = false;
+        Box* unicode = NULL;
+        if (raw_prefix)
+            unicode = PyUnicode_DecodeRawUnicodeEscape(s.c_str(), s.size(), "strict");
+        else
+            unicode = PyUnicode_DecodeUnicodeEscape(s.c_str(), s.size(), "strict");
+        checkAndThrowCAPIException();
+        BoxedString* str_utf8 = (BoxedString*)PyUnicode_AsUTF8String(unicode);
+        checkAndThrowCAPIException();
+        return str_utf8->s;
+    } catch (ExcInfo e) {
+        error = true;
+        BoxedString* error_message = str(e.value);
+        if (error_message && error_message->cls == str_cls)
+            return error_message->s;
+        return "Encountered an unknown error inside pypaUnicodeEscapeDecoder";
+    }
+}
 AST_Module* pypa_parse(char const* file_path) {
    pypa::Lexer lexer(file_path);
    pypa::SymbolTablePtr symbols;
@@ -842,6 +863,7 @@ AST_Module* pypa_parse(char const* file_path) {
    options.python3only = false;
    options.handle_future_errors = false;
    options.error_handler = pypaErrorHandler;
+    options.unicode_escape_handler = pypaUnicodeEscapeDecoder;
    if (pypa::parse(lexer, module, symbols, options) && module) {
        return readModule(*module);

--- a/src/jit.cpp
+++ b/src/jit.cpp
@@ -231,6 +231,7 @@ static int main(int argc, char** argv) {
            add_history(line);
+            try {
                AST_Module* m = parse_string(line);
                Timer _t("repl");
@@ -253,7 +254,6 @@ static int main(int argc, char** argv) {
                    m->body[0] = p;
                }
-            try {
                compileAndRunModule(m, main_module);
            } catch (ExcInfo e) {
                int retcode = 0xdeadbeef; // should never be seen

--- a/src/runtime/stacktrace.cpp
+++ b/src/runtime/stacktrace.cpp
@@ -118,11 +118,9 @@ void raiseSyntaxError(const char* msg, int lineno, int col_offset, const std::st
    Box* exc = runtimeCall(SyntaxError, ArgPassSpec(1), boxStrConstant(msg), NULL, NULL, NULL, NULL);
    auto tb = getTraceback();
-    // TODO: push the syntax error line back on it:
+    std::vector<const LineInfo*> entries = tb->lines;
-    //// TODO: leaks this!
+    entries.push_back(new LineInfo(lineno, col_offset, file, func));
-    // last_tb.push_back(new LineInfo(lineno, col_offset, file, func));
+    raiseRaw(ExcInfo(exc->cls, exc, new BoxedTraceback(std::move(entries))));
-    raiseRaw(ExcInfo(exc->cls, exc, tb));
 }
 void _printStacktrace() {

--- a/test/tests/json_test.py
+++ b/test/tests/json_test.py
-# skip-if: '-x' in EXTRA_JIT_ARGS
 from StringIO import StringIO
 import json

--- a/test/tests/prevent_nonascii_attrs.py
+++ b/test/tests/prevent_nonascii_attrs.py
-# skip-if: '-x' in EXTRA_JIT_ARGS
 def f(a):
    print a

--- a/test/tests/unicode_test.py
+++ b/test/tests/unicode_test.py
-# skip-if: '-x' in EXTRA_JIT_ARGS
 print repr(unicode())
 print repr(unicode('hello world'))
@@ -32,6 +30,7 @@ print u"Hello " + " World"
 def p(x):
    return [hex(ord(i)) for i in x]
 s = u"\u20AC" # euro sign
+print p(u"\N{EURO SIGN}")
 print p(s) 
 print p(s.encode("utf8"))
 print p(s.encode("utf16"))

--- a/test/tests/unicodedata_test.py
+++ b/test/tests/unicodedata_test.py
-# skip-if: '-x' in EXTRA_JIT_ARGS
 import unicodedata
 print unicodedata.lookup("EURO SIGN") == u"\u20ac"
 print unicodedata.name(u"/")