Commit bd80565f authored by Kevin Modzelewski's avatar Kevin Modzelewski

Support unicode literals in source files

Currently storing + passing unicode strings around as UTF-encoded bytestrings
in std::string; maybe it'd be nice to have the type system show that these are
actually unicode strings, or to use the CPython internal representation (UCS4?)
to reduce the number of encodings/decodings.
parent e0c1a8d9
......@@ -477,7 +477,20 @@ extern "C" int PyObject_IsSubclass(PyObject* derived, PyObject* cls) noexcept {
}
extern "C" PyObject* _PyObject_CallFunction_SizeT(PyObject* callable, const char* format, ...) noexcept {
Py_FatalError("unimplemented");
va_list va;
PyObject* args;
if (callable == NULL)
return null_error();
if (format && *format) {
va_start(va, format);
args = _Py_VaBuildValue_SizeT(format, va);
va_end(va);
} else
args = PyTuple_New(0);
return call_function_tail(callable, args);
}
#define NEW_STYLE_NUMBER(o) PyType_HasFeature((o)->cls, Py_TPFLAGS_CHECKTYPES)
......
......@@ -69,6 +69,18 @@ static int countformat(const char* format, int endchar) noexcept {
return count;
}
#ifdef Py_USING_UNICODE
static int _ustrlen(Py_UNICODE* u) {
int i = 0;
Py_UNICODE* v = u;
while (*v != 0) {
i++;
v++;
}
return i;
}
#endif
static PyObject* do_mktuple(const char**, va_list*, int, int, int) noexcept;
// static PyObject *do_mklist(const char**, va_list *, int, int, int) noexcept;
// static PyObject *do_mkdict(const char**, va_list *, int, int, int) noexcept;
......@@ -162,7 +174,30 @@ static PyObject* do_mkvalue(const char** p_format, va_list* p_va, int flags) noe
}
return v;
}
#ifdef Py_USING_UNICODE
case 'u': {
PyObject* v;
Py_UNICODE* u = va_arg(*p_va, Py_UNICODE*);
Py_ssize_t n;
if (**p_format == '#') {
++*p_format;
if (flags & FLAG_SIZE_T)
n = va_arg(*p_va, Py_ssize_t);
else
n = va_arg(*p_va, int);
} else
n = -1;
if (u == NULL) {
v = Py_None;
Py_INCREF(v);
} else {
if (n < 0)
n = _ustrlen(u);
v = PyUnicode_FromUnicode(u, n);
}
return v;
}
#endif
default:
RELEASE_ASSERT(0, "%c", *((*p_format) - 1));
}
......
......@@ -33,6 +33,7 @@
#include "core/stats.h"
#include "core/thread_utils.h"
#include "core/util.h"
#include "runtime/capi.h"
#include "runtime/generator.h"
#include "runtime/import.h"
#include "runtime/inline/boxing.h"
......@@ -497,7 +498,9 @@ Value ASTInterpreter::visit_langPrimitive(AST_LangPrimitive* node) {
assert(node->args[1]->type == AST_TYPE::Str);
Value module = visit_expr(node->args[0]);
const std::string& name = ast_cast<AST_Str>(node->args[1])->s;
auto ast_str = ast_cast<AST_Str>(node->args[1]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& name = ast_str->str_data;
assert(name.size());
v = importFrom(module.o, &name);
} else if (node->opcode == AST_LangPrimitive::IMPORT_NAME) {
......@@ -508,7 +511,9 @@ Value ASTInterpreter::visit_langPrimitive(AST_LangPrimitive* node) {
int level = static_cast<AST_Num*>(node->args[0])->n_int;
Value froms = visit_expr(node->args[1]);
const std::string& module_name = static_cast<AST_Str*>(node->args[2])->s;
auto ast_str = ast_cast<AST_Str>(node->args[2]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& module_name = ast_str->str_data;
v = import(level, froms.o, &module_name);
} else if (node->opcode == AST_LangPrimitive::IMPORT_STAR) {
assert(node->args.size() == 1);
......@@ -996,7 +1001,13 @@ Value ASTInterpreter::visit_set(AST_Set* node) {
}
Value ASTInterpreter::visit_str(AST_Str* node) {
return boxString(node->s);
if (node->str_type == AST_Str::STR) {
return boxString(node->str_data);
} else if (node->str_type == AST_Str::UNICODE) {
return decodeUTF8StringPtr(&node->str_data);
} else {
RELEASE_ASSERT(0, "%d", node->str_type);
}
}
Value ASTInterpreter::visit_name(AST_Name* node) {
......
......@@ -1798,6 +1798,12 @@ CompilerVariable* makeStr(const std::string* s) {
return new ValuedCompilerVariable<const std::string*>(STR_CONSTANT, s, true);
}
CompilerVariable* makeUnicode(IREmitter& emitter, const std::string* s) {
llvm::Value* boxed
= emitter.getBuilder()->CreateCall(g.funcs.decodeUTF8StringPtr, embedConstantPtr(s, g.llvm_str_type_ptr));
return new ConcreteCompilerVariable(typeFromClass(unicode_cls), boxed, true);
}
class VoidType : public ConcreteCompilerType {
public:
llvm::Type* llvmType() override { return g.void_; }
......
......@@ -379,6 +379,7 @@ ConcreteCompilerVariable* makeBool(bool);
ConcreteCompilerVariable* makeLong(IREmitter& emitter, std::string&);
ConcreteCompilerVariable* makePureImaginary(IREmitter& emitter, double imag);
CompilerVariable* makeStr(const std::string*);
CompilerVariable* makeUnicode(IREmitter& emitter, const std::string*);
CompilerVariable* makeFunction(IREmitter& emitter, CLFunction*, CompilerVariable* closure, bool isGenerator,
const std::vector<ConcreteCompilerVariable*>& defaults);
ConcreteCompilerVariable* undefVariable();
......
......@@ -515,7 +515,9 @@ private:
ConcreteCompilerVariable* converted_module = module->makeConverted(emitter, module->getBoxType());
module->decvref(emitter);
const std::string& name = ast_cast<AST_Str>(node->args[1])->s;
auto ast_str = ast_cast<AST_Str>(node->args[1]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& name = ast_str->str_data;
assert(name.size());
llvm::Value* r = emitter.createCall2(unw_info, g.funcs.importFrom, converted_module->getValue(),
......@@ -558,7 +560,9 @@ private:
ConcreteCompilerVariable* converted_froms = froms->makeConverted(emitter, froms->getBoxType());
froms->decvref(emitter);
const std::string& module_name = static_cast<AST_Str*>(node->args[2])->s;
auto ast_str = ast_cast<AST_Str>(node->args[2]);
assert(ast_str->str_type == AST_Str::STR);
const std::string& module_name = ast_str->str_data;
llvm::Value* imported = emitter.createCall3(unw_info, g.funcs.import, getConstantInt(level, g.i32),
converted_froms->getValue(),
......@@ -1008,7 +1012,15 @@ private:
return new ConcreteCompilerVariable(SLICE, rtn, true);
}
CompilerVariable* evalStr(AST_Str* node, UnwindInfo unw_info) { return makeStr(&node->s); }
CompilerVariable* evalStr(AST_Str* node, UnwindInfo unw_info) {
if (node->str_type == AST_Str::STR) {
return makeStr(&node->str_data);
} else if (node->str_type == AST_Str::UNICODE) {
return makeUnicode(emitter, &node->str_data);
} else {
RELEASE_ASSERT(0, "%d", node->str_type);
}
}
CompilerVariable* evalSubscript(AST_Subscript* node, UnwindInfo unw_info) {
CompilerVariable* value = evalExpr(node->value, unw_info);
......
......@@ -154,7 +154,7 @@ def convert(n, f):
elif isinstance(v, str):
_print_str(v, f)
elif isinstance(v, unicode):
_print_str(v.encode("ascii"), f)
_print_str(v.encode("utf8"), f)
elif isinstance(v, bool):
f.write(struct.pack("B", v))
elif isinstance(v, int):
......
......@@ -661,12 +661,9 @@ AST_Str* read_str(BufferedReader* reader) {
rtn->lineno = reader->readULL();
if (rtn->str_type == AST_Str::STR) {
rtn->s = readString(reader);
rtn->str_data = readString(reader);
} else if (rtn->str_type == AST_Str::UNICODE) {
// Don't really support unicode for now...
printf("Warning: converting unicode literal to str\n");
rtn->str_type = AST_Str::STR;
rtn->s = readString(reader);
rtn->str_data = readString(reader);
} else {
RELEASE_ASSERT(0, "%d", rtn->str_type);
}
......
......@@ -504,7 +504,8 @@ struct expr_dispatcher {
ResultPtr read(pypa::AstStr& s) {
AST_Str* ptr = new AST_Str();
location(ptr, s);
ptr->s = s.value;
ptr->str_type = AST_Str::STR;
ptr->str_data = s.value;
return ptr;
}
......@@ -792,7 +793,7 @@ struct stmt_dispatcher {
AST_Str* str = new AST_Str();
ptr->value = str;
str->str_type = AST_Str::STR;
str->s = d.doc;
str->str_data = d.doc;
return ptr;
}
};
......
......@@ -184,6 +184,7 @@ void initGlobalFuncs(GlobalState& g) {
GET(createLong);
GET(createPureImaginary);
GET(createSet);
GET(decodeUTF8StringPtr);
GET(getattr);
GET(setattr);
......
......@@ -34,7 +34,8 @@ struct GlobalFuncs {
llvm::Value* boxInt, *unboxInt, *boxFloat, *unboxFloat, *boxStringPtr, *boxCLFunction, *unboxCLFunction,
*boxInstanceMethod, *boxBool, *unboxBool, *createTuple, *createDict, *createList, *createSlice,
*createUserClass, *createClosure, *createGenerator, *createLong, *createSet, *createPureImaginary;
*createUserClass, *createClosure, *createGenerator, *createLong, *createSet, *createPureImaginary,
*decodeUTF8StringPtr;
llvm::Value* getattr, *setattr, *delattr, *delitem, *delGlobal, *nonzero, *binop, *compare, *augbinop, *unboxedLen,
*getitem, *getclsattr, *getGlobal, *setitem, *unaryop, *import, *importFrom, *importStar, *repr, *str,
*isinstance, *yield, *getPystonIter;
......
......@@ -1627,7 +1627,13 @@ bool PrintVisitor::visit_slice(AST_Slice* node) {
}
bool PrintVisitor::visit_str(AST_Str* node) {
printf("\"%s\"", node->s.c_str());
if (node->str_type == AST_Str::STR) {
printf("\"%s\"", node->str_data.c_str());
} else if (node->str_type == AST_Str::UNICODE) {
printf("<unicode value>");
} else {
RELEASE_ASSERT(0, "%d", node->str_type);
}
return false;
}
......
......@@ -816,18 +816,21 @@ public:
class AST_Str : public AST_expr {
public:
enum StrType {
UNSET = 0x00,
STR = 0x10,
UNICODE = 0x20,
} str_type;
std::string s;
// The meaning of str_data depends on str_type. For STR, it's just the bytes value.
// For UNICODE, it's the utf-8 encoded value.
std::string str_data;
virtual void accept(ASTVisitor* v);
virtual void* accept_expr(ExprVisitor* v);
AST_Str() : AST_expr(AST_TYPE::Str) {}
AST_Str(const std::string& s) : AST_expr(AST_TYPE::Str), str_type(STR), s(s) {}
AST_Str(const std::string&& s) : AST_expr(AST_TYPE::Str), str_type(STR), s(std::move(s)) {}
AST_Str() : AST_expr(AST_TYPE::Str), str_type(UNSET) {}
AST_Str(const std::string& s) : AST_expr(AST_TYPE::Str), str_type(STR), str_data(s) {}
AST_Str(const std::string&& s) : AST_expr(AST_TYPE::Str), str_type(STR), str_data(std::move(s)) {}
static const AST_TYPE::AST_TYPE TYPE = AST_TYPE::Str;
};
......
......@@ -594,7 +594,7 @@ private:
AST_Str* orig = ast_cast<AST_Str>(val);
AST_Str* made = new AST_Str();
made->str_type = orig->str_type;
made->s = orig->s;
made->str_data = orig->str_data;
made->col_offset = orig->col_offset;
made->lineno = orig->lineno;
return made;
......
......@@ -347,16 +347,40 @@ extern "C" Box* chr(Box* arg) {
return boxString(std::string(1, (char)n));
}
extern "C" Box* ord(Box* arg) {
if (arg->cls != str_cls) {
raiseExcHelper(TypeError, "ord() expected string of length 1, but %s found", getTypeName(arg));
}
const std::string& s = static_cast<BoxedString*>(arg)->s;
extern "C" Box* ord(Box* obj) {
long ord;
Py_ssize_t size;
if (PyString_Check(obj)) {
size = PyString_GET_SIZE(obj);
if (size == 1) {
ord = (long)((unsigned char)*PyString_AS_STRING(obj));
return new BoxedInt(ord);
}
} else if (PyByteArray_Check(obj)) {
size = PyByteArray_GET_SIZE(obj);
if (size == 1) {
ord = (long)((unsigned char)*PyByteArray_AS_STRING(obj));
return new BoxedInt(ord);
}
if (s.size() != 1)
raiseExcHelper(TypeError, "ord() expected string of length 1, but string of length %d found", s.size());
#ifdef Py_USING_UNICODE
} else if (PyUnicode_Check(obj)) {
size = PyUnicode_GET_SIZE(obj);
if (size == 1) {
ord = (long)*PyUnicode_AS_UNICODE(obj);
return new BoxedInt(ord);
}
#endif
} else {
raiseExcHelper(TypeError, "ord() expected string of length 1, but "
"%.200s found",
obj->cls->tp_name);
}
return boxInt(s[0]);
raiseExcHelper(TypeError, "ord() expected a character, "
"but string of length %zd found",
size);
}
Box* range(Box* start, Box* stop, Box* step) {
......
......@@ -65,6 +65,7 @@ void force() {
FORCE(createLong);
FORCE(createPureImaginary);
FORCE(createSet);
FORCE(decodeUTF8StringPtr);
FORCE(getattr);
FORCE(setattr);
......
......@@ -1172,6 +1172,13 @@ extern "C" PyObject* PyObject_Init(PyObject* op, PyTypeObject* tp) noexcept {
return op;
}
Box* decodeUTF8StringPtr(const std::string* s) {
Box* rtn = PyUnicode_DecodeUTF8(s->c_str(), s->size(), "strict");
checkAndThrowCAPIException();
assert(rtn);
return rtn;
}
bool TRACK_ALLOCATIONS = false;
void setupRuntime() {
root_hcls = HiddenClass::makeRoot();
......
......@@ -108,6 +108,7 @@ Box* boxString(const std::string& s);
Box* boxString(std::string&& s);
extern "C" BoxedString* boxStrConstant(const char* chars);
extern "C" BoxedString* boxStrConstantSize(const char* chars, size_t n);
extern "C" Box* decodeUTF8StringPtr(const std::string* s);
// creates an uninitialized string of length n; useful for directly constructing into the string and avoiding copies:
BoxedString* createUninitializedString(ssize_t n);
......
# skip-if: '-x' in EXTRA_JIT_ARGS
# allow-warning: import level 0 will be treated as -1
print repr(unicode())
print repr(unicode('hello world'))
# Some random unicode character:
u = u'\u0180'
print len(u)
print repr(u)
print repr(u.encode("utf8"))
# This is tricky, since we need to support file encodings, and then set stdout to UTF8:
# print u
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment