Commit 3f93988f authored by Stefan Behnel's avatar Stefan Behnel

initial support for unicode literals in UTF-8

parent 79f741da
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# to be rebuilt next time pyrexc is run. # to be rebuilt next time pyrexc is run.
# #
string_prefixes = "cCrR" string_prefixes = "cCrRuU"
def make_lexicon(): def make_lexicon():
from Cython.Plex import \ from Cython.Plex import \
......
...@@ -1071,13 +1071,16 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode): ...@@ -1071,13 +1071,16 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
"static __Pyx_StringTabEntry %s[] = {" % "static __Pyx_StringTabEntry %s[] = {" %
Naming.stringtab_cname) Naming.stringtab_cname)
for entry in entries: for entry in entries:
print repr(entry.init), type(entry.init)
code.putln( code.putln(
"{&%s, %s, sizeof(%s)}," % ( "{&%s, %s, sizeof(%s), %d}," % (
entry.pystring_cname, entry.pystring_cname,
entry.cname, entry.cname,
entry.cname)) entry.cname,
isinstance(entry.init, unicode)
))
code.putln( code.putln(
"{0, 0, 0}") "{0, 0, 0, 0}")
code.putln( code.putln(
"};") "};")
......
...@@ -2600,7 +2600,7 @@ utility_function_predeclarations = \ ...@@ -2600,7 +2600,7 @@ utility_function_predeclarations = \
typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/ typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/
typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/ typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/
typedef struct {PyObject **p; char *s; long n;} __Pyx_StringTabEntry; /*proto*/ typedef struct {PyObject **p; char *s; long n; int is_unicode;} __Pyx_StringTabEntry; /*proto*/
#define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False)) #define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
static INLINE int __Pyx_PyObject_IsTrue(PyObject* x) { static INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
...@@ -3104,7 +3104,11 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/ ...@@ -3104,7 +3104,11 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/
""",""" ""","""
static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) { static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
while (t->p) { while (t->p) {
if (t->is_unicode) {
*t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
} else {
*t->p = PyString_FromStringAndSize(t->s, t->n - 1); *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
}
if (!*t->p) if (!*t->p)
return -1; return -1;
++t; ++t;
......
...@@ -493,7 +493,7 @@ def p_opt_string_literal(s): ...@@ -493,7 +493,7 @@ def p_opt_string_literal(s):
def p_string_literal(s): def p_string_literal(s):
# A single string or char literal. # A single string or char literal.
# Returns (kind, value) where kind in ('', 'c', 'r') # Returns (kind, value) where kind in ('', 'c', 'r', 'u')
if s.sy == 'STRING': if s.sy == 'STRING':
value = unquote(s.systring) value = unquote(s.systring)
s.next() s.next()
...@@ -502,7 +502,7 @@ def p_string_literal(s): ...@@ -502,7 +502,7 @@ def p_string_literal(s):
pos = s.position() pos = s.position()
#is_raw = s.systring[:1].lower() == "r" #is_raw = s.systring[:1].lower() == "r"
kind = s.systring[:1].lower() kind = s.systring[:1].lower()
if kind not in "cr": if kind not in "cru":
kind = '' kind = ''
chars = [] chars = []
while 1: while 1:
...@@ -513,6 +513,8 @@ def p_string_literal(s): ...@@ -513,6 +513,8 @@ def p_string_literal(s):
systr = s.systring systr = s.systring
if len(systr) == 1 and systr in "'\"\n": if len(systr) == 1 and systr in "'\"\n":
chars.append('\\') chars.append('\\')
if kind == 'u' and not isinstance(systr, unicode):
systr = systr.decode("UTF-8")
chars.append(systr) chars.append(systr)
elif sy == 'ESCAPE': elif sy == 'ESCAPE':
systr = s.systring systr = s.systring
...@@ -533,6 +535,8 @@ def p_string_literal(s): ...@@ -533,6 +535,8 @@ def p_string_literal(s):
chars.append('\\x0' + systr[2:]) chars.append('\\x0' + systr[2:])
elif c == '\n': elif c == '\n':
pass pass
elif c == 'u':
chars.append(systr)
else: else:
chars.append(r'\\' + systr[1:]) chars.append(r'\\' + systr[1:])
elif sy == 'NEWLINE': elif sy == 'NEWLINE':
...@@ -546,7 +550,10 @@ def p_string_literal(s): ...@@ -546,7 +550,10 @@ def p_string_literal(s):
"Unexpected token %r:%r in string literal" % "Unexpected token %r:%r in string literal" %
(sy, s.systring)) (sy, s.systring))
s.next() s.next()
value = join(chars, '') if kind == 'u':
value = u''.join(chars)
else:
value = ''.join(chars)
#print "p_string_literal: value =", repr(value) ### #print "p_string_literal: value =", repr(value) ###
return kind, value return kind, value
......
...@@ -705,6 +705,8 @@ class CStringType: ...@@ -705,6 +705,8 @@ class CStringType:
from_py_function = "PyString_AsString" from_py_function = "PyString_AsString"
def literal_code(self, value): def literal_code(self, value):
if isinstance(value, unicode):
value = value.encode("UTF-8")
return '"%s"' % value return '"%s"' % value
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment