initial support for unicode literals in UTF-8

3f93988f · Stefan Behnel · 79f741da · 3f93988f · 3f93988f · 3f93988f
Commit 3f93988f authored Aug 02, 2007 by Stefan Behnel
5 changed files
--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -5,7 +5,7 @@
 #   to be rebuilt next time pyrexc is run.
 #

-string_prefixes = "cCrR"
+string_prefixes = "cCrRuU"

 def make_lexicon():
    from Cython.Plex import \

--- a/Cython/Compiler/ModuleNode.py
+++ b/Cython/Compiler/ModuleNode.py
@@ -1071,13 +1071,16 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
                "static __Pyx_StringTabEntry %s[] = {" %
                    Naming.stringtab_cname)
            for entry in entries:
+                print repr(entry.init), type(entry.init)
                code.putln(
-                    "{&%s, %s, sizeof(%s)}," % (
+                    "{&%s, %s, sizeof(%s), %d}," % (
                        entry.pystring_cname,
                        entry.cname,
-                        entry.cname))
+                        entry.cname,
+                        isinstance(entry.init, unicode)
+                        ))
            code.putln(
-                "{0, 0, 0}")
+                "{0, 0, 0, 0}")
            code.putln(
                "};")
    

--- a/Cython/Compiler/Nodes.py
+++ b/Cython/Compiler/Nodes.py
@@ -2600,7 +2600,7 @@ utility_function_predeclarations = \

 typedef struct {const char *s; const void **p;} __Pyx_CApiTabEntry; /*proto*/
 typedef struct {PyObject **p; char *s;} __Pyx_InternTabEntry; /*proto*/
-typedef struct {PyObject **p; char *s; long n;} __Pyx_StringTabEntry; /*proto*/
+typedef struct {PyObject **p; char *s; long n; int is_unicode;} __Pyx_StringTabEntry; /*proto*/

 #define __Pyx_PyBool_FromLong(b) ((b) ? (Py_INCREF(Py_True), Py_True) : (Py_INCREF(Py_False), Py_False))
 static INLINE int __Pyx_PyObject_IsTrue(PyObject* x) {
@@ -3104,7 +3104,11 @@ static int __Pyx_InitStrings(__Pyx_StringTabEntry *t); /*proto*/
 ""","""
 static int __Pyx_InitStrings(__Pyx_StringTabEntry *t) {
    while (t->p) {
+        if (t->is_unicode) {
+            *t->p = PyUnicode_DecodeUTF8(t->s, t->n - 1, NULL);
+        } else {
            *t->p = PyString_FromStringAndSize(t->s, t->n - 1);
+        }
        if (!*t->p)
            return -1;
        ++t;

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -493,7 +493,7 @@ def p_opt_string_literal(s):

 def p_string_literal(s):
    # A single string or char literal.
-    # Returns (kind, value) where kind in ('', 'c', 'r')
+    # Returns (kind, value) where kind in ('', 'c', 'r', 'u')
    if s.sy == 'STRING':
        value = unquote(s.systring)
        s.next()
@@ -502,7 +502,7 @@ def p_string_literal(s):
    pos = s.position()
    #is_raw = s.systring[:1].lower() == "r"
    kind = s.systring[:1].lower()
-    if kind not in "cr":
+    if kind not in "cru":
        kind = ''
    chars = []
    while 1:
@@ -513,6 +513,8 @@ def p_string_literal(s):
            systr = s.systring
            if len(systr) == 1 and systr in "'\"\n":
                chars.append('\\')
+            if kind == 'u' and not isinstance(systr, unicode):
+                systr = systr.decode("UTF-8")
            chars.append(systr)
        elif sy == 'ESCAPE':
            systr = s.systring
@@ -533,6 +535,8 @@ def p_string_literal(s):
                    chars.append('\\x0' + systr[2:])
                elif c == '\n':
                    pass
+                elif c == 'u':
+                    chars.append(systr)
                else:
                    chars.append(r'\\' + systr[1:])
        elif sy == 'NEWLINE':
@@ -546,7 +550,10 @@ def p_string_literal(s):
                "Unexpected token %r:%r in string literal" %
                    (sy, s.systring))
    s.next()
-    value = join(chars, '')
+    if kind == 'u':
+        value = u''.join(chars)
+    else:
+        value = ''.join(chars)
    #print "p_string_literal: value =", repr(value) ###
    return kind, value


--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -705,6 +705,8 @@ class CStringType:
    from_py_function = "PyString_AsString"

    def literal_code(self, value):
+        if isinstance(value, unicode):
+            value = value.encode("UTF-8")
        return '"%s"' % value