use a dedicated UnicodeType and UnicodeNode to represent unicode literals

fixes the unicode literal indexing problem (only for unicode strings, not for byte strings!)

use a dedicated UnicodeType and UnicodeNode to represent unicode literals
fixes the unicode literal indexing problem (only for unicode strings, not for byte strings!)
14986aea · Stefan Behnel · 0227fc22 · 14986aea · 14986aea · 14986aea
Commit 14986aea authored Aug 12, 2008 by Stefan Behnel
4 changed files
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -738,6 +738,29 @@ class StringNode(ConstNode):
            return self.entry.cname


+class UnicodeNode(PyConstNode):
+    #  entry   Symtab.Entry
+
+    type = PyrexTypes.c_unicode_type
+
+    def analyse_types(self, env):
+        self.entry = env.add_string_const(self.value)
+        env.add_py_string(self.entry)
+
+    def calculate_result_code(self):
+        return self.entry.pystring_cname
+    
+    def _coerce_to(self, dst_type, env):
+        if not dst_type.is_pyobject:
+            node = StringNode(self.pos, entry = entry, type = py_object_type)
+            return ConstNode.coerce_to(node, dst_type, env)
+        else:
+            return self
+        # We still need to perform normal coerce_to processing on the
+        # result, because we might be coercing to an extension type,
+        # in which case a type test node will be needed.
+
+
 class IdentifierStringNode(ConstNode):
    # A Python string that behaves like an identifier, e.g. for
    # keyword arguments in a call, or for imported names

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -492,6 +492,8 @@ def p_atom(s):
        kind, value = p_cat_string_literal(s)
        if kind == 'c':
            return ExprNodes.CharNode(pos, value = value)
+        elif kind == 'u':
+            return ExprNodes.UnicodeNode(pos, value = value)
        else:
            return ExprNodes.StringNode(pos, value = value)
    elif sy == 'IDENT':

--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -998,20 +998,6 @@ class CStringType:
        return '"%s"' % Utils.escape_byte_string(value)


-class CUTF8StringType:
-    #  Mixin class for C unicode types.
-
-    is_string = 1
-    is_unicode = 1
-    
-    to_py_function = "PyUnicode_DecodeUTF8"
-    exception_value = "NULL"
-
-    def literal_code(self, value):
-        assert isinstance(value, str)
-        return '"%s"' % Utils.escape_byte_string(value)
-
-
 class CCharArrayType(CStringType, CArrayType):
    #  C 'char []' type.
    
@@ -1022,16 +1008,6 @@ class CCharArrayType(CStringType, CArrayType):
        CArrayType.__init__(self, c_char_type, size)
    

-class CUTF8CharArrayType(CUTF8StringType, CArrayType):
-    #  C 'char []' type.
-    
-    parsetuple_format = "s"
-    pymemberdef_typecode = "T_STRING_INPLACE"
-    
-    def __init__(self, size):
-        CArrayType.__init__(self, c_char_type, size)
-    
-
 class CCharPtrType(CStringType, CPtrType):
    # C 'char *' type.
    
@@ -1042,6 +1018,29 @@ class CCharPtrType(CStringType, CPtrType):
        CPtrType.__init__(self, c_char_type)


+class UnicodeType(BuiltinObjectType):
+    #  The Python unicode type.
+
+    is_string = 1
+    is_unicode = 1
+    
+    parsetuple_format = "O"
+
+    def __init__(self):
+        BuiltinObjectType.__init__(self, "unicode", "PyUnicodeObject")
+
+    def literal_code(self, value):
+        assert isinstance(value, str)
+        return '"%s"' % Utils.escape_byte_string(value)
+
+    def declaration_code(self, entity_code, 
+            for_display = 0, dll_linkage = None, pyrex = 0):
+        if pyrex or for_display:
+            return self.base_declaration_code(self.name, entity_code)
+        else:
+            return "%s %s[]" % (public_decl("char", dll_linkage), entity_code)
+
+
 class ErrorType(PyrexType):
    # Used to prevent propagation of error messages.
    
@@ -1106,7 +1105,7 @@ c_longdouble_type =  CFloatType(8, typestring="g")

 c_null_ptr_type =     CNullPtrType(c_void_type)
 c_char_array_type =   CCharArrayType(None)
-c_utf8_char_array_type =   CUTF8CharArrayType(None)
+c_unicode_type =      UnicodeType()
 c_char_ptr_type =     CCharPtrType()
 c_char_ptr_ptr_type = CPtrType(c_char_ptr_type)
 c_py_ssize_t_ptr_type =  CPtrType(c_py_ssize_t_type)

--- a/Cython/Compiler/Symtab.py
+++ b/Cython/Compiler/Symtab.py
@@ -504,7 +504,7 @@ class Scope:
        else:
            cname = self.new_const_cname()
        if value.is_unicode:
-            c_type = PyrexTypes.c_utf8_char_array_type
+            c_type = PyrexTypes.c_unicode_type
            value = value.utf8encode()
        else:
            c_type = PyrexTypes.c_char_array_type