Rewrite of the string literal handling code

String literals pass through the compiler as follows: - unicode string literals are stored as unicode strings and encoded to UTF-8 on the way out - byte string literals are stored as correctly encoded byte strings by unescaping the source string literal into the corresponding byte sequence. No further encoding is done later on! - char literals are stored as byte strings of length 1. This can be verified by the parser now, e.g. a non-ASCII char literal in UTF-8 source code will result in an error, as it would end up as two or more bytes in the C code, which can no longer be represented as a C char. Storing byte strings is necessary as we otherwise loose the ability to encode byte string literals on the way out. They do not necessarily contain only bytes that fit into the source code encoding as the source can use escape sequences to represent them. Previously, ASCII encoded source code could not contain byte string literals with properly escaped non-ASCII bytes. Another bug that was fixed: in Python, escape sequences behave different in unicode strings (where they represent the character code) and byte strings (where they represent a byte value). Previously, they resulted in the same byte value in Cython code. This is only a problem for non-ASCII escapes, since the character code and the byte value of ASCII characters are identical.

Rewrite of the string literal handling code
String literals pass through the compiler as follows: - unicode string literals are stored as unicode strings and encoded to UTF-8 on the way out - byte string literals are stored as correctly encoded byte strings by unescaping the source string literal into the corresponding byte sequence. No further encoding is done later on! - char literals are stored as byte strings of length 1. This can be verified by the parser now, e.g. a non-ASCII char literal in UTF-8 source code will result in an error, as it would end up as two or more bytes in the C code, which can no longer be represented as a C char. Storing byte strings is necessary as we otherwise loose the ability to encode byte string literals on the way out. They do not necessarily contain only bytes that fit into the source code encoding as the source can use escape sequences to represent them. Previously, ASCII encoded source code could not contain byte string literals with properly escaped non-ASCII bytes. Another bug that was fixed: in Python, escape sequences behave different in unicode strings (where they represent the character code) and byte strings (where they represent a byte value). Previously, they resulted in the same byte value in Cython code. This is only a problem for non-ASCII escapes, since the character code and the byte value of ASCII characters are identical.
2e8a0084 · Stefan Behnel · 7bc8549a · 2e8a0084 · 2e8a0084 · 2e8a0084
Commit 2e8a0084 authored Aug 15, 2008 by Stefan Behnel
15 changed files
--- a/Cython/Compiler/Buffer.py
+++ b/Cython/Compiler/Buffer.py
@@ -3,7 +3,7 @@ from Cython.Compiler.ModuleNode import ModuleNode
 from Cython.Compiler.Nodes import *
 from Cython.Compiler.ExprNodes import *
 from Cython.Compiler.TreeFragment import TreeFragment
-from Cython.Utils import EncodedString
+from Cython.Compiler.StringEncoding import EncodedString
 from Cython.Compiler.Errors import CompileError
 import Interpreter
 import PyrexTypes

--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -6,6 +6,7 @@ import operator
 from string import join
 from Errors import error, warning, InternalError
+import StringEncoding
 import Naming
 from Nodes import Node
 import PyrexTypes
@@ -14,7 +15,6 @@ from Builtin import list_type, tuple_type, dict_type, unicode_type
 import Symtab
 import Options
 from Annotate import AnnotationItem
-from Cython import Utils
 from Cython.Debugging import print_call_chain
 from DebugFlags import debug_disposal_code, debug_temp_alloc, \
@@ -640,10 +640,10 @@ class CharNode(ConstNode):
    type = PyrexTypes.c_char_type
    def compile_time_value(self, denv):
-        return ord(self.value.byteencode())
+        return ord(self.value)
    def calculate_result_code(self):
-        return "'%s'" % Utils.escape_character(self.value.byteencode())
+        return "'%s'" % StringEncoding.escape_character(self.value)
 class IntNode(ConstNode):

--- a/Cython/Compiler/Main.py
+++ b/Cython/Compiler/Main.py
@@ -397,6 +397,8 @@ class Context:
            finally:
                f.close()
        except UnicodeDecodeError, msg:
+            import traceback
+            traceback.print_exc()
            error((source_desc, 0, 0), "Decoding error, missing or incorrect coding=<encoding-name> at top of source (%s)" % msg)
        if Errors.num_errors > 0:
            raise CompileError

--- a/Cython/Compiler/ModuleNode.py
+++ b/Cython/Compiler/ModuleNode.py
@@ -23,7 +23,8 @@ import Version
 from Errors import error, warning
 from PyrexTypes import py_object_type
-from Cython.Utils import open_new_file, replace_suffix, escape_byte_string, EncodedString
+from Cython.Utils import open_new_file, replace_suffix
+from StringEncoding import escape_byte_string, EncodedString
 def check_c_classes(module_node):
@@ -514,9 +515,12 @@ class ModuleNode(Nodes.Node, Nodes.BlockNode):
        code.putln('static const char *%s;' % Naming.filename_cname)
        code.putln('static const char **%s;' % Naming.filetable_cname)
        if env.doc:
+            docstr = env.doc
+            if not isinstance(docstr, str):
+                docstr = docstr.utf8encode()
            code.putln('')
            code.putln('static char %s[] = "%s";' % (
-                    env.doc_cname, escape_byte_string(env.doc.utf8encode())))
+                    env.doc_cname, escape_byte_string(docstr)))
    def generate_extern_c_macro_definition(self, code):
        name = Naming.extern_c_macro

--- a/Cython/Compiler/Nodes.py
+++ b/Cython/Compiler/Nodes.py
@@ -13,7 +13,7 @@ from PyrexTypes import py_object_type, error_type, CTypedefType, CFuncType
 from Symtab import ModuleScope, LocalScope, GeneratorLocalScope, \
    StructOrUnionScope, PyClassScope, CClassScope
 from Cython.Utils import open_new_file, replace_suffix
-from Cython.Utils import EncodedString, escape_byte_string
+from StringEncoding import EncodedString, escape_byte_string
 import Options
 import ControlFlow
@@ -1516,10 +1516,13 @@ class DefNode(FuncDefNode):
        if proto_only:
            return
        if self.entry.doc and Options.docstrings:
+            docstr = self.entry.doc
+            if not isinstance(docstr, str):
+                docstr = docstr.utf8encode()
            code.putln(
                'static char %s[] = "%s";' % (
                    self.entry.doc_cname,
-                    escape_byte_string(self.entry.doc.utf8encode())))
+                    escape_byte_string(docstr)))
        if with_pymethdef:
            code.put(
                "static PyMethodDef %s = " % 

--- a/Cython/Compiler/ParseTreeTransforms.py
+++ b/Cython/Compiler/ParseTreeTransforms.py
@@ -3,7 +3,7 @@ from Cython.Compiler.ModuleNode import ModuleNode
 from Cython.Compiler.Nodes import *
 from Cython.Compiler.ExprNodes import *
 from Cython.Compiler.TreeFragment import TreeFragment
-from Cython.Utils import EncodedString
+from Cython.Compiler.StringEncoding import EncodedString
 from Cython.Compiler.Errors import CompileError
 try:
    set

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
--- a/Cython/Compiler/PyrexTypes.py
+++ b/Cython/Compiler/PyrexTypes.py
@@ -2,7 +2,7 @@
 #   Pyrex - Types
 #
-from Cython import Utils
+import StringEncoding
 import Naming
 import copy
@@ -1000,7 +1000,7 @@ class CStringType:
    def literal_code(self, value):
        assert isinstance(value, str)
-        return '"%s"' % Utils.escape_byte_string(value)
+        return '"%s"' % StringEncoding.escape_byte_string(value)
 class CUTF8CharArrayType(CStringType, CArrayType):

--- a/Cython/Compiler/Scanning.py
+++ b/Cython/Compiler/Scanning.py
@@ -17,7 +17,7 @@ from Cython.Plex.Errors import UnrecognizedInput
 from Errors import CompileError, error
 from Lexicon import string_prefixes, raw_prefixes, make_lexicon
-from Cython import Utils
+from StringEncoding import EncodedString
 plex_version = getattr(Plex, '_version', None)
 #print "Plex version:", plex_version ###
@@ -413,7 +413,7 @@ class PyrexScanner(Scanner):
            if systring in self.resword_dict:
                sy = systring
            else:
-                systring = Utils.EncodedString(systring)
+                systring = EncodedString(systring)
                systring.encoding = self.source_encoding
        self.sy = sy
        self.systring = systring

--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
+#
+#   Cython -- encoding related tools
+#
+import re
+class UnicodeLiteralBuilder(object):
+    """Assemble a unicode string.
+    """
+    def __init__(self):
+        self.chars = []
+    def append(self, characters):
+        if isinstance(characters, str):
+            # this came from a Py2 string literal in the parser code
+            characters = characters.decode("ASCII")
+        assert isinstance(characters, unicode), str(type(characters))
+        self.chars.append(characters)
+    def append_charval(self, char_number):
+        self.chars.append( unichr(char_number) )
+    def getstring(self):
+        return EncodedString(u''.join(self.chars))
+class BytesLiteralBuilder(object):
+    """Assemble a byte string or char value.
+    """
+    def __init__(self, target_encoding):
+        self.chars = []
+        self.target_encoding = target_encoding
+    def append(self, characters):
+        if isinstance(characters, unicode):
+            characters = characters.encode(self.target_encoding)
+        assert isinstance(characters, str), str(type(characters))
+        self.chars.append(characters)
+    def append_charval(self, char_number):
+        self.chars.append( chr(char_number) )
+    def getstring(self):
+        # this *must* return a byte string! => fix it in Py3k!!
+        s = BytesLiteral(''.join(self.chars))
+        s.encoding = self.target_encoding
+        return s
+    def getchar(self):
+        # this *must* return a byte string! => fix it in Py3k!!
+        return self.getstring()
+class EncodedString(unicode):
+    # unicode string subclass to keep track of the original encoding.
+    # 'encoding' is None for unicode strings and the source encoding
+    # otherwise
+    encoding = None
+    def byteencode(self):
+        assert self.encoding is not None
+        return self.encode(self.encoding)
+    def utf8encode(self):
+        assert self.encoding is None
+        return self.encode("UTF-8")
+    def is_unicode(self):
+        return self.encoding is None
+    is_unicode = property(is_unicode)
+class BytesLiteral(str):
+    # str subclass that is compatible with EncodedString
+    encoding = None
+    def byteencode(self):
+        return str(self)
+    def utf8encode(self):
+        assert False, "this is not a unicode string: %r" % self
+    is_unicode = False
+char_from_escape_sequence = {
+    r'\a' : u'\a',
+    r'\b' : u'\b',
+    r'\f' : u'\f',
+    r'\n' : u'\n',
+    r'\r' : u'\r',
+    r'\t' : u'\t',
+    r'\v' : u'\v',
+    }.get
+def _to_escape_sequence(s):
+    if s in '\n\r\t':
+        return repr(s)[1:-1]
+    elif s == '"':
+        return r'\"'
+    else:
+        # within a character sequence, oct passes much better than hex
+        return ''.join(['\\%03o' % ord(c) for c in s])
+_c_special = ('\0', '\n', '\r', '\t', '??', '"')
+_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
+def _build_specials_test():
+    subexps = []
+    for special in _c_special:
+        regexp = ''.join(['[%s]' % c for c in special])
+        subexps.append(regexp)
+    return re.compile('|'.join(subexps)).search
+_has_specials = _build_specials_test()
+def escape_character(c):
+    if c in '\n\r\t\\':
+        return repr(c)[1:-1]
+    elif c == "'":
+        return "\\'"
+    n = ord(c)
+    if n < 32 or n > 127:
+        # hex works well for characters
+        return "\\x%02X" % n
+    else:
+        return c
+def escape_byte_string(s):
+    s = s.replace('\\', '\\\\')
+    if _has_specials(s):
+        for special, replacement in _c_special_replacements:
+            s = s.replace(special, replacement)
+    try:
+        s.decode("ASCII")
+        return s
+    except UnicodeDecodeError:
+        pass
+    l = []
+    append = l.append
+    for c in s:
+        o = ord(c)
+        if o >= 128:
+            append('\\%3o' % o)
+        else:
+            append(c)
+    return ''.join(l)
--- a/Cython/Compiler/Symtab.py
+++ b/Cython/Compiler/Symtab.py
@@ -5,6 +5,7 @@
 import re
 from Cython import Utils
 from Errors import warning, error, InternalError
+from StringEncoding import EncodedString
 import Options
 import Naming
 import PyrexTypes
@@ -684,14 +685,14 @@ class BuiltinScope(Scope):
            utility_code = None):
        # If python_equiv == "*", the Python equivalent has the same name
        # as the entry, otherwise it has the name specified by python_equiv.
-        name = Utils.EncodedString(name)
+        name = EncodedString(name)
        entry = self.declare_cfunction(name, type, None, cname)
        entry.utility_code = utility_code
        if python_equiv:
            if python_equiv == "*":
                python_equiv = name
            else:
-                python_equiv = Utils.EncodedString(python_equiv)
+                python_equiv = EncodedString(python_equiv)
            var_entry = Entry(python_equiv, python_equiv, py_object_type)
            var_entry.is_variable = 1
            var_entry.is_builtin = 1
@@ -699,7 +700,7 @@ class BuiltinScope(Scope):
        return entry
    def declare_builtin_type(self, name, cname):
-        name = Utils.EncodedString(name)
+        name = EncodedString(name)
        type = PyrexTypes.BuiltinObjectType(name, cname)
        type.set_scope(CClassScope(name, outer_scope=None, visibility='extern'))
        self.type_names[name] = 1
@@ -1370,7 +1371,7 @@ class CClassScope(ClassScope):
        if name == "__new__":
            warning(pos, "__new__ method of extension type will change semantics "
                "in a future version of Pyrex and Cython. Use __cinit__ instead.")
-            name = Utils.EncodedString("__cinit__")
+            name = EncodedString("__cinit__")
        entry = self.declare_var(name, py_object_type, pos, visibility='extern')
        special_sig = get_special_method_signature(name)
        if special_sig:
@@ -1387,7 +1388,7 @@ class CClassScope(ClassScope):
    def lookup_here(self, name):
        if name == "__new__":
-            name = Utils.EncodedString("__cinit__")
+            name = EncodedString("__cinit__")
        return ClassScope.lookup_here(self, name)
    def declare_cfunction(self, name, type, pos,

--- a/Cython/Compiler/TypeSlots.py
+++ b/Cython/Compiler/TypeSlots.py
@@ -3,9 +3,9 @@
 #           and associated know-how.
 #
-from Cython import Utils
 import Naming
 import PyrexTypes
+import StringEncoding
 import sys
 class Signature:
@@ -311,7 +311,7 @@ class DocStringSlot(SlotDescriptor):
                doc = scope.doc.utf8encode()
            else:
                doc = scope.doc.byteencode()
-            return '"%s"' % Utils.escape_byte_string(doc)
+            return '"%s"' % StringEncoding.escape_byte_string(doc)
        else:
            return "0"

--- a/Cython/Compiler/Visitor.py
+++ b/Cython/Compiler/Visitor.py
@@ -5,7 +5,7 @@ import inspect
 import Nodes
 import ExprNodes
 import Naming
-from Cython.Utils import EncodedString
+from StringEncoding import EncodedString
 class BasicVisitor(object):
    """A generic visitor base class which can be used for visiting any kind of object."""

--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -40,7 +40,7 @@ def file_newer_than(path, time):
    ftime = modification_time(path)
    return ftime > time
-# support for source file encoding detection and unicode decoding
+# support for source file encoding detection
 def encode_filename(filename):
    if isinstance(filename, unicode):
@@ -77,90 +77,6 @@ def open_source_file(source_filename, mode="rU"):
    encoding = detect_file_encoding(source_filename)
    return codecs.open(source_filename, mode=mode, encoding=encoding)
-class EncodedString(unicode):
-    # unicode string subclass to keep track of the original encoding.
-    # 'encoding' is None for unicode strings and the source encoding
-    # otherwise
-    encoding = None
-    def byteencode(self):
-        assert self.encoding is not None
-        return self.encode(self.encoding)
-    def utf8encode(self):
-        assert self.encoding is None
-        return self.encode("UTF-8")
-    def is_unicode(self):
-        return self.encoding is None
-    is_unicode = property(is_unicode)
-#    def __eq__(self, other):
-#        return unicode.__eq__(self, other) and \
-#            getattr(other, 'encoding', '') == self.encoding
-char_from_escape_sequence = {
-    r'\a' : '\a',
-    r'\b' : '\b',
-    r'\f' : '\f',
-    r'\n' : '\n',
-    r'\r' : '\r',
-    r'\t' : '\t',
-    r'\v' : '\v',
-    }.get
-def _to_escape_sequence(s):
-    if s in '\n\r\t':
-        return repr(s)[1:-1]
-    elif s == '"':
-        return r'\"'
-    else:
-        # within a character sequence, oct passes much better than hex
-        return ''.join(['\\%03o' % ord(c) for c in s])
-_c_special = ('\0', '\n', '\r', '\t', '??', '"')
-_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
-def _build_specials_test():
-    subexps = []
-    for special in _c_special:
-        regexp = ''.join(['[%s]' % c for c in special])
-        subexps.append(regexp)
-    return re.compile('|'.join(subexps)).search
-_has_specials = _build_specials_test()
-def escape_character(c):
-    if c in '\n\r\t\\':
-        return repr(c)[1:-1]
-    elif c == "'":
-        return "\\'"
-    elif ord(c) < 32:
-        # hex works well for characters
-        return "\\x%02X" % ord(c)
-    else:
-        return c
-def escape_byte_string(s):
-    s = s.replace('\\', '\\\\')
-    if _has_specials(s):
-        for special, replacement in _c_special_replacements:
-            s = s.replace(special, replacement)
-    try:
-        s.decode("ASCII")
-        return s
-    except UnicodeDecodeError:
-        pass
-    l = []
-    append = l.append
-    for c in s:
-        o = ord(c)
-        if o >= 128:
-            append('\\%3o' % o)
-        else:
-            append(c)
-    return ''.join(l)
 def long_literal(value):
    if isinstance(value, basestring):
        if len(value) < 2:

--- a/tests/run/charencoding.pyx
+++ b/tests/run/charencoding.pyx
+# coding: ASCII
+__doc__ = u"""
+>>> s = test()
+>>> assert s == ''.join([chr(i) for i in range(0x10,0xFF,0x11)] + [chr(0xFF)]), repr(s)
+"""
+def test():
+    cdef char s[17]
+    s[ 0] = c'\x10'
+    s[ 1] = c'\x21'
+    s[ 2] = c'\x32'
+    s[ 3] = c'\x43'
+    s[ 4] = c'\x54'
+    s[ 5] = c'\x65'
+    s[ 6] = c'\x76'
+    s[ 7] = c'\x87'
+    s[ 8] = c'\x98'
+    s[ 9] = c'\xA9'
+    s[10] = c'\xBA'
+    s[11] = c'\xCB'
+    s[12] = c'\xDC'
+    s[13] = c'\xED'
+    s[14] = c'\xFE'
+    s[15] = c'\xFF'
+    s[16] = c'\x00'
+    return s