unescape all string content in the parser and escape it on the way out

otherwise, different ways of spelling special characters can end up being correctly escaped or not in the C file

unescape all string content in the parser and escape it on the way out
otherwise, different ways of spelling special characters can end up being correctly escaped or not in the C file
62fc87e0 · Stefan Behnel · f6df9115 · 62fc87e0 · 62fc87e0 · 62fc87e0
Commit 62fc87e0 authored Aug 12, 2008 by Stefan Behnel
5 changed files
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -642,7 +642,13 @@ class CharNode(ConstNode):
        return ord(self.value)
    
    def calculate_result_code(self):
-        return "'%s'" % self.value
+        if self.value == "'":
+            return r"'\''"
+        char = ord(self.value)
+        if char < 32:
+            return "'\\x%02X'" % char
+        else:
+            return "'%s'" % self.value


 class IntNode(ConstNode):

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -585,25 +585,28 @@ def p_string_literal(s):
        sy = s.sy
        #print "p_string_literal: sy =", sy, repr(s.systring) ###
        if sy == 'CHARS':
-            systr = s.systring
-            if len(systr) == 1 and systr in "'\"\n":
-                chars.append('\\')
-            chars.append(systr)
+            chars.append(s.systring)
        elif sy == 'ESCAPE':
            systr = s.systring
            if is_raw:
                if systr == '\\\n':
-                    chars.append(r'\\\n')
-                elif systr == r'\"':
-                    chars.append(r'\\\"')
-                elif systr == r'\\':
-                    chars.append(r'\\\\')
+                    chars.append('\n')
+                elif systr == '\\\"':
+                    chars.append('"')
+                elif systr == '\\\'':
+                    chars.append("'")
+                elif systr == '\\\\':
+                    chars.append('\\')
                else:
-                    chars.append('\\' + systr)
+                    chars.append(systr)
            else:
                c = systr[1]
-                if c in "'\"\\abfnrtv01234567":
-                    chars.append(systr)
+                if c in "01234567":
+                    chars.append(chr(int(systr[1:])))
+                elif c in "'\"\\":
+                    chars.append(c)
+                elif c in "abfnrtv":
+                    chars.append(Utils.char_from_escape_sequence(systr))
                elif c == '\n':
                    pass
                elif c in 'Uux':
@@ -616,11 +619,11 @@ def p_string_literal(s):
                    else:
                        # unicode escapes in plain byte strings are not unescaped
                        strval = systr
-                    chars.append(strval.replace('\\', '\\\\'))
+                    chars.append(strval)
                else:
                    chars.append(r'\\' + systr[1:])
        elif sy == 'NEWLINE':
-            chars.append(r'\n')
+            chars.append('\n')
        elif sy == 'END_STRING':
            break
        elif sy == 'EOF':
@@ -629,8 +632,11 @@ def p_string_literal(s):
            s.error(
                "Unexpected token %r:%r in string literal" %
                    (sy, s.systring))
+    string = u''.join(chars)
+    if kind == 'c' and len(string) != 1:
+        error(pos, u"invalid character literal: %r" % string)
    s.next()
-    value = Utils.EncodedString( u''.join(chars) )
+    value = Utils.EncodedString(string)
    if kind != 'u':
        value.encoding = s.source_encoding
    #print "p_string_literal: value =", repr(value) ###

--- a/Cython/Utils.py
+++ b/Cython/Utils.py
@@ -99,25 +99,39 @@ class EncodedString(unicode):
 #        return unicode.__eq__(self, other) and \
 #            getattr(other, 'encoding', '') == self.encoding

-def _to_oct_sequence(s):
+char_from_escape_sequence = {
+    r'\a' : '\a',
+    r'\b' : '\b',
+    r'\f' : '\f',
+    r'\n' : '\n',
+    r'\r' : '\r',
+    r'\t' : '\t',
+    r'\v' : '\v',
+    }.get
+
+def _to_escape_sequence(s):
    if s in '\n\r\t':
        return repr(s)[1:-1]
+    elif s == '"':
+        return r'\"'
    else:
+        # oct passes much better than hex
        return ''.join(['\\%03o' % ord(c) for c in s])

-_c_special = ('\0', '\n','\r','\t', '??', '<:', ':>', '<%', '%>', '%:', '%:')
-_c_special_replacements = zip(_c_special, map(_to_oct_sequence, _c_special))
+_c_special = ('\0', '\n', '\r', '\t', '??', '"')
+_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))

-def _build_special_test():
+def _build_specials_test():
    subexps = []
    for special in _c_special:
        regexp = ''.join(['[%s]' % c for c in special])
        subexps.append(regexp)
-    return re.compile('(' + '|'.join(subexps) + ')').search
+    return re.compile('|'.join(subexps)).search

-_has_specials = _build_special_test()
+_has_specials = _build_specials_test()

 def escape_byte_string(s):
+    s = s.replace('\\', '\\\\')
    if _has_specials(s):
        for special, replacement in _c_special_replacements:
            s = s.replace(special, replacement)

--- a/tests/run/charescape.pyx
+++ b/tests/run/charescape.pyx
+__doc__ = u"""
+>>> s = test()
+>>> assert s == ''.join([chr(i) for i in range(1,49)]), s
+"""
+
+def test():
+    cdef char s[50]
+
+    s[ 0] = c'\0'
+    s[ 1] = c'\x01'
+    s[ 2] = c'\x02'
+    s[ 3] = c'\x03'
+    s[ 4] = c'\x04'
+    s[ 5] = c'\x05'
+    s[ 6] = c'\x06'
+    s[ 7] = c'\x07'
+    s[ 8] = c'\x08'
+    s[ 9] = c'\x09'
+    s[10] = c'\x0A'
+    s[11] = c'\x0B'
+    s[12] = c'\x0C'
+    s[13] = c'\x0D'
+    s[14] = c'\x0E'
+    s[15] = c'\x0F'
+    s[16] = c'\x10'
+    s[17] = c'\x11'
+    s[18] = c'\x12'
+    s[19] = c'\x13'
+    s[20] = c'\x14'
+    s[21] = c'\x15'
+    s[22] = c'\x16'
+    s[23] = c'\x17'
+    s[24] = c'\x18'
+    s[25] = c'\x19'
+    s[26] = c'\x1A'
+    s[27] = c'\x1B'
+    s[28] = c'\x1C'
+    s[29] = c'\x1D'
+    s[30] = c'\x1E'
+    s[31] = c'\x1F'
+    s[32] = c'\x20'
+    s[33] = c'\x21'
+    s[34] = c'\x22'
+    s[35] = c'\x23'
+    s[36] = c'\x24'
+    s[37] = c'\x25'
+    s[38] = c'\x26'
+    s[39] = c'\x27'
+    s[40] = c'\x28'
+    s[41] = c'\x29'
+    s[42] = c'\x2A'
+    s[43] = c'\x2B'
+    s[44] = c'\x2C'
+    s[45] = c'\x2D'
+    s[46] = c'\x2E'
+    s[47] = c'\x2F'
+    s[48] = c'\x30'
+
+    s[49] = c'\x00'
+
+    assert s[ 0] == c'\x00'
+    assert s[49] == c'\0'
+
+    return &s[1]
--- a/tests/run/strescapes.pyx
+++ b/tests/run/strescapes.pyx
@@ -4,6 +4,11 @@ __doc__ = u"""
 ... b'\\x1234',
 ... b'\\x0A12\\x0C34',
 ... b'\\x0A57',
+... b'\\x0A',
+... b'\\'',
+... b"\\'",
+... b"\\"",
+... b'\\"',
 ... b'abc\\x12def',
 ... u'\\u1234',
 ... u'\\U00001234',
@@ -28,14 +33,19 @@ __doc__ = u"""

 import sys
 if sys.version_info[0] < 3:
-    __doc__ = __doc__.replace(u" b'", u" '")
+    __doc__ = __doc__.replace(u" b'", u" '").replace(u' b"', u' "')
 else:
-    __doc__ = __doc__.replace(u" u'", u" '")
+    __doc__ = __doc__.replace(u" u'", u" '").replace(u' u"', u' "')

 c_strings = [
 (b'\x1234', 3),
 (b'\x0A12\x0C34', 6),
 (b'\x0A57', 3),
+(b'\x0A', 1),
+(b'\'', 1),
+(b"\'", 1),
+(b"\"", 1),
+(b'\"', 1),
 (b'abc\x12def', 7),
 (u'\u1234', 1),
 (u'\U00001234', 1),