Commit 62fc87e0 authored by Stefan Behnel's avatar Stefan Behnel

unescape all string content in the parser and escape it on the way out

otherwise, different ways of spelling special characters can end up being correctly escaped or not in the C file
parent f6df9115
......@@ -642,7 +642,13 @@ class CharNode(ConstNode):
return ord(self.value)
def calculate_result_code(self):
return "'%s'" % self.value
if self.value == "'":
return r"'\''"
char = ord(self.value)
if char < 32:
return "'\\x%02X'" % char
else:
return "'%s'" % self.value
class IntNode(ConstNode):
......
......@@ -585,25 +585,28 @@ def p_string_literal(s):
sy = s.sy
#print "p_string_literal: sy =", sy, repr(s.systring) ###
if sy == 'CHARS':
systr = s.systring
if len(systr) == 1 and systr in "'\"\n":
chars.append('\\')
chars.append(systr)
chars.append(s.systring)
elif sy == 'ESCAPE':
systr = s.systring
if is_raw:
if systr == '\\\n':
chars.append(r'\\\n')
elif systr == r'\"':
chars.append(r'\\\"')
elif systr == r'\\':
chars.append(r'\\\\')
chars.append('\n')
elif systr == '\\\"':
chars.append('"')
elif systr == '\\\'':
chars.append("'")
elif systr == '\\\\':
chars.append('\\')
else:
chars.append('\\' + systr)
chars.append(systr)
else:
c = systr[1]
if c in "'\"\\abfnrtv01234567":
chars.append(systr)
if c in "01234567":
chars.append(chr(int(systr[1:])))
elif c in "'\"\\":
chars.append(c)
elif c in "abfnrtv":
chars.append(Utils.char_from_escape_sequence(systr))
elif c == '\n':
pass
elif c in 'Uux':
......@@ -616,11 +619,11 @@ def p_string_literal(s):
else:
# unicode escapes in plain byte strings are not unescaped
strval = systr
chars.append(strval.replace('\\', '\\\\'))
chars.append(strval)
else:
chars.append(r'\\' + systr[1:])
elif sy == 'NEWLINE':
chars.append(r'\n')
chars.append('\n')
elif sy == 'END_STRING':
break
elif sy == 'EOF':
......@@ -629,8 +632,11 @@ def p_string_literal(s):
s.error(
"Unexpected token %r:%r in string literal" %
(sy, s.systring))
string = u''.join(chars)
if kind == 'c' and len(string) != 1:
error(pos, u"invalid character literal: %r" % string)
s.next()
value = Utils.EncodedString( u''.join(chars) )
value = Utils.EncodedString(string)
if kind != 'u':
value.encoding = s.source_encoding
#print "p_string_literal: value =", repr(value) ###
......
......@@ -99,25 +99,39 @@ class EncodedString(unicode):
# return unicode.__eq__(self, other) and \
# getattr(other, 'encoding', '') == self.encoding
def _to_oct_sequence(s):
char_from_escape_sequence = {
r'\a' : '\a',
r'\b' : '\b',
r'\f' : '\f',
r'\n' : '\n',
r'\r' : '\r',
r'\t' : '\t',
r'\v' : '\v',
}.get
def _to_escape_sequence(s):
if s in '\n\r\t':
return repr(s)[1:-1]
elif s == '"':
return r'\"'
else:
# oct passes much better than hex
return ''.join(['\\%03o' % ord(c) for c in s])
_c_special = ('\0', '\n','\r','\t', '??', '<:', ':>', '<%', '%>', '%:', '%:')
_c_special_replacements = zip(_c_special, map(_to_oct_sequence, _c_special))
_c_special = ('\0', '\n', '\r', '\t', '??', '"')
_c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
def _build_special_test():
def _build_specials_test():
subexps = []
for special in _c_special:
regexp = ''.join(['[%s]' % c for c in special])
subexps.append(regexp)
return re.compile('(' + '|'.join(subexps) + ')').search
return re.compile('|'.join(subexps)).search
_has_specials = _build_special_test()
_has_specials = _build_specials_test()
def escape_byte_string(s):
s = s.replace('\\', '\\\\')
if _has_specials(s):
for special, replacement in _c_special_replacements:
s = s.replace(special, replacement)
......
__doc__ = u"""
>>> s = test()
>>> assert s == ''.join([chr(i) for i in range(1,49)]), s
"""
def test():
cdef char s[50]
s[ 0] = c'\0'
s[ 1] = c'\x01'
s[ 2] = c'\x02'
s[ 3] = c'\x03'
s[ 4] = c'\x04'
s[ 5] = c'\x05'
s[ 6] = c'\x06'
s[ 7] = c'\x07'
s[ 8] = c'\x08'
s[ 9] = c'\x09'
s[10] = c'\x0A'
s[11] = c'\x0B'
s[12] = c'\x0C'
s[13] = c'\x0D'
s[14] = c'\x0E'
s[15] = c'\x0F'
s[16] = c'\x10'
s[17] = c'\x11'
s[18] = c'\x12'
s[19] = c'\x13'
s[20] = c'\x14'
s[21] = c'\x15'
s[22] = c'\x16'
s[23] = c'\x17'
s[24] = c'\x18'
s[25] = c'\x19'
s[26] = c'\x1A'
s[27] = c'\x1B'
s[28] = c'\x1C'
s[29] = c'\x1D'
s[30] = c'\x1E'
s[31] = c'\x1F'
s[32] = c'\x20'
s[33] = c'\x21'
s[34] = c'\x22'
s[35] = c'\x23'
s[36] = c'\x24'
s[37] = c'\x25'
s[38] = c'\x26'
s[39] = c'\x27'
s[40] = c'\x28'
s[41] = c'\x29'
s[42] = c'\x2A'
s[43] = c'\x2B'
s[44] = c'\x2C'
s[45] = c'\x2D'
s[46] = c'\x2E'
s[47] = c'\x2F'
s[48] = c'\x30'
s[49] = c'\x00'
assert s[ 0] == c'\x00'
assert s[49] == c'\0'
return &s[1]
......@@ -4,6 +4,11 @@ __doc__ = u"""
... b'\\x1234',
... b'\\x0A12\\x0C34',
... b'\\x0A57',
... b'\\x0A',
... b'\\'',
... b"\\'",
... b"\\"",
... b'\\"',
... b'abc\\x12def',
... u'\\u1234',
... u'\\U00001234',
......@@ -28,14 +33,19 @@ __doc__ = u"""
import sys
if sys.version_info[0] < 3:
__doc__ = __doc__.replace(u" b'", u" '")
__doc__ = __doc__.replace(u" b'", u" '").replace(u' b"', u' "')
else:
__doc__ = __doc__.replace(u" u'", u" '")
__doc__ = __doc__.replace(u" u'", u" '").replace(u' u"', u' "')
c_strings = [
(b'\x1234', 3),
(b'\x0A12\x0C34', 6),
(b'\x0A57', 3),
(b'\x0A', 1),
(b'\'', 1),
(b"\'", 1),
(b"\"", 1),
(b'\"', 1),
(b'abc\x12def', 7),
(u'\u1234', 1),
(u'\U00001234', 1),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment