Commit 62fc87e0 authored by Stefan Behnel's avatar Stefan Behnel

unescape all string content in the parser and escape it on the way out

otherwise, different ways of spelling special characters can end up being correctly escaped or not in the C file
parent f6df9115
...@@ -642,7 +642,13 @@ class CharNode(ConstNode): ...@@ -642,7 +642,13 @@ class CharNode(ConstNode):
return ord(self.value) return ord(self.value)
def calculate_result_code(self): def calculate_result_code(self):
return "'%s'" % self.value if self.value == "'":
return r"'\''"
char = ord(self.value)
if char < 32:
return "'\\x%02X'" % char
else:
return "'%s'" % self.value
class IntNode(ConstNode): class IntNode(ConstNode):
......
...@@ -585,25 +585,28 @@ def p_string_literal(s): ...@@ -585,25 +585,28 @@ def p_string_literal(s):
sy = s.sy sy = s.sy
#print "p_string_literal: sy =", sy, repr(s.systring) ### #print "p_string_literal: sy =", sy, repr(s.systring) ###
if sy == 'CHARS': if sy == 'CHARS':
systr = s.systring chars.append(s.systring)
if len(systr) == 1 and systr in "'\"\n":
chars.append('\\')
chars.append(systr)
elif sy == 'ESCAPE': elif sy == 'ESCAPE':
systr = s.systring systr = s.systring
if is_raw: if is_raw:
if systr == '\\\n': if systr == '\\\n':
chars.append(r'\\\n') chars.append('\n')
elif systr == r'\"': elif systr == '\\\"':
chars.append(r'\\\"') chars.append('"')
elif systr == r'\\': elif systr == '\\\'':
chars.append(r'\\\\') chars.append("'")
elif systr == '\\\\':
chars.append('\\')
else: else:
chars.append('\\' + systr) chars.append(systr)
else: else:
c = systr[1] c = systr[1]
if c in "'\"\\abfnrtv01234567": if c in "01234567":
chars.append(systr) chars.append(chr(int(systr[1:])))
elif c in "'\"\\":
chars.append(c)
elif c in "abfnrtv":
chars.append(Utils.char_from_escape_sequence(systr))
elif c == '\n': elif c == '\n':
pass pass
elif c in 'Uux': elif c in 'Uux':
...@@ -616,11 +619,11 @@ def p_string_literal(s): ...@@ -616,11 +619,11 @@ def p_string_literal(s):
else: else:
# unicode escapes in plain byte strings are not unescaped # unicode escapes in plain byte strings are not unescaped
strval = systr strval = systr
chars.append(strval.replace('\\', '\\\\')) chars.append(strval)
else: else:
chars.append(r'\\' + systr[1:]) chars.append(r'\\' + systr[1:])
elif sy == 'NEWLINE': elif sy == 'NEWLINE':
chars.append(r'\n') chars.append('\n')
elif sy == 'END_STRING': elif sy == 'END_STRING':
break break
elif sy == 'EOF': elif sy == 'EOF':
...@@ -629,8 +632,11 @@ def p_string_literal(s): ...@@ -629,8 +632,11 @@ def p_string_literal(s):
s.error( s.error(
"Unexpected token %r:%r in string literal" % "Unexpected token %r:%r in string literal" %
(sy, s.systring)) (sy, s.systring))
string = u''.join(chars)
if kind == 'c' and len(string) != 1:
error(pos, u"invalid character literal: %r" % string)
s.next() s.next()
value = Utils.EncodedString( u''.join(chars) ) value = Utils.EncodedString(string)
if kind != 'u': if kind != 'u':
value.encoding = s.source_encoding value.encoding = s.source_encoding
#print "p_string_literal: value =", repr(value) ### #print "p_string_literal: value =", repr(value) ###
......
...@@ -99,25 +99,39 @@ class EncodedString(unicode): ...@@ -99,25 +99,39 @@ class EncodedString(unicode):
# return unicode.__eq__(self, other) and \ # return unicode.__eq__(self, other) and \
# getattr(other, 'encoding', '') == self.encoding # getattr(other, 'encoding', '') == self.encoding
def _to_oct_sequence(s): char_from_escape_sequence = {
r'\a' : '\a',
r'\b' : '\b',
r'\f' : '\f',
r'\n' : '\n',
r'\r' : '\r',
r'\t' : '\t',
r'\v' : '\v',
}.get
def _to_escape_sequence(s):
if s in '\n\r\t': if s in '\n\r\t':
return repr(s)[1:-1] return repr(s)[1:-1]
elif s == '"':
return r'\"'
else: else:
# oct passes much better than hex
return ''.join(['\\%03o' % ord(c) for c in s]) return ''.join(['\\%03o' % ord(c) for c in s])
_c_special = ('\0', '\n','\r','\t', '??', '<:', ':>', '<%', '%>', '%:', '%:') _c_special = ('\0', '\n', '\r', '\t', '??', '"')
_c_special_replacements = zip(_c_special, map(_to_oct_sequence, _c_special)) _c_special_replacements = zip(_c_special, map(_to_escape_sequence, _c_special))
def _build_special_test(): def _build_specials_test():
subexps = [] subexps = []
for special in _c_special: for special in _c_special:
regexp = ''.join(['[%s]' % c for c in special]) regexp = ''.join(['[%s]' % c for c in special])
subexps.append(regexp) subexps.append(regexp)
return re.compile('(' + '|'.join(subexps) + ')').search return re.compile('|'.join(subexps)).search
_has_specials = _build_special_test() _has_specials = _build_specials_test()
def escape_byte_string(s): def escape_byte_string(s):
s = s.replace('\\', '\\\\')
if _has_specials(s): if _has_specials(s):
for special, replacement in _c_special_replacements: for special, replacement in _c_special_replacements:
s = s.replace(special, replacement) s = s.replace(special, replacement)
......
__doc__ = u"""
>>> s = test()
>>> assert s == ''.join([chr(i) for i in range(1,49)]), s
"""
def test():
cdef char s[50]
s[ 0] = c'\0'
s[ 1] = c'\x01'
s[ 2] = c'\x02'
s[ 3] = c'\x03'
s[ 4] = c'\x04'
s[ 5] = c'\x05'
s[ 6] = c'\x06'
s[ 7] = c'\x07'
s[ 8] = c'\x08'
s[ 9] = c'\x09'
s[10] = c'\x0A'
s[11] = c'\x0B'
s[12] = c'\x0C'
s[13] = c'\x0D'
s[14] = c'\x0E'
s[15] = c'\x0F'
s[16] = c'\x10'
s[17] = c'\x11'
s[18] = c'\x12'
s[19] = c'\x13'
s[20] = c'\x14'
s[21] = c'\x15'
s[22] = c'\x16'
s[23] = c'\x17'
s[24] = c'\x18'
s[25] = c'\x19'
s[26] = c'\x1A'
s[27] = c'\x1B'
s[28] = c'\x1C'
s[29] = c'\x1D'
s[30] = c'\x1E'
s[31] = c'\x1F'
s[32] = c'\x20'
s[33] = c'\x21'
s[34] = c'\x22'
s[35] = c'\x23'
s[36] = c'\x24'
s[37] = c'\x25'
s[38] = c'\x26'
s[39] = c'\x27'
s[40] = c'\x28'
s[41] = c'\x29'
s[42] = c'\x2A'
s[43] = c'\x2B'
s[44] = c'\x2C'
s[45] = c'\x2D'
s[46] = c'\x2E'
s[47] = c'\x2F'
s[48] = c'\x30'
s[49] = c'\x00'
assert s[ 0] == c'\x00'
assert s[49] == c'\0'
return &s[1]
...@@ -4,6 +4,11 @@ __doc__ = u""" ...@@ -4,6 +4,11 @@ __doc__ = u"""
... b'\\x1234', ... b'\\x1234',
... b'\\x0A12\\x0C34', ... b'\\x0A12\\x0C34',
... b'\\x0A57', ... b'\\x0A57',
... b'\\x0A',
... b'\\'',
... b"\\'",
... b"\\"",
... b'\\"',
... b'abc\\x12def', ... b'abc\\x12def',
... u'\\u1234', ... u'\\u1234',
... u'\\U00001234', ... u'\\U00001234',
...@@ -28,14 +33,19 @@ __doc__ = u""" ...@@ -28,14 +33,19 @@ __doc__ = u"""
import sys import sys
if sys.version_info[0] < 3: if sys.version_info[0] < 3:
__doc__ = __doc__.replace(u" b'", u" '") __doc__ = __doc__.replace(u" b'", u" '").replace(u' b"', u' "')
else: else:
__doc__ = __doc__.replace(u" u'", u" '") __doc__ = __doc__.replace(u" u'", u" '").replace(u' u"', u' "')
c_strings = [ c_strings = [
(b'\x1234', 3), (b'\x1234', 3),
(b'\x0A12\x0C34', 6), (b'\x0A12\x0C34', 6),
(b'\x0A57', 3), (b'\x0A57', 3),
(b'\x0A', 1),
(b'\'', 1),
(b"\'", 1),
(b"\"", 1),
(b'\"', 1),
(b'abc\x12def', 7), (b'abc\x12def', 7),
(u'\u1234', 1), (u'\u1234', 1),
(u'\U00001234', 1), (u'\U00001234', 1),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment