Commit 71a59940 authored by Stefan Behnel's avatar Stefan Behnel

support for long unicode escapes ('\U...')

fixed unicode escape handling in byte strings
unescape \xXY in string literals as C allows it to conflict with trailing hex numbers - output string escaping will do the right thing
parent fcdc2a72
...@@ -64,7 +64,8 @@ def make_lexicon(): ...@@ -64,7 +64,8 @@ def make_lexicon():
two_hex = hexdigit + hexdigit two_hex = hexdigit + hexdigit
four_hex = two_hex + two_hex four_hex = two_hex + two_hex
escapeseq = Str("\\") + (two_oct | three_oct | two_hex | escapeseq = Str("\\") + (two_oct | three_oct | two_hex |
Str('u') + four_hex | Str('x') + two_hex | AnyChar) Str('u') + four_hex | Str('x') + two_hex |
Str('U') + four_hex + four_hex | AnyChar)
deco = Str("@") deco = Str("@")
bra = Any("([{") bra = Any("([{")
......
...@@ -2,7 +2,9 @@ ...@@ -2,7 +2,9 @@
# Pyrex Parser # Pyrex Parser
# #
import os, re import os
import re
import sys
from types import ListType, TupleType from types import ListType, TupleType
from Scanning import PyrexScanner, FileSourceDescriptor from Scanning import PyrexScanner, FileSourceDescriptor
import Nodes import Nodes
...@@ -604,18 +606,17 @@ def p_string_literal(s): ...@@ -604,18 +606,17 @@ def p_string_literal(s):
chars.append(systr) chars.append(systr)
elif c == '\n': elif c == '\n':
pass pass
elif c in 'ux': elif c in 'Uux':
if kind == 'u': if kind == 'u' or c == 'x':
try: chrval = int(systr[2:], 16)
chars.append( if chrval > sys.maxunicode:
systr.encode("ASCII").decode('unicode_escape'))
except UnicodeDecodeError:
s.error("Invalid unicode escape '%s'" % systr, s.error("Invalid unicode escape '%s'" % systr,
pos = pos) pos = pos)
elif c == 'x': strval = unichr(chrval)
chars.append('\\x0' + systr[2:])
else: else:
chars.append(systr) # unicode escapes in plain byte strings are not unescaped
strval = systr
chars.append(strval.replace('\\', '\\\\'))
else: else:
chars.append(r'\\' + systr[1:]) chars.append(r'\\' + systr[1:])
elif sy == 'NEWLINE': elif sy == 'NEWLINE':
......
__doc__ = u"""
>>> py_strings = [
... '\\x1234',
... '\\x0A12\\x0C34',
... '\\x0A57',
... 'abc\\x12def',
... u'\\u1234',
... u'\\U00041234',
... b'\\u1234',
... b'\\U00041234',
... ]
>>> for i, (py_string, c_string) in enumerate(zip(py_strings, c_strings)):
... assert py_string == c_string, "%d: %r != %r" % (i, py_string, c_string)
"""
import sys
if sys.version_info[0] < 3:
__doc__ = __doc__.replace(" b'", " '")
else:
__doc__ = __doc__.replace(" u'", " '")
c_strings = [
'\x1234',
'\x0A12\x0C34',
'\x0A57',
'abc\x12def',
u'\u1234',
u'\U00041234',
b'\u1234',
b'\U00041234',
]
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment