support for long unicode escapes ('\U...')

fixed unicode escape handling in byte strings unescape \xXY in string literals as C allows it to conflict with trailing hex numbers - output string escaping will do the right thing

support for long unicode escapes ('\U...')
fixed unicode escape handling in byte strings unescape \xXY in string literals as C allows it to conflict with trailing hex numbers - output string escaping will do the right thing
71a59940 · Stefan Behnel · fcdc2a72 · 71a59940 · 71a59940 · 71a59940
Commit 71a59940 authored Aug 10, 2008 by Stefan Behnel
Showing with 47 additions and 11 deletions

Cython/Compiler/Lexicon.py Cython/Compiler/Lexicon.py +2 -1

Cython/Compiler/Parsing.py Cython/Compiler/Parsing.py +11 -10

tests/run/strescapes.pyx tests/run/strescapes.pyx +34 -0

No files found.
--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -64,7 +64,8 @@ def make_lexicon():
    two_hex = hexdigit + hexdigit
    four_hex = two_hex + two_hex
    escapeseq = Str("\\") + (two_oct | three_oct | two_hex |
-                             Str('u') + four_hex | Str('x') + two_hex | AnyChar)
+                             Str('u') + four_hex | Str('x') + two_hex |
+                             Str('U') + four_hex + four_hex | AnyChar)
    deco = Str("@")
    bra = Any("([{")

--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -2,7 +2,9 @@
 #   Pyrex Parser
 #
-import os, re
+import os
+import re
+import sys
 from types import ListType, TupleType
 from Scanning import PyrexScanner, FileSourceDescriptor
 import Nodes
@@ -604,18 +606,17 @@ def p_string_literal(s):
                    chars.append(systr)
                elif c == '\n':
                    pass
-                elif c in 'ux':
+                elif c in 'Uux':
-                    if kind == 'u':
+                    if kind == 'u' or c == 'x':
-                        try:
+                        chrval = int(systr[2:], 16)
-                            chars.append(
+                        if chrval > sys.maxunicode:
-                                systr.encode("ASCII").decode('unicode_escape'))
-                        except UnicodeDecodeError:
                            s.error("Invalid unicode escape '%s'" % systr,
                                    pos = pos)
-                    elif c == 'x':
+                        strval = unichr(chrval)
-                        chars.append('\\x0' + systr[2:])
                    else:
-                        chars.append(systr)
+                        # unicode escapes in plain byte strings are not unescaped
+                        strval = systr
+                    chars.append(strval.replace('\\', '\\\\'))
                else:
                    chars.append(r'\\' + systr[1:])
        elif sy == 'NEWLINE':

--- a/tests/run/strescapes.pyx
+++ b/tests/run/strescapes.pyx
+__doc__ = u"""
+>>> py_strings = [
+... '\\x1234',
+... '\\x0A12\\x0C34',
+... '\\x0A57',
+... 'abc\\x12def',
+... u'\\u1234',
+... u'\\U00041234',
+... b'\\u1234',
+... b'\\U00041234',
+... ]
+>>> for i, (py_string, c_string) in enumerate(zip(py_strings, c_strings)):
+...     assert py_string == c_string, "%d: %r != %r" % (i, py_string, c_string)
+"""
+import sys
+if sys.version_info[0] < 3:
+    __doc__ = __doc__.replace(" b'", " '")
+else:
+    __doc__ = __doc__.replace(" u'", " '")
+c_strings = [
+'\x1234',
+'\x0A12\x0C34',
+'\x0A57',
+'abc\x12def',
+u'\u1234',
+u'\U00041234',
+b'\u1234',
+b'\U00041234',
+]