Commit 2a9d8d45 authored by Stefan Behnel's avatar Stefan Behnel

implement \N{...} Unicode escapes for literals

parent 159a3b9a
......@@ -8,6 +8,8 @@ Cython Changelog
Features added
--------------
* Named Unicode escapes ("\N{...}") are supported.
* Python functions/classes provide the special attribute "__qualname__"
as defined by PEP 3155.
......
......@@ -66,6 +66,7 @@ def make_lexicon():
two_hex = hexdigit + hexdigit
four_hex = two_hex + two_hex
escapeseq = Str("\\") + (two_oct | three_oct |
Str('N{') + Rep(AnyBut('}')) + Str('}') |
Str('u') + four_hex | Str('x') + two_hex |
Str('U') + four_hex + four_hex | AnyChar)
......
......@@ -8,6 +8,7 @@ import cython
cython.declare(Nodes=object, ExprNodes=object, EncodedString=object)
import re
import unicodedata
from Cython.Compiler.Scanning import PyrexScanner, FileSourceDescriptor
import Nodes
......@@ -803,22 +804,26 @@ def p_string_literal(s, kind_override=None):
StringEncoding.char_from_escape_sequence(systr))
elif c == u'\n':
pass
elif c == u'x':
elif c == u'x': # \xXX
if len(systr) == 4:
chars.append_charval( int(systr[2:], 16) )
else:
s.error("Invalid hex escape '%s'" % systr)
elif c in u'Uu':
if kind in ('u', ''):
if len(systr) in (6,10):
elif c in u'NUu' and kind in ('u', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
chrval = -1
if c == u'N':
try:
chrval = ord(unicodedata.lookup(systr[3:-1]))
except KeyError:
s.error("Unknown Unicode character name %r" % systr[3:-1])
elif len(systr) in (6,10):
chrval = int(systr[2:], 16)
if chrval > 1114111: # sys.maxunicode:
s.error("Invalid unicode escape '%s'" % systr)
chrval = -1
else:
s.error("Invalid unicode escape '%s'" % systr)
else:
# unicode escapes in byte strings are not unescaped
chrval = None
if chrval >= 0:
chars.append_uescape(chrval, systr)
else:
chars.append(u'\\' + systr[1:])
......
......@@ -132,9 +132,9 @@ __doc__ = ur"""
>>> len(bytes_uescape)
28
>>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 3 or
... sys.version_info[0] >= 3 and sys.maxunicode == 65535 and len(str_uescape) == 4 or
... sys.version_info[0] < 3 and len(str_uescape) == 17 or
>>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 4 or
... sys.version_info[0] >= 3 and sys.maxunicode == 65535 and len(str_uescape) == 5 or
... sys.version_info[0] < 3 and len(str_uescape) == 28 or
... len(str_uescape))
True
>>> (sys.version_info[0] >= 3 and str_uescape[0] == 'c' or
......@@ -143,6 +143,10 @@ __doc__ = ur"""
True
>>> print(str_uescape[-1])
B
>>> (sys.version_info[0] >= 3 and ord(str_uescape[-2]) == 0x2603 or
... sys.version_info[0] < 3 and str_uescape[-12:-1] == b'\\N{SNOWMAN}' or
... sys.version_info[0] >= 3 and ord(str_uescape[-2]) or str_uescape[-12:-1])
True
>>> newlines == "Aaa\n"
True
......@@ -185,7 +189,7 @@ bresc = br'\12\'\"\\'
uresc = ur'\12\'\"\\'
bytes_uescape = b'\u1234\U12345678\u\u1\u12\uX'
str_uescape = '\u0063\U00012345\x42'
str_uescape = '\u0063\U00012345\N{SNOWMAN}\x42'
newlines = "Aaa\n"
......
......@@ -75,6 +75,8 @@ __doc__ = br"""
True
>>> h == u'\\ud800' # unescaped by Python (required by doctest)
True
>>> k == u'\\N{SNOWMAN}' == u'\\u2603'
True
>>> add == u'Søk ik' + u'üÖä' + 'abc'
True
>>> null == u'\\x00' # unescaped by Python (required by doctest)
......@@ -107,6 +109,7 @@ e = u'\x03\x67\xf8\uf8d2Søk ik'
f = u'\xf8'
g = u'\udc00' # lone trail surrogate
h = u'\ud800' # lone lead surrogate
k = u'\N{SNOWMAN}'
add = u'Søk ik' + u'üÖä' + u'abc'
null = u'\x00'
......
......@@ -55,6 +55,8 @@ __doc__ = br"""
True
>>> f == u'\\xf8' # unescaped by Python
True
>>> k == u'' == u'\\N{LATIN SMALL LETTER A WITH DIAERESIS}'
True
>>> add == u'Sk ik' + u'' + 'abc'
True
>>> null == u'\\x00' # unescaped by Python (required by doctest)
......@@ -75,6 +77,7 @@ c = u'S
d = u''
e = u'\x03\x67\xf8\uf8d2Sk ik'
f = u'\xf8'
k = u'\N{LATIN SMALL LETTER A WITH DIAERESIS}'
add = u'Sk ik' + u'' + u'abc'
null = u'\x00'
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment