Commit 2a9d8d45 authored by Stefan Behnel's avatar Stefan Behnel

implement \N{...} Unicode escapes for literals

parent 159a3b9a
...@@ -8,6 +8,8 @@ Cython Changelog ...@@ -8,6 +8,8 @@ Cython Changelog
Features added Features added
-------------- --------------
* Named Unicode escapes ("\N{...}") are supported.
* Python functions/classes provide the special attribute "__qualname__" * Python functions/classes provide the special attribute "__qualname__"
as defined by PEP 3155. as defined by PEP 3155.
......
...@@ -66,6 +66,7 @@ def make_lexicon(): ...@@ -66,6 +66,7 @@ def make_lexicon():
two_hex = hexdigit + hexdigit two_hex = hexdigit + hexdigit
four_hex = two_hex + two_hex four_hex = two_hex + two_hex
escapeseq = Str("\\") + (two_oct | three_oct | escapeseq = Str("\\") + (two_oct | three_oct |
Str('N{') + Rep(AnyBut('}')) + Str('}') |
Str('u') + four_hex | Str('x') + two_hex | Str('u') + four_hex | Str('x') + two_hex |
Str('U') + four_hex + four_hex | AnyChar) Str('U') + four_hex + four_hex | AnyChar)
......
...@@ -8,6 +8,7 @@ import cython ...@@ -8,6 +8,7 @@ import cython
cython.declare(Nodes=object, ExprNodes=object, EncodedString=object) cython.declare(Nodes=object, ExprNodes=object, EncodedString=object)
import re import re
import unicodedata
from Cython.Compiler.Scanning import PyrexScanner, FileSourceDescriptor from Cython.Compiler.Scanning import PyrexScanner, FileSourceDescriptor
import Nodes import Nodes
...@@ -803,23 +804,27 @@ def p_string_literal(s, kind_override=None): ...@@ -803,23 +804,27 @@ def p_string_literal(s, kind_override=None):
StringEncoding.char_from_escape_sequence(systr)) StringEncoding.char_from_escape_sequence(systr))
elif c == u'\n': elif c == u'\n':
pass pass
elif c == u'x': elif c == u'x': # \xXX
if len(systr) == 4: if len(systr) == 4:
chars.append_charval( int(systr[2:], 16) ) chars.append_charval( int(systr[2:], 16) )
else: else:
s.error("Invalid hex escape '%s'" % systr) s.error("Invalid hex escape '%s'" % systr)
elif c in u'Uu': elif c in u'NUu' and kind in ('u', ''): # \uxxxx, \Uxxxxxxxx, \N{...}
if kind in ('u', ''): chrval = -1
if len(systr) in (6,10): if c == u'N':
chrval = int(systr[2:], 16) try:
if chrval > 1114111: # sys.maxunicode: chrval = ord(unicodedata.lookup(systr[3:-1]))
s.error("Invalid unicode escape '%s'" % systr) except KeyError:
else: s.error("Unknown Unicode character name %r" % systr[3:-1])
elif len(systr) in (6,10):
chrval = int(systr[2:], 16)
if chrval > 1114111: # sys.maxunicode:
s.error("Invalid unicode escape '%s'" % systr) s.error("Invalid unicode escape '%s'" % systr)
chrval = -1
else: else:
# unicode escapes in byte strings are not unescaped s.error("Invalid unicode escape '%s'" % systr)
chrval = None if chrval >= 0:
chars.append_uescape(chrval, systr) chars.append_uescape(chrval, systr)
else: else:
chars.append(u'\\' + systr[1:]) chars.append(u'\\' + systr[1:])
if is_python3_source and not has_non_ASCII_literal_characters \ if is_python3_source and not has_non_ASCII_literal_characters \
......
...@@ -132,9 +132,9 @@ __doc__ = ur""" ...@@ -132,9 +132,9 @@ __doc__ = ur"""
>>> len(bytes_uescape) >>> len(bytes_uescape)
28 28
>>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 3 or >>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 4 or
... sys.version_info[0] >= 3 and sys.maxunicode == 65535 and len(str_uescape) == 4 or ... sys.version_info[0] >= 3 and sys.maxunicode == 65535 and len(str_uescape) == 5 or
... sys.version_info[0] < 3 and len(str_uescape) == 17 or ... sys.version_info[0] < 3 and len(str_uescape) == 28 or
... len(str_uescape)) ... len(str_uescape))
True True
>>> (sys.version_info[0] >= 3 and str_uescape[0] == 'c' or >>> (sys.version_info[0] >= 3 and str_uescape[0] == 'c' or
...@@ -143,6 +143,10 @@ __doc__ = ur""" ...@@ -143,6 +143,10 @@ __doc__ = ur"""
True True
>>> print(str_uescape[-1]) >>> print(str_uescape[-1])
B B
>>> (sys.version_info[0] >= 3 and ord(str_uescape[-2]) == 0x2603 or
... sys.version_info[0] < 3 and str_uescape[-12:-1] == b'\\N{SNOWMAN}' or
... sys.version_info[0] >= 3 and ord(str_uescape[-2]) or str_uescape[-12:-1])
True
>>> newlines == "Aaa\n" >>> newlines == "Aaa\n"
True True
...@@ -185,7 +189,7 @@ bresc = br'\12\'\"\\' ...@@ -185,7 +189,7 @@ bresc = br'\12\'\"\\'
uresc = ur'\12\'\"\\' uresc = ur'\12\'\"\\'
bytes_uescape = b'\u1234\U12345678\u\u1\u12\uX' bytes_uescape = b'\u1234\U12345678\u\u1\u12\uX'
str_uescape = '\u0063\U00012345\x42' str_uescape = '\u0063\U00012345\N{SNOWMAN}\x42'
newlines = "Aaa\n" newlines = "Aaa\n"
......
...@@ -75,6 +75,8 @@ __doc__ = br""" ...@@ -75,6 +75,8 @@ __doc__ = br"""
True True
>>> h == u'\\ud800' # unescaped by Python (required by doctest) >>> h == u'\\ud800' # unescaped by Python (required by doctest)
True True
>>> k == u'\\N{SNOWMAN}' == u'\\u2603'
True
>>> add == u'Søk ik' + u'üÖä' + 'abc' >>> add == u'Søk ik' + u'üÖä' + 'abc'
True True
>>> null == u'\\x00' # unescaped by Python (required by doctest) >>> null == u'\\x00' # unescaped by Python (required by doctest)
...@@ -107,6 +109,7 @@ e = u'\x03\x67\xf8\uf8d2Søk ik' ...@@ -107,6 +109,7 @@ e = u'\x03\x67\xf8\uf8d2Søk ik'
f = u'\xf8' f = u'\xf8'
g = u'\udc00' # lone trail surrogate g = u'\udc00' # lone trail surrogate
h = u'\ud800' # lone lead surrogate h = u'\ud800' # lone lead surrogate
k = u'\N{SNOWMAN}'
add = u'Søk ik' + u'üÖä' + u'abc' add = u'Søk ik' + u'üÖä' + u'abc'
null = u'\x00' null = u'\x00'
......
...@@ -55,6 +55,8 @@ __doc__ = br""" ...@@ -55,6 +55,8 @@ __doc__ = br"""
True True
>>> f == u'\\xf8' # unescaped by Python >>> f == u'\\xf8' # unescaped by Python
True True
>>> k == u'' == u'\\N{LATIN SMALL LETTER A WITH DIAERESIS}'
True
>>> add == u'Sk ik' + u'' + 'abc' >>> add == u'Sk ik' + u'' + 'abc'
True True
>>> null == u'\\x00' # unescaped by Python (required by doctest) >>> null == u'\\x00' # unescaped by Python (required by doctest)
...@@ -75,6 +77,7 @@ c = u'S ...@@ -75,6 +77,7 @@ c = u'S
d = u'' d = u''
e = u'\x03\x67\xf8\uf8d2Sk ik' e = u'\x03\x67\xf8\uf8d2Sk ik'
f = u'\xf8' f = u'\xf8'
k = u'\N{LATIN SMALL LETTER A WITH DIAERESIS}'
add = u'Sk ik' + u'' + u'abc' add = u'Sk ik' + u'' + u'abc'
null = u'\x00' null = u'\x00'
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment