implement \N{...} Unicode escapes for literals

2a9d8d45 · Stefan Behnel · 159a3b9a · 2a9d8d45 · 2a9d8d45 · 2a9d8d45
Commit 2a9d8d45 authored Jan 06, 2013 by Stefan Behnel
6 changed files
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -8,6 +8,8 @@ Cython Changelog
 Features added
 --------------

+* Named Unicode escapes ("\N{...}") are supported.
+
 * Python functions/classes provide the special attribute "__qualname__"
  as defined by PEP 3155.


--- a/Cython/Compiler/Lexicon.py
+++ b/Cython/Compiler/Lexicon.py
@@ -66,6 +66,7 @@ def make_lexicon():
    two_hex = hexdigit + hexdigit
    four_hex = two_hex + two_hex
    escapeseq = Str("\\") + (two_oct | three_oct |
+                             Str('N{') + Rep(AnyBut('}')) + Str('}') |
                             Str('u') + four_hex | Str('x') + two_hex |
                             Str('U') + four_hex + four_hex | AnyChar)


--- a/Cython/Compiler/Parsing.py
+++ b/Cython/Compiler/Parsing.py
@@ -8,6 +8,7 @@ import cython
 cython.declare(Nodes=object, ExprNodes=object, EncodedString=object)

 import re
+import unicodedata

 from Cython.Compiler.Scanning import PyrexScanner, FileSourceDescriptor
 import Nodes
@@ -803,23 +804,27 @@ def p_string_literal(s, kind_override=None):
                        StringEncoding.char_from_escape_sequence(systr))
                elif c == u'\n':
                    pass
-                elif c == u'x':
+                elif c == u'x':   # \xXX
                    if len(systr) == 4:
                        chars.append_charval( int(systr[2:], 16) )
                    else:
                        s.error("Invalid hex escape '%s'" % systr)
-                elif c in u'Uu':
-                    if kind in ('u', ''):
-                        if len(systr) in (6,10):
-                            chrval = int(systr[2:], 16)
-                            if chrval > 1114111: # sys.maxunicode:
-                                s.error("Invalid unicode escape '%s'" % systr)
-                        else:
+                elif c in u'NUu' and kind in ('u', ''):   # \uxxxx, \Uxxxxxxxx, \N{...}
+                    chrval = -1
+                    if c == u'N':
+                        try:
+                            chrval = ord(unicodedata.lookup(systr[3:-1]))
+                        except KeyError:
+                            s.error("Unknown Unicode character name %r" % systr[3:-1])
+                    elif len(systr) in (6,10):
+                        chrval = int(systr[2:], 16)
+                        if chrval > 1114111: # sys.maxunicode:
                            s.error("Invalid unicode escape '%s'" % systr)
+                            chrval = -1
                    else:
-                        # unicode escapes in byte strings are not unescaped
-                        chrval = None
-                    chars.append_uescape(chrval, systr)
+                        s.error("Invalid unicode escape '%s'" % systr)
+                    if chrval >= 0:
+                        chars.append_uescape(chrval, systr)
                else:
                    chars.append(u'\\' + systr[1:])
                    if is_python3_source and not has_non_ASCII_literal_characters \

--- a/tests/run/strliterals.pyx
+++ b/tests/run/strliterals.pyx
@@ -132,9 +132,9 @@ __doc__ = ur"""
    >>> len(bytes_uescape)
    28

-    >>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 3 or
-    ...  sys.version_info[0] >= 3 and sys.maxunicode == 65535   and len(str_uescape) == 4 or
-    ...  sys.version_info[0] <  3 and len(str_uescape) == 17 or
+    >>> (sys.version_info[0] >= 3 and sys.maxunicode == 1114111 and len(str_uescape) == 4 or
+    ...  sys.version_info[0] >= 3 and sys.maxunicode == 65535   and len(str_uescape) == 5 or
+    ...  sys.version_info[0] <  3 and len(str_uescape) == 28 or
    ...  len(str_uescape))
    True
    >>> (sys.version_info[0] >= 3 and str_uescape[0] == 'c' or
@@ -143,6 +143,10 @@ __doc__ = ur"""
    True
    >>> print(str_uescape[-1])
    B
+    >>> (sys.version_info[0] >= 3 and ord(str_uescape[-2]) == 0x2603 or
+    ...  sys.version_info[0] <  3 and str_uescape[-12:-1]  == b'\\N{SNOWMAN}' or
+    ...  sys.version_info[0] >= 3 and ord(str_uescape[-2]) or str_uescape[-12:-1])
+    True

    >>> newlines == "Aaa\n"
    True
@@ -185,7 +189,7 @@ bresc = br'\12\'\"\\'
 uresc = ur'\12\'\"\\'

 bytes_uescape = b'\u1234\U12345678\u\u1\u12\uX'
-str_uescape = '\u0063\U00012345\x42'
+str_uescape = '\u0063\U00012345\N{SNOWMAN}\x42'

 newlines = "Aaa\n"


--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -75,6 +75,8 @@ __doc__ = br"""
    True
    >>> h == u'\\ud800' # unescaped by Python (required by doctest)
    True
+    >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
+    True
    >>> add == u'Søk ik' + u'üÖä' + 'abc'
    True
    >>> null == u'\\x00' # unescaped by Python (required by doctest)
@@ -107,6 +109,7 @@ e = u'\x03\x67\xf8\uf8d2Søk ik'
 f = u'\xf8'
 g = u'\udc00'   # lone trail surrogate
 h = u'\ud800'   # lone lead surrogate
+k = u'\N{SNOWMAN}'

 add = u'Søk ik' + u'üÖä' + u'abc'
 null = u'\x00'

--- a/tests/run/unicodeliteralslatin1.pyx
+++ b/tests/run/unicodeliteralslatin1.pyx
@@ -55,6 +55,8 @@ __doc__ = br"""
    True
    >>> f == u'\\xf8' # unescaped by Python
    True
+    >>> k == u'' == u'\\N{LATIN SMALL LETTER A WITH DIAERESIS}'
+    True
    >>> add == u'Sk ik' + u'' + 'abc'
    True
    >>> null == u'\\x00' # unescaped by Python (required by doctest)
@@ -75,6 +77,7 @@ c = u'S
 d = u''
 e = u'\x03\x67\xf8\uf8d2Sk ik'
 f = u'\xf8'
+k = u'\N{LATIN SMALL LETTER A WITH DIAERESIS}'

 add = u'Sk ik' + u'' + u'abc'
 null = u'\x00'