Commit a445feb7 authored by Serhiy Storchaka's avatar Serhiy Storchaka Committed by GitHub

bpo-30688: Support \N{name} escapes in re patterns. (GH-5588)

Co-authored-by: default avatarJonathan Eunice <jonathan.eunice@gmail.com>
parent 2411292b
...@@ -468,13 +468,13 @@ Most of the standard escapes supported by Python string literals are also ...@@ -468,13 +468,13 @@ Most of the standard escapes supported by Python string literals are also
accepted by the regular expression parser:: accepted by the regular expression parser::
\a \b \f \n \a \b \f \n
\r \t \u \U \N \r \t \u
\v \x \\ \U \v \x \\
(Note that ``\b`` is used to represent word boundaries, and means "backspace" (Note that ``\b`` is used to represent word boundaries, and means "backspace"
only inside character classes.) only inside character classes.)
``'\u'`` and ``'\U'`` escape sequences are only recognized in Unicode ``'\u'``, ``'\U'``, and ``'\N'`` escape sequences are only recognized in Unicode
patterns. In bytes patterns they are errors. patterns. In bytes patterns they are errors.
Octal escapes are included in a limited form. If the first digit is a 0, or if Octal escapes are included in a limited form. If the first digit is a 0, or if
...@@ -488,6 +488,9 @@ three digits in length. ...@@ -488,6 +488,9 @@ three digits in length.
.. versionchanged:: 3.6 .. versionchanged:: 3.6
Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors. Unknown escapes consisting of ``'\'`` and an ASCII letter now are errors.
.. versionchanged:: 3.8
The ``'\N{name}'`` escape sequence has been added. As in string literals,
it expands to the named Unicode character (e.g. ``'\N{EM DASH}'``).
.. seealso:: .. seealso::
......
...@@ -75,6 +75,8 @@ New Features ...@@ -75,6 +75,8 @@ New Features
Other Language Changes Other Language Changes
====================== ======================
* Added support of ``\N{name}`` escapes in :mod:`regular expressions <re>`.
(Contributed by Jonathan Eunice and Serhiy Storchaka in :issue:`30688`.)
New Modules New Modules
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# XXX: show string offset and offending character for all errors # XXX: show string offset and offending character for all errors
from sre_constants import * from sre_constants import *
import unicodedata
SPECIAL_CHARS = ".\\[{()*+?^$|" SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{" REPEAT_CHARS = "*+?{"
...@@ -264,19 +265,19 @@ class Tokenizer: ...@@ -264,19 +265,19 @@ class Tokenizer:
result += c result += c
self.__next() self.__next()
return result return result
def getuntil(self, terminator): def getuntil(self, terminator, name):
result = '' result = ''
while True: while True:
c = self.next c = self.next
self.__next() self.__next()
if c is None: if c is None:
if not result: if not result:
raise self.error("missing group name") raise self.error("missing " + name)
raise self.error("missing %s, unterminated name" % terminator, raise self.error("missing %s, unterminated name" % terminator,
len(result)) len(result))
if c == terminator: if c == terminator:
if not result: if not result:
raise self.error("missing group name", 1) raise self.error("missing " + name, 1)
break break
result += c result += c
return result return result
...@@ -322,6 +323,17 @@ def _class_escape(source, escape): ...@@ -322,6 +323,17 @@ def _class_escape(source, escape):
c = int(escape[2:], 16) c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code chr(c) # raise ValueError for invalid code
return LITERAL, c return LITERAL, c
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
if not source.match('{'):
raise source.error("missing {")
charname = source.getuntil('}', 'character name')
try:
c = ord(unicodedata.lookup(charname))
except KeyError:
raise source.error("undefined character name %r" % charname,
len(charname) + len(r'\N{}'))
return LITERAL, c
elif c in OCTDIGITS: elif c in OCTDIGITS:
# octal escape (up to three digits) # octal escape (up to three digits)
escape += source.getwhile(2, OCTDIGITS) escape += source.getwhile(2, OCTDIGITS)
...@@ -370,6 +382,17 @@ def _escape(source, escape, state): ...@@ -370,6 +382,17 @@ def _escape(source, escape, state):
c = int(escape[2:], 16) c = int(escape[2:], 16)
chr(c) # raise ValueError for invalid code chr(c) # raise ValueError for invalid code
return LITERAL, c return LITERAL, c
elif c == "N" and source.istext:
# named unicode escape e.g. \N{EM DASH}
if not source.match('{'):
raise source.error("missing {")
charname = source.getuntil('}', 'character name')
try:
c = ord(unicodedata.lookup(charname))
except KeyError:
raise source.error("undefined character name %r" % charname,
len(charname) + len(r'\N{}'))
return LITERAL, c
elif c == "0": elif c == "0":
# octal escape # octal escape
escape += source.getwhile(2, OCTDIGITS) escape += source.getwhile(2, OCTDIGITS)
...@@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False): ...@@ -679,13 +702,13 @@ def _parse(source, state, verbose, nested, first=False):
# python extensions # python extensions
if sourcematch("<"): if sourcematch("<"):
# named group: skip forward to end of name # named group: skip forward to end of name
name = source.getuntil(">") name = source.getuntil(">", "group name")
if not name.isidentifier(): if not name.isidentifier():
msg = "bad character in group name %r" % name msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1) raise source.error(msg, len(name) + 1)
elif sourcematch("="): elif sourcematch("="):
# named backreference # named backreference
name = source.getuntil(")") name = source.getuntil(")", "group name")
if not name.isidentifier(): if not name.isidentifier():
msg = "bad character in group name %r" % name msg = "bad character in group name %r" % name
raise source.error(msg, len(name) + 1) raise source.error(msg, len(name) + 1)
...@@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False): ...@@ -748,7 +771,7 @@ def _parse(source, state, verbose, nested, first=False):
elif char == "(": elif char == "(":
# conditional backreference group # conditional backreference group
condname = source.getuntil(")") condname = source.getuntil(")", "group name")
if condname.isidentifier(): if condname.isidentifier():
condgroup = state.groupdict.get(condname) condgroup = state.groupdict.get(condname)
if condgroup is None: if condgroup is None:
...@@ -977,7 +1000,7 @@ def parse_template(source, pattern): ...@@ -977,7 +1000,7 @@ def parse_template(source, pattern):
name = "" name = ""
if not s.match("<"): if not s.match("<"):
raise s.error("missing <") raise s.error("missing <")
name = s.getuntil(">") name = s.getuntil(">", "group name")
if name.isidentifier(): if name.isidentifier():
try: try:
index = groupindex[name] index = groupindex[name]
......
...@@ -694,6 +694,42 @@ class ReTests(unittest.TestCase): ...@@ -694,6 +694,42 @@ class ReTests(unittest.TestCase):
with self.subTest(c): with self.subTest(c):
self.assertRaises(re.error, re.compile, '[\\%c]' % c) self.assertRaises(re.error, re.compile, '[\\%c]' % c)
def test_named_unicode_escapes(self):
# test individual Unicode named escapes
self.assertTrue(re.match(r'\N{LESS-THAN SIGN}', '<'))
self.assertTrue(re.match(r'\N{less-than sign}', '<'))
self.assertIsNone(re.match(r'\N{LESS-THAN SIGN}', '>'))
self.assertTrue(re.match(r'\N{SNAKE}', '\U0001f40d'))
self.assertTrue(re.match(r'\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH '
r'HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}',
'\ufbf9'))
self.assertTrue(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
'='))
self.assertIsNone(re.match(r'[\N{LESS-THAN SIGN}-\N{GREATER-THAN SIGN}]',
';'))
# test errors in \N{name} handling - only valid names should pass
self.checkPatternError(r'\N', 'missing {', 2)
self.checkPatternError(r'[\N]', 'missing {', 3)
self.checkPatternError(r'\N{', 'missing character name', 3)
self.checkPatternError(r'[\N{', 'missing character name', 4)
self.checkPatternError(r'\N{}', 'missing character name', 3)
self.checkPatternError(r'[\N{}]', 'missing character name', 4)
self.checkPatternError(r'\NSNAKE}', 'missing {', 2)
self.checkPatternError(r'[\NSNAKE}]', 'missing {', 3)
self.checkPatternError(r'\N{SNAKE',
'missing }, unterminated name', 3)
self.checkPatternError(r'[\N{SNAKE]',
'missing }, unterminated name', 4)
self.checkPatternError(r'[\N{SNAKE]}',
"undefined character name 'SNAKE]'", 1)
self.checkPatternError(r'\N{SPAM}',
"undefined character name 'SPAM'", 0)
self.checkPatternError(r'[\N{SPAM}]',
"undefined character name 'SPAM'", 1)
self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)
def test_string_boundaries(self): def test_string_boundaries(self):
# See http://bugs.python.org/issue10713 # See http://bugs.python.org/issue10713
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
......
...@@ -441,6 +441,7 @@ Andy Eskilsson ...@@ -441,6 +441,7 @@ Andy Eskilsson
André Espaze André Espaze
Stefan Esser Stefan Esser
Nicolas Estibals Nicolas Estibals
Jonathan Eunice
Carey Evans Carey Evans
Stephen D Evans Stephen D Evans
Tim Everett Tim Everett
......
Added support of ``\N{name}`` escapes in regular expressions. Based on
patch by Jonathan Eunice.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment