Commit 5908300e authored by Serhiy Storchaka's avatar Serhiy Storchaka Committed by GitHub

bpo-29995: re.escape() now escapes only special characters. (#1007)

parent a6e395df
...@@ -786,7 +786,7 @@ form. ...@@ -786,7 +786,7 @@ form.
.. function:: escape(pattern) .. function:: escape(pattern)
Escape all the characters in *pattern* except ASCII letters, numbers and ``'_'``. Escape special characters in *pattern*.
This is useful if you want to match an arbitrary literal string that may This is useful if you want to match an arbitrary literal string that may
have regular expression metacharacters in it. For example:: have regular expression metacharacters in it. For example::
...@@ -795,15 +795,19 @@ form. ...@@ -795,15 +795,19 @@ form.
>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:" >>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
>>> print('[%s]+' % re.escape(legal_chars)) >>> print('[%s]+' % re.escape(legal_chars))
[abcdefghijklmnopqrstuvwxyz0123456789\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\:]+ [abcdefghijklmnopqrstuvwxyz0123456789!\#\$%&'\*\+\-\.\^_`\|~:]+
>>> operators = ['+', '-', '*', '/', '**'] >>> operators = ['+', '-', '*', '/', '**']
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True)))) >>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
\/|\-|\+|\*\*|\* /|\-|\+|\*\*|\*
.. versionchanged:: 3.3 .. versionchanged:: 3.3
The ``'_'`` character is no longer escaped. The ``'_'`` character is no longer escaped.
.. versionchanged:: 3.7
Only characters that can have special meaning in a regular expression
are escaped.
.. function:: purge() .. function:: purge()
......
...@@ -303,7 +303,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a ...@@ -303,7 +303,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
whatsnew/3.2,,:location,zope9-location = ${zope9:location} whatsnew/3.2,,:location,zope9-location = ${zope9:location}
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
library/re,,`,!#$%&'*+-.^_`|~: library/re,,`,!#$%&'*+-.^_`|~:
library/re,,`,\!\#\$\%\&\'\*\+\-\.\^_\`\|\~\: library/re,,`,!\#\$%&'\*\+\-\.\^_`\|~:
library/tarfile,,:xz,'x:xz' library/tarfile,,:xz,'x:xz'
library/xml.etree.elementtree,,:sometag,prefix:sometag library/xml.etree.elementtree,,:sometag,prefix:sometag
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com""" library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
......
...@@ -221,8 +221,8 @@ class ReplaceDialogTest(unittest.TestCase): ...@@ -221,8 +221,8 @@ class ReplaceDialogTest(unittest.TestCase):
self.assertIn('Invalid Replace Expression', showerror.message) self.assertIn('Invalid Replace Expression', showerror.message)
# test access method # test access method
self.engine.setcookedpat("\'") self.engine.setcookedpat("?")
equal(pv.get(), "\\'") equal(pv.get(), "\\?")
def test_replace_backwards(self): def test_replace_backwards(self):
equal = self.assertEqual equal = self.assertEqual
......
...@@ -241,39 +241,21 @@ def template(pattern, flags=0): ...@@ -241,39 +241,21 @@ def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object" "Compile a template pattern, returning a pattern object"
return _compile(pattern, flags|T) return _compile(pattern, flags|T)
_alphanum_str = frozenset( # SPECIAL_CHARS
"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") # closing ')', '}' and ']'
_alphanum_bytes = frozenset( # '-' (a range in character set)
b"_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890") # '#' (comment) and WHITESPACE (ignored) in verbose mode
_special_chars_map = {i: '\\' + chr(i) for i in b'()[]{}?*+-|^$\\.# \t\n\r\v\f'}
def escape(pattern): def escape(pattern):
""" """
Escape all the characters in pattern except ASCII letters, numbers and '_'. Escape special characters in a string.
""" """
if isinstance(pattern, str): if isinstance(pattern, str):
alphanum = _alphanum_str return pattern.translate(_special_chars_map)
s = list(pattern)
for i, c in enumerate(pattern):
if c not in alphanum:
if c == "\000":
s[i] = "\\000"
else: else:
s[i] = "\\" + c pattern = str(pattern, 'latin1')
return "".join(s) return pattern.translate(_special_chars_map).encode('latin1')
else:
alphanum = _alphanum_bytes
s = []
esc = ord(b"\\")
for c in pattern:
if c in alphanum:
s.append(c)
else:
if c == 0:
s.extend(b"\\000")
else:
s.append(esc)
s.append(c)
return bytes(s)
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# internals # internals
......
...@@ -904,7 +904,7 @@ class ReTests(unittest.TestCase): ...@@ -904,7 +904,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.search(r"a\s", "a ").group(0), "a ") self.assertEqual(re.search(r"a\s", "a ").group(0), "a ")
def assertMatch(self, pattern, text, match=None, span=None, def assertMatch(self, pattern, text, match=None, span=None,
matcher=re.match): matcher=re.fullmatch):
if match is None and span is None: if match is None and span is None:
# the pattern matches the whole text # the pattern matches the whole text
match = text match = text
...@@ -917,37 +917,38 @@ class ReTests(unittest.TestCase): ...@@ -917,37 +917,38 @@ class ReTests(unittest.TestCase):
self.assertEqual(m.group(), match) self.assertEqual(m.group(), match)
self.assertEqual(m.span(), span) self.assertEqual(m.span(), span)
LITERAL_CHARS = string.ascii_letters + string.digits + '!"%&\',/:;<=>@_`~'
def test_re_escape(self): def test_re_escape(self):
alnum_chars = string.ascii_letters + string.digits + '_'
p = ''.join(chr(i) for i in range(256)) p = ''.join(chr(i) for i in range(256))
for c in p: for c in p:
if c in alnum_chars:
self.assertEqual(re.escape(c), c)
elif c == '\x00':
self.assertEqual(re.escape(c), '\\000')
else:
self.assertEqual(re.escape(c), '\\' + c)
self.assertMatch(re.escape(c), c) self.assertMatch(re.escape(c), c)
self.assertMatch('[' + re.escape(c) + ']', c)
self.assertMatch('(?x)' + re.escape(c), c)
self.assertMatch(re.escape(p), p) self.assertMatch(re.escape(p), p)
for c in '-.]{}':
self.assertEqual(re.escape(c)[:1], '\\')
literal_chars = self.LITERAL_CHARS
self.assertEqual(re.escape(literal_chars), literal_chars)
def test_re_escape_byte(self): def test_re_escape_bytes(self):
alnum_chars = (string.ascii_letters + string.digits + '_').encode('ascii')
p = bytes(range(256)) p = bytes(range(256))
for i in p: for i in p:
b = bytes([i]) b = bytes([i])
if b in alnum_chars:
self.assertEqual(re.escape(b), b)
elif i == 0:
self.assertEqual(re.escape(b), b'\\000')
else:
self.assertEqual(re.escape(b), b'\\' + b)
self.assertMatch(re.escape(b), b) self.assertMatch(re.escape(b), b)
self.assertMatch(b'[' + re.escape(b) + b']', b)
self.assertMatch(b'(?x)' + re.escape(b), b)
self.assertMatch(re.escape(p), p) self.assertMatch(re.escape(p), p)
for i in b'-.]{}':
b = bytes([i])
self.assertEqual(re.escape(b)[:1], b'\\')
literal_chars = self.LITERAL_CHARS.encode('ascii')
self.assertEqual(re.escape(literal_chars), literal_chars)
def test_re_escape_non_ascii(self): def test_re_escape_non_ascii(self):
s = 'xxx\u2620\u2620\u2620xxx' s = 'xxx\u2620\u2620\u2620xxx'
s_escaped = re.escape(s) s_escaped = re.escape(s)
self.assertEqual(s_escaped, 'xxx\\\u2620\\\u2620\\\u2620xxx') self.assertEqual(s_escaped, s)
self.assertMatch(s_escaped, s) self.assertMatch(s_escaped, s)
self.assertMatch('.%s+.' % re.escape('\u2620'), s, self.assertMatch('.%s+.' % re.escape('\u2620'), s,
'x\u2620\u2620\u2620x', (2, 7), re.search) 'x\u2620\u2620\u2620x', (2, 7), re.search)
...@@ -955,7 +956,7 @@ class ReTests(unittest.TestCase): ...@@ -955,7 +956,7 @@ class ReTests(unittest.TestCase):
def test_re_escape_non_ascii_bytes(self): def test_re_escape_non_ascii_bytes(self):
b = 'y\u2620y\u2620y'.encode('utf-8') b = 'y\u2620y\u2620y'.encode('utf-8')
b_escaped = re.escape(b) b_escaped = re.escape(b)
self.assertEqual(b_escaped, b'y\\\xe2\\\x98\\\xa0y\\\xe2\\\x98\\\xa0y') self.assertEqual(b_escaped, b)
self.assertMatch(b_escaped, b) self.assertMatch(b_escaped, b)
res = re.findall(re.escape('\u2620'.encode('utf-8')), b) res = re.findall(re.escape('\u2620'.encode('utf-8')), b)
self.assertEqual(len(res), 2) self.assertEqual(len(res), 2)
......
...@@ -320,6 +320,8 @@ Library ...@@ -320,6 +320,8 @@ Library
- bpo-29998: Pickling and copying ImportError now preserves name and path - bpo-29998: Pickling and copying ImportError now preserves name and path
attributes. attributes.
- bpo-29995: re.escape() now escapes only regex special characters.
- bpo-29962: Add math.remainder operation, implementing remainder - bpo-29962: Add math.remainder operation, implementing remainder
as specified in IEEE 754. as specified in IEEE 754.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment