Commit 821a9d14 authored by Serhiy Storchaka's avatar Serhiy Storchaka Committed by GitHub

bpo-30340: Enhanced regular expressions optimization. (#1542)

This increased the performance of matching some patterns up to 25 times.
parent cbddf58c
...@@ -20,6 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL} ...@@ -20,6 +20,7 @@ _LITERAL_CODES = {LITERAL, NOT_LITERAL}
_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT} _REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
_SUCCESS_CODES = {SUCCESS, FAILURE} _SUCCESS_CODES = {SUCCESS, FAILURE}
_ASSERT_CODES = {ASSERT, ASSERT_NOT} _ASSERT_CODES = {ASSERT, ASSERT_NOT}
_UNIT_CODES = _LITERAL_CODES | {ANY, IN}
# Sets of lowercase characters which have the same uppercase. # Sets of lowercase characters which have the same uppercase.
_equivalences = ( _equivalences = (
...@@ -125,7 +126,7 @@ def _compile(code, pattern, flags): ...@@ -125,7 +126,7 @@ def _compile(code, pattern, flags):
elif op in REPEATING_CODES: elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE: if flags & SRE_FLAG_TEMPLATE:
raise error("internal: unsupported template operator %r" % (op,)) raise error("internal: unsupported template operator %r" % (op,))
elif _simple(av) and op is not REPEAT: if _simple(av[2]):
if op is MAX_REPEAT: if op is MAX_REPEAT:
emit(REPEAT_ONE) emit(REPEAT_ONE)
else: else:
...@@ -404,10 +405,14 @@ def _bytes_to_codes(b): ...@@ -404,10 +405,14 @@ def _bytes_to_codes(b):
assert len(a) * a.itemsize == len(b) assert len(a) * a.itemsize == len(b)
return a.tolist() return a.tolist()
def _simple(av): def _simple(p):
# check if av is a "simple" operator # check if this subpattern is a "simple" operator
lo, hi = av[2].getwidth() if len(p) != 1:
return lo == hi == 1 and av[2][0][0] != SUBPATTERN return False
op, av = p[0]
if op is SUBPATTERN:
return av[0] is None and _simple(av[-1])
return op in _UNIT_CODES
def _generate_overlap_table(prefix): def _generate_overlap_table(prefix):
""" """
......
This diff is collapsed.
...@@ -1695,20 +1695,18 @@ class ReTests(unittest.TestCase): ...@@ -1695,20 +1695,18 @@ class ReTests(unittest.TestCase):
dump = '''\ dump = '''\
SUBPATTERN 1 0 0 SUBPATTERN 1 0 0
LITERAL 46 LITERAL 46
SUBPATTERN None 0 0 BRANCH
BRANCH IN
IN LITERAL 99
LITERAL 99 LITERAL 104
LITERAL 104 OR
OR LITERAL 112
LITERAL 112 LITERAL 121
LITERAL 121 GROUPREF_EXISTS 1
SUBPATTERN None 0 0 AT AT_END
GROUPREF_EXISTS 1 ELSE
AT AT_END LITERAL 58
ELSE LITERAL 32
LITERAL 58
LITERAL 32
''' '''
self.assertEqual(out.getvalue(), dump) self.assertEqual(out.getvalue(), dump)
# Debug output is output again even a second time (bypassing # Debug output is output again even a second time (bypassing
......
...@@ -326,6 +326,9 @@ Library ...@@ -326,6 +326,9 @@ Library
- bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is - bpo-30048: Fixed ``Task.cancel()`` can be ignored when the task is
running coroutine and the coroutine returned without any more ``await``. running coroutine and the coroutine returned without any more ``await``.
- bpo-30340: Enhanced regular expressions optimization. This increased
the performance of matching some patterns up to 25 times.
- bpo-30298: Weaken the condition of deprecation warnings for inline modifiers. - bpo-30298: Weaken the condition of deprecation warnings for inline modifiers.
Now allowed several subsequential inline modifiers at the start of the Now allowed several subsequential inline modifiers at the start of the
pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments pattern (e.g. ``'(?i)(?s)...'``). In verbose mode whitespaces and comments
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment