Commit 4a7f44a2 authored by animalize's avatar animalize Committed by Serhiy Storchaka

bpo-34294: re module, fix wrong capturing groups in rare cases. (GH-11546)

Need to reset capturing groups between two SRE(match) callings in loops, this fixes wrong capturing groups in rare cases.

Also add a missing index in re.rst.
parent 02c04f26
...@@ -371,6 +371,8 @@ The special characters are: ...@@ -371,6 +371,8 @@ The special characters are:
``(?#...)`` ``(?#...)``
A comment; the contents of the parentheses are simply ignored. A comment; the contents of the parentheses are simply ignored.
.. index:: single: (?=; in regular expressions
``(?=...)`` ``(?=...)``
Matches if ``...`` matches next, but doesn't consume any of the string. This is Matches if ``...`` matches next, but doesn't consume any of the string. This is
called a :dfn:`lookahead assertion`. For example, ``Isaac (?=Asimov)`` will match called a :dfn:`lookahead assertion`. For example, ``Isaac (?=Asimov)`` will match
......
...@@ -2067,6 +2067,40 @@ ELSE ...@@ -2067,6 +2067,40 @@ ELSE
self.assertEqual(m.group(), b'xyz') self.assertEqual(m.group(), b'xyz')
self.assertEqual(m2.group(), b'') self.assertEqual(m2.group(), b'')
def test_bug_34294(self):
# Issue 34294: wrong capturing groups
# exists since Python 2
s = "a\tx"
p = r"\b(?=(\t)|(x))x"
self.assertEqual(re.search(p, s).groups(), (None, 'x'))
# introduced in Python 3.7.0
s = "ab"
p = r"(?=(.)(.)?)"
self.assertEqual(re.findall(p, s),
[('a', 'b'), ('b', '')])
self.assertEqual([m.groups() for m in re.finditer(p, s)],
[('a', 'b'), ('b', None)])
# test-cases provided by issue34294, introduced in Python 3.7.0
p = r"(?=<(?P<tag>\w+)/?>(?:(?P<text>.+?)</(?P=tag)>)?)"
s = "<test><foo2/></test>"
self.assertEqual(re.findall(p, s),
[('test', '<foo2/>'), ('foo2', '')])
self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
[{'tag': 'test', 'text': '<foo2/>'},
{'tag': 'foo2', 'text': None}])
s = "<test>Hello</test><foo/>"
self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
[{'tag': 'test', 'text': 'Hello'},
{'tag': 'foo', 'text': None}])
s = "<test>Hello</test><foo/><foo/>"
self.assertEqual([m.groupdict() for m in re.finditer(p, s)],
[{'tag': 'test', 'text': 'Hello'},
{'tag': 'foo', 'text': None},
{'tag': 'foo', 'text': None}])
class PatternReprTests(unittest.TestCase): class PatternReprTests(unittest.TestCase):
def check(self, pattern, expected): def check(self, pattern, expected):
......
re module, fix wrong capturing groups in rare cases. :func:`re.search`,
:func:`re.findall`, :func:`re.sub` and other functions that scan through
string looking for a match, should reset capturing groups between two match
attempts. Patch by Ma Lin.
\ No newline at end of file
...@@ -340,7 +340,7 @@ _sre_unicode_tolower_impl(PyObject *module, int character) ...@@ -340,7 +340,7 @@ _sre_unicode_tolower_impl(PyObject *module, int character)
LOCAL(void) LOCAL(void)
state_reset(SRE_STATE* state) state_reset(SRE_STATE* state)
{ {
/* FIXME: dynamic! */ /* state->mark will be set to 0 in SRE_OP_MARK dynamically. */
/*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/ /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
state->lastmark = -1; state->lastmark = -1;
......
...@@ -1363,6 +1363,10 @@ exit: ...@@ -1363,6 +1363,10 @@ exit:
return ret; /* should never get here */ return ret; /* should never get here */
} }
/* need to reset capturing groups between two SRE(match) callings in loops */
#define RESET_CAPTURE_GROUP() \
do { state->lastmark = state->lastindex = -1; } while (0)
LOCAL(Py_ssize_t) LOCAL(Py_ssize_t)
SRE(search)(SRE_STATE* state, SRE_CODE* pattern) SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
{ {
...@@ -1440,6 +1444,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) ...@@ -1440,6 +1444,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0) if (status != 0)
return status; return status;
++ptr; ++ptr;
RESET_CAPTURE_GROUP();
} }
return 0; return 0;
} }
...@@ -1487,6 +1492,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) ...@@ -1487,6 +1492,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
/* close but no cigar -- try again */ /* close but no cigar -- try again */
if (++ptr >= end) if (++ptr >= end)
return 0; return 0;
RESET_CAPTURE_GROUP();
} }
i = overlap[i]; i = overlap[i];
} while (i != 0); } while (i != 0);
...@@ -1510,6 +1516,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) ...@@ -1510,6 +1516,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
if (status != 0) if (status != 0)
break; break;
ptr++; ptr++;
RESET_CAPTURE_GROUP();
} }
} else { } else {
/* general case */ /* general case */
...@@ -1520,6 +1527,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern) ...@@ -1520,6 +1527,7 @@ SRE(search)(SRE_STATE* state, SRE_CODE* pattern)
state->must_advance = 0; state->must_advance = 0;
while (status == 0 && ptr < end) { while (status == 0 && ptr < end) {
ptr++; ptr++;
RESET_CAPTURE_GROUP();
TRACE(("|%p|%p|SEARCH\n", pattern, ptr)); TRACE(("|%p|%p|SEARCH\n", pattern, ptr));
state->start = state->ptr = ptr; state->start = state->ptr = ptr;
status = SRE(match)(state, pattern, 0); status = SRE(match)(state, pattern, 0);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment