Commit 7b3110b7 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issues #814253, #9179: Group references and conditional group references now

work in lookbehind assertions in regular expressions.
parent 0098ad02
...@@ -297,6 +297,9 @@ The special characters are: ...@@ -297,6 +297,9 @@ The special characters are:
>>> m.group(0) >>> m.group(0)
'egg' 'egg'
.. versionchanged: 3.5
Added support for group references of fixed length.
``(?<!...)`` ``(?<!...)``
Matches if the current position in the string is not preceded by a match for Matches if the current position in the string is not preceded by a match for
``...``. This is called a :dfn:`negative lookbehind assertion`. Similar to ``...``. This is called a :dfn:`negative lookbehind assertion`. Similar to
......
...@@ -351,10 +351,11 @@ class Scanner: ...@@ -351,10 +351,11 @@ class Scanner:
s = sre_parse.Pattern() s = sre_parse.Pattern()
s.flags = flags s.flags = flags
for phrase, action in lexicon: for phrase, action in lexicon:
gid = s.opengroup()
p.append(sre_parse.SubPattern(s, [ p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (len(p)+1, sre_parse.parse(phrase, flags))), (SUBPATTERN, (gid, sre_parse.parse(phrase, flags))),
])) ]))
s.groups = len(p)+1 s.closegroup(gid, p[-1])
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
self.scanner = sre_compile.compile(p) self.scanner = sre_compile.compile(p)
def scan(self, string): def scan(self, string):
......
...@@ -68,12 +68,15 @@ class Pattern: ...@@ -68,12 +68,15 @@ class Pattern:
# master pattern object. keeps track of global attributes # master pattern object. keeps track of global attributes
def __init__(self): def __init__(self):
self.flags = 0 self.flags = 0
self.open = []
self.groups = 1
self.groupdict = {} self.groupdict = {}
self.subpatterns = [None] # group 0
self.lookbehindgroups = None
@property
def groups(self):
return len(self.subpatterns)
def opengroup(self, name=None): def opengroup(self, name=None):
gid = self.groups gid = self.groups
self.groups = gid + 1 self.subpatterns.append(None)
if self.groups > MAXGROUPS: if self.groups > MAXGROUPS:
raise error("groups number is too large") raise error("groups number is too large")
if name is not None: if name is not None:
...@@ -82,12 +85,19 @@ class Pattern: ...@@ -82,12 +85,19 @@ class Pattern:
raise error("redefinition of group name %r as group %d; " raise error("redefinition of group name %r as group %d; "
"was group %d" % (name, gid, ogid)) "was group %d" % (name, gid, ogid))
self.groupdict[name] = gid self.groupdict[name] = gid
self.open.append(gid)
return gid return gid
def closegroup(self, gid): def closegroup(self, gid, p):
self.open.remove(gid) self.subpatterns[gid] = p
def checkgroup(self, gid): def checkgroup(self, gid):
return gid < self.groups and gid not in self.open return gid < self.groups and self.subpatterns[gid] is not None
def checklookbehindgroup(self, gid, source):
if self.lookbehindgroups is not None:
if not self.checkgroup(gid):
raise source.error('cannot refer to an open group')
if gid >= self.lookbehindgroups:
raise source.error('cannot refer to group defined in the same '
'lookbehind subpattern')
class SubPattern: class SubPattern:
# a subpattern, in intermediate form # a subpattern, in intermediate form
...@@ -183,7 +193,21 @@ class SubPattern: ...@@ -183,7 +193,21 @@ class SubPattern:
elif op in _UNITCODES: elif op in _UNITCODES:
lo = lo + 1 lo = lo + 1
hi = hi + 1 hi = hi + 1
elif op == SUCCESS: elif op is GROUPREF:
i, j = self.pattern.subpatterns[av].getwidth()
lo = lo + i
hi = hi + j
elif op is GROUPREF_EXISTS:
i, j = av[1].getwidth()
if av[2] is not None:
l, h = av[2].getwidth()
i = min(i, l)
j = max(j, h)
else:
i = 0
lo = lo + i
hi = hi + j
elif op is SUCCESS:
break break
self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
return self.width return self.width
...@@ -379,6 +403,7 @@ def _escape(source, escape, state): ...@@ -379,6 +403,7 @@ def _escape(source, escape, state):
if not state.checkgroup(group): if not state.checkgroup(group):
raise source.error("cannot refer to open group", raise source.error("cannot refer to open group",
len(escape)) len(escape))
state.checklookbehindgroup(group, source)
return GROUPREF, group return GROUPREF, group
raise ValueError raise ValueError
if len(escape) == 2: if len(escape) == 2:
...@@ -641,6 +666,7 @@ def _parse(source, state): ...@@ -641,6 +666,7 @@ def _parse(source, state):
if gid is None: if gid is None:
msg = "unknown group name: {0!r}".format(name) msg = "unknown group name: {0!r}".format(name)
raise source.error(msg, len(name) + 1) raise source.error(msg, len(name) + 1)
state.checklookbehindgroup(gid, source)
subpatternappend((GROUPREF, gid)) subpatternappend((GROUPREF, gid))
continue continue
else: else:
...@@ -668,7 +694,13 @@ def _parse(source, state): ...@@ -668,7 +694,13 @@ def _parse(source, state):
if char is None or char not in "=!": if char is None or char not in "=!":
raise source.error("syntax error") raise source.error("syntax error")
dir = -1 # lookbehind dir = -1 # lookbehind
lookbehindgroups = state.lookbehindgroups
if lookbehindgroups is None:
state.lookbehindgroups = state.groups
p = _parse_sub(source, state) p = _parse_sub(source, state)
if dir < 0:
if lookbehindgroups is None:
state.lookbehindgroups = None
if not sourcematch(")"): if not sourcematch(")"):
raise source.error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if char == "=": if char == "=":
...@@ -701,6 +733,7 @@ def _parse(source, state): ...@@ -701,6 +733,7 @@ def _parse(source, state):
if condgroup >= MAXGROUPS: if condgroup >= MAXGROUPS:
raise source.error("the group number is too large", raise source.error("the group number is too large",
len(condname) + 1) len(condname) + 1)
state.checklookbehindgroup(condgroup, source)
elif char in FLAGS: elif char in FLAGS:
# flags # flags
state.flags |= FLAGS[char] state.flags |= FLAGS[char]
...@@ -726,7 +759,7 @@ def _parse(source, state): ...@@ -726,7 +759,7 @@ def _parse(source, state):
if not sourcematch(")"): if not sourcematch(")"):
raise source.error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if group is not None: if group is not None:
state.closegroup(group) state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p))) subpatternappend((SUBPATTERN, (group, p)))
else: else:
while True: while True:
......
...@@ -604,7 +604,7 @@ class ReTests(unittest.TestCase): ...@@ -604,7 +604,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0), self.assertEqual(re.match("a.*b", "a\n\nb", re.DOTALL).group(0),
"a\n\nb") "a\n\nb")
def test_non_consuming(self): def test_lookahead(self):
self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[^a]))", "a b").group(1), "a")
self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[^a]*))", "a b").group(1), "a")
self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a") self.assertEqual(re.match("(a(?=\s[abc]))", "a b").group(1), "a")
...@@ -618,6 +618,46 @@ class ReTests(unittest.TestCase): ...@@ -618,6 +618,46 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a") self.assertEqual(re.match(r"(a)(?!\s\1)", "a b").group(1), "a")
self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a") self.assertEqual(re.match(r"(a)(?!\s(abc|a))", "a b").group(1), "a")
# Group reference.
self.assertTrue(re.match(r'(a)b(?=\1)a', 'aba'))
self.assertIsNone(re.match(r'(a)b(?=\1)c', 'abac'))
# Conditional group reference.
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(2)c|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?=(?(1)b|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?=(?(1)c|x))c', 'abc'))
# Group used before defined.
self.assertTrue(re.match(r'(a)b(?=(?(2)x|c))(c)', 'abc'))
self.assertIsNone(re.match(r'(a)b(?=(?(2)b|x))(c)', 'abc'))
self.assertTrue(re.match(r'(a)b(?=(?(1)c|x))(c)', 'abc'))
def test_lookbehind(self):
self.assertTrue(re.match(r'ab(?<=b)c', 'abc'))
self.assertIsNone(re.match(r'ab(?<=c)c', 'abc'))
self.assertIsNone(re.match(r'ab(?<!b)c', 'abc'))
self.assertTrue(re.match(r'ab(?<!c)c', 'abc'))
# Group reference.
self.assertTrue(re.match(r'(a)a(?<=\1)c', 'aac'))
self.assertIsNone(re.match(r'(a)b(?<=\1)a', 'abaa'))
self.assertIsNone(re.match(r'(a)a(?<!\1)c', 'aac'))
self.assertTrue(re.match(r'(a)b(?<!\1)a', 'abaa'))
# Conditional group reference.
self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)x|c))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(2)b|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(2)x|b))c', 'abc'))
self.assertIsNone(re.match(r'(?:(a)|(x))b(?<=(?(1)c|x))c', 'abc'))
self.assertTrue(re.match(r'(?:(a)|(x))b(?<=(?(1)b|x))c', 'abc'))
# Group used before defined.
self.assertRaises(re.error, re.compile, r'(a)b(?<=(?(2)b|x))(c)')
self.assertIsNone(re.match(r'(a)b(?<=(?(1)c|x))(c)', 'abc'))
self.assertTrue(re.match(r'(a)b(?<=(?(1)b|x))(c)', 'abc'))
# Group defined in the same lookbehind pattern
self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)\2)(c)')
self.assertRaises(re.error, re.compile, r'(a)b(?<=(?P<a>.)(?P=a))(c)')
self.assertRaises(re.error, re.compile, r'(a)b(?<=(a)(?(2)b|x))(c)')
self.assertRaises(re.error, re.compile, r'(a)b(?<=(.)(?<=\2))(c)')
def test_ignore_case(self): def test_ignore_case(self):
self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC") self.assertEqual(re.match(b"abc", b"ABC", re.I).group(0), b"ABC")
......
...@@ -13,6 +13,9 @@ Core and Builtins ...@@ -13,6 +13,9 @@ Core and Builtins
Library Library
------- -------
- Issues #814253, #9179: Group references and conditional group references now
work in lookbehind assertions in regular expressions.
- Issue #23215: Multibyte codecs with custom error handlers that ignores errors - Issue #23215: Multibyte codecs with custom error handlers that ignores errors
consumed too much memory and raised SystemError or MemoryError. consumed too much memory and raised SystemError or MemoryError.
Original patch by Aleksi Torhamo. Original patch by Aleksi Torhamo.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment