Commit be9a4e5c authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #433028: Added support of modifier spans in regular expressions.

parent ee73a657
...@@ -237,6 +237,16 @@ The special characters are: ...@@ -237,6 +237,16 @@ The special characters are:
*cannot* be retrieved after performing a match or referenced later in the *cannot* be retrieved after performing a match or referenced later in the
pattern. pattern.
``(?imsx-imsx:...)``
(Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
optionally followed by ``'-'`` followed by one or more letters from the
same set.) The letters set or removes the corresponding flags:
:const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
(dot matches all), and :const:`re.X` (verbose), for the part of the
expression. (The flags are described in :ref:`contents-of-module-re`.)
.. versionadded: 3.7
``(?P<name>...)`` ``(?P<name>...)``
Similar to regular parentheses, but the substring matched by the group is Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid accessible via the symbolic group name *name*. Group names must be valid
......
...@@ -645,6 +645,15 @@ Protocol version 4 already supports this case. (Contributed by Serhiy ...@@ -645,6 +645,15 @@ Protocol version 4 already supports this case. (Contributed by Serhiy
Storchaka in :issue:`24164`.) Storchaka in :issue:`24164`.)
re
--
Added support of modifier spans in regular expressions. Examples:
``'(?i:p)ython'`` matches ``'python'`` and ``'Python'``, but not ``'PYTHON'``;
``'(?i)g(?-i:v)r'`` matches ``'GvR'`` and ``'gvr'``, but not ``'GVR'``.
(Contributed by Serhiy Storchaka in :issue:`433028`.)
readline readline
-------- --------
......
...@@ -352,7 +352,7 @@ class Scanner: ...@@ -352,7 +352,7 @@ class Scanner:
for phrase, action in lexicon: for phrase, action in lexicon:
gid = s.opengroup() gid = s.opengroup()
p.append(sre_parse.SubPattern(s, [ p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (gid, sre_parse.parse(phrase, flags))), (SUBPATTERN, (gid, 0, 0, sre_parse.parse(phrase, flags))),
])) ]))
s.closegroup(gid, p[-1]) s.closegroup(gid, p[-1])
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
......
...@@ -71,7 +71,8 @@ def _compile(code, pattern, flags): ...@@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
ASSERT_CODES = _ASSERT_CODES ASSERT_CODES = _ASSERT_CODES
if (flags & SRE_FLAG_IGNORECASE and if (flags & SRE_FLAG_IGNORECASE and
not (flags & SRE_FLAG_LOCALE) and not (flags & SRE_FLAG_LOCALE) and
flags & SRE_FLAG_UNICODE): flags & SRE_FLAG_UNICODE and
not (flags & SRE_FLAG_ASCII)):
fixes = _ignorecase_fixes fixes = _ignorecase_fixes
else: else:
fixes = None fixes = None
...@@ -137,14 +138,15 @@ def _compile(code, pattern, flags): ...@@ -137,14 +138,15 @@ def _compile(code, pattern, flags):
else: else:
emit(MIN_UNTIL) emit(MIN_UNTIL)
elif op is SUBPATTERN: elif op is SUBPATTERN:
if av[0]: group, add_flags, del_flags, p = av
if group:
emit(MARK) emit(MARK)
emit((av[0]-1)*2) emit((group-1)*2)
# _compile_info(code, av[1], flags) # _compile_info(code, p, (flags | add_flags) & ~del_flags)
_compile(code, av[1], flags) _compile(code, p, (flags | add_flags) & ~del_flags)
if av[0]: if group:
emit(MARK) emit(MARK)
emit((av[0]-1)*2+1) emit((group-1)*2+1)
elif op in SUCCESS_CODES: elif op in SUCCESS_CODES:
emit(op) emit(op)
elif op in ASSERT_CODES: elif op in ASSERT_CODES:
...@@ -172,7 +174,7 @@ def _compile(code, pattern, flags): ...@@ -172,7 +174,7 @@ def _compile(code, pattern, flags):
av = AT_MULTILINE.get(av, av) av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av) av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE: elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
av = AT_UNICODE.get(av, av) av = AT_UNICODE.get(av, av)
emit(av) emit(av)
elif op is BRANCH: elif op is BRANCH:
...@@ -193,7 +195,7 @@ def _compile(code, pattern, flags): ...@@ -193,7 +195,7 @@ def _compile(code, pattern, flags):
emit(op) emit(op)
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av] av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE: elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
av = CH_UNICODE[av] av = CH_UNICODE[av]
emit(av) emit(av)
elif op is GROUPREF: elif op is GROUPREF:
...@@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): ...@@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
elif op is CATEGORY: elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE: if flags & SRE_FLAG_LOCALE:
emit(CH_LOCALE[av]) emit(CH_LOCALE[av])
elif flags & SRE_FLAG_UNICODE: elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
emit(CH_UNICODE[av]) emit(CH_UNICODE[av])
else: else:
emit(av) emit(av)
...@@ -414,14 +416,16 @@ def _get_literal_prefix(pattern): ...@@ -414,14 +416,16 @@ def _get_literal_prefix(pattern):
prefix = [] prefix = []
prefixappend = prefix.append prefixappend = prefix.append
prefix_skip = None prefix_skip = None
got_all = True
for op, av in pattern.data: for op, av in pattern.data:
if op is LITERAL: if op is LITERAL:
prefixappend(av) prefixappend(av)
elif op is SUBPATTERN: elif op is SUBPATTERN:
prefix1, prefix_skip1, got_all = _get_literal_prefix(av[1]) group, add_flags, del_flags, p = av
if add_flags & SRE_FLAG_IGNORECASE:
break
prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
if prefix_skip is None: if prefix_skip is None:
if av[0] is not None: if group is not None:
prefix_skip = len(prefix) prefix_skip = len(prefix)
elif prefix_skip1 is not None: elif prefix_skip1 is not None:
prefix_skip = len(prefix) + prefix_skip1 prefix_skip = len(prefix) + prefix_skip1
...@@ -429,32 +433,35 @@ def _get_literal_prefix(pattern): ...@@ -429,32 +433,35 @@ def _get_literal_prefix(pattern):
if not got_all: if not got_all:
break break
else: else:
got_all = False
break break
return prefix, prefix_skip, got_all else:
return prefix, prefix_skip, True
return prefix, prefix_skip, False
def _get_charset_prefix(pattern): def _get_charset_prefix(pattern):
charset = [] # not used charset = [] # not used
charsetappend = charset.append charsetappend = charset.append
if pattern.data: if pattern.data:
op, av = pattern.data[0] op, av = pattern.data[0]
if op is SUBPATTERN and av[1]: if op is SUBPATTERN:
op, av = av[1][0] group, add_flags, del_flags, p = av
if op is LITERAL: if p and not (add_flags & SRE_FLAG_IGNORECASE):
charsetappend((op, av)) op, av = p[0]
elif op is BRANCH: if op is LITERAL:
c = [] charsetappend((op, av))
cappend = c.append elif op is BRANCH:
for p in av[1]: c = []
if not p: cappend = c.append
break for p in av[1]:
op, av = p[0] if not p:
if op is LITERAL: break
cappend((op, av)) op, av = p[0]
if op is LITERAL:
cappend((op, av))
else:
break
else: else:
break charset = c
else:
charset = c
elif op is BRANCH: elif op is BRANCH:
c = [] c = []
cappend = c.append cappend = c.append
......
...@@ -65,6 +65,12 @@ FLAGS = { ...@@ -65,6 +65,12 @@ FLAGS = {
"u": SRE_FLAG_UNICODE, "u": SRE_FLAG_UNICODE,
} }
GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
class Verbose(Exception):
pass
class Pattern: class Pattern:
# master pattern object. keeps track of global attributes # master pattern object. keeps track of global attributes
def __init__(self): def __init__(self):
...@@ -184,7 +190,7 @@ class SubPattern: ...@@ -184,7 +190,7 @@ class SubPattern:
lo = lo + i lo = lo + i
hi = hi + j hi = hi + j
elif op is SUBPATTERN: elif op is SUBPATTERN:
i, j = av[1].getwidth() i, j = av[-1].getwidth()
lo = lo + i lo = lo + i
hi = hi + j hi = hi + j
elif op in _REPEATCODES: elif op in _REPEATCODES:
...@@ -395,7 +401,7 @@ def _escape(source, escape, state): ...@@ -395,7 +401,7 @@ def _escape(source, escape, state):
pass pass
raise source.error("bad escape %s" % escape, len(escape)) raise source.error("bad escape %s" % escape, len(escape))
def _parse_sub(source, state, nested=True): def _parse_sub(source, state, verbose, nested=True):
# parse an alternation: a|b|c # parse an alternation: a|b|c
items = [] items = []
...@@ -403,7 +409,7 @@ def _parse_sub(source, state, nested=True): ...@@ -403,7 +409,7 @@ def _parse_sub(source, state, nested=True):
sourcematch = source.match sourcematch = source.match
start = source.tell() start = source.tell()
while True: while True:
itemsappend(_parse(source, state)) itemsappend(_parse(source, state, verbose))
if not sourcematch("|"): if not sourcematch("|"):
break break
...@@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=True): ...@@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=True):
subpattern.append((BRANCH, (None, items))) subpattern.append((BRANCH, (None, items)))
return subpattern return subpattern
def _parse_sub_cond(source, state, condgroup): def _parse_sub_cond(source, state, condgroup, verbose):
item_yes = _parse(source, state) item_yes = _parse(source, state, verbose)
if source.match("|"): if source.match("|"):
item_no = _parse(source, state) item_no = _parse(source, state, verbose)
if source.next == "|": if source.next == "|":
raise source.error("conditional backref with more than two branches") raise source.error("conditional backref with more than two branches")
else: else:
...@@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condgroup): ...@@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condgroup):
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no))) subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern return subpattern
def _parse(source, state): def _parse(source, state, verbose):
# parse a simple pattern # parse a simple pattern
subpattern = SubPattern(state) subpattern = SubPattern(state)
...@@ -467,7 +473,6 @@ def _parse(source, state): ...@@ -467,7 +473,6 @@ def _parse(source, state):
sourcematch = source.match sourcematch = source.match
_len = len _len = len
_ord = ord _ord = ord
verbose = state.flags & SRE_FLAG_VERBOSE
while True: while True:
...@@ -621,6 +626,8 @@ def _parse(source, state): ...@@ -621,6 +626,8 @@ def _parse(source, state):
group = True group = True
name = None name = None
condgroup = None condgroup = None
add_flags = 0
del_flags = 0
if sourcematch("?"): if sourcematch("?"):
# options # options
char = sourceget() char = sourceget()
...@@ -682,7 +689,7 @@ def _parse(source, state): ...@@ -682,7 +689,7 @@ def _parse(source, state):
lookbehindgroups = state.lookbehindgroups lookbehindgroups = state.lookbehindgroups
if lookbehindgroups is None: if lookbehindgroups is None:
state.lookbehindgroups = state.groups state.lookbehindgroups = state.groups
p = _parse_sub(source, state) p = _parse_sub(source, state, verbose)
if dir < 0: if dir < 0:
if lookbehindgroups is None: if lookbehindgroups is None:
state.lookbehindgroups = None state.lookbehindgroups = None
...@@ -718,19 +725,13 @@ def _parse(source, state): ...@@ -718,19 +725,13 @@ def _parse(source, state):
raise source.error("invalid group reference", raise source.error("invalid group reference",
len(condname) + 1) len(condname) + 1)
state.checklookbehindgroup(condgroup, source) state.checklookbehindgroup(condgroup, source)
elif char in FLAGS: elif char in FLAGS or char == "-":
# flags # flags
while True: flags = _parse_flags(source, state, char)
state.flags |= FLAGS[char] if flags is None: # global flags
char = sourceget() continue
if char is None: add_flags, del_flags = flags
raise source.error("missing )") group = None
if char == ")":
break
if char not in FLAGS:
raise source.error("unknown flag", len(char))
verbose = state.flags & SRE_FLAG_VERBOSE
continue
else: else:
raise source.error("unknown extension ?" + char, raise source.error("unknown extension ?" + char,
len(char) + 1) len(char) + 1)
...@@ -742,15 +743,17 @@ def _parse(source, state): ...@@ -742,15 +743,17 @@ def _parse(source, state):
except error as err: except error as err:
raise source.error(err.msg, len(name) + 1) from None raise source.error(err.msg, len(name) + 1) from None
if condgroup: if condgroup:
p = _parse_sub_cond(source, state, condgroup) p = _parse_sub_cond(source, state, condgroup, verbose)
else: else:
p = _parse_sub(source, state) sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
not (del_flags & SRE_FLAG_VERBOSE))
p = _parse_sub(source, state, sub_verbose)
if not source.match(")"): if not source.match(")"):
raise source.error("missing ), unterminated subpattern", raise source.error("missing ), unterminated subpattern",
source.tell() - start) source.tell() - start)
if group is not None: if group is not None:
state.closegroup(group, p) state.closegroup(group, p)
subpatternappend((SUBPATTERN, (group, p))) subpatternappend((SUBPATTERN, (group, add_flags, del_flags, p)))
elif this == "^": elif this == "^":
subpatternappend((AT, AT_BEGINNING)) subpatternappend((AT, AT_BEGINNING))
...@@ -763,6 +766,53 @@ def _parse(source, state): ...@@ -763,6 +766,53 @@ def _parse(source, state):
return subpattern return subpattern
def _parse_flags(source, state, char):
sourceget = source.get
add_flags = 0
del_flags = 0
if char != "-":
while True:
add_flags |= FLAGS[char]
char = sourceget()
if char is None:
raise source.error("missing -, : or )")
if char in ")-:":
break
if char not in FLAGS:
msg = "unknown flag" if char.isalpha() else "missing -, : or )"
raise source.error(msg, len(char))
if char == ")":
if ((add_flags & SRE_FLAG_VERBOSE) and
not (state.flags & SRE_FLAG_VERBOSE)):
raise Verbose
state.flags |= add_flags
return None
if add_flags & GLOBAL_FLAGS:
raise source.error("bad inline flags: cannot turn on global flag", 1)
if char == "-":
char = sourceget()
if char is None:
raise source.error("missing flag")
if char not in FLAGS:
msg = "unknown flag" if char.isalpha() else "missing flag"
raise source.error(msg, len(char))
while True:
del_flags |= FLAGS[char]
char = sourceget()
if char is None:
raise source.error("missing :")
if char == ":":
break
if char not in FLAGS:
msg = "unknown flag" if char.isalpha() else "missing :"
raise source.error(msg, len(char))
assert char == ":"
if del_flags & GLOBAL_FLAGS:
raise source.error("bad inline flags: cannot turn off global flag", 1)
if add_flags & del_flags:
raise source.error("bad inline flags: flag turned on and off", 1)
return add_flags, del_flags
def fix_flags(src, flags): def fix_flags(src, flags):
# Check and fix flags according to the type of pattern (str or bytes) # Check and fix flags according to the type of pattern (str or bytes)
if isinstance(src, str): if isinstance(src, str):
...@@ -789,18 +839,22 @@ def parse(str, flags=0, pattern=None): ...@@ -789,18 +839,22 @@ def parse(str, flags=0, pattern=None):
pattern.flags = flags pattern.flags = flags
pattern.str = str pattern.str = str
p = _parse_sub(source, pattern, 0) try:
p = _parse_sub(source, pattern, flags & SRE_FLAG_VERBOSE, False)
except Verbose:
# the VERBOSE flag was switched on inside the pattern. to be
# on the safe side, we'll parse the whole thing again...
pattern = Pattern()
pattern.flags = flags | SRE_FLAG_VERBOSE
pattern.str = str
p = _parse_sub(source, pattern, True, False)
p.pattern.flags = fix_flags(str, p.pattern.flags) p.pattern.flags = fix_flags(str, p.pattern.flags)
if source.next is not None: if source.next is not None:
assert source.next == ")" assert source.next == ")"
raise source.error("unbalanced parenthesis") raise source.error("unbalanced parenthesis")
if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:
# the VERBOSE flag was switched on inside the pattern. to be
# on the safe side, we'll parse the whole thing again...
return parse(str, p.pattern.flags)
if flags & SRE_FLAG_DEBUG: if flags & SRE_FLAG_DEBUG:
p.dump() p.dump()
......
...@@ -1376,6 +1376,38 @@ class ReTests(unittest.TestCase): ...@@ -1376,6 +1376,38 @@ class ReTests(unittest.TestCase):
self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE) self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
self.assertRaises(ValueError, re.compile, b'(?aL)') self.assertRaises(ValueError, re.compile, b'(?aL)')
def test_scoped_flags(self):
self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
self.assertTrue(re.match(r'(?x: a) b', 'a b'))
self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
self.checkPatternError(r'(?a:\w)',
'bad inline flags: cannot turn on global flag', 3)
self.checkPatternError(r'(?a)(?-a:\w)',
'bad inline flags: cannot turn off global flag', 8)
self.checkPatternError(r'(?i-i:a)',
'bad inline flags: flag turned on and off', 5)
self.checkPatternError(r'(?-', 'missing flag', 3)
self.checkPatternError(r'(?-+', 'missing flag', 3)
self.checkPatternError(r'(?-z', 'unknown flag', 3)
self.checkPatternError(r'(?-i', 'missing :', 4)
self.checkPatternError(r'(?-i)', 'missing :', 4)
self.checkPatternError(r'(?-i+', 'missing :', 4)
self.checkPatternError(r'(?-iz', 'unknown flag', 4)
self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
self.checkPatternError(r'(?i', 'missing -, : or )', 3)
self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
self.checkPatternError(r'(?iz', 'unknown flag', 3)
def test_bug_6509(self): def test_bug_6509(self):
# Replacement strings of both types must parse properly. # Replacement strings of both types must parse properly.
# all strings # all strings
...@@ -1538,9 +1570,9 @@ class ReTests(unittest.TestCase): ...@@ -1538,9 +1570,9 @@ class ReTests(unittest.TestCase):
with captured_stdout() as out: with captured_stdout() as out:
re.compile(pat, re.DEBUG) re.compile(pat, re.DEBUG)
dump = '''\ dump = '''\
SUBPATTERN 1 SUBPATTERN 1 0 0
LITERAL 46 LITERAL 46
SUBPATTERN None SUBPATTERN None 0 0
BRANCH BRANCH
IN IN
LITERAL 99 LITERAL 99
...@@ -1548,7 +1580,7 @@ SUBPATTERN None ...@@ -1548,7 +1580,7 @@ SUBPATTERN None
OR OR
LITERAL 112 LITERAL 112
LITERAL 121 LITERAL 121
SUBPATTERN None SUBPATTERN None 0 0
GROUPREF_EXISTS 1 GROUPREF_EXISTS 1
AT AT_END AT AT_END
ELSE ELSE
...@@ -1664,7 +1696,7 @@ SUBPATTERN None ...@@ -1664,7 +1696,7 @@ SUBPATTERN None
self.checkPatternError(r'(?P', 'unexpected end of pattern', 3) self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?z)', 'unknown extension ?z', 1) self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
self.checkPatternError(r'(?iz)', 'unknown flag', 3) self.checkPatternError(r'(?iz)', 'unknown flag', 3)
self.checkPatternError(r'(?i', 'missing )', 3) self.checkPatternError(r'(?i', 'missing -, : or )', 3)
self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0) self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
self.checkPatternError(r'(?<', 'unexpected end of pattern', 3) self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1) self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
......
...@@ -120,6 +120,8 @@ Core and Builtins ...@@ -120,6 +120,8 @@ Core and Builtins
Library Library
------- -------
- Issue #433028: Added support of modifier spans in regular expressions.
- Issue #24594: Validates persist parameter when opening MSI database - Issue #24594: Validates persist parameter when opening MSI database
- Issue #28047: Fixed calculation of line length used for the base64 CTE - Issue #28047: Fixed calculation of line length used for the base64 CTE
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment