Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
be9a4e5c
Commit
be9a4e5c
authored
Sep 10, 2016
by
Serhiy Storchaka
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Issue #433028: Added support of modifier spans in regular expressions.
parent
ee73a657
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
180 additions
and
66 deletions
+180
-66
Doc/library/re.rst
Doc/library/re.rst
+10
-0
Doc/whatsnew/3.6.rst
Doc/whatsnew/3.6.rst
+9
-0
Lib/re.py
Lib/re.py
+1
-1
Lib/sre_compile.py
Lib/sre_compile.py
+38
-31
Lib/sre_parse.py
Lib/sre_parse.py
+84
-30
Lib/test/test_re.py
Lib/test/test_re.py
+36
-4
Misc/NEWS
Misc/NEWS
+2
-0
No files found.
Doc/library/re.rst
View file @
be9a4e5c
...
...
@@ -237,6 +237,16 @@ The special characters are:
*cannot* be retrieved after performing a match or referenced later in the
pattern.
``(?imsx-imsx:...)``
(Zero or more letters from the set ``'i'``, ``'m'``, ``'s'``, ``'x'``,
optionally followed by ``'-'`` followed by one or more letters from the
same set.) The letters set or removes the corresponding flags:
:const:`re.I` (ignore case), :const:`re.M` (multi-line), :const:`re.S`
(dot matches all), and :const:`re.X` (verbose), for the part of the
expression. (The flags are described in :ref:`contents-of-module-re`.)
.. versionadded: 3.7
``(?P<name>...)``
Similar to regular parentheses, but the substring matched by the group is
accessible via the symbolic group name *name*. Group names must be valid
...
...
Doc/whatsnew/3.6.rst
View file @
be9a4e5c
...
...
@@ -645,6 +645,15 @@ Protocol version 4 already supports this case. (Contributed by Serhiy
Storchaka in :issue:`24164`.)
re
--
Added support of modifier spans in regular expressions. Examples:
``'(?i:p)ython'`` matches ``'python'`` and ``'Python'``, but not ``'PYTHON'``;
``'(?i)g(?-i:v)r'`` matches ``'GvR'`` and ``'gvr'``, but not ``'GVR'``.
(Contributed by Serhiy Storchaka in :issue:`433028`.)
readline
--------
...
...
Lib/re.py
View file @
be9a4e5c
...
...
@@ -352,7 +352,7 @@ class Scanner:
for
phrase
,
action
in
lexicon
:
gid
=
s
.
opengroup
()
p
.
append
(
sre_parse
.
SubPattern
(
s
,
[
(
SUBPATTERN
,
(
gid
,
sre_parse
.
parse
(
phrase
,
flags
))),
(
SUBPATTERN
,
(
gid
,
0
,
0
,
sre_parse
.
parse
(
phrase
,
flags
))),
]))
s
.
closegroup
(
gid
,
p
[
-
1
])
p
=
sre_parse
.
SubPattern
(
s
,
[(
BRANCH
,
(
None
,
p
))])
...
...
Lib/sre_compile.py
View file @
be9a4e5c
...
...
@@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
ASSERT_CODES
=
_ASSERT_CODES
if
(
flags
&
SRE_FLAG_IGNORECASE
and
not
(
flags
&
SRE_FLAG_LOCALE
)
and
flags
&
SRE_FLAG_UNICODE
):
flags
&
SRE_FLAG_UNICODE
and
not
(
flags
&
SRE_FLAG_ASCII
)):
fixes
=
_ignorecase_fixes
else
:
fixes
=
None
...
...
@@ -137,14 +138,15 @@ def _compile(code, pattern, flags):
else
:
emit
(
MIN_UNTIL
)
elif
op
is
SUBPATTERN
:
if
av
[
0
]:
group
,
add_flags
,
del_flags
,
p
=
av
if
group
:
emit
(
MARK
)
emit
((
av
[
0
]
-
1
)
*
2
)
# _compile_info(code,
av[1],
flags)
_compile
(
code
,
av
[
1
],
flags
)
if
av
[
0
]
:
emit
((
group
-
1
)
*
2
)
# _compile_info(code,
p, (flags | add_flags) & ~del_
flags)
_compile
(
code
,
p
,
(
flags
|
add_flags
)
&
~
del_
flags
)
if
group
:
emit
(
MARK
)
emit
((
av
[
0
]
-
1
)
*
2
+
1
)
emit
((
group
-
1
)
*
2
+
1
)
elif
op
in
SUCCESS_CODES
:
emit
(
op
)
elif
op
in
ASSERT_CODES
:
...
...
@@ -172,7 +174,7 @@ def _compile(code, pattern, flags):
av
=
AT_MULTILINE
.
get
(
av
,
av
)
if
flags
&
SRE_FLAG_LOCALE
:
av
=
AT_LOCALE
.
get
(
av
,
av
)
elif
flags
&
SRE_FLAG_UNICODE
:
elif
(
flags
&
SRE_FLAG_UNICODE
)
and
not
(
flags
&
SRE_FLAG_ASCII
)
:
av
=
AT_UNICODE
.
get
(
av
,
av
)
emit
(
av
)
elif
op
is
BRANCH
:
...
...
@@ -193,7 +195,7 @@ def _compile(code, pattern, flags):
emit
(
op
)
if
flags
&
SRE_FLAG_LOCALE
:
av
=
CH_LOCALE
[
av
]
elif
flags
&
SRE_FLAG_UNICODE
:
elif
(
flags
&
SRE_FLAG_UNICODE
)
and
not
(
flags
&
SRE_FLAG_ASCII
)
:
av
=
CH_UNICODE
[
av
]
emit
(
av
)
elif
op
is
GROUPREF
:
...
...
@@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
elif
op
is
CATEGORY
:
if
flags
&
SRE_FLAG_LOCALE
:
emit
(
CH_LOCALE
[
av
])
elif
flags
&
SRE_FLAG_UNICODE
:
elif
(
flags
&
SRE_FLAG_UNICODE
)
and
not
(
flags
&
SRE_FLAG_ASCII
)
:
emit
(
CH_UNICODE
[
av
])
else
:
emit
(
av
)
...
...
@@ -414,14 +416,16 @@ def _get_literal_prefix(pattern):
prefix
=
[]
prefixappend
=
prefix
.
append
prefix_skip
=
None
got_all
=
True
for
op
,
av
in
pattern
.
data
:
if
op
is
LITERAL
:
prefixappend
(
av
)
elif
op
is
SUBPATTERN
:
prefix1
,
prefix_skip1
,
got_all
=
_get_literal_prefix
(
av
[
1
])
group
,
add_flags
,
del_flags
,
p
=
av
if
add_flags
&
SRE_FLAG_IGNORECASE
:
break
prefix1
,
prefix_skip1
,
got_all
=
_get_literal_prefix
(
p
)
if
prefix_skip
is
None
:
if
av
[
0
]
is
not
None
:
if
group
is
not
None
:
prefix_skip
=
len
(
prefix
)
elif
prefix_skip1
is
not
None
:
prefix_skip
=
len
(
prefix
)
+
prefix_skip1
...
...
@@ -429,32 +433,35 @@ def _get_literal_prefix(pattern):
if
not
got_all
:
break
else
:
got_all
=
False
break
return
prefix
,
prefix_skip
,
got_all
else
:
return
prefix
,
prefix_skip
,
True
return
prefix
,
prefix_skip
,
False
def
_get_charset_prefix
(
pattern
):
charset
=
[]
# not used
charsetappend
=
charset
.
append
if
pattern
.
data
:
op
,
av
=
pattern
.
data
[
0
]
if
op
is
SUBPATTERN
and
av
[
1
]:
op
,
av
=
av
[
1
][
0
]
if
op
is
LITERAL
:
charsetappend
((
op
,
av
))
elif
op
is
BRANCH
:
c
=
[]
cappend
=
c
.
append
for
p
in
av
[
1
]:
if
not
p
:
break
op
,
av
=
p
[
0
]
if
op
is
LITERAL
:
cappend
((
op
,
av
))
if
op
is
SUBPATTERN
:
group
,
add_flags
,
del_flags
,
p
=
av
if
p
and
not
(
add_flags
&
SRE_FLAG_IGNORECASE
):
op
,
av
=
p
[
0
]
if
op
is
LITERAL
:
charsetappend
((
op
,
av
))
elif
op
is
BRANCH
:
c
=
[]
cappend
=
c
.
append
for
p
in
av
[
1
]:
if
not
p
:
break
op
,
av
=
p
[
0
]
if
op
is
LITERAL
:
cappend
((
op
,
av
))
else
:
break
else
:
break
else
:
charset
=
c
charset
=
c
elif
op
is
BRANCH
:
c
=
[]
cappend
=
c
.
append
...
...
Lib/sre_parse.py
View file @
be9a4e5c
...
...
@@ -65,6 +65,12 @@ FLAGS = {
"u": SRE_FLAG_UNICODE,
}
GLOBAL_FLAGS = (SRE_FLAG_ASCII | SRE_FLAG_LOCALE | SRE_FLAG_UNICODE |
SRE_FLAG_DEBUG | SRE_FLAG_TEMPLATE)
class Verbose(Exception):
pass
class Pattern:
# master pattern object. keeps track of global attributes
def __init__(self):
...
...
@@ -184,7 +190,7 @@ class SubPattern:
lo = lo + i
hi = hi + j
elif op is SUBPATTERN:
i, j = av[1].getwidth()
i, j = av[
-
1].getwidth()
lo = lo + i
hi = hi + j
elif op in _REPEATCODES:
...
...
@@ -395,7 +401,7 @@ def _escape(source, escape, state):
pass
raise source.error("
bad
escape
%
s
" % escape, len(escape))
def _parse_sub(source, state, nested=True):
def _parse_sub(source, state,
verbose,
nested=True):
# parse an alternation: a|b|c
items = []
...
...
@@ -403,7 +409,7 @@ def _parse_sub(source, state, nested=True):
sourcematch = source.match
start = source.tell()
while True:
itemsappend(_parse(source, state))
itemsappend(_parse(source, state
, verbose
))
if not sourcematch("
|
"):
break
...
...
@@ -445,10 +451,10 @@ def _parse_sub(source, state, nested=True):
subpattern.append((BRANCH, (None, items)))
return subpattern
def _parse_sub_cond(source, state, condgroup):
item_yes = _parse(source, state)
def _parse_sub_cond(source, state, condgroup
, verbose
):
item_yes = _parse(source, state
, verbose
)
if source.match("
|
"):
item_no = _parse(source, state)
item_no = _parse(source, state
, verbose
)
if source.next == "
|
":
raise source.error("
conditional
backref
with
more
than
two
branches
")
else:
...
...
@@ -457,7 +463,7 @@ def _parse_sub_cond(source, state, condgroup):
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
def _parse(source, state):
def _parse(source, state
, verbose
):
# parse a simple pattern
subpattern = SubPattern(state)
...
...
@@ -467,7 +473,6 @@ def _parse(source, state):
sourcematch = source.match
_len = len
_ord = ord
verbose = state.flags & SRE_FLAG_VERBOSE
while True:
...
...
@@ -621,6 +626,8 @@ def _parse(source, state):
group
=
True
name
=
None
condgroup
=
None
add_flags
=
0
del_flags
=
0
if
sourcematch
(
"?"
):
# options
char
=
sourceget
()
...
...
@@ -682,7 +689,7 @@ def _parse(source, state):
lookbehindgroups
=
state
.
lookbehindgroups
if
lookbehindgroups
is
None
:
state
.
lookbehindgroups
=
state
.
groups
p
=
_parse_sub
(
source
,
state
)
p
=
_parse_sub
(
source
,
state
,
verbose
)
if
dir
<
0
:
if
lookbehindgroups
is
None
:
state
.
lookbehindgroups
=
None
...
...
@@ -718,19 +725,13 @@ def _parse(source, state):
raise
source
.
error
(
"invalid group reference"
,
len
(
condname
)
+
1
)
state
.
checklookbehindgroup
(
condgroup
,
source
)
elif
char
in
FLAGS
:
elif
char
in
FLAGS
or
char
==
"-"
:
# flags
while
True
:
state
.
flags
|=
FLAGS
[
char
]
char
=
sourceget
()
if
char
is
None
:
raise
source
.
error
(
"missing )"
)
if
char
==
")"
:
break
if
char
not
in
FLAGS
:
raise
source
.
error
(
"unknown flag"
,
len
(
char
))
verbose
=
state
.
flags
&
SRE_FLAG_VERBOSE
continue
flags
=
_parse_flags
(
source
,
state
,
char
)
if
flags
is
None
:
# global flags
continue
add_flags
,
del_flags
=
flags
group
=
None
else
:
raise
source
.
error
(
"unknown extension ?"
+
char
,
len
(
char
)
+
1
)
...
...
@@ -742,15 +743,17 @@ def _parse(source, state):
except
error
as
err
:
raise
source
.
error
(
err
.
msg
,
len
(
name
)
+
1
)
from
None
if
condgroup
:
p
=
_parse_sub_cond
(
source
,
state
,
condgroup
)
p
=
_parse_sub_cond
(
source
,
state
,
condgroup
,
verbose
)
else
:
p
=
_parse_sub
(
source
,
state
)
sub_verbose
=
((
verbose
or
(
add_flags
&
SRE_FLAG_VERBOSE
))
and
not
(
del_flags
&
SRE_FLAG_VERBOSE
))
p
=
_parse_sub
(
source
,
state
,
sub_verbose
)
if
not
source
.
match
(
")"
):
raise
source
.
error
(
"missing ), unterminated subpattern"
,
source
.
tell
()
-
start
)
if
group
is
not
None
:
state
.
closegroup
(
group
,
p
)
subpatternappend
((
SUBPATTERN
,
(
group
,
p
)))
subpatternappend
((
SUBPATTERN
,
(
group
,
add_flags
,
del_flags
,
p
)))
elif
this
==
"^"
:
subpatternappend
((
AT
,
AT_BEGINNING
))
...
...
@@ -763,6 +766,53 @@ def _parse(source, state):
return
subpattern
def
_parse_flags
(
source
,
state
,
char
):
sourceget
=
source
.
get
add_flags
=
0
del_flags
=
0
if
char
!=
"-"
:
while
True
:
add_flags
|=
FLAGS
[
char
]
char
=
sourceget
()
if
char
is
None
:
raise
source
.
error
(
"missing -, : or )"
)
if
char
in
")-:"
:
break
if
char
not
in
FLAGS
:
msg
=
"unknown flag"
if
char
.
isalpha
()
else
"missing -, : or )"
raise
source
.
error
(
msg
,
len
(
char
))
if
char
==
")"
:
if
((
add_flags
&
SRE_FLAG_VERBOSE
)
and
not
(
state
.
flags
&
SRE_FLAG_VERBOSE
)):
raise
Verbose
state
.
flags
|=
add_flags
return
None
if
add_flags
&
GLOBAL_FLAGS
:
raise
source
.
error
(
"bad inline flags: cannot turn on global flag"
,
1
)
if
char
==
"-"
:
char
=
sourceget
()
if
char
is
None
:
raise
source
.
error
(
"missing flag"
)
if
char
not
in
FLAGS
:
msg
=
"unknown flag"
if
char
.
isalpha
()
else
"missing flag"
raise
source
.
error
(
msg
,
len
(
char
))
while
True
:
del_flags
|=
FLAGS
[
char
]
char
=
sourceget
()
if
char
is
None
:
raise
source
.
error
(
"missing :"
)
if
char
==
":"
:
break
if
char
not
in
FLAGS
:
msg
=
"unknown flag"
if
char
.
isalpha
()
else
"missing :"
raise
source
.
error
(
msg
,
len
(
char
))
assert
char
==
":"
if
del_flags
&
GLOBAL_FLAGS
:
raise
source
.
error
(
"bad inline flags: cannot turn off global flag"
,
1
)
if
add_flags
&
del_flags
:
raise
source
.
error
(
"bad inline flags: flag turned on and off"
,
1
)
return
add_flags
,
del_flags
def
fix_flags
(
src
,
flags
):
# Check and fix flags according to the type of pattern (str or bytes)
if
isinstance
(
src
,
str
):
...
...
@@ -789,18 +839,22 @@ def parse(str, flags=0, pattern=None):
pattern
.
flags
=
flags
pattern
.
str
=
str
p
=
_parse_sub
(
source
,
pattern
,
0
)
try
:
p
=
_parse_sub
(
source
,
pattern
,
flags
&
SRE_FLAG_VERBOSE
,
False
)
except
Verbose
:
# the VERBOSE flag was switched on inside the pattern. to be
# on the safe side, we'll parse the whole thing again...
pattern
=
Pattern
()
pattern
.
flags
=
flags
|
SRE_FLAG_VERBOSE
pattern
.
str
=
str
p
=
_parse_sub
(
source
,
pattern
,
True
,
False
)
p
.
pattern
.
flags
=
fix_flags
(
str
,
p
.
pattern
.
flags
)
if
source
.
next
is
not
None
:
assert
source
.
next
==
")"
raise
source
.
error
(
"unbalanced parenthesis"
)
if
not
(
flags
&
SRE_FLAG_VERBOSE
)
and
p
.
pattern
.
flags
&
SRE_FLAG_VERBOSE
:
# the VERBOSE flag was switched on inside the pattern. to be
# on the safe side, we'll parse the whole thing again...
return
parse
(
str
,
p
.
pattern
.
flags
)
if
flags
&
SRE_FLAG_DEBUG
:
p
.
dump
()
...
...
Lib/test/test_re.py
View file @
be9a4e5c
...
...
@@ -1376,6 +1376,38 @@ class ReTests(unittest.TestCase):
self.assertRaises(ValueError, re.compile, b'(?a)', re.LOCALE)
self.assertRaises(ValueError, re.compile, b'(?aL)')
def test_scoped_flags(self):
self.assertTrue(re.match(r'(?i:a)b', 'Ab'))
self.assertIsNone(re.match(r'(?i:a)b', 'aB'))
self.assertIsNone(re.match(r'(?-i:a)b', 'Ab', re.IGNORECASE))
self.assertTrue(re.match(r'(?-i:a)b', 'aB', re.IGNORECASE))
self.assertIsNone(re.match(r'(?i:(?-i:a)b)', 'Ab'))
self.assertTrue(re.match(r'(?i:(?-i:a)b)', 'aB'))
self.assertTrue(re.match(r'(?x: a) b', 'a b'))
self.assertIsNone(re.match(r'(?x: a) b', ' a b'))
self.assertTrue(re.match(r'(?-x: a) b', ' ab', re.VERBOSE))
self.assertIsNone(re.match(r'(?-x: a) b', 'ab', re.VERBOSE))
self.checkPatternError(r'(?a:
\
w)
'
,
'bad inline flags: cannot turn on global flag', 3)
self.checkPatternError(r'(?a)(?-a:
\
w)
'
,
'bad inline flags: cannot turn off global flag', 8)
self.checkPatternError(r'(?i-i:a)',
'bad inline flags: flag turned on and off', 5)
self.checkPatternError(r'(?-', 'missing flag', 3)
self.checkPatternError(r'(?-+', 'missing flag', 3)
self.checkPatternError(r'(?-z', 'unknown flag', 3)
self.checkPatternError(r'(?-i', 'missing :', 4)
self.checkPatternError(r'(?-i)', 'missing :', 4)
self.checkPatternError(r'(?-i+', 'missing :', 4)
self.checkPatternError(r'(?-iz', 'unknown flag', 4)
self.checkPatternError(r'(?i:', 'missing ), unterminated subpattern', 0)
self.checkPatternError(r'(?i', 'missing -, : or )', 3)
self.checkPatternError(r'(?i+', 'missing -, : or )', 3)
self.checkPatternError(r'(?iz', 'unknown flag', 3)
def test_bug_6509(self):
# Replacement strings of both types must parse properly.
# all strings
...
...
@@ -1538,9 +1570,9 @@ class ReTests(unittest.TestCase):
with captured_stdout() as out:
re.compile(pat, re.DEBUG)
dump = '''
\
SUBPATTERN 1
SUBPATTERN 1
0 0
LITERAL 46
SUBPATTERN None
SUBPATTERN None
0 0
BRANCH
IN
LITERAL 99
...
...
@@ -1548,7 +1580,7 @@ SUBPATTERN None
OR
LITERAL 112
LITERAL 121
SUBPATTERN None
SUBPATTERN None
0 0
GROUPREF_EXISTS 1
AT AT_END
ELSE
...
...
@@ -1664,7 +1696,7 @@ SUBPATTERN None
self.checkPatternError(r'(?P', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?z)', 'unknown extension ?z', 1)
self.checkPatternError(r'(?iz)', 'unknown flag', 3)
self.checkPatternError(r'(?i', 'missing )', 3)
self.checkPatternError(r'(?i', 'missing
-, : or
)', 3)
self.checkPatternError(r'(?#abc', 'missing ), unterminated comment', 0)
self.checkPatternError(r'(?<', 'unexpected end of pattern', 3)
self.checkPatternError(r'(?<>)', 'unknown extension ?<>', 1)
...
...
Misc/NEWS
View file @
be9a4e5c
...
...
@@ -120,6 +120,8 @@ Core and Builtins
Library
-------
-
Issue
#
433028
:
Added
support
of
modifier
spans
in
regular
expressions
.
-
Issue
#
24594
:
Validates
persist
parameter
when
opening
MSI
database
-
Issue
#
28047
:
Fixed
calculation
of
line
length
used
for
the
base64
CTE
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment