Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
05cb728d
Commit
05cb728d
authored
Nov 16, 2017
by
Serhiy Storchaka
Committed by
GitHub
Nov 16, 2017
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
bpo-30349: Raise FutureWarning for nested sets and set operations (#1553)
in regular expressions.
parent
3daaafb7
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
106 additions
and
9 deletions
+106
-9
Doc/library/re.rst
Doc/library/re.rst
+15
-1
Doc/tools/susp-ignored.csv
Doc/tools/susp-ignored.csv
+1
-1
Doc/whatsnew/3.7.rst
Doc/whatsnew/3.7.rst
+11
-0
Lib/email/_header_value_parser.py
Lib/email/_header_value_parser.py
+4
-5
Lib/re.py
Lib/re.py
+2
-1
Lib/sre_parse.py
Lib/sre_parse.py
+24
-0
Lib/test/test_re.py
Lib/test/test_re.py
+46
-1
Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst
...S.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst
+3
-0
No files found.
Doc/library/re.rst
View file @
05cb728d
...
...
@@ -200,6 +200,20 @@ The special characters are:
place it at the beginning of the set. For example, both ``[()[\]{}]`` and
``[]()[{}]`` will both match a parenthesis.
* Support of nested sets and set operations as in `Unicode Technical
Standard #18`_ might be added in the future. This would change the
syntax, so to facilitate this change a :exc:`FutureWarning` will be raised
in ambiguous cases for the time being.
That include sets starting with a literal ``'['`` or containing literal
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
avoid a warning escape them with a backslash.
.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
.. versionchanged:: 3.7
:exc:`FutureWarning` is raised if a character set contains constructs
that will change semantically in the future.
``|``
``A|B``, where *A* and *B* can be arbitrary REs, creates a regular expression that
will match either *A* or *B*. An arbitrary number of REs can be separated by the
...
...
@@ -829,7 +843,7 @@ form.
>>> legal_chars = string.ascii_lowercase + string.digits + "!#$%&'*+-.^_`|~:"
>>> print('[%s]+' % re.escape(legal_chars))
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%
&'\*\+\-\.\^_`\|
~:]+
[abcdefghijklmnopqrstuvwxyz0123456789!\#\$%
\&'\*\+\-\.\^_`\|\
~:]+
>>> operators = ['+', '-', '*', '/', '**']
>>> print('|'.join(map(re.escape, sorted(operators, reverse=True))))
...
...
Doc/tools/susp-ignored.csv
View file @
05cb728d
...
...
@@ -300,7 +300,7 @@ whatsnew/3.2,,:gz,">>> with tarfile.open(name='myarchive.tar.gz', mode='w:gz') a
whatsnew/3.2,,:location,zope9-location = ${zope9:location}
whatsnew/3.2,,:prefix,zope-conf = ${custom:prefix}/etc/zope.conf
library/re,,`,!#$%&'*+-.^_`|~:
library/re,,`,!\#\$%
&'\*\+\-\.\^_`\|
~:
library/re,,`,!\#\$%
\&'\*\+\-\.\^_`\|\
~:
library/tarfile,,:xz,'x:xz'
library/xml.etree.elementtree,,:sometag,prefix:sometag
library/xml.etree.elementtree,,:fictional,"<actors xmlns:fictional=""http://characters.example.com"""
...
...
Doc/whatsnew/3.7.rst
View file @
05cb728d
...
...
@@ -700,6 +700,17 @@ Changes in the Python API
argument ``os.scandir`` instead of ``os.listdir`` when listing the direcory
is failed.
* Support of nested sets and set operations in regular expressions as in
`Unicode Technical Standard #18`_ might be added in the future. This would
change the syntax, so to facilitate this change a :exc:`FutureWarning` will
be raised in ambiguous cases for the time being.
That include sets starting with a literal ``'['`` or containing literal
character sequences ``'--'``, ``'&&'``, ``'~~'``, and ``'||'``. To
avoid a warning escape them with a backslash.
(Contributed by Serhiy Storchaka in :issue:`30349`.)
.. _Unicode Technical Standard #18: https://unicode.org/reports/tr18/
Changes in the C API
--------------------
...
...
Lib/email/_header_value_parser.py
View file @
05cb728d
...
...
@@ -1354,15 +1354,14 @@ RouteComponentMarker = ValueTerminal('@', 'route-component-marker')
_wsp_splitter
=
re
.
compile
(
r'([{}]+)'
.
format
(
''
.
join
(
WSP
))).
split
_non_atom_end_matcher
=
re
.
compile
(
r"[^{}]+"
.
format
(
''
.
join
(
ATOM_ENDS
).
replace
(
'
\
\
'
,
'
\
\
\
\
'
).
replace
(
']'
,
r'\
]
'
))).match
re
.
escape
(
''
.
join
(
ATOM_ENDS
)
))).
match
_non_printable_finder
=
re
.
compile
(
r"[\x00-\x20\x7F]"
).
findall
_non_token_end_matcher
=
re
.
compile
(
r"[^{}]+"
.
format
(
''.join(TOKEN_ENDS).replace('
\\
','
\\\\
').replace('
]
',r'
\
]
'
))).match
re
.
escape
(
''
.
join
(
TOKEN_ENDS
)
))).
match
_non_attribute_end_matcher
=
re
.
compile
(
r"[^{}]+"
.
format
(
''.join(ATTRIBUTE_ENDS).replace('
\\
','
\\\\
').replace('
]
',r'
\
]
'
))).match
re
.
escape
(
''
.
join
(
ATTRIBUTE_ENDS
)
))).
match
_non_extended_attribute_end_matcher
=
re
.
compile
(
r"[^{}]+"
.
format
(
''.join(EXTENDED_ATTRIBUTE_ENDS).replace(
'
\\
','
\\\\
').replace('
]
',r'
\
]
'))).match
re
.
escape
(
''
.
join
(
EXTENDED_ATTRIBUTE_ENDS
)))).
match
def
_validate_xtext
(
xtext
):
"""If input token contains ASCII non-printables, register a defect."""
...
...
Lib/re.py
View file @
05cb728d
...
...
@@ -251,8 +251,9 @@ def template(pattern, flags=0):
# SPECIAL_CHARS
# closing ')', '}' and ']'
# '-' (a range in character set)
# '&', '~', (extended character set operations)
# '#' (comment) and WHITESPACE (ignored) in verbose mode
_special_chars_map
=
{
i
:
'
\
\
'
+
chr
(
i
)
for
i
in
b'()[]{}?*+-|^$
\
\
.#
\
t
\
n
\
r
\
v
\
f
'
}
_special_chars_map
=
{
i
:
'
\
\
'
+
chr
(
i
)
for
i
in
b'()[]{}?*+-|^$
\
\
.
&~
#
\
t
\
n
\
r
\
v
\
f
'
}
def
escape
(
pattern
):
"""
...
...
Lib/sre_parse.py
View file @
05cb728d
...
...
@@ -517,6 +517,12 @@ def _parse(source, state, verbose, nested, first=False):
setappend
=
set
.
append
## if sourcematch(":"):
## pass # handle character classes
if
source
.
next
==
'['
:
import
warnings
warnings
.
warn
(
'Possible nested set at position %d'
%
source
.
tell
(),
FutureWarning
,
stacklevel
=
nested
+
6
)
negate
=
sourcematch
(
"^"
)
# check remaining characters
while
True
:
...
...
@@ -529,6 +535,17 @@ def _parse(source, state, verbose, nested, first=False):
elif
this
[
0
]
==
"
\
\
"
:
code1
=
_class_escape
(
source
,
this
)
else
:
if
set
and
this
in
'-&~|'
and
source
.
next
==
this
:
import
warnings
warnings
.
warn
(
'Possible set %s at position %d'
%
(
'difference'
if
this
==
'-'
else
'intersection'
if
this
==
'&'
else
'symmetric difference'
if
this
==
'~'
else
'union'
,
source
.
tell
()
-
1
),
FutureWarning
,
stacklevel
=
nested
+
6
)
code1
=
LITERAL
,
_ord
(
this
)
if
sourcematch
(
"-"
):
# potential range
...
...
@@ -545,6 +562,13 @@ def _parse(source, state, verbose, nested, first=False):
if
that
[
0
]
==
"
\
\
"
:
code2
=
_class_escape
(
source
,
that
)
else
:
if
that
==
'-'
:
import
warnings
warnings
.
warn
(
'Possible set difference at position %d'
%
(
source
.
tell
()
-
2
),
FutureWarning
,
stacklevel
=
nested
+
6
)
code2
=
LITERAL
,
_ord
(
that
)
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
msg
=
"bad character range %s-%s"
%
(
this
,
that
)
...
...
Lib/test/test_re.py
View file @
05cb728d
...
...
@@ -914,6 +914,51 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.search(r"
\
s
([
^
a
])
", "
b").group(1), "b")
self.assertEqual(re.search(r"
\
s
([
^
a
]
*
)
", "
bb").group(1), "bb")
def test_possible_set_operations(self):
s = bytes(range(128)).decode()
with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9--1]')
self.assertEqual(p.findall(s), list('-./0123456789'))
self.assertEqual(re.findall(r'[--1]', s), list('-./01'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[%--1]')
self.assertEqual(p.findall(s), list("
%&
'()*+,-1"))
with self.assertWarns(FutureWarning):
p = re.compile(r'
[
%--
]
')
self.assertEqual(p.findall(s), list("%&'
()
*+
,
-
"))
with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9&&1]')
self.assertEqual(p.findall(s), list('&0123456789'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[
\
d&&
1
]')
self.assertEqual(p.findall(s), list('&0123456789'))
self.assertEqual(re.findall(r'[&&1]', s), list('&1'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9||a]')
self.assertEqual(p.findall(s), list('0123456789a|'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[
\
d||
a
]')
self.assertEqual(p.findall(s), list('0123456789a|'))
self.assertEqual(re.findall(r'[||1]', s), list('1|'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[0-9~~1]')
self.assertEqual(p.findall(s), list('0123456789~'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[
\
d~~
1
]')
self.assertEqual(p.findall(s), list('0123456789~'))
self.assertEqual(re.findall(r'[~~1]', s), list('1~'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[[0-9]|]')
self.assertEqual(p.findall(s), list('0123456789[]'))
with self.assertWarns(FutureWarning):
p = re.compile(r'[[:digit:]|]')
self.assertEqual(p.findall(s), list(':[]dgit'))
def test_search_coverage(self):
self.assertEqual(re.search(r"
\
s
(
b
)
", "
b").group(1), "b")
self.assertEqual(re.search(r"
a
\
s
", "
a
").group(0), "
a
")
...
...
@@ -932,7 +977,7 @@ class ReTests(unittest.TestCase):
self.assertEqual(m.group(), match)
self.assertEqual(m.span(), span)
LITERAL_CHARS = string.ascii_letters + string.digits + '!"
%
&
\
',/:;<=>@_`~
'
LITERAL_CHARS = string.ascii_letters + string.digits + '!"
%
\
',/:;<=>@_`
'
def
test_re_escape
(
self
):
p
=
''
.
join
(
chr
(
i
)
for
i
in
range
(
256
))
...
...
Misc/NEWS.d/next/Library/2017-10-05-12-45-29.bpo-30349.6zKJsF.rst
0 → 100644
View file @
05cb728d
FutureWarning is now emitted if a regular expression contains character set
constructs that will change semantically in the future (nested sets and set
operations).
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment