Commit 83e80279 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #22818: Splitting on a pattern that could match an empty string now

raises a warning.  Patterns that can only match empty strings are now
rejected.
parent 32ca3dcb
...@@ -626,17 +626,37 @@ form. ...@@ -626,17 +626,37 @@ form.
That way, separator components are always found at the same relative That way, separator components are always found at the same relative
indices within the result list. indices within the result list.
Note that *split* will never split a string on an empty pattern match. .. note::
For example:
:func:`split` doesn't currently split a string on an empty pattern match.
For example:
>>> re.split('x*', 'axbc')
['a', 'bc']
>>> re.split('x*', 'foo') Even though ``'x*'`` also matches 0 'x' before 'a', between 'b' and 'c',
['foo'] and after 'c', currently these matches are ignored. The correct behavior
>>> re.split("(?m)^$", "foo\n\nbar\n") (i.e. splitting on empty matches too and returning ``['', 'a', 'b', 'c',
['foo\n\nbar\n'] '']``) will be implemented in future versions of Python, but since this
is a backward incompatible change, a :exc:`FutureWarning` will be raised
in the meanwhile.
Patterns that can only match empty strings currently never split the
string. Since this doesn't match the expected behavior, a
:exc:`ValueError` will be raised starting from Python 3.5::
>>> re.split("^$", "foo\n\nbar\n", flags=re.M)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
...
ValueError: split() requires a non-empty pattern match.
.. versionchanged:: 3.1 .. versionchanged:: 3.1
Added the optional flags argument. Added the optional flags argument.
.. versionchanged:: 3.5
Splitting on a pattern that could match an empty string now raises
a warning. Patterns that can only match empty strings are now rejected.
.. function:: findall(pattern, string, flags=0) .. function:: findall(pattern, string, flags=0)
......
...@@ -482,6 +482,13 @@ Changes in the Python API ...@@ -482,6 +482,13 @@ Changes in the Python API
simply define :meth:`~importlib.machinery.Loader.create_module` to return simply define :meth:`~importlib.machinery.Loader.create_module` to return
``None`` (:issue:`23014`). ``None`` (:issue:`23014`).
* :func:`re.split` always ignored empty pattern matches, so the ``'x*'``
pattern worked the same as ``'x+'``, and the ``'\b'`` pattern never worked.
Now :func:`re.split` raises a warning if the pattern could match
an empty string. For compatibility use patterns that never match an empty
string (e.g. ``'x+'`` instead of ``'x*'``). Patterns that could only match
an empty string (such as ``'\b'``) now raise an error.
Changes in the C API Changes in the C API
-------------------- --------------------
......
...@@ -414,8 +414,11 @@ def _compile_info(code, pattern, flags): ...@@ -414,8 +414,11 @@ def _compile_info(code, pattern, flags):
# this contains min/max pattern width, and an optional literal # this contains min/max pattern width, and an optional literal
# prefix or a character map # prefix or a character map
lo, hi = pattern.getwidth() lo, hi = pattern.getwidth()
if hi > MAXCODE:
hi = MAXCODE
if lo == 0: if lo == 0:
return # not worth it code.extend([INFO, 4, 0, lo, hi])
return
# look for a literal prefix # look for a literal prefix
prefix = [] prefix = []
prefixappend = prefix.append prefixappend = prefix.append
...@@ -495,10 +498,7 @@ def _compile_info(code, pattern, flags): ...@@ -495,10 +498,7 @@ def _compile_info(code, pattern, flags):
else: else:
emit(MAXCODE) emit(MAXCODE)
prefix = prefix[:MAXCODE] prefix = prefix[:MAXCODE]
if hi < MAXCODE: emit(min(hi, MAXCODE))
emit(hi)
else:
emit(0)
# add literal prefix # add literal prefix
if prefix: if prefix:
emit(len(prefix)) # length emit(len(prefix)) # length
......
...@@ -251,28 +251,28 @@ class ReTests(unittest.TestCase): ...@@ -251,28 +251,28 @@ class ReTests(unittest.TestCase):
for string in ":a:b::c", S(":a:b::c"): for string in ":a:b::c", S(":a:b::c"):
self.assertTypedEqual(re.split(":", string), self.assertTypedEqual(re.split(":", string),
['', 'a', 'b', '', 'c']) ['', 'a', 'b', '', 'c'])
self.assertTypedEqual(re.split(":*", string), self.assertTypedEqual(re.split(":+", string),
['', 'a', 'b', 'c']) ['', 'a', 'b', 'c'])
self.assertTypedEqual(re.split("(:*)", string), self.assertTypedEqual(re.split("(:+)", string),
['', ':', 'a', ':', 'b', '::', 'c']) ['', ':', 'a', ':', 'b', '::', 'c'])
for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"), for string in (b":a:b::c", B(b":a:b::c"), bytearray(b":a:b::c"),
memoryview(b":a:b::c")): memoryview(b":a:b::c")):
self.assertTypedEqual(re.split(b":", string), self.assertTypedEqual(re.split(b":", string),
[b'', b'a', b'b', b'', b'c']) [b'', b'a', b'b', b'', b'c'])
self.assertTypedEqual(re.split(b":*", string), self.assertTypedEqual(re.split(b":+", string),
[b'', b'a', b'b', b'c']) [b'', b'a', b'b', b'c'])
self.assertTypedEqual(re.split(b"(:*)", string), self.assertTypedEqual(re.split(b"(:+)", string),
[b'', b':', b'a', b':', b'b', b'::', b'c']) [b'', b':', b'a', b':', b'b', b'::', b'c'])
for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432", for a, b, c in ("\xe0\xdf\xe7", "\u0430\u0431\u0432",
"\U0001d49c\U0001d49e\U0001d4b5"): "\U0001d49c\U0001d49e\U0001d4b5"):
string = ":%s:%s::%s" % (a, b, c) string = ":%s:%s::%s" % (a, b, c)
self.assertEqual(re.split(":", string), ['', a, b, '', c]) self.assertEqual(re.split(":", string), ['', a, b, '', c])
self.assertEqual(re.split(":*", string), ['', a, b, c]) self.assertEqual(re.split(":+", string), ['', a, b, c])
self.assertEqual(re.split("(:*)", string), self.assertEqual(re.split("(:+)", string),
['', ':', a, ':', b, '::', c]) ['', ':', a, ':', b, '::', c])
self.assertEqual(re.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(re.split("(?::+)", ":a:b::c"), ['', 'a', 'b', 'c'])
self.assertEqual(re.split("(:)*", ":a:b::c"), self.assertEqual(re.split("(:)+", ":a:b::c"),
['', ':', 'a', ':', 'b', ':', 'c']) ['', ':', 'a', ':', 'b', ':', 'c'])
self.assertEqual(re.split("([b:]+)", ":a:b::c"), self.assertEqual(re.split("([b:]+)", ":a:b::c"),
['', ':', 'a', ':b::', 'c']) ['', ':', 'a', ':b::', 'c'])
...@@ -282,13 +282,34 @@ class ReTests(unittest.TestCase): ...@@ -282,13 +282,34 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"), self.assertEqual(re.split("(?:b)|(?::+)", ":a:b::c"),
['', 'a', '', '', 'c']) ['', 'a', '', '', 'c'])
for sep, expected in [
(':*', ['', 'a', 'b', 'c']),
('(?::*)', ['', 'a', 'b', 'c']),
('(:*)', ['', ':', 'a', ':', 'b', '::', 'c']),
('(:)*', ['', ':', 'a', ':', 'b', ':', 'c']),
]:
with self.subTest(sep=sep), self.assertWarns(FutureWarning):
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
for sep, expected in [
('', [':a:b::c']),
(r'\b', [':a:b::c']),
(r'(?=:)', [':a:b::c']),
(r'(?<=:)', [':a:b::c']),
]:
with self.subTest(sep=sep), self.assertRaises(ValueError):
self.assertTypedEqual(re.split(sep, ':a:b::c'), expected)
def test_qualified_re_split(self): def test_qualified_re_split(self):
self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c']) self.assertEqual(re.split(":", ":a:b::c", maxsplit=2), ['', 'a', 'b::c'])
self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d']) self.assertEqual(re.split(':', 'a:b:c:d', maxsplit=2), ['a', 'b', 'c:d'])
self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2), self.assertEqual(re.split("(:)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c']) ['', ':', 'a', ':', 'b::c'])
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2), self.assertEqual(re.split("(:+)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c']) ['', ':', 'a', ':', 'b::c'])
with self.assertWarns(FutureWarning):
self.assertEqual(re.split("(:*)", ":a:b::c", maxsplit=2),
['', ':', 'a', ':', 'b::c'])
def test_re_findall(self): def test_re_findall(self):
self.assertEqual(re.findall(":+", "abc"), []) self.assertEqual(re.findall(":+", "abc"), [])
......
...@@ -232,6 +232,10 @@ Core and Builtins ...@@ -232,6 +232,10 @@ Core and Builtins
Library Library
------- -------
- Issue #22818: Splitting on a pattern that could match an empty string now
raises a warning. Patterns that can only match empty strings are now
rejected.
- Issue #23099: Closing io.BytesIO with exported buffer is rejected now to - Issue #23099: Closing io.BytesIO with exported buffer is rejected now to
prevent corrupting exported buffer. prevent corrupting exported buffer.
......
...@@ -863,6 +863,19 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw) ...@@ -863,6 +863,19 @@ pattern_split(PatternObject* self, PyObject* args, PyObject* kw)
if (!string) if (!string)
return NULL; return NULL;
assert(self->codesize != 0);
if (self->code[0] != SRE_OP_INFO || self->code[3] == 0) {
if (self->code[0] == SRE_OP_INFO && self->code[4] == 0) {
PyErr_SetString(PyExc_ValueError,
"split() requires a non-empty pattern match.");
return NULL;
}
if (PyErr_WarnEx(PyExc_FutureWarning,
"split() requires a non-empty pattern match.",
1) < 0)
return NULL;
}
string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX); string = state_init(&state, self, string, 0, PY_SSIZE_T_MAX);
if (!string) if (!string)
return NULL; return NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment