Commit 9baa5b2d authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #22437: Number of capturing groups in regular expression is no longer

limited by 100.
parent c31e6227
...@@ -217,6 +217,12 @@ os ...@@ -217,6 +217,12 @@ os
* :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes` * :class:`os.stat_result` now has a :attr:`~os.stat_result.st_file_attributes`
attribute on Windows (contributed by Ben Hoyt in :issue:`21719`). attribute on Windows (contributed by Ben Hoyt in :issue:`21719`).
re
--
* Number of capturing groups in regular expression is no longer limited by 100.
(Contributed by Serhiy Storchaka in :issue:`22437`.)
shutil shutil
------ ------
......
...@@ -470,12 +470,6 @@ def compile(p, flags=0): ...@@ -470,12 +470,6 @@ def compile(p, flags=0):
# print code # print code
# XXX: <fl> get rid of this limitation!
if p.pattern.groups > 100:
raise AssertionError(
"sorry, but this version only supports 100 named groups"
)
# map in either direction # map in either direction
groupindex = p.pattern.groupdict groupindex = p.pattern.groupdict
indexgroup = [None] * p.pattern.groups indexgroup = [None] * p.pattern.groups
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
MAGIC = 20031017 MAGIC = 20031017
from _sre import MAXREPEAT from _sre import MAXREPEAT, MAXGROUPS
# SRE standard exception (access as sre.error) # SRE standard exception (access as sre.error)
# should this really be here? # should this really be here?
......
...@@ -72,6 +72,8 @@ class Pattern: ...@@ -72,6 +72,8 @@ class Pattern:
def opengroup(self, name=None): def opengroup(self, name=None):
gid = self.groups gid = self.groups
self.groups = gid + 1 self.groups = gid + 1
if self.groups > MAXGROUPS:
raise error("groups number is too large")
if name is not None: if name is not None:
ogid = self.groupdict.get(name, None) ogid = self.groupdict.get(name, None)
if ogid is not None: if ogid is not None:
...@@ -695,8 +697,14 @@ def _parse(source, state): ...@@ -695,8 +697,14 @@ def _parse(source, state):
else: else:
try: try:
condgroup = int(condname) condgroup = int(condname)
if condgroup < 0:
raise ValueError
except ValueError: except ValueError:
raise error("bad character in group name") raise error("bad character in group name")
if not condgroup:
raise error("bad group number")
if condgroup >= MAXGROUPS:
raise error("the group number is too large")
else: else:
# flags # flags
if not source.next in FLAGS: if not source.next in FLAGS:
...@@ -822,6 +830,8 @@ def parse_template(source, pattern): ...@@ -822,6 +830,8 @@ def parse_template(source, pattern):
index = int(name) index = int(name)
if index < 0: if index < 0:
raise error("negative group number") raise error("negative group number")
if index >= MAXGROUPS:
raise error("the group number is too large")
except ValueError: except ValueError:
if not name.isidentifier(): if not name.isidentifier():
raise error("bad character in group name") raise error("bad character in group name")
......
...@@ -193,6 +193,7 @@ class ReTests(unittest.TestCase): ...@@ -193,6 +193,7 @@ class ReTests(unittest.TestCase):
def test_symbolic_groups(self): def test_symbolic_groups(self):
re.compile('(?P<a>x)(?P=a)(?(a)y)') re.compile('(?P<a>x)(?P=a)(?(a)y)')
re.compile('(?P<a1>x)(?P=a1)(?(a1)y)') re.compile('(?P<a1>x)(?P=a1)(?(a1)y)')
re.compile('(?P<a1>x)\1(?(1)y)')
self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)') self.assertRaises(re.error, re.compile, '(?P<a>)(?P<a>)')
self.assertRaises(re.error, re.compile, '(?Px)') self.assertRaises(re.error, re.compile, '(?Px)')
self.assertRaises(re.error, re.compile, '(?P=)') self.assertRaises(re.error, re.compile, '(?P=)')
...@@ -212,6 +213,10 @@ class ReTests(unittest.TestCase): ...@@ -212,6 +213,10 @@ class ReTests(unittest.TestCase):
re.compile('(?P<µ>x)(?P=µ)(?(µ)y)') re.compile('(?P<µ>x)(?P=µ)(?(µ)y)')
re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)') re.compile('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)(?P=𝔘𝔫𝔦𝔠𝔬𝔡𝔢)(?(𝔘𝔫𝔦𝔠𝔬𝔡𝔢)y)')
self.assertRaises(re.error, re.compile, '(?P<©>x)') self.assertRaises(re.error, re.compile, '(?P<©>x)')
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
pat = '(?:%s)(?(200)z|t)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_symbolic_refs(self): def test_symbolic_refs(self):
self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx') self.assertRaises(re.error, re.sub, '(?P<a>x)', '\g<a', 'xx')
...@@ -228,6 +233,9 @@ class ReTests(unittest.TestCase): ...@@ -228,6 +233,9 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<µ>x)', r'\g<µ>', 'xx'), 'xx')
self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx') self.assertEqual(re.sub('(?P<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>x)', r'\g<𝔘𝔫𝔦𝔠𝔬𝔡𝔢>', 'xx'), 'xx')
self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx') self.assertRaises(re.error, re.sub, '(?P<a>x)', r'\g<©>', 'xx')
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
self.assertEqual(re.sub(pat, '\g<200>', 'xc8yzxc8y'), 'c8zc8')
def test_re_subn(self): def test_re_subn(self):
self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2)) self.assertEqual(re.subn("(?i)b+", "x", "bbbb BBBB"), ('x x', 2))
...@@ -404,6 +412,10 @@ class ReTests(unittest.TestCase): ...@@ -404,6 +412,10 @@ class ReTests(unittest.TestCase):
self.assertIsNone(p.match('abd')) self.assertIsNone(p.match('abd'))
self.assertIsNone(p.match('ac')) self.assertIsNone(p.match('ac'))
# Support > 100 groups.
pat = '|'.join('x(?P<a%d>%x)y' % (i, i) for i in range(1, 200 + 1))
pat = '(?:%s)(?(200)z)' % pat
self.assertEqual(re.match(pat, 'xc8yz').span(), (0, 5))
def test_re_groupref(self): def test_re_groupref(self):
self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(), self.assertEqual(re.match(r'^(\|)?([^()]+)\1$', '|a|').groups(),
...@@ -1070,8 +1082,10 @@ class ReTests(unittest.TestCase): ...@@ -1070,8 +1082,10 @@ class ReTests(unittest.TestCase):
# a RuntimeError is raised instead of OverflowError. # a RuntimeError is raised instead of OverflowError.
long_overflow = 2**128 long_overflow = 2**128
self.assertRaises(TypeError, re.finditer, "a", {}) self.assertRaises(TypeError, re.finditer, "a", {})
self.assertRaises(OverflowError, _sre.compile, "abc", 0, [long_overflow]) with self.assertRaises(OverflowError):
self.assertRaises(TypeError, _sre.compile, {}, 0, []) _sre.compile("abc", 0, [long_overflow], 0, [], [])
with self.assertRaises(TypeError):
_sre.compile({}, 0, [], 0, [], [])
def test_search_dot_unicode(self): def test_search_dot_unicode(self):
self.assertTrue(re.search("123.*-", '123abc-')) self.assertTrue(re.search("123.*-", '123abc-'))
......
...@@ -145,6 +145,9 @@ Core and Builtins ...@@ -145,6 +145,9 @@ Core and Builtins
Library Library
------- -------
- Issue #22437: Number of capturing groups in regular expression is no longer
limited by 100.
- Issue #17442: InteractiveInterpreter now displays the full chained traceback - Issue #17442: InteractiveInterpreter now displays the full chained traceback
in its showtraceback method, to match the built in interactive interpreter. in its showtraceback method, to match the built in interactive interpreter.
......
...@@ -357,6 +357,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, ...@@ -357,6 +357,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
memset(state, 0, sizeof(SRE_STATE)); memset(state, 0, sizeof(SRE_STATE));
state->mark = PyMem_New(void *, pattern->groups * 2);
if (!state->mark) {
PyErr_NoMemory();
goto err;
}
state->lastmark = -1; state->lastmark = -1;
state->lastindex = -1; state->lastindex = -1;
...@@ -409,6 +414,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, ...@@ -409,6 +414,8 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
return string; return string;
err: err:
PyMem_Del(state->mark);
state->mark = NULL;
if (state->buffer.buf) if (state->buffer.buf)
PyBuffer_Release(&state->buffer); PyBuffer_Release(&state->buffer);
return NULL; return NULL;
...@@ -421,6 +428,8 @@ state_fini(SRE_STATE* state) ...@@ -421,6 +428,8 @@ state_fini(SRE_STATE* state)
PyBuffer_Release(&state->buffer); PyBuffer_Release(&state->buffer);
Py_XDECREF(state->string); Py_XDECREF(state->string);
data_stack_dealloc(state); data_stack_dealloc(state);
PyMem_Del(state->mark);
state->mark = NULL;
} }
/* calculate offset from start of string */ /* calculate offset from start of string */
...@@ -560,6 +569,7 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs) ...@@ -560,6 +569,7 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
PyObject *pattern = NULL; PyObject *pattern = NULL;
SRE_STATE state; SRE_STATE state;
Py_ssize_t status; Py_ssize_t status;
PyObject *match;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, if (!PyArg_ParseTupleAndKeywords(args, kwargs,
"|Onn$O:match", _keywords, "|Onn$O:match", _keywords,
...@@ -579,12 +589,14 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs) ...@@ -579,12 +589,14 @@ pattern_match(PatternObject *self, PyObject *args, PyObject *kwargs)
status = sre_match(&state, PatternObject_GetCode(self), 0); status = sre_match(&state, PatternObject_GetCode(self), 0);
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) if (PyErr_Occurred()) {
state_fini(&state);
return NULL; return NULL;
}
match = pattern_new_match(self, &state, status);
state_fini(&state); state_fini(&state);
return match;
return (PyObject *)pattern_new_match(self, &state, status);
} }
static PyObject* static PyObject*
...@@ -592,6 +604,7 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw) ...@@ -592,6 +604,7 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
{ {
SRE_STATE state; SRE_STATE state;
Py_ssize_t status; Py_ssize_t status;
PyObject *match;
PyObject *string = NULL, *string2 = NULL; PyObject *string = NULL, *string2 = NULL;
Py_ssize_t start = 0; Py_ssize_t start = 0;
...@@ -616,12 +629,14 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw) ...@@ -616,12 +629,14 @@ pattern_fullmatch(PatternObject* self, PyObject* args, PyObject* kw)
status = sre_match(&state, PatternObject_GetCode(self), 1); status = sre_match(&state, PatternObject_GetCode(self), 1);
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
if (PyErr_Occurred()) if (PyErr_Occurred()) {
state_fini(&state);
return NULL; return NULL;
}
match = pattern_new_match(self, &state, status);
state_fini(&state); state_fini(&state);
return match;
return pattern_new_match(self, &state, status);
} }
static PyObject* static PyObject*
...@@ -629,6 +644,7 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw) ...@@ -629,6 +644,7 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
{ {
SRE_STATE state; SRE_STATE state;
Py_ssize_t status; Py_ssize_t status;
PyObject *match;
PyObject *string = NULL, *string2 = NULL; PyObject *string = NULL, *string2 = NULL;
Py_ssize_t start = 0; Py_ssize_t start = 0;
...@@ -652,12 +668,14 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw) ...@@ -652,12 +668,14 @@ pattern_search(PatternObject* self, PyObject* args, PyObject* kw)
TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr)); TRACE(("|%p|%p|END\n", PatternObject_GetCode(self), state.ptr));
state_fini(&state); if (PyErr_Occurred()) {
state_fini(&state);
if (PyErr_Occurred())
return NULL; return NULL;
}
return pattern_new_match(self, &state, status); match = pattern_new_match(self, &state, status);
state_fini(&state);
return match;
} }
static PyObject* static PyObject*
...@@ -1417,7 +1435,7 @@ _compile(PyObject* self_, PyObject* args) ...@@ -1417,7 +1435,7 @@ _compile(PyObject* self_, PyObject* args)
PyObject* groupindex = NULL; PyObject* groupindex = NULL;
PyObject* indexgroup = NULL; PyObject* indexgroup = NULL;
if (!PyArg_ParseTuple(args, "OiO!|nOO", &pattern, &flags, if (!PyArg_ParseTuple(args, "OiO!nOO", &pattern, &flags,
&PyList_Type, &code, &groups, &PyList_Type, &code, &groups,
&groupindex, &indexgroup)) &groupindex, &indexgroup))
return NULL; return NULL;
...@@ -1933,10 +1951,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) ...@@ -1933,10 +1951,9 @@ _validate_inner(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
static int static int
_validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups) _validate_outer(SRE_CODE *code, SRE_CODE *end, Py_ssize_t groups)
{ {
if (groups < 0 || groups > 100 || code >= end || end[-1] != SRE_OP_SUCCESS) if (groups < 0 || (size_t)groups > SRE_MAXGROUPS ||
code >= end || end[-1] != SRE_OP_SUCCESS)
FAIL; FAIL;
if (groups == 0) /* fix for simplejson */
groups = 100; /* 100 groups should always be safe */
return _validate_inner(code, end-1, groups); return _validate_inner(code, end-1, groups);
} }
...@@ -2747,6 +2764,12 @@ PyMODINIT_FUNC PyInit__sre(void) ...@@ -2747,6 +2764,12 @@ PyMODINIT_FUNC PyInit__sre(void)
Py_DECREF(x); Py_DECREF(x);
} }
x = PyLong_FromUnsignedLong(SRE_MAXGROUPS);
if (x) {
PyDict_SetItemString(d, "MAXGROUPS", x);
Py_DECREF(x);
}
x = PyUnicode_FromString(copyright); x = PyUnicode_FromString(copyright);
if (x) { if (x) {
PyDict_SetItemString(d, "copyright", x); PyDict_SetItemString(d, "copyright", x);
......
...@@ -18,8 +18,10 @@ ...@@ -18,8 +18,10 @@
#define SRE_CODE Py_UCS4 #define SRE_CODE Py_UCS4
#if SIZEOF_SIZE_T > 4 #if SIZEOF_SIZE_T > 4
# define SRE_MAXREPEAT (~(SRE_CODE)0) # define SRE_MAXREPEAT (~(SRE_CODE)0)
# define SRE_MAXGROUPS ((~(SRE_CODE)0) / 2)
#else #else
# define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX) # define SRE_MAXREPEAT ((SRE_CODE)PY_SSIZE_T_MAX)
# define SRE_MAXGROUPS ((SRE_CODE)PY_SSIZE_T_MAX / SIZEOF_SIZE_T / 2)
#endif #endif
typedef struct { typedef struct {
...@@ -52,9 +54,6 @@ typedef struct { ...@@ -52,9 +54,6 @@ typedef struct {
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
/* FIXME: <fl> shouldn't be a constant, really... */
#define SRE_MARK_SIZE 200
typedef struct SRE_REPEAT_T { typedef struct SRE_REPEAT_T {
Py_ssize_t count; Py_ssize_t count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */ SRE_CODE* pattern; /* points to REPEAT operator arguments */
...@@ -76,7 +75,7 @@ typedef struct { ...@@ -76,7 +75,7 @@ typedef struct {
/* registers */ /* registers */
Py_ssize_t lastindex; Py_ssize_t lastindex;
Py_ssize_t lastmark; Py_ssize_t lastmark;
void* mark[SRE_MARK_SIZE]; void** mark;
/* dynamically allocated stuff */ /* dynamically allocated stuff */
char* data_stack; char* data_stack;
size_t data_stack_size; size_t data_stack_size;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment