Commit 6d336a02 authored by Serhiy Storchaka's avatar Serhiy Storchaka Committed by GitHub

bpo-30285: Optimize case-insensitive matching and searching (#1482)

of regular expressions.
parent f93234bb
......@@ -208,6 +208,10 @@ Optimizations
using the :func:`os.scandir` function.
(Contributed by Serhiy Storchaka in :issue:`25996`.)
* Optimized case-insensitive matching and searching of :mod:`regular
expressions <re>`. Searching some patterns can now be up to 20 times faster.
(Contributed by Serhiy Storchaka in :issue:`30285`.)
Build and C API Changes
=======================
......
This diff is collapsed.
......@@ -891,15 +891,24 @@ class ReTests(unittest.TestCase):
lo = ord(c.lower())
self.assertEqual(_sre.ascii_tolower(i), lo)
self.assertEqual(_sre.unicode_tolower(i), lo)
iscased = c in string.ascii_letters
self.assertEqual(_sre.ascii_iscased(i), iscased)
self.assertEqual(_sre.unicode_iscased(i), iscased)
for i in list(range(128, 0x1000)) + [0x10400, 0x10428]:
c = chr(i)
self.assertEqual(_sre.ascii_tolower(i), i)
if i != 0x0130:
self.assertEqual(_sre.unicode_tolower(i), ord(c.lower()))
iscased = c != c.lower() or c != c.upper()
self.assertFalse(_sre.ascii_iscased(i))
self.assertEqual(_sre.unicode_iscased(i),
c != c.lower() or c != c.upper())
self.assertEqual(_sre.ascii_tolower(0x0130), 0x0130)
self.assertEqual(_sre.unicode_tolower(0x0130), ord('i'))
self.assertFalse(_sre.ascii_iscased(0x0130))
self.assertTrue(_sre.unicode_iscased(0x0130))
def test_not_literal(self):
self.assertEqual(re.search(r"\s([^a])", " b").group(1), "b")
......
......@@ -320,6 +320,9 @@ Extension Modules
Library
-------
- bpo-30285: Optimized case-insensitive matching and searching of regular
expressions.
- bpo-29990: Fix range checking in GB18030 decoder. Original patch by Ma Lin.
- bpo-29979: rewrite cgi.parse_multipart, reusing the FieldStorage class and
......
......@@ -273,6 +273,38 @@ _sre_getcodesize_impl(PyObject *module)
return sizeof(SRE_CODE);
}
/*[clinic input]
_sre.ascii_iscased -> bool
character: int
/
[clinic start generated code]*/
static int
_sre_ascii_iscased_impl(PyObject *module, int character)
/*[clinic end generated code: output=4f454b630fbd19a2 input=9f0bd952812c7ed3]*/
{
unsigned int ch = (unsigned int)character;
return ch != sre_lower(ch) || ch != sre_upper(ch);
}
/*[clinic input]
_sre.unicode_iscased -> bool
character: int
/
[clinic start generated code]*/
static int
_sre_unicode_iscased_impl(PyObject *module, int character)
/*[clinic end generated code: output=9c5ddee0dc2bc258 input=51e42c3b8dddb78e]*/
{
unsigned int ch = (unsigned int)character;
return ch != sre_lower_unicode(ch) || ch != sre_upper_unicode(ch);
}
/*[clinic input]
_sre.ascii_tolower -> int
......@@ -2750,6 +2782,8 @@ static PyTypeObject Scanner_Type = {
static PyMethodDef _functions[] = {
_SRE_COMPILE_METHODDEF
_SRE_GETCODESIZE_METHODDEF
_SRE_ASCII_ISCASED_METHODDEF
_SRE_UNICODE_ISCASED_METHODDEF
_SRE_ASCII_TOLOWER_METHODDEF
_SRE_UNICODE_TOLOWER_METHODDEF
{NULL, NULL}
......
......@@ -29,6 +29,68 @@ exit:
return return_value;
}
PyDoc_STRVAR(_sre_ascii_iscased__doc__,
"ascii_iscased($module, character, /)\n"
"--\n"
"\n");
#define _SRE_ASCII_ISCASED_METHODDEF \
{"ascii_iscased", (PyCFunction)_sre_ascii_iscased, METH_O, _sre_ascii_iscased__doc__},
static int
_sre_ascii_iscased_impl(PyObject *module, int character);
static PyObject *
_sre_ascii_iscased(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int character;
int _return_value;
if (!PyArg_Parse(arg, "i:ascii_iscased", &character)) {
goto exit;
}
_return_value = _sre_ascii_iscased_impl(module, character);
if ((_return_value == -1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyBool_FromLong((long)_return_value);
exit:
return return_value;
}
PyDoc_STRVAR(_sre_unicode_iscased__doc__,
"unicode_iscased($module, character, /)\n"
"--\n"
"\n");
#define _SRE_UNICODE_ISCASED_METHODDEF \
{"unicode_iscased", (PyCFunction)_sre_unicode_iscased, METH_O, _sre_unicode_iscased__doc__},
static int
_sre_unicode_iscased_impl(PyObject *module, int character);
static PyObject *
_sre_unicode_iscased(PyObject *module, PyObject *arg)
{
PyObject *return_value = NULL;
int character;
int _return_value;
if (!PyArg_Parse(arg, "i:unicode_iscased", &character)) {
goto exit;
}
_return_value = _sre_unicode_iscased_impl(module, character);
if ((_return_value == -1) && PyErr_Occurred()) {
goto exit;
}
return_value = PyBool_FromLong((long)_return_value);
exit:
return return_value;
}
PyDoc_STRVAR(_sre_ascii_tolower__doc__,
"ascii_tolower($module, character, /)\n"
"--\n"
......@@ -715,4 +777,4 @@ _sre_SRE_Scanner_search(ScannerObject *self, PyObject *Py_UNUSED(ignored))
{
return _sre_SRE_Scanner_search_impl(self);
}
/*[clinic end generated code: output=811e67d7f8f5052e input=a9049054013a1b77]*/
/*[clinic end generated code: output=5fe47c49e475cccb input=a9049054013a1b77]*/
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment