Commit c9325e19 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #17381: Fixed ranges handling in case-insensitive regular expressions.

parent 8e8308c4
...@@ -21,9 +21,6 @@ if _sre.CODESIZE == 2: ...@@ -21,9 +21,6 @@ if _sre.CODESIZE == 2:
else: else:
MAXCODE = 0xFFFFFFFFL MAXCODE = 0xFFFFFFFFL
def _identityfunction(x):
return x
_LITERAL_CODES = set([LITERAL, NOT_LITERAL]) _LITERAL_CODES = set([LITERAL, NOT_LITERAL])
_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT]) _REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
_SUCCESS_CODES = set([SUCCESS, FAILURE]) _SUCCESS_CODES = set([SUCCESS, FAILURE])
...@@ -52,7 +49,7 @@ def _compile(code, pattern, flags): ...@@ -52,7 +49,7 @@ def _compile(code, pattern, flags):
return _sre.getlower(literal, flags) return _sre.getlower(literal, flags)
else: else:
emit(OPCODES[op]) emit(OPCODES[op])
fixup = _identityfunction fixup = None
skip = _len(code); emit(0) skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup) _compile_charset(av, flags, code, fixup)
code[skip] = _len(code) - skip code[skip] = _len(code) - skip
...@@ -178,17 +175,15 @@ def _compile(code, pattern, flags): ...@@ -178,17 +175,15 @@ def _compile(code, pattern, flags):
def _compile_charset(charset, flags, code, fixup=None): def _compile_charset(charset, flags, code, fixup=None):
# compile charset subprogram # compile charset subprogram
emit = code.append emit = code.append
if fixup is None: for op, av in _optimize_charset(charset, fixup, flags & SRE_FLAG_UNICODE):
fixup = _identityfunction
for op, av in _optimize_charset(charset, fixup):
emit(OPCODES[op]) emit(OPCODES[op])
if op is NEGATE: if op is NEGATE:
pass pass
elif op is LITERAL: elif op is LITERAL:
emit(fixup(av)) emit(av)
elif op is RANGE: elif op is RANGE:
emit(fixup(av[0])) emit(av[0])
emit(fixup(av[1])) emit(av[1])
elif op is CHARSET: elif op is CHARSET:
code.extend(av) code.extend(av)
elif op is BIGCHARSET: elif op is BIGCHARSET:
...@@ -204,7 +199,7 @@ def _compile_charset(charset, flags, code, fixup=None): ...@@ -204,7 +199,7 @@ def _compile_charset(charset, flags, code, fixup=None):
raise error, "internal: unsupported set operator" raise error, "internal: unsupported set operator"
emit(OPCODES[FAILURE]) emit(OPCODES[FAILURE])
def _optimize_charset(charset, fixup): def _optimize_charset(charset, fixup, isunicode):
# internal: optimize character set # internal: optimize character set
out = [] out = []
tail = [] tail = []
...@@ -213,9 +208,15 @@ def _optimize_charset(charset, fixup): ...@@ -213,9 +208,15 @@ def _optimize_charset(charset, fixup):
while True: while True:
try: try:
if op is LITERAL: if op is LITERAL:
charmap[fixup(av)] = 1 i = av
if fixup:
i = fixup(i)
charmap[i] = 1
elif op is RANGE: elif op is RANGE:
for i in range(fixup(av[0]), fixup(av[1])+1): r = range(av[0], av[1]+1)
if fixup:
r = map(fixup, r)
for i in r:
charmap[i] = 1 charmap[i] = 1
elif op is NEGATE: elif op is NEGATE:
out.append((op, av)) out.append((op, av))
...@@ -227,6 +228,19 @@ def _optimize_charset(charset, fixup): ...@@ -227,6 +228,19 @@ def _optimize_charset(charset, fixup):
charmap += b'\0' * 0xff00 charmap += b'\0' * 0xff00
continue continue
# character set contains non-BMP character codes # character set contains non-BMP character codes
if fixup and isunicode and op is RANGE:
lo, hi = av
ranges = [av]
# There are only two ranges of cased astral characters:
# 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
_fixup_range(max(0x10000, lo), min(0x11fff, hi),
ranges, fixup)
for lo, hi in ranges:
if lo == hi:
tail.append((LITERAL, hi))
else:
tail.append((RANGE, (lo, hi)))
else:
tail.append((op, av)) tail.append((op, av))
break break
...@@ -253,8 +267,10 @@ def _optimize_charset(charset, fixup): ...@@ -253,8 +267,10 @@ def _optimize_charset(charset, fixup):
else: else:
out.append((RANGE, (p, q - 1))) out.append((RANGE, (p, q - 1)))
out += tail out += tail
if len(out) < len(charset): # if the case was changed or new representation is more compact
if fixup or len(out) < len(charset):
return out return out
# else original character set is good enough
return charset return charset
# use bitmap # use bitmap
...@@ -307,6 +323,24 @@ def _optimize_charset(charset, fixup): ...@@ -307,6 +323,24 @@ def _optimize_charset(charset, fixup):
out += tail out += tail
return out return out
def _fixup_range(lo, hi, ranges, fixup):
for i in map(fixup, range(lo, hi+1)):
for k, (lo, hi) in enumerate(ranges):
if i < lo:
if l == lo - 1:
ranges[k] = (i, hi)
else:
ranges.insert(k, (i, i))
break
elif i > hi:
if i == hi + 1:
ranges[k] = (lo, i)
break
else:
break
else:
ranges.append((i, i))
_CODEBITS = _sre.CODESIZE * 8 _CODEBITS = _sre.CODESIZE * 8
_BITS_TRANS = b'0' + b'1' * 255 _BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int): def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
......
...@@ -474,6 +474,43 @@ class ReTests(unittest.TestCase): ...@@ -474,6 +474,43 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")
def test_ignore_case_range(self):
# Issues #3511, #17381.
self.assertTrue(re.match(r'[9-a]', '_', re.I))
self.assertIsNone(re.match(r'[9-A]', '_', re.I))
self.assertTrue(re.match(r'[\xc0-\xde]', '\xd7', re.I))
self.assertIsNone(re.match(r'[\xc0-\xde]', '\xf7', re.I))
self.assertTrue(re.match(r'[\xe0-\xfe]', '\xf7',re.I))
self.assertIsNone(re.match(r'[\xe0-\xfe]', '\xd7', re.I))
if have_unicode:
self.assertTrue(re.match(u(r'[9-a]'), u(r'_'), re.U | re.I))
self.assertIsNone(re.match(u(r'[9-A]'), u(r'_'), re.U | re.I))
self.assertTrue(re.match(u(r'[\xc0-\xde]'),
u(r'\xd7'), re.U | re.I))
self.assertIsNone(re.match(u(r'[\xc0-\xde]'),
u(r'\xf7'), re.U | re.I))
self.assertTrue(re.match(u(r'[\xe0-\xfe]'),
u(r'\xf7'), re.U | re.I))
self.assertIsNone(re.match(u(r'[\xe0-\xfe]'),
u(r'\xd7'), re.U | re.I))
self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
u(r'\u0450'), re.U | re.I))
self.assertTrue(re.match(u(r'[\u0430-\u045f]'),
u(r'\u0400'), re.U | re.I))
self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
u(r'\u0450'), re.U | re.I))
self.assertTrue(re.match(u(r'[\u0400-\u042f]'),
u(r'\u0400'), re.U | re.I))
if sys.maxunicode > 0xffff:
self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
u(r'\U00010428'), re.U | re.I))
self.assertTrue(re.match(u(r'[\U00010428-\U0001044f]'),
u(r'\U00010400'), re.U | re.I))
self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
u(r'\U00010428'), re.U | re.I))
self.assertTrue(re.match(u(r'[\U00010400-\U00010427]'),
u(r'\U00010400'), re.U | re.I))
def test_category(self): def test_category(self):
self.assertEqual(re.match(r"(\s)", " ").group(1), " ") self.assertEqual(re.match(r"(\s)", " ").group(1), " ")
......
...@@ -37,6 +37,8 @@ Core and Builtins ...@@ -37,6 +37,8 @@ Core and Builtins
Library Library
------- -------
- Issue #17381: Fixed ranges handling in case-insensitive regular expressions.
- Issue #19329: Optimized compiling charsets in regular expressions. - Issue #19329: Optimized compiling charsets in regular expressions.
- Issue #22410: Module level functions in the re module now cache compiled - Issue #22410: Module level functions in the re module now cache compiled
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment