Commit c04fcd40 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Backported the optimization of compiling charsets in regular expressions

(issue #19329).  This is needed to apply the patch from issue #17381.
parent 34d1597b
...@@ -207,149 +207,124 @@ def _compile_charset(charset, flags, code, fixup=None): ...@@ -207,149 +207,124 @@ def _compile_charset(charset, flags, code, fixup=None):
def _optimize_charset(charset, fixup): def _optimize_charset(charset, fixup):
# internal: optimize character set # internal: optimize character set
out = [] out = []
outappend = out.append tail = []
charmap = [0]*256 charmap = bytearray(256)
try:
for op, av in charset: for op, av in charset:
if op is NEGATE: while True:
outappend((op, av)) try:
elif op is LITERAL: if op is LITERAL:
charmap[fixup(av)] = 1 charmap[fixup(av)] = 1
elif op is RANGE: elif op is RANGE:
for i in range(fixup(av[0]), fixup(av[1])+1): for i in range(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1 charmap[i] = 1
elif op is CATEGORY: elif op is NEGATE:
# XXX: could append to charmap tail out.append((op, av))
return charset # cannot compress else:
tail.append((op, av))
except IndexError: except IndexError:
# character set contains unicode characters if len(charmap) == 256:
return _optimize_unicode(charset, fixup) # character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
# character set contains non-BMP character codes
tail.append((op, av))
break
# compress character map # compress character map
i = p = n = 0
runs = [] runs = []
runsappend = runs.append q = 0
for c in charmap: while True:
if c: p = charmap.find(b'\1', q)
if n == 0: if p < 0:
p = i break
n = n + 1 if len(runs) >= 2:
elif n: runs = None
runsappend((p, n)) break
n = 0 q = charmap.find(b'\0', p)
i = i + 1 if q < 0:
if n: runs.append((p, len(charmap)))
runsappend((p, n)) break
if len(runs) <= 2: runs.append((p, q))
if runs is not None:
# use literal/range # use literal/range
for p, n in runs: for p, q in runs:
if n == 1: if q - p == 1:
outappend((LITERAL, p)) out.append((LITERAL, p))
else: else:
outappend((RANGE, (p, p+n-1))) out.append((RANGE, (p, q - 1)))
out += tail
if len(out) < len(charset): if len(out) < len(charset):
return out return out
else: return charset
# use bitmap # use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap) data = _mk_bitmap(charmap)
outappend((CHARSET, data)) out.append((CHARSET, data))
out += tail
return out return out
return charset
def _mk_bitmap(bits): # To represent a big charset, first a bitmap of all characters in the
data = [] # set is constructed. Then, this bitmap is sliced into chunks of 256
dataappend = data.append # characters, duplicate chunks are eliminated, and each chunk is
if _sre.CODESIZE == 2: # given a number. In the compiled expression, the charset is
start = (1, 0) # represented by a 32-bit word sequence, consisting of one word for
else: # the number of different chunks, a sequence of 256 bytes (64 words)
start = (1L, 0L) # of chunk numbers indexed by their original chunk position, and a
m, v = start # sequence of 256-bit chunks (8 words each).
for c in bits:
if c: # Compression is normally good: in a typical charset, large ranges of
v = v + m # Unicode will be either completely excluded (e.g. if only cyrillic
m = m + m # letters are to be matched), or completely included (e.g. if large
if m > MAXCODE: # subranges of Kanji match). These ranges will be represented by
dataappend(v) # chunks of all one-bits or all zero-bits.
m, v = start
return data # Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# To represent a big charset, first a bitmap of all characters in the # less significant byte is a bit index in the chunk (just like the
# set is constructed. Then, this bitmap is sliced into chunks of 256 # CHARSET matching).
# characters, duplicate chunks are eliminated, and each chunk is
# given a number. In the compiled expression, the charset is # In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# represented by a 32-bit word sequence, consisting of one word for # of the basic multilingual plane; an efficient representation
# the number of different chunks, a sequence of 256 bytes (64 words) # for all of Unicode has not yet been developed.
# of chunk numbers indexed by their original chunk position, and a
# sequence of 256-bit chunks (8 words each). charmap = bytes(charmap) # should be hashable
# Compression is normally good: in a typical charset, large ranges of
# Unicode will be either completely excluded (e.g. if only cyrillic
# letters are to be matched), or completely included (e.g. if large
# subranges of Kanji match). These ranges will be represented by
# chunks of all one-bits or all zero-bits.
# Matching can be also done efficiently: the more significant byte of
# the Unicode character is an index into the chunk number, and the
# less significant byte is a bit index in the chunk (just like the
# CHARSET matching).
# In UCS-4 mode, the BIGCHARSET opcode still supports only subsets
# of the basic multilingual plane; an efficient representation
# for all of Unicode has not yet been developed. This means,
# in particular, that negated charsets cannot be represented as
# bigcharsets.
def _optimize_unicode(charset, fixup):
try:
import array
except ImportError:
return charset
charmap = [0]*65536
negate = 0
try:
for op, av in charset:
if op is NEGATE:
negate = 1
elif op is LITERAL:
charmap[fixup(av)] = 1
elif op is RANGE:
for i in xrange(fixup(av[0]), fixup(av[1])+1):
charmap[i] = 1
elif op is CATEGORY:
# XXX: could expand category
return charset # cannot compress
except IndexError:
# non-BMP characters
return charset
if negate:
if sys.maxunicode != 65535:
# XXX: negation does not work with big charsets
return charset
for i in xrange(65536):
charmap[i] = not charmap[i]
comps = {} comps = {}
mapping = [0]*256 mapping = bytearray(256)
block = 0 block = 0
data = [] data = bytearray()
for i in xrange(256): for i in range(0, 65536, 256):
chunk = tuple(charmap[i*256:(i+1)*256]) chunk = charmap[i: i + 256]
new = comps.setdefault(chunk, block) if chunk in comps:
mapping[i] = new mapping[i // 256] = comps[chunk]
if new == block: else:
block = block + 1 mapping[i // 256] = comps[chunk] = block
data = data + _mk_bitmap(chunk) block += 1
header = [block] data += chunk
data = _mk_bitmap(data)
data[0:0] = [block] + _bytes_to_codes(mapping)
out.append((BIGCHARSET, data))
out += tail
return out
_CODEBITS = _sre.CODESIZE * 8
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
s = bytes(bits).translate(_BITS_TRANS)[::-1]
return [_int(s[i - _CODEBITS: i], 2)
for i in range(len(s), 0, -_CODEBITS)]
def _bytes_to_codes(b):
# Convert block indices to word array
import array
if _sre.CODESIZE == 2: if _sre.CODESIZE == 2:
code = 'H' code = 'H'
else: else:
code = 'I' code = 'I'
# Convert block indices to byte array of 256 bytes a = array.array(code, bytes(b))
mapping = array.array('B', mapping).tostring() assert a.itemsize == _sre.CODESIZE
# Convert byte array to word array assert len(a) * a.itemsize == len(b)
mapping = array.array(code, mapping) return a.tolist()
assert mapping.itemsize == _sre.CODESIZE
header = header + mapping.tolist()
data[0:0] = header
return [(BIGCHARSET, data)]
def _simple(av): def _simple(av):
# check if av is a "simple" operator # check if av is a "simple" operator
......
...@@ -37,6 +37,8 @@ Core and Builtins ...@@ -37,6 +37,8 @@ Core and Builtins
Library Library
------- -------
- Issue #19329: Optimized compiling charsets in regular expressions.
- Issue #22410: Module level functions in the re module now cache compiled - Issue #22410: Module level functions in the re module now cache compiled
locale-dependent regular expressions taking into account the locale. locale-dependent regular expressions taking into account the locale.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment