Commit 2ea88051 authored by YOU's avatar YOU Committed by Dylan Trotter

fix several bugs on re module (#122)

parent 08f9c2ed
......@@ -45,7 +45,8 @@ def compile(pattern, flags, code, groups=0, groupindex={}, indexgroup=[None]):
def getlower(char_ord, flags):
if (char_ord < 128) or (flags & SRE_FLAG_UNICODE) \
or (flags & SRE_FLAG_LOCALE and char_ord < 256):
return ord(unichr(char_ord).lower())
# return ord(unichr(char_ord).lower())
return ord(chr(char_ord).lower())
else:
return char_ord
......
......@@ -194,8 +194,10 @@ def compile(pattern, flags=0):
def purge():
"Clear the regular expression cache"
_cache.clear()
_cache_repl.clear()
# _cache.clear()
# _cache_repl.clear()
globals()['_cache'] = {}
globals()['_cache_repl'] = {}
def template(pattern, flags=0):
"Compile a template pattern, returning a pattern object"
......@@ -250,7 +252,8 @@ def _compile(*key):
raise error, v # invalid expression
if not bypass_cache:
if len(_cache) >= _MAXCACHE:
_cache.clear()
# _cache.clear()
globals()['_cache'] = {}
if p.flags & LOCALE:
if not _locale:
return p
......@@ -271,7 +274,8 @@ def _compile_repl(*key):
except error, v:
raise error, v # invalid expression
if len(_cache_repl) >= _MAXCACHE:
_cache_repl.clear()
# _cache_repl.clear()
globals()['_cache_repl'] = {}
_cache_repl[key] = p
return p
......
This diff is collapsed.
......@@ -11,7 +11,8 @@
"""Internal support module for sre"""
import _sre, sys
import sys
import _sre
import sre_parse
# TODO: Support from foo import * syntax.
......@@ -262,11 +263,9 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
def _optimize_charset(charset, fixup, fixes, isunicode):
# internal: optimize character set
out = []
outappend = out.append
tail = []
# charmap = bytearray(256)
charmap = [0] * 256
for op, av in charset:
while True:
try:
......@@ -319,85 +318,50 @@ def _optimize_charset(charset, fixup, fixes, isunicode):
break
# compress character map
i = p = n = 0
runs = []
runsappend = runs.append
for c in charmap:
if c:
if n == 0:
p = i
n = n + 1
elif n:
runsappend((p, n))
n = 0
i = i + 1
if n:
runsappend((p, n))
if len(runs) <= 2:
q = 0
def char_find(l, s, start):
i = start
while i < len(l):
if l[i] == s:
return i
i += 1
return -1
while True:
# p = charmap.find(b'\1', q)
p = char_find(charmap, 1, q)
if p < 0:
break
if len(runs) >= 2:
runs = None
break
# q = charmap.find(b'\0', p)
q = char_find(charmap, 0, p)
if q < 0:
runs.append((p, len(charmap)))
break
runs.append((p, q))
if runs is not None:
# use literal/range
for p, n in runs:
if n == 1:
outappend((LITERAL, p))
for p, q in runs:
if q - p == 1:
out.append((LITERAL, p))
else:
outappend((RANGE, (p, p + n - 1)))
if len(out) < len(charset):
out.append((RANGE, (p, q - 1)))
out += tail
# if the case was changed or new representation is more compact
if fixup or len(out) < len(charset):
return out
else:
# use bitmap
# else original character set is good enough
return charset
# use bitmap
if len(charmap) == 256:
data = _mk_bitmap(charmap)
outappend((CHARSET, data))
out.append((CHARSET, data))
out += tail
return out
return charset
# runs = []
# q = 0
# while True:
# p = charmap.find(b'\1', q)
# if p < 0:
# break
# if len(runs) >= 2:
# runs = None
# break
# q = charmap.find(b'\0', p)
# if q < 0:
# runs.append((p, len(charmap)))
# break
# runs.append((p, q))
# if runs is not None:
# # use literal/range
# for p, q in runs:
# if q - p == 1:
# out.append((LITERAL, p))
# else:
# out.append((RANGE, (p, q - 1)))
# out += tail
# # if the case was changed or new representation is more compact
# if fixup or len(out) < len(charset):
# return out
# # else original character set is good enough
# return charset
# # use bitmap
# if len(charmap) == 256:
# data = _mk_bitmap(charmap)
# out.append((CHARSET, data))
# out += tail
# return out
def _mk_bitmap(bits):
data = []
dataappend = data.append
if _sre.CODESIZE == 2:
start = (1, 0)
else:
start = (1, 0)
m, v = start
for c in bits:
if c:
v = v + m
m = m + m
if m > MAXCODE:
dataappend(v)
m, v = start
return data
# To represent a big charset, first a bitmap of all characters in the
# set is constructed. Then, this bitmap is sliced into chunks of 256
# characters, duplicate chunks are eliminated, and each chunk is
......@@ -422,7 +386,8 @@ def _mk_bitmap(bits):
# of the basic multilingual plane; an efficient representation
# for all of Unicode has not yet been developed.
charmap = bytes(charmap) # should be hashable
# charmap = bytes(charmap) # should be hashable
charmap = str(charmap) # should be hashable
comps = {}
# mapping = bytearray(256)
mapping = [0] * 256
......@@ -465,8 +430,26 @@ _CODEBITS = _sre.CODESIZE * 8
_BITS_TRANS = b'0' + b'1' * 255
# def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
# s = bytes(bits).translate(_BITS_TRANS)[::-1]
# return [_int(s[i - _CODEBITS: i], 2)
# r = [_int(s[i - _CODEBITS: i], 2)
# for i in range(len(s), 0, -_CODEBITS)]
# return r
def _mk_bitmap(bits):
data = []
dataappend = data.append
# if _sre.CODESIZE == 2:
# start = (1, 0)
# else:
# start = (1, 0)
start = (1, 0)
m, v = start
for c in bits:
if c:
v = v + m
m = m + m
if m > MAXCODE:
dataappend(v)
m, v = start
return data
def _bytes_to_codes(b):
return b[:]
......
......@@ -31,13 +31,15 @@ __all__ = [
'SRE_FLAG_LOCALE', 'SRE_FLAG_MULTILINE', 'SRE_FLAG_TEMPLATE',
'SRE_FLAG_UNICODE', 'SRE_FLAG_VERBOSE', 'SRE_INFO_CHARSET',
'SRE_INFO_LITERAL', 'SRE_INFO_PREFIX', 'SUBPATTERN', 'SUCCESS',
'SRE_FLAG_DEBUG', 'error'
'SRE_FLAG_DEBUG', 'MAXCODE', 'error'
]
# update when constants are added or removed
MAGIC = 20031017
MAXCODE = 65535
# try:
# from _sre import MAXREPEAT
# except ImportError:
......
......@@ -143,7 +143,8 @@ class SubPattern(object):
def __len__(self):
return len(self.data)
def __delitem__(self, index):
del self.data[index]
# del self.data[index]
self.data = self.data[:index] + self.data[index+1:]
def __getitem__(self, index):
if isinstance(index, slice):
return SubPattern(self.pattern, self.data[index])
......@@ -345,7 +346,7 @@ def _parse_sub(source, state, nested=1):
# check if all items share a common prefix
while 1:
prefix = None
prefix, common = None, False
for item in items:
if not item:
break
......@@ -356,10 +357,16 @@ def _parse_sub(source, state, nested=1):
else:
# all subitems start with a common "prefix".
# move it out of the branch
for item in items:
del item[0]
# for item in items:
# print "del", item[0], items
# del item[0]
for i in range(len(items)):
items[i] = items[i][1:]
subpatternappend(prefix)
continue # check next one
# continue # check next one
common = True
if common:
continue
break
# check if the branch can be replaced by a character set
......@@ -589,7 +596,8 @@ def _parse(source, state):
"%r" % name)
gid = state.groupdict.get(name)
if gid is None:
msg = "unknown group name: {0!r}".format(name)
# msg = "unknown group name: {0!r}".format(name)
msg = "unknown group name: %s" % (name)
raise error(msg)
# if state.lookbehind:
# import warnings
......@@ -651,7 +659,8 @@ def _parse(source, state):
if isname(condname):
condgroup = state.groupdict.get(condname)
if condgroup is None:
msg = "unknown group name: {0!r}".format(condname)
# msg = "unknown group name: {0!r}".format(condname)
msg = "unknown group name: %s" % (condname)
raise error(msg)
else:
try:
......@@ -783,7 +792,8 @@ def parse_template(source, pattern):
try:
index = pattern.groupindex[name]
except KeyError:
msg = "unknown group name: {0!r}".format(name)
# msg = "unknown group name: {0!r}".format(name)
msg = "unknown group name: %s" % (name)
raise IndexError(msg)
a((MARK, index))
elif c == "0":
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment