Commit 8c1a3565 authored by Fredrik Lundh's avatar Fredrik Lundh

-- SRE 0.9.6 sync. this includes:

 + added "regs" attribute
 + fixed "pos" and "endpos" attributes
 + reset "lastindex" and "lastgroup" in scanner methods
 + removed (?P#id) syntax; the "lastindex" and "lastgroup"
   attributes are now always set
 + removed string module dependencies in sre_parse
 + better debugging support in sre_parse
 + various tweaks to build under 1.5.2
parent 902e1319
...@@ -10,9 +10,13 @@ ...@@ -10,9 +10,13 @@
# other compatibility work. # other compatibility work.
# #
# FIXME: change all FIXME's to XXX ;-)
import sre_compile import sre_compile
import sre_parse import sre_parse
import string
# flags # flags
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
L = LOCALE = sre_compile.SRE_FLAG_LOCALE L = LOCALE = sre_compile.SRE_FLAG_LOCALE
...@@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0): ...@@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0):
def compile(pattern, flags=0): def compile(pattern, flags=0):
return _compile(pattern, flags) return _compile(pattern, flags)
def purge():
_cache.clear()
def template(pattern, flags=0): def template(pattern, flags=0):
return _compile(pattern, flags|T) return _compile(pattern, flags|T)
...@@ -65,7 +72,7 @@ def escape(pattern): ...@@ -65,7 +72,7 @@ def escape(pattern):
s[i] = "\\000" s[i] = "\\000"
else: else:
s[i] = "\\" + c s[i] = "\\" + c
return pattern[:0].join(s) return _join(s, pattern)
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# internals # internals
...@@ -73,10 +80,14 @@ def escape(pattern): ...@@ -73,10 +80,14 @@ def escape(pattern):
_cache = {} _cache = {}
_MAXCACHE = 100 _MAXCACHE = 100
def _join(seq, sep):
# internal: join into string having the same type as sep
return string.join(seq, sep[:0])
def _compile(pattern, flags=0): def _compile(pattern, flags=0):
# internal: compile pattern # internal: compile pattern
tp = type(pattern) tp = type(pattern)
if tp not in (type(""), type(u"")): if tp not in sre_compile.STRING_TYPES:
return pattern return pattern
key = (tp, pattern, flags) key = (tp, pattern, flags)
try: try:
...@@ -89,10 +100,6 @@ def _compile(pattern, flags=0): ...@@ -89,10 +100,6 @@ def _compile(pattern, flags=0):
_cache[key] = p _cache[key] = p
return p return p
def purge():
# clear pattern cache
_cache.clear()
def _sub(pattern, template, string, count=0): def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook # internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0] return _subn(pattern, template, string, count)[0]
...@@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0): ...@@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0):
i = e i = e
n = n + 1 n = n + 1
append(string[i:]) append(string[i:])
return string[:0].join(s), n return _join(s, string[:0]), n
def _split(pattern, string, maxsplit=0): def _split(pattern, string, maxsplit=0):
# internal: pattern.split implementation hook # internal: pattern.split implementation hook
...@@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile) ...@@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
class Scanner: class Scanner:
def __init__(self, lexicon): def __init__(self, lexicon):
from sre_constants import BRANCH, SUBPATTERN, INDEX
self.lexicon = lexicon self.lexicon = lexicon
# combine phrases into a compound pattern
p = [] p = []
s = sre_parse.Pattern()
for phrase, action in lexicon: for phrase, action in lexicon:
p.append("(?:%s)(?P#%d)" % (phrase, len(p))) p.append(sre_parse.SubPattern(s, [
self.scanner = _compile("|".join(p)) (SUBPATTERN, (None, sre_parse.parse(phrase))),
(INDEX, len(p))
]))
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
s.groups = len(p)
self.scanner = sre_compile.compile(p)
def scan(self, string): def scan(self, string):
result = [] result = []
append = result.append append = result.append
......
...@@ -197,10 +197,11 @@ def _compile(code, pattern, flags): ...@@ -197,10 +197,11 @@ def _compile(code, pattern, flags):
else: else:
emit(ATCODES[av]) emit(ATCODES[av])
elif op is BRANCH: elif op is BRANCH:
emit(OPCODES[op])
tail = [] tail = []
for av in av[1]: for av in av[1]:
emit(OPCODES[op])
skip = len(code); emit(0) skip = len(code); emit(0)
emit(MAXCODE) # save mark
_compile(code, av, flags) _compile(code, av, flags)
emit(OPCODES[JUMP]) emit(OPCODES[JUMP])
tail.append(len(code)); emit(0) tail.append(len(code)); emit(0)
...@@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags): ...@@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags):
emit(OPCODES[FAILURE]) emit(OPCODES[FAILURE])
code[skip] = len(code) - skip code[skip] = len(code) - skip
STRING_TYPES = [type("")]
try:
STRING_TYPES.append(type(unicode("")))
except NameError:
pass
def compile(p, flags=0): def compile(p, flags=0):
# internal: convert pattern list to internal format # internal: convert pattern list to internal format
# compile, as necessary # compile, as necessary
if type(p) in (type(""), type(u"")): if type(p) in STRING_TYPES:
import sre_parse import sre_parse
pattern = p pattern = p
p = sre_parse.parse(p, flags) p = sre_parse.parse(p, flags)
...@@ -308,6 +316,8 @@ def compile(p, flags=0): ...@@ -308,6 +316,8 @@ def compile(p, flags=0):
code.append(OPCODES[SUCCESS]) code.append(OPCODES[SUCCESS])
# print code
# FIXME: <fl> get rid of this limitation! # FIXME: <fl> get rid of this limitation!
assert p.pattern.groups <= 100,\ assert p.pattern.groups <= 100,\
"sorry, but this version only supports 100 named groups" "sorry, but this version only supports 100 named groups"
......
...@@ -172,7 +172,7 @@ CH_UNICODE = { ...@@ -172,7 +172,7 @@ CH_UNICODE = {
# flags # flags
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking) SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
SRE_FLAG_IGNORECASE = 2 # case insensitive SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honor system locale SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode locale SRE_FLAG_UNICODE = 32 # use unicode locale
......
...@@ -25,12 +25,12 @@ CHARMASK = 0xff ...@@ -25,12 +25,12 @@ CHARMASK = 0xff
SPECIAL_CHARS = ".\\[{()*+?^$|" SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{" REPEAT_CHARS = "*+?{"
DIGITS = tuple(string.digits) DIGITS = tuple("012345689")
OCTDIGITS = tuple("01234567") OCTDIGITS = tuple("01234567")
HEXDIGITS = tuple("0123456789abcdefABCDEF") HEXDIGITS = tuple("0123456789abcdefABCDEF")
WHITESPACE = tuple(string.whitespace) WHITESPACE = tuple(" \t\n\r\v\f")
ESCAPES = { ESCAPES = {
r"\a": (LITERAL, 7), r"\a": (LITERAL, 7),
...@@ -68,7 +68,8 @@ FLAGS = { ...@@ -68,7 +68,8 @@ FLAGS = {
"u": SRE_FLAG_UNICODE, "u": SRE_FLAG_UNICODE,
} }
class State: class Pattern:
# master pattern object. keeps track of global attributes
def __init__(self): def __init__(self):
self.flags = 0 self.flags = 0
self.groups = 1 self.groups = 1
...@@ -88,6 +89,33 @@ class SubPattern: ...@@ -88,6 +89,33 @@ class SubPattern:
data = [] data = []
self.data = data self.data = data
self.width = None self.width = None
def dump(self, level=0):
nl = 1
for op, av in self.data:
print level*" " + op,; nl = 0
if op == "in":
# member sublanguage
print; nl = 1
for op, a in av:
print (level+1)*" " + op, a
elif op == "branch":
print; nl = 1
i = 0
for a in av[1]:
if i > 0:
print level*" " + "or"
a.dump(level+1); nl = 1
i = i + 1
elif type(av) in (type(()), type([])):
for a in av:
if isinstance(a, SubPattern):
if not nl: print
a.dump(level+1); nl = 1
else:
print a, ; nl = 0
else:
print av, ; nl = 0
if not nl: print
def __repr__(self): def __repr__(self):
return repr(self.data) return repr(self.data)
def __len__(self): def __len__(self):
...@@ -255,10 +283,25 @@ def _escape(source, escape, state): ...@@ -255,10 +283,25 @@ def _escape(source, escape, state):
pass pass
raise error, "bogus escape: %s" % repr(escape) raise error, "bogus escape: %s" % repr(escape)
def _branch(pattern, items): def _parse_sub(source, state, nested=1):
# form a branch operator from a set of items # parse an alternation: a|b|c
subpattern = SubPattern(pattern) items = []
while 1:
items.append(_parse(source, state))
if source.match("|"):
continue
if not nested:
break
if not source.next or source.match(")"):
break
else:
raise error, "pattern not properly closed"
if len(items) == 1:
return items[0]
subpattern = SubPattern(state)
# check if all items share a common prefix # check if all items share a common prefix
while 1: while 1:
...@@ -285,7 +328,7 @@ def _branch(pattern, items): ...@@ -285,7 +328,7 @@ def _branch(pattern, items):
break break
else: else:
# we can store this as a character set instead of a # we can store this as a character set instead of a
# branch (FIXME: use a range if possible) # branch (the compiler may optimize this even more)
set = [] set = []
for item in items: for item in items:
set.append(item[0]) set.append(item[0])
...@@ -296,8 +339,7 @@ def _branch(pattern, items): ...@@ -296,8 +339,7 @@ def _branch(pattern, items):
return subpattern return subpattern
def _parse(source, state): def _parse(source, state):
# parse a simple pattern
# parse regular expression pattern into an operator list.
subpattern = SubPattern(state) subpattern = SubPattern(state)
...@@ -451,22 +493,6 @@ def _parse(source, state): ...@@ -451,22 +493,6 @@ def _parse(source, state):
if gid is None: if gid is None:
raise error, "unknown group name" raise error, "unknown group name"
subpattern.append((GROUPREF, gid)) subpattern.append((GROUPREF, gid))
elif source.match("#"):
index = ""
while 1:
char = source.get()
if char is None:
raise error, "unterminated index"
if char == ")":
break
index = index + char
try:
index = int(index)
if index < 0 or index > MAXREPEAT:
raise ValueError
except ValueError:
raise error, "illegal index"
subpattern.append((INDEX, index))
continue continue
else: else:
char = source.get() char = source.get()
...@@ -491,48 +517,27 @@ def _parse(source, state): ...@@ -491,48 +517,27 @@ def _parse(source, state):
raise error, "syntax error" raise error, "syntax error"
dir = -1 # lookbehind dir = -1 # lookbehind
char = source.get() char = source.get()
b = [] p = _parse_sub(source, state)
while 1: if char == "=":
p = _parse(source, state) subpattern.append((ASSERT, (dir, p)))
if source.next == ")": else:
if b: subpattern.append((ASSERT_NOT, (dir, p)))
b.append(p) continue
p = _branch(state, b)
if char == "=":
subpattern.append((ASSERT, (dir, p)))
else:
subpattern.append((ASSERT_NOT, (dir, p)))
break
elif source.match("|"):
b.append(p)
else:
raise error, "pattern not properly closed"
else: else:
# flags # flags
while FLAGS.has_key(source.next): while FLAGS.has_key(source.next):
state.flags = state.flags | FLAGS[source.get()] state.flags = state.flags | FLAGS[source.get()]
if group: if group:
# parse group contents # parse group contents
b = []
if group == 2: if group == 2:
# anonymous group # anonymous group
group = None group = None
else: else:
group = state.getgroup(name) group = state.getgroup(name)
while 1: p = _parse_sub(source, state)
p = _parse(source, state) subpattern.append((SUBPATTERN, (group, p)))
if group is not None: if group is not None:
p.append((INDEX, group)) p.append((INDEX, group))
if source.match(")"):
if b:
b.append(p)
p = _branch(state, b)
subpattern.append((SUBPATTERN, (group, p)))
break
elif source.match("|"):
b.append(p)
else:
raise error, "group not properly closed"
else: else:
while 1: while 1:
char = source.get() char = source.get()
...@@ -555,26 +560,24 @@ def _parse(source, state): ...@@ -555,26 +560,24 @@ def _parse(source, state):
return subpattern return subpattern
def parse(pattern, flags=0): def parse(str, flags=0):
# parse 're' pattern into list of (opcode, argument) tuples # parse 're' pattern into list of (opcode, argument) tuples
source = Tokenizer(pattern)
state = State() source = Tokenizer(str)
state.flags = flags
b = [] pattern = Pattern()
while 1: pattern.flags = flags
p = _parse(source, state)
tail = source.get() p = _parse_sub(source, pattern, 0)
if tail == "|":
b.append(p) tail = source.get()
elif tail == ")": if tail == ")":
raise error, "unbalanced parenthesis" raise error, "unbalanced parenthesis"
elif tail is None: elif tail:
if b: raise error, "bogus characters at end of regular expression"
b.append(p)
p = _branch(state, b) # p.dump()
break
else:
raise error, "bogus characters at end of regular expression"
return p return p
def parse_template(source, pattern): def parse_template(source, pattern):
...@@ -656,4 +659,4 @@ def expand_template(template, match): ...@@ -656,4 +659,4 @@ def expand_template(template, match):
if s is None: if s is None:
raise error, "empty group" raise error, "empty group"
a(s) a(s)
return sep.join(p) return string.join(p, sep)
test_sre test_sre
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A') === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
=== Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a') === Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None'
=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None'
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A') === Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
This diff is collapsed.
/* /*
*
* Secret Labs' Regular Expression Engine * Secret Labs' Regular Expression Engine
* *
* regular expression matching engine * regular expression matching engine
...@@ -33,6 +34,7 @@ typedef struct { ...@@ -33,6 +34,7 @@ typedef struct {
typedef struct { typedef struct {
PyObject_VAR_HEAD PyObject_VAR_HEAD
PyObject* string; /* link to the target string */ PyObject* string; /* link to the target string */
PyObject* regs; /* cached list of matching spans */
PatternObject* pattern; /* link to the regex (pattern) object */ PatternObject* pattern; /* link to the regex (pattern) object */
int pos, endpos; /* current target slice */ int pos, endpos; /* current target slice */
int lastindex; /* last index marker seen by the engine (-1 if none) */ int lastindex; /* last index marker seen by the engine (-1 if none) */
...@@ -60,6 +62,9 @@ typedef struct { ...@@ -60,6 +62,9 @@ typedef struct {
void* beginning; /* start of original string */ void* beginning; /* start of original string */
void* start; /* start of current slice */ void* start; /* start of current slice */
void* end; /* end of original string */ void* end; /* end of original string */
/* attributes for the match object */
PyObject* string;
int pos, endpos;
/* character size */ /* character size */
int charsize; int charsize;
/* registers */ /* registers */
...@@ -78,7 +83,6 @@ typedef struct { ...@@ -78,7 +83,6 @@ typedef struct {
/* scanner (internal helper object) */ /* scanner (internal helper object) */
PyObject_HEAD PyObject_HEAD
PyObject* pattern; PyObject* pattern;
PyObject* string;
SRE_STATE state; SRE_STATE state;
} ScannerObject; } ScannerObject;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment