Commit 29c4ba9a authored by Fredrik Lundh's avatar Fredrik Lundh

SRE 0.9.8: passes the entire test suite

-- reverted REPEAT operator to use "repeat context" strategy
   (from 0.8.X), but done right this time.
-- got rid of backtracking stack; use nested SRE_MATCH calls
   instead (should probably put it back again in 0.9.9 ;-)
-- properly reset state in scanner mode
-- don't use aggressive inlining by default
parent 19c6afb4
......@@ -5,8 +5,12 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# This version of the SRE library can be redistributed under CNRI's
# Python 1.6 license. For any other use, please contact Secret Labs
# AB (info@pythonware.com).
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
......@@ -24,7 +28,7 @@ M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
# sre extensions (may or may not be in 2.0 final)
# sre extensions (may or may not be in 1.6/2.0 final)
T = TEMPLATE = sre_compile.SRE_FLAG_TEMPLATE
U = UNICODE = sre_compile.SRE_FLAG_UNICODE
......@@ -168,15 +172,14 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
class Scanner:
def __init__(self, lexicon):
from sre_constants import BRANCH, SUBPATTERN, INDEX
from sre_constants import BRANCH, SUBPATTERN
self.lexicon = lexicon
# combine phrases into a compound pattern
p = []
s = sre_parse.Pattern()
for phrase, action in lexicon:
p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (None, sre_parse.parse(phrase))),
(INDEX, len(p))
(SUBPATTERN, (len(p), sre_parse.parse(phrase))),
]))
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
s.groups = len(p)
......
......@@ -5,9 +5,7 @@
#
# Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# See the sre.py file for information on usage and redistribution.
#
import _sre
......@@ -124,6 +122,7 @@ def _compile(code, pattern, flags):
emit(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
if flags & SRE_FLAG_TEMPLATE:
raise error, "internal: unsupported template operator"
emit(OPCODES[REPEAT])
skip = len(code); emit(0)
emit(av[0])
......@@ -136,9 +135,8 @@ def _compile(code, pattern, flags):
if lo == 0:
raise error, "nothing to repeat"
if 0 and lo == hi == 1 and op is MAX_REPEAT:
# FIXME: <fl> need a better way to figure out when
# it's safe to use this one (in the parser, probably)
emit(OPCODES[MAX_REPEAT_ONE])
# FIXME: <fl> fast and wrong (but we'll fix that)
emit(OPCODES[REPEAT_ONE])
skip = len(code); emit(0)
emit(av[0])
emit(av[1])
......@@ -146,29 +144,24 @@ def _compile(code, pattern, flags):
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else:
emit(OPCODES[op])
emit(OPCODES[REPEAT])
skip = len(code); emit(0)
emit(av[0])
emit(av[1])
mark = MAXCODE
if av[2][0][0] == SUBPATTERN:
# repeated subpattern
gid, foo = av[2][0][1]
if gid:
mark = (gid-1)*2
emit(mark)
_compile(code, av[2], flags)
emit(OPCODES[SUCCESS])
code[skip] = len(code) - skip
if op == MAX_REPEAT:
emit(OPCODES[MAX_UNTIL])
else:
emit(OPCODES[MIN_UNTIL])
elif op is SUBPATTERN:
gid = av[0]
if gid:
if av[0]:
emit(OPCODES[MARK])
emit((gid-1)*2)
emit((av[0]-1)*2)
_compile(code, av[1], flags)
if gid:
if av[0]:
emit(OPCODES[MARK])
emit((gid-1)*2+1)
emit((av[0]-1)*2+1)
elif op in (SUCCESS, FAILURE):
emit(OPCODES[op])
elif op in (ASSERT, ASSERT_NOT):
......@@ -197,11 +190,10 @@ def _compile(code, pattern, flags):
else:
emit(ATCODES[av])
elif op is BRANCH:
emit(OPCODES[op])
tail = []
for av in av[1]:
emit(OPCODES[op])
skip = len(code); emit(0)
emit(MAXCODE) # save mark
_compile(code, av, flags)
emit(OPCODES[JUMP])
tail.append(len(code)); emit(0)
......@@ -223,9 +215,6 @@ def _compile(code, pattern, flags):
else:
emit(OPCODES[op])
emit(av-1)
elif op in (MARK, INDEX):
emit(OPCODES[op])
emit(av)
else:
raise ValueError, ("unsupported operand type", op)
......@@ -294,16 +283,7 @@ try:
except NameError:
pass
def compile(p, flags=0):
# internal: convert pattern list to internal format
# compile, as necessary
if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
def _compile1(p, flags):
flags = p.pattern.flags | flags
code = []
......@@ -316,6 +296,20 @@ def compile(p, flags=0):
code.append(OPCODES[SUCCESS])
return code
def compile(p, flags=0):
# internal: convert pattern list to internal format
if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
else:
pattern = None
code = _compile1(p, flags)
# print code
# FIXME: <fl> get rid of this limitation!
......
......@@ -6,9 +6,7 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# See the sre.py file for information on usage and redistribution.
#
# should this really be here?
......@@ -33,15 +31,15 @@ GROUPREF = "groupref"
GROUPREF_IGNORE = "groupref_ignore"
IN = "in"
IN_IGNORE = "in_ignore"
INDEX = "index"
INFO = "info"
JUMP = "jump"
LITERAL = "literal"
LITERAL_IGNORE = "literal_ignore"
MARK = "mark"
MAX_REPEAT = "max_repeat"
MAX_REPEAT_ONE = "max_repeat_one"
MAX_UNTIL = "max_until"
MIN_REPEAT = "min_repeat"
MIN_UNTIL = "min_until"
NEGATE = "negate"
NOT_LITERAL = "not_literal"
NOT_LITERAL_IGNORE = "not_literal_ignore"
......@@ -91,19 +89,19 @@ OPCODES = [
CATEGORY,
CHARSET,
GROUPREF, GROUPREF_IGNORE,
INDEX,
IN, IN_IGNORE,
INFO,
JUMP,
LITERAL, LITERAL_IGNORE,
MARK,
MAX_REPEAT,
MAX_REPEAT_ONE,
MIN_REPEAT,
MAX_UNTIL,
MIN_UNTIL,
NOT_LITERAL, NOT_LITERAL_IGNORE,
NEGATE,
RANGE,
REPEAT
REPEAT,
REPEAT_ONE,
SUBPATTERN
]
......
......@@ -5,9 +5,7 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 2.0 integration and
# other compatibility work.
# See the sre.py file for information on usage and redistribution.
#
import string, sys
......@@ -536,8 +534,6 @@ def _parse(source, state):
group = state.getgroup(name)
p = _parse_sub(source, state)
subpattern.append((SUBPATTERN, (group, p)))
if group is not None:
p.append((INDEX, group))
else:
while 1:
char = source.get()
......
......@@ -5,37 +5,29 @@
*
* partial history:
* 99-10-24 fl created (based on existing template matcher code)
* 99-11-13 fl added categories, branching, and more (0.2)
* 99-11-16 fl some tweaks to compile on non-Windows platforms
* 99-12-18 fl non-literals, generic maximizing repeat (0.3)
* 00-02-28 fl tons of changes (not all to the better ;-) (0.4)
* 00-03-06 fl first alpha, sort of (0.5)
* 00-03-14 fl removed most compatibility stuff (0.6)
* 00-05-10 fl towards third alpha (0.8.2)
* 00-05-13 fl added experimental scanner stuff (0.8.3)
* 00-05-27 fl final bug hunt (0.8.4)
* 00-06-21 fl less bugs, more taste (0.8.5)
* 00-06-25 fl major changes to better deal with nested repeats (0.9)
* 00-06-28 fl fixed findall (0.9.1)
* 00-06-29 fl fixed split, added more scanner features (0.9.2)
* 00-06-30 fl added fast search optimization (0.9.3)
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
* 00-07-02 fl added charset optimizations, etc (0.9.5)
* 00-07-03 fl store code in pattern object, lookbehind, etc
* 00-07-08 fl added regs attribute
* 00-07-18 fl changed branch operator to use failure stack
* 00-07-21 fl reset lastindex in scanner methods (0.9.6)
* 00-08-01 fl fixes for 1.6b1 (0.9.8)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
* This version of the SRE library can be redistributed under CNRI's
* Python 1.6 license. For any other use, please contact Secret Labs
* AB (info@pythonware.com).
*
* Portions of this engine have been developed in cooperation with
* CNRI. Hewlett-Packard provided funding for 2.0 integration and
* CNRI. Hewlett-Packard provided funding for 1.6 integration and
* other compatibility work.
*/
#ifndef SRE_RECURSIVE
char copyright[] = " SRE 0.9.6 Copyright (c) 1997-2000 by Secret Labs AB ";
char copyright[] = " SRE 0.9.8 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
......@@ -67,7 +59,7 @@ char copyright[] = " SRE 0.9.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#define USE_FAST_SEARCH
/* enables aggressive inlining (always on for Visual C) */
#define USE_INLINE
#undef USE_INLINE
/* -------------------------------------------------------------------- */
......@@ -84,6 +76,7 @@ char copyright[] = " SRE 0.9.6 Copyright (c) 1997-2000 by Secret Labs AB ";
/* error codes */
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
#define SRE_ERROR_STATE -2 /* illegal state */
#define SRE_ERROR_MEMORY -9 /* out of memory */
#if defined(VERBOSE)
......@@ -216,56 +209,85 @@ sre_category(SRE_CODE category, unsigned int ch)
/* helpers */
LOCAL(int)
stack_free(SRE_STATE* state)
static void
mark_init(SRE_STATE* state)
{
if (state->stack) {
TRACE(("release stack\n"));
free(state->stack);
state->stack = NULL;
}
state->stacksize = 0;
return 0;
state->mark_stack = NULL;
state->mark_stack_size = state->mark_stack_base = 0;
}
static void
mark_fini(SRE_STATE* state)
{
if (state->mark_stack)
free(state->mark_stack);
mark_init(state);
}
static int /* shouldn't be LOCAL */
stack_extend(SRE_STATE* state, int lo, int hi)
static int
mark_save(SRE_STATE* state, int lo, int hi)
{
SRE_STACK* stack;
int stacksize;
void* stack;
int size;
int minsize, newsize;
/* grow the stack to a suitable size; we need at least lo entries,
at most hi entries. if for some reason hi is lower than lo, lo
wins */
if (hi <= lo)
return 0;
stacksize = state->stacksize;
size = (hi - lo) + 1;
if (stacksize == 0) {
newsize = state->mark_stack_size;
minsize = state->mark_stack_base + size;
if (newsize < minsize) {
/* create new stack */
stacksize = 512;
if (stacksize < lo)
stacksize = lo;
else if (stacksize > hi)
stacksize = hi;
TRACE(("allocate stack %d\n", stacksize));
stack = malloc(sizeof(SRE_STACK) * stacksize);
if (!newsize) {
newsize = 512;
if (newsize < minsize)
newsize = minsize;
TRACE(("allocate stack %d\n", newsize));
stack = malloc(sizeof(void*) * newsize);
} else {
/* grow the stack (typically by a factor of two) */
while (stacksize < lo)
stacksize = 2 * stacksize;
/* FIXME: <fl> could trim size if it's much larger than hi,
as long it's larger than lo */
TRACE(("grow stack to %d\n", stacksize));
stack = realloc(state->stack, sizeof(SRE_STACK) * stacksize);
/* grow the stack */
while (newsize < minsize)
newsize += newsize;
TRACE(("grow stack to %d\n", newsize));
stack = realloc(state->mark_stack, sizeof(void*) * newsize);
}
if (!stack) {
stack_free(state);
mark_fini(state);
return SRE_ERROR_MEMORY;
}
state->mark_stack = stack;
state->mark_stack_size = newsize;
}
TRACE(("copy %d:%d to %d\n", lo, hi, state->mark_stack_base));
memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
size * sizeof(void*));
state->mark_stack_base += size;
return 0;
}
static int
mark_restore(SRE_STATE* state, int lo, int hi)
{
int size;
if (hi <= lo)
return 0;
size = (hi - lo) + 1;
state->mark_stack_base -= size;
state->stack = stack;
state->stacksize = stacksize;
TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
size * sizeof(void*));
return 0;
}
......@@ -372,28 +394,28 @@ SRE_MEMBER(SRE_CODE* set, SRE_CODE ch)
return !ok;
case SRE_OP_LITERAL:
/* args: <literal> */
/* <LITERAL> <code> */
if (ch == set[0])
return ok;
set++;
break;
case SRE_OP_RANGE:
/* args: <lower> <upper> */
/* <RANGE> <lower> <upper> */
if (set[0] <= ch && ch <= set[1])
return ok;
set += 2;
break;
case SRE_OP_CHARSET:
/* args: <bitmap> (16 bits per code word) */
/* <CHARSET> <bitmap> (16 bits per code word) */
if (ch < 256 && (set[ch >> 4] & (1 << (ch & 15))))
return ok;
set += 16;
break;
case SRE_OP_CATEGORY:
/* args: <category> */
/* <CATEGORY> <code> */
if (sre_category(set[0], (int) ch))
return ok;
set += 1;
......@@ -415,39 +437,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* end = state->end;
SRE_CHAR* ptr = state->ptr;
int stack;
int stackbase;
int lastmark;
int i, count;
SRE_STACK* sp;
/* FIXME: this is a hack! */
void* mark_copy[SRE_MARK_SIZE];
void* mark = NULL;
#define PUSH(skip_, mark_, max_)\
if (stack >= state->stacksize) {\
i = stack_extend(state, stack + 1, stackbase + max_);\
if (i < 0)\
return i;\
}\
TRACE(("%8d: stack[%d]\n", PTR(ptr), stack));\
sp = state->stack + (stack++);\
sp->ptr = ptr;\
sp->pattern = pattern + skip_;\
sp->mark = mark_;\
if (mark_ != 65535) {\
sp->mark0 = state->mark[mark_];\
sp->mark1 = state->mark[mark_+1];\
TRACE((" mark %d %d %d\n", mark_, PTR(state->mark[mark_]),\
PTR(state->mark[mark_+1])));\
}\
SRE_REPEAT* rp;
int lastmark;
SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */
TRACE(("%8d: enter\n", PTR(ptr)));
if (pattern[0] == SRE_OP_INFO) {
/* optimization info block */
/* args: <1=skip> <2=flags> <3=min> ... */
/* <INFO> <1=skip> <2=flags> <3=min> ... */
if (pattern[3] && (end - ptr) < pattern[3]) {
TRACE(("reject (got %d chars, need %d)\n",
(end - ptr), pattern[3]));
......@@ -456,11 +456,6 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
pattern += pattern[1] + 1;
}
stackbase = stack = state->stackbase;
lastmark = state->lastmark;
retry:
for (;;) {
switch (*pattern++) {
......@@ -468,30 +463,30 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_FAILURE:
/* immediate failure */
TRACE(("%8d: failure\n", PTR(ptr)));
goto failure;
return 0;
case SRE_OP_SUCCESS:
/* end of pattern */
TRACE(("%8d: success\n", PTR(ptr)));
state->ptr = ptr;
goto success;
return 1;
case SRE_OP_AT:
/* match at given position */
/* args: <at> */
/* <AT> <code> */
TRACE(("%8d: position %d\n", PTR(ptr), *pattern));
if (!SRE_AT(state, ptr, *pattern))
goto failure;
return 0;
pattern++;
break;
case SRE_OP_CATEGORY:
/* match at given category */
/* args: <category> */
/* <CATEGORY> <code> */
TRACE(("%8d: category %d [category %d]\n", PTR(ptr),
*ptr, *pattern));
if (ptr >= end || !sre_category(pattern[0], ptr[0]))
goto failure;
return 0;
TRACE(("%8d: category ok\n", PTR(ptr)));
pattern++;
ptr++;
......@@ -499,43 +494,44 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_LITERAL:
/* match literal string */
/* args: <code> */
/* <LITERAL> <code> */
TRACE(("%8d: literal %c\n", PTR(ptr), pattern[0]));
if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
goto failure;
return 0;
pattern++;
ptr++;
break;
case SRE_OP_NOT_LITERAL:
/* match anything that is not literal character */
/* args: <code> */
/* <NOT_LITERAL> <code> */
TRACE(("%8d: literal not %c\n", PTR(ptr), pattern[0]));
if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
goto failure;
return 0;
pattern++;
ptr++;
break;
case SRE_OP_ANY:
/* match anything */
/* <ANY> */
TRACE(("%8d: anything\n", PTR(ptr)));
if (ptr >= end)
goto failure;
return 0;
ptr++;
break;
case SRE_OP_IN:
/* match set member (or non_member) */
/* args: <skip> <set> */
/* <IN> <skip> <set> */
TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
goto failure;
return 0;
pattern += pattern[0];
ptr++;
break;
case SRE_OP_GROUP:
case SRE_OP_GROUPREF:
/* match backreference */
TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
i = pattern[0];
......@@ -543,17 +539,17 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
if (!p || !e || e < p)
goto failure;
return 0;
while (p < e) {
if (ptr >= end || *ptr != *p)
goto failure;
return 0;
p++; ptr++;
}
}
pattern++;
break;
case SRE_OP_GROUP_IGNORE:
case SRE_OP_GROUPREF_IGNORE:
/* match backreference */
TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
i = pattern[0];
......@@ -561,11 +557,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
if (!p || !e || e < p)
goto failure;
return 0;
while (p < e) {
if (ptr >= end ||
state->lower(*ptr) != state->lower(*p))
goto failure;
return 0;
p++; ptr++;
}
}
......@@ -576,7 +572,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: literal lower(%c)\n", PTR(ptr), pattern[0]));
if (ptr >= end ||
state->lower(*ptr) != state->lower(*pattern))
goto failure;
return 0;
pattern++;
ptr++;
break;
......@@ -585,7 +581,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: literal not lower(%c)\n", PTR(ptr), pattern[0]));
if (ptr >= end ||
state->lower(*ptr) == state->lower(*pattern))
goto failure;
return 0;
pattern++;
ptr++;
break;
......@@ -594,359 +590,211 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
if (ptr >= end
|| !SRE_MEMBER(pattern+1, (SRE_CODE) state->lower(*ptr)))
goto failure;
return 0;
pattern += pattern[0];
ptr++;
break;
case SRE_OP_MARK:
/* set mark */
/* args: <mark> */
/* <MARK> <gid> */
TRACE(("%8d: set mark %d\n", PTR(ptr), pattern[0]));
if (state->lastmark < pattern[0]+1)
state->lastmark = pattern[0]+1;
if (!mark) {
mark = mark_copy;
memcpy(mark, state->mark, state->lastmark*sizeof(void*));
}
state->mark[pattern[0]] = ptr;
pattern++;
break;
case SRE_OP_INDEX:
/* set index */
/* args: <index> */
TRACE(("%8d: set index %d\n", PTR(ptr), pattern[0]));
state->lastindex = pattern[0];
i = pattern[0];
if (i & 1)
state->lastindex = i/2 + 1;
if (i > state->lastmark)
state->lastmark = i;
state->mark[i] = ptr;
pattern++;
break;
case SRE_OP_JUMP:
case SRE_OP_INFO:
/* jump forward */
/* args: <skip> */
/* <JUMP> <offset> */
TRACE(("%8d: jump +%d\n", PTR(ptr), pattern[0]));
pattern += pattern[0];
break;
case SRE_OP_ASSERT:
/* assert subpattern */
/* args: <skip> <back> <pattern> */
/* <ASSERT> <skip> <back> <pattern> */
TRACE(("%8d: assert subpattern %d\n", PTR(ptr), pattern[1]));
state->ptr = ptr - pattern[1];
if (state->ptr < state->beginning)
goto failure;
return 0;
i = SRE_MATCH(state, pattern + 2);
if (i < 0)
if (i <= 0)
return i;
if (!i)
goto failure;
if (pattern[1] > 0 && state->ptr != ptr)
return SRE_ERROR_STATE;
pattern += pattern[0];
break;
case SRE_OP_ASSERT_NOT:
/* assert not subpattern */
/* args: <skip> <pattern> */
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("%8d: assert not subpattern %d\n", PTR(ptr), pattern[1]));
state->ptr = ptr - pattern[1];
if (state->ptr < state->beginning)
goto failure;
return 0;
i = SRE_MATCH(state, pattern + 2);
if (i < 0)
return i;
if (i)
goto failure;
return 0;
if (pattern[1] > 0 && state->ptr != ptr)
return SRE_ERROR_STATE;
pattern += pattern[0];
break;
case SRE_OP_BRANCH:
/* try an alternate branch */
/* format: <branch> <0=skip> <1=mark> <tail...> */
/* alternation */
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
TRACE(("%8d: branch\n", PTR(ptr)));
{
lastmark = state->lastmark;
while (pattern[0]) {
TRACE(("%8d: try branch\n", PTR(ptr)));
if (pattern[2] != SRE_OP_LITERAL ||
(ptr < end && (SRE_CODE) ptr[0] == pattern[3])) {
/* worth trying */
PUSH(pattern[0], pattern[3], 1);
pattern += 2;
} else
state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1);
if (i)
return i;
}
while (state->lastmark > lastmark)
state->mark[state->lastmark--] = NULL;
pattern += pattern[0];
break;
#if 0
case SRE_OP_MAX_REPEAT_ONE:
/* match repeated sequence (maximizing regexp) */
/* this operator only works if the repeated item is
exactly one character wide, and we're not already
collecting backtracking points. for other cases,
use the MAX_REPEAT operator instead */
}
}
return 0;
/* args: <skip> <min> <max> <step> */
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done
by the UNTIL operator */
/* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
TRACE(("%8d: repeat {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2]));
count = 0;
if (pattern[3] == SRE_OP_ANY) {
/* repeated wildcard. skip to the end of the target
string, and backtrack from there */
/* FIXME: must look for line endings */
if (ptr + pattern[1] > end)
goto failure; /* cannot match */
count = pattern[2];
if (count > end - ptr)
count = end - ptr;
ptr += count;
} else if (pattern[3] == SRE_OP_LITERAL) {
/* repeated literal */
SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) ptr[0] != chr)
break;
ptr++;
count++;
}
state->ptr = ptr;
} else if (pattern[3] == SRE_OP_LITERAL_IGNORE) {
/* repeated literal */
SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) state->lower(*ptr) != chr)
break;
ptr++;
count++;
}
rep.count = -1;
rep.pattern = pattern;
} else if (pattern[3] == SRE_OP_NOT_LITERAL) {
/* repeated non-literal */
SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) ptr[0] == chr)
break;
ptr++;
count++;
}
/* install new repeat context */
rep.prev = state->repeat;
state->repeat = &rep;
} else if (pattern[3] == SRE_OP_NOT_LITERAL_IGNORE) {
/* repeated non-literal */
SRE_CODE chr = pattern[4];
while (count < (int) pattern[2]) {
if (ptr >= end || (SRE_CODE) state->lower(ptr[0]) == chr)
break;
ptr++;
count++;
}
i = SRE_MATCH(state, pattern + pattern[0]);
} else if (pattern[3] == SRE_OP_IN) {
/* repeated set */
while (count < (int) pattern[2]) {
if (ptr >= end || !SRE_MEMBER(pattern + 5, *ptr))
break;
ptr++;
count++;
}
state->repeat = rep.prev;
/* free(rp); */
} else {
/* repeated single character pattern */
state->ptr = ptr;
while (count < (int) pattern[2]) {
i = SRE_MATCH(state, pattern + 3);
if (i < 0)
return i;
if (!i)
break;
count++;
}
state->ptr = ptr;
ptr += count;
}
/* when we arrive here, count contains the number of
matches, and ptr points to the tail of the target
string. check if the rest of the pattern matches, and
backtrack if not. */
case SRE_OP_MAX_UNTIL:
/* maximizing repeat */
/* <REPEAT> <skip> <1=min> <2=max> item <MAX_UNTIL> tail */
TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
/* FIXME: we probably need to deal with zero-width
matches in here... */
if (count < (int) pattern[1])
goto failure;
rp = state->repeat;
if (!rp)
return SRE_ERROR_STATE;
if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
/* tail is empty. we're finished */
TRACE(("%8d: tail is empty\n", PTR(ptr)));
state->ptr = ptr;
goto success;
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */
SRE_CODE chr = pattern[pattern[0]+1];
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
for (;;) {
TRACE(("%8d: scan for tail match\n", PTR(ptr)));
while (count >= (int) pattern[1] &&
(ptr >= end || *ptr != chr)) {
ptr--;
count--;
}
TRACE(("%8d: check tail\n", PTR(ptr)));
if (count < (int) pattern[1])
break;
state->ptr = ptr;
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--;
count--;
}
} else {
/* general case */
TRACE(("%8d: tail is pattern\n", PTR(ptr)));
while (count >= (int) pattern[1]) {
state->ptr = ptr;
i = SRE_MATCH(state, pattern + pattern[0]);
if (i < 0)
return i;
if (i) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--;
count--;
}
}
goto failure;
#endif
case SRE_OP_MAX_REPEAT:
/* match repeated sequence (maximizing regexp) */
/* args: <skip> <1=min> <2=max> <3=save> <4=item> */
count = rp->count + 1;
TRACE(("%8d: max repeat (%d %d)\n", PTR(ptr),
pattern[1], pattern[2]));
TRACE(("%8d: max until %d\n", PTR(ptr), count));
count = 0;
if (count < rp->pattern[1]) {
/* not enough matches */
TRACE(("%8d: match item (required)\n", PTR(ptr)));
rp->count = count;
i = SRE_MATCH(state, rp->pattern + 3);
if (i)
return i;
rp->count = count - 1;
state->ptr = ptr;
return 0;
}
/* match minimum number of items */
while (count < (int) pattern[1]) {
i = SRE_MATCH(state, pattern + 4);
if (i < 0)
if (count < rp->pattern[2] || rp->pattern[2] == 65535) {
TRACE(("%8d: match item (optional)\n", PTR(ptr)));
/* we may have enough matches, but if we can
match another item, do so */
rp->count = count;
lastmark = state->lastmark;
mark_save(state, 0, lastmark);
i = SRE_MATCH(state, rp->pattern + 3);
if (i)
return i;
if (!i)
goto failure;
if (state->ptr == ptr) {
/* if the match was successful but empty, set the
count to max and terminate the scanning loop */
count = (int) pattern[2];
break;
}
count++;
ptr = state->ptr;
mark_restore(state, 0, lastmark);
rp->count = count - 1;
state->ptr = ptr;
}
TRACE(("%8d: found %d leading items\n", PTR(ptr), count));
if (count < (int) pattern[1])
goto failure;
/* match maximum number of items, pushing alternate end
points to the stack */
while (pattern[2] == 65535 || count < (int) pattern[2]) {
/* this position is valid; add it to the retry
stack */
PUSH(pattern[0], pattern[3], pattern[2]);
/* match more stuff */
state->stackbase = stack;
i = SRE_MATCH(state, pattern + 4);
state->stackbase = stackbase;
if (i < 0)
/* cannot match more repeated items here. make sure the
tail matches */
TRACE(("%8d: match tail\n", PTR(ptr)));
state->repeat = rp->prev;
i = SRE_MATCH(state, pattern);
if (i) {
/* free(rp); */
return i;
if (!i)
break;
if (state->ptr == ptr) {
count = (int) pattern[2];
break;
}
/* move forward */
ptr = state->ptr;
count++;
}
state->repeat = rp;
return 0;
/* when we get here, count is the number of successful
matches, and ptr points to the tail. */
case SRE_OP_MIN_UNTIL:
/* minimizing repeat */
/* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
TRACE(("%8d: skip +%d\n", PTR(ptr), pattern[0]));
rp = state->repeat;
if (!rp)
return SRE_ERROR_STATE;
pattern += pattern[0];
break;
count = rp->count + 1;
case SRE_OP_MIN_REPEAT:
/* match repeated sequence (minimizing regexp) */
/* args: <skip> <1=min> <2=max> <3=save> <4=item> */
TRACE(("%8d: min until %d\n", PTR(ptr), count));
TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
pattern[1], pattern[2]));
count = 0;
state->ptr = ptr;
/* match minimum number of items */
while (count < (int) pattern[1]) {
i = SRE_MATCH(state, pattern + 4);
if (i < 0)
if (count < rp->pattern[1]) {
/* not enough matches */
TRACE(("%8d: match item (required)\n", PTR(ptr)));
rp->count = count;
i = SRE_MATCH(state, rp->pattern + 3);
if (i)
return i;
if (!i)
goto failure;
count++;
}
/* move forward until the tail matches. */
while (count <= (int) pattern[2]) {
ptr = state->ptr;
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
goto success;
rp->count = count-1;
state->ptr = ptr;
return 0;
}
state->ptr = ptr; /* backtrack */
i = SRE_MATCH(state, pattern + 4);
if (i < 0)
/* see if the tail matches */
TRACE(("%8d: match tail\n", PTR(ptr)));
state->repeat = rp->prev;
i = SRE_MATCH(state, pattern);
if (i) {
/* free(rp); */
return i;
if (!i)
goto failure;
count++;
}
goto failure;
state->repeat = rp;
case SRE_OP_REPEAT:
/* TEMPLATE: match repeated sequence (no backtracking) */
/* args: <skip> <min> <max> */
TRACE(("%8d: repeat %d %d\n", PTR(ptr), pattern[1], pattern[2]));
count = 0;
state->ptr = ptr;
while (count < (int) pattern[2]) {
i = SRE_MATCH(state, pattern + 3);
if (i < 0)
if (count >= rp->pattern[2] && rp->pattern[2] != 65535)
return 0;
TRACE(("%8d: match item (optional)\n", PTR(ptr)));
rp->count = count;
i = SRE_MATCH(state, rp->pattern + 3);
if (i)
return i;
if (!i)
break;
if (state->ptr == ptr) {
count = (int) pattern[2];
break;
}
count++;
}
if (count <= (int) pattern[1])
goto failure;
TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
pattern += pattern[0];
ptr = state->ptr;
break;
rp->count = count - 1;
return 0;
default:
TRACE(("%8d: unknown opcode %d\n", PTR(ptr), pattern[-1]));
......@@ -954,33 +802,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
}
}
failure:
TRACE(("%8d: leave (failure)\n", PTR(ptr)));
if (stack-- > stackbase) {
TRACE(("%8d: pop stack[%d]\n", stack));
sp = state->stack + stack;
ptr = sp->ptr;
pattern = sp->pattern;
if (sp->mark != 65535) {
state->mark[sp->mark] = sp->mark0;
state->mark[sp->mark+1] = sp->mark1;
}
TRACE(("%8d: retry (%d)\n", PTR(ptr), stack));
goto retry;
}
state->lastmark = lastmark;
state->stackbase = stackbase;
if (mark)
memcpy(state->mark, mark, state->lastmark*sizeof(void*));
return 0;
success:
TRACE(("%8d: leave (success)\n", PTR(ptr)));
state->stackbase = stackbase;
return 1;
/* shouldn't end up here */
return SRE_ERROR_ILLEGAL;
}
LOCAL(int)
static int
SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
{
SRE_CHAR* ptr = state->start;
......@@ -994,7 +820,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
if (pattern[0] == SRE_OP_INFO) {
/* optimization info block */
/* args: <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
/* <INFO> <1=skip> <2=flags> <3=min> <4=max> <5=prefix info> */
flags = pattern[2];
......@@ -1191,6 +1017,24 @@ sre_getlower(PyObject* self, PyObject* args)
return Py_BuildValue("i", sre_lower(character));
}
LOCAL(void)
state_reset(SRE_STATE* state)
{
int i;
state->lastmark = 0;
/* FIXME: dynamic! */
for (i = 0; i < SRE_MARK_SIZE; i++)
state->mark[i] = NULL;
state->lastindex = -1;
state->repeat = NULL;
mark_fini(state);
}
LOCAL(PyObject*)
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
int start, int end)
......@@ -1198,44 +1042,53 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
/* prepare state object */
PyBufferProcs *buffer;
int i, count;
int size, bytes;
void* ptr;
/* get pointer to string buffer */
buffer = string->ob_type->tp_as_buffer;
if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
buffer->bf_getsegcount(string, NULL) != 1) {
PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
PyErr_SetString(PyExc_TypeError, "expected string or buffer");
return NULL;
}
/* determine buffer size */
count = buffer->bf_getreadbuffer(string, 0, &ptr);
if (count < 0) {
/* sanity check */
bytes = buffer->bf_getreadbuffer(string, 0, &ptr);
if (bytes < 0) {
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
return NULL;
}
/* determine character size */
#if defined(HAVE_UNICODE)
state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
#if PY_VERSION_HEX >= 0x01060000
size = PyObject_Size(string);
#else
state->charsize = 1;
size = PyObject_Length(string);
#endif
count /= state->charsize;
if (PyString_Check(string) || bytes == size)
state->charsize = 1;
#if defined(HAVE_UNICODE)
else if (bytes == (int) (size * sizeof(Py_UNICODE)))
state->charsize = sizeof(Py_UNICODE);
#endif
else {
PyErr_SetString(PyExc_TypeError, "buffer size mismatch");
return NULL;
}
/* adjust boundaries */
if (start < 0)
start = 0;
else if (start > count)
start = count;
else if (start > size)
start = size;
if (end < 0)
end = 0;
else if (end > count)
end = count;
else if (end > size)
end = size;
state->beginning = ptr;
......@@ -1247,18 +1100,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
state->pos = start;
state->endpos = end;
state->lastmark = 0;
/* FIXME: dynamic! */
for (i = 0; i < SRE_MARK_SIZE; i++)
state->mark[i] = NULL;
state->lastindex = -1;
state->stack = NULL;
state->stackbase = 0;
state->stacksize = 0;
if (pattern->flags & SRE_FLAG_LOCALE)
state->lower = sre_lower_locale;
#if defined(HAVE_UNICODE)
......@@ -1268,6 +1109,10 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
else
state->lower = sre_lower;
state->mark_stack = NULL;
state_reset(state);
return string;
}
......@@ -1275,7 +1120,7 @@ LOCAL(void)
state_fini(SRE_STATE* state)
{
Py_XDECREF(state->string);
stack_free(state);
mark_fini(state);
}
LOCAL(PyObject*)
......@@ -2091,7 +1936,8 @@ scanner_match(ScannerObject* self, PyObject* args)
PyObject* match;
int status;
state->lastindex = -1;
state_reset(state);
state->ptr = state->start;
if (state->charsize == 1) {
......@@ -2121,7 +1967,8 @@ scanner_search(ScannerObject* self, PyObject* args)
PyObject* match;
int status;
state->lastindex = -1;
state_reset(state);
state->ptr = state->start;
if (state->charsize == 1) {
......
/*
*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
......@@ -44,18 +43,15 @@ typedef struct {
typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
typedef struct {
/* stack elements */
SRE_CODE* pattern;
void* ptr;
int mark;
void* mark0;
void* mark1;
} SRE_STACK;
/* FIXME: <fl> shouldn't be a constant, really... */
#define SRE_MARK_SIZE 200
typedef struct SRE_REPEAT_T {
int count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */
} SRE_REPEAT;
typedef struct {
/* string pointers */
void* ptr; /* current position (also end of current slice) */
......@@ -71,16 +67,16 @@ typedef struct {
int lastindex;
int lastmark;
void* mark[SRE_MARK_SIZE];
/* backtracking stack */
SRE_STACK* stack;
int stacksize;
int stackbase;
/* dynamically allocated stuff */
void** mark_stack;
int mark_stack_size;
int mark_stack_base;
SRE_REPEAT *repeat; /* current repeat context */
/* hooks */
SRE_TOLOWER_HOOK lower;
} SRE_STATE;
typedef struct {
/* scanner (internal helper object) */
PyObject_HEAD
PyObject* pattern;
SRE_STATE state;
......
......@@ -21,24 +21,24 @@
#define SRE_OP_CALL 7
#define SRE_OP_CATEGORY 8
#define SRE_OP_CHARSET 9
#define SRE_OP_GROUP 10
#define SRE_OP_GROUP_IGNORE 11
#define SRE_OP_INDEX 12
#define SRE_OP_IN 13
#define SRE_OP_IN_IGNORE 14
#define SRE_OP_INFO 15
#define SRE_OP_JUMP 16
#define SRE_OP_LITERAL 17
#define SRE_OP_LITERAL_IGNORE 18
#define SRE_OP_MARK 19
#define SRE_OP_MAX_REPEAT 20
#define SRE_OP_MAX_REPEAT_ONE 21
#define SRE_OP_MIN_REPEAT 22
#define SRE_OP_NOT_LITERAL 23
#define SRE_OP_NOT_LITERAL_IGNORE 24
#define SRE_OP_NEGATE 25
#define SRE_OP_RANGE 26
#define SRE_OP_REPEAT 27
#define SRE_OP_GROUPREF 10
#define SRE_OP_GROUPREF_IGNORE 11
#define SRE_OP_IN 12
#define SRE_OP_IN_IGNORE 13
#define SRE_OP_INFO 14
#define SRE_OP_JUMP 15
#define SRE_OP_LITERAL 16
#define SRE_OP_LITERAL_IGNORE 17
#define SRE_OP_MARK 18
#define SRE_OP_MAX_UNTIL 19
#define SRE_OP_MIN_UNTIL 20
#define SRE_OP_NOT_LITERAL 21
#define SRE_OP_NOT_LITERAL_IGNORE 22
#define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25
#define SRE_OP_REPEAT_ONE 26
#define SRE_OP_SUBPATTERN 27
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment