Commit 51fe8e78 authored by Gustavo Niemeyer's avatar Gustavo Niemeyer

Implemented non-recursive SRE matching.

parent b9db49db
...@@ -297,6 +297,15 @@ assertions, the contained pattern must only match strings of some ...@@ -297,6 +297,15 @@ assertions, the contained pattern must only match strings of some
fixed length. Patterns which start with negative lookbehind fixed length. Patterns which start with negative lookbehind
assertions may match at the beginning of the string being searched. assertions may match at the beginning of the string being searched.
\item[\code{(?(\var{id/name})yes-pattern|no-pattern)}] Will try to match
with \regexp{yes-pattern} if the group with given \var{id} or \var{name}
exists, and with \regexp{no-pattern} if it doesn't. \regexp{|no-pattern}
is optional and can be omitted. For example,
\regexp{(<)?(\e w+@\e w+(?:\e .\e w+)+)(?(1)>)} is a poor email matching
pattern, which will match with \code{'<user@host.com>'} as well as
\code{'user@host.com'}, but not with \code{'<user@host.com'}.
\versionadded{2.3}
\end{list} \end{list}
The special sequences consist of \character{\e} and a character from the The special sequences consist of \character{\e} and a character from the
......
...@@ -145,6 +145,19 @@ def _compile(code, pattern, flags): ...@@ -145,6 +145,19 @@ def _compile(code, pattern, flags):
else: else:
emit(OPCODES[op]) emit(OPCODES[op])
emit(av-1) emit(av-1)
elif op is GROUPREF_EXISTS:
emit(OPCODES[op])
emit((av[0]-1)*2)
skipyes = len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
emit(OPCODES[JUMP])
skipno = len(code); emit(0)
code[skipyes] = len(code) - skipyes + 1
_compile(code, av[2], flags)
code[skipno] = len(code) - skipno
else:
code[skipyes] = len(code) - skipyes + 1
else: else:
raise ValueError, ("unsupported operand type", op) raise ValueError, ("unsupported operand type", op)
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
# update when constants are added or removed # update when constants are added or removed
MAGIC = 20030419 MAGIC = 20031017
# max code word in this release # max code word in this release
...@@ -42,6 +42,7 @@ CATEGORY = "category" ...@@ -42,6 +42,7 @@ CATEGORY = "category"
CHARSET = "charset" CHARSET = "charset"
GROUPREF = "groupref" GROUPREF = "groupref"
GROUPREF_IGNORE = "groupref_ignore" GROUPREF_IGNORE = "groupref_ignore"
GROUPREF_EXISTS = "groupref_exists"
IN = "in" IN = "in"
IN_IGNORE = "in_ignore" IN_IGNORE = "in_ignore"
INFO = "info" INFO = "info"
...@@ -108,7 +109,7 @@ OPCODES = [ ...@@ -108,7 +109,7 @@ OPCODES = [
CALL, CALL,
CATEGORY, CATEGORY,
CHARSET, BIGCHARSET, CHARSET, BIGCHARSET,
GROUPREF, GROUPREF_IGNORE, GROUPREF, GROUPREF_EXISTS, GROUPREF_IGNORE,
IN, IN_IGNORE, IN, IN_IGNORE,
INFO, INFO,
JUMP, JUMP,
......
...@@ -364,6 +364,20 @@ def _parse_sub(source, state, nested=1): ...@@ -364,6 +364,20 @@ def _parse_sub(source, state, nested=1):
subpattern.append((BRANCH, (None, items))) subpattern.append((BRANCH, (None, items)))
return subpattern return subpattern
def _parse_sub_cond(source, state, condgroup):
item_yes = _parse(source, state)
if source.match("|"):
item_no = _parse(source, state)
if source.match("|"):
raise error, "conditional backref with more than two branches"
else:
item_no = None
if source.next and not source.match(")", 0):
raise error, "pattern not properly closed"
subpattern = SubPattern(state)
subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))
return subpattern
def _parse(source, state): def _parse(source, state):
# parse a simple pattern # parse a simple pattern
...@@ -499,6 +513,7 @@ def _parse(source, state): ...@@ -499,6 +513,7 @@ def _parse(source, state):
elif this == "(": elif this == "(":
group = 1 group = 1
name = None name = None
condgroup = None
if source.match("?"): if source.match("?"):
group = 0 group = 0
# options # options
...@@ -568,6 +583,26 @@ def _parse(source, state): ...@@ -568,6 +583,26 @@ def _parse(source, state):
else: else:
subpattern.append((ASSERT_NOT, (dir, p))) subpattern.append((ASSERT_NOT, (dir, p)))
continue continue
elif source.match("("):
# conditional backreference group
condname = ""
while 1:
char = source.get()
if char is None:
raise error, "unterminated name"
if char == ")":
break
condname = condname + char
group = 2
if isname(condname):
condgroup = state.groupdict.get(condname)
if condgroup is None:
raise error, "unknown group name"
else:
try:
condgroup = atoi(condname)
except ValueError:
raise error, "bad character in group name"
else: else:
# flags # flags
if not source.next in FLAGS: if not source.next in FLAGS:
...@@ -581,6 +616,9 @@ def _parse(source, state): ...@@ -581,6 +616,9 @@ def _parse(source, state):
group = None group = None
else: else:
group = state.opengroup(name) group = state.opengroup(name)
if condgroup:
p = _parse_sub_cond(source, state, condgroup)
else:
p = _parse_sub(source, state) p = _parse_sub(source, state)
if not source.match(")"): if not source.match(")"):
raise error, "unbalanced parenthesis" raise error, "unbalanced parenthesis"
......
...@@ -169,7 +169,6 @@ class ReTests(unittest.TestCase): ...@@ -169,7 +169,6 @@ class ReTests(unittest.TestCase):
self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c')) self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
def test_re_groupref_exists(self): def test_re_groupref_exists(self):
return # not yet
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(), self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', '(a)').groups(),
('(', 'a')) ('(', 'a'))
self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(), self.assertEqual(re.match('^(\()?([^()]+)(?(1)\))$', 'a').groups(),
...@@ -405,19 +404,20 @@ class ReTests(unittest.TestCase): ...@@ -405,19 +404,20 @@ class ReTests(unittest.TestCase):
self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0), self.assertEqual(re.match('.*?cd', 5000*'ab'+'c'+5000*'ab'+'cde').end(0),
20003) 20003)
self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001) self.assertEqual(re.match('.*?cd', 20000*'abc'+'de').end(0), 60001)
# non-simple '*?' still recurses and hits the recursion limit # non-simple '*?' still used to hit the recursion limit, before the
self.assertRaises(RuntimeError, re.search, '(a|b)*?c', 10000*'ab'+'cd') # non-recursive scheme was implemented.
self.assertEqual(re.search('(a|b)*?c', 10000*'ab'+'cd').end(0), 20001)
def test_bug_612074(self): def test_bug_612074(self):
pat=u"["+re.escape(u"\u2039")+u"]" pat=u"["+re.escape(u"\u2039")+u"]"
self.assertEqual(re.compile(pat) and 1, 1) self.assertEqual(re.compile(pat) and 1, 1)
def test_stack_overflow(self): def test_stack_overflow(self):
# nasty case that overflows the straightforward recursive # nasty cases that used to overflow the straightforward recursive
# implementation of repeated groups. # implementation of repeated groups.
self.assertRaises(RuntimeError, re.match, '(x)*', 50000*'x') self.assertEqual(re.match('(x)*', 50000*'x').group(1), 'x')
self.assertRaises(RuntimeError, re.match, '(x)*y', 50000*'x'+'y') self.assertEqual(re.match('(x)*y', 50000*'x'+'y').group(1), 'x')
self.assertRaises(RuntimeError, re.match, '(x)*?y', 50000*'x'+'y') self.assertEqual(re.match('(x)*?y', 50000*'x'+'y').group(1), 'x')
def test_scanner(self): def test_scanner(self):
def s_ident(scanner, token): return token def s_ident(scanner, token): return token
......
...@@ -61,6 +61,10 @@ Extension modules ...@@ -61,6 +61,10 @@ Extension modules
- Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI - Bug #814613: INET_ADDRSTRLEN fix needed for all compilers on SGI
- Implemented non-recursive SRE matching scheme (#757624).
- Implemented (?(id/name)yes|no) support in SRE (#572936).
Library Library
------- -------
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
* 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum) * 2001-12-07 fl fixed memory leak in sub/subn (Guido van Rossum)
* 2002-11-09 fl fixed empty sub/subn return type * 2002-11-09 fl fixed empty sub/subn return type
* 2003-04-18 mvl fully support 4-byte codes * 2003-04-18 mvl fully support 4-byte codes
* 2003-10-17 gn implemented non recursive scheme
* *
* Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved. * Copyright (c) 1997-2001 by Secret Labs AB. All rights reserved.
* *
...@@ -91,6 +92,9 @@ static char copyright[] = ...@@ -91,6 +92,9 @@ static char copyright[] =
#endif #endif
#endif #endif
/* enables usage of recursive scheme */
#undef USE_RECURSION
/* enables fast searching */ /* enables fast searching */
#define USE_FAST_SEARCH #define USE_FAST_SEARCH
...@@ -275,82 +279,33 @@ sre_category(SRE_CODE category, unsigned int ch) ...@@ -275,82 +279,33 @@ sre_category(SRE_CODE category, unsigned int ch)
/* helpers */ /* helpers */
static void static void
mark_fini(SRE_STATE* state) data_stack_dealloc(SRE_STATE* state)
{ {
if (state->mark_stack) { if (state->data_stack) {
free(state->mark_stack); free(state->data_stack);
state->mark_stack = NULL; state->data_stack = NULL;
} }
state->mark_stack_size = state->mark_stack_base = 0; state->data_stack_size = state->data_stack_base = 0;
} }
static int static int
mark_save(SRE_STATE* state, int lo, int hi, int *mark_stack_base) data_stack_grow(SRE_STATE* state, int size)
{ {
int minsize, cursize;
minsize = state->data_stack_base+size;
cursize = state->data_stack_size;
if (cursize < minsize) {
void* stack; void* stack;
int size; cursize = minsize+minsize/4+1024;
int minsize, newsize; TRACE(("allocate/grow stack %d\n", cursize));
stack = realloc(state->data_stack, cursize);
if (hi <= lo)
return 0;
size = (hi - lo) + 1;
newsize = state->mark_stack_size;
minsize = state->mark_stack_base + size;
if (newsize < minsize) {
/* create new stack */
if (!newsize) {
newsize = 512;
if (newsize < minsize)
newsize = minsize;
TRACE(("allocate stack %d\n", newsize));
stack = malloc(sizeof(void*) * newsize);
} else {
/* grow the stack */
while (newsize < minsize)
newsize += newsize;
TRACE(("grow stack to %d\n", newsize));
stack = realloc(state->mark_stack, sizeof(void*) * newsize);
}
if (!stack) { if (!stack) {
mark_fini(state); data_stack_dealloc(state);
return SRE_ERROR_MEMORY; return SRE_ERROR_MEMORY;
} }
state->mark_stack = stack; state->data_stack = stack;
state->mark_stack_size = newsize; state->data_stack_size = cursize;
} }
TRACE(("copy %d:%d to %d (%d)\n", lo, hi, state->mark_stack_base, size));
memcpy(state->mark_stack + state->mark_stack_base, state->mark + lo,
size * sizeof(void*));
state->mark_stack_base += size;
*mark_stack_base = state->mark_stack_base;
return 0;
}
static int
mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
{
int size;
if (hi <= lo)
return 0;
size = (hi - lo) + 1;
state->mark_stack_base = *mark_stack_base - size;
TRACE(("copy %d:%d from %d\n", lo, hi, state->mark_stack_base));
memcpy(state->mark + lo, state->mark_stack + state->mark_stack_base,
size * sizeof(void*));
return 0; return 0;
} }
...@@ -362,6 +317,7 @@ mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base) ...@@ -362,6 +317,7 @@ mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
#define SRE_CHARSET sre_charset #define SRE_CHARSET sre_charset
#define SRE_INFO sre_info #define SRE_INFO sre_info
#define SRE_MATCH sre_match #define SRE_MATCH sre_match
#define SRE_MATCH_CONTEXT sre_match_context
#define SRE_SEARCH sre_search #define SRE_SEARCH sre_search
#define SRE_LITERAL_TEMPLATE sre_literal_template #define SRE_LITERAL_TEMPLATE sre_literal_template
...@@ -374,6 +330,7 @@ mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base) ...@@ -374,6 +330,7 @@ mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
#undef SRE_LITERAL_TEMPLATE #undef SRE_LITERAL_TEMPLATE
#undef SRE_SEARCH #undef SRE_SEARCH
#undef SRE_MATCH #undef SRE_MATCH
#undef SRE_MATCH_CONTEXT
#undef SRE_INFO #undef SRE_INFO
#undef SRE_CHARSET #undef SRE_CHARSET
#undef SRE_COUNT #undef SRE_COUNT
...@@ -388,6 +345,7 @@ mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base) ...@@ -388,6 +345,7 @@ mark_restore(SRE_STATE* state, int lo, int hi, int *mark_stack_base)
#define SRE_CHARSET sre_ucharset #define SRE_CHARSET sre_ucharset
#define SRE_INFO sre_uinfo #define SRE_INFO sre_uinfo
#define SRE_MATCH sre_umatch #define SRE_MATCH sre_umatch
#define SRE_MATCH_CONTEXT sre_umatch_context
#define SRE_SEARCH sre_usearch #define SRE_SEARCH sre_usearch
#define SRE_LITERAL_TEMPLATE sre_uliteral_template #define SRE_LITERAL_TEMPLATE sre_uliteral_template
#endif #endif
...@@ -500,6 +458,9 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) ...@@ -500,6 +458,9 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
for (;;) { for (;;) {
switch (*set++) { switch (*set++) {
case SRE_OP_FAILURE:
return !ok;
case SRE_OP_LITERAL: case SRE_OP_LITERAL:
/* <LITERAL> <code> */ /* <LITERAL> <code> */
if (ch == set[0]) if (ch == set[0])
...@@ -507,11 +468,11 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) ...@@ -507,11 +468,11 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
set++; set++;
break; break;
case SRE_OP_RANGE: case SRE_OP_CATEGORY:
/* <RANGE> <lower> <upper> */ /* <CATEGORY> <code> */
if (set[0] <= ch && ch <= set[1]) if (sre_category(set[0], (int) ch))
return ok; return ok;
set += 2; set += 1;
break; break;
case SRE_OP_CHARSET: case SRE_OP_CHARSET:
...@@ -529,6 +490,17 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) ...@@ -529,6 +490,17 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
} }
break; break;
case SRE_OP_RANGE:
/* <RANGE> <lower> <upper> */
if (set[0] <= ch && ch <= set[1])
return ok;
set += 2;
break;
case SRE_OP_NEGATE:
ok = !ok;
break;
case SRE_OP_BIGCHARSET: case SRE_OP_BIGCHARSET:
/* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */ /* <BIGCHARSET> <blockcount> <256 blockindices> <blocks> */
{ {
...@@ -556,20 +528,6 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch) ...@@ -556,20 +528,6 @@ SRE_CHARSET(SRE_CODE* set, SRE_CODE ch)
break; break;
} }
case SRE_OP_CATEGORY:
/* <CATEGORY> <code> */
if (sre_category(set[0], (int) ch))
return ok;
set += 1;
break;
case SRE_OP_NEGATE:
ok = !ok;
break;
case SRE_OP_FAILURE:
return !ok;
default: default:
/* internal error -- there's not much we can do about it /* internal error -- there's not much we can do about it
here, so let's just pretend it didn't match... */ here, so let's just pretend it didn't match... */
...@@ -594,6 +552,13 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level) ...@@ -594,6 +552,13 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
switch (pattern[0]) { switch (pattern[0]) {
case SRE_OP_IN:
/* repeated set */
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
ptr++;
break;
case SRE_OP_ANY: case SRE_OP_ANY:
/* repeated dot wildcard. */ /* repeated dot wildcard. */
TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr)); TRACE(("|%p|%p|COUNT ANY\n", pattern, ptr));
...@@ -640,13 +605,6 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level) ...@@ -640,13 +605,6 @@ SRE_COUNT(SRE_STATE* state, SRE_CODE* pattern, int maxcount, int level)
ptr++; ptr++;
break; break;
case SRE_OP_IN:
/* repeated set */
TRACE(("|%p|%p|COUNT IN\n", pattern, ptr));
while (ptr < end && SRE_CHARSET(pattern + 2, *ptr))
ptr++;
break;
default: default:
/* repeated single character pattern */ /* repeated single character pattern */
TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr)); TRACE(("|%p|%p|COUNT SUBPATTERN\n", pattern, ptr));
...@@ -724,35 +682,173 @@ SRE_INFO(SRE_STATE* state, SRE_CODE* pattern) ...@@ -724,35 +682,173 @@ SRE_INFO(SRE_STATE* state, SRE_CODE* pattern)
*/ */
#define LASTMARK_SAVE() \ #define LASTMARK_SAVE() \
do { \ do { \
lastmark = state->lastmark; \ ctx->lastmark = state->lastmark; \
lastindex = state->lastindex; \ ctx->lastindex = state->lastindex; \
} while (0) } while (0)
#define LASTMARK_RESTORE() \ #define LASTMARK_RESTORE() \
do { \ do { \
if (state->lastmark > lastmark) { \ state->lastmark = ctx->lastmark; \
memset(state->mark + lastmark + 1, 0, \ state->lastindex = ctx->lastindex; \
(state->lastmark - lastmark) * sizeof(void*)); \ } while (0)
state->lastmark = lastmark; \
state->lastindex = lastindex; \ #define RETURN_ERROR(i) do { return i; } while(0)
#define RETURN_FAILURE do { ret = 0; goto exit; } while(0)
#define RETURN_SUCCESS do { ret = 1; goto exit; } while(0)
#define RETURN_ON_ERROR(i) \
do { if (i < 0) RETURN_ERROR(i); } while (0)
#define RETURN_ON_SUCCESS(i) \
do { RETURN_ON_ERROR(i); if (i > 0) RETURN_SUCCESS; } while (0)
#define RETURN_ON_FAILURE(i) \
do { RETURN_ON_ERROR(i); if (i == 0) RETURN_FAILURE; } while (0)
#define SFY(x) #x
#define DATA_STACK_ALLOC(state, type, ptr) \
do { \
alloc_pos = state->data_stack_base; \
TRACE(("allocating %s in %d (%d)\n", \
SFY(type), alloc_pos, sizeof(type))); \
if (state->data_stack_size < alloc_pos+sizeof(type)) { \
int j = data_stack_grow(state, sizeof(type)); \
if (j < 0) return j; \
if (ctx_pos != -1) \
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
} \
ptr = (type*)(state->data_stack+alloc_pos); \
state->data_stack_base += sizeof(type); \
} while (0)
#define DATA_STACK_LOOKUP(state, type, ptr) \
do { \
TRACE(("looking up %s in %d (%d)\n", SFY(type), \
state->data_stack_base-sizeof(type), sizeof(type))); \
ptr = (type*)(state->data_stack+state->data_stack_base-sizeof(type)); \
} while (0)
#define DATA_STACK_LOOKUP_AT(state, type, ptr, pos) \
do { \
TRACE(("looking up %s at %d\n", SFY(type), pos)); \
ptr = (type*)(state->data_stack+pos); \
} while (0)
#define DATA_STACK_PUSH(state, data, size) \
do { \
TRACE(("copy data in %p to %d (%d)\n", \
data, state->data_stack_base, size)); \
if (state->data_stack_size < state->data_stack_base+size) { \
int j = data_stack_grow(state, size); \
if (j < 0) return j; \
if (ctx_pos != -1) \
DATA_STACK_LOOKUP_AT(state, SRE_MATCH_CONTEXT, ctx, ctx_pos); \
} \ } \
memcpy(state->data_stack+state->data_stack_base, data, size); \
state->data_stack_base += size; \
} while (0)
#define DATA_STACK_POP(state, data, size, discard) \
do { \
TRACE(("copy data to %p from %d (%d)\n", \
data, state->data_stack_base-size, size)); \
memcpy(data, state->data_stack+state->data_stack_base-size, size); \
if (discard) \
state->data_stack_base -= size; \
} while (0)
#define DATA_STACK_POP_DISCARD(state, size) \
do { \
TRACE(("discard data from %d (%d)\n", \
state->data_stack_base-size, size)); \
state->data_stack_base -= size; \
} while(0)
#define DATA_PUSH(x) \
DATA_STACK_PUSH(state, (x), sizeof(*(x)))
#define DATA_POP(x) \
DATA_STACK_POP(state, (x), sizeof(*(x)), 1)
#define DATA_POP_KEEP(x) \
DATA_STACK_POP(state, (x), sizeof(*(x)), 0)
#define DATA_POP_DISCARD(x) \
DATA_STACK_POP_DISCARD(state, sizeof(*(x)))
#define DATA_ALLOC(t,p) \
DATA_STACK_ALLOC(state, t, p)
#define DATA_LOOKUP(t,p) \
DATA_STACK_LOOKUP(state, t, p)
#define DATA_LOOKUP_AT(t,p,pos) \
DATA_STACK_LOOKUP_AT(state,t,p,pos)
#define MARK_PUSH(lastmark) \
do if (lastmark > 0) { \
i = lastmark; /* ctx->lastmark may change if reallocated */ \
DATA_STACK_PUSH(state, state->mark, (i+1)*sizeof(void*)); \
} while (0)
#define MARK_POP(lastmark) \
do if (lastmark > 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 1); \
} while (0)
#define MARK_POP_KEEP(lastmark) \
do if (lastmark > 0) { \
DATA_STACK_POP(state, state->mark, (lastmark+1)*sizeof(void*), 0); \
} while (0) } while (0)
#define MARK_POP_DISCARD(lastmark) \
do if (lastmark > 0) { \
DATA_STACK_POP_DISCARD(state, (lastmark+1)*sizeof(void*)); \
} while (0)
#define JUMP_NONE 0
#define JUMP_MAX_UNTIL_1 1
#define JUMP_MAX_UNTIL_2 2
#define JUMP_MAX_UNTIL_3 3
#define JUMP_MIN_UNTIL_1 4
#define JUMP_MIN_UNTIL_2 5
#define JUMP_MIN_UNTIL_3 6
#define JUMP_REPEAT 7
#define JUMP_REPEAT_ONE_1 8
#define JUMP_REPEAT_ONE_2 9
#define JUMP_MIN_REPEAT_ONE 10
#define JUMP_BRANCH 11
#define JUMP_ASSERT 12
#define JUMP_ASSERT_NOT 13
#define DO_JUMP(jumpvalue, jumplabel, nextpattern) \
DATA_ALLOC(SRE_MATCH_CONTEXT, nextctx); \
nextctx->last_ctx_pos = ctx_pos; \
nextctx->jump = jumpvalue; \
nextctx->pattern = nextpattern; \
ctx_pos = alloc_pos; \
ctx = nextctx; \
goto entrance; \
jumplabel: \
while (0) /* gcc doesn't like labels at end of scopes */ \
typedef struct {
int last_ctx_pos;
int jump;
SRE_CHAR* ptr;
SRE_CODE* pattern;
int count;
int lastmark;
int lastindex;
union {
SRE_CODE chr;
SRE_REPEAT* rep;
} u;
} SRE_MATCH_CONTEXT;
/* check if string matches the given pattern. returns <0 for
error, 0 for failure, and 1 for success */
LOCAL(int) LOCAL(int)
SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
{ {
/* check if string matches the given pattern. returns <0 for
error, 0 for failure, and 1 for success */
SRE_CHAR* end = state->end; SRE_CHAR* end = state->end;
SRE_CHAR* ptr = state->ptr; int alloc_pos, ctx_pos = -1;
int i, count; int i, ret = 0;
SRE_REPEAT* rp; int jump;
int lastmark, lastindex, mark_stack_base;
SRE_CODE chr;
SRE_REPEAT rep; /* FIXME: <fl> allocate in STATE instead */ SRE_MATCH_CONTEXT* ctx;
SRE_MATCH_CONTEXT* nextctx;
TRACE(("|%p|%p|ENTER %d\n", pattern, ptr, level)); TRACE(("|%p|%p|ENTER %d\n", pattern, state->ptr, level));
#if defined(USE_STACKCHECK) #if defined(USE_STACKCHECK)
if (level % 10 == 0 && PyOS_CheckStack()) if (level % 10 == 0 && PyOS_CheckStack())
...@@ -764,241 +860,204 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -764,241 +860,204 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
return SRE_ERROR_RECURSION_LIMIT; return SRE_ERROR_RECURSION_LIMIT;
#endif #endif
if (pattern[0] == SRE_OP_INFO) { DATA_ALLOC(SRE_MATCH_CONTEXT, ctx);
ctx->last_ctx_pos = -1;
ctx->jump = JUMP_NONE;
ctx->pattern = pattern;
ctx_pos = alloc_pos;
entrance:
ctx->ptr = state->ptr;
if (ctx->pattern[0] == SRE_OP_INFO) {
/* optimization info block */ /* optimization info block */
/* <INFO> <1=skip> <2=flags> <3=min> ... */ /* <INFO> <1=skip> <2=flags> <3=min> ... */
if (pattern[3] && (end - ptr) < pattern[3]) { if (ctx->pattern[3] && (end - ctx->ptr) < ctx->pattern[3]) {
TRACE(("reject (got %d chars, need %d)\n", TRACE(("reject (got %d chars, need %d)\n",
(end - ptr), pattern[3])); (end - ctx->ptr), ctx->pattern[3]));
return 0; RETURN_FAILURE;
} }
pattern += pattern[1] + 1; ctx->pattern += ctx->pattern[1] + 1;
} }
for (;;) { for (;;) {
switch (*pattern++) { switch (*ctx->pattern++) {
case SRE_OP_FAILURE: case SRE_OP_MARK:
/* immediate failure */ /* set mark */
TRACE(("|%p|%p|FAILURE\n", pattern, ptr)); /* <MARK> <gid> */
return 0; TRACE(("|%p|%p|MARK %d\n", ctx->pattern,
ctx->ptr, ctx->pattern[0]));
i = ctx->pattern[0];
if (i & 1)
state->lastindex = i/2 + 1;
if (i > state->lastmark) {
/* state->lastmark is the highest valid index in the
state->mark array. If it is increased by more than 1,
the intervening marks must be set to NULL to signal
that these marks have not been encountered. */
int j = state->lastmark + 1;
while (j < i)
state->mark[j++] = NULL;
state->lastmark = i;
}
state->mark[i] = ctx->ptr;
ctx->pattern++;
break;
case SRE_OP_LITERAL:
/* match literal string */
/* <LITERAL> <code> */
TRACE(("|%p|%p|LITERAL %d\n", ctx->pattern,
ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] != ctx->pattern[0])
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;
case SRE_OP_NOT_LITERAL:
/* match anything that is not literal character */
/* <NOT_LITERAL> <code> */
TRACE(("|%p|%p|NOT_LITERAL %d\n", ctx->pattern,
ctx->ptr, *ctx->pattern));
if (ctx->ptr >= end || (SRE_CODE) ctx->ptr[0] == ctx->pattern[0])
RETURN_FAILURE;
ctx->pattern++;
ctx->ptr++;
break;
case SRE_OP_SUCCESS: case SRE_OP_SUCCESS:
/* end of pattern */ /* end of pattern */
TRACE(("|%p|%p|SUCCESS\n", pattern, ptr)); TRACE(("|%p|%p|SUCCESS\n", ctx->pattern, ctx->ptr));
state->ptr = ptr; state->ptr = ctx->ptr;
return 1; RETURN_SUCCESS;
case SRE_OP_AT: case SRE_OP_AT:
/* match at given position */ /* match at given position */
/* <AT> <code> */ /* <AT> <code> */
TRACE(("|%p|%p|AT %d\n", pattern, ptr, *pattern)); TRACE(("|%p|%p|AT %d\n", ctx->pattern, ctx->ptr, *ctx->pattern));
if (!SRE_AT(state, ptr, *pattern)) if (!SRE_AT(state, ctx->ptr, *ctx->pattern))
return 0; RETURN_FAILURE;
pattern++; ctx->pattern++;
break; break;
case SRE_OP_CATEGORY: case SRE_OP_CATEGORY:
/* match at given category */ /* match at given category */
/* <CATEGORY> <code> */ /* <CATEGORY> <code> */
TRACE(("|%p|%p|CATEGORY %d\n", pattern, ptr, *pattern)); TRACE(("|%p|%p|CATEGORY %d\n", ctx->pattern,
if (ptr >= end || !sre_category(pattern[0], ptr[0])) ctx->ptr, *ctx->pattern));
return 0; if (ctx->ptr >= end || !sre_category(ctx->pattern[0], ctx->ptr[0]))
pattern++; RETURN_FAILURE;
ptr++; ctx->pattern++;
break; ctx->ptr++;
case SRE_OP_LITERAL:
/* match literal string */
/* <LITERAL> <code> */
TRACE(("|%p|%p|LITERAL %d\n", pattern, ptr, *pattern));
if (ptr >= end || (SRE_CODE) ptr[0] != pattern[0])
return 0;
pattern++;
ptr++;
break;
case SRE_OP_NOT_LITERAL:
/* match anything that is not literal character */
/* <NOT_LITERAL> <code> */
TRACE(("|%p|%p|NOT_LITERAL %d\n", pattern, ptr, *pattern));
if (ptr >= end || (SRE_CODE) ptr[0] == pattern[0])
return 0;
pattern++;
ptr++;
break; break;
case SRE_OP_ANY: case SRE_OP_ANY:
/* match anything (except a newline) */ /* match anything (except a newline) */
/* <ANY> */ /* <ANY> */
TRACE(("|%p|%p|ANY\n", pattern, ptr)); TRACE(("|%p|%p|ANY\n", ctx->pattern, ctx->ptr));
if (ptr >= end || SRE_IS_LINEBREAK(ptr[0])) if (ctx->ptr >= end || SRE_IS_LINEBREAK(ctx->ptr[0]))
return 0; RETURN_FAILURE;
ptr++; ctx->ptr++;
break; break;
case SRE_OP_ANY_ALL: case SRE_OP_ANY_ALL:
/* match anything */ /* match anything */
/* <ANY_ALL> */ /* <ANY_ALL> */
TRACE(("|%p|%p|ANY_ALL\n", pattern, ptr)); TRACE(("|%p|%p|ANY_ALL\n", ctx->pattern, ctx->ptr));
if (ptr >= end) if (ctx->ptr >= end)
return 0; RETURN_FAILURE;
ptr++; ctx->ptr++;
break; break;
case SRE_OP_IN: case SRE_OP_IN:
/* match set member (or non_member) */ /* match set member (or non_member) */
/* <IN> <skip> <set> */ /* <IN> <skip> <set> */
TRACE(("|%p|%p|IN\n", pattern, ptr)); TRACE(("|%p|%p|IN\n", ctx->pattern, ctx->ptr));
if (ptr >= end || !SRE_CHARSET(pattern + 1, *ptr)) if (ctx->ptr >= end || !SRE_CHARSET(ctx->pattern + 1, *ctx->ptr))
return 0; RETURN_FAILURE;
pattern += pattern[0]; ctx->pattern += ctx->pattern[0];
ptr++; ctx->ptr++;
break;
case SRE_OP_GROUPREF:
/* match backreference */
TRACE(("|%p|%p|GROUPREF %d\n", pattern, ptr, pattern[0]));
i = pattern[0];
{
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
if (!p || !e || e < p)
return 0;
while (p < e) {
if (ptr >= end || *ptr != *p)
return 0;
p++; ptr++;
}
}
pattern++;
break;
case SRE_OP_GROUPREF_IGNORE:
/* match backreference */
TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", pattern, ptr, pattern[0]));
i = pattern[0];
{
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
if (!p || !e || e < p)
return 0;
while (p < e) {
if (ptr >= end ||
state->lower(*ptr) != state->lower(*p))
return 0;
p++; ptr++;
}
}
pattern++;
break; break;
case SRE_OP_LITERAL_IGNORE: case SRE_OP_LITERAL_IGNORE:
TRACE(("|%p|%p|LITERAL_IGNORE %d\n", pattern, ptr, pattern[0])); TRACE(("|%p|%p|LITERAL_IGNORE %d\n",
if (ptr >= end || ctx->pattern, ctx->ptr, ctx->pattern[0]));
state->lower(*ptr) != state->lower(*pattern)) if (ctx->ptr >= end ||
return 0; state->lower(*ctx->ptr) != state->lower(*ctx->pattern))
pattern++; RETURN_FAILURE;
ptr++; ctx->pattern++;
ctx->ptr++;
break; break;
case SRE_OP_NOT_LITERAL_IGNORE: case SRE_OP_NOT_LITERAL_IGNORE:
TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n", pattern, ptr, *pattern)); TRACE(("|%p|%p|NOT_LITERAL_IGNORE %d\n",
if (ptr >= end || ctx->pattern, ctx->ptr, *ctx->pattern));
state->lower(*ptr) == state->lower(*pattern)) if (ctx->ptr >= end ||
return 0; state->lower(*ctx->ptr) == state->lower(*ctx->pattern))
pattern++; RETURN_FAILURE;
ptr++; ctx->pattern++;
ctx->ptr++;
break; break;
case SRE_OP_IN_IGNORE: case SRE_OP_IN_IGNORE:
TRACE(("|%p|%p|IN_IGNORE\n", pattern, ptr)); TRACE(("|%p|%p|IN_IGNORE\n", ctx->pattern, ctx->ptr));
if (ptr >= end if (ctx->ptr >= end
|| !SRE_CHARSET(pattern + 1, (SRE_CODE) state->lower(*ptr))) || !SRE_CHARSET(ctx->pattern+1,
return 0; (SRE_CODE)state->lower(*ctx->ptr)))
pattern += pattern[0]; RETURN_FAILURE;
ptr++; ctx->pattern += ctx->pattern[0];
break; ctx->ptr++;
case SRE_OP_MARK:
/* set mark */
/* <MARK> <gid> */
TRACE(("|%p|%p|MARK %d\n", pattern, ptr, pattern[0]));
i = pattern[0];
if (i & 1)
state->lastindex = i/2 + 1;
if (i > state->lastmark)
state->lastmark = i;
state->mark[i] = ptr;
pattern++;
break; break;
case SRE_OP_JUMP: case SRE_OP_JUMP:
case SRE_OP_INFO: case SRE_OP_INFO:
/* jump forward */ /* jump forward */
/* <JUMP> <offset> */ /* <JUMP> <offset> */
TRACE(("|%p|%p|JUMP %d\n", pattern, ptr, pattern[0])); TRACE(("|%p|%p|JUMP %d\n", ctx->pattern,
pattern += pattern[0]; ctx->ptr, ctx->pattern[0]));
break; ctx->pattern += ctx->pattern[0];
case SRE_OP_ASSERT:
/* assert subpattern */
/* <ASSERT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT %d\n", pattern, ptr, pattern[1]));
state->ptr = ptr - pattern[1];
if (state->ptr < state->beginning)
return 0;
i = SRE_MATCH(state, pattern + 2, level + 1);
if (i <= 0)
return i;
pattern += pattern[0];
break;
case SRE_OP_ASSERT_NOT:
/* assert not subpattern */
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT_NOT %d\n", pattern, ptr, pattern[1]));
state->ptr = ptr - pattern[1];
if (state->ptr >= state->beginning) {
i = SRE_MATCH(state, pattern + 2, level + 1);
if (i < 0)
return i;
if (i)
return 0;
}
pattern += pattern[0];
break; break;
case SRE_OP_BRANCH: case SRE_OP_BRANCH:
/* alternation */ /* alternation */
/* <BRANCH> <0=skip> code <JUMP> ... <NULL> */ /* <BRANCH> <0=skip> code <JUMP> ... <NULL> */
TRACE(("|%p|%p|BRANCH\n", pattern, ptr)); TRACE(("|%p|%p|BRANCH\n", ctx->pattern, ctx->ptr));
LASTMARK_SAVE(); LASTMARK_SAVE();
if (state->repeat) { ctx->u.rep = state->repeat;
i = mark_save(state, 0, lastmark, &mark_stack_base); if (ctx->u.rep)
if (i < 0) MARK_PUSH(ctx->lastmark);
return i; for (; ctx->pattern[0]; ctx->pattern += ctx->pattern[0]) {
} if (ctx->pattern[1] == SRE_OP_LITERAL &&
for (; pattern[0]; pattern += pattern[0]) { (ctx->ptr >= end ||
if (pattern[1] == SRE_OP_LITERAL && (SRE_CODE) *ctx->ptr != ctx->pattern[2]))
(ptr >= end || (SRE_CODE) *ptr != pattern[2]))
continue; continue;
if (pattern[1] == SRE_OP_IN && if (ctx->pattern[1] == SRE_OP_IN &&
(ptr >= end || !SRE_CHARSET(pattern + 3, (SRE_CODE) *ptr))) (ctx->ptr >= end ||
!SRE_CHARSET(ctx->pattern + 3, (SRE_CODE) *ctx->ptr)))
continue; continue;
state->ptr = ptr; state->ptr = ctx->ptr;
i = SRE_MATCH(state, pattern + 1, level + 1); #ifdef USE_RECURSION
if (i) ret = SRE_MATCH(state, ctx->pattern+1, level+1);
return i; #else
if (state->repeat) { DO_JUMP(JUMP_BRANCH, jump_branch, ctx->pattern+1);
i = mark_restore(state, 0, lastmark, &mark_stack_base); #endif
if (i < 0) if (ret) {
return i; if (ctx->u.rep)
} MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
if (ctx->u.rep)
MARK_POP_KEEP(ctx->lastmark);
LASTMARK_RESTORE(); LASTMARK_RESTORE();
} }
return 0; if (ctx->u.rep)
MARK_POP_DISCARD(ctx->lastmark);
RETURN_FAILURE;
case SRE_OP_REPEAT_ONE: case SRE_OP_REPEAT_ONE:
/* match repeated sequence (maximizing regexp) */ /* match repeated sequence (maximizing regexp) */
...@@ -1010,70 +1069,88 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -1010,70 +1069,88 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
/* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ /* <REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
TRACE(("|%p|%p|REPEAT_ONE %d %d\n", pattern, ptr, TRACE(("|%p|%p|REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
pattern[1], pattern[2])); ctx->pattern[1], ctx->pattern[2]));
if (ptr + pattern[1] > end) if (ctx->ptr + ctx->pattern[1] > end)
return 0; /* cannot match */ RETURN_FAILURE; /* cannot match */
state->ptr = ptr; state->ptr = ctx->ptr;
count = SRE_COUNT(state, pattern + 3, pattern[2], level + 1); ctx->count = SRE_COUNT(state, ctx->pattern+3, ctx->pattern[2],
if (count < 0) level+1);
return count; RETURN_ON_ERROR(ctx->count);
ptr += count; ctx->ptr += ctx->count;
/* when we arrive here, count contains the number of /* when we arrive here, count contains the number of
matches, and ptr points to the tail of the target matches, and ctx->ptr points to the tail of the target
string. check if the rest of the pattern matches, string. check if the rest of the pattern matches,
and backtrack if not. */ and backtrack if not. */
if (count < (int) pattern[1]) if (ctx->count < (int) ctx->pattern[1])
return 0; RETURN_FAILURE;
if (pattern[pattern[0]] == SRE_OP_SUCCESS) { if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
/* tail is empty. we're finished */ /* tail is empty. we're finished */
state->ptr = ptr; state->ptr = ctx->ptr;
return 1; RETURN_SUCCESS;
} }
LASTMARK_SAVE(); LASTMARK_SAVE();
if (pattern[pattern[0]] == SRE_OP_LITERAL) { if (ctx->pattern[ctx->pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. skip positions where /* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */ the rest of the pattern cannot possibly match */
chr = pattern[pattern[0]+1]; ctx->u.chr = ctx->pattern[ctx->pattern[0]+1];
for (;;) { for (;;) {
while (count >= (int) pattern[1] && while (ctx->count >= (int) ctx->pattern[1] &&
(ptr >= end || *ptr != chr)) { (ctx->ptr >= end || *ctx->ptr != ctx->u.chr)) {
ptr--; ctx->ptr--;
count--; ctx->count--;
} }
if (count < (int) pattern[1]) if (ctx->count < (int) ctx->pattern[1])
break; break;
state->ptr = ptr; state->ptr = ctx->ptr;
i = SRE_MATCH(state, pattern + pattern[0], level + 1); #ifdef USE_RECURSION
if (i) ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0],
return i; level+1);
ptr--; #else
count--; DO_JUMP(JUMP_REPEAT_ONE_1, jump_repeat_one_1,
ctx->pattern+ctx->pattern[0]);
#endif
if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
LASTMARK_RESTORE(); LASTMARK_RESTORE();
ctx->ptr--;
ctx->count--;
} }
} else { } else {
/* general case */ /* general case */
while (count >= (int) pattern[1]) { while (ctx->count >= (int) ctx->pattern[1]) {
state->ptr = ptr; state->ptr = ctx->ptr;
i = SRE_MATCH(state, pattern + pattern[0], level + 1); #ifdef USE_RECURSION
if (i) ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0],
return i; level+1);
ptr--; #else
count--; DO_JUMP(JUMP_REPEAT_ONE_2, jump_repeat_one_2,
ctx->pattern+ctx->pattern[0]);
#endif
if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
ctx->ptr--;
ctx->count--;
LASTMARK_RESTORE(); LASTMARK_RESTORE();
} }
} }
return 0; RETURN_FAILURE;
case SRE_OP_MIN_REPEAT_ONE: case SRE_OP_MIN_REPEAT_ONE:
/* match repeated sequence (minimizing regexp) */ /* match repeated sequence (minimizing regexp) */
...@@ -1085,76 +1162,92 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -1085,76 +1162,92 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
/* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */ /* <MIN_REPEAT_ONE> <skip> <1=min> <2=max> item <SUCCESS> tail */
TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", pattern, ptr, TRACE(("|%p|%p|MIN_REPEAT_ONE %d %d\n", ctx->pattern, ctx->ptr,
pattern[1], pattern[2])); ctx->pattern[1], ctx->pattern[2]));
if (ptr + pattern[1] > end) if (ctx->ptr + ctx->pattern[1] > end)
return 0; /* cannot match */ RETURN_FAILURE; /* cannot match */
state->ptr = ptr; state->ptr = ctx->ptr;
if (pattern[1] == 0) if (ctx->pattern[1] == 0)
count = 0; ctx->count = 0;
else { else {
/* count using pattern min as the maximum */ /* count using pattern min as the maximum */
count = SRE_COUNT(state, pattern + 3, pattern[1], level + 1); ctx->count = SRE_COUNT(state, ctx->pattern+3,
ctx->pattern[1], level+1);
if (count < 0) RETURN_ON_ERROR(ctx->count);
return count; /* exception */ if (ctx->count < (int) ctx->pattern[1])
if (count < (int) pattern[1]) /* didn't match minimum number of times */
return 0; /* did not match minimum number of times */ RETURN_FAILURE;
ptr += count; /* advance past minimum matches of repeat */ /* advance past minimum matches of repeat */
ctx->ptr += ctx->count;
} }
if (pattern[pattern[0]] == SRE_OP_SUCCESS) { if (ctx->pattern[ctx->pattern[0]] == SRE_OP_SUCCESS) {
/* tail is empty. we're finished */ /* tail is empty. we're finished */
state->ptr = ptr; state->ptr = ctx->ptr;
return 1; RETURN_SUCCESS;
} else { } else {
/* general case */ /* general case */
int matchmax = ((int)pattern[2] == 65535);
int c;
LASTMARK_SAVE(); LASTMARK_SAVE();
while (matchmax || count <= (int) pattern[2]) { while ((int)ctx->pattern[2] == 65535
state->ptr = ptr; || ctx->count <= (int)ctx->pattern[2]) {
i = SRE_MATCH(state, pattern + pattern[0], level + 1); state->ptr = ctx->ptr;
if (i) #ifdef USE_RECURSION
return i; ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0],
state->ptr = ptr; level+1);
c = SRE_COUNT(state, pattern+3, 1, level+1); #else
if (c < 0) DO_JUMP(JUMP_MIN_REPEAT_ONE,jump_min_repeat_one,
return c; ctx->pattern+ctx->pattern[0]);
if (c == 0) #endif
if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
state->ptr = ctx->ptr;
ret = SRE_COUNT(state, ctx->pattern+3, 1, level+1);
RETURN_ON_ERROR(ret);
if (ret == 0)
break; break;
assert(c == 1); assert(ret == 1);
ptr++; ctx->ptr++;
count++; ctx->count++;
LASTMARK_RESTORE(); LASTMARK_RESTORE();
} }
} }
return 0; RETURN_FAILURE;
case SRE_OP_REPEAT: case SRE_OP_REPEAT:
/* create repeat context. all the hard work is done /* create repeat context. all the hard work is done
by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */ by the UNTIL operator (MAX_UNTIL, MIN_UNTIL) */
/* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */ /* <REPEAT> <skip> <1=min> <2=max> item <UNTIL> tail */
TRACE(("|%p|%p|REPEAT %d %d\n", pattern, ptr, TRACE(("|%p|%p|REPEAT %d %d\n", ctx->pattern, ctx->ptr,
pattern[1], pattern[2])); ctx->pattern[1], ctx->pattern[2]));
rep.count = -1;
rep.pattern = pattern;
/* install new repeat context */ /* install new repeat context */
rep.prev = state->repeat; ctx->u.rep = (SRE_REPEAT*) malloc(sizeof(*ctx->u.rep));
state->repeat = &rep; ctx->u.rep->count = -1;
ctx->u.rep->pattern = ctx->pattern;
state->ptr = ptr; ctx->u.rep->prev = state->repeat;
i = SRE_MATCH(state, pattern + pattern[0], level + 1); ctx->u.rep->last_ptr = NULL;
state->repeat = ctx->u.rep;
state->repeat = rep.prev;
state->ptr = ctx->ptr;
#ifdef USE_RECURSION
ret = SRE_MATCH(state, ctx->pattern+ctx->pattern[0], level+1);
#else
DO_JUMP(JUMP_REPEAT, jump_repeat, ctx->pattern+ctx->pattern[0]);
#endif
state->repeat = ctx->u.rep->prev;
free(ctx->u.rep);
return i; if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
RETURN_FAILURE;
case SRE_OP_MAX_UNTIL: case SRE_OP_MAX_UNTIL:
/* maximizing repeat */ /* maximizing repeat */
...@@ -1163,119 +1256,328 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level) ...@@ -1163,119 +1256,328 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern, int level)
/* FIXME: we probably need to deal with zero-width /* FIXME: we probably need to deal with zero-width
matches in here... */ matches in here... */
rp = state->repeat; ctx->u.rep = state->repeat;
if (!rp) if (!ctx->u.rep)
return SRE_ERROR_STATE; RETURN_ERROR(SRE_ERROR_STATE);
state->ptr = ptr; state->ptr = ctx->ptr;
count = rp->count + 1; ctx->count = ctx->u.rep->count+1;
TRACE(("|%p|%p|MAX_UNTIL %d\n", pattern, ptr, count)); TRACE(("|%p|%p|MAX_UNTIL %d\n", ctx->pattern,
ctx->ptr, ctx->count));
if (count < rp->pattern[1]) { if (ctx->count < ctx->u.rep->pattern[1]) {
/* not enough matches */ /* not enough matches */
rp->count = count; ctx->u.rep->count = ctx->count;
#ifdef USE_RECURSION
/* RECURSIVE */ /* RECURSIVE */
i = SRE_MATCH(state, rp->pattern + 3, level + 1); ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
if (i) #else
return i; DO_JUMP(JUMP_MAX_UNTIL_1, jump_max_until_1,
rp->count = count - 1; ctx->u.rep->pattern+3);
state->ptr = ptr; #endif
return 0; if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
ctx->u.rep->count = ctx->count-1;
state->ptr = ctx->ptr;
RETURN_FAILURE;
} }
if (count < rp->pattern[2] || rp->pattern[2] == 65535) { if ((ctx->count < ctx->u.rep->pattern[2] ||
ctx->u.rep->pattern[2] == 65535) &&
state->ptr != ctx->u.rep->last_ptr) {
/* we may have enough matches, but if we can /* we may have enough matches, but if we can
match another item, do so */ match another item, do so */
rp->count = count; ctx->u.rep->count = ctx->count;
LASTMARK_SAVE(); LASTMARK_SAVE();
i = mark_save(state, 0, lastmark, &mark_stack_base); MARK_PUSH(ctx->lastmark);
if (i < 0) /* zero-width match protection */
return i; DATA_PUSH(&ctx->u.rep->last_ptr);
ctx->u.rep->last_ptr = state->ptr;
#ifdef USE_RECURSION
/* RECURSIVE */ /* RECURSIVE */
i = SRE_MATCH(state, rp->pattern + 3, level + 1); ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
if (i) #else
return i; DO_JUMP(JUMP_MAX_UNTIL_2, jump_max_until_2,
i = mark_restore(state, 0, lastmark, &mark_stack_base); ctx->u.rep->pattern+3);
if (i < 0) #endif
return i; DATA_POP(&ctx->u.rep->last_ptr);
if (ret) {
MARK_POP_DISCARD(ctx->lastmark);
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
MARK_POP(ctx->lastmark);
LASTMARK_RESTORE(); LASTMARK_RESTORE();
rp->count = count - 1; ctx->u.rep->count = ctx->count-1;
state->ptr = ptr; state->ptr = ctx->ptr;
} }
/* cannot match more repeated items here. make sure the /* cannot match more repeated items here. make sure the
tail matches */ tail matches */
state->repeat = rp->prev; state->repeat = ctx->u.rep->prev;
i = SRE_MATCH(state, pattern, level + 1); #ifdef USE_RECURSION
if (i) ret = SRE_MATCH(state, ctx->pattern, level+1);
return i; #else
state->repeat = rp; DO_JUMP(JUMP_MAX_UNTIL_3, jump_max_until_3, ctx->pattern);
state->ptr = ptr; #endif
return 0; RETURN_ON_SUCCESS(ret);
state->repeat = ctx->u.rep;
state->ptr = ctx->ptr;
RETURN_FAILURE;
case SRE_OP_MIN_UNTIL: case SRE_OP_MIN_UNTIL:
/* minimizing repeat */ /* minimizing repeat */
/* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */ /* <REPEAT> <skip> <1=min> <2=max> item <MIN_UNTIL> tail */
rp = state->repeat; ctx->u.rep = state->repeat;
if (!rp) if (!ctx->u.rep)
return SRE_ERROR_STATE; RETURN_ERROR(SRE_ERROR_STATE);
state->ptr = ptr; state->ptr = ctx->ptr;
count = rp->count + 1; ctx->count = ctx->u.rep->count+1;
TRACE(("|%p|%p|MIN_UNTIL %d %p\n", pattern, ptr, count, TRACE(("|%p|%p|MIN_UNTIL %d %p\n", ctx->pattern,
rp->pattern)); ctx->ptr, ctx->count, ctx->u.rep->pattern));
if (count < rp->pattern[1]) { if (ctx->count < ctx->u.rep->pattern[1]) {
/* not enough matches */ /* not enough matches */
rp->count = count; ctx->u.rep->count = ctx->count;
#ifdef USE_RECURSION
/* RECURSIVE */ /* RECURSIVE */
i = SRE_MATCH(state, rp->pattern + 3, level + 1); ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
if (i) #else
return i; DO_JUMP(JUMP_MIN_UNTIL_1, jump_min_until_1,
rp->count = count-1; ctx->u.rep->pattern+3);
state->ptr = ptr; #endif
return 0; if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
ctx->u.rep->count = ctx->count-1;
state->ptr = ctx->ptr;
RETURN_FAILURE;
} }
LASTMARK_SAVE(); LASTMARK_SAVE();
/* see if the tail matches */ /* see if the tail matches */
state->repeat = rp->prev; state->repeat = ctx->u.rep->prev;
i = SRE_MATCH(state, pattern, level + 1); #ifdef USE_RECURSION
if (i) ret = SRE_MATCH(state, ctx->pattern, level+1);
return i; #else
DO_JUMP(JUMP_MIN_UNTIL_2, jump_min_until_2, ctx->pattern);
state->ptr = ptr; #endif
state->repeat = rp; if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
if (count >= rp->pattern[2] && rp->pattern[2] != 65535) state->repeat = ctx->u.rep;
return 0; state->ptr = ctx->ptr;
LASTMARK_RESTORE(); LASTMARK_RESTORE();
rp->count = count; if (ctx->count >= ctx->u.rep->pattern[2]
&& ctx->u.rep->pattern[2] != 65535)
RETURN_FAILURE;
ctx->u.rep->count = ctx->count;
#ifdef USE_RECURSION
/* RECURSIVE */ /* RECURSIVE */
i = SRE_MATCH(state, rp->pattern + 3, level + 1); ret = SRE_MATCH(state, ctx->u.rep->pattern+3, level+1);
if (i) #else
return i; DO_JUMP(JUMP_MIN_UNTIL_3,jump_min_until_3,
rp->count = count - 1; ctx->u.rep->pattern+3);
state->ptr = ptr; #endif
if (ret) {
RETURN_ON_ERROR(ret);
RETURN_SUCCESS;
}
ctx->u.rep->count = ctx->count-1;
state->ptr = ctx->ptr;
RETURN_FAILURE;
return 0; case SRE_OP_GROUPREF:
/* match backreference */
TRACE(("|%p|%p|GROUPREF %d\n", ctx->pattern,
ctx->ptr, ctx->pattern[0]));
i = ctx->pattern[0];
{
int groupref = i+i;
if (groupref >= state->lastmark) {
RETURN_FAILURE;
} else {
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
if (!p || !e || e < p)
RETURN_FAILURE;
while (p < e) {
if (ctx->ptr >= end || *ctx->ptr != *p)
RETURN_FAILURE;
p++; ctx->ptr++;
}
}
}
ctx->pattern++;
break;
default: case SRE_OP_GROUPREF_IGNORE:
TRACE(("|%p|%p|UNKNOWN %d\n", pattern, ptr, pattern[-1])); /* match backreference */
return SRE_ERROR_ILLEGAL; TRACE(("|%p|%p|GROUPREF_IGNORE %d\n", ctx->pattern,
ctx->ptr, ctx->pattern[0]));
i = ctx->pattern[0];
{
int groupref = i+i;
if (groupref >= state->lastmark) {
RETURN_FAILURE;
} else {
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
if (!p || !e || e < p)
RETURN_FAILURE;
while (p < e) {
if (ctx->ptr >= end ||
state->lower(*ctx->ptr) != state->lower(*p))
RETURN_FAILURE;
p++; ctx->ptr++;
}
}
}
ctx->pattern++;
break;
case SRE_OP_GROUPREF_EXISTS:
TRACE(("|%p|%p|GROUPREF_EXISTS %d\n", ctx->pattern,
ctx->ptr, ctx->pattern[0]));
/* <GROUPREF_EXISTS> <group> <skip> codeyes <JUMP> codeno ... */
i = ctx->pattern[0];
{
int groupref = i+i;
if (groupref >= state->lastmark) {
ctx->pattern += ctx->pattern[1];
break;
} else {
SRE_CHAR* p = (SRE_CHAR*) state->mark[groupref];
SRE_CHAR* e = (SRE_CHAR*) state->mark[groupref+1];
if (!p || !e || e < p) {
ctx->pattern += ctx->pattern[1];
break;
}
}
}
ctx->pattern += 2;
break;
case SRE_OP_ASSERT:
/* assert subpattern */
/* <ASSERT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT %d\n", ctx->pattern,
ctx->ptr, ctx->pattern[1]));
state->ptr = ctx->ptr - ctx->pattern[1];
if (state->ptr < state->beginning)
RETURN_FAILURE;
#ifdef USE_RECURSION
ret = SRE_MATCH(state, ctx->pattern+2, level+1);
#else
DO_JUMP(JUMP_ASSERT, jump_assert, ctx->pattern+2);
#endif
RETURN_ON_FAILURE(ret);
ctx->pattern += ctx->pattern[0];
break;
case SRE_OP_ASSERT_NOT:
/* assert not subpattern */
/* <ASSERT_NOT> <skip> <back> <pattern> */
TRACE(("|%p|%p|ASSERT_NOT %d\n", ctx->pattern,
ctx->ptr, ctx->pattern[1]));
state->ptr = ctx->ptr - ctx->pattern[1];
if (state->ptr >= state->beginning) {
#ifdef USE_RECURSION
ret = SRE_MATCH(state, ctx->pattern+2, level+1);
#else
DO_JUMP(JUMP_ASSERT_NOT, jump_assert_not, ctx->pattern+2);
#endif
if (ret) {
RETURN_ON_ERROR(ret);
RETURN_FAILURE;
} }
} }
ctx->pattern += ctx->pattern[0];
break;
case SRE_OP_FAILURE:
/* immediate failure */
TRACE(("|%p|%p|FAILURE\n", ctx->pattern, ctx->ptr));
RETURN_FAILURE;
default:
TRACE(("|%p|%p|UNKNOWN %d\n", ctx->pattern, ctx->ptr,
ctx->pattern[-1]));
RETURN_ERROR(SRE_ERROR_ILLEGAL);
}
}
exit:
ctx_pos = ctx->last_ctx_pos;
jump = ctx->jump;
DATA_POP_DISCARD(ctx);
if (ctx_pos == -1)
return ret;
DATA_LOOKUP_AT(SRE_MATCH_CONTEXT, ctx, ctx_pos);
#ifndef USE_RECURSION
switch (jump) {
case JUMP_MAX_UNTIL_2:
TRACE(("|%p|%p|JUMP_MAX_UNTIL_2\n", ctx->pattern, ctx->ptr));
goto jump_max_until_2;
case JUMP_MAX_UNTIL_3:
TRACE(("|%p|%p|JUMP_MAX_UNTIL_3\n", ctx->pattern, ctx->ptr));
goto jump_max_until_3;
case JUMP_MIN_UNTIL_2:
TRACE(("|%p|%p|JUMP_MIN_UNTIL_2\n", ctx->pattern, ctx->ptr));
goto jump_min_until_2;
case JUMP_MIN_UNTIL_3:
TRACE(("|%p|%p|JUMP_MIN_UNTIL_3\n", ctx->pattern, ctx->ptr));
goto jump_min_until_3;
case JUMP_BRANCH:
TRACE(("|%p|%p|JUMP_BRANCH\n", ctx->pattern, ctx->ptr));
goto jump_branch;
case JUMP_MAX_UNTIL_1:
TRACE(("|%p|%p|JUMP_MAX_UNTIL_1\n", ctx->pattern, ctx->ptr));
goto jump_max_until_1;
case JUMP_MIN_UNTIL_1:
TRACE(("|%p|%p|JUMP_MIN_UNTIL_1\n", ctx->pattern, ctx->ptr));
goto jump_min_until_1;
case JUMP_REPEAT:
TRACE(("|%p|%p|JUMP_REPEAT\n", ctx->pattern, ctx->ptr));
goto jump_repeat;
case JUMP_REPEAT_ONE_1:
TRACE(("|%p|%p|JUMP_REPEAT_ONE_1\n", ctx->pattern, ctx->ptr));
goto jump_repeat_one_1;
case JUMP_REPEAT_ONE_2:
TRACE(("|%p|%p|JUMP_REPEAT_ONE_2\n", ctx->pattern, ctx->ptr));
goto jump_repeat_one_2;
case JUMP_MIN_REPEAT_ONE:
TRACE(("|%p|%p|JUMP_MIN_REPEAT_ONE\n", ctx->pattern, ctx->ptr));
goto jump_min_repeat_one;
case JUMP_ASSERT:
TRACE(("|%p|%p|JUMP_ASSERT\n", ctx->pattern, ctx->ptr));
goto jump_assert;
case JUMP_ASSERT_NOT:
TRACE(("|%p|%p|JUMP_ASSERT_NOT\n", ctx->pattern, ctx->ptr));
goto jump_assert_not;
case JUMP_NONE:
TRACE(("|%p|%p|RETURN %d\n", ctx->pattern, ctx->ptr, ret));
break;
}
#endif
/* can't end up here */ return ret; /* should never get here */
/* return SRE_ERROR_ILLEGAL; -- see python-dev discussion */
} }
LOCAL(int) LOCAL(int)
...@@ -1511,16 +1813,15 @@ sre_getlower(PyObject* self, PyObject* args) ...@@ -1511,16 +1813,15 @@ sre_getlower(PyObject* self, PyObject* args)
LOCAL(void) LOCAL(void)
state_reset(SRE_STATE* state) state_reset(SRE_STATE* state)
{ {
state->lastmark = 0;
/* FIXME: dynamic! */ /* FIXME: dynamic! */
memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE); /*memset(state->mark, 0, sizeof(*state->mark) * SRE_MARK_SIZE);*/
state->lastmark = -1;
state->lastindex = -1; state->lastindex = -1;
state->repeat = NULL; state->repeat = NULL;
mark_fini(state); data_stack_dealloc(state);
} }
static void* static void*
...@@ -1600,6 +1901,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string, ...@@ -1600,6 +1901,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
memset(state, 0, sizeof(SRE_STATE)); memset(state, 0, sizeof(SRE_STATE));
state->lastmark = -1;
state->lastindex = -1; state->lastindex = -1;
ptr = getstring(string, &length, &charsize); ptr = getstring(string, &length, &charsize);
...@@ -1647,7 +1949,7 @@ LOCAL(void) ...@@ -1647,7 +1949,7 @@ LOCAL(void)
state_fini(SRE_STATE* state) state_fini(SRE_STATE* state)
{ {
Py_XDECREF(state->string); Py_XDECREF(state->string);
mark_fini(state); data_stack_dealloc(state);
} }
/* calculate offset from start of string */ /* calculate offset from start of string */
...@@ -1661,7 +1963,7 @@ state_getslice(SRE_STATE* state, int index, PyObject* string, int empty) ...@@ -1661,7 +1963,7 @@ state_getslice(SRE_STATE* state, int index, PyObject* string, int empty)
index = (index - 1) * 2; index = (index - 1) * 2;
if (string == Py_None || !state->mark[index] || !state->mark[index+1]) { if (string == Py_None || index >= state->lastmark || !state->mark[index] || !state->mark[index+1]) {
if (empty) if (empty)
/* want empty string */ /* want empty string */
i = j = 0; i = j = 0;
......
...@@ -55,6 +55,7 @@ typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch); ...@@ -55,6 +55,7 @@ typedef unsigned int (*SRE_TOLOWER_HOOK)(unsigned int ch);
typedef struct SRE_REPEAT_T { typedef struct SRE_REPEAT_T {
int count; int count;
SRE_CODE* pattern; /* points to REPEAT operator arguments */ SRE_CODE* pattern; /* points to REPEAT operator arguments */
void* last_ptr; /* helper to check for infinite loops */
struct SRE_REPEAT_T *prev; /* points to previous repeat context */ struct SRE_REPEAT_T *prev; /* points to previous repeat context */
} SRE_REPEAT; } SRE_REPEAT;
...@@ -74,10 +75,11 @@ typedef struct { ...@@ -74,10 +75,11 @@ typedef struct {
int lastmark; int lastmark;
void* mark[SRE_MARK_SIZE]; void* mark[SRE_MARK_SIZE];
/* dynamically allocated stuff */ /* dynamically allocated stuff */
void** mark_stack; char* data_stack;
int mark_stack_size; int data_stack_size;
int mark_stack_base; int data_stack_base;
SRE_REPEAT *repeat; /* current repeat context */ /* current repeat context */
SRE_REPEAT *repeat;
/* hooks */ /* hooks */
SRE_TOLOWER_HOOK lower; SRE_TOLOWER_HOOK lower;
} SRE_STATE; } SRE_STATE;
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
* See the _sre.c file for information on usage and redistribution. * See the _sre.c file for information on usage and redistribution.
*/ */
#define SRE_MAGIC 20030419 #define SRE_MAGIC 20031017
#define SRE_OP_FAILURE 0 #define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1 #define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2 #define SRE_OP_ANY 2
...@@ -25,24 +25,25 @@ ...@@ -25,24 +25,25 @@
#define SRE_OP_CHARSET 10 #define SRE_OP_CHARSET 10
#define SRE_OP_BIGCHARSET 11 #define SRE_OP_BIGCHARSET 11
#define SRE_OP_GROUPREF 12 #define SRE_OP_GROUPREF 12
#define SRE_OP_GROUPREF_IGNORE 13 #define SRE_OP_GROUPREF_EXISTS 13
#define SRE_OP_IN 14 #define SRE_OP_GROUPREF_IGNORE 14
#define SRE_OP_IN_IGNORE 15 #define SRE_OP_IN 15
#define SRE_OP_INFO 16 #define SRE_OP_IN_IGNORE 16
#define SRE_OP_JUMP 17 #define SRE_OP_INFO 17
#define SRE_OP_LITERAL 18 #define SRE_OP_JUMP 18
#define SRE_OP_LITERAL_IGNORE 19 #define SRE_OP_LITERAL 19
#define SRE_OP_MARK 20 #define SRE_OP_LITERAL_IGNORE 20
#define SRE_OP_MAX_UNTIL 21 #define SRE_OP_MARK 21
#define SRE_OP_MIN_UNTIL 22 #define SRE_OP_MAX_UNTIL 22
#define SRE_OP_NOT_LITERAL 23 #define SRE_OP_MIN_UNTIL 23
#define SRE_OP_NOT_LITERAL_IGNORE 24 #define SRE_OP_NOT_LITERAL 24
#define SRE_OP_NEGATE 25 #define SRE_OP_NOT_LITERAL_IGNORE 25
#define SRE_OP_RANGE 26 #define SRE_OP_NEGATE 26
#define SRE_OP_REPEAT 27 #define SRE_OP_RANGE 27
#define SRE_OP_REPEAT_ONE 28 #define SRE_OP_REPEAT 28
#define SRE_OP_SUBPATTERN 29 #define SRE_OP_REPEAT_ONE 29
#define SRE_OP_MIN_REPEAT_ONE 30 #define SRE_OP_SUBPATTERN 30
#define SRE_OP_MIN_REPEAT_ONE 31
#define SRE_AT_BEGINNING 0 #define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1 #define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BEGINNING_STRING 2 #define SRE_AT_BEGINNING_STRING 2
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment