Commit 8c1a3565 authored by Fredrik Lundh's avatar Fredrik Lundh

-- SRE 0.9.6 sync. this includes:

 + added "regs" attribute
 + fixed "pos" and "endpos" attributes
 + reset "lastindex" and "lastgroup" in scanner methods
 + removed (?P#id) syntax; the "lastindex" and "lastgroup"
   attributes are now always set
 + removed string module dependencies in sre_parse
 + better debugging support in sre_parse
 + various tweaks to build under 1.5.2
parent 902e1319
......@@ -10,9 +10,13 @@
# other compatibility work.
#
# FIXME: change all FIXME's to XXX ;-)
import sre_compile
import sre_parse
import string
# flags
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
L = LOCALE = sre_compile.SRE_FLAG_LOCALE
......@@ -53,6 +57,9 @@ def findall(pattern, string, maxsplit=0):
def compile(pattern, flags=0):
return _compile(pattern, flags)
def purge():
_cache.clear()
def template(pattern, flags=0):
return _compile(pattern, flags|T)
......@@ -65,7 +72,7 @@ def escape(pattern):
s[i] = "\\000"
else:
s[i] = "\\" + c
return pattern[:0].join(s)
return _join(s, pattern)
# --------------------------------------------------------------------
# internals
......@@ -73,10 +80,14 @@ def escape(pattern):
_cache = {}
_MAXCACHE = 100
def _join(seq, sep):
# internal: join into string having the same type as sep
return string.join(seq, sep[:0])
def _compile(pattern, flags=0):
# internal: compile pattern
tp = type(pattern)
if tp not in (type(""), type(u"")):
if tp not in sre_compile.STRING_TYPES:
return pattern
key = (tp, pattern, flags)
try:
......@@ -89,10 +100,6 @@ def _compile(pattern, flags=0):
_cache[key] = p
return p
def purge():
# clear pattern cache
_cache.clear()
def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0]
......@@ -120,7 +127,7 @@ def _subn(pattern, template, string, count=0):
i = e
n = n + 1
append(string[i:])
return string[:0].join(s), n
return _join(s, string[:0]), n
def _split(pattern, string, maxsplit=0):
# internal: pattern.split implementation hook
......@@ -161,11 +168,19 @@ copy_reg.pickle(type(_compile("")), _pickle, _compile)
class Scanner:
def __init__(self, lexicon):
from sre_constants import BRANCH, SUBPATTERN, INDEX
self.lexicon = lexicon
# combine phrases into a compound pattern
p = []
s = sre_parse.Pattern()
for phrase, action in lexicon:
p.append("(?:%s)(?P#%d)" % (phrase, len(p)))
self.scanner = _compile("|".join(p))
p.append(sre_parse.SubPattern(s, [
(SUBPATTERN, (None, sre_parse.parse(phrase))),
(INDEX, len(p))
]))
p = sre_parse.SubPattern(s, [(BRANCH, (None, p))])
s.groups = len(p)
self.scanner = sre_compile.compile(p)
def scan(self, string):
result = []
append = result.append
......
......@@ -197,10 +197,11 @@ def _compile(code, pattern, flags):
else:
emit(ATCODES[av])
elif op is BRANCH:
emit(OPCODES[op])
tail = []
for av in av[1]:
emit(OPCODES[op])
skip = len(code); emit(0)
emit(MAXCODE) # save mark
_compile(code, av, flags)
emit(OPCODES[JUMP])
tail.append(len(code)); emit(0)
......@@ -286,11 +287,18 @@ def _compile_info(code, pattern, flags):
emit(OPCODES[FAILURE])
code[skip] = len(code) - skip
STRING_TYPES = [type("")]
try:
STRING_TYPES.append(type(unicode("")))
except NameError:
pass
def compile(p, flags=0):
# internal: convert pattern list to internal format
# compile, as necessary
if type(p) in (type(""), type(u"")):
if type(p) in STRING_TYPES:
import sre_parse
pattern = p
p = sre_parse.parse(p, flags)
......@@ -308,6 +316,8 @@ def compile(p, flags=0):
code.append(OPCODES[SUCCESS])
# print code
# FIXME: <fl> get rid of this limitation!
assert p.pattern.groups <= 100,\
"sorry, but this version only supports 100 named groups"
......
......@@ -172,7 +172,7 @@ CH_UNICODE = {
# flags
SRE_FLAG_TEMPLATE = 1 # template mode (disable backtracking)
SRE_FLAG_IGNORECASE = 2 # case insensitive
SRE_FLAG_LOCALE = 4 # honor system locale
SRE_FLAG_LOCALE = 4 # honour system locale
SRE_FLAG_MULTILINE = 8 # treat target as multiline string
SRE_FLAG_DOTALL = 16 # treat target as a single string
SRE_FLAG_UNICODE = 32 # use unicode locale
......
......@@ -25,12 +25,12 @@ CHARMASK = 0xff
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
DIGITS = tuple(string.digits)
DIGITS = tuple("012345689")
OCTDIGITS = tuple("01234567")
HEXDIGITS = tuple("0123456789abcdefABCDEF")
WHITESPACE = tuple(string.whitespace)
WHITESPACE = tuple(" \t\n\r\v\f")
ESCAPES = {
r"\a": (LITERAL, 7),
......@@ -68,7 +68,8 @@ FLAGS = {
"u": SRE_FLAG_UNICODE,
}
class State:
class Pattern:
# master pattern object. keeps track of global attributes
def __init__(self):
self.flags = 0
self.groups = 1
......@@ -88,6 +89,33 @@ class SubPattern:
data = []
self.data = data
self.width = None
def dump(self, level=0):
nl = 1
for op, av in self.data:
print level*" " + op,; nl = 0
if op == "in":
# member sublanguage
print; nl = 1
for op, a in av:
print (level+1)*" " + op, a
elif op == "branch":
print; nl = 1
i = 0
for a in av[1]:
if i > 0:
print level*" " + "or"
a.dump(level+1); nl = 1
i = i + 1
elif type(av) in (type(()), type([])):
for a in av:
if isinstance(a, SubPattern):
if not nl: print
a.dump(level+1); nl = 1
else:
print a, ; nl = 0
else:
print av, ; nl = 0
if not nl: print
def __repr__(self):
return repr(self.data)
def __len__(self):
......@@ -255,10 +283,25 @@ def _escape(source, escape, state):
pass
raise error, "bogus escape: %s" % repr(escape)
def _branch(pattern, items):
# form a branch operator from a set of items
def _parse_sub(source, state, nested=1):
# parse an alternation: a|b|c
items = []
while 1:
items.append(_parse(source, state))
if source.match("|"):
continue
if not nested:
break
if not source.next or source.match(")"):
break
else:
raise error, "pattern not properly closed"
if len(items) == 1:
return items[0]
subpattern = SubPattern(pattern)
subpattern = SubPattern(state)
# check if all items share a common prefix
while 1:
......@@ -285,7 +328,7 @@ def _branch(pattern, items):
break
else:
# we can store this as a character set instead of a
# branch (FIXME: use a range if possible)
# branch (the compiler may optimize this even more)
set = []
for item in items:
set.append(item[0])
......@@ -296,8 +339,7 @@ def _branch(pattern, items):
return subpattern
def _parse(source, state):
# parse regular expression pattern into an operator list.
# parse a simple pattern
subpattern = SubPattern(state)
......@@ -451,22 +493,6 @@ def _parse(source, state):
if gid is None:
raise error, "unknown group name"
subpattern.append((GROUPREF, gid))
elif source.match("#"):
index = ""
while 1:
char = source.get()
if char is None:
raise error, "unterminated index"
if char == ")":
break
index = index + char
try:
index = int(index)
if index < 0 or index > MAXREPEAT:
raise ValueError
except ValueError:
raise error, "illegal index"
subpattern.append((INDEX, index))
continue
else:
char = source.get()
......@@ -491,48 +517,27 @@ def _parse(source, state):
raise error, "syntax error"
dir = -1 # lookbehind
char = source.get()
b = []
while 1:
p = _parse(source, state)
if source.next == ")":
if b:
b.append(p)
p = _branch(state, b)
p = _parse_sub(source, state)
if char == "=":
subpattern.append((ASSERT, (dir, p)))
else:
subpattern.append((ASSERT_NOT, (dir, p)))
break
elif source.match("|"):
b.append(p)
else:
raise error, "pattern not properly closed"
continue
else:
# flags
while FLAGS.has_key(source.next):
state.flags = state.flags | FLAGS[source.get()]
if group:
# parse group contents
b = []
if group == 2:
# anonymous group
group = None
else:
group = state.getgroup(name)
while 1:
p = _parse(source, state)
p = _parse_sub(source, state)
subpattern.append((SUBPATTERN, (group, p)))
if group is not None:
p.append((INDEX, group))
if source.match(")"):
if b:
b.append(p)
p = _branch(state, b)
subpattern.append((SUBPATTERN, (group, p)))
break
elif source.match("|"):
b.append(p)
else:
raise error, "group not properly closed"
else:
while 1:
char = source.get()
......@@ -555,26 +560,24 @@ def _parse(source, state):
return subpattern
def parse(pattern, flags=0):
def parse(str, flags=0):
# parse 're' pattern into list of (opcode, argument) tuples
source = Tokenizer(pattern)
state = State()
state.flags = flags
b = []
while 1:
p = _parse(source, state)
source = Tokenizer(str)
pattern = Pattern()
pattern.flags = flags
p = _parse_sub(source, pattern, 0)
tail = source.get()
if tail == "|":
b.append(p)
elif tail == ")":
if tail == ")":
raise error, "unbalanced parenthesis"
elif tail is None:
if b:
b.append(p)
p = _branch(state, b)
break
else:
elif tail:
raise error, "bogus characters at end of regular expression"
# p.dump()
return p
def parse_template(source, pattern):
......@@ -656,4 +659,4 @@ def expand_template(template, match):
if s is None:
raise error, "empty group"
a(s)
return sep.join(p)
return string.join(p, sep)
test_sre
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
=== Failed incorrectly ('(a+)+\\1', 'aa', 0, 'found+"-"+g1', 'aa-a')
=== grouping error ('(a)(b)c|ab', 'ab', 0, 'found+"-"+g1+"-"+g2', 'ab-None-None') 'ab-None-b' should be 'ab-None-None'
=== grouping error ('(a)+b|aac', 'aac', 0, 'found+"-"+g1', 'aac-None') 'aac-a' should be 'aac-None'
=== Failed incorrectly ('^(.+)?B', 'AB', 0, 'g1', 'A')
/* -*- Mode: C; tab-width: 4 -*-
*
/*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
......@@ -23,6 +22,9 @@
* 00-06-30 fl added assert (lookahead) primitives, etc (0.9.4)
* 00-07-02 fl added charset optimizations, etc (0.9.5)
* 00-07-03 fl store code in pattern object, lookbehind, etc
* 00-07-08 fl added regs attribute
* 00-07-18 fl changed branch operator to use failure stack
* 00-07-21 fl reset lastindex in scanner methods (0.9.6)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
......@@ -33,7 +35,7 @@
#ifndef SRE_RECURSIVE
char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
char copyright[] = " SRE 0.9.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
......@@ -51,30 +53,40 @@ char copyright[] = " SRE 0.9.5 Copyright (c) 1997-2000 by Secret Labs AB ";
#define MODULE "sre"
/* defining this one enables tracing */
#undef DEBUG
#undef VERBOSE
#if PY_VERSION_HEX >= 0x01060000
/* defining this enables unicode support (default under 1.6a1 and later) */
#define HAVE_UNICODE
#endif
/* -------------------------------------------------------------------- */
/* optional features */
/* enables fast searching */
#define USE_FAST_SEARCH
/* enables aggressive inlining (always on for Visual C) */
#define USE_INLINE
/* -------------------------------------------------------------------- */
#if defined(_MSC_VER)
#pragma optimize("agtw", on) /* doesn't seem to make much difference... */
#pragma warning(disable: 4710) /* who cares if functions are not inlined ;-) */
/* fastest possible local call under MSVC */
#define LOCAL(type) static __inline type __fastcall
#else
#elif defined(USE_INLINE)
#define LOCAL(type) static inline type
#else
#define LOCAL(type) static type
#endif
/* error codes */
#define SRE_ERROR_ILLEGAL -1 /* illegal opcode */
#define SRE_ERROR_MEMORY -9 /* out of memory */
#if defined(DEBUG)
#if defined(VERBOSE)
#define TRACE(v) printf v
#else
#define TRACE(v)
......@@ -150,7 +162,7 @@ static unsigned int sre_lower_unicode(unsigned int ch)
#define SRE_UNI_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_UNI_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch))
#define SRE_UNI_IS_ALNUM(ch) Py_UNICODE_ISALNUM((Py_UNICODE)(ch))
#define SRE_UNI_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
#define SRE_UNI_IS_WORD(ch) (SRE_UNI_IS_ALNUM((ch)) || (ch) == '_')
#endif
LOCAL(int)
......@@ -413,6 +425,24 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
void* mark_copy[SRE_MARK_SIZE];
void* mark = NULL;
#define PUSH(skip_, mark_, max_)\
if (stack >= state->stacksize) {\
i = stack_extend(state, stack + 1, stackbase + max_);\
if (i < 0)\
return i;\
}\
TRACE(("%8d: stack[%d]\n", PTR(ptr), stack));\
sp = state->stack + (stack++);\
sp->ptr = ptr;\
sp->pattern = pattern + skip_;\
sp->mark = mark_;\
if (mark_ != 65535) {\
sp->mark0 = state->mark[mark_];\
sp->mark1 = state->mark[mark_+1];\
TRACE((" mark %d %d %d\n", mark_, PTR(state->mark[mark_]),\
PTR(state->mark[mark_+1])));\
}\
TRACE(("%8d: enter\n", PTR(ptr)));
if (pattern[0] == SRE_OP_INFO) {
......@@ -629,6 +659,19 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
pattern += pattern[0];
break;
case SRE_OP_BRANCH:
/* try an alternate branch */
/* format: <branch> <0=skip> <1=mark> <tail...> */
TRACE(("%8d: branch\n", PTR(ptr)));
if (pattern[2] != SRE_OP_LITERAL ||
(ptr < end && (SRE_CODE) ptr[0] == pattern[3])) {
/* worth trying */
PUSH(pattern[0], pattern[3], 1);
pattern += 2;
} else
pattern += pattern[0];
break;
#if 0
case SRE_OP_MAX_REPEAT_ONE:
/* match repeated sequence (maximizing regexp) */
......@@ -817,26 +860,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
points to the stack */
while (pattern[2] == 65535 || count < (int) pattern[2]) {
/* this position was valid; add it to the retry
/* this position is valid; add it to the retry
stack */
if (stack >= state->stacksize) {
i = stack_extend(state, stack + 1,
stackbase + pattern[2]);
if (i < 0)
return i; /* out of memory */
}
TRACE(("%8d: stack[%d]\n", PTR(ptr), stack));
TRACE((" ptr %d mark %d %d %d\n",
PTR(ptr), pattern[3], PTR(mark0), PTR(mark1)));
sp = state->stack + stack;
sp->ptr = ptr;
sp->pattern = pattern + pattern[0];
sp->mark = pattern[3];
if (pattern[3] != 65535) {
sp->mark0 = state->mark[pattern[3]];
sp->mark1 = state->mark[pattern[3]+1];
}
stack++;
PUSH(pattern[0], pattern[3], pattern[2]);
/* match more stuff */
state->stackbase = stack;
i = SRE_MATCH(state, pattern + 4);
state->stackbase = stackbase;
......@@ -896,28 +923,6 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
}
goto failure;
case SRE_OP_BRANCH:
/* match one of several subpatterns */
/* format: <branch> <size> <head> ... <null> <tail> */
TRACE(("%8d: branch\n", PTR(ptr)));
while (*pattern) {
if (pattern[1] != SRE_OP_LITERAL ||
(ptr < end && (SRE_CODE) ptr[0] == pattern[2])) {
TRACE(("%8d: branch check\n", PTR(ptr)));
state->ptr = ptr;
i = SRE_MATCH(state, pattern + 1);
if (i < 0)
return i;
if (i) {
TRACE(("%8d: branch succeeded\n", PTR(ptr)));
goto success;
}
}
pattern += *pattern;
}
TRACE(("%8d: branch failed\n", PTR(ptr)));
goto failure;
case SRE_OP_REPEAT:
/* TEMPLATE: match repeated sequence (no backtracking) */
/* args: <skip> <min> <max> */
......@@ -1127,7 +1132,11 @@ _compile(PyObject* self_, PyObject* args)
if (!code)
return NULL;
#if PY_VERSION_HEX >= 0x01060000
n = PySequence_Size(code);
#else
n = PySequence_Length(code);
#endif
self = PyObject_NEW_VAR(PatternObject, &Pattern_Type, 100*n);
if (!self) {
......@@ -1183,7 +1192,8 @@ sre_getlower(PyObject* self, PyObject* args)
}
LOCAL(PyObject*)
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
state_init(SRE_STATE* state, PatternObject* pattern, PyObject* string,
int start, int end)
{
/* prepare state object */
......@@ -1191,12 +1201,6 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
int i, count;
void* ptr;
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
return NULL;
/* get pointer to string buffer */
buffer = string->ob_type->tp_as_buffer;
if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
......@@ -1238,6 +1242,11 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
state->start = (void*) ((char*) ptr + start * state->charsize);
state->end = (void*) ((char*) ptr + end * state->charsize);
Py_INCREF(string);
state->string = string;
state->pos = start;
state->endpos = end;
state->lastmark = 0;
/* FIXME: dynamic! */
......@@ -1265,6 +1274,7 @@ state_init(SRE_STATE* state, PatternObject* pattern, PyObject* args)
LOCAL(void)
state_fini(SRE_STATE* state)
{
Py_XDECREF(state->string);
stack_free(state);
}
......@@ -1288,8 +1298,7 @@ state_getslice(SRE_STATE* state, int index, PyObject* string)
}
static PyObject*
pattern_new_match(PatternObject* pattern, SRE_STATE* state,
PyObject* string, int status)
pattern_new_match(PatternObject* pattern, SRE_STATE* state, int status)
{
/* create match object (from state object) */
......@@ -1309,19 +1318,20 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
Py_INCREF(pattern);
match->pattern = pattern;
Py_INCREF(string);
match->string = string;
Py_INCREF(state->string);
match->string = state->string;
match->regs = NULL;
match->groups = pattern->groups+1;
/* fill in group slices */
base = (char*) state->beginning;
n = state->charsize;
/* group zero */
match->mark[0] = ((char*) state->start - base) / n;
match->mark[1] = ((char*) state->ptr - base) / n;
/* fill in the rest of the groups */
for (i = j = 0; i < pattern->groups; i++, j+=2)
if (j+1 <= state->lastmark && state->mark[j] && state->mark[j+1]) {
match->mark[j+2] = ((char*) state->mark[j] - base) / n;
......@@ -1329,10 +1339,10 @@ pattern_new_match(PatternObject* pattern, SRE_STATE* state,
} else
match->mark[j+2] = match->mark[j+3] = -1; /* undefined */
match->lastindex = state->lastindex;
match->pos = state->pos;
match->endpos = state->endpos;
match->pos = ((char*) state->start - base) / n;
match->endpos = ((char*) state->end - base) / n;
match->lastindex = state->lastindex;
return (PyObject*) match;
......@@ -1356,14 +1366,19 @@ pattern_scanner(PatternObject* pattern, PyObject* args)
/* create search state object */
ScannerObject* self;
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii:scanner", &string, &start, &end))
return NULL;
/* create match object (with room for extra group marks) */
/* create scanner object */
self = PyObject_NEW(ScannerObject, &Scanner_Type);
if (!self)
return NULL;
string = state_init(&self->state, pattern, args);
string = state_init(&self->state, pattern, string, start, end);
if (!string) {
PyObject_Del(self);
return NULL;
......@@ -1372,9 +1387,6 @@ pattern_scanner(PatternObject* pattern, PyObject* args)
Py_INCREF(pattern);
self->pattern = (PyObject*) pattern;
Py_INCREF(string);
self->string = string;
return (PyObject*) self;
}
......@@ -1390,10 +1402,15 @@ static PyObject*
pattern_match(PatternObject* self, PyObject* args)
{
SRE_STATE state;
PyObject* string;
int status;
string = state_init(&state, self, args);
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii:match", &string, &start, &end))
return NULL;
string = state_init(&state, self, string, start, end);
if (!string)
return NULL;
......@@ -1409,17 +1426,22 @@ pattern_match(PatternObject* self, PyObject* args)
state_fini(&state);
return pattern_new_match(self, &state, string, status);
return pattern_new_match(self, &state, status);
}
static PyObject*
pattern_search(PatternObject* self, PyObject* args)
{
SRE_STATE state;
PyObject* string;
int status;
string = state_init(&state, self, args);
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii:search", &string, &start, &end))
return NULL;
string = state_init(&state, self, string, start, end);
if (!string)
return NULL;
......@@ -1433,7 +1455,7 @@ pattern_search(PatternObject* self, PyObject* args)
state_fini(&state);
return pattern_new_match(self, &state, string, status);
return pattern_new_match(self, &state, status);
}
static PyObject*
......@@ -1467,7 +1489,7 @@ pattern_sub(PatternObject* self, PyObject* args)
PyObject* template;
PyObject* string;
PyObject* count = Py_False; /* zero */
if (!PyArg_ParseTuple(args, "OO|O", &template, &string, &count))
if (!PyArg_ParseTuple(args, "OO|O:sub", &template, &string, &count))
return NULL;
/* delegate to Python code */
......@@ -1480,7 +1502,7 @@ pattern_subn(PatternObject* self, PyObject* args)
PyObject* template;
PyObject* string;
PyObject* count = Py_False; /* zero */
if (!PyArg_ParseTuple(args, "OO|O", &template, &string, &count))
if (!PyArg_ParseTuple(args, "OO|O:subn", &template, &string, &count))
return NULL;
/* delegate to Python code */
......@@ -1492,7 +1514,7 @@ pattern_split(PatternObject* self, PyObject* args)
{
PyObject* string;
PyObject* maxsplit = Py_False; /* zero */
if (!PyArg_ParseTuple(args, "O|O", &string, &maxsplit))
if (!PyArg_ParseTuple(args, "O|O:split", &string, &maxsplit))
return NULL;
/* delegate to Python code */
......@@ -1503,12 +1525,17 @@ static PyObject*
pattern_findall(PatternObject* self, PyObject* args)
{
SRE_STATE state;
PyObject* string;
PyObject* list;
int status;
int i;
string = state_init(&state, self, args);
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii:findall", &string, &start, &end))
return NULL;
string = state_init(&state, self, string, start, end);
if (!string)
return NULL;
......@@ -1657,6 +1684,7 @@ statichere PyTypeObject Pattern_Type = {
static void
match_dealloc(MatchObject* self)
{
Py_XDECREF(self->regs);
Py_XDECREF(self->string);
Py_DECREF(self->pattern);
PyObject_DEL(self);
......@@ -1758,7 +1786,7 @@ match_groups(MatchObject* self, PyObject* args)
int index;
PyObject* def = Py_None;
if (!PyArg_ParseTuple(args, "|O", &def))
if (!PyArg_ParseTuple(args, "|O:groups", &def))
return NULL;
result = PyTuple_New(self->groups-1);
......@@ -1786,7 +1814,7 @@ match_groupdict(MatchObject* self, PyObject* args)
int index;
PyObject* def = Py_None;
if (!PyArg_ParseTuple(args, "|O", &def))
if (!PyArg_ParseTuple(args, "|O:groupdict", &def))
return NULL;
result = PyDict_New();
......@@ -1830,7 +1858,7 @@ match_start(MatchObject* self, PyObject* args)
int index;
PyObject* index_ = Py_False; /* zero */
if (!PyArg_ParseTuple(args, "|O", &index_))
if (!PyArg_ParseTuple(args, "|O:start", &index_))
return NULL;
index = match_getindex(self, index_);
......@@ -1857,7 +1885,7 @@ match_end(MatchObject* self, PyObject* args)
int index;
PyObject* index_ = Py_False; /* zero */
if (!PyArg_ParseTuple(args, "|O", &index_))
if (!PyArg_ParseTuple(args, "|O:end", &index_))
return NULL;
index = match_getindex(self, index_);
......@@ -1878,13 +1906,40 @@ match_end(MatchObject* self, PyObject* args)
return Py_BuildValue("i", self->mark[index*2+1]);
}
LOCAL(PyObject*)
_pair(int i1, int i2)
{
PyObject* pair;
PyObject* item;
pair = PyTuple_New(2);
if (!pair)
return NULL;
item = PyInt_FromLong(i1);
if (!item)
goto error;
PyTuple_SET_ITEM(pair, 0, item);
item = PyInt_FromLong(i2);
if (!item)
goto error;
PyTuple_SET_ITEM(pair, 1, item);
return pair;
error:
Py_DECREF(pair);
return NULL;
}
static PyObject*
match_span(MatchObject* self, PyObject* args)
{
int index;
PyObject* index_ = Py_False; /* zero */
if (!PyArg_ParseTuple(args, "|O", &index_))
if (!PyArg_ParseTuple(args, "|O:span", &index_))
return NULL;
index = match_getindex(self, index_);
......@@ -1903,7 +1958,33 @@ match_span(MatchObject* self, PyObject* args)
return Py_BuildValue("OO", Py_None, Py_None);
}
return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
return _pair(self->mark[index*2], self->mark[index*2+1]);
}
static PyObject*
match_regs(MatchObject* self)
{
PyObject* regs;
PyObject* item;
int index;
regs = PyTuple_New(self->groups);
if (!regs)
return NULL;
for (index = 0; index < self->groups; index++) {
item = _pair(self->mark[index*2], self->mark[index*2+1]);
if (!item) {
Py_DECREF(regs);
return NULL;
}
PyTuple_SET_ITEM(regs, index, item);
}
Py_INCREF(regs);
self->regs = regs;
return regs;
}
static PyMethodDef match_methods[] = {
......@@ -1928,7 +2009,6 @@ match_getattr(MatchObject* self, char* name)
PyErr_Clear();
if (!strcmp(name, "lastindex")) {
/* experimental */
if (self->lastindex >= 0)
return Py_BuildValue("i", self->lastindex);
Py_INCREF(Py_None);
......@@ -1936,7 +2016,6 @@ match_getattr(MatchObject* self, char* name)
}
if (!strcmp(name, "lastgroup")) {
/* experimental */
if (self->pattern->indexgroup && self->lastindex >= 0) {
PyObject* result = PySequence_GetItem(
self->pattern->indexgroup, self->lastindex
......@@ -1950,8 +2029,21 @@ match_getattr(MatchObject* self, char* name)
}
if (!strcmp(name, "string")) {
if (self->string) {
Py_INCREF(self->string);
return self->string;
} else {
Py_INCREF(Py_None);
return Py_None;
}
}
if (!strcmp(name, "regs")) {
if (self->regs) {
Py_INCREF(self->regs);
return self->regs;
} else
return match_regs(self);
}
if (!strcmp(name, "re")) {
......@@ -1988,7 +2080,6 @@ static void
scanner_dealloc(ScannerObject* self)
{
state_fini(&self->state);
Py_DECREF(self->string);
Py_DECREF(self->pattern);
PyObject_DEL(self);
}
......@@ -2000,6 +2091,7 @@ scanner_match(ScannerObject* self, PyObject* args)
PyObject* match;
int status;
state->lastindex = -1;
state->ptr = state->start;
if (state->charsize == 1) {
......@@ -2011,7 +2103,7 @@ scanner_match(ScannerObject* self, PyObject* args)
}
match = pattern_new_match((PatternObject*) self->pattern,
state, self->string, status);
state, status);
if (status == 0 || state->ptr == state->start)
state->start = (void*) ((char*) state->ptr + state->charsize);
......@@ -2029,6 +2121,7 @@ scanner_search(ScannerObject* self, PyObject* args)
PyObject* match;
int status;
state->lastindex = -1;
state->ptr = state->start;
if (state->charsize == 1) {
......@@ -2040,7 +2133,7 @@ scanner_search(ScannerObject* self, PyObject* args)
}
match = pattern_new_match((PatternObject*) self->pattern,
state, self->string, status);
state, status);
if (status == 0 || state->ptr == state->start)
state->start = (void*) ((char*) state->ptr + state->charsize);
......
/*
*
* Secret Labs' Regular Expression Engine
*
* regular expression matching engine
......@@ -33,6 +34,7 @@ typedef struct {
typedef struct {
PyObject_VAR_HEAD
PyObject* string; /* link to the target string */
PyObject* regs; /* cached list of matching spans */
PatternObject* pattern; /* link to the regex (pattern) object */
int pos, endpos; /* current target slice */
int lastindex; /* last index marker seen by the engine (-1 if none) */
......@@ -60,6 +62,9 @@ typedef struct {
void* beginning; /* start of original string */
void* start; /* start of current slice */
void* end; /* end of original string */
/* attributes for the match object */
PyObject* string;
int pos, endpos;
/* character size */
int charsize;
/* registers */
......@@ -78,7 +83,6 @@ typedef struct {
/* scanner (internal helper object) */
PyObject_HEAD
PyObject* pattern;
PyObject* string;
SRE_STATE state;
} ScannerObject;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment