Commit b1aa1951 authored by Jeremy Hylton's avatar Jeremy Hylton

Fredrik Lundh: here's the 96.6% version of SRE

parent 0292d78e
# -*- Mode: Python; tab-width: 4 -*-
#
# Secret Labs' Regular Expression Engine
# $Id$
......@@ -7,39 +6,127 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# This code can only be used for 1.6 alpha testing. All other use
# require explicit permission from Secret Labs AB.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
"""
this is a long string
"""
import sre_compile
# flags
I = IGNORECASE = sre_compile.SRE_FLAG_IGNORECASE
L = LOCALE = sre_compile.SRE_FLAG_LOCALE
M = MULTILINE = sre_compile.SRE_FLAG_MULTILINE
S = DOTALL = sre_compile.SRE_FLAG_DOTALL
X = VERBOSE = sre_compile.SRE_FLAG_VERBOSE
# --------------------------------------------------------------------
# public interface
def compile(pattern, flags=0):
return sre_compile.compile(pattern, _fixflags(flags))
# FIXME: add docstrings
def match(pattern, string, flags=0):
return compile(pattern, _fixflags(flags)).match(string)
return _compile(pattern, flags).match(string)
def search(pattern, string, flags=0):
return compile(pattern, _fixflags(flags)).search(string)
return _compile(pattern, flags).search(string)
def sub(pattern, repl, string, count=0):
return _compile(pattern).sub(repl, string, count)
def subn(pattern, repl, string, count=0):
return _compile(pattern).subn(repl, string, count)
def split(pattern, string, maxsplit=0):
return _compile(pattern).split(string, maxsplit)
# FIXME: etc
def findall(pattern, string, maxsplit=0):
return _compile(pattern).findall(string, maxsplit)
def compile(pattern, flags=0):
return _compile(pattern, flags)
def escape(pattern):
s = list(pattern)
for i in range(len(pattern)):
c = pattern[i]
if not ("a" <= c <= "z" or "A" <= c <= "Z" or "0" <= c <= "9"):
if c == "\000":
s[i] = "\\000"
else:
s[i] = "\\" + c
return pattern[:0].join(s)
# --------------------------------------------------------------------
# helpers
# internals
_cache = {}
_MAXCACHE = 100
def _compile(pattern, flags=0):
# internal: compile pattern
tp = type(pattern)
if tp not in (type(""), type(u"")):
return pattern
key = (tp, pattern, flags)
try:
return _cache[key]
except KeyError:
pass
p = sre_compile.compile(pattern, flags)
if len(_cache) >= _MAXCACHE:
_cache.clear()
_cache[key] = p
return p
def _sub(pattern, template, string, count=0):
# internal: pattern.sub implementation hook
return _subn(pattern, template, string, count)[0]
def _expand(match, template):
# internal: expand template
return template # FIXME
def _fixflags(flags):
# convert flag bitmask to sequence
assert not flags
return ()
def _subn(pattern, template, string, count=0):
# internal: pattern.subn implementation hook
if callable(template):
filter = callable
else:
# FIXME: prepare template
def filter(match, template=template):
return _expand(match, template)
n = i = 0
s = []
append = s.append
c = pattern.cursor(string)
while not count or n < count:
m = c.search()
if not m:
break
j = m.start()
if j > i:
append(string[i:j])
append(filter(m))
i = m.end()
n = n + 1
if i < len(string):
append(string[i:])
return string[:0].join(s), n
def _split(pattern, string, maxsplit=0):
# internal: pattern.split implementation hook
n = i = 0
s = []
append = s.append
c = pattern.cursor(string)
while not maxsplit or n < maxsplit:
m = c.search()
if not m:
break
j = m.start()
append(string[i:j])
i = m.end()
n = n + 1
if i < len(string):
append(string[i:])
return s
......@@ -14,9 +14,6 @@
# other compatibility work.
#
# FIXME: <fl> formalize (objectify?) and document the compiler code
# format, so that other frontends can use this compiler
import array, string, sys
import _sre
......@@ -45,64 +42,70 @@ class Code:
self.data.append(code)
def todata(self):
# print self.data
try:
return array.array(WORDSIZE, self.data).tostring()
except OverflowError:
print self.data
raise
def _lower(literal):
# return _sre._lower(literal) # FIXME
return string.lower(literal)
def _compile(code, pattern, flags):
def _compile(code, pattern, flags, level=0):
append = code.append
for op, av in pattern:
if op is ANY:
if "s" in flags:
append(CODES[op]) # any character at all!
if flags & SRE_FLAG_DOTALL:
append(OPCODES[op]) # any character at all!
else:
append(CODES[NOT_LITERAL])
append(10)
append(OPCODES[CATEGORY])
append(CHCODES[CATEGORY_NOT_LINEBREAK])
elif op in (SUCCESS, FAILURE):
append(CODES[op])
append(OPCODES[op])
elif op is AT:
append(CODES[op])
append(POSITIONS[av])
append(OPCODES[op])
if flags & SRE_FLAG_MULTILINE:
append(ATCODES[AT_MULTILINE[av]])
else:
append(ATCODES[av])
elif op is BRANCH:
append(CODES[op])
append(OPCODES[op])
tail = []
for av in av[1]:
skip = len(code); append(0)
_compile(code, av, flags)
append(CODES[JUMP])
_compile(code, av, flags, level)
append(OPCODES[JUMP])
tail.append(len(code)); append(0)
code[skip] = len(code) - skip
append(0) # end of branch
for tail in tail:
code[tail] = len(code) - tail
elif op is CALL:
append(CODES[op])
append(OPCODES[op])
skip = len(code); append(0)
_compile(code, av, flags)
append(CODES[SUCCESS])
_compile(code, av, flags, level+1)
append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
elif op is CATEGORY: # not used by current parser
append(CODES[op])
append(CATEGORIES[av])
append(OPCODES[op])
if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]])
else:
append(CHCODES[av])
elif op is GROUP:
if "i" in flags:
append(CODES[MAP_IGNORE[op]])
if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]])
else:
append(CODES[op])
append(av)
append(OPCODES[op])
append(av-1)
elif op is IN:
if "i" in flags:
append(CODES[MAP_IGNORE[op]])
if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]])
def fixup(literal):
return ord(_lower(literal))
return ord(literal.lower())
else:
append(CODES[op])
append(OPCODES[op])
fixup = ord
skip = len(code); append(0)
for op, av in av:
append(CODES[op])
append(OPCODES[op])
if op is NEGATE:
pass
elif op is LITERAL:
......@@ -111,58 +114,60 @@ def _compile(code, pattern, flags):
append(fixup(av[0]))
append(fixup(av[1]))
elif op is CATEGORY:
append(CATEGORIES[av])
if flags & SRE_FLAG_LOCALE:
append(CH_LOCALE[CHCODES[av]])
else:
append(CHCODES[av])
else:
raise ValueError, "unsupported set operator"
append(CODES[FAILURE])
append(OPCODES[FAILURE])
code[skip] = len(code) - skip
elif op in (LITERAL, NOT_LITERAL):
if "i" in flags:
append(CODES[MAP_IGNORE[op]])
append(ord(_lower(av)))
if flags & SRE_FLAG_IGNORECASE:
append(OPCODES[OP_IGNORE[op]])
append(ord(av.lower()))
else:
append(CODES[op])
append(OPCODES[op])
append(ord(av))
elif op is MARK:
append(CODES[op])
append(OPCODES[op])
append(av)
elif op in (REPEAT, MIN_REPEAT, MAX_REPEAT):
lo, hi = av[2].getwidth()
if lo == 0:
raise SyntaxError, "cannot repeat zero-width items"
if lo == hi == 1 and op is MAX_REPEAT:
append(CODES[MAX_REPEAT_ONE])
append(OPCODES[MAX_REPEAT_ONE])
skip = len(code); append(0)
append(av[0])
append(av[1])
_compile(code, av[2], flags)
append(CODES[SUCCESS])
_compile(code, av[2], flags, level+1)
append(OPCODES[SUCCESS])
code[skip] = len(code) - skip
else:
append(CODES[op])
append(OPCODES[op])
skip = len(code); append(0)
append(av[0])
append(av[1])
_compile(code, av[2], flags)
_compile(code, av[2], flags, level+1)
if op is MIN_REPEAT:
append(CODES[MIN_UNTIL])
append(OPCODES[MIN_UNTIL])
else:
# FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
append(CODES[MAX_UNTIL])
append(OPCODES[MAX_UNTIL])
code[skip] = len(code) - skip
elif op is SUBPATTERN:
## group = av[0]
## if group:
## append(CODES[MARK])
## append((group-1)*2)
_compile(code, av[1], flags)
## if group:
## append(CODES[MARK])
## append((group-1)*2+1)
group = av[0]
if group:
append(OPCODES[MARK])
append((group-1)*2)
_compile(code, av[1], flags, level+1)
if group:
append(OPCODES[MARK])
append((group-1)*2+1)
else:
raise ValueError, ("unsupported operand type", op)
def compile(p, flags=()):
def compile(p, flags=0):
# convert pattern list to internal format
if type(p) in (type(""), type(u"")):
import sre_parse
......@@ -170,12 +175,10 @@ def compile(p, flags=()):
p = sre_parse.parse(p)
else:
pattern = None
# print p.getwidth()
# print p
flags = p.pattern.flags | flags
code = Code()
_compile(code, p.data, p.pattern.flags)
code.append(CODES[SUCCESS])
# print list(code.data)
_compile(code, p.data, flags)
code.append(OPCODES[SUCCESS])
data = code.todata()
if 0: # debugging
print
......@@ -183,5 +186,8 @@ def compile(p, flags=()):
import sre_disasm
sre_disasm.disasm(data)
print "-" * 68
# print len(data), p.pattern.groups, len(p.pattern.groupdict)
return _sre.compile(pattern, data, p.pattern.groups-1, p.pattern.groupdict)
return _sre.compile(
pattern, flags,
data,
p.pattern.groups-1, p.pattern.groupdict
)
......@@ -48,20 +48,31 @@ SUBPATTERN = "subpattern"
# positions
AT_BEGINNING = "at_beginning"
AT_BEGINNING_LINE = "at_beginning_line"
AT_BOUNDARY = "at_boundary"
AT_NON_BOUNDARY = "at_non_boundary"
AT_END = "at_end"
AT_END_LINE = "at_end_line"
# categories
CATEGORY_DIGIT = "category_digit"
CATEGORY_NOT_DIGIT = "category_not_digit"
CATEGORY_SPACE = "category_space"
CATEGORY_NOT_SPACE = "category_not_space"
CATEGORY_WORD = "category_word"
CATEGORY_NOT_WORD = "category_not_word"
CATEGORY_LINEBREAK = "category_linebreak"
CATEGORY_NOT_LINEBREAK = "category_not_linebreak"
CATEGORY_LOC_DIGIT = "category_loc_digit"
CATEGORY_LOC_NOT_DIGIT = "category_loc_not_digit"
CATEGORY_LOC_SPACE = "category_loc_space"
CATEGORY_LOC_NOT_SPACE = "category_loc_not_space"
CATEGORY_LOC_WORD = "category_loc_word"
CATEGORY_LOC_NOT_WORD = "category_loc_not_word"
CATEGORY_LOC_LINEBREAK = "category_loc_linebreak"
CATEGORY_LOC_NOT_LINEBREAK = "category_loc_not_linebreak"
CODES = [
OPCODES = [
# failure=0 success=1 (just because it looks better that way :-)
FAILURE, SUCCESS,
......@@ -87,45 +98,75 @@ CODES = [
]
# convert to dictionary
c = {}
i = 0
for code in CODES:
c[code] = i
ATCODES = [
AT_BEGINNING, AT_BEGINNING_LINE, AT_BOUNDARY,
AT_NON_BOUNDARY, AT_END, AT_END_LINE
]
CHCODES = [
CATEGORY_DIGIT, CATEGORY_NOT_DIGIT, CATEGORY_SPACE,
CATEGORY_NOT_SPACE, CATEGORY_WORD, CATEGORY_NOT_WORD,
CATEGORY_LINEBREAK, CATEGORY_NOT_LINEBREAK, CATEGORY_LOC_DIGIT,
CATEGORY_LOC_NOT_DIGIT, CATEGORY_LOC_SPACE,
CATEGORY_LOC_NOT_SPACE, CATEGORY_LOC_WORD, CATEGORY_LOC_NOT_WORD,
CATEGORY_LOC_LINEBREAK, CATEGORY_LOC_NOT_LINEBREAK
]
def makedict(list):
d = {}
i = 0
for item in list:
d[item] = i
i = i + 1
CODES = c
return d
OPCODES = makedict(OPCODES)
ATCODES = makedict(ATCODES)
CHCODES = makedict(CHCODES)
# replacement operations for "ignore case" mode
MAP_IGNORE = {
OP_IGNORE = {
GROUP: GROUP_IGNORE,
IN: IN_IGNORE,
LITERAL: LITERAL_IGNORE,
NOT_LITERAL: NOT_LITERAL_IGNORE
}
POSITIONS = {
AT_BEGINNING: ord("a"),
AT_BOUNDARY: ord("b"),
AT_NON_BOUNDARY: ord("B"),
AT_END: ord("z"),
AT_MULTILINE = {
AT_BEGINNING: AT_BEGINNING_LINE,
AT_END: AT_END_LINE
}
CATEGORIES = {
CATEGORY_DIGIT: ord("d"),
CATEGORY_NOT_DIGIT: ord("D"),
CATEGORY_SPACE: ord("s"),
CATEGORY_NOT_SPACE: ord("S"),
CATEGORY_WORD: ord("w"),
CATEGORY_NOT_WORD: ord("W"),
CH_LOCALE = {
CATEGORY_DIGIT: CATEGORY_LOC_DIGIT,
CATEGORY_NOT_DIGIT: CATEGORY_LOC_NOT_DIGIT,
CATEGORY_SPACE: CATEGORY_LOC_SPACE,
CATEGORY_NOT_SPACE: CATEGORY_LOC_NOT_SPACE,
CATEGORY_WORD: CATEGORY_LOC_WORD,
CATEGORY_NOT_WORD: CATEGORY_LOC_NOT_WORD,
CATEGORY_LINEBREAK: CATEGORY_LOC_LINEBREAK,
CATEGORY_NOT_LINEBREAK: CATEGORY_LOC_NOT_LINEBREAK
}
# flags
SRE_FLAG_TEMPLATE = 1 # NYI
SRE_FLAG_IGNORECASE = 2
SRE_FLAG_LOCALE = 4
SRE_FLAG_MULTILINE = 8
SRE_FLAG_DOTALL = 16
SRE_FLAG_VERBOSE = 32
if __name__ == "__main__":
import string
items = CODES.items()
def dump(f, d, prefix):
items = d.items()
items.sort(lambda a, b: cmp(a[1], b[1]))
f = open("sre_constants.h", "w")
f.write("/* generated by sre_constants.py */\n")
for k, v in items:
f.write("#define SRE_OP_" + string.upper(k) + " " + str(v) + "\n")
f.write("#define %s_%s %s\n" % (prefix, string.upper(k), v))
f = open("sre_constants.h", "w")
f.write("/* generated from sre_constants.py */\n")
dump(f, OPCODES, "SRE_OP")
dump(f, ATCODES, "SRE")
dump(f, CHCODES, "SRE")
f.close()
print "done"
......@@ -6,13 +6,16 @@
* simple regular expression matching engine
*
* partial history:
* 99-10-24 fl created (bits and pieces from the template matcher)
* 99-10-24 fl created (based on the template matcher)
* 99-11-13 fl added categories, branching, and more (0.2)
* 99-11-16 fl some tweaks to compile on non-Windows platforms
* 99-12-18 fl non-literals, generic maximizing repeat (0.3)
* 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
* 99-03-06 fl first alpha, sort of (0.5)
* 99-03-14 fl removed most compatibility stuff (0.6)
* 99-05-10 fl towards third alpha (0.8.2)
* 99-05-13 fl added experimental cursor stuff (0.8.3)
* 99-05-27 fl final bug hunt (0.8.4)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
......@@ -26,7 +29,7 @@
#ifndef SRE_RECURSIVE
char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
char copyright[] = " SRE 0.8.4 Copyright (c) 1997-2000 by Secret Labs AB ";
#include "Python.h"
......@@ -40,7 +43,7 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#define INT_MAX 2147483647
#endif
#include <ctype.h> /* temporary hack */
#include <ctype.h>
/* defining this one enables tracing */
#undef DEBUG
......@@ -59,61 +62,69 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#ifdef DEBUG
#define TRACE(v) printf v
#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#else
#define TRACE(v)
#endif
#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#define SRE_CODE unsigned short /* unsigned short or larger */
typedef struct {
/* string pointers */
void* ptr; /* current position (also end of current slice) */
void* beginning; /* start of original string */
void* start; /* start of current slice */
void* end; /* end of original string */
/* character size */
int charsize;
/* registers */
int marks;
void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
/* FIXME */
/* backtracking stack */
void** stack;
int stacksize;
int stackbase;
} SRE_STATE;
#if 1 /* FIXME: <fl> fix this one! */
#define SRE_TO_LOWER Py_UNICODE_TOLOWER
#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT
#define SRE_IS_SPACE Py_UNICODE_ISSPACE
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#else
#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
/* -------------------------------------------------------------------- */
/* search engine state */
/* unicode character predicates */
#define SRE_TO_LOWER(ch) Py_UNICODE_TOLOWER((Py_UNICODE)(ch))
#define SRE_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_IS_LINEBREAK(ch) ((ch) == '\n')
/* #define SRE_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) */
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#endif
#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
/* locale-specific character predicates */
#define SRE_LOC_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
LOCAL(int)
sre_category(SRE_CODE category, unsigned int ch)
{
switch (category) {
case 'd':
case SRE_CATEGORY_DIGIT:
return SRE_IS_DIGIT(ch);
case 'D':
case SRE_CATEGORY_NOT_DIGIT:
return !SRE_IS_DIGIT(ch);
case 's':
case SRE_CATEGORY_SPACE:
return SRE_IS_SPACE(ch);
case 'S':
case SRE_CATEGORY_NOT_SPACE:
return !SRE_IS_SPACE(ch);
case 'w':
case SRE_CATEGORY_WORD:
return SRE_IS_WORD(ch);
case 'W':
case SRE_CATEGORY_NOT_WORD:
return !SRE_IS_WORD(ch);
case SRE_CATEGORY_LINEBREAK:
return SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_NOT_LINEBREAK:
return !SRE_IS_LINEBREAK(ch);
case SRE_CATEGORY_LOC_DIGIT:
return SRE_LOC_IS_DIGIT(ch);
case SRE_CATEGORY_LOC_NOT_DIGIT:
return !SRE_LOC_IS_DIGIT(ch);
case SRE_CATEGORY_LOC_SPACE:
return SRE_LOC_IS_SPACE(ch);
case SRE_CATEGORY_LOC_NOT_SPACE:
return !SRE_LOC_IS_SPACE(ch);
case SRE_CATEGORY_LOC_WORD:
return SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_LOC_NOT_WORD:
return !SRE_LOC_IS_WORD(ch);
case SRE_CATEGORY_LOC_LINEBREAK:
return SRE_LOC_IS_LINEBREAK(ch);
case SRE_CATEGORY_LOC_NOT_LINEBREAK:
return !SRE_LOC_IS_LINEBREAK(ch);
}
return 0;
}
......@@ -174,7 +185,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
return 0;
}
/* set things up for the 8-bit version */
/* generate 8-bit version */
#define SRE_CHAR unsigned char
#define SRE_AT sre_at
......@@ -192,7 +203,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
#undef SRE_AT
#undef SRE_CHAR
/* set things up for the 16-bit unicode version */
/* generate 16-bit unicode version */
#define SRE_CHAR Py_UNICODE
#define SRE_AT sre_uat
......@@ -211,20 +222,22 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
LOCAL(int)
SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
{
/* check if pointer is at given position. return 1 if so, 0
otherwise */
/* check if pointer is at given position */
int this, that;
switch (at) {
case 'a':
/* beginning */
case SRE_AT_BEGINNING:
return ((void*) ptr == state->beginning);
case 'z':
/* end */
case SRE_AT_BEGINNING_LINE:
return ((void*) ptr == state->beginning ||
SRE_IS_LINEBREAK((int) ptr[-1]));
case SRE_AT_END:
return ((void*) ptr == state->end);
case 'b':
/* word boundary */
case SRE_AT_END_LINE:
return ((void*) ptr == state->end ||
SRE_IS_LINEBREAK((int) ptr[0]));
case SRE_AT_BOUNDARY:
if (state->beginning == state->end)
return 0;
that = ((void*) ptr > state->beginning) ?
......@@ -232,8 +245,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
this = ((void*) ptr < state->end) ?
SRE_IS_WORD((int) ptr[0]) : 0;
return this != that;
case 'B':
/* word non-boundary */
case SRE_AT_NON_BOUNDARY:
if (state->beginning == state->end)
return 0;
that = ((void*) ptr > state->beginning) ?
......@@ -249,8 +261,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
LOCAL(int)
SRE_MEMBER(SRE_CODE* set, SRE_CHAR ch)
{
/* check if character is a member of the given set. return 1 if
so, 0 otherwise */
/* check if character is a member of the given set */
int ok = 1;
......@@ -301,37 +312,50 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
int stackbase;
int i, count;
for (;;) {
/* FIXME: this is one ugly hack */
void* *mark = NULL;
void* mark_data[64];
TRACE(("[%p]\n", pattern));
for (;;) {
switch (*pattern++) {
case SRE_OP_FAILURE:
/* immediate failure */
TRACE(("%8d: failure\n", PTR(ptr)));
return 0;
goto failure;
case SRE_OP_SUCCESS:
/* end of pattern */
TRACE(("%8d: success\n", PTR(ptr)));
state->ptr = ptr;
return 1;
goto success;
case SRE_OP_AT:
/* match at given position */
/* args: <at> */
TRACE(("%8d: match at \\%c\n", PTR(ptr), *pattern));
if (!SRE_AT(state, ptr, *pattern))
return 0;
goto failure;
pattern++;
break;
case SRE_OP_CATEGORY:
/* match at given category */
/* args: <category> */
TRACE(("%8d: category match at \\%c\n", PTR(ptr), *pattern));
if (ptr >= end || !sre_category(pattern[0], ptr[0]))
goto failure;
pattern++;
ptr++;
break;
case SRE_OP_LITERAL:
/* match literal character */
/* args: <code> */
TRACE(("%8d: literal %c\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || *ptr != (SRE_CHAR) *pattern)
return 0;
goto failure;
pattern++;
ptr++;
break;
......@@ -341,7 +365,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <code> */
TRACE(("%8d: literal not %c\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || *ptr == (SRE_CHAR) *pattern)
return 0;
goto failure;
pattern++;
ptr++;
break;
......@@ -350,7 +374,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* match anything */
TRACE(("%8d: any\n", PTR(ptr)));
if (ptr >= end)
return 0;
goto failure;
ptr++;
break;
......@@ -359,23 +383,47 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <skip> <set> */
TRACE(("%8d: set %c\n", PTR(ptr), *ptr));
if (ptr >= end || !SRE_MEMBER(pattern + 1, *ptr))
return 0;
goto failure;
pattern += pattern[0];
ptr++;
break;
case SRE_OP_GROUP:
/* match backreference */
TRACE(("%8d: group %d\n", PTR(ptr), pattern[0]));
i = pattern[0];
{
/* FIXME: optimize size! */
/* FIXME: optimize! */
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
if (!p || !e || e < p)
return 0;
goto failure;
while (p < e) {
TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
if (ptr >= end || *ptr != *p)
return 0;
goto failure;
p++; ptr++;
}
}
pattern++;
break;
case SRE_OP_GROUP_IGNORE:
/* match backreference */
TRACE(("%8d: group ignore %d\n", PTR(ptr), pattern[0]));
i = pattern[0];
{
/* FIXME: optimize! */
SRE_CHAR* p = (SRE_CHAR*) state->mark[i+i];
SRE_CHAR* e = (SRE_CHAR*) state->mark[i+i+1];
TRACE(("%8d: group %p %p\n", PTR(ptr), p, e));
if (!p || !e || e < p)
goto failure;
while (p < e) {
TRACE(("%8d: group test %c %c\n", PTR(ptr), *ptr, *p));
if (ptr >= end || SRE_TO_LOWER(*ptr) != SRE_TO_LOWER(*p))
goto failure;
p++; ptr++;
}
}
......@@ -385,7 +433,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case SRE_OP_LITERAL_IGNORE:
TRACE(("%8d: literal lower(%c)\n", PTR(ptr), (SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) != (SRE_CHAR) *pattern)
return 0;
goto failure;
pattern++;
ptr++;
break;
......@@ -394,7 +442,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: literal not lower(%c)\n", PTR(ptr),
(SRE_CHAR) *pattern));
if (ptr >= end || SRE_TO_LOWER(*ptr) == (SRE_CHAR) *pattern)
return 0;
goto failure;
pattern++;
ptr++;
break;
......@@ -403,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: set lower(%c)\n", PTR(ptr), *ptr));
if (ptr >= end
|| !SRE_MEMBER(pattern+1, (SRE_CHAR) SRE_TO_LOWER(*ptr)))
return 0;
goto failure;
pattern += pattern[0];
ptr++;
break;
......@@ -412,6 +460,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* set mark */
/* args: <mark> */
TRACE(("%8d: set mark(%d)\n", PTR(ptr), pattern[0]));
if (!mark) {
mark = mark_data;
memcpy(mark, state->mark, sizeof(state->mark));
}
state->mark[pattern[0]] = ptr;
pattern++;
break;
......@@ -429,21 +481,18 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: match subpattern\n", PTR(ptr)));
state->ptr = ptr;
if (!SRE_MATCH(state, pattern + 1))
return 0;
goto failure;
pattern += pattern[0];
ptr = state->ptr;
break;
case SRE_OP_MAX_REPEAT_ONE:
/* match repeated sequence (maximizing regexp). this
variant only works if the repeated item is exactly one
character wide, and we're not already collecting
/* match repeated sequence (maximizing regexp) */
/* this variant only works if the repeated item is exactly
one character wide, and we're not already collecting
backtracking points. for other cases, use the
MAX_REPEAT operator instead */
/* args: <skip> <min> <max> <step> */
TRACE(("%8d: max repeat one {%d,%d}\n", PTR(ptr),
pattern[1], pattern[2]));
......@@ -454,7 +503,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string, and backtrack from there */
/* FIXME: must look for line endings */
if (ptr + pattern[1] > end)
return 0; /* cannot match */
goto failure; /* cannot match */
count = pattern[2];
if (count > end - ptr)
count = end - ptr;
......@@ -515,7 +564,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count < (int) pattern[2]) {
i = SRE_MATCH(state, pattern + 3);
if (i < 0)
return i;
goto failure;
if (i == 0)
break;
count++;
......@@ -529,23 +578,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string. check if the rest of the pattern matches, and
backtrack if not. */
/* FIXME: <fl> this is a mess. fix it! */
TRACE(("%8d: repeat %d found\n", PTR(ptr), count));
if (count < (int) pattern[1])
return 0;
goto failure;
if (pattern[pattern[0]] == SRE_OP_SUCCESS) {
/* tail is empty. we're finished */
TRACE(("%8d: tail is empty\n", PTR(ptr)));
state->ptr = ptr;
return 1;
goto success;
} else if (pattern[pattern[0]] == SRE_OP_LITERAL) {
/* tail starts with a literal. we can speed things up
by skipping positions where the rest of the pattern
cannot possibly match */
/* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */
SRE_CHAR chr = (SRE_CHAR) pattern[pattern[0]+1];
TRACE(("%8d: tail is literal %d\n", PTR(ptr), chr));
for (;;) {
......@@ -562,7 +608,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1;
goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--;
......@@ -570,23 +616,21 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
}
} else {
/* general case */
TRACE(("%8d: tail is pattern\n", PTR(ptr)));
while (count >= (int) pattern[1]) {
state->ptr = ptr;
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1;
goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
ptr--;
count--;
}
}
return 0; /* failure! */
/* ----------------------------------------------------------------------- */
/* FIXME: the following section is just plain broken */
goto failure;
case SRE_OP_MAX_REPEAT:
/* match repeated sequence (maximizing regexp). repeated
......@@ -611,7 +655,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = _stack_extend(state, stackbase + count + 1,
stackbase + pattern[2]);
if (i < 0)
return i;
goto failure;
}
state->stack[stackbase + count] = ptr;
/* check if we can match another item */
......@@ -642,7 +686,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
ptr points to the tail. */
if (count < (int) pattern[1])
return 0;
goto failure;
/* make sure that rest of the expression matches. if it
doesn't, backtrack */
......@@ -659,7 +703,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->stackbase = stackbase;
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1;
goto success;
}
/* backtrack! */
......@@ -673,10 +717,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state->stackbase = stackbase;
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1;
goto success;
}
}
return 0; /* failure! */
goto failure;
case SRE_OP_MAX_UNTIL:
/* match repeated sequence (maximizing regexp). repeated
......@@ -684,13 +728,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE(("%8d: max until\n", PTR(ptr)));
state->ptr = ptr;
return 2; /* always succeeds, for now... */
/* end of totally broken section */
/* ----------------------------------------------------------------------- */
goto success; /* always succeeds, for now... */
case SRE_OP_MIN_REPEAT:
/* match repeated sequence (minimizing regexp) */
/* FIXME: HERE BE BUGS! */
TRACE(("%8d: min repeat %d %d\n", PTR(ptr),
pattern[1], pattern[2]));
count = 0;
......@@ -699,7 +741,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while (count < (int) pattern[1]) {
i = SRE_MATCH(state, pattern + 3);
if (i <= 0)
return i;
goto failure;
count++;
}
/* move forward until the tail matches. */
......@@ -708,22 +750,22 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + pattern[0]);
if (i > 0) {
TRACE(("%8d: repeat %d picked\n", PTR(ptr), count));
return 1;
goto success;
}
TRACE(("%8d: BACKTRACK\n", PTR(ptr)));
state->ptr = ptr; /* backtrack */
i = SRE_MATCH(state, pattern + 3);
if (i <= 0)
return i;
goto failure;
count++;
}
return 0; /* failure! */
goto failure;
case SRE_OP_MIN_UNTIL:
/* end of repeat group */
TRACE(("%8d: min until\n", PTR(ptr)));
state->ptr = ptr;
return 2; /* always succeeds, for now... */
goto success; /* always succeeds, for now... */
case SRE_OP_BRANCH:
/* match one of several subpatterns */
......@@ -737,13 +779,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i = SRE_MATCH(state, pattern + 1);
if (i > 0) {
TRACE(("%8d: branch succeeded\n", PTR(ptr)));
return 1;
goto success;
}
}
pattern += *pattern;
}
TRACE(("%8d: branch failed\n", PTR(ptr)));
return 0; /* failure! */
goto failure;
case SRE_OP_REPEAT:
/* TEMPLATE: match repeated sequence (no backtracking) */
......@@ -758,7 +800,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
count++;
}
if (count <= (int) pattern[1])
return 0;
goto failure;
TRACE(("%8d: repeat %d matches\n", PTR(ptr), count));
pattern += pattern[0];
ptr = state->ptr;
......@@ -768,6 +810,14 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
return SRE_ERROR_ILLEGAL;
}
}
failure:
if (mark)
memcpy(state->mark, mark, sizeof(state->mark));
return 0;
success:
return 1;
}
LOCAL(int)
......@@ -832,6 +882,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
staticforward PyTypeObject Pattern_Type;
staticforward PyTypeObject Match_Type;
staticforward PyTypeObject Cursor_Type;
static PyObject *
_compile(PyObject* self_, PyObject* args)
......@@ -841,20 +892,25 @@ _compile(PyObject* self_, PyObject* args)
PatternObject* self;
PyObject* pattern;
int flags = 0;
PyObject* code;
int groups = 0;
PyObject* groupindex = NULL;
if (!PyArg_ParseTuple(args, "OO!|iO", &pattern,
&PyString_Type, &code, &groups, &groupindex))
if (!PyArg_ParseTuple(args, "OiO!|iO", &pattern, &flags,
&PyString_Type, &code,
&groups, &groupindex))
return NULL;
self = PyObject_New(PatternObject, &Pattern_Type);
self = PyObject_NEW(PatternObject, &Pattern_Type);
if (self == NULL)
return NULL;
Py_INCREF(pattern);
self->pattern = pattern;
self->flags = flags;
Py_INCREF(code);
self->code = code;
......@@ -872,6 +928,69 @@ _getcodesize(PyObject* self_, PyObject* args)
return Py_BuildValue("i", sizeof(SRE_CODE));
}
LOCAL(PyObject*)
_setup(SRE_STATE* state, PyObject* args)
{
/* prepare state object */
PyBufferProcs *buffer;
int i, count;
void* ptr;
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
return NULL;
/* get pointer to string buffer */
buffer = string->ob_type->tp_as_buffer;
if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
buffer->bf_getsegcount(string, NULL) != 1) {
PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
return NULL;
}
/* determine buffer size */
count = buffer->bf_getreadbuffer(string, 0, &ptr);
if (count < 0) {
/* sanity check */
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
return NULL;
}
/* determine character size */
state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
count /= state->charsize;
/* adjust boundaries */
if (start < 0)
start = 0;
else if (start > count)
start = count;
if (end < 0)
end = 0;
else if (end > count)
end = count;
state->beginning = ptr;
state->start = (void*) ((char*) ptr + start * state->charsize);
state->end = (void*) ((char*) ptr + end * state->charsize);
/* FIXME: dynamic! */
for (i = 0; i < 64; i++)
state->mark[i] = NULL;
state->stack = NULL;
state->stackbase = 0;
state->stacksize = 0;
return string;
}
static PyObject*
_pattern_new_match(PatternObject* pattern, SRE_STATE* state,
PyObject* string, int status)
......@@ -886,7 +1005,7 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
if (status > 0) {
/* create match object (with room for extra group marks) */
match = PyObject_NewVar(MatchObject, &Match_Type, 2*pattern->groups);
match = PyObject_NEW_VAR(MatchObject, &Match_Type, 2*pattern->groups);
if (match == NULL)
return NULL;
......@@ -930,70 +1049,32 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
return Py_None;
}
/* -------------------------------------------------------------------- */
/* pattern methods */
LOCAL(PyObject*)
_setup(SRE_STATE* state, PyObject* args)
static PyObject*
_pattern_cursor(PyObject* pattern, PyObject* args)
{
/* prepare state object */
PyBufferProcs *buffer;
int i, count;
void* ptr;
/* create search state object */
CursorObject* self;
PyObject* string;
int start = 0;
int end = INT_MAX;
if (!PyArg_ParseTuple(args, "O|ii", &string, &start, &end))
return NULL;
/* get pointer to string buffer */
buffer = string->ob_type->tp_as_buffer;
if (!buffer || !buffer->bf_getreadbuffer || !buffer->bf_getsegcount ||
buffer->bf_getsegcount(string, NULL) != 1) {
PyErr_SetString(PyExc_TypeError, "expected read-only buffer");
/* create match object (with room for extra group marks) */
self = PyObject_NEW(CursorObject, &Cursor_Type);
if (self == NULL)
return NULL;
}
/* determine buffer size */
count = buffer->bf_getreadbuffer(string, 0, &ptr);
if (count < 0) {
/* sanity check */
PyErr_SetString(PyExc_TypeError, "buffer has negative size");
string = _setup(&self->state, args);
if (!string) {
/* FIXME: dealloc cursor object */
return NULL;
}
/* determine character size */
state->charsize = (PyUnicode_Check(string) ? sizeof(Py_UNICODE) : 1);
count /= state->charsize;
/* adjust boundaries */
if (start < 0)
start = 0;
else if (start > count)
start = count;
if (end < 0)
end = 0;
else if (end > count)
end = count;
state->beginning = ptr;
state->start = (void*) ((char*) ptr + start * state->charsize);
state->end = (void*) ((char*) ptr + end * state->charsize);
/* FIXME: dynamic! */
for (i = 0; i < 64; i++)
state->mark[i] = NULL;
Py_INCREF(pattern);
self->pattern = pattern;
state->stack = NULL;
state->stackbase = 0;
state->stacksize = 0;
Py_INCREF(string);
self->string = string;
return string;
return (PyObject*) self;
}
static void
......@@ -1002,7 +1083,7 @@ _pattern_dealloc(PatternObject* self)
Py_XDECREF(self->code);
Py_XDECREF(self->pattern);
Py_XDECREF(self->groupindex);
PyObject_Del(self);
PyMem_DEL(self);
}
static PyObject*
......@@ -1052,11 +1133,71 @@ _pattern_search(PatternObject* self, PyObject* args)
}
static PyObject*
_pattern_findall(PatternObject* self, PyObject* args)
call(char* function, PyObject* args)
{
PyObject* name;
PyObject* module;
PyObject* func;
PyObject* result;
name = PyString_FromString("sre");
if (!name)
return NULL;
module = PyImport_Import(name);
Py_DECREF(name);
if (!module)
return NULL;
func = PyObject_GetAttrString(module, function);
Py_DECREF(module);
if (!func)
return NULL;
result = PyObject_CallObject(func, args);
Py_DECREF(func);
Py_DECREF(args);
return result;
}
static PyObject*
_pattern_sub(PatternObject* self, PyObject* args)
{
PyObject* template;
PyObject* string;
PyObject* count;
if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
return NULL;
/* delegate to Python code */
return call("_sub", Py_BuildValue("OOOO", self, template, string, count));
}
static PyObject*
_pattern_subn(PatternObject* self, PyObject* args)
{
PyObject* template;
PyObject* string;
PyObject* count;
if (!PyArg_ParseTuple(args, "OOO", &template, &string, &count))
return NULL;
/* delegate to Python code */
return call("_subn", Py_BuildValue("OOOO", self, template, string, count));
}
static PyObject*
_pattern_split(PatternObject* self, PyObject* args)
{
/* FIXME: not sure about the semantics here. this is good enough
for SXP, though... */
PyObject* string;
PyObject* maxsplit;
if (!PyArg_ParseTuple(args, "OO", &string, &maxsplit))
return NULL;
/* delegate to Python code */
return call("_split", Py_BuildValue("OOO", self, string, maxsplit));
}
static PyObject*
_pattern_findall(PatternObject* self, PyObject* args)
{
SRE_STATE state;
PyObject* string;
PyObject* list;
......@@ -1085,6 +1226,10 @@ _pattern_findall(PatternObject* self, PyObject* args)
if (status == 0)
state.ptr = (void*) ((char*) state.start + 1);
/* FIXME: if one group is defined, slice that group
instead. if multiple groups are defined, add tuple
containing all slices */
item = PySequence_GetSlice(
string,
((char*) state.start - (char*) state.beginning),
......@@ -1121,7 +1266,12 @@ error:
static PyMethodDef _pattern_methods[] = {
{"match", (PyCFunction) _pattern_match, 1},
{"search", (PyCFunction) _pattern_search, 1},
{"sub", (PyCFunction) _pattern_sub, 1},
{"subn", (PyCFunction) _pattern_subn, 1},
{"split", (PyCFunction) _pattern_split, 1},
{"findall", (PyCFunction) _pattern_findall, 1},
/* experimental */
{"cursor", (PyCFunction) _pattern_cursor, 1},
{NULL, NULL}
};
......@@ -1143,6 +1293,14 @@ _pattern_getattr(PatternObject* self, char* name)
return self->pattern;
}
if (!strcmp(name, "flags"))
return Py_BuildValue("i", self->flags);
if (!strcmp(name, "groupindex") && self->groupindex) {
Py_INCREF(self->groupindex);
return self->groupindex;
}
PyErr_SetString(PyExc_AttributeError, name);
return NULL;
}
......@@ -1163,7 +1321,7 @@ _match_dealloc(MatchObject* self)
{
Py_XDECREF(self->string);
Py_DECREF(self->pattern);
PyObject_Del(self);
PyMem_DEL(self);
}
static PyObject*
......@@ -1244,6 +1402,8 @@ _match_groups(MatchObject* self, PyObject* args)
PyObject* result;
int index;
/* FIXME: <fl> handle default value! */
result = PyTuple_New(self->groups-1);
if (!result)
return NULL;
......@@ -1269,6 +1429,8 @@ _match_groupdict(MatchObject* self, PyObject* args)
PyObject* keys;
int index;
/* FIXME: <fl> handle default value! */
result = PyDict_New();
if (!result)
return NULL;
......@@ -1367,7 +1529,8 @@ _match_span(MatchObject* self, PyObject* args)
if (self->mark[index*2] < 0) {
Py_INCREF(Py_None);
return Py_None;
Py_INCREF(Py_None);
return Py_BuildValue("OO", Py_None, Py_None);
}
return Py_BuildValue("ii", self->mark[index*2], self->mark[index*2+1]);
......@@ -1394,24 +1557,20 @@ _match_getattr(MatchObject* self, char* name)
PyErr_Clear();
/* attributes! */
/* attributes */
if (!strcmp(name, "string")) {
Py_INCREF(self->string);
return self->string;
}
if (!strcmp(name, "regs"))
/* FIXME: should return the whole list! */
return Py_BuildValue("((i,i))", self->mark[0], self->mark[1]);
if (!strcmp(name, "re")) {
Py_INCREF(self->pattern);
return (PyObject*) self->pattern;
}
if (!strcmp(name, "groupindex") && self->pattern->groupindex) {
Py_INCREF(self->pattern->groupindex);
return self->pattern->groupindex;
}
if (!strcmp(name, "pos"))
return Py_BuildValue("i", 0); /* FIXME */
if (!strcmp(name, "endpos"))
return Py_BuildValue("i", 0); /* FIXME */
......@@ -1432,6 +1591,106 @@ statichere PyTypeObject Match_Type = {
(getattrfunc)_match_getattr, /*tp_getattr*/
};
/* -------------------------------------------------------------------- */
/* cursor methods (experimental) */
static void
_cursor_dealloc(CursorObject* self)
{
_stack_free(&self->state);
Py_DECREF(self->string);
Py_DECREF(self->pattern);
PyMem_DEL(self);
}
static PyObject*
_cursor_match(CursorObject* self, PyObject* args)
{
SRE_STATE* state = &self->state;
PyObject* match;
int status;
state->ptr = state->start;
if (state->charsize == 1) {
status = sre_match(state, PatternObject_GetCode(self->pattern));
} else {
status = sre_umatch(state, PatternObject_GetCode(self->pattern));
}
match = _pattern_new_match((PatternObject*) self->pattern,
state, self->string, status);
if (status >= 0)
state->start = state->ptr;
else
state->start = (char*) state->ptr + state->charsize;
return match;
}
static PyObject*
_cursor_search(CursorObject* self, PyObject* args)
{
SRE_STATE* state = &self->state;
PyObject* match;
int status;
state->ptr = state->start;
if (state->charsize == 1) {
status = sre_search(state, PatternObject_GetCode(self->pattern));
} else {
status = sre_usearch(state, PatternObject_GetCode(self->pattern));
}
match = _pattern_new_match((PatternObject*) self->pattern,
state, self->string, status);
if (status >= 0)
state->start = state->ptr;
return match;
}
static PyMethodDef _cursor_methods[] = {
{"match", (PyCFunction) _cursor_match, 0},
{"search", (PyCFunction) _cursor_search, 0},
{NULL, NULL}
};
static PyObject*
_cursor_getattr(CursorObject* self, char* name)
{
PyObject* res;
res = Py_FindMethod(_cursor_methods, (PyObject*) self, name);
if (res)
return res;
PyErr_Clear();
/* attributes */
if (!strcmp(name, "pattern")) {
Py_INCREF(self->pattern);
return self->pattern;
}
PyErr_SetString(PyExc_AttributeError, name);
return NULL;
}
statichere PyTypeObject Cursor_Type = {
PyObject_HEAD_INIT(NULL)
0, "Cursor",
sizeof(CursorObject), /* size of basic object */
0,
(destructor)_cursor_dealloc, /*tp_dealloc*/
0, /*tp_print*/
(getattrfunc)_cursor_getattr, /*tp_getattr*/
};
static PyMethodDef _functions[] = {
{"compile", _compile, 1},
{"getcodesize", _getcodesize, 1},
......@@ -1445,7 +1704,8 @@ __declspec(dllexport)
init_sre()
{
/* Patch object types */
Pattern_Type.ob_type = Match_Type.ob_type = &PyType_Type;
Pattern_Type.ob_type = Match_Type.ob_type =
Cursor_Type.ob_type = &PyType_Type;
Py_InitModule("_sre", _functions);
}
......
......@@ -14,17 +14,18 @@
#include "sre_constants.h"
/* Python objects */
typedef struct {
PyObject_HEAD
PyObject* code; /* link to the code string object */
PyObject* pattern; /* link to the pattern source (or None) */
int groups;
PyObject* groupindex;
/* compatibility */
PyObject* pattern; /* pattern source (or None) */
int flags; /* flags used when compiling pattern source */
} PatternObject;
#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code))
#define PatternObject_GetCode(o)\
((void*) PyString_AS_STRING(((PatternObject*)(o))->code))
typedef struct {
PyObject_HEAD
......@@ -34,5 +35,28 @@ typedef struct {
int mark[2];
} MatchObject;
#endif
typedef struct {
/* string pointers */
void* ptr; /* current position (also end of current slice) */
void* beginning; /* start of original string */
void* start; /* start of current slice */
void* end; /* end of original string */
/* character size */
int charsize;
/* registers */
int marks;
void* mark[64]; /* FIXME: <fl> should be dynamically allocated! */
/* backtracking stack */
void** stack;
int stacksize;
int stackbase;
} SRE_STATE;
typedef struct {
PyObject_HEAD
PyObject* pattern;
PyObject* string;
SRE_STATE state;
} CursorObject;
#endif
/* generated by sre_constants.py */
/* generated from sre_constants.py */
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
......@@ -25,3 +25,25 @@
#define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2
#define SRE_AT_NON_BOUNDARY 3
#define SRE_AT_END 4
#define SRE_AT_END_LINE 5
#define SRE_CATEGORY_DIGIT 0
#define SRE_CATEGORY_NOT_DIGIT 1
#define SRE_CATEGORY_SPACE 2
#define SRE_CATEGORY_NOT_SPACE 3
#define SRE_CATEGORY_WORD 4
#define SRE_CATEGORY_NOT_WORD 5
#define SRE_CATEGORY_LINEBREAK 6
#define SRE_CATEGORY_NOT_LINEBREAK 7
#define SRE_CATEGORY_LOC_DIGIT 8
#define SRE_CATEGORY_LOC_NOT_DIGIT 9
#define SRE_CATEGORY_LOC_SPACE 10
#define SRE_CATEGORY_LOC_NOT_SPACE 11
#define SRE_CATEGORY_LOC_WORD 12
#define SRE_CATEGORY_LOC_NOT_WORD 13
#define SRE_CATEGORY_LOC_LINEBREAK 14
#define SRE_CATEGORY_LOC_NOT_LINEBREAK 15
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment