Commit 004c1e1d authored by Guido van Rossum's avatar Guido van Rossum

Latest from Jeffrey Ollie.

Infinite failure stack, some bugs fixed (fastmap, star_jump, register bug).
parent 1681429b
/*
regexpr.c
Author: Tatu Ylonen <ylo@ngs.fi>
Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
Permission to use, copy, modify, distribute, and sell this software
and its documentation for any purpose is hereby granted without fee,
provided that the above copyright notice appear in all copies. This
software is provided "as is" without express or implied warranty.
Created: Thu Sep 26 17:14:05 1991 ylo
Last modified: Mon Nov 4 17:06:48 1991 ylo
Ported to Think C: 19 Jan 1992 guido@cwi.nl
This code draws many ideas from the regular expression packages by
Henry Spencer of the University of Toronto and Richard Stallman of the
Free Software Foundation.
Emacs-specific code and syntax table code is almost directly borrowed
from GNU regexp.
*/
/* regexpr.c
*
* Author: Tatu Ylonen <ylo@ngs.fi>
*
* Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
*
* Permission to use, copy, modify, distribute, and sell this software
* and its documentation for any purpose is hereby granted without
* fee, provided that the above copyright notice appear in all copies.
* This software is provided "as is" without express or implied
* warranty.
*
* Created: Thu Sep 26 17:14:05 1991 ylo
* Last modified: Mon Nov 4 17:06:48 1991 ylo
* Ported to Think C: 19 Jan 1992 guido@cwi.nl
*
* This code draws many ideas from the regular expression packages by
* Henry Spencer of the University of Toronto and Richard Stallman of
* the Free Software Foundation.
*
* Emacs-specific code and syntax table code is almost directly borrowed
* from GNU regexp.
*
* Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
* 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
* Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
* probably one or two others that I'm forgetting.
*
* $Id$ */
#include "config.h" /* For Win* specific redefinition of printf c.s. */
#include "myproto.h" /* For Py_PROTO macro --Guido */
#include "myproto.h" /* For PROTO macro --Guido */
#include <stdio.h>
#ifndef NDEBUG
#define NDEBUG 1
#endif
#include <assert.h>
#include "regexpr.h"
......@@ -48,8 +57,316 @@ char *realloc();
#endif /* __STDC__ */
#endif /* THINK_C */
#define MACRO_BEGIN do {
#define MACRO_END } while (0)
/* The stack implementation is taken from an idea by Andrew Kuchling.
* It's a doubly linked list of arrays. The advantages of this over a
* simple linked list are that the number of mallocs required are
* reduced. It also makes it possible to statically allocate enough
* space so that small patterns don't ever need to call malloc.
*
* The advantages over a single array is that is periodically
* realloced when more space is needed is that we avoid ever copying
* the stack. */
/* item_t is the basic stack element. Defined as a union of
* structures so that both registers, failure points, and counters can
* be pushed/popped from the stack. There's nothing built into the
* item to keep track of whether a certain stack item is a register, a
* failure point, or a counter. */
typedef union item_t
{
struct
{
int num;
int level;
char *start;
char *end;
} reg;
struct
{
int count;
int level;
int phantom;
char *code;
char *text;
} fail;
struct
{
int num;
int level;
int count;
} cntr;
} item_t;
#define STACK_PAGE_SIZE 256
#define NUM_REGISTERS 256
/* A 'page' of stack items. */
typedef struct item_page_t
{
item_t items[STACK_PAGE_SIZE];
struct item_page_t *prev;
struct item_page_t *next;
} item_page_t;
typedef struct match_state
{
/* Structure to encapsulate the stack. */
struct
{
/* index into the curent page. If index == 0 and you need
* to pop and item, move to the previous page and set
* index = STACK_PAGE_SIZE - 1. Otherwise decrement index
* to push a page. If index == STACK_PAGE_SIZE and you
* need to push a page move to the next page and set index
* = 0. If there is no new next page, allocate a new page
* and link it in. Otherwise, increment index to push a
* page. */
int index;
item_page_t *current; /* Pointer to the current page. */
item_page_t first; /* First page is statically allocated. */
} stack;
char *start[NUM_REGISTERS];
char *end[NUM_REGISTERS];
int changed[NUM_REGISTERS];
/* The number of registers that have been pushed onto the stack
* since the last failure point. */
int count;
/* Used to control when registers need to be pushed onto the
* stack. */
int level;
/* The number of failure points on the stack. */
int point;
} match_state;
/* Discard the top 'count' stack items. */
#define STACK_DISCARD(stack, count, on_error) \
stack.index -= count; \
while (stack.index < 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index += STACK_PAGE_SIZE; \
}
/* Store a pointer to the previous item on the stack. Used to pop an
* item off of the stack. */
#define STACK_PREV(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index = STACK_PAGE_SIZE - 1; \
} \
else \
stack.index--; \
top = &(stack.current->items[stack.index])
/* Store a pointer to the next item on the stack. Used to push an item
* on to the stack. */
#define STACK_NEXT(stack, top, on_error) \
if (stack.index == STACK_PAGE_SIZE) \
{ \
if (stack.current->next == NULL) \
{ \
stack.current->next = malloc(sizeof(item_page_t)); \
if (stack.current->next == NULL) \
on_error; \
stack.current->next->prev = stack.current; \
stack.current->next->next = NULL; \
} \
stack.current = stack.current->next; \
stack.index = 0; \
} \
top = &(stack.current->items[stack.index++])
/* Store a pointer to the item that is 'count' items back in the
* stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
* STACK_TOP(stack, top, on_error). */
#define STACK_BACK(stack, top, count, on_error) \
{ \
int index; \
item_page_t *current; \
current = stack.current; \
index = stack.index - (count); \
while (index < 0) \
{ \
if (current->prev == NULL) \
on_error; \
current = current->prev; \
index += STACK_PAGE_SIZE; \
} \
top = &(current->items[index]); \
}
/* Store a pointer to the top item on the stack. Execute the
* 'on_error' code if there are no items on the stack. */
#define STACK_TOP(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
} \
else \
top = &(stack.current->items[stack.index - 1])
/* Test to see if the stack is empty */
#define STACK_EMPTY(stack) ((stack.index == 0) && \
(stack.current->prev == NULL))
/* Initialize a state object */
#define NEW_STATE(state) \
memset(&state, 0, sizeof(match_state)); \
state.stack.current = &state.stack.first; \
state.level = 1
/* Free any memory that might have been malloc'd */
#define FREE_STATE(state) \
while(state.stack.first.next != NULL) \
{ \
state.stack.current = state.stack.first.next; \
state.stack.first.next = state.stack.current->next; \
free(state.stack.current); \
}
/* Return the start of register 'reg' */
#define GET_REG_START(state, reg) (state.start[reg])
/* Return the end of register 'reg' */
#define GET_REG_END(state, reg) (state.end[reg])
/* Set the start of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
#define SET_REG_START(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.start[reg] = text
/* Set the end of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
#define SET_REG_END(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.end[reg] = text
#define PUSH_FAILURE(state, xcode, xtext, on_error) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->fail.code = xcode; \
item->fail.text = xtext; \
item->fail.count = state.count; \
item->fail.level = state.level; \
item->fail.phantom = 0; \
state.count = 0; \
state.level++; \
state.point++; \
}
/* Update the last failure point with a new position in the text. */
/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
/* { \ */
/* item_t *item; \ */
/* STACK_DISCARD(state.stack, state.count, on_error); \ */
/* STACK_TOP(state.stack, item, on_error); \ */
/* item->fail.text = xtext; \ */
/* state.count = 0; \ */
/* } */
/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
/* { \ */
/* item_t *item; \ */
/* STACK_BACK(state.stack, item, state.count + 1, on_error); \ */
/* item->fail.text = xtext; \ */
/* } */
#define UPDATE_FAILURE(state, xtext, on_error) \
{ \
item_t *item; \
STACK_BACK(state.stack, item, state.count + 1, on_error); \
if (!item->fail.phantom) \
{ \
item_t *item2; \
STACK_NEXT(state.stack, item2, on_error); \
item2->fail.code = item->fail.code; \
item2->fail.text = xtext; \
item2->fail.count = state.count; \
item2->fail.level = state.level; \
item2->fail.phantom = 1; \
state.count = 0; \
state.level++; \
state.point++; \
} \
else \
{ \
STACK_DISCARD(state.stack, state.count, on_error); \
STACK_TOP(state.stack, item, on_error); \
item->fail.text = xtext; \
state.count = 0; \
state.level++; \
} \
}
#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
{ \
item_t *item; \
do \
{ \
while(state.count > 0) \
{ \
STACK_PREV(state.stack, item, on_error); \
state.start[item->reg.num] = item->reg.start; \
state.end[item->reg.num] = item->reg.end; \
state.changed[item->reg.num] = item->reg.level; \
state.count--; \
} \
STACK_PREV(state.stack, item, on_empty); \
xcode = item->fail.code; \
xtext = item->fail.text; \
state.count = item->fail.count; \
state.level = item->fail.level; \
state.point--; \
} \
while (item->fail.text == NULL); \
}
enum regexp_compiled_ops /* opcodes for compiled regexp */
{
......@@ -73,9 +390,6 @@ enum regexp_compiled_ops /* opcodes for compiled regexp */
Cwordend, /* match at end of word */
Cwordbound, /* match if at word boundary */
Cnotwordbound, /* match if not at word boundary */
#ifdef emacs
Cemacs_at_dot, /* emacs only: matches at dot */
#endif /* emacs */
Csyntaxspec, /* matches syntax code (1 byte follows) */
Cnotsyntaxspec /* matches if syntax code does not match (1 byte foll)*/
};
......@@ -106,11 +420,6 @@ enum regexp_syntax_op /* syntax codes for plain and quoted characters */
Rwordend, /* end of word */
Rwordbound, /* word bound */
Rnotwordbound, /* not word bound */
#ifdef emacs
Remacs_at_dot, /* emacs: at dot */
Remacs_syntaxspec, /* syntaxspec */
Remacs_notsyntaxspec, /* notsyntaxspec */
#endif /* emacs */
Rnum_ops
};
......@@ -126,38 +435,15 @@ static int regexp_ansi_sequences;
#define NUM_LEVELS 5 /* number of precedence levels in use */
#define MAX_NESTING 100 /* max nesting level of operators */
#ifdef emacs
/* This code is for emacs compatibility only. */
#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
/* emacs defines NULL in some strange way? */
#undef NULL
#define NULL 0
#else /* emacs */
#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
#define Sword 1
#ifdef SYNTAX_TABLE
char *re_syntax_table;
#else
static char re_syntax_table[256];
#endif /* SYNTAX_TABLE */
#endif /* emacs */
static void re_compile_initialize Py_PROTO((void));
static void re_compile_initialize()
static void re_compile_initialize(void)
{
int a;
#if !defined(emacs) && !defined(SYNTAX_TABLE)
static int syntax_table_inited = 0;
if (!syntax_table_inited)
......@@ -171,7 +457,6 @@ static void re_compile_initialize()
for (a = '0'; a <= '9'; a++)
re_syntax_table[a] = Sword;
}
#endif /* !emacs && !SYNTAX_TABLE */
re_compile_initialized = 1;
for (a = 0; a < 256; a++)
{
......@@ -214,11 +499,6 @@ static void re_compile_initialize()
regexp_plain_ops['.'] = Ranychar;
if (!(regexp_syntax & RE_NO_GNU_EXTENSIONS))
{
#ifdef emacs
regexp_quoted_ops['='] = Remacs_at_dot;
regexp_quoted_ops['s'] = Remacs_syntaxspec;
regexp_quoted_ops['S'] = Remacs_notsyntaxspec;
#endif /* emacs */
regexp_quoted_ops['w'] = Rwordchar;
regexp_quoted_ops['W'] = Rnotwordchar;
regexp_quoted_ops['<'] = Rwordbeg;
......@@ -250,8 +530,7 @@ static void re_compile_initialize()
regexp_ansi_sequences = (regexp_syntax & RE_ANSI_HEX) != 0;
}
int re_set_syntax(syntax)
int syntax;
int re_set_syntax(int syntax)
{
int ret;
......@@ -262,9 +541,7 @@ int syntax;
return ret;
}
static int hex_char_to_decimal Py_PROTO((int));
static int hex_char_to_decimal(ch)
int ch;
static int hex_char_to_decimal(int ch)
{
if (ch >= '0' && ch <= '9')
return ch - '0';
......@@ -275,224 +552,659 @@ int ch;
return 16;
}
char *re_compile_pattern(regex, size, bufp)
char *regex;
int size;
regexp_t bufp;
static void re_compile_fastmap_aux(char *code,
int pos,
char *visited,
char *can_be_null,
char *fastmap)
{
int a, pos, op, current_level, level, opcode;
int pattern_offset = 0, alloc;
int starts[NUM_LEVELS * MAX_NESTING], starts_base;
int future_jumps[MAX_NESTING], num_jumps;
unsigned char ch = '\0';
char *pattern, *translate;
int next_register, paren_depth, num_open_registers, open_registers[RE_NREGS];
int beginning_context;
#define NEXTCHAR(var) \
MACRO_BEGIN \
if (pos >= size) \
goto ends_prematurely; \
(var) = regex[pos]; \
pos++; \
MACRO_END
#define ALLOC(amount) \
MACRO_BEGIN \
if (pattern_offset+(amount) > alloc) \
{ \
alloc += 256 + (amount); \
pattern = realloc(pattern, alloc); \
if (!pattern) \
goto out_of_memory; \
} \
MACRO_END
#define STORE(ch) pattern[pattern_offset++] = (ch)
#define CURRENT_LEVEL_START (starts[starts_base + current_level])
#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
starts_base += NUM_LEVELS; \
else \
goto too_complex
#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
#define PUT_ADDR(offset,addr) \
MACRO_BEGIN \
int disp = (addr) - (offset) - 2; \
pattern[(offset)] = disp & 0xff; \
pattern[(offset)+1] = (disp>>8) & 0xff; \
MACRO_END
#define INSERT_JUMP(pos,type,addr) \
MACRO_BEGIN \
int a, p = (pos), t = (type), ad = (addr); \
for (a = pattern_offset - 1; a >= p; a--) \
pattern[a + 3] = pattern[a]; \
pattern[p] = t; \
PUT_ADDR(p+1,ad); \
pattern_offset += 3; \
MACRO_END
#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
#define SET_FIELDS \
MACRO_BEGIN \
bufp->allocated = alloc; \
bufp->buffer = pattern; \
bufp->used = pattern_offset; \
MACRO_END
#define GETHEX(var) \
MACRO_BEGIN \
char gethex_ch, gethex_value; \
NEXTCHAR(gethex_ch); \
gethex_value = hex_char_to_decimal(gethex_ch); \
if (gethex_value == 16) \
goto hex_error; \
NEXTCHAR(gethex_ch); \
gethex_ch = hex_char_to_decimal(gethex_ch); \
if (gethex_ch == 16) \
goto hex_error; \
(var) = gethex_value * 16 + gethex_ch; \
MACRO_END
#define ANSI_TRANSLATE(ch) \
MACRO_BEGIN \
switch (ch) \
{ \
case 'a': \
case 'A': \
ch = 7; /* audible bell */ \
break; \
case 'b': \
case 'B': \
ch = 8; /* backspace */ \
break; \
case 'f': \
case 'F': \
ch = 12; /* form feed */ \
break; \
case 'n': \
case 'N': \
ch = 10; /* line feed */ \
break; \
case 'r': \
case 'R': \
ch = 13; /* carriage return */ \
break; \
case 't': \
case 'T': \
ch = 9; /* tab */ \
break; \
case 'v': \
case 'V': \
ch = 11; /* vertical tab */ \
break; \
case 'x': /* hex code */ \
case 'X': \
GETHEX(ch); \
break; \
default: \
/* other characters passed through */ \
if (translate) \
ch = translate[(unsigned char)ch]; \
break; \
} \
MACRO_END
int a;
int b;
int syntaxcode;
if (!re_compile_initialized)
re_compile_initialize();
bufp->used = 0;
bufp->fastmap_accurate = 0;
bufp->uses_registers = 0;
translate = bufp->translate;
pattern = bufp->buffer;
alloc = bufp->allocated;
if (alloc == 0 || pattern == NULL)
{
alloc = 256;
pattern = malloc(alloc);
if (!pattern)
goto out_of_memory;
}
pattern_offset = 0;
starts_base = 0;
num_jumps = 0;
current_level = 0;
SET_LEVEL_START;
num_open_registers = 0;
next_register = 1;
paren_depth = 0;
beginning_context = 1;
op = -1;
/* we use Rend dummy to ensure that pending jumps are updated (due to
low priority of Rend) before exiting the loop. */
pos = 0;
while (op != Rend)
if (visited[pos])
return; /* we have already been here */
visited[pos] = 1;
for (;;)
switch (code[pos++])
{
if (pos >= size)
op = Rend;
else
case Cend:
{
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
op = regexp_plain_ops[(unsigned char)ch];
if (op == Rquote)
*can_be_null = 1;
return;
}
case Cbol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
NEXTCHAR(ch);
op = regexp_quoted_ops[(unsigned char)ch];
if (op == Rnormal && regexp_ansi_sequences)
ANSI_TRANSLATE(ch);
break;
}
case Csyntaxspec:
{
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (SYNTAX(a) == syntaxcode)
fastmap[a] = 1;
return;
}
level = regexp_precedences[op];
/* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
ch, op, level, current_level, CURRENT_LEVEL_START); */
if (level > current_level)
case Cnotsyntaxspec:
{
for (current_level++; current_level < level; current_level++)
SET_LEVEL_START;
SET_LEVEL_START;
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (SYNTAX(a) != syntaxcode)
fastmap[a] = 1;
return;
}
else
if (level < current_level)
case Ceol:
{
current_level = level;
for (;num_jumps > 0 &&
future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
num_jumps--)
PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
fastmap['\n'] = 1;
if (*can_be_null == 0)
*can_be_null = 2; /* can match null, but only at end of buffer*/
return;
}
switch (op)
case Cset:
{
case Rend:
break;
case Rnormal:
normal_char:
opcode = Cexact;
for (a = 0; a < 256/8; a++)
if (code[pos + a] != 0)
for (b = 0; b < 8; b++)
if (code[pos + a] & (1 << b))
fastmap[(a << 3) + b] = 1;
pos += 256/8;
return;
}
case Cexact:
{
fastmap[(unsigned char)code[pos]] = 1;
return;
}
case Canychar:
{
for (a = 0; a < 256; a++)
if (a != '\n')
fastmap[a] = 1;
return;
}
case Cstart_memory:
case Cend_memory:
{
pos++;
break;
}
case Cmatch_memory:
{
for (a = 0; a < 256; a++)
fastmap[a] = 1;
*can_be_null = 1;
return;
}
case Cjump:
case Cdummy_failure_jump:
case Cupdate_failure_jump:
case Cstar_jump:
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
pos += (int)(short)a;
if (visited[pos])
{
/* argh... the regexp contains empty loops. This is not
good, as this may cause a failure stack overflow when
matching. Oh well. */
/* this path leads nowhere; pursue other paths. */
return;
}
visited[pos] = 1;
break;
}
case Cfailure_jump:
{
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
a = pos + (int)(short)a;
re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
break;
}
default:
{
abort(); /* probably some opcode is missing from this switch */
/*NOTREACHED*/
}
}
}
static int re_do_compile_fastmap(char *buffer,
int used,
int pos,
char *can_be_null,
char *fastmap)
{
char small_visited[512], *visited;
if (used <= sizeof(small_visited))
visited = small_visited;
else
{
visited = malloc(used);
if (!visited)
return 0;
}
*can_be_null = 0;
memset(fastmap, 0, 256);
memset(visited, 0, used);
re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
if (visited != small_visited)
free(visited);
return 1;
}
void re_compile_fastmap(regexp_t bufp)
{
if (!bufp->fastmap || bufp->fastmap_accurate)
return;
assert(bufp->used > 0);
if (!re_do_compile_fastmap(bufp->buffer,
bufp->used,
0,
&bufp->can_be_null,
bufp->fastmap))
return;
if (bufp->buffer[0] == Cbol)
bufp->anchor = 1; /* begline */
else
if (bufp->buffer[0] == Cbegbuf)
bufp->anchor = 2; /* begbuf */
else
bufp->anchor = 0; /* none */
bufp->fastmap_accurate = 1;
}
/*
* star is coded as:
* 1: failure_jump 2
* ... code for operand of star
* star_jump 1
* 2: ... code after star
*
* We change the star_jump to update_failure_jump if we can determine
* that it is safe to do so; otherwise we change it to an ordinary
* jump.
*
* plus is coded as
*
* jump 2
* 1: failure_jump 3
* 2: ... code for operand of plus
* star_jump 1
* 3: ... code after plus
*
* For star_jump considerations this is processed identically to star.
*
*/
static int re_optimize_star_jump(regexp_t bufp, char *code)
{
char map[256];
char can_be_null;
char *p1;
char *p2;
char ch;
int a;
int b;
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)(short)a;
p1 = code + a + 3; /* skip the failure_jump */
assert(p1[-3] == Cfailure_jump);
p2 = code;
/* p1 points inside loop, p2 points to after loop */
if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
p2 - bufp->buffer, &can_be_null, map))
goto make_normal_jump;
/* If we might introduce a new update point inside the
* loop, we can't optimize because then update_jump would
* update a wrong failure point. Thus we have to be
* quite careful here.
*/
/* loop until we find something that consumes a character */
loop_p1:
switch (*p1++)
{
case Cbol:
case Ceol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
goto loop_p1;
}
case Cstart_memory:
case Cend_memory:
{
p1++;
goto loop_p1;
}
case Cexact:
{
ch = (unsigned char)*p1++;
if (map[ch])
goto make_normal_jump;
break;
}
case Canychar:
{
for (b = 0; b < 256; b++)
if (b != '\n' && map[b])
goto make_normal_jump;
break;
}
case Cset:
{
for (b = 0; b < 256; b++)
if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
goto make_normal_jump;
p1 += 256/8;
break;
}
default:
{
goto make_normal_jump;
}
}
/* now we know that we can't backtrack. */
while (p1 != p2 - 3)
{
switch (*p1++)
{
case Cend:
{
return 0;
}
case Cbol:
case Ceol:
case Canychar:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
break;
}
case Cset:
{
p1 += 256/8;
break;
}
case Cexact:
case Cstart_memory:
case Cend_memory:
case Cmatch_memory:
case Csyntaxspec:
case Cnotsyntaxspec:
{
p1++;
break;
}
case Cjump:
case Cstar_jump:
case Cfailure_jump:
case Cupdate_failure_jump:
case Cdummy_failure_jump:
{
goto make_normal_jump;
}
default:
{
return 0;
break;
}
}
}
make_update_jump:
code -= 3;
a += 3; /* jump to after the Cfailure_jump */
code[0] = Cupdate_failure_jump;
code[1] = a & 0xff;
code[2] = a >> 8;
return 1;
make_normal_jump:
code -= 3;
*code = Cjump;
return 1;
}
static int re_optimize(regexp_t bufp)
{
char *code;
code = bufp->buffer;
while(1)
{
switch (*code++)
{
case Cend:
{
return 1;
}
case Canychar:
case Cbol:
case Ceol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
{
break;
}
case Cset:
{
code += 256/8;
break;
}
case Cexact:
case Cstart_memory:
case Cend_memory:
case Cmatch_memory:
case Csyntaxspec:
case Cnotsyntaxspec:
{
code++;
break;
}
case Cstar_jump:
{
if (!re_optimize_star_jump(bufp, code))
{
return 0;
}
/* fall through */
}
case Cupdate_failure_jump:
case Cjump:
case Cdummy_failure_jump:
case Cfailure_jump:
{
code += 2;
break;
}
default:
{
return 0;
}
}
}
}
#define NEXTCHAR(var) \
{ \
if (pos >= size) \
goto ends_prematurely; \
(var) = regex[pos]; \
pos++; \
}
#define ALLOC(amount) \
{ \
if (pattern_offset+(amount) > alloc) \
{ \
alloc += 256 + (amount); \
pattern = realloc(pattern, alloc); \
if (!pattern) \
goto out_of_memory; \
} \
}
#define STORE(ch) pattern[pattern_offset++] = (ch)
#define CURRENT_LEVEL_START (starts[starts_base + current_level])
#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
#define PUSH_LEVEL_STARTS \
if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
starts_base += NUM_LEVELS; \
else \
goto too_complex
#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
#define PUT_ADDR(offset,addr) \
{ \
int disp = (addr) - (offset) - 2; \
pattern[(offset)] = disp & 0xff; \
pattern[(offset)+1] = (disp>>8) & 0xff; \
}
#define INSERT_JUMP(pos,type,addr) \
{ \
int a, p = (pos), t = (type), ad = (addr); \
for (a = pattern_offset - 1; a >= p; a--) \
pattern[a + 3] = pattern[a]; \
pattern[p] = t; \
PUT_ADDR(p+1,ad); \
pattern_offset += 3; \
}
#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
#define SET_FIELDS \
{ \
bufp->allocated = alloc; \
bufp->buffer = pattern; \
bufp->used = pattern_offset; \
}
#define GETHEX(var) \
{ \
char gethex_ch, gethex_value; \
NEXTCHAR(gethex_ch); \
gethex_value = hex_char_to_decimal(gethex_ch); \
if (gethex_value == 16) \
goto hex_error; \
NEXTCHAR(gethex_ch); \
gethex_ch = hex_char_to_decimal(gethex_ch); \
if (gethex_ch == 16) \
goto hex_error; \
(var) = gethex_value * 16 + gethex_ch; \
}
#define ANSI_TRANSLATE(ch) \
{ \
switch (ch) \
{ \
case 'a': \
case 'A': \
{ \
ch = 7; /* audible bell */ \
break; \
} \
case 'b': \
case 'B': \
{ \
ch = 8; /* backspace */ \
break; \
} \
case 'f': \
case 'F': \
{ \
ch = 12; /* form feed */ \
break; \
} \
case 'n': \
case 'N': \
{ \
ch = 10; /* line feed */ \
break; \
} \
case 'r': \
case 'R': \
{ \
ch = 13; /* carriage return */ \
break; \
} \
case 't': \
case 'T': \
{ \
ch = 9; /* tab */ \
break; \
} \
case 'v': \
case 'V': \
{ \
ch = 11; /* vertical tab */ \
break; \
} \
case 'x': /* hex code */ \
case 'X': \
{ \
GETHEX(ch); \
break; \
} \
default: \
{ \
/* other characters passed through */ \
if (translate) \
ch = translate[(unsigned char)ch]; \
break; \
} \
} \
}
char *re_compile_pattern(char *regex, int size, regexp_t bufp)
{
int a;
int pos;
int op;
int current_level;
int level;
int opcode;
int pattern_offset, alloc;
int starts[NUM_LEVELS * MAX_NESTING];
int starts_base;
int future_jumps[MAX_NESTING];
int num_jumps;
unsigned char ch;
char *pattern;
char *translate;
int next_register;
int paren_depth;
int num_open_registers;
int open_registers[RE_NREGS];
int beginning_context;
if (!re_compile_initialized)
re_compile_initialize();
bufp->used = 0;
bufp->fastmap_accurate = 0;
bufp->uses_registers = 0;
translate = bufp->translate;
pattern = bufp->buffer;
alloc = bufp->allocated;
if (alloc == 0 || pattern == NULL)
{
alloc = 256;
pattern = malloc(alloc);
if (!pattern)
goto out_of_memory;
}
pattern_offset = 0;
starts_base = 0;
num_jumps = 0;
current_level = 0;
SET_LEVEL_START;
num_open_registers = 0;
next_register = 1;
paren_depth = 0;
beginning_context = 1;
op = -1;
/* we use Rend dummy to ensure that pending jumps are updated (due to
low priority of Rend) before exiting the loop. */
pos = 0;
while (op != Rend)
{
if (pos >= size)
op = Rend;
else
{
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
op = regexp_plain_ops[(unsigned char)ch];
if (op == Rquote)
{
NEXTCHAR(ch);
op = regexp_quoted_ops[(unsigned char)ch];
if (op == Rnormal && regexp_ansi_sequences)
ANSI_TRANSLATE(ch);
}
}
level = regexp_precedences[op];
/* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
ch, op, level, current_level, CURRENT_LEVEL_START); */
if (level > current_level)
{
for (current_level++; current_level < level; current_level++)
SET_LEVEL_START;
SET_LEVEL_START;
}
else
if (level < current_level)
{
current_level = level;
for (;num_jumps > 0 &&
future_jumps[num_jumps-1] >= CURRENT_LEVEL_START;
num_jumps--)
PUT_ADDR(future_jumps[num_jumps-1], pattern_offset);
}
switch (op)
{
case Rend:
{
break;
}
case Rnormal:
{
normal_char:
opcode = Cexact;
store_opcode_and_arg: /* opcode & ch must be set */
SET_LEVEL_START;
ALLOC(2);
STORE(opcode);
STORE(ch);
break;
}
case Ranychar:
{
opcode = Canychar;
store_opcode:
SET_LEVEL_START;
ALLOC(1);
STORE(opcode);
break;
}
case Rquote:
{
abort();
/*NOTREACHED*/
}
case Rbol:
{
if (!beginning_context)
if (regexp_context_indep_ops)
goto op_error;
......@@ -500,7 +1212,9 @@ regexp_t bufp;
goto normal_char;
opcode = Cbol;
goto store_opcode;
}
case Reol:
{
if (!((pos >= size) ||
((regexp_syntax & RE_NO_BK_VBAR) ?
(regex[pos] == '\174') :
......@@ -518,7 +1232,9 @@ regexp_t bufp;
goto store_opcode;
/* NOTREACHED */
break;
}
case Roptional:
{
if (beginning_context)
if (regexp_context_indep_ops)
goto op_error;
......@@ -530,8 +1246,10 @@ regexp_t bufp;
INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
pattern_offset + 3);
break;
}
case Rstar:
case Rplus:
{
if (beginning_context)
if (regexp_context_indep_ops)
goto op_error;
......@@ -547,7 +1265,9 @@ regexp_t bufp;
INSERT_JUMP(CURRENT_LEVEL_START, Cdummy_failure_jump,
CURRENT_LEVEL_START + 6);
break;
}
case Ror:
{
ALLOC(6);
INSERT_JUMP(CURRENT_LEVEL_START, Cfailure_jump,
pattern_offset + 6);
......@@ -559,7 +1279,9 @@ regexp_t bufp;
STORE(0);
SET_LEVEL_START;
break;
}
case Ropenpar:
{
SET_LEVEL_START;
if (next_register < RE_NREGS)
{
......@@ -575,7 +1297,9 @@ regexp_t bufp;
current_level = 0;
SET_LEVEL_START;
break;
}
case Rclosepar:
{
if (paren_depth <= 0)
goto parenthesis_error;
POP_LEVEL_STARTS;
......@@ -590,7 +1314,9 @@ regexp_t bufp;
STORE(open_registers[num_open_registers]);
}
break;
}
case Rmemory:
{
if (ch == '0')
goto bad_match_register;
assert(ch >= '0' && ch <= '9');
......@@ -598,7 +1324,9 @@ regexp_t bufp;
opcode = Cmatch_memory;
ch -= '0';
goto store_opcode_and_arg;
}
case Rextended_memory:
{
NEXTCHAR(ch);
if (ch < '0' || ch > '9')
goto bad_match_register;
......@@ -611,9 +1339,14 @@ regexp_t bufp;
bufp->uses_registers = 1;
opcode = Cmatch_memory;
goto store_opcode_and_arg;
}
case Ropenset:
{
int complement,prev,offset,range,firstchar;
int complement;
int prev;
int offset;
int range;
int firstchar;
SET_LEVEL_START;
ALLOC(1+256/8);
......@@ -673,53 +1406,52 @@ regexp_t bufp;
break;
}
case Rbegbuf:
{
opcode = Cbegbuf;
goto store_opcode;
}
case Rendbuf:
{
opcode = Cendbuf;
goto store_opcode;
}
case Rwordchar:
{
opcode = Csyntaxspec;
ch = Sword;
goto store_opcode_and_arg;
}
case Rnotwordchar:
{
opcode = Cnotsyntaxspec;
ch = Sword;
goto store_opcode_and_arg;
}
case Rwordbeg:
{
opcode = Cwordbeg;
goto store_opcode;
}
case Rwordend:
{
opcode = Cwordend;
goto store_opcode;
}
case Rwordbound:
{
opcode = Cwordbound;
goto store_opcode;
}
case Rnotwordbound:
{
opcode = Cnotwordbound;
goto store_opcode;
#ifdef emacs
case Remacs_at_dot:
opcode = Cemacs_at_dot;
goto store_opcode;
case Remacs_syntaxspec:
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
opcode = Csyntaxspec;
ch = syntax_spec_code[(unsigned char)ch];
goto store_opcode_and_arg;
case Remacs_notsyntaxspec:
NEXTCHAR(ch);
if (translate)
ch = translate[(unsigned char)ch];
opcode = Cnotsyntaxspec;
ch = syntax_spec_code[(unsigned char)ch];
goto store_opcode_and_arg;
#endif /* emacs */
}
default:
{
abort();
}
}
beginning_context = (op == Ropenpar || op == Ror);
}
if (starts_base != 0)
......@@ -728,6 +1460,8 @@ regexp_t bufp;
ALLOC(1);
STORE(Cend);
SET_FIELDS;
if(!re_optimize(bufp))
return "Optimization error";
return NULL;
op_error:
......@@ -758,6 +1492,7 @@ regexp_t bufp;
SET_FIELDS;
return "Regular expression too complex";
}
#undef CHARAT
#undef NEXTCHAR
#undef GETHEX
......@@ -772,643 +1507,349 @@ regexp_t bufp;
#undef SETBIT
#undef SET_FIELDS
static void re_compile_fastmap_aux
Py_PROTO((char *, int, char *, char *, char *));
static void re_compile_fastmap_aux(code, pos, visited, can_be_null, fastmap)
char *code, *visited, *can_be_null, *fastmap;
int pos;
{
int a, b, syntaxcode;
if (visited[pos])
return; /* we have already been here */
visited[pos] = 1;
for (;;)
switch (code[pos++])
{
case Cend:
*can_be_null = 1;
return;
case Cbol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
#ifdef emacs
case Cemacs_at_dot:
#endif /* emacs */
break;
case Csyntaxspec:
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (SYNTAX(a) == syntaxcode)
fastmap[a] = 1;
return;
case Cnotsyntaxspec:
syntaxcode = code[pos++];
for (a = 0; a < 256; a++)
if (SYNTAX(a) != syntaxcode)
fastmap[a] = 1;
return;
case Ceol:
fastmap['\n'] = 1;
if (*can_be_null == 0)
*can_be_null = 2; /* can match null, but only at end of buffer*/
return;
case Cset:
for (a = 0; a < 256/8; a++)
if (code[pos + a] != 0)
for (b = 0; b < 8; b++)
if (code[pos + a] & (1 << b))
fastmap[(a << 3) + b] = 1;
pos += 256/8;
return;
case Cexact:
fastmap[(unsigned char)code[pos]] = 1;
return;
case Canychar:
for (a = 0; a < 256; a++)
if (a != '\n')
fastmap[a] = 1;
return;
case Cstart_memory:
case Cend_memory:
pos++;
break;
case Cmatch_memory:
/* should this ever happen for sensible patterns??? */
*can_be_null = 1;
return;
case Cjump:
case Cdummy_failure_jump:
case Cupdate_failure_jump:
case Cstar_jump:
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
pos += (int)(short)a;
if (visited[pos])
{
/* argh... the regexp contains empty loops. This is not
good, as this may cause a failure stack overflow when
matching. Oh well. */
/* this path leads nowhere; pursue other paths. */
return;
}
visited[pos] = 1;
break;
case Cfailure_jump:
a = (unsigned char)code[pos++];
a |= (unsigned char)code[pos++] << 8;
a = pos + (int)(short)a;
re_compile_fastmap_aux(code, a, visited, can_be_null, fastmap);
break;
default:
abort(); /* probably some opcode is missing from this switch */
/*NOTREACHED*/
}
}
static int re_do_compile_fastmap Py_PROTO((char *, int, int, char *, char *));
static int re_do_compile_fastmap(buffer, used, pos, can_be_null, fastmap)
char *buffer, *fastmap, *can_be_null;
int used, pos;
{
char small_visited[512], *visited;
if (used <= sizeof(small_visited))
visited = small_visited;
else
{
visited = malloc(used);
if (!visited)
return 0;
}
*can_be_null = 0;
memset(fastmap, 0, 256);
memset(visited, 0, used);
re_compile_fastmap_aux(buffer, pos, visited, can_be_null, fastmap);
if (visited != small_visited)
free(visited);
return 1;
}
#define PREFETCH if (text == textend) goto fail
void re_compile_fastmap(bufp)
regexp_t bufp;
#define NEXTCHAR(var) \
PREFETCH; \
var = (unsigned char)*text++; \
if (translate) \
var = translate[var]
int re_match(regexp_t bufp,
char *string,
int size,
int pos,
regexp_registers_t old_regs)
{
if (!bufp->fastmap || bufp->fastmap_accurate)
return;
assert(bufp->used > 0);
if (!re_do_compile_fastmap(bufp->buffer, bufp->used, 0, &bufp->can_be_null,
bufp->fastmap))
return;
if (bufp->buffer[0] == Cbol)
bufp->anchor = 1; /* begline */
else
if (bufp->buffer[0] == Cbegbuf)
bufp->anchor = 2; /* begbuf */
else
bufp->anchor = 0; /* none */
bufp->fastmap_accurate = 1;
}
#define INITIAL_FAILURES 128 /* initial # failure points to allocate */
#define MAX_FAILURES 4100L /* max # of failure points before failing */
char *code;
char *translate;
char *text;
char *textstart;
char *textend;
int a;
int b;
int ch;
int reg;
int match_end;
char *regstart;
char *regend;
int regsize;
match_state state;
assert(pos >= 0 && size >= 0);
assert(pos <= size);
text = string + pos;
textstart = string;
textend = string + size;
int re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop)
regexp_t bufp;
char *string1, *string2;
int size1, size2, pos, mstop;
regexp_registers_t regs;
{
struct failure_point { char *text, *partend, *code; }
*failure_stack_start, *failure_sp, *failure_stack_end,
initial_failure_stack[INITIAL_FAILURES];
char *code, *translate, *text, *textend, *partend, *part_2_end;
char *regstart_text[RE_NREGS], *regstart_partend[RE_NREGS];
char *regend_text[RE_NREGS], *regend_partend[RE_NREGS];
int a, b, ch, reg, regch, match_end;
char *regtext, *regpartend, *regtextend;
#define PREFETCH \
MACRO_BEGIN \
if (text == partend) \
{ \
if (text == textend) \
goto fail; \
text = string2; \
partend = part_2_end; \
} \
MACRO_END
code = bufp->buffer;
#define NEXTCHAR(var) \
MACRO_BEGIN \
PREFETCH; \
(var) = (unsigned char)*text++; \
if (translate) \
(var) = (unsigned char)translate[(var)]; \
MACRO_END
translate = bufp->translate;
/* translated = NULL; */
/* if (bufp->translate) */
/* { */
/* char *t1; */
/* char *t2; */
assert(pos >= 0 && size1 >= 0 && size2 >= 0 && mstop >= 0);
assert(mstop <= size1 + size2);
assert(pos <= mstop);
/* translated = malloc(size); */
/* if (translated == NULL) */
/* goto error; */
if (pos <= size1)
{
text = string1 + pos;
if (mstop <= size1)
{
partend = string1 + mstop;
textend = partend;
}
else
{
partend = string1 + size1;
textend = string2 + mstop - size1;
}
part_2_end = string2 + mstop - size1;
}
else
{
text = string2 + pos - size1;
partend = string2 + mstop - size1;
textend = partend;
part_2_end = partend;
}
/* t1 = string; */
/* t2 = translated; */
/* while(t1 < textend) */
/* *t2++ = bufp->translate[*t1++]; */
if (bufp->uses_registers && regs != NULL)
for (a = 0; a < RE_NREGS; a++)
regend_text[a] = NULL;
/* text = translated + pos; */
/* textstart = translated; */
/* textend = translated + size; */
/* } */
code = bufp->buffer;
translate = bufp->translate;
failure_stack_start = failure_sp = initial_failure_stack;
failure_stack_end = initial_failure_stack + INITIAL_FAILURES;
#if 0
/* re_search_2 has already done this, and otherwise we get little benefit
from this. So I'll leave this out. */
if (bufp->fastmap_accurate && !bufp->can_be_null &&
text != textend &&
!bufp->fastmap[translate ?
(unsigned char)translate[(unsigned char)*text] :
(unsigned char)*text])
return -1; /* it can't possibly match */
#endif
NEW_STATE(state);
continue_matching:
for (;;)
{
switch (*code++)
{
case Cend:
if (partend != part_2_end)
match_end = text - string1;
else
match_end = text - string2 + size1;
if (regs)
{
regs->start[0] = pos;
regs->end[0] = match_end;
match_end = text - textstart;
if (old_regs)
{
old_regs->start[0] = pos;
old_regs->end[0] = match_end;
if (!bufp->uses_registers)
{
for (a = 1; a < RE_NREGS; a++)
{
regs->start[a] = -1;
regs->end[a] = -1;
old_regs->start[a] = -1;
old_regs->end[a] = -1;
}
}
else
{
for (a = 1; a < RE_NREGS; a++)
{
if (regend_text[a] == NULL)
if ((GET_REG_START(state, a) == NULL) ||
(GET_REG_END(state, a) == NULL))
{
regs->start[a] = -1;
regs->end[a] = -1;
old_regs->start[a] = -1;
old_regs->end[a] = -1;
continue;
}
if (regstart_partend[a] != part_2_end)
regs->start[a] = regstart_text[a] - string1;
else
regs->start[a] = regstart_text[a] - string2 + size1;
if (regend_partend[a] != part_2_end)
regs->end[a] = regend_text[a] - string1;
else
regs->end[a] = regend_text[a] - string2 + size1;
old_regs->start[a] = GET_REG_START(state, a) - textstart;
old_regs->end[a] = GET_REG_END(state, a) - textstart;
}
}
}
if (failure_stack_start != initial_failure_stack)
free((char *)failure_stack_start);
/* if(translated) */
/* free(translated); */
FREE_STATE(state);
return match_end - pos;
case Cbol:
if (text == string1 || text[-1] == '\n') /* text[-1] always valid */
break;
goto fail;
case Ceol:
if (text == string2 + size2 ||
(text == string1 + size1 ?
(size2 == 0 || *string2 == '\n') :
*text == '\n'))
break;
goto fail;
case Cset:
NEXTCHAR(ch);
if (code[ch/8] & (1<<(ch & 7)))
{
code += 256/8;
break;
}
goto fail;
case Cexact:
NEXTCHAR(ch);
if (ch != (unsigned char)*code++)
goto fail;
break;
case Canychar:
NEXTCHAR(ch);
if (ch == '\n')
goto fail;
break;
case Cstart_memory:
reg = *code++;
regstart_text[reg] = text;
regstart_partend[reg] = partend;
break;
case Cend_memory:
reg = *code++;
regend_text[reg] = text;
regend_partend[reg] = partend;
break;
case Cmatch_memory:
reg = *code++;
if (regend_text[reg] == NULL)
goto fail; /* or should we just match nothing? */
regtext = regstart_text[reg];
regtextend = regend_text[reg];
if (regstart_partend[reg] == regend_partend[reg])
regpartend = regtextend;
else
regpartend = string1 + size1;
for (;regtext != regtextend;)
case Cbol:
{
NEXTCHAR(ch);
if (regtext == regpartend)
regtext = string2;
regch = (unsigned char)*regtext++;
if (translate)
regch = (unsigned char)translate[regch];
if (regch != ch)
if (text == textstart || text[-1] == '\n')
goto continue_matching;
goto fail;
}
break;
case Cstar_jump:
/* star is coded as:
1: failure_jump 2
... code for operand of star
star_jump 1
2: ... code after star
We change the star_jump to update_failure_jump if we can determine
that it is safe to do so; otherwise we change it to an ordinary
jump.
plus is coded as
jump 2
1: failure_jump 3
2: ... code for operand of plus
star_jump 1
3: ... code after plus
For star_jump considerations this is processed identically
to star. */
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)(short)a;
{
char map[256], can_be_null;
char *p1, *p2;
p1 = code + a + 3; /* skip the failure_jump */
assert(p1[-3] == Cfailure_jump);
p2 = code;
/* p1 points inside loop, p2 points to after loop */
if (!re_do_compile_fastmap(bufp->buffer, bufp->used,
p2 - bufp->buffer, &can_be_null, map))
goto make_normal_jump;
/* If we might introduce a new update point inside the loop,
we can't optimize because then update_jump would update a
wrong failure point. Thus we have to be quite careful here. */
loop_p1:
/* loop until we find something that consumes a character */
switch (*p1++)
{
case Cbol:
case Ceol:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
#ifdef emacs
case Cemacs_at_dot:
#endif /* emacs */
goto loop_p1;
case Cstart_memory:
case Cend_memory:
p1++;
goto loop_p1;
case Cexact:
ch = (unsigned char)*p1++;
if (map[ch])
goto make_normal_jump;
break;
case Canychar:
for (b = 0; b < 256; b++)
if (b != '\n' && map[b])
goto make_normal_jump;
break;
case Cset:
for (b = 0; b < 256; b++)
if ((p1[b >> 3] & (1 << (b & 7))) && map[b])
goto make_normal_jump;
p1 += 256/8;
break;
default:
goto make_normal_jump;
}
/* now we know that we can't backtrack. */
while (p1 != p2 - 3)
{
switch (*p1++)
{
case Cend:
abort(); /* we certainly shouldn't get this inside loop */
/*NOTREACHED*/
case Cbol:
case Ceol:
case Canychar:
case Cbegbuf:
case Cendbuf:
case Cwordbeg:
case Cwordend:
case Cwordbound:
case Cnotwordbound:
#ifdef emacs
case Cemacs_at_dot:
#endif /* emacs */
break;
if (text == textend || *text == '\n')
goto continue_matching;
goto fail;
}
case Cset:
p1 += 256/8;
break;
{
NEXTCHAR(ch);
if (code[ch/8] & (1<<(ch & 7)))
{
code += 256/8;
goto continue_matching;
}
goto fail;
}
case Cexact:
{
NEXTCHAR(ch);
if (ch != (unsigned char)*code++)
goto fail;
/* { */
/* char *p1 = code - 2; */
/* ch = *(code - 1); */
/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
/* while ((code == p1) && (*text != ch)) */
/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
/* if ((code == p1) && (*text == ch)) */
/* { */
/* code += 2; */
/* text++; */
/* } */
/* } */
goto continue_matching;
}
case Canychar:
{
NEXTCHAR(ch);
if (ch == '\n')
goto fail;
goto continue_matching;
}
case Cstart_memory:
{
reg = *code++;
SET_REG_START(state, reg, text, goto error);
goto continue_matching;
}
case Cend_memory:
case Cmatch_memory:
case Csyntaxspec:
case Cnotsyntaxspec:
p1++;
break;
case Cjump:
case Cstar_jump:
case Cfailure_jump:
case Cupdate_failure_jump:
case Cdummy_failure_jump:
goto make_normal_jump;
default:
printf("regexpr.c: processing star_jump: unknown op %d\n", p1[-1]);
break;
{
reg = *code++;
SET_REG_END(state, reg, text, goto error);
goto continue_matching;
}
case Cmatch_memory:
{
reg = *code++;
regstart = GET_REG_START(state, reg);
regend = GET_REG_END(state, reg);
if ((regstart == NULL) || (regend == NULL))
goto fail; /* or should we just match nothing? */
regsize = regend - regstart;
if (regsize > (textend - text))
goto fail;
if(translate)
{
for (; regstart < regend; regstart++, text++)
if (translate[*regstart] != translate[*text])
goto fail;
}
goto make_update_jump;
else
for (; regstart < regend; regstart++, text++)
if (*regstart != *text)
goto fail;
/* if (memcmp(text, regstart, regsize) != 0)
goto fail;
text += regsize; */
goto continue_matching;
}
make_normal_jump:
/* printf("changing to normal jump\n"); */
code -= 3;
*code = Cjump;
break;
make_update_jump:
/* printf("changing to update jump\n"); */
code -= 2;
a += 3; /* jump to after the Cfailure_jump */
code[-1] = Cupdate_failure_jump;
code[0] = a & 0xff;
code[1] = a >> 8;
/* fall to next case */
case Cupdate_failure_jump:
failure_sp[-1].text = text;
failure_sp[-1].partend = partend;
{
UPDATE_FAILURE(state, text, goto error);
/* fall to next case */
}
/* treat Cstar_jump just like Cjump if it hasn't been optimized */
case Cstar_jump:
case Cjump:
{
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
code += (int)(short)a;
break;
goto continue_matching;
}
case Cdummy_failure_jump:
case Cfailure_jump:
if (failure_sp == failure_stack_end)
{
if (failure_stack_start != initial_failure_stack)
goto error;
failure_stack_start = (struct failure_point *)
malloc(MAX_FAILURES * sizeof(*failure_stack_start));
if (failure_stack_start == NULL)
{
failure_stack_start = initial_failure_stack;
goto error;
}
failure_stack_end = failure_stack_start + MAX_FAILURES;
memcpy((char *)failure_stack_start, (char *)initial_failure_stack,
INITIAL_FAILURES * sizeof(*failure_stack_start));
failure_sp = failure_stack_start + INITIAL_FAILURES;
}
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)(short)a;
if (code[-3] == Cdummy_failure_jump)
{ /* this is only used in plus */
assert(*code == Cfailure_jump);
b = (unsigned char)code[1];
b |= (unsigned char)code[2] << 8;
failure_sp->code = code + (int)(short)b + 3;
failure_sp->text = NULL;
PUSH_FAILURE(state, code + (int)(short)b + 3, NULL, goto error);
code += a;
goto continue_matching;
}
else
case Cfailure_jump:
{
failure_sp->code = code + a;
failure_sp->text = text;
failure_sp->partend = partend;
a = (unsigned char)*code++;
a |= (unsigned char)*code++ << 8;
a = (int)(short)a;
PUSH_FAILURE(state, code + a, text, goto error);
goto continue_matching;
}
failure_sp++;
break;
case Cbegbuf:
if (text == string1)
break;
{
if (text == textstart)
goto continue_matching;
goto fail;
}
case Cendbuf:
if (size2 == 0 ? text == string1 + size1 : text == string2 + size2)
break;
{
if (text == textend)
goto continue_matching;
goto fail;
}
case Cwordbeg:
if (text == string2 + size2)
goto fail;
if (size2 == 0 && text == string1 + size1)
{
if (text == textend)
goto fail;
if (SYNTAX(text == string1 + size1 ? *string1 : *text) != Sword)
if (SYNTAX(*text) != Sword)
goto fail;
if (text == string1)
break;
if (text == textstart)
goto continue_matching;
if (SYNTAX(text[-1]) != Sword)
break;
goto continue_matching;
goto fail;
}
case Cwordend:
if (text == string1)
{
if (text == textstart)
goto fail;
if (SYNTAX(text[-1]) != Sword)
goto fail;
if (text == string2 + size2)
break;
if (size2 == 0 && text == string1 + size1)
break;
if (text == textend)
goto continue_matching;
if (SYNTAX(*text) == Sword)
goto fail;
break;
goto continue_matching;
}
case Cwordbound:
{
/* Note: as in gnu regexp, this also matches at the beginning
and end of buffer. */
if (text == string1 || text == string2 + size2 ||
(size2 == 0 && text == string1 + size1))
break;
if ((SYNTAX(text[-1]) == Sword) ^
(SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword))
break;
* and end of buffer. */
if (text == textstart || text == textend)
goto continue_matching;
if ((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword))
goto continue_matching;
goto fail;
}
case Cnotwordbound:
{
/* Note: as in gnu regexp, this never matches at the beginning
and end of buffer. */
if (text == string1 || text == string2 + size2 ||
(size2 == 0 && text == string1 + size1))
* and end of buffer. */
if (text == textstart || text == textend)
goto fail;
if (!((SYNTAX(text[-1]) == Sword) ^
(SYNTAX(text == string1 + size1 ? *string2 : *text) == Sword)))
if (!((SYNTAX(text[-1]) == Sword) ^ (SYNTAX(*text) == Sword)))
goto fail;
break;
goto continue_matching;
}
case Csyntaxspec:
{
NEXTCHAR(ch);
if (SYNTAX(ch) != (unsigned char)*code++)
goto fail;
break;
goto continue_matching;
}
case Cnotsyntaxspec:
{
NEXTCHAR(ch);
if (SYNTAX(ch) != (unsigned char)*code++)
break;
goto fail;
#ifdef emacs
case Cemacs_at_dot:
if (PTR_CHAR_POS((unsigned char *)text) + 1 != point)
goto fail;
break;
#endif /* emacs */
goto continue_matching;
}
default:
{
abort();
/*NOTREACHED*/
}
}
#if 0 /* This line is never reached --Guido */
abort();
#endif
/*NOTREACHED*/
/*
*NOTREACHED
*/
fail:
if (failure_sp != failure_stack_start)
{
failure_sp--;
text = failure_sp->text;
if (text == NULL)
goto fail;
partend = failure_sp->partend;
code = failure_sp->code;
POP_FAILURE(state, code, text, goto done_matching, goto error);
goto continue_matching;
}
if (failure_stack_start != initial_failure_stack)
free((char *)failure_stack_start);
done_matching:
/* if(translated != NULL) */
/* free(translated); */
FREE_STATE(state);
return -1;
error:
if (failure_stack_start != initial_failure_stack)
free((char *)failure_stack_start);
/* if (translated != NULL) */
/* free(translated); */
FREE_STATE(state);
return -2;
}
#undef PREFETCH
#undef NEXTCHAR
#undef PUSH_FAILURE
int re_match(bufp, string, size, pos, regs)
regexp_t bufp;
char *string;
int size, pos;
regexp_registers_t regs;
int re_search(regexp_t bufp,
char *string,
int size,
int pos,
int range,
regexp_registers_t regs)
{
return re_match_2(bufp, string, size, (char *)NULL, 0, pos, regs, size);
}
int re_search_2(bufp, string1, size1, string2, size2, pos, range, regs,
mstop)
regexp_t bufp;
char *string1, *string2;
int size1, size2, pos, range, mstop;
regexp_registers_t regs;
{
char *fastmap, *translate, *text, *partstart, *partend;
int dir, ret;
char *fastmap;
char *translate;
char *text;
char *partstart;
char *partend;
int dir;
int ret;
char anchor;
assert(size1 >= 0 && size2 >= 0 && pos >= 0 && mstop >= 0);
assert(pos + range >= 0 && pos + range <= size1 + size2); /* Bugfix by ylo */
assert(pos <= mstop);
assert(size >= 0 && pos >= 0);
assert(pos + range >= 0 && pos + range <= size); /* Bugfix by ylo */
fastmap = bufp->fastmap;
translate = bufp->translate;
......@@ -1417,6 +1858,7 @@ regexp_registers_t regs;
anchor = bufp->anchor;
if (bufp->can_be_null == 1) /* can_be_null == 2: can match null at eob */
fastmap = NULL;
if (range < 0)
{
dir = -1;
......@@ -1424,59 +1866,39 @@ regexp_registers_t regs;
}
else
dir = 1;
if (anchor == 2)
if (pos != 0)
return -1;
else
range = 0;
for (; range >= 0; range--, pos += dir)
{
if (fastmap)
{
if (dir == 1)
{ /* searching forwards */
if (pos < size1)
{
text = string1 + pos;
if (pos + range > size1)
partend = string1 + size1;
else
partend = string1 + pos + range;
}
else
{
text = string2 + pos - size1;
partend = string2 + pos + range - size1;
}
text = string + pos;
partend = string + size;
partstart = text;
if (translate)
while (text != partend &&
!fastmap[(unsigned char)
translate[(unsigned char)*text]])
!fastmap[(unsigned char) translate[(unsigned char)*text]])
text++;
else
while (text != partend && !fastmap[(unsigned char)*text])
text++;
pos += text - partstart;
range -= text - partstart;
if (pos == size1 + size2 && bufp->can_be_null == 0)
if (pos == size && bufp->can_be_null == 0)
return -1;
}
else
{ /* searching backwards */
if (pos <= size1)
{
text = string1 + pos;
partstart = string1 + pos - range;
}
else
{
text = string2 + pos - size1;
if (range < pos - size1)
partstart = string2 + pos - size1 - range;
else
partstart = string2;
}
text = string + pos;
partstart = string + pos - range;
partend = text;
if (translate)
while (text != partstart &&
......@@ -1493,13 +1915,11 @@ regexp_registers_t regs;
}
if (anchor == 1)
{ /* anchored to begline */
if (pos > 0 &&
(pos <= size1 ? string1[pos - 1] :
string2[pos - size1 - 1]) != '\n')
if (pos > 0 && string[pos - 1])
continue;
}
assert(pos >= 0 && pos <= size1 + size2);
ret = re_match_2(bufp, string1, size1, string2, size2, pos, regs, mstop);
assert(pos >= 0 && pos <= size);
ret = re_match(bufp, string, size, pos, regs);
if (ret >= 0)
return pos;
if (ret == -2)
......@@ -1507,198 +1927,3 @@ regexp_registers_t regs;
}
return -1;
}
int re_search(bufp, string, size, startpos, range, regs)
regexp_t bufp;
char *string;
int size, startpos, range;
regexp_registers_t regs;
{
return re_search_2(bufp, string, size, (char *)NULL, 0,
startpos, range, regs, size);
}
#ifdef UNUSED
static struct re_pattern_buffer re_comp_buf;
char *re_comp(s)
char *s;
{
if (s == NULL)
{
if (!re_comp_buf.buffer)
return "Out of memory";
return NULL;
}
if (!re_comp_buf.buffer)
{
/* the buffer will be allocated automatically */
re_comp_buf.fastmap = malloc(256);
re_comp_buf.translate = NULL;
if (re_comp_buf.fastmap == NULL)
return "Out of memory";
}
return re_compile_pattern(s, strlen(s), &re_comp_buf);
}
int re_exec(s)
char *s;
{
int len = strlen(s);
return re_search(&re_comp_buf, s, len, 0, len, (regexp_registers_t)NULL) >= 0;
}
#endif
#ifdef TEST_REGEXP
int main()
{
char buf[500];
char *cp;
struct re_pattern_buffer exp;
struct re_registers regs;
int a,pos;
char fastmap[256];
exp.allocated = 0;
exp.buffer = 0;
exp.translate = NULL;
exp.fastmap = fastmap;
/* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
while (1)
{
printf("Enter regexp:\n");
gets(buf);
cp=re_compile_pattern(buf, strlen(buf), &exp);
if (cp)
{
printf("Error: %s\n", cp);
continue;
}
re_compile_fastmap(&exp);
printf("dump:\n");
for (pos = 0; pos < exp.used;)
{
printf("%d: ", pos);
switch (exp.buffer[pos++])
{
case Cend:
strcpy(buf, "end");
break;
case Cbol:
strcpy(buf, "bol");
break;
case Ceol:
strcpy(buf, "eol");
break;
case Cset:
strcpy(buf, "set ");
for (a = 0; a < 256/8; a++)
sprintf(buf+strlen(buf)," %02x",
(unsigned char)exp.buffer[pos++]);
break;
case Cexact:
sprintf(buf, "exact '%c' 0x%x", exp.buffer[pos],
(unsigned char)exp.buffer[pos]);
pos++;
break;
case Canychar:
strcpy(buf, "anychar");
break;
case Cstart_memory:
sprintf(buf, "start_memory %d", exp.buffer[pos++]);
break;
case Cend_memory:
sprintf(buf, "end_memory %d", exp.buffer[pos++]);
break;
case Cmatch_memory:
sprintf(buf, "match_memory %d", exp.buffer[pos++]);
break;
case Cjump:
case Cdummy_failure_jump:
case Cstar_jump:
case Cfailure_jump:
case Cupdate_failure_jump:
a = (unsigned char)exp.buffer[pos++];
a += (unsigned char)exp.buffer[pos++] << 8;
a = (int)(short)a;
switch (exp.buffer[pos-3])
{
case Cjump:
cp = "jump";
break;
case Cstar_jump:
cp = "star_jump";
break;
case Cfailure_jump:
cp = "failure_jump";
break;
case Cupdate_failure_jump:
cp = "update_failure_jump";
break;
case Cdummy_failure_jump:
cp = "dummy_failure_jump";
break;
default:
cp = "unknown jump";
break;
}
sprintf(buf, "%s %d", cp, a + pos);
break;
case Cbegbuf:
strcpy(buf,"begbuf");
break;
case Cendbuf:
strcpy(buf,"endbuf");
break;
case Cwordbeg:
strcpy(buf,"wordbeg");
break;
case Cwordend:
strcpy(buf,"wordend");
break;
case Cwordbound:
strcpy(buf,"wordbound");
break;
case Cnotwordbound:
strcpy(buf,"notwordbound");
break;
default:
sprintf(buf, "unknown code %d",
(unsigned char)exp.buffer[pos - 1]);
break;
}
printf("%s\n", buf);
}
printf("can_be_null = %d uses_registers = %d anchor = %d\n",
exp.can_be_null, exp.uses_registers, exp.anchor);
printf("fastmap:");
for (a = 0; a < 256; a++)
if (exp.fastmap[a])
printf(" %d", a);
printf("\n");
printf("Enter strings. An empty line terminates.\n");
while (fgets(buf, sizeof(buf), stdin))
{
if (buf[0] == '\n')
break;
a = re_search(&exp, buf, strlen(buf), 0, strlen(buf), &regs);
printf("search returns %d\n", a);
if (a != -1)
{
for (a = 0; a < RE_NREGS; a++)
{
printf("buf %d: %d to %d\n", a, regs.start[a], regs.end[a]);
}
}
}
}
}
#endif /* TEST_REGEXP */
......@@ -69,9 +69,7 @@ typedef struct re_registers
#define re_set_syntax _Py_re_set_syntax
#define re_compile_pattern _Py_re_compile_pattern
#define re_match _Py_re_match
#define re_match_2 _Py_re_match_2
#define re_search _Py_re_search
#define re_search_2 _Py_re_search_2
#define re_compile_fastmap _Py_re_compile_fastmap
#define re_comp _Py_re_comp
#define re_exec _Py_re_exec
......@@ -96,20 +94,12 @@ char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
translation table, or NULL if it is not used. */
int re_match(regexp_t compiled, char *string, int size, int pos,
regexp_registers_t regs);
regexp_registers_t old_regs);
/* This tries to match the regexp against the string. This returns the
length of the matched portion, or -1 if the pattern could not be
matched and -2 if an error (such as failure stack overflow) is
encountered. */
int re_match_2(regexp_t compiled, char *string1, int size1,
char *string2, int size2, int pos, regexp_registers_t regs,
int mstop);
/* This tries to match the regexp to the concatenation of string1 and
string2. This returns the length of the matched portion, or -1 if the
pattern could not be matched and -2 if an error (such as failure stack
overflow) is encountered. */
int re_search(regexp_t compiled, char *string, int size, int startpos,
int range, regexp_registers_t regs);
/* This rearches for a substring matching the regexp. This returns the first
......@@ -119,12 +109,6 @@ int re_search(regexp_t compiled, char *string, int size, int startpos,
which a match must not go. This returns -1 if no match is found, and
-2 if an error (such as failure stack overflow) is encountered. */
int re_search_2(regexp_t compiled, char *string1, int size1,
char *string2, int size2, int startpos, int range,
regexp_registers_t regs, int mstop);
/* This is like re_search, but search from the concatenation of string1 and
string2. */
void re_compile_fastmap(regexp_t compiled);
/* This computes the fastmap for the regexp. For this to have any effect,
the calling program must have initialized the fastmap field to point
......@@ -146,9 +130,7 @@ extern int re_syntax;
int re_set_syntax();
char *re_compile_pattern();
int re_match();
int re_match_2();
int re_search();
int re_search_2();
void re_compile_fastmap();
char *re_comp();
int re_exec();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment