Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
004c1e1d
Commit
004c1e1d
authored
May 09, 1997
by
Guido van Rossum
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Latest from Jeffrey Ollie.
Infinite failure stack, some bugs fixed (fastmap, star_jump, register bug).
parent
1681429b
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1680 additions
and
1473 deletions
+1680
-1473
Modules/regexpr.c
Modules/regexpr.c
+1679
-1454
Modules/regexpr.h
Modules/regexpr.h
+1
-19
No files found.
Modules/regexpr.c
View file @
004c1e1d
/*
regexpr.c
Author: Tatu Ylonen <ylo@ngs.fi>
Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
Permission to use, copy, modify, distribute, and sell this software
and its documentation for any purpose is hereby granted without fee,
provided that the above copyright notice appear in all copies. This
software is provided "as is" without express or implied warranty.
Created: Thu Sep 26 17:14:05 1991 ylo
Last modified: Mon Nov 4 17:06:48 1991 ylo
Ported to Think C: 19 Jan 1992 guido@cwi.nl
This code draws many ideas from the regular expression packages by
Henry Spencer of the University of Toronto and Richard Stallman of the
Free Software Foundation.
Emacs-specific code and syntax table code is almost directly borrowed
from GNU regexp.
*/
/* regexpr.c
*
* Author: Tatu Ylonen <ylo@ngs.fi>
*
* Copyright (c) 1991 Tatu Ylonen, Espoo, Finland
*
* Permission to use, copy, modify, distribute, and sell this software
* and its documentation for any purpose is hereby granted without
* fee, provided that the above copyright notice appear in all copies.
* This software is provided "as is" without express or implied
* warranty.
*
* Created: Thu Sep 26 17:14:05 1991 ylo
* Last modified: Mon Nov 4 17:06:48 1991 ylo
* Ported to Think C: 19 Jan 1992 guido@cwi.nl
*
* This code draws many ideas from the regular expression packages by
* Henry Spencer of the University of Toronto and Richard Stallman of
* the Free Software Foundation.
*
* Emacs-specific code and syntax table code is almost directly borrowed
* from GNU regexp.
*
* Bugs fixed and lots of reorganization by Jeffrey C. Ollie, April
* 1997 Thanks for bug reports and ideas from Andrew Kuchling, Tim
* Peters, Guido van Rossum, Ka-Ping Yee, Sjoerd Mullender, and
* probably one or two others that I'm forgetting.
*
* $Id$ */
#include "config.h"
/* For Win* specific redefinition of printf c.s. */
#include "myproto.h"
/* For P
y_P
ROTO macro --Guido */
#include "myproto.h"
/* For PROTO macro --Guido */
#include <stdio.h>
#ifndef NDEBUG
#define NDEBUG 1
#endif
#include <assert.h>
#include "regexpr.h"
...
...
@@ -48,8 +57,316 @@ char *realloc();
#endif
/* __STDC__ */
#endif
/* THINK_C */
#define MACRO_BEGIN do {
#define MACRO_END } while (0)
/* The stack implementation is taken from an idea by Andrew Kuchling.
* It's a doubly linked list of arrays. The advantages of this over a
* simple linked list are that the number of mallocs required are
* reduced. It also makes it possible to statically allocate enough
* space so that small patterns don't ever need to call malloc.
*
* The advantages over a single array is that is periodically
* realloced when more space is needed is that we avoid ever copying
* the stack. */
/* item_t is the basic stack element. Defined as a union of
* structures so that both registers, failure points, and counters can
* be pushed/popped from the stack. There's nothing built into the
* item to keep track of whether a certain stack item is a register, a
* failure point, or a counter. */
typedef
union
item_t
{
struct
{
int
num
;
int
level
;
char
*
start
;
char
*
end
;
}
reg
;
struct
{
int
count
;
int
level
;
int
phantom
;
char
*
code
;
char
*
text
;
}
fail
;
struct
{
int
num
;
int
level
;
int
count
;
}
cntr
;
}
item_t
;
#define STACK_PAGE_SIZE 256
#define NUM_REGISTERS 256
/* A 'page' of stack items. */
typedef
struct
item_page_t
{
item_t
items
[
STACK_PAGE_SIZE
];
struct
item_page_t
*
prev
;
struct
item_page_t
*
next
;
}
item_page_t
;
typedef
struct
match_state
{
/* Structure to encapsulate the stack. */
struct
{
/* index into the curent page. If index == 0 and you need
* to pop and item, move to the previous page and set
* index = STACK_PAGE_SIZE - 1. Otherwise decrement index
* to push a page. If index == STACK_PAGE_SIZE and you
* need to push a page move to the next page and set index
* = 0. If there is no new next page, allocate a new page
* and link it in. Otherwise, increment index to push a
* page. */
int
index
;
item_page_t
*
current
;
/* Pointer to the current page. */
item_page_t
first
;
/* First page is statically allocated. */
}
stack
;
char
*
start
[
NUM_REGISTERS
];
char
*
end
[
NUM_REGISTERS
];
int
changed
[
NUM_REGISTERS
];
/* The number of registers that have been pushed onto the stack
* since the last failure point. */
int
count
;
/* Used to control when registers need to be pushed onto the
* stack. */
int
level
;
/* The number of failure points on the stack. */
int
point
;
}
match_state
;
/* Discard the top 'count' stack items. */
#define STACK_DISCARD(stack, count, on_error) \
stack.index -= count; \
while (stack.index < 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index += STACK_PAGE_SIZE; \
}
/* Store a pointer to the previous item on the stack. Used to pop an
* item off of the stack. */
#define STACK_PREV(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
stack.current = stack.current->prev; \
stack.index = STACK_PAGE_SIZE - 1; \
} \
else \
stack.index--; \
top = &(stack.current->items[stack.index])
/* Store a pointer to the next item on the stack. Used to push an item
* on to the stack. */
#define STACK_NEXT(stack, top, on_error) \
if (stack.index == STACK_PAGE_SIZE) \
{ \
if (stack.current->next == NULL) \
{ \
stack.current->next = malloc(sizeof(item_page_t)); \
if (stack.current->next == NULL) \
on_error; \
stack.current->next->prev = stack.current; \
stack.current->next->next = NULL; \
} \
stack.current = stack.current->next; \
stack.index = 0; \
} \
top = &(stack.current->items[stack.index++])
/* Store a pointer to the item that is 'count' items back in the
* stack. STACK_BACK(stack, top, 1, on_error) is equivalent to
* STACK_TOP(stack, top, on_error). */
#define STACK_BACK(stack, top, count, on_error) \
{ \
int index; \
item_page_t *current; \
current = stack.current; \
index = stack.index - (count); \
while (index < 0) \
{ \
if (current->prev == NULL) \
on_error; \
current = current->prev; \
index += STACK_PAGE_SIZE; \
} \
top = &(current->items[index]); \
}
/* Store a pointer to the top item on the stack. Execute the
* 'on_error' code if there are no items on the stack. */
#define STACK_TOP(stack, top, on_error) \
if (stack.index == 0) \
{ \
if (stack.current->prev == NULL) \
on_error; \
top = &(stack.current->prev->items[STACK_PAGE_SIZE - 1]); \
} \
else \
top = &(stack.current->items[stack.index - 1])
/* Test to see if the stack is empty */
#define STACK_EMPTY(stack) ((stack.index == 0) && \
(stack.current->prev == NULL))
/* Initialize a state object */
#define NEW_STATE(state) \
memset(&state, 0, sizeof(match_state)); \
state.stack.current = &state.stack.first; \
state.level = 1
/* Free any memory that might have been malloc'd */
#define FREE_STATE(state) \
while(state.stack.first.next != NULL) \
{ \
state.stack.current = state.stack.first.next; \
state.stack.first.next = state.stack.current->next; \
free(state.stack.current); \
}
/* Return the start of register 'reg' */
#define GET_REG_START(state, reg) (state.start[reg])
/* Return the end of register 'reg' */
#define GET_REG_END(state, reg) (state.end[reg])
/* Set the start of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
#define SET_REG_START(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.start[reg] = text
/* Set the end of register 'reg'. If the state of the register needs
* saving, push it on the stack. */
#define SET_REG_END(state, reg, text, on_error) \
if(state.changed[reg] < state.level) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->reg.num = reg; \
item->reg.start = state.start[reg]; \
item->reg.end = state.end[reg]; \
item->reg.level = state.changed[reg]; \
state.changed[reg] = state.level; \
state.count++; \
} \
state.end[reg] = text
#define PUSH_FAILURE(state, xcode, xtext, on_error) \
{ \
item_t *item; \
STACK_NEXT(state.stack, item, on_error); \
item->fail.code = xcode; \
item->fail.text = xtext; \
item->fail.count = state.count; \
item->fail.level = state.level; \
item->fail.phantom = 0; \
state.count = 0; \
state.level++; \
state.point++; \
}
/* Update the last failure point with a new position in the text. */
/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
/* { \ */
/* item_t *item; \ */
/* STACK_DISCARD(state.stack, state.count, on_error); \ */
/* STACK_TOP(state.stack, item, on_error); \ */
/* item->fail.text = xtext; \ */
/* state.count = 0; \ */
/* } */
/* #define UPDATE_FAILURE(state, xtext, on_error) \ */
/* { \ */
/* item_t *item; \ */
/* STACK_BACK(state.stack, item, state.count + 1, on_error); \ */
/* item->fail.text = xtext; \ */
/* } */
#define UPDATE_FAILURE(state, xtext, on_error) \
{ \
item_t *item; \
STACK_BACK(state.stack, item, state.count + 1, on_error); \
if (!item->fail.phantom) \
{ \
item_t *item2; \
STACK_NEXT(state.stack, item2, on_error); \
item2->fail.code = item->fail.code; \
item2->fail.text = xtext; \
item2->fail.count = state.count; \
item2->fail.level = state.level; \
item2->fail.phantom = 1; \
state.count = 0; \
state.level++; \
state.point++; \
} \
else \
{ \
STACK_DISCARD(state.stack, state.count, on_error); \
STACK_TOP(state.stack, item, on_error); \
item->fail.text = xtext; \
state.count = 0; \
state.level++; \
} \
}
#define POP_FAILURE(state, xcode, xtext, on_empty, on_error) \
{ \
item_t *item; \
do \
{ \
while(state.count > 0) \
{ \
STACK_PREV(state.stack, item, on_error); \
state.start[item->reg.num] = item->reg.start; \
state.end[item->reg.num] = item->reg.end; \
state.changed[item->reg.num] = item->reg.level; \
state.count--; \
} \
STACK_PREV(state.stack, item, on_empty); \
xcode = item->fail.code; \
xtext = item->fail.text; \
state.count = item->fail.count; \
state.level = item->fail.level; \
state.point--; \
} \
while (item->fail.text == NULL); \
}
enum
regexp_compiled_ops
/* opcodes for compiled regexp */
{
...
...
@@ -73,9 +390,6 @@ enum regexp_compiled_ops /* opcodes for compiled regexp */
Cwordend
,
/* match at end of word */
Cwordbound
,
/* match if at word boundary */
Cnotwordbound
,
/* match if not at word boundary */
#ifdef emacs
Cemacs_at_dot
,
/* emacs only: matches at dot */
#endif
/* emacs */
Csyntaxspec
,
/* matches syntax code (1 byte follows) */
Cnotsyntaxspec
/* matches if syntax code does not match (1 byte foll)*/
};
...
...
@@ -106,11 +420,6 @@ enum regexp_syntax_op /* syntax codes for plain and quoted characters */
Rwordend
,
/* end of word */
Rwordbound
,
/* word bound */
Rnotwordbound
,
/* not word bound */
#ifdef emacs
Remacs_at_dot
,
/* emacs: at dot */
Remacs_syntaxspec
,
/* syntaxspec */
Remacs_notsyntaxspec
,
/* notsyntaxspec */
#endif
/* emacs */
Rnum_ops
};
...
...
@@ -126,38 +435,15 @@ static int regexp_ansi_sequences;
#define NUM_LEVELS 5
/* number of precedence levels in use */
#define MAX_NESTING 100
/* max nesting level of operators */
#ifdef emacs
/* This code is for emacs compatibility only. */
#include "config.h"
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
/* emacs defines NULL in some strange way? */
#undef NULL
#define NULL 0
#else
/* emacs */
#define SYNTAX(ch) re_syntax_table[(unsigned char)(ch)]
#define Sword 1
#ifdef SYNTAX_TABLE
char
*
re_syntax_table
;
#else
static
char
re_syntax_table
[
256
];
#endif
/* SYNTAX_TABLE */
#endif
/* emacs */
static
void
re_compile_initialize
Py_PROTO
((
void
));
static
void
re_compile_initialize
()
static
void
re_compile_initialize
(
void
)
{
int
a
;
#if !defined(emacs) && !defined(SYNTAX_TABLE)
static
int
syntax_table_inited
=
0
;
if
(
!
syntax_table_inited
)
...
...
@@ -171,7 +457,6 @@ static void re_compile_initialize()
for
(
a
=
'0'
;
a
<=
'9'
;
a
++
)
re_syntax_table
[
a
]
=
Sword
;
}
#endif
/* !emacs && !SYNTAX_TABLE */
re_compile_initialized
=
1
;
for
(
a
=
0
;
a
<
256
;
a
++
)
{
...
...
@@ -214,11 +499,6 @@ static void re_compile_initialize()
regexp_plain_ops
[
'.'
]
=
Ranychar
;
if
(
!
(
regexp_syntax
&
RE_NO_GNU_EXTENSIONS
))
{
#ifdef emacs
regexp_quoted_ops
[
'='
]
=
Remacs_at_dot
;
regexp_quoted_ops
[
's'
]
=
Remacs_syntaxspec
;
regexp_quoted_ops
[
'S'
]
=
Remacs_notsyntaxspec
;
#endif
/* emacs */
regexp_quoted_ops
[
'w'
]
=
Rwordchar
;
regexp_quoted_ops
[
'W'
]
=
Rnotwordchar
;
regexp_quoted_ops
[
'<'
]
=
Rwordbeg
;
...
...
@@ -250,8 +530,7 @@ static void re_compile_initialize()
regexp_ansi_sequences
=
(
regexp_syntax
&
RE_ANSI_HEX
)
!=
0
;
}
int
re_set_syntax
(
syntax
)
int
syntax
;
int
re_set_syntax
(
int
syntax
)
{
int
ret
;
...
...
@@ -262,9 +541,7 @@ int syntax;
return
ret
;
}
static
int
hex_char_to_decimal
Py_PROTO
((
int
));
static
int
hex_char_to_decimal
(
ch
)
int
ch
;
static
int
hex_char_to_decimal
(
int
ch
)
{
if
(
ch
>=
'0'
&&
ch
<=
'9'
)
return
ch
-
'0'
;
...
...
@@ -275,224 +552,659 @@ int ch;
return
16
;
}
char
*
re_compile_pattern
(
regex
,
size
,
bufp
)
char
*
regex
;
int
size
;
regexp_t
bufp
;
static
void
re_compile_fastmap_aux
(
char
*
code
,
int
pos
,
char
*
visited
,
char
*
can_be_null
,
char
*
fastmap
)
{
int
a
,
pos
,
op
,
current_level
,
level
,
opcode
;
int
pattern_offset
=
0
,
alloc
;
int
starts
[
NUM_LEVELS
*
MAX_NESTING
],
starts_base
;
int
future_jumps
[
MAX_NESTING
],
num_jumps
;
unsigned
char
ch
=
'\0'
;
char
*
pattern
,
*
translate
;
int
next_register
,
paren_depth
,
num_open_registers
,
open_registers
[
RE_NREGS
];
int
beginning_context
;
#define NEXTCHAR(var) \
MACRO_BEGIN \
if (pos >= size) \
goto ends_prematurely; \
(var) = regex[pos]; \
pos++; \
MACRO_END
#define ALLOC(amount) \
MACRO_BEGIN \
if (pattern_offset+(amount) > alloc) \
{ \
alloc += 256 + (amount); \
pattern = realloc(pattern, alloc); \
if (!pattern) \
goto out_of_memory; \
} \
MACRO_END
#define STORE(ch) pattern[pattern_offset++] = (ch)
#define CURRENT_LEVEL_START (starts[starts_base + current_level])
#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
#define PUSH_LEVEL_STARTS if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
starts_base += NUM_LEVELS; \
else \
goto too_complex
#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
#define PUT_ADDR(offset,addr) \
MACRO_BEGIN \
int disp = (addr) - (offset) - 2; \
pattern[(offset)] = disp & 0xff; \
pattern[(offset)+1] = (disp>>8) & 0xff; \
MACRO_END
#define INSERT_JUMP(pos,type,addr) \
MACRO_BEGIN \
int a, p = (pos), t = (type), ad = (addr); \
for (a = pattern_offset - 1; a >= p; a--) \
pattern[a + 3] = pattern[a]; \
pattern[p] = t; \
PUT_ADDR(p+1,ad); \
pattern_offset += 3; \
MACRO_END
#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
#define SET_FIELDS \
MACRO_BEGIN \
bufp->allocated = alloc; \
bufp->buffer = pattern; \
bufp->used = pattern_offset; \
MACRO_END
#define GETHEX(var) \
MACRO_BEGIN \
char gethex_ch, gethex_value; \
NEXTCHAR(gethex_ch); \
gethex_value = hex_char_to_decimal(gethex_ch); \
if (gethex_value == 16) \
goto hex_error; \
NEXTCHAR(gethex_ch); \
gethex_ch = hex_char_to_decimal(gethex_ch); \
if (gethex_ch == 16) \
goto hex_error; \
(var) = gethex_value * 16 + gethex_ch; \
MACRO_END
#define ANSI_TRANSLATE(ch) \
MACRO_BEGIN \
switch (ch) \
{ \
case 'a': \
case 'A': \
ch = 7;
/* audible bell */
\
break; \
case 'b': \
case 'B': \
ch = 8;
/* backspace */
\
break; \
case 'f': \
case 'F': \
ch = 12;
/* form feed */
\
break; \
case 'n': \
case 'N': \
ch = 10;
/* line feed */
\
break; \
case 'r': \
case 'R': \
ch = 13;
/* carriage return */
\
break; \
case 't': \
case 'T': \
ch = 9;
/* tab */
\
break; \
case 'v': \
case 'V': \
ch = 11;
/* vertical tab */
\
break; \
case 'x':
/* hex code */
\
case 'X': \
GETHEX(ch); \
break; \
default: \
/* other characters passed through */
\
if (translate) \
ch = translate[(unsigned char)ch]; \
break; \
} \
MACRO_END
int
a
;
int
b
;
int
syntaxcode
;
if
(
!
re_compile_initialized
)
re_compile_initialize
();
bufp
->
used
=
0
;
bufp
->
fastmap_accurate
=
0
;
bufp
->
uses_registers
=
0
;
translate
=
bufp
->
translate
;
pattern
=
bufp
->
buffer
;
alloc
=
bufp
->
allocated
;
if
(
alloc
==
0
||
pattern
==
NULL
)
{
alloc
=
256
;
pattern
=
malloc
(
alloc
);
if
(
!
pattern
)
goto
out_of_memory
;
}
pattern_offset
=
0
;
starts_base
=
0
;
num_jumps
=
0
;
current_level
=
0
;
SET_LEVEL_START
;
num_open_registers
=
0
;
next_register
=
1
;
paren_depth
=
0
;
beginning_context
=
1
;
op
=
-
1
;
/* we use Rend dummy to ensure that pending jumps are updated (due to
low priority of Rend) before exiting the loop. */
pos
=
0
;
while
(
op
!=
Rend
)
if
(
visited
[
pos
])
return
;
/* we have already been here */
visited
[
pos
]
=
1
;
for
(;;)
switch
(
code
[
pos
++
])
{
if
(
pos
>=
size
)
op
=
Rend
;
else
case
Cend
:
{
NEXTCHAR
(
ch
);
if
(
translate
)
ch
=
translate
[(
unsigned
char
)
ch
];
op
=
regexp_plain_ops
[(
unsigned
char
)
ch
];
if
(
op
==
Rquote
)
*
can_be_null
=
1
;
return
;
}
case
Cbol
:
case
Cbegbuf
:
case
Cendbuf
:
case
Cwordbeg
:
case
Cwordend
:
case
Cwordbound
:
case
Cnotwordbound
:
{
NEXTCHAR
(
ch
);
op
=
regexp_quoted_ops
[(
unsigned
char
)
ch
];
if
(
op
==
Rnormal
&&
regexp_ansi_sequences
)
ANSI_TRANSLATE
(
ch
);
break
;
}
case
Csyntaxspec
:
{
syntaxcode
=
code
[
pos
++
];
for
(
a
=
0
;
a
<
256
;
a
++
)
if
(
SYNTAX
(
a
)
==
syntaxcode
)
fastmap
[
a
]
=
1
;
return
;
}
level
=
regexp_precedences
[
op
];
/* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
ch, op, level, current_level, CURRENT_LEVEL_START); */
if
(
level
>
current_level
)
case
Cnotsyntaxspec
:
{
for
(
current_level
++
;
current_level
<
level
;
current_level
++
)
SET_LEVEL_START
;
SET_LEVEL_START
;
syntaxcode
=
code
[
pos
++
];
for
(
a
=
0
;
a
<
256
;
a
++
)
if
(
SYNTAX
(
a
)
!=
syntaxcode
)
fastmap
[
a
]
=
1
;
return
;
}
else
if
(
level
<
current_level
)
case
Ceol
:
{
current_level
=
level
;
for
(;
num_jumps
>
0
&&
future_jumps
[
num_jumps
-
1
]
>=
CURRENT_LEVEL_START
;
num_jumps
--
)
PUT_ADDR
(
future_jumps
[
num_jumps
-
1
],
pattern_offset
);
fastmap
[
'\n'
]
=
1
;
if
(
*
can_be_null
==
0
)
*
can_be_null
=
2
;
/* can match null, but only at end of buffer*/
return
;
}
switch
(
op
)
case
Cset
:
{
case
Rend
:
break
;
case
Rnormal
:
normal_char:
opcode
=
Cexact
;
for
(
a
=
0
;
a
<
256
/
8
;
a
++
)
if
(
code
[
pos
+
a
]
!=
0
)
for
(
b
=
0
;
b
<
8
;
b
++
)
if
(
code
[
pos
+
a
]
&
(
1
<<
b
))
fastmap
[(
a
<<
3
)
+
b
]
=
1
;
pos
+=
256
/
8
;
return
;
}
case
Cexact
:
{
fastmap
[(
unsigned
char
)
code
[
pos
]]
=
1
;
return
;
}
case
Canychar
:
{
for
(
a
=
0
;
a
<
256
;
a
++
)
if
(
a
!=
'\n'
)
fastmap
[
a
]
=
1
;
return
;
}
case
Cstart_memory
:
case
Cend_memory
:
{
pos
++
;
break
;
}
case
Cmatch_memory
:
{
for
(
a
=
0
;
a
<
256
;
a
++
)
fastmap
[
a
]
=
1
;
*
can_be_null
=
1
;
return
;
}
case
Cjump
:
case
Cdummy_failure_jump
:
case
Cupdate_failure_jump
:
case
Cstar_jump
:
{
a
=
(
unsigned
char
)
code
[
pos
++
];
a
|=
(
unsigned
char
)
code
[
pos
++
]
<<
8
;
pos
+=
(
int
)(
short
)
a
;
if
(
visited
[
pos
])
{
/* argh... the regexp contains empty loops. This is not
good, as this may cause a failure stack overflow when
matching. Oh well. */
/* this path leads nowhere; pursue other paths. */
return
;
}
visited
[
pos
]
=
1
;
break
;
}
case
Cfailure_jump
:
{
a
=
(
unsigned
char
)
code
[
pos
++
];
a
|=
(
unsigned
char
)
code
[
pos
++
]
<<
8
;
a
=
pos
+
(
int
)(
short
)
a
;
re_compile_fastmap_aux
(
code
,
a
,
visited
,
can_be_null
,
fastmap
);
break
;
}
default:
{
abort
();
/* probably some opcode is missing from this switch */
/*NOTREACHED*/
}
}
}
static
int
re_do_compile_fastmap
(
char
*
buffer
,
int
used
,
int
pos
,
char
*
can_be_null
,
char
*
fastmap
)
{
char
small_visited
[
512
],
*
visited
;
if
(
used
<=
sizeof
(
small_visited
))
visited
=
small_visited
;
else
{
visited
=
malloc
(
used
);
if
(
!
visited
)
return
0
;
}
*
can_be_null
=
0
;
memset
(
fastmap
,
0
,
256
);
memset
(
visited
,
0
,
used
);
re_compile_fastmap_aux
(
buffer
,
pos
,
visited
,
can_be_null
,
fastmap
);
if
(
visited
!=
small_visited
)
free
(
visited
);
return
1
;
}
void
re_compile_fastmap
(
regexp_t
bufp
)
{
if
(
!
bufp
->
fastmap
||
bufp
->
fastmap_accurate
)
return
;
assert
(
bufp
->
used
>
0
);
if
(
!
re_do_compile_fastmap
(
bufp
->
buffer
,
bufp
->
used
,
0
,
&
bufp
->
can_be_null
,
bufp
->
fastmap
))
return
;
if
(
bufp
->
buffer
[
0
]
==
Cbol
)
bufp
->
anchor
=
1
;
/* begline */
else
if
(
bufp
->
buffer
[
0
]
==
Cbegbuf
)
bufp
->
anchor
=
2
;
/* begbuf */
else
bufp
->
anchor
=
0
;
/* none */
bufp
->
fastmap_accurate
=
1
;
}
/*
* star is coded as:
* 1: failure_jump 2
* ... code for operand of star
* star_jump 1
* 2: ... code after star
*
* We change the star_jump to update_failure_jump if we can determine
* that it is safe to do so; otherwise we change it to an ordinary
* jump.
*
* plus is coded as
*
* jump 2
* 1: failure_jump 3
* 2: ... code for operand of plus
* star_jump 1
* 3: ... code after plus
*
* For star_jump considerations this is processed identically to star.
*
*/
static
int
re_optimize_star_jump
(
regexp_t
bufp
,
char
*
code
)
{
char
map
[
256
];
char
can_be_null
;
char
*
p1
;
char
*
p2
;
char
ch
;
int
a
;
int
b
;
a
=
(
unsigned
char
)
*
code
++
;
a
|=
(
unsigned
char
)
*
code
++
<<
8
;
a
=
(
int
)(
short
)
a
;
p1
=
code
+
a
+
3
;
/* skip the failure_jump */
assert
(
p1
[
-
3
]
==
Cfailure_jump
);
p2
=
code
;
/* p1 points inside loop, p2 points to after loop */
if
(
!
re_do_compile_fastmap
(
bufp
->
buffer
,
bufp
->
used
,
p2
-
bufp
->
buffer
,
&
can_be_null
,
map
))
goto
make_normal_jump
;
/* If we might introduce a new update point inside the
* loop, we can't optimize because then update_jump would
* update a wrong failure point. Thus we have to be
* quite careful here.
*/
/* loop until we find something that consumes a character */
loop_p1:
switch
(
*
p1
++
)
{
case
Cbol
:
case
Ceol
:
case
Cbegbuf
:
case
Cendbuf
:
case
Cwordbeg
:
case
Cwordend
:
case
Cwordbound
:
case
Cnotwordbound
:
{
goto
loop_p1
;
}
case
Cstart_memory
:
case
Cend_memory
:
{
p1
++
;
goto
loop_p1
;
}
case
Cexact
:
{
ch
=
(
unsigned
char
)
*
p1
++
;
if
(
map
[
ch
])
goto
make_normal_jump
;
break
;
}
case
Canychar
:
{
for
(
b
=
0
;
b
<
256
;
b
++
)
if
(
b
!=
'\n'
&&
map
[
b
])
goto
make_normal_jump
;
break
;
}
case
Cset
:
{
for
(
b
=
0
;
b
<
256
;
b
++
)
if
((
p1
[
b
>>
3
]
&
(
1
<<
(
b
&
7
)))
&&
map
[
b
])
goto
make_normal_jump
;
p1
+=
256
/
8
;
break
;
}
default:
{
goto
make_normal_jump
;
}
}
/* now we know that we can't backtrack. */
while
(
p1
!=
p2
-
3
)
{
switch
(
*
p1
++
)
{
case
Cend
:
{
return
0
;
}
case
Cbol
:
case
Ceol
:
case
Canychar
:
case
Cbegbuf
:
case
Cendbuf
:
case
Cwordbeg
:
case
Cwordend
:
case
Cwordbound
:
case
Cnotwordbound
:
{
break
;
}
case
Cset
:
{
p1
+=
256
/
8
;
break
;
}
case
Cexact
:
case
Cstart_memory
:
case
Cend_memory
:
case
Cmatch_memory
:
case
Csyntaxspec
:
case
Cnotsyntaxspec
:
{
p1
++
;
break
;
}
case
Cjump
:
case
Cstar_jump
:
case
Cfailure_jump
:
case
Cupdate_failure_jump
:
case
Cdummy_failure_jump
:
{
goto
make_normal_jump
;
}
default:
{
return
0
;
break
;
}
}
}
make_update_jump:
code
-=
3
;
a
+=
3
;
/* jump to after the Cfailure_jump */
code
[
0
]
=
Cupdate_failure_jump
;
code
[
1
]
=
a
&
0xff
;
code
[
2
]
=
a
>>
8
;
return
1
;
make_normal_jump:
code
-=
3
;
*
code
=
Cjump
;
return
1
;
}
static
int
re_optimize
(
regexp_t
bufp
)
{
char
*
code
;
code
=
bufp
->
buffer
;
while
(
1
)
{
switch
(
*
code
++
)
{
case
Cend
:
{
return
1
;
}
case
Canychar
:
case
Cbol
:
case
Ceol
:
case
Cbegbuf
:
case
Cendbuf
:
case
Cwordbeg
:
case
Cwordend
:
case
Cwordbound
:
case
Cnotwordbound
:
{
break
;
}
case
Cset
:
{
code
+=
256
/
8
;
break
;
}
case
Cexact
:
case
Cstart_memory
:
case
Cend_memory
:
case
Cmatch_memory
:
case
Csyntaxspec
:
case
Cnotsyntaxspec
:
{
code
++
;
break
;
}
case
Cstar_jump
:
{
if
(
!
re_optimize_star_jump
(
bufp
,
code
))
{
return
0
;
}
/* fall through */
}
case
Cupdate_failure_jump
:
case
Cjump
:
case
Cdummy_failure_jump
:
case
Cfailure_jump
:
{
code
+=
2
;
break
;
}
default:
{
return
0
;
}
}
}
}
#define NEXTCHAR(var) \
{ \
if (pos >= size) \
goto ends_prematurely; \
(var) = regex[pos]; \
pos++; \
}
#define ALLOC(amount) \
{ \
if (pattern_offset+(amount) > alloc) \
{ \
alloc += 256 + (amount); \
pattern = realloc(pattern, alloc); \
if (!pattern) \
goto out_of_memory; \
} \
}
#define STORE(ch) pattern[pattern_offset++] = (ch)
#define CURRENT_LEVEL_START (starts[starts_base + current_level])
#define SET_LEVEL_START starts[starts_base + current_level] = pattern_offset
#define PUSH_LEVEL_STARTS \
if (starts_base < (MAX_NESTING-1)*NUM_LEVELS) \
starts_base += NUM_LEVELS; \
else \
goto too_complex
#define POP_LEVEL_STARTS starts_base -= NUM_LEVELS
#define PUT_ADDR(offset,addr) \
{ \
int disp = (addr) - (offset) - 2; \
pattern[(offset)] = disp & 0xff; \
pattern[(offset)+1] = (disp>>8) & 0xff; \
}
#define INSERT_JUMP(pos,type,addr) \
{ \
int a, p = (pos), t = (type), ad = (addr); \
for (a = pattern_offset - 1; a >= p; a--) \
pattern[a + 3] = pattern[a]; \
pattern[p] = t; \
PUT_ADDR(p+1,ad); \
pattern_offset += 3; \
}
#define SETBIT(buf,offset,bit) (buf)[(offset)+(bit)/8] |= (1<<((bit) & 7))
#define SET_FIELDS \
{ \
bufp->allocated = alloc; \
bufp->buffer = pattern; \
bufp->used = pattern_offset; \
}
#define GETHEX(var) \
{ \
char gethex_ch, gethex_value; \
NEXTCHAR(gethex_ch); \
gethex_value = hex_char_to_decimal(gethex_ch); \
if (gethex_value == 16) \
goto hex_error; \
NEXTCHAR(gethex_ch); \
gethex_ch = hex_char_to_decimal(gethex_ch); \
if (gethex_ch == 16) \
goto hex_error; \
(var) = gethex_value * 16 + gethex_ch; \
}
#define ANSI_TRANSLATE(ch) \
{ \
switch (ch) \
{ \
case 'a': \
case 'A': \
{ \
ch = 7;
/* audible bell */
\
break; \
} \
case 'b': \
case 'B': \
{ \
ch = 8;
/* backspace */
\
break; \
} \
case 'f': \
case 'F': \
{ \
ch = 12;
/* form feed */
\
break; \
} \
case 'n': \
case 'N': \
{ \
ch = 10;
/* line feed */
\
break; \
} \
case 'r': \
case 'R': \
{ \
ch = 13;
/* carriage return */
\
break; \
} \
case 't': \
case 'T': \
{ \
ch = 9;
/* tab */
\
break; \
} \
case 'v': \
case 'V': \
{ \
ch = 11;
/* vertical tab */
\
break; \
} \
case 'x':
/* hex code */
\
case 'X': \
{ \
GETHEX(ch); \
break; \
} \
default: \
{ \
/* other characters passed through */
\
if (translate) \
ch = translate[(unsigned char)ch]; \
break; \
} \
} \
}
char
*
re_compile_pattern
(
char
*
regex
,
int
size
,
regexp_t
bufp
)
{
int
a
;
int
pos
;
int
op
;
int
current_level
;
int
level
;
int
opcode
;
int
pattern_offset
,
alloc
;
int
starts
[
NUM_LEVELS
*
MAX_NESTING
];
int
starts_base
;
int
future_jumps
[
MAX_NESTING
];
int
num_jumps
;
unsigned
char
ch
;
char
*
pattern
;
char
*
translate
;
int
next_register
;
int
paren_depth
;
int
num_open_registers
;
int
open_registers
[
RE_NREGS
];
int
beginning_context
;
if
(
!
re_compile_initialized
)
re_compile_initialize
();
bufp
->
used
=
0
;
bufp
->
fastmap_accurate
=
0
;
bufp
->
uses_registers
=
0
;
translate
=
bufp
->
translate
;
pattern
=
bufp
->
buffer
;
alloc
=
bufp
->
allocated
;
if
(
alloc
==
0
||
pattern
==
NULL
)
{
alloc
=
256
;
pattern
=
malloc
(
alloc
);
if
(
!
pattern
)
goto
out_of_memory
;
}
pattern_offset
=
0
;
starts_base
=
0
;
num_jumps
=
0
;
current_level
=
0
;
SET_LEVEL_START
;
num_open_registers
=
0
;
next_register
=
1
;
paren_depth
=
0
;
beginning_context
=
1
;
op
=
-
1
;
/* we use Rend dummy to ensure that pending jumps are updated (due to
low priority of Rend) before exiting the loop. */
pos
=
0
;
while
(
op
!=
Rend
)
{
if
(
pos
>=
size
)
op
=
Rend
;
else
{
NEXTCHAR
(
ch
);
if
(
translate
)
ch
=
translate
[(
unsigned
char
)
ch
];
op
=
regexp_plain_ops
[(
unsigned
char
)
ch
];
if
(
op
==
Rquote
)
{
NEXTCHAR
(
ch
);
op
=
regexp_quoted_ops
[(
unsigned
char
)
ch
];
if
(
op
==
Rnormal
&&
regexp_ansi_sequences
)
ANSI_TRANSLATE
(
ch
);
}
}
level
=
regexp_precedences
[
op
];
/* printf("ch='%c' op=%d level=%d current_level=%d curlevstart=%d\n",
ch, op, level, current_level, CURRENT_LEVEL_START); */
if
(
level
>
current_level
)
{
for
(
current_level
++
;
current_level
<
level
;
current_level
++
)
SET_LEVEL_START
;
SET_LEVEL_START
;
}
else
if
(
level
<
current_level
)
{
current_level
=
level
;
for
(;
num_jumps
>
0
&&
future_jumps
[
num_jumps
-
1
]
>=
CURRENT_LEVEL_START
;
num_jumps
--
)
PUT_ADDR
(
future_jumps
[
num_jumps
-
1
],
pattern_offset
);
}
switch
(
op
)
{
case
Rend
:
{
break
;
}
case
Rnormal
:
{
normal_char:
opcode
=
Cexact
;
store_opcode_and_arg:
/* opcode & ch must be set */
SET_LEVEL_START
;
ALLOC
(
2
);
STORE
(
opcode
);
STORE
(
ch
);
break
;
}
case
Ranychar
:
{
opcode
=
Canychar
;
store_opcode:
SET_LEVEL_START
;
ALLOC
(
1
);
STORE
(
opcode
);
break
;
}
case
Rquote
:
{
abort
();
/*NOTREACHED*/
}
case
Rbol
:
{
if
(
!
beginning_context
)
if
(
regexp_context_indep_ops
)
goto
op_error
;
...
...
@@ -500,7 +1212,9 @@ regexp_t bufp;
goto
normal_char
;
opcode
=
Cbol
;
goto
store_opcode
;
}
case
Reol
:
{
if
(
!
((
pos
>=
size
)
||
((
regexp_syntax
&
RE_NO_BK_VBAR
)
?
(
regex
[
pos
]
==
'\174'
)
:
...
...
@@ -518,7 +1232,9 @@ regexp_t bufp;
goto
store_opcode
;
/* NOTREACHED */
break
;
}
case
Roptional
:
{
if
(
beginning_context
)
if
(
regexp_context_indep_ops
)
goto
op_error
;
...
...
@@ -530,8 +1246,10 @@ regexp_t bufp;
INSERT_JUMP
(
CURRENT_LEVEL_START
,
Cfailure_jump
,
pattern_offset
+
3
);
break
;
}
case
Rstar
:
case
Rplus
:
{
if
(
beginning_context
)
if
(
regexp_context_indep_ops
)
goto
op_error
;
...
...
@@ -547,7 +1265,9 @@ regexp_t bufp;
INSERT_JUMP
(
CURRENT_LEVEL_START
,
Cdummy_failure_jump
,
CURRENT_LEVEL_START
+
6
);
break
;
}
case
Ror
:
{
ALLOC
(
6
);
INSERT_JUMP
(
CURRENT_LEVEL_START
,
Cfailure_jump
,
pattern_offset
+
6
);
...
...
@@ -559,7 +1279,9 @@ regexp_t bufp;
STORE
(
0
);
SET_LEVEL_START
;
break
;
}
case
Ropenpar
:
{
SET_LEVEL_START
;
if
(
next_register
<
RE_NREGS
)
{
...
...
@@ -575,7 +1297,9 @@ regexp_t bufp;
current_level
=
0
;
SET_LEVEL_START
;
break
;
}
case
Rclosepar
:
{
if
(
paren_depth
<=
0
)
goto
parenthesis_error
;
POP_LEVEL_STARTS
;
...
...
@@ -590,7 +1314,9 @@ regexp_t bufp;
STORE
(
open_registers
[
num_open_registers
]);
}
break
;
}
case
Rmemory
:
{
if
(
ch
==
'0'
)
goto
bad_match_register
;
assert
(
ch
>=
'0'
&&
ch
<=
'9'
);
...
...
@@ -598,7 +1324,9 @@ regexp_t bufp;
opcode
=
Cmatch_memory
;
ch
-=
'0'
;
goto
store_opcode_and_arg
;
}
case
Rextended_memory
:
{
NEXTCHAR
(
ch
);
if
(
ch
<
'0'
||
ch
>
'9'
)
goto
bad_match_register
;
...
...
@@ -611,9 +1339,14 @@ regexp_t bufp;
bufp
->
uses_registers
=
1
;
opcode
=
Cmatch_memory
;
goto
store_opcode_and_arg
;
}
case
Ropenset
:
{
int
complement
,
prev
,
offset
,
range
,
firstchar
;
int
complement
;
int
prev
;
int
offset
;
int
range
;
int
firstchar
;
SET_LEVEL_START
;
ALLOC
(
1
+
256
/
8
);
...
...
@@ -673,53 +1406,52 @@ regexp_t bufp;
break
;
}
case
Rbegbuf
:
{
opcode
=
Cbegbuf
;
goto
store_opcode
;
}
case
Rendbuf
:
{
opcode
=
Cendbuf
;
goto
store_opcode
;
}
case
Rwordchar
:
{
opcode
=
Csyntaxspec
;
ch
=
Sword
;
goto
store_opcode_and_arg
;
}
case
Rnotwordchar
:
{
opcode
=
Cnotsyntaxspec
;
ch
=
Sword
;
goto
store_opcode_and_arg
;
}
case
Rwordbeg
:
{
opcode
=
Cwordbeg
;
goto
store_opcode
;
}
case
Rwordend
:
{
opcode
=
Cwordend
;
goto
store_opcode
;
}
case
Rwordbound
:
{
opcode
=
Cwordbound
;
goto
store_opcode
;
}
case
Rnotwordbound
:
{
opcode
=
Cnotwordbound
;
goto
store_opcode
;
#ifdef emacs
case
Remacs_at_dot
:
opcode
=
Cemacs_at_dot
;
goto
store_opcode
;
case
Remacs_syntaxspec
:
NEXTCHAR
(
ch
);
if
(
translate
)
ch
=
translate
[(
unsigned
char
)
ch
];
opcode
=
Csyntaxspec
;
ch
=
syntax_spec_code
[(
unsigned
char
)
ch
];
goto
store_opcode_and_arg
;
case
Remacs_notsyntaxspec
:
NEXTCHAR
(
ch
);
if
(
translate
)
ch
=
translate
[(
unsigned
char
)
ch
];
opcode
=
Cnotsyntaxspec
;
ch
=
syntax_spec_code
[(
unsigned
char
)
ch
];
goto
store_opcode_and_arg
;
#endif
/* emacs */
}
default:
{
abort
();
}
}
beginning_context
=
(
op
==
Ropenpar
||
op
==
Ror
);
}
if
(
starts_base
!=
0
)
...
...
@@ -728,6 +1460,8 @@ regexp_t bufp;
ALLOC
(
1
);
STORE
(
Cend
);
SET_FIELDS
;
if
(
!
re_optimize
(
bufp
))
return
"Optimization error"
;
return
NULL
;
op_error:
...
...
@@ -758,6 +1492,7 @@ regexp_t bufp;
SET_FIELDS
;
return
"Regular expression too complex"
;
}
#undef CHARAT
#undef NEXTCHAR
#undef GETHEX
...
...
@@ -772,643 +1507,349 @@ regexp_t bufp;
#undef SETBIT
#undef SET_FIELDS
static
void
re_compile_fastmap_aux
Py_PROTO
((
char
*
,
int
,
char
*
,
char
*
,
char
*
));
static
void
re_compile_fastmap_aux
(
code
,
pos
,
visited
,
can_be_null
,
fastmap
)
char
*
code
,
*
visited
,
*
can_be_null
,
*
fastmap
;
int
pos
;
{
int
a
,
b
,
syntaxcode
;
if
(
visited
[
pos
])
return
;
/* we have already been here */
visited
[
pos
]
=
1
;
for
(;;)
switch
(
code
[
pos
++
])
{
case
Cend
:
*
can_be_null
=
1
;
return
;
case
Cbol
:
case
Cbegbuf
:
case
Cendbuf
:
case
Cwordbeg
:
case
Cwordend
:
case
Cwordbound
:
case
Cnotwordbound
:
#ifdef emacs
case
Cemacs_at_dot
:
#endif
/* emacs */
break
;
case
Csyntaxspec
:
syntaxcode
=
code
[
pos
++
];
for
(
a
=
0
;
a
<
256
;
a
++
)
if
(
SYNTAX
(
a
)
==
syntaxcode
)
fastmap
[
a
]
=
1
;
return
;
case
Cnotsyntaxspec
:
syntaxcode
=
code
[
pos
++
];
for
(
a
=
0
;
a
<
256
;
a
++
)
if
(
SYNTAX
(
a
)
!=
syntaxcode
)
fastmap
[
a
]
=
1
;
return
;
case
Ceol
:
fastmap
[
'\n'
]
=
1
;
if
(
*
can_be_null
==
0
)
*
can_be_null
=
2
;
/* can match null, but only at end of buffer*/
return
;
case
Cset
:
for
(
a
=
0
;
a
<
256
/
8
;
a
++
)
if
(
code
[
pos
+
a
]
!=
0
)
for
(
b
=
0
;
b
<
8
;
b
++
)
if
(
code
[
pos
+
a
]
&
(
1
<<
b
))
fastmap
[(
a
<<
3
)
+
b
]
=
1
;
pos
+=
256
/
8
;
return
;
case
Cexact
:
fastmap
[(
unsigned
char
)
code
[
pos
]]
=
1
;
return
;
case
Canychar
:
for
(
a
=
0
;
a
<
256
;
a
++
)
if
(
a
!=
'\n'
)
fastmap
[
a
]
=
1
;
return
;
case
Cstart_memory
:
case
Cend_memory
:
pos
++
;
break
;
case
Cmatch_memory
:
/* should this ever happen for sensible patterns??? */
*
can_be_null
=
1
;
return
;
case
Cjump
:
case
Cdummy_failure_jump
:
case
Cupdate_failure_jump
:
case
Cstar_jump
:
a
=
(
unsigned
char
)
code
[
pos
++
];
a
|=
(
unsigned
char
)
code
[
pos
++
]
<<
8
;
pos
+=
(
int
)(
short
)
a
;
if
(
visited
[
pos
])
{
/* argh... the regexp contains empty loops. This is not
good, as this may cause a failure stack overflow when
matching. Oh well. */
/* this path leads nowhere; pursue other paths. */
return
;
}
visited
[
pos
]
=
1
;
break
;
case
Cfailure_jump
:
a
=
(
unsigned
char
)
code
[
pos
++
];
a
|=
(
unsigned
char
)
code
[
pos
++
]
<<
8
;
a
=
pos
+
(
int
)(
short
)
a
;
re_compile_fastmap_aux
(
code
,
a
,
visited
,
can_be_null
,
fastmap
);
break
;
default:
abort
();
/* probably some opcode is missing from this switch */
/*NOTREACHED*/
}
}
static
int
re_do_compile_fastmap
Py_PROTO
((
char
*
,
int
,
int
,
char
*
,
char
*
));
static
int
re_do_compile_fastmap
(
buffer
,
used
,
pos
,
can_be_null
,
fastmap
)
char
*
buffer
,
*
fastmap
,
*
can_be_null
;
int
used
,
pos
;
{
char
small_visited
[
512
],
*
visited
;
if
(
used
<=
sizeof
(
small_visited
))
visited
=
small_visited
;
else
{
visited
=
malloc
(
used
);
if
(
!
visited
)
return
0
;
}
*
can_be_null
=
0
;
memset
(
fastmap
,
0
,
256
);
memset
(
visited
,
0
,
used
);
re_compile_fastmap_aux
(
buffer
,
pos
,
visited
,
can_be_null
,
fastmap
);
if
(
visited
!=
small_visited
)
free
(
visited
);
return
1
;
}
#define PREFETCH if (text == textend) goto fail
void
re_compile_fastmap
(
bufp
)
regexp_t
bufp
;
#define NEXTCHAR(var) \
PREFETCH; \
var = (unsigned char)*text++; \
if (translate) \
var = translate[var]
int
re_match
(
regexp_t
bufp
,
char
*
string
,
int
size
,
int
pos
,
regexp_registers_t
old_regs
)
{
if
(
!
bufp
->
fastmap
||
bufp
->
fastmap_accurate
)
return
;
assert
(
bufp
->
used
>
0
);
if
(
!
re_do_compile_fastmap
(
bufp
->
buffer
,
bufp
->
used
,
0
,
&
bufp
->
can_be_null
,
bufp
->
fastmap
))
return
;
if
(
bufp
->
buffer
[
0
]
==
Cbol
)
bufp
->
anchor
=
1
;
/* begline */
else
if
(
bufp
->
buffer
[
0
]
==
Cbegbuf
)
bufp
->
anchor
=
2
;
/* begbuf */
else
bufp
->
anchor
=
0
;
/* none */
bufp
->
fastmap_accurate
=
1
;
}
#define INITIAL_FAILURES 128
/* initial # failure points to allocate */
#define MAX_FAILURES 4100L
/* max # of failure points before failing */
char
*
code
;
char
*
translate
;
char
*
text
;
char
*
textstart
;
char
*
textend
;
int
a
;
int
b
;
int
ch
;
int
reg
;
int
match_end
;
char
*
regstart
;
char
*
regend
;
int
regsize
;
match_state
state
;
assert
(
pos
>=
0
&&
size
>=
0
);
assert
(
pos
<=
size
);
text
=
string
+
pos
;
textstart
=
string
;
textend
=
string
+
size
;
int
re_match_2
(
bufp
,
string1
,
size1
,
string2
,
size2
,
pos
,
regs
,
mstop
)
regexp_t
bufp
;
char
*
string1
,
*
string2
;
int
size1
,
size2
,
pos
,
mstop
;
regexp_registers_t
regs
;
{
struct
failure_point
{
char
*
text
,
*
partend
,
*
code
;
}
*
failure_stack_start
,
*
failure_sp
,
*
failure_stack_end
,
initial_failure_stack
[
INITIAL_FAILURES
];
char
*
code
,
*
translate
,
*
text
,
*
textend
,
*
partend
,
*
part_2_end
;
char
*
regstart_text
[
RE_NREGS
],
*
regstart_partend
[
RE_NREGS
];
char
*
regend_text
[
RE_NREGS
],
*
regend_partend
[
RE_NREGS
];
int
a
,
b
,
ch
,
reg
,
regch
,
match_end
;
char
*
regtext
,
*
regpartend
,
*
regtextend
;
#define PREFETCH \
MACRO_BEGIN \
if (text == partend) \
{ \
if (text == textend) \
goto fail; \
text = string2; \
partend = part_2_end; \
} \
MACRO_END
code
=
bufp
->
buffer
;
#define NEXTCHAR(var) \
MACRO_BEGIN \
PREFETCH; \
(var) = (unsigned char)*text++; \
if (translate) \
(var) = (unsigned char)translate[(var)]; \
MACRO_END
translate
=
bufp
->
translate
;
/* translated = NULL; */
/* if (bufp->translate) */
/* { */
/* char *t1; */
/* char *t2; */
assert
(
pos
>=
0
&&
size1
>=
0
&&
size2
>=
0
&&
mstop
>=
0
);
assert
(
mstop
<=
size1
+
size2
);
assert
(
pos
<=
mstop
);
/* translated = malloc(size); */
/* if (translated == NULL) */
/* goto error; */
if
(
pos
<=
size1
)
{
text
=
string1
+
pos
;
if
(
mstop
<=
size1
)
{
partend
=
string1
+
mstop
;
textend
=
partend
;
}
else
{
partend
=
string1
+
size1
;
textend
=
string2
+
mstop
-
size1
;
}
part_2_end
=
string2
+
mstop
-
size1
;
}
else
{
text
=
string2
+
pos
-
size1
;
partend
=
string2
+
mstop
-
size1
;
textend
=
partend
;
part_2_end
=
partend
;
}
/* t1 = string; */
/* t2 = translated; */
/* while(t1 < textend) */
/* *t2++ = bufp->translate[*t1++]; */
if
(
bufp
->
uses_registers
&&
regs
!=
NULL
)
for
(
a
=
0
;
a
<
RE_NREGS
;
a
++
)
regend_text
[
a
]
=
NULL
;
/* text = translated + pos; */
/* textstart = translated; */
/* textend = translated + size; */
/* } */
code
=
bufp
->
buffer
;
translate
=
bufp
->
translate
;
failure_stack_start
=
failure_sp
=
initial_failure_stack
;
failure_stack_end
=
initial_failure_stack
+
INITIAL_FAILURES
;
#if 0
/* re_search_2 has already done this, and otherwise we get little benefit
from this. So I'll leave this out. */
if (bufp->fastmap_accurate && !bufp->can_be_null &&
text != textend &&
!bufp->fastmap[translate ?
(unsigned char)translate[(unsigned char)*text] :
(unsigned char)*text])
return -1; /* it can't possibly match */
#endif
NEW_STATE
(
state
);
continue_matching:
for
(;;)
{
switch
(
*
code
++
)
{
case
Cend
:
if
(
partend
!=
part_2_end
)
match_end
=
text
-
string1
;
else
match_end
=
text
-
string2
+
size1
;
if
(
regs
)
{
regs
->
start
[
0
]
=
pos
;
regs
->
end
[
0
]
=
match_end
;
match_end
=
text
-
textstart
;
if
(
old_regs
)
{
old_regs
->
start
[
0
]
=
pos
;
old_regs
->
end
[
0
]
=
match_end
;
if
(
!
bufp
->
uses_registers
)
{
for
(
a
=
1
;
a
<
RE_NREGS
;
a
++
)
{
regs
->
start
[
a
]
=
-
1
;
regs
->
end
[
a
]
=
-
1
;
old_
regs
->
start
[
a
]
=
-
1
;
old_
regs
->
end
[
a
]
=
-
1
;
}
}
else
{
for
(
a
=
1
;
a
<
RE_NREGS
;
a
++
)
{
if
(
regend_text
[
a
]
==
NULL
)
if
((
GET_REG_START
(
state
,
a
)
==
NULL
)
||
(
GET_REG_END
(
state
,
a
)
==
NULL
))
{
regs
->
start
[
a
]
=
-
1
;
regs
->
end
[
a
]
=
-
1
;
old_
regs
->
start
[
a
]
=
-
1
;
old_
regs
->
end
[
a
]
=
-
1
;
continue
;
}
if
(
regstart_partend
[
a
]
!=
part_2_end
)
regs
->
start
[
a
]
=
regstart_text
[
a
]
-
string1
;
else
regs
->
start
[
a
]
=
regstart_text
[
a
]
-
string2
+
size1
;
if
(
regend_partend
[
a
]
!=
part_2_end
)
regs
->
end
[
a
]
=
regend_text
[
a
]
-
string1
;
else
regs
->
end
[
a
]
=
regend_text
[
a
]
-
string2
+
size1
;
old_regs
->
start
[
a
]
=
GET_REG_START
(
state
,
a
)
-
textstart
;
old_regs
->
end
[
a
]
=
GET_REG_END
(
state
,
a
)
-
textstart
;
}
}
}
if
(
failure_stack_start
!=
initial_failure_stack
)
free
((
char
*
)
failure_stack_start
);
/* if(translated) */
/* free(translated); */
FREE_STATE
(
state
);
return
match_end
-
pos
;
case
Cbol
:
if
(
text
==
string1
||
text
[
-
1
]
==
'\n'
)
/* text[-1] always valid */
break
;
goto
fail
;
case
Ceol
:
if
(
text
==
string2
+
size2
||
(
text
==
string1
+
size1
?
(
size2
==
0
||
*
string2
==
'\n'
)
:
*
text
==
'\n'
))
break
;
goto
fail
;
case
Cset
:
NEXTCHAR
(
ch
);
if
(
code
[
ch
/
8
]
&
(
1
<<
(
ch
&
7
)))
{
code
+=
256
/
8
;
break
;
}
goto
fail
;
case
Cexact
:
NEXTCHAR
(
ch
);
if
(
ch
!=
(
unsigned
char
)
*
code
++
)
goto
fail
;
break
;
case
Canychar
:
NEXTCHAR
(
ch
);
if
(
ch
==
'\n'
)
goto
fail
;
break
;
case
Cstart_memory
:
reg
=
*
code
++
;
regstart_text
[
reg
]
=
text
;
regstart_partend
[
reg
]
=
partend
;
break
;
case
Cend_memory
:
reg
=
*
code
++
;
regend_text
[
reg
]
=
text
;
regend_partend
[
reg
]
=
partend
;
break
;
case
Cmatch_memory
:
reg
=
*
code
++
;
if
(
regend_text
[
reg
]
==
NULL
)
goto
fail
;
/* or should we just match nothing? */
regtext
=
regstart_text
[
reg
];
regtextend
=
regend_text
[
reg
];
if
(
regstart_partend
[
reg
]
==
regend_partend
[
reg
])
regpartend
=
regtextend
;
else
regpartend
=
string1
+
size1
;
for
(;
regtext
!=
regtextend
;)
case
Cbol
:
{
NEXTCHAR
(
ch
);
if
(
regtext
==
regpartend
)
regtext
=
string2
;
regch
=
(
unsigned
char
)
*
regtext
++
;
if
(
translate
)
regch
=
(
unsigned
char
)
translate
[
regch
];
if
(
regch
!=
ch
)
if
(
text
==
textstart
||
text
[
-
1
]
==
'\n'
)
goto
continue_matching
;
goto
fail
;
}
break
;
case
Cstar_jump
:
/* star is coded as:
1: failure_jump 2
... code for operand of star
star_jump 1
2: ... code after star
We change the star_jump to update_failure_jump if we can determine
that it is safe to do so; otherwise we change it to an ordinary
jump.
plus is coded as
jump 2
1: failure_jump 3
2: ... code for operand of plus
star_jump 1
3: ... code after plus
For star_jump considerations this is processed identically
to star. */
a
=
(
unsigned
char
)
*
code
++
;
a
|=
(
unsigned
char
)
*
code
++
<<
8
;
a
=
(
int
)(
short
)
a
;
{
char
map
[
256
],
can_be_null
;
char
*
p1
,
*
p2
;
p1
=
code
+
a
+
3
;
/* skip the failure_jump */
assert
(
p1
[
-
3
]
==
Cfailure_jump
);
p2
=
code
;
/* p1 points inside loop, p2 points to after loop */
if
(
!
re_do_compile_fastmap
(
bufp
->
buffer
,
bufp
->
used
,
p2
-
bufp
->
buffer
,
&
can_be_null
,
map
))
goto
make_normal_jump
;
/* If we might introduce a new update point inside the loop,
we can't optimize because then update_jump would update a
wrong failure point. Thus we have to be quite careful here. */
loop_p1:
/* loop until we find something that consumes a character */
switch
(
*
p1
++
)
{
case
Cbol
:
case
Ceol
:
case
Cbegbuf
:
case
Cendbuf
:
case
Cwordbeg
:
case
Cwordend
:
case
Cwordbound
:
case
Cnotwordbound
:
#ifdef emacs
case
Cemacs_at_dot
:
#endif
/* emacs */
goto
loop_p1
;
case
Cstart_memory
:
case
Cend_memory
:
p1
++
;
goto
loop_p1
;
case
Cexact
:
ch
=
(
unsigned
char
)
*
p1
++
;
if
(
map
[
ch
])
goto
make_normal_jump
;
break
;
case
Canychar
:
for
(
b
=
0
;
b
<
256
;
b
++
)
if
(
b
!=
'\n'
&&
map
[
b
])
goto
make_normal_jump
;
break
;
case
Cset
:
for
(
b
=
0
;
b
<
256
;
b
++
)
if
((
p1
[
b
>>
3
]
&
(
1
<<
(
b
&
7
)))
&&
map
[
b
])
goto
make_normal_jump
;
p1
+=
256
/
8
;
break
;
default:
goto
make_normal_jump
;
}
/* now we know that we can't backtrack. */
while
(
p1
!=
p2
-
3
)
{
switch
(
*
p1
++
)
{
case
Cend
:
abort
();
/* we certainly shouldn't get this inside loop */
/*NOTREACHED*/
case
Cbol
:
case
Ceol
:
case
Canychar
:
case
Cbegbuf
:
case
Cendbuf
:
case
Cwordbeg
:
case
Cwordend
:
case
Cwordbound
:
case
Cnotwordbound
:
#ifdef emacs
case
Cemacs_at_dot
:
#endif
/* emacs */
break
;
if
(
text
==
textend
||
*
text
==
'\n'
)
goto
continue_matching
;
goto
fail
;
}
case
Cset
:
p1
+=
256
/
8
;
break
;
{
NEXTCHAR
(
ch
);
if
(
code
[
ch
/
8
]
&
(
1
<<
(
ch
&
7
)))
{
code
+=
256
/
8
;
goto
continue_matching
;
}
goto
fail
;
}
case
Cexact
:
{
NEXTCHAR
(
ch
);
if
(
ch
!=
(
unsigned
char
)
*
code
++
)
goto
fail
;
/* { */
/* char *p1 = code - 2; */
/* ch = *(code - 1); */
/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
/* while ((code == p1) && (*text != ch)) */
/* POP_FAILURE(state, code, text, goto done_matching, goto error); */
/* if ((code == p1) && (*text == ch)) */
/* { */
/* code += 2; */
/* text++; */
/* } */
/* } */
goto
continue_matching
;
}
case
Canychar
:
{
NEXTCHAR
(
ch
);
if
(
ch
==
'\n'
)
goto
fail
;
goto
continue_matching
;
}
case
Cstart_memory
:
{
reg
=
*
code
++
;
SET_REG_START
(
state
,
reg
,
text
,
goto
error
);
goto
continue_matching
;
}
case
Cend_memory
:
case
Cmatch_memory
:
case
Csyntaxspec
:
case
Cnotsyntaxspec
:
p1
++
;
break
;
case
Cjump
:
case
Cstar_jump
:
case
Cfailure_jump
:
case
Cupdate_failure_jump
:
case
Cdummy_failure_jump
:
goto
make_normal_jump
;
default:
printf
(
"regexpr.c: processing star_jump: unknown op %d
\n
"
,
p1
[
-
1
]);
break
;
{
reg
=
*
code
++
;
SET_REG_END
(
state
,
reg
,
text
,
goto
error
);
goto
continue_matching
;
}
case
Cmatch_memory
:
{
reg
=
*
code
++
;
regstart
=
GET_REG_START
(
state
,
reg
);
regend
=
GET_REG_END
(
state
,
reg
);
if
((
regstart
==
NULL
)
||
(
regend
==
NULL
))
goto
fail
;
/* or should we just match nothing? */
regsize
=
regend
-
regstart
;
if
(
regsize
>
(
textend
-
text
))
goto
fail
;
if
(
translate
)
{
for
(;
regstart
<
regend
;
regstart
++
,
text
++
)
if
(
translate
[
*
regstart
]
!=
translate
[
*
text
])
goto
fail
;
}
goto
make_update_jump
;
else
for
(;
regstart
<
regend
;
regstart
++
,
text
++
)
if
(
*
regstart
!=
*
text
)
goto
fail
;
/* if (memcmp(text, regstart, regsize) != 0)
goto fail;
text += regsize; */
goto
continue_matching
;
}
make_normal_jump:
/* printf("changing to normal jump\n"); */
code
-=
3
;
*
code
=
Cjump
;
break
;
make_update_jump:
/* printf("changing to update jump\n"); */
code
-=
2
;
a
+=
3
;
/* jump to after the Cfailure_jump */
code
[
-
1
]
=
Cupdate_failure_jump
;
code
[
0
]
=
a
&
0xff
;
code
[
1
]
=
a
>>
8
;
/* fall to next case */
case
Cupdate_failure_jump
:
failure_sp
[
-
1
].
text
=
text
;
failure_sp
[
-
1
].
partend
=
partend
;
{
UPDATE_FAILURE
(
state
,
text
,
goto
error
)
;
/* fall to next case */
}
/* treat Cstar_jump just like Cjump if it hasn't been optimized */
case
Cstar_jump
:
case
Cjump
:
{
a
=
(
unsigned
char
)
*
code
++
;
a
|=
(
unsigned
char
)
*
code
++
<<
8
;
code
+=
(
int
)(
short
)
a
;
break
;
goto
continue_matching
;
}
case
Cdummy_failure_jump
:
case
Cfailure_jump
:
if
(
failure_sp
==
failure_stack_end
)
{
if
(
failure_stack_start
!=
initial_failure_stack
)
goto
error
;
failure_stack_start
=
(
struct
failure_point
*
)
malloc
(
MAX_FAILURES
*
sizeof
(
*
failure_stack_start
));
if
(
failure_stack_start
==
NULL
)
{
failure_stack_start
=
initial_failure_stack
;
goto
error
;
}
failure_stack_end
=
failure_stack_start
+
MAX_FAILURES
;
memcpy
((
char
*
)
failure_stack_start
,
(
char
*
)
initial_failure_stack
,
INITIAL_FAILURES
*
sizeof
(
*
failure_stack_start
));
failure_sp
=
failure_stack_start
+
INITIAL_FAILURES
;
}
a
=
(
unsigned
char
)
*
code
++
;
a
|=
(
unsigned
char
)
*
code
++
<<
8
;
a
=
(
int
)(
short
)
a
;
if
(
code
[
-
3
]
==
Cdummy_failure_jump
)
{
/* this is only used in plus */
assert
(
*
code
==
Cfailure_jump
);
b
=
(
unsigned
char
)
code
[
1
];
b
|=
(
unsigned
char
)
code
[
2
]
<<
8
;
failure_sp
->
code
=
code
+
(
int
)(
short
)
b
+
3
;
failure_sp
->
text
=
NULL
;
PUSH_FAILURE
(
state
,
code
+
(
int
)(
short
)
b
+
3
,
NULL
,
goto
error
);
code
+=
a
;
goto
continue_matching
;
}
else
case
Cfailure_jump
:
{
failure_sp
->
code
=
code
+
a
;
failure_sp
->
text
=
text
;
failure_sp
->
partend
=
partend
;
a
=
(
unsigned
char
)
*
code
++
;
a
|=
(
unsigned
char
)
*
code
++
<<
8
;
a
=
(
int
)(
short
)
a
;
PUSH_FAILURE
(
state
,
code
+
a
,
text
,
goto
error
);
goto
continue_matching
;
}
failure_sp
++
;
break
;
case
Cbegbuf
:
if
(
text
==
string1
)
break
;
{
if
(
text
==
textstart
)
goto
continue_matching
;
goto
fail
;
}
case
Cendbuf
:
if
(
size2
==
0
?
text
==
string1
+
size1
:
text
==
string2
+
size2
)
break
;
{
if
(
text
==
textend
)
goto
continue_matching
;
goto
fail
;
}
case
Cwordbeg
:
if
(
text
==
string2
+
size2
)
goto
fail
;
if
(
size2
==
0
&&
text
==
string1
+
size1
)
{
if
(
text
==
textend
)
goto
fail
;
if
(
SYNTAX
(
text
==
string1
+
size1
?
*
string1
:
*
text
)
!=
Sword
)
if
(
SYNTAX
(
*
text
)
!=
Sword
)
goto
fail
;
if
(
text
==
string1
)
break
;
if
(
text
==
textstart
)
goto
continue_matching
;
if
(
SYNTAX
(
text
[
-
1
])
!=
Sword
)
break
;
goto
continue_matching
;
goto
fail
;
}
case
Cwordend
:
if
(
text
==
string1
)
{
if
(
text
==
textstart
)
goto
fail
;
if
(
SYNTAX
(
text
[
-
1
])
!=
Sword
)
goto
fail
;
if
(
text
==
string2
+
size2
)
break
;
if
(
size2
==
0
&&
text
==
string1
+
size1
)
break
;
if
(
text
==
textend
)
goto
continue_matching
;
if
(
SYNTAX
(
*
text
)
==
Sword
)
goto
fail
;
break
;
goto
continue_matching
;
}
case
Cwordbound
:
{
/* Note: as in gnu regexp, this also matches at the beginning
and end of buffer. */
if
(
text
==
string1
||
text
==
string2
+
size2
||
(
size2
==
0
&&
text
==
string1
+
size1
))
break
;
if
((
SYNTAX
(
text
[
-
1
])
==
Sword
)
^
(
SYNTAX
(
text
==
string1
+
size1
?
*
string2
:
*
text
)
==
Sword
))
break
;
* and end of buffer. */
if
(
text
==
textstart
||
text
==
textend
)
goto
continue_matching
;
if
((
SYNTAX
(
text
[
-
1
])
==
Sword
)
^
(
SYNTAX
(
*
text
)
==
Sword
))
goto
continue_matching
;
goto
fail
;
}
case
Cnotwordbound
:
{
/* Note: as in gnu regexp, this never matches at the beginning
and end of buffer. */
if
(
text
==
string1
||
text
==
string2
+
size2
||
(
size2
==
0
&&
text
==
string1
+
size1
))
* and end of buffer. */
if
(
text
==
textstart
||
text
==
textend
)
goto
fail
;
if
(
!
((
SYNTAX
(
text
[
-
1
])
==
Sword
)
^
(
SYNTAX
(
text
==
string1
+
size1
?
*
string2
:
*
text
)
==
Sword
)))
if
(
!
((
SYNTAX
(
text
[
-
1
])
==
Sword
)
^
(
SYNTAX
(
*
text
)
==
Sword
)))
goto
fail
;
break
;
goto
continue_matching
;
}
case
Csyntaxspec
:
{
NEXTCHAR
(
ch
);
if
(
SYNTAX
(
ch
)
!=
(
unsigned
char
)
*
code
++
)
goto
fail
;
break
;
goto
continue_matching
;
}
case
Cnotsyntaxspec
:
{
NEXTCHAR
(
ch
);
if
(
SYNTAX
(
ch
)
!=
(
unsigned
char
)
*
code
++
)
break
;
goto
fail
;
#ifdef emacs
case
Cemacs_at_dot
:
if
(
PTR_CHAR_POS
((
unsigned
char
*
)
text
)
+
1
!=
point
)
goto
fail
;
break
;
#endif
/* emacs */
goto
continue_matching
;
}
default:
{
abort
();
/*NOTREACHED*/
}
}
#if 0 /* This line is never reached --Guido */
abort();
#endif
/*NOTREACHED*/
/*
*NOTREACHED
*/
fail:
if
(
failure_sp
!=
failure_stack_start
)
{
failure_sp
--
;
text
=
failure_sp
->
text
;
if
(
text
==
NULL
)
goto
fail
;
partend
=
failure_sp
->
partend
;
code
=
failure_sp
->
code
;
POP_FAILURE
(
state
,
code
,
text
,
goto
done_matching
,
goto
error
);
goto
continue_matching
;
}
if
(
failure_stack_start
!=
initial_failure_stack
)
free
((
char
*
)
failure_stack_start
);
done_matching:
/* if(translated != NULL) */
/* free(translated); */
FREE_STATE
(
state
);
return
-
1
;
error:
if
(
failure_stack_start
!=
initial_failure_stack
)
free
((
char
*
)
failure_stack_start
);
/* if (translated != NULL) */
/* free(translated); */
FREE_STATE
(
state
);
return
-
2
;
}
#undef PREFETCH
#undef NEXTCHAR
#undef PUSH_FAILURE
int
re_match
(
bufp
,
string
,
size
,
pos
,
regs
)
regexp_t
bufp
;
char
*
string
;
int
size
,
pos
;
regexp_registers_t
regs
;
int
re_search
(
regexp_t
bufp
,
char
*
string
,
int
size
,
int
pos
,
int
range
,
regexp_registers_t
regs
)
{
return
re_match_2
(
bufp
,
string
,
size
,
(
char
*
)
NULL
,
0
,
pos
,
regs
,
size
);
}
int
re_search_2
(
bufp
,
string1
,
size1
,
string2
,
size2
,
pos
,
range
,
regs
,
mstop
)
regexp_t
bufp
;
char
*
string1
,
*
string2
;
int
size1
,
size2
,
pos
,
range
,
mstop
;
regexp_registers_t
regs
;
{
char
*
fastmap
,
*
translate
,
*
text
,
*
partstart
,
*
partend
;
int
dir
,
ret
;
char
*
fastmap
;
char
*
translate
;
char
*
text
;
char
*
partstart
;
char
*
partend
;
int
dir
;
int
ret
;
char
anchor
;
assert
(
size1
>=
0
&&
size2
>=
0
&&
pos
>=
0
&&
mstop
>=
0
);
assert
(
pos
+
range
>=
0
&&
pos
+
range
<=
size1
+
size2
);
/* Bugfix by ylo */
assert
(
pos
<=
mstop
);
assert
(
size
>=
0
&&
pos
>=
0
);
assert
(
pos
+
range
>=
0
&&
pos
+
range
<=
size
);
/* Bugfix by ylo */
fastmap
=
bufp
->
fastmap
;
translate
=
bufp
->
translate
;
...
...
@@ -1417,6 +1858,7 @@ regexp_registers_t regs;
anchor
=
bufp
->
anchor
;
if
(
bufp
->
can_be_null
==
1
)
/* can_be_null == 2: can match null at eob */
fastmap
=
NULL
;
if
(
range
<
0
)
{
dir
=
-
1
;
...
...
@@ -1424,59 +1866,39 @@ regexp_registers_t regs;
}
else
dir
=
1
;
if
(
anchor
==
2
)
if
(
pos
!=
0
)
return
-
1
;
else
range
=
0
;
for
(;
range
>=
0
;
range
--
,
pos
+=
dir
)
{
if
(
fastmap
)
{
if
(
dir
==
1
)
{
/* searching forwards */
if
(
pos
<
size1
)
{
text
=
string1
+
pos
;
if
(
pos
+
range
>
size1
)
partend
=
string1
+
size1
;
else
partend
=
string1
+
pos
+
range
;
}
else
{
text
=
string2
+
pos
-
size1
;
partend
=
string2
+
pos
+
range
-
size1
;
}
text
=
string
+
pos
;
partend
=
string
+
size
;
partstart
=
text
;
if
(
translate
)
while
(
text
!=
partend
&&
!
fastmap
[(
unsigned
char
)
translate
[(
unsigned
char
)
*
text
]])
!
fastmap
[(
unsigned
char
)
translate
[(
unsigned
char
)
*
text
]])
text
++
;
else
while
(
text
!=
partend
&&
!
fastmap
[(
unsigned
char
)
*
text
])
text
++
;
pos
+=
text
-
partstart
;
range
-=
text
-
partstart
;
if
(
pos
==
size1
+
size2
&&
bufp
->
can_be_null
==
0
)
if
(
pos
==
size
&&
bufp
->
can_be_null
==
0
)
return
-
1
;
}
else
{
/* searching backwards */
if
(
pos
<=
size1
)
{
text
=
string1
+
pos
;
partstart
=
string1
+
pos
-
range
;
}
else
{
text
=
string2
+
pos
-
size1
;
if
(
range
<
pos
-
size1
)
partstart
=
string2
+
pos
-
size1
-
range
;
else
partstart
=
string2
;
}
text
=
string
+
pos
;
partstart
=
string
+
pos
-
range
;
partend
=
text
;
if
(
translate
)
while
(
text
!=
partstart
&&
...
...
@@ -1493,13 +1915,11 @@ regexp_registers_t regs;
}
if
(
anchor
==
1
)
{
/* anchored to begline */
if
(
pos
>
0
&&
(
pos
<=
size1
?
string1
[
pos
-
1
]
:
string2
[
pos
-
size1
-
1
])
!=
'\n'
)
if
(
pos
>
0
&&
string
[
pos
-
1
])
continue
;
}
assert
(
pos
>=
0
&&
pos
<=
size1
+
size2
);
ret
=
re_match_2
(
bufp
,
string1
,
size1
,
string2
,
size2
,
pos
,
regs
,
mstop
);
assert
(
pos
>=
0
&&
pos
<=
size
);
ret
=
re_match
(
bufp
,
string
,
size
,
pos
,
regs
);
if
(
ret
>=
0
)
return
pos
;
if
(
ret
==
-
2
)
...
...
@@ -1507,198 +1927,3 @@ regexp_registers_t regs;
}
return
-
1
;
}
int
re_search
(
bufp
,
string
,
size
,
startpos
,
range
,
regs
)
regexp_t
bufp
;
char
*
string
;
int
size
,
startpos
,
range
;
regexp_registers_t
regs
;
{
return
re_search_2
(
bufp
,
string
,
size
,
(
char
*
)
NULL
,
0
,
startpos
,
range
,
regs
,
size
);
}
#ifdef UNUSED
static
struct
re_pattern_buffer
re_comp_buf
;
char
*
re_comp
(
s
)
char
*
s
;
{
if
(
s
==
NULL
)
{
if
(
!
re_comp_buf
.
buffer
)
return
"Out of memory"
;
return
NULL
;
}
if
(
!
re_comp_buf
.
buffer
)
{
/* the buffer will be allocated automatically */
re_comp_buf
.
fastmap
=
malloc
(
256
);
re_comp_buf
.
translate
=
NULL
;
if
(
re_comp_buf
.
fastmap
==
NULL
)
return
"Out of memory"
;
}
return
re_compile_pattern
(
s
,
strlen
(
s
),
&
re_comp_buf
);
}
int
re_exec
(
s
)
char
*
s
;
{
int
len
=
strlen
(
s
);
return
re_search
(
&
re_comp_buf
,
s
,
len
,
0
,
len
,
(
regexp_registers_t
)
NULL
)
>=
0
;
}
#endif
#ifdef TEST_REGEXP
int
main
()
{
char
buf
[
500
];
char
*
cp
;
struct
re_pattern_buffer
exp
;
struct
re_registers
regs
;
int
a
,
pos
;
char
fastmap
[
256
];
exp
.
allocated
=
0
;
exp
.
buffer
=
0
;
exp
.
translate
=
NULL
;
exp
.
fastmap
=
fastmap
;
/* re_set_syntax(RE_NO_BK_PARENS|RE_NO_BK_VBAR|RE_ANSI_HEX); */
while
(
1
)
{
printf
(
"Enter regexp:
\n
"
);
gets
(
buf
);
cp
=
re_compile_pattern
(
buf
,
strlen
(
buf
),
&
exp
);
if
(
cp
)
{
printf
(
"Error: %s
\n
"
,
cp
);
continue
;
}
re_compile_fastmap
(
&
exp
);
printf
(
"dump:
\n
"
);
for
(
pos
=
0
;
pos
<
exp
.
used
;)
{
printf
(
"%d: "
,
pos
);
switch
(
exp
.
buffer
[
pos
++
])
{
case
Cend
:
strcpy
(
buf
,
"end"
);
break
;
case
Cbol
:
strcpy
(
buf
,
"bol"
);
break
;
case
Ceol
:
strcpy
(
buf
,
"eol"
);
break
;
case
Cset
:
strcpy
(
buf
,
"set "
);
for
(
a
=
0
;
a
<
256
/
8
;
a
++
)
sprintf
(
buf
+
strlen
(
buf
),
" %02x"
,
(
unsigned
char
)
exp
.
buffer
[
pos
++
]);
break
;
case
Cexact
:
sprintf
(
buf
,
"exact '%c' 0x%x"
,
exp
.
buffer
[
pos
],
(
unsigned
char
)
exp
.
buffer
[
pos
]);
pos
++
;
break
;
case
Canychar
:
strcpy
(
buf
,
"anychar"
);
break
;
case
Cstart_memory
:
sprintf
(
buf
,
"start_memory %d"
,
exp
.
buffer
[
pos
++
]);
break
;
case
Cend_memory
:
sprintf
(
buf
,
"end_memory %d"
,
exp
.
buffer
[
pos
++
]);
break
;
case
Cmatch_memory
:
sprintf
(
buf
,
"match_memory %d"
,
exp
.
buffer
[
pos
++
]);
break
;
case
Cjump
:
case
Cdummy_failure_jump
:
case
Cstar_jump
:
case
Cfailure_jump
:
case
Cupdate_failure_jump
:
a
=
(
unsigned
char
)
exp
.
buffer
[
pos
++
];
a
+=
(
unsigned
char
)
exp
.
buffer
[
pos
++
]
<<
8
;
a
=
(
int
)(
short
)
a
;
switch
(
exp
.
buffer
[
pos
-
3
])
{
case
Cjump
:
cp
=
"jump"
;
break
;
case
Cstar_jump
:
cp
=
"star_jump"
;
break
;
case
Cfailure_jump
:
cp
=
"failure_jump"
;
break
;
case
Cupdate_failure_jump
:
cp
=
"update_failure_jump"
;
break
;
case
Cdummy_failure_jump
:
cp
=
"dummy_failure_jump"
;
break
;
default:
cp
=
"unknown jump"
;
break
;
}
sprintf
(
buf
,
"%s %d"
,
cp
,
a
+
pos
);
break
;
case
Cbegbuf
:
strcpy
(
buf
,
"begbuf"
);
break
;
case
Cendbuf
:
strcpy
(
buf
,
"endbuf"
);
break
;
case
Cwordbeg
:
strcpy
(
buf
,
"wordbeg"
);
break
;
case
Cwordend
:
strcpy
(
buf
,
"wordend"
);
break
;
case
Cwordbound
:
strcpy
(
buf
,
"wordbound"
);
break
;
case
Cnotwordbound
:
strcpy
(
buf
,
"notwordbound"
);
break
;
default:
sprintf
(
buf
,
"unknown code %d"
,
(
unsigned
char
)
exp
.
buffer
[
pos
-
1
]);
break
;
}
printf
(
"%s
\n
"
,
buf
);
}
printf
(
"can_be_null = %d uses_registers = %d anchor = %d
\n
"
,
exp
.
can_be_null
,
exp
.
uses_registers
,
exp
.
anchor
);
printf
(
"fastmap:"
);
for
(
a
=
0
;
a
<
256
;
a
++
)
if
(
exp
.
fastmap
[
a
])
printf
(
" %d"
,
a
);
printf
(
"
\n
"
);
printf
(
"Enter strings. An empty line terminates.
\n
"
);
while
(
fgets
(
buf
,
sizeof
(
buf
),
stdin
))
{
if
(
buf
[
0
]
==
'\n'
)
break
;
a
=
re_search
(
&
exp
,
buf
,
strlen
(
buf
),
0
,
strlen
(
buf
),
&
regs
);
printf
(
"search returns %d
\n
"
,
a
);
if
(
a
!=
-
1
)
{
for
(
a
=
0
;
a
<
RE_NREGS
;
a
++
)
{
printf
(
"buf %d: %d to %d
\n
"
,
a
,
regs
.
start
[
a
],
regs
.
end
[
a
]);
}
}
}
}
}
#endif
/* TEST_REGEXP */
Modules/regexpr.h
View file @
004c1e1d
...
...
@@ -69,9 +69,7 @@ typedef struct re_registers
#define re_set_syntax _Py_re_set_syntax
#define re_compile_pattern _Py_re_compile_pattern
#define re_match _Py_re_match
#define re_match_2 _Py_re_match_2
#define re_search _Py_re_search
#define re_search_2 _Py_re_search_2
#define re_compile_fastmap _Py_re_compile_fastmap
#define re_comp _Py_re_comp
#define re_exec _Py_re_exec
...
...
@@ -96,20 +94,12 @@ char *re_compile_pattern(char *regex, int regex_size, regexp_t compiled);
translation table, or NULL if it is not used. */
int
re_match
(
regexp_t
compiled
,
char
*
string
,
int
size
,
int
pos
,
regexp_registers_t
regs
);
regexp_registers_t
old_
regs
);
/* This tries to match the regexp against the string. This returns the
length of the matched portion, or -1 if the pattern could not be
matched and -2 if an error (such as failure stack overflow) is
encountered. */
int
re_match_2
(
regexp_t
compiled
,
char
*
string1
,
int
size1
,
char
*
string2
,
int
size2
,
int
pos
,
regexp_registers_t
regs
,
int
mstop
);
/* This tries to match the regexp to the concatenation of string1 and
string2. This returns the length of the matched portion, or -1 if the
pattern could not be matched and -2 if an error (such as failure stack
overflow) is encountered. */
int
re_search
(
regexp_t
compiled
,
char
*
string
,
int
size
,
int
startpos
,
int
range
,
regexp_registers_t
regs
);
/* This rearches for a substring matching the regexp. This returns the first
...
...
@@ -119,12 +109,6 @@ int re_search(regexp_t compiled, char *string, int size, int startpos,
which a match must not go. This returns -1 if no match is found, and
-2 if an error (such as failure stack overflow) is encountered. */
int
re_search_2
(
regexp_t
compiled
,
char
*
string1
,
int
size1
,
char
*
string2
,
int
size2
,
int
startpos
,
int
range
,
regexp_registers_t
regs
,
int
mstop
);
/* This is like re_search, but search from the concatenation of string1 and
string2. */
void
re_compile_fastmap
(
regexp_t
compiled
);
/* This computes the fastmap for the regexp. For this to have any effect,
the calling program must have initialized the fastmap field to point
...
...
@@ -146,9 +130,7 @@ extern int re_syntax;
int
re_set_syntax
();
char
*
re_compile_pattern
();
int
re_match
();
int
re_match_2
();
int
re_search
();
int
re_search_2
();
void
re_compile_fastmap
();
char
*
re_comp
();
int
re_exec
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment