Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
b1aa1951
Commit
b1aa1951
authored
Jun 01, 2000
by
Jeremy Hylton
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Fredrik Lundh: here's the 96.6% version of SRE
parent
0292d78e
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
743 additions
and
303 deletions
+743
-303
Lib/sre.py
Lib/sre.py
+105
-18
Lib/sre_compile.py
Lib/sre_compile.py
+70
-64
Lib/sre_constants.py
Lib/sre_constants.py
+68
-27
Modules/_sre.c
Modules/_sre.c
+448
-188
Modules/sre.h
Modules/sre.h
+29
-5
Modules/sre_constants.h
Modules/sre_constants.h
+23
-1
No files found.
Lib/sre.py
View file @
b1aa1951
# -*- Mode: Python; tab-width: 4 -*-
#
# Secret Labs' Regular Expression Engine
# $Id$
...
...
@@ -7,39 +6,127 @@
#
# Copyright (c) 1998-2000 by Secret Labs AB. All rights reserved.
#
# This code can only be used for 1.6 alpha testing. All other use
# require explicit permission from Secret Labs AB.
#
# Portions of this engine have been developed in cooperation with
# CNRI. Hewlett-Packard provided funding for 1.6 integration and
# other compatibility work.
#
"""
this is a long string
"""
import
sre_compile
# flags
I
=
IGNORECASE
=
sre_compile
.
SRE_FLAG_IGNORECASE
L
=
LOCALE
=
sre_compile
.
SRE_FLAG_LOCALE
M
=
MULTILINE
=
sre_compile
.
SRE_FLAG_MULTILINE
S
=
DOTALL
=
sre_compile
.
SRE_FLAG_DOTALL
X
=
VERBOSE
=
sre_compile
.
SRE_FLAG_VERBOSE
# --------------------------------------------------------------------
# public interface
def
compile
(
pattern
,
flags
=
0
):
return
sre_compile
.
compile
(
pattern
,
_fixflags
(
flags
))
# FIXME: add docstrings
def
match
(
pattern
,
string
,
flags
=
0
):
return
compile
(
pattern
,
_fixflags
(
flags
)
).
match
(
string
)
return
_compile
(
pattern
,
flags
).
match
(
string
)
def
search
(
pattern
,
string
,
flags
=
0
):
return
compile
(
pattern
,
_fixflags
(
flags
)).
search
(
string
)
return
_compile
(
pattern
,
flags
).
search
(
string
)
def
sub
(
pattern
,
repl
,
string
,
count
=
0
):
return
_compile
(
pattern
).
sub
(
repl
,
string
,
count
)
def
subn
(
pattern
,
repl
,
string
,
count
=
0
):
return
_compile
(
pattern
).
subn
(
repl
,
string
,
count
)
def
split
(
pattern
,
string
,
maxsplit
=
0
):
return
_compile
(
pattern
).
split
(
string
,
maxsplit
)
# FIXME: etc
def
findall
(
pattern
,
string
,
maxsplit
=
0
):
return
_compile
(
pattern
).
findall
(
string
,
maxsplit
)
def
compile
(
pattern
,
flags
=
0
):
return
_compile
(
pattern
,
flags
)
def
escape
(
pattern
):
s
=
list
(
pattern
)
for
i
in
range
(
len
(
pattern
)):
c
=
pattern
[
i
]
if
not
(
"a"
<=
c
<=
"z"
or
"A"
<=
c
<=
"Z"
or
"0"
<=
c
<=
"9"
):
if
c
==
"
\
000
"
:
s
[
i
]
=
"
\
\
000"
else
:
s
[
i
]
=
"
\
\
"
+
c
return
pattern
[:
0
].
join
(
s
)
# --------------------------------------------------------------------
# helpers
# internals
_cache
=
{}
_MAXCACHE
=
100
def
_compile
(
pattern
,
flags
=
0
):
# internal: compile pattern
tp
=
type
(
pattern
)
if
tp
not
in
(
type
(
""
),
type
(
u""
)):
return
pattern
key
=
(
tp
,
pattern
,
flags
)
try
:
return
_cache
[
key
]
except
KeyError
:
pass
p
=
sre_compile
.
compile
(
pattern
,
flags
)
if
len
(
_cache
)
>=
_MAXCACHE
:
_cache
.
clear
()
_cache
[
key
]
=
p
return
p
def
_sub
(
pattern
,
template
,
string
,
count
=
0
):
# internal: pattern.sub implementation hook
return
_subn
(
pattern
,
template
,
string
,
count
)[
0
]
def
_expand
(
match
,
template
):
# internal: expand template
return
template
# FIXME
def
_fixflags
(
flags
):
# convert flag bitmask to sequence
assert
not
flags
return
()
def
_subn
(
pattern
,
template
,
string
,
count
=
0
):
# internal: pattern.subn implementation hook
if
callable
(
template
):
filter
=
callable
else
:
# FIXME: prepare template
def
filter
(
match
,
template
=
template
):
return
_expand
(
match
,
template
)
n
=
i
=
0
s
=
[]
append
=
s
.
append
c
=
pattern
.
cursor
(
string
)
while
not
count
or
n
<
count
:
m
=
c
.
search
()
if
not
m
:
break
j
=
m
.
start
()
if
j
>
i
:
append
(
string
[
i
:
j
])
append
(
filter
(
m
))
i
=
m
.
end
()
n
=
n
+
1
if
i
<
len
(
string
):
append
(
string
[
i
:])
return
string
[:
0
].
join
(
s
),
n
def
_split
(
pattern
,
string
,
maxsplit
=
0
):
# internal: pattern.split implementation hook
n
=
i
=
0
s
=
[]
append
=
s
.
append
c
=
pattern
.
cursor
(
string
)
while
not
maxsplit
or
n
<
maxsplit
:
m
=
c
.
search
()
if
not
m
:
break
j
=
m
.
start
()
append
(
string
[
i
:
j
])
i
=
m
.
end
()
n
=
n
+
1
if
i
<
len
(
string
):
append
(
string
[
i
:])
return
s
Lib/sre_compile.py
View file @
b1aa1951
...
...
@@ -14,9 +14,6 @@
# other compatibility work.
#
# FIXME: <fl> formalize (objectify?) and document the compiler code
# format, so that other frontends can use this compiler
import
array
,
string
,
sys
import
_sre
...
...
@@ -45,64 +42,70 @@ class Code:
self
.
data
.
append
(
code
)
def
todata
(
self
):
# print self.data
try
:
return
array
.
array
(
WORDSIZE
,
self
.
data
).
tostring
()
except
OverflowError
:
print
self
.
data
raise
def
_lower
(
literal
):
# return _sre._lower(literal) # FIXME
return
string
.
lower
(
literal
)
def
_compile
(
code
,
pattern
,
flags
):
def
_compile
(
code
,
pattern
,
flags
,
level
=
0
):
append
=
code
.
append
for
op
,
av
in
pattern
:
if
op
is
ANY
:
if
"s"
in
flags
:
append
(
CODES
[
op
])
# any character at all!
if
flags
&
SRE_FLAG_DOTALL
:
append
(
OP
CODES
[
op
])
# any character at all!
else
:
append
(
CODES
[
NOT_LITERAL
])
append
(
10
)
append
(
OPCODES
[
CATEGORY
])
append
(
CHCODES
[
CATEGORY_NOT_LINEBREAK
]
)
elif
op
in
(
SUCCESS
,
FAILURE
):
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
elif
op
is
AT
:
append
(
CODES
[
op
])
append
(
POSITIONS
[
av
])
append
(
OPCODES
[
op
])
if
flags
&
SRE_FLAG_MULTILINE
:
append
(
ATCODES
[
AT_MULTILINE
[
av
]])
else
:
append
(
ATCODES
[
av
])
elif
op
is
BRANCH
:
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
tail
=
[]
for
av
in
av
[
1
]:
skip
=
len
(
code
);
append
(
0
)
_compile
(
code
,
av
,
flags
)
append
(
CODES
[
JUMP
])
_compile
(
code
,
av
,
flags
,
level
)
append
(
OP
CODES
[
JUMP
])
tail
.
append
(
len
(
code
));
append
(
0
)
code
[
skip
]
=
len
(
code
)
-
skip
append
(
0
)
# end of branch
for
tail
in
tail
:
code
[
tail
]
=
len
(
code
)
-
tail
elif
op
is
CALL
:
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
skip
=
len
(
code
);
append
(
0
)
_compile
(
code
,
av
,
flags
)
append
(
CODES
[
SUCCESS
])
_compile
(
code
,
av
,
flags
,
level
+
1
)
append
(
OP
CODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
CATEGORY
:
# not used by current parser
append
(
CODES
[
op
])
append
(
CATEGORIES
[
av
])
append
(
OPCODES
[
op
])
if
flags
&
SRE_FLAG_LOCALE
:
append
(
CH_LOCALE
[
CHCODES
[
av
]])
else
:
append
(
CHCODES
[
av
])
elif
op
is
GROUP
:
if
"i"
in
flags
:
append
(
CODES
[
MA
P_IGNORE
[
op
]])
if
flags
&
SRE_FLAG_IGNORECASE
:
append
(
OPCODES
[
O
P_IGNORE
[
op
]])
else
:
append
(
CODES
[
op
])
append
(
av
)
append
(
OP
CODES
[
op
])
append
(
av
-
1
)
elif
op
is
IN
:
if
"i"
in
flags
:
append
(
CODES
[
MA
P_IGNORE
[
op
]])
if
flags
&
SRE_FLAG_IGNORECASE
:
append
(
OPCODES
[
O
P_IGNORE
[
op
]])
def
fixup
(
literal
):
return
ord
(
_lower
(
literal
))
return
ord
(
literal
.
lower
(
))
else
:
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
fixup
=
ord
skip
=
len
(
code
);
append
(
0
)
for
op
,
av
in
av
:
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
if
op
is
NEGATE
:
pass
elif
op
is
LITERAL
:
...
...
@@ -111,58 +114,60 @@ def _compile(code, pattern, flags):
append
(
fixup
(
av
[
0
]))
append
(
fixup
(
av
[
1
]))
elif
op
is
CATEGORY
:
append
(
CATEGORIES
[
av
])
if
flags
&
SRE_FLAG_LOCALE
:
append
(
CH_LOCALE
[
CHCODES
[
av
]])
else
:
append
(
CHCODES
[
av
])
else
:
raise
ValueError
,
"unsupported set operator"
append
(
CODES
[
FAILURE
])
append
(
OP
CODES
[
FAILURE
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
in
(
LITERAL
,
NOT_LITERAL
):
if
"i"
in
flags
:
append
(
CODES
[
MA
P_IGNORE
[
op
]])
append
(
ord
(
_lower
(
av
)))
if
flags
&
SRE_FLAG_IGNORECASE
:
append
(
OPCODES
[
O
P_IGNORE
[
op
]])
append
(
ord
(
av
.
lower
(
)))
else
:
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
append
(
ord
(
av
))
elif
op
is
MARK
:
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
append
(
av
)
elif
op
in
(
REPEAT
,
MIN_REPEAT
,
MAX_REPEAT
):
lo
,
hi
=
av
[
2
].
getwidth
()
if
lo
==
0
:
raise
SyntaxError
,
"cannot repeat zero-width items"
if
lo
==
hi
==
1
and
op
is
MAX_REPEAT
:
append
(
CODES
[
MAX_REPEAT_ONE
])
append
(
OP
CODES
[
MAX_REPEAT_ONE
])
skip
=
len
(
code
);
append
(
0
)
append
(
av
[
0
])
append
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
append
(
CODES
[
SUCCESS
])
_compile
(
code
,
av
[
2
],
flags
,
level
+
1
)
append
(
OP
CODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
else
:
append
(
CODES
[
op
])
append
(
OP
CODES
[
op
])
skip
=
len
(
code
);
append
(
0
)
append
(
av
[
0
])
append
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
_compile
(
code
,
av
[
2
],
flags
,
level
+
1
)
if
op
is
MIN_REPEAT
:
append
(
CODES
[
MIN_UNTIL
])
append
(
OP
CODES
[
MIN_UNTIL
])
else
:
# FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
append
(
CODES
[
MAX_UNTIL
])
append
(
OPCODES
[
MAX_UNTIL
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
SUBPATTERN
:
##
group = av[0]
##
if group:
## append(
CODES[MARK])
##
append((group-1)*2)
_compile
(
code
,
av
[
1
],
flags
)
##
if group:
## append(
CODES[MARK])
##
append((group-1)*2+1)
group
=
av
[
0
]
if
group
:
append
(
OP
CODES
[
MARK
])
append
((
group
-
1
)
*
2
)
_compile
(
code
,
av
[
1
],
flags
,
level
+
1
)
if
group
:
append
(
OP
CODES
[
MARK
])
append
((
group
-
1
)
*
2
+
1
)
else
:
raise
ValueError
,
(
"unsupported operand type"
,
op
)
def
compile
(
p
,
flags
=
()
):
def
compile
(
p
,
flags
=
0
):
# convert pattern list to internal format
if
type
(
p
)
in
(
type
(
""
),
type
(
u""
)):
import
sre_parse
...
...
@@ -170,12 +175,10 @@ def compile(p, flags=()):
p
=
sre_parse
.
parse
(
p
)
else
:
pattern
=
None
# print p.getwidth()
# print p
flags
=
p
.
pattern
.
flags
|
flags
code
=
Code
()
_compile
(
code
,
p
.
data
,
p
.
pattern
.
flags
)
code
.
append
(
CODES
[
SUCCESS
])
# print list(code.data)
_compile
(
code
,
p
.
data
,
flags
)
code
.
append
(
OPCODES
[
SUCCESS
])
data
=
code
.
todata
()
if
0
:
# debugging
print
...
...
@@ -183,5 +186,8 @@ def compile(p, flags=()):
import
sre_disasm
sre_disasm
.
disasm
(
data
)
print
"-"
*
68
# print len(data), p.pattern.groups, len(p.pattern.groupdict)
return
_sre
.
compile
(
pattern
,
data
,
p
.
pattern
.
groups
-
1
,
p
.
pattern
.
groupdict
)
return
_sre
.
compile
(
pattern
,
flags
,
data
,
p
.
pattern
.
groups
-
1
,
p
.
pattern
.
groupdict
)
Lib/sre_constants.py
View file @
b1aa1951
...
...
@@ -48,20 +48,31 @@ SUBPATTERN = "subpattern"
# positions
AT_BEGINNING
=
"at_beginning"
AT_BEGINNING_LINE
=
"at_beginning_line"
AT_BOUNDARY
=
"at_boundary"
AT_NON_BOUNDARY
=
"at_non_boundary"
AT_END
=
"at_end"
AT_END_LINE
=
"at_end_line"
# categories
CATEGORY_DIGIT
=
"category_digit"
CATEGORY_NOT_DIGIT
=
"category_not_digit"
CATEGORY_SPACE
=
"category_space"
CATEGORY_NOT_SPACE
=
"category_not_space"
CATEGORY_WORD
=
"category_word"
CATEGORY_NOT_WORD
=
"category_not_word"
CATEGORY_LINEBREAK
=
"category_linebreak"
CATEGORY_NOT_LINEBREAK
=
"category_not_linebreak"
CATEGORY_LOC_DIGIT
=
"category_loc_digit"
CATEGORY_LOC_NOT_DIGIT
=
"category_loc_not_digit"
CATEGORY_LOC_SPACE
=
"category_loc_space"
CATEGORY_LOC_NOT_SPACE
=
"category_loc_not_space"
CATEGORY_LOC_WORD
=
"category_loc_word"
CATEGORY_LOC_NOT_WORD
=
"category_loc_not_word"
CATEGORY_LOC_LINEBREAK
=
"category_loc_linebreak"
CATEGORY_LOC_NOT_LINEBREAK
=
"category_loc_not_linebreak"
CODES
=
[
OP
CODES
=
[
# failure=0 success=1 (just because it looks better that way :-)
FAILURE
,
SUCCESS
,
...
...
@@ -87,45 +98,75 @@ CODES = [
]
# convert to dictionary
c
=
{}
i
=
0
for
code
in
CODES
:
c
[
code
]
=
i
ATCODES
=
[
AT_BEGINNING
,
AT_BEGINNING_LINE
,
AT_BOUNDARY
,
AT_NON_BOUNDARY
,
AT_END
,
AT_END_LINE
]
CHCODES
=
[
CATEGORY_DIGIT
,
CATEGORY_NOT_DIGIT
,
CATEGORY_SPACE
,
CATEGORY_NOT_SPACE
,
CATEGORY_WORD
,
CATEGORY_NOT_WORD
,
CATEGORY_LINEBREAK
,
CATEGORY_NOT_LINEBREAK
,
CATEGORY_LOC_DIGIT
,
CATEGORY_LOC_NOT_DIGIT
,
CATEGORY_LOC_SPACE
,
CATEGORY_LOC_NOT_SPACE
,
CATEGORY_LOC_WORD
,
CATEGORY_LOC_NOT_WORD
,
CATEGORY_LOC_LINEBREAK
,
CATEGORY_LOC_NOT_LINEBREAK
]
def
makedict
(
list
):
d
=
{}
i
=
0
for
item
in
list
:
d
[
item
]
=
i
i
=
i
+
1
CODES
=
c
return
d
OPCODES
=
makedict
(
OPCODES
)
ATCODES
=
makedict
(
ATCODES
)
CHCODES
=
makedict
(
CHCODES
)
# replacement operations for "ignore case" mode
MA
P_IGNORE
=
{
O
P_IGNORE
=
{
GROUP
:
GROUP_IGNORE
,
IN
:
IN_IGNORE
,
LITERAL
:
LITERAL_IGNORE
,
NOT_LITERAL
:
NOT_LITERAL_IGNORE
}
POSITIONS
=
{
AT_BEGINNING
:
ord
(
"a"
),
AT_BOUNDARY
:
ord
(
"b"
),
AT_NON_BOUNDARY
:
ord
(
"B"
),
AT_END
:
ord
(
"z"
),
AT_MULTILINE
=
{
AT_BEGINNING
:
AT_BEGINNING_LINE
,
AT_END
:
AT_END_LINE
}
CATEGORIES
=
{
CATEGORY_DIGIT
:
ord
(
"d"
),
CATEGORY_NOT_DIGIT
:
ord
(
"D"
),
CATEGORY_SPACE
:
ord
(
"s"
),
CATEGORY_NOT_SPACE
:
ord
(
"S"
),
CATEGORY_WORD
:
ord
(
"w"
),
CATEGORY_NOT_WORD
:
ord
(
"W"
),
CH_LOCALE
=
{
CATEGORY_DIGIT
:
CATEGORY_LOC_DIGIT
,
CATEGORY_NOT_DIGIT
:
CATEGORY_LOC_NOT_DIGIT
,
CATEGORY_SPACE
:
CATEGORY_LOC_SPACE
,
CATEGORY_NOT_SPACE
:
CATEGORY_LOC_NOT_SPACE
,
CATEGORY_WORD
:
CATEGORY_LOC_WORD
,
CATEGORY_NOT_WORD
:
CATEGORY_LOC_NOT_WORD
,
CATEGORY_LINEBREAK
:
CATEGORY_LOC_LINEBREAK
,
CATEGORY_NOT_LINEBREAK
:
CATEGORY_LOC_NOT_LINEBREAK
}
# flags
SRE_FLAG_TEMPLATE
=
1
# NYI
SRE_FLAG_IGNORECASE
=
2
SRE_FLAG_LOCALE
=
4
SRE_FLAG_MULTILINE
=
8
SRE_FLAG_DOTALL
=
16
SRE_FLAG_VERBOSE
=
32
if
__name__
==
"__main__"
:
import
string
items
=
CODES
.
items
()
def
dump
(
f
,
d
,
prefix
):
items
=
d
.
items
()
items
.
sort
(
lambda
a
,
b
:
cmp
(
a
[
1
],
b
[
1
]))
f
=
open
(
"sre_constants.h"
,
"w"
)
f
.
write
(
"/* generated by sre_constants.py */
\
n
"
)
for
k
,
v
in
items
:
f
.
write
(
"#define SRE_OP_"
+
string
.
upper
(
k
)
+
" "
+
str
(
v
)
+
"
\
n
"
)
f
.
write
(
"#define %s_%s %s
\
n
"
%
(
prefix
,
string
.
upper
(
k
),
v
))
f
=
open
(
"sre_constants.h"
,
"w"
)
f
.
write
(
"/* generated from sre_constants.py */
\
n
"
)
dump
(
f
,
OPCODES
,
"SRE_OP"
)
dump
(
f
,
ATCODES
,
"SRE"
)
dump
(
f
,
CHCODES
,
"SRE"
)
f
.
close
()
print
"done"
Modules/_sre.c
View file @
b1aa1951
...
...
@@ -6,13 +6,16 @@
* simple regular expression matching engine
*
* partial history:
* 99-10-24 fl created (b
its and pieces from
the template matcher)
* 99-10-24 fl created (b
ased on
the template matcher)
* 99-11-13 fl added categories, branching, and more (0.2)
* 99-11-16 fl some tweaks to compile on non-Windows platforms
* 99-12-18 fl non-literals, generic maximizing repeat (0.3)
* 99-02-28 fl tons of changes (not all to the better ;-) (0.4)
* 99-03-06 fl first alpha, sort of (0.5)
* 99-03-14 fl removed most compatibility stuff (0.6)
* 99-05-10 fl towards third alpha (0.8.2)
* 99-05-13 fl added experimental cursor stuff (0.8.3)
* 99-05-27 fl final bug hunt (0.8.4)
*
* Copyright (c) 1997-2000 by Secret Labs AB. All rights reserved.
*
...
...
@@ -26,7 +29,7 @@
#ifndef SRE_RECURSIVE
char
copyright
[]
=
" SRE 0.
6
Copyright (c) 1997-2000 by Secret Labs AB "
;
char
copyright
[]
=
" SRE 0.
8.4
Copyright (c) 1997-2000 by Secret Labs AB "
;
#include "Python.h"
...
...
@@ -40,7 +43,7 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#define INT_MAX 2147483647
#endif
#include <ctype.h>
/* temporary hack */
#include <ctype.h>
/* defining this one enables tracing */
#undef DEBUG
...
...
@@ -59,61 +62,69 @@ char copyright[] = " SRE 0.6 Copyright (c) 1997-2000 by Secret Labs AB ";
#ifdef DEBUG
#define TRACE(v) printf v
#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#else
#define TRACE(v)
#endif
#define PTR(ptr) ((SRE_CHAR*) (ptr) - (SRE_CHAR*) state->beginning)
#define SRE_CODE unsigned short
/* unsigned short or larger */
typedef
struct
{
/* string pointers */
void
*
ptr
;
/* current position (also end of current slice) */
void
*
beginning
;
/* start of original string */
void
*
start
;
/* start of current slice */
void
*
end
;
/* end of original string */
/* character size */
int
charsize
;
/* registers */
int
marks
;
void
*
mark
[
64
];
/* FIXME: <fl> should be dynamically allocated! */
/* FIXME */
/* backtracking stack */
void
**
stack
;
int
stacksize
;
int
stackbase
;
}
SRE_STATE
;
#if 1
/* FIXME: <fl> fix this one! */
#define SRE_TO_LOWER Py_UNICODE_TOLOWER
#define SRE_IS_DIGIT Py_UNICODE_ISDIGIT
#define SRE_IS_SPACE Py_UNICODE_ISSPACE
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#else
#define SRE_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
#define SRE_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
/* -------------------------------------------------------------------- */
/* search engine state */
/* unicode character predicates */
#define SRE_TO_LOWER(ch) Py_UNICODE_TOLOWER((Py_UNICODE)(ch))
#define SRE_IS_DIGIT(ch) Py_UNICODE_ISDIGIT((Py_UNICODE)(ch))
#define SRE_IS_SPACE(ch) Py_UNICODE_ISSPACE((Py_UNICODE)(ch))
#define SRE_IS_LINEBREAK(ch) ((ch) == '\n')
/* #define SRE_IS_LINEBREAK(ch) Py_UNICODE_ISLINEBREAK((Py_UNICODE)(ch)) */
#define SRE_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#endif
#define SRE_IS_WORD(ch) (SRE_IS_ALNUM((ch)) || (ch) == '_')
/* locale-specific character predicates */
#define SRE_LOC_TO_LOWER(ch) ((ch) < 256 ? tolower((ch)) : ch)
#define SRE_LOC_IS_DIGIT(ch) ((ch) < 256 ? isdigit((ch)) : 0)
#define SRE_LOC_IS_SPACE(ch) ((ch) < 256 ? isspace((ch)) : 0)
#define SRE_LOC_IS_LINEBREAK(ch) ((ch) == '\n')
#define SRE_LOC_IS_ALNUM(ch) ((ch) < 256 ? isalnum((ch)) : 0)
#define SRE_LOC_IS_WORD(ch) (SRE_LOC_IS_ALNUM((ch)) || (ch) == '_')
LOCAL
(
int
)
sre_category
(
SRE_CODE
category
,
unsigned
int
ch
)
{
switch
(
category
)
{
case
'd'
:
case
SRE_CATEGORY_DIGIT
:
return
SRE_IS_DIGIT
(
ch
);
case
'D'
:
case
SRE_CATEGORY_NOT_DIGIT
:
return
!
SRE_IS_DIGIT
(
ch
);
case
's'
:
case
SRE_CATEGORY_SPACE
:
return
SRE_IS_SPACE
(
ch
);
case
'S'
:
case
SRE_CATEGORY_NOT_SPACE
:
return
!
SRE_IS_SPACE
(
ch
);
case
'w'
:
case
SRE_CATEGORY_WORD
:
return
SRE_IS_WORD
(
ch
);
case
'W'
:
case
SRE_CATEGORY_NOT_WORD
:
return
!
SRE_IS_WORD
(
ch
);
case
SRE_CATEGORY_LINEBREAK
:
return
SRE_IS_LINEBREAK
(
ch
);
case
SRE_CATEGORY_NOT_LINEBREAK
:
return
!
SRE_IS_LINEBREAK
(
ch
);
case
SRE_CATEGORY_LOC_DIGIT
:
return
SRE_LOC_IS_DIGIT
(
ch
);
case
SRE_CATEGORY_LOC_NOT_DIGIT
:
return
!
SRE_LOC_IS_DIGIT
(
ch
);
case
SRE_CATEGORY_LOC_SPACE
:
return
SRE_LOC_IS_SPACE
(
ch
);
case
SRE_CATEGORY_LOC_NOT_SPACE
:
return
!
SRE_LOC_IS_SPACE
(
ch
);
case
SRE_CATEGORY_LOC_WORD
:
return
SRE_LOC_IS_WORD
(
ch
);
case
SRE_CATEGORY_LOC_NOT_WORD
:
return
!
SRE_LOC_IS_WORD
(
ch
);
case
SRE_CATEGORY_LOC_LINEBREAK
:
return
SRE_LOC_IS_LINEBREAK
(
ch
);
case
SRE_CATEGORY_LOC_NOT_LINEBREAK
:
return
!
SRE_LOC_IS_LINEBREAK
(
ch
);
}
return
0
;
}
...
...
@@ -174,7 +185,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
return
0
;
}
/*
set things up for th
e 8-bit version */
/*
generat
e 8-bit version */
#define SRE_CHAR unsigned char
#define SRE_AT sre_at
...
...
@@ -192,7 +203,7 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
#undef SRE_AT
#undef SRE_CHAR
/*
set things up for th
e 16-bit unicode version */
/*
generat
e 16-bit unicode version */
#define SRE_CHAR Py_UNICODE
#define SRE_AT sre_uat
...
...
@@ -211,20 +222,22 @@ _stack_extend(SRE_STATE* state, int lo, int hi)
LOCAL
(
int
)
SRE_AT
(
SRE_STATE
*
state
,
SRE_CHAR
*
ptr
,
SRE_CODE
at
)
{
/* check if pointer is at given position. return 1 if so, 0
otherwise */
/* check if pointer is at given position */
int
this
,
that
;
switch
(
at
)
{
case
'a'
:
/* beginning */
case
SRE_AT_BEGINNING
:
return
((
void
*
)
ptr
==
state
->
beginning
);
case
'z'
:
/* end */
case
SRE_AT_BEGINNING_LINE
:
return
((
void
*
)
ptr
==
state
->
beginning
||
SRE_IS_LINEBREAK
((
int
)
ptr
[
-
1
]));
case
SRE_AT_END
:
return
((
void
*
)
ptr
==
state
->
end
);
case
'b'
:
/* word boundary */
case
SRE_AT_END_LINE
:
return
((
void
*
)
ptr
==
state
->
end
||
SRE_IS_LINEBREAK
((
int
)
ptr
[
0
]));
case
SRE_AT_BOUNDARY
:
if
(
state
->
beginning
==
state
->
end
)
return
0
;
that
=
((
void
*
)
ptr
>
state
->
beginning
)
?
...
...
@@ -232,8 +245,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
this
=
((
void
*
)
ptr
<
state
->
end
)
?
SRE_IS_WORD
((
int
)
ptr
[
0
])
:
0
;
return
this
!=
that
;
case
'B'
:
/* word non-boundary */
case
SRE_AT_NON_BOUNDARY
:
if
(
state
->
beginning
==
state
->
end
)
return
0
;
that
=
((
void
*
)
ptr
>
state
->
beginning
)
?
...
...
@@ -249,8 +261,7 @@ SRE_AT(SRE_STATE* state, SRE_CHAR* ptr, SRE_CODE at)
LOCAL
(
int
)
SRE_MEMBER
(
SRE_CODE
*
set
,
SRE_CHAR
ch
)
{
/* check if character is a member of the given set. return 1 if
so, 0 otherwise */
/* check if character is a member of the given set */
int
ok
=
1
;
...
...
@@ -301,37 +312,50 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
int
stackbase
;
int
i
,
count
;
for
(;;)
{
/* FIXME: this is one ugly hack */
void
*
*
mark
=
NULL
;
void
*
mark_data
[
64
];
TRACE
((
"[%p]
\n
"
,
pattern
));
for
(;;)
{
switch
(
*
pattern
++
)
{
case
SRE_OP_FAILURE
:
/* immediate failure */
TRACE
((
"%8d: failure
\n
"
,
PTR
(
ptr
)));
return
0
;
goto
failure
;
case
SRE_OP_SUCCESS
:
/* end of pattern */
TRACE
((
"%8d: success
\n
"
,
PTR
(
ptr
)));
state
->
ptr
=
ptr
;
return
1
;
goto
success
;
case
SRE_OP_AT
:
/* match at given position */
/* args: <at> */
TRACE
((
"%8d: match at
\\
%c
\n
"
,
PTR
(
ptr
),
*
pattern
));
if
(
!
SRE_AT
(
state
,
ptr
,
*
pattern
))
return
0
;
goto
failure
;
pattern
++
;
break
;
case
SRE_OP_CATEGORY
:
/* match at given category */
/* args: <category> */
TRACE
((
"%8d: category match at
\\
%c
\n
"
,
PTR
(
ptr
),
*
pattern
));
if
(
ptr
>=
end
||
!
sre_category
(
pattern
[
0
],
ptr
[
0
]))
goto
failure
;
pattern
++
;
ptr
++
;
break
;
case
SRE_OP_LITERAL
:
/* match literal character */
/* args: <code> */
TRACE
((
"%8d: literal %c
\n
"
,
PTR
(
ptr
),
(
SRE_CHAR
)
*
pattern
));
if
(
ptr
>=
end
||
*
ptr
!=
(
SRE_CHAR
)
*
pattern
)
return
0
;
goto
failure
;
pattern
++
;
ptr
++
;
break
;
...
...
@@ -341,7 +365,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <code> */
TRACE
((
"%8d: literal not %c
\n
"
,
PTR
(
ptr
),
(
SRE_CHAR
)
*
pattern
));
if
(
ptr
>=
end
||
*
ptr
==
(
SRE_CHAR
)
*
pattern
)
return
0
;
goto
failure
;
pattern
++
;
ptr
++
;
break
;
...
...
@@ -350,7 +374,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* match anything */
TRACE
((
"%8d: any
\n
"
,
PTR
(
ptr
)));
if
(
ptr
>=
end
)
return
0
;
goto
failure
;
ptr
++
;
break
;
...
...
@@ -359,23 +383,47 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* args: <skip> <set> */
TRACE
((
"%8d: set %c
\n
"
,
PTR
(
ptr
),
*
ptr
));
if
(
ptr
>=
end
||
!
SRE_MEMBER
(
pattern
+
1
,
*
ptr
))
return
0
;
goto
failure
;
pattern
+=
pattern
[
0
];
ptr
++
;
break
;
case
SRE_OP_GROUP
:
/* match backreference */
TRACE
((
"%8d: group %d
\n
"
,
PTR
(
ptr
),
pattern
[
0
]));
i
=
pattern
[
0
];
{
/* FIXME: optimize
size
! */
/* FIXME: optimize! */
SRE_CHAR
*
p
=
(
SRE_CHAR
*
)
state
->
mark
[
i
+
i
];
SRE_CHAR
*
e
=
(
SRE_CHAR
*
)
state
->
mark
[
i
+
i
+
1
];
TRACE
((
"%8d: group %p %p
\n
"
,
PTR
(
ptr
),
p
,
e
));
if
(
!
p
||
!
e
||
e
<
p
)
return
0
;
goto
failure
;
while
(
p
<
e
)
{
TRACE
((
"%8d: group test %c %c
\n
"
,
PTR
(
ptr
),
*
ptr
,
*
p
));
if
(
ptr
>=
end
||
*
ptr
!=
*
p
)
return
0
;
goto
failure
;
p
++
;
ptr
++
;
}
}
pattern
++
;
break
;
case
SRE_OP_GROUP_IGNORE
:
/* match backreference */
TRACE
((
"%8d: group ignore %d
\n
"
,
PTR
(
ptr
),
pattern
[
0
]));
i
=
pattern
[
0
];
{
/* FIXME: optimize! */
SRE_CHAR
*
p
=
(
SRE_CHAR
*
)
state
->
mark
[
i
+
i
];
SRE_CHAR
*
e
=
(
SRE_CHAR
*
)
state
->
mark
[
i
+
i
+
1
];
TRACE
((
"%8d: group %p %p
\n
"
,
PTR
(
ptr
),
p
,
e
));
if
(
!
p
||
!
e
||
e
<
p
)
goto
failure
;
while
(
p
<
e
)
{
TRACE
((
"%8d: group test %c %c
\n
"
,
PTR
(
ptr
),
*
ptr
,
*
p
));
if
(
ptr
>=
end
||
SRE_TO_LOWER
(
*
ptr
)
!=
SRE_TO_LOWER
(
*
p
))
goto
failure
;
p
++
;
ptr
++
;
}
}
...
...
@@ -385,7 +433,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
case
SRE_OP_LITERAL_IGNORE
:
TRACE
((
"%8d: literal lower(%c)
\n
"
,
PTR
(
ptr
),
(
SRE_CHAR
)
*
pattern
));
if
(
ptr
>=
end
||
SRE_TO_LOWER
(
*
ptr
)
!=
(
SRE_CHAR
)
*
pattern
)
return
0
;
goto
failure
;
pattern
++
;
ptr
++
;
break
;
...
...
@@ -394,7 +442,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE
((
"%8d: literal not lower(%c)
\n
"
,
PTR
(
ptr
),
(
SRE_CHAR
)
*
pattern
));
if
(
ptr
>=
end
||
SRE_TO_LOWER
(
*
ptr
)
==
(
SRE_CHAR
)
*
pattern
)
return
0
;
goto
failure
;
pattern
++
;
ptr
++
;
break
;
...
...
@@ -403,7 +451,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE
((
"%8d: set lower(%c)
\n
"
,
PTR
(
ptr
),
*
ptr
));
if
(
ptr
>=
end
||
!
SRE_MEMBER
(
pattern
+
1
,
(
SRE_CHAR
)
SRE_TO_LOWER
(
*
ptr
)))
return
0
;
goto
failure
;
pattern
+=
pattern
[
0
];
ptr
++
;
break
;
...
...
@@ -412,6 +460,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
/* set mark */
/* args: <mark> */
TRACE
((
"%8d: set mark(%d)
\n
"
,
PTR
(
ptr
),
pattern
[
0
]));
if
(
!
mark
)
{
mark
=
mark_data
;
memcpy
(
mark
,
state
->
mark
,
sizeof
(
state
->
mark
));
}
state
->
mark
[
pattern
[
0
]]
=
ptr
;
pattern
++
;
break
;
...
...
@@ -429,21 +481,18 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE
((
"%8d: match subpattern
\n
"
,
PTR
(
ptr
)));
state
->
ptr
=
ptr
;
if
(
!
SRE_MATCH
(
state
,
pattern
+
1
))
return
0
;
goto
failure
;
pattern
+=
pattern
[
0
];
ptr
=
state
->
ptr
;
break
;
case
SRE_OP_MAX_REPEAT_ONE
:
/* match repeated sequence (maximizing regexp). this
variant only works if the repeated item is exactly one
character wide, and we're not already collecting
/* match repeated sequence (maximizing regexp) */
/* this variant only works if the repeated item is exactly
one character wide, and we're not already collecting
backtracking points. for other cases, use the
MAX_REPEAT operator instead */
/* args: <skip> <min> <max> <step> */
TRACE
((
"%8d: max repeat one {%d,%d}
\n
"
,
PTR
(
ptr
),
pattern
[
1
],
pattern
[
2
]));
...
...
@@ -454,7 +503,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string, and backtrack from there */
/* FIXME: must look for line endings */
if
(
ptr
+
pattern
[
1
]
>
end
)
return
0
;
/* cannot match */
goto
failure
;
/* cannot match */
count
=
pattern
[
2
];
if
(
count
>
end
-
ptr
)
count
=
end
-
ptr
;
...
...
@@ -515,7 +564,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while
(
count
<
(
int
)
pattern
[
2
])
{
i
=
SRE_MATCH
(
state
,
pattern
+
3
);
if
(
i
<
0
)
return
i
;
goto
failure
;
if
(
i
==
0
)
break
;
count
++
;
...
...
@@ -529,23 +578,20 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
string. check if the rest of the pattern matches, and
backtrack if not. */
/* FIXME: <fl> this is a mess. fix it! */
TRACE
((
"%8d: repeat %d found
\n
"
,
PTR
(
ptr
),
count
));
if
(
count
<
(
int
)
pattern
[
1
])
return
0
;
goto
failure
;
if
(
pattern
[
pattern
[
0
]]
==
SRE_OP_SUCCESS
)
{
/* tail is empty. we're finished */
TRACE
((
"%8d: tail is empty
\n
"
,
PTR
(
ptr
)));
state
->
ptr
=
ptr
;
return
1
;
goto
success
;
}
else
if
(
pattern
[
pattern
[
0
]]
==
SRE_OP_LITERAL
)
{
/* tail starts with a literal. we can speed things up
by skipping positions where the rest of the pattern
cannot possibly match */
/* tail starts with a literal. skip positions where
the rest of the pattern cannot possibly match */
SRE_CHAR
chr
=
(
SRE_CHAR
)
pattern
[
pattern
[
0
]
+
1
];
TRACE
((
"%8d: tail is literal %d
\n
"
,
PTR
(
ptr
),
chr
));
for
(;;)
{
...
...
@@ -562,7 +608,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i
=
SRE_MATCH
(
state
,
pattern
+
pattern
[
0
]);
if
(
i
>
0
)
{
TRACE
((
"%8d: repeat %d picked
\n
"
,
PTR
(
ptr
),
count
));
return
1
;
goto
success
;
}
TRACE
((
"%8d: BACKTRACK
\n
"
,
PTR
(
ptr
)));
ptr
--
;
...
...
@@ -570,23 +616,21 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
}
}
else
{
/* general case */
TRACE
((
"%8d: tail is pattern
\n
"
,
PTR
(
ptr
)));
while
(
count
>=
(
int
)
pattern
[
1
])
{
state
->
ptr
=
ptr
;
i
=
SRE_MATCH
(
state
,
pattern
+
pattern
[
0
]);
if
(
i
>
0
)
{
TRACE
((
"%8d: repeat %d picked
\n
"
,
PTR
(
ptr
),
count
));
return
1
;
goto
success
;
}
TRACE
((
"%8d: BACKTRACK
\n
"
,
PTR
(
ptr
)));
ptr
--
;
count
--
;
}
}
return
0
;
/* failure! */
/* ----------------------------------------------------------------------- */
/* FIXME: the following section is just plain broken */
goto
failure
;
case
SRE_OP_MAX_REPEAT
:
/* match repeated sequence (maximizing regexp). repeated
...
...
@@ -611,7 +655,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i
=
_stack_extend
(
state
,
stackbase
+
count
+
1
,
stackbase
+
pattern
[
2
]);
if
(
i
<
0
)
return
i
;
goto
failure
;
}
state
->
stack
[
stackbase
+
count
]
=
ptr
;
/* check if we can match another item */
...
...
@@ -642,7 +686,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
ptr points to the tail. */
if
(
count
<
(
int
)
pattern
[
1
])
return
0
;
goto
failure
;
/* make sure that rest of the expression matches. if it
doesn't, backtrack */
...
...
@@ -659,7 +703,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state
->
stackbase
=
stackbase
;
if
(
i
>
0
)
{
TRACE
((
"%8d: repeat %d picked
\n
"
,
PTR
(
ptr
),
count
));
return
1
;
goto
success
;
}
/* backtrack! */
...
...
@@ -673,10 +717,10 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
state
->
stackbase
=
stackbase
;
if
(
i
>
0
)
{
TRACE
((
"%8d: repeat %d picked
\n
"
,
PTR
(
ptr
),
count
));
return
1
;
goto
success
;
}
}
return
0
;
/* failure! */
goto
failure
;
case
SRE_OP_MAX_UNTIL
:
/* match repeated sequence (maximizing regexp). repeated
...
...
@@ -684,13 +728,11 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
TRACE
((
"%8d: max until
\n
"
,
PTR
(
ptr
)));
state
->
ptr
=
ptr
;
return
2
;
/* always succeeds, for now... */
/* end of totally broken section */
/* ----------------------------------------------------------------------- */
goto
success
;
/* always succeeds, for now... */
case
SRE_OP_MIN_REPEAT
:
/* match repeated sequence (minimizing regexp) */
/* FIXME: HERE BE BUGS! */
TRACE
((
"%8d: min repeat %d %d
\n
"
,
PTR
(
ptr
),
pattern
[
1
],
pattern
[
2
]));
count
=
0
;
...
...
@@ -699,7 +741,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
while
(
count
<
(
int
)
pattern
[
1
])
{
i
=
SRE_MATCH
(
state
,
pattern
+
3
);
if
(
i
<=
0
)
return
i
;
goto
failure
;
count
++
;
}
/* move forward until the tail matches. */
...
...
@@ -708,22 +750,22 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i
=
SRE_MATCH
(
state
,
pattern
+
pattern
[
0
]);
if
(
i
>
0
)
{
TRACE
((
"%8d: repeat %d picked
\n
"
,
PTR
(
ptr
),
count
));
return
1
;
goto
success
;
}
TRACE
((
"%8d: BACKTRACK
\n
"
,
PTR
(
ptr
)));
state
->
ptr
=
ptr
;
/* backtrack */
i
=
SRE_MATCH
(
state
,
pattern
+
3
);
if
(
i
<=
0
)
return
i
;
goto
failure
;
count
++
;
}
return
0
;
/* failure! */
goto
failure
;
case
SRE_OP_MIN_UNTIL
:
/* end of repeat group */
TRACE
((
"%8d: min until
\n
"
,
PTR
(
ptr
)));
state
->
ptr
=
ptr
;
return
2
;
/* always succeeds, for now... */
goto
success
;
/* always succeeds, for now... */
case
SRE_OP_BRANCH
:
/* match one of several subpatterns */
...
...
@@ -737,13 +779,13 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
i
=
SRE_MATCH
(
state
,
pattern
+
1
);
if
(
i
>
0
)
{
TRACE
((
"%8d: branch succeeded
\n
"
,
PTR
(
ptr
)));
return
1
;
goto
success
;
}
}
pattern
+=
*
pattern
;
}
TRACE
((
"%8d: branch failed
\n
"
,
PTR
(
ptr
)));
return
0
;
/* failure! */
goto
failure
;
case
SRE_OP_REPEAT
:
/* TEMPLATE: match repeated sequence (no backtracking) */
...
...
@@ -758,7 +800,7 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
count
++
;
}
if
(
count
<=
(
int
)
pattern
[
1
])
return
0
;
goto
failure
;
TRACE
((
"%8d: repeat %d matches
\n
"
,
PTR
(
ptr
),
count
));
pattern
+=
pattern
[
0
];
ptr
=
state
->
ptr
;
...
...
@@ -768,6 +810,14 @@ SRE_MATCH(SRE_STATE* state, SRE_CODE* pattern)
return
SRE_ERROR_ILLEGAL
;
}
}
failure:
if
(
mark
)
memcpy
(
state
->
mark
,
mark
,
sizeof
(
state
->
mark
));
return
0
;
success:
return
1
;
}
LOCAL
(
int
)
...
...
@@ -832,6 +882,7 @@ SRE_SEARCH(SRE_STATE* state, SRE_CODE* pattern)
staticforward
PyTypeObject
Pattern_Type
;
staticforward
PyTypeObject
Match_Type
;
staticforward
PyTypeObject
Cursor_Type
;
static
PyObject
*
_compile
(
PyObject
*
self_
,
PyObject
*
args
)
...
...
@@ -841,20 +892,25 @@ _compile(PyObject* self_, PyObject* args)
PatternObject
*
self
;
PyObject
*
pattern
;
int
flags
=
0
;
PyObject
*
code
;
int
groups
=
0
;
PyObject
*
groupindex
=
NULL
;
if
(
!
PyArg_ParseTuple
(
args
,
"OO!|iO"
,
&
pattern
,
&
PyString_Type
,
&
code
,
&
groups
,
&
groupindex
))
if
(
!
PyArg_ParseTuple
(
args
,
"OiO!|iO"
,
&
pattern
,
&
flags
,
&
PyString_Type
,
&
code
,
&
groups
,
&
groupindex
))
return
NULL
;
self
=
PyObject_N
ew
(
PatternObject
,
&
Pattern_Type
);
self
=
PyObject_N
EW
(
PatternObject
,
&
Pattern_Type
);
if
(
self
==
NULL
)
return
NULL
;
Py_INCREF
(
pattern
);
self
->
pattern
=
pattern
;
self
->
flags
=
flags
;
Py_INCREF
(
code
);
self
->
code
=
code
;
...
...
@@ -872,6 +928,69 @@ _getcodesize(PyObject* self_, PyObject* args)
return
Py_BuildValue
(
"i"
,
sizeof
(
SRE_CODE
));
}
LOCAL
(
PyObject
*
)
_setup
(
SRE_STATE
*
state
,
PyObject
*
args
)
{
/* prepare state object */
PyBufferProcs
*
buffer
;
int
i
,
count
;
void
*
ptr
;
PyObject
*
string
;
int
start
=
0
;
int
end
=
INT_MAX
;
if
(
!
PyArg_ParseTuple
(
args
,
"O|ii"
,
&
string
,
&
start
,
&
end
))
return
NULL
;
/* get pointer to string buffer */
buffer
=
string
->
ob_type
->
tp_as_buffer
;
if
(
!
buffer
||
!
buffer
->
bf_getreadbuffer
||
!
buffer
->
bf_getsegcount
||
buffer
->
bf_getsegcount
(
string
,
NULL
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"expected read-only buffer"
);
return
NULL
;
}
/* determine buffer size */
count
=
buffer
->
bf_getreadbuffer
(
string
,
0
,
&
ptr
);
if
(
count
<
0
)
{
/* sanity check */
PyErr_SetString
(
PyExc_TypeError
,
"buffer has negative size"
);
return
NULL
;
}
/* determine character size */
state
->
charsize
=
(
PyUnicode_Check
(
string
)
?
sizeof
(
Py_UNICODE
)
:
1
);
count
/=
state
->
charsize
;
/* adjust boundaries */
if
(
start
<
0
)
start
=
0
;
else
if
(
start
>
count
)
start
=
count
;
if
(
end
<
0
)
end
=
0
;
else
if
(
end
>
count
)
end
=
count
;
state
->
beginning
=
ptr
;
state
->
start
=
(
void
*
)
((
char
*
)
ptr
+
start
*
state
->
charsize
);
state
->
end
=
(
void
*
)
((
char
*
)
ptr
+
end
*
state
->
charsize
);
/* FIXME: dynamic! */
for
(
i
=
0
;
i
<
64
;
i
++
)
state
->
mark
[
i
]
=
NULL
;
state
->
stack
=
NULL
;
state
->
stackbase
=
0
;
state
->
stacksize
=
0
;
return
string
;
}
static
PyObject
*
_pattern_new_match
(
PatternObject
*
pattern
,
SRE_STATE
*
state
,
PyObject
*
string
,
int
status
)
...
...
@@ -886,7 +1005,7 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
if
(
status
>
0
)
{
/* create match object (with room for extra group marks) */
match
=
PyObject_N
ewVar
(
MatchObject
,
&
Match_Type
,
2
*
pattern
->
groups
);
match
=
PyObject_N
EW_VAR
(
MatchObject
,
&
Match_Type
,
2
*
pattern
->
groups
);
if
(
match
==
NULL
)
return
NULL
;
...
...
@@ -930,70 +1049,32 @@ _pattern_new_match(PatternObject* pattern, SRE_STATE* state,
return
Py_None
;
}
/* -------------------------------------------------------------------- */
/* pattern methods */
LOCAL
(
PyObject
*
)
_setup
(
SRE_STATE
*
state
,
PyObject
*
args
)
static
PyObject
*
_pattern_cursor
(
PyObject
*
pattern
,
PyObject
*
args
)
{
/* prepare state object */
PyBufferProcs
*
buffer
;
int
i
,
count
;
void
*
ptr
;
/* create search state object */
CursorObject
*
self
;
PyObject
*
string
;
int
start
=
0
;
int
end
=
INT_MAX
;
if
(
!
PyArg_ParseTuple
(
args
,
"O|ii"
,
&
string
,
&
start
,
&
end
))
return
NULL
;
/* get pointer to string buffer */
buffer
=
string
->
ob_type
->
tp_as_buffer
;
if
(
!
buffer
||
!
buffer
->
bf_getreadbuffer
||
!
buffer
->
bf_getsegcount
||
buffer
->
bf_getsegcount
(
string
,
NULL
)
!=
1
)
{
PyErr_SetString
(
PyExc_TypeError
,
"expected read-only buffer"
);
/* create match object (with room for extra group marks) */
self
=
PyObject_NEW
(
CursorObject
,
&
Cursor_Type
);
if
(
self
==
NULL
)
return
NULL
;
}
/* determine buffer size */
count
=
buffer
->
bf_getreadbuffer
(
string
,
0
,
&
ptr
);
if
(
count
<
0
)
{
/* sanity check */
PyErr_SetString
(
PyExc_TypeError
,
"buffer has negative size"
);
string
=
_setup
(
&
self
->
state
,
args
);
if
(
!
string
)
{
/* FIXME: dealloc cursor object */
return
NULL
;
}
/* determine character size */
state
->
charsize
=
(
PyUnicode_Check
(
string
)
?
sizeof
(
Py_UNICODE
)
:
1
);
count
/=
state
->
charsize
;
/* adjust boundaries */
if
(
start
<
0
)
start
=
0
;
else
if
(
start
>
count
)
start
=
count
;
if
(
end
<
0
)
end
=
0
;
else
if
(
end
>
count
)
end
=
count
;
state
->
beginning
=
ptr
;
state
->
start
=
(
void
*
)
((
char
*
)
ptr
+
start
*
state
->
charsize
);
state
->
end
=
(
void
*
)
((
char
*
)
ptr
+
end
*
state
->
charsize
);
/* FIXME: dynamic! */
for
(
i
=
0
;
i
<
64
;
i
++
)
state
->
mark
[
i
]
=
NULL
;
Py_INCREF
(
pattern
);
self
->
pattern
=
pattern
;
state
->
stack
=
NULL
;
state
->
stackbase
=
0
;
state
->
stacksize
=
0
;
Py_INCREF
(
string
);
self
->
string
=
string
;
return
string
;
return
(
PyObject
*
)
self
;
}
static
void
...
...
@@ -1002,7 +1083,7 @@ _pattern_dealloc(PatternObject* self)
Py_XDECREF
(
self
->
code
);
Py_XDECREF
(
self
->
pattern
);
Py_XDECREF
(
self
->
groupindex
);
Py
Object_Del
(
self
);
Py
Mem_DEL
(
self
);
}
static
PyObject
*
...
...
@@ -1052,11 +1133,71 @@ _pattern_search(PatternObject* self, PyObject* args)
}
static
PyObject
*
_pattern_findall
(
PatternObject
*
self
,
PyObject
*
args
)
call
(
char
*
function
,
PyObject
*
args
)
{
PyObject
*
name
;
PyObject
*
module
;
PyObject
*
func
;
PyObject
*
result
;
name
=
PyString_FromString
(
"sre"
);
if
(
!
name
)
return
NULL
;
module
=
PyImport_Import
(
name
);
Py_DECREF
(
name
);
if
(
!
module
)
return
NULL
;
func
=
PyObject_GetAttrString
(
module
,
function
);
Py_DECREF
(
module
);
if
(
!
func
)
return
NULL
;
result
=
PyObject_CallObject
(
func
,
args
);
Py_DECREF
(
func
);
Py_DECREF
(
args
);
return
result
;
}
static
PyObject
*
_pattern_sub
(
PatternObject
*
self
,
PyObject
*
args
)
{
PyObject
*
template
;
PyObject
*
string
;
PyObject
*
count
;
if
(
!
PyArg_ParseTuple
(
args
,
"OOO"
,
&
template
,
&
string
,
&
count
))
return
NULL
;
/* delegate to Python code */
return
call
(
"_sub"
,
Py_BuildValue
(
"OOOO"
,
self
,
template
,
string
,
count
));
}
static
PyObject
*
_pattern_subn
(
PatternObject
*
self
,
PyObject
*
args
)
{
PyObject
*
template
;
PyObject
*
string
;
PyObject
*
count
;
if
(
!
PyArg_ParseTuple
(
args
,
"OOO"
,
&
template
,
&
string
,
&
count
))
return
NULL
;
/* delegate to Python code */
return
call
(
"_subn"
,
Py_BuildValue
(
"OOOO"
,
self
,
template
,
string
,
count
));
}
static
PyObject
*
_pattern_split
(
PatternObject
*
self
,
PyObject
*
args
)
{
/* FIXME: not sure about the semantics here. this is good enough
for SXP, though... */
PyObject
*
string
;
PyObject
*
maxsplit
;
if
(
!
PyArg_ParseTuple
(
args
,
"OO"
,
&
string
,
&
maxsplit
))
return
NULL
;
/* delegate to Python code */
return
call
(
"_split"
,
Py_BuildValue
(
"OOO"
,
self
,
string
,
maxsplit
));
}
static
PyObject
*
_pattern_findall
(
PatternObject
*
self
,
PyObject
*
args
)
{
SRE_STATE
state
;
PyObject
*
string
;
PyObject
*
list
;
...
...
@@ -1085,6 +1226,10 @@ _pattern_findall(PatternObject* self, PyObject* args)
if
(
status
==
0
)
state
.
ptr
=
(
void
*
)
((
char
*
)
state
.
start
+
1
);
/* FIXME: if one group is defined, slice that group
instead. if multiple groups are defined, add tuple
containing all slices */
item
=
PySequence_GetSlice
(
string
,
((
char
*
)
state
.
start
-
(
char
*
)
state
.
beginning
),
...
...
@@ -1121,7 +1266,12 @@ error:
static
PyMethodDef
_pattern_methods
[]
=
{
{
"match"
,
(
PyCFunction
)
_pattern_match
,
1
},
{
"search"
,
(
PyCFunction
)
_pattern_search
,
1
},
{
"sub"
,
(
PyCFunction
)
_pattern_sub
,
1
},
{
"subn"
,
(
PyCFunction
)
_pattern_subn
,
1
},
{
"split"
,
(
PyCFunction
)
_pattern_split
,
1
},
{
"findall"
,
(
PyCFunction
)
_pattern_findall
,
1
},
/* experimental */
{
"cursor"
,
(
PyCFunction
)
_pattern_cursor
,
1
},
{
NULL
,
NULL
}
};
...
...
@@ -1143,6 +1293,14 @@ _pattern_getattr(PatternObject* self, char* name)
return
self
->
pattern
;
}
if
(
!
strcmp
(
name
,
"flags"
))
return
Py_BuildValue
(
"i"
,
self
->
flags
);
if
(
!
strcmp
(
name
,
"groupindex"
)
&&
self
->
groupindex
)
{
Py_INCREF
(
self
->
groupindex
);
return
self
->
groupindex
;
}
PyErr_SetString
(
PyExc_AttributeError
,
name
);
return
NULL
;
}
...
...
@@ -1163,7 +1321,7 @@ _match_dealloc(MatchObject* self)
{
Py_XDECREF
(
self
->
string
);
Py_DECREF
(
self
->
pattern
);
Py
Object_Del
(
self
);
Py
Mem_DEL
(
self
);
}
static
PyObject
*
...
...
@@ -1244,6 +1402,8 @@ _match_groups(MatchObject* self, PyObject* args)
PyObject
*
result
;
int
index
;
/* FIXME: <fl> handle default value! */
result
=
PyTuple_New
(
self
->
groups
-
1
);
if
(
!
result
)
return
NULL
;
...
...
@@ -1269,6 +1429,8 @@ _match_groupdict(MatchObject* self, PyObject* args)
PyObject
*
keys
;
int
index
;
/* FIXME: <fl> handle default value! */
result
=
PyDict_New
();
if
(
!
result
)
return
NULL
;
...
...
@@ -1367,7 +1529,8 @@ _match_span(MatchObject* self, PyObject* args)
if
(
self
->
mark
[
index
*
2
]
<
0
)
{
Py_INCREF
(
Py_None
);
return
Py_None
;
Py_INCREF
(
Py_None
);
return
Py_BuildValue
(
"OO"
,
Py_None
,
Py_None
);
}
return
Py_BuildValue
(
"ii"
,
self
->
mark
[
index
*
2
],
self
->
mark
[
index
*
2
+
1
]);
...
...
@@ -1394,24 +1557,20 @@ _match_getattr(MatchObject* self, char* name)
PyErr_Clear
();
/* attributes
!
*/
/* attributes */
if
(
!
strcmp
(
name
,
"string"
))
{
Py_INCREF
(
self
->
string
);
return
self
->
string
;
}
if
(
!
strcmp
(
name
,
"regs"
))
/* FIXME: should return the whole list! */
return
Py_BuildValue
(
"((i,i))"
,
self
->
mark
[
0
],
self
->
mark
[
1
]);
if
(
!
strcmp
(
name
,
"re"
))
{
Py_INCREF
(
self
->
pattern
);
return
(
PyObject
*
)
self
->
pattern
;
}
if
(
!
strcmp
(
name
,
"groupindex"
)
&&
self
->
pattern
->
groupindex
)
{
Py_INCREF
(
self
->
pattern
->
groupindex
);
return
self
->
pattern
->
groupindex
;
}
if
(
!
strcmp
(
name
,
"pos"
))
return
Py_BuildValue
(
"i"
,
0
);
/* FIXME */
if
(
!
strcmp
(
name
,
"endpos"
))
return
Py_BuildValue
(
"i"
,
0
);
/* FIXME */
...
...
@@ -1432,6 +1591,106 @@ statichere PyTypeObject Match_Type = {
(
getattrfunc
)
_match_getattr
,
/*tp_getattr*/
};
/* -------------------------------------------------------------------- */
/* cursor methods (experimental) */
static
void
_cursor_dealloc
(
CursorObject
*
self
)
{
_stack_free
(
&
self
->
state
);
Py_DECREF
(
self
->
string
);
Py_DECREF
(
self
->
pattern
);
PyMem_DEL
(
self
);
}
static
PyObject
*
_cursor_match
(
CursorObject
*
self
,
PyObject
*
args
)
{
SRE_STATE
*
state
=
&
self
->
state
;
PyObject
*
match
;
int
status
;
state
->
ptr
=
state
->
start
;
if
(
state
->
charsize
==
1
)
{
status
=
sre_match
(
state
,
PatternObject_GetCode
(
self
->
pattern
));
}
else
{
status
=
sre_umatch
(
state
,
PatternObject_GetCode
(
self
->
pattern
));
}
match
=
_pattern_new_match
((
PatternObject
*
)
self
->
pattern
,
state
,
self
->
string
,
status
);
if
(
status
>=
0
)
state
->
start
=
state
->
ptr
;
else
state
->
start
=
(
char
*
)
state
->
ptr
+
state
->
charsize
;
return
match
;
}
static
PyObject
*
_cursor_search
(
CursorObject
*
self
,
PyObject
*
args
)
{
SRE_STATE
*
state
=
&
self
->
state
;
PyObject
*
match
;
int
status
;
state
->
ptr
=
state
->
start
;
if
(
state
->
charsize
==
1
)
{
status
=
sre_search
(
state
,
PatternObject_GetCode
(
self
->
pattern
));
}
else
{
status
=
sre_usearch
(
state
,
PatternObject_GetCode
(
self
->
pattern
));
}
match
=
_pattern_new_match
((
PatternObject
*
)
self
->
pattern
,
state
,
self
->
string
,
status
);
if
(
status
>=
0
)
state
->
start
=
state
->
ptr
;
return
match
;
}
static
PyMethodDef
_cursor_methods
[]
=
{
{
"match"
,
(
PyCFunction
)
_cursor_match
,
0
},
{
"search"
,
(
PyCFunction
)
_cursor_search
,
0
},
{
NULL
,
NULL
}
};
static
PyObject
*
_cursor_getattr
(
CursorObject
*
self
,
char
*
name
)
{
PyObject
*
res
;
res
=
Py_FindMethod
(
_cursor_methods
,
(
PyObject
*
)
self
,
name
);
if
(
res
)
return
res
;
PyErr_Clear
();
/* attributes */
if
(
!
strcmp
(
name
,
"pattern"
))
{
Py_INCREF
(
self
->
pattern
);
return
self
->
pattern
;
}
PyErr_SetString
(
PyExc_AttributeError
,
name
);
return
NULL
;
}
statichere
PyTypeObject
Cursor_Type
=
{
PyObject_HEAD_INIT
(
NULL
)
0
,
"Cursor"
,
sizeof
(
CursorObject
),
/* size of basic object */
0
,
(
destructor
)
_cursor_dealloc
,
/*tp_dealloc*/
0
,
/*tp_print*/
(
getattrfunc
)
_cursor_getattr
,
/*tp_getattr*/
};
static
PyMethodDef
_functions
[]
=
{
{
"compile"
,
_compile
,
1
},
{
"getcodesize"
,
_getcodesize
,
1
},
...
...
@@ -1445,7 +1704,8 @@ __declspec(dllexport)
init_sre
()
{
/* Patch object types */
Pattern_Type
.
ob_type
=
Match_Type
.
ob_type
=
&
PyType_Type
;
Pattern_Type
.
ob_type
=
Match_Type
.
ob_type
=
Cursor_Type
.
ob_type
=
&
PyType_Type
;
Py_InitModule
(
"_sre"
,
_functions
);
}
...
...
Modules/sre.h
View file @
b1aa1951
...
...
@@ -14,17 +14,18 @@
#include "sre_constants.h"
/* Python objects */
typedef
struct
{
PyObject_HEAD
PyObject
*
code
;
/* link to the code string object */
PyObject
*
pattern
;
/* link to the pattern source (or None) */
int
groups
;
PyObject
*
groupindex
;
/* compatibility */
PyObject
*
pattern
;
/* pattern source (or None) */
int
flags
;
/* flags used when compiling pattern source */
}
PatternObject
;
#define PatternObject_GetCode(o) ((void*) PyString_AS_STRING((o)->code))
#define PatternObject_GetCode(o)\
((void*) PyString_AS_STRING(((PatternObject*)(o))->code))
typedef
struct
{
PyObject_HEAD
...
...
@@ -34,5 +35,28 @@ typedef struct {
int
mark
[
2
];
}
MatchObject
;
#endif
typedef
struct
{
/* string pointers */
void
*
ptr
;
/* current position (also end of current slice) */
void
*
beginning
;
/* start of original string */
void
*
start
;
/* start of current slice */
void
*
end
;
/* end of original string */
/* character size */
int
charsize
;
/* registers */
int
marks
;
void
*
mark
[
64
];
/* FIXME: <fl> should be dynamically allocated! */
/* backtracking stack */
void
**
stack
;
int
stacksize
;
int
stackbase
;
}
SRE_STATE
;
typedef
struct
{
PyObject_HEAD
PyObject
*
pattern
;
PyObject
*
string
;
SRE_STATE
state
;
}
CursorObject
;
#endif
Modules/sre_constants.h
View file @
b1aa1951
/* generated
by
sre_constants.py */
/* generated
from
sre_constants.py */
#define SRE_OP_FAILURE 0
#define SRE_OP_SUCCESS 1
#define SRE_OP_ANY 2
...
...
@@ -25,3 +25,25 @@
#define SRE_OP_NEGATE 23
#define SRE_OP_RANGE 24
#define SRE_OP_REPEAT 25
#define SRE_AT_BEGINNING 0
#define SRE_AT_BEGINNING_LINE 1
#define SRE_AT_BOUNDARY 2
#define SRE_AT_NON_BOUNDARY 3
#define SRE_AT_END 4
#define SRE_AT_END_LINE 5
#define SRE_CATEGORY_DIGIT 0
#define SRE_CATEGORY_NOT_DIGIT 1
#define SRE_CATEGORY_SPACE 2
#define SRE_CATEGORY_NOT_SPACE 3
#define SRE_CATEGORY_WORD 4
#define SRE_CATEGORY_NOT_WORD 5
#define SRE_CATEGORY_LINEBREAK 6
#define SRE_CATEGORY_NOT_LINEBREAK 7
#define SRE_CATEGORY_LOC_DIGIT 8
#define SRE_CATEGORY_LOC_NOT_DIGIT 9
#define SRE_CATEGORY_LOC_SPACE 10
#define SRE_CATEGORY_LOC_NOT_SPACE 11
#define SRE_CATEGORY_LOC_WORD 12
#define SRE_CATEGORY_LOC_NOT_WORD 13
#define SRE_CATEGORY_LOC_LINEBREAK 14
#define SRE_CATEGORY_LOC_NOT_LINEBREAK 15
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment