Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
90a07913
Commit
90a07913
authored
Jun 30, 2000
by
Fredrik Lundh
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
- pedantic: make sure "python -t" doesn't complain...
parent
df02d0b3
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
587 additions
and
587 deletions
+587
-587
Lib/sre.py
Lib/sre.py
+11
-11
Lib/sre_compile.py
Lib/sre_compile.py
+153
-153
Lib/sre_parse.py
Lib/sre_parse.py
+418
-418
Lib/test/test_sre.py
Lib/test/test_sre.py
+5
-5
No files found.
Lib/sre.py
View file @
90a07913
...
...
@@ -98,7 +98,7 @@ def _subn(pattern, template, string, count=0):
if
callable
(
template
):
filter
=
template
else
:
template
=
sre_parse
.
parse_template
(
template
,
pattern
)
template
=
sre_parse
.
parse_template
(
template
,
pattern
)
def
filter
(
match
,
template
=
template
):
return
sre_parse
.
expand_template
(
template
,
match
)
n
=
i
=
0
...
...
@@ -109,11 +109,11 @@ def _subn(pattern, template, string, count=0):
m
=
c
.
search
()
if
not
m
:
break
b
,
e
=
m
.
span
()
b
,
e
=
m
.
span
()
if
i
<
b
:
append
(
string
[
i
:
b
])
append
(
filter
(
m
))
i
=
e
i
=
e
n
=
n
+
1
append
(
string
[
i
:])
return
string
[:
0
].
join
(
s
),
n
...
...
@@ -130,15 +130,15 @@ def _split(pattern, string, maxsplit=0):
m
=
c
.
search
()
if
not
m
:
break
b
,
e
=
m
.
span
()
if
b
==
e
:
if
i
>=
len
(
string
):
break
continue
b
,
e
=
m
.
span
()
if
b
==
e
:
if
i
>=
len
(
string
):
break
continue
append
(
string
[
i
:
b
])
if
g
and
b
!=
e
:
extend
(
m
.
groups
())
i
=
e
if
g
and
b
!=
e
:
extend
(
m
.
groups
())
i
=
e
n
=
n
+
1
append
(
string
[
i
:])
return
s
Lib/sre_compile.py
View file @
90a07913
...
...
@@ -18,7 +18,7 @@ from sre_constants import *
# find an array type code that matches the engine's code size
for
WORDSIZE
in
"BHil"
:
if
len
(
array
.
array
(
WORDSIZE
,
[
0
]).
tostring
())
==
_sre
.
getcodesize
():
break
break
else
:
raise
RuntimeError
,
"cannot find a useable array type"
...
...
@@ -26,132 +26,132 @@ def _compile(code, pattern, flags):
# internal: compile a (sub)pattern
emit
=
code
.
append
for
op
,
av
in
pattern
:
if
op
is
ANY
:
if
flags
&
SRE_FLAG_DOTALL
:
emit
(
OPCODES
[
op
])
else
:
emit
(
OPCODES
[
CATEGORY
])
emit
(
CHCODES
[
CATEGORY_NOT_LINEBREAK
])
elif
op
in
(
SUCCESS
,
FAILURE
):
emit
(
OPCODES
[
op
])
elif
op
is
AT
:
emit
(
OPCODES
[
op
])
if
flags
&
SRE_FLAG_MULTILINE
:
emit
(
ATCODES
[
AT_MULTILINE
[
av
]])
else
:
emit
(
ATCODES
[
av
])
elif
op
is
BRANCH
:
emit
(
OPCODES
[
op
])
tail
=
[]
for
av
in
av
[
1
]:
skip
=
len
(
code
);
emit
(
0
)
_compile
(
code
,
av
,
flags
)
emit
(
OPCODES
[
JUMP
])
tail
.
append
(
len
(
code
));
emit
(
0
)
code
[
skip
]
=
len
(
code
)
-
skip
emit
(
0
)
# end of branch
for
tail
in
tail
:
code
[
tail
]
=
len
(
code
)
-
tail
elif
op
is
CALL
:
emit
(
OPCODES
[
op
])
skip
=
len
(
code
);
emit
(
0
)
_compile
(
code
,
av
,
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
CATEGORY
:
emit
(
OPCODES
[
op
])
if
flags
&
SRE_FLAG_LOCALE
:
emit
(
CHCODES
[
CH_LOCALE
[
av
]])
elif
flags
&
SRE_FLAG_UNICODE
:
emit
(
CHCODES
[
CH_UNICODE
[
av
]])
else
:
emit
(
CHCODES
[
av
])
elif
op
is
GROUP
:
if
flags
&
SRE_FLAG_IGNORECASE
:
emit
(
OPCODES
[
OP_IGNORE
[
op
]])
else
:
emit
(
OPCODES
[
op
])
emit
(
av
-
1
)
elif
op
is
IN
:
if
flags
&
SRE_FLAG_IGNORECASE
:
emit
(
OPCODES
[
OP_IGNORE
[
op
]])
def
fixup
(
literal
,
flags
=
flags
):
return
_sre
.
getlower
(
ord
(
literal
),
flags
)
else
:
emit
(
OPCODES
[
op
])
fixup
=
ord
skip
=
len
(
code
);
emit
(
0
)
for
op
,
av
in
av
:
emit
(
OPCODES
[
op
])
if
op
is
NEGATE
:
pass
elif
op
is
LITERAL
:
emit
(
fixup
(
av
))
elif
op
is
RANGE
:
emit
(
fixup
(
av
[
0
]))
emit
(
fixup
(
av
[
1
]))
elif
op
is
CATEGORY
:
if
flags
&
SRE_FLAG_LOCALE
:
emit
(
CHCODES
[
CH_LOCALE
[
av
]])
elif
flags
&
SRE_FLAG_UNICODE
:
emit
(
CHCODES
[
CH_UNICODE
[
av
]])
else
:
emit
(
CHCODES
[
av
])
else
:
raise
error
,
"internal: unsupported set operator"
emit
(
OPCODES
[
FAILURE
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
in
(
LITERAL
,
NOT_LITERAL
):
if
flags
&
SRE_FLAG_IGNORECASE
:
emit
(
OPCODES
[
OP_IGNORE
[
op
]])
else
:
emit
(
OPCODES
[
op
])
emit
(
ord
(
av
))
elif
op
is
MARK
:
emit
(
OPCODES
[
op
])
emit
(
av
)
elif
op
in
(
REPEAT
,
MIN_REPEAT
,
MAX_REPEAT
):
if
flags
&
SRE_FLAG_TEMPLATE
:
emit
(
OPCODES
[
REPEAT
])
skip
=
len
(
code
);
emit
(
0
)
emit
(
av
[
0
])
emit
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
else
:
lo
,
hi
=
av
[
2
].
getwidth
()
if
lo
==
0
:
raise
error
,
"nothing to repeat"
if
0
and
lo
==
hi
==
1
and
op
is
MAX_REPEAT
:
# FIXME: <fl> need a better way to figure out when
# it's safe to use this one (in the parser, probably)
emit
(
OPCODES
[
MAX_REPEAT_ONE
])
skip
=
len
(
code
);
emit
(
0
)
emit
(
av
[
0
])
emit
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
else
:
emit
(
OPCODES
[
op
])
skip
=
len
(
code
);
emit
(
0
)
emit
(
av
[
0
])
emit
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
SUBPATTERN
:
group
=
av
[
0
]
if
group
:
emit
(
OPCODES
[
MARK
])
emit
((
group
-
1
)
*
2
)
_compile
(
code
,
av
[
1
],
flags
)
if
group
:
emit
(
OPCODES
[
MARK
])
emit
((
group
-
1
)
*
2
+
1
)
else
:
raise
ValueError
,
(
"unsupported operand type"
,
op
)
if
op
is
ANY
:
if
flags
&
SRE_FLAG_DOTALL
:
emit
(
OPCODES
[
op
])
else
:
emit
(
OPCODES
[
CATEGORY
])
emit
(
CHCODES
[
CATEGORY_NOT_LINEBREAK
])
elif
op
in
(
SUCCESS
,
FAILURE
):
emit
(
OPCODES
[
op
])
elif
op
is
AT
:
emit
(
OPCODES
[
op
])
if
flags
&
SRE_FLAG_MULTILINE
:
emit
(
ATCODES
[
AT_MULTILINE
[
av
]])
else
:
emit
(
ATCODES
[
av
])
elif
op
is
BRANCH
:
emit
(
OPCODES
[
op
])
tail
=
[]
for
av
in
av
[
1
]:
skip
=
len
(
code
);
emit
(
0
)
_compile
(
code
,
av
,
flags
)
emit
(
OPCODES
[
JUMP
])
tail
.
append
(
len
(
code
));
emit
(
0
)
code
[
skip
]
=
len
(
code
)
-
skip
emit
(
0
)
# end of branch
for
tail
in
tail
:
code
[
tail
]
=
len
(
code
)
-
tail
elif
op
is
CALL
:
emit
(
OPCODES
[
op
])
skip
=
len
(
code
);
emit
(
0
)
_compile
(
code
,
av
,
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
CATEGORY
:
emit
(
OPCODES
[
op
])
if
flags
&
SRE_FLAG_LOCALE
:
emit
(
CHCODES
[
CH_LOCALE
[
av
]])
elif
flags
&
SRE_FLAG_UNICODE
:
emit
(
CHCODES
[
CH_UNICODE
[
av
]])
else
:
emit
(
CHCODES
[
av
])
elif
op
is
GROUP
:
if
flags
&
SRE_FLAG_IGNORECASE
:
emit
(
OPCODES
[
OP_IGNORE
[
op
]])
else
:
emit
(
OPCODES
[
op
])
emit
(
av
-
1
)
elif
op
is
IN
:
if
flags
&
SRE_FLAG_IGNORECASE
:
emit
(
OPCODES
[
OP_IGNORE
[
op
]])
def
fixup
(
literal
,
flags
=
flags
):
return
_sre
.
getlower
(
ord
(
literal
),
flags
)
else
:
emit
(
OPCODES
[
op
])
fixup
=
ord
skip
=
len
(
code
);
emit
(
0
)
for
op
,
av
in
av
:
emit
(
OPCODES
[
op
])
if
op
is
NEGATE
:
pass
elif
op
is
LITERAL
:
emit
(
fixup
(
av
))
elif
op
is
RANGE
:
emit
(
fixup
(
av
[
0
]))
emit
(
fixup
(
av
[
1
]))
elif
op
is
CATEGORY
:
if
flags
&
SRE_FLAG_LOCALE
:
emit
(
CHCODES
[
CH_LOCALE
[
av
]])
elif
flags
&
SRE_FLAG_UNICODE
:
emit
(
CHCODES
[
CH_UNICODE
[
av
]])
else
:
emit
(
CHCODES
[
av
])
else
:
raise
error
,
"internal: unsupported set operator"
emit
(
OPCODES
[
FAILURE
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
in
(
LITERAL
,
NOT_LITERAL
):
if
flags
&
SRE_FLAG_IGNORECASE
:
emit
(
OPCODES
[
OP_IGNORE
[
op
]])
else
:
emit
(
OPCODES
[
op
])
emit
(
ord
(
av
))
elif
op
is
MARK
:
emit
(
OPCODES
[
op
])
emit
(
av
)
elif
op
in
(
REPEAT
,
MIN_REPEAT
,
MAX_REPEAT
):
if
flags
&
SRE_FLAG_TEMPLATE
:
emit
(
OPCODES
[
REPEAT
])
skip
=
len
(
code
);
emit
(
0
)
emit
(
av
[
0
])
emit
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
else
:
lo
,
hi
=
av
[
2
].
getwidth
()
if
lo
==
0
:
raise
error
,
"nothing to repeat"
if
0
and
lo
==
hi
==
1
and
op
is
MAX_REPEAT
:
# FIXME: <fl> need a better way to figure out when
# it's safe to use this one (in the parser, probably)
emit
(
OPCODES
[
MAX_REPEAT_ONE
])
skip
=
len
(
code
);
emit
(
0
)
emit
(
av
[
0
])
emit
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
else
:
emit
(
OPCODES
[
op
])
skip
=
len
(
code
);
emit
(
0
)
emit
(
av
[
0
])
emit
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
emit
(
OPCODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
SUBPATTERN
:
group
=
av
[
0
]
if
group
:
emit
(
OPCODES
[
MARK
])
emit
((
group
-
1
)
*
2
)
_compile
(
code
,
av
[
1
],
flags
)
if
group
:
emit
(
OPCODES
[
MARK
])
emit
((
group
-
1
)
*
2
+
1
)
else
:
raise
ValueError
,
(
"unsupported operand type"
,
op
)
def
_compile_info
(
code
,
pattern
,
flags
):
# internal: compile an info block. in the current version,
...
...
@@ -159,15 +159,15 @@ def _compile_info(code, pattern, flags):
# if any
lo
,
hi
=
pattern
.
getwidth
()
if
lo
==
0
:
return
# not worth it
return
# not worth it
# look for a literal prefix
prefix
=
[]
if
not
(
flags
&
SRE_FLAG_IGNORECASE
):
for
op
,
av
in
pattern
.
data
:
if
op
is
LITERAL
:
prefix
.
append
(
ord
(
av
))
else
:
break
for
op
,
av
in
pattern
.
data
:
if
op
is
LITERAL
:
prefix
.
append
(
ord
(
av
))
else
:
break
# add an info block
emit
=
code
.
append
emit
(
OPCODES
[
INFO
])
...
...
@@ -175,25 +175,25 @@ def _compile_info(code, pattern, flags):
# literal flag
mask
=
0
if
len
(
prefix
)
==
len
(
pattern
.
data
):
mask
=
1
mask
=
1
emit
(
mask
)
# pattern length
emit
(
lo
)
if
hi
<
32768
:
emit
(
hi
)
emit
(
hi
)
else
:
emit
(
0
)
emit
(
0
)
# add literal prefix
emit
(
len
(
prefix
))
if
prefix
:
code
.
extend
(
prefix
)
# generate overlap table
table
=
[
-
1
]
+
([
0
]
*
len
(
prefix
))
for
i
in
range
(
len
(
prefix
)):
table
[
i
+
1
]
=
table
[
i
]
+
1
while
table
[
i
+
1
]
>
0
and
prefix
[
i
]
!=
prefix
[
table
[
i
+
1
]
-
1
]:
table
[
i
+
1
]
=
table
[
table
[
i
+
1
]
-
1
]
+
1
code
.
extend
(
table
[
1
:])
# don't store first entry
code
.
extend
(
prefix
)
# generate overlap table
table
=
[
-
1
]
+
([
0
]
*
len
(
prefix
))
for
i
in
range
(
len
(
prefix
)):
table
[
i
+
1
]
=
table
[
i
]
+
1
while
table
[
i
+
1
]
>
0
and
prefix
[
i
]
!=
prefix
[
table
[
i
+
1
]
-
1
]:
table
[
i
+
1
]
=
table
[
table
[
i
+
1
]
-
1
]
+
1
code
.
extend
(
table
[
1
:])
# don't store first entry
code
[
skip
]
=
len
(
code
)
-
skip
def
compile
(
p
,
flags
=
0
):
...
...
@@ -201,11 +201,11 @@ def compile(p, flags=0):
# compile, as necessary
if
type
(
p
)
in
(
type
(
""
),
type
(
u""
)):
import
sre_parse
pattern
=
p
p
=
sre_parse
.
parse
(
p
)
import
sre_parse
pattern
=
p
p
=
sre_parse
.
parse
(
p
)
else
:
pattern
=
None
pattern
=
None
flags
=
p
.
pattern
.
flags
|
flags
code
=
[]
...
...
@@ -220,10 +220,10 @@ def compile(p, flags=0):
# FIXME: <fl> get rid of this limitation!
assert
p
.
pattern
.
groups
<=
100
,
\
"sorry, but this version only supports 100 named groups"
"sorry, but this version only supports 100 named groups"
return
_sre
.
compile
(
pattern
,
flags
,
array
.
array
(
WORDSIZE
,
code
).
tostring
(),
p
.
pattern
.
groups
-
1
,
p
.
pattern
.
groupdict
)
pattern
,
flags
,
array
.
array
(
WORDSIZE
,
code
).
tostring
(),
p
.
pattern
.
groups
-
1
,
p
.
pattern
.
groupdict
)
Lib/sre_parse.py
View file @
90a07913
...
...
@@ -67,106 +67,106 @@ FLAGS = {
class State:
def __init__(self):
self.flags = 0
self.groups = 1
self.groupdict = {}
self.flags = 0
self.groups = 1
self.groupdict = {}
def getgroup(self, name=None):
gid = self.groups
self.groups = gid + 1
if name:
self.groupdict[name] = gid
return gid
gid = self.groups
self.groups = gid + 1
if name:
self.groupdict[name] = gid
return gid
class SubPattern:
# a subpattern, in intermediate form
def __init__(self, pattern, data=None):
self.pattern = pattern
if not data:
data = []
self.data = data
self.width = None
self.pattern = pattern
if not data:
data = []
self.data = data
self.width = None
def __repr__(self):
return repr(self.data)
return repr(self.data)
def __len__(self):
return len(self.data)
return len(self.data)
def __delitem__(self, index):
del self.data[index]
del self.data[index]
def __getitem__(self, index):
return self.data[index]
return self.data[index]
def __setitem__(self, index, code):
self.data[index] = code
self.data[index] = code
def __getslice__(self, start, stop):
return SubPattern(self.pattern, self.data[start:stop])
return SubPattern(self.pattern, self.data[start:stop])
def insert(self, index, code):
self.data.insert(index, code)
self.data.insert(index, code)
def append(self, code):
self.data.append(code)
self.data.append(code)
def getwidth(self):
# determine the width (min, max) for this subpattern
if self.width:
return self.width
lo = hi = 0L
for op, av in self.data:
if op is BRANCH:
l = sys.maxint
h = 0
for av in av[1]:
i, j = av.getwidth()
l = min(l, i)
h = min(h, j)
lo = lo + i
hi = hi + j
elif op is CALL:
i, j = av.getwidth()
lo = lo + i
hi = hi + j
elif op is SUBPATTERN:
i, j = av[1].getwidth()
lo = lo + i
hi = hi + j
elif op in (MIN_REPEAT, MAX_REPEAT):
i, j = av[2].getwidth()
lo = lo + long(i) * av[0]
hi = hi + long(j) * av[1]
elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
lo = lo + 1
hi = hi + 1
elif op == SUCCESS:
break
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
return self.width
# determine the width (min, max) for this subpattern
if self.width:
return self.width
lo = hi = 0L
for op, av in self.data:
if op is BRANCH:
l = sys.maxint
h = 0
for av in av[1]:
i, j = av.getwidth()
l = min(l, i)
h = min(h, j)
lo = lo + i
hi = hi + j
elif op is CALL:
i, j = av.getwidth()
lo = lo + i
hi = hi + j
elif op is SUBPATTERN:
i, j = av[1].getwidth()
lo = lo + i
hi = hi + j
elif op in (MIN_REPEAT, MAX_REPEAT):
i, j = av[2].getwidth()
lo = lo + long(i) * av[0]
hi = hi + long(j) * av[1]
elif op in (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY):
lo = lo + 1
hi = hi + 1
elif op == SUCCESS:
break
self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))
return self.width
class Tokenizer:
def __init__(self, string):
self.index = 0
self.string = string
self.next = self.__next()
self.index = 0
self.string = string
self.next = self.__next()
def __next(self):
if self.index >= len(self.string):
return None
char = self.string[self.index]
if char[0] == "
\\
":
try:
c = self.string[self.index + 1]
except IndexError:
raise error, "
bogus
escape
"
char = char + c
self.index = self.index + len(char)
return char
if self.index >= len(self.string):
return None
char = self.string[self.index]
if char[0] == "
\\
":
try:
c = self.string[self.index + 1]
except IndexError:
raise error, "
bogus
escape
"
char = char + c
self.index = self.index + len(char)
return char
def match(self, char):
if char == self.next:
self.next = self.__next()
return 1
return 0
if char == self.next:
self.next = self.__next()
return 1
return 0
def match_set(self, set):
if self.next and self.next in set:
self.next = self.__next()
return 1
return 0
if self.next and self.next in set:
self.next = self.__next()
return 1
return 0
def get(self):
this = self.next
self.next = self.__next()
return this
this = self.next
self.next = self.__next()
return this
def isident(char):
return "
a
" <= char <= "
z
" or "
A
" <= char <= "
Z
" or char == "
_
"
...
...
@@ -180,83 +180,83 @@ def isname(name):
# expression instead, but I seem to have certain bootstrapping
# problems here ;-)
if not isident(name[0]):
return 0
return 0
for char in name:
if not isident(char) and not isdigit(char):
return 0
if not isident(char) and not isdigit(char):
return 0
return 1
def _group(escape, groups):
# check if the escape string represents a valid group
try:
group = int(escape[1:])
if group and group < groups:
return group
group = int(escape[1:])
if group and group < groups:
return group
except ValueError:
pass
pass
return None # not a valid group
def _class_escape(source, escape):
# handle escape code inside character class
code = ESCAPES.get(escape)
if code:
return code
return code
code = CATEGORIES.get(escape)
if code:
return code
return code
try:
if escape[1:2] == "
x
":
while source.next in HEXDIGITS:
escape = escape + source.get()
escape = escape[2:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-4:], 16) & 0xff)
elif str(escape[1:2]) in OCTDIGITS:
while source.next in OCTDIGITS:
escape = escape + source.get()
escape = escape[1:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-6:], 8) & 0xff)
if len(escape) == 2:
return LITERAL, escape[1]
if escape[1:2] == "
x
":
while source.next in HEXDIGITS:
escape = escape + source.get()
escape = escape[2:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-4:], 16) & 0xff)
elif str(escape[1:2]) in OCTDIGITS:
while source.next in OCTDIGITS:
escape = escape + source.get()
escape = escape[1:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-6:], 8) & 0xff)
if len(escape) == 2:
return LITERAL, escape[1]
except ValueError:
pass
pass
raise error, "
bogus
escape
:
%
s
" % repr(escape)
def _escape(source, escape, state):
# handle escape code in expression
code = CATEGORIES.get(escape)
if code:
return code
return code
code = ESCAPES.get(escape)
if code:
return code
return code
try:
if escape[1:2] == "
x
":
while source.next in HEXDIGITS:
escape = escape + source.get()
escape = escape[2:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-4:], 16) & 0xff)
elif escape[1:2] in DIGITS:
while 1:
group = _group(escape, state.groups)
if group:
if (not source.next or
not _group(escape + source.next, state.groups)):
return GROUP, group
escape = escape + source.get()
elif source.next in OCTDIGITS:
escape = escape + source.get()
else:
break
escape = escape[1:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-6:], 8) & 0xff)
if len(escape) == 2:
return LITERAL, escape[1]
if escape[1:2] == "
x
":
while source.next in HEXDIGITS:
escape = escape + source.get()
escape = escape[2:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-4:], 16) & 0xff)
elif escape[1:2] in DIGITS:
while 1:
group = _group(escape, state.groups)
if group:
if (not source.next or
not _group(escape + source.next, state.groups)):
return GROUP, group
escape = escape + source.get()
elif source.next in OCTDIGITS:
escape = escape + source.get()
else:
break
escape = escape[1:]
# FIXME: support unicode characters!
return LITERAL, chr(int(escape[-6:], 8) & 0xff)
if len(escape) == 2:
return LITERAL, escape[1]
except ValueError:
pass
pass
raise error, "
bogus
escape
:
%
s
" % repr(escape)
...
...
@@ -268,35 +268,35 @@ def _branch(pattern, items):
# check if all items share a common prefix
while 1:
prefix = None
for item in items:
if not item:
break
if prefix is None:
prefix = item[0]
elif item[0] != prefix:
break
else:
# all subitems start with a common "
prefix
".
# move it out of the branch
for item in items:
del item[0]
subpattern.append(prefix)
continue # check next one
break
prefix = None
for item in items:
if not item:
break
if prefix is None:
prefix = item[0]
elif item[0] != prefix:
break
else:
# all subitems start with a common "
prefix
".
# move it out of the branch
for item in items:
del item[0]
subpattern.append(prefix)
continue # check next one
break
# check if the branch can be replaced by a character set
for item in items:
if len(item) != 1 or item[0][0] != LITERAL:
break
if len(item) != 1 or item[0][0] != LITERAL:
break
else:
# we can store this as a character set instead of a
# branch (FIXME: use a range if possible)
set = []
for item in items:
set.append(item[0])
subpattern.append((IN, set))
return subpattern
# we can store this as a character set instead of a
# branch (FIXME: use a range if possible)
set = []
for item in items:
set.append(item[0])
subpattern.append((IN, set))
return subpattern
subpattern.append((BRANCH, (None, items)))
return subpattern
...
...
@@ -309,197 +309,197 @@ def _parse(source, state, flags=0):
while 1:
if source.next in ("
|
", "
)
"):
break # end of subpattern
this = source.get()
if this is None:
break # end of pattern
if state.flags & SRE_FLAG_VERBOSE:
# skip whitespace and comments
if this in WHITESPACE:
continue
if this == "
#":
while
1
:
this
=
source
.
get
()
if
this
in
(
None
,
"
\
n
"
):
break
continue
if
this
and
this
[
0
]
not
in
SPECIAL_CHARS
:
subpattern
.
append
((
LITERAL
,
this
))
elif
this
==
"["
:
# character set
set
=
[]
##
if source.match(":"):
##
pass # handle character classes
if
source
.
match
(
"^"
):
set
.
append
((
NEGATE
,
None
))
# check remaining characters
start
=
set
[:]
while
1
:
this
=
source
.
get
()
if
this
==
"]"
and
set
!=
start
:
break
elif
this
and
this
[
0
]
==
"
\
\
"
:
code1
=
_class_escape
(
source
,
this
)
elif
this
:
code1
=
LITERAL
,
this
else
:
raise
error
,
"unexpected end of regular expression"
if
source
.
match
(
"-"
):
# potential range
this
=
source
.
get
()
if
this
==
"]"
:
set
.
append
(
code1
)
set
.
append
((
LITERAL
,
"-"
))
break
else
:
if
this
[
0
]
==
"
\
\
"
:
code2
=
_class_escape
(
source
,
this
)
else
:
code2
=
LITERAL
,
this
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
error
,
"illegal range"
if
len
(
code1
[
1
])
!=
1
or
len
(
code2
[
1
])
!=
1
:
raise
error
,
"illegal range"
set
.
append
((
RANGE
,
(
code1
[
1
],
code2
[
1
])))
else
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
set
.
append
(
code1
)
# FIXME: <fl> move set optimization to compiler!
if
len
(
set
)
==
1
and
set
[
0
][
0
]
is
LITERAL
:
subpattern
.
append
(
set
[
0
])
# optimization
elif
len
(
set
)
==
2
and
set
[
0
][
0
]
is
NEGATE
and
set
[
1
][
0
]
is
LITERAL
:
subpattern
.
append
((
NOT_LITERAL
,
set
[
1
][
1
]))
# optimization
else
:
# FIXME: <fl> add charmap optimization
subpattern
.
append
((
IN
,
set
))
elif
this
and
this
[
0
]
in
REPEAT_CHARS
:
# repeat previous item
if
this
==
"?"
:
min
,
max
=
0
,
1
elif
this
==
"*"
:
min
,
max
=
0
,
MAXREPEAT
elif
this
==
"+"
:
min
,
max
=
1
,
MAXREPEAT
elif
this
==
"{"
:
min
,
max
=
0
,
MAXREPEAT
lo
=
hi
=
""
while
source
.
next
in
DIGITS
:
lo
=
lo
+
source
.
get
()
if
source
.
match
(
","
):
while
source
.
next
in
DIGITS
:
hi
=
hi
+
source
.
get
()
else
:
hi
=
lo
if
not
source
.
match
(
"}"
):
raise
error
,
"bogus range"
if
lo
:
min
=
int
(
lo
)
if
hi
:
max
=
int
(
hi
)
# FIXME: <fl> check that hi >= lo!
else
:
raise
error
,
"not supported"
# figure out which item to repeat
if
subpattern
:
item
=
subpattern
[
-
1
:]
else
:
raise
error
,
"nothing to repeat"
if
source
.
match
(
"?"
):
subpattern
[
-
1
]
=
(
MIN_REPEAT
,
(
min
,
max
,
item
))
else
:
subpattern
[
-
1
]
=
(
MAX_REPEAT
,
(
min
,
max
,
item
))
elif
this
==
"."
:
subpattern
.
append
((
ANY
,
None
))
elif
this
==
"("
:
group
=
1
name
=
None
if
source
.
match
(
"?"
):
group
=
0
# options
if
source
.
match
(
"P"
):
# python extensions
if
source
.
match
(
"<"
):
# named group: skip forward to end of name
name
=
""
while
1
:
char
=
source
.
get
()
if
char
is
None
:
raise
error
,
"unterminated name"
if
char
==
">"
:
break
name
=
name
+
char
group
=
1
if
not
isname
(
name
):
raise
error
,
"illegal character in group name"
elif
source
.
match
(
"="
):
# named backreference
raise
error
,
"not yet implemented"
else
:
char
=
source
.
get
()
if
char
is
None
:
raise
error
,
"unexpected end of pattern"
raise
error
,
"unknown specifier: ?P%s"
%
char
elif
source
.
match
(
":"
):
# non-capturing group
group
=
2
elif
source
.
match
(
"#"
):
# comment
while
1
:
if
source
.
next
is
None
or
source
.
next
==
")"
:
break
source
.
get
()
else
:
# flags
while
FLAGS
.
has_key
(
source
.
next
):
state
.
flags
=
state
.
flags
|
FLAGS
[
source
.
get
()]
if
group
:
# parse group contents
b
=
[]
if
group
==
2
:
# anonymous group
group
=
None
else
:
group
=
state
.
getgroup
(
name
)
while
1
:
p
=
_parse
(
source
,
state
,
flags
)
if
source
.
match
(
")"
):
if
b
:
b
.
append
(
p
)
p
=
_branch
(
state
,
b
)
subpattern
.
append
((
SUBPATTERN
,
(
group
,
p
)))
break
elif
source
.
match
(
"|"
):
b
.
append
(
p
)
else
:
raise
error
,
"group not properly closed"
else
:
while
1
:
char
=
source
.
get
()
if
char
is
None
or
char
==
")"
:
break
raise
error
,
"unknown extension"
elif
this
==
"^"
:
subpattern
.
append
((
AT
,
AT_BEGINNING
))
elif
this
==
"$"
:
subpattern
.
append
((
AT
,
AT_END
))
elif
this
and
this
[
0
]
==
"
\
\
"
:
code
=
_escape
(
source
,
this
,
state
)
subpattern
.
append
(
code
)
else
:
raise
error
,
"parser error"
if source.next in ("
|
", "
)
"):
break # end of subpattern
this = source.get()
if this is None:
break # end of pattern
if state.flags & SRE_FLAG_VERBOSE:
# skip whitespace and comments
if this in WHITESPACE:
continue
if this == "
#":
while
1
:
this
=
source
.
get
()
if
this
in
(
None
,
"
\
n
"
):
break
continue
if
this
and
this
[
0
]
not
in
SPECIAL_CHARS
:
subpattern
.
append
((
LITERAL
,
this
))
elif
this
==
"["
:
# character set
set
=
[]
##
if source.match(":"):
##
pass # handle character classes
if
source
.
match
(
"^"
):
set
.
append
((
NEGATE
,
None
))
# check remaining characters
start
=
set
[:]
while
1
:
this
=
source
.
get
()
if
this
==
"]"
and
set
!=
start
:
break
elif
this
and
this
[
0
]
==
"
\
\
"
:
code1
=
_class_escape
(
source
,
this
)
elif
this
:
code1
=
LITERAL
,
this
else
:
raise
error
,
"unexpected end of regular expression"
if
source
.
match
(
"-"
):
# potential range
this
=
source
.
get
()
if
this
==
"]"
:
set
.
append
(
code1
)
set
.
append
((
LITERAL
,
"-"
))
break
else
:
if
this
[
0
]
==
"
\
\
"
:
code2
=
_class_escape
(
source
,
this
)
else
:
code2
=
LITERAL
,
this
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
error
,
"illegal range"
if
len
(
code1
[
1
])
!=
1
or
len
(
code2
[
1
])
!=
1
:
raise
error
,
"illegal range"
set
.
append
((
RANGE
,
(
code1
[
1
],
code2
[
1
])))
else
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
set
.
append
(
code1
)
# FIXME: <fl> move set optimization to compiler!
if
len
(
set
)
==
1
and
set
[
0
][
0
]
is
LITERAL
:
subpattern
.
append
(
set
[
0
])
# optimization
elif
len
(
set
)
==
2
and
set
[
0
][
0
]
is
NEGATE
and
set
[
1
][
0
]
is
LITERAL
:
subpattern
.
append
((
NOT_LITERAL
,
set
[
1
][
1
]))
# optimization
else
:
# FIXME: <fl> add charmap optimization
subpattern
.
append
((
IN
,
set
))
elif
this
and
this
[
0
]
in
REPEAT_CHARS
:
# repeat previous item
if
this
==
"?"
:
min
,
max
=
0
,
1
elif
this
==
"*"
:
min
,
max
=
0
,
MAXREPEAT
elif
this
==
"+"
:
min
,
max
=
1
,
MAXREPEAT
elif
this
==
"{"
:
min
,
max
=
0
,
MAXREPEAT
lo
=
hi
=
""
while
source
.
next
in
DIGITS
:
lo
=
lo
+
source
.
get
()
if
source
.
match
(
","
):
while
source
.
next
in
DIGITS
:
hi
=
hi
+
source
.
get
()
else
:
hi
=
lo
if
not
source
.
match
(
"}"
):
raise
error
,
"bogus range"
if
lo
:
min
=
int
(
lo
)
if
hi
:
max
=
int
(
hi
)
# FIXME: <fl> check that hi >= lo!
else
:
raise
error
,
"not supported"
# figure out which item to repeat
if
subpattern
:
item
=
subpattern
[
-
1
:]
else
:
raise
error
,
"nothing to repeat"
if
source
.
match
(
"?"
):
subpattern
[
-
1
]
=
(
MIN_REPEAT
,
(
min
,
max
,
item
))
else
:
subpattern
[
-
1
]
=
(
MAX_REPEAT
,
(
min
,
max
,
item
))
elif
this
==
"."
:
subpattern
.
append
((
ANY
,
None
))
elif
this
==
"("
:
group
=
1
name
=
None
if
source
.
match
(
"?"
):
group
=
0
# options
if
source
.
match
(
"P"
):
# python extensions
if
source
.
match
(
"<"
):
# named group: skip forward to end of name
name
=
""
while
1
:
char
=
source
.
get
()
if
char
is
None
:
raise
error
,
"unterminated name"
if
char
==
">"
:
break
name
=
name
+
char
group
=
1
if
not
isname
(
name
):
raise
error
,
"illegal character in group name"
elif
source
.
match
(
"="
):
# named backreference
raise
error
,
"not yet implemented"
else
:
char
=
source
.
get
()
if
char
is
None
:
raise
error
,
"unexpected end of pattern"
raise
error
,
"unknown specifier: ?P%s"
%
char
elif
source
.
match
(
":"
):
# non-capturing group
group
=
2
elif
source
.
match
(
"#"
):
# comment
while
1
:
if
source
.
next
is
None
or
source
.
next
==
")"
:
break
source
.
get
()
else
:
# flags
while
FLAGS
.
has_key
(
source
.
next
):
state
.
flags
=
state
.
flags
|
FLAGS
[
source
.
get
()]
if
group
:
# parse group contents
b
=
[]
if
group
==
2
:
# anonymous group
group
=
None
else
:
group
=
state
.
getgroup
(
name
)
while
1
:
p
=
_parse
(
source
,
state
,
flags
)
if
source
.
match
(
")"
):
if
b
:
b
.
append
(
p
)
p
=
_branch
(
state
,
b
)
subpattern
.
append
((
SUBPATTERN
,
(
group
,
p
)))
break
elif
source
.
match
(
"|"
):
b
.
append
(
p
)
else
:
raise
error
,
"group not properly closed"
else
:
while
1
:
char
=
source
.
get
()
if
char
is
None
or
char
==
")"
:
break
raise
error
,
"unknown extension"
elif
this
==
"^"
:
subpattern
.
append
((
AT
,
AT_BEGINNING
))
elif
this
==
"$"
:
subpattern
.
append
((
AT
,
AT_END
))
elif
this
and
this
[
0
]
==
"
\
\
"
:
code
=
_escape
(
source
,
this
,
state
)
subpattern
.
append
(
code
)
else
:
raise
error
,
"parser error"
return
subpattern
...
...
@@ -509,19 +509,19 @@ def parse(pattern, flags=0):
state
=
State
()
b
=
[]
while
1
:
p
=
_parse
(
source
,
state
,
flags
)
tail
=
source
.
get
()
if
tail
==
"|"
:
b
.
append
(
p
)
elif
tail
==
")"
:
raise
error
,
"unbalanced parenthesis"
elif
tail
is
None
:
if
b
:
b
.
append
(
p
)
p
=
_branch
(
state
,
b
)
break
else
:
raise
error
,
"bogus characters at end of regular expression"
p
=
_parse
(
source
,
state
,
flags
)
tail
=
source
.
get
()
if
tail
==
"|"
:
b
.
append
(
p
)
elif
tail
==
")"
:
raise
error
,
"unbalanced parenthesis"
elif
tail
is
None
:
if
b
:
b
.
append
(
p
)
p
=
_branch
(
state
,
b
)
break
else
:
raise
error
,
"bogus characters at end of regular expression"
return
p
def
parse_template
(
source
,
pattern
):
...
...
@@ -531,59 +531,59 @@ def parse_template(source, pattern):
p
=
[]
a
=
p
.
append
while
1
:
this
=
s
.
get
()
if
this
is
None
:
break
# end of replacement string
if
this
and
this
[
0
]
==
"
\
\
"
:
# group
if
this
==
"
\
\
g"
:
name
=
""
if
s
.
match
(
"<"
):
while
1
:
char
=
s
.
get
()
if
char
is
None
:
raise
error
,
"unterminated group name"
if
char
==
">"
:
break
name
=
name
+
char
if
not
name
:
raise
error
,
"bad group name"
try
:
index
=
int
(
name
)
except
ValueError
:
if
not
isname
(
name
):
raise
error
,
"illegal character in group name"
try
:
index
=
pattern
.
groupindex
[
name
]
except
KeyError
:
raise
IndexError
,
"unknown group name"
a
((
MARK
,
index
))
elif
len
(
this
)
>
1
and
this
[
1
]
in
DIGITS
:
code
=
None
while
1
:
group
=
_group
(
this
,
pattern
.
groups
+
1
)
if
group
:
if
(
not
s
.
next
or
not
_group
(
this
+
s
.
next
,
pattern
.
groups
+
1
)):
code
=
MARK
,
int
(
group
)
break
elif
s
.
next
in
OCTDIGITS
:
this
=
this
+
s
.
get
()
else
:
break
if
not
code
:
this
=
this
[
1
:]
# FIXME: support unicode characters!
code
=
LITERAL
,
chr
(
int
(
this
[
-
6
:],
8
)
&
0xff
)
a
(
code
)
else
:
try
:
a
(
ESCAPES
[
this
])
except
KeyError
:
for
c
in
this
:
a
((
LITERAL
,
c
))
else
:
a
((
LITERAL
,
this
))
this
=
s
.
get
()
if
this
is
None
:
break
# end of replacement string
if
this
and
this
[
0
]
==
"
\
\
"
:
# group
if
this
==
"
\
\
g"
:
name
=
""
if
s
.
match
(
"<"
):
while
1
:
char
=
s
.
get
()
if
char
is
None
:
raise
error
,
"unterminated group name"
if
char
==
">"
:
break
name
=
name
+
char
if
not
name
:
raise
error
,
"bad group name"
try
:
index
=
int
(
name
)
except
ValueError
:
if
not
isname
(
name
):
raise
error
,
"illegal character in group name"
try
:
index
=
pattern
.
groupindex
[
name
]
except
KeyError
:
raise
IndexError
,
"unknown group name"
a
((
MARK
,
index
))
elif
len
(
this
)
>
1
and
this
[
1
]
in
DIGITS
:
code
=
None
while
1
:
group
=
_group
(
this
,
pattern
.
groups
+
1
)
if
group
:
if
(
not
s
.
next
or
not
_group
(
this
+
s
.
next
,
pattern
.
groups
+
1
)):
code
=
MARK
,
int
(
group
)
break
elif
s
.
next
in
OCTDIGITS
:
this
=
this
+
s
.
get
()
else
:
break
if
not
code
:
this
=
this
[
1
:]
# FIXME: support unicode characters!
code
=
LITERAL
,
chr
(
int
(
this
[
-
6
:],
8
)
&
0xff
)
a
(
code
)
else
:
try
:
a
(
ESCAPES
[
this
])
except
KeyError
:
for
c
in
this
:
a
((
LITERAL
,
c
))
else
:
a
((
LITERAL
,
this
))
return
p
def
expand_template
(
template
,
match
):
...
...
@@ -592,11 +592,11 @@ def expand_template(template, match):
p
=
[]
a
=
p
.
append
for
c
,
s
in
template
:
if
c
is
LITERAL
:
a
(
s
)
elif
c
is
MARK
:
s
=
match
.
group
(
s
)
if
s
is
None
:
raise
error
,
"empty group"
a
(
s
)
if
c
is
LITERAL
:
a
(
s
)
elif
c
is
MARK
:
s
=
match
.
group
(
s
)
if
s
is
None
:
raise
error
,
"empty group"
a
(
s
)
return
match
.
string
[:
0
].
join
(
p
)
Lib/test/test_sre.py
View file @
90a07913
# FIXME: this is basically test_re.py, with a few
# FIXME: this is basically test_re.py, with a few
minor changes
import
sys
sys
.
path
=
[
'.'
]
+
sys
.
path
...
...
@@ -337,7 +337,7 @@ for t in tests:
print repr(repl)+'
should
be
'+repr(expected)
else:
print '
===
Failed
incorrectly
', t
continue
continue
# Try the match on a unicode string, and check that it
# still succeeds.
...
...
@@ -359,9 +359,9 @@ for t in tests:
if
pattern
[:
2
]
!=
'
\
\
B'
and
pattern
[
-
2
:]
!=
'
\
\
B'
:
obj
=
sre
.
compile
(
pattern
)
result
=
obj
.
search
(
s
,
result
.
start
(
0
),
result
.
end
(
0
)
+
1
)
if
result
==
None
:
print
'=== Failed on range-limited match'
,
t
result
=
obj
.
search
(
s
,
result
.
start
(
0
),
result
.
end
(
0
)
+
1
)
if
result
==
None
:
print
'=== Failed on range-limited match'
,
t
# Try the match with IGNORECASE enabled, and check that it
# still succeeds.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment