Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
8f417748
Commit
8f417748
authored
Apr 02, 2000
by
Andrew M. Kuchling
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
This patch looks large, but it just deletes the ^M characters and
untabifies the files. No actual code changes were made.
parent
bd83b7ee
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
482 additions
and
482 deletions
+482
-482
Lib/sre_compile.py
Lib/sre_compile.py
+124
-124
Lib/sre_constants.py
Lib/sre_constants.py
+1
-1
Lib/sre_parse.py
Lib/sre_parse.py
+357
-357
No files found.
Lib/sre_compile.py
View file @
8f417748
...
...
@@ -26,7 +26,7 @@ from sre_constants import *
# find an array type code that matches the engine's code size
for
WORDSIZE
in
"BHil"
:
if
len
(
array
.
array
(
WORDSIZE
,
[
0
]).
tostring
())
==
_sre
.
getcodesize
():
break
break
else
:
raise
RuntimeError
,
"cannot find a useable array type"
...
...
@@ -34,18 +34,18 @@ else:
class
Code
:
def
__init__
(
self
):
self
.
data
=
[]
self
.
data
=
[]
def
__len__
(
self
):
return
len
(
self
.
data
)
return
len
(
self
.
data
)
def
__getitem__
(
self
,
index
):
return
self
.
data
[
index
]
return
self
.
data
[
index
]
def
__setitem__
(
self
,
index
,
code
):
self
.
data
[
index
]
=
code
self
.
data
[
index
]
=
code
def
append
(
self
,
code
):
self
.
data
.
append
(
code
)
self
.
data
.
append
(
code
)
def
todata
(
self
):
# print self.data
return
array
.
array
(
WORDSIZE
,
self
.
data
).
tostring
()
# print self.data
return
array
.
array
(
WORDSIZE
,
self
.
data
).
tostring
()
def
_lower
(
literal
):
# return _sre._lower(literal) # FIXME
...
...
@@ -54,122 +54,122 @@ def _lower(literal):
def
_compile
(
code
,
pattern
,
flags
):
append
=
code
.
append
for
op
,
av
in
pattern
:
if
op
is
ANY
:
if
"s"
in
flags
:
append
(
CODES
[
op
])
# any character at all!
else
:
append
(
CODES
[
NOT_LITERAL
])
append
(
10
)
elif
op
in
(
SUCCESS
,
FAILURE
):
append
(
CODES
[
op
])
elif
op
is
AT
:
append
(
CODES
[
op
])
append
(
POSITIONS
[
av
])
elif
op
is
BRANCH
:
append
(
CODES
[
op
])
tail
=
[]
for
av
in
av
[
1
]:
skip
=
len
(
code
);
append
(
0
)
_compile
(
code
,
av
,
flags
)
append
(
CODES
[
JUMP
])
tail
.
append
(
len
(
code
));
append
(
0
)
code
[
skip
]
=
len
(
code
)
-
skip
append
(
0
)
# end of branch
for
tail
in
tail
:
code
[
tail
]
=
len
(
code
)
-
tail
elif
op
is
CALL
:
append
(
CODES
[
op
])
skip
=
len
(
code
);
append
(
0
)
_compile
(
code
,
av
,
flags
)
append
(
CODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
CATEGORY
:
# not used by current parser
append
(
CODES
[
op
])
append
(
CATEGORIES
[
av
])
elif
op
is
GROUP
:
if
"i"
in
flags
:
append
(
CODES
[
MAP_IGNORE
[
op
]])
else
:
append
(
CODES
[
op
])
append
(
av
)
elif
op
is
IN
:
if
"i"
in
flags
:
append
(
CODES
[
MAP_IGNORE
[
op
]])
def
fixup
(
literal
):
return
ord
(
_lower
(
literal
))
else
:
append
(
CODES
[
op
])
fixup
=
ord
skip
=
len
(
code
);
append
(
0
)
for
op
,
av
in
av
:
append
(
CODES
[
op
])
if
op
is
NEGATE
:
pass
elif
op
is
LITERAL
:
append
(
fixup
(
av
))
elif
op
is
RANGE
:
append
(
fixup
(
av
[
0
]))
append
(
fixup
(
av
[
1
]))
elif
op
is
CATEGORY
:
append
(
CATEGORIES
[
av
])
else
:
raise
ValueError
,
"unsupported set operator"
append
(
CODES
[
FAILURE
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
in
(
LITERAL
,
NOT_LITERAL
):
if
"i"
in
flags
:
append
(
CODES
[
MAP_IGNORE
[
op
]])
append
(
ord
(
_lower
(
av
)))
else
:
append
(
CODES
[
op
])
append
(
ord
(
av
))
elif
op
is
MARK
:
append
(
CODES
[
op
])
append
(
av
)
elif
op
in
(
REPEAT
,
MIN_REPEAT
,
MAX_REPEAT
):
lo
,
hi
=
av
[
2
].
getwidth
()
if
lo
==
0
:
raise
SyntaxError
,
"cannot repeat zero-width items"
if
lo
==
hi
==
1
and
op
is
MAX_REPEAT
:
append
(
CODES
[
MAX_REPEAT_ONE
])
skip
=
len
(
code
);
append
(
0
)
append
(
av
[
0
])
append
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
append
(
CODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
else
:
append
(
CODES
[
op
])
skip
=
len
(
code
);
append
(
0
)
append
(
av
[
0
])
append
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
if
op
is
MIN_REPEAT
:
append
(
CODES
[
MIN_UNTIL
])
else
:
# FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
append
(
CODES
[
MAX_UNTIL
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
SUBPATTERN
:
##
group = av[0]
##
if group:
##
append(CODES[MARK])
##
append((group-1)*2)
_compile
(
code
,
av
[
1
],
flags
)
##
if group:
##
append(CODES[MARK])
##
append((group-1)*2+1)
else
:
raise
ValueError
,
(
"unsupported operand type"
,
op
)
if
op
is
ANY
:
if
"s"
in
flags
:
append
(
CODES
[
op
])
# any character at all!
else
:
append
(
CODES
[
NOT_LITERAL
])
append
(
10
)
elif
op
in
(
SUCCESS
,
FAILURE
):
append
(
CODES
[
op
])
elif
op
is
AT
:
append
(
CODES
[
op
])
append
(
POSITIONS
[
av
])
elif
op
is
BRANCH
:
append
(
CODES
[
op
])
tail
=
[]
for
av
in
av
[
1
]:
skip
=
len
(
code
);
append
(
0
)
_compile
(
code
,
av
,
flags
)
append
(
CODES
[
JUMP
])
tail
.
append
(
len
(
code
));
append
(
0
)
code
[
skip
]
=
len
(
code
)
-
skip
append
(
0
)
# end of branch
for
tail
in
tail
:
code
[
tail
]
=
len
(
code
)
-
tail
elif
op
is
CALL
:
append
(
CODES
[
op
])
skip
=
len
(
code
);
append
(
0
)
_compile
(
code
,
av
,
flags
)
append
(
CODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
CATEGORY
:
# not used by current parser
append
(
CODES
[
op
])
append
(
CATEGORIES
[
av
])
elif
op
is
GROUP
:
if
"i"
in
flags
:
append
(
CODES
[
MAP_IGNORE
[
op
]])
else
:
append
(
CODES
[
op
])
append
(
av
)
elif
op
is
IN
:
if
"i"
in
flags
:
append
(
CODES
[
MAP_IGNORE
[
op
]])
def
fixup
(
literal
):
return
ord
(
_lower
(
literal
))
else
:
append
(
CODES
[
op
])
fixup
=
ord
skip
=
len
(
code
);
append
(
0
)
for
op
,
av
in
av
:
append
(
CODES
[
op
])
if
op
is
NEGATE
:
pass
elif
op
is
LITERAL
:
append
(
fixup
(
av
))
elif
op
is
RANGE
:
append
(
fixup
(
av
[
0
]))
append
(
fixup
(
av
[
1
]))
elif
op
is
CATEGORY
:
append
(
CATEGORIES
[
av
])
else
:
raise
ValueError
,
"unsupported set operator"
append
(
CODES
[
FAILURE
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
in
(
LITERAL
,
NOT_LITERAL
):
if
"i"
in
flags
:
append
(
CODES
[
MAP_IGNORE
[
op
]])
append
(
ord
(
_lower
(
av
)))
else
:
append
(
CODES
[
op
])
append
(
ord
(
av
))
elif
op
is
MARK
:
append
(
CODES
[
op
])
append
(
av
)
elif
op
in
(
REPEAT
,
MIN_REPEAT
,
MAX_REPEAT
):
lo
,
hi
=
av
[
2
].
getwidth
()
if
lo
==
0
:
raise
SyntaxError
,
"cannot repeat zero-width items"
if
lo
==
hi
==
1
and
op
is
MAX_REPEAT
:
append
(
CODES
[
MAX_REPEAT_ONE
])
skip
=
len
(
code
);
append
(
0
)
append
(
av
[
0
])
append
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
append
(
CODES
[
SUCCESS
])
code
[
skip
]
=
len
(
code
)
-
skip
else
:
append
(
CODES
[
op
])
skip
=
len
(
code
);
append
(
0
)
append
(
av
[
0
])
append
(
av
[
1
])
_compile
(
code
,
av
[
2
],
flags
)
if
op
is
MIN_REPEAT
:
append
(
CODES
[
MIN_UNTIL
])
else
:
# FIXME: MAX_REPEAT PROBABLY DOESN'T WORK (?)
append
(
CODES
[
MAX_UNTIL
])
code
[
skip
]
=
len
(
code
)
-
skip
elif
op
is
SUBPATTERN
:
##
group = av[0]
##
if group:
##
append(CODES[MARK])
##
append((group-1)*2)
_compile
(
code
,
av
[
1
],
flags
)
##
if group:
##
append(CODES[MARK])
##
append((group-1)*2+1)
else
:
raise
ValueError
,
(
"unsupported operand type"
,
op
)
def
compile
(
p
,
flags
=
()):
# convert pattern list to internal format
if
type
(
p
)
is
type
(
""
):
import
sre_parse
pattern
=
p
p
=
sre_parse
.
parse
(
p
)
import
sre_parse
pattern
=
p
p
=
sre_parse
.
parse
(
p
)
else
:
pattern
=
None
pattern
=
None
# print p.getwidth()
# print p
code
=
Code
()
...
...
@@ -178,10 +178,10 @@ def compile(p, flags=()):
# print list(code.data)
data
=
code
.
todata
()
if
0
:
# debugging
print
print
"-"
*
68
import
sre_disasm
sre_disasm
.
disasm
(
data
)
print
"-"
*
68
print
print
"-"
*
68
import
sre_disasm
sre_disasm
.
disasm
(
data
)
print
"-"
*
68
# print len(data), p.pattern.groups, len(p.pattern.groupdict)
return
_sre
.
compile
(
pattern
,
data
,
p
.
pattern
.
groups
-
1
,
p
.
pattern
.
groupdict
)
Lib/sre_constants.py
View file @
8f417748
...
...
@@ -126,6 +126,6 @@ if __name__ == "__main__":
f
=
open
(
"sre_constants.h"
,
"w"
)
f
.
write
(
"/* generated by sre_constants.py */
\
n
"
)
for
k
,
v
in
items
:
f
.
write
(
"#define SRE_OP_"
+
string
.
upper
(
k
)
+
" "
+
str
(
v
)
+
"
\
n
"
)
f
.
write
(
"#define SRE_OP_"
+
string
.
upper
(
k
)
+
" "
+
str
(
v
)
+
"
\
n
"
)
f
.
close
()
print
"done"
Lib/sre_parse.py
View file @
8f417748
...
...
@@ -55,168 +55,168 @@ CATEGORIES = {
class
Pattern
:
# FIXME: <fl> rename class, and store flags in here too!
def
__init__
(
self
):
self
.
flags
=
[]
self
.
groups
=
1
self
.
groupdict
=
{}
self
.
flags
=
[]
self
.
groups
=
1
self
.
groupdict
=
{}
def
getgroup
(
self
,
name
=
None
):
gid
=
self
.
groups
self
.
groups
=
gid
+
1
if
name
:
self
.
groupdict
[
name
]
=
gid
return
gid
gid
=
self
.
groups
self
.
groups
=
gid
+
1
if
name
:
self
.
groupdict
[
name
]
=
gid
return
gid
def
setflag
(
self
,
flag
):
if
flag
not
in
self
.
flags
:
self
.
flags
.
append
(
flag
)
if
flag
not
in
self
.
flags
:
self
.
flags
.
append
(
flag
)
class
SubPattern
:
# a subpattern, in intermediate form
def
__init__
(
self
,
pattern
,
data
=
None
):
self
.
pattern
=
pattern
if
not
data
:
data
=
[]
self
.
data
=
data
self
.
flags
=
[]
self
.
width
=
None
self
.
pattern
=
pattern
if
not
data
:
data
=
[]
self
.
data
=
data
self
.
flags
=
[]
self
.
width
=
None
def
__repr__
(
self
):
return
repr
(
self
.
data
)
return
repr
(
self
.
data
)
def
__len__
(
self
):
return
len
(
self
.
data
)
return
len
(
self
.
data
)
def
__delitem__
(
self
,
index
):
del
self
.
data
[
index
]
del
self
.
data
[
index
]
def
__getitem__
(
self
,
index
):
return
self
.
data
[
index
]
return
self
.
data
[
index
]
def
__setitem__
(
self
,
index
,
code
):
self
.
data
[
index
]
=
code
self
.
data
[
index
]
=
code
def
__getslice__
(
self
,
start
,
stop
):
return
SubPattern
(
self
.
pattern
,
self
.
data
[
start
:
stop
])
return
SubPattern
(
self
.
pattern
,
self
.
data
[
start
:
stop
])
def
insert
(
self
,
index
,
code
):
self
.
data
.
insert
(
index
,
code
)
self
.
data
.
insert
(
index
,
code
)
def
append
(
self
,
code
):
self
.
data
.
append
(
code
)
self
.
data
.
append
(
code
)
def
getwidth
(
self
):
# determine the width (min, max) for this subpattern
if
self
.
width
:
return
self
.
width
lo
=
hi
=
0L
for
op
,
av
in
self
.
data
:
if
op
is
BRANCH
:
l
=
sys
.
maxint
h
=
0
for
av
in
av
[
1
]:
i
,
j
=
av
.
getwidth
()
l
=
min
(
l
,
i
)
h
=
min
(
h
,
j
)
lo
=
lo
+
i
hi
=
hi
+
j
elif
op
is
CALL
:
i
,
j
=
av
.
getwidth
()
lo
=
lo
+
i
hi
=
hi
+
j
elif
op
is
SUBPATTERN
:
i
,
j
=
av
[
1
].
getwidth
()
lo
=
lo
+
i
hi
=
hi
+
j
elif
op
in
(
MIN_REPEAT
,
MAX_REPEAT
):
i
,
j
=
av
[
2
].
getwidth
()
lo
=
lo
+
i
*
av
[
0
]
hi
=
hi
+
j
*
av
[
1
]
elif
op
in
(
ANY
,
RANGE
,
IN
,
LITERAL
,
NOT_LITERAL
,
CATEGORY
):
lo
=
lo
+
1
hi
=
hi
+
1
elif
op
==
SUCCESS
:
break
self
.
width
=
int
(
min
(
lo
,
sys
.
maxint
)),
int
(
min
(
hi
,
sys
.
maxint
))
return
self
.
width
# determine the width (min, max) for this subpattern
if
self
.
width
:
return
self
.
width
lo
=
hi
=
0L
for
op
,
av
in
self
.
data
:
if
op
is
BRANCH
:
l
=
sys
.
maxint
h
=
0
for
av
in
av
[
1
]:
i
,
j
=
av
.
getwidth
()
l
=
min
(
l
,
i
)
h
=
min
(
h
,
j
)
lo
=
lo
+
i
hi
=
hi
+
j
elif
op
is
CALL
:
i
,
j
=
av
.
getwidth
()
lo
=
lo
+
i
hi
=
hi
+
j
elif
op
is
SUBPATTERN
:
i
,
j
=
av
[
1
].
getwidth
()
lo
=
lo
+
i
hi
=
hi
+
j
elif
op
in
(
MIN_REPEAT
,
MAX_REPEAT
):
i
,
j
=
av
[
2
].
getwidth
()
lo
=
lo
+
i
*
av
[
0
]
hi
=
hi
+
j
*
av
[
1
]
elif
op
in
(
ANY
,
RANGE
,
IN
,
LITERAL
,
NOT_LITERAL
,
CATEGORY
):
lo
=
lo
+
1
hi
=
hi
+
1
elif
op
==
SUCCESS
:
break
self
.
width
=
int
(
min
(
lo
,
sys
.
maxint
)),
int
(
min
(
hi
,
sys
.
maxint
))
return
self
.
width
def
set
(
self
,
flag
):
if
not
flag
in
self
.
flags
:
self
.
flags
.
append
(
flag
)
if
not
flag
in
self
.
flags
:
self
.
flags
.
append
(
flag
)
def
reset
(
self
,
flag
):
if
flag
in
self
.
flags
:
self
.
flags
.
remove
(
flag
)
if
flag
in
self
.
flags
:
self
.
flags
.
remove
(
flag
)
class
Tokenizer
:
def
__init__
(
self
,
string
):
self
.
string
=
list
(
string
)
self
.
next
=
self
.
__next
()
self
.
string
=
list
(
string
)
self
.
next
=
self
.
__next
()
def
__next
(
self
):
if
not
self
.
string
:
return
None
char
=
self
.
string
[
0
]
if
char
[
0
]
==
"
\
\
"
:
try
:
c
=
self
.
string
[
1
]
except
IndexError
:
raise
SyntaxError
,
"bogus escape"
char
=
char
+
c
try
:
if
c
==
"x"
:
# hexadecimal constant
for
i
in
xrange
(
2
,
sys
.
maxint
):
c
=
self
.
string
[
i
]
if
c
not
in
HEXDIGITS
:
break
char
=
char
+
c
elif
c
in
string
.
digits
:
# decimal (or octal) number
for
i
in
xrange
(
2
,
sys
.
maxint
):
c
=
self
.
string
[
i
]
# FIXME: if larger than current number of
# groups, interpret as an octal number
if
c
not
in
string
.
digits
:
break
char
=
char
+
c
except
IndexError
:
pass
# use what we've got this far
del
self
.
string
[
0
:
len
(
char
)]
return
char
if
not
self
.
string
:
return
None
char
=
self
.
string
[
0
]
if
char
[
0
]
==
"
\
\
"
:
try
:
c
=
self
.
string
[
1
]
except
IndexError
:
raise
SyntaxError
,
"bogus escape"
char
=
char
+
c
try
:
if
c
==
"x"
:
# hexadecimal constant
for
i
in
xrange
(
2
,
sys
.
maxint
):
c
=
self
.
string
[
i
]
if
c
not
in
HEXDIGITS
:
break
char
=
char
+
c
elif
c
in
string
.
digits
:
# decimal (or octal) number
for
i
in
xrange
(
2
,
sys
.
maxint
):
c
=
self
.
string
[
i
]
# FIXME: if larger than current number of
# groups, interpret as an octal number
if
c
not
in
string
.
digits
:
break
char
=
char
+
c
except
IndexError
:
pass
# use what we've got this far
del
self
.
string
[
0
:
len
(
char
)]
return
char
def
match
(
self
,
char
):
if
char
==
self
.
next
:
self
.
next
=
self
.
__next
()
return
1
return
0
if
char
==
self
.
next
:
self
.
next
=
self
.
__next
()
return
1
return
0
def
match_set
(
self
,
set
):
if
self
.
next
in
set
:
self
.
next
=
self
.
__next
()
return
1
return
0
if
self
.
next
in
set
:
self
.
next
=
self
.
__next
()
return
1
return
0
def
get
(
self
):
this
=
self
.
next
self
.
next
=
self
.
__next
()
return
this
this
=
self
.
next
self
.
next
=
self
.
__next
()
return
this
def
_fixescape
(
escape
,
character_class
=
0
):
# convert escape to (type, value)
if
character_class
:
# inside a character class, we'll look in the character
# escapes dictionary first
code
=
ESCAPES
.
get
(
escape
)
if
code
:
return
code
code
=
CATEGORIES
.
get
(
escape
)
# inside a character class, we'll look in the character
# escapes dictionary first
code
=
ESCAPES
.
get
(
escape
)
if
code
:
return
code
code
=
CATEGORIES
.
get
(
escape
)
else
:
code
=
CATEGORIES
.
get
(
escape
)
if
code
:
return
code
code
=
ESCAPES
.
get
(
escape
)
code
=
CATEGORIES
.
get
(
escape
)
if
code
:
return
code
code
=
ESCAPES
.
get
(
escape
)
if
code
:
return
code
return
code
if
not
character_class
:
try
:
group
=
int
(
escape
[
1
:])
# FIXME: only valid if group <= current number of groups
return
GROUP
,
group
except
ValueError
:
pass
try
:
group
=
int
(
escape
[
1
:])
# FIXME: only valid if group <= current number of groups
return
GROUP
,
group
except
ValueError
:
pass
try
:
if
escape
[
1
:
2
]
==
"x"
:
escape
=
escape
[
2
:]
return
LITERAL
,
chr
(
string
.
atoi
(
escape
[
-
2
:],
16
)
&
0xff
)
elif
escape
[
1
:
2
]
in
string
.
digits
:
return
LITERAL
,
chr
(
string
.
atoi
(
escape
[
1
:],
8
)
&
0xff
)
elif
len
(
escape
)
==
2
:
return
LITERAL
,
escape
[
1
]
if
escape
[
1
:
2
]
==
"x"
:
escape
=
escape
[
2
:]
return
LITERAL
,
chr
(
string
.
atoi
(
escape
[
-
2
:],
16
)
&
0xff
)
elif
escape
[
1
:
2
]
in
string
.
digits
:
return
LITERAL
,
chr
(
string
.
atoi
(
escape
[
1
:],
8
)
&
0xff
)
elif
len
(
escape
)
==
2
:
return
LITERAL
,
escape
[
1
]
except
ValueError
:
pass
pass
raise
SyntaxError
,
"bogus escape: %s"
%
repr
(
escape
)
def
_branch
(
subpattern
,
items
):
...
...
@@ -226,35 +226,35 @@ def _branch(subpattern, items):
# check if all items share a common prefix
while
1
:
prefix
=
None
for
item
in
items
:
if
not
item
:
break
if
prefix
is
None
:
prefix
=
item
[
0
]
elif
item
[
0
]
!=
prefix
:
break
else
:
# all subitems start with a common "prefix".
# move it out of the branch
for
item
in
items
:
del
item
[
0
]
subpattern
.
append
(
prefix
)
continue
# check next one
break
prefix
=
None
for
item
in
items
:
if
not
item
:
break
if
prefix
is
None
:
prefix
=
item
[
0
]
elif
item
[
0
]
!=
prefix
:
break
else
:
# all subitems start with a common "prefix".
# move it out of the branch
for
item
in
items
:
del
item
[
0
]
subpattern
.
append
(
prefix
)
continue
# check next one
break
# check if the branch can be replaced by a character set
for
item
in
items
:
if
len
(
item
)
!=
1
or
item
[
0
][
0
]
!=
LITERAL
:
break
if
len
(
item
)
!=
1
or
item
[
0
][
0
]
!=
LITERAL
:
break
else
:
# we can store this as a character set instead of a
# branch (FIXME: use a range if possible)
set
=
[]
for
item
in
items
:
set
.
append
(
item
[
0
])
subpattern
.
append
((
IN
,
set
))
return
# we can store this as a character set instead of a
# branch (FIXME: use a range if possible)
set
=
[]
for
item
in
items
:
set
.
append
(
item
[
0
])
subpattern
.
append
((
IN
,
set
))
return
subpattern
.
append
((
BRANCH
,
(
None
,
items
)))
...
...
@@ -268,178 +268,178 @@ def _parse(source, pattern, flags=()):
while
1
:
if
source
.
next
in
(
"|"
,
")"
):
break
# end of subpattern
this
=
source
.
get
()
if
this
is
None
:
break
# end of pattern
if
this
and
this
[
0
]
not
in
SPECIAL_CHARS
:
subpattern
.
append
((
LITERAL
,
this
))
elif
this
==
"["
:
# character set
set
=
[]
##
if source.match(":"):
##
pass # handle character classes
if
source
.
match
(
"^"
):
set
.
append
((
NEGATE
,
None
))
# check remaining characters
start
=
set
[:]
while
1
:
this
=
source
.
get
()
if
this
==
"]"
and
set
!=
start
:
break
elif
this
and
this
[
0
]
==
"
\
\
"
:
code1
=
_fixescape
(
this
,
1
)
elif
this
:
code1
=
LITERAL
,
this
else
:
raise
SyntaxError
,
"unexpected end of regular expression"
if
source
.
match
(
"-"
):
# potential range
this
=
source
.
get
()
if
this
==
"]"
:
set
.
append
(
code1
)
set
.
append
((
LITERAL
,
"-"
))
break
else
:
if
this
[
0
]
==
"
\
\
"
:
code2
=
_fixescape
(
this
,
1
)
else
:
code2
=
LITERAL
,
this
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
SyntaxError
,
"illegal range"
if
len
(
code1
[
1
])
!=
1
or
len
(
code2
[
1
])
!=
1
:
raise
SyntaxError
,
"illegal range"
set
.
append
((
RANGE
,
(
code1
[
1
],
code2
[
1
])))
else
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
set
.
append
(
code1
)
# FIXME: <fl> move set optimization to support function
if
len
(
set
)
==
1
and
set
[
0
][
0
]
is
LITERAL
:
subpattern
.
append
(
set
[
0
])
# optimization
elif
len
(
set
)
==
2
and
set
[
0
][
0
]
is
NEGATE
and
set
[
1
][
0
]
is
LITERAL
:
subpattern
.
append
((
NOT_LITERAL
,
set
[
1
][
1
]))
# optimization
else
:
# FIXME: <fl> add charmap optimization
subpattern
.
append
((
IN
,
set
))
elif
this
and
this
[
0
]
in
REPEAT_CHARS
:
# repeat previous item
if
this
==
"?"
:
min
,
max
=
0
,
1
elif
this
==
"*"
:
min
,
max
=
0
,
sys
.
maxint
elif
this
==
"+"
:
min
,
max
=
1
,
sys
.
maxint
elif
this
==
"{"
:
min
,
max
=
0
,
sys
.
maxint
lo
=
hi
=
""
while
source
.
next
in
string
.
digits
:
lo
=
lo
+
source
.
get
()
if
source
.
match
(
","
):
while
source
.
next
in
string
.
digits
:
hi
=
hi
+
source
.
get
()
else
:
hi
=
lo
if
not
source
.
match
(
"}"
):
raise
SyntaxError
,
"bogus range"
if
lo
:
min
=
int
(
lo
)
if
hi
:
max
=
int
(
hi
)
# FIXME: <fl> check that hi >= lo!
else
:
raise
SyntaxError
,
"not supported"
# figure out which item to repeat
# FIXME: should back up to the right mark, right?
if
subpattern
:
index
=
len
(
subpattern
)
-
1
while
subpattern
[
index
][
0
]
is
MARK
:
index
=
index
-
1
item
=
subpattern
[
index
:
index
+
1
]
else
:
raise
SyntaxError
,
"nothing to repeat"
if
source
.
match
(
"?"
):
subpattern
[
index
]
=
(
MIN_REPEAT
,
(
min
,
max
,
item
))
else
:
subpattern
[
index
]
=
(
MAX_REPEAT
,
(
min
,
max
,
item
))
elif
this
==
"."
:
subpattern
.
append
((
ANY
,
None
))
elif
this
==
"("
:
group
=
1
name
=
None
if
source
.
match
(
"?"
):
group
=
0
# options
if
source
.
match
(
"P"
):
# named group: skip forward to end of name
if
source
.
match
(
"<"
):
name
=
""
while
1
:
char
=
source
.
get
()
if
char
in
(
">"
,
None
):
break
name
=
name
+
char
group
=
1
elif
source
.
match
(
":"
):
# non-capturing group
group
=
2
elif
source
.
match_set
(
"iI"
):
pattern
.
setflag
(
"i"
)
elif
source
.
match_set
(
"lL"
):
pattern
.
setflag
(
"l"
)
elif
source
.
match_set
(
"mM"
):
pattern
.
setflag
(
"m"
)
elif
source
.
match_set
(
"sS"
):
pattern
.
setflag
(
"s"
)
elif
source
.
match_set
(
"xX"
):
pattern
.
setflag
(
"x"
)
if
group
:
# parse group contents
b
=
[]
if
group
==
2
:
# anonymous group
group
=
None
else
:
group
=
pattern
.
getgroup
(
name
)
if
group
:
subpattern
.
append
((
MARK
,
(
group
-
1
)
*
2
))
while
1
:
p
=
_parse
(
source
,
pattern
,
flags
)
if
source
.
match
(
")"
):
if
b
:
b
.
append
(
p
)
_branch
(
subpattern
,
b
)
else
:
subpattern
.
append
((
SUBPATTERN
,
(
group
,
p
)))
break
elif
source
.
match
(
"|"
):
b
.
append
(
p
)
else
:
raise
SyntaxError
,
"group not properly closed"
if
group
:
subpattern
.
append
((
MARK
,
(
group
-
1
)
*
2
+
1
))
else
:
# FIXME: should this really be a while loop?
while
source
.
get
()
not
in
(
")"
,
None
):
pass
elif
this
==
"^"
:
subpattern
.
append
((
AT
,
AT_BEGINNING
))
elif
this
==
"$"
:
subpattern
.
append
((
AT
,
AT_END
))
elif
this
and
this
[
0
]
==
"
\
\
"
:
code
=
_fixescape
(
this
)
subpattern
.
append
(
code
)
else
:
raise
SyntaxError
,
"parser error"
if
source
.
next
in
(
"|"
,
")"
):
break
# end of subpattern
this
=
source
.
get
()
if
this
is
None
:
break
# end of pattern
if
this
and
this
[
0
]
not
in
SPECIAL_CHARS
:
subpattern
.
append
((
LITERAL
,
this
))
elif
this
==
"["
:
# character set
set
=
[]
##
if source.match(":"):
##
pass # handle character classes
if
source
.
match
(
"^"
):
set
.
append
((
NEGATE
,
None
))
# check remaining characters
start
=
set
[:]
while
1
:
this
=
source
.
get
()
if
this
==
"]"
and
set
!=
start
:
break
elif
this
and
this
[
0
]
==
"
\
\
"
:
code1
=
_fixescape
(
this
,
1
)
elif
this
:
code1
=
LITERAL
,
this
else
:
raise
SyntaxError
,
"unexpected end of regular expression"
if
source
.
match
(
"-"
):
# potential range
this
=
source
.
get
()
if
this
==
"]"
:
set
.
append
(
code1
)
set
.
append
((
LITERAL
,
"-"
))
break
else
:
if
this
[
0
]
==
"
\
\
"
:
code2
=
_fixescape
(
this
,
1
)
else
:
code2
=
LITERAL
,
this
if
code1
[
0
]
!=
LITERAL
or
code2
[
0
]
!=
LITERAL
:
raise
SyntaxError
,
"illegal range"
if
len
(
code1
[
1
])
!=
1
or
len
(
code2
[
1
])
!=
1
:
raise
SyntaxError
,
"illegal range"
set
.
append
((
RANGE
,
(
code1
[
1
],
code2
[
1
])))
else
:
if
code1
[
0
]
is
IN
:
code1
=
code1
[
1
][
0
]
set
.
append
(
code1
)
# FIXME: <fl> move set optimization to support function
if
len
(
set
)
==
1
and
set
[
0
][
0
]
is
LITERAL
:
subpattern
.
append
(
set
[
0
])
# optimization
elif
len
(
set
)
==
2
and
set
[
0
][
0
]
is
NEGATE
and
set
[
1
][
0
]
is
LITERAL
:
subpattern
.
append
((
NOT_LITERAL
,
set
[
1
][
1
]))
# optimization
else
:
# FIXME: <fl> add charmap optimization
subpattern
.
append
((
IN
,
set
))
elif
this
and
this
[
0
]
in
REPEAT_CHARS
:
# repeat previous item
if
this
==
"?"
:
min
,
max
=
0
,
1
elif
this
==
"*"
:
min
,
max
=
0
,
sys
.
maxint
elif
this
==
"+"
:
min
,
max
=
1
,
sys
.
maxint
elif
this
==
"{"
:
min
,
max
=
0
,
sys
.
maxint
lo
=
hi
=
""
while
source
.
next
in
string
.
digits
:
lo
=
lo
+
source
.
get
()
if
source
.
match
(
","
):
while
source
.
next
in
string
.
digits
:
hi
=
hi
+
source
.
get
()
else
:
hi
=
lo
if
not
source
.
match
(
"}"
):
raise
SyntaxError
,
"bogus range"
if
lo
:
min
=
int
(
lo
)
if
hi
:
max
=
int
(
hi
)
# FIXME: <fl> check that hi >= lo!
else
:
raise
SyntaxError
,
"not supported"
# figure out which item to repeat
# FIXME: should back up to the right mark, right?
if
subpattern
:
index
=
len
(
subpattern
)
-
1
while
subpattern
[
index
][
0
]
is
MARK
:
index
=
index
-
1
item
=
subpattern
[
index
:
index
+
1
]
else
:
raise
SyntaxError
,
"nothing to repeat"
if
source
.
match
(
"?"
):
subpattern
[
index
]
=
(
MIN_REPEAT
,
(
min
,
max
,
item
))
else
:
subpattern
[
index
]
=
(
MAX_REPEAT
,
(
min
,
max
,
item
))
elif
this
==
"."
:
subpattern
.
append
((
ANY
,
None
))
elif
this
==
"("
:
group
=
1
name
=
None
if
source
.
match
(
"?"
):
group
=
0
# options
if
source
.
match
(
"P"
):
# named group: skip forward to end of name
if
source
.
match
(
"<"
):
name
=
""
while
1
:
char
=
source
.
get
()
if
char
in
(
">"
,
None
):
break
name
=
name
+
char
group
=
1
elif
source
.
match
(
":"
):
# non-capturing group
group
=
2
elif
source
.
match_set
(
"iI"
):
pattern
.
setflag
(
"i"
)
elif
source
.
match_set
(
"lL"
):
pattern
.
setflag
(
"l"
)
elif
source
.
match_set
(
"mM"
):
pattern
.
setflag
(
"m"
)
elif
source
.
match_set
(
"sS"
):
pattern
.
setflag
(
"s"
)
elif
source
.
match_set
(
"xX"
):
pattern
.
setflag
(
"x"
)
if
group
:
# parse group contents
b
=
[]
if
group
==
2
:
# anonymous group
group
=
None
else
:
group
=
pattern
.
getgroup
(
name
)
if
group
:
subpattern
.
append
((
MARK
,
(
group
-
1
)
*
2
))
while
1
:
p
=
_parse
(
source
,
pattern
,
flags
)
if
source
.
match
(
")"
):
if
b
:
b
.
append
(
p
)
_branch
(
subpattern
,
b
)
else
:
subpattern
.
append
((
SUBPATTERN
,
(
group
,
p
)))
break
elif
source
.
match
(
"|"
):
b
.
append
(
p
)
else
:
raise
SyntaxError
,
"group not properly closed"
if
group
:
subpattern
.
append
((
MARK
,
(
group
-
1
)
*
2
+
1
))
else
:
# FIXME: should this really be a while loop?
while
source
.
get
()
not
in
(
")"
,
None
):
pass
elif
this
==
"^"
:
subpattern
.
append
((
AT
,
AT_BEGINNING
))
elif
this
==
"$"
:
subpattern
.
append
((
AT
,
AT_END
))
elif
this
and
this
[
0
]
==
"
\
\
"
:
code
=
_fixescape
(
this
)
subpattern
.
append
(
code
)
else
:
raise
SyntaxError
,
"parser error"
return
subpattern
...
...
@@ -448,20 +448,20 @@ def parse(source, flags=()):
g
=
Pattern
()
b
=
[]
while
1
:
p
=
_parse
(
s
,
g
,
flags
)
tail
=
s
.
get
()
if
tail
==
"|"
:
b
.
append
(
p
)
elif
tail
==
")"
:
raise
SyntaxError
,
"unbalanced parenthesis"
elif
tail
is
None
:
if
b
:
b
.
append
(
p
)
p
=
SubPattern
(
g
)
_branch
(
p
,
b
)
break
else
:
raise
SyntaxError
,
"bogus characters at end of regular expression"
p
=
_parse
(
s
,
g
,
flags
)
tail
=
s
.
get
()
if
tail
==
"|"
:
b
.
append
(
p
)
elif
tail
==
")"
:
raise
SyntaxError
,
"unbalanced parenthesis"
elif
tail
is
None
:
if
b
:
b
.
append
(
p
)
p
=
SubPattern
(
g
)
_branch
(
p
,
b
)
break
else
:
raise
SyntaxError
,
"bogus characters at end of regular expression"
return
p
if
__name__
==
"__main__"
:
...
...
@@ -469,23 +469,23 @@ if __name__ == "__main__":
from
testpatterns
import
PATTERNS
a
=
b
=
c
=
0
for
pattern
,
flags
in
PATTERNS
:
if
flags
:
continue
print
"-"
*
68
try
:
p
=
parse
(
pattern
)
print
repr
(
pattern
),
"->"
pprint
(
p
.
data
)
import
sre_compile
try
:
code
=
sre_compile
.
compile
(
p
)
c
=
c
+
1
except
:
pass
a
=
a
+
1
except
SyntaxError
,
v
:
print
"**"
,
repr
(
pattern
),
v
b
=
b
+
1
if
flags
:
continue
print
"-"
*
68
try
:
p
=
parse
(
pattern
)
print
repr
(
pattern
),
"->"
pprint
(
p
.
data
)
import
sre_compile
try
:
code
=
sre_compile
.
compile
(
p
)
c
=
c
+
1
except
:
pass
a
=
a
+
1
except
SyntaxError
,
v
:
print
"**"
,
repr
(
pattern
),
v
b
=
b
+
1
print
"-"
*
68
print
a
,
"of"
,
b
,
"patterns successfully parsed"
print
c
,
"of"
,
b
,
"patterns successfully compiled"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment