Commit 0415f0cc authored by Guido van Rossum's avatar Guido van Rossum

New "re" regular expression support.

The new re module was written by Andrew Kuchling and uses the pcre
code in ../Modules/.  The old re module has been renamed to re1,
just in case you need it for comparison.
parent b28c97f7
...@@ -2,42 +2,27 @@ ...@@ -2,42 +2,27 @@
# -*- mode: python -*- # -*- mode: python -*-
# $Id$ # $Id$
import string
import reop
# reop.error and re.error should be the same, since exceptions can be
# raised from either module.
error = reop.error # 're error'
from reop import NORMAL, CHARCLASS, REPLACEMENT
from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET
from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER
# compilation flags import sys
import string
IGNORECASE = I = 0x01 from pcre import *
MULTILINE = M = 0x02
DOTALL = S = 0x04
VERBOSE = X = 0x08
repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?', [ NORMAL, CHARCLASS, REPLACEMENT ] = range(3)
'{n,}', '{n,}?', '{n,m}', '{n,m}?'] [ CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER ] = range(9)
# #
# # First, the public part of the interface:
# #
def valid_identifier(id): # pcre.error and re.error should be the same, since exceptions can be
if len(id) == 0: # raised from either module.
return 0
if (not reop.syntax_table[id[0]] & reop.word) or \ # compilation flags
(reop.syntax_table[id[0]] & reop.digit):
return 0 I = IGNORECASE
for char in id[1:]: M = MULTILINE
if not reop.syntax_table[char] & reop.word: S = DOTALL
return 0 X = VERBOSE
return 1
# #
# #
...@@ -83,60 +68,17 @@ def split(pattern, string, maxsplit=0): ...@@ -83,60 +68,17 @@ def split(pattern, string, maxsplit=0):
# #
# #
def _expand(m, repl):
results = []
index = 0
size = len(repl)
while index < size:
found = string.find(repl, '\\', index)
if found < 0:
results.append(repl[index:])
break
if found > index:
results.append(repl[index:found])
escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT)
if escape_type == CHAR:
results.append(value)
elif escape_type == MEMORY_REFERENCE:
r = m.group(value)
if r is None:
raise error, ('group "' + str(value) + '" did not contribute '
'to the match')
results.append(m.group(value))
else:
raise error, "bad escape in replacement"
return string.join(results, '')
class RegexObject: class RegexObject:
def __init__(self, pattern, flags, code, num_regs, groupindex): def __init__(self, pattern, flags, code, groupindex):
self.code = code self.code = code
self.num_regs = num_regs
self.flags = flags self.flags = flags
self.pattern = pattern self.pattern = pattern
self.groupindex = groupindex self.groupindex = groupindex
self.fastmap = build_fastmap(code)
if code[0].name == 'bol':
self.anchor = 1
elif code[0].name == 'begbuf':
self.anchor = 2
else:
self.anchor = 0
self.buffer = assemble(code)
def search(self, string, pos=0): def search(self, string, pos=0):
regs = reop.search(self.buffer, regs = self.code.match(string, pos, 0)
self.num_regs,
self.flags,
self.fastmap.can_be_null,
self.fastmap.fastmap(),
self.anchor,
string,
pos)
if regs is None: if regs is None:
return None return None
self.num_regs=len(regs)
return MatchObject(self, return MatchObject(self,
string, string,
...@@ -144,17 +86,10 @@ class RegexObject: ...@@ -144,17 +86,10 @@ class RegexObject:
regs) regs)
def match(self, string, pos=0): def match(self, string, pos=0):
regs = reop.match(self.buffer, regs = self.code.match(string, pos, ANCHORED)
self.num_regs,
self.flags,
self.fastmap.can_be_null,
self.fastmap.fastmap(),
self.anchor,
string,
pos)
if regs is None: if regs is None:
return None return None
self.num_regs=len(regs)/2
return MatchObject(self, return MatchObject(self,
string, string,
pos, pos,
...@@ -165,13 +100,13 @@ class RegexObject: ...@@ -165,13 +100,13 @@ class RegexObject:
def subn(self, repl, source, count=0): def subn(self, repl, source, count=0):
if count < 0: if count < 0:
raise ValueError, "negative substibution count" raise error, "negative substitution count"
if count == 0: if count == 0:
import sys import sys
count = sys.maxint count = sys.maxint
if type(repl) == type(''): if type(repl) == type(''):
if '\\' in repl: if '\\' in repl:
repl = lambda m, r=repl: _expand(m, r) repl = lambda m, r=repl: pcre_expand(m, r)
else: else:
repl = lambda m, r=repl: r repl = lambda m, r=repl: r
n = 0 # Number of matches n = 0 # Number of matches
...@@ -275,7 +210,8 @@ class MatchObject: ...@@ -275,7 +210,8 @@ class MatchObject:
g = self.re.groupindex[g] g = self.re.groupindex[g]
except (KeyError, TypeError): except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined') raise IndexError, ('group "' + g + '" is undefined')
if (self.regs[g][0] == -1) or (self.regs[g][1] == -1): if len(self.regs)<=g: raise IndexError, ('group "' + str(g) + '" is undefined')
elif (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
result.append(None) result.append(None)
else: else:
result.append(self.string[self.regs[g][0]:self.regs[g][1]]) result.append(self.string[self.regs[g][0]:self.regs[g][1]])
...@@ -286,364 +222,56 @@ class MatchObject: ...@@ -286,364 +222,56 @@ class MatchObject:
else: else:
return () return ()
#
# A set of classes to make assembly a bit easier, if a bit verbose.
#
class Instruction:
def __init__(self, opcode, size=1):
self.opcode = opcode
self.size = size
def assemble(self, position, labels):
return self.opcode
def __repr__(self):
return '%-15s' % (self.name)
class End(Instruction):
name = 'end'
def __init__(self):
Instruction.__init__(self, chr(0))
class Bol(Instruction):
name = 'bol'
def __init__(self):
self.name = 'bol'
Instruction.__init__(self, chr(1))
class Eol(Instruction):
name = 'eol'
def __init__(self):
Instruction.__init__(self, chr(2))
class Set(Instruction):
name = 'set'
def __init__(self, set, flags=0):
self.set = set
if flags & IGNORECASE: self.set=map(string.lower, self.set)
if len(set)==1:
# If only one element, use the "exact" opcode (it'll be faster)
Instruction.__init__(self, chr(4), 2)
else:
# Use the "set" opcode
Instruction.__init__(self, chr(3), 33)
def assemble(self, position, labels):
if len(self.set)==1:
# If only one character in set, generate an "exact" opcode
return self.opcode + self.set[0]
result = self.opcode
temp = 0
for i, c in map(lambda x: (x, chr(x)), range(256)):
if c in self.set:
temp = temp | (1 << (i & 7))
if (i % 8) == 7:
result = result + chr(temp)
temp = 0
return result
def __repr__(self):
result = '%-15s' % (self.name)
self.set.sort()
# XXX this should print more intelligently
for char in self.set:
result = result + char
return result
class Exact(Instruction):
name = 'exact'
def __init__(self, char, flags):
self.char = char
if flags & IGNORECASE: self.char=string.lower(self.char)
Instruction.__init__(self, chr(4), 2)
def assemble(self, position, labels):
return self.opcode + self.char
def __repr__(self):
return '%-15s %s' % (self.name, `self.char`)
class AnyChar(Instruction):
name = 'anychar'
def __init__(self):
Instruction.__init__(self, chr(5))
def assemble(self, position, labels):
return self.opcode
class MemoryInstruction(Instruction):
def __init__(self, opcode, register):
self.register = register
Instruction.__init__(self, opcode, 2)
def assemble(self, position, labels):
return self.opcode + chr(self.register)
def __repr__(self):
return '%-15s %i' % (self.name, self.register)
class StartMemory(MemoryInstruction):
name = 'start_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(6), register)
class EndMemory(MemoryInstruction):
name = 'end_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(7), register)
class MatchMemory(MemoryInstruction):
name = 'match_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(8), register)
class JumpInstruction(Instruction):
def __init__(self, opcode, label):
self.label = label
Instruction.__init__(self, opcode, 3)
def compute_offset(self, start, dest):
return dest - (start + 3)
def pack_offset(self, offset):
if offset > 32767:
raise error, 'offset out of range (pos)'
elif offset < -32768:
raise error, 'offset out of range (neg)'
elif offset < 0:
offset = offset + 65536
return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
def assemble(self, position, labels):
return self.opcode + \
self.pack_offset(self.compute_offset(position,
labels[self.label]))
def __repr__(self):
return '%-15s %i' % (self.name, self.label)
class Jump(JumpInstruction):
name = 'jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(9), label)
class StarJump(JumpInstruction):
name = 'star_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(10), label)
class FailureJump(JumpInstruction):
name = 'failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(11), label)
class UpdateFailureJump(JumpInstruction):
name = 'update_failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(12), label)
class DummyFailureJump(JumpInstruction):
name = 'dummy_failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(13), label)
class BegBuf(Instruction):
name = 'begbuf'
def __init__(self):
Instruction.__init__(self, chr(14))
class EndBuf(Instruction):
name = 'endbuf'
def __init__(self):
Instruction.__init__(self, chr(15))
class WordBeg(Instruction):
name = 'wordbeg'
def __init__(self):
Instruction.__init__(self, chr(16))
class WordEnd(Instruction):
name = 'wordend'
def __init__(self):
Instruction.__init__(self, chr(17))
class WordBound(Instruction):
name = 'wordbound'
def __init__(self):
Instruction.__init__(self, chr(18))
class NotWordBound(Instruction):
name = 'notwordbound'
def __init__(self):
Instruction.__init__(self, chr(19))
class SyntaxSpec(Instruction):
name = 'syntaxspec'
def __init__(self, syntax):
self.syntax = syntax
Instruction.__init__(self, chr(20), 2)
def assemble(self, postition, labels):
return self.opcode + chr(self.syntax)
class NotSyntaxSpec(Instruction):
name = 'notsyntaxspec'
def __init__(self, syntax):
self.syntax = syntax
Instruction.__init__(self, chr(21), 2)
def assemble(self, postition, labels):
return self.opcode + chr(self.syntax)
class Label(Instruction):
name = 'label'
def __init__(self, label):
self.label = label
Instruction.__init__(self, '', 0)
def __repr__(self):
return '%-15s %i' % (self.name, self.label)
class OpenParen(Instruction):
name = '('
def __init__(self, register):
self.register = register
Instruction.__init__(self, '', 0)
def assemble(self, position, labels):
raise error, 'unmatched open parenthesis'
class Alternation(Instruction):
name = '|'
def __init__(self):
Instruction.__init__(self, '', 0)
def assemble(self, position, labels):
raise error, 'an alternation was not taken care of'
#
#
#
def assemble(instructions):
labels = {}
position = 0
pass1 = []
for instruction in instructions:
if instruction.name == 'label':
labels[instruction.label] = position
else:
pass1.append((position, instruction))
position = position + instruction.size
pass2 = ''
for position, instruction in pass1:
pass2 = pass2 + instruction.assemble(position, labels)
return pass2
#
#
#
def escape(pattern): def escape(pattern):
result = [] result = []
alphanum=string.letters+'_'+string.digits
for char in pattern: for char in pattern:
if not reop.syntax_table[char] & reop.word: if char not in alphanum:
result.append('\\') result.append('\\')
result.append(char) result.append(char)
return string.join(result, '') return string.join(result, '')
# def valid_identifier(id):
# import string
# if len(id) == 0:
return 0
def registers_used(instructions): if id[0] not in string.letters+'_':
result = [] return 0
for instruction in instructions: for char in id[1:]:
if (instruction.name in ['set_memory', 'end_memory']) and \ if not syntax_table[char] & word:
(instruction.register not in result): return 0
result.append(instruction.register) return 1
return result
#
#
#
class Fastmap:
def __init__(self):
self.map = ['\000']*256
self.can_be_null = 0
def add(self, char):
self.map[ord(char)] = '\001'
def fastmap(self):
return string.join(self.map, '')
def __getitem__(self, char):
return ord(self.map[ord(char)])
def __repr__(self):
self.map.sort()
return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
#
#
#
def find_label(code, label):
line = 0
for instruction in code:
if (instruction.name == 'label') and (instruction.label == label):
return line + 1
line = line + 1
def build_fastmap_aux(code, pos, visited, fastmap):
if visited[pos]:
return
while 1:
instruction = code[pos]
visited[pos] = 1
pos = pos + 1
if instruction.name == 'end':
fastmap.can_be_null = 1
return
elif instruction.name == 'syntaxspec':
for char in map(chr, range(256)):
if reop.syntax_table[char] & instruction.syntax:
fastmap.add(char)
return
elif instruction.name == 'notsyntaxspec':
for char in map(chr, range(256)):
if not reop.syntax_table[char] & instruction.syntax:
fastmap.add(char)
return
elif instruction.name == 'eol':
fastmap.add('\n')
if fastmap.can_be_null == 0:
fastmap.can_be_null = 2
return
elif instruction.name == 'set':
for char in instruction.set:
fastmap.add(char)
return
elif instruction.name == 'exact':
fastmap.add(instruction.char)
elif instruction.name == 'anychar':
for char in map(chr, range(256)):
if char != '\n':
fastmap.add(char)
return
elif instruction.name == 'match_memory':
for char in map(chr, range(256)):
fastmap.add(char)
fastmap.can_be_null = 1
return
elif instruction.name in ['jump', 'dummy_failure_jump', \
'update_failure_jump', 'star_jump']:
pos = find_label(code, instruction.label)
if visited[pos]:
return
visited[pos] = 1
elif instruction.name == 'failure_jump':
build_fastmap_aux(code,
find_label(code, instruction.label),
visited,
fastmap)
def build_fastmap(code, pos=0):
visited = [0] * len(code)
fastmap = Fastmap()
build_fastmap_aux(code, pos, visited, fastmap)
return fastmap
# def compile(pattern, flags=0):
# groupindex={}
# code=pcre_compile(pattern, flags, groupindex)
return RegexObject(pattern, flags, code, groupindex)
#[NORMAL, CHARCLASS, REPLACEMENT] = range(3) def _expand(m, repl):
#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY, results = []
# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9) index = 0
size = len(repl)
while index < size:
found = string.find(repl, '\\', index)
if found < 0:
results.append(repl[index:])
break
if found > index:
results.append(repl[index:found])
escape_type, value, index = _expand_escape(repl, found+1, REPLACEMENT)
if escape_type == CHAR:
results.append(value)
elif escape_type == MEMORY_REFERENCE:
r = m.group(value)
if r is None:
raise error, ('group "' + str(value) + '" did not contribute '
'to the match')
results.append(m.group(value))
else:
raise error, "bad escape in replacement"
return string.join(results, '')
def expand_escape(pattern, index, context=NORMAL): def _expand_escape(pattern, index, context=NORMAL):
if index >= len(pattern): if index >= len(pattern):
raise error, 'escape ends too soon' raise error, 'escape ends too soon'
...@@ -676,7 +304,7 @@ def expand_escape(pattern, index, context=NORMAL): ...@@ -676,7 +304,7 @@ def expand_escape(pattern, index, context=NORMAL):
# what it's doing (and Python in turn passes it on to sscanf, # what it's doing (and Python in turn passes it on to sscanf,
# so that *it* doesn't incorrectly 2nd-guess what C does!) # so that *it* doesn't incorrectly 2nd-guess what C does!)
char = eval ('"' + pattern[index-1:end] + '"') char = eval ('"' + pattern[index-1:end] + '"')
assert len(char) == 1 # assert len(char) == 1
return CHAR, char, end return CHAR, char, end
elif pattern[index] == 'b': elif pattern[index] == 'b':
...@@ -707,75 +335,21 @@ def expand_escape(pattern, index, context=NORMAL): ...@@ -707,75 +335,21 @@ def expand_escape(pattern, index, context=NORMAL):
raise error, ('\\' + pattern[index] + ' is not allowed') raise error, ('\\' + pattern[index] + ' is not allowed')
elif pattern[index] == 'w': elif pattern[index] == 'w':
if context == NORMAL:
return SYNTAX, reop.word, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.word:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'w', index + 1 return CHAR, 'w', index + 1
elif pattern[index] == 'W': elif pattern[index] == 'W':
if context == NORMAL:
return NOT_SYNTAX, reop.word, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.word:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'W', index + 1 return CHAR, 'W', index + 1
elif pattern[index] == 's': elif pattern[index] == 's':
if context == NORMAL:
return SYNTAX, reop.whitespace, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.whitespace:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 's', index + 1 return CHAR, 's', index + 1
elif pattern[index] == 'S': elif pattern[index] == 'S':
if context == NORMAL:
return NOT_SYNTAX, reop.whitespace, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.whitespace:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'S', index + 1 return CHAR, 'S', index + 1
elif pattern[index] == 'd': elif pattern[index] == 'd':
if context == NORMAL:
return SYNTAX, reop.digit, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.digit:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'd', index + 1 return CHAR, 'd', index + 1
elif pattern[index] == 'D': elif pattern[index] == 'D':
if context == NORMAL:
return NOT_SYNTAX, reop.digit, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.digit:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'D', index + 1 return CHAR, 'D', index + 1
elif pattern[index] in '0123456789': elif pattern[index] in '0123456789':
...@@ -854,655 +428,3 @@ def expand_escape(pattern, index, context=NORMAL): ...@@ -854,655 +428,3 @@ def expand_escape(pattern, index, context=NORMAL):
else: else:
return CHAR, pattern[index], index + 1 return CHAR, pattern[index], index + 1
def compile(pattern, flags=0):
stack = []
label = 0
register = 1
groupindex = {}
lastop = ''
# look for embedded pattern modifiers at the beginning of the pattern
index = 0
if len(pattern) >= 3 and \
(pattern[:2] == '(?') and \
(pattern[2] in 'iImMsSxX'):
index = 2
while (index < len(pattern)) and (pattern[index] != ')'):
if pattern[index] in 'iI':
flags = flags | IGNORECASE
elif pattern[index] in 'mM':
flags = flags | MULTILINE
elif pattern[index] in 'sS':
flags = flags | DOTALL
elif pattern[index] in 'xX':
flags = flags | VERBOSE
else:
raise error, 'unknown modifier'
index = index + 1
index = index + 1
# compile the rest of the pattern
while (index < len(pattern)):
char = pattern[index]
index = index + 1
if char == '\\':
escape_type, value, index = expand_escape(pattern, index)
if escape_type == CHAR:
stack.append([Exact(value, flags)])
lastop = '\\' + value
elif escape_type == MEMORY_REFERENCE:
if value >= register:
raise error, ('cannot reference a register '
'not yet used')
stack.append([MatchMemory(value)])
lastop = '\\1'
elif escape_type == BEGINNING_OF_BUFFER:
stack.append([BegBuf()])
lastop = '\\A'
elif escape_type == END_OF_BUFFER:
stack.append([EndBuf()])
lastop = '\\Z'
elif escape_type == WORD_BOUNDARY:
stack.append([WordBound()])
lastop = '\\b'
elif escape_type == NOT_WORD_BOUNDARY:
stack.append([NotWordBound()])
lastop = '\\B'
elif escape_type == SYNTAX:
stack.append([SyntaxSpec(value)])
if value == reop.word:
lastop = '\\w'
elif value == reop.whitespace:
lastop = '\\s'
elif value == reop.digit:
lastop = '\\d'
else:
lastop = '\\?'
elif escape_type == NOT_SYNTAX:
stack.append([NotSyntaxSpec(value)])
if value == reop.word:
lastop = '\\W'
elif value == reop.whitespace:
lastop = '\\S'
elif value == reop.digit:
lastop = '\\D'
else:
lastop = '\\?'
elif escape_type == SET:
raise error, 'cannot use set escape type here'
else:
raise error, 'unknown escape type'
elif char == '|':
expr = []
while (len(stack) != 0) and \
(stack[-1][0].name != '(') and \
(stack[-1][0].name != '|'):
expr = stack[-1] + expr
del stack[-1]
stack.append([FailureJump(label)] + \
expr + \
[Jump(-1),
Label(label)])
stack.append([Alternation()])
label = label + 1
lastop = '|'
elif char == '(':
if index >= len(pattern):
raise error, 'no matching close paren'
elif pattern[index] == '?':
# Perl style (?...) extensions
index = index + 1
if index >= len(pattern):
raise error, 'extension ends prematurely'
elif pattern[index] == 'P':
# Python extensions
index = index + 1
if index >= len(pattern):
raise error, 'extension ends prematurely'
elif pattern[index] == '<':
# Handle Python symbolic group names (?P<...>...)
index = index + 1
end = string.find(pattern, '>', index)
if end == -1:
raise error, 'no end to symbolic group name'
name = pattern[index:end]
if not valid_identifier(name):
raise error, ('symbolic group name must be a '
'valid identifier')
index = end + 1
groupindex[name] = register
stack.append([OpenParen(register)])
register = register + 1
lastop = '('
elif pattern[index] == '=':
# backreference to symbolic group name
if index >= len(pattern):
raise error, '(?P= at the end of the pattern'
start = index + 1
end = string.find(pattern, ')', start)
if end == -1:
raise error, 'no ) to end symbolic group name'
name = pattern[start:end]
if name not in groupindex.keys():
raise error, ('symbolic group name ' + name + \
' has not been used yet')
stack.append([MatchMemory(groupindex[name])])
index = end + 1
lastop = '(?P=)'
else:
raise error, ('unknown Python extension: ' + \
pattern[index])
elif pattern[index] == ':':
# grouping, but no registers
index = index + 1
stack.append([OpenParen(-1)])
lastop = '('
elif pattern[index] == '#':
# comment
index = index + 1
end = string.find(pattern, ')', index)
if end == -1:
raise error, 'no end to comment'
index = end + 1
# do not change lastop
elif pattern[index] == '=':
raise error, ('zero-width positive lookahead '
'assertion is unsupported')
elif pattern[index] == '!':
raise error, ('zero-width negative lookahead '
'assertion is unsupported')
elif pattern[index] in 'iImMsSxX':
raise error, ('embedded pattern modifiers are only '
'allowed at the beginning of the pattern')
else:
raise error, 'unknown extension'
else:
stack.append([OpenParen(register)])
register = register + 1
lastop = '('
elif char == ')':
# make one expression out of everything on the stack up to
# the marker left by the last parenthesis
expr = []
while (len(stack) > 0) and (stack[-1][0].name != '('):
expr = stack[-1] + expr
del stack[-1]
if len(stack) == 0:
raise error, 'too many close parens'
# remove markers left by alternation
expr = filter(lambda x: x.name != '|', expr)
# clean up jumps inserted by alternation
need_label = 0
for i in range(len(expr)):
if (expr[i].name == 'jump') and (expr[i].label == -1):
expr[i] = Jump(label)
need_label = 1
if need_label:
expr.append(Label(label))
label = label + 1
if stack[-1][0].register > 0:
expr = [StartMemory(stack[-1][0].register)] + \
expr + \
[EndMemory(stack[-1][0].register)]
del stack[-1]
stack.append(expr)
lastop = ')'
elif char == '{':
if len(stack) == 0:
raise error, 'no expression to repeat'
end = string.find(pattern, '}', index)
if end == -1:
raise error, ('no close curly bracket to match'
' open curly bracket')
fields = map(string.strip,
string.split(pattern[index:end], ','))
index = end + 1
minimal = 0
if (index < len(pattern)) and (pattern[index] == '?'):
minimal = 1
index = index + 1
if len(fields) == 1:
# {n} or {n}? (there's really no difference)
try:
count = string.atoi(fields[0])
except ValueError:
raise error, ('count must be an integer '
'inside curly braces')
if count > 65535:
raise error, 'repeat count out of range'
expr = []
while count > 0:
expr = expr + stack[-1]
count = count - 1
del stack[-1]
stack.append(expr)
if minimal:
lastop = '{n}?'
else:
lastop = '{n}'
elif len(fields) == 2:
# {n,} or {n,m}
if fields[1] == '':
# {n,}
try:
min = string.atoi(fields[0])
except ValueError:
raise error, ('minimum must be an integer '
'inside curly braces')
if min > 65535:
raise error, 'minimum repeat count out of range'
expr = []
while min > 0:
expr = expr + stack[-1]
min = min - 1
if minimal:
expr = expr + \
([Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
FailureJump(label)])
lastop = '{n,}?'
else:
expr = expr + \
([Label(label),
FailureJump(label + 1)] +
stack[-1] +
[StarJump(label),
Label(label + 1)])
lastop = '{n,}'
del stack[-1]
stack.append(expr)
label = label + 2
else:
# {n,m}
try:
min = string.atoi(fields[0])
except ValueError:
raise error, ('minimum must be an integer '
'inside curly braces')
try:
max = string.atoi(fields[1])
except ValueError:
raise error, ('maximum must be an integer '
'inside curly braces')
if min > 65535:
raise error, ('minumim repeat count out '
'of range')
if max > 65535:
raise error, ('maximum repeat count out '
'of range')
if min > max:
raise error, ('minimum repeat count must be '
'less than the maximum '
'repeat count')
expr = []
while min > 0:
expr = expr + stack[-1]
min = min - 1
max = max - 1
if minimal:
while max > 0:
expr = expr + \
[FailureJump(label),
Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1)]
max = max - 1
label = label + 2
del stack[-1]
stack.append(expr)
lastop = '{n,m}?'
else:
while max > 0:
expr = expr + \
[FailureJump(label)] + \
stack[-1]
max = max - 1
del stack[-1]
stack.append(expr + [Label(label)])
label = label + 1
lastop = '{n,m}'
else:
raise error, ('there need to be one or two fields '
'in a {} expression')
elif char == '}':
raise error, 'unbalanced close curly brace'
elif char == '*':
# Kleene closure
if len(stack) == 0:
raise error, '* needs something to repeat'
if lastop in ['(', '|']:
raise error, '* needs something to repeat'
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
FailureJump(label)]
index = index + 1
lastop = '*?'
else:
# greedy matching
expr = [Label(label),
FailureJump(label + 1)] + \
stack[-1] + \
[StarJump(label),
Label(label + 1)]
lastop = '*'
del stack[-1]
stack.append(expr)
label = label + 2
elif char == '+':
# positive closure
if len(stack) == 0:
raise error, '+ needs something to repeat'
if lastop in ['(', '|']:
raise error, '+ needs something to repeat'
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy
expr = [Label(label)] + \
stack[-1] + \
[FailureJump(label)]
label = label + 1
index = index + 1
lastop = '+?'
else:
# greedy
expr = [DummyFailureJump(label + 1),
Label(label),
FailureJump(label + 2),
Label(label + 1)] + \
stack[-1] + \
[StarJump(label),
Label(label + 2)]
label = label + 3
lastop = '+'
del stack[-1]
stack.append(expr)
elif char == '?':
if len(stack) == 0:
raise error, 'need something to be optional'
if len(stack) == 0:
raise error, '? needs something to repeat'
if lastop in ['(', '|']:
raise error, '? needs something to repeat'
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [FailureJump(label),
Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1)]
label = label + 2
index = index + 1
lastop = '??'
else:
# greedy matching
expr = [FailureJump(label)] + \
stack[-1] + \
[Label(label)]
label = label + 1
lastop = '?'
del stack[-1]
stack.append(expr)
elif char == '.':
if flags & DOTALL:
stack.append([Set(map(chr, range(256)), flags)])
else:
stack.append([AnyChar()])
lastop = '.'
elif char == '^':
if flags & MULTILINE:
stack.append([Bol()])
else:
stack.append([BegBuf()])
lastop = '^'
elif char == '$':
if flags & MULTILINE:
stack.append([Eol()])
else:
stack.append([EndBuf()])
lastop = '$'
elif char == '#':
if flags & VERBOSE:
# comment
index = index + 1
end = string.find(pattern, '\n', index)
if end == -1:
index = len(pattern)
else:
index = end + 1
# do not change lastop
else:
stack.append([Exact(char, flags)])
lastop = '#'
elif char in string.whitespace:
if not (flags & VERBOSE):
stack.append([Exact(char, flags)])
lastop = char
elif char == '[':
# compile character class
if index >= len(pattern):
raise error, 'unclosed character class'
negate = 0
last = ''
set = []
if pattern[index] == '^':
negate = 1
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
if pattern[index] == ']':
set.append(']')
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
elif pattern[index] == '-':
set.append('-')
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
while (index < len(pattern)) and (pattern[index] != ']'):
next = pattern[index]
index = index + 1
if next == '-':
if index >= len(pattern):
raise error, 'incomplete range in character class'
elif pattern[index] == ']':
set.append('-')
else:
if last == '':
raise error, ('improper use of range in '
'character class')
start = last
if pattern[index] == '\\':
escape_type,
value,
index = expand_escape(pattern,
index + 1,
CHARCLASS)
if escape_type == CHAR:
end = value
else:
raise error, ('illegal escape in character '
'class range')
else:
end = pattern[index]
index = index + 1
if start > end:
raise error, ('range arguments out of order '
'in character class')
for char in map(chr, range(ord(start), ord(end) + 1)):
if char not in set:
set.append(char)
last = ''
elif next == '\\':
# expand syntax meta-characters and add to set
if index >= len(pattern):
raise error, 'incomplete set'
escape_type, value, index = expand_escape(pattern,
index,
CHARCLASS)
if escape_type == CHAR:
set.append(value)
last = value
elif escape_type == SET:
for char in value:
if char not in set:
set.append(char)
last = ''
else:
raise error, 'illegal escape type in character class'
else:
if next not in set:
set.append(next)
last = next
if (index >= len(pattern)) or ( pattern[index] != ']'):
raise error, 'incomplete set'
index = index + 1
if negate:
# If case is being ignored, then both upper- and lowercase
# versions of the letters must be excluded.
if flags & IGNORECASE: set=set+map(string.upper, set)
notset = []
for char in map(chr, range(256)):
if char not in set:
notset.append(char)
if len(notset) == 0:
raise error, 'empty negated set'
stack.append([Set(notset, flags)])
else:
if len(set) == 0:
raise error, 'empty set'
stack.append([Set(set, flags)])
lastop = '[]'
else:
stack.append([Exact(char, flags)])
lastop = char
code = []
while len(stack) > 0:
if stack[-1][0].name == '(':
raise error, 'too many open parens'
code = stack[-1] + code
del stack[-1]
if len(code) == 0:
raise error, 'no code generated'
code = filter(lambda x: x.name != '|', code)
need_label = 0
for i in range(len(code)):
if (code[i].name == 'jump') and (code[i].label == -1):
code[i] = Jump(label)
need_label = 1
if need_label:
code.append(Label(label))
label = label + 1
code.append(End())
# print code
return RegexObject(pattern, flags, code, register, groupindex)
# Replace expand_escape and _expand functions with their C equivalents.
# If you suspect bugs in the C versions, comment out the next two lines
expand_escape = reop.expand_escape
_expand = reop._expand
#!/usr/bin/env python
# -*- mode: python -*-
# $Id$
import string
import reop
# reop.error and re.error should be the same, since exceptions can be
# raised from either module.
error = reop.error # 're error'
from reop import NORMAL, CHARCLASS, REPLACEMENT
from reop import CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET
from reop import WORD_BOUNDARY, NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER
# compilation flags
IGNORECASE = I = 0x01
MULTILINE = M = 0x02
DOTALL = S = 0x04
VERBOSE = X = 0x08
repetition_operators = ['*', '*?', '+', '+?', '?', '??', '{n}', '{n}?',
'{n,}', '{n,}?', '{n,m}', '{n,m}?']
#
#
#
def valid_identifier(id):
if len(id) == 0:
return 0
if (not reop.syntax_table[id[0]] & reop.word) or \
(reop.syntax_table[id[0]] & reop.digit):
return 0
for char in id[1:]:
if not reop.syntax_table[char] & reop.word:
return 0
return 1
#
#
#
_cache = {}
_MAXCACHE = 20
def _cachecompile(pattern, flags=0):
key = (pattern, flags)
try:
return _cache[key]
except KeyError:
pass
value = compile(pattern, flags)
if len(_cache) >= _MAXCACHE:
_cache.clear()
_cache[key] = value
return value
def match(pattern, string, flags=0):
return _cachecompile(pattern, flags).match(string)
def search(pattern, string, flags=0):
return _cachecompile(pattern, flags).search(string)
def sub(pattern, repl, string, count=0):
if type(pattern) == type(''):
pattern = _cachecompile(pattern)
return pattern.sub(repl, string, count)
def subn(pattern, repl, string, count=0):
if type(pattern) == type(''):
pattern = _cachecompile(pattern)
return pattern.subn(repl, string, count)
def split(pattern, string, maxsplit=0):
if type(pattern) == type(''):
pattern = _cachecompile(pattern)
return pattern.split(string, maxsplit)
#
#
#
def _expand(m, repl):
results = []
index = 0
size = len(repl)
while index < size:
found = string.find(repl, '\\', index)
if found < 0:
results.append(repl[index:])
break
if found > index:
results.append(repl[index:found])
escape_type, value, index = expand_escape(repl, found+1, REPLACEMENT)
if escape_type == CHAR:
results.append(value)
elif escape_type == MEMORY_REFERENCE:
r = m.group(value)
if r is None:
raise error, ('group "' + str(value) + '" did not contribute '
'to the match')
results.append(m.group(value))
else:
raise error, "bad escape in replacement"
return string.join(results, '')
class RegexObject:
def __init__(self, pattern, flags, code, num_regs, groupindex):
self.code = code
self.num_regs = num_regs
self.flags = flags
self.pattern = pattern
self.groupindex = groupindex
self.fastmap = build_fastmap(code)
if code[0].name == 'bol':
self.anchor = 1
elif code[0].name == 'begbuf':
self.anchor = 2
else:
self.anchor = 0
self.buffer = assemble(code)
def search(self, string, pos=0):
regs = reop.search(self.buffer,
self.num_regs,
self.flags,
self.fastmap.can_be_null,
self.fastmap.fastmap(),
self.anchor,
string,
pos)
if regs is None:
return None
return MatchObject(self,
string,
pos,
regs)
def match(self, string, pos=0):
regs = reop.match(self.buffer,
self.num_regs,
self.flags,
self.fastmap.can_be_null,
self.fastmap.fastmap(),
self.anchor,
string,
pos)
if regs is None:
return None
return MatchObject(self,
string,
pos,
regs)
def sub(self, repl, string, count=0):
return self.subn(repl, string, count)[0]
def subn(self, repl, source, count=0):
if count < 0:
raise ValueError, "negative substibution count"
if count == 0:
import sys
count = sys.maxint
if type(repl) == type(''):
if '\\' in repl:
repl = lambda m, r=repl: _expand(m, r)
else:
repl = lambda m, r=repl: r
n = 0 # Number of matches
pos = 0 # Where to start searching
lastmatch = -1 # End of last match
results = [] # Substrings making up the result
end = len(source)
while n < count and pos <= end:
m = self.search(source, pos)
if not m:
break
i, j = m.span(0)
if i == j == lastmatch:
# Empty match adjacent to previous match
pos = pos + 1
results.append(source[lastmatch:pos])
continue
if pos < i:
results.append(source[pos:i])
results.append(repl(m))
pos = lastmatch = j
if i == j:
# Last match was empty; don't try here again
pos = pos + 1
results.append(source[lastmatch:pos])
n = n + 1
results.append(source[pos:])
return (string.join(results, ''), n)
def split(self, source, maxsplit=0):
if maxsplit < 0:
raise error, "negative split count"
if maxsplit == 0:
import sys
maxsplit = sys.maxint
n = 0
pos = 0
lastmatch = 0
results = []
end = len(source)
while n < maxsplit:
m = self.search(source, pos)
if not m:
break
i, j = m.span(0)
if i == j:
# Empty match
if pos >= end:
break
pos = pos+1
continue
results.append(source[lastmatch:i])
g = m.group()
if g:
results[len(results):] = list(g)
pos = lastmatch = j
results.append(source[lastmatch:])
return results
class MatchObject:
def __init__(self, re, string, pos, regs):
self.re = re
self.string = string
self.pos = pos
self.regs = regs
def start(self, g):
if type(g) == type(''):
try:
g = self.re.groupindex[g]
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g][0]
def end(self, g):
if type(g) == type(''):
try:
g = self.re.groupindex[g]
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g][1]
def span(self, g):
if type(g) == type(''):
try:
g = self.re.groupindex[g]
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
return self.regs[g]
def group(self, *groups):
if len(groups) == 0:
groups = range(1, self.re.num_regs)
use_all = 1
else:
use_all = 0
result = []
for g in groups:
if type(g) == type(''):
try:
g = self.re.groupindex[g]
except (KeyError, TypeError):
raise IndexError, ('group "' + g + '" is undefined')
if (self.regs[g][0] == -1) or (self.regs[g][1] == -1):
result.append(None)
else:
result.append(self.string[self.regs[g][0]:self.regs[g][1]])
if use_all or len(result) > 1:
return tuple(result)
elif len(result) == 1:
return result[0]
else:
return ()
#
# A set of classes to make assembly a bit easier, if a bit verbose.
#
class Instruction:
def __init__(self, opcode, size=1):
self.opcode = opcode
self.size = size
def assemble(self, position, labels):
return self.opcode
def __repr__(self):
return '%-15s' % (self.name)
class End(Instruction):
name = 'end'
def __init__(self):
Instruction.__init__(self, chr(0))
class Bol(Instruction):
name = 'bol'
def __init__(self):
self.name = 'bol'
Instruction.__init__(self, chr(1))
class Eol(Instruction):
name = 'eol'
def __init__(self):
Instruction.__init__(self, chr(2))
class Set(Instruction):
name = 'set'
def __init__(self, set, flags=0):
self.set = set
if flags & IGNORECASE: self.set=map(string.lower, self.set)
if len(set)==1:
# If only one element, use the "exact" opcode (it'll be faster)
Instruction.__init__(self, chr(4), 2)
else:
# Use the "set" opcode
Instruction.__init__(self, chr(3), 33)
def assemble(self, position, labels):
if len(self.set)==1:
# If only one character in set, generate an "exact" opcode
return self.opcode + self.set[0]
result = self.opcode
temp = 0
for i, c in map(lambda x: (x, chr(x)), range(256)):
if c in self.set:
temp = temp | (1 << (i & 7))
if (i % 8) == 7:
result = result + chr(temp)
temp = 0
return result
def __repr__(self):
result = '%-15s' % (self.name)
self.set.sort()
# XXX this should print more intelligently
for char in self.set:
result = result + char
return result
class Exact(Instruction):
name = 'exact'
def __init__(self, char, flags):
self.char = char
if flags & IGNORECASE: self.char=string.lower(self.char)
Instruction.__init__(self, chr(4), 2)
def assemble(self, position, labels):
return self.opcode + self.char
def __repr__(self):
return '%-15s %s' % (self.name, `self.char`)
class AnyChar(Instruction):
name = 'anychar'
def __init__(self):
Instruction.__init__(self, chr(5))
def assemble(self, position, labels):
return self.opcode
class MemoryInstruction(Instruction):
def __init__(self, opcode, register):
self.register = register
Instruction.__init__(self, opcode, 2)
def assemble(self, position, labels):
return self.opcode + chr(self.register)
def __repr__(self):
return '%-15s %i' % (self.name, self.register)
class StartMemory(MemoryInstruction):
name = 'start_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(6), register)
class EndMemory(MemoryInstruction):
name = 'end_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(7), register)
class MatchMemory(MemoryInstruction):
name = 'match_memory'
def __init__(self, register):
MemoryInstruction.__init__(self, chr(8), register)
class JumpInstruction(Instruction):
def __init__(self, opcode, label):
self.label = label
Instruction.__init__(self, opcode, 3)
def compute_offset(self, start, dest):
return dest - (start + 3)
def pack_offset(self, offset):
if offset > 32767:
raise error, 'offset out of range (pos)'
elif offset < -32768:
raise error, 'offset out of range (neg)'
elif offset < 0:
offset = offset + 65536
return chr(offset & 0xff) + chr((offset >> 8) & 0xff)
def assemble(self, position, labels):
return self.opcode + \
self.pack_offset(self.compute_offset(position,
labels[self.label]))
def __repr__(self):
return '%-15s %i' % (self.name, self.label)
class Jump(JumpInstruction):
name = 'jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(9), label)
class StarJump(JumpInstruction):
name = 'star_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(10), label)
class FailureJump(JumpInstruction):
name = 'failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(11), label)
class UpdateFailureJump(JumpInstruction):
name = 'update_failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(12), label)
class DummyFailureJump(JumpInstruction):
name = 'dummy_failure_jump'
def __init__(self, label):
JumpInstruction.__init__(self, chr(13), label)
class BegBuf(Instruction):
name = 'begbuf'
def __init__(self):
Instruction.__init__(self, chr(14))
class EndBuf(Instruction):
name = 'endbuf'
def __init__(self):
Instruction.__init__(self, chr(15))
class WordBeg(Instruction):
name = 'wordbeg'
def __init__(self):
Instruction.__init__(self, chr(16))
class WordEnd(Instruction):
name = 'wordend'
def __init__(self):
Instruction.__init__(self, chr(17))
class WordBound(Instruction):
name = 'wordbound'
def __init__(self):
Instruction.__init__(self, chr(18))
class NotWordBound(Instruction):
name = 'notwordbound'
def __init__(self):
Instruction.__init__(self, chr(19))
class SyntaxSpec(Instruction):
name = 'syntaxspec'
def __init__(self, syntax):
self.syntax = syntax
Instruction.__init__(self, chr(20), 2)
def assemble(self, postition, labels):
return self.opcode + chr(self.syntax)
class NotSyntaxSpec(Instruction):
name = 'notsyntaxspec'
def __init__(self, syntax):
self.syntax = syntax
Instruction.__init__(self, chr(21), 2)
def assemble(self, postition, labels):
return self.opcode + chr(self.syntax)
class Label(Instruction):
name = 'label'
def __init__(self, label):
self.label = label
Instruction.__init__(self, '', 0)
def __repr__(self):
return '%-15s %i' % (self.name, self.label)
class OpenParen(Instruction):
name = '('
def __init__(self, register):
self.register = register
Instruction.__init__(self, '', 0)
def assemble(self, position, labels):
raise error, 'unmatched open parenthesis'
class Alternation(Instruction):
name = '|'
def __init__(self):
Instruction.__init__(self, '', 0)
def assemble(self, position, labels):
raise error, 'an alternation was not taken care of'
#
#
#
def assemble(instructions):
labels = {}
position = 0
pass1 = []
for instruction in instructions:
if instruction.name == 'label':
labels[instruction.label] = position
else:
pass1.append((position, instruction))
position = position + instruction.size
pass2 = ''
for position, instruction in pass1:
pass2 = pass2 + instruction.assemble(position, labels)
return pass2
#
#
#
def escape(pattern):
result = []
for char in pattern:
if not reop.syntax_table[char] & reop.word:
result.append('\\')
result.append(char)
return string.join(result, '')
#
#
#
def registers_used(instructions):
result = []
for instruction in instructions:
if (instruction.name in ['set_memory', 'end_memory']) and \
(instruction.register not in result):
result.append(instruction.register)
return result
#
#
#
class Fastmap:
def __init__(self):
self.map = ['\000']*256
self.can_be_null = 0
def add(self, char):
self.map[ord(char)] = '\001'
def fastmap(self):
return string.join(self.map, '')
def __getitem__(self, char):
return ord(self.map[ord(char)])
def __repr__(self):
self.map.sort()
return 'Fastmap(' + `self.can_be_null` + ', ' + `self.map` + ')'
#
#
#
def find_label(code, label):
line = 0
for instruction in code:
if (instruction.name == 'label') and (instruction.label == label):
return line + 1
line = line + 1
def build_fastmap_aux(code, pos, visited, fastmap):
if visited[pos]:
return
while 1:
instruction = code[pos]
visited[pos] = 1
pos = pos + 1
if instruction.name == 'end':
fastmap.can_be_null = 1
return
elif instruction.name == 'syntaxspec':
for char in map(chr, range(256)):
if reop.syntax_table[char] & instruction.syntax:
fastmap.add(char)
return
elif instruction.name == 'notsyntaxspec':
for char in map(chr, range(256)):
if not reop.syntax_table[char] & instruction.syntax:
fastmap.add(char)
return
elif instruction.name == 'eol':
fastmap.add('\n')
if fastmap.can_be_null == 0:
fastmap.can_be_null = 2
return
elif instruction.name == 'set':
for char in instruction.set:
fastmap.add(char)
return
elif instruction.name == 'exact':
fastmap.add(instruction.char)
elif instruction.name == 'anychar':
for char in map(chr, range(256)):
if char != '\n':
fastmap.add(char)
return
elif instruction.name == 'match_memory':
for char in map(chr, range(256)):
fastmap.add(char)
fastmap.can_be_null = 1
return
elif instruction.name in ['jump', 'dummy_failure_jump', \
'update_failure_jump', 'star_jump']:
pos = find_label(code, instruction.label)
if visited[pos]:
return
visited[pos] = 1
elif instruction.name == 'failure_jump':
build_fastmap_aux(code,
find_label(code, instruction.label),
visited,
fastmap)
def build_fastmap(code, pos=0):
visited = [0] * len(code)
fastmap = Fastmap()
build_fastmap_aux(code, pos, visited, fastmap)
return fastmap
#
#
#
#[NORMAL, CHARCLASS, REPLACEMENT] = range(3)
#[CHAR, MEMORY_REFERENCE, SYNTAX, NOT_SYNTAX, SET, WORD_BOUNDARY,
# NOT_WORD_BOUNDARY, BEGINNING_OF_BUFFER, END_OF_BUFFER] = range(9)
def expand_escape(pattern, index, context=NORMAL):
if index >= len(pattern):
raise error, 'escape ends too soon'
elif pattern[index] == 't':
return CHAR, chr(9), index + 1
elif pattern[index] == 'n':
return CHAR, chr(10), index + 1
elif pattern[index] == 'v':
return CHAR, chr(11), index + 1
elif pattern[index] == 'r':
return CHAR, chr(13), index + 1
elif pattern[index] == 'f':
return CHAR, chr(12), index + 1
elif pattern[index] == 'a':
return CHAR, chr(7), index + 1
elif pattern[index] == 'x':
# CAUTION: this is the Python rule, not the Perl rule!
end = index + 1 # Skip over the 'x' character
while (end < len(pattern)) and (pattern[end] in string.hexdigits):
end = end + 1
if end == index:
raise error, "\\x must be followed by hex digit(s)"
# let Python evaluate it, so we don't incorrectly 2nd-guess
# what it's doing (and Python in turn passes it on to sscanf,
# so that *it* doesn't incorrectly 2nd-guess what C does!)
char = eval ('"' + pattern[index-1:end] + '"')
assert len(char) == 1
return CHAR, char, end
elif pattern[index] == 'b':
if context != NORMAL:
return CHAR, chr(8), index + 1
else:
return WORD_BOUNDARY, '', index + 1
elif pattern[index] == 'B':
if context != NORMAL:
return CHAR, 'B', index + 1
else:
return NOT_WORD_BOUNDARY, '', index + 1
elif pattern[index] == 'A':
if context != NORMAL:
return CHAR, 'A', index + 1
else:
return BEGINNING_OF_BUFFER, '', index + 1
elif pattern[index] == 'Z':
if context != NORMAL:
return CHAR, 'Z', index + 1
else:
return END_OF_BUFFER, '', index + 1
elif pattern[index] in 'GluLUQE':
raise error, ('\\' + pattern[index] + ' is not allowed')
elif pattern[index] == 'w':
if context == NORMAL:
return SYNTAX, reop.word, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.word:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'w', index + 1
elif pattern[index] == 'W':
if context == NORMAL:
return NOT_SYNTAX, reop.word, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.word:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'W', index + 1
elif pattern[index] == 's':
if context == NORMAL:
return SYNTAX, reop.whitespace, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.whitespace:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 's', index + 1
elif pattern[index] == 'S':
if context == NORMAL:
return NOT_SYNTAX, reop.whitespace, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.whitespace:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'S', index + 1
elif pattern[index] == 'd':
if context == NORMAL:
return SYNTAX, reop.digit, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if reop.syntax_table[char] & reop.digit:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'd', index + 1
elif pattern[index] == 'D':
if context == NORMAL:
return NOT_SYNTAX, reop.digit, index + 1
elif context == CHARCLASS:
set = []
for char in reop.syntax_table.keys():
if not reop.syntax_table[char] & reop.digit:
set.append(char)
return SET, set, index + 1
else:
return CHAR, 'D', index + 1
elif pattern[index] in '0123456789':
if pattern[index] == '0':
if (index + 1 < len(pattern)) and \
(pattern[index + 1] in string.octdigits):
if (index + 2 < len(pattern)) and \
(pattern[index + 2] in string.octdigits):
value = string.atoi(pattern[index:index + 3], 8)
index = index + 3
else:
value = string.atoi(pattern[index:index + 2], 8)
index = index + 2
else:
value = 0
index = index + 1
if value > 255:
raise error, 'octal value out of range'
return CHAR, chr(value), index
else:
if (index + 1 < len(pattern)) and \
(pattern[index + 1] in string.digits):
if (index + 2 < len(pattern)) and \
(pattern[index + 2] in string.octdigits) and \
(pattern[index + 1] in string.octdigits) and \
(pattern[index] in string.octdigits):
value = string.atoi(pattern[index:index + 3], 8)
if value > 255:
raise error, 'octal value out of range'
return CHAR, chr(value), index + 3
else:
value = string.atoi(pattern[index:index + 2])
if (value < 1) or (value > 99):
raise error, 'memory reference out of range'
if context == CHARCLASS:
raise error, ('cannot reference a register from '
'inside a character class')
return MEMORY_REFERENCE, value, index + 2
else:
if context == CHARCLASS:
raise error, ('cannot reference a register from '
'inside a character class')
value = string.atoi(pattern[index])
return MEMORY_REFERENCE, value, index + 1
elif pattern[index] == 'g':
if context != REPLACEMENT:
return CHAR, 'g', index + 1
index = index + 1
if index >= len(pattern):
raise error, 'unfinished symbolic reference'
if pattern[index] != '<':
raise error, 'missing < in symbolic reference'
index = index + 1
end = string.find(pattern, '>', index)
if end == -1:
raise error, 'unfinished symbolic reference'
value = pattern[index:end]
if not valid_identifier(value):
raise error, 'illegal symbolic reference'
return MEMORY_REFERENCE, value, end + 1
else:
return CHAR, pattern[index], index + 1
def compile(pattern, flags=0):
stack = []
label = 0
register = 1
groupindex = {}
lastop = ''
# look for embedded pattern modifiers at the beginning of the pattern
index = 0
if len(pattern) >= 3 and \
(pattern[:2] == '(?') and \
(pattern[2] in 'iImMsSxX'):
index = 2
while (index < len(pattern)) and (pattern[index] != ')'):
if pattern[index] in 'iI':
flags = flags | IGNORECASE
elif pattern[index] in 'mM':
flags = flags | MULTILINE
elif pattern[index] in 'sS':
flags = flags | DOTALL
elif pattern[index] in 'xX':
flags = flags | VERBOSE
else:
raise error, 'unknown modifier'
index = index + 1
index = index + 1
# compile the rest of the pattern
while (index < len(pattern)):
char = pattern[index]
index = index + 1
if char == '\\':
escape_type, value, index = expand_escape(pattern, index)
if escape_type == CHAR:
stack.append([Exact(value, flags)])
lastop = '\\' + value
elif escape_type == MEMORY_REFERENCE:
if value >= register:
raise error, ('cannot reference a register '
'not yet used')
stack.append([MatchMemory(value)])
lastop = '\\1'
elif escape_type == BEGINNING_OF_BUFFER:
stack.append([BegBuf()])
lastop = '\\A'
elif escape_type == END_OF_BUFFER:
stack.append([EndBuf()])
lastop = '\\Z'
elif escape_type == WORD_BOUNDARY:
stack.append([WordBound()])
lastop = '\\b'
elif escape_type == NOT_WORD_BOUNDARY:
stack.append([NotWordBound()])
lastop = '\\B'
elif escape_type == SYNTAX:
stack.append([SyntaxSpec(value)])
if value == reop.word:
lastop = '\\w'
elif value == reop.whitespace:
lastop = '\\s'
elif value == reop.digit:
lastop = '\\d'
else:
lastop = '\\?'
elif escape_type == NOT_SYNTAX:
stack.append([NotSyntaxSpec(value)])
if value == reop.word:
lastop = '\\W'
elif value == reop.whitespace:
lastop = '\\S'
elif value == reop.digit:
lastop = '\\D'
else:
lastop = '\\?'
elif escape_type == SET:
raise error, 'cannot use set escape type here'
else:
raise error, 'unknown escape type'
elif char == '|':
expr = []
while (len(stack) != 0) and \
(stack[-1][0].name != '(') and \
(stack[-1][0].name != '|'):
expr = stack[-1] + expr
del stack[-1]
stack.append([FailureJump(label)] + \
expr + \
[Jump(-1),
Label(label)])
stack.append([Alternation()])
label = label + 1
lastop = '|'
elif char == '(':
if index >= len(pattern):
raise error, 'no matching close paren'
elif pattern[index] == '?':
# Perl style (?...) extensions
index = index + 1
if index >= len(pattern):
raise error, 'extension ends prematurely'
elif pattern[index] == 'P':
# Python extensions
index = index + 1
if index >= len(pattern):
raise error, 'extension ends prematurely'
elif pattern[index] == '<':
# Handle Python symbolic group names (?P<...>...)
index = index + 1
end = string.find(pattern, '>', index)
if end == -1:
raise error, 'no end to symbolic group name'
name = pattern[index:end]
if not valid_identifier(name):
raise error, ('symbolic group name must be a '
'valid identifier')
index = end + 1
groupindex[name] = register
stack.append([OpenParen(register)])
register = register + 1
lastop = '('
elif pattern[index] == '=':
# backreference to symbolic group name
if index >= len(pattern):
raise error, '(?P= at the end of the pattern'
start = index + 1
end = string.find(pattern, ')', start)
if end == -1:
raise error, 'no ) to end symbolic group name'
name = pattern[start:end]
if name not in groupindex.keys():
raise error, ('symbolic group name ' + name + \
' has not been used yet')
stack.append([MatchMemory(groupindex[name])])
index = end + 1
lastop = '(?P=)'
else:
raise error, ('unknown Python extension: ' + \
pattern[index])
elif pattern[index] == ':':
# grouping, but no registers
index = index + 1
stack.append([OpenParen(-1)])
lastop = '('
elif pattern[index] == '#':
# comment
index = index + 1
end = string.find(pattern, ')', index)
if end == -1:
raise error, 'no end to comment'
index = end + 1
# do not change lastop
elif pattern[index] == '=':
raise error, ('zero-width positive lookahead '
'assertion is unsupported')
elif pattern[index] == '!':
raise error, ('zero-width negative lookahead '
'assertion is unsupported')
elif pattern[index] in 'iImMsSxX':
raise error, ('embedded pattern modifiers are only '
'allowed at the beginning of the pattern')
else:
raise error, 'unknown extension'
else:
stack.append([OpenParen(register)])
register = register + 1
lastop = '('
elif char == ')':
# make one expression out of everything on the stack up to
# the marker left by the last parenthesis
expr = []
while (len(stack) > 0) and (stack[-1][0].name != '('):
expr = stack[-1] + expr
del stack[-1]
if len(stack) == 0:
raise error, 'too many close parens'
# remove markers left by alternation
expr = filter(lambda x: x.name != '|', expr)
# clean up jumps inserted by alternation
need_label = 0
for i in range(len(expr)):
if (expr[i].name == 'jump') and (expr[i].label == -1):
expr[i] = Jump(label)
need_label = 1
if need_label:
expr.append(Label(label))
label = label + 1
if stack[-1][0].register > 0:
expr = [StartMemory(stack[-1][0].register)] + \
expr + \
[EndMemory(stack[-1][0].register)]
del stack[-1]
stack.append(expr)
lastop = ')'
elif char == '{':
if len(stack) == 0:
raise error, 'no expression to repeat'
end = string.find(pattern, '}', index)
if end == -1:
raise error, ('no close curly bracket to match'
' open curly bracket')
fields = map(string.strip,
string.split(pattern[index:end], ','))
index = end + 1
minimal = 0
if (index < len(pattern)) and (pattern[index] == '?'):
minimal = 1
index = index + 1
if len(fields) == 1:
# {n} or {n}? (there's really no difference)
try:
count = string.atoi(fields[0])
except ValueError:
raise error, ('count must be an integer '
'inside curly braces')
if count > 65535:
raise error, 'repeat count out of range'
expr = []
while count > 0:
expr = expr + stack[-1]
count = count - 1
del stack[-1]
stack.append(expr)
if minimal:
lastop = '{n}?'
else:
lastop = '{n}'
elif len(fields) == 2:
# {n,} or {n,m}
if fields[1] == '':
# {n,}
try:
min = string.atoi(fields[0])
except ValueError:
raise error, ('minimum must be an integer '
'inside curly braces')
if min > 65535:
raise error, 'minimum repeat count out of range'
expr = []
while min > 0:
expr = expr + stack[-1]
min = min - 1
if minimal:
expr = expr + \
([Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
FailureJump(label)])
lastop = '{n,}?'
else:
expr = expr + \
([Label(label),
FailureJump(label + 1)] +
stack[-1] +
[StarJump(label),
Label(label + 1)])
lastop = '{n,}'
del stack[-1]
stack.append(expr)
label = label + 2
else:
# {n,m}
try:
min = string.atoi(fields[0])
except ValueError:
raise error, ('minimum must be an integer '
'inside curly braces')
try:
max = string.atoi(fields[1])
except ValueError:
raise error, ('maximum must be an integer '
'inside curly braces')
if min > 65535:
raise error, ('minumim repeat count out '
'of range')
if max > 65535:
raise error, ('maximum repeat count out '
'of range')
if min > max:
raise error, ('minimum repeat count must be '
'less than the maximum '
'repeat count')
expr = []
while min > 0:
expr = expr + stack[-1]
min = min - 1
max = max - 1
if minimal:
while max > 0:
expr = expr + \
[FailureJump(label),
Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1)]
max = max - 1
label = label + 2
del stack[-1]
stack.append(expr)
lastop = '{n,m}?'
else:
while max > 0:
expr = expr + \
[FailureJump(label)] + \
stack[-1]
max = max - 1
del stack[-1]
stack.append(expr + [Label(label)])
label = label + 1
lastop = '{n,m}'
else:
raise error, ('there need to be one or two fields '
'in a {} expression')
elif char == '}':
raise error, 'unbalanced close curly brace'
elif char == '*':
# Kleene closure
if len(stack) == 0:
raise error, '* needs something to repeat'
if lastop in ['(', '|']:
raise error, '* needs something to repeat'
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1),
FailureJump(label)]
index = index + 1
lastop = '*?'
else:
# greedy matching
expr = [Label(label),
FailureJump(label + 1)] + \
stack[-1] + \
[StarJump(label),
Label(label + 1)]
lastop = '*'
del stack[-1]
stack.append(expr)
label = label + 2
elif char == '+':
# positive closure
if len(stack) == 0:
raise error, '+ needs something to repeat'
if lastop in ['(', '|']:
raise error, '+ needs something to repeat'
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy
expr = [Label(label)] + \
stack[-1] + \
[FailureJump(label)]
label = label + 1
index = index + 1
lastop = '+?'
else:
# greedy
expr = [DummyFailureJump(label + 1),
Label(label),
FailureJump(label + 2),
Label(label + 1)] + \
stack[-1] + \
[StarJump(label),
Label(label + 2)]
label = label + 3
lastop = '+'
del stack[-1]
stack.append(expr)
elif char == '?':
if len(stack) == 0:
raise error, 'need something to be optional'
if len(stack) == 0:
raise error, '? needs something to repeat'
if lastop in ['(', '|']:
raise error, '? needs something to repeat'
if lastop in repetition_operators:
raise error, 'nested repetition operators'
if (index < len(pattern)) and (pattern[index] == '?'):
# non-greedy matching
expr = [FailureJump(label),
Jump(label + 1),
Label(label)] + \
stack[-1] + \
[Label(label + 1)]
label = label + 2
index = index + 1
lastop = '??'
else:
# greedy matching
expr = [FailureJump(label)] + \
stack[-1] + \
[Label(label)]
label = label + 1
lastop = '?'
del stack[-1]
stack.append(expr)
elif char == '.':
if flags & DOTALL:
stack.append([Set(map(chr, range(256)), flags)])
else:
stack.append([AnyChar()])
lastop = '.'
elif char == '^':
if flags & MULTILINE:
stack.append([Bol()])
else:
stack.append([BegBuf()])
lastop = '^'
elif char == '$':
if flags & MULTILINE:
stack.append([Eol()])
else:
stack.append([EndBuf()])
lastop = '$'
elif char == '#':
if flags & VERBOSE:
# comment
index = index + 1
end = string.find(pattern, '\n', index)
if end == -1:
index = len(pattern)
else:
index = end + 1
# do not change lastop
else:
stack.append([Exact(char, flags)])
lastop = '#'
elif char in string.whitespace:
if not (flags & VERBOSE):
stack.append([Exact(char, flags)])
lastop = char
elif char == '[':
# compile character class
if index >= len(pattern):
raise error, 'unclosed character class'
negate = 0
last = ''
set = []
if pattern[index] == '^':
negate = 1
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
if pattern[index] == ']':
set.append(']')
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
elif pattern[index] == '-':
set.append('-')
index = index + 1
if index >= len(pattern):
raise error, 'unclosed character class'
while (index < len(pattern)) and (pattern[index] != ']'):
next = pattern[index]
index = index + 1
if next == '-':
if index >= len(pattern):
raise error, 'incomplete range in character class'
elif pattern[index] == ']':
set.append('-')
else:
if last == '':
raise error, ('improper use of range in '
'character class')
start = last
if pattern[index] == '\\':
escape_type,
value,
index = expand_escape(pattern,
index + 1,
CHARCLASS)
if escape_type == CHAR:
end = value
else:
raise error, ('illegal escape in character '
'class range')
else:
end = pattern[index]
index = index + 1
if start > end:
raise error, ('range arguments out of order '
'in character class')
for char in map(chr, range(ord(start), ord(end) + 1)):
if char not in set:
set.append(char)
last = ''
elif next == '\\':
# expand syntax meta-characters and add to set
if index >= len(pattern):
raise error, 'incomplete set'
escape_type, value, index = expand_escape(pattern,
index,
CHARCLASS)
if escape_type == CHAR:
set.append(value)
last = value
elif escape_type == SET:
for char in value:
if char not in set:
set.append(char)
last = ''
else:
raise error, 'illegal escape type in character class'
else:
if next not in set:
set.append(next)
last = next
if (index >= len(pattern)) or ( pattern[index] != ']'):
raise error, 'incomplete set'
index = index + 1
if negate:
# If case is being ignored, then both upper- and lowercase
# versions of the letters must be excluded.
if flags & IGNORECASE: set=set+map(string.upper, set)
notset = []
for char in map(chr, range(256)):
if char not in set:
notset.append(char)
if len(notset) == 0:
raise error, 'empty negated set'
stack.append([Set(notset, flags)])
else:
if len(set) == 0:
raise error, 'empty set'
stack.append([Set(set, flags)])
lastop = '[]'
else:
stack.append([Exact(char, flags)])
lastop = char
code = []
while len(stack) > 0:
if stack[-1][0].name == '(':
raise error, 'too many open parens'
code = stack[-1] + code
del stack[-1]
if len(code) == 0:
raise error, 'no code generated'
code = filter(lambda x: x.name != '|', code)
need_label = 0
for i in range(len(code)):
if (code[i].name == 'jump') and (code[i].label == -1):
code[i] = Jump(label)
need_label = 1
if need_label:
code.append(Label(label))
label = label + 1
code.append(End())
# print code
return RegexObject(pattern, flags, code, register, groupindex)
# Replace expand_escape and _expand functions with their C equivalents.
# If you suspect bugs in the C versions, comment out the next two lines
expand_escape = reop.expand_escape
_expand = reop._expand
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment