Commit a6e1ad27 authored by Rob Pike's avatar Rob Pike

two easy optimizations for regexp:

	1) if char class contains a single character, make it a single character.
		(this is used to quote, e.g. [.] rather than \.
	2) if regexp begins with ordinary text substring, use plain string match to start engine

R=rsc
CC=golang-dev
https://golang.org/cl/157095
parent 398927e6
...@@ -60,6 +60,8 @@ type tester struct { ...@@ -60,6 +60,8 @@ type tester struct {
} }
var matches = []tester{ var matches = []tester{
tester{`a+`, "baaab", vec{1, 4}},
tester{"abcd..", "abcdef", vec{0, 6}},
tester{``, "", vec{0, 0}}, tester{``, "", vec{0, 0}},
tester{`a`, "a", vec{0, 1}}, tester{`a`, "a", vec{0, 1}},
tester{`x`, "y", vec{}}, tester{`x`, "y", vec{}},
...@@ -78,6 +80,8 @@ var matches = []tester{ ...@@ -78,6 +80,8 @@ var matches = []tester{
tester{`[a\-\]z]+`, "az]-bcz", vec{0, 4}}, tester{`[a\-\]z]+`, "az]-bcz", vec{0, 4}},
tester{`[^\n]+`, "abcd\n", vec{0, 4}}, tester{`[^\n]+`, "abcd\n", vec{0, 4}},
tester{`[日本語]+`, "日本語日本語", vec{0, 18}}, tester{`[日本語]+`, "日本語日本語", vec{0, 18}},
tester{`日本語+`, "日本語", vec{0, 9}},
tester{`日本語+`, "日本語語語語", vec{0, 18}},
tester{`()`, "", vec{0, 0, 0, 0}}, tester{`()`, "", vec{0, 0, 0, 0}},
tester{`(a)`, "a", vec{0, 1, 0, 1}}, tester{`(a)`, "a", vec{0, 1, 0, 1}},
tester{`(.)(.)`, "日a", vec{0, 4, 0, 3, 3, 4}}, tester{`(.)(.)`, "日a", vec{0, 4, 0, 3, 3, 4}},
...@@ -89,6 +93,7 @@ var matches = []tester{ ...@@ -89,6 +93,7 @@ var matches = []tester{
tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}}, tester{`(((a|b|c)*)(d))`, "abcd", vec{0, 4, 0, 4, 0, 3, 2, 3, 3, 4}},
tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}}, tester{`a*(|(b))c*`, "aacc", vec{0, 4, 2, 2, -1, -1}},
tester{`(.*).*`, "ab", vec{0, 2, 0, 2}}, tester{`(.*).*`, "ab", vec{0, 2, 0, 2}},
tester{`[.]`, ".", vec{0, 1}},
} }
func compileTest(t *testing.T, expr string, error os.Error) *Regexp { func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
......
...@@ -27,6 +27,7 @@ import ( ...@@ -27,6 +27,7 @@ import (
"container/vector"; "container/vector";
"io"; "io";
"os"; "os";
"strings";
"utf8"; "utf8";
) )
...@@ -70,10 +71,12 @@ func (c *common) setIndex(i int) { c._index = i } ...@@ -70,10 +71,12 @@ func (c *common) setIndex(i int) { c._index = i }
// Regexp is the representation of a compiled regular expression. // Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods. // The public interface is entirely through methods.
type Regexp struct { type Regexp struct {
expr string; // the original expression expr string; // the original expression
inst *vector.Vector; prefix string; // initial plain text string
start instr; prefixBytes []byte; // initial plain text bytes
nbra int; // number of brackets in expression, for subexpressions inst *vector.Vector;
start instr;
nbra int; // number of brackets in expression, for subexpressions
} }
const ( const (
...@@ -315,6 +318,12 @@ func (p *parser) charClass() instr { ...@@ -315,6 +318,12 @@ func (p *parser) charClass() instr {
p.re.add(nl); p.re.add(nl);
return nl; return nl;
} }
// Special common case: "[a]" -> "a"
if !cc.negate && cc.ranges.Len() == 2 && cc.ranges.At(0) == cc.ranges.At(1) {
c := newChar(cc.ranges.At(0));
p.re.add(c);
return c;
}
p.re.add(cc); p.re.add(cc);
return cc; return cc;
case '-': // do this before backslash processing case '-': // do this before backslash processing
...@@ -573,6 +582,7 @@ func (re *Regexp) eliminateNops() { ...@@ -573,6 +582,7 @@ func (re *Regexp) eliminateNops() {
} }
func (re *Regexp) dump() { func (re *Regexp) dump() {
print("prefix <", re.prefix, ">\n");
for i := 0; i < re.inst.Len(); i++ { for i := 0; i < re.inst.Len(); i++ {
inst := re.inst.At(i).(instr); inst := re.inst.At(i).(instr);
print(inst.index(), ": "); print(inst.index(), ": ");
...@@ -606,9 +616,42 @@ func (re *Regexp) doParse() os.Error { ...@@ -606,9 +616,42 @@ func (re *Regexp) doParse() os.Error {
re.dump(); re.dump();
println(); println();
} }
if p.error == nil {
re.setPrefix();
if debug {
re.dump();
println();
}
}
return p.error; return p.error;
} }
// return regular text at the beginning of str
func (re *Regexp) setPrefix() {
var b []byte;
var utf = make([]byte, utf8.UTFMax);
// First instruction is start; skip that.
i := re.inst.At(0).(instr).next().index();
for i < re.inst.Len() {
inst := re.inst.At(i).(instr);
// stop if this is not a char
if inst.kind() != _CHAR {
break
}
// stop if this char starts a closure; liberal but easy test: is an ALT next?
if re.inst.At(inst.next().index()).(instr).kind() == _ALT {
break
}
n := utf8.EncodeRune(inst.(*_Char).char, utf);
b = bytes.Add(b, utf[0:n]);
i = inst.next().index();
}
// point start instruction to first non-CHAR
re.inst.At(0).(instr).setNext(re.inst.At(i).(instr));
re.prefixBytes = b;
re.prefix = string(b);
}
// Compile parses a regular expression and returns, if successful, a Regexp // Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text. // object that can be used to match against text.
func Compile(str string) (regexp *Regexp, error os.Error) { func Compile(str string) (regexp *Regexp, error os.Error) {
...@@ -691,9 +734,17 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s ...@@ -691,9 +734,17 @@ func (re *Regexp) addState(s []state, inst instr, match []int, pos, end int) []s
return s; return s;
} }
func noMatch(nbra int) []int {
match := make([]int, 2*(nbra+1));
for i := range match {
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
return match;
}
// Accepts either string or bytes - the logic is identical either way. // Accepts either string or bytes - the logic is identical either way.
// If bytes == nil, scan str. // If bytes == nil, scan str.
func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { func (re *Regexp) doExecute(str string, bytestr []byte, pos int) []int {
var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements var s [2][]state; // TODO: use a vector when state values (not ptrs) can be vector elements
s[0] = make([]state, 10)[0:0]; s[0] = make([]state, 10)[0:0];
s[1] = make([]state, 10)[0:0]; s[1] = make([]state, 10)[0:0];
...@@ -701,16 +752,26 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { ...@@ -701,16 +752,26 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
var final state; var final state;
found := false; found := false;
end := len(str); end := len(str);
if bytes != nil { if bytestr != nil {
end = len(bytes) end = len(bytestr)
}
// fast check for initial plain substring
if re.prefix != "" {
var advance int;
if bytestr == nil {
advance = strings.Index(str[pos:len(str)], re.prefix)
} else {
advance = bytes.Index(bytestr[pos:len(bytestr)], re.prefixBytes)
}
if advance == -1 {
return []int{}
}
pos += advance + len(re.prefix);
} }
for pos <= end { for pos <= end {
if !found { if !found {
// prime the pump if we haven't seen a match yet // prime the pump if we haven't seen a match yet
match := make([]int, 2*(re.nbra+1)); match := noMatch(re.nbra);
for i := 0; i < len(match); i++ {
match[i] = -1 // no match seen; catches cases like "a(b)?c" on "ac"
}
match[0] = pos; match[0] = pos;
s[out] = re.addState(s[out], re.start.next(), match, pos, end); s[out] = re.addState(s[out], re.start.next(), match, pos, end);
} }
...@@ -723,10 +784,10 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { ...@@ -723,10 +784,10 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
charwidth := 1; charwidth := 1;
c := endOfFile; c := endOfFile;
if pos < end { if pos < end {
if bytes == nil { if bytestr == nil {
c, charwidth = utf8.DecodeRuneInString(str[pos:end]) c, charwidth = utf8.DecodeRuneInString(str[pos:end])
} else { } else {
c, charwidth = utf8.DecodeRune(bytes[pos:end]) c, charwidth = utf8.DecodeRune(bytestr[pos:end])
} }
} }
pos += charwidth; pos += charwidth;
...@@ -769,6 +830,10 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int { ...@@ -769,6 +830,10 @@ func (re *Regexp) doExecute(str string, bytes []byte, pos int) []int {
} }
} }
} }
// if match found, back up start of match by width of prefix.
if re.prefix != "" && len(final.match) > 0 {
final.match[0] -= len(re.prefix)
}
return final.match; return final.match;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment