Commit a1e7cd97 authored by Russ Cox's avatar Russ Cox

exp/regexp: implement regexp API using exp/regexp/syntax

Still need to write tests for new syntax
and fix bugs that the tests find, but this
is a good check point.

All tests pass.

Compared against existing regexp:

benchmark                                old ns/op    new ns/op    delta
regexp.BenchmarkLiteral                       1869          620  -66.83%
regexp.BenchmarkNotLiteral                    9489         7823  -17.56%
regexp.BenchmarkMatchClass                   10372         8386  -19.15%
regexp.BenchmarkMatchClass_InRange           10800         7750  -28.24%
regexp.BenchmarkReplaceAll                   13492         8519  -36.86%
regexp.BenchmarkAnchoredLiteralShortNonMatch   747          339  -54.62%
regexp.BenchmarkAnchoredLiteralLongNonMatch    599          335  -44.07%
regexp.BenchmarkAnchoredShortMatch            2137          917  -57.09%
regexp.BenchmarkAnchoredLongMatch             2029          917  -54.81%

R=r, r
CC=golang-dev, sam.thorogood
https://golang.org/cl/4820046
parent fc2480da
# Copyright 2011 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
include ../../../Make.inc
TARG=exp/regexp
GOFILES=\
exec.go\
regexp.go\
include ../../../Make.pkg
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"os"
"strings"
"testing"
)
var good_re = []string{
``,
`.`,
`^.$`,
`a`,
`a*`,
`a+`,
`a?`,
`a|b`,
`a*|b*`,
`(a*|b)(c*|d)`,
`[a-z]`,
`[a-abc-c\-\]\[]`,
`[a-z]+`,
`[abc]`,
`[^1234]`,
`[^\n]`,
`\!\\`,
}
/*
type stringError struct {
re string
err os.Error
}
var bad_re = []stringError{
{`*`, ErrBareClosure},
{`+`, ErrBareClosure},
{`?`, ErrBareClosure},
{`(abc`, ErrUnmatchedLpar},
{`abc)`, ErrUnmatchedRpar},
{`x[a-z`, ErrUnmatchedLbkt},
{`abc]`, ErrUnmatchedRbkt},
{`[z-a]`, ErrBadRange},
{`abc\`, ErrExtraneousBackslash},
{`a**`, ErrBadClosure},
{`a*+`, ErrBadClosure},
{`a??`, ErrBadClosure},
{`\x`, ErrBadBackslash},
}
*/
func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
re, err := Compile(expr)
if err != error {
t.Error("compiling `", expr, "`; unexpected error: ", err.String())
}
return re
}
func TestGoodCompile(t *testing.T) {
for i := 0; i < len(good_re); i++ {
compileTest(t, good_re[i], nil)
}
}
/*
func TestBadCompile(t *testing.T) {
for i := 0; i < len(bad_re); i++ {
compileTest(t, bad_re[i].re, bad_re[i].err)
}
}
*/
func matchTest(t *testing.T, test *FindTest) {
re := compileTest(t, test.pat, nil)
if re == nil {
return
}
m := re.MatchString(test.text)
if m != (len(test.matches) > 0) {
t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0)
}
// now try bytes
m = re.Match([]byte(test.text))
if m != (len(test.matches) > 0) {
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
}
}
func TestMatch(t *testing.T) {
for _, test := range findTests {
matchTest(t, &test)
}
}
func matchFunctionTest(t *testing.T, test *FindTest) {
m, err := MatchString(test.pat, test.text)
if err == nil {
return
}
if m != (len(test.matches) > 0) {
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
}
}
func TestMatchFunction(t *testing.T) {
for _, test := range findTests {
matchFunctionTest(t, &test)
}
}
type ReplaceTest struct {
pattern, replacement, input, output string
}
var replaceTests = []ReplaceTest{
// Test empty input and/or replacement, with pattern that matches the empty string.
{"", "", "", ""},
{"", "x", "", "x"},
{"", "", "abc", "abc"},
{"", "x", "abc", "xaxbxcx"},
// Test empty input and/or replacement, with pattern that does not match the empty string.
{"b", "", "", ""},
{"b", "x", "", ""},
{"b", "", "abc", "ac"},
{"b", "x", "abc", "axc"},
{"y", "", "", ""},
{"y", "x", "", ""},
{"y", "", "abc", "abc"},
{"y", "x", "abc", "abc"},
// Multibyte characters -- verify that we don't try to match in the middle
// of a character.
{"[a-c]*", "x", "\u65e5", "x\u65e5x"},
{"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"},
// Start and end of a string.
{"^[a-c]*", "x", "abcdabc", "xdabc"},
{"[a-c]*$", "x", "abcdabc", "abcdx"},
{"^[a-c]*$", "x", "abcdabc", "abcdabc"},
{"^[a-c]*", "x", "abc", "x"},
{"[a-c]*$", "x", "abc", "x"},
{"^[a-c]*$", "x", "abc", "x"},
{"^[a-c]*", "x", "dabce", "xdabce"},
{"[a-c]*$", "x", "dabce", "dabcex"},
{"^[a-c]*$", "x", "dabce", "dabce"},
{"^[a-c]*", "x", "", "x"},
{"[a-c]*$", "x", "", "x"},
{"^[a-c]*$", "x", "", "x"},
{"^[a-c]+", "x", "abcdabc", "xdabc"},
{"[a-c]+$", "x", "abcdabc", "abcdx"},
{"^[a-c]+$", "x", "abcdabc", "abcdabc"},
{"^[a-c]+", "x", "abc", "x"},
{"[a-c]+$", "x", "abc", "x"},
{"^[a-c]+$", "x", "abc", "x"},
{"^[a-c]+", "x", "dabce", "dabce"},
{"[a-c]+$", "x", "dabce", "dabce"},
{"^[a-c]+$", "x", "dabce", "dabce"},
{"^[a-c]+", "x", "", ""},
{"[a-c]+$", "x", "", ""},
{"^[a-c]+$", "x", "", ""},
// Other cases.
{"abc", "def", "abcdefg", "defdefg"},
{"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"},
{"abc", "", "abcdabc", "d"},
{"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"},
{"abc", "d", "", ""},
{"abc", "d", "abc", "d"},
{".+", "x", "abc", "x"},
{"[a-c]*", "x", "def", "xdxexfx"},
{"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"},
{"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"},
}
type ReplaceFuncTest struct {
pattern string
replacement func(string) string
input, output string
}
var replaceFuncTests = []ReplaceFuncTest{
{"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"},
{"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"},
{"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"},
}
func TestReplaceAll(t *testing.T) {
for _, tc := range replaceTests {
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
continue
}
actual := re.ReplaceAllString(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.Replace(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
// now try bytes
actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement)))
if actual != tc.output {
t.Errorf("%q.Replace(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
}
}
func TestReplaceAllFunc(t *testing.T) {
for _, tc := range replaceFuncTests {
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
continue
}
actual := re.ReplaceAllStringFunc(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
// now try bytes
actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) }))
if actual != tc.output {
t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
}
}
}
type MetaTest struct {
pattern, output, literal string
isLiteral bool
}
var metaTests = []MetaTest{
{``, ``, ``, true},
{`foo`, `foo`, `foo`, true},
{`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator
{`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators
{`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false},
}
func TestQuoteMeta(t *testing.T) {
for _, tc := range metaTests {
// Verify that QuoteMeta returns the expected string.
quoted := QuoteMeta(tc.pattern)
if quoted != tc.output {
t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`",
tc.pattern, quoted, tc.output)
continue
}
// Verify that the quoted string is in fact treated as expected
// by Compile -- i.e. that it matches the original, unquoted string.
if tc.pattern != "" {
re, err := Compile(quoted)
if err != nil {
t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err)
continue
}
src := "abc" + tc.pattern + "def"
repl := "xyz"
replaced := re.ReplaceAllString(src, repl)
expected := "abcxyzdef"
if replaced != expected {
t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`",
tc.pattern, src, repl, replaced, expected)
}
}
}
}
func TestLiteralPrefix(t *testing.T) {
for _, tc := range metaTests {
// Literal method needs to scan the pattern.
re := MustCompile(tc.pattern)
str, complete := re.LiteralPrefix()
if complete != tc.isLiteral {
t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral)
}
if str != tc.literal {
t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal)
}
}
}
type numSubexpCase struct {
input string
expected int
}
var numSubexpCases = []numSubexpCase{
{``, 0},
{`.*`, 0},
{`abba`, 0},
{`ab(b)a`, 1},
{`ab(.*)a`, 1},
{`(.*)ab(.*)a`, 2},
{`(.*)(ab)(.*)a`, 3},
{`(.*)((a)b)(.*)a`, 4},
{`(.*)(\(ab)(.*)a`, 3},
{`(.*)(\(a\)b)(.*)a`, 3},
}
func TestNumSubexp(t *testing.T) {
for _, c := range numSubexpCases {
re := MustCompile(c.input)
n := re.NumSubexp()
if n != c.expected {
t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected)
}
}
}
func BenchmarkLiteral(b *testing.B) {
x := strings.Repeat("x", 50) + "y"
b.StopTimer()
re := MustCompile("y")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
break
}
}
}
func BenchmarkNotLiteral(b *testing.B) {
x := strings.Repeat("x", 50) + "y"
b.StopTimer()
re := MustCompile(".y")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
break
}
}
}
func BenchmarkMatchClass(b *testing.B) {
b.StopTimer()
x := strings.Repeat("xxxx", 20) + "w"
re := MustCompile("[abcdw]")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
break
}
}
}
func BenchmarkMatchClass_InRange(b *testing.B) {
b.StopTimer()
// 'b' is between 'a' and 'c', so the charclass
// range checking is no help here.
x := strings.Repeat("bbbb", 20) + "c"
re := MustCompile("[ac]")
b.StartTimer()
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
break
}
}
}
func BenchmarkReplaceAll(b *testing.B) {
x := "abcdefghijklmnopqrstuvwxyz"
b.StopTimer()
re := MustCompile("[cjrw]")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.ReplaceAllString(x, "")
}
}
func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^zbc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
}
re := MustCompile("^zbc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredShortMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^.bc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
func BenchmarkAnchoredLongMatch(b *testing.B) {
b.StopTimer()
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
}
re := MustCompile("^.bc(d|e)")
b.StartTimer()
for i := 0; i < b.N; i++ {
re.Match(x)
}
}
package regexp
import "exp/regexp/syntax"
// A queue is a 'sparse array' holding pending threads of execution.
// See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html
type queue struct {
sparse []uint32
dense []entry
}
// A entry is an entry on a queue.
// It holds both the instruction pc and the actual thread.
// Some queue entries are just place holders so that the machine
// knows it has considered that pc. Such entries have t == nil.
type entry struct {
pc uint32
t *thread
}
// A thread is the state of a single path through the machine:
// an instruction and a corresponding capture array.
// See http://swtch.com/~rsc/regexp/regexp2.html
type thread struct {
inst *syntax.Inst
cap []int
}
// A machine holds all the state during an NFA simulation for p.
type machine struct {
re *Regexp // corresponding Regexp
p *syntax.Prog // compiled program
q0, q1 queue // two queues for runq, nextq
pool []*thread // pool of available threads
matched bool // whether a match was found
matchcap []int // capture information for the match
}
// progMachine returns a new machine running the prog p.
func progMachine(p *syntax.Prog) *machine {
m := &machine{p: p}
n := len(m.p.Inst)
m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
ncap := p.NumCap
if ncap < 2 {
ncap = 2
}
m.matchcap = make([]int, ncap)
return m
}
// alloc allocates a new thread with the given instruction.
// It uses the free pool if possible.
func (m *machine) alloc(i *syntax.Inst) *thread {
var t *thread
if n := len(m.pool); n > 0 {
t = m.pool[n-1]
m.pool = m.pool[:n-1]
} else {
t = new(thread)
t.cap = make([]int, cap(m.matchcap))
}
t.cap = t.cap[:len(m.matchcap)]
t.inst = i
return t
}
// free returns t to the free pool.
func (m *machine) free(t *thread) {
m.pool = append(m.pool, t)
}
// match runs the machine over the input starting at pos.
// It reports whether a match was found.
// If so, m.matchcap holds the submatch information.
func (m *machine) match(i input, pos int) bool {
startCond := m.re.cond
if startCond == ^syntax.EmptyOp(0) { // impossible
return false
}
m.matched = false
for i := range m.matchcap {
m.matchcap[i] = -1
}
runq, nextq := &m.q0, &m.q1
rune, rune1 := endOfText, endOfText
width, width1 := 0, 0
rune, width = i.step(pos)
if rune != endOfText {
rune1, width1 = i.step(pos + width)
}
// TODO: Let caller specify the initial flag setting.
// For now assume pos == 0 is beginning of text and
// pos != 0 is not even beginning of line.
// TODO: Word boundary.
var flag syntax.EmptyOp
if pos == 0 {
flag = syntax.EmptyBeginText | syntax.EmptyBeginLine
}
// Update flag using lookahead rune.
if rune1 == '\n' {
flag |= syntax.EmptyEndLine
}
if rune1 == endOfText {
flag |= syntax.EmptyEndText
}
for {
if len(runq.dense) == 0 {
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
// Anchored match, past beginning of text.
break
}
if m.matched {
// Have match; finished exploring alternatives.
break
}
if len(m.re.prefix) > 0 && rune1 != m.re.prefixRune && i.canCheckPrefix() {
// Match requires literal prefix; fast search for it.
advance := i.index(m.re, pos)
if advance < 0 {
break
}
pos += advance
rune, width = i.step(pos)
rune1, width1 = i.step(pos + width)
}
}
if !m.matched {
if len(m.matchcap) > 0 {
m.matchcap[0] = pos
}
m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag)
}
// TODO: word boundary
flag = 0
if rune == '\n' {
flag |= syntax.EmptyBeginLine
}
if rune1 == '\n' {
flag |= syntax.EmptyEndLine
}
if rune1 == endOfText {
flag |= syntax.EmptyEndText
}
m.step(runq, nextq, pos, pos+width, rune, flag)
if width == 0 {
break
}
pos += width
rune, width = rune1, width1
if rune != endOfText {
rune1, width1 = i.step(pos + width)
}
runq, nextq = nextq, runq
}
m.clear(nextq)
return m.matched
}
// clear frees all threads on the thread queue.
func (m *machine) clear(q *queue) {
for _, d := range q.dense {
if d.t != nil {
m.free(d.t)
}
}
q.dense = q.dense[:0]
}
// step executes one step of the machine, running each of the threads
// on runq and appending new threads to nextq.
// The step processes the rune c (which may be endOfText),
// which starts at position pos and ends at nextPos.
// nextCond gives the setting for the empty-width flags after c.
func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) {
for j := 0; j < len(runq.dense); j++ {
d := &runq.dense[j]
t := d.t
if t == nil {
continue
}
/*
* If we support leftmost-longest matching:
if longest && matched && match[0] < t.cap[0] {
m.free(t)
continue
}
*/
i := t.inst
switch i.Op {
default:
panic("bad inst")
case syntax.InstMatch:
if len(t.cap) > 0 {
t.cap[1] = pos
copy(m.matchcap, t.cap)
}
m.matched = true
for _, d := range runq.dense[j+1:] {
if d.t != nil {
m.free(d.t)
}
}
runq.dense = runq.dense[:0]
case syntax.InstRune:
if i.MatchRune(c) {
m.add(nextq, i.Out, nextPos, t.cap, nextCond)
}
}
m.free(t)
}
runq.dense = runq.dense[:0]
}
// add adds an entry to q for pc, unless the q already has such an entry.
// It also recursively adds an entry for all instructions reachable from pc by following
// empty-width conditions satisfied by cond. pos gives the current position
// in the input.
func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp) {
if pc == 0 {
return
}
if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
return
}
j := len(q.dense)
q.dense = q.dense[:j+1]
d := &q.dense[j]
d.t = nil
d.pc = pc
q.sparse[pc] = uint32(j)
i := &m.p.Inst[pc]
switch i.Op {
default:
panic("unhandled")
case syntax.InstFail:
// nothing
case syntax.InstAlt, syntax.InstAltMatch:
m.add(q, i.Out, pos, cap, cond)
m.add(q, i.Arg, pos, cap, cond)
case syntax.InstEmptyWidth:
if syntax.EmptyOp(i.Arg)&^cond == 0 {
m.add(q, i.Out, pos, cap, cond)
}
case syntax.InstNop:
m.add(q, i.Out, pos, cap, cond)
case syntax.InstCapture:
if int(i.Arg) < len(cap) {
opos := cap[i.Arg]
cap[i.Arg] = pos
m.add(q, i.Out, pos, cap, cond)
cap[i.Arg] = opos
} else {
m.add(q, i.Out, pos, cap, cond)
}
case syntax.InstMatch, syntax.InstRune:
t := m.alloc(i)
if len(t.cap) > 0 {
copy(t.cap, cap)
}
d.t = t
}
}
// empty is a non-nil 0-element slice,
// so doExecute can avoid an allocation
// when 0 captures are requested from a successful match.
var empty = make([]int, 0)
// doExecute finds the leftmost match in the input and returns
// the position of its subexpressions.
func (re *Regexp) doExecute(i input, pos int, ncap int) []int {
m := re.get()
m.matchcap = m.matchcap[:ncap]
if !m.match(i, pos) {
re.put(m)
return nil
}
if ncap == 0 {
re.put(m)
return empty // empty but not nil
}
cap := make([]int, ncap)
copy(cap, m.matchcap)
re.put(m)
return cap
}
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
"fmt"
"strings"
"testing"
)
// For each pattern/text pair, what is the expected output of each function?
// We can derive the textual results from the indexed results, the non-submatch
// results from the submatched results, the single results from the 'all' results,
// and the byte results from the string results. Therefore the table includes
// only the FindAllStringSubmatchIndex result.
type FindTest struct {
pat string
text string
matches [][]int
}
func (t FindTest) String() string {
return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text)
}
var findTests = []FindTest{
{``, ``, build(1, 0, 0)},
{`^abcdefg`, "abcdefg", build(1, 0, 7)},
{`a+`, "baaab", build(1, 1, 4)},
{"abcd..", "abcdef", build(1, 0, 6)},
{`a`, "a", build(1, 0, 1)},
{`x`, "y", nil},
{`b`, "abc", build(1, 1, 2)},
{`.`, "a", build(1, 0, 1)},
{`.*`, "abcdef", build(1, 0, 6)},
{`^`, "abcde", build(1, 0, 0)},
{`$`, "abcde", build(1, 5, 5)},
{`^abcd$`, "abcd", build(1, 0, 4)},
{`^bcd'`, "abcdef", nil},
{`^abcd$`, "abcde", nil},
{`a+`, "baaab", build(1, 1, 4)},
{`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)},
{`[a-z]+`, "abcd", build(1, 0, 4)},
{`[^a-z]+`, "ab1234cd", build(1, 2, 6)},
{`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)},
{`[^\n]+`, "abcd\n", build(1, 0, 4)},
{`[日本語]+`, "日本語日本語", build(1, 0, 18)},
{`日本語+`, "日本語", build(1, 0, 9)},
{`日本語+`, "日本語語語語", build(1, 0, 18)},
{`()`, "", build(1, 0, 0, 0, 0)},
{`(a)`, "a", build(1, 0, 1, 0, 1)},
{`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)},
{`(.*)`, "", build(1, 0, 0, 0, 0)},
{`(.*)`, "abcd", build(1, 0, 4, 0, 4)},
{`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)},
{`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)},
{`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)},
{`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)},
{`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)},
{`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)},
{`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)},
{`(.*).*`, "ab", build(1, 0, 2, 0, 2)},
{`[.]`, ".", build(1, 0, 1)},
{`/$`, "/abc/", build(1, 4, 5)},
{`/$`, "/abc", nil},
// multiple matches
{`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)},
{`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)},
{`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)},
{`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)},
{`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)},
// fixed bugs
{`ab$`, "cab", build(1, 1, 3)},
{`axxb$`, "axxcb", nil},
{`data`, "daXY data", build(1, 5, 9)},
{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
{`zx+`, "zzx", build(1, 1, 3)},
// can backslash-escape any punctuation
{`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`,
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
{`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`,
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
{"\\`", "`", build(1, 0, 1)},
{"[\\`]+", "`", build(1, 0, 1)},
// long set of matches (longer than startSize)
{
".",
"qwertyuiopasdfghjklzxcvbnm1234567890",
build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20,
20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30,
30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36),
},
}
// build is a helper to construct a [][]int by extracting n sequences from x.
// This represents n matches with len(x)/n submatches each.
func build(n int, x ...int) [][]int {
ret := make([][]int, n)
runLength := len(x) / n
j := 0
for i := range ret {
ret[i] = make([]int, runLength)
copy(ret[i], x[j:])
j += runLength
if j > len(x) {
panic("invalid build entry")
}
}
return ret
}
// First the simple cases.
func TestFind(t *testing.T) {
for _, test := range findTests {
re := MustCompile(test.pat)
if re.String() != test.pat {
t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
}
result := re.Find([]byte(test.text))
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
expect := test.text[test.matches[0][0]:test.matches[0][1]]
if expect != string(result) {
t.Errorf("expected %q got %q: %s", expect, result, test)
}
}
}
}
func TestFindString(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindString(test.text)
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != "":
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == "":
// Tricky because an empty result has two meanings: no match or empty match.
if test.matches[0][0] != test.matches[0][1] {
t.Errorf("expected match; got none: %s", test)
}
case test.matches != nil && result != "":
expect := test.text[test.matches[0][0]:test.matches[0][1]]
if expect != result {
t.Errorf("expected %q got %q: %s", expect, result, test)
}
}
}
}
func testFindIndex(test *FindTest, result []int, t *testing.T) {
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
expect := test.matches[0]
if expect[0] != result[0] || expect[1] != result[1] {
t.Errorf("expected %v got %v: %s", expect, result, test)
}
}
}
func TestFindIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t)
}
}
func TestFindStringIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t)
}
}
func TestFindReaderIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
}
}
// Now come the simple All cases.
func TestFindAll(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAll([]byte(test.text), -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Fatalf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
continue
}
for k, e := range test.matches {
expect := test.text[e[0]:e[1]]
if expect != string(result[k]) {
t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test)
}
}
}
}
}
func TestFindAllString(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllString(test.text, -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
continue
}
for k, e := range test.matches {
expect := test.text[e[0]:e[1]]
if expect != result[k] {
t.Errorf("expected %q got %q: %s", expect, result, test)
}
}
}
}
}
func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
return
}
for k, e := range test.matches {
if e[0] != result[k][0] || e[1] != result[k][1] {
t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
}
}
}
}
func TestFindAllIndex(t *testing.T) {
for _, test := range findTests {
testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t)
}
}
func TestFindAllStringIndex(t *testing.T) {
for _, test := range findTests {
testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t)
}
}
// Now come the Submatch cases.
func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
if len(submatches) != len(result)*2 {
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
return
}
for k := 0; k < len(submatches); k += 2 {
if submatches[k] == -1 {
if result[k/2] != nil {
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
}
continue
}
expect := test.text[submatches[k]:submatches[k+1]]
if expect != string(result[k/2]) {
t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
return
}
}
}
func TestFindSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindSubmatch([]byte(test.text))
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchBytes(&test, 0, test.matches[0], result, t)
}
}
}
func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
if len(submatches) != len(result)*2 {
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
return
}
for k := 0; k < len(submatches); k += 2 {
if submatches[k] == -1 {
if result[k/2] != "" {
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
}
continue
}
expect := test.text[submatches[k]:submatches[k+1]]
if expect != result[k/2] {
t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
return
}
}
}
func TestFindStringSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindStringSubmatch(test.text)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchString(&test, 0, test.matches[0], result, t)
}
}
}
func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
if len(expect) != len(result) {
t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
return
}
for k, e := range expect {
if e != result[k] {
t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
}
}
}
func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchIndices(test, 0, test.matches[0], result, t)
}
}
func TestFindSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t)
}
}
func TestFindStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t)
}
}
func TestFindReaderSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
}
}
// Now come the monster AllSubmatch cases.
func TestFindAllSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchBytes(&test, k, match, result[k], t)
}
}
}
}
func TestFindAllStringSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchString(&test, k, match, result[k], t)
}
}
}
}
func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchIndices(test, k, match, result[k], t)
}
}
}
func TestFindAllSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t)
}
}
func TestFindAllStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t)
}
}
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package regexp implements a simple regular expression library.
//
// The syntax of the regular expressions accepted is the same
// general syntax used by Perl, Python, and other languages.
// More precisely, it is the syntax accepted by RE2 and described at
// http://code.google.com/p/re2/wiki/Syntax, except for \C.
//
// All characters are UTF-8-encoded code points.
//
// There are 16 methods of Regexp that match a regular expression and identify
// the matched text. Their names are matched by this regular expression:
//
// Find(All)?(String)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression. Empty matches abutting a preceding
// match are ignored. The return value is a slice containing the successive
// return values of the corresponding non-'All' routine. These routines take
// an extra integer argument, n; if n >= 0, the function returns at most n
// matches/submatches.
//
// If 'String' is present, the argument is a string; otherwise it is a slice
// of bytes; return values are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is a slice identifying the
// successive submatches of the expression. Submatches are matches of
// parenthesized subexpressions within the regular expression, numbered from
// left to right in order of opening parenthesis. Submatch 0 is the match of
// the entire expression, submatch 1 the match of the first parenthesized
// subexpression, and so on.
//
// If 'Index' is present, matches and submatches are identified by byte index
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
// the nth submatch. The pair for n==0 identifies the match of the entire
// expression. If 'Index' is not present, the match is identified by the
// text of the match/submatch. If an index is negative, it means that
// subexpression did not match any string in the input.
//
// There is also a subset of the methods that can be applied to text read
// from a RuneReader:
//
// MatchReader, FindReaderIndex, FindReaderSubmatchIndex
//
// This set may grow. Note that regular expression matches may need to
// examine text beyond the text returned by a match, so the methods that
// match text from a RuneReader may read arbitrarily far into the input
// before returning.
//
// (There are a few other methods that do not match this pattern.)
//
package regexp
import (
"bytes"
"exp/regexp/syntax"
"io"
"os"
"strings"
"sync"
"utf8"
)
var debug = false
// Error is the local type for a parsing error.
type Error string
func (e Error) String() string {
return string(e)
}
// Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods.
// A Regexp is safe for concurrent use by multiple goroutines.
type Regexp struct {
// read-only after Compile
expr string // as passed to Compile
prog *syntax.Prog // compiled program
prefix string // required prefix in unanchored matches
prefixBytes []byte // prefix, as a []byte
prefixComplete bool // prefix is the entire regexp
prefixRune int // first rune in prefix
cond syntax.EmptyOp // empty-width conditions required at start of match
// cache of machines for running regexp
mu sync.Mutex
machine []*machine
}
// String returns the source text used to compile the regular expression.
func (re *Regexp) String() string {
return re.expr
}
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
func Compile(expr string) (*Regexp, os.Error) {
re, err := syntax.Parse(expr, syntax.Perl)
if err != nil {
return nil, err
}
prog, err := syntax.Compile(re)
if err != nil {
return nil, err
}
regexp := &Regexp{
expr: expr,
prog: prog,
}
regexp.prefix, regexp.prefixComplete = prog.Prefix()
if regexp.prefix != "" {
// TODO(rsc): Remove this allocation by adding
// IndexString to package bytes.
regexp.prefixBytes = []byte(regexp.prefix)
regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
}
regexp.cond = prog.StartCond()
return regexp, nil
}
// get returns a machine to use for matching re.
// It uses the re's machine cache if possible, to avoid
// unnecessary allocation.
func (re *Regexp) get() *machine {
re.mu.Lock()
if n := len(re.machine); n > 0 {
z := re.machine[n-1]
re.machine = re.machine[:n-1]
re.mu.Unlock()
return z
}
re.mu.Unlock()
z := progMachine(re.prog)
z.re = re
return z
}
// put returns a machine to the re's machine cache.
// There is no attempt to limit the size of the cache, so it will
// grow to the maximum number of simultaneous matches
// run using re. (The cache empties when re gets garbage collected.)
func (re *Regexp) put(z *machine) {
re.mu.Lock()
re.machine = append(re.machine, z)
re.mu.Unlock()
}
// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompile(str string) *Regexp {
regexp, error := Compile(str)
if error != nil {
panic(`regexp: compiling "` + str + `": ` + error.String())
}
return regexp
}
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
func (re *Regexp) NumSubexp() int {
// NumCap/2 because captures count ( and ) separately.
// -1 because NumCap counts $0 but NumSubexp does not.
return re.prog.NumCap/2 - 1
}
const endOfText = -1
// input abstracts different representations of the input text. It provides
// one-character lookahead.
type input interface {
step(pos int) (rune int, width int) // advance one rune
canCheckPrefix() bool // can we look ahead without losing info?
hasPrefix(re *Regexp) bool
index(re *Regexp, pos int) int
}
// inputString scans a string.
type inputString struct {
str string
}
func newInputString(str string) *inputString {
return &inputString{str: str}
}
func (i *inputString) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
}
return endOfText, 0
}
func (i *inputString) canCheckPrefix() bool {
return true
}
func (i *inputString) hasPrefix(re *Regexp) bool {
return strings.HasPrefix(i.str, re.prefix)
}
func (i *inputString) index(re *Regexp, pos int) int {
return strings.Index(i.str[pos:], re.prefix)
}
// inputBytes scans a byte slice.
type inputBytes struct {
str []byte
}
func newInputBytes(str []byte) *inputBytes {
return &inputBytes{str: str}
}
func (i *inputBytes) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRune(i.str[pos:len(i.str)])
}
return endOfText, 0
}
func (i *inputBytes) canCheckPrefix() bool {
return true
}
func (i *inputBytes) hasPrefix(re *Regexp) bool {
return bytes.HasPrefix(i.str, re.prefixBytes)
}
func (i *inputBytes) index(re *Regexp, pos int) int {
return bytes.Index(i.str[pos:], re.prefixBytes)
}
// inputReader scans a RuneReader.
type inputReader struct {
r io.RuneReader
atEOT bool
pos int
}
func newInputReader(r io.RuneReader) *inputReader {
return &inputReader{r: r}
}
func (i *inputReader) step(pos int) (int, int) {
if !i.atEOT && pos != i.pos {
return endOfText, 0
}
r, w, err := i.r.ReadRune()
if err != nil {
i.atEOT = true
return endOfText, 0
}
i.pos += w
return r, w
}
func (i *inputReader) canCheckPrefix() bool {
return false
}
func (i *inputReader) hasPrefix(re *Regexp) bool {
return false
}
func (i *inputReader) index(re *Regexp, pos int) int {
return -1
}
// LiteralPrefix returns a literal string that must begin any match
// of the regular expression re. It returns the boolean true if the
// literal string comprises the entire regular expression.
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
return re.prefix, re.prefixComplete
}
// MatchReader returns whether the Regexp matches the text read by the
// RuneReader. The return value is a boolean: true for match, false for no
// match.
func (re *Regexp) MatchReader(r io.RuneReader) bool {
return re.doExecute(newInputReader(r), 0, 0) != nil
}
// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool {
return re.doExecute(newInputString(s), 0, 0) != nil
}
// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool {
return re.doExecute(newInputBytes(b), 0, 0) != nil
}
// MatchReader checks whether a textual regular expression matches the text
// read by the RuneReader. More complicated queries need to use Compile and
// the full Regexp interface.
func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
}
return re.MatchReader(r), nil
}
// MatchString checks whether a textual regular expression
// matches a string. More complicated queries need
// to use Compile and the full Regexp interface.
func MatchString(pattern string, s string) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
}
return re.MatchString(s), nil
}
// Match checks whether a textual regular expression
// matches a byte slice. More complicated queries need
// to use Compile and the full Regexp interface.
func Match(pattern string, b []byte) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
}
return re.Match(b), nil
}
// ReplaceAllString returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllString(src, repl string) string {
return re.ReplaceAllStringFunc(src, func(string) string { return repl })
}
// ReplaceAllStringFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched string). No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
lastMatchEnd := 0 // end position of the most recent match
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
for searchPos <= len(src) {
a := re.doExecute(newInputString(src), searchPos, 2)
if len(a) == 0 {
break // no more matches
}
// Copy the unmatched characters before this match.
io.WriteString(buf, src[lastMatchEnd:a[0]])
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
io.WriteString(buf, repl(src[a[0]:a[1]]))
}
lastMatchEnd = a[1]
// Advance past this match; always advance at least one character.
_, width := utf8.DecodeRuneInString(src[searchPos:])
if searchPos+width > a[1] {
searchPos += width
} else if searchPos+1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
searchPos++
} else {
searchPos = a[1]
}
}
// Copy the unmatched characters after the last match.
io.WriteString(buf, src[lastMatchEnd:])
return buf.String()
}
// ReplaceAll returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement text.
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
}
// ReplaceAllFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched []byte). No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
lastMatchEnd := 0 // end position of the most recent match
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
for searchPos <= len(src) {
a := re.doExecute(newInputBytes(src), searchPos, 2)
if len(a) == 0 {
break // no more matches
}
// Copy the unmatched characters before this match.
buf.Write(src[lastMatchEnd:a[0]])
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
buf.Write(repl(src[a[0]:a[1]]))
}
lastMatchEnd = a[1]
// Advance past this match; always advance at least one character.
_, width := utf8.DecodeRune(src[searchPos:])
if searchPos+width > a[1] {
searchPos += width
} else if searchPos+1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
searchPos++
} else {
searchPos = a[1]
}
}
// Copy the unmatched characters after the last match.
buf.Write(src[lastMatchEnd:])
return buf.Bytes()
}
var specialBytes = []byte(`\.+*?()|[]{}^$`)
func special(b byte) bool {
return bytes.IndexByte(specialBytes, b) >= 0
}
// QuoteMeta returns a string that quotes all regular expression metacharacters
// inside the argument text; the returned string is a regular expression matching
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
func QuoteMeta(s string) string {
b := make([]byte, 2*len(s))
// A byte loop is correct because all metacharacters are ASCII.
j := 0
for i := 0; i < len(s); i++ {
if special(s[i]) {
b[j] = '\\'
j++
}
b[j] = s[i]
j++
}
return string(b[0:j])
}
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
var end int
if b == nil {
end = len(s)
} else {
end = len(b)
}
for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
var in input
if b == nil {
in = newInputString(s)
} else {
in = newInputBytes(b)
}
matches := re.doExecute(in, pos, re.prog.NumCap)
if len(matches) == 0 {
break
}
accept := true
if matches[1] == pos {
// We've found an empty match.
if matches[0] == prevMatchEnd {
// We don't allow an empty match right
// after a previous match, so ignore it.
accept = false
}
var width int
// TODO: use step()
if b == nil {
_, width = utf8.DecodeRuneInString(s[pos:end])
} else {
_, width = utf8.DecodeRune(b[pos:end])
}
if width > 0 {
pos += width
} else {
pos = end + 1
}
} else {
pos = matches[1]
}
prevMatchEnd = matches[1]
if accept {
deliver(matches)
i++
}
}
}
// Find returns a slice holding the text of the leftmost match in b of the regular expression.
// A return value of nil indicates no match.
func (re *Regexp) Find(b []byte) []byte {
a := re.doExecute(newInputBytes(b), 0, 2)
if a == nil {
return nil
}
return b[a[0]:a[1]]
}
// FindIndex returns a two-element slice of integers defining the location of
// the leftmost match in b of the regular expression. The match itself is at
// b[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindIndex(b []byte) (loc []int) {
a := re.doExecute(newInputBytes(b), 0, 2)
if a == nil {
return nil
}
return a[0:2]
}
// FindString returns a string holding the text of the leftmost match in s of the regular
// expression. If there is no match, the return value is an empty string,
// but it will also be empty if the regular expression successfully matches
// an empty string. Use FindStringIndex or FindStringSubmatch if it is
// necessary to distinguish these cases.
func (re *Regexp) FindString(s string) string {
a := re.doExecute(newInputString(s), 0, 2)
if a == nil {
return ""
}
return s[a[0]:a[1]]
}
// FindStringIndex returns a two-element slice of integers defining the
// location of the leftmost match in s of the regular expression. The match
// itself is at s[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindStringIndex(s string) []int {
a := re.doExecute(newInputString(s), 0, 2)
if a == nil {
return nil
}
return a[0:2]
}
// FindReaderIndex returns a two-element slice of integers defining the
// location of the leftmost match of the regular expression in text read from
// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return
// value of nil indicates no match.
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
a := re.doExecute(newInputReader(r), 0, 2)
if a == nil {
return nil
}
return a[0:2]
}
// FindSubmatch returns a slice of slices holding the text of the leftmost
// match of the regular expression in b and the matches, if any, of its
// subexpressions, as defined by the 'Submatch' descriptions in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
a := re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
if a == nil {
return nil
}
ret := make([][]byte, len(a)/2)
for i := range ret {
if a[2*i] >= 0 {
ret[i] = b[a[2*i]:a[2*i+1]]
}
}
return ret
}
// FindSubmatchIndex returns a slice holding the index pairs identifying the
// leftmost match of the regular expression in b and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
return re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
}
// FindStringSubmatch returns a slice of strings holding the text of the
// leftmost match of the regular expression in s and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatch(s string) []string {
a := re.doExecute(newInputString(s), 0, re.prog.NumCap)
if a == nil {
return nil
}
ret := make([]string, len(a)/2)
for i := range ret {
if a[2*i] >= 0 {
ret[i] = s[a[2*i]:a[2*i+1]]
}
}
return ret
}
// FindStringSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression in s and the
// matches, if any, of its subexpressions, as defined by the 'Submatch' and
// 'Index' descriptions in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
return re.doExecute(newInputString(s), 0, re.prog.NumCap)
}
// FindReaderSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression of text read by
// the RuneReader, and the matches, if any, of its subexpressions, as defined
// by the 'Submatch' and 'Index' descriptions in the package comment. A
// return value of nil indicates no match.
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
return re.doExecute(newInputReader(r), 0, re.prog.NumCap)
}
const startSize = 10 // The size at which to start a slice in the 'All' routines.
// FindAll is the 'All' version of Find; it returns a slice of all successive
// matches of the expression, as defined by the 'All' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
if n < 0 {
n = len(b) + 1
}
result := make([][]byte, 0, startSize)
re.allMatches("", b, n, func(match []int) {
result = append(result, b[match[0]:match[1]])
})
if len(result) == 0 {
return nil
}
return result
}
// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
if n < 0 {
n = len(b) + 1
}
result := make([][]int, 0, startSize)
re.allMatches("", b, n, func(match []int) {
result = append(result, match[0:2])
})
if len(result) == 0 {
return nil
}
return result
}
// FindAllString is the 'All' version of FindString; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllString(s string, n int) []string {
if n < 0 {
n = len(s) + 1
}
result := make([]string, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
result = append(result, s[match[0]:match[1]])
})
if len(result) == 0 {
return nil
}
return result
}
// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
// slice of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
if n < 0 {
n = len(s) + 1
}
result := make([][]int, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
result = append(result, match[0:2])
})
if len(result) == 0 {
return nil
}
return result
}
// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
// of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
if n < 0 {
n = len(b) + 1
}
result := make([][][]byte, 0, startSize)
re.allMatches("", b, n, func(match []int) {
slice := make([][]byte, len(match)/2)
for j := range slice {
if match[2*j] >= 0 {
slice[j] = b[match[2*j]:match[2*j+1]]
}
}
result = append(result, slice)
})
if len(result) == 0 {
return nil
}
return result
}
// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
// a slice of all successive matches of the expression, as defined by the
// 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
if n < 0 {
n = len(b) + 1
}
result := make([][]int, 0, startSize)
re.allMatches("", b, n, func(match []int) {
result = append(result, match)
})
if len(result) == 0 {
return nil
}
return result
}
// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
// returns a slice of all successive matches of the expression, as defined by
// the 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
if n < 0 {
n = len(s) + 1
}
result := make([][]string, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
slice := make([]string, len(match)/2)
for j := range slice {
if match[2*j] >= 0 {
slice[j] = s[match[2*j]:match[2*j+1]]
}
}
result = append(result, slice)
})
if len(result) == 0 {
return nil
}
return result
}
// FindAllStringSubmatchIndex is the 'All' version of
// FindStringSubmatchIndex; it returns a slice of all successive matches of
// the expression, as defined by the 'All' description in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
if n < 0 {
n = len(s) + 1
}
result := make([][]int, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
result = append(result, match)
})
if len(result) == 0 {
return nil
}
return result
}
......@@ -86,6 +86,7 @@ func Compile(re *Regexp) (*Prog, os.Error) {
func (c *compiler) init() {
c.p = new(Prog)
c.p.NumCap = 2 // implicit ( and ) for whole match $0
c.inst(InstFail)
}
......
......@@ -55,6 +55,61 @@ func (p *Prog) String() string {
return b.String()
}
// skipNop follows any no-op or capturing instructions
// and returns the resulting pc.
func (p *Prog) skipNop(pc uint32) *Inst {
i := &p.Inst[pc]
for i.Op == InstNop || i.Op == InstCapture {
pc = i.Out
i = &p.Inst[pc]
}
return i
}
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
i := p.skipNop(uint32(p.Start))
// Avoid allocation of buffer if prefix is empty.
if i.Op != InstRune || len(i.Rune) != 1 {
return "", i.Op == InstMatch
}
// Have prefix; gather characters.
var buf bytes.Buffer
for i.Op == InstRune && len(i.Rune) == 1 {
buf.WriteRune(i.Rune[0])
i = p.skipNop(i.Out)
}
return buf.String(), i.Op == InstMatch
}
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
var flag EmptyOp
pc := uint32(p.Start)
i := &p.Inst[pc]
Loop:
for {
switch i.Op {
case InstEmptyWidth:
flag |= EmptyOp(i.Arg)
case InstFail:
return ^EmptyOp(0)
case InstCapture, InstNop:
// skip
default:
break Loop
}
pc = i.Out
i = &p.Inst[pc]
}
return flag
}
// MatchRune returns true if the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
func (i *Inst) MatchRune(r int) bool {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment