Commit a1e7cd97 authored by Russ Cox's avatar Russ Cox

exp/regexp: implement regexp API using exp/regexp/syntax

Still need to write tests for new syntax
and fix bugs that the tests find, but this
is a good check point.

All tests pass.

Compared against existing regexp:

benchmark                                old ns/op    new ns/op    delta
regexp.BenchmarkLiteral                       1869          620  -66.83%
regexp.BenchmarkNotLiteral                    9489         7823  -17.56%
regexp.BenchmarkMatchClass                   10372         8386  -19.15%
regexp.BenchmarkMatchClass_InRange           10800         7750  -28.24%
regexp.BenchmarkReplaceAll                   13492         8519  -36.86%
regexp.BenchmarkAnchoredLiteralShortNonMatch   747          339  -54.62%
regexp.BenchmarkAnchoredLiteralLongNonMatch    599          335  -44.07%
regexp.BenchmarkAnchoredShortMatch            2137          917  -57.09%
regexp.BenchmarkAnchoredLongMatch             2029          917  -54.81%

R=r, r
CC=golang-dev, sam.thorogood
parent fc2480da
# Copyright 2011 The Go Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
include ../../../
include ../../../Make.pkg
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
var good_re = []string{
type stringError struct {
re string
err os.Error
var bad_re = []stringError{
{`*`, ErrBareClosure},
{`+`, ErrBareClosure},
{`?`, ErrBareClosure},
{`(abc`, ErrUnmatchedLpar},
{`abc)`, ErrUnmatchedRpar},
{`x[a-z`, ErrUnmatchedLbkt},
{`abc]`, ErrUnmatchedRbkt},
{`[z-a]`, ErrBadRange},
{`abc\`, ErrExtraneousBackslash},
{`a**`, ErrBadClosure},
{`a*+`, ErrBadClosure},
{`a??`, ErrBadClosure},
{`\x`, ErrBadBackslash},
func compileTest(t *testing.T, expr string, error os.Error) *Regexp {
re, err := Compile(expr)
if err != error {
t.Error("compiling `", expr, "`; unexpected error: ", err.String())
return re
func TestGoodCompile(t *testing.T) {
for i := 0; i < len(good_re); i++ {
compileTest(t, good_re[i], nil)
func TestBadCompile(t *testing.T) {
for i := 0; i < len(bad_re); i++ {
compileTest(t, bad_re[i].re, bad_re[i].err)
func matchTest(t *testing.T, test *FindTest) {
re := compileTest(t, test.pat, nil)
if re == nil {
m := re.MatchString(test.text)
if m != (len(test.matches) > 0) {
t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0)
// now try bytes
m = re.Match([]byte(test.text))
if m != (len(test.matches) > 0) {
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
func TestMatch(t *testing.T) {
for _, test := range findTests {
matchTest(t, &test)
func matchFunctionTest(t *testing.T, test *FindTest) {
m, err := MatchString(test.pat, test.text)
if err == nil {
if m != (len(test.matches) > 0) {
t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0)
func TestMatchFunction(t *testing.T) {
for _, test := range findTests {
matchFunctionTest(t, &test)
type ReplaceTest struct {
pattern, replacement, input, output string
var replaceTests = []ReplaceTest{
// Test empty input and/or replacement, with pattern that matches the empty string.
{"", "", "", ""},
{"", "x", "", "x"},
{"", "", "abc", "abc"},
{"", "x", "abc", "xaxbxcx"},
// Test empty input and/or replacement, with pattern that does not match the empty string.
{"b", "", "", ""},
{"b", "x", "", ""},
{"b", "", "abc", "ac"},
{"b", "x", "abc", "axc"},
{"y", "", "", ""},
{"y", "x", "", ""},
{"y", "", "abc", "abc"},
{"y", "x", "abc", "abc"},
// Multibyte characters -- verify that we don't try to match in the middle
// of a character.
{"[a-c]*", "x", "\u65e5", "x\u65e5x"},
{"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"},
// Start and end of a string.
{"^[a-c]*", "x", "abcdabc", "xdabc"},
{"[a-c]*$", "x", "abcdabc", "abcdx"},
{"^[a-c]*$", "x", "abcdabc", "abcdabc"},
{"^[a-c]*", "x", "abc", "x"},
{"[a-c]*$", "x", "abc", "x"},
{"^[a-c]*$", "x", "abc", "x"},
{"^[a-c]*", "x", "dabce", "xdabce"},
{"[a-c]*$", "x", "dabce", "dabcex"},
{"^[a-c]*$", "x", "dabce", "dabce"},
{"^[a-c]*", "x", "", "x"},
{"[a-c]*$", "x", "", "x"},
{"^[a-c]*$", "x", "", "x"},
{"^[a-c]+", "x", "abcdabc", "xdabc"},
{"[a-c]+$", "x", "abcdabc", "abcdx"},
{"^[a-c]+$", "x", "abcdabc", "abcdabc"},
{"^[a-c]+", "x", "abc", "x"},
{"[a-c]+$", "x", "abc", "x"},
{"^[a-c]+$", "x", "abc", "x"},
{"^[a-c]+", "x", "dabce", "dabce"},
{"[a-c]+$", "x", "dabce", "dabce"},
{"^[a-c]+$", "x", "dabce", "dabce"},
{"^[a-c]+", "x", "", ""},
{"[a-c]+$", "x", "", ""},
{"^[a-c]+$", "x", "", ""},
// Other cases.
{"abc", "def", "abcdefg", "defdefg"},
{"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"},
{"abc", "", "abcdabc", "d"},
{"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"},
{"abc", "d", "", ""},
{"abc", "d", "abc", "d"},
{".+", "x", "abc", "x"},
{"[a-c]*", "x", "def", "xdxexfx"},
{"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"},
{"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"},
type ReplaceFuncTest struct {
pattern string
replacement func(string) string
input, output string
var replaceFuncTests = []ReplaceFuncTest{
{"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"},
{"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"},
{"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"},
func TestReplaceAll(t *testing.T) {
for _, tc := range replaceTests {
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
actual := re.ReplaceAllString(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.Replace(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
// now try bytes
actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement)))
if actual != tc.output {
t.Errorf("%q.Replace(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
func TestReplaceAllFunc(t *testing.T) {
for _, tc := range replaceFuncTests {
re, err := Compile(tc.pattern)
if err != nil {
t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err)
actual := re.ReplaceAllStringFunc(tc.input, tc.replacement)
if actual != tc.output {
t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
// now try bytes
actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) }))
if actual != tc.output {
t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q",
tc.pattern, tc.input, tc.replacement, actual, tc.output)
type MetaTest struct {
pattern, output, literal string
isLiteral bool
var metaTests = []MetaTest{
{``, ``, ``, true},
{`foo`, `foo`, `foo`, true},
{`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator
{`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators
{`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false},
func TestQuoteMeta(t *testing.T) {
for _, tc := range metaTests {
// Verify that QuoteMeta returns the expected string.
quoted := QuoteMeta(tc.pattern)
if quoted != tc.output {
t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`",
tc.pattern, quoted, tc.output)
// Verify that the quoted string is in fact treated as expected
// by Compile -- i.e. that it matches the original, unquoted string.
if tc.pattern != "" {
re, err := Compile(quoted)
if err != nil {
t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err)
src := "abc" + tc.pattern + "def"
repl := "xyz"
replaced := re.ReplaceAllString(src, repl)
expected := "abcxyzdef"
if replaced != expected {
t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`",
tc.pattern, src, repl, replaced, expected)
func TestLiteralPrefix(t *testing.T) {
for _, tc := range metaTests {
// Literal method needs to scan the pattern.
re := MustCompile(tc.pattern)
str, complete := re.LiteralPrefix()
if complete != tc.isLiteral {
t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral)
if str != tc.literal {
t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal)
type numSubexpCase struct {
input string
expected int
var numSubexpCases = []numSubexpCase{
{``, 0},
{`.*`, 0},
{`abba`, 0},
{`ab(b)a`, 1},
{`ab(.*)a`, 1},
{`(.*)ab(.*)a`, 2},
{`(.*)(ab)(.*)a`, 3},
{`(.*)((a)b)(.*)a`, 4},
{`(.*)(\(ab)(.*)a`, 3},
{`(.*)(\(a\)b)(.*)a`, 3},
func TestNumSubexp(t *testing.T) {
for _, c := range numSubexpCases {
re := MustCompile(c.input)
n := re.NumSubexp()
if n != c.expected {
t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected)
func BenchmarkLiteral(b *testing.B) {
x := strings.Repeat("x", 50) + "y"
re := MustCompile("y")
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
func BenchmarkNotLiteral(b *testing.B) {
x := strings.Repeat("x", 50) + "y"
re := MustCompile(".y")
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
func BenchmarkMatchClass(b *testing.B) {
x := strings.Repeat("xxxx", 20) + "w"
re := MustCompile("[abcdw]")
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
func BenchmarkMatchClass_InRange(b *testing.B) {
// 'b' is between 'a' and 'c', so the charclass
// range checking is no help here.
x := strings.Repeat("bbbb", 20) + "c"
re := MustCompile("[ac]")
for i := 0; i < b.N; i++ {
if !re.MatchString(x) {
println("no match!")
func BenchmarkReplaceAll(b *testing.B) {
x := "abcdefghijklmnopqrstuvwxyz"
re := MustCompile("[cjrw]")
for i := 0; i < b.N; i++ {
re.ReplaceAllString(x, "")
func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) {
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^zbc(d|e)")
for i := 0; i < b.N; i++ {
func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) {
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
re := MustCompile("^zbc(d|e)")
for i := 0; i < b.N; i++ {
func BenchmarkAnchoredShortMatch(b *testing.B) {
x := []byte("abcdefghijklmnopqrstuvwxyz")
re := MustCompile("^.bc(d|e)")
for i := 0; i < b.N; i++ {
func BenchmarkAnchoredLongMatch(b *testing.B) {
x := []byte("abcdefghijklmnopqrstuvwxyz")
for i := 0; i < 15; i++ {
x = append(x, x...)
re := MustCompile("^.bc(d|e)")
for i := 0; i < b.N; i++ {
package regexp
import "exp/regexp/syntax"
// A queue is a 'sparse array' holding pending threads of execution.
// See
type queue struct {
sparse []uint32
dense []entry
// A entry is an entry on a queue.
// It holds both the instruction pc and the actual thread.
// Some queue entries are just place holders so that the machine
// knows it has considered that pc. Such entries have t == nil.
type entry struct {
pc uint32
t *thread
// A thread is the state of a single path through the machine:
// an instruction and a corresponding capture array.
// See
type thread struct {
inst *syntax.Inst
cap []int
// A machine holds all the state during an NFA simulation for p.
type machine struct {
re *Regexp // corresponding Regexp
p *syntax.Prog // compiled program
q0, q1 queue // two queues for runq, nextq
pool []*thread // pool of available threads
matched bool // whether a match was found
matchcap []int // capture information for the match
// progMachine returns a new machine running the prog p.
func progMachine(p *syntax.Prog) *machine {
m := &machine{p: p}
n := len(m.p.Inst)
m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
ncap := p.NumCap
if ncap < 2 {
ncap = 2
m.matchcap = make([]int, ncap)
return m
// alloc allocates a new thread with the given instruction.
// It uses the free pool if possible.
func (m *machine) alloc(i *syntax.Inst) *thread {
var t *thread
if n := len(m.pool); n > 0 {
t = m.pool[n-1]
m.pool = m.pool[:n-1]
} else {
t = new(thread)
t.cap = make([]int, cap(m.matchcap))
t.cap = t.cap[:len(m.matchcap)]
t.inst = i
return t
// free returns t to the free pool.
func (m *machine) free(t *thread) {
m.pool = append(m.pool, t)
// match runs the machine over the input starting at pos.
// It reports whether a match was found.
// If so, m.matchcap holds the submatch information.
func (m *machine) match(i input, pos int) bool {
startCond :=
if startCond == ^syntax.EmptyOp(0) { // impossible
return false
m.matched = false
for i := range m.matchcap {
m.matchcap[i] = -1
runq, nextq := &m.q0, &m.q1
rune, rune1 := endOfText, endOfText
width, width1 := 0, 0
rune, width = i.step(pos)
if rune != endOfText {
rune1, width1 = i.step(pos + width)
// TODO: Let caller specify the initial flag setting.
// For now assume pos == 0 is beginning of text and
// pos != 0 is not even beginning of line.
// TODO: Word boundary.
var flag syntax.EmptyOp
if pos == 0 {
flag = syntax.EmptyBeginText | syntax.EmptyBeginLine
// Update flag using lookahead rune.
if rune1 == '\n' {
flag |= syntax.EmptyEndLine
if rune1 == endOfText {
flag |= syntax.EmptyEndText
for {
if len(runq.dense) == 0 {
if startCond&syntax.EmptyBeginText != 0 && pos != 0 {
// Anchored match, past beginning of text.
if m.matched {
// Have match; finished exploring alternatives.
if len( > 0 && rune1 != && i.canCheckPrefix() {
// Match requires literal prefix; fast search for it.
advance := i.index(, pos)
if advance < 0 {
pos += advance
rune, width = i.step(pos)
rune1, width1 = i.step(pos + width)
if !m.matched {
if len(m.matchcap) > 0 {
m.matchcap[0] = pos
m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag)
// TODO: word boundary
flag = 0
if rune == '\n' {
flag |= syntax.EmptyBeginLine
if rune1 == '\n' {
flag |= syntax.EmptyEndLine
if rune1 == endOfText {
flag |= syntax.EmptyEndText
m.step(runq, nextq, pos, pos+width, rune, flag)
if width == 0 {
pos += width
rune, width = rune1, width1
if rune != endOfText {
rune1, width1 = i.step(pos + width)
runq, nextq = nextq, runq
return m.matched
// clear frees all threads on the thread queue.
func (m *machine) clear(q *queue) {
for _, d := range q.dense {
if d.t != nil {
q.dense = q.dense[:0]
// step executes one step of the machine, running each of the threads
// on runq and appending new threads to nextq.
// The step processes the rune c (which may be endOfText),
// which starts at position pos and ends at nextPos.
// nextCond gives the setting for the empty-width flags after c.
func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) {
for j := 0; j < len(runq.dense); j++ {
d := &runq.dense[j]
t := d.t
if t == nil {
* If we support leftmost-longest matching:
if longest && matched && match[0] < t.cap[0] {
i := t.inst
switch i.Op {
panic("bad inst")
case syntax.InstMatch:
if len(t.cap) > 0 {
t.cap[1] = pos
copy(m.matchcap, t.cap)
m.matched = true
for _, d := range runq.dense[j+1:] {
if d.t != nil {
runq.dense = runq.dense[:0]
case syntax.InstRune:
if i.MatchRune(c) {
m.add(nextq, i.Out, nextPos, t.cap, nextCond)
runq.dense = runq.dense[:0]
// add adds an entry to q for pc, unless the q already has such an entry.
// It also recursively adds an entry for all instructions reachable from pc by following
// empty-width conditions satisfied by cond. pos gives the current position
// in the input.
func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp) {
if pc == 0 {
if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc {
j := len(q.dense)
q.dense = q.dense[:j+1]
d := &q.dense[j]
d.t = nil
d.pc = pc
q.sparse[pc] = uint32(j)
i := &m.p.Inst[pc]
switch i.Op {
case syntax.InstFail:
// nothing
case syntax.InstAlt, syntax.InstAltMatch:
m.add(q, i.Out, pos, cap, cond)
m.add(q, i.Arg, pos, cap, cond)
case syntax.InstEmptyWidth:
if syntax.EmptyOp(i.Arg)&^cond == 0 {
m.add(q, i.Out, pos, cap, cond)
case syntax.InstNop:
m.add(q, i.Out, pos, cap, cond)
case syntax.InstCapture:
if int(i.Arg) < len(cap) {
opos := cap[i.Arg]
cap[i.Arg] = pos
m.add(q, i.Out, pos, cap, cond)
cap[i.Arg] = opos
} else {
m.add(q, i.Out, pos, cap, cond)
case syntax.InstMatch, syntax.InstRune:
t := m.alloc(i)
if len(t.cap) > 0 {
copy(t.cap, cap)
d.t = t
// empty is a non-nil 0-element slice,
// so doExecute can avoid an allocation
// when 0 captures are requested from a successful match.
var empty = make([]int, 0)
// doExecute finds the leftmost match in the input and returns
// the position of its subexpressions.
func (re *Regexp) doExecute(i input, pos int, ncap int) []int {
m := re.get()
m.matchcap = m.matchcap[:ncap]
if !m.match(i, pos) {
return nil
if ncap == 0 {
return empty // empty but not nil
cap := make([]int, ncap)
copy(cap, m.matchcap)
return cap
// Copyright 2010 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package regexp
import (
// For each pattern/text pair, what is the expected output of each function?
// We can derive the textual results from the indexed results, the non-submatch
// results from the submatched results, the single results from the 'all' results,
// and the byte results from the string results. Therefore the table includes
// only the FindAllStringSubmatchIndex result.
type FindTest struct {
pat string
text string
matches [][]int
func (t FindTest) String() string {
return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text)
var findTests = []FindTest{
{``, ``, build(1, 0, 0)},
{`^abcdefg`, "abcdefg", build(1, 0, 7)},
{`a+`, "baaab", build(1, 1, 4)},
{"abcd..", "abcdef", build(1, 0, 6)},
{`a`, "a", build(1, 0, 1)},
{`x`, "y", nil},
{`b`, "abc", build(1, 1, 2)},
{`.`, "a", build(1, 0, 1)},
{`.*`, "abcdef", build(1, 0, 6)},
{`^`, "abcde", build(1, 0, 0)},
{`$`, "abcde", build(1, 5, 5)},
{`^abcd$`, "abcd", build(1, 0, 4)},
{`^bcd'`, "abcdef", nil},
{`^abcd$`, "abcde", nil},
{`a+`, "baaab", build(1, 1, 4)},
{`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)},
{`[a-z]+`, "abcd", build(1, 0, 4)},
{`[^a-z]+`, "ab1234cd", build(1, 2, 6)},
{`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)},
{`[^\n]+`, "abcd\n", build(1, 0, 4)},
{`[日本語]+`, "日本語日本語", build(1, 0, 18)},
{`日本語+`, "日本語", build(1, 0, 9)},
{`日本語+`, "日本語語語語", build(1, 0, 18)},
{`()`, "", build(1, 0, 0, 0, 0)},
{`(a)`, "a", build(1, 0, 1, 0, 1)},
{`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)},
{`(.*)`, "", build(1, 0, 0, 0, 0)},
{`(.*)`, "abcd", build(1, 0, 4, 0, 4)},
{`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)},
{`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)},
{`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)},
{`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)},
{`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)},
{`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)},
{`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)},
{`(.*).*`, "ab", build(1, 0, 2, 0, 2)},
{`[.]`, ".", build(1, 0, 1)},
{`/$`, "/abc/", build(1, 4, 5)},
{`/$`, "/abc", nil},
// multiple matches
{`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)},
{`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)},
{`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)},
{`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)},
{`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)},
// fixed bugs
{`ab$`, "cab", build(1, 1, 3)},
{`axxb$`, "axxcb", nil},
{`data`, "daXY data", build(1, 5, 9)},
{`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)},
{`zx+`, "zzx", build(1, 1, 3)},
// can backslash-escape any punctuation
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
`!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)},
{"\\`", "`", build(1, 0, 1)},
{"[\\`]+", "`", build(1, 0, 1)},
// long set of matches (longer than startSize)
build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20,
20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30,
30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36),
// build is a helper to construct a [][]int by extracting n sequences from x.
// This represents n matches with len(x)/n submatches each.
func build(n int, x [][]int {
ret := make([][]int, n)
runLength := len(x) / n
j := 0
for i := range ret {
ret[i] = make([]int, runLength)
copy(ret[i], x[j:])
j += runLength
if j > len(x) {
panic("invalid build entry")
return ret
// First the simple cases.
func TestFind(t *testing.T) {
for _, test := range findTests {
re := MustCompile(test.pat)
if re.String() != test.pat {
t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat)
result := re.Find([]byte(test.text))
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
expect := test.text[test.matches[0][0]:test.matches[0][1]]
if expect != string(result) {
t.Errorf("expected %q got %q: %s", expect, result, test)
func TestFindString(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindString(test.text)
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != "":
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == "":
// Tricky because an empty result has two meanings: no match or empty match.
if test.matches[0][0] != test.matches[0][1] {
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != "":
expect := test.text[test.matches[0][0]:test.matches[0][1]]
if expect != result {
t.Errorf("expected %q got %q: %s", expect, result, test)
func testFindIndex(test *FindTest, result []int, t *testing.T) {
switch {
case len(test.matches) == 0 && len(result) == 0:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
expect := test.matches[0]
if expect[0] != result[0] || expect[1] != result[1] {
t.Errorf("expected %v got %v: %s", expect, result, test)
func TestFindIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t)
func TestFindStringIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t)
func TestFindReaderIndex(t *testing.T) {
for _, test := range findTests {
testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t)
// Now come the simple All cases.
func TestFindAll(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAll([]byte(test.text), -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Fatalf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
for k, e := range test.matches {
expect := test.text[e[0]:e[1]]
if expect != string(result[k]) {
t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test)
func TestFindAllString(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllString(test.text, -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
for k, e := range test.matches {
expect := test.text[e[0]:e[1]]
if expect != result[k] {
t.Errorf("expected %q got %q: %s", expect, result, test)
func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
if len(test.matches) != len(result) {
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
for k, e := range test.matches {
if e[0] != result[k][0] || e[1] != result[k][1] {
t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test)
func TestFindAllIndex(t *testing.T) {
for _, test := range findTests {
testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t)
func TestFindAllStringIndex(t *testing.T) {
for _, test := range findTests {
testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t)
// Now come the Submatch cases.
func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) {
if len(submatches) != len(result)*2 {
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
for k := 0; k < len(submatches); k += 2 {
if submatches[k] == -1 {
if result[k/2] != nil {
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
expect := test.text[submatches[k]:submatches[k+1]]
if expect != string(result[k/2]) {
t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
func TestFindSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindSubmatch([]byte(test.text))
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchBytes(&test, 0, test.matches[0], result, t)
func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) {
if len(submatches) != len(result)*2 {
t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test)
for k := 0; k < len(submatches); k += 2 {
if submatches[k] == -1 {
if result[k/2] != "" {
t.Errorf("match %d: expected nil got %q: %s", n, result, test)
expect := test.text[submatches[k]:submatches[k+1]]
if expect != result[k/2] {
t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test)
func TestFindStringSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindStringSubmatch(test.text)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchString(&test, 0, test.matches[0], result, t)
func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) {
if len(expect) != len(result) {
t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test)
for k, e := range expect {
if e != result[k] {
t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test)
func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case test.matches != nil && result != nil:
testSubmatchIndices(test, 0, test.matches[0], result, t)
func TestFindSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t)
func TestFindStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t)
func TestFindReaderSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t)
// Now come the monster AllSubmatch cases.
func TestFindAllSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchBytes(&test, k, match, result[k], t)
func TestFindAllStringSubmatch(t *testing.T) {
for _, test := range findTests {
result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1)
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchString(&test, k, match, result[k], t)
func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) {
switch {
case test.matches == nil && result == nil:
// ok
case test.matches == nil && result != nil:
t.Errorf("expected no match; got one: %s", test)
case test.matches != nil && result == nil:
t.Errorf("expected match; got none: %s", test)
case len(test.matches) != len(result):
t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test)
case test.matches != nil && result != nil:
for k, match := range test.matches {
testSubmatchIndices(test, k, match, result[k], t)
func TestFindAllSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t)
func TestFindAllStringSubmatchIndex(t *testing.T) {
for _, test := range findTests {
testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t)
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package regexp implements a simple regular expression library.
// The syntax of the regular expressions accepted is the same
// general syntax used by Perl, Python, and other languages.
// More precisely, it is the syntax accepted by RE2 and described at
//, except for \C.
// All characters are UTF-8-encoded code points.
// There are 16 methods of Regexp that match a regular expression and identify
// the matched text. Their names are matched by this regular expression:
// Find(All)?(String)?(Submatch)?(Index)?
// If 'All' is present, the routine matches successive non-overlapping
// matches of the entire expression. Empty matches abutting a preceding
// match are ignored. The return value is a slice containing the successive
// return values of the corresponding non-'All' routine. These routines take
// an extra integer argument, n; if n >= 0, the function returns at most n
// matches/submatches.
// If 'String' is present, the argument is a string; otherwise it is a slice
// of bytes; return values are adjusted as appropriate.
// If 'Submatch' is present, the return value is a slice identifying the
// successive submatches of the expression. Submatches are matches of
// parenthesized subexpressions within the regular expression, numbered from
// left to right in order of opening parenthesis. Submatch 0 is the match of
// the entire expression, submatch 1 the match of the first parenthesized
// subexpression, and so on.
// If 'Index' is present, matches and submatches are identified by byte index
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
// the nth submatch. The pair for n==0 identifies the match of the entire
// expression. If 'Index' is not present, the match is identified by the
// text of the match/submatch. If an index is negative, it means that
// subexpression did not match any string in the input.
// There is also a subset of the methods that can be applied to text read
// from a RuneReader:
// MatchReader, FindReaderIndex, FindReaderSubmatchIndex
// This set may grow. Note that regular expression matches may need to
// examine text beyond the text returned by a match, so the methods that
// match text from a RuneReader may read arbitrarily far into the input
// before returning.
// (There are a few other methods that do not match this pattern.)
package regexp
import (
var debug = false
// Error is the local type for a parsing error.
type Error string
func (e Error) String() string {
return string(e)
// Regexp is the representation of a compiled regular expression.
// The public interface is entirely through methods.
// A Regexp is safe for concurrent use by multiple goroutines.
type Regexp struct {
// read-only after Compile
expr string // as passed to Compile
prog *syntax.Prog // compiled program
prefix string // required prefix in unanchored matches
prefixBytes []byte // prefix, as a []byte
prefixComplete bool // prefix is the entire regexp
prefixRune int // first rune in prefix
cond syntax.EmptyOp // empty-width conditions required at start of match
// cache of machines for running regexp
mu sync.Mutex
machine []*machine
// String returns the source text used to compile the regular expression.
func (re *Regexp) String() string {
return re.expr
// Compile parses a regular expression and returns, if successful, a Regexp
// object that can be used to match against text.
func Compile(expr string) (*Regexp, os.Error) {
re, err := syntax.Parse(expr, syntax.Perl)
if err != nil {
return nil, err
prog, err := syntax.Compile(re)
if err != nil {
return nil, err
regexp := &Regexp{
expr: expr,
prog: prog,
regexp.prefix, regexp.prefixComplete = prog.Prefix()
if regexp.prefix != "" {
// TODO(rsc): Remove this allocation by adding
// IndexString to package bytes.
regexp.prefixBytes = []byte(regexp.prefix)
regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
regexp.cond = prog.StartCond()
return regexp, nil
// get returns a machine to use for matching re.
// It uses the re's machine cache if possible, to avoid
// unnecessary allocation.
func (re *Regexp) get() *machine {
if n := len(re.machine); n > 0 {
z := re.machine[n-1]
re.machine = re.machine[:n-1]
return z
z := progMachine(re.prog) = re
return z
// put returns a machine to the re's machine cache.
// There is no attempt to limit the size of the cache, so it will
// grow to the maximum number of simultaneous matches
// run using re. (The cache empties when re gets garbage collected.)
func (re *Regexp) put(z *machine) {
re.machine = append(re.machine, z)
// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompile(str string) *Regexp {
regexp, error := Compile(str)
if error != nil {
panic(`regexp: compiling "` + str + `": ` + error.String())
return regexp
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
func (re *Regexp) NumSubexp() int {
// NumCap/2 because captures count ( and ) separately.
// -1 because NumCap counts $0 but NumSubexp does not.
return re.prog.NumCap/2 - 1
const endOfText = -1
// input abstracts different representations of the input text. It provides
// one-character lookahead.
type input interface {
step(pos int) (rune int, width int) // advance one rune
canCheckPrefix() bool // can we look ahead without losing info?
hasPrefix(re *Regexp) bool
index(re *Regexp, pos int) int
// inputString scans a string.
type inputString struct {
str string
func newInputString(str string) *inputString {
return &inputString{str: str}
func (i *inputString) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRuneInString(i.str[pos:len(i.str)])
return endOfText, 0
func (i *inputString) canCheckPrefix() bool {
return true
func (i *inputString) hasPrefix(re *Regexp) bool {
return strings.HasPrefix(i.str, re.prefix)
func (i *inputString) index(re *Regexp, pos int) int {
return strings.Index(i.str[pos:], re.prefix)
// inputBytes scans a byte slice.
type inputBytes struct {
str []byte
func newInputBytes(str []byte) *inputBytes {
return &inputBytes{str: str}
func (i *inputBytes) step(pos int) (int, int) {
if pos < len(i.str) {
return utf8.DecodeRune(i.str[pos:len(i.str)])
return endOfText, 0
func (i *inputBytes) canCheckPrefix() bool {
return true
func (i *inputBytes) hasPrefix(re *Regexp) bool {
return bytes.HasPrefix(i.str, re.prefixBytes)
func (i *inputBytes) index(re *Regexp, pos int) int {
return bytes.Index(i.str[pos:], re.prefixBytes)
// inputReader scans a RuneReader.
type inputReader struct {
r io.RuneReader
atEOT bool
pos int
func newInputReader(r io.RuneReader) *inputReader {
return &inputReader{r: r}
func (i *inputReader) step(pos int) (int, int) {
if !i.atEOT && pos != i.pos {
return endOfText, 0
r, w, err := i.r.ReadRune()
if err != nil {
i.atEOT = true
return endOfText, 0
i.pos += w
return r, w
func (i *inputReader) canCheckPrefix() bool {
return false
func (i *inputReader) hasPrefix(re *Regexp) bool {
return false
func (i *inputReader) index(re *Regexp, pos int) int {
return -1
// LiteralPrefix returns a literal string that must begin any match
// of the regular expression re. It returns the boolean true if the
// literal string comprises the entire regular expression.
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
return re.prefix, re.prefixComplete
// MatchReader returns whether the Regexp matches the text read by the
// RuneReader. The return value is a boolean: true for match, false for no
// match.
func (re *Regexp) MatchReader(r io.RuneReader) bool {
return re.doExecute(newInputReader(r), 0, 0) != nil
// MatchString returns whether the Regexp matches the string s.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) MatchString(s string) bool {
return re.doExecute(newInputString(s), 0, 0) != nil
// Match returns whether the Regexp matches the byte slice b.
// The return value is a boolean: true for match, false for no match.
func (re *Regexp) Match(b []byte) bool {
return re.doExecute(newInputBytes(b), 0, 0) != nil
// MatchReader checks whether a textual regular expression matches the text
// read by the RuneReader. More complicated queries need to use Compile and
// the full Regexp interface.
func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
return re.MatchReader(r), nil
// MatchString checks whether a textual regular expression
// matches a string. More complicated queries need
// to use Compile and the full Regexp interface.
func MatchString(pattern string, s string) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
return re.MatchString(s), nil
// Match checks whether a textual regular expression
// matches a byte slice. More complicated queries need
// to use Compile and the full Regexp interface.
func Match(pattern string, b []byte) (matched bool, error os.Error) {
re, err := Compile(pattern)
if err != nil {
return false, err
return re.Match(b), nil
// ReplaceAllString returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllString(src, repl string) string {
return re.ReplaceAllStringFunc(src, func(string) string { return repl })
// ReplaceAllStringFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched string). No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
lastMatchEnd := 0 // end position of the most recent match
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
for searchPos <= len(src) {
a := re.doExecute(newInputString(src), searchPos, 2)
if len(a) == 0 {
break // no more matches
// Copy the unmatched characters before this match.
io.WriteString(buf, src[lastMatchEnd:a[0]])
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
io.WriteString(buf, repl(src[a[0]:a[1]]))
lastMatchEnd = a[1]
// Advance past this match; always advance at least one character.
_, width := utf8.DecodeRuneInString(src[searchPos:])
if searchPos+width > a[1] {
searchPos += width
} else if searchPos+1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
} else {
searchPos = a[1]
// Copy the unmatched characters after the last match.
io.WriteString(buf, src[lastMatchEnd:])
return buf.String()
// ReplaceAll returns a copy of src in which all matches for the Regexp
// have been replaced by repl. No support is provided for expressions
// (e.g. \1 or $1) in the replacement text.
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
return re.ReplaceAllFunc(src, func([]byte) []byte { return repl })
// ReplaceAllFunc returns a copy of src in which all matches for the
// Regexp have been replaced by the return value of of function repl (whose
// first argument is the matched []byte). No support is provided for
// expressions (e.g. \1 or $1) in the replacement string.
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
lastMatchEnd := 0 // end position of the most recent match
searchPos := 0 // position where we next look for a match
buf := new(bytes.Buffer)
for searchPos <= len(src) {
a := re.doExecute(newInputBytes(src), searchPos, 2)
if len(a) == 0 {
break // no more matches
// Copy the unmatched characters before this match.
// Now insert a copy of the replacement string, but not for a
// match of the empty string immediately after another match.
// (Otherwise, we get double replacement for patterns that
// match both empty and nonempty strings.)
if a[1] > lastMatchEnd || a[0] == 0 {
lastMatchEnd = a[1]
// Advance past this match; always advance at least one character.
_, width := utf8.DecodeRune(src[searchPos:])
if searchPos+width > a[1] {
searchPos += width
} else if searchPos+1 > a[1] {
// This clause is only needed at the end of the input
// string. In that case, DecodeRuneInString returns width=0.
} else {
searchPos = a[1]
// Copy the unmatched characters after the last match.
return buf.Bytes()
var specialBytes = []byte(`\.+*?()|[]{}^$`)
func special(b byte) bool {
return bytes.IndexByte(specialBytes, b) >= 0
// QuoteMeta returns a string that quotes all regular expression metacharacters
// inside the argument text; the returned string is a regular expression matching
// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
func QuoteMeta(s string) string {
b := make([]byte, 2*len(s))
// A byte loop is correct because all metacharacters are ASCII.
j := 0
for i := 0; i < len(s); i++ {
if special(s[i]) {
b[j] = '\\'
b[j] = s[i]
return string(b[0:j])
// Find matches in slice b if b is non-nil, otherwise find matches in string s.
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
var end int
if b == nil {
end = len(s)
} else {
end = len(b)
for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
var in input
if b == nil {
in = newInputString(s)
} else {
in = newInputBytes(b)
matches := re.doExecute(in, pos, re.prog.NumCap)
if len(matches) == 0 {
accept := true
if matches[1] == pos {
// We've found an empty match.
if matches[0] == prevMatchEnd {
// We don't allow an empty match right
// after a previous match, so ignore it.
accept = false
var width int
// TODO: use step()
if b == nil {
_, width = utf8.DecodeRuneInString(s[pos:end])
} else {
_, width = utf8.DecodeRune(b[pos:end])
if width > 0 {
pos += width
} else {
pos = end + 1
} else {
pos = matches[1]
prevMatchEnd = matches[1]
if accept {
// Find returns a slice holding the text of the leftmost match in b of the regular expression.
// A return value of nil indicates no match.
func (re *Regexp) Find(b []byte) []byte {
a := re.doExecute(newInputBytes(b), 0, 2)
if a == nil {
return nil
return b[a[0]:a[1]]
// FindIndex returns a two-element slice of integers defining the location of
// the leftmost match in b of the regular expression. The match itself is at
// b[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindIndex(b []byte) (loc []int) {
a := re.doExecute(newInputBytes(b), 0, 2)
if a == nil {
return nil
return a[0:2]
// FindString returns a string holding the text of the leftmost match in s of the regular
// expression. If there is no match, the return value is an empty string,
// but it will also be empty if the regular expression successfully matches
// an empty string. Use FindStringIndex or FindStringSubmatch if it is
// necessary to distinguish these cases.
func (re *Regexp) FindString(s string) string {
a := re.doExecute(newInputString(s), 0, 2)
if a == nil {
return ""
return s[a[0]:a[1]]
// FindStringIndex returns a two-element slice of integers defining the
// location of the leftmost match in s of the regular expression. The match
// itself is at s[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindStringIndex(s string) []int {
a := re.doExecute(newInputString(s), 0, 2)
if a == nil {
return nil
return a[0:2]
// FindReaderIndex returns a two-element slice of integers defining the
// location of the leftmost match of the regular expression in text read from
// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return
// value of nil indicates no match.
func (re *Regexp) FindReaderIndex(r io.RuneReader) []int {
a := re.doExecute(newInputReader(r), 0, 2)
if a == nil {
return nil
return a[0:2]
// FindSubmatch returns a slice of slices holding the text of the leftmost
// match of the regular expression in b and the matches, if any, of its
// subexpressions, as defined by the 'Submatch' descriptions in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
a := re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
if a == nil {
return nil
ret := make([][]byte, len(a)/2)
for i := range ret {
if a[2*i] >= 0 {
ret[i] = b[a[2*i]:a[2*i+1]]
return ret
// FindSubmatchIndex returns a slice holding the index pairs identifying the
// leftmost match of the regular expression in b and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
return re.doExecute(newInputBytes(b), 0, re.prog.NumCap)
// FindStringSubmatch returns a slice of strings holding the text of the
// leftmost match of the regular expression in s and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatch(s string) []string {
a := re.doExecute(newInputString(s), 0, re.prog.NumCap)
if a == nil {
return nil
ret := make([]string, len(a)/2)
for i := range ret {
if a[2*i] >= 0 {
ret[i] = s[a[2*i]:a[2*i+1]]
return ret
// FindStringSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression in s and the
// matches, if any, of its subexpressions, as defined by the 'Submatch' and
// 'Index' descriptions in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
return re.doExecute(newInputString(s), 0, re.prog.NumCap)
// FindReaderSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression of text read by
// the RuneReader, and the matches, if any, of its subexpressions, as defined
// by the 'Submatch' and 'Index' descriptions in the package comment. A
// return value of nil indicates no match.
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
return re.doExecute(newInputReader(r), 0, re.prog.NumCap)
const startSize = 10 // The size at which to start a slice in the 'All' routines.
// FindAll is the 'All' version of Find; it returns a slice of all successive
// matches of the expression, as defined by the 'All' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
if n < 0 {
n = len(b) + 1
result := make([][]byte, 0, startSize)
re.allMatches("", b, n, func(match []int) {
result = append(result, b[match[0]:match[1]])
if len(result) == 0 {
return nil
return result
// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
if n < 0 {
n = len(b) + 1
result := make([][]int, 0, startSize)
re.allMatches("", b, n, func(match []int) {
result = append(result, match[0:2])
if len(result) == 0 {
return nil
return result
// FindAllString is the 'All' version of FindString; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllString(s string, n int) []string {
if n < 0 {
n = len(s) + 1
result := make([]string, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
result = append(result, s[match[0]:match[1]])
if len(result) == 0 {
return nil
return result
// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
// slice of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
if n < 0 {
n = len(s) + 1
result := make([][]int, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
result = append(result, match[0:2])
if len(result) == 0 {
return nil
return result
// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
// of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
if n < 0 {
n = len(b) + 1
result := make([][][]byte, 0, startSize)
re.allMatches("", b, n, func(match []int) {
slice := make([][]byte, len(match)/2)
for j := range slice {
if match[2*j] >= 0 {
slice[j] = b[match[2*j]:match[2*j+1]]
result = append(result, slice)
if len(result) == 0 {
return nil
return result
// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
// a slice of all successive matches of the expression, as defined by the
// 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
if n < 0 {
n = len(b) + 1
result := make([][]int, 0, startSize)
re.allMatches("", b, n, func(match []int) {
result = append(result, match)
if len(result) == 0 {
return nil
return result
// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
// returns a slice of all successive matches of the expression, as defined by
// the 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
if n < 0 {
n = len(s) + 1
result := make([][]string, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
slice := make([]string, len(match)/2)
for j := range slice {
if match[2*j] >= 0 {
slice[j] = s[match[2*j]:match[2*j+1]]
result = append(result, slice)
if len(result) == 0 {
return nil
return result
// FindAllStringSubmatchIndex is the 'All' version of
// FindStringSubmatchIndex; it returns a slice of all successive matches of
// the expression, as defined by the 'All' description in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
if n < 0 {
n = len(s) + 1
result := make([][]int, 0, startSize)
re.allMatches(s, nil, n, func(match []int) {
result = append(result, match)
if len(result) == 0 {
return nil
return result
......@@ -86,6 +86,7 @@ func Compile(re *Regexp) (*Prog, os.Error) {
func (c *compiler) init() {
c.p = new(Prog)
c.p.NumCap = 2 // implicit ( and ) for whole match $0
......@@ -55,6 +55,61 @@ func (p *Prog) String() string {
return b.String()
// skipNop follows any no-op or capturing instructions
// and returns the resulting pc.
func (p *Prog) skipNop(pc uint32) *Inst {
i := &p.Inst[pc]
for i.Op == InstNop || i.Op == InstCapture {
pc = i.Out
i = &p.Inst[pc]
return i
// Prefix returns a literal string that all matches for the
// regexp must start with. Complete is true if the prefix
// is the entire match.
func (p *Prog) Prefix() (prefix string, complete bool) {
i := p.skipNop(uint32(p.Start))
// Avoid allocation of buffer if prefix is empty.
if i.Op != InstRune || len(i.Rune) != 1 {
return "", i.Op == InstMatch
// Have prefix; gather characters.
var buf bytes.Buffer
for i.Op == InstRune && len(i.Rune) == 1 {
i = p.skipNop(i.Out)
return buf.String(), i.Op == InstMatch
// StartCond returns the leading empty-width conditions that must
// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
func (p *Prog) StartCond() EmptyOp {
var flag EmptyOp
pc := uint32(p.Start)
i := &p.Inst[pc]
for {
switch i.Op {
case InstEmptyWidth:
flag |= EmptyOp(i.Arg)
case InstFail:
return ^EmptyOp(0)
case InstCapture, InstNop:
// skip
break Loop
pc = i.Out
i = &p.Inst[pc]
return flag
// MatchRune returns true if the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
func (i *Inst) MatchRune(r int) bool {
Markdown is supported
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment