regexp.go 35.7 KB
Newer Older
1
// Copyright 2009 The Go Authors. All rights reserved.
2 3 4
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

Russ Cox's avatar
Russ Cox committed
5
// Package regexp implements regular expression search.
6
//
7 8 9
// The syntax of the regular expressions accepted is the same
// general syntax used by Perl, Python, and other languages.
// More precisely, it is the syntax accepted by RE2 and described at
10
// https://golang.org/s/re2syntax, except for \C.
11
// For an overview of the syntax, run
12
//   go doc regexp/syntax
13
//
14 15 16 17 18
// The regexp implementation provided by this package is
// guaranteed to run in time linear in the size of the input.
// (This is a property not guaranteed by most open source
// implementations of regular expressions.) For more information
// about this property, see
19
//	https://swtch.com/~rsc/regexp/regexp1.html
20 21
// or any book about automata theory.
//
22
// All characters are UTF-8-encoded code points.
23 24
//
// There are 16 methods of Regexp that match a regular expression and identify
25
// the matched text. Their names are matched by this regular expression:
26 27 28 29
//
//	Find(All)?(String)?(Submatch)?(Index)?
//
// If 'All' is present, the routine matches successive non-overlapping
30 31 32
// matches of the entire expression. Empty matches abutting a preceding
// match are ignored. The return value is a slice containing the successive
// return values of the corresponding non-'All' routine. These routines take
33 34
// an extra integer argument, n. If n >= 0, the function returns at most n
// matches/submatches; otherwise, it returns all of them.
35 36 37 38 39
//
// If 'String' is present, the argument is a string; otherwise it is a slice
// of bytes; return values are adjusted as appropriate.
//
// If 'Submatch' is present, the return value is a slice identifying the
40 41 42 43 44
// successive submatches of the expression. Submatches are matches of
// parenthesized subexpressions (also known as capturing groups) within the
// regular expression, numbered from left to right in order of opening
// parenthesis. Submatch 0 is the match of the entire expression, submatch 1
// the match of the first parenthesized subexpression, and so on.
45 46 47
//
// If 'Index' is present, matches and submatches are identified by byte index
// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
48 49 50
// the nth submatch. The pair for n==0 identifies the match of the entire
// expression. If 'Index' is not present, the match is identified by the
// text of the match/submatch. If an index is negative, it means that
51 52
// subexpression did not match any string in the input.
//
53 54 55 56 57
// There is also a subset of the methods that can be applied to text read
// from a RuneReader:
//
//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
//
58
// This set may grow. Note that regular expression matches may need to
59 60 61 62
// examine text beyond the text returned by a match, so the methods that
// match text from a RuneReader may read arbitrarily far into the input
// before returning.
//
63
// (There are a few other methods that do not match this pattern.)
64
//
65 66
package regexp

Rob Pike's avatar
Rob Pike committed
67
import (
68 69
	"bytes"
	"io"
70 71
	"regexp/syntax"
	"strconv"
72
	"strings"
73
	"sync"
74
	"unicode"
75
	"unicode/utf8"
Rob Pike's avatar
Rob Pike committed
76 77
)

78
// Regexp is the representation of a compiled regular expression.
79 80
// A Regexp is safe for concurrent use by multiple goroutines,
// except for configuration methods, such as Longest.
81
type Regexp struct {
82 83 84 85 86 87
	expr           string       // as passed to Compile
	prog           *syntax.Prog // compiled program
	onepass        *onePassProg // onepass program or nil
	numSubexp      int
	maxBitStateLen int
	subexpNames    []string
88 89
	prefix         string         // required prefix in unanchored matches
	prefixBytes    []byte         // prefix, as a []byte
Russ Cox's avatar
Russ Cox committed
90
	prefixRune     rune           // first rune in prefix
91
	prefixEnd      uint32         // pc for last rune in prefix
92 93
	mpool          int            // pool for machines
	matchcap       int            // size of recorded match lengths
94
	prefixComplete bool           // prefix is the entire regexp
95
	cond           syntax.EmptyOp // empty-width conditions required at start of match
96 97 98 99

	// This field can be modified by the Longest method,
	// but it is otherwise read-only.
	longest bool // whether regexp prefers leftmost-longest match
100 101
}

102 103
// String returns the source text used to compile the regular expression.
func (re *Regexp) String() string {
104 105 106
	return re.expr
}

107 108
// Copy returns a new Regexp object copied from re.
//
109 110
// When using a Regexp in multiple goroutines, giving each goroutine
// its own copy helps to avoid lock contention.
111
func (re *Regexp) Copy() *Regexp {
112 113
	re2 := *re
	return &re2
114 115
}

116 117 118 119 120 121 122 123 124 125
// Compile parses a regular expression and returns, if successful,
// a Regexp object that can be used to match against text.
//
// When matching against text, the regexp returns a match that
// begins as early as possible in the input (leftmost), and among those
// it chooses the one that a backtracking search would have found first.
// This so-called leftmost-first matching is the same semantics
// that Perl, Python, and other implementations use, although this
// package implements it without the expense of backtracking.
// For POSIX leftmost-longest matching, see CompilePOSIX.
126
func Compile(expr string) (*Regexp, error) {
127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
	return compile(expr, syntax.Perl, false)
}

// CompilePOSIX is like Compile but restricts the regular expression
// to POSIX ERE (egrep) syntax and changes the match semantics to
// leftmost-longest.
//
// That is, when matching against text, the regexp returns a match that
// begins as early as possible in the input (leftmost), and among those
// it chooses a match that is as long as possible.
// This so-called leftmost-longest matching is the same semantics
// that early regular expression implementations used and that POSIX
// specifies.
//
// However, there can be multiple leftmost-longest matches, with different
// submatch choices, and here this package diverges from POSIX.
// Among the possible leftmost-longest matches, this package chooses
// the one that a backtracking search would have found first, while POSIX
// specifies that the match be chosen to maximize the length of the first
// subexpression, then the second, and so on from left to right.
// The POSIX rule is computationally prohibitive and not even well-defined.
148
// See https://swtch.com/~rsc/regexp/regexp2.html#posix for details.
149
func CompilePOSIX(expr string) (*Regexp, error) {
150 151 152
	return compile(expr, syntax.POSIX, true)
}

153
// Longest makes future searches prefer the leftmost-longest match.
154 155 156
// That is, when matching against text, the regexp returns a match that
// begins as early as possible in the input (leftmost), and among those
// it chooses a match that is as long as possible.
157 158
// This method modifies the Regexp and may not be called concurrently
// with any other methods.
159 160 161 162
func (re *Regexp) Longest() {
	re.longest = true
}

163
func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
164 165 166 167 168
	re, err := syntax.Parse(expr, mode)
	if err != nil {
		return nil, err
	}
	maxCap := re.MaxCap()
Russ Cox's avatar
Russ Cox committed
169 170
	capNames := re.CapNames()

171 172 173 174 175
	re = re.Simplify()
	prog, err := syntax.Compile(re)
	if err != nil {
		return nil, err
	}
176 177 178 179
	matchcap := prog.NumCap
	if matchcap < 2 {
		matchcap = 2
	}
180
	regexp := &Regexp{
181 182 183 184 185 186 187 188
		expr:        expr,
		prog:        prog,
		onepass:     compileOnePass(prog),
		numSubexp:   maxCap,
		subexpNames: capNames,
		cond:        prog.StartCond(),
		longest:     longest,
		matchcap:    matchcap,
189
	}
190
	if regexp.onepass == nil {
191
		regexp.prefix, regexp.prefixComplete = prog.Prefix()
192
		regexp.maxBitStateLen = maxBitStateLen(prog)
193
	} else {
194
		regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
195
	}
196 197 198 199 200 201
	if regexp.prefix != "" {
		// TODO(rsc): Remove this allocation by adding
		// IndexString to package bytes.
		regexp.prefixBytes = []byte(regexp.prefix)
		regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
	}
202 203 204 205 206 207 208 209

	n := len(prog.Inst)
	i := 0
	for matchSize[i] != 0 && matchSize[i] < n {
		i++
	}
	regexp.mpool = i

210 211 212
	return regexp, nil
}

213 214 215 216 217 218 219 220 221 222 223
// Pools of *machine for use during (*Regexp).doExecute,
// split up by the size of the execution queues.
// matchPool[i] machines have queue size matchSize[i].
// On a 64-bit system each queue entry is 16 bytes,
// so matchPool[0] has 16*2*128 = 4kB queues, etc.
// The final matchPool is a catch-all for very large queues.
var (
	matchSize = [...]int{128, 512, 2048, 16384, 0}
	matchPool [len(matchSize)]sync.Pool
)

224 225 226 227
// get returns a machine to use for matching re.
// It uses the re's machine cache if possible, to avoid
// unnecessary allocation.
func (re *Regexp) get() *machine {
228 229 230 231 232 233 234 235 236 237 238
	m, ok := matchPool[re.mpool].Get().(*machine)
	if !ok {
		m = new(machine)
	}
	m.re = re
	m.p = re.prog
	if cap(m.matchcap) < re.matchcap {
		m.matchcap = make([]int, re.matchcap)
		for _, t := range m.pool {
			t.cap = make([]int, re.matchcap)
		}
239
	}
240

241 242 243 244 245 246 247 248 249 250 251 252
	// Allocate queues if needed.
	// Or reallocate, for "large" match pool.
	n := matchSize[re.mpool]
	if n == 0 { // large pool
		n = len(re.prog.Inst)
	}
	if len(m.q0.sparse) < n {
		m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
		m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
	}
	return m
}
253

254 255 256 257 258 259
// put returns a machine to the correct machine pool.
func (re *Regexp) put(m *machine) {
	m.re = nil
	m.p = nil
	m.inputs.clear()
	matchPool[re.mpool].Put(m)
260 261 262 263 264 265
}

// MustCompile is like Compile but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompile(str string) *Regexp {
266 267 268
	regexp, err := Compile(str)
	if err != nil {
		panic(`regexp: Compile(` + quote(str) + `): ` + err.Error())
269
	}
270
	return regexp
271
}
Rob Pike's avatar
Rob Pike committed
272

273 274 275 276
// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
// It simplifies safe initialization of global variables holding compiled regular
// expressions.
func MustCompilePOSIX(str string) *Regexp {
277 278 279
	regexp, err := CompilePOSIX(str)
	if err != nil {
		panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error())
280
	}
281
	return regexp
282 283
}

284 285 286
func quote(s string) string {
	if strconv.CanBackquote(s) {
		return "`" + s + "`"
287
	}
288
	return strconv.Quote(s)
289 290
}

291 292 293
// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
func (re *Regexp) NumSubexp() int {
	return re.numSubexp
Rob Pike's avatar
Rob Pike committed
294 295
}

Russ Cox's avatar
Russ Cox committed
296
// SubexpNames returns the names of the parenthesized subexpressions
297
// in this Regexp. The name for the first sub-expression is names[1],
Russ Cox's avatar
Russ Cox committed
298 299
// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
// Since the Regexp as a whole cannot be named, names[0] is always
300
// the empty string. The slice should not be modified.
Russ Cox's avatar
Russ Cox committed
301 302 303 304
func (re *Regexp) SubexpNames() []string {
	return re.subexpNames
}

Russ Cox's avatar
Russ Cox committed
305
const endOfText rune = -1
Rob Pike's avatar
Rob Pike committed
306

307 308 309
// input abstracts different representations of the input text. It provides
// one-character lookahead.
type input interface {
Russ Cox's avatar
Russ Cox committed
310 311
	step(pos int) (r rune, width int) // advance one rune
	canCheckPrefix() bool             // can we look ahead without losing info?
312 313
	hasPrefix(re *Regexp) bool
	index(re *Regexp, pos int) int
314
	context(pos int) lazyFlag
315 316 317 318 319 320 321
}

// inputString scans a string.
type inputString struct {
	str string
}

Russ Cox's avatar
Russ Cox committed
322
func (i *inputString) step(pos int) (rune, int) {
323
	if pos < len(i.str) {
Russ Cox's avatar
Russ Cox committed
324 325
		c := i.str[pos]
		if c < utf8.RuneSelf {
Russ Cox's avatar
Russ Cox committed
326
			return rune(c), 1
Russ Cox's avatar
Russ Cox committed
327 328
		}
		return utf8.DecodeRuneInString(i.str[pos:])
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344
	}
	return endOfText, 0
}

func (i *inputString) canCheckPrefix() bool {
	return true
}

func (i *inputString) hasPrefix(re *Regexp) bool {
	return strings.HasPrefix(i.str, re.prefix)
}

func (i *inputString) index(re *Regexp, pos int) int {
	return strings.Index(i.str[pos:], re.prefix)
}

345
func (i *inputString) context(pos int) lazyFlag {
Russ Cox's avatar
Russ Cox committed
346
	r1, r2 := endOfText, endOfText
347 348 349 350 351 352
	// 0 < pos && pos <= len(i.str)
	if uint(pos-1) < uint(len(i.str)) {
		r1 = rune(i.str[pos-1])
		if r1 >= utf8.RuneSelf {
			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
		}
353
	}
354 355 356 357 358 359
	// 0 <= pos && pos < len(i.str)
	if uint(pos) < uint(len(i.str)) {
		r2 = rune(i.str[pos])
		if r2 >= utf8.RuneSelf {
			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
		}
360
	}
361
	return newLazyFlag(r1, r2)
362 363
}

364 365 366 367 368
// inputBytes scans a byte slice.
type inputBytes struct {
	str []byte
}

Russ Cox's avatar
Russ Cox committed
369
func (i *inputBytes) step(pos int) (rune, int) {
370
	if pos < len(i.str) {
Russ Cox's avatar
Russ Cox committed
371 372
		c := i.str[pos]
		if c < utf8.RuneSelf {
Russ Cox's avatar
Russ Cox committed
373
			return rune(c), 1
Russ Cox's avatar
Russ Cox committed
374 375
		}
		return utf8.DecodeRune(i.str[pos:])
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
	}
	return endOfText, 0
}

func (i *inputBytes) canCheckPrefix() bool {
	return true
}

func (i *inputBytes) hasPrefix(re *Regexp) bool {
	return bytes.HasPrefix(i.str, re.prefixBytes)
}

func (i *inputBytes) index(re *Regexp, pos int) int {
	return bytes.Index(i.str[pos:], re.prefixBytes)
}

392
func (i *inputBytes) context(pos int) lazyFlag {
Russ Cox's avatar
Russ Cox committed
393
	r1, r2 := endOfText, endOfText
394 395 396 397 398 399
	// 0 < pos && pos <= len(i.str)
	if uint(pos-1) < uint(len(i.str)) {
		r1 = rune(i.str[pos-1])
		if r1 >= utf8.RuneSelf {
			r1, _ = utf8.DecodeLastRune(i.str[:pos])
		}
400
	}
401 402 403 404 405 406
	// 0 <= pos && pos < len(i.str)
	if uint(pos) < uint(len(i.str)) {
		r2 = rune(i.str[pos])
		if r2 >= utf8.RuneSelf {
			r2, _ = utf8.DecodeRune(i.str[pos:])
		}
407
	}
408
	return newLazyFlag(r1, r2)
409 410
}

411 412 413 414 415 416 417
// inputReader scans a RuneReader.
type inputReader struct {
	r     io.RuneReader
	atEOT bool
	pos   int
}

Russ Cox's avatar
Russ Cox committed
418
func (i *inputReader) step(pos int) (rune, int) {
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443
	if !i.atEOT && pos != i.pos {
		return endOfText, 0

	}
	r, w, err := i.r.ReadRune()
	if err != nil {
		i.atEOT = true
		return endOfText, 0
	}
	i.pos += w
	return r, w
}

func (i *inputReader) canCheckPrefix() bool {
	return false
}

func (i *inputReader) hasPrefix(re *Regexp) bool {
	return false
}

func (i *inputReader) index(re *Regexp, pos int) int {
	return -1
}

444 445
func (i *inputReader) context(pos int) lazyFlag {
	return 0 // not used
Rob Pike's avatar
Rob Pike committed
446 447
}

448
// LiteralPrefix returns a literal string that must begin any match
449
// of the regular expression re. It returns the boolean true if the
450 451
// literal string comprises the entire regular expression.
func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
452
	return re.prefix, re.prefixComplete
453 454
}

455 456
// MatchReader reports whether the text returned by the RuneReader
// contains any match of the regular expression re.
457
func (re *Regexp) MatchReader(r io.RuneReader) bool {
458
	return re.doMatch(r, nil, "")
459 460
}

461 462
// MatchString reports whether the string s
// contains any match of the regular expression re.
463
func (re *Regexp) MatchString(s string) bool {
464
	return re.doMatch(nil, nil, s)
465
}
466

467 468
// Match reports whether the byte slice b
// contains any match of the regular expression re.
469
func (re *Regexp) Match(b []byte) bool {
470
	return re.doMatch(nil, b, "")
471
}
472

473 474 475
// MatchReader reports whether the text returned by the RuneReader
// contains any match of the regular expression pattern.
// More complicated queries need to use Compile and the full Regexp interface.
Brad Fitzpatrick's avatar
Brad Fitzpatrick committed
476
func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
477 478 479 480 481 482
	re, err := Compile(pattern)
	if err != nil {
		return false, err
	}
	return re.MatchReader(r), nil
}
483

484 485 486
// MatchString reports whether the string s
// contains any match of the regular expression pattern.
// More complicated queries need to use Compile and the full Regexp interface.
Brad Fitzpatrick's avatar
Brad Fitzpatrick committed
487
func MatchString(pattern string, s string) (matched bool, err error) {
488
	re, err := Compile(pattern)
489
	if err != nil {
490
		return false, err
491
	}
492
	return re.MatchString(s), nil
493 494
}

495
// Match reports whether the byte slice b
496 497
// contains any match of the regular expression pattern.
// More complicated queries need to use Compile and the full Regexp interface.
Brad Fitzpatrick's avatar
Brad Fitzpatrick committed
498
func Match(pattern string, b []byte) (matched bool, err error) {
499
	re, err := Compile(pattern)
500
	if err != nil {
501
		return false, err
502
	}
503
	return re.Match(b), nil
504
}
505

506
// ReplaceAllString returns a copy of src, replacing matches of the Regexp
507
// with the replacement string repl. Inside repl, $ signs are interpreted as
508
// in Expand, so for instance $1 represents the text of the first submatch.
509
func (re *Regexp) ReplaceAllString(src, repl string) string {
510
	n := 2
511
	if strings.Contains(repl, "$") {
512 513 514 515 516 517 518 519
		n = 2 * (re.numSubexp + 1)
	}
	b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
		return re.expand(dst, repl, nil, src, match)
	})
	return string(b)
}

520
// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
521
// with the replacement string repl. The replacement repl is substituted directly,
522 523 524 525 526
// without using Expand.
func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
	return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
		return append(dst, repl...)
	}))
527 528
}

529
// ReplaceAllStringFunc returns a copy of src in which all matches of the
530
// Regexp have been replaced by the return value of function repl applied
531
// to the matched substring. The replacement returned by repl is substituted
532
// directly, without using Expand.
533
func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
534 535 536 537 538 539 540
	b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
		return append(dst, repl(src[match[0]:match[1]])...)
	})
	return string(b)
}

func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
541 542
	lastMatchEnd := 0 // end position of the most recent match
	searchPos := 0    // position where we next look for a match
543 544 545 546 547 548 549
	var buf []byte
	var endPos int
	if bsrc != nil {
		endPos = len(bsrc)
	} else {
		endPos = len(src)
	}
550 551 552 553
	if nmatch > re.prog.NumCap {
		nmatch = re.prog.NumCap
	}

554
	var dstCap [2]int
555
	for searchPos <= endPos {
556
		a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0])
557
		if len(a) == 0 {
558
			break // no more matches
559 560 561
		}

		// Copy the unmatched characters before this match.
562 563 564 565 566
		if bsrc != nil {
			buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
		} else {
			buf = append(buf, src[lastMatchEnd:a[0]]...)
		}
567 568 569 570 571 572

		// Now insert a copy of the replacement string, but not for a
		// match of the empty string immediately after another match.
		// (Otherwise, we get double replacement for patterns that
		// match both empty and nonempty strings.)
		if a[1] > lastMatchEnd || a[0] == 0 {
573
			buf = repl(buf, a)
574
		}
575
		lastMatchEnd = a[1]
576 577

		// Advance past this match; always advance at least one character.
578 579 580 581 582 583
		var width int
		if bsrc != nil {
			_, width = utf8.DecodeRune(bsrc[searchPos:])
		} else {
			_, width = utf8.DecodeRuneInString(src[searchPos:])
		}
584
		if searchPos+width > a[1] {
585
			searchPos += width
586
		} else if searchPos+1 > a[1] {
587
			// This clause is only needed at the end of the input
588
			// string. In that case, DecodeRuneInString returns width=0.
589
			searchPos++
590
		} else {
591
			searchPos = a[1]
592 593 594 595
		}
	}

	// Copy the unmatched characters after the last match.
596 597 598 599 600
	if bsrc != nil {
		buf = append(buf, bsrc[lastMatchEnd:]...)
	} else {
		buf = append(buf, src[lastMatchEnd:]...)
	}
601

602
	return buf
603 604
}

605
// ReplaceAll returns a copy of src, replacing matches of the Regexp
606
// with the replacement text repl. Inside repl, $ signs are interpreted as
607
// in Expand, so for instance $1 represents the text of the first submatch.
608
func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
609 610 611 612 613 614 615 616
	n := 2
	if bytes.IndexByte(repl, '$') >= 0 {
		n = 2 * (re.numSubexp + 1)
	}
	srepl := ""
	b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
		if len(srepl) != len(repl) {
			srepl = string(repl)
617
		}
618 619 620 621
		return re.expand(dst, srepl, src, "", match)
	})
	return b
}
622

623
// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
624
// with the replacement bytes repl. The replacement repl is substituted directly,
625 626 627 628 629 630
// without using Expand.
func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
		return append(dst, repl...)
	})
}
631

632
// ReplaceAllFunc returns a copy of src in which all matches of the
633
// Regexp have been replaced by the return value of function repl applied
634
// to the matched byte slice. The replacement returned by repl is substituted
635 636 637 638 639
// directly, without using Expand.
func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
		return append(dst, repl(src[match[0]:match[1]])...)
	})
640 641
}

642 643
// Bitmap used by func special to check whether a character needs to be escaped.
var specialBytes [16]byte
644

645
// special reports whether byte b needs to be escaped by QuoteMeta.
646
func special(b byte) bool {
647 648 649 650 651 652 653
	return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0
}

func init() {
	for _, b := range []byte(`\.+*?()|[]{}^$`) {
		specialBytes[b%16] |= 1 << (b / 16)
	}
654 655
}

656
// QuoteMeta returns a string that escapes all regular expression metacharacters
657
// inside the argument text; the returned string is a regular expression matching
658
// the literal text.
659 660
func QuoteMeta(s string) string {
	// A byte loop is correct because all metacharacters are ASCII.
661 662 663 664 665 666 667 668 669 670 671 672 673 674 675
	var i int
	for i = 0; i < len(s); i++ {
		if special(s[i]) {
			break
		}
	}
	// No meta characters found, so return original string.
	if i >= len(s) {
		return s
	}

	b := make([]byte, 2*len(s)-i)
	copy(b, s[:i])
	j := i
	for ; i < len(s); i++ {
676
		if special(s[i]) {
677 678
			b[j] = '\\'
			j++
679
		}
680 681
		b[j] = s[i]
		j++
682
	}
683
	return string(b[:j])
684 685
}

686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702
// The number of capture values in the program may correspond
// to fewer capturing expressions than are in the regexp.
// For example, "(a){0}" turns into an empty program, so the
// maximum capture in the program is 0 but we need to return
// an expression for \1.  Pad appends -1s to the slice a as needed.
func (re *Regexp) pad(a []int) []int {
	if a == nil {
		// No match.
		return nil
	}
	n := (1 + re.numSubexp) * 2
	for len(a) < n {
		a = append(a, -1)
	}
	return a
}

703 704 705
// allMatches calls deliver at most n times
// with the location of successive matches in the input text.
// The input text is b if non-nil, otherwise s.
706
func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
707
	var end int
708
	if b == nil {
709
		end = len(s)
710
	} else {
711
		end = len(b)
712 713 714
	}

	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
715
		matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil)
716
		if len(matches) == 0 {
717
			break
718 719
		}

720
		accept := true
721 722 723 724 725
		if matches[1] == pos {
			// We've found an empty match.
			if matches[0] == prevMatchEnd {
				// We don't allow an empty match right
				// after a previous match, so ignore it.
726
				accept = false
727
			}
728
			var width int
729
			// TODO: use step()
730
			if b == nil {
731
				_, width = utf8.DecodeRuneInString(s[pos:end])
732
			} else {
733
				_, width = utf8.DecodeRune(b[pos:end])
734 735
			}
			if width > 0 {
736
				pos += width
737
			} else {
738
				pos = end + 1
739 740
			}
		} else {
741
			pos = matches[1]
742
		}
743
		prevMatchEnd = matches[1]
744 745

		if accept {
746
			deliver(re.pad(matches))
747
			i++
748 749 750 751
		}
	}
}

752 753 754
// Find returns a slice holding the text of the leftmost match in b of the regular expression.
// A return value of nil indicates no match.
func (re *Regexp) Find(b []byte) []byte {
755 756
	var dstCap [2]int
	a := re.doExecute(nil, b, "", 0, 2, dstCap[:0])
757 758 759 760 761 762 763
	if a == nil {
		return nil
	}
	return b[a[0]:a[1]]
}

// FindIndex returns a two-element slice of integers defining the location of
764
// the leftmost match in b of the regular expression. The match itself is at
765 766 767
// b[loc[0]:loc[1]].
// A return value of nil indicates no match.
func (re *Regexp) FindIndex(b []byte) (loc []int) {
768
	a := re.doExecute(nil, b, "", 0, 2, nil)
769 770 771 772 773 774 775
	if a == nil {
		return nil
	}
	return a[0:2]
}

// FindString returns a string holding the text of the leftmost match in s of the regular
776
// expression. If there is no match, the return value is an empty string,
777
// but it will also be empty if the regular expression successfully matches
778
// an empty string. Use FindStringIndex or FindStringSubmatch if it is
779 780
// necessary to distinguish these cases.
func (re *Regexp) FindString(s string) string {
781 782
	var dstCap [2]int
	a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0])
783 784 785 786 787 788 789
	if a == nil {
		return ""
	}
	return s[a[0]:a[1]]
}

// FindStringIndex returns a two-element slice of integers defining the
790
// location of the leftmost match in s of the regular expression. The match
791 792
// itself is at s[loc[0]:loc[1]].
// A return value of nil indicates no match.
793
func (re *Regexp) FindStringIndex(s string) (loc []int) {
794
	a := re.doExecute(nil, nil, s, 0, 2, nil)
795 796 797 798 799 800 801 802
	if a == nil {
		return nil
	}
	return a[0:2]
}

// FindReaderIndex returns a two-element slice of integers defining the
// location of the leftmost match of the regular expression in text read from
803
// the RuneReader. The match text was found in the input stream at
804 805
// byte offset loc[0] through loc[1]-1.
// A return value of nil indicates no match.
806
func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
807
	a := re.doExecute(r, nil, "", 0, 2, nil)
808 809 810 811 812 813 814 815 816 817 818 819
	if a == nil {
		return nil
	}
	return a[0:2]
}

// FindSubmatch returns a slice of slices holding the text of the leftmost
// match of the regular expression in b and the matches, if any, of its
// subexpressions, as defined by the 'Submatch' descriptions in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatch(b []byte) [][]byte {
820 821
	var dstCap [4]int
	a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0])
822 823 824
	if a == nil {
		return nil
	}
825
	ret := make([][]byte, 1+re.numSubexp)
826
	for i := range ret {
827
		if 2*i < len(a) && a[2*i] >= 0 {
828 829 830 831 832 833
			ret[i] = b[a[2*i]:a[2*i+1]]
		}
	}
	return ret
}

834 835
// Expand appends template to dst and returns the result; during the
// append, Expand replaces variables in the template with corresponding
836
// matches drawn from src. The match slice should have been returned by
837
// FindSubmatchIndex.
838
//
839 840
// In the template, a variable is denoted by a substring of the form
// $name or ${name}, where name is a non-empty sequence of letters,
841
// digits, and underscores. A purely numeric name like $1 refers to
842
// the submatch with the corresponding index; other names refer to
843
// capturing parentheses named with the (?P<name>...) syntax. A
844
// reference to an out of range or unmatched index or a name that is not
845
// present in the regular expression is replaced with an empty slice.
846
//
847 848
// In the $name form, name is taken to be as long as possible: $1x is
// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
849
//
850 851 852 853 854 855 856
// To insert a literal $ in the output, use $$ in the template.
func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
	return re.expand(dst, string(template), src, "", match)
}

// ExpandString is like Expand but the template and source are strings.
// It appends to and returns a byte slice in order to give the calling
Russ Cox's avatar
Russ Cox committed
857
// code control over allocation.
858 859 860 861 862 863
func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
	return re.expand(dst, template, nil, src, match)
}

func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
	for len(template) > 0 {
864
		i := strings.Index(template, "$")
865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884
		if i < 0 {
			break
		}
		dst = append(dst, template[:i]...)
		template = template[i:]
		if len(template) > 1 && template[1] == '$' {
			// Treat $$ as $.
			dst = append(dst, '$')
			template = template[2:]
			continue
		}
		name, num, rest, ok := extract(template)
		if !ok {
			// Malformed; treat $ as raw text.
			dst = append(dst, '$')
			template = template[1:]
			continue
		}
		template = rest
		if num >= 0 {
885
			if 2*num+1 < len(match) && match[2*num] >= 0 {
886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961
				if bsrc != nil {
					dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
				} else {
					dst = append(dst, src[match[2*num]:match[2*num+1]]...)
				}
			}
		} else {
			for i, namei := range re.subexpNames {
				if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
					if bsrc != nil {
						dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
					} else {
						dst = append(dst, src[match[2*i]:match[2*i+1]]...)
					}
					break
				}
			}
		}
	}
	dst = append(dst, template...)
	return dst
}

// extract returns the name from a leading "$name" or "${name}" in str.
// If it is a number, extract returns num set to that number; otherwise num = -1.
func extract(str string) (name string, num int, rest string, ok bool) {
	if len(str) < 2 || str[0] != '$' {
		return
	}
	brace := false
	if str[1] == '{' {
		brace = true
		str = str[2:]
	} else {
		str = str[1:]
	}
	i := 0
	for i < len(str) {
		rune, size := utf8.DecodeRuneInString(str[i:])
		if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
			break
		}
		i += size
	}
	if i == 0 {
		// empty name is not okay
		return
	}
	name = str[:i]
	if brace {
		if i >= len(str) || str[i] != '}' {
			// missing closing brace
			return
		}
		i++
	}

	// Parse number.
	num = 0
	for i := 0; i < len(name); i++ {
		if name[i] < '0' || '9' < name[i] || num >= 1e8 {
			num = -1
			break
		}
		num = num*10 + int(name[i]) - '0'
	}
	// Disallow leading zeros.
	if name[0] == '0' && len(name) > 1 {
		num = -1
	}

	rest = str[i:]
	ok = true
	return
}

962 963 964 965 966 967
// FindSubmatchIndex returns a slice holding the index pairs identifying the
// leftmost match of the regular expression in b and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindSubmatchIndex(b []byte) []int {
968
	return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil))
969 970 971 972 973 974 975 976
}

// FindStringSubmatch returns a slice of strings holding the text of the
// leftmost match of the regular expression in s and the matches, if any, of
// its subexpressions, as defined by the 'Submatch' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatch(s string) []string {
977 978
	var dstCap [4]int
	a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0])
979 980 981
	if a == nil {
		return nil
	}
982
	ret := make([]string, 1+re.numSubexp)
983
	for i := range ret {
984
		if 2*i < len(a) && a[2*i] >= 0 {
985 986 987 988 989 990 991 992 993 994 995 996
			ret[i] = s[a[2*i]:a[2*i+1]]
		}
	}
	return ret
}

// FindStringSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression in s and the
// matches, if any, of its subexpressions, as defined by the 'Submatch' and
// 'Index' descriptions in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindStringSubmatchIndex(s string) []int {
997
	return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil))
998 999 1000 1001 1002
}

// FindReaderSubmatchIndex returns a slice holding the index pairs
// identifying the leftmost match of the regular expression of text read by
// the RuneReader, and the matches, if any, of its subexpressions, as defined
1003
// by the 'Submatch' and 'Index' descriptions in the package comment. A
1004 1005
// return value of nil indicates no match.
func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
1006
	return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil))
1007 1008
}

1009
const startSize = 10 // The size at which to start a slice in the 'All' routines.
1010

1011 1012 1013 1014 1015 1016 1017 1018
// FindAll is the 'All' version of Find; it returns a slice of all successive
// matches of the expression, as defined by the 'All' description in the
// package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAll(b []byte, n int) [][]byte {
	if n < 0 {
		n = len(b) + 1
	}
1019
	var result [][]byte
1020
	re.allMatches("", b, n, func(match []int) {
1021 1022 1023
		if result == nil {
			result = make([][]byte, 0, startSize)
		}
Russ Cox's avatar
Russ Cox committed
1024
		result = append(result, b[match[0]:match[1]])
1025
	})
Russ Cox's avatar
Russ Cox committed
1026
	return result
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036
}

// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
	if n < 0 {
		n = len(b) + 1
	}
1037
	var result [][]int
1038
	re.allMatches("", b, n, func(match []int) {
1039 1040 1041
		if result == nil {
			result = make([][]int, 0, startSize)
		}
Russ Cox's avatar
Russ Cox committed
1042
		result = append(result, match[0:2])
1043
	})
Russ Cox's avatar
Russ Cox committed
1044
	return result
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054
}

// FindAllString is the 'All' version of FindString; it returns a slice of all
// successive matches of the expression, as defined by the 'All' description
// in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllString(s string, n int) []string {
	if n < 0 {
		n = len(s) + 1
	}
1055
	var result []string
1056
	re.allMatches(s, nil, n, func(match []int) {
1057 1058 1059
		if result == nil {
			result = make([]string, 0, startSize)
		}
Russ Cox's avatar
Russ Cox committed
1060
		result = append(result, s[match[0]:match[1]])
1061
	})
Russ Cox's avatar
Russ Cox committed
1062
	return result
1063 1064 1065 1066 1067 1068 1069 1070 1071 1072
}

// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
// slice of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
	if n < 0 {
		n = len(s) + 1
	}
1073
	var result [][]int
1074
	re.allMatches(s, nil, n, func(match []int) {
1075 1076 1077
		if result == nil {
			result = make([][]int, 0, startSize)
		}
Russ Cox's avatar
Russ Cox committed
1078
		result = append(result, match[0:2])
1079
	})
Russ Cox's avatar
Russ Cox committed
1080
	return result
1081 1082 1083 1084 1085 1086 1087 1088 1089 1090
}

// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
// of all successive matches of the expression, as defined by the 'All'
// description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
	if n < 0 {
		n = len(b) + 1
	}
1091
	var result [][][]byte
1092
	re.allMatches("", b, n, func(match []int) {
1093 1094 1095
		if result == nil {
			result = make([][][]byte, 0, startSize)
		}
1096 1097 1098 1099 1100 1101
		slice := make([][]byte, len(match)/2)
		for j := range slice {
			if match[2*j] >= 0 {
				slice[j] = b[match[2*j]:match[2*j+1]]
			}
		}
Russ Cox's avatar
Russ Cox committed
1102
		result = append(result, slice)
1103
	})
Russ Cox's avatar
Russ Cox committed
1104
	return result
1105 1106 1107 1108 1109 1110 1111 1112 1113 1114
}

// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
// a slice of all successive matches of the expression, as defined by the
// 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
	if n < 0 {
		n = len(b) + 1
	}
1115
	var result [][]int
1116
	re.allMatches("", b, n, func(match []int) {
1117 1118 1119
		if result == nil {
			result = make([][]int, 0, startSize)
		}
Russ Cox's avatar
Russ Cox committed
1120
		result = append(result, match)
1121
	})
Russ Cox's avatar
Russ Cox committed
1122
	return result
1123 1124 1125 1126 1127 1128 1129 1130 1131 1132
}

// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
// returns a slice of all successive matches of the expression, as defined by
// the 'All' description in the package comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
	if n < 0 {
		n = len(s) + 1
	}
1133
	var result [][]string
1134
	re.allMatches(s, nil, n, func(match []int) {
1135 1136 1137
		if result == nil {
			result = make([][]string, 0, startSize)
		}
1138 1139 1140 1141 1142 1143
		slice := make([]string, len(match)/2)
		for j := range slice {
			if match[2*j] >= 0 {
				slice[j] = s[match[2*j]:match[2*j+1]]
			}
		}
Russ Cox's avatar
Russ Cox committed
1144
		result = append(result, slice)
1145
	})
Russ Cox's avatar
Russ Cox committed
1146
	return result
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157
}

// FindAllStringSubmatchIndex is the 'All' version of
// FindStringSubmatchIndex; it returns a slice of all successive matches of
// the expression, as defined by the 'All' description in the package
// comment.
// A return value of nil indicates no match.
func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
	if n < 0 {
		n = len(s) + 1
	}
1158
	var result [][]int
1159
	re.allMatches(s, nil, n, func(match []int) {
1160 1161 1162
		if result == nil {
			result = make([][]int, 0, startSize)
		}
Russ Cox's avatar
Russ Cox committed
1163
		result = append(result, match)
1164
	})
Russ Cox's avatar
Russ Cox committed
1165
	return result
1166
}
Rick Arnold's avatar
Rick Arnold committed
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215

// Split slices s into substrings separated by the expression and returns a slice of
// the substrings between those expression matches.
//
// The slice returned by this method consists of all the substrings of s
// not contained in the slice returned by FindAllString. When called on an expression
// that contains no metacharacters, it is equivalent to strings.SplitN.
//
// Example:
//   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
//   // s: ["", "b", "b", "c", "cadaaae"]
//
// The count determines the number of substrings to return:
//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
//   n == 0: the result is nil (zero substrings)
//   n < 0: all substrings
func (re *Regexp) Split(s string, n int) []string {

	if n == 0 {
		return nil
	}

	if len(re.expr) > 0 && len(s) == 0 {
		return []string{""}
	}

	matches := re.FindAllStringIndex(s, n)
	strings := make([]string, 0, len(matches))

	beg := 0
	end := 0
	for _, match := range matches {
		if n > 0 && len(strings) >= n-1 {
			break
		}

		end = match[0]
		if match[1] != 0 {
			strings = append(strings, s[beg:end])
		}
		beg = match[1]
	}

	if end != len(s) {
		strings = append(strings, s[beg:])
	}

	return strings
}