bytes, strings: fix regression in IndexRune

In all previous versions of Go, the behavior of IndexRune(s, r) where r was utf.RuneError was that it would effectively return the index of any invalid UTF-8 byte sequence (include RuneError). Optimizations made in http://golang.org/cl/28537 and http://golang.org/cl/28546 altered this undocumented behavior such that RuneError would only match on the RuneError rune itself. Although, the new behavior is arguably reasonable, it did break code that depended on the previous behavior. Thus, we add special checks to ensure that we preserve the old behavior. There is a slight performance hit for correctness: benchmark old ns/op new ns/op delta BenchmarkIndexRune/10-4 19.3 21.6 +11.92% BenchmarkIndexRune/32-4 33.6 35.2 +4.76% This only occurs on small strings. The performance hit for larger strings is neglible and not shown. Fixes #17611 Change-Id: I1d863a741213d46c40b2e1724c41245df52502a5 Reviewed-on: https://go-review.googlesource.com/32123 Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

bytes, strings: fix regression in IndexRune
In all previous versions of Go, the behavior of IndexRune(s, r) where r was utf.RuneError was that it would effectively return the index of any invalid UTF-8 byte sequence (include RuneError). Optimizations made in http://golang.org/cl/28537 and http://golang.org/cl/28546 altered this undocumented behavior such that RuneError would only match on the RuneError rune itself. Although, the new behavior is arguably reasonable, it did break code that depended on the previous behavior. Thus, we add special checks to ensure that we preserve the old behavior. There is a slight performance hit for correctness: benchmark old ns/op new ns/op delta BenchmarkIndexRune/10-4 19.3 21.6 +11.92% BenchmarkIndexRune/32-4 33.6 35.2 +4.76% This only occurs on small strings. The performance hit for larger strings is neglible and not shown. Fixes #17611 Change-Id: I1d863a741213d46c40b2e1724c41245df52502a5 Reviewed-on: https://go-review.googlesource.com/32123 Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
4b266578 · Joe Tsai · Joe Tsai · 4f1e7be5 · 4b266578 · 4b266578
Commit 4b266578 authored Oct 26, 2016 by Joe Tsai Committed by Joe Tsai Oct 26, 2016
4 changed files
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -130,13 +130,28 @@ func LastIndexByte(s []byte, c byte) int {
 // IndexRune interprets s as a sequence of UTF-8-encoded Unicode code points.
 // It returns the byte index of the first occurrence in s of the given rune.
 // It returns -1 if rune is not present in s.
+// If r is utf8.RuneError, it returns the first instance of any
+// invalid UTF-8 byte sequence.
 func IndexRune(s []byte, r rune) int {
-	if r < utf8.RuneSelf {
+	switch {
+	case 0 <= r && r < utf8.RuneSelf:
 		return IndexByte(s, byte(r))
+	case r == utf8.RuneError:
+		for i := 0; i < len(s); {
+			r1, n := utf8.DecodeRune(s[i:])
+			if r1 == utf8.RuneError {
+				return i
+			}
+			i += n
+		}
+		return -1
+	case !utf8.ValidRune(r):
+		return -1
+	default:
+		var b [utf8.UTFMax]byte
+		n := utf8.EncodeRune(b[:], r)
+		return Index(s, b[:n])
 	}
-	var b [utf8.UTFMax]byte
-	n := utf8.EncodeRune(b[:], r)
-	return Index(s, b[:n])
 }

 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.

--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -185,15 +185,6 @@ var lastIndexAnyTests = []BinOpTest{
 	{dots + dots + dots, " ", -1},
 }

-var indexRuneTests = []BinOpTest{
-	{"", "a", -1},
-	{"", "☺", -1},
-	{"foo", "☹", -1},
-	{"foo", "o", 1},
-	{"foo☺bar", "☺", 3},
-	{"foo☺☻☹bar", "☹", 9},
-}
-
 // Execute f on each test case.  funcName should be the name of f; it's used
 // in failure reports.
 func runIndexTests(t *testing.T, f func(s, sep []byte) int, funcName string, testCases []BinOpTest) {
@@ -348,17 +339,42 @@ func TestIndexByteSmall(t *testing.T) {
 }

 func TestIndexRune(t *testing.T) {
-	for _, tt := range indexRuneTests {
-		a := []byte(tt.a)
-		r, _ := utf8.DecodeRuneInString(tt.b)
-		pos := IndexRune(a, r)
-		if pos != tt.i {
-			t.Errorf(`IndexRune(%q, '%c') = %v`, tt.a, r, pos)
+	tests := []struct {
+		in   string
+		rune rune
+		want int
+	}{
+		{"", 'a', -1},
+		{"", '☺', -1},
+		{"foo", '☹', -1},
+		{"foo", 'o', 1},
+		{"foo☺bar", '☺', 3},
+		{"foo☺☻☹bar", '☹', 9},
+		{"a A x", 'A', 2},
+		{"some_text=some_value", '=', 9},
+		{"☺a", 'a', 3},
+		{"a☻☺b", '☺', 4},
+
+		// RuneError should match any invalid UTF-8 byte sequence.
+		{"�", '�', 0},
+		{"\xff", '�', 0},
+		{"☻x�", '�', len("☻x")},
+		{"☻x\xe2\x98", '�', len("☻x")},
+		{"☻x\xe2\x98�", '�', len("☻x")},
+		{"☻x\xe2\x98x", '�', len("☻x")},
+
+		// Invalid rune values should never match.
+		{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", -1, -1},
+		{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", 0xD800, -1}, // Surrogate pair
+		{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", utf8.MaxRune + 1, -1},
+	}
+	for _, tt := range tests {
+		if got := IndexRune([]byte(tt.in), tt.rune); got != tt.want {
+			t.Errorf("IndexRune(%q, %d) = %v; want %v", tt.in, tt.rune, got, tt.want)
 		}
 	}

 	haystack := []byte("test世界")
-
 	allocs := testing.AllocsPerRun(1000, func() {
 		if i := IndexRune(haystack, 's'); i != 2 {
 			t.Fatalf("'s' at %d; want 2", i)
@@ -368,7 +384,7 @@ func TestIndexRune(t *testing.T) {
 		}
 	})
 	if allocs != 0 {
-		t.Errorf(`expected no allocations, got %f`, allocs)
+		t.Errorf("expected no allocations, got %f", allocs)
 	}
 }


--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -145,12 +145,24 @@ func LastIndex(s, sep string) int {

 // IndexRune returns the index of the first instance of the Unicode code point
 // r, or -1 if rune is not present in s.
+// If r is utf8.RuneError, it returns the first instance of any
+// invalid UTF-8 byte sequence.
 func IndexRune(s string, r rune) int {
-	if r < utf8.RuneSelf {
+	switch {
+	case 0 <= r && r < utf8.RuneSelf:
 		return IndexByte(s, byte(r))
+	case r == utf8.RuneError:
+		for i, r := range s {
+			if r == utf8.RuneError {
+				return i
+			}
+		}
+		return -1
+	case !utf8.ValidRune(r):
+		return -1
+	default:
+		return Index(s, string(r))
 	}
-
-	return Index(s, string(r))
 }

 // IndexAny returns the index of the first instance of any Unicode code point

--- a/src/strings/strings_test.go
+++ b/src/strings/strings_test.go
@@ -240,21 +240,39 @@ func TestIndexRandom(t *testing.T) {
 	}
 }

-var indexRuneTests = []struct {
-	s    string
-	rune rune
-	out  int
-}{
-	{"a A x", 'A', 2},
-	{"some_text=some_value", '=', 9},
-	{"☺a", 'a', 3},
-	{"a☻☺b", '☺', 4},
-}
-
 func TestIndexRune(t *testing.T) {
-	for _, test := range indexRuneTests {
-		if actual := IndexRune(test.s, test.rune); actual != test.out {
-			t.Errorf("IndexRune(%q,%d)= %v; want %v", test.s, test.rune, actual, test.out)
+	tests := []struct {
+		in   string
+		rune rune
+		want int
+	}{
+		{"", 'a', -1},
+		{"", '☺', -1},
+		{"foo", '☹', -1},
+		{"foo", 'o', 1},
+		{"foo☺bar", '☺', 3},
+		{"foo☺☻☹bar", '☹', 9},
+		{"a A x", 'A', 2},
+		{"some_text=some_value", '=', 9},
+		{"☺a", 'a', 3},
+		{"a☻☺b", '☺', 4},
+
+		// RuneError should match any invalid UTF-8 byte sequence.
+		{"�", '�', 0},
+		{"\xff", '�', 0},
+		{"☻x�", '�', len("☻x")},
+		{"☻x\xe2\x98", '�', len("☻x")},
+		{"☻x\xe2\x98�", '�', len("☻x")},
+		{"☻x\xe2\x98x", '�', len("☻x")},
+
+		// Invalid rune values should never match.
+		{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", -1, -1},
+		{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", 0xD800, -1}, // Surrogate pair
+		{"a☺b☻c☹d\xe2\x98�\xff�\xed\xa0\x80", utf8.MaxRune + 1, -1},
+	}
+	for _, tt := range tests {
+		if got := IndexRune(tt.in, tt.rune); got != tt.want {
+			t.Errorf("IndexRune(%q, %d) = %v; want %v", tt.in, tt.rune, got, tt.want)
 		}
 	}

@@ -267,9 +285,8 @@ func TestIndexRune(t *testing.T) {
 			t.Fatalf("'世' at %d; want 4", i)
 		}
 	})
-
 	if allocs != 0 {
-		t.Errorf(`expected no allocations, got %f`, allocs)
+		t.Errorf("expected no allocations, got %f", allocs)
 	}
 }