bytes, strings: optimize multi-byte index operations on s390x

Use vector instructions to speed up indexing operations for short strings (64 bytes or less). bytes_s390x.go and strings_s390x.go are based on their amd64 equivalents in CL 31690. bytes package: name old time/op new time/op delta Index/10 40.3ns ± 7% 11.3ns ± 4% -72.06% (p=0.000 n=10+10) Index/32 196ns ± 1% 27ns ± 2% -86.25% (p=0.000 n=10+10) Index/4K 28.9µs ± 1% 1.5µs ± 2% -94.94% (p=0.000 n=9+9) Index/4M 30.1ms ± 2% 1.5ms ± 3% -94.94% (p=0.000 n=10+10) Index/64M 549ms ±13% 28ms ± 3% -94.87% (p=0.000 n=10+9) IndexEasy/10 18.8ns ±11% 11.5ns ± 2% -38.81% (p=0.000 n=10+10) IndexEasy/32 23.6ns ± 6% 28.1ns ± 3% +19.29% (p=0.000 n=10+10) IndexEasy/4K 251ns ± 5% 223ns ± 8% -11.04% (p=0.000 n=10+10) IndexEasy/4M 318µs ± 9% 266µs ± 8% -16.42% (p=0.000 n=10+10) IndexEasy/64M 14.7ms ±16% 13.2ms ±11% -10.22% (p=0.001 n=10+10) strings package: name old time/op new time/op delta IndexRune 88.1ns ±16% 28.9ns ± 4% -67.20% (p=0.000 n=10+10) IndexRuneLongString 456ns ± 7% 34ns ± 3% -92.50% (p=0.000 n=10+10) IndexRuneFastPath 12.9ns ±14% 11.1ns ± 6% -13.84% (p=0.000 n=10+10) Index 13.0ns ± 7% 11.3ns ± 4% -13.31% (p=0.000 n=10+10) IndexHard1 3.38ms ± 9% 0.07ms ± 1% -97.79% (p=0.000 n=10+10) IndexHard2 3.58ms ± 7% 0.37ms ± 2% -89.78% (p=0.000 n=10+10) IndexHard3 3.47ms ± 7% 0.75ms ± 1% -78.52% (p=0.000 n=10+10) IndexHard4 3.56ms ± 6% 1.34ms ± 0% -62.39% (p=0.000 n=9+9) Change-Id: If36c2afb8c02e80fcaa1cf5ec2abb0a2be08c7d1 Reviewed-on: https://go-review.googlesource.com/32447 Run-TryBot: Michael Munday <munday@ca.ibm.com> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>

bytes, strings: optimize multi-byte index operations on s390x
Use vector instructions to speed up indexing operations for short strings (64 bytes or less). bytes_s390x.go and strings_s390x.go are based on their amd64 equivalents in CL 31690. bytes package: name old time/op new time/op delta Index/10 40.3ns ± 7% 11.3ns ± 4% -72.06% (p=0.000 n=10+10) Index/32 196ns ± 1% 27ns ± 2% -86.25% (p=0.000 n=10+10) Index/4K 28.9µs ± 1% 1.5µs ± 2% -94.94% (p=0.000 n=9+9) Index/4M 30.1ms ± 2% 1.5ms ± 3% -94.94% (p=0.000 n=10+10) Index/64M 549ms ±13% 28ms ± 3% -94.87% (p=0.000 n=10+9) IndexEasy/10 18.8ns ±11% 11.5ns ± 2% -38.81% (p=0.000 n=10+10) IndexEasy/32 23.6ns ± 6% 28.1ns ± 3% +19.29% (p=0.000 n=10+10) IndexEasy/4K 251ns ± 5% 223ns ± 8% -11.04% (p=0.000 n=10+10) IndexEasy/4M 318µs ± 9% 266µs ± 8% -16.42% (p=0.000 n=10+10) IndexEasy/64M 14.7ms ±16% 13.2ms ±11% -10.22% (p=0.001 n=10+10) strings package: name old time/op new time/op delta IndexRune 88.1ns ±16% 28.9ns ± 4% -67.20% (p=0.000 n=10+10) IndexRuneLongString 456ns ± 7% 34ns ± 3% -92.50% (p=0.000 n=10+10) IndexRuneFastPath 12.9ns ±14% 11.1ns ± 6% -13.84% (p=0.000 n=10+10) Index 13.0ns ± 7% 11.3ns ± 4% -13.31% (p=0.000 n=10+10) IndexHard1 3.38ms ± 9% 0.07ms ± 1% -97.79% (p=0.000 n=10+10) IndexHard2 3.58ms ± 7% 0.37ms ± 2% -89.78% (p=0.000 n=10+10) IndexHard3 3.47ms ± 7% 0.75ms ± 1% -78.52% (p=0.000 n=10+10) IndexHard4 3.56ms ± 6% 1.34ms ± 0% -62.39% (p=0.000 n=9+9) Change-Id: If36c2afb8c02e80fcaa1cf5ec2abb0a2be08c7d1 Reviewed-on: https://go-review.googlesource.com/32447 Run-TryBot: Michael Munday <munday@ca.ibm.com> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
47c58b46 · Michael Munday · 11ff4b21 · 47c58b46 · 47c58b46 · 47c58b46
Commit 47c58b46 authored Oct 01, 2016 by Michael Munday
5 changed files
--- a/src/bytes/bytes_generic.go
+++ b/src/bytes/bytes_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64
+// +build !amd64,!s390x

 package bytes


--- a/src/bytes/bytes_s390x.go
+++ b/src/bytes/bytes_s390x.go
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytes
+
+//go:noescape
+
+// indexShortStr returns the index of the first instance of sep in s,
+// or -1 if sep is not present in s.
+// indexShortStr requires 2 <= len(sep) <= shortStringLen
+func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s
+
+// supportsVX reports whether the vector facility is available.
+// indexShortStr must not be called if the vector facility is not
+// available.
+func supportsVX() bool // ../runtime/asm_s390x.s
+
+var shortStringLen = -1
+
+func init() {
+	if supportsVX() {
+		shortStringLen = 64
+	}
+}
+
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+	n := len(sep)
+	switch {
+	case n == 0:
+		return 0
+	case n == 1:
+		return IndexByte(s, sep[0])
+	case n <= shortStringLen:
+		// Use brute force when s and sep both are small
+		if len(s) <= 64 {
+			return indexShortStr(s, sep)
+		}
+		c := sep[0]
+		i := 0
+		t := s[:len(s)-n+1]
+		fails := 0
+		for i < len(t) {
+			if t[i] != c {
+				// IndexByte skips 16/32 bytes per iteration,
+				// so it's faster than indexShortStr.
+				o := IndexByte(t[i:], c)
+				if o < 0 {
+					return -1
+				}
+				i += o
+			}
+			if Equal(s[i:i+n], sep) {
+				return i
+			}
+			fails++
+			i++
+			// Switch to indexShortStr when IndexByte produces too many false positives.
+			// Too many means more that 1 error per 8 characters.
+			// Allow some errors in the beginning.
+			if fails > (i+16)/8 {
+				r := indexShortStr(s[i:], sep)
+				if r >= 0 {
+					return r + i
+				}
+				return -1
+			}
+		}
+		return -1
+	case n == len(s):
+		if Equal(sep, s) {
+			return 0
+		}
+		return -1
+	case n > len(s):
+		return -1
+	}
+	// Rabin-Karp search
+	hashsep, pow := hashStr(sep)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*primeRK + uint32(s[i])
+	}
+	if h == hashsep && Equal(s[:n], sep) {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= primeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashsep && Equal(s[i-n:i], sep) {
+			return i - n
+		}
+	}
+	return -1
+}
+
+// primeRK is the prime base used in Rabin-Karp algorithm.
+const primeRK = 16777619
+
+// hashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func hashStr(sep []byte) (uint32, uint32) {
+	hash := uint32(0)
+	for i := 0; i < len(sep); i++ {
+		hash = hash*primeRK + uint32(sep[i])
+	}
+	var pow, sq uint32 = 1, primeRK
+	for i := len(sep); i > 0; i >>= 1 {
+		if i&1 != 0 {
+			pow *= sq
+		}
+		sq *= sq
+	}
+	return hash, pow
+}
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -1076,6 +1076,230 @@ TEXT runtime·cmpbodyclc(SB),NOSPLIT|NOFRAME,$0-0
 	CLC	$1, 0(R3), 0(R5)
 	RET

+// func supportsVX() bool
+TEXT strings·supportsVX(SB),NOSPLIT,$0-1
+	MOVBZ	runtime·cpu+facilities_hasVX(SB), R0
+	MOVB	R0, ret+0(FP)
+	RET
+
+// func supportsVX() bool
+TEXT bytes·supportsVX(SB),NOSPLIT,$0-1
+	MOVBZ	runtime·cpu+facilities_hasVX(SB), R0
+	MOVB	R0, ret+0(FP)
+	RET
+
+// func indexShortStr(s, sep string) int
+// Caller must confirm availability of vx facility before calling.
+TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40
+	LMG	s+0(FP), R1, R2   // R1=&s[0],   R2=len(s)
+	LMG	sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+32(FP), R5
+	BR	runtime·indexShortStr(SB)
+
+// func indexShortStr(s, sep []byte) int
+// Caller must confirm availability of vx facility before calling.
+TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56
+	LMG	s+0(FP), R1, R2    // R1=&s[0],   R2=len(s)
+	LMG	sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+	MOVD	$ret+48(FP), R5
+	BR	runtime·indexShortStr(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0
+	CMPBGT	R4, R2, notfound
+	ADD	R1, R2
+	SUB	R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+	CMPBEQ	R4, $0, notfound
+	SUB	$1, R4 // R4=len(sep)-1 for use as VLL index
+	VLL	R4, (R3), V0 // contains first 16 bytes of sep
+	MOVD	R1, R7
+index2plus:
+	CMPBNE	R4, $1, index3plus
+	MOVD	$15(R7), R9
+	CMPBGE	R9, R2, index2to16
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	CMPBGE	R9, R2, index2to16
+index2loop:
+	VL	0(R7), V2          // 16 bytes, even indices
+	VL	1(R7), V4          // 16 bytes, odd indices
+	VCEQH	V1, V2, V5         // compare even indices
+	VCEQH	V1, V4, V6         // compare odd indices
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index3plus:
+	CMPBNE	R4, $2, index4plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$1, R0
+	VGBM	$0xaaaa, V31       // 0xff00ff00ff00ff00...
+	VONE	V16
+	VREPH	$0, V0, V1
+	VREPB	$2, V0, V8
+index3loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 2-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<2
+	VCEQH	V1, V2, V5         // compare 2-byte even indices
+	VCEQH	V1, V4, V6         // compare 2-byte odd indices
+	VCEQB	V8, V9, V10        // compare last bytes
+	VSEL	V5, V6, V31, V7    // merge even and odd indices
+	VN	V7, V10, V7        // AND indices with last byte
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index4plus:
+	CMPBNE	R4, $3, index5plus
+	ADD	$15, R7, R9
+	CMPBGE	R9, R2, index2to16
+	MOVD	$2, R0
+	VGBM	$0x8888, V29       // 0xff000000ff000000...
+	VGBM	$0x2222, V30       // 0x0000ff000000ff00...
+	VGBM	$0xcccc, V31       // 0xffff0000ffff0000...
+	VONE	V16
+	VREPF	$0, V0, V1
+index4loop:
+	VL	(R7), V2           // load 16-bytes into V2
+	VLL	R0, 16(R7), V3     // load 3-bytes into V3
+	VSLDB	$1, V2, V3, V4     // V4=(V2:V3)<<1
+	VSLDB	$2, V2, V3, V9     // V9=(V2:V3)<<1
+	VSLDB	$3, V2, V3, V10    // V10=(V2:V3)<<1
+	VCEQF	V1, V2, V5         // compare index 0, 4, ...
+	VCEQF	V1, V4, V6         // compare index 1, 5, ...
+	VCEQF	V1, V9, V11        // compare index 2, 6, ...
+	VCEQF	V1, V10, V12       // compare index 3, 7, ...
+	VSEL	V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+	VSEL	V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+	VSEL	V13, V14, V31, V7  // final merge
+	VFEEBS	V16, V7, V17       // find leftmost index, set condition to 1 if found
+	BLT	foundV17
+	MOVD	$16(R7), R7        // R7+=16
+	ADD	$15, R7, R9
+	CMPBLE	R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+	CMPBLE	R7, R2, index2to16
+	BR	notfound
+
+index5plus:
+	CMPBGT	R4, $15, index17plus
+index2to16:
+	CMPBGT	R7, R2, notfound
+	MOVD	$1(R7), R8
+	CMPBGT	R8, R2, index2to16tail
+index2to16loop:
+	// unrolled 2x
+	VLL	R4, (R7), V1
+	VLL	R4, 1(R7), V2
+	VCEQGS	V0, V1, V3
+	BEQ	found
+	MOVD	$1(R7), R7
+	VCEQGS	V0, V2, V4
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLT	R7, R2, index2to16loop
+	CMPBGT	R7, R2, notfound
+index2to16tail:
+	VLL	R4, (R7), V1
+	VCEQGS	V0, V1, V2
+	BEQ	found
+	BR	notfound
+
+index17plus:
+	CMPBGT	R4, $31, index33plus
+	SUB	$16, R4, R0
+	VLL	R0, 16(R3), V1
+	VONE	V7
+index17to32loop:
+	VL	(R7), V2
+	VLL	R0, 16(R7), V3
+	VCEQG	V0, V2, V4
+	VCEQG	V1, V3, V5
+	VN	V4, V5, V6
+	VCEQGS	V6, V7, V8
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index17to32loop
+	BR	notfound
+
+index33plus:
+	CMPBGT	R4, $47, index49plus
+	SUB	$32, R4, R0
+	VL	16(R3), V1
+	VLL	R0, 32(R3), V2
+	VONE	V11
+index33to48loop:
+	VL	(R7), V3
+	VL	16(R7), V4
+	VLL	R0, 32(R7), V5
+	VCEQG	V0, V3, V6
+	VCEQG	V1, V4, V7
+	VCEQG	V2, V5, V8
+	VN	V6, V7, V9
+	VN	V8, V9, V10
+	VCEQGS	V10, V11, V12
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index33to48loop
+	BR	notfound
+
+index49plus:
+	CMPBGT	R4, $63, index65plus
+	SUB	$48, R4, R0
+	VL	16(R3), V1
+	VL	32(R3), V2
+	VLL	R0, 48(R3), V3
+	VONE	V15
+index49to64loop:
+	VL	(R7), V4
+	VL	16(R7), V5
+	VL	32(R7), V6
+	VLL	R0, 48(R7), V7
+	VCEQG	V0, V4, V8
+	VCEQG	V1, V5, V9
+	VCEQG	V2, V6, V10
+	VCEQG	V3, V7, V11
+	VN	V8, V9, V12
+	VN	V10, V11, V13
+	VN	V12, V13, V14
+	VCEQGS	V14, V15, V16
+	BEQ	found
+	MOVD	$1(R7), R7
+	CMPBLE  R7, R2, index49to64loop
+notfound:
+	MOVD	$-1, (R5)
+	RET
+
+index65plus:
+	// not implemented
+	MOVD	$0, (R0)
+	RET
+
+foundV17: // index is in doubleword V17[0]
+	VLGVG	$0, V17, R8
+	ADD	R8, R7
+found:
+	SUB	R1, R7
+	MOVD	R7, (R5)
+	RET
+
 // This is called from .init_array and follows the platform, not Go, ABI.
 // We are overly conservative. We could only save the registers we use.
 // However, since this function is only called once per loaded module

--- a/src/strings/strings_generic.go
+++ b/src/strings/strings_generic.go
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// +build !amd64
+// +build !amd64,!s390x

 package strings


--- a/src/strings/strings_s390x.go
+++ b/src/strings/strings_s390x.go
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package strings
+
+//go:noescape
+
+// indexShortStr returns the index of the first instance of sep in s,
+// or -1 if sep is not present in s.
+// indexShortStr requires 2 <= len(sep) <= shortStringLen
+func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s
+
+// supportsVX reports whether the vector facility is available.
+// indexShortStr must not be called if the vector facility is not
+// available.
+func supportsVX() bool // ../runtime/asm_s390x.s
+
+var shortStringLen = -1
+
+func init() {
+	if supportsVX() {
+		shortStringLen = 64
+	}
+}
+
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep string) int {
+	n := len(sep)
+	switch {
+	case n == 0:
+		return 0
+	case n == 1:
+		return IndexByte(s, sep[0])
+	case n <= shortStringLen:
+		// Use brute force when s and sep both are small
+		if len(s) <= 64 {
+			return indexShortStr(s, sep)
+		}
+		c := sep[0]
+		i := 0
+		t := s[:len(s)-n+1]
+		fails := 0
+		for i < len(t) {
+			if t[i] != c {
+				// IndexByte skips 16/32 bytes per iteration,
+				// so it's faster than indexShortStr.
+				o := IndexByte(t[i:], c)
+				if o < 0 {
+					return -1
+				}
+				i += o
+			}
+			if s[i:i+n] == sep {
+				return i
+			}
+			fails++
+			i++
+			// Switch to indexShortStr when IndexByte produces too many false positives.
+			// Too many means more that 1 error per 8 characters.
+			// Allow some errors in the beginning.
+			if fails > (i+16)/8 {
+				r := indexShortStr(s[i:], sep)
+				if r >= 0 {
+					return r + i
+				}
+				return -1
+			}
+		}
+		return -1
+	case n == len(s):
+		if sep == s {
+			return 0
+		}
+		return -1
+	case n > len(s):
+		return -1
+	}
+	// Rabin-Karp search
+	hashsep, pow := hashStr(sep)
+	var h uint32
+	for i := 0; i < n; i++ {
+		h = h*primeRK + uint32(s[i])
+	}
+	if h == hashsep && s[:n] == sep {
+		return 0
+	}
+	for i := n; i < len(s); {
+		h *= primeRK
+		h += uint32(s[i])
+		h -= pow * uint32(s[i-n])
+		i++
+		if h == hashsep && s[i-n:i] == sep {
+			return i - n
+		}
+	}
+	return -1
+}