Commit a96e117a authored by Keith Randall's avatar Keith Randall

runtime: amd64, use 4-byte ops for memmove of 4 bytes

memmove used to use 2 2-byte load/store pairs to move 4 bytes.
When the result is loaded with a single 4-byte load, it caused
a store to load fowarding stall.  To avoid the stall,
special case memmove to use 4 byte ops for the 4 byte copy case.

We already have a special case for 8-byte copies.
386 already specializes 4-byte copies.
I'll do 2-byte copies also, but not for 1.8.

benchmark                 old ns/op     new ns/op     delta
BenchmarkIssue18740-8     7567          4799          -36.58%

3-byte copies get a bit slower.  Other copies are unchanged.
name         old time/op   new time/op   delta
Memmove/3-8   4.76ns ± 5%   5.26ns ± 3%  +10.50%  (p=0.000 n=10+10)

Fixes #18740

Change-Id: Iec82cbac0ecfee80fa3c8fc83828f9a1819c3c74
Reviewed-on: https://go-review.googlesource.com/35567
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarDavid Chase <drchase@google.com>
parent 4cce27a3
...@@ -146,10 +146,16 @@ move_1or2: ...@@ -146,10 +146,16 @@ move_1or2:
move_0: move_0:
RET RET
move_3or4: move_3or4:
CMPQ BX, $4
JB move_3
MOVL (SI), AX
MOVL AX, (DI)
RET
move_3:
MOVW (SI), AX MOVW (SI), AX
MOVW -2(SI)(BX*1), CX MOVB 2(SI), CX
MOVW AX, (DI) MOVW AX, (DI)
MOVW CX, -2(DI)(BX*1) MOVB CX, 2(DI)
RET RET
move_5through7: move_5through7:
MOVL (SI), AX MOVL (SI), AX
......
...@@ -6,6 +6,7 @@ package runtime_test ...@@ -6,6 +6,7 @@ package runtime_test
import ( import (
"crypto/rand" "crypto/rand"
"encoding/binary"
"fmt" "fmt"
"internal/race" "internal/race"
. "runtime" . "runtime"
...@@ -447,3 +448,22 @@ func BenchmarkCopyFat1024(b *testing.B) { ...@@ -447,3 +448,22 @@ func BenchmarkCopyFat1024(b *testing.B) {
_ = y _ = y
} }
} }
func BenchmarkIssue18740(b *testing.B) {
// This tests that memmove uses one 4-byte load/store to move 4 bytes.
// It used to do 2 2-byte load/stores, which leads to a pipeline stall
// when we try to read the result with one 4-byte load.
var buf [4]byte
for j := 0; j < b.N; j++ {
s := uint32(0)
for i := 0; i < 4096; i += 4 {
copy(buf[:], g[i:])
s += binary.LittleEndian.Uint32(buf[:])
}
sink = uint64(s)
}
}
// TODO: 2 byte and 8 byte benchmarks also.
var g [4096]byte
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment