Commit 2421c6e3 authored by Ilya Tocar's avatar Ilya Tocar Committed by Keith Randall

runtime: optimize duffzero for amd64.

Use MOVUPS to zero 16 bytes at a time.

results (haswell):

name             old time/op  new time/op  delta
ClearFat8-48     0.62ns ± 2%  0.62ns ± 1%     ~     (p=0.085 n=20+15)
ClearFat12-48    0.93ns ± 2%  0.93ns ± 2%     ~     (p=0.757 n=19+19)
ClearFat16-48    1.23ns ± 1%  1.23ns ± 1%     ~     (p=0.896 n=19+17)
ClearFat24-48    1.85ns ± 2%  1.84ns ± 0%   -0.51%  (p=0.023 n=20+15)
ClearFat32-48    2.45ns ± 0%  2.46ns ± 2%     ~     (p=0.053 n=17+18)
ClearFat40-48    1.99ns ± 0%  0.92ns ± 2%  -53.54%  (p=0.000 n=19+20)
ClearFat48-48    2.15ns ± 1%  0.92ns ± 2%  -56.93%  (p=0.000 n=19+20)
ClearFat56-48    2.46ns ± 1%  1.23ns ± 0%  -49.98%  (p=0.000 n=19+14)
ClearFat64-48    2.76ns ± 0%  2.14ns ± 1%  -22.21%  (p=0.000 n=17+17)
ClearFat128-48   5.21ns ± 0%  3.99ns ± 0%  -23.46%  (p=0.000 n=17+19)
ClearFat256-48   10.3ns ± 4%   7.7ns ± 0%  -25.37%  (p=0.000 n=20+17)
ClearFat512-48   20.2ns ± 4%  15.0ns ± 1%  -25.58%  (p=0.000 n=20+17)
ClearFat1024-48  39.7ns ± 2%  29.7ns ± 0%  -25.05%  (p=0.000 n=19+19)

Change-Id: I200401eec971b2dd2450c0651c51e378bd982405
Reviewed-on: https://go-review.googlesource.com/14408Reviewed-by: default avatarKeith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 2027b00e
This diff is collapsed.
...@@ -135,6 +135,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{ ...@@ -135,6 +135,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
x86.AMOVL: {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite | gc.Move}, x86.AMOVL: {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVQ: {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move}, x86.AMOVQ: {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVW: {Flags: gc.SizeW | gc.LeftRead | gc.RightWrite | gc.Move}, x86.AMOVW: {Flags: gc.SizeW | gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVUPS: {Flags: gc.LeftRead | gc.RightWrite | gc.Move},
x86.AMOVSB: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI}, x86.AMOVSB: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI},
x86.AMOVSL: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI}, x86.AMOVSL: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI},
x86.AMOVSQ: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI}, x86.AMOVSQ: {Flags: gc.OK, Reguse: DI | SI, Regset: DI | SI},
...@@ -246,6 +247,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{ ...@@ -246,6 +247,7 @@ var progtable = [x86.ALAST]obj.ProgInfo{
x86.AXORL: {Flags: gc.SizeL | gc.LeftRead | RightRdwr | gc.SetCarry}, x86.AXORL: {Flags: gc.SizeL | gc.LeftRead | RightRdwr | gc.SetCarry},
x86.AXORQ: {Flags: gc.SizeQ | gc.LeftRead | RightRdwr | gc.SetCarry}, x86.AXORQ: {Flags: gc.SizeQ | gc.LeftRead | RightRdwr | gc.SetCarry},
x86.AXORW: {Flags: gc.SizeW | gc.LeftRead | RightRdwr | gc.SetCarry}, x86.AXORW: {Flags: gc.SizeW | gc.LeftRead | RightRdwr | gc.SetCarry},
x86.AXORPS: {Flags: gc.LeftRead | RightRdwr},
} }
func progflags(p *obj.Prog) uint32 { func progflags(p *obj.Prog) uint32 {
......
...@@ -5,196 +5,102 @@ ...@@ -5,196 +5,102 @@
#include "textflag.h" #include "textflag.h"
TEXT runtime·duffzero(SB), NOSPLIT, $0-0 TEXT runtime·duffzero(SB), NOSPLIT, $0-0
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI) MOVUPS X0,(DI)
MOVQ AX,8(DI) MOVUPS X0,16(DI)
MOVQ AX,16(DI) MOVUPS X0,32(DI)
MOVQ AX,24(DI) MOVUPS X0,48(DI)
ADDQ $32,DI ADDQ $64,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
MOVQ AX,(DI)
MOVQ AX,8(DI)
MOVQ AX,16(DI)
MOVQ AX,24(DI)
ADDQ $32,DI
STOSQ
STOSQ
STOSQ
STOSQ
RET RET
TEXT runtime·duffcopy(SB), NOSPLIT, $0-0 TEXT runtime·duffcopy(SB), NOSPLIT, $0-0
......
...@@ -60,21 +60,18 @@ func gen(arch string, tags, zero, copy func(io.Writer)) { ...@@ -60,21 +60,18 @@ func gen(arch string, tags, zero, copy func(io.Writer)) {
func notags(w io.Writer) { fmt.Fprintln(w) } func notags(w io.Writer) { fmt.Fprintln(w) }
func zeroAMD64(w io.Writer) { func zeroAMD64(w io.Writer) {
// AX: zero // X0: zero
// DI: ptr to memory to be zeroed // DI: ptr to memory to be zeroed
// DI is updated as a side effect. // DI is updated as a side effect.
fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0") fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT, $0-0")
for i := 0; i < 31; i++ { for i := 0; i < 16; i++ {
fmt.Fprintln(w, "\tMOVQ\tAX,(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,8(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,16(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,16(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,32(DI)")
fmt.Fprintln(w, "\tMOVQ\tAX,24(DI)") fmt.Fprintln(w, "\tMOVUPS\tX0,48(DI)")
fmt.Fprintln(w, "\tADDQ\t$32,DI") fmt.Fprintln(w, "\tADDQ\t$64,DI")
fmt.Fprintln(w) fmt.Fprintln(w)
} }
for i := 0; i < 4; i++ {
fmt.Fprintln(w, "\tSTOSQ")
}
fmt.Fprintln(w, "\tRET") fmt.Fprintln(w, "\tRET")
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment