Commit a143f5d6 authored by Philip Hofer's avatar Philip Hofer Committed by Josh Bleecher Snyder

cmd/internal/obj/arm: improve static branch prediction for wrapper prologue

This is a follow-up to CL 36893.

Move the unlikely branch in the wrapper prologue to the end
of the function, where it has minimal impact on the instruction
cache. Static branch prediction is also less likely to choose
a forward branch.

Updates #19042

sort benchmarks:
name                  old time/op  new time/op  delta
SearchWrappers-4      1.44µs ± 0%  1.45µs ± 0%  +1.15%  (p=0.000 n=9+10)
SortString1K-4        1.02ms ± 0%  1.04ms ± 0%  +2.39%  (p=0.000 n=10+10)
SortString1K_Slice-4   960µs ± 0%   989µs ± 0%  +2.95%  (p=0.000 n=9+10)
StableString1K-4       218µs ± 0%   213µs ± 0%  -2.13%  (p=0.000 n=10+10)
SortInt1K-4            541µs ± 0%   543µs ± 0%  +0.30%  (p=0.003 n=9+10)
StableInt1K-4          760µs ± 1%   763µs ± 1%  +0.38%  (p=0.011 n=10+10)
StableInt1K_Slice-4    840µs ± 1%   779µs ± 0%  -7.31%  (p=0.000 n=9+10)
SortInt64K-4          55.2ms ± 0%  55.4ms ± 1%  +0.34%  (p=0.012 n=10+8)
SortInt64K_Slice-4    56.2ms ± 0%  55.6ms ± 1%  -1.16%  (p=0.000 n=10+10)
StableInt64K-4        70.9ms ± 1%  71.0ms ± 0%    ~     (p=0.315 n=10+7)
Sort1e2-4              250µs ± 0%   249µs ± 1%    ~     (p=0.315 n=9+10)
Stable1e2-4            600µs ± 0%   594µs ± 0%  -1.09%  (p=0.000 n=9+10)
Sort1e4-4             51.2ms ± 0%  51.4ms ± 1%  +0.40%  (p=0.001 n=9+10)
Stable1e4-4            204ms ± 1%   199ms ± 1%  -2.27%  (p=0.000 n=10+10)
Sort1e6-4              8.42s ± 0%   8.44s ± 0%  +0.28%  (p=0.000 n=8+9)
Stable1e6-4            43.3s ± 0%   42.5s ± 1%  -1.89%  (p=0.000 n=9+9)

Change-Id: I827559aa557fdba211a38ce3f77137b471c5c67e
Reviewed-on: https://go-review.googlesource.com/37611
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
Reviewed-by: default avatarJosh Bleecher Snyder <josharian@gmail.com>
parent a2cc8b20
...@@ -341,8 +341,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) { ...@@ -341,8 +341,6 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
q = p q = p
} }
var p1 *obj.Prog
var p2 *obj.Prog
var q2 *obj.Prog var q2 *obj.Prog
for p := cursym.Text; p != nil; p = p.Link { for p := cursym.Text; p != nil; p = p.Link {
o := p.As o := p.As
...@@ -391,22 +389,24 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) { ...@@ -391,22 +389,24 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
// if(g->panic != nil && g->panic->argp == FP) g->panic->argp = bottom-of-frame // if(g->panic != nil && g->panic->argp == FP) g->panic->argp = bottom-of-frame
// //
// MOVW g_panic(g), R1 // MOVW g_panic(g), R1
// CMP $0, R1 // CMP $0, R1
// B.EQ end // B.NE checkargp
// end:
// NOP
// ... function ...
// checkargp:
// MOVW panic_argp(R1), R2 // MOVW panic_argp(R1), R2
// ADD $(autosize+4), R13, R3 // ADD $(autosize+4), R13, R3
// CMP R2, R3 // CMP R2, R3
// B.NE end // B.NE end
// ADD $4, R13, R4 // ADD $4, R13, R4
// MOVW R4, panic_argp(R1) // MOVW R4, panic_argp(R1)
// end: // B end
// NOP
// //
// The NOP is needed to give the jumps somewhere to land. // The NOP is needed to give the jumps somewhere to land.
// It is a liblink NOP, not an ARM NOP: it encodes to 0 instruction bytes. // It is a liblink NOP, not an ARM NOP: it encodes to 0 instruction bytes.
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = AMOVW p.As = AMOVW
p.From.Type = obj.TYPE_MEM p.From.Type = obj.TYPE_MEM
p.From.Reg = REGG p.From.Reg = REGG
...@@ -420,20 +420,34 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) { ...@@ -420,20 +420,34 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p.From.Offset = 0 p.From.Offset = 0
p.Reg = REG_R1 p.Reg = REG_R1
p = obj.Appendp(ctxt, p) // B.NE checkargp
p.As = ABEQ bne := obj.Appendp(ctxt, p)
p.To.Type = obj.TYPE_BRANCH bne.As = ABNE
p1 = p bne.To.Type = obj.TYPE_BRANCH
p = obj.Appendp(ctxt, p) // end: NOP
p.As = AMOVW end := obj.Appendp(ctxt, bne)
p.From.Type = obj.TYPE_MEM end.As = obj.ANOP
p.From.Reg = REG_R1
p.From.Offset = 0 // Panic.argp
p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R2
p = obj.Appendp(ctxt, p) // find end of function
var last *obj.Prog
for last = end; last.Link != nil; last = last.Link {
}
// MOVW panic_argp(R1), R2
mov := obj.Appendp(ctxt, last)
mov.As = AMOVW
mov.From.Type = obj.TYPE_MEM
mov.From.Reg = REG_R1
mov.From.Offset = 0 // Panic.argp
mov.To.Type = obj.TYPE_REG
mov.To.Reg = REG_R2
// B.NE branch target is MOVW above
bne.Pcond = mov
// ADD $(autosize+4), R13, R3
p = obj.Appendp(ctxt, mov)
p.As = AADD p.As = AADD
p.From.Type = obj.TYPE_CONST p.From.Type = obj.TYPE_CONST
p.From.Offset = int64(autosize) + 4 p.From.Offset = int64(autosize) + 4
...@@ -441,17 +455,20 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) { ...@@ -441,17 +455,20 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R3 p.To.Reg = REG_R3
// CMP R2, R3
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = ACMP p.As = ACMP
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
p.From.Reg = REG_R2 p.From.Reg = REG_R2
p.Reg = REG_R3 p.Reg = REG_R3
// B.NE end
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = ABNE p.As = ABNE
p.To.Type = obj.TYPE_BRANCH p.To.Type = obj.TYPE_BRANCH
p2 = p p.Pcond = end
// ADD $4, R13, R4
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = AADD p.As = AADD
p.From.Type = obj.TYPE_CONST p.From.Type = obj.TYPE_CONST
...@@ -460,6 +477,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) { ...@@ -460,6 +477,7 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = REG_R4 p.To.Reg = REG_R4
// MOVW R4, panic_argp(R1)
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = AMOVW p.As = AMOVW
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
...@@ -468,11 +486,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) { ...@@ -468,11 +486,14 @@ func preprocess(ctxt *obj.Link, cursym *obj.LSym) {
p.To.Reg = REG_R1 p.To.Reg = REG_R1
p.To.Offset = 0 // Panic.argp p.To.Offset = 0 // Panic.argp
// B end
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = AB
p.To.Type = obj.TYPE_BRANCH
p.Pcond = end
p.As = obj.ANOP // reset for subsequent passes
p1.Pcond = p p = end
p2.Pcond = p
} }
case obj.ARET: case obj.ARET:
...@@ -702,7 +723,7 @@ func softfloat(ctxt *obj.Link, cursym *obj.LSym) { ...@@ -702,7 +723,7 @@ func softfloat(ctxt *obj.Link, cursym *obj.LSym) {
} }
func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog { func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
// MOVW g_stackguard(g), R1 // MOVW g_stackguard(g), R1
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = AMOVW p.As = AMOVW
...@@ -748,11 +769,11 @@ func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog { ...@@ -748,11 +769,11 @@ func stacksplit(ctxt *obj.Link, p *obj.Prog, framesize int32) *obj.Prog {
// SP-stackguard+StackGuard < framesize + (StackGuard-StackSmall) // SP-stackguard+StackGuard < framesize + (StackGuard-StackSmall)
// The +StackGuard on both sides is required to keep the left side positive: // The +StackGuard on both sides is required to keep the left side positive:
// SP is allowed to be slightly below stackguard. See stack.h. // SP is allowed to be slightly below stackguard. See stack.h.
// CMP $StackPreempt, R1 // CMP $StackPreempt, R1
// MOVW.NE $StackGuard(SP), R2 // MOVW.NE $StackGuard(SP), R2
// SUB.NE R1, R2 // SUB.NE R1, R2
// MOVW.NE $(framesize+(StackGuard-StackSmall)), R3 // MOVW.NE $(framesize+(StackGuard-StackSmall)), R3
// CMP.NE R3, R2 // CMP.NE R3, R2
p = obj.Appendp(ctxt, p) p = obj.Appendp(ctxt, p)
p.As = ACMP p.As = ACMP
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment