Commit 23bd9191 authored by Lynn Boger's avatar Lynn Boger

cmd/compile: improve LoweredZero performance for ppc64x

This change improves the performance of the LoweredZero rule
on ppc64x.

The improvement can be seen in the runtime ClearFat
benchmarks:

BenchmarkClearFat12-16       2.40          0.69          -71.25%
BenchmarkClearFat16-16       9.98          0.93          -90.68%
BenchmarkClearFat24-16       4.75          0.93          -80.42%
BenchmarkClearFat32-16       6.02          0.93          -84.55%
BenchmarkClearFat40-16       7.19          1.16          -83.87%
BenchmarkClearFat48-16       15.0          1.39          -90.73%
BenchmarkClearFat56-16       9.95          1.62          -83.72%
BenchmarkClearFat64-16       18.0          1.86          -89.67%
BenchmarkClearFat128-16      30.0          8.08          -73.07%
BenchmarkClearFat256-16      52.5          11.3          -78.48%
BenchmarkClearFat512-16      97.0          19.0          -80.41%
BenchmarkClearFat1024-16     244           34.2          -85.98%

Fixes: #19532

Change-Id: If493e28bc1d8e61bc79978498be9f5336a36cd3f
Reviewed-on: https://go-review.googlesource.com/38096
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarMichael Munday <munday@ca.ibm.com>
parent d972dc2d
...@@ -831,62 +831,135 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -831,62 +831,135 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
ssaGenISEL(v, ppc64.C_COND_EQ, iselRegs[1], v.Reg()) ssaGenISEL(v, ppc64.C_COND_EQ, iselRegs[1], v.Reg())
case ssa.OpPPC64LoweredZero: case ssa.OpPPC64LoweredZero:
// Similar to how this is done on ARM,
// except that PPC MOVDU x,off(y) is *(y+off) = x; y=y+off // unaligned data doesn't hurt performance
// not store-and-increment. // for these instructions on power8 or later
// Therefore R3 should be dest-align
// and arg1 should be dest+size-align // for sizes >= 64 generate a loop as follows:
// HOWEVER, the input dest address cannot be dest-align because
// that does not necessarily address valid memory and it's not // set up loop counter in CTR, used by BC
// known how that might be optimized. Therefore, correct it in // MOVD len/32,REG_TMP
// in the expansion: // MOVD REG_TMP,CTR
// loop:
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD $32,R3
// BC 16, 0, loop
// //
// ADD -8,R3,R3 // any remainder is done as described below
// MOVDU R0, 8(R3)
// CMP R3, Rarg1
// BL -2(PC)
// arg1 is the address of the last element to zero
// auxint is alignment
var sz int64
var movu obj.As
switch {
case v.AuxInt%8 == 0:
sz = 8
movu = ppc64.AMOVDU
case v.AuxInt%4 == 0:
sz = 4
movu = ppc64.AMOVWZU // MOVWU instruction not implemented
case v.AuxInt%2 == 0:
sz = 2
movu = ppc64.AMOVHU
default:
sz = 1
movu = ppc64.AMOVBU
}
p := gc.Prog(ppc64.AADD) // for sizes < 64 bytes, first clear as many doublewords as possible,
p.Reg = v.Args[0].Reg() // then handle the remainder
p.From.Type = obj.TYPE_CONST // MOVD R0,(R3)
p.From.Offset = -sz // MOVD R0,8(R3)
p.To.Type = obj.TYPE_REG // .... etc.
p.To.Reg = v.Args[0].Reg() //
// the remainder bytes are cleared using one or more
// of the following instructions with the appropriate
// offsets depending which instructions are needed
//
// MOVW R0,n1(R3) 4 bytes
// MOVH R0,n2(R3) 2 bytes
// MOVB R0,n3(R3) 1 byte
//
// 7 bytes: MOVW, MOVH, MOVB
// 6 bytes: MOVW, MOVH
// 5 bytes: MOVW, MOVB
// 3 bytes: MOVH, MOVB
p = gc.Prog(movu) // each loop iteration does 32 bytes
p.From.Type = obj.TYPE_REG ctr := v.AuxInt / 32
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = sz
p2 := gc.Prog(ppc64.ACMPU) // remainder bytes
p2.From.Type = obj.TYPE_REG rem := v.AuxInt % 32
p2.From.Reg = v.Args[0].Reg()
p2.To.Reg = v.Args[1].Reg()
p2.To.Type = obj.TYPE_REG
p3 := gc.Prog(ppc64.ABLT) // only generate a loop if there is more
p3.To.Type = obj.TYPE_BRANCH // than 1 iteration.
gc.Patch(p3, p) if ctr > 1 {
// Set up CTR loop counter
p := gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_CONST
p.From.Offset = ctr
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP
p = gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REGTMP
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REG_CTR
// generate 4 MOVDs
// when this is a loop then the top must be saved
var top *obj.Prog
for offset := int64(0); offset < 32; offset += 8 {
// This is the top of loop
p := gc.Prog(ppc64.AMOVD)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = offset
// Save the top of loop
if top == nil {
top = p
}
}
// Increment address for the
// 4 doublewords just zeroed.
p = gc.Prog(ppc64.AADD)
p.Reg = v.Args[0].Reg()
p.From.Type = obj.TYPE_CONST
p.From.Offset = 32
p.To.Type = obj.TYPE_REG
p.To.Reg = v.Args[0].Reg()
// Branch back to top of loop
// based on CTR
// BC with BO_BCTR generates bdnz
p = gc.Prog(ppc64.ABC)
p.From.Type = obj.TYPE_CONST
p.From.Offset = ppc64.BO_BCTR
p.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_BRANCH
gc.Patch(p, top)
}
// when ctr == 1 the loop was not generated but
// there are at least 32 bytes to clear, so add
// that to the remainder to generate the code
// to clear those doublewords
if ctr == 1 {
rem += 32
}
// clear the remainder starting at offset zero
offset := int64(0)
// first clear as many doublewords as possible
// then clear remaining sizes as available
for rem > 0 {
op, size := ppc64.AMOVB, int64(1)
switch {
case rem >= 8:
op, size = ppc64.AMOVD, 8
case rem >= 4:
op, size = ppc64.AMOVW, 4
case rem >= 2:
op, size = ppc64.AMOVH, 2
}
p := gc.Prog(op)
p.From.Type = obj.TYPE_REG
p.From.Reg = ppc64.REG_R0
p.To.Type = obj.TYPE_MEM
p.To.Reg = v.Args[0].Reg()
p.To.Offset = offset
rem -= size
offset += size
}
case ssa.OpPPC64LoweredMove: case ssa.OpPPC64LoweredMove:
// Similar to how this is done on ARM, // Similar to how this is done on ARM,
......
...@@ -485,60 +485,73 @@ ...@@ -485,60 +485,73 @@
(Store {t} ptr val mem) && t.(Type).Size() == 2 -> (MOVHstore ptr val mem) (Store {t} ptr val mem) && t.(Type).Size() == 2 -> (MOVHstore ptr val mem)
(Store {t} ptr val mem) && t.(Type).Size() == 1 -> (MOVBstore ptr val mem) (Store {t} ptr val mem) && t.(Type).Size() == 1 -> (MOVBstore ptr val mem)
// Using Zero instead of LoweredZero allows the
// target address to be folded where possible.
(Zero [0] _ mem) -> mem (Zero [0] _ mem) -> mem
(Zero [1] destptr mem) -> (MOVBstorezero destptr mem) (Zero [1] destptr mem) -> (MOVBstorezero destptr mem)
(Zero [2] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero destptr mem)
(Zero [2] destptr mem) -> (Zero [2] destptr mem) ->
(MOVBstorezero [1] destptr (MOVHstorezero destptr mem)
(MOVBstorezero [0] destptr mem))
(Zero [4] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
(MOVWstorezero destptr mem)
(Zero [4] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero [2] destptr
(MOVHstorezero [0] destptr mem))
(Zero [4] destptr mem) ->
(MOVBstorezero [3] destptr
(MOVBstorezero [2] destptr
(MOVBstorezero [1] destptr
(MOVBstorezero [0] destptr mem))))
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%8 == 0 ->
(MOVDstorezero [0] destptr mem)
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%4 == 0 ->
(MOVWstorezero [4] destptr
(MOVWstorezero [0] destptr mem))
(Zero [8] {t} destptr mem) && t.(Type).Alignment()%2 == 0 ->
(MOVHstorezero [6] destptr
(MOVHstorezero [4] destptr
(MOVHstorezero [2] destptr
(MOVHstorezero [0] destptr mem))))
(Zero [3] destptr mem) -> (Zero [3] destptr mem) ->
(MOVBstorezero [2] destptr (MOVBstorezero [2] destptr
(MOVBstorezero [1] destptr (MOVHstorezero destptr mem))
(MOVBstorezero [0] destptr mem))) (Zero [4] destptr mem) ->
(MOVWstorezero destptr mem)
(Zero [5] destptr mem) ->
(MOVBstorezero [4] destptr
(MOVWstorezero destptr mem))
(Zero [6] destptr mem) ->
(MOVHstorezero [4] destptr
(MOVWstorezero destptr mem))
(Zero [7] destptr mem) ->
(MOVBstorezero [6] destptr
(MOVHstorezero [4] destptr
(MOVWstorezero destptr mem)))
(Zero [8] destptr mem) ->
(MOVDstorezero destptr mem)
// Zero small numbers of words directly. // Zero small numbers of words directly.
(Zero [16] {t} destptr mem) && t.(Type).Alignment()%8 == 0 -> (Zero [12] destptr mem) ->
(MOVWstorezero [8] destptr
(MOVDstorezero [0] destptr mem))
(Zero [16] destptr mem) ->
(MOVDstorezero [8] destptr (MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)) (MOVDstorezero [0] destptr mem))
(Zero [24] {t} destptr mem) && t.(Type).Alignment()%8 == 0 -> (Zero [24] destptr mem) ->
(MOVDstorezero [16] destptr (MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr (MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))) (MOVDstorezero [0] destptr mem)))
(Zero [32] {t} destptr mem) && t.(Type).Alignment()%8 == 0 -> (Zero [32] destptr mem) ->
(MOVDstorezero [24] destptr (MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr (MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr (MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))) (MOVDstorezero [0] destptr mem))))
// Large zeroing uses a loop (Zero [40] destptr mem) ->
(Zero [s] {t} ptr mem) (MOVDstorezero [32] destptr
&& (s > 512 || config.noDuffDevice) || t.(Type).Alignment()%8 != 0 -> (MOVDstorezero [24] destptr
(LoweredZero [t.(Type).Alignment()] (MOVDstorezero [16] destptr
ptr (MOVDstorezero [8] destptr
(ADDconst <ptr.Type> ptr [s-moveSize(t.(Type).Alignment(), config)]) (MOVDstorezero [0] destptr mem)))))
mem)
(Zero [48] destptr mem) ->
(MOVDstorezero [40] destptr
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem))))))
(Zero [56] destptr mem) ->
(MOVDstorezero [48] destptr
(MOVDstorezero [40] destptr
(MOVDstorezero [32] destptr
(MOVDstorezero [24] destptr
(MOVDstorezero [16] destptr
(MOVDstorezero [8] destptr
(MOVDstorezero [0] destptr mem)))))))
// Handle cases not handled above
(Zero [s] ptr mem) -> (LoweredZero [s] ptr mem)
// moves // moves
(Move [0] _ _ mem) -> mem (Move [0] _ _ mem) -> mem
......
...@@ -312,19 +312,37 @@ func init() { ...@@ -312,19 +312,37 @@ func init() {
// large or unaligned zeroing // large or unaligned zeroing
// arg0 = address of memory to zero (in R3, changed as side effect) // arg0 = address of memory to zero (in R3, changed as side effect)
// arg1 = address of the last element to zero
// arg2 = mem
// returns mem // returns mem
// ADD -8,R3,R3 // intermediate value not valid GC ptr, cannot expose to opt+GC //
// MOVDU R0, 8(R3) // a loop is generated when there is more than one iteration
// CMP R3, Rarg1 // needed to clear 4 doublewords
// BLE -2(PC) //
// MOVD $len/32,R31
// MOVD R31,CTR
// loop:
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// ADD R3,32
// BC loop
// remaining doubleword clears generated as needed
// MOVD R0,(R3)
// MOVD R0,8(R3)
// MOVD R0,16(R3)
// MOVD R0,24(R3)
// one or more of these to clear remainder < 8 bytes
// MOVW R0,n1(R3)
// MOVH R0,n2(R3)
// MOVB R0,n3(R3)
{ {
name: "LoweredZero", name: "LoweredZero",
aux: "Int64", aux: "Int64",
argLength: 3, argLength: 2,
reg: regInfo{ reg: regInfo{
inputs: []regMask{buildReg("R3"), gp}, inputs: []regMask{buildReg("R3")},
clobbers: buildReg("R3"), clobbers: buildReg("R3"),
}, },
clobberFlags: true, clobberFlags: true,
......
...@@ -17368,13 +17368,12 @@ var opcodeTable = [...]opInfo{ ...@@ -17368,13 +17368,12 @@ var opcodeTable = [...]opInfo{
{ {
name: "LoweredZero", name: "LoweredZero",
auxType: auxInt64, auxType: auxInt64,
argLen: 3, argLen: 2,
clobberFlags: true, clobberFlags: true,
faultOnNilArg0: true, faultOnNilArg0: true,
reg: regInfo{ reg: regInfo{
inputs: []inputInfo{ inputs: []inputInfo{
{0, 8}, // R3 {0, 8}, // R3
{1, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
}, },
clobbers: 8, // R3 clobbers: 8, // R3
}, },
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment