Commit 8304d107 authored by Lynn Boger's avatar Lynn Boger

cmd/compile: ppc64x intrinsics for math/bits

This adds math/bits intrinsics for OnesCount, Len, TrailingZeros on
ppc64x.

benchmark                       old ns/op     new ns/op     delta
BenchmarkLeadingZeros-16        4.26          1.71          -59.86%
BenchmarkLeadingZeros16-16      3.04          1.83          -39.80%
BenchmarkLeadingZeros32-16      3.31          1.82          -45.02%
BenchmarkLeadingZeros64-16      3.69          1.71          -53.66%
BenchmarkTrailingZeros-16       2.55          1.62          -36.47%
BenchmarkTrailingZeros32-16     2.55          1.77          -30.59%
BenchmarkTrailingZeros64-16     2.78          1.62          -41.73%
BenchmarkOnesCount-16           3.19          0.93          -70.85%
BenchmarkOnesCount32-16         2.55          1.18          -53.73%
BenchmarkOnesCount64-16         3.22          0.93          -71.12%

Update #18616

I also made a change to bits_test.go because when debugging some failures
the output was not quite providing the right argument information.

Change-Id: Ia58d31d1777cf4582a4505f85b11a1202ca07d3e
Reviewed-on: https://go-review.googlesource.com/41630
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent a4864094
......@@ -2730,12 +2730,12 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz64, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "TrailingZeros32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpCtz32, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "TrailingZeros16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
x := s.newValue1(ssa.OpZeroExt16to32, types.Types[TUINT32], args[0])
......@@ -2776,7 +2776,7 @@ func init() {
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "Len32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
......@@ -2785,7 +2785,7 @@ func init() {
x := s.newValue1(ssa.OpZeroExt32to64, types.Types[TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "Len16",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
if s.config.PtrSize == 4 {
......@@ -2795,7 +2795,7 @@ func init() {
x := s.newValue1(ssa.OpZeroExt16to64, types.Types[TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
// Note: disabled on AMD64 because the Go code is faster!
addF("math/bits", "Len8",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
......@@ -2806,7 +2806,7 @@ func init() {
x := s.newValue1(ssa.OpZeroExt8to64, types.Types[TUINT64], args[0])
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], x)
},
sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
addF("math/bits", "Len",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
......@@ -2815,7 +2815,7 @@ func init() {
}
return s.newValue1(ssa.OpBitLen64, types.Types[TINT], args[0])
},
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS)
sys.AMD64, sys.ARM64, sys.ARM, sys.S390X, sys.MIPS, sys.PPC64)
// LeadingZeros is handled because it trivially calls Len.
addF("math/bits", "Reverse64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
......@@ -2845,7 +2845,7 @@ func init() {
return s.newValue1(ssa.OpBitRev64, types.Types[TINT], args[0])
},
sys.ARM64)
makeOnesCount := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
makeOnesCountAMD64 := func(op64 ssa.Op, op32 ssa.Op) func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
aux := s.lookupSymbol(n, &ssa.ExternSymbol{Sym: syslook("support_popcnt").Sym.Linksym()})
addr := s.entryNewValue1A(ssa.OpAddr, types.Types[TBOOL].PtrTo(), aux, s.sb)
......@@ -2881,17 +2881,27 @@ func init() {
}
}
addF("math/bits", "OnesCount64",
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount64),
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount64),
sys.AMD64)
addF("math/bits", "OnesCount64",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount64, types.Types[TINT], args[0])
},
sys.PPC64)
addF("math/bits", "OnesCount32",
makeOnesCount(ssa.OpPopCount32, ssa.OpPopCount32),
makeOnesCountAMD64(ssa.OpPopCount32, ssa.OpPopCount32),
sys.AMD64)
addF("math/bits", "OnesCount32",
func(s *state, n *Node, args []*ssa.Value) *ssa.Value {
return s.newValue1(ssa.OpPopCount32, types.Types[TINT], args[0])
},
sys.PPC64)
addF("math/bits", "OnesCount16",
makeOnesCount(ssa.OpPopCount16, ssa.OpPopCount16),
makeOnesCountAMD64(ssa.OpPopCount16, ssa.OpPopCount16),
sys.AMD64)
// Note: no OnesCount8, the Go implementation is faster - just a table load.
addF("math/bits", "OnesCount",
makeOnesCount(ssa.OpPopCount64, ssa.OpPopCount32),
makeOnesCountAMD64(ssa.OpPopCount64, ssa.OpPopCount32),
sys.AMD64)
/******** sync/atomic ********/
......
......@@ -596,7 +596,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_REG
p.To.Reg = ppc64.REGTMP // Ignored; this is for the carry effect.
case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FRSP:
case ssa.OpPPC64NEG, ssa.OpPPC64FNEG, ssa.OpPPC64FSQRT, ssa.OpPPC64FSQRTS, ssa.OpPPC64FCTIDZ, ssa.OpPPC64FCTIWZ, ssa.OpPPC64FCFID, ssa.OpPPC64FRSP, ssa.OpPPC64CNTLZD, ssa.OpPPC64CNTLZW, ssa.OpPPC64POPCNTD, ssa.OpPPC64POPCNTW, ssa.OpPPC64POPCNTB:
r := v.Reg()
p := s.Prog(v.Op.Asm())
p.To.Type = obj.TYPE_REG
......
......@@ -244,6 +244,17 @@
// (Addr {sym} base) -> (ADDconst {sym} base)
(OffPtr [off] ptr) -> (ADD (MOVDconst <typ.Int64> [off]) ptr)
(Ctz64 x) -> (POPCNTD (ANDN <types.Int64> (ADDconst <types.Int64> [-1] x) x))
(Ctz32 x) -> (POPCNTW (MOVWZreg (ANDN <types.Int> (ADDconst <types.Int> [-1] x) x)))
(BitLen64 x) -> (SUB (MOVDconst [64]) (CNTLZD <types.Int> x))
(BitLen32 x) -> (SUB (MOVDconst [32]) (CNTLZW <types.Int> x))
(PopCount64 x) -> (POPCNTD x)
(PopCount32 x) -> (POPCNTW (MOVWZreg x))
(PopCount16 x) -> (POPCNTW (MOVHZreg x))
(PopCount8 x) -> (POPCNTB (MOVBreg x))
(And64 x y) -> (AND x y)
(And32 x y) -> (AND x y)
(And16 x y) -> (AND x y)
......
......@@ -198,6 +198,13 @@ func init() {
{name: "ROTLconst", argLength: 1, reg: gp11, asm: "ROTL", aux: "Int64"}, // arg0 rotate left by auxInt bits
{name: "ROTLWconst", argLength: 1, reg: gp11, asm: "ROTLW", aux: "Int64"}, // uint32(arg0) rotate left by auxInt bits
{name: "CNTLZD", argLength: 1, reg: gp11, asm: "CNTLZD", clobberFlags: true}, // count leading zeros
{name: "CNTLZW", argLength: 1, reg: gp11, asm: "CNTLZW", clobberFlags: true}, // count leading zeros (32 bit)
{name: "POPCNTD", argLength: 1, reg: gp11, asm: "POPCNTD"}, // number of set bits in arg0
{name: "POPCNTW", argLength: 1, reg: gp11, asm: "POPCNTW"}, // number of set bits in each word of arg0 placed in corresponding word
{name: "POPCNTB", argLength: 1, reg: gp11, asm: "POPCNTB"}, // number of set bits in each byte of arg0 placed in corresonding byte
{name: "FDIV", argLength: 2, reg: fp21, asm: "FDIV"}, // arg0/arg1
{name: "FDIVS", argLength: 2, reg: fp21, asm: "FDIVS"}, // arg0/arg1
......
......@@ -1294,6 +1294,11 @@ const (
OpPPC64SLWconst
OpPPC64ROTLconst
OpPPC64ROTLWconst
OpPPC64CNTLZD
OpPPC64CNTLZW
OpPPC64POPCNTD
OpPPC64POPCNTW
OpPPC64POPCNTB
OpPPC64FDIV
OpPPC64FDIVS
OpPPC64DIVD
......@@ -16568,6 +16573,73 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "CNTLZD",
argLen: 1,
clobberFlags: true,
asm: ppc64.ACNTLZD,
reg: regInfo{
inputs: []inputInfo{
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
outputs: []outputInfo{
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
},
},
{
name: "CNTLZW",
argLen: 1,
clobberFlags: true,
asm: ppc64.ACNTLZW,
reg: regInfo{
inputs: []inputInfo{
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
outputs: []outputInfo{
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
},
},
{
name: "POPCNTD",
argLen: 1,
asm: ppc64.APOPCNTD,
reg: regInfo{
inputs: []inputInfo{
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
outputs: []outputInfo{
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
},
},
{
name: "POPCNTW",
argLen: 1,
asm: ppc64.APOPCNTW,
reg: regInfo{
inputs: []inputInfo{
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
outputs: []outputInfo{
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
},
},
{
name: "POPCNTB",
argLen: 1,
asm: ppc64.APOPCNTB,
reg: regInfo{
inputs: []inputInfo{
{0, 1073733630}, // SP SB R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
outputs: []outputInfo{
{0, 1073733624}, // R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R14 R15 R16 R17 R18 R19 R20 R21 R22 R23 R24 R25 R26 R27 R28 R29
},
},
},
{
name: "FDIV",
argLen: 2,
......
......@@ -69,6 +69,10 @@ func rewriteValuePPC64(v *Value) bool {
return rewriteValuePPC64_OpAtomicStore64_0(v)
case OpAvg64u:
return rewriteValuePPC64_OpAvg64u_0(v)
case OpBitLen32:
return rewriteValuePPC64_OpBitLen32_0(v)
case OpBitLen64:
return rewriteValuePPC64_OpBitLen64_0(v)
case OpClosureCall:
return rewriteValuePPC64_OpClosureCall_0(v)
case OpCom16:
......@@ -97,6 +101,10 @@ func rewriteValuePPC64(v *Value) bool {
return rewriteValuePPC64_OpConstNil_0(v)
case OpConvert:
return rewriteValuePPC64_OpConvert_0(v)
case OpCtz32:
return rewriteValuePPC64_OpCtz32_0(v)
case OpCtz64:
return rewriteValuePPC64_OpCtz64_0(v)
case OpCvt32Fto32:
return rewriteValuePPC64_OpCvt32Fto32_0(v)
case OpCvt32Fto64:
......@@ -465,6 +473,14 @@ func rewriteValuePPC64(v *Value) bool {
return rewriteValuePPC64_OpPPC64XOR_0(v)
case OpPPC64XORconst:
return rewriteValuePPC64_OpPPC64XORconst_0(v)
case OpPopCount16:
return rewriteValuePPC64_OpPopCount16_0(v)
case OpPopCount32:
return rewriteValuePPC64_OpPopCount32_0(v)
case OpPopCount64:
return rewriteValuePPC64_OpPopCount64_0(v)
case OpPopCount8:
return rewriteValuePPC64_OpPopCount8_0(v)
case OpRound32F:
return rewriteValuePPC64_OpRound32F_0(v)
case OpRound64F:
......@@ -988,6 +1004,46 @@ func rewriteValuePPC64_OpAvg64u_0(v *Value) bool {
return true
}
}
func rewriteValuePPC64_OpBitLen32_0(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (BitLen32 x)
// cond:
// result: (SUB (MOVDconst [32]) (CNTLZW <types.Int> x))
for {
x := v.Args[0]
v.reset(OpPPC64SUB)
v0 := b.NewValue0(v.Pos, OpPPC64MOVDconst, types.Int64)
v0.AuxInt = 32
v.AddArg(v0)
v1 := b.NewValue0(v.Pos, OpPPC64CNTLZW, types.Int)
v1.AddArg(x)
v.AddArg(v1)
return true
}
}
func rewriteValuePPC64_OpBitLen64_0(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (BitLen64 x)
// cond:
// result: (SUB (MOVDconst [64]) (CNTLZD <types.Int> x))
for {
x := v.Args[0]
v.reset(OpPPC64SUB)
v0 := b.NewValue0(v.Pos, OpPPC64MOVDconst, types.Int64)
v0.AuxInt = 64
v.AddArg(v0)
v1 := b.NewValue0(v.Pos, OpPPC64CNTLZD, types.Int)
v1.AddArg(x)
v.AddArg(v1)
return true
}
}
func rewriteValuePPC64_OpClosureCall_0(v *Value) bool {
// match: (ClosureCall [argwid] entry closure mem)
// cond:
......@@ -1155,6 +1211,50 @@ func rewriteValuePPC64_OpConvert_0(v *Value) bool {
return true
}
}
func rewriteValuePPC64_OpCtz32_0(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (Ctz32 x)
// cond:
// result: (POPCNTW (MOVWZreg (ANDN <types.Int> (ADDconst <types.Int> [-1] x) x)))
for {
x := v.Args[0]
v.reset(OpPPC64POPCNTW)
v0 := b.NewValue0(v.Pos, OpPPC64MOVWZreg, types.Int64)
v1 := b.NewValue0(v.Pos, OpPPC64ANDN, types.Int)
v2 := b.NewValue0(v.Pos, OpPPC64ADDconst, types.Int)
v2.AuxInt = -1
v2.AddArg(x)
v1.AddArg(v2)
v1.AddArg(x)
v0.AddArg(v1)
v.AddArg(v0)
return true
}
}
func rewriteValuePPC64_OpCtz64_0(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (Ctz64 x)
// cond:
// result: (POPCNTD (ANDN <types.Int64> (ADDconst <types.Int64> [-1] x) x))
for {
x := v.Args[0]
v.reset(OpPPC64POPCNTD)
v0 := b.NewValue0(v.Pos, OpPPC64ANDN, types.Int64)
v1 := b.NewValue0(v.Pos, OpPPC64ADDconst, types.Int64)
v1.AuxInt = -1
v1.AddArg(x)
v0.AddArg(v1)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValuePPC64_OpCvt32Fto32_0(v *Value) bool {
b := v.Block
_ = b
......@@ -7944,6 +8044,68 @@ func rewriteValuePPC64_OpPPC64XORconst_0(v *Value) bool {
}
return false
}
func rewriteValuePPC64_OpPopCount16_0(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (PopCount16 x)
// cond:
// result: (POPCNTW (MOVHZreg x))
for {
x := v.Args[0]
v.reset(OpPPC64POPCNTW)
v0 := b.NewValue0(v.Pos, OpPPC64MOVHZreg, types.Int64)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValuePPC64_OpPopCount32_0(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (PopCount32 x)
// cond:
// result: (POPCNTW (MOVWZreg x))
for {
x := v.Args[0]
v.reset(OpPPC64POPCNTW)
v0 := b.NewValue0(v.Pos, OpPPC64MOVWZreg, types.Int64)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValuePPC64_OpPopCount64_0(v *Value) bool {
// match: (PopCount64 x)
// cond:
// result: (POPCNTD x)
for {
x := v.Args[0]
v.reset(OpPPC64POPCNTD)
v.AddArg(x)
return true
}
}
func rewriteValuePPC64_OpPopCount8_0(v *Value) bool {
b := v.Block
_ = b
types := &b.Func.Config.Types
_ = types
// match: (PopCount8 x)
// cond:
// result: (POPCNTB (MOVBreg x))
for {
x := v.Args[0]
v.reset(OpPPC64POPCNTB)
v0 := b.NewValue0(v.Pos, OpPPC64MOVBreg, types.Int64)
v0.AddArg(x)
v.AddArg(v0)
return true
}
}
func rewriteValuePPC64_OpRound32F_0(v *Value) bool {
// match: (Round32F x)
// cond:
......
......@@ -254,26 +254,26 @@ func testOnesCount(t *testing.T, x uint64, want int) {
if x <= 1<<8-1 {
got := OnesCount8(uint8(x))
if got != want {
t.Fatalf("OnesCount8(%#02x) == %d; want %d", x, got, want)
t.Fatalf("OnesCount8(%#02x) == %d; want %d", uint8(x), got, want)
}
}
if x <= 1<<16-1 {
got := OnesCount16(uint16(x))
if got != want {
t.Fatalf("OnesCount16(%#04x) == %d; want %d", x, got, want)
t.Fatalf("OnesCount16(%#04x) == %d; want %d", uint16(x), got, want)
}
}
if x <= 1<<32-1 {
got := OnesCount32(uint32(x))
if got != want {
t.Fatalf("OnesCount32(%#08x) == %d; want %d", x, got, want)
t.Fatalf("OnesCount32(%#08x) == %d; want %d", uint32(x), got, want)
}
if UintSize == 32 {
got = OnesCount(uint(x))
if got != want {
t.Fatalf("OnesCount(%#08x) == %d; want %d", x, got, want)
t.Fatalf("OnesCount(%#08x) == %d; want %d", uint32(x), got, want)
}
}
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment