Commit 659dd4f1 authored by Cherry Zhang's avatar Cherry Zhang

cmd/compile: add more ARM64 optimizations

- Use machine instructions for uint64<->float conversions
- Do not enforce alignment on Zero/Move
	ARM64 supports unaligned load/stores, but only aligned offset
	or small offset can be encoded into instructions.
- Do combined loads

Change-Id: Iffca7dd0f13070b17b784861ce5a30af584680eb
Reviewed-on: https://go-review.googlesource.com/27086Reviewed-by: default avatarDavid Chase <drchase@google.com>
parent cda633b3
......@@ -75,6 +75,9 @@ var progtable = [arm64.ALAST & obj.AMask]obj.ProgInfo{
arm64.AADDS & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RegRead | gc.RightWrite | gc.SetCarry},
arm64.ACSET & obj.AMask: {Flags: gc.SizeQ | gc.RightWrite},
arm64.ACSEL & obj.AMask: {Flags: gc.SizeQ | gc.RegRead | gc.RightWrite},
arm64.AREV & obj.AMask: {Flags: gc.SizeQ | gc.LeftRead | gc.RightWrite},
arm64.AREVW & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite},
arm64.AREV16W & obj.AMask: {Flags: gc.SizeL | gc.LeftRead | gc.RightWrite},
// Floating point.
arm64.AFADDD & obj.AMask: {Flags: gc.SizeD | gc.LeftRead | gc.RegRead | gc.RightWrite},
......
......@@ -482,7 +482,10 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
ssa.OpARM64UCVTFS,
ssa.OpARM64UCVTFD,
ssa.OpARM64FCVTSD,
ssa.OpARM64FCVTDS:
ssa.OpARM64FCVTDS,
ssa.OpARM64REV,
ssa.OpARM64REVW,
ssa.OpARM64REV16W:
p := gc.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
p.From.Reg = gc.SSARegNum(v.Args[0])
......@@ -519,30 +522,13 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// CMP Rarg1, R16
// BLE -2(PC)
// arg1 is the address of the last element to zero
// auxint is alignment
var sz int64
var mov obj.As
switch {
case v.AuxInt%8 == 0:
sz = 8
mov = arm64.AMOVD
case v.AuxInt%4 == 0:
sz = 4
mov = arm64.AMOVW
case v.AuxInt%2 == 0:
sz = 2
mov = arm64.AMOVH
default:
sz = 1
mov = arm64.AMOVB
}
p := gc.Prog(mov)
p := gc.Prog(arm64.AMOVD)
p.Scond = arm64.C_XPOST
p.From.Type = obj.TYPE_REG
p.From.Reg = arm64.REGZERO
p.To.Type = obj.TYPE_MEM
p.To.Reg = arm64.REG_R16
p.To.Offset = sz
p.To.Offset = 8
p2 := gc.Prog(arm64.ACMP)
p2.From.Type = obj.TYPE_REG
p2.From.Reg = gc.SSARegNum(v.Args[1])
......@@ -556,37 +542,20 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
// CMP Rarg2, R16
// BLE -3(PC)
// arg2 is the address of the last element of src
// auxint is alignment
var sz int64
var mov obj.As
switch {
case v.AuxInt%8 == 0:
sz = 8
mov = arm64.AMOVD
case v.AuxInt%4 == 0:
sz = 4
mov = arm64.AMOVW
case v.AuxInt%2 == 0:
sz = 2
mov = arm64.AMOVH
default:
sz = 1
mov = arm64.AMOVB
}
p := gc.Prog(mov)
p := gc.Prog(arm64.AMOVD)
p.Scond = arm64.C_XPOST
p.From.Type = obj.TYPE_MEM
p.From.Reg = arm64.REG_R16
p.From.Offset = sz
p.From.Offset = 8
p.To.Type = obj.TYPE_REG
p.To.Reg = arm64.REGTMP
p2 := gc.Prog(mov)
p2 := gc.Prog(arm64.AMOVD)
p2.Scond = arm64.C_XPOST
p2.From.Type = obj.TYPE_REG
p2.From.Reg = arm64.REGTMP
p2.To.Type = obj.TYPE_MEM
p2.To.Reg = arm64.REG_R17
p2.To.Offset = sz
p2.To.Offset = 8
p3 := gc.Prog(arm64.ACMP)
p3.From.Type = obj.TYPE_REG
p3.From.Reg = gc.SSARegNum(v.Args[2])
......
......@@ -1344,6 +1344,14 @@ var fpConvOpToSSA32 = map[twoTypes]twoOpsAndType{
twoTypes{TFLOAT64, TUINT32}: twoOpsAndType{ssa.OpCvt64Fto32U, ssa.OpCopy, TUINT32},
}
// uint64<->float conversions, only on machines that have intructions for that
var uint64fpConvOpToSSA = map[twoTypes]twoOpsAndType{
twoTypes{TUINT64, TFLOAT32}: twoOpsAndType{ssa.OpCopy, ssa.OpCvt64Uto32F, TUINT64},
twoTypes{TUINT64, TFLOAT64}: twoOpsAndType{ssa.OpCopy, ssa.OpCvt64Uto64F, TUINT64},
twoTypes{TFLOAT32, TUINT64}: twoOpsAndType{ssa.OpCvt32Fto64U, ssa.OpCopy, TUINT64},
twoTypes{TFLOAT64, TUINT64}: twoOpsAndType{ssa.OpCvt64Fto64U, ssa.OpCopy, TUINT64},
}
var shiftOpToSSA = map[opAndTwoTypes]ssa.Op{
opAndTwoTypes{OLSH, TINT8, TUINT8}: ssa.OpLsh8x8,
opAndTwoTypes{OLSH, TUINT8, TUINT8}: ssa.OpLsh8x8,
......@@ -1665,6 +1673,11 @@ func (s *state) expr(n *Node) *ssa.Value {
conv = conv1
}
}
if Thearch.LinkArch.Name == "arm64" {
if conv1, ok1 := uint64fpConvOpToSSA[twoTypes{s.concreteEtype(ft), s.concreteEtype(tt)}]; ok1 {
conv = conv1
}
}
if !ok {
s.Fatalf("weird float conversion %s -> %s", ft, tt)
}
......
......@@ -206,6 +206,9 @@ func init() {
{name: "FNEGS", argLength: 1, reg: fp11, asm: "FNEGS"}, // -arg0, float32
{name: "FNEGD", argLength: 1, reg: fp11, asm: "FNEGD"}, // -arg0, float64
{name: "FSQRTD", argLength: 1, reg: fp11, asm: "FSQRTD"}, // sqrt(arg0), float64
{name: "REV", argLength: 1, reg: gp11, asm: "REV"}, // byte reverse, 64-bit
{name: "REVW", argLength: 1, reg: gp11, asm: "REVW"}, // byte reverse, 32-bit
{name: "REV16W", argLength: 1, reg: gp11, asm: "REV16W"}, // byte reverse in each 16-bit halfword, 32-bit
// shifts
{name: "SLL", argLength: 2, reg: gp21, asm: "LSL"}, // arg0 << arg1, shift amount is mod 64
......@@ -356,7 +359,6 @@ func init() {
// arg0 = address of memory to zero (in R16 aka arm64.REGRT1, changed as side effect)
// arg1 = address of the last element to zero
// arg2 = mem
// auxint = alignment
// returns mem
// MOVD.P ZR, 8(R16)
// CMP Rarg1, R16
......@@ -365,7 +367,6 @@ func init() {
// the-end-of-the-memory - 8 is with the area to zero, ok to spill.
{
name: "LoweredZero",
aux: "Int64",
argLength: 3,
reg: regInfo{
inputs: []regMask{buildReg("R16"), gp},
......@@ -379,7 +380,6 @@ func init() {
// arg1 = address of src memory (in R16 aka arm64.REGRT1, changed as side effect)
// arg2 = address of the last element of src
// arg3 = mem
// auxint = alignment
// returns mem
// MOVD.P 8(R16), Rtmp
// MOVD.P Rtmp, 8(R17)
......@@ -389,7 +389,6 @@ func init() {
// the-end-of-src - 8 is within the area to copy, ok to spill.
{
name: "LoweredMove",
aux: "Int64",
argLength: 4,
reg: regInfo{
inputs: []regMask{buildReg("R17"), buildReg("R16"), gp},
......
......@@ -437,6 +437,10 @@ var genericOps = []opData{
{name: "Cvt32Uto64F", argLength: 1}, // uint32 -> float64, only used on 32-bit arch
{name: "Cvt32Fto32U", argLength: 1}, // float32 -> uint32, only used on 32-bit arch
{name: "Cvt64Fto32U", argLength: 1}, // float64 -> uint32, only used on 32-bit arch
{name: "Cvt64Uto32F", argLength: 1}, // uint64 -> float32, only used on archs that has the instruction
{name: "Cvt64Uto64F", argLength: 1}, // uint64 -> float64, only used on archs that has the instruction
{name: "Cvt32Fto64U", argLength: 1}, // float32 -> uint64, only used on archs that has the instruction
{name: "Cvt64Fto64U", argLength: 1}, // float64 -> uint64, only used on archs that has the instruction
// pseudo-ops for breaking Tuple
{name: "Select0", argLength: 1}, // the first component of a tuple
......
......@@ -841,6 +841,9 @@ const (
OpARM64FNEGS
OpARM64FNEGD
OpARM64FSQRTD
OpARM64REV
OpARM64REVW
OpARM64REV16W
OpARM64SLL
OpARM64SLLconst
OpARM64SRL
......@@ -1377,6 +1380,10 @@ const (
OpCvt32Uto64F
OpCvt32Fto32U
OpCvt64Fto32U
OpCvt64Uto32F
OpCvt64Uto64F
OpCvt32Fto64U
OpCvt64Fto64U
OpSelect0
OpSelect1
)
......@@ -10412,6 +10419,45 @@ var opcodeTable = [...]opInfo{
},
},
},
{
name: "REV",
argLen: 1,
asm: arm64.AREV,
reg: regInfo{
inputs: []inputInfo{
{0, 268173311}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g
},
outputs: []outputInfo{
{0, 133955583}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26
},
},
},
{
name: "REVW",
argLen: 1,
asm: arm64.AREVW,
reg: regInfo{
inputs: []inputInfo{
{0, 268173311}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g
},
outputs: []outputInfo{
{0, 133955583}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26
},
},
},
{
name: "REV16W",
argLen: 1,
asm: arm64.AREV16W,
reg: regInfo{
inputs: []inputInfo{
{0, 268173311}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26 g
},
outputs: []outputInfo{
{0, 133955583}, // R0 R1 R2 R3 R4 R5 R6 R7 R8 R9 R10 R11 R12 R13 R14 R15 R16 R17 R19 R20 R21 R22 R23 R24 R25 R26
},
},
},
{
name: "SLL",
argLen: 2,
......@@ -11762,7 +11808,6 @@ var opcodeTable = [...]opInfo{
},
{
name: "LoweredZero",
auxType: auxInt64,
argLen: 3,
clobberFlags: true,
reg: regInfo{
......@@ -11775,7 +11820,6 @@ var opcodeTable = [...]opInfo{
},
{
name: "LoweredMove",
auxType: auxInt64,
argLen: 4,
clobberFlags: true,
reg: regInfo{
......@@ -14859,6 +14903,26 @@ var opcodeTable = [...]opInfo{
argLen: 1,
generic: true,
},
{
name: "Cvt64Uto32F",
argLen: 1,
generic: true,
},
{
name: "Cvt64Uto64F",
argLen: 1,
generic: true,
},
{
name: "Cvt32Fto64U",
argLen: 1,
generic: true,
},
{
name: "Cvt64Fto64U",
argLen: 1,
generic: true,
},
{
name: "Select0",
argLen: 1,
......
......@@ -149,6 +149,18 @@ func canMergeSym(x, y interface{}) bool {
return x == nil || y == nil
}
// isArg returns whether s is an arg symbol
func isArg(s interface{}) bool {
_, ok := s.(*ArgSymbol)
return ok
}
// isAuto returns whether s is an auto symbol
func isAuto(s interface{}) bool {
_, ok := s.(*AutoSymbol)
return ok
}
// nlz returns the number of leading zeros.
func nlz(x int64) int64 {
// log2(0) == 1, so nlz(0) == 64
......
......@@ -2245,12 +2245,13 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
case 20: /* movT R,O(R) -> strT */
v := int32(regoff(ctxt, &p.To))
sz := int32(1 << uint(movesize(p.As)))
r := int(p.To.Reg)
if r == 0 {
r = int(o.param)
}
if v < 0 { /* unscaled 9-bit signed */
if v < 0 || v%sz != 0 { /* unscaled 9-bit signed */
o1 = olsr9s(ctxt, int32(opstr9(ctxt, p.As)), v, r, int(p.From.Reg))
} else {
v = int32(offsetshift(ctxt, int64(v), int(o.a3)))
......@@ -2259,16 +2260,16 @@ func asmout(ctxt *obj.Link, p *obj.Prog, o *Optab, out []uint32) {
case 21: /* movT O(R),R -> ldrT */
v := int32(regoff(ctxt, &p.From))
sz := int32(1 << uint(movesize(p.As)))
r := int(p.From.Reg)
if r == 0 {
r = int(o.param)
}
if v < 0 { /* unscaled 9-bit signed */
if v < 0 || v%sz != 0 { /* unscaled 9-bit signed */
o1 = olsr9s(ctxt, int32(opldr9(ctxt, p.As)), v, r, int(p.To.Reg))
} else {
v = int32(offsetshift(ctxt, int64(v), int(o.a1)))
//print("offset=%lld v=%ld a1=%d\n", instoffset, v, o->a1);
o1 = olsr12u(ctxt, int32(opldr12(ctxt, p.As)), v, r, int(p.To.Reg))
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment