Commit 4a33af6b authored by Keith Randall's avatar Keith Randall

[dev.ssa] cmd/compile: more 386 port changes

Fix up zero/move code, including duff calls and rep movs.

Handle the new ops generated by dec64.rules.

Fix constant shifts.

Change-Id: I7d89194b29b04311bfafa0fd93b9f5644af04df9
Reviewed-on: https://go-review.googlesource.com/25033
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarDavid Chase <drchase@google.com>
parent 1b0404c4
...@@ -83,8 +83,7 @@ ...@@ -83,8 +83,7 @@
(Not x) -> (XORLconst [1] x) (Not x) -> (XORLconst [1] x)
// Lowering pointer arithmetic // Lowering pointer arithmetic
(OffPtr [off] ptr) && is32Bit(off) -> (ADDLconst [off] ptr) (OffPtr [off] ptr) -> (ADDLconst [off] ptr)
(OffPtr [off] ptr) -> (ADDL (MOVLconst [off]) ptr)
(Bswap32 x) -> (BSWAPL x) (Bswap32 x) -> (BSWAPL x)
...@@ -99,6 +98,9 @@ ...@@ -99,6 +98,9 @@
(ZeroExt8to32 x) -> (MOVBLZX x) (ZeroExt8to32 x) -> (MOVBLZX x)
(ZeroExt16to32 x) -> (MOVWLZX x) (ZeroExt16to32 x) -> (MOVWLZX x)
(Signmask x) -> (SARLconst x [31])
(Zeromask x) -> (SBBLcarrymask (CMPL (MOVLconst [0]) x))
// Lowering truncation // Lowering truncation
// Because we ignore high parts of registers, truncates are just copies. // Because we ignore high parts of registers, truncates are just copies.
(Trunc16to8 x) -> x (Trunc16to8 x) -> x
...@@ -161,6 +163,26 @@ ...@@ -161,6 +163,26 @@
(Rsh8x16 <t> x y) -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst y [8]))))) (Rsh8x16 <t> x y) -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst y [8])))))
(Rsh8x8 <t> x y) -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst y [8]))))) (Rsh8x8 <t> x y) -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst y [8])))))
// constant shifts
// generic opt rewrites all constant shifts to shift by Const64
(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SHLLconst x [c])
(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SARLconst x [c])
(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 -> (SHRLconst x [c])
(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SHLLconst x [c])
(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SARWconst x [c])
(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 -> (SHRWconst x [c])
(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SHLLconst x [c])
(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SARBconst x [c])
(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 -> (SHRBconst x [c])
// large constant shifts
(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0])
(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0])
(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0])
(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0])
(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0])
(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0])
// Lowering comparisons // Lowering comparisons
(Less32 x y) -> (SETL (CMPL x y)) (Less32 x y) -> (SETL (CMPL x y))
(Less16 x y) -> (SETL (CMPW x y)) (Less16 x y) -> (SETL (CMPW x y))
...@@ -241,7 +263,6 @@ ...@@ -241,7 +263,6 @@
(Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem) (Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem)
(Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstore dst (MOVWload src mem) mem) (Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstore dst (MOVWload src mem) mem)
(Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstore dst (MOVLload src mem) mem) (Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstore dst (MOVLload src mem) mem)
(Move [s] dst src mem) && SizeAndAlign(s).Size() == 16 -> (MOVOstore dst (MOVOload src mem) mem)
(Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 -> (Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 ->
(MOVBstore [2] dst (MOVBload [2] src mem) (MOVBstore [2] dst (MOVBload [2] src mem)
(MOVWstore dst (MOVWload src mem) mem)) (MOVWstore dst (MOVWload src mem) mem))
...@@ -254,21 +275,32 @@ ...@@ -254,21 +275,32 @@
(Move [s] dst src mem) && SizeAndAlign(s).Size() == 7 -> (Move [s] dst src mem) && SizeAndAlign(s).Size() == 7 ->
(MOVLstore [3] dst (MOVLload [3] src mem) (MOVLstore [3] dst (MOVLload [3] src mem)
(MOVLstore dst (MOVLload src mem) mem)) (MOVLstore dst (MOVLload src mem) mem))
(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 ->
(MOVLstore [4] dst (MOVLload [4] src mem)
(MOVLstore dst (MOVLload src mem) mem))
// Adjust moves to be a multiple of 4 bytes.
(Move [s] dst src mem)
&& SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0 ->
(Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%4]
(ADDLconst <dst.Type> dst [SizeAndAlign(s).Size()%4])
(ADDLconst <src.Type> src [SizeAndAlign(s).Size()%4])
(MOVLstore dst (MOVLload src mem) mem))
// Medium copying uses a duff device. // Medium copying uses a duff device.
(Move [s] dst src mem) (Move [s] dst src mem)
&& SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0 && SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0
&& !config.noDuffDevice -> && !config.noDuffDevice ->
(DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem) (DUFFCOPY [10*(128-SizeAndAlign(s).Size()/4)] dst src mem)
// 14 and 64 are magic constants. 14 is the number of bytes to encode: // 10 and 128 are magic constants. 10 is the number of bytes to encode:
// MOVUPS (SI), X0 // MOVL (SI), CX
// ADDL $16, SI // ADDL $4, SI
// MOVUPS X0, (DI) // MOVL CX, (DI)
// ADDL $16, DI // ADDL $4, DI
// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy. // and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy.
// Large copying uses REP MOVSL. // Large copying uses REP MOVSL.
(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0 -> (Move [s] dst src mem) && (SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0 ->
(REPMOVSL dst src (MOVLconst [SizeAndAlign(s).Size()/4]) mem) (REPMOVSL dst src (MOVLconst [SizeAndAlign(s).Size()/4]) mem)
// Lowering Zero instructions // Lowering Zero instructions
...@@ -309,11 +341,22 @@ ...@@ -309,11 +341,22 @@
(MOVLstoreconst [makeValAndOff(0,4)] destptr (MOVLstoreconst [makeValAndOff(0,4)] destptr
(MOVLstoreconst [0] destptr mem)))) (MOVLstoreconst [0] destptr mem))))
// Medium zeroing uses a duff device.
(Zero [s] destptr mem)
&& SizeAndAlign(s).Size() > 16
&& SizeAndAlign(s).Size() <= 4*128
&& SizeAndAlign(s).Size()%4 == 0
&& !config.noDuffDevice ->
(DUFFZERO [1*(128-SizeAndAlign(s).Size()/4)] destptr (MOVLconst [0]) mem)
// 1 and 128 are magic constants. 1 is the number of bytes to encode STOSL.
// 128 is the number of STOSL instructions in duffzero.
// See src/runtime/duff_386.s:duffzero.
// Large zeroing uses REP STOSQ. // Large zeroing uses REP STOSQ.
(Zero [s] destptr mem) (Zero [s] destptr mem)
&& (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32)) && (SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16))
&& SizeAndAlign(s).Size()%8 == 0 -> && SizeAndAlign(s).Size()%4 == 0 ->
(REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/8]) (MOVLconst [0]) mem) (REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/4]) (MOVLconst [0]) mem)
// Lowering constants // Lowering constants
(Const8 [val]) -> (MOVLconst [val]) (Const8 [val]) -> (MOVLconst [val])
...@@ -596,14 +639,12 @@ ...@@ -596,14 +639,12 @@
(MOVBload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVBload [off1+off2] {sym} ptr mem) (MOVBload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVBload [off1+off2] {sym} ptr mem)
(MOVSSload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSSload [off1+off2] {sym} ptr mem) (MOVSSload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSSload [off1+off2] {sym} ptr mem)
(MOVSDload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSDload [off1+off2] {sym} ptr mem) (MOVSDload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSDload [off1+off2] {sym} ptr mem)
(MOVOload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVOload [off1+off2] {sym} ptr mem)
(MOVLstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVLstore [off1+off2] {sym} ptr val mem) (MOVLstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVLstore [off1+off2] {sym} ptr val mem)
(MOVWstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVWstore [off1+off2] {sym} ptr val mem) (MOVWstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVWstore [off1+off2] {sym} ptr val mem)
(MOVBstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVBstore [off1+off2] {sym} ptr val mem) (MOVBstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVBstore [off1+off2] {sym} ptr val mem)
(MOVSSstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSSstore [off1+off2] {sym} ptr val mem) (MOVSSstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSSstore [off1+off2] {sym} ptr val mem)
(MOVSDstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSDstore [off1+off2] {sym} ptr val mem) (MOVSDstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSDstore [off1+off2] {sym} ptr val mem)
(MOVOstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVOstore [off1+off2] {sym} ptr val mem)
// Fold constants into stores. // Fold constants into stores.
(MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) -> (MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) ->
...@@ -633,8 +674,6 @@ ...@@ -633,8 +674,6 @@
(MOVSSload [off1+off2] {mergeSym(sym1,sym2)} base mem) (MOVSSload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVSDload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> (MOVSDload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVSDload [off1+off2] {mergeSym(sym1,sym2)} base mem) (MOVSDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVOload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVOload [off1+off2] {mergeSym(sym1,sym2)} base mem)
(MOVBLSXload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> (MOVBLSXload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVBLSXload [off1+off2] {mergeSym(sym1,sym2)} base mem) (MOVBLSXload [off1+off2] {mergeSym(sym1,sym2)} base mem)
...@@ -651,8 +690,6 @@ ...@@ -651,8 +690,6 @@
(MOVSSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) (MOVSSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(MOVSDstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) -> (MOVSDstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVSDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem) (MOVSDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(MOVOstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
(MOVOstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
(MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) -> (MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
(MOVLstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem) (MOVLstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)
......
...@@ -330,8 +330,6 @@ func init() { ...@@ -330,8 +330,6 @@ func init() {
{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem {name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"}, // store byte in arg1 to arg0+auxint+aux. arg2=mem
{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem {name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"}, // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem {name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"}, // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
{name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128"}, // load 16 bytes from arg0+auxint+aux. arg1=mem
{name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem"}, // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem
// indexed loads/stores // indexed loads/stores
{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem {name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
...@@ -360,7 +358,7 @@ func init() { ...@@ -360,7 +358,7 @@ func init() {
{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ... {name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ...
{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ... {name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ...
// arg0 = (duff-adjusted) pointer to start of memory to zero // arg0 = pointer to start of memory to zero
// arg1 = value to store (will always be zero) // arg1 = value to store (will always be zero)
// arg2 = mem // arg2 = mem
// auxint = offset into duffzero code to start executing // auxint = offset into duffzero code to start executing
...@@ -370,11 +368,10 @@ func init() { ...@@ -370,11 +368,10 @@ func init() {
aux: "Int64", aux: "Int64",
argLength: 3, argLength: 3,
reg: regInfo{ reg: regInfo{
inputs: []regMask{buildReg("DI"), buildReg("X0")}, inputs: []regMask{buildReg("DI"), buildReg("AX")},
clobbers: buildReg("DI FLAGS"), clobbers: buildReg("DI FLAGS"),
}, },
}, },
{name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true},
// arg0 = address of memory to zero // arg0 = address of memory to zero
// arg1 = # of 4-byte words to zero // arg1 = # of 4-byte words to zero
...@@ -407,7 +404,7 @@ func init() { ...@@ -407,7 +404,7 @@ func init() {
argLength: 3, argLength: 3,
reg: regInfo{ reg: regInfo{
inputs: []regMask{buildReg("DI"), buildReg("SI")}, inputs: []regMask{buildReg("DI"), buildReg("SI")},
clobbers: buildReg("DI SI X0 FLAGS"), // uses X0 as a temporary clobbers: buildReg("DI SI CX FLAGS"), // uses CX as a temporary
}, },
}, },
......
...@@ -400,8 +400,8 @@ ...@@ -400,8 +400,8 @@
(Zero [SizeAndAlign(s).Size()-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem)) (Zero [SizeAndAlign(s).Size()-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
(Zero [s] destptr mem) (Zero [s] destptr mem)
&& SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice -> && SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice ->
(DUFFZERO [duffStart(SizeAndAlign(s).Size())] (DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())]
(ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) (ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0])
mem) mem)
// Large zeroing uses REP STOSQ. // Large zeroing uses REP STOSQ.
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// This file contains rules to decompose [u]int32 types on 32-bit // This file contains rules to decompose [u]int64 types on 32-bit
// architectures. These rules work together with the decomposeBuiltIn // architectures. These rules work together with the decomposeBuiltIn
// pass which handles phis of these types. // pass which handles phis of these types.
......
...@@ -285,8 +285,6 @@ const ( ...@@ -285,8 +285,6 @@ const (
Op386MOVBstore Op386MOVBstore
Op386MOVWstore Op386MOVWstore
Op386MOVLstore Op386MOVLstore
Op386MOVOload
Op386MOVOstore
Op386MOVBloadidx1 Op386MOVBloadidx1
Op386MOVWloadidx1 Op386MOVWloadidx1
Op386MOVWloadidx2 Op386MOVWloadidx2
...@@ -306,7 +304,6 @@ const ( ...@@ -306,7 +304,6 @@ const (
Op386MOVLstoreconstidx1 Op386MOVLstoreconstidx1
Op386MOVLstoreconstidx4 Op386MOVLstoreconstidx4
Op386DUFFZERO Op386DUFFZERO
Op386MOVOconst
Op386REPSTOSL Op386REPSTOSL
Op386CALLstatic Op386CALLstatic
Op386CALLclosure Op386CALLclosure
...@@ -3152,32 +3149,6 @@ var opcodeTable = [...]opInfo{ ...@@ -3152,32 +3149,6 @@ var opcodeTable = [...]opInfo{
}, },
}, },
}, },
{
name: "MOVOload",
auxType: auxSymOff,
argLen: 2,
asm: x86.AMOVUPS,
reg: regInfo{
inputs: []inputInfo{
{0, 65791}, // AX CX DX BX SP BP SI DI SB
},
outputs: []outputInfo{
{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
},
},
},
{
name: "MOVOstore",
auxType: auxSymOff,
argLen: 3,
asm: x86.AMOVUPS,
reg: regInfo{
inputs: []inputInfo{
{1, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
{0, 65791}, // AX CX DX BX SP BP SI DI SB
},
},
},
{ {
name: "MOVBloadidx1", name: "MOVBloadidx1",
auxType: auxSymOff, auxType: auxSymOff,
...@@ -3418,22 +3389,11 @@ var opcodeTable = [...]opInfo{ ...@@ -3418,22 +3389,11 @@ var opcodeTable = [...]opInfo{
reg: regInfo{ reg: regInfo{
inputs: []inputInfo{ inputs: []inputInfo{
{0, 128}, // DI {0, 128}, // DI
{1, 256}, // X0 {1, 1}, // AX
}, },
clobbers: 131200, // DI FLAGS clobbers: 131200, // DI FLAGS
}, },
}, },
{
name: "MOVOconst",
auxType: auxInt128,
argLen: 0,
rematerializeable: true,
reg: regInfo{
outputs: []outputInfo{
{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
},
},
},
{ {
name: "REPSTOSL", name: "REPSTOSL",
argLen: 4, argLen: 4,
...@@ -3502,7 +3462,7 @@ var opcodeTable = [...]opInfo{ ...@@ -3502,7 +3462,7 @@ var opcodeTable = [...]opInfo{
{0, 128}, // DI {0, 128}, // DI
{1, 64}, // SI {1, 64}, // SI
}, },
clobbers: 131520, // SI DI X0 FLAGS clobbers: 131266, // CX SI DI FLAGS
}, },
}, },
{ {
......
...@@ -254,39 +254,38 @@ func isSamePtr(p1, p2 *Value) bool { ...@@ -254,39 +254,38 @@ func isSamePtr(p1, p2 *Value) bool {
return false return false
} }
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD, func duffStartAMD64(size int64) int64 {
// See runtime/mkduff.go. x, _ := duffAMD64(size)
const (
dzBlocks = 16 // number of MOV/ADD blocks
dzBlockLen = 4 // number of clears per block
dzBlockSize = 19 // size of instructions in a single block
dzMovSize = 4 // size of single MOV instruction w/ offset
dzAddSize = 4 // size of single ADD instruction
dzClearStep = 16 // number of bytes cleared by each MOV instruction
dzTailLen = 4 // number of final STOSQ instructions
dzTailSize = 2 // size of single STOSQ instruction
dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
dzSize = dzBlocks * dzBlockSize
)
func duffStart(size int64) int64 {
x, _ := duff(size)
return x return x
} }
func duffAdj(size int64) int64 { func duffAdjAMD64(size int64) int64 {
_, x := duff(size) _, x := duffAMD64(size)
return x return x
} }
// duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes) // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
// required to use the duffzero mechanism for a block of the given size. // required to use the duffzero mechanism for a block of the given size.
func duff(size int64) (int64, int64) { func duffAMD64(size int64) (int64, int64) {
// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
// See runtime/mkduff.go.
const (
dzBlocks = 16 // number of MOV/ADD blocks
dzBlockLen = 4 // number of clears per block
dzBlockSize = 19 // size of instructions in a single block
dzMovSize = 4 // size of single MOV instruction w/ offset
dzAddSize = 4 // size of single ADD instruction
dzClearStep = 16 // number of bytes cleared by each MOV instruction
dzTailLen = 4 // number of final STOSQ instructions
dzTailSize = 2 // size of single STOSQ instruction
dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
dzSize = dzBlocks * dzBlockSize
)
if size < 32 || size > 1024 || size%dzClearStep != 0 { if size < 32 || size > 1024 || size%dzClearStep != 0 {
panic("bad duffzero size") panic("bad duffzero size")
} }
// TODO: arch-dependent
steps := size / dzClearStep steps := size / dzClearStep
blocks := steps / dzBlockLen blocks := steps / dzBlockLen
steps %= dzBlockLen steps %= dzBlockLen
......
This diff is collapsed.
...@@ -17415,7 +17415,7 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool { ...@@ -17415,7 +17415,7 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool {
} }
// match: (Zero [s] destptr mem) // match: (Zero [s] destptr mem)
// cond: SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice // cond: SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice
// result: (DUFFZERO [duffStart(SizeAndAlign(s).Size())] (ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) mem) // result: (DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())] (ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) mem)
for { for {
s := v.AuxInt s := v.AuxInt
destptr := v.Args[0] destptr := v.Args[0]
...@@ -17424,9 +17424,9 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool { ...@@ -17424,9 +17424,9 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool {
break break
} }
v.reset(OpAMD64DUFFZERO) v.reset(OpAMD64DUFFZERO)
v.AuxInt = duffStart(SizeAndAlign(s).Size()) v.AuxInt = duffStartAMD64(SizeAndAlign(s).Size())
v0 := b.NewValue0(v.Line, OpAMD64ADDQconst, config.fe.TypeUInt64()) v0 := b.NewValue0(v.Line, OpAMD64ADDQconst, config.fe.TypeUInt64())
v0.AuxInt = duffAdj(SizeAndAlign(s).Size()) v0.AuxInt = duffAdjAMD64(SizeAndAlign(s).Size())
v0.AddArg(destptr) v0.AddArg(destptr)
v.AddArg(v0) v.AddArg(v0)
v1 := b.NewValue0(v.Line, OpAMD64MOVOconst, TypeInt128) v1 := b.NewValue0(v.Line, OpAMD64MOVOconst, TypeInt128)
......
...@@ -101,11 +101,14 @@ func storeByType(t ssa.Type) obj.As { ...@@ -101,11 +101,14 @@ func storeByType(t ssa.Type) obj.As {
// moveByType returns the reg->reg move instruction of the given type. // moveByType returns the reg->reg move instruction of the given type.
func moveByType(t ssa.Type) obj.As { func moveByType(t ssa.Type) obj.As {
if t.IsFloat() { if t.IsFloat() {
// Moving the whole sse2 register is faster switch t.Size() {
// than moving just the correct low portion of it. case 4:
// There is no xmm->xmm move with 1 byte opcode, return x86.AMOVSS
// so use movups, which has 2 byte opcode. case 8:
return x86.AMOVUPS return x86.AMOVSD
default:
panic(fmt.Sprintf("bad float register width %d:%s", t.Size(), t))
}
} else { } else {
switch t.Size() { switch t.Size() {
case 1: case 1:
...@@ -115,8 +118,6 @@ func moveByType(t ssa.Type) obj.As { ...@@ -115,8 +118,6 @@ func moveByType(t ssa.Type) obj.As {
return x86.AMOVL return x86.AMOVL
case 4: case 4:
return x86.AMOVL return x86.AMOVL
case 16:
return x86.AMOVUPS // int128s are in SSE registers
default: default:
panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t)) panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
} }
...@@ -448,7 +449,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -448,7 +449,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.From.Val = math.Float64frombits(uint64(v.AuxInt)) p.From.Val = math.Float64frombits(uint64(v.AuxInt))
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = x p.To.Reg = x
case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload, ssa.Op386MOVOload: case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload:
p := gc.Prog(v.Op.Asm()) p := gc.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_MEM p.From.Type = obj.TYPE_MEM
p.From.Reg = gc.SSARegNum(v.Args[0]) p.From.Reg = gc.SSARegNum(v.Args[0])
...@@ -496,7 +497,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -496,7 +497,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
gc.AddAux(&p.From, v) gc.AddAux(&p.From, v)
p.To.Type = obj.TYPE_REG p.To.Type = obj.TYPE_REG
p.To.Reg = gc.SSARegNum(v) p.To.Reg = gc.SSARegNum(v)
case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore, ssa.Op386MOVOstore: case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore:
p := gc.Prog(v.Op.Asm()) p := gc.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG p.From.Type = obj.TYPE_REG
p.From.Reg = gc.SSARegNum(v.Args[1]) p.From.Reg = gc.SSARegNum(v.Args[1])
...@@ -584,12 +585,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -584,12 +585,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_ADDR p.To.Type = obj.TYPE_ADDR
p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg)) p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
p.To.Offset = v.AuxInt p.To.Offset = v.AuxInt
case ssa.Op386MOVOconst:
if v.AuxInt != 0 {
v.Unimplementedf("MOVOconst can only do constant=0")
}
r := gc.SSARegNum(v)
opregreg(x86.AXORPS, r, r)
case ssa.Op386DUFFCOPY: case ssa.Op386DUFFCOPY:
p := gc.Prog(obj.ADUFFCOPY) p := gc.Prog(obj.ADUFFCOPY)
p.To.Type = obj.TYPE_ADDR p.To.Type = obj.TYPE_ADDR
...@@ -828,8 +823,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { ...@@ -828,8 +823,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
case ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, case ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload,
ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore,
ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload,
ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVOload, ssa.Op386MOVSSload, ssa.Op386MOVSDload,
ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVOstore: ssa.Op386MOVSSstore, ssa.Op386MOVSDstore:
if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage { if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
if gc.Debug_checknil != 0 && int(v.Line) > 1 { if gc.Debug_checknil != 0 && int(v.Line) > 1 {
gc.Warnl(v.Line, "removed nil check") gc.Warnl(v.Line, "removed nil check")
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment