[dev.ssa] cmd/compile: more 386 port changes

Fix up zero/move code, including duff calls and rep movs. Handle the new ops generated by dec64.rules. Fix constant shifts. Change-Id: I7d89194b29b04311bfafa0fd93b9f5644af04df9 Reviewed-on: https://go-review.googlesource.com/25033 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>

[dev.ssa] cmd/compile: more 386 port changes
Fix up zero/move code, including duff calls and rep movs. Handle the new ops generated by dec64.rules. Fix constant shifts. Change-Id: I7d89194b29b04311bfafa0fd93b9f5644af04df9 Reviewed-on: https://go-review.googlesource.com/25033 Run-TryBot: Keith Randall <khr@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: David Chase <drchase@google.com>
4a33af6b · Keith Randall · 1b0404c4 · 4a33af6b · 4a33af6b · 4a33af6b
Commit 4a33af6b authored Jul 18, 2016 by Keith Randall
9 changed files
--- a/src/cmd/compile/internal/ssa/gen/386.rules
+++ b/src/cmd/compile/internal/ssa/gen/386.rules
@@ -83,8 +83,7 @@
 (Not x) -> (XORLconst [1] x)

 // Lowering pointer arithmetic
-(OffPtr [off] ptr) && is32Bit(off) -> (ADDLconst [off] ptr)
-(OffPtr [off] ptr) -> (ADDL (MOVLconst [off]) ptr)
+(OffPtr [off] ptr) -> (ADDLconst [off] ptr)

 (Bswap32 x) -> (BSWAPL x)

@@ -99,6 +98,9 @@
 (ZeroExt8to32  x) -> (MOVBLZX x)
 (ZeroExt16to32 x) -> (MOVWLZX x)

+(Signmask x) -> (SARLconst x [31])
+(Zeromask x) -> (SBBLcarrymask (CMPL (MOVLconst [0]) x))
+
 // Lowering truncation
 // Because we ignore high parts of registers, truncates are just copies.
 (Trunc16to8  x) -> x
@@ -161,6 +163,26 @@
 (Rsh8x16 <t> x y)  -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPWconst y [8])))))
 (Rsh8x8  <t> x y)  -> (SARB <t> x (ORL <y.Type> y (NOTL <y.Type> (SBBLcarrymask <y.Type> (CMPBconst y [8])))))

+// constant shifts
+// generic opt rewrites all constant shifts to shift by Const64
+(Lsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SHLLconst x [c])
+(Rsh32x64 x (Const64 [c])) && uint64(c) < 32 -> (SARLconst x [c])
+(Rsh32Ux64 x (Const64 [c])) && uint64(c) < 32 -> (SHRLconst x [c])
+(Lsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SHLLconst x [c])
+(Rsh16x64 x (Const64 [c])) && uint64(c) < 16 -> (SARWconst x [c])
+(Rsh16Ux64 x (Const64 [c])) && uint64(c) < 16 -> (SHRWconst x [c])
+(Lsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SHLLconst x [c])
+(Rsh8x64 x (Const64 [c])) && uint64(c) < 8 -> (SARBconst x [c])
+(Rsh8Ux64 x (Const64 [c])) && uint64(c) < 8 -> (SHRBconst x [c])
+
+// large constant shifts
+(Lsh32x64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0])
+(Rsh32Ux64 _ (Const64 [c])) && uint64(c) >= 32 -> (Const32 [0])
+(Lsh16x64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0])
+(Rsh16Ux64 _ (Const64 [c])) && uint64(c) >= 16 -> (Const16 [0])
+(Lsh8x64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0])
+(Rsh8Ux64 _ (Const64 [c])) && uint64(c) >= 8 -> (Const8 [0])
+
 // Lowering comparisons
 (Less32  x y) -> (SETL (CMPL x y))
 (Less16  x y) -> (SETL (CMPW x y))
@@ -241,7 +263,6 @@
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 1 -> (MOVBstore dst (MOVBload src mem) mem)
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 2 -> (MOVWstore dst (MOVWload src mem) mem)
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 4 -> (MOVLstore dst (MOVLload src mem) mem)
-(Move [s] dst src mem) && SizeAndAlign(s).Size() == 16 -> (MOVOstore dst (MOVOload src mem) mem)
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 3 ->
 	(MOVBstore [2] dst (MOVBload [2] src mem)
 		(MOVWstore dst (MOVWload src mem) mem))
@@ -254,21 +275,32 @@
 (Move [s] dst src mem) && SizeAndAlign(s).Size() == 7 ->
 	(MOVLstore [3] dst (MOVLload [3] src mem)
 		(MOVLstore dst (MOVLload src mem) mem))
+(Move [s] dst src mem) && SizeAndAlign(s).Size() == 8 ->
+	(MOVLstore [4] dst (MOVLload [4] src mem)
+		(MOVLstore dst (MOVLload src mem) mem))
+
+// Adjust moves to be a multiple of 4 bytes.
+(Move [s] dst src mem)
+	&& SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size()%4 != 0 ->
+	(Move [SizeAndAlign(s).Size()-SizeAndAlign(s).Size()%4]
+		(ADDLconst <dst.Type> dst [SizeAndAlign(s).Size()%4])
+		(ADDLconst <src.Type> src [SizeAndAlign(s).Size()%4])
+		(MOVLstore dst (MOVLload src mem) mem))

 // Medium copying uses a duff device.
 (Move [s] dst src mem)
-	&& SizeAndAlign(s).Size() >= 32 && SizeAndAlign(s).Size() <= 16*64 && SizeAndAlign(s).Size()%16 == 0
+	&& SizeAndAlign(s).Size() > 8 && SizeAndAlign(s).Size() <= 4*128 && SizeAndAlign(s).Size()%4 == 0
 	&& !config.noDuffDevice ->
-	(DUFFCOPY [14*(64-SizeAndAlign(s).Size()/16)] dst src mem)
-// 14 and 64 are magic constants.  14 is the number of bytes to encode:
-//	MOVUPS	(SI), X0
-//	ADDL	$16, SI
-//	MOVUPS	X0, (DI)
-//	ADDL	$16, DI
-// and 64 is the number of such blocks. See src/runtime/duff_amd64.s:duffcopy.
+	(DUFFCOPY [10*(128-SizeAndAlign(s).Size()/4)] dst src mem)
+// 10 and 128 are magic constants.  10 is the number of bytes to encode:
+//	MOVL	(SI), CX
+//	ADDL	$4, SI
+//	MOVL	CX, (DI)
+//	ADDL	$4, DI
+// and 128 is the number of such blocks. See src/runtime/duff_386.s:duffcopy.

 // Large copying uses REP MOVSL.
-(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 16*64 || config.noDuffDevice) && SizeAndAlign(s).Size()%8 == 0 ->
+(Move [s] dst src mem) && (SizeAndAlign(s).Size() > 4*128 || config.noDuffDevice) && SizeAndAlign(s).Size()%4 == 0 ->
 	(REPMOVSL dst src (MOVLconst [SizeAndAlign(s).Size()/4]) mem)

 // Lowering Zero instructions
@@ -309,11 +341,22 @@
 			(MOVLstoreconst [makeValAndOff(0,4)] destptr
 				(MOVLstoreconst [0] destptr mem))))

+// Medium zeroing uses a duff device.
+(Zero [s] destptr mem)
+  && SizeAndAlign(s).Size() > 16
+  && SizeAndAlign(s).Size() <= 4*128
+  && SizeAndAlign(s).Size()%4 == 0
+  && !config.noDuffDevice ->
+	(DUFFZERO [1*(128-SizeAndAlign(s).Size()/4)] destptr (MOVLconst [0]) mem)
+// 1 and 128 are magic constants.  1 is the number of bytes to encode STOSL.
+// 128 is the number of STOSL instructions in duffzero.
+// See src/runtime/duff_386.s:duffzero.
+
 // Large zeroing uses REP STOSQ.
 (Zero [s] destptr mem)
-	&& (SizeAndAlign(s).Size() > 1024 || (config.noDuffDevice && SizeAndAlign(s).Size() > 32))
-	&& SizeAndAlign(s).Size()%8 == 0 ->
-	(REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/8]) (MOVLconst [0]) mem)
+  && (SizeAndAlign(s).Size() > 4*128 || (config.noDuffDevice && SizeAndAlign(s).Size() > 16))
+  && SizeAndAlign(s).Size()%4 == 0 ->
+	(REPSTOSL destptr (MOVLconst [SizeAndAlign(s).Size()/4]) (MOVLconst [0]) mem)

 // Lowering constants
 (Const8   [val]) -> (MOVLconst [val])
@@ -596,14 +639,12 @@
 (MOVBload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVBload  [off1+off2] {sym} ptr mem)
 (MOVSSload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSSload [off1+off2] {sym} ptr mem)
 (MOVSDload [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVSDload [off1+off2] {sym} ptr mem)
-(MOVOload  [off1] {sym} (ADDLconst [off2] ptr) mem) && is32Bit(off1+off2) -> (MOVOload  [off1+off2] {sym} ptr mem)

 (MOVLstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVLstore  [off1+off2] {sym} ptr val mem)
 (MOVWstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVWstore  [off1+off2] {sym} ptr val mem)
 (MOVBstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVBstore  [off1+off2] {sym} ptr val mem)
 (MOVSSstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSSstore [off1+off2] {sym} ptr val mem)
 (MOVSDstore [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVSDstore [off1+off2] {sym} ptr val mem)
-(MOVOstore  [off1] {sym} (ADDLconst [off2] ptr) val mem) && is32Bit(off1+off2) -> (MOVOstore  [off1+off2] {sym} ptr val mem)

 // Fold constants into stores.
 (MOVLstore [off] {sym} ptr (MOVLconst [c]) mem) && validOff(off) ->
@@ -633,8 +674,6 @@
 	(MOVSSload [off1+off2] {mergeSym(sym1,sym2)} base mem)
 (MOVSDload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVSDload [off1+off2] {mergeSym(sym1,sym2)} base mem)
-(MOVOload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-	(MOVOload [off1+off2] {mergeSym(sym1,sym2)} base mem)

 (MOVBLSXload [off1] {sym1} (LEAL [off2] {sym2} base) mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVBLSXload [off1+off2] {mergeSym(sym1,sym2)} base mem)
@@ -651,8 +690,6 @@
 	(MOVSSstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
 (MOVSDstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
 	(MOVSDstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
-(MOVOstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem) && is32Bit(off1+off2) && canMergeSym(sym1, sym2) ->
-	(MOVOstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)

 (MOVLstoreconst [sc] {sym1} (LEAL [off] {sym2} ptr) mem) && canMergeSym(sym1, sym2) && ValAndOff(sc).canAdd(off) ->
 	(MOVLstoreconst [ValAndOff(sc).add(off)] {mergeSym(sym1, sym2)} ptr mem)

--- a/src/cmd/compile/internal/ssa/gen/386Ops.go
+++ b/src/cmd/compile/internal/ssa/gen/386Ops.go
@@ -330,8 +330,6 @@ func init() {
 		{name: "MOVBstore", argLength: 3, reg: gpstore, asm: "MOVB", aux: "SymOff", typ: "Mem"},     // store byte in arg1 to arg0+auxint+aux. arg2=mem
 		{name: "MOVWstore", argLength: 3, reg: gpstore, asm: "MOVW", aux: "SymOff", typ: "Mem"},     // store 2 bytes in arg1 to arg0+auxint+aux. arg2=mem
 		{name: "MOVLstore", argLength: 3, reg: gpstore, asm: "MOVL", aux: "SymOff", typ: "Mem"},     // store 4 bytes in arg1 to arg0+auxint+aux. arg2=mem
-		{name: "MOVOload", argLength: 2, reg: fpload, asm: "MOVUPS", aux: "SymOff", typ: "Int128"},  // load 16 bytes from arg0+auxint+aux. arg1=mem
-		{name: "MOVOstore", argLength: 3, reg: fpstore, asm: "MOVUPS", aux: "SymOff", typ: "Mem"},   // store 16 bytes in arg1 to arg0+auxint+aux. arg2=mem

 		// indexed loads/stores
 		{name: "MOVBloadidx1", argLength: 3, reg: gploadidx, asm: "MOVBLZX", aux: "SymOff"}, // load a byte from arg0+arg1+auxint+aux. arg2=mem
@@ -360,7 +358,7 @@ func init() {
 		{name: "MOVLstoreconstidx1", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... arg1 ...
 		{name: "MOVLstoreconstidx4", argLength: 3, reg: gpstoreconstidx, asm: "MOVL", aux: "SymValAndOff", typ: "Mem"}, // store low 4 bytes of ... 4*arg1 ...

-		// arg0 = (duff-adjusted) pointer to start of memory to zero
+		// arg0 = pointer to start of memory to zero
 		// arg1 = value to store (will always be zero)
 		// arg2 = mem
 		// auxint = offset into duffzero code to start executing
@@ -370,11 +368,10 @@ func init() {
 			aux:       "Int64",
 			argLength: 3,
 			reg: regInfo{
-				inputs:   []regMask{buildReg("DI"), buildReg("X0")},
+				inputs:   []regMask{buildReg("DI"), buildReg("AX")},
 				clobbers: buildReg("DI FLAGS"),
 			},
 		},
-		{name: "MOVOconst", reg: regInfo{nil, 0, []regMask{fp}}, typ: "Int128", aux: "Int128", rematerializeable: true},

 		// arg0 = address of memory to zero
 		// arg1 = # of 4-byte words to zero
@@ -407,7 +404,7 @@ func init() {
 			argLength: 3,
 			reg: regInfo{
 				inputs:   []regMask{buildReg("DI"), buildReg("SI")},
-				clobbers: buildReg("DI SI X0 FLAGS"), // uses X0 as a temporary
+				clobbers: buildReg("DI SI CX FLAGS"), // uses CX as a temporary
 			},
 		},


--- a/src/cmd/compile/internal/ssa/gen/AMD64.rules
+++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules
@@ -400,8 +400,8 @@
 	(Zero [SizeAndAlign(s).Size()-8] (ADDQconst [8] destptr) (MOVQstore destptr (MOVQconst [0]) mem))
 (Zero [s] destptr mem)
 	&& SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice ->
-	(DUFFZERO [duffStart(SizeAndAlign(s).Size())]
-		(ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0])
+	(DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())]
+		(ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0])
 		mem)

 // Large zeroing uses REP STOSQ.

--- a/src/cmd/compile/internal/ssa/gen/dec64.rules
+++ b/src/cmd/compile/internal/ssa/gen/dec64.rules
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.

-// This file contains rules to decompose [u]int32 types on 32-bit
+// This file contains rules to decompose [u]int64 types on 32-bit
 // architectures. These rules work together with the decomposeBuiltIn
 // pass which handles phis of these types.


--- a/src/cmd/compile/internal/ssa/opGen.go
+++ b/src/cmd/compile/internal/ssa/opGen.go
@@ -285,8 +285,6 @@ const (
 	Op386MOVBstore
 	Op386MOVWstore
 	Op386MOVLstore
-	Op386MOVOload
-	Op386MOVOstore
 	Op386MOVBloadidx1
 	Op386MOVWloadidx1
 	Op386MOVWloadidx2
@@ -306,7 +304,6 @@ const (
 	Op386MOVLstoreconstidx1
 	Op386MOVLstoreconstidx4
 	Op386DUFFZERO
-	Op386MOVOconst
 	Op386REPSTOSL
 	Op386CALLstatic
 	Op386CALLclosure
@@ -3152,32 +3149,6 @@ var opcodeTable = [...]opInfo{
 			},
 		},
 	},
-	{
-		name:    "MOVOload",
-		auxType: auxSymOff,
-		argLen:  2,
-		asm:     x86.AMOVUPS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{0, 65791}, // AX CX DX BX SP BP SI DI SB
-			},
-			outputs: []outputInfo{
-				{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-			},
-		},
-	},
-	{
-		name:    "MOVOstore",
-		auxType: auxSymOff,
-		argLen:  3,
-		asm:     x86.AMOVUPS,
-		reg: regInfo{
-			inputs: []inputInfo{
-				{1, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-				{0, 65791}, // AX CX DX BX SP BP SI DI SB
-			},
-		},
-	},
 	{
 		name:    "MOVBloadidx1",
 		auxType: auxSymOff,
@@ -3418,22 +3389,11 @@ var opcodeTable = [...]opInfo{
 		reg: regInfo{
 			inputs: []inputInfo{
 				{0, 128}, // DI
-				{1, 256}, // X0
+				{1, 1},   // AX
 			},
 			clobbers: 131200, // DI FLAGS
 		},
 	},
-	{
-		name:              "MOVOconst",
-		auxType:           auxInt128,
-		argLen:            0,
-		rematerializeable: true,
-		reg: regInfo{
-			outputs: []outputInfo{
-				{0, 65280}, // X0 X1 X2 X3 X4 X5 X6 X7
-			},
-		},
-	},
 	{
 		name:   "REPSTOSL",
 		argLen: 4,
@@ -3502,7 +3462,7 @@ var opcodeTable = [...]opInfo{
 				{0, 128}, // DI
 				{1, 64},  // SI
 			},
-			clobbers: 131520, // SI DI X0 FLAGS
+			clobbers: 131266, // CX SI DI FLAGS
 		},
 	},
 	{

--- a/src/cmd/compile/internal/ssa/rewrite.go
+++ b/src/cmd/compile/internal/ssa/rewrite.go
@@ -254,39 +254,38 @@ func isSamePtr(p1, p2 *Value) bool {
 	return false
 }

-// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
-// See runtime/mkduff.go.
-const (
-	dzBlocks    = 16 // number of MOV/ADD blocks
-	dzBlockLen  = 4  // number of clears per block
-	dzBlockSize = 19 // size of instructions in a single block
-	dzMovSize   = 4  // size of single MOV instruction w/ offset
-	dzAddSize   = 4  // size of single ADD instruction
-	dzClearStep = 16 // number of bytes cleared by each MOV instruction
-
-	dzTailLen  = 4 // number of final STOSQ instructions
-	dzTailSize = 2 // size of single STOSQ instruction
-
-	dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
-	dzSize     = dzBlocks * dzBlockSize
-)
-
-func duffStart(size int64) int64 {
-	x, _ := duff(size)
+func duffStartAMD64(size int64) int64 {
+	x, _ := duffAMD64(size)
 	return x
 }
-func duffAdj(size int64) int64 {
-	_, x := duff(size)
+func duffAdjAMD64(size int64) int64 {
+	_, x := duffAMD64(size)
 	return x
 }

 // duff returns the offset (from duffzero, in bytes) and pointer adjust (in bytes)
 // required to use the duffzero mechanism for a block of the given size.
-func duff(size int64) (int64, int64) {
+func duffAMD64(size int64) (int64, int64) {
+	// DUFFZERO consists of repeated blocks of 4 MOVUPSs + ADD,
+	// See runtime/mkduff.go.
+	const (
+		dzBlocks    = 16 // number of MOV/ADD blocks
+		dzBlockLen  = 4  // number of clears per block
+		dzBlockSize = 19 // size of instructions in a single block
+		dzMovSize   = 4  // size of single MOV instruction w/ offset
+		dzAddSize   = 4  // size of single ADD instruction
+		dzClearStep = 16 // number of bytes cleared by each MOV instruction
+
+		dzTailLen  = 4 // number of final STOSQ instructions
+		dzTailSize = 2 // size of single STOSQ instruction
+
+		dzClearLen = dzClearStep * dzBlockLen // bytes cleared by one block
+		dzSize     = dzBlocks * dzBlockSize
+	)
+
 	if size < 32 || size > 1024 || size%dzClearStep != 0 {
 		panic("bad duffzero size")
 	}
-	// TODO: arch-dependent
 	steps := size / dzClearStep
 	blocks := steps / dzBlockLen
 	steps %= dzBlockLen

--- a/src/cmd/compile/internal/ssa/rewrite386.go
+++ b/src/cmd/compile/internal/ssa/rewrite386.go
--- a/src/cmd/compile/internal/ssa/rewriteAMD64.go
+++ b/src/cmd/compile/internal/ssa/rewriteAMD64.go
@@ -17415,7 +17415,7 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool {
 	}
 	// match: (Zero [s] destptr mem)
 	// cond: SizeAndAlign(s).Size() <= 1024 && SizeAndAlign(s).Size()%16 == 0 && !config.noDuffDevice
-	// result: (DUFFZERO [duffStart(SizeAndAlign(s).Size())] 		(ADDQconst [duffAdj(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) 		mem)
+	// result: (DUFFZERO [duffStartAMD64(SizeAndAlign(s).Size())] 		(ADDQconst [duffAdjAMD64(SizeAndAlign(s).Size())] destptr) (MOVOconst [0]) 		mem)
 	for {
 		s := v.AuxInt
 		destptr := v.Args[0]
@@ -17424,9 +17424,9 @@ func rewriteValueAMD64_OpZero(v *Value, config *Config) bool {
 			break
 		}
 		v.reset(OpAMD64DUFFZERO)
-		v.AuxInt = duffStart(SizeAndAlign(s).Size())
+		v.AuxInt = duffStartAMD64(SizeAndAlign(s).Size())
 		v0 := b.NewValue0(v.Line, OpAMD64ADDQconst, config.fe.TypeUInt64())
-		v0.AuxInt = duffAdj(SizeAndAlign(s).Size())
+		v0.AuxInt = duffAdjAMD64(SizeAndAlign(s).Size())
 		v0.AddArg(destptr)
 		v.AddArg(v0)
 		v1 := b.NewValue0(v.Line, OpAMD64MOVOconst, TypeInt128)

--- a/src/cmd/compile/internal/x86/ssa.go
+++ b/src/cmd/compile/internal/x86/ssa.go
@@ -101,11 +101,14 @@ func storeByType(t ssa.Type) obj.As {
 // moveByType returns the reg->reg move instruction of the given type.
 func moveByType(t ssa.Type) obj.As {
 	if t.IsFloat() {
-		// Moving the whole sse2 register is faster
-		// than moving just the correct low portion of it.
-		// There is no xmm->xmm move with 1 byte opcode,
-		// so use movups, which has 2 byte opcode.
-		return x86.AMOVUPS
+		switch t.Size() {
+		case 4:
+			return x86.AMOVSS
+		case 8:
+			return x86.AMOVSD
+		default:
+			panic(fmt.Sprintf("bad float register width %d:%s", t.Size(), t))
+		}
 	} else {
 		switch t.Size() {
 		case 1:
@@ -115,8 +118,6 @@ func moveByType(t ssa.Type) obj.As {
 			return x86.AMOVL
 		case 4:
 			return x86.AMOVL
-		case 16:
-			return x86.AMOVUPS // int128s are in SSE registers
 		default:
 			panic(fmt.Sprintf("bad int register width %d:%s", t.Size(), t))
 		}
@@ -448,7 +449,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.From.Val = math.Float64frombits(uint64(v.AuxInt))
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = x
-	case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload, ssa.Op386MOVOload:
+	case ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload, ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_MEM
 		p.From.Reg = gc.SSARegNum(v.Args[0])
@@ -496,7 +497,7 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		gc.AddAux(&p.From, v)
 		p.To.Type = obj.TYPE_REG
 		p.To.Reg = gc.SSARegNum(v)
-	case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore, ssa.Op386MOVOstore:
+	case ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore:
 		p := gc.Prog(v.Op.Asm())
 		p.From.Type = obj.TYPE_REG
 		p.From.Reg = gc.SSARegNum(v.Args[1])
@@ -584,12 +585,6 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 		p.To.Type = obj.TYPE_ADDR
 		p.To.Sym = gc.Linksym(gc.Pkglookup("duffzero", gc.Runtimepkg))
 		p.To.Offset = v.AuxInt
-	case ssa.Op386MOVOconst:
-		if v.AuxInt != 0 {
-			v.Unimplementedf("MOVOconst can only do constant=0")
-		}
-		r := gc.SSARegNum(v)
-		opregreg(x86.AXORPS, r, r)
 	case ssa.Op386DUFFCOPY:
 		p := gc.Prog(obj.ADUFFCOPY)
 		p.To.Type = obj.TYPE_ADDR
@@ -828,8 +823,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
 			case ssa.Op386MOVLload, ssa.Op386MOVWload, ssa.Op386MOVBload,
 				ssa.Op386MOVLstore, ssa.Op386MOVWstore, ssa.Op386MOVBstore,
 				ssa.Op386MOVBLSXload, ssa.Op386MOVWLSXload,
-				ssa.Op386MOVSSload, ssa.Op386MOVSDload, ssa.Op386MOVOload,
-				ssa.Op386MOVSSstore, ssa.Op386MOVSDstore, ssa.Op386MOVOstore:
+				ssa.Op386MOVSSload, ssa.Op386MOVSDload,
+				ssa.Op386MOVSSstore, ssa.Op386MOVSDstore:
 				if w.Args[0] == v.Args[0] && w.Aux == nil && w.AuxInt >= 0 && w.AuxInt < minZeroPage {
 					if gc.Debug_checknil != 0 && int(v.Line) > 1 {
 						gc.Warnl(v.Line, "removed nil check")