Commit 9c99512d authored by Ilya Tocar's avatar Ilya Tocar

cmd/compile/internal/ssa: combine consecutive loads and stores on amd64

Sometimes (often for calls) we generate code like this:

MOVQ  (addr),AX
MOVQ  8(addr),BX
MOVQ  AX,(otheraddr)
MOVQ  BX,8(otheraddr)

Replace it with

MOVUPS (addr),X0
MOVUPS X0,(otheraddr)

For completeness do the same for 8,16,32-bit loads/stores too.
Shaves 1% from code sections of go tool.

/localdisk/itocar/golang/bin/go 10293917
go_old 10334877 [40960 bytes]

read-only data = 682 bytes (0.040769%)
global text (code) = 38961 bytes (1.036503%)
Total difference 39643 bytes (0.674628%)

Updates #6853

Change-Id: I1f0d2f60273a63a079b58927cd1c4e3429d2e7ae
Reviewed-on: https://go-review.googlesource.com/57130
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarKeith Randall <khr@golang.org>
parent b40831b1
......@@ -24,9 +24,10 @@ import (
// architecture-specific, and they are grouped in arrays of tests, one
// for each architecture.
//
// Each asmTest consists in a function to be compiled and an array of
// regexps that will be matched to the generated assembly. For
// example, the following amd64 test
// Each asmTest consists of a function to compile, an array of
// positiveRegexps that will be matched to the generated assembly and
// an array of negativeRegexps that must not match generated assembly.
// For example, the following amd64 test
//
// {
// `
......@@ -35,10 +36,11 @@ import (
// }
// `,
// []string{"\tSHLQ\t[$]6,"},
// []string{"MULQ"}
// }
//
// verifies that the code the compiler generates for a multiplication
// by 64 contains a 'SHLQ' instruction.
// by 64 contains a 'SHLQ' instruction and does not contain a MULQ.
//
// Since all the tests for a given architecture are dumped in the same
// file, the function names must be unique. As a workaround for this
......@@ -52,6 +54,7 @@ import (
// }
// `,
// []string{"\tSHLQ\t[$]6,"},
// []string{"MULQ"}
// }
//
// Each '$'-function will be given a unique name of form f<N>_<arch>,
......@@ -124,16 +127,22 @@ func funcAsm(t *testing.T, asm string, funcName string) string {
type asmTest struct {
// function to compile
function string
// regexps that must match the generated assembly
regexps []string
// positiveRegexps that must match the generated assembly
positiveRegexps []string
negativeRegexps []string
}
func (at asmTest) verifyAsm(t *testing.T, fa string) {
for _, r := range at.regexps {
for _, r := range at.positiveRegexps {
if b, err := regexp.MatchString(r, fa); !b || err != nil {
t.Errorf("expected:%s\ngo:%s\nasm:%s\n", r, at.function, fa)
}
}
for _, r := range at.negativeRegexps {
if b, err := regexp.MatchString(r, fa); b || err != nil {
t.Errorf("not expected:%s\ngo:%s\nasm:%s\n", r, at.function, fa)
}
}
}
type asmTests struct {
......@@ -214,7 +223,7 @@ var allAsmTests = []*asmTests{
{
arch: "amd64",
os: "linux",
imports: []string{"encoding/binary", "math", "math/bits", "unsafe"},
imports: []string{"encoding/binary", "math", "math/bits", "unsafe", "runtime"},
tests: linuxAMD64Tests,
},
{
......@@ -262,6 +271,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tSHLQ\t\\$6,"},
[]string{},
},
{
`
......@@ -270,6 +280,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tSHLQ\t\\$5,", "\tLEAQ\t\\(.*\\)\\(.*\\*2\\),"},
[]string{},
},
// Load-combining tests.
{
......@@ -279,6 +290,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVQ\t\\(.*\\),"},
[]string{},
},
{
`
......@@ -287,6 +299,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVQ\t\\(.*\\)\\(.*\\*1\\),"},
[]string{},
},
{
`
......@@ -295,6 +308,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVL\t\\(.*\\),"},
[]string{},
},
{
`
......@@ -303,6 +317,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
[]string{},
},
{
`
......@@ -311,6 +326,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPQ\t"},
[]string{},
},
{
`
......@@ -319,6 +335,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPQ\t"},
[]string{},
},
{
`
......@@ -327,6 +344,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPQ\t"},
[]string{},
},
{
`
......@@ -335,6 +353,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPQ\t"},
[]string{},
},
{
`
......@@ -343,6 +362,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPL\t"},
[]string{},
},
{
`
......@@ -351,6 +371,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPL\t"},
[]string{},
},
{
`
......@@ -359,6 +380,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPL\t"},
[]string{},
},
{
`
......@@ -367,6 +389,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPL\t"},
[]string{},
},
{
`
......@@ -375,6 +398,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t\\$8,"},
[]string{},
},
{
`
......@@ -383,6 +407,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t\\$8,"},
[]string{},
},
{
`
......@@ -391,6 +416,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t\\$8,"},
[]string{},
},
{
`
......@@ -399,6 +425,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t\\$8,"},
[]string{},
},
// Structure zeroing. See issue #18370.
{
......@@ -411,6 +438,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)"},
[]string{},
},
// SSA-able composite literal initialization. Issue 18872.
{
......@@ -424,6 +452,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVQ\t[$]1", "\tMOVQ\t[$]2", "\tMOVQ\t[$]3", "\tMOVQ\t[$]4"},
[]string{},
},
// Also test struct containing pointers (this was special because of write barriers).
{
......@@ -436,6 +465,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tXORPS\tX., X", "\tMOVUPS\tX., \\(.*\\)", "\tMOVQ\t\\$0, 16\\(.*\\)", "\tCALL\truntime\\.writebarrierptr\\(SB\\)"},
[]string{},
},
// Rotate tests
{
......@@ -445,6 +475,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLQ\t[$]7,"},
[]string{},
},
{
`
......@@ -453,6 +484,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLQ\t[$]7,"},
[]string{},
},
{
`
......@@ -461,6 +493,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLQ\t[$]7,"},
[]string{},
},
{
`
......@@ -469,6 +502,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLL\t[$]7,"},
[]string{},
},
{
`
......@@ -477,6 +511,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLL\t[$]7,"},
[]string{},
},
{
`
......@@ -485,6 +520,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLL\t[$]7,"},
[]string{},
},
{
`
......@@ -493,6 +529,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t[$]7,"},
[]string{},
},
{
`
......@@ -501,6 +538,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t[$]7,"},
[]string{},
},
{
`
......@@ -509,6 +547,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t[$]7,"},
[]string{},
},
{
`
......@@ -517,6 +556,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLB\t[$]7,"},
[]string{},
},
{
`
......@@ -525,6 +565,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLB\t[$]7,"},
[]string{},
},
{
`
......@@ -533,6 +574,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLB\t[$]7,"},
[]string{},
},
// Rotate after inlining (see issue 18254).
{
......@@ -545,6 +587,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLL\t[$]7,"},
[]string{},
},
{
`
......@@ -553,6 +596,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVQ\t[$]5,"},
[]string{},
},
// Direct use of constants in fast map access calls. Issue 19015.
{
......@@ -563,6 +607,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVQ\t[$]5,"},
[]string{},
},
{
`
......@@ -571,6 +616,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\"abc\""},
[]string{},
},
{
`
......@@ -580,6 +626,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\"abc\""},
[]string{},
},
// Bit test ops on amd64, issue 18943.
{
......@@ -592,6 +639,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBTQ\t"},
[]string{},
},
{
`
......@@ -600,6 +648,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBTQ\t"},
[]string{},
},
{
`
......@@ -611,6 +660,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBTQ\t\\$60"},
[]string{},
},
{
`
......@@ -619,6 +669,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBTQ\t\\$60"},
[]string{},
},
// Intrinsic tests for math/bits
{
......@@ -628,6 +679,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSFQ\t", "\tMOVL\t\\$64,", "\tCMOVQEQ\t"},
[]string{},
},
{
`
......@@ -636,6 +688,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSFQ\t", "\tORQ\t[^$]", "\tMOVQ\t\\$4294967296,"},
[]string{},
},
{
`
......@@ -644,6 +697,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSFQ\t", "\tORQ\t\\$65536,"},
[]string{},
},
{
`
......@@ -652,6 +706,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSFQ\t", "\tORQ\t\\$256,"},
[]string{},
},
{
`
......@@ -660,6 +715,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPQ\t"},
[]string{},
},
{
`
......@@ -668,6 +724,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSWAPL\t"},
[]string{},
},
{
`
......@@ -676,6 +733,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tROLW\t\\$8,"},
[]string{},
},
{
`
......@@ -684,6 +742,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
{
`
......@@ -692,6 +751,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
{
`
......@@ -700,6 +760,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
/* see ssa.go
{
......@@ -709,6 +770,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
*/
{
......@@ -718,6 +780,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
{
`
......@@ -726,6 +789,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
{
`
......@@ -734,6 +798,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
{
`
......@@ -742,6 +807,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
/* see ssa.go
{
......@@ -751,6 +817,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
*/
{
......@@ -760,6 +827,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tBSRQ\t"},
[]string{},
},
{
`
......@@ -767,6 +835,7 @@ var linuxAMD64Tests = []*asmTest{
return bits.OnesCount64(x)
}`,
[]string{"\tPOPCNTQ\t", "support_popcnt"},
[]string{},
},
{
`
......@@ -774,6 +843,7 @@ var linuxAMD64Tests = []*asmTest{
return bits.OnesCount32(x)
}`,
[]string{"\tPOPCNTL\t", "support_popcnt"},
[]string{},
},
{
`
......@@ -781,6 +851,7 @@ var linuxAMD64Tests = []*asmTest{
return bits.OnesCount16(x)
}`,
[]string{"\tPOPCNTL\t", "support_popcnt"},
[]string{},
},
{
`
......@@ -788,6 +859,7 @@ var linuxAMD64Tests = []*asmTest{
return bits.OnesCount(x)
}`,
[]string{"\tPOPCNTQ\t", "support_popcnt"},
[]string{},
},
// multiplication merging tests
{
......@@ -796,6 +868,7 @@ var linuxAMD64Tests = []*asmTest{
return 15*n + 31*n
}`,
[]string{"\tIMULQ\t[$]46"}, // 46*n
[]string{},
},
{
`
......@@ -803,6 +876,7 @@ var linuxAMD64Tests = []*asmTest{
return 5*n + 7*(n+1) + 11*(n+2)
}`,
[]string{"\tIMULQ\t[$]23", "\tADDQ\t[$]29"}, // 23*n + 29
[]string{},
},
{
`
......@@ -810,6 +884,7 @@ var linuxAMD64Tests = []*asmTest{
return a*n + 19*n
}`,
[]string{"\tADDQ\t[$]19", "\tIMULQ"}, // (a+19)*n
[]string{},
},
// see issue 19595.
......@@ -821,6 +896,7 @@ var linuxAMD64Tests = []*asmTest{
*q += x
}`,
[]string{"\tADDQ\t\\("},
[]string{},
},
{
`
......@@ -831,6 +907,7 @@ var linuxAMD64Tests = []*asmTest{
}
}`,
[]string{"\tADDQ\t[A-Z]"},
[]string{},
},
// Floating-point strength reduction
{
......@@ -839,6 +916,7 @@ var linuxAMD64Tests = []*asmTest{
return f * 2.0
}`,
[]string{"\tADDSD\t"},
[]string{},
},
{
`
......@@ -846,6 +924,7 @@ var linuxAMD64Tests = []*asmTest{
return f / 16.0
}`,
[]string{"\tMULSD\t"},
[]string{},
},
{
`
......@@ -853,6 +932,7 @@ var linuxAMD64Tests = []*asmTest{
return f / 0.125
}`,
[]string{"\tMULSD\t"},
[]string{},
},
{
`
......@@ -860,6 +940,7 @@ var linuxAMD64Tests = []*asmTest{
return f / 0.5
}`,
[]string{"\tADDSD\t"},
[]string{},
},
// Check that compare to constant string uses 2/4/8 byte compares
{
......@@ -868,6 +949,7 @@ var linuxAMD64Tests = []*asmTest{
return a == "xx"
}`,
[]string{"\tCMPW\t[A-Z]"},
[]string{},
},
{
`
......@@ -875,6 +957,7 @@ var linuxAMD64Tests = []*asmTest{
return a == "xxxx"
}`,
[]string{"\tCMPL\t[A-Z]"},
[]string{},
},
{
`
......@@ -882,6 +965,7 @@ var linuxAMD64Tests = []*asmTest{
return a == "xxxxxxxx"
}`,
[]string{"\tCMPQ\t[A-Z]"},
[]string{},
},
// Non-constant rotate
{
......@@ -890,6 +974,7 @@ var linuxAMD64Tests = []*asmTest{
return x << z | x >> (64-z)
}`,
[]string{"\tROLQ\t"},
[]string{},
},
{
`func rot64r(x uint64, y int) uint64 {
......@@ -897,6 +982,7 @@ var linuxAMD64Tests = []*asmTest{
return x >> z | x << (64-z)
}`,
[]string{"\tRORQ\t"},
[]string{},
},
{
`func rot32l(x uint32, y int) uint32 {
......@@ -904,6 +990,7 @@ var linuxAMD64Tests = []*asmTest{
return x << z | x >> (32-z)
}`,
[]string{"\tROLL\t"},
[]string{},
},
{
`func rot32r(x uint32, y int) uint32 {
......@@ -911,6 +998,7 @@ var linuxAMD64Tests = []*asmTest{
return x >> z | x << (32-z)
}`,
[]string{"\tRORL\t"},
[]string{},
},
{
`func rot16l(x uint16, y int) uint16 {
......@@ -918,6 +1006,7 @@ var linuxAMD64Tests = []*asmTest{
return x << z | x >> (16-z)
}`,
[]string{"\tROLW\t"},
[]string{},
},
{
`func rot16r(x uint16, y int) uint16 {
......@@ -925,6 +1014,7 @@ var linuxAMD64Tests = []*asmTest{
return x >> z | x << (16-z)
}`,
[]string{"\tRORW\t"},
[]string{},
},
{
`func rot8l(x uint8, y int) uint8 {
......@@ -932,6 +1022,7 @@ var linuxAMD64Tests = []*asmTest{
return x << z | x >> (8-z)
}`,
[]string{"\tROLB\t"},
[]string{},
},
{
`func rot8r(x uint8, y int) uint8 {
......@@ -939,6 +1030,7 @@ var linuxAMD64Tests = []*asmTest{
return x >> z | x << (8-z)
}`,
[]string{"\tRORB\t"},
[]string{},
},
// Check that array compare uses 2/4/8 byte compares
{
......@@ -947,6 +1039,7 @@ var linuxAMD64Tests = []*asmTest{
return a == b
}`,
[]string{"\tCMPW\t[A-Z]"},
[]string{},
},
{
`
......@@ -954,6 +1047,7 @@ var linuxAMD64Tests = []*asmTest{
return a == b
}`,
[]string{"\tCMPL\t[A-Z]"},
[]string{},
},
{
`
......@@ -961,6 +1055,7 @@ var linuxAMD64Tests = []*asmTest{
return a == b
}`,
[]string{"\tCMPQ\t[A-Z]"},
[]string{},
},
{
`
......@@ -968,6 +1063,7 @@ var linuxAMD64Tests = []*asmTest{
return *((*[4]byte)(a)) != *((*[4]byte)(b))
}`,
[]string{"\tCMPL\t[A-Z]"},
[]string{},
},
{
// make sure assembly output has matching offset and base register.
......@@ -979,6 +1075,56 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"b\\+40\\(SP\\)"},
[]string{},
},
{
// check load combining
`
func f73(a, b byte) (byte,byte) {
return f73(f73(a,b))
}
`,
[]string{"\tMOVW\t"},
[]string{},
},
{
`
func f74(a, b uint16) (uint16,uint16) {
return f74(f74(a,b))
}
`,
[]string{"\tMOVL\t"},
[]string{},
},
{
`
func f75(a, b uint32) (uint32,uint32) {
return f75(f75(a,b))
}
`,
[]string{"\tMOVQ\t"},
[]string{},
},
{
`
func f76(a, b uint64) (uint64,uint64) {
return f76(f76(a,b))
}
`,
[]string{"\tMOVUPS\t"},
[]string{},
},
// Make sure we don't put pointers in SSE registers across safe points.
{
`
func $(p, q *[2]*int) {
a, b := p[0], p[1]
runtime.GC()
q[0], q[1] = a, b
}
`,
[]string{},
[]string{"MOVUPS"},
},
{
// check that stack store is optimized away
......@@ -989,6 +1135,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"TEXT\t.*, [$]0-8"},
[]string{},
},
// math.Abs using integer registers
{
......@@ -998,6 +1145,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,"},
[]string{},
},
// math.Copysign using integer registers
{
......@@ -1007,6 +1155,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tSHLQ\t[$]1,", "\tSHRQ\t[$]1,", "\tSHRQ\t[$]63,", "\tSHLQ\t[$]63,", "\tORQ\t"},
[]string{},
},
// int <-> fp moves
{
......@@ -1016,6 +1165,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVQ\tX.*, [^X].*"},
[]string{},
},
{
`
......@@ -1024,6 +1174,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVL\tX.*, [^X].*"},
[]string{},
},
{
`
......@@ -1032,6 +1183,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVQ\t[^X].*, X.*"},
[]string{},
},
{
`
......@@ -1040,6 +1192,7 @@ var linuxAMD64Tests = []*asmTest{
}
`,
[]string{"\tMOVL\t[^X].*, X.*"},
[]string{},
},
}
......@@ -1051,6 +1204,7 @@ var linux386Tests = []*asmTest{
}
`,
[]string{"\tMOVL\t\\(.*\\),"},
[]string{},
},
{
`
......@@ -1059,6 +1213,7 @@ var linux386Tests = []*asmTest{
}
`,
[]string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
[]string{},
},
// multiplication merging tests
......@@ -1068,6 +1223,7 @@ var linux386Tests = []*asmTest{
return 9*n + 14*n
}`,
[]string{"\tIMULL\t[$]23"}, // 23*n
[]string{},
},
{
`
......@@ -1075,6 +1231,7 @@ var linux386Tests = []*asmTest{
return 19*a + a*n
}`,
[]string{"\tADDL\t[$]19", "\tIMULL"}, // (n+19)*a
[]string{},
},
{
// check that stack store is optimized away
......@@ -1085,6 +1242,7 @@ var linux386Tests = []*asmTest{
}
`,
[]string{"TEXT\t.*, [$]0-4"},
[]string{},
},
}
......@@ -1096,6 +1254,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVWBR\t\\(.*\\),"},
[]string{},
},
{
`
......@@ -1104,6 +1263,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVWBR\t\\(.*\\)\\(.*\\*1\\),"},
[]string{},
},
{
`
......@@ -1112,6 +1272,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVDBR\t\\(.*\\),"},
[]string{},
},
{
`
......@@ -1120,6 +1281,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVDBR\t\\(.*\\)\\(.*\\*1\\),"},
[]string{},
},
{
`
......@@ -1128,6 +1290,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVWZ\t\\(.*\\),"},
[]string{},
},
{
`
......@@ -1136,6 +1299,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVWZ\t\\(.*\\)\\(.*\\*1\\),"},
[]string{},
},
{
`
......@@ -1144,6 +1308,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVD\t\\(.*\\),"},
[]string{},
},
{
`
......@@ -1152,6 +1317,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVD\t\\(.*\\)\\(.*\\*1\\),"},
[]string{},
},
{
`
......@@ -1160,6 +1326,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tRLLG\t[$]7,"},
[]string{},
},
{
`
......@@ -1168,6 +1335,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tRLLG\t[$]7,"},
[]string{},
},
{
`
......@@ -1176,6 +1344,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tRLLG\t[$]7,"},
[]string{},
},
{
`
......@@ -1184,6 +1353,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tRLL\t[$]7,"},
[]string{},
},
{
`
......@@ -1192,6 +1362,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tRLL\t[$]7,"},
[]string{},
},
{
`
......@@ -1200,6 +1371,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tRLL\t[$]7,"},
[]string{},
},
// Fused multiply-add/sub instructions.
{
......@@ -1209,6 +1381,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFMADD\t"},
[]string{},
},
{
`
......@@ -1217,6 +1390,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFMSUB\t"},
[]string{},
},
{
`
......@@ -1225,6 +1399,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFMADDS\t"},
[]string{},
},
{
`
......@@ -1233,6 +1408,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFMSUBS\t"},
[]string{},
},
// Intrinsic tests for math/bits
{
......@@ -1242,6 +1418,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1250,6 +1427,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t", "\tMOVWZ\t"},
[]string{},
},
{
`
......@@ -1258,6 +1436,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t", "\tOR\t\\$65536,"},
[]string{},
},
{
`
......@@ -1266,6 +1445,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t", "\tOR\t\\$256,"},
[]string{},
},
// Intrinsic tests for math/bits
{
......@@ -1275,6 +1455,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVDBR\t"},
[]string{},
},
{
`
......@@ -1283,6 +1464,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tMOVWBR\t"},
[]string{},
},
{
`
......@@ -1291,6 +1473,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1299,6 +1482,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1307,6 +1491,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1315,6 +1500,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1323,6 +1509,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1331,6 +1518,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1339,6 +1527,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1347,6 +1536,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1355,6 +1545,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
`
......@@ -1363,6 +1554,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"\tFLOGR\t"},
[]string{},
},
{
// check that stack store is optimized away
......@@ -1373,6 +1565,7 @@ var linuxS390XTests = []*asmTest{
}
`,
[]string{"TEXT\t.*, [$]0-8"},
[]string{},
},
}
......@@ -1384,6 +1577,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tMOVW\tR[0-9]+@>25,"},
[]string{},
},
{
`
......@@ -1392,6 +1586,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tMOVW\tR[0-9]+@>25,"},
[]string{},
},
{
`
......@@ -1400,6 +1595,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tMOVW\tR[0-9]+@>25,"},
[]string{},
},
{
`
......@@ -1408,6 +1604,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1416,6 +1613,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1424,6 +1622,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1432,6 +1631,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1440,6 +1640,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1448,6 +1649,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1456,6 +1658,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1464,6 +1667,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1472,6 +1676,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1480,6 +1685,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
// make sure assembly output has matching offset and base register.
......@@ -1491,6 +1697,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"b\\+4\\(FP\\)"},
[]string{},
},
{
// check that stack store is optimized away
......@@ -1501,6 +1708,7 @@ var linuxARMTests = []*asmTest{
}
`,
[]string{"TEXT\t.*, [$]-4-4"},
[]string{},
},
}
......@@ -1512,6 +1720,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tROR\t[$]57,"},
[]string{},
},
{
`
......@@ -1520,6 +1729,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tROR\t[$]57,"},
[]string{},
},
{
`
......@@ -1528,6 +1738,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tROR\t[$]57,"},
[]string{},
},
{
`
......@@ -1536,6 +1747,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tRORW\t[$]25,"},
[]string{},
},
{
`
......@@ -1544,6 +1756,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tRORW\t[$]25,"},
[]string{},
},
{
`
......@@ -1552,6 +1765,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tRORW\t[$]25,"},
[]string{},
},
{
`
......@@ -1560,6 +1774,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tREV\t"},
[]string{},
},
{
`
......@@ -1568,6 +1783,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tREVW\t"},
[]string{},
},
{
`
......@@ -1576,6 +1792,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1584,6 +1801,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1592,6 +1810,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1600,6 +1819,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1608,6 +1828,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1616,6 +1837,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1624,6 +1846,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1632,6 +1855,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1640,6 +1864,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1648,6 +1873,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1656,6 +1882,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tAND\t"},
[]string{},
},
{
`
......@@ -1664,6 +1891,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tAND\t"},
[]string{},
},
{
// make sure offsets are folded into load and store.
......@@ -1674,6 +1902,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"\tMOVD\t\"\"\\.a\\+[0-9]+\\(FP\\), R[0-9]+", "\tMOVD\tR[0-9]+, \"\"\\.b\\+[0-9]+\\(FP\\)"},
[]string{},
},
{
// check that stack store is optimized away
......@@ -1684,6 +1913,7 @@ var linuxARM64Tests = []*asmTest{
}
`,
[]string{"TEXT\t.*, [$]-8-8"},
[]string{},
},
}
......@@ -1695,6 +1925,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1703,6 +1934,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1711,6 +1943,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1719,6 +1952,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1727,6 +1961,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1735,6 +1970,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1743,6 +1979,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1751,6 +1988,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1759,6 +1997,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
`
......@@ -1767,6 +2006,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"\tCLZ\t"},
[]string{},
},
{
// check that stack store is optimized away
......@@ -1777,6 +2017,7 @@ var linuxMIPSTests = []*asmTest{
}
`,
[]string{"TEXT\t.*, [$]-4-4"},
[]string{},
},
}
......@@ -1789,6 +2030,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tFMADD\t"},
[]string{},
},
{
`
......@@ -1797,6 +2039,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tFMSUB\t"},
[]string{},
},
{
`
......@@ -1805,6 +2048,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tFMADDS\t"},
[]string{},
},
{
`
......@@ -1813,6 +2057,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tFMSUBS\t"},
[]string{},
},
{
`
......@@ -1821,6 +2066,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tROTLW\t"},
[]string{},
},
{
`
......@@ -1829,6 +2075,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tROTLW\t"},
[]string{},
},
{
`
......@@ -1837,6 +2084,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tROTLW\t"},
[]string{},
},
{
`
......@@ -1845,6 +2093,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tROTL\t"},
[]string{},
},
{
`
......@@ -1853,6 +2102,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tROTL\t"},
[]string{},
},
{
`
......@@ -1861,6 +2111,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"\tROTL\t"},
[]string{},
},
{
// check that stack store is optimized away
......@@ -1871,6 +2122,7 @@ var linuxPPC64LETests = []*asmTest{
}
`,
[]string{"TEXT\t.*, [$]0-8"},
[]string{},
},
}
......
......@@ -2327,6 +2327,58 @@
&& clobber(x)
-> (MOVQstoreidx1 [i-4] {s} p (SHLQconst <idx.Type> [2] idx) w0 mem)
(MOVBstore [i] {s} p
x1:(MOVBload [j] {s2} p2 mem)
mem2:(MOVBstore [i-1] {s} p
x2:(MOVBload [j-1] {s2} p2 mem) mem))
&& x1.Uses == 1
&& x2.Uses == 1
&& mem2.Uses == 1
&& clobber(x1)
&& clobber(x2)
&& clobber(mem2)
-> (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem)
(MOVWstore [i] {s} p
x1:(MOVWload [j] {s2} p2 mem)
mem2:(MOVWstore [i-2] {s} p
x2:(MOVWload [j-2] {s2} p2 mem) mem))
&& x1.Uses == 1
&& x2.Uses == 1
&& mem2.Uses == 1
&& clobber(x1)
&& clobber(x2)
&& clobber(mem2)
-> (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem)
(MOVLstore [i] {s} p
x1:(MOVLload [j] {s2} p2 mem)
mem2:(MOVLstore [i-4] {s} p
x2:(MOVLload [j-4] {s2} p2 mem) mem))
&& x1.Uses == 1
&& x2.Uses == 1
&& mem2.Uses == 1
&& clobber(x1)
&& clobber(x2)
&& clobber(mem2)
-> (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem)
// This is somewhat tricky. There may be pointers in SSE registers due to rule below.
// However those register shouldn't live across GC safepoint.
(MOVQstore [i] {s} p
x1:(MOVQload [j] {s2} p2 mem)
mem2:(MOVQstore [i-8] {s} p
x2:(MOVQload [j-8] {s2} p2 mem) mem))
&& x1.Uses == 1
&& x2.Uses == 1
&& mem2.Uses == 1
&& config.useSSE
&& clobber(x1)
&& clobber(x2)
&& clobber(mem2)
-> (MOVOstore [i-8] {s} p (MOVOload [j-8] {s2} p2 mem) mem)
// amd64p32 rules
// same as the rules above, but with 32 instead of 64 bit pointer arithmetic.
// LEAQ,ADDQ -> LEAL,ADDL
......
......@@ -154,7 +154,7 @@ func rewriteValueAMD64(v *Value) bool {
case OpAMD64MOVQloadidx8:
return rewriteValueAMD64_OpAMD64MOVQloadidx8_0(v)
case OpAMD64MOVQstore:
return rewriteValueAMD64_OpAMD64MOVQstore_0(v)
return rewriteValueAMD64_OpAMD64MOVQstore_0(v) || rewriteValueAMD64_OpAMD64MOVQstore_10(v)
case OpAMD64MOVQstoreconst:
return rewriteValueAMD64_OpAMD64MOVQstoreconst_0(v)
case OpAMD64MOVQstoreconstidx1:
......@@ -5690,6 +5690,10 @@ func rewriteValueAMD64_OpAMD64MOVBstore_0(v *Value) bool {
return false
}
func rewriteValueAMD64_OpAMD64MOVBstore_10(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (MOVBstore [i] {s} p (SHRQconst [8] w) x:(MOVBstore [i-1] {s} p w mem))
// cond: x.Uses == 1 && clobber(x)
// result: (MOVWstore [i-1] {s} p w mem)
......@@ -5785,6 +5789,73 @@ func rewriteValueAMD64_OpAMD64MOVBstore_10(v *Value) bool {
v.AddArg(mem)
return true
}
// match: (MOVBstore [i] {s} p x1:(MOVBload [j] {s2} p2 mem) mem2:(MOVBstore [i-1] {s} p x2:(MOVBload [j-1] {s2} p2 mem) mem))
// cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)
// result: (MOVWstore [i-1] {s} p (MOVWload [j-1] {s2} p2 mem) mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[2]
p := v.Args[0]
x1 := v.Args[1]
if x1.Op != OpAMD64MOVBload {
break
}
j := x1.AuxInt
s2 := x1.Aux
_ = x1.Args[1]
p2 := x1.Args[0]
mem := x1.Args[1]
mem2 := v.Args[2]
if mem2.Op != OpAMD64MOVBstore {
break
}
if mem2.AuxInt != i-1 {
break
}
if mem2.Aux != s {
break
}
_ = mem2.Args[2]
if p != mem2.Args[0] {
break
}
x2 := mem2.Args[1]
if x2.Op != OpAMD64MOVBload {
break
}
if x2.AuxInt != j-1 {
break
}
if x2.Aux != s2 {
break
}
_ = x2.Args[1]
if p2 != x2.Args[0] {
break
}
if mem != x2.Args[1] {
break
}
if mem != mem2.Args[2] {
break
}
if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) {
break
}
v.reset(OpAMD64MOVWstore)
v.AuxInt = i - 1
v.Aux = s
v.AddArg(p)
v0 := b.NewValue0(v.Pos, OpAMD64MOVWload, typ.UInt16)
v0.AuxInt = j - 1
v0.Aux = s2
v0.AddArg(p2)
v0.AddArg(mem)
v.AddArg(v0)
v.AddArg(mem)
return true
}
// match: (MOVBstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
// cond: canMergeSym(sym1, sym2)
// result: (MOVBstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
......@@ -7810,6 +7881,77 @@ func rewriteValueAMD64_OpAMD64MOVLstore_0(v *Value) bool {
return false
}
func rewriteValueAMD64_OpAMD64MOVLstore_10(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (MOVLstore [i] {s} p x1:(MOVLload [j] {s2} p2 mem) mem2:(MOVLstore [i-4] {s} p x2:(MOVLload [j-4] {s2} p2 mem) mem))
// cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)
// result: (MOVQstore [i-4] {s} p (MOVQload [j-4] {s2} p2 mem) mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[2]
p := v.Args[0]
x1 := v.Args[1]
if x1.Op != OpAMD64MOVLload {
break
}
j := x1.AuxInt
s2 := x1.Aux
_ = x1.Args[1]
p2 := x1.Args[0]
mem := x1.Args[1]
mem2 := v.Args[2]
if mem2.Op != OpAMD64MOVLstore {
break
}
if mem2.AuxInt != i-4 {
break
}
if mem2.Aux != s {
break
}
_ = mem2.Args[2]
if p != mem2.Args[0] {
break
}
x2 := mem2.Args[1]
if x2.Op != OpAMD64MOVLload {
break
}
if x2.AuxInt != j-4 {
break
}
if x2.Aux != s2 {
break
}
_ = x2.Args[1]
if p2 != x2.Args[0] {
break
}
if mem != x2.Args[1] {
break
}
if mem != mem2.Args[2] {
break
}
if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) {
break
}
v.reset(OpAMD64MOVQstore)
v.AuxInt = i - 4
v.Aux = s
v.AddArg(p)
v0 := b.NewValue0(v.Pos, OpAMD64MOVQload, typ.UInt64)
v0.AuxInt = j - 4
v0.Aux = s2
v0.AddArg(p2)
v0.AddArg(mem)
v.AddArg(v0)
v.AddArg(mem)
return true
}
// match: (MOVLstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
// cond: canMergeSym(sym1, sym2)
// result: (MOVLstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
......@@ -9345,6 +9487,10 @@ func rewriteValueAMD64_OpAMD64MOVQloadidx8_0(v *Value) bool {
return false
}
func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool {
b := v.Block
_ = b
config := b.Func.Config
_ = config
// match: (MOVQstore [off1] {sym} (ADDQconst [off2] ptr) val mem)
// cond: is32Bit(off1+off2)
// result: (MOVQstore [off1+off2] {sym} ptr val mem)
......@@ -9510,6 +9656,73 @@ func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool {
v.AddArg(mem)
return true
}
// match: (MOVQstore [i] {s} p x1:(MOVQload [j] {s2} p2 mem) mem2:(MOVQstore [i-8] {s} p x2:(MOVQload [j-8] {s2} p2 mem) mem))
// cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && config.useSSE && clobber(x1) && clobber(x2) && clobber(mem2)
// result: (MOVOstore [i-8] {s} p (MOVOload [j-8] {s2} p2 mem) mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[2]
p := v.Args[0]
x1 := v.Args[1]
if x1.Op != OpAMD64MOVQload {
break
}
j := x1.AuxInt
s2 := x1.Aux
_ = x1.Args[1]
p2 := x1.Args[0]
mem := x1.Args[1]
mem2 := v.Args[2]
if mem2.Op != OpAMD64MOVQstore {
break
}
if mem2.AuxInt != i-8 {
break
}
if mem2.Aux != s {
break
}
_ = mem2.Args[2]
if p != mem2.Args[0] {
break
}
x2 := mem2.Args[1]
if x2.Op != OpAMD64MOVQload {
break
}
if x2.AuxInt != j-8 {
break
}
if x2.Aux != s2 {
break
}
_ = x2.Args[1]
if p2 != x2.Args[0] {
break
}
if mem != x2.Args[1] {
break
}
if mem != mem2.Args[2] {
break
}
if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && config.useSSE && clobber(x1) && clobber(x2) && clobber(mem2)) {
break
}
v.reset(OpAMD64MOVOstore)
v.AuxInt = i - 8
v.Aux = s
v.AddArg(p)
v0 := b.NewValue0(v.Pos, OpAMD64MOVOload, types.TypeInt128)
v0.AuxInt = j - 8
v0.Aux = s2
v0.AddArg(p2)
v0.AddArg(mem)
v.AddArg(v0)
v.AddArg(mem)
return true
}
// match: (MOVQstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
// cond: canMergeSym(sym1, sym2)
// result: (MOVQstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
......@@ -9602,6 +9815,9 @@ func rewriteValueAMD64_OpAMD64MOVQstore_0(v *Value) bool {
v.AddArg(mem)
return true
}
return false
}
func rewriteValueAMD64_OpAMD64MOVQstore_10(v *Value) bool {
// match: (MOVQstore [off] {sym} ptr (MOVQf2i val) mem)
// cond:
// result: (MOVSDstore [off] {sym} ptr val mem)
......@@ -12334,6 +12550,77 @@ func rewriteValueAMD64_OpAMD64MOVWstore_0(v *Value) bool {
return false
}
func rewriteValueAMD64_OpAMD64MOVWstore_10(v *Value) bool {
b := v.Block
_ = b
typ := &b.Func.Config.Types
_ = typ
// match: (MOVWstore [i] {s} p x1:(MOVWload [j] {s2} p2 mem) mem2:(MOVWstore [i-2] {s} p x2:(MOVWload [j-2] {s2} p2 mem) mem))
// cond: x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)
// result: (MOVLstore [i-2] {s} p (MOVLload [j-2] {s2} p2 mem) mem)
for {
i := v.AuxInt
s := v.Aux
_ = v.Args[2]
p := v.Args[0]
x1 := v.Args[1]
if x1.Op != OpAMD64MOVWload {
break
}
j := x1.AuxInt
s2 := x1.Aux
_ = x1.Args[1]
p2 := x1.Args[0]
mem := x1.Args[1]
mem2 := v.Args[2]
if mem2.Op != OpAMD64MOVWstore {
break
}
if mem2.AuxInt != i-2 {
break
}
if mem2.Aux != s {
break
}
_ = mem2.Args[2]
if p != mem2.Args[0] {
break
}
x2 := mem2.Args[1]
if x2.Op != OpAMD64MOVWload {
break
}
if x2.AuxInt != j-2 {
break
}
if x2.Aux != s2 {
break
}
_ = x2.Args[1]
if p2 != x2.Args[0] {
break
}
if mem != x2.Args[1] {
break
}
if mem != mem2.Args[2] {
break
}
if !(x1.Uses == 1 && x2.Uses == 1 && mem2.Uses == 1 && clobber(x1) && clobber(x2) && clobber(mem2)) {
break
}
v.reset(OpAMD64MOVLstore)
v.AuxInt = i - 2
v.Aux = s
v.AddArg(p)
v0 := b.NewValue0(v.Pos, OpAMD64MOVLload, typ.UInt32)
v0.AuxInt = j - 2
v0.Aux = s2
v0.AddArg(p2)
v0.AddArg(mem)
v.AddArg(v0)
v.AddArg(mem)
return true
}
// match: (MOVWstore [off1] {sym1} (LEAL [off2] {sym2} base) val mem)
// cond: canMergeSym(sym1, sym2)
// result: (MOVWstore [off1+off2] {mergeSym(sym1,sym2)} base val mem)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment