Commit 009c002c authored by Ilya Tocar's avatar Ilya Tocar Committed by Russ Cox

cmd/internal/obj/x86: add AVX2 instrutions needed for sha1/sha512/sha256 acceleration

This means: VPSHUFB, VPSHUFD, VPERM2F128, VPALIGNR, VPADDQ, VPADDD, VPSRLDQ,
VPSLLDQ, VPSRLQ, VPSLLQ, VPSRLD, VPSLLD, VPOR, VPBLENDD, VINSERTI128,
VPERM2I128, RORXL, RORXQ.

Change-Id: Ief27190ee6acfa86b109262af5d999bc101e923d
Reviewed-on: https://go-review.googlesource.com/22606
Run-TryBot: Ilya Tocar <ilya.tocar@intel.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarRuss Cox <rsc@golang.org>
parent 2e32efc4
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file encapsulates some of the odd characteristics of the
// AMD64 instruction set, to minimize its interaction
// with the core of the assembler.
package arch
import (
"cmd/internal/obj"
"cmd/internal/obj/x86"
)
// IsAMD4OP reports whether the op (as defined by an ppc64.A* constant) is
// The FMADD-like instructions behave similarly.
func IsAMD4OP(op obj.As) bool {
switch op {
case x86.AVPERM2F128,
x86.AVPALIGNR,
x86.AVPERM2I128,
x86.AVINSERTI128,
x86.AVPBLENDD:
return true
}
return false
}
...@@ -568,6 +568,15 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) { ...@@ -568,6 +568,15 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
prog.From = a[0] prog.From = a[0]
prog.Reg = p.getRegister(prog, op, &a[1]) prog.Reg = p.getRegister(prog, op, &a[1])
prog.To = a[2] prog.To = a[2]
case sys.AMD64:
// Catch missing operand here, because we store immediate as part of From3, and can't distinguish
// missing operand from legal value 0 in obj/x86/asm6.
if arch.IsAMD4OP(op) {
p.errorf("4 operands required, but only 3 are provided for %s instruction", obj.Aconv(op))
}
prog.From = a[0]
prog.From3 = newAddr(a[1])
prog.To = a[2]
case sys.ARM64: case sys.ARM64:
// ARM64 instructions with one input and two outputs. // ARM64 instructions with one input and two outputs.
if arch.IsARM64STLXR(op) { if arch.IsARM64STLXR(op) {
...@@ -583,7 +592,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) { ...@@ -583,7 +592,7 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
prog.From = a[0] prog.From = a[0]
prog.Reg = p.getRegister(prog, op, &a[1]) prog.Reg = p.getRegister(prog, op, &a[1])
prog.To = a[2] prog.To = a[2]
case sys.AMD64, sys.I386: case sys.I386:
prog.From = a[0] prog.From = a[0]
prog.From3 = newAddr(a[1]) prog.From3 = newAddr(a[1])
prog.To = a[2] prog.To = a[2]
...@@ -640,6 +649,23 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) { ...@@ -640,6 +649,23 @@ func (p *Parser) asmInstruction(op obj.As, cond string, a []obj.Addr) {
prog.Reg = r1 prog.Reg = r1
break break
} }
if p.arch.Family == sys.AMD64 {
// 4 operand instruction have form ymm1, ymm2, ymm3/m256, imm8
// So From3 is always just a register, so we store imm8 in Offset field,
// to avoid increasing size of Prog.
prog.From = a[1]
prog.From3 = newAddr(a[2])
if a[0].Type != obj.TYPE_CONST {
p.errorf("first operand must be an immediate in %s instruction", obj.Aconv(op))
}
if prog.From3.Type != obj.TYPE_REG {
p.errorf("third operand must be a register in %s instruction", obj.Aconv(op))
}
prog.From3.Offset = int64(p.getImmediate(prog, op, &a[0]))
prog.To = a[3]
prog.RegTo2 = -1
break
}
if p.arch.Family == sys.ARM64 { if p.arch.Family == sys.ARM64 {
prog.From = a[0] prog.From = a[0]
prog.Reg = p.getRegister(prog, op, &a[1]) prog.Reg = p.getRegister(prog, op, &a[1])
......
...@@ -140,6 +140,11 @@ func (p *Prog) String() string { ...@@ -140,6 +140,11 @@ func (p *Prog) String() string {
fmt.Fprintf(&buf, "%.5d (%v)\t%v%s", p.Pc, p.Line(), Aconv(p.As), sc) fmt.Fprintf(&buf, "%.5d (%v)\t%v%s", p.Pc, p.Line(), Aconv(p.As), sc)
sep := "\t" sep := "\t"
quadOpAmd64 := p.RegTo2 == -1
if quadOpAmd64 {
fmt.Fprintf(&buf, "%s$%d", sep, p.From3.Offset)
sep = ", "
}
if p.From.Type != TYPE_NONE { if p.From.Type != TYPE_NONE {
fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, &p.From)) fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, &p.From))
sep = ", " sep = ", "
...@@ -153,6 +158,8 @@ func (p *Prog) String() string { ...@@ -153,6 +158,8 @@ func (p *Prog) String() string {
if p.From3.Type == TYPE_CONST && (p.As == ATEXT || p.As == AGLOBL) { if p.From3.Type == TYPE_CONST && (p.As == ATEXT || p.As == AGLOBL) {
// Special case - omit $. // Special case - omit $.
fmt.Fprintf(&buf, "%s%d", sep, p.From3.Offset) fmt.Fprintf(&buf, "%s%d", sep, p.From3.Offset)
} else if quadOpAmd64 {
fmt.Fprintf(&buf, "%s%v", sep, Rconv(int(p.From3.Reg)))
} else { } else {
fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, p.From3)) fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, p.From3))
} }
...@@ -161,7 +168,7 @@ func (p *Prog) String() string { ...@@ -161,7 +168,7 @@ func (p *Prog) String() string {
if p.To.Type != TYPE_NONE { if p.To.Type != TYPE_NONE {
fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, &p.To)) fmt.Fprintf(&buf, "%s%v", sep, Dconv(p, &p.To))
} }
if p.RegTo2 != REG_NONE { if p.RegTo2 != REG_NONE && !quadOpAmd64 {
fmt.Fprintf(&buf, "%s%v", sep, Rconv(int(p.RegTo2))) fmt.Fprintf(&buf, "%s%v", sep, Rconv(int(p.RegTo2)))
} }
return buf.String() return buf.String()
......
...@@ -785,6 +785,24 @@ const ( ...@@ -785,6 +785,24 @@ const (
AVPAND AVPAND
AVPTEST AVPTEST
AVPBROADCASTB AVPBROADCASTB
AVPSHUFB
AVPSHUFD
AVPERM2F128
AVPALIGNR
AVPADDQ
AVPADDD
AVPSRLDQ
AVPSLLDQ
AVPSRLQ
AVPSLLQ
AVPSRLD
AVPSLLD
AVPOR
AVPBLENDD
AVINSERTI128
AVPERM2I128
ARORXL
ARORXQ
// from 386 // from 386
AJCXZW AJCXZW
......
...@@ -720,6 +720,24 @@ var Anames = []string{ ...@@ -720,6 +720,24 @@ var Anames = []string{
"VPAND", "VPAND",
"VPTEST", "VPTEST",
"VPBROADCASTB", "VPBROADCASTB",
"VPSHUFB",
"VPSHUFD",
"VPERM2F128",
"VPALIGNR",
"VPADDQ",
"VPADDD",
"VPSRLDQ",
"VPSLLDQ",
"VPSRLQ",
"VPSLLQ",
"VPSRLD",
"VPSLLD",
"VPOR",
"VPBLENDD",
"VINSERTI128",
"VPERM2I128",
"RORXL",
"RORXQ",
"JCXZW", "JCXZW",
"FCMOVCC", "FCMOVCC",
"FCMOVCS", "FCMOVCS",
......
...@@ -208,6 +208,9 @@ const ( ...@@ -208,6 +208,9 @@ const (
Zvex_rm_v_r Zvex_rm_v_r
Zvex_r_v_rm Zvex_r_v_rm
Zvex_v_rm_r Zvex_v_rm_r
Zvex_i_rm_r
Zvex_i_r_v
Zvex_i_rm_v_r
Zmax Zmax
) )
...@@ -847,6 +850,35 @@ var yvex_xy3 = []ytab{ ...@@ -847,6 +850,35 @@ var yvex_xy3 = []ytab{
{Yym, Yyr, Yyr, Zvex_rm_v_r, 2}, {Yym, Yyr, Yyr, Zvex_rm_v_r, 2},
} }
var yvex_ri3 = []ytab{
{Yi8, Ymb, Yrl, Zvex_i_rm_r, 2},
}
var yvex_xyi3 = []ytab{
{Yi8, Yxm, Yxr, Zvex_i_rm_r, 2},
{Yi8, Yym, Yyr, Zvex_i_rm_r, 2},
}
var yvex_yyi4 = []ytab{ //TODO don't hide 4 op, some version have xmm version
{Yym, Yyr, Yyr, Zvex_i_rm_v_r, 2},
}
var yvex_xyi4 = []ytab{
{Yxm, Yyr, Yyr, Zvex_i_rm_v_r, 2},
}
var yvex_shift = []ytab{
{Yi8, Yxr, Yxr, Zvex_i_r_v, 3},
{Yi8, Yyr, Yyr, Zvex_i_r_v, 3},
{Yxm, Yxr, Yxr, Zvex_rm_v_r, 2},
{Yxm, Yyr, Yyr, Zvex_rm_v_r, 2},
}
var yvex_shift_dq = []ytab{
{Yi8, Yxr, Yxr, Zvex_i_r_v, 3},
{Yi8, Yyr, Yyr, Zvex_i_r_v, 3},
}
var yvex_r3 = []ytab{ var yvex_r3 = []ytab{
{Yml, Yrl, Yrl, Zvex_rm_v_r, 2}, {Yml, Yrl, Yrl, Zvex_rm_v_r, 2},
{Yml, Yrl, Yrl, Zvex_rm_v_r, 2}, {Yml, Yrl, Yrl, Zvex_rm_v_r, 2},
...@@ -1679,6 +1711,24 @@ var optab = ...@@ -1679,6 +1711,24 @@ var optab =
{AVPAND, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xDB, VEX_256_66_0F_WIG, 0xDB}}, {AVPAND, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xDB, VEX_256_66_0F_WIG, 0xDB}},
{AVPBROADCASTB, yvex_vpbroadcast, Pvex, [23]uint8{VEX_128_66_0F38_W0, 0x78, VEX_256_66_0F38_W0, 0x78}}, {AVPBROADCASTB, yvex_vpbroadcast, Pvex, [23]uint8{VEX_128_66_0F38_W0, 0x78, VEX_256_66_0F38_W0, 0x78}},
{AVPTEST, yvex_xy2, Pvex, [23]uint8{VEX_128_66_0F38_WIG, 0x17, VEX_256_66_0F38_WIG, 0x17}}, {AVPTEST, yvex_xy2, Pvex, [23]uint8{VEX_128_66_0F38_WIG, 0x17, VEX_256_66_0F38_WIG, 0x17}},
{AVPSHUFB, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F38_WIG, 0x00, VEX_256_66_0F38_WIG, 0x00}},
{AVPSHUFD, yvex_xyi3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x70, VEX_256_66_0F_WIG, 0x70}},
{AVPOR, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xeb, VEX_256_66_0F_WIG, 0xeb}},
{AVPADDQ, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xd4, VEX_256_66_0F_WIG, 0xd4}},
{AVPADDD, yvex_xy3, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0xfe, VEX_256_66_0F_WIG, 0xfe}},
{AVPSLLD, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x72, 0xf0, VEX_256_66_0F_WIG, 0x72, 0xf0, VEX_128_66_0F_WIG, 0xf2, VEX_256_66_0F_WIG, 0xf2}},
{AVPSLLQ, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xf0, VEX_256_66_0F_WIG, 0x73, 0xf0, VEX_128_66_0F_WIG, 0xf3, VEX_256_66_0F_WIG, 0xf3}},
{AVPSRLD, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x72, 0xd0, VEX_256_66_0F_WIG, 0x72, 0xd0, VEX_128_66_0F_WIG, 0xd2, VEX_256_66_0F_WIG, 0xd2}},
{AVPSRLQ, yvex_shift, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xd0, VEX_256_66_0F_WIG, 0x73, 0xd0, VEX_128_66_0F_WIG, 0xd3, VEX_256_66_0F_WIG, 0xd3}},
{AVPSRLDQ, yvex_shift_dq, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xd8, VEX_256_66_0F_WIG, 0x73, 0xd8}},
{AVPSLLDQ, yvex_shift_dq, Pvex, [23]uint8{VEX_128_66_0F_WIG, 0x73, 0xf8, VEX_256_66_0F_WIG, 0x73, 0xf8}},
{AVPERM2F128, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_W0, 0x06}},
{AVPALIGNR, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x0f}},
{AVPBLENDD, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x02}},
{AVINSERTI128, yvex_xyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x38}},
{AVPERM2I128, yvex_yyi4, Pvex, [23]uint8{VEX_256_66_0F3A_WIG, 0x46}},
{ARORXL, yvex_ri3, Pvex, [23]uint8{VEX_LZ_F2_0F3A_W0, 0xf0}},
{ARORXQ, yvex_ri3, Pvex, [23]uint8{VEX_LZ_F2_0F3A_W1, 0xf0}},
{AXACQUIRE, ynone, Px, [23]uint8{0xf2}}, {AXACQUIRE, ynone, Px, [23]uint8{0xf2}},
{AXRELEASE, ynone, Px, [23]uint8{0xf3}}, {AXRELEASE, ynone, Px, [23]uint8{0xf3}},
...@@ -3189,9 +3239,16 @@ var bpduff2 = []byte{ ...@@ -3189,9 +3239,16 @@ var bpduff2 = []byte{
// https://en.wikipedia.org/wiki/VEX_prefix#Technical_description // https://en.wikipedia.org/wiki/VEX_prefix#Technical_description
func asmvex(ctxt *obj.Link, rm, v, r *obj.Addr, vex, opcode uint8) { func asmvex(ctxt *obj.Link, rm, v, r *obj.Addr, vex, opcode uint8) {
ctxt.Vexflag = 1 ctxt.Vexflag = 1
rexR := regrex[r.Reg] & Rxr rexR := 0
rexB := regrex[rm.Reg] & Rxb if r != nil {
rexX := regrex[rm.Index] & Rxx rexR = regrex[r.Reg] & Rxr
}
rexB := 0
rexX := 0
if rm != nil {
rexB = regrex[rm.Reg] & Rxb
rexX = regrex[rm.Index] & Rxx
}
vexM := (vex >> 3) & 0xF vexM := (vex >> 3) & 0xF
vexWLP := vex & 0x87 vexWLP := vex & 0x87
vexV := byte(0) vexV := byte(0)
...@@ -3477,6 +3534,27 @@ func doasm(ctxt *obj.Link, p *obj.Prog) { ...@@ -3477,6 +3534,27 @@ func doasm(ctxt *obj.Link, p *obj.Prog) {
asmvex(ctxt, &p.From, p.From3, &p.To, o.op[z], o.op[z+1]) asmvex(ctxt, &p.From, p.From3, &p.To, o.op[z], o.op[z+1])
asmand(ctxt, p, &p.From, &p.To) asmand(ctxt, p, &p.From, &p.To)
case Zvex_i_r_v:
asmvex(ctxt, p.From3, &p.To, nil, o.op[z], o.op[z+1])
regnum := byte(0x7)
if p.From3.Reg >= REG_X0 && p.From3.Reg <= REG_X15 {
regnum &= byte(p.From3.Reg - REG_X0)
} else {
regnum &= byte(p.From3.Reg - REG_Y0)
}
ctxt.AsmBuf.Put1(byte(o.op[z+2]) | regnum)
ctxt.AsmBuf.Put1(byte(p.From.Offset))
case Zvex_i_rm_v_r:
asmvex(ctxt, &p.From, p.From3, &p.To, o.op[z], o.op[z+1])
asmand(ctxt, p, &p.From, &p.To)
ctxt.AsmBuf.Put1(byte(p.From3.Offset))
case Zvex_i_rm_r:
asmvex(ctxt, p.From3, nil, &p.To, o.op[z], o.op[z+1])
asmand(ctxt, p, p.From3, &p.To)
ctxt.AsmBuf.Put1(byte(p.From.Offset))
case Zvex_v_rm_r: case Zvex_v_rm_r:
asmvex(ctxt, p.From3, &p.From, &p.To, o.op[z], o.op[z+1]) asmvex(ctxt, p.From3, &p.From, &p.To, o.op[z], o.op[z+1])
asmand(ctxt, p, p.From3, &p.To) asmand(ctxt, p, p.From3, &p.To)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment