Commit 7bacfc64 authored by Vlad Krasnov's avatar Vlad Krasnov Committed by Adam Langley

crypto/elliptic,crypto/ecdsa: P256 amd64 assembly

This is based on the implementation used in OpenSSL, from a
submission by Shay Gueron and myself. Besides using assembly,
this implementation employs several optimizations described in:

    S.Gueron and V.Krasnov, "Fast prime field elliptic-curve
                             cryptography with 256-bit primes"

In addition a new and improved modular inverse modulo N is
implemented here.

The performance measured on a Haswell based Macbook Pro shows 21X
speedup for the sign and 9X for the verify operations.
The operation BaseMult is 30X faster (and the Diffie-Hellman/ECDSA
key generation that use it are sped up as well).

The adaptation to Go with the help of Filippo Valsorda

Updated the submission for faster verify/ecdh, fixed some asm syntax
and API problems and added benchmarks.

Change-Id: I86a33636747d5c92f15e0c8344caa2e7e07e0028
Reviewed-on: https://go-review.googlesource.com/8968
Run-TryBot: Adam Langley <agl@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarAdam Langley <agl@golang.org>
parent 50fa6467
...@@ -27,6 +27,17 @@ import ( ...@@ -27,6 +27,17 @@ import (
"math/big" "math/big"
) )
// A invertible implements fast inverse mod Curve.Params().N
type invertible interface {
// Inverse returns the inverse of k in GF(P)
Inverse(k *big.Int) *big.Int
}
// combinedMult implements fast multiplication S1*g + S2*p (g - generator, p - arbitrary point)
type combinedMult interface {
CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int)
}
const ( const (
aesIV = "IV for ECDSA CTR" aesIV = "IV for ECDSA CTR"
) )
...@@ -179,7 +190,12 @@ func Sign(rand io.Reader, priv *PrivateKey, hash []byte) (r, s *big.Int, err err ...@@ -179,7 +190,12 @@ func Sign(rand io.Reader, priv *PrivateKey, hash []byte) (r, s *big.Int, err err
return return
} }
if in, ok := priv.Curve.(invertible); ok {
kInv = in.Inverse(k)
} else {
kInv = fermatInverse(k, N) kInv = fermatInverse(k, N)
}
r, _ = priv.Curve.ScalarBaseMult(k.Bytes()) r, _ = priv.Curve.ScalarBaseMult(k.Bytes())
r.Mod(r, N) r.Mod(r, N)
if r.Sign() != 0 { if r.Sign() != 0 {
...@@ -214,16 +230,29 @@ func Verify(pub *PublicKey, hash []byte, r, s *big.Int) bool { ...@@ -214,16 +230,29 @@ func Verify(pub *PublicKey, hash []byte, r, s *big.Int) bool {
return false return false
} }
e := hashToInt(hash, c) e := hashToInt(hash, c)
w := new(big.Int).ModInverse(s, N)
var w *big.Int
if in, ok := c.(invertible); ok {
w = in.Inverse(s)
} else {
w = new(big.Int).ModInverse(s, N)
}
u1 := e.Mul(e, w) u1 := e.Mul(e, w)
u1.Mod(u1, N) u1.Mod(u1, N)
u2 := w.Mul(r, w) u2 := w.Mul(r, w)
u2.Mod(u2, N) u2.Mod(u2, N)
// Check if implements S1*g + S2*p
var x, y *big.Int
if opt, ok := c.(combinedMult); ok {
x, y = opt.CombinedMult(pub.X, pub.Y, u1.Bytes(), u2.Bytes())
} else {
x1, y1 := c.ScalarBaseMult(u1.Bytes()) x1, y1 := c.ScalarBaseMult(u1.Bytes())
x2, y2 := c.ScalarMult(pub.X, pub.Y, u2.Bytes()) x2, y2 := c.ScalarMult(pub.X, pub.Y, u2.Bytes())
x, y := c.Add(x1, y1, x2, y2) x, y = c.Add(x1, y1, x2, y2)
}
if x.Sign() == 0 && y.Sign() == 0 { if x.Sign() == 0 && y.Sign() == 0 {
return false return false
} }
......
...@@ -42,6 +42,41 @@ func TestKeyGeneration(t *testing.T) { ...@@ -42,6 +42,41 @@ func TestKeyGeneration(t *testing.T) {
testKeyGeneration(t, elliptic.P521(), "p521") testKeyGeneration(t, elliptic.P521(), "p521")
} }
func BenchmarkSignP256(b *testing.B) {
b.ResetTimer()
p256 := elliptic.P256()
hashed := []byte("testing")
priv, _ := GenerateKey(p256, rand.Reader)
b.ResetTimer()
for i := 0; i < b.N; i++ {
_, _, _ = Sign(rand.Reader, priv, hashed)
}
}
func BenchmarkVerifyP256(b *testing.B) {
b.ResetTimer()
p256 := elliptic.P256()
hashed := []byte("testing")
priv, _ := GenerateKey(p256, rand.Reader)
r, s, _ := Sign(rand.Reader, priv, hashed)
b.ResetTimer()
for i := 0; i < b.N; i++ {
Verify(&priv.PublicKey, hashed, r, s)
}
}
func BenchmarkKeyGeneration(b *testing.B) {
b.ResetTimer()
p256 := elliptic.P256()
b.ResetTimer()
for i := 0; i < b.N; i++ {
GenerateKey(p256, rand.Reader)
}
}
func testSignAndVerify(t *testing.T, c elliptic.Curve, tag string) { func testSignAndVerify(t *testing.T, c elliptic.Curve, tag string) {
priv, _ := GenerateKey(c, rand.Reader) priv, _ := GenerateKey(c, rand.Reader)
......
...@@ -441,6 +441,18 @@ func BenchmarkBaseMultP256(b *testing.B) { ...@@ -441,6 +441,18 @@ func BenchmarkBaseMultP256(b *testing.B) {
} }
} }
func BenchmarkScalarMultP256(b *testing.B) {
b.ResetTimer()
p256 := P256()
_, x, y, _ := GenerateKey(p256, rand.Reader)
priv, _, _, _ := GenerateKey(p256, rand.Reader)
b.StartTimer()
for i := 0; i < b.N; i++ {
p256.ScalarMult(x, y, priv)
}
}
func TestMarshal(t *testing.T) { func TestMarshal(t *testing.T) {
p224 := P224() p224 := P224()
_, x, y, err := GenerateKey(p224, rand.Reader) _, x, y, err := GenerateKey(p224, rand.Reader)
......
...@@ -2,6 +2,8 @@ ...@@ -2,6 +2,8 @@
// Use of this source code is governed by a BSD-style // Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file. // license that can be found in the LICENSE file.
// +build !amd64
package elliptic package elliptic
// This file contains a constant-time, 32-bit implementation of P256. // This file contains a constant-time, 32-bit implementation of P256.
......
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file contains the Go wrapper for the constant-time, 64-bit assembly
// implementation of P256. The optimizations performed here are described in
// detail in:
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
// 256-bit primes"
// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf
// +build amd64
package elliptic
import (
"math/big"
"sync"
)
type (
p256Curve struct {
*CurveParams
}
p256Point struct {
xyz [12]uint64
}
)
var (
p256 p256Curve
p256Precomputed *[37][64 * 8]uint64
precomputeOnce sync.Once
)
func initP256() {
// See FIPS 186-3, section D.2.3
p256.CurveParams = &CurveParams{Name: "P-256"}
p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10)
p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10)
p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16)
p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16)
p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16)
p256.BitSize = 256
}
func (curve p256Curve) Params() *CurveParams {
return curve.CurveParams
}
// Functions implemented in p256_asm_amd64.s
// Montgomery multiplication modulo P256
func p256Mul(res, in1, in2 []uint64)
// Montgomery square modulo P256
func p256Sqr(res, in []uint64)
// Montgomery multiplication by 1
func p256FromMont(res, in []uint64)
// iff cond == 1 val <- -val
func p256NegCond(val []uint64, cond int)
// if cond == 0 res <- b; else res <- a
func p256MovCond(res, a, b []uint64, cond int)
// Endianess swap
func p256BigToLittle(res []uint64, in []byte)
func p256LittleToBig(res []byte, in []uint64)
// Constant time table access
func p256Select(point, table []uint64, idx int)
func p256SelectBase(point, table []uint64, idx int)
// Montgomery multiplication modulo Ord(G)
func p256OrdMul(res, in1, in2 []uint64)
// Montgomery square modulo Ord(G), repeated n times
func p256OrdSqr(res, in []uint64, n int)
// Point add with in2 being affine point
// If sign == 1 -> in2 = -in2
// If sel == 0 -> res = in1
// if zero == 0 -> res = in2
func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
// Point add
func p256PointAddAsm(res, in1, in2 []uint64)
// Point double
func p256PointDoubleAsm(res, in []uint64)
func (curve p256Curve) Inverse(k *big.Int) *big.Int {
if k.Cmp(p256.N) >= 0 {
// This should never happen.
reducedK := new(big.Int).Mod(k, p256.N)
k = reducedK
}
// table will store precomputed powers of x. The four words at index
// 4×i store x^(i+1).
var table [4 * 15]uint64
x := make([]uint64, 4)
fromBig(x[:], k)
// This code operates in the Montgomery domain where R = 2^256 mod n
// and n is the order of the scalar field. (See initP256 for the
// value.) Elements in the Montgomery domain take the form a×R and
// multiplication of x and y in the calculates (x × y × R^-1) mod n. RR
// is R×R mod n thus the Montgomery multiplication x and RR gives x×R,
// i.e. converts x into the Montgomery domain.
RR := []uint64{0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, 0x66e12d94f3d95620}
p256OrdMul(table[:4], x, RR)
// Prepare the table, no need in constant time access, because the
// power is not a secret. (Entry 0 is never used.)
for i := 2; i < 16; i += 2 {
p256OrdSqr(table[4*(i-1):], table[4*((i/2)-1):], 1)
p256OrdMul(table[4*i:], table[4*(i-1):], table[:4])
}
x[0] = table[4*14+0] // f
x[1] = table[4*14+1]
x[2] = table[4*14+2]
x[3] = table[4*14+3]
p256OrdSqr(x, x, 4)
p256OrdMul(x, x, table[4*14:4*14+4]) // ff
t := make([]uint64, 4, 4)
t[0] = x[0]
t[1] = x[1]
t[2] = x[2]
t[3] = x[3]
p256OrdSqr(x, x, 8)
p256OrdMul(x, x, t) // ffff
t[0] = x[0]
t[1] = x[1]
t[2] = x[2]
t[3] = x[3]
p256OrdSqr(x, x, 16)
p256OrdMul(x, x, t) // ffffffff
t[0] = x[0]
t[1] = x[1]
t[2] = x[2]
t[3] = x[3]
p256OrdSqr(x, x, 64) // ffffffff0000000000000000
p256OrdMul(x, x, t) // ffffffff00000000ffffffff
p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000
p256OrdMul(x, x, t) // ffffffff00000000ffffffffffffffff
// Remaining 32 windows
expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4, 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf}
for i := 0; i < 32; i++ {
p256OrdSqr(x, x, 4)
p256OrdMul(x, x, table[4*(expLo[i]-1):])
}
// Multiplying by one in the Montgomery domain converts a Montgomery
// value out of the domain.
one := []uint64{1, 0, 0, 0}
p256OrdMul(x, x, one)
xOut := make([]byte, 32)
p256LittleToBig(xOut, x)
return new(big.Int).SetBytes(xOut)
}
// fromBig converts a *big.Int into a format used by this code.
func fromBig(out []uint64, big *big.Int) {
for i := range out {
out[i] = 0
}
for i, v := range big.Bits() {
out[i] = uint64(v)
}
}
// p256GetScalar endian-swaps the big-endian scalar value from in and writes it
// to out. If the scalar is equal or greater than the order of the group, it's
// reduced modulo that order.
func p256GetScalar(out []uint64, in []byte) {
n := new(big.Int).SetBytes(in)
if n.Cmp(p256.N) >= 0 {
n.Mod(n, p256.N)
}
fromBig(out, n)
}
// p256Mul operates in a Montgomery domain with R = 2^256 mod p, where p is the
// underlying field of the curve. (See initP256 for the value.) Thus rr here is
// R×R mod p. See comment in Inverse about how this is used.
var rr = []uint64{0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd}
func maybeReduceModP(in *big.Int) *big.Int {
if in.Cmp(p256.P) < 0 {
return in
}
return new(big.Int).Mod(in, p256.P)
}
func (curve p256Curve) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) {
scalarReversed := make([]uint64, 4)
var r1, r2 p256Point
p256GetScalar(scalarReversed, baseScalar)
r1.p256BaseMult(scalarReversed)
p256GetScalar(scalarReversed, scalar)
fromBig(r2.xyz[0:4], maybeReduceModP(bigX))
fromBig(r2.xyz[4:8], maybeReduceModP(bigY))
p256Mul(r2.xyz[0:4], r2.xyz[0:4], rr[:])
p256Mul(r2.xyz[4:8], r2.xyz[4:8], rr[:])
// This sets r2's Z value to 1, in the Montgomery domain.
r2.xyz[8] = 0x0000000000000001
r2.xyz[9] = 0xffffffff00000000
r2.xyz[10] = 0xffffffffffffffff
r2.xyz[11] = 0x00000000fffffffe
r2.p256ScalarMult(scalarReversed)
p256PointAddAsm(r1.xyz[:], r1.xyz[:], r2.xyz[:])
return r1.p256PointToAffine()
}
func (curve p256Curve) ScalarBaseMult(scalar []byte) (x, y *big.Int) {
scalarReversed := make([]uint64, 4)
p256GetScalar(scalarReversed, scalar)
var r p256Point
r.p256BaseMult(scalarReversed)
return r.p256PointToAffine()
}
func (curve p256Curve) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) {
scalarReversed := make([]uint64, 4)
p256GetScalar(scalarReversed, scalar)
var r p256Point
fromBig(r.xyz[0:4], maybeReduceModP(bigX))
fromBig(r.xyz[4:8], maybeReduceModP(bigY))
p256Mul(r.xyz[0:4], r.xyz[0:4], rr[:])
p256Mul(r.xyz[4:8], r.xyz[4:8], rr[:])
// This sets r2's Z value to 1, in the Montgomery domain.
r.xyz[8] = 0x0000000000000001
r.xyz[9] = 0xffffffff00000000
r.xyz[10] = 0xffffffffffffffff
r.xyz[11] = 0x00000000fffffffe
r.p256ScalarMult(scalarReversed)
return r.p256PointToAffine()
}
func (p *p256Point) p256PointToAffine() (x, y *big.Int) {
zInv := make([]uint64, 4)
zInvSq := make([]uint64, 4)
p256Inverse(zInv, p.xyz[8:12])
p256Sqr(zInvSq, zInv)
p256Mul(zInv, zInv, zInvSq)
p256Mul(zInvSq, p.xyz[0:4], zInvSq)
p256Mul(zInv, p.xyz[4:8], zInv)
p256FromMont(zInvSq, zInvSq)
p256FromMont(zInv, zInv)
xOut := make([]byte, 32)
yOut := make([]byte, 32)
p256LittleToBig(xOut, zInvSq)
p256LittleToBig(yOut, zInv)
return new(big.Int).SetBytes(xOut), new(big.Int).SetBytes(yOut)
}
// p256Inverse sets out to in^-1 mod p.
func p256Inverse(out, in []uint64) {
var stack [6 * 4]uint64
p2 := stack[4*0 : 4*0+4]
p4 := stack[4*1 : 4*1+4]
p8 := stack[4*2 : 4*2+4]
p16 := stack[4*3 : 4*3+4]
p32 := stack[4*4 : 4*4+4]
p256Sqr(out, in)
p256Mul(p2, out, in) // 3*p
p256Sqr(out, p2)
p256Sqr(out, out)
p256Mul(p4, out, p2) // f*p
p256Sqr(out, p4)
p256Sqr(out, out)
p256Sqr(out, out)
p256Sqr(out, out)
p256Mul(p8, out, p4) // ff*p
p256Sqr(out, p8)
for i := 0; i < 7; i++ {
p256Sqr(out, out)
}
p256Mul(p16, out, p8) // ffff*p
p256Sqr(out, p16)
for i := 0; i < 15; i++ {
p256Sqr(out, out)
}
p256Mul(p32, out, p16) // ffffffff*p
p256Sqr(out, p32)
for i := 0; i < 31; i++ {
p256Sqr(out, out)
}
p256Mul(out, out, in)
for i := 0; i < 32*4; i++ {
p256Sqr(out, out)
}
p256Mul(out, out, p32)
for i := 0; i < 32; i++ {
p256Sqr(out, out)
}
p256Mul(out, out, p32)
for i := 0; i < 16; i++ {
p256Sqr(out, out)
}
p256Mul(out, out, p16)
for i := 0; i < 8; i++ {
p256Sqr(out, out)
}
p256Mul(out, out, p8)
p256Sqr(out, out)
p256Sqr(out, out)
p256Sqr(out, out)
p256Sqr(out, out)
p256Mul(out, out, p4)
p256Sqr(out, out)
p256Sqr(out, out)
p256Mul(out, out, p2)
p256Sqr(out, out)
p256Sqr(out, out)
p256Mul(out, out, in)
}
func (p *p256Point) p256StorePoint(r *[16 * 4 * 3]uint64, index int) {
copy(r[index*12:], p.xyz[:])
}
func boothW5(in uint) (int, int) {
var s uint = ^((in >> 5) - 1)
var d uint = (1 << 6) - in - 1
d = (d & s) | (in & (^s))
d = (d >> 1) + (d & 1)
return int(d), int(s & 1)
}
func boothW7(in uint) (int, int) {
var s uint = ^((in >> 7) - 1)
var d uint = (1 << 8) - in - 1
d = (d & s) | (in & (^s))
d = (d >> 1) + (d & 1)
return int(d), int(s & 1)
}
func initTable() {
p256Precomputed = new([37][64 * 8]uint64)
basePoint := []uint64{
0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6,
0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, 0x8571ff1825885d85,
0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe,
}
t1 := make([]uint64, 12)
t2 := make([]uint64, 12)
copy(t2, basePoint)
zInv := make([]uint64, 4)
zInvSq := make([]uint64, 4)
for j := 0; j < 64; j++ {
copy(t1, t2)
for i := 0; i < 37; i++ {
// The window size is 7 so we need to double 7 times.
if i != 0 {
for k := 0; k < 7; k++ {
p256PointDoubleAsm(t1, t1)
}
}
// Convert the point to affine form. (Its values are
// still in Montgomery form however.)
p256Inverse(zInv, t1[8:12])
p256Sqr(zInvSq, zInv)
p256Mul(zInv, zInv, zInvSq)
p256Mul(t1[:4], t1[:4], zInvSq)
p256Mul(t1[4:8], t1[4:8], zInv)
copy(t1[8:12], basePoint[8:12])
// Update the table entry
copy(p256Precomputed[i][j*8:], t1[:8])
}
if j == 0 {
p256PointDoubleAsm(t2, basePoint)
} else {
p256PointAddAsm(t2, t2, basePoint)
}
}
}
func (p *p256Point) p256BaseMult(scalar []uint64) {
precomputeOnce.Do(initTable)
wvalue := (scalar[0] << 1) & 0xff
sel, sign := boothW7(uint(wvalue))
p256SelectBase(p.xyz[0:8], p256Precomputed[0][0:], sel)
p256NegCond(p.xyz[4:8], sign)
// (This is one, in the Montgomery domain.)
p.xyz[8] = 0x0000000000000001
p.xyz[9] = 0xffffffff00000000
p.xyz[10] = 0xffffffffffffffff
p.xyz[11] = 0x00000000fffffffe
var t0 p256Point
// (This is one, in the Montgomery domain.)
t0.xyz[8] = 0x0000000000000001
t0.xyz[9] = 0xffffffff00000000
t0.xyz[10] = 0xffffffffffffffff
t0.xyz[11] = 0x00000000fffffffe
index := uint(6)
zero := sel
for i := 1; i < 37; i++ {
if index < 192 {
wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0xff
} else {
wvalue = (scalar[index/64] >> (index % 64)) & 0xff
}
index += 7
sel, sign = boothW7(uint(wvalue))
p256SelectBase(t0.xyz[0:8], p256Precomputed[i][0:], sel)
p256PointAddAffineAsm(p.xyz[0:12], p.xyz[0:12], t0.xyz[0:8], sign, sel, zero)
zero |= sel
}
}
func (p *p256Point) p256ScalarMult(scalar []uint64) {
// precomp is a table of precomputed points that stores powers of p
// from p^1 to p^16.
var precomp [16 * 4 * 3]uint64
var t0, t1, t2, t3 p256Point
// Prepare the table
p.p256StorePoint(&precomp, 0) // 1
p256PointDoubleAsm(t0.xyz[:], p.xyz[:])
p256PointDoubleAsm(t1.xyz[:], t0.xyz[:])
p256PointDoubleAsm(t2.xyz[:], t1.xyz[:])
p256PointDoubleAsm(t3.xyz[:], t2.xyz[:])
t0.p256StorePoint(&precomp, 1) // 2
t1.p256StorePoint(&precomp, 3) // 4
t2.p256StorePoint(&precomp, 7) // 8
t3.p256StorePoint(&precomp, 15) // 16
p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
t0.p256StorePoint(&precomp, 2) // 3
t1.p256StorePoint(&precomp, 4) // 5
t2.p256StorePoint(&precomp, 8) // 9
p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
p256PointDoubleAsm(t1.xyz[:], t1.xyz[:])
t0.p256StorePoint(&precomp, 5) // 6
t1.p256StorePoint(&precomp, 9) // 10
p256PointAddAsm(t2.xyz[:], t0.xyz[:], p.xyz[:])
p256PointAddAsm(t1.xyz[:], t1.xyz[:], p.xyz[:])
t2.p256StorePoint(&precomp, 6) // 7
t1.p256StorePoint(&precomp, 10) // 11
p256PointDoubleAsm(t0.xyz[:], t0.xyz[:])
p256PointDoubleAsm(t2.xyz[:], t2.xyz[:])
t0.p256StorePoint(&precomp, 11) // 12
t2.p256StorePoint(&precomp, 13) // 14
p256PointAddAsm(t0.xyz[:], t0.xyz[:], p.xyz[:])
p256PointAddAsm(t2.xyz[:], t2.xyz[:], p.xyz[:])
t0.p256StorePoint(&precomp, 12) // 13
t2.p256StorePoint(&precomp, 14) // 15
// Start scanning the window from top bit
index := uint(254)
var sel, sign int
wvalue := (scalar[index/64] >> (index % 64)) & 0x3f
sel, _ = boothW5(uint(wvalue))
p256Select(p.xyz[0:12], precomp[0:], sel)
zero := sel
for index > 4 {
index -= 5
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
if index < 192 {
wvalue = ((scalar[index/64] >> (index % 64)) + (scalar[index/64+1] << (64 - (index % 64)))) & 0x3f
} else {
wvalue = (scalar[index/64] >> (index % 64)) & 0x3f
}
sel, sign = boothW5(uint(wvalue))
p256Select(t0.xyz[0:], precomp[0:], sel)
p256NegCond(t0.xyz[4:8], sign)
p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
zero |= sel
}
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
p256PointDoubleAsm(p.xyz[:], p.xyz[:])
wvalue = (scalar[0] << 1) & 0x3f
sel, sign = boothW5(uint(wvalue))
p256Select(t0.xyz[0:], precomp[0:], sel)
p256NegCond(t0.xyz[4:8], sign)
p256PointAddAsm(t1.xyz[:], p.xyz[:], t0.xyz[:])
p256MovCond(t1.xyz[0:12], t1.xyz[0:12], p.xyz[0:12], sel)
p256MovCond(p.xyz[0:12], t1.xyz[0:12], t0.xyz[0:12], zero)
}
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// This file contains constant-time, 64-bit assembly implementation of
// P256. The optimizations performed here are described in detail in:
// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
// 256-bit primes"
// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf
#include "textflag.h"
#define res_ptr DI
#define x_ptr SI
#define y_ptr CX
#define acc0 R8
#define acc1 R9
#define acc2 R10
#define acc3 R11
#define acc4 R12
#define acc5 R13
#define t0 R14
#define t1 R15
DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
DATA p256one<>+0x00(SB)/8, $0x0000000000000001
DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
GLOBL p256const0<>(SB), 8, $8
GLOBL p256const1<>(SB), 8, $8
GLOBL p256ordK0<>(SB), 8, $8
GLOBL p256ord<>(SB), 8, $32
GLOBL p256one<>(SB), 8, $32
/* ---------------------------------------*/
// func p256LittleToBig(res []byte, in []uint64)
TEXT ·p256LittleToBig(SB),NOSPLIT,$0
JMP ·p256BigToLittle(SB)
/* ---------------------------------------*/
// func p256BigToLittle(res []uint64, in []byte)
TEXT ·p256BigToLittle(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+24(FP), x_ptr
MOVQ (8*0)(x_ptr), acc0
MOVQ (8*1)(x_ptr), acc1
MOVQ (8*2)(x_ptr), acc2
MOVQ (8*3)(x_ptr), acc3
BSWAPQ acc0
BSWAPQ acc1
BSWAPQ acc2
BSWAPQ acc3
MOVQ acc3, (8*0)(res_ptr)
MOVQ acc2, (8*1)(res_ptr)
MOVQ acc1, (8*2)(res_ptr)
MOVQ acc0, (8*3)(res_ptr)
RET
/* ---------------------------------------*/
// func p256MovCond(res, a, b []uint64, cond int)
// If cond == 0 res=b, else res=a
TEXT ·p256MovCond(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ a+24(FP), x_ptr
MOVQ b+48(FP), y_ptr
MOVQ cond+72(FP), X12
PXOR X13, X13
PSHUFD $0, X12, X12
PCMPEQL X13, X12
MOVOU X12, X0
PANDN (16*0)(x_ptr), X0
MOVOU X12, X1
PANDN (16*1)(x_ptr), X1
MOVOU X12, X2
PANDN (16*2)(x_ptr), X2
MOVOU X12, X3
PANDN (16*3)(x_ptr), X3
MOVOU X12, X4
PANDN (16*4)(x_ptr), X4
MOVOU X12, X5
PANDN (16*5)(x_ptr), X5
MOVOU (16*0)(y_ptr), X6
MOVOU (16*1)(y_ptr), X7
MOVOU (16*2)(y_ptr), X8
MOVOU (16*3)(y_ptr), X9
MOVOU (16*4)(y_ptr), X10
MOVOU (16*5)(y_ptr), X11
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
MOVOU X0, (16*0)(res_ptr)
MOVOU X1, (16*1)(res_ptr)
MOVOU X2, (16*2)(res_ptr)
MOVOU X3, (16*3)(res_ptr)
MOVOU X4, (16*4)(res_ptr)
MOVOU X5, (16*5)(res_ptr)
RET
/* ---------------------------------------*/
// func p256NegCond(val []uint64, cond int)
TEXT ·p256NegCond(SB),NOSPLIT,$0
MOVQ val+0(FP), res_ptr
MOVQ cond+24(FP), t0
// acc = poly
MOVQ $-1, acc0
MOVQ p256const0<>(SB), acc1
MOVQ $0, acc2
MOVQ p256const1<>(SB), acc3
// Load the original value
MOVQ (8*0)(res_ptr), acc5
MOVQ (8*1)(res_ptr), x_ptr
MOVQ (8*2)(res_ptr), y_ptr
MOVQ (8*3)(res_ptr), t1
// Speculatively subtract
SUBQ acc5, acc0
SBBQ x_ptr, acc1
SBBQ y_ptr, acc2
SBBQ t1, acc3
// If condition is 0, keep original value
TESTQ t0, t0
CMOVQEQ acc5, acc0
CMOVQEQ x_ptr, acc1
CMOVQEQ y_ptr, acc2
CMOVQEQ t1, acc3
// Store result
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
RET
/* ---------------------------------------*/
// func p256Sqr(res, in []uint64)
TEXT ·p256Sqr(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+24(FP), x_ptr
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, t1
MOVQ t1, x_ptr
// First reduction step
MOVQ acc0, AX
MOVQ acc0, t1
SHLQ $32, acc0
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc0, acc1
ADCQ t1, acc2
ADCQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc0
// Second reduction step
MOVQ acc1, AX
MOVQ acc1, t1
SHLQ $32, acc1
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc1, acc2
ADCQ t1, acc3
ADCQ AX, acc0
ADCQ $0, DX
MOVQ DX, acc1
// Third reduction step
MOVQ acc2, AX
MOVQ acc2, t1
SHLQ $32, acc2
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc2, acc3
ADCQ t1, acc0
ADCQ AX, acc1
ADCQ $0, DX
MOVQ DX, acc2
// Last reduction step
XORQ t0, t0
MOVQ acc3, AX
MOVQ acc3, t1
SHLQ $32, acc3
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc3, acc0
ADCQ t1, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
MOVQ acc0, acc4
MOVQ acc1, acc5
MOVQ acc2, y_ptr
MOVQ acc3, t1
// Subtract p256
SUBQ $-1, acc0
SBBQ p256const0<>(SB) ,acc1
SBBQ $0, acc2
SBBQ p256const1<>(SB), acc3
SBBQ $0, t0
CMOVQCS acc4, acc0
CMOVQCS acc5, acc1
CMOVQCS y_ptr, acc2
CMOVQCS t1, acc3
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
RET
/* ---------------------------------------*/
// func p256Mul(res, in1, in2 []uint64)
TEXT ·p256Mul(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in1+24(FP), x_ptr
MOVQ in2+48(FP), y_ptr
// x * y[0]
MOVQ (8*0)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
MOVQ AX, acc0
MOVQ DX, acc1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
XORQ acc5, acc5
// First reduction step
MOVQ acc0, AX
MOVQ acc0, t1
SHLQ $32, acc0
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc0, acc1
ADCQ t1, acc2
ADCQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
XORQ acc0, acc0
// x * y[1]
MOVQ (8*1)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MOVQ acc1, t1
SHLQ $32, acc1
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc1, acc2
ADCQ t1, acc3
ADCQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
XORQ acc1, acc1
// x * y[2]
MOVQ (8*2)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MOVQ acc2, t1
SHLQ $32, acc2
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc2, acc3
ADCQ t1, acc4
ADCQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
XORQ acc2, acc2
// x * y[3]
MOVQ (8*3)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MOVQ acc3, t1
SHLQ $32, acc3
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc3, acc4
ADCQ t1, acc5
ADCQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p256
SUBQ $-1, acc4
SBBQ p256const0<>(SB) ,acc5
SBBQ $0, acc0
SBBQ p256const1<>(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
RET
/* ---------------------------------------*/
// func p256FromMont(res, in []uint64)
TEXT ·p256FromMont(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+24(FP), x_ptr
MOVQ (8*0)(x_ptr), acc0
MOVQ (8*1)(x_ptr), acc1
MOVQ (8*2)(x_ptr), acc2
MOVQ (8*3)(x_ptr), acc3
XORQ acc4, acc4
// Only reduce, no multiplications are needed
// First stage
MOVQ acc0, AX
MOVQ acc0, t1
SHLQ $32, acc0
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc0, acc1
ADCQ t1, acc2
ADCQ AX, acc3
ADCQ DX, acc4
XORQ acc5, acc5
// Second stage
MOVQ acc1, AX
MOVQ acc1, t1
SHLQ $32, acc1
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc1, acc2
ADCQ t1, acc3
ADCQ AX, acc4
ADCQ DX, acc5
XORQ acc0, acc0
// Third stage
MOVQ acc2, AX
MOVQ acc2, t1
SHLQ $32, acc2
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc2, acc3
ADCQ t1, acc4
ADCQ AX, acc5
ADCQ DX, acc0
XORQ acc1, acc1
// Last stage
MOVQ acc3, AX
MOVQ acc3, t1
SHLQ $32, acc3
MULQ p256const1<>(SB)
SHRQ $32, t1
ADDQ acc3, acc4
ADCQ t1, acc5
ADCQ AX, acc0
ADCQ DX, acc1
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
SUBQ $-1, acc4
SBBQ p256const0<>(SB), acc5
SBBQ $0, acc0
SBBQ p256const1<>(SB), acc1
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
RET
/* ---------------------------------------*/
// Constant time point access to arbitrary point table.
// Indexed from 1 to 15, with -1 offset
// (index 0 is implicitly point at infinity)
// func p256Select(point, table []uint64, idx int)
TEXT ·p256Select(SB),NOSPLIT,$0
MOVQ idx+48(FP),AX
MOVQ table+24(FP),DI
MOVQ point+0(FP),DX
PXOR X15, X15 // X15 = 0
PCMPEQL X14, X14 // X14 = -1
PSUBL X14, X15 // X15 = 1
MOVL AX, X14
PSHUFD $0, X14, X14
PXOR X0, X0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
PXOR X4, X4
PXOR X5, X5
MOVQ $16, AX
MOVOU X15, X13
loop_select:
MOVOU X13, X12
PADDL X15, X13
PCMPEQL X14, X12
MOVOU (16*0)(DI), X6
MOVOU (16*1)(DI), X7
MOVOU (16*2)(DI), X8
MOVOU (16*3)(DI), X9
MOVOU (16*4)(DI), X10
MOVOU (16*5)(DI), X11
ADDQ $(16*6), DI
PAND X12, X6
PAND X12, X7
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X6, X0
PXOR X7, X1
PXOR X8, X2
PXOR X9, X3
PXOR X10, X4
PXOR X11, X5
DECQ AX
JNE loop_select
MOVOU X0, (16*0)(DX)
MOVOU X1, (16*1)(DX)
MOVOU X2, (16*2)(DX)
MOVOU X3, (16*3)(DX)
MOVOU X4, (16*4)(DX)
MOVOU X5, (16*5)(DX)
RET
/* ---------------------------------------*/
// Constant time point access to base point table.
// func p256SelectBase(point, table []uint64, idx int)
TEXT ·p256SelectBase(SB),NOSPLIT,$0
MOVQ idx+48(FP),AX
MOVQ table+24(FP),DI
MOVQ point+0(FP),DX
PXOR X15, X15 // X15 = 0
PCMPEQL X14, X14 // X14 = -1
PSUBL X14, X15 // X15 = 1
MOVL AX, X14
PSHUFD $0, X14, X14
PXOR X0, X0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
MOVQ $32, AX
MOVOU X15, X13
loop_select_base:
MOVOU X13, X12
PADDL X15, X13
PCMPEQL X14, X12
MOVOU (16*0)(DI), X4
MOVOU (16*1)(DI), X5
MOVOU (16*2)(DI), X6
MOVOU (16*3)(DI), X7
MOVOU (16*4)(DI), X8
MOVOU (16*5)(DI), X9
MOVOU (16*6)(DI), X10
MOVOU (16*7)(DI), X11
ADDQ $(16*8), DI
PAND X12, X4
PAND X12, X5
PAND X12, X6
PAND X12, X7
MOVOU X13, X12
PADDL X15, X13
PCMPEQL X14, X12
PAND X12, X8
PAND X12, X9
PAND X12, X10
PAND X12, X11
PXOR X4, X0
PXOR X5, X1
PXOR X6, X2
PXOR X7, X3
PXOR X8, X0
PXOR X9, X1
PXOR X10, X2
PXOR X11, X3
DECQ AX
JNE loop_select_base
MOVOU X0, (16*0)(DX)
MOVOU X1, (16*1)(DX)
MOVOU X2, (16*2)(DX)
MOVOU X3, (16*3)(DX)
RET
/* ---------------------------------------*/
// func p256OrdMul(res, in1, in2 []uint64)
TEXT ·p256OrdMul(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in1+24(FP), x_ptr
MOVQ in2+48(FP), y_ptr
// x * y[0]
MOVQ (8*0)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
MOVQ AX, acc0
MOVQ DX, acc1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
XORQ acc5, acc5
// First reduction step
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ DX, acc4
ADCQ $0, acc5
// x * y[1]
MOVQ (8*1)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// Second reduction step
MOVQ acc1, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ DX, acc5
ADCQ $0, acc0
// x * y[2]
MOVQ (8*2)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// Third reduction step
MOVQ acc2, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ DX, acc0
ADCQ $0, acc1
// x * y[3]
MOVQ (8*3)(y_ptr), t0
MOVQ (8*0)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*1)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Last reduction step
MOVQ acc3, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x10(SB), AX
MULQ t0
ADDQ t1, acc5
ADCQ $0, DX
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x18(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ DX, acc1
ADCQ $0, acc2
// Copy result [255:0]
MOVQ acc4, x_ptr
MOVQ acc5, acc3
MOVQ acc0, t0
MOVQ acc1, t1
// Subtract p256
SUBQ p256ord<>+0x00(SB), acc4
SBBQ p256ord<>+0x08(SB) ,acc5
SBBQ p256ord<>+0x10(SB), acc0
SBBQ p256ord<>+0x18(SB), acc1
SBBQ $0, acc2
CMOVQCS x_ptr, acc4
CMOVQCS acc3, acc5
CMOVQCS t0, acc0
CMOVQCS t1, acc1
MOVQ acc4, (8*0)(res_ptr)
MOVQ acc5, (8*1)(res_ptr)
MOVQ acc0, (8*2)(res_ptr)
MOVQ acc1, (8*3)(res_ptr)
RET
/* ---------------------------------------*/
// func p256OrdSqr(res, in []uint64, n int)
TEXT ·p256OrdSqr(SB),NOSPLIT,$0
MOVQ res+0(FP), res_ptr
MOVQ in+24(FP), x_ptr
MOVQ n+48(FP), BX
ordSqrLoop:
// y[1:] * y[0]
MOVQ (8*0)(x_ptr), t0
MOVQ (8*1)(x_ptr), AX
MULQ t0
MOVQ AX, acc1
MOVQ DX, acc2
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, acc3
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, acc4
// y[2:] * y[1]
MOVQ (8*1)(x_ptr), t0
MOVQ (8*2)(x_ptr), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ t1, acc4
ADCQ $0, DX
ADDQ AX, acc4
ADCQ $0, DX
MOVQ DX, acc5
// y[3] * y[2]
MOVQ (8*2)(x_ptr), t0
MOVQ (8*3)(x_ptr), AX
MULQ t0
ADDQ AX, acc5
ADCQ $0, DX
MOVQ DX, y_ptr
XORQ t1, t1
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ acc4, acc4
ADCQ acc5, acc5
ADCQ y_ptr, y_ptr
ADCQ $0, t1
// Missing products
MOVQ (8*0)(x_ptr), AX
MULQ AX
MOVQ AX, acc0
MOVQ DX, t0
MOVQ (8*1)(x_ptr), AX
MULQ AX
ADDQ t0, acc1
ADCQ AX, acc2
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*2)(x_ptr), AX
MULQ AX
ADDQ t0, acc3
ADCQ AX, acc4
ADCQ $0, DX
MOVQ DX, t0
MOVQ (8*3)(x_ptr), AX
MULQ AX
ADDQ t0, acc5
ADCQ AX, y_ptr
ADCQ DX, t1
MOVQ t1, x_ptr
// First reduction step
MOVQ acc0, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc1
ADCQ $0, DX
ADDQ AX, acc1
MOVQ t0, t1
ADCQ DX, acc2
ADCQ $0, t1
SUBQ t0, acc2
SBBQ $0, t1
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc0
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc3
ADCQ $0, acc0
SUBQ AX, acc3
SBBQ DX, acc0
// Second reduction step
MOVQ acc1, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc1
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc2
ADCQ $0, DX
ADDQ AX, acc2
MOVQ t0, t1
ADCQ DX, acc3
ADCQ $0, t1
SUBQ t0, acc3
SBBQ $0, t1
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc1
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc0
ADCQ $0, acc1
SUBQ AX, acc0
SBBQ DX, acc1
// Third reduction step
MOVQ acc2, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc2
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc3
ADCQ $0, DX
ADDQ AX, acc3
MOVQ t0, t1
ADCQ DX, acc0
ADCQ $0, t1
SUBQ t0, acc0
SBBQ $0, t1
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc2
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc1
ADCQ $0, acc2
SUBQ AX, acc1
SBBQ DX, acc2
// Last reduction step
MOVQ acc3, AX
MULQ p256ordK0<>(SB)
MOVQ AX, t0
MOVQ p256ord<>+0x00(SB), AX
MULQ t0
ADDQ AX, acc3
ADCQ $0, DX
MOVQ DX, t1
MOVQ p256ord<>+0x08(SB), AX
MULQ t0
ADDQ t1, acc0
ADCQ $0, DX
ADDQ AX, acc0
ADCQ $0, DX
MOVQ DX, t1
MOVQ t0, t1
ADCQ DX, acc1
ADCQ $0, t1
SUBQ t0, acc1
SBBQ $0, t1
MOVQ t0, AX
MOVQ t0, DX
MOVQ t0, acc3
SHLQ $32, AX
SHRQ $32, DX
ADDQ t1, acc2
ADCQ $0, acc3
SUBQ AX, acc2
SBBQ DX, acc3
XORQ t0, t0
// Add bits [511:256] of the sqr result
ADCQ acc4, acc0
ADCQ acc5, acc1
ADCQ y_ptr, acc2
ADCQ x_ptr, acc3
ADCQ $0, t0
MOVQ acc0, acc4
MOVQ acc1, acc5
MOVQ acc2, y_ptr
MOVQ acc3, t1
// Subtract p256
SUBQ p256ord<>+0x00(SB), acc0
SBBQ p256ord<>+0x08(SB) ,acc1
SBBQ p256ord<>+0x10(SB), acc2
SBBQ p256ord<>+0x18(SB), acc3
SBBQ $0, t0
CMOVQCS acc4, acc0
CMOVQCS acc5, acc1
CMOVQCS y_ptr, acc2
CMOVQCS t1, acc3
MOVQ acc0, (8*0)(res_ptr)
MOVQ acc1, (8*1)(res_ptr)
MOVQ acc2, (8*2)(res_ptr)
MOVQ acc3, (8*3)(res_ptr)
MOVQ res_ptr, x_ptr
DECQ BX
JNE ordSqrLoop
RET
/* ---------------------------------------*/
#undef res_ptr
#undef x_ptr
#undef y_ptr
#undef acc0
#undef acc1
#undef acc2
#undef acc3
#undef acc4
#undef acc5
#undef t0
#undef t1
/* ---------------------------------------*/
#define mul0 AX
#define mul1 DX
#define acc0 BX
#define acc1 CX
#define acc2 R8
#define acc3 R9
#define acc4 R10
#define acc5 R11
#define acc6 R12
#define acc7 R13
#define t0 R14
#define t1 R15
#define t2 DI
#define t3 SI
#define hlp BP
/* ---------------------------------------*/
TEXT p256SubInternal(SB),NOSPLIT,$0
XORQ mul0, mul0
SUBQ t0, acc4
SBBQ t1, acc5
SBBQ t2, acc6
SBBQ t3, acc7
SBBQ $0, mul0
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
ADDQ $-1, acc4
ADCQ p256const0<>(SB), acc5
ADCQ $0, acc6
ADCQ p256const1<>(SB), acc7
ADCQ $0, mul0
CMOVQNE acc0, acc4
CMOVQNE acc1, acc5
CMOVQNE acc2, acc6
CMOVQNE acc3, acc7
RET
/* ---------------------------------------*/
TEXT p256MulInternal(SB),NOSPLIT,$0
MOVQ acc4, mul0
MULQ t0
MOVQ mul0, acc0
MOVQ mul1, acc1
MOVQ acc4, mul0
MULQ t1
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
MOVQ acc4, mul0
MULQ t2
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
MOVQ acc4, mul0
MULQ t3
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc4
MOVQ acc5, mul0
MULQ t0
ADDQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ t1
ADDQ hlp, acc2
ADCQ $0, mul1
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ t2
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ t3
ADDQ hlp, acc4
ADCQ $0, mul1
ADDQ mul0, acc4
ADCQ $0, mul1
MOVQ mul1, acc5
MOVQ acc6, mul0
MULQ t0
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc6, mul0
MULQ t1
ADDQ hlp, acc3
ADCQ $0, mul1
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc6, mul0
MULQ t2
ADDQ hlp, acc4
ADCQ $0, mul1
ADDQ mul0, acc4
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc6, mul0
MULQ t3
ADDQ hlp, acc5
ADCQ $0, mul1
ADDQ mul0, acc5
ADCQ $0, mul1
MOVQ mul1, acc6
MOVQ acc7, mul0
MULQ t0
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc7, mul0
MULQ t1
ADDQ hlp, acc4
ADCQ $0, mul1
ADDQ mul0, acc4
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc7, mul0
MULQ t2
ADDQ hlp, acc5
ADCQ $0, mul1
ADDQ mul0, acc5
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc7, mul0
MULQ t3
ADDQ hlp, acc6
ADCQ $0, mul1
ADDQ mul0, acc6
ADCQ $0, mul1
MOVQ mul1, acc7
// First reduction step
MOVQ acc0, mul0
MOVQ acc0, hlp
SHLQ $32, acc0
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc0, acc1
ADCQ hlp, acc2
ADCQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, hlp
SHLQ $32, acc1
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc1, acc2
ADCQ hlp, acc3
ADCQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, hlp
SHLQ $32, acc2
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc2, acc3
ADCQ hlp, acc0
ADCQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, hlp
SHLQ $32, acc3
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc3, acc0
ADCQ hlp, acc1
ADCQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, acc4
ADCQ acc1, acc5
ADCQ acc2, acc6
ADCQ acc3, acc7
ADCQ $0, hlp
// Copy result
MOVQ acc4, acc0
MOVQ acc5, acc1
MOVQ acc6, acc2
MOVQ acc7, acc3
// Subtract p256
SUBQ $-1, acc4
SBBQ p256const0<>(SB) ,acc5
SBBQ $0, acc6
SBBQ p256const1<>(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS acc0, acc4
CMOVQCS acc1, acc5
CMOVQCS acc2, acc6
CMOVQCS acc3, acc7
RET
/* ---------------------------------------*/
TEXT p256SqrInternal(SB),NOSPLIT,$0
MOVQ acc4, mul0
MULQ acc5
MOVQ mul0, acc1
MOVQ mul1, acc2
MOVQ acc4, mul0
MULQ acc6
ADDQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
MOVQ acc4, mul0
MULQ acc7
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, t0
MOVQ acc5, mul0
MULQ acc6
ADDQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, hlp
MOVQ acc5, mul0
MULQ acc7
ADDQ hlp, t0
ADCQ $0, mul1
ADDQ mul0, t0
ADCQ $0, mul1
MOVQ mul1, t1
MOVQ acc6, mul0
MULQ acc7
ADDQ mul0, t1
ADCQ $0, mul1
MOVQ mul1, t2
XORQ t3, t3
// *2
ADDQ acc1, acc1
ADCQ acc2, acc2
ADCQ acc3, acc3
ADCQ t0, t0
ADCQ t1, t1
ADCQ t2, t2
ADCQ $0, t3
// Missing products
MOVQ acc4, mul0
MULQ mul0
MOVQ mul0, acc0
MOVQ DX, acc4
MOVQ acc5, mul0
MULQ mul0
ADDQ acc4, acc1
ADCQ mul0, acc2
ADCQ $0, DX
MOVQ DX, acc4
MOVQ acc6, mul0
MULQ mul0
ADDQ acc4, acc3
ADCQ mul0, t0
ADCQ $0, DX
MOVQ DX, acc4
MOVQ acc7, mul0
MULQ mul0
ADDQ acc4, t1
ADCQ mul0, t2
ADCQ DX, t3
// First reduction step
MOVQ acc0, mul0
MOVQ acc0, hlp
SHLQ $32, acc0
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc0, acc1
ADCQ hlp, acc2
ADCQ mul0, acc3
ADCQ $0, mul1
MOVQ mul1, acc0
// Second reduction step
MOVQ acc1, mul0
MOVQ acc1, hlp
SHLQ $32, acc1
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc1, acc2
ADCQ hlp, acc3
ADCQ mul0, acc0
ADCQ $0, mul1
MOVQ mul1, acc1
// Third reduction step
MOVQ acc2, mul0
MOVQ acc2, hlp
SHLQ $32, acc2
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc2, acc3
ADCQ hlp, acc0
ADCQ mul0, acc1
ADCQ $0, mul1
MOVQ mul1, acc2
// Last reduction step
MOVQ acc3, mul0
MOVQ acc3, hlp
SHLQ $32, acc3
MULQ p256const1<>(SB)
SHRQ $32, hlp
ADDQ acc3, acc0
ADCQ hlp, acc1
ADCQ mul0, acc2
ADCQ $0, mul1
MOVQ mul1, acc3
BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP
// Add bits [511:256] of the result
ADCQ acc0, t0
ADCQ acc1, t1
ADCQ acc2, t2
ADCQ acc3, t3
ADCQ $0, hlp
// Copy result
MOVQ t0, acc4
MOVQ t1, acc5
MOVQ t2, acc6
MOVQ t3, acc7
// Subtract p256
SUBQ $-1, acc4
SBBQ p256const0<>(SB) ,acc5
SBBQ $0, acc6
SBBQ p256const1<>(SB), acc7
SBBQ $0, hlp
// If the result of the subtraction is negative, restore the previous result
CMOVQCS t0, acc4
CMOVQCS t1, acc5
CMOVQCS t2, acc6
CMOVQCS t3, acc7
RET
/* ---------------------------------------*/
#define p256MulBy2Inline\
XORQ mul0, mul0;\
ADDQ acc4, acc4;\
ADCQ acc5, acc5;\
ADCQ acc6, acc6;\
ADCQ acc7, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256const0<>(SB), t1;\
SBBQ $0, t2;\
SBBQ p256const1<>(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
#define p256AddInline \
XORQ mul0, mul0;\
ADDQ t0, acc4;\
ADCQ t1, acc5;\
ADCQ t2, acc6;\
ADCQ t3, acc7;\
ADCQ $0, mul0;\
MOVQ acc4, t0;\
MOVQ acc5, t1;\
MOVQ acc6, t2;\
MOVQ acc7, t3;\
SUBQ $-1, t0;\
SBBQ p256const0<>(SB), t1;\
SBBQ $0, t2;\
SBBQ p256const1<>(SB), t3;\
SBBQ $0, mul0;\
CMOVQCS acc4, t0;\
CMOVQCS acc5, t1;\
CMOVQCS acc6, t2;\
CMOVQCS acc7, t3;
/* ---------------------------------------*/
#define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7
#define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3
#define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3)
#define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3)
#define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3
#define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7
/* ---------------------------------------*/
#define x1in(off) (32*0 + off)(SP)
#define y1in(off) (32*1 + off)(SP)
#define z1in(off) (32*2 + off)(SP)
#define x2in(off) (32*3 + off)(SP)
#define y2in(off) (32*4 + off)(SP)
#define xout(off) (32*5 + off)(SP)
#define yout(off) (32*6 + off)(SP)
#define zout(off) (32*7 + off)(SP)
#define s2(off) (32*8 + off)(SP)
#define z1sqr(off) (32*9 + off)(SP)
#define h(off) (32*10 + off)(SP)
#define r(off) (32*11 + off)(SP)
#define hsqr(off) (32*12 + off)(SP)
#define rsqr(off) (32*13 + off)(SP)
#define hcub(off) (32*14 + off)(SP)
#define rptr (32*15)(SP)
#define sel_save (32*15 + 8)(SP)
#define zero_save (32*15 + 8 + 4)(SP)
// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
TEXT ·p256PointAddAffineAsm(SB),0,$512-96
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in1+24(FP), BX
MOVQ in2+48(FP), CX
MOVQ sign+72(FP), DX
MOVQ sel+80(FP), t1
MOVQ zero+88(FP), t2
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU (16*4)(BX), X4
MOVOU (16*5)(BX), X5
MOVOU X0, x1in(16*0)
MOVOU X1, x1in(16*1)
MOVOU X2, y1in(16*0)
MOVOU X3, y1in(16*1)
MOVOU X4, z1in(16*0)
MOVOU X5, z1in(16*1)
MOVOU (16*0)(CX), X0
MOVOU (16*1)(CX), X1
MOVOU X0, x2in(16*0)
MOVOU X1, x2in(16*1)
// Store pointer to result
MOVQ mul0, rptr
MOVL t1, sel_save
MOVL t2, zero_save
// Negate y2in based on sign
MOVQ (16*2 + 8*0)(CX), acc4
MOVQ (16*2 + 8*1)(CX), acc5
MOVQ (16*2 + 8*2)(CX), acc6
MOVQ (16*2 + 8*3)(CX), acc7
MOVQ $-1, acc0
MOVQ p256const0<>(SB), acc1
MOVQ $0, acc2
MOVQ p256const1<>(SB), acc3
XORQ mul0, mul0
// Speculatively subtract
SUBQ acc4, acc0
SBBQ acc5, acc1
SBBQ acc6, acc2
SBBQ acc7, acc3
SBBQ $0, mul0
MOVQ acc0, t0
MOVQ acc1, t1
MOVQ acc2, t2
MOVQ acc3, t3
// Add in case the operand was > p256
ADDQ $-1, acc0
ADCQ p256const0<>(SB), acc1
ADCQ $0, acc2
ADCQ p256const1<>(SB), acc3
ADCQ $0, mul0
CMOVQNE t0, acc0
CMOVQNE t1, acc1
CMOVQNE t2, acc2
CMOVQNE t3, acc3
// If condition is 0, keep original value
TESTQ DX, DX
CMOVQEQ acc4, acc0
CMOVQEQ acc5, acc1
CMOVQEQ acc6, acc2
CMOVQEQ acc7, acc3
// Store result
MOVQ acc0, y2in(8*0)
MOVQ acc1, y2in(8*1)
MOVQ acc2, y2in(8*2)
MOVQ acc3, y2in(8*3)
// Begin point add
LDacc (z1in)
CALL p256SqrInternal(SB) // z1ˆ2
ST (z1sqr)
LDt (x2in)
CALL p256MulInternal(SB) // x2 * z1ˆ2
LDt (x1in)
CALL p256SubInternal(SB) // h = u2 - u1
ST (h)
LDt (z1in)
CALL p256MulInternal(SB) // z3 = h * z1
ST (zout)
LDacc (z1sqr)
CALL p256MulInternal(SB) // z1ˆ3
LDt (y2in)
CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3
ST (s2)
LDt (y1in)
CALL p256SubInternal(SB) // r = s2 - s1
ST (r)
CALL p256SqrInternal(SB) // rsqr = rˆ2
ST (rsqr)
LDacc (h)
CALL p256SqrInternal(SB) // hsqr = hˆ2
ST (hsqr)
LDt (h)
CALL p256MulInternal(SB) // hcub = hˆ3
ST (hcub)
LDt (y1in)
CALL p256MulInternal(SB) // y1 * hˆ3
ST (s2)
LDacc (x1in)
LDt (hsqr)
CALL p256MulInternal(SB) // u1 * hˆ2
ST (h)
p256MulBy2Inline // u1 * hˆ2 * 2, inline
LDacc (rsqr)
CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
LDt (hcub)
CALL p256SubInternal(SB)
ST (xout)
MOVQ acc4, t0
MOVQ acc5, t1
MOVQ acc6, t2
MOVQ acc7, t3
LDacc (h)
CALL p256SubInternal(SB)
LDt (r)
CALL p256MulInternal(SB)
LDt (s2)
CALL p256SubInternal(SB)
ST (yout)
// Load stored values from stack
MOVQ rptr, AX
MOVL sel_save, BX
MOVL zero_save, CX
// The result is not valid if (sel == 0), conditional choose
MOVOU xout(16*0), X0
MOVOU xout(16*1), X1
MOVOU yout(16*0), X2
MOVOU yout(16*1), X3
MOVOU zout(16*0), X4
MOVOU zout(16*1), X5
MOVL BX, X6
MOVL CX, X7
PXOR X8, X8
PCMPEQL X9, X9
PSHUFD $0, X6, X6
PSHUFD $0, X7, X7
PCMPEQL X8, X6
PCMPEQL X8, X7
MOVOU X6, X15
PANDN X9, X15
MOVOU x1in(16*0), X9
MOVOU x1in(16*1), X10
MOVOU y1in(16*0), X11
MOVOU y1in(16*1), X12
MOVOU z1in(16*0), X13
MOVOU z1in(16*1), X14
PAND X15, X0
PAND X15, X1
PAND X15, X2
PAND X15, X3
PAND X15, X4
PAND X15, X5
PAND X6, X9
PAND X6, X10
PAND X6, X11
PAND X6, X12
PAND X6, X13
PAND X6, X14
PXOR X9, X0
PXOR X10, X1
PXOR X11, X2
PXOR X12, X3
PXOR X13, X4
PXOR X14, X5
// Similarly if zero == 0
PCMPEQL X9, X9
MOVOU X7, X15
PANDN X9, X15
MOVOU x2in(16*0), X9
MOVOU x2in(16*1), X10
MOVOU y2in(16*0), X11
MOVOU y2in(16*1), X12
MOVOU p256one<>+0x00(SB), X13
MOVOU p256one<>+0x10(SB), X14
PAND X15, X0
PAND X15, X1
PAND X15, X2
PAND X15, X3
PAND X15, X4
PAND X15, X5
PAND X7, X9
PAND X7, X10
PAND X7, X11
PAND X7, X12
PAND X7, X13
PAND X7, X14
PXOR X9, X0
PXOR X10, X1
PXOR X11, X2
PXOR X12, X3
PXOR X13, X4
PXOR X14, X5
// Finally output the result
MOVOU X0, (16*0)(AX)
MOVOU X1, (16*1)(AX)
MOVOU X2, (16*2)(AX)
MOVOU X3, (16*3)(AX)
MOVOU X4, (16*4)(AX)
MOVOU X5, (16*5)(AX)
MOVQ $0, rptr
RET
#undef x1in
#undef y1in
#undef z1in
#undef x2in
#undef y2in
#undef xout
#undef yout
#undef zout
#undef s2
#undef z1sqr
#undef h
#undef r
#undef hsqr
#undef rsqr
#undef hcub
#undef rptr
#undef sel_save
#undef zero_save
/* ---------------------------------------*/
#define x1in(off) (32*0 + off)(SP)
#define y1in(off) (32*1 + off)(SP)
#define z1in(off) (32*2 + off)(SP)
#define x2in(off) (32*3 + off)(SP)
#define y2in(off) (32*4 + off)(SP)
#define z2in(off) (32*5 + off)(SP)
#define xout(off) (32*6 + off)(SP)
#define yout(off) (32*7 + off)(SP)
#define zout(off) (32*8 + off)(SP)
#define u1(off) (32*9 + off)(SP)
#define u2(off) (32*10 + off)(SP)
#define s1(off) (32*11 + off)(SP)
#define s2(off) (32*12 + off)(SP)
#define z1sqr(off) (32*13 + off)(SP)
#define z2sqr(off) (32*14 + off)(SP)
#define h(off) (32*15 + off)(SP)
#define r(off) (32*16 + off)(SP)
#define hsqr(off) (32*17 + off)(SP)
#define rsqr(off) (32*18 + off)(SP)
#define hcub(off) (32*19 + off)(SP)
#define rptr (32*20)(SP)
//func p256PointAddAsm(res, in1, in2 []uint64)
TEXT ·p256PointAddAsm(SB),0,$672-72
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in1+24(FP), BX
MOVQ in2+48(FP), CX
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU (16*4)(BX), X4
MOVOU (16*5)(BX), X5
MOVOU X0, x1in(16*0)
MOVOU X1, x1in(16*1)
MOVOU X2, y1in(16*0)
MOVOU X3, y1in(16*1)
MOVOU X4, z1in(16*0)
MOVOU X5, z1in(16*1)
MOVOU (16*0)(CX), X0
MOVOU (16*1)(CX), X1
MOVOU (16*2)(CX), X2
MOVOU (16*3)(CX), X3
MOVOU (16*4)(CX), X4
MOVOU (16*5)(CX), X5
MOVOU X0, x2in(16*0)
MOVOU X1, x2in(16*1)
MOVOU X2, y2in(16*0)
MOVOU X3, y2in(16*1)
MOVOU X4, z2in(16*0)
MOVOU X5, z2in(16*1)
// Store pointer to result
MOVQ AX, rptr
// Begin point add
LDacc (z2in)
CALL p256SqrInternal(SB) // z2ˆ2
ST (z2sqr)
LDt (z2in)
CALL p256MulInternal(SB) // z2ˆ3
LDt (y1in)
CALL p256MulInternal(SB) // s1 = z2ˆ3*y1
ST (s1)
LDacc (z1in)
CALL p256SqrInternal(SB) // z1ˆ2
ST (z1sqr)
LDt (z1in)
CALL p256MulInternal(SB) // z1ˆ3
LDt (y2in)
CALL p256MulInternal(SB) // s2 = z1ˆ3*y2
ST (s2)
LDt (s1)
CALL p256SubInternal(SB) // r = s2 - s1
ST (r)
LDacc (z2sqr)
LDt (x1in)
CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2
ST (u1)
LDacc (z1sqr)
LDt (x2in)
CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2
ST (u2)
LDt (u1)
CALL p256SubInternal(SB) // h = u2 - u1
ST (h)
LDacc (r)
CALL p256SqrInternal(SB) // rsqr = rˆ2
ST (rsqr)
LDacc (h)
CALL p256SqrInternal(SB) // hsqr = hˆ2
ST (hsqr)
LDt (h)
CALL p256MulInternal(SB) // hcub = hˆ3
ST (hcub)
LDt (s1)
CALL p256MulInternal(SB)
ST (s2)
LDacc (z1in)
LDt (z2in)
CALL p256MulInternal(SB) // z1 * z2
LDt (h)
CALL p256MulInternal(SB) // z1 * z2 * h
ST (zout)
LDacc (hsqr)
LDt (u1)
CALL p256MulInternal(SB) // hˆ2 * u1
ST (u2)
p256MulBy2Inline // u1 * hˆ2 * 2, inline
LDacc (rsqr)
CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
LDt (hcub)
CALL p256SubInternal(SB)
ST (xout)
MOVQ acc4, t0
MOVQ acc5, t1
MOVQ acc6, t2
MOVQ acc7, t3
LDacc (u2)
CALL p256SubInternal(SB)
LDt (r)
CALL p256MulInternal(SB)
LDt (s2)
CALL p256SubInternal(SB)
ST (yout)
MOVOU xout(16*0), X0
MOVOU xout(16*1), X1
MOVOU yout(16*0), X2
MOVOU yout(16*1), X3
MOVOU zout(16*0), X4
MOVOU zout(16*1), X5
// Finally output the result
MOVQ rptr, AX
MOVQ $0, rptr
MOVOU X0, (16*0)(AX)
MOVOU X1, (16*1)(AX)
MOVOU X2, (16*2)(AX)
MOVOU X3, (16*3)(AX)
MOVOU X4, (16*4)(AX)
MOVOU X5, (16*5)(AX)
RET
#undef x1in
#undef y1in
#undef z1in
#undef x2in
#undef y2in
#undef z2in
#undef xout
#undef yout
#undef zout
#undef s1
#undef s2
#undef u1
#undef u2
#undef z1sqr
#undef z2sqr
#undef h
#undef r
#undef hsqr
#undef rsqr
#undef hcub
#undef rptr
/* ---------------------------------------*/
#define x(off) (32*0 + off)(SP)
#define y(off) (32*1 + off)(SP)
#define z(off) (32*2 + off)(SP)
#define s(off) (32*3 + off)(SP)
#define m(off) (32*4 + off)(SP)
#define zsqr(off) (32*5 + off)(SP)
#define tmp(off) (32*6 + off)(SP)
#define rptr (32*7)(SP)
//func p256PointDoubleAsm(res, in []uint64)
TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48
// Move input to stack in order to free registers
MOVQ res+0(FP), AX
MOVQ in+24(FP), BX
MOVOU (16*0)(BX), X0
MOVOU (16*1)(BX), X1
MOVOU (16*2)(BX), X2
MOVOU (16*3)(BX), X3
MOVOU (16*4)(BX), X4
MOVOU (16*5)(BX), X5
MOVOU X0, x(16*0)
MOVOU X1, x(16*1)
MOVOU X2, y(16*0)
MOVOU X3, y(16*1)
MOVOU X4, z(16*0)
MOVOU X5, z(16*1)
// Store pointer to result
MOVQ AX, rptr
// Begin point double
LDacc (z)
CALL p256SqrInternal(SB)
ST (zsqr)
LDt (x)
p256AddInline
STt (m)
LDacc (z)
LDt (y)
CALL p256MulInternal(SB)
p256MulBy2Inline
MOVQ rptr, AX
// Store z
MOVQ t0, (16*4 + 8*0)(AX)
MOVQ t1, (16*4 + 8*1)(AX)
MOVQ t2, (16*4 + 8*2)(AX)
MOVQ t3, (16*4 + 8*3)(AX)
LDacc (x)
LDt (zsqr)
CALL p256SubInternal(SB)
LDt (m)
CALL p256MulInternal(SB)
ST (m)
// Multiply by 3
p256MulBy2Inline
LDacc (m)
p256AddInline
STt (m)
////////////////////////
LDacc (y)
p256MulBy2Inline
t2acc
CALL p256SqrInternal(SB)
ST (s)
CALL p256SqrInternal(SB)
// Divide by 2
XORQ mul0, mul0
MOVQ acc4, t0
MOVQ acc5, t1
MOVQ acc6, t2
MOVQ acc7, t3
ADDQ $-1, acc4
ADCQ p256const0<>(SB), acc5
ADCQ $0, acc6
ADCQ p256const1<>(SB), acc7
ADCQ $0, mul0
TESTQ $1, t0
CMOVQEQ t0, acc4
CMOVQEQ t1, acc5
CMOVQEQ t2, acc6
CMOVQEQ t3, acc7
ANDQ t0, mul0
SHRQ $1, acc4:acc5
SHRQ $1, acc5:acc6
SHRQ $1, acc6:acc7
SHRQ $1, acc7:mul0
ST (y)
/////////////////////////
LDacc (x)
LDt (s)
CALL p256MulInternal(SB)
ST (s)
p256MulBy2Inline
STt (tmp)
LDacc (m)
CALL p256SqrInternal(SB)
LDt (tmp)
CALL p256SubInternal(SB)
MOVQ rptr, AX
// Store x
MOVQ acc4, (16*0 + 8*0)(AX)
MOVQ acc5, (16*0 + 8*1)(AX)
MOVQ acc6, (16*0 + 8*2)(AX)
MOVQ acc7, (16*0 + 8*3)(AX)
acc2t
LDacc (s)
CALL p256SubInternal(SB)
LDt (m)
CALL p256MulInternal(SB)
LDt (y)
CALL p256SubInternal(SB)
MOVQ rptr, AX
// Store y
MOVQ acc4, (16*2 + 8*0)(AX)
MOVQ acc5, (16*2 + 8*1)(AX)
MOVQ acc6, (16*2 + 8*2)(AX)
MOVQ acc7, (16*2 + 8*3)(AX)
///////////////////////
MOVQ $0, rptr
RET
/* ---------------------------------------*/
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment