Commit 854ae03d authored by Mike Strosaker's avatar Mike Strosaker Committed by David Chase

crypto/sha512: improve performance for sha512.block on ppc64le

Adds an assembly implementation of sha512.block for ppc64le to improve its
performance.  This implementation is largely based on the original amd64
implementation, unrolling the 80 iterations of the inner loop.

Fixes #17660

benchmark               old ns/op     new ns/op     delta
BenchmarkHash8Bytes     1715          1133          -33.94%
BenchmarkHash1K         10098         5513          -45.41%
BenchmarkHash8K         68004         35278         -48.12%

benchmark               old MB/s     new MB/s     speedup
BenchmarkHash8Bytes     4.66         7.06         1.52x
BenchmarkHash1K         101.40       185.72       1.83x
BenchmarkHash8K         120.46       232.21       1.93x

Change-Id: Ifd55a49a24cb159b3a09a8e928c3f37727aca103
Reviewed-on: https://go-review.googlesource.com/32320Reviewed-by: default avatarCarlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Reviewed-by: default avatarDavid Chase <drchase@google.com>
Run-TryBot: David Chase <drchase@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent 051cf380
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build amd64 s390x
// +build amd64 s390x ppc64le
package sha512
......
......@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!s390x
// +build !amd64,!s390x,!ppc64le
package sha512
......
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
#include "textflag.h"
// SHA512 block routine. See sha512block.go for Go equivalent.
//
// The algorithm is detailed in FIPS 180-4:
//
// http://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
//
// Wt = Mt; for 0 <= t <= 15
// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
//
// a = H0
// b = H1
// c = H2
// d = H3
// e = H4
// f = H5
// g = H6
// h = H7
//
// for t = 0 to 79 {
// T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
// T2 = BIGSIGMA0(a) + Maj(a,b,c)
// h = g
// g = f
// f = e
// e = d + T1
// d = c
// c = b
// b = a
// a = T1 + T2
// }
//
// H0 = a + H0
// H1 = b + H1
// H2 = c + H2
// H3 = d + H3
// H4 = e + H4
// H5 = f + H5
// H6 = g + H6
// H7 = h + H7
// Wt = Mt; for 0 <= t <= 15
#define MSGSCHEDULE0(index) \
MOVD (index*8)(R6), R14; \
RLWNM $24, R14, $-1, R21; \
RLWMI $8, R14, $0x00FF0000, R21; \
RLWMI $8, R14, $0x000000FF, R21; \
SLD $32, R21; \
SRD $32, R14, R20; \
RLWNM $24, R20, $-1, R14; \
RLWMI $8, R20, $0x00FF0000, R14; \
RLWMI $8, R20, $0x000000FF, R14; \
OR R21, R14; \
MOVD R14, (index*8)(R9)
// Wt = SIGMA1(Wt-2) + Wt-7 + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
// SIGMA0(x) = ROTR(1,x) XOR ROTR(8,x) XOR SHR(7,x)
// SIGMA1(x) = ROTR(19,x) XOR ROTR(61,x) XOR SHR(6,x)
#define MSGSCHEDULE1(index) \
MOVD ((index-2)*8)(R9), R14; \
MOVD R14, R16; \
RLDCL $64-19, R14, $-1, R14; \
MOVD R16, R17; \
RLDCL $64-61, R16, $-1, R16; \
SRD $6, R17; \
MOVD ((index-15)*8)(R9), R15; \
XOR R16, R14; \
MOVD R15, R16; \
XOR R17, R14; \
RLDCL $64-1, R15, $-1, R15; \
MOVD R16, R17; \
SRD $7, R17; \
RLDCL $64-8, R16, $-1, R16; \
MOVD ((index-7)*8)(R9), R21; \
ADD R21, R14; \
XOR R16, R15; \
XOR R17, R15; \
MOVD ((index-16)*8)(R9), R21; \
ADD R21, R15; \
ADD R15, R14; \
MOVD R14, ((index)*8)(R9)
// Calculate T1 in R14 - uses R14, R16 and R17 registers.
// h is also used as an accumulator. Wt is passed in R14.
// T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + Kt + Wt
// BIGSIGMA1(x) = ROTR(14,x) XOR ROTR(18,x) XOR ROTR(41,x)
// Ch(x, y, z) = (x AND y) XOR (NOT x AND z)
#define SHA512T1(const, e, f, g, h) \
MOVD $const, R17; \
ADD R14, h; \
MOVD e, R14; \
ADD R17, h; \
MOVD e, R16; \
RLDCL $64-14, R14, $-1, R14; \
MOVD e, R17; \
RLDCL $64-18, R16, $-1, R16; \
XOR R16, R14; \
MOVD e, R16; \
RLDCL $64-41, R17, $-1, R17; \
AND f, R16; \
XOR R14, R17; \
MOVD e, R14; \
NOR R14, R14, R14; \
ADD R17, h; \
AND g, R14; \
XOR R16, R14; \
ADD h, R14
// Calculate T2 in R15 - uses R15, R16, R17 and R8 registers.
// T2 = BIGSIGMA0(a) + Maj(a, b, c)
// BIGSIGMA0(x) = ROTR(28,x) XOR ROTR(34,x) XOR ROTR(39,x)
// Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
#define SHA512T2(a, b, c) \
MOVD a, R8; \
MOVD c, R15; \
RLDCL $64-28, R8, $-1, R8; \
MOVD a, R17; \
AND b, R15; \
RLDCL $64-34, R17, $-1, R17; \
MOVD a, R16; \
AND c, R16; \
XOR R17, R8; \
XOR R16, R15; \
MOVD a, R17; \
MOVD b, R16; \
RLDCL $64-39, R17, $-1, R17; \
AND a, R16; \
XOR R16, R15; \
XOR R17, R8; \
ADD R8, R15
// Calculate T1 and T2, then e = d + T1 and a = T1 + T2.
// The values for e and a are stored in d and h, ready for rotation.
#define SHA512ROUND(index, const, a, b, c, d, e, f, g, h) \
SHA512T1(const, e, f, g, h); \
SHA512T2(a, b, c); \
MOVD R15, h; \
ADD R14, d; \
ADD R14, h
#define SHA512ROUND0(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE0(index); \
SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
#define SHA512ROUND1(index, const, a, b, c, d, e, f, g, h) \
MSGSCHEDULE1(index); \
SHA512ROUND(index, const, a, b, c, d, e, f, g, h)
// func block(dig *digest, p []byte)
TEXT ·block(SB),0,$680-32
MOVD p_base+8(FP), R6
MOVD p_len+16(FP), R7
SRD $7, R7
SLD $7, R7
ADD R6, R7, R8
MOVD R8, 640(R1)
CMP R6, R8
BEQ end
MOVD dig+0(FP), R9
MOVD (0*8)(R9), R22 // a = H0
MOVD (1*8)(R9), R23 // b = H1
MOVD (2*8)(R9), R24 // c = H2
MOVD (3*8)(R9), R25 // d = H3
MOVD (4*8)(R9), R26 // e = H4
MOVD (5*8)(R9), R27 // f = H5
MOVD (6*8)(R9), R28 // g = H6
MOVD (7*8)(R9), R29 // h = H7
loop:
MOVD R1, R9 // R9: message schedule
SHA512ROUND0(0, 0x428a2f98d728ae22, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND0(1, 0x7137449123ef65cd, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND0(2, 0xb5c0fbcfec4d3b2f, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND0(3, 0xe9b5dba58189dbbc, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND0(4, 0x3956c25bf348b538, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND0(5, 0x59f111f1b605d019, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND0(6, 0x923f82a4af194f9b, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND0(7, 0xab1c5ed5da6d8118, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND0(8, 0xd807aa98a3030242, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND0(9, 0x12835b0145706fbe, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND0(10, 0x243185be4ee4b28c, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND0(11, 0x550c7dc3d5ffb4e2, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND0(12, 0x72be5d74f27b896f, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND0(13, 0x80deb1fe3b1696b1, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND0(14, 0x9bdc06a725c71235, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND0(15, 0xc19bf174cf692694, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(16, 0xe49b69c19ef14ad2, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(17, 0xefbe4786384f25e3, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(18, 0x0fc19dc68b8cd5b5, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(19, 0x240ca1cc77ac9c65, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(20, 0x2de92c6f592b0275, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(21, 0x4a7484aa6ea6e483, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(22, 0x5cb0a9dcbd41fbd4, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(23, 0x76f988da831153b5, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(24, 0x983e5152ee66dfab, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(25, 0xa831c66d2db43210, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(26, 0xb00327c898fb213f, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(27, 0xbf597fc7beef0ee4, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(28, 0xc6e00bf33da88fc2, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(29, 0xd5a79147930aa725, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(30, 0x06ca6351e003826f, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(31, 0x142929670a0e6e70, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(32, 0x27b70a8546d22ffc, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(33, 0x2e1b21385c26c926, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(34, 0x4d2c6dfc5ac42aed, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(35, 0x53380d139d95b3df, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(36, 0x650a73548baf63de, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(37, 0x766a0abb3c77b2a8, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(38, 0x81c2c92e47edaee6, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(39, 0x92722c851482353b, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(40, 0xa2bfe8a14cf10364, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(41, 0xa81a664bbc423001, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(42, 0xc24b8b70d0f89791, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(43, 0xc76c51a30654be30, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(44, 0xd192e819d6ef5218, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(45, 0xd69906245565a910, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(46, 0xf40e35855771202a, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(47, 0x106aa07032bbd1b8, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(48, 0x19a4c116b8d2d0c8, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(49, 0x1e376c085141ab53, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(50, 0x2748774cdf8eeb99, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(51, 0x34b0bcb5e19b48a8, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(52, 0x391c0cb3c5c95a63, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(53, 0x4ed8aa4ae3418acb, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(54, 0x5b9cca4f7763e373, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(55, 0x682e6ff3d6b2b8a3, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(56, 0x748f82ee5defb2fc, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(57, 0x78a5636f43172f60, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(58, 0x84c87814a1f0ab72, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(59, 0x8cc702081a6439ec, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(60, 0x90befffa23631e28, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(61, 0xa4506cebde82bde9, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(62, 0xbef9a3f7b2c67915, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(63, 0xc67178f2e372532b, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(64, 0xca273eceea26619c, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(65, 0xd186b8c721c0c207, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(66, 0xeada7dd6cde0eb1e, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(67, 0xf57d4f7fee6ed178, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(68, 0x06f067aa72176fba, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(69, 0x0a637dc5a2c898a6, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(70, 0x113f9804bef90dae, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(71, 0x1b710b35131c471b, R23, R24, R25, R26, R27, R28, R29, R22)
SHA512ROUND1(72, 0x28db77f523047d84, R22, R23, R24, R25, R26, R27, R28, R29)
SHA512ROUND1(73, 0x32caab7b40c72493, R29, R22, R23, R24, R25, R26, R27, R28)
SHA512ROUND1(74, 0x3c9ebe0a15c9bebc, R28, R29, R22, R23, R24, R25, R26, R27)
SHA512ROUND1(75, 0x431d67c49c100d4c, R27, R28, R29, R22, R23, R24, R25, R26)
SHA512ROUND1(76, 0x4cc5d4becb3e42b6, R26, R27, R28, R29, R22, R23, R24, R25)
SHA512ROUND1(77, 0x597f299cfc657e2a, R25, R26, R27, R28, R29, R22, R23, R24)
SHA512ROUND1(78, 0x5fcb6fab3ad6faec, R24, R25, R26, R27, R28, R29, R22, R23)
SHA512ROUND1(79, 0x6c44198c4a475817, R23, R24, R25, R26, R27, R28, R29, R22)
MOVD dig+0(FP), R9
MOVD (0*8)(R9), R21
ADD R21, R22 // H0 = a + H0
MOVD R22, (0*8)(R9)
MOVD (1*8)(R9), R21
ADD R21, R23 // H1 = b + H1
MOVD R23, (1*8)(R9)
MOVD (2*8)(R9), R21
ADD R21, R24 // H2 = c + H2
MOVD R24, (2*8)(R9)
MOVD (3*8)(R9), R21
ADD R21, R25 // H3 = d + H3
MOVD R25, (3*8)(R9)
MOVD (4*8)(R9), R21
ADD R21, R26 // H4 = e + H4
MOVD R26, (4*8)(R9)
MOVD (5*8)(R9), R21
ADD R21, R27 // H5 = f + H5
MOVD R27, (5*8)(R9)
MOVD (6*8)(R9), R21
ADD R21, R28 // H6 = g + H6
MOVD R28, (6*8)(R9)
MOVD (7*8)(R9), R21
ADD R21, R29 // H7 = h + H7
MOVD R29, (7*8)(R9)
ADD $128, R6
MOVD 640(R1), R21
CMPU R6, R21
BLT loop
end:
RET
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment