Commit e7edc7e2 authored by Adam Langley's avatar Adam Langley

vendor: update golang.org/x/crypto/chacha20poly1305

This change updates the vendored chacha20poly1305 package to match
revision 14f9af67c679edd414f72f13d67c917447113df2 of x/crypto.

Change-Id: I05a4ba86578b0f0cdb1ed7dd50fee3b38bb48cf5
Reviewed-on: https://go-review.googlesource.com/31312
Run-TryBot: Adam Langley <agl@golang.org>
Reviewed-by: default avatarBrad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent bb4c40b2
...@@ -82,82 +82,82 @@ ...@@ -82,82 +82,82 @@
#define TT2 BB3 #define TT2 BB3
#define TT3 CC3 #define TT3 CC3
// ChaCha20 constants // ChaCha20 constants
DATA chacha20Constants<>+0x00(SB)/4, $0x61707865 DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
DATA chacha20Constants<>+0x04(SB)/4, $0x3320646e DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
DATA chacha20Constants<>+0x08(SB)/4, $0x79622d32 DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
DATA chacha20Constants<>+0x0c(SB)/4, $0x6b206574 DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
DATA chacha20Constants<>+0x10(SB)/4, $0x61707865 DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
DATA chacha20Constants<>+0x14(SB)/4, $0x3320646e DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
DATA chacha20Constants<>+0x18(SB)/4, $0x79622d32 DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
DATA chacha20Constants<>+0x1c(SB)/4, $0x6b206574 DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
// <<< 16 with PSHUFB // <<< 16 with PSHUFB
DATA rol16<>+0x00(SB)/8, $0x0504070601000302 DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA rol16<>+0x10(SB)/8, $0x0504070601000302 DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
DATA rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
// <<< 8 with PSHUFB // <<< 8 with PSHUFB
DATA rol8<>+0x00(SB)/8, $0x0605040702010003 DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
DATA rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA rol8<>+0x10(SB)/8, $0x0605040702010003 DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
DATA rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
DATA avx2InitMask<>+0x00(SB)/8, $0x0 DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
DATA avx2InitMask<>+0x08(SB)/8, $0x0 DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
DATA avx2InitMask<>+0x10(SB)/8, $0x1 DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
DATA avx2InitMask<>+0x18(SB)/8, $0x0 DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
DATA avx2IncMask<>+0x00(SB)/8, $0x2 DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
DATA avx2IncMask<>+0x08(SB)/8, $0x0 DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
DATA avx2IncMask<>+0x10(SB)/8, $0x2 DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
DATA avx2IncMask<>+0x18(SB)/8, $0x0 DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
// Poly1305 key clamp // Poly1305 key clamp
DATA polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
DATA polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
DATA polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA sseIncMask<>+0x00(SB)/8, $0x1 DATA ·sseIncMask<>+0x00(SB)/8, $0x1
DATA sseIncMask<>+0x08(SB)/8, $0x0 DATA ·sseIncMask<>+0x08(SB)/8, $0x0
// To load/store the last < 16 bytes in a buffer // To load/store the last < 16 bytes in a buffer
DATA andMask<>+0x00(SB)/8, $0x00000000000000ff DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
DATA andMask<>+0x08(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
DATA andMask<>+0x10(SB)/8, $0x000000000000ffff DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
DATA andMask<>+0x18(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
DATA andMask<>+0x28(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
DATA andMask<>+0x38(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
DATA andMask<>+0x48(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0x58(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
DATA andMask<>+0x68(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x78(SB)/8, $0x0000000000000000 DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x88(SB)/8, $0x00000000000000ff DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x98(SB)/8, $0x000000000000ffff DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
GLOBL chacha20Constants<>(SB), (NOPTR+RODATA), $32 GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
GLOBL rol16<>(SB), (NOPTR+RODATA), $32 GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
GLOBL rol8<>(SB), (NOPTR+RODATA), $32 GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
GLOBL sseIncMask<>(SB), (NOPTR+RODATA), $16 GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
GLOBL avx2IncMask<>(SB), (NOPTR+RODATA), $32 GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
GLOBL avx2InitMask<>(SB), (NOPTR+RODATA), $32 GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
GLOBL polyClampMask<>(SB), (NOPTR+RODATA), $32 GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
GLOBL andMask<>(SB), (NOPTR+RODATA), $240 GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
// No PALIGNR in Go ASM yet (but VPALIGNR is present). // No PALIGNR in Go ASM yet (but VPALIGNR is present).
#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 #define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 #define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
...@@ -185,15 +185,15 @@ GLOBL andMask<>(SB), (NOPTR+RODATA), $240 ...@@ -185,15 +185,15 @@ GLOBL andMask<>(SB), (NOPTR+RODATA), $240
#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 #define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
// Some macros // Some macros
#define chachaQR(A, B, C, D, T) \ #define chachaQR(A, B, C, D, T) \
PADDD B, A; PXOR A, D; PSHUFB rol16<>(SB), D \ PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
PADDD B, A; PXOR A, D; PSHUFB rol8<>(SB), D \ PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
#define chachaQR_AVX2(A, B, C, D, T) \ #define chachaQR_AVX2(A, B, C, D, T) \
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol16<>(SB), D, D \ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol8<>(SB), D, D \ VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 #define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
...@@ -286,7 +286,7 @@ TEXT ·chacha20Poly1305Open(SB), 0, $288-97 ...@@ -286,7 +286,7 @@ TEXT ·chacha20Poly1305Open(SB), 0, $288-97
JBE openSSE128 // About 16% faster JBE openSSE128 // About 16% faster
// For long buffers, prepare the poly key first // For long buffers, prepare the poly key first
MOVOU chacha20Constants<>(SB), A0 MOVOU ·chacha20Constants<>(SB), A0
MOVOU (1*16)(keyp), B0 MOVOU (1*16)(keyp), B0
MOVOU (2*16)(keyp), C0 MOVOU (2*16)(keyp), C0
MOVOU (3*16)(keyp), D0 MOVOU (3*16)(keyp), D0
...@@ -307,10 +307,10 @@ openSSEPreparePolyKey: ...@@ -307,10 +307,10 @@ openSSEPreparePolyKey:
JNE openSSEPreparePolyKey JNE openSSEPreparePolyKey
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
// Clamp and store the key // Clamp and store the key
PAND polyClampMask<>(SB), A0 PAND ·polyClampMask<>(SB), A0
MOVO A0, rStore; MOVO B0, sStore MOVO A0, rStore; MOVO B0, sStore
// Hash AAD // Hash AAD
...@@ -322,10 +322,10 @@ openSSEMainLoop: ...@@ -322,10 +322,10 @@ openSSEMainLoop:
JB openSSEMainLoopDone JB openSSEMainLoopDone
// Load state, increment counter blocks // Load state, increment counter blocks
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters // Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
...@@ -370,7 +370,7 @@ openSSEInternalLoop: ...@@ -370,7 +370,7 @@ openSSEInternalLoop:
JG openSSEInternalLoop JG openSSEInternalLoop
// Add in the state // Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
...@@ -446,9 +446,9 @@ openSSEFinalize: ...@@ -446,9 +446,9 @@ openSSEFinalize:
// Special optimization for buffers smaller than 129 bytes // Special optimization for buffers smaller than 129 bytes
openSSE128: openSSE128:
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
MOVQ $10, itr2 MOVQ $10, itr2
...@@ -465,13 +465,13 @@ openSSE128InnerCipherLoop: ...@@ -465,13 +465,13 @@ openSSE128InnerCipherLoop:
JNE openSSE128InnerCipherLoop JNE openSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
PADDL T2, C1; PADDL T2, C2 PADDL T2, C1; PADDL T2, C2
PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
// Clamp and store the key // Clamp and store the key
PAND polyClampMask<>(SB), A0 PAND ·polyClampMask<>(SB), A0
MOVOU A0, rStore; MOVOU B0, sStore MOVOU A0, rStore; MOVOU B0, sStore
// Hash // Hash
...@@ -509,7 +509,7 @@ openSSETail16: ...@@ -509,7 +509,7 @@ openSSETail16:
// We can safely load the CT from the end, because it is padded with the MAC // We can safely load the CT from the end, because it is padded with the MAC
MOVQ inl, itr2 MOVQ inl, itr2
SHLQ $4, itr2 SHLQ $4, itr2
LEAQ andMask<>(SB), t0 LEAQ ·andMask<>(SB), t0
MOVOU (inp), T0 MOVOU (inp), T0
ADDQ inl, inp ADDQ inl, inp
PAND -16(t0)(itr2*1), T0 PAND -16(t0)(itr2*1), T0
...@@ -534,7 +534,7 @@ openSSETail16Store: ...@@ -534,7 +534,7 @@ openSSETail16Store:
// Special optimization for the last 64 bytes of ciphertext // Special optimization for the last 64 bytes of ciphertext
openSSETail64: openSSETail64:
// Need to decrypt up to 64 bytes - prepare single block // Need to decrypt up to 64 bytes - prepare single block
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
XORQ itr2, itr2 XORQ itr2, itr2
MOVQ inl, itr1 MOVQ inl, itr1
CMPQ itr1, $16 CMPQ itr1, $16
...@@ -559,7 +559,7 @@ openSSETail64LoopB: ...@@ -559,7 +559,7 @@ openSSETail64LoopB:
CMPQ itr2, $160 CMPQ itr2, $160
JNE openSSETail64LoopB JNE openSSETail64LoopB
PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
openSSETail64DecLoop: openSSETail64DecLoop:
CMPQ inl, $16 CMPQ inl, $16
...@@ -583,8 +583,8 @@ openSSETail64DecLoopDone: ...@@ -583,8 +583,8 @@ openSSETail64DecLoopDone:
// Special optimization for the last 128 bytes of ciphertext // Special optimization for the last 128 bytes of ciphertext
openSSETail128: openSSETail128:
// Need to decrypt up to 128 bytes - prepare two blocks // Need to decrypt up to 128 bytes - prepare two blocks
MOVO chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr0Store MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr1Store MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
XORQ itr2, itr2 XORQ itr2, itr2
MOVQ inl, itr1 MOVQ inl, itr1
ANDQ $-16, itr1 ANDQ $-16, itr1
...@@ -609,7 +609,7 @@ openSSETail128LoopB: ...@@ -609,7 +609,7 @@ openSSETail128LoopB:
CMPQ itr2, $160 CMPQ itr2, $160
JNE openSSETail128LoopB JNE openSSETail128LoopB
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B0; PADDL state1Store, B1 PADDL state1Store, B0; PADDL state1Store, B1
PADDL state2Store, C0; PADDL state2Store, C1 PADDL state2Store, C0; PADDL state2Store, C1
PADDL ctr1Store, D0; PADDL ctr0Store, D1 PADDL ctr1Store, D0; PADDL ctr0Store, D1
...@@ -627,9 +627,9 @@ openSSETail128LoopB: ...@@ -627,9 +627,9 @@ openSSETail128LoopB:
// Special optimization for the last 192 bytes of ciphertext // Special optimization for the last 192 bytes of ciphertext
openSSETail192: openSSETail192:
// Need to decrypt up to 192 bytes - prepare three blocks // Need to decrypt up to 192 bytes - prepare three blocks
MOVO chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr0Store MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr2Store MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
MOVQ inl, itr1 MOVQ inl, itr1
MOVQ $160, itr2 MOVQ $160, itr2
...@@ -674,7 +674,7 @@ openSSLTail192LoopB: ...@@ -674,7 +674,7 @@ openSSLTail192LoopB:
polyMul polyMul
openSSLTail192Store: openSSLTail192Store:
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
...@@ -696,10 +696,10 @@ openSSLTail192Store: ...@@ -696,10 +696,10 @@ openSSLTail192Store:
// Special optimization for the last 256 bytes of ciphertext // Special optimization for the last 256 bytes of ciphertext
openSSETail256: openSSETail256:
// Need to decrypt up to 256 bytes - prepare four blocks // Need to decrypt up to 256 bytes - prepare four blocks
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters // Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
...@@ -744,7 +744,7 @@ openSSETail256HashLoop: ...@@ -744,7 +744,7 @@ openSSETail256HashLoop:
JB openSSETail256HashLoop JB openSSETail256HashLoop
// Add in the state // Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
...@@ -779,11 +779,11 @@ openSSETail256HashLoop: ...@@ -779,11 +779,11 @@ openSSETail256HashLoop:
// ------------------------- AVX2 Code ---------------------------------------- // ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Open_AVX2: chacha20Poly1305Open_AVX2:
VZEROUPPER VZEROUPPER
VMOVDQU chacha20Constants<>(SB), AA0 VMOVDQU ·chacha20Constants<>(SB), AA0
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
VPADDD avx2InitMask<>(SB), DD0, DD0 VPADDD ·avx2InitMask<>(SB), DD0, DD0
// Special optimization, for very short buffers // Special optimization, for very short buffers
CMPQ inl, $192 CMPQ inl, $192
...@@ -805,7 +805,7 @@ openAVX2PreparePolyKey: ...@@ -805,7 +805,7 @@ openAVX2PreparePolyKey:
DECQ itr2 DECQ itr2
JNE openAVX2PreparePolyKey JNE openAVX2PreparePolyKey
VPADDD chacha20Constants<>(SB), AA0, AA0 VPADDD ·chacha20Constants<>(SB), AA0, AA0
VPADDD state1StoreAVX2, BB0, BB0 VPADDD state1StoreAVX2, BB0, BB0
VPADDD state2StoreAVX2, CC0, CC0 VPADDD state2StoreAVX2, CC0, CC0
VPADDD ctr3StoreAVX2, DD0, DD0 VPADDD ctr3StoreAVX2, DD0, DD0
...@@ -813,7 +813,7 @@ openAVX2PreparePolyKey: ...@@ -813,7 +813,7 @@ openAVX2PreparePolyKey:
VPERM2I128 $0x02, AA0, BB0, TT0 VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key // Clamp and store poly key
VPAND polyClampMask<>(SB), TT0, TT0 VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2 VMOVDQA TT0, rsStoreAVX2
// Stream for the first 64 bytes // Stream for the first 64 bytes
...@@ -846,10 +846,10 @@ openAVX2MainLoop: ...@@ -846,10 +846,10 @@ openAVX2MainLoop:
JB openAVX2MainLoopDone JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters // Load state, increment counter blocks, store the incremented counters
VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
XORQ itr1, itr1 XORQ itr1, itr1
...@@ -860,7 +860,7 @@ openAVX2InternalLoop: ...@@ -860,7 +860,7 @@ openAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage1_AVX2 polyMulStage1_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulStage2_AVX2 polyMulStage2_AVX2
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
...@@ -874,7 +874,7 @@ openAVX2InternalLoop: ...@@ -874,7 +874,7 @@ openAVX2InternalLoop:
polyMulReduceStage polyMulReduceStage
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
polyAdd(2*8(inp)(itr1*1)) polyAdd(2*8(inp)(itr1*1))
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage1_AVX2 polyMulStage1_AVX2
...@@ -892,7 +892,7 @@ openAVX2InternalLoop: ...@@ -892,7 +892,7 @@ openAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage3_AVX2 polyMulStage3_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulReduceStage polyMulReduceStage
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
...@@ -908,7 +908,7 @@ openAVX2InternalLoop: ...@@ -908,7 +908,7 @@ openAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
polyMulStage2_AVX2 polyMulStage2_AVX2
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage3_AVX2 polyMulStage3_AVX2
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
...@@ -925,7 +925,7 @@ openAVX2InternalLoop: ...@@ -925,7 +925,7 @@ openAVX2InternalLoop:
CMPQ itr1, $480 CMPQ itr1, $480
JNE openAVX2InternalLoop JNE openAVX2InternalLoop
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
...@@ -974,7 +974,7 @@ openAVX2192: ...@@ -974,7 +974,7 @@ openAVX2192:
VMOVDQA AA0, AA1 VMOVDQA AA0, AA1
VMOVDQA BB0, BB1 VMOVDQA BB0, BB1
VMOVDQA CC0, CC1 VMOVDQA CC0, CC1
VPADDD avx2IncMask<>(SB), DD0, DD1 VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2 VMOVDQA AA0, AA2
VMOVDQA BB0, BB2 VMOVDQA BB0, BB2
VMOVDQA CC0, CC2 VMOVDQA CC0, CC2
...@@ -1000,7 +1000,7 @@ openAVX2192InnerCipherLoop: ...@@ -1000,7 +1000,7 @@ openAVX2192InnerCipherLoop:
VPERM2I128 $0x02, AA0, BB0, TT0 VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key // Clamp and store poly key
VPAND polyClampMask<>(SB), TT0, TT0 VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2 VMOVDQA TT0, rsStoreAVX2
// Stream for up to 192 bytes // Stream for up to 192 bytes
...@@ -1072,8 +1072,8 @@ openAVX2ShortDone: ...@@ -1072,8 +1072,8 @@ openAVX2ShortDone:
// Special optimization for buffers smaller than 321 bytes // Special optimization for buffers smaller than 321 bytes
openAVX2320: openAVX2320:
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
MOVQ $10, itr2 MOVQ $10, itr2
...@@ -1089,18 +1089,18 @@ openAVX2320InnerCipherLoop: ...@@ -1089,18 +1089,18 @@ openAVX2320InnerCipherLoop:
DECQ itr2 DECQ itr2
JNE openAVX2320InnerCipherLoop JNE openAVX2320InnerCipherLoop
VMOVDQA chacha20Constants<>(SB), TT0 VMOVDQA ·chacha20Constants<>(SB), TT0
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
VMOVDQA avx2IncMask<>(SB), TT0 VMOVDQA ·avx2IncMask<>(SB), TT0
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
VPADDD TT3, DD2, DD2 VPADDD TT3, DD2, DD2
// Clamp and store poly key // Clamp and store poly key
VPERM2I128 $0x02, AA0, BB0, TT0 VPERM2I128 $0x02, AA0, BB0, TT0
VPAND polyClampMask<>(SB), TT0, TT0 VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2 VMOVDQA TT0, rsStoreAVX2
// Stream for up to 320 bytes // Stream for up to 320 bytes
...@@ -1120,11 +1120,11 @@ openAVX2320InnerCipherLoop: ...@@ -1120,11 +1120,11 @@ openAVX2320InnerCipherLoop:
// Special optimization for the last 128 bytes of ciphertext // Special optimization for the last 128 bytes of ciphertext
openAVX2Tail128: openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks // Need to decrypt up to 128 bytes - prepare two blocks
VMOVDQA chacha20Constants<>(SB), AA1 VMOVDQA ·chacha20Constants<>(SB), AA1
VMOVDQA state1StoreAVX2, BB1 VMOVDQA state1StoreAVX2, BB1
VMOVDQA state2StoreAVX2, CC1 VMOVDQA state2StoreAVX2, CC1
VMOVDQA ctr3StoreAVX2, DD1 VMOVDQA ctr3StoreAVX2, DD1
VPADDD avx2IncMask<>(SB), DD1, DD1 VPADDD ·avx2IncMask<>(SB), DD1, DD1
VMOVDQA DD1, DD0 VMOVDQA DD1, DD0
XORQ itr2, itr2 XORQ itr2, itr2
...@@ -1153,7 +1153,7 @@ openAVX2Tail128LoopB: ...@@ -1153,7 +1153,7 @@ openAVX2Tail128LoopB:
CMPQ itr2, $160 CMPQ itr2, $160
JNE openAVX2Tail128LoopB JNE openAVX2Tail128LoopB
VPADDD chacha20Constants<>(SB), AA1, AA1 VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB1, BB1 VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC1, CC1 VPADDD state2StoreAVX2, CC1, CC1
VPADDD DD0, DD1, DD1 VPADDD DD0, DD1, DD1
...@@ -1196,12 +1196,12 @@ openAVX2TailDone: ...@@ -1196,12 +1196,12 @@ openAVX2TailDone:
// Special optimization for the last 256 bytes of ciphertext // Special optimization for the last 256 bytes of ciphertext
openAVX2Tail256: openAVX2Tail256:
// Need to decrypt up to 256 bytes - prepare four blocks // Need to decrypt up to 256 bytes - prepare four blocks
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
VMOVDQA ctr3StoreAVX2, DD0 VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0
VPADDD avx2IncMask<>(SB), DD0, DD1 VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA DD0, TT1 VMOVDQA DD0, TT1
VMOVDQA DD1, TT2 VMOVDQA DD1, TT2
...@@ -1255,7 +1255,7 @@ openAVX2Tail256Hash: ...@@ -1255,7 +1255,7 @@ openAVX2Tail256Hash:
// Store 128 bytes safely, then go to store loop // Store 128 bytes safely, then go to store loop
openAVX2Tail256HashEnd: openAVX2Tail256HashEnd:
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
...@@ -1274,13 +1274,13 @@ openAVX2Tail256HashEnd: ...@@ -1274,13 +1274,13 @@ openAVX2Tail256HashEnd:
// Special optimization for the last 384 bytes of ciphertext // Special optimization for the last 384 bytes of ciphertext
openAVX2Tail384: openAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare six blocks // Need to decrypt up to 384 bytes - prepare six blocks
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
VMOVDQA ctr3StoreAVX2, DD0 VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0
VPADDD avx2IncMask<>(SB), DD0, DD1 VPADDD ·avx2IncMask<>(SB), DD0, DD1
VPADDD avx2IncMask<>(SB), DD1, DD2 VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA DD0, ctr0StoreAVX2 VMOVDQA DD0, ctr0StoreAVX2
VMOVDQA DD1, ctr1StoreAVX2 VMOVDQA DD1, ctr1StoreAVX2
VMOVDQA DD2, ctr2StoreAVX2 VMOVDQA DD2, ctr2StoreAVX2
...@@ -1339,7 +1339,7 @@ openAVX2Tail384Hash: ...@@ -1339,7 +1339,7 @@ openAVX2Tail384Hash:
// Store 256 bytes safely, then go to store loop // Store 256 bytes safely, then go to store loop
openAVX2Tail384HashEnd: openAVX2Tail384HashEnd:
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
...@@ -1358,10 +1358,10 @@ openAVX2Tail384HashEnd: ...@@ -1358,10 +1358,10 @@ openAVX2Tail384HashEnd:
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Special optimization for the last 512 bytes of ciphertext // Special optimization for the last 512 bytes of ciphertext
openAVX2Tail512: openAVX2Tail512:
VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
XORQ itr1, itr1 XORQ itr1, itr1
MOVQ inp, itr2 MOVQ inp, itr2
...@@ -1374,7 +1374,7 @@ openAVX2Tail512LoopB: ...@@ -1374,7 +1374,7 @@ openAVX2Tail512LoopB:
openAVX2Tail512LoopA: openAVX2Tail512LoopA:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -1387,7 +1387,7 @@ openAVX2Tail512LoopA: ...@@ -1387,7 +1387,7 @@ openAVX2Tail512LoopA:
polyMulAVX2 polyMulAVX2
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -1401,7 +1401,7 @@ openAVX2Tail512LoopA: ...@@ -1401,7 +1401,7 @@ openAVX2Tail512LoopA:
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
polyAdd(2*8(itr2)) polyAdd(2*8(itr2))
...@@ -1415,7 +1415,7 @@ openAVX2Tail512LoopA: ...@@ -1415,7 +1415,7 @@ openAVX2Tail512LoopA:
VMOVDQA tmpStoreAVX2, CC3 VMOVDQA tmpStoreAVX2, CC3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -1448,7 +1448,7 @@ openAVX2Tail512HashLoop: ...@@ -1448,7 +1448,7 @@ openAVX2Tail512HashLoop:
JMP openAVX2Tail512HashLoop JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd: openAVX2Tail512HashEnd:
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
...@@ -1493,7 +1493,7 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 ...@@ -1493,7 +1493,7 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
JBE sealSSE128 // About 15% faster JBE sealSSE128 // About 15% faster
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
MOVOU chacha20Constants<>(SB), A0 MOVOU ·chacha20Constants<>(SB), A0
MOVOU (1*16)(keyp), B0 MOVOU (1*16)(keyp), B0
MOVOU (2*16)(keyp), C0 MOVOU (2*16)(keyp), C0
MOVOU (3*16)(keyp), D0 MOVOU (3*16)(keyp), D0
...@@ -1503,9 +1503,9 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 ...@@ -1503,9 +1503,9 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
MOVO C0, state2Store MOVO C0, state2Store
// Load state, increment counter blocks // Load state, increment counter blocks
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters // Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
...@@ -1535,13 +1535,13 @@ sealSSEIntroLoop: ...@@ -1535,13 +1535,13 @@ sealSSEIntroLoop:
JNE sealSSEIntroLoop JNE sealSSEIntroLoop
// Add in the state // Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
// Clamp and store the key // Clamp and store the key
PAND polyClampMask<>(SB), A0 PAND ·polyClampMask<>(SB), A0
MOVO A0, rStore MOVO A0, rStore
MOVO B0, sStore MOVO B0, sStore
...@@ -1585,10 +1585,10 @@ sealSSEIntroLoop: ...@@ -1585,10 +1585,10 @@ sealSSEIntroLoop:
sealSSEMainLoop: sealSSEMainLoop:
// Load state, increment counter blocks // Load state, increment counter blocks
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0 MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3 MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters // Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
...@@ -1627,7 +1627,7 @@ sealSSEInnerLoop: ...@@ -1627,7 +1627,7 @@ sealSSEInnerLoop:
JG sealSSEInnerLoop JG sealSSEInnerLoop
// Add in the state // Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3 PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
...@@ -1683,11 +1683,11 @@ sealSSEInnerLoop: ...@@ -1683,11 +1683,11 @@ sealSSEInnerLoop:
// Special optimization for the last 64 bytes of plaintext // Special optimization for the last 64 bytes of plaintext
sealSSETail64: sealSSETail64:
// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
MOVO chacha20Constants<>(SB), A1 MOVO ·chacha20Constants<>(SB), A1
MOVO state1Store, B1 MOVO state1Store, B1
MOVO state2Store, C1 MOVO state2Store, C1
MOVO ctr3Store, D1 MOVO ctr3Store, D1
PADDL sseIncMask<>(SB), D1 PADDL ·sseIncMask<>(SB), D1
MOVO D1, ctr0Store MOVO D1, ctr0Store
sealSSETail64LoopA: sealSSETail64LoopA:
...@@ -1710,7 +1710,7 @@ sealSSETail64LoopB: ...@@ -1710,7 +1710,7 @@ sealSSETail64LoopB:
DECQ itr2 DECQ itr2
JGE sealSSETail64LoopB JGE sealSSETail64LoopB
PADDL chacha20Constants<>(SB), A1 PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B1 PADDL state1Store, B1
PADDL state2Store, C1 PADDL state2Store, C1
PADDL ctr0Store, D1 PADDL ctr0Store, D1
...@@ -1721,8 +1721,8 @@ sealSSETail64LoopB: ...@@ -1721,8 +1721,8 @@ sealSSETail64LoopB:
// Special optimization for the last 128 bytes of plaintext // Special optimization for the last 128 bytes of plaintext
sealSSETail128: sealSSETail128:
// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
sealSSETail128LoopA: sealSSETail128LoopA:
// Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext // Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext
...@@ -1747,7 +1747,7 @@ sealSSETail128LoopB: ...@@ -1747,7 +1747,7 @@ sealSSETail128LoopB:
DECQ itr2 DECQ itr2
JGE sealSSETail128LoopB JGE sealSSETail128LoopB
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B0; PADDL state1Store, B1 PADDL state1Store, B0; PADDL state1Store, B1
PADDL state2Store, C0; PADDL state2Store, C1 PADDL state2Store, C0; PADDL state2Store, C1
PADDL ctr0Store, D0; PADDL ctr1Store, D1 PADDL ctr0Store, D0; PADDL ctr1Store, D1
...@@ -1766,9 +1766,9 @@ sealSSETail128LoopB: ...@@ -1766,9 +1766,9 @@ sealSSETail128LoopB:
// Special optimization for the last 192 bytes of plaintext // Special optimization for the last 192 bytes of plaintext
sealSSETail192: sealSSETail192:
// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr2Store MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
sealSSETail192LoopA: sealSSETail192LoopA:
// Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext // Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext
...@@ -1797,7 +1797,7 @@ sealSSETail192LoopB: ...@@ -1797,7 +1797,7 @@ sealSSETail192LoopB:
DECQ itr2 DECQ itr2
JGE sealSSETail192LoopB JGE sealSSETail192LoopB
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
...@@ -1823,9 +1823,9 @@ sealSSETail192LoopB: ...@@ -1823,9 +1823,9 @@ sealSSETail192LoopB:
// Special seal optimization for buffers smaller than 129 bytes // Special seal optimization for buffers smaller than 129 bytes
sealSSE128: sealSSE128:
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1 MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2 MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
MOVQ $10, itr2 MOVQ $10, itr2
...@@ -1842,11 +1842,11 @@ sealSSE128InnerCipherLoop: ...@@ -1842,11 +1842,11 @@ sealSSE128InnerCipherLoop:
JNE sealSSE128InnerCipherLoop JNE sealSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2 PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
PADDL T2, C1; PADDL T2, C2 PADDL T2, C1; PADDL T2, C2
PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2 PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
PAND polyClampMask<>(SB), A0 PAND ·polyClampMask<>(SB), A0
MOVOU A0, rStore MOVOU A0, rStore
MOVOU B0, sStore MOVOU B0, sStore
...@@ -1903,7 +1903,7 @@ sealSSETail: ...@@ -1903,7 +1903,7 @@ sealSSETail:
// We can only load the PT one byte at a time to avoid read after end of buffer // We can only load the PT one byte at a time to avoid read after end of buffer
MOVQ inl, itr2 MOVQ inl, itr2
SHLQ $4, itr2 SHLQ $4, itr2
LEAQ andMask<>(SB), t0 LEAQ ·andMask<>(SB), t0
MOVQ inl, itr1 MOVQ inl, itr1
LEAQ -1(inp)(inl*1), inp LEAQ -1(inp)(inl*1), inp
XORQ t2, t2 XORQ t2, t2
...@@ -1963,11 +1963,11 @@ sealSSEFinalize: ...@@ -1963,11 +1963,11 @@ sealSSEFinalize:
// ------------------------- AVX2 Code ---------------------------------------- // ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Seal_AVX2: chacha20Poly1305Seal_AVX2:
VZEROUPPER VZEROUPPER
VMOVDQU chacha20Constants<>(SB), AA0 VMOVDQU ·chacha20Constants<>(SB), AA0
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
VPADDD avx2InitMask<>(SB), DD0, DD0 VPADDD ·avx2InitMask<>(SB), DD0, DD0
// Special optimizations, for very short buffers // Special optimizations, for very short buffers
CMPQ inl, $192 CMPQ inl, $192
...@@ -1979,9 +1979,9 @@ chacha20Poly1305Seal_AVX2: ...@@ -1979,9 +1979,9 @@ chacha20Poly1305Seal_AVX2:
VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
VPADDD avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
VPADDD avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
VPADDD avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA DD3, ctr3StoreAVX2
MOVQ $10, itr2 MOVQ $10, itr2
...@@ -2012,7 +2012,7 @@ sealAVX2IntroLoop: ...@@ -2012,7 +2012,7 @@ sealAVX2IntroLoop:
DECQ itr2 DECQ itr2
JNE sealAVX2IntroLoop JNE sealAVX2IntroLoop
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
...@@ -2022,7 +2022,7 @@ sealAVX2IntroLoop: ...@@ -2022,7 +2022,7 @@ sealAVX2IntroLoop:
VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
// Clamp and store poly key // Clamp and store poly key
VPAND polyClampMask<>(SB), DD0, DD0 VPAND ·polyClampMask<>(SB), DD0, DD0
VMOVDQA DD0, rsStoreAVX2 VMOVDQA DD0, rsStoreAVX2
// Hash AD // Hash AD
...@@ -2068,11 +2068,11 @@ sealAVX2IntroLoop: ...@@ -2068,11 +2068,11 @@ sealAVX2IntroLoop:
JBE sealAVX2Tail512 JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0 VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -2100,7 +2100,7 @@ sealAVX2IntroLoop: ...@@ -2100,7 +2100,7 @@ sealAVX2IntroLoop:
VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -2116,10 +2116,10 @@ sealAVX2IntroLoop: ...@@ -2116,10 +2116,10 @@ sealAVX2IntroLoop:
sealAVX2MainLoop: sealAVX2MainLoop:
// Load state, increment counter blocks, store the incremented counters // Load state, increment counter blocks, store the incremented counters
VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
MOVQ $10, itr1 MOVQ $10, itr1
...@@ -2128,7 +2128,7 @@ sealAVX2InternalLoop: ...@@ -2128,7 +2128,7 @@ sealAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage1_AVX2 polyMulStage1_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulStage2_AVX2 polyMulStage2_AVX2
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
...@@ -2144,7 +2144,7 @@ sealAVX2InternalLoop: ...@@ -2144,7 +2144,7 @@ sealAVX2InternalLoop:
sealAVX2InternalLoopStart: sealAVX2InternalLoopStart:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
polyAdd(2*8(oup)) polyAdd(2*8(oup))
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage1_AVX2 polyMulStage1_AVX2
...@@ -2162,7 +2162,7 @@ sealAVX2InternalLoopStart: ...@@ -2162,7 +2162,7 @@ sealAVX2InternalLoopStart:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage3_AVX2 polyMulStage3_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulReduceStage polyMulReduceStage
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
...@@ -2178,7 +2178,7 @@ sealAVX2InternalLoopStart: ...@@ -2178,7 +2178,7 @@ sealAVX2InternalLoopStart:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
polyMulStage2_AVX2 polyMulStage2_AVX2
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage3_AVX2 polyMulStage3_AVX2
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
...@@ -2195,7 +2195,7 @@ sealAVX2InternalLoopStart: ...@@ -2195,7 +2195,7 @@ sealAVX2InternalLoopStart:
DECQ itr1 DECQ itr1
JNE sealAVX2InternalLoop JNE sealAVX2InternalLoop
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
...@@ -2250,7 +2250,7 @@ seal192AVX2: ...@@ -2250,7 +2250,7 @@ seal192AVX2:
VMOVDQA AA0, AA1 VMOVDQA AA0, AA1
VMOVDQA BB0, BB1 VMOVDQA BB0, BB1
VMOVDQA CC0, CC1 VMOVDQA CC0, CC1
VPADDD avx2IncMask<>(SB), DD0, DD1 VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2 VMOVDQA AA0, AA2
VMOVDQA BB0, BB2 VMOVDQA BB0, BB2
VMOVDQA CC0, CC2 VMOVDQA CC0, CC2
...@@ -2276,7 +2276,7 @@ sealAVX2192InnerCipherLoop: ...@@ -2276,7 +2276,7 @@ sealAVX2192InnerCipherLoop:
VPERM2I128 $0x02, AA0, BB0, TT0 VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key // Clamp and store poly key
VPAND polyClampMask<>(SB), TT0, TT0 VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2 VMOVDQA TT0, rsStoreAVX2
// Stream for up to 192 bytes // Stream for up to 192 bytes
...@@ -2359,8 +2359,8 @@ sealAVX2ShortDone: ...@@ -2359,8 +2359,8 @@ sealAVX2ShortDone:
// Special optimization for buffers smaller than 321 bytes // Special optimization for buffers smaller than 321 bytes
seal320AVX2: seal320AVX2:
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1 VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2 VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
MOVQ $10, itr2 MOVQ $10, itr2
...@@ -2376,18 +2376,18 @@ sealAVX2320InnerCipherLoop: ...@@ -2376,18 +2376,18 @@ sealAVX2320InnerCipherLoop:
DECQ itr2 DECQ itr2
JNE sealAVX2320InnerCipherLoop JNE sealAVX2320InnerCipherLoop
VMOVDQA chacha20Constants<>(SB), TT0 VMOVDQA ·chacha20Constants<>(SB), TT0
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
VMOVDQA avx2IncMask<>(SB), TT0 VMOVDQA ·avx2IncMask<>(SB), TT0
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
VPADDD TT3, DD2, DD2 VPADDD TT3, DD2, DD2
// Clamp and store poly key // Clamp and store poly key
VPERM2I128 $0x02, AA0, BB0, TT0 VPERM2I128 $0x02, AA0, BB0, TT0
VPAND polyClampMask<>(SB), TT0, TT0 VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2 VMOVDQA TT0, rsStoreAVX2
// Stream for up to 320 bytes // Stream for up to 320 bytes
...@@ -2409,11 +2409,11 @@ sealAVX2Tail128: ...@@ -2409,11 +2409,11 @@ sealAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks // Need to decrypt up to 128 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0 VMOVDQA ·chacha20Constants<>(SB), AA0
VMOVDQA state1StoreAVX2, BB0 VMOVDQA state1StoreAVX2, BB0
VMOVDQA state2StoreAVX2, CC0 VMOVDQA state2StoreAVX2, CC0
VMOVDQA ctr3StoreAVX2, DD0 VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0
VMOVDQA DD0, DD1 VMOVDQA DD0, DD1
sealAVX2Tail128LoopA: sealAVX2Tail128LoopA:
...@@ -2440,7 +2440,7 @@ sealAVX2Tail128LoopB: ...@@ -2440,7 +2440,7 @@ sealAVX2Tail128LoopB:
DECQ itr2 DECQ itr2
JGE sealAVX2Tail128LoopB JGE sealAVX2Tail128LoopB
VPADDD chacha20Constants<>(SB), AA0, AA1 VPADDD ·chacha20Constants<>(SB), AA0, AA1
VPADDD state1StoreAVX2, BB0, BB1 VPADDD state1StoreAVX2, BB0, BB1
VPADDD state2StoreAVX2, CC0, CC1 VPADDD state2StoreAVX2, CC0, CC1
VPADDD DD1, DD0, DD1 VPADDD DD1, DD0, DD1
...@@ -2457,12 +2457,12 @@ sealAVX2Tail256: ...@@ -2457,12 +2457,12 @@ sealAVX2Tail256:
// Need to decrypt up to 256 bytes - prepare two blocks // Need to decrypt up to 256 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA chacha20Constants<>(SB), AA1 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
VMOVDQA ctr3StoreAVX2, DD0 VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0 VPADDD ·avx2IncMask<>(SB), DD0, DD0
VPADDD avx2IncMask<>(SB), DD0, DD1 VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA DD0, TT1 VMOVDQA DD0, TT1
VMOVDQA DD1, TT2 VMOVDQA DD1, TT2
...@@ -2490,7 +2490,7 @@ sealAVX2Tail256LoopB: ...@@ -2490,7 +2490,7 @@ sealAVX2Tail256LoopB:
DECQ itr2 DECQ itr2
JGE sealAVX2Tail256LoopB JGE sealAVX2Tail256LoopB
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
...@@ -2516,11 +2516,11 @@ sealAVX2Tail384: ...@@ -2516,11 +2516,11 @@ sealAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare two blocks // Need to decrypt up to 384 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
VMOVDQA ctr3StoreAVX2, DD0 VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
sealAVX2Tail384LoopA: sealAVX2Tail384LoopA:
...@@ -2547,7 +2547,7 @@ sealAVX2Tail384LoopB: ...@@ -2547,7 +2547,7 @@ sealAVX2Tail384LoopB:
DECQ itr2 DECQ itr2
JGE sealAVX2Tail384LoopB JGE sealAVX2Tail384LoopB
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
...@@ -2579,11 +2579,11 @@ sealAVX2Tail512: ...@@ -2579,11 +2579,11 @@ sealAVX2Tail512:
// Need to decrypt up to 512 bytes - prepare two blocks // Need to decrypt up to 512 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0 VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3 VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
sealAVX2Tail512LoopA: sealAVX2Tail512LoopA:
...@@ -2594,7 +2594,7 @@ sealAVX2Tail512LoopA: ...@@ -2594,7 +2594,7 @@ sealAVX2Tail512LoopA:
sealAVX2Tail512LoopB: sealAVX2Tail512LoopB:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -2607,7 +2607,7 @@ sealAVX2Tail512LoopB: ...@@ -2607,7 +2607,7 @@ sealAVX2Tail512LoopB:
polyMulAVX2 polyMulAVX2
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -2621,7 +2621,7 @@ sealAVX2Tail512LoopB: ...@@ -2621,7 +2621,7 @@ sealAVX2Tail512LoopB:
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3 VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
polyAdd(2*8(oup)) polyAdd(2*8(oup))
...@@ -2635,7 +2635,7 @@ sealAVX2Tail512LoopB: ...@@ -2635,7 +2635,7 @@ sealAVX2Tail512LoopB:
VMOVDQA tmpStoreAVX2, CC3 VMOVDQA tmpStoreAVX2, CC3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3 VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2 VMOVDQA CC3, tmpStoreAVX2
...@@ -2653,7 +2653,7 @@ sealAVX2Tail512LoopB: ...@@ -2653,7 +2653,7 @@ sealAVX2Tail512LoopB:
DECQ itr2 DECQ itr2
JGE sealAVX2Tail512LoopB JGE sealAVX2Tail512LoopB
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3 VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment