Commit e7edc7e2 authored by Adam Langley's avatar Adam Langley

vendor: update golang.org/x/crypto/chacha20poly1305

This change updates the vendored chacha20poly1305 package to match
revision 14f9af67c679edd414f72f13d67c917447113df2 of x/crypto.

Change-Id: I05a4ba86578b0f0cdb1ed7dd50fee3b38bb48cf5
Reviewed-on: https://go-review.googlesource.com/31312
Run-TryBot: Adam Langley <agl@golang.org>
Reviewed-by: default avatarBrad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
parent bb4c40b2
......@@ -82,82 +82,82 @@
#define TT2 BB3
#define TT3 CC3
// ChaCha20 constants
DATA chacha20Constants<>+0x00(SB)/4, $0x61707865
DATA chacha20Constants<>+0x04(SB)/4, $0x3320646e
DATA chacha20Constants<>+0x08(SB)/4, $0x79622d32
DATA chacha20Constants<>+0x0c(SB)/4, $0x6b206574
DATA chacha20Constants<>+0x10(SB)/4, $0x61707865
DATA chacha20Constants<>+0x14(SB)/4, $0x3320646e
DATA chacha20Constants<>+0x18(SB)/4, $0x79622d32
DATA chacha20Constants<>+0x1c(SB)/4, $0x6b206574
DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865
DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574
DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865
DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e
DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32
DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574
// <<< 16 with PSHUFB
DATA rol16<>+0x00(SB)/8, $0x0504070601000302
DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA rol16<>+0x10(SB)/8, $0x0504070601000302
DATA rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302
DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
// <<< 8 with PSHUFB
DATA rol8<>+0x00(SB)/8, $0x0605040702010003
DATA rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA rol8<>+0x10(SB)/8, $0x0605040702010003
DATA rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
DATA avx2InitMask<>+0x00(SB)/8, $0x0
DATA avx2InitMask<>+0x08(SB)/8, $0x0
DATA avx2InitMask<>+0x10(SB)/8, $0x1
DATA avx2InitMask<>+0x18(SB)/8, $0x0
DATA avx2IncMask<>+0x00(SB)/8, $0x2
DATA avx2IncMask<>+0x08(SB)/8, $0x0
DATA avx2IncMask<>+0x10(SB)/8, $0x2
DATA avx2IncMask<>+0x18(SB)/8, $0x0
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003
DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
DATA ·avx2InitMask<>+0x00(SB)/8, $0x0
DATA ·avx2InitMask<>+0x08(SB)/8, $0x0
DATA ·avx2InitMask<>+0x10(SB)/8, $0x1
DATA ·avx2InitMask<>+0x18(SB)/8, $0x0
DATA ·avx2IncMask<>+0x00(SB)/8, $0x2
DATA ·avx2IncMask<>+0x08(SB)/8, $0x0
DATA ·avx2IncMask<>+0x10(SB)/8, $0x2
DATA ·avx2IncMask<>+0x18(SB)/8, $0x0
// Poly1305 key clamp
DATA polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
DATA polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
DATA polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF
DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC
DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA sseIncMask<>+0x00(SB)/8, $0x1
DATA sseIncMask<>+0x08(SB)/8, $0x0
DATA ·sseIncMask<>+0x00(SB)/8, $0x1
DATA ·sseIncMask<>+0x08(SB)/8, $0x0
// To load/store the last < 16 bytes in a buffer
DATA andMask<>+0x00(SB)/8, $0x00000000000000ff
DATA andMask<>+0x08(SB)/8, $0x0000000000000000
DATA andMask<>+0x10(SB)/8, $0x000000000000ffff
DATA andMask<>+0x18(SB)/8, $0x0000000000000000
DATA andMask<>+0x20(SB)/8, $0x0000000000ffffff
DATA andMask<>+0x28(SB)/8, $0x0000000000000000
DATA andMask<>+0x30(SB)/8, $0x00000000ffffffff
DATA andMask<>+0x38(SB)/8, $0x0000000000000000
DATA andMask<>+0x40(SB)/8, $0x000000ffffffffff
DATA andMask<>+0x48(SB)/8, $0x0000000000000000
DATA andMask<>+0x50(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0x58(SB)/8, $0x0000000000000000
DATA andMask<>+0x60(SB)/8, $0x00ffffffffffffff
DATA andMask<>+0x68(SB)/8, $0x0000000000000000
DATA andMask<>+0x70(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x78(SB)/8, $0x0000000000000000
DATA andMask<>+0x80(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x88(SB)/8, $0x00000000000000ff
DATA andMask<>+0x90(SB)/8, $0xffffffffffffffff
DATA andMask<>+0x98(SB)/8, $0x000000000000ffff
DATA andMask<>+0xa0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xa8(SB)/8, $0x0000000000ffffff
DATA andMask<>+0xb0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xb8(SB)/8, $0x00000000ffffffff
DATA andMask<>+0xc0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xc8(SB)/8, $0x000000ffffffffff
DATA andMask<>+0xd0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
DATA andMask<>+0xe0(SB)/8, $0xffffffffffffffff
DATA andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
GLOBL chacha20Constants<>(SB), (NOPTR+RODATA), $32
GLOBL rol16<>(SB), (NOPTR+RODATA), $32
GLOBL rol8<>(SB), (NOPTR+RODATA), $32
GLOBL sseIncMask<>(SB), (NOPTR+RODATA), $16
GLOBL avx2IncMask<>(SB), (NOPTR+RODATA), $32
GLOBL avx2InitMask<>(SB), (NOPTR+RODATA), $32
GLOBL polyClampMask<>(SB), (NOPTR+RODATA), $32
GLOBL andMask<>(SB), (NOPTR+RODATA), $240
DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff
DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff
DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff
DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000
DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff
DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff
DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff
DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff
DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff
DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff
DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff
DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff
GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32
GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16
GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32
GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32
GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32
GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240
// No PALIGNR in Go ASM yet (but VPALIGNR is present).
#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3
#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4
......@@ -185,15 +185,15 @@ GLOBL andMask<>(SB), (NOPTR+RODATA), $240
#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15
// Some macros
#define chachaQR(A, B, C, D, T) \
PADDD B, A; PXOR A, D; PSHUFB rol16<>(SB), D \
PADDD B, A; PXOR A, D; PSHUFB ·rol16<>(SB), D \
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \
PADDD B, A; PXOR A, D; PSHUFB rol8<>(SB), D \
PADDD B, A; PXOR A, D; PSHUFB ·rol8<>(SB), D \
PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B
#define chachaQR_AVX2(A, B, C, D, T) \
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol16<>(SB), D, D \
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB rol8<>(SB), D, D \
VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \
VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B
#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2
......@@ -286,7 +286,7 @@ TEXT ·chacha20Poly1305Open(SB), 0, $288-97
JBE openSSE128 // About 16% faster
// For long buffers, prepare the poly key first
MOVOU chacha20Constants<>(SB), A0
MOVOU ·chacha20Constants<>(SB), A0
MOVOU (1*16)(keyp), B0
MOVOU (2*16)(keyp), C0
MOVOU (3*16)(keyp), D0
......@@ -307,10 +307,10 @@ openSSEPreparePolyKey:
JNE openSSEPreparePolyKey
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0
PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0
// Clamp and store the key
PAND polyClampMask<>(SB), A0
PAND ·polyClampMask<>(SB), A0
MOVO A0, rStore; MOVO B0, sStore
// Hash AAD
......@@ -322,10 +322,10 @@ openSSEMainLoop:
JB openSSEMainLoopDone
// Load state, increment counter blocks
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
......@@ -370,7 +370,7 @@ openSSEInternalLoop:
JG openSSEInternalLoop
// Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
......@@ -446,9 +446,9 @@ openSSEFinalize:
// Special optimization for buffers smaller than 129 bytes
openSSE128:
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
MOVQ $10, itr2
......@@ -465,13 +465,13 @@ openSSE128InnerCipherLoop:
JNE openSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
PADDL T2, C1; PADDL T2, C2
PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2
PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
// Clamp and store the key
PAND polyClampMask<>(SB), A0
PAND ·polyClampMask<>(SB), A0
MOVOU A0, rStore; MOVOU B0, sStore
// Hash
......@@ -509,7 +509,7 @@ openSSETail16:
// We can safely load the CT from the end, because it is padded with the MAC
MOVQ inl, itr2
SHLQ $4, itr2
LEAQ andMask<>(SB), t0
LEAQ ·andMask<>(SB), t0
MOVOU (inp), T0
ADDQ inl, inp
PAND -16(t0)(itr2*1), T0
......@@ -534,7 +534,7 @@ openSSETail16Store:
// Special optimization for the last 64 bytes of ciphertext
openSSETail64:
// Need to decrypt up to 64 bytes - prepare single block
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
XORQ itr2, itr2
MOVQ inl, itr1
CMPQ itr1, $16
......@@ -559,7 +559,7 @@ openSSETail64LoopB:
CMPQ itr2, $160
JNE openSSETail64LoopB
PADDL chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0
openSSETail64DecLoop:
CMPQ inl, $16
......@@ -583,8 +583,8 @@ openSSETail64DecLoopDone:
// Special optimization for the last 128 bytes of ciphertext
openSSETail128:
// Need to decrypt up to 128 bytes - prepare two blocks
MOVO chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr0Store
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr1Store
MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store
XORQ itr2, itr2
MOVQ inl, itr1
ANDQ $-16, itr1
......@@ -609,7 +609,7 @@ openSSETail128LoopB:
CMPQ itr2, $160
JNE openSSETail128LoopB
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B0; PADDL state1Store, B1
PADDL state2Store, C0; PADDL state2Store, C1
PADDL ctr1Store, D0; PADDL ctr0Store, D1
......@@ -627,9 +627,9 @@ openSSETail128LoopB:
// Special optimization for the last 192 bytes of ciphertext
openSSETail192:
// Need to decrypt up to 192 bytes - prepare three blocks
MOVO chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr0Store
MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr2Store
MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store
MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store
MOVQ inl, itr1
MOVQ $160, itr2
......@@ -674,7 +674,7 @@ openSSLTail192LoopB:
polyMul
openSSLTail192Store:
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2
......@@ -696,10 +696,10 @@ openSSLTail192Store:
// Special optimization for the last 256 bytes of ciphertext
openSSETail256:
// Need to decrypt up to 256 bytes - prepare four blocks
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
......@@ -744,7 +744,7 @@ openSSETail256HashLoop:
JB openSSETail256HashLoop
// Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
......@@ -779,11 +779,11 @@ openSSETail256HashLoop:
// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Open_AVX2:
VZEROUPPER
VMOVDQU chacha20Constants<>(SB), AA0
VMOVDQU ·chacha20Constants<>(SB), AA0
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
VPADDD avx2InitMask<>(SB), DD0, DD0
VPADDD ·avx2InitMask<>(SB), DD0, DD0
// Special optimization, for very short buffers
CMPQ inl, $192
......@@ -805,7 +805,7 @@ openAVX2PreparePolyKey:
DECQ itr2
JNE openAVX2PreparePolyKey
VPADDD chacha20Constants<>(SB), AA0, AA0
VPADDD ·chacha20Constants<>(SB), AA0, AA0
VPADDD state1StoreAVX2, BB0, BB0
VPADDD state2StoreAVX2, CC0, CC0
VPADDD ctr3StoreAVX2, DD0, DD0
......@@ -813,7 +813,7 @@ openAVX2PreparePolyKey:
VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key
VPAND polyClampMask<>(SB), TT0, TT0
VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for the first 64 bytes
......@@ -846,10 +846,10 @@ openAVX2MainLoop:
JB openAVX2MainLoopDone
// Load state, increment counter blocks, store the incremented counters
VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
XORQ itr1, itr1
......@@ -860,7 +860,7 @@ openAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage1_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulStage2_AVX2
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
......@@ -874,7 +874,7 @@ openAVX2InternalLoop:
polyMulReduceStage
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
polyAdd(2*8(inp)(itr1*1))
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage1_AVX2
......@@ -892,7 +892,7 @@ openAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage3_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulReduceStage
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
......@@ -908,7 +908,7 @@ openAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
polyMulStage2_AVX2
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage3_AVX2
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
......@@ -925,7 +925,7 @@ openAVX2InternalLoop:
CMPQ itr1, $480
JNE openAVX2InternalLoop
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
......@@ -974,7 +974,7 @@ openAVX2192:
VMOVDQA AA0, AA1
VMOVDQA BB0, BB1
VMOVDQA CC0, CC1
VPADDD avx2IncMask<>(SB), DD0, DD1
VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2
VMOVDQA BB0, BB2
VMOVDQA CC0, CC2
......@@ -1000,7 +1000,7 @@ openAVX2192InnerCipherLoop:
VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key
VPAND polyClampMask<>(SB), TT0, TT0
VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 192 bytes
......@@ -1072,8 +1072,8 @@ openAVX2ShortDone:
// Special optimization for buffers smaller than 321 bytes
openAVX2320:
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
MOVQ $10, itr2
......@@ -1089,18 +1089,18 @@ openAVX2320InnerCipherLoop:
DECQ itr2
JNE openAVX2320InnerCipherLoop
VMOVDQA chacha20Constants<>(SB), TT0
VMOVDQA ·chacha20Constants<>(SB), TT0
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
VMOVDQA avx2IncMask<>(SB), TT0
VMOVDQA ·avx2IncMask<>(SB), TT0
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
VPADDD TT3, DD2, DD2
// Clamp and store poly key
VPERM2I128 $0x02, AA0, BB0, TT0
VPAND polyClampMask<>(SB), TT0, TT0
VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 320 bytes
......@@ -1120,11 +1120,11 @@ openAVX2320InnerCipherLoop:
// Special optimization for the last 128 bytes of ciphertext
openAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
VMOVDQA chacha20Constants<>(SB), AA1
VMOVDQA ·chacha20Constants<>(SB), AA1
VMOVDQA state1StoreAVX2, BB1
VMOVDQA state2StoreAVX2, CC1
VMOVDQA ctr3StoreAVX2, DD1
VPADDD avx2IncMask<>(SB), DD1, DD1
VPADDD ·avx2IncMask<>(SB), DD1, DD1
VMOVDQA DD1, DD0
XORQ itr2, itr2
......@@ -1153,7 +1153,7 @@ openAVX2Tail128LoopB:
CMPQ itr2, $160
JNE openAVX2Tail128LoopB
VPADDD chacha20Constants<>(SB), AA1, AA1
VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC1, CC1
VPADDD DD0, DD1, DD1
......@@ -1196,12 +1196,12 @@ openAVX2TailDone:
// Special optimization for the last 256 bytes of ciphertext
openAVX2Tail256:
// Need to decrypt up to 256 bytes - prepare four blocks
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1
VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0
VPADDD avx2IncMask<>(SB), DD0, DD1
VPADDD ·avx2IncMask<>(SB), DD0, DD0
VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA DD0, TT1
VMOVDQA DD1, TT2
......@@ -1255,7 +1255,7 @@ openAVX2Tail256Hash:
// Store 128 bytes safely, then go to store loop
openAVX2Tail256HashEnd:
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
......@@ -1274,13 +1274,13 @@ openAVX2Tail256HashEnd:
// Special optimization for the last 384 bytes of ciphertext
openAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare six blocks
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0
VPADDD avx2IncMask<>(SB), DD0, DD1
VPADDD avx2IncMask<>(SB), DD1, DD2
VPADDD ·avx2IncMask<>(SB), DD0, DD0
VPADDD ·avx2IncMask<>(SB), DD0, DD1
VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA DD0, ctr0StoreAVX2
VMOVDQA DD1, ctr1StoreAVX2
VMOVDQA DD2, ctr2StoreAVX2
......@@ -1339,7 +1339,7 @@ openAVX2Tail384Hash:
// Store 256 bytes safely, then go to store loop
openAVX2Tail384HashEnd:
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2
......@@ -1358,10 +1358,10 @@ openAVX2Tail384HashEnd:
// ----------------------------------------------------------------------------
// Special optimization for the last 512 bytes of ciphertext
openAVX2Tail512:
VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
XORQ itr1, itr1
MOVQ inp, itr2
......@@ -1374,7 +1374,7 @@ openAVX2Tail512LoopB:
openAVX2Tail512LoopA:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
......@@ -1387,7 +1387,7 @@ openAVX2Tail512LoopA:
polyMulAVX2
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
......@@ -1401,7 +1401,7 @@ openAVX2Tail512LoopA:
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
polyAdd(2*8(itr2))
......@@ -1415,7 +1415,7 @@ openAVX2Tail512LoopA:
VMOVDQA tmpStoreAVX2, CC3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
......@@ -1448,7 +1448,7 @@ openAVX2Tail512HashLoop:
JMP openAVX2Tail512HashLoop
openAVX2Tail512HashEnd:
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
......@@ -1493,7 +1493,7 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
JBE sealSSE128 // About 15% faster
// In the seal case - prepare the poly key + 3 blocks of stream in the first iteration
MOVOU chacha20Constants<>(SB), A0
MOVOU ·chacha20Constants<>(SB), A0
MOVOU (1*16)(keyp), B0
MOVOU (2*16)(keyp), C0
MOVOU (3*16)(keyp), D0
......@@ -1503,9 +1503,9 @@ TEXT ·chacha20Poly1305Seal(SB), 0, $288-96
MOVO C0, state2Store
// Load state, increment counter blocks
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
......@@ -1535,13 +1535,13 @@ sealSSEIntroLoop:
JNE sealSSEIntroLoop
// Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
// Clamp and store the key
PAND polyClampMask<>(SB), A0
PAND ·polyClampMask<>(SB), A0
MOVO A0, rStore
MOVO B0, sStore
......@@ -1585,10 +1585,10 @@ sealSSEIntroLoop:
sealSSEMainLoop:
// Load state, increment counter blocks
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL sseIncMask<>(SB), D3
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3
// Store counters
MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store
......@@ -1627,7 +1627,7 @@ sealSSEInnerLoop:
JG sealSSEInnerLoop
// Add in the state
PADDD chacha20Constants<>(SB), A0; PADDD chacha20Constants<>(SB), A1; PADDD chacha20Constants<>(SB), A2; PADDD chacha20Constants<>(SB), A3
PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3
PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3
PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3
PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3
......@@ -1683,11 +1683,11 @@ sealSSEInnerLoop:
// Special optimization for the last 64 bytes of plaintext
sealSSETail64:
// Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes
MOVO chacha20Constants<>(SB), A1
MOVO ·chacha20Constants<>(SB), A1
MOVO state1Store, B1
MOVO state2Store, C1
MOVO ctr3Store, D1
PADDL sseIncMask<>(SB), D1
PADDL ·sseIncMask<>(SB), D1
MOVO D1, ctr0Store
sealSSETail64LoopA:
......@@ -1710,7 +1710,7 @@ sealSSETail64LoopB:
DECQ itr2
JGE sealSSETail64LoopB
PADDL chacha20Constants<>(SB), A1
PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B1
PADDL state2Store, C1
PADDL ctr0Store, D1
......@@ -1721,8 +1721,8 @@ sealSSETail64LoopB:
// Special optimization for the last 128 bytes of plaintext
sealSSETail128:
// Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
sealSSETail128LoopA:
// Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext
......@@ -1747,7 +1747,7 @@ sealSSETail128LoopB:
DECQ itr2
JGE sealSSETail128LoopB
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1
PADDL state1Store, B0; PADDL state1Store, B1
PADDL state2Store, C0; PADDL state2Store, C1
PADDL ctr0Store, D0; PADDL ctr1Store, D1
......@@ -1766,9 +1766,9 @@ sealSSETail128LoopB:
// Special optimization for the last 192 bytes of plaintext
sealSSETail192:
// Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes
MOVO chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL sseIncMask<>(SB), D0; MOVO D0, ctr0Store
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1; MOVO D1, ctr1Store
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2; MOVO D2, ctr2Store
MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store
sealSSETail192LoopA:
// Perform ChaCha rounds, while hashing the prevsiosly encrpyted ciphertext
......@@ -1797,7 +1797,7 @@ sealSSETail192LoopB:
DECQ itr2
JGE sealSSETail192LoopB
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2
PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2
PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2
......@@ -1823,9 +1823,9 @@ sealSSETail192LoopB:
// Special seal optimization for buffers smaller than 129 bytes
sealSSE128:
// For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks
MOVOU chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL sseIncMask<>(SB), D2
MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0
MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1
MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2
MOVO B0, T1; MOVO C0, T2; MOVO D1, T3
MOVQ $10, itr2
......@@ -1842,11 +1842,11 @@ sealSSE128InnerCipherLoop:
JNE sealSSE128InnerCipherLoop
// A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
PADDL chacha20Constants<>(SB), A0; PADDL chacha20Constants<>(SB), A1; PADDL chacha20Constants<>(SB), A2
PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2
PADDL T1, B0; PADDL T1, B1; PADDL T1, B2
PADDL T2, C1; PADDL T2, C2
PADDL T3, D1; PADDL sseIncMask<>(SB), T3; PADDL T3, D2
PAND polyClampMask<>(SB), A0
PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2
PAND ·polyClampMask<>(SB), A0
MOVOU A0, rStore
MOVOU B0, sStore
......@@ -1903,7 +1903,7 @@ sealSSETail:
// We can only load the PT one byte at a time to avoid read after end of buffer
MOVQ inl, itr2
SHLQ $4, itr2
LEAQ andMask<>(SB), t0
LEAQ ·andMask<>(SB), t0
MOVQ inl, itr1
LEAQ -1(inp)(inl*1), inp
XORQ t2, t2
......@@ -1963,11 +1963,11 @@ sealSSEFinalize:
// ------------------------- AVX2 Code ----------------------------------------
chacha20Poly1305Seal_AVX2:
VZEROUPPER
VMOVDQU chacha20Constants<>(SB), AA0
VMOVDQU ·chacha20Constants<>(SB), AA0
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14
BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12
BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4
VPADDD avx2InitMask<>(SB), DD0, DD0
VPADDD ·avx2InitMask<>(SB), DD0, DD0
// Special optimizations, for very short buffers
CMPQ inl, $192
......@@ -1979,9 +1979,9 @@ chacha20Poly1305Seal_AVX2:
VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2
VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2
VPADDD avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
VPADDD avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
VPADDD avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2
VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2
VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2
VMOVDQA DD3, ctr3StoreAVX2
MOVQ $10, itr2
......@@ -2012,7 +2012,7 @@ sealAVX2IntroLoop:
DECQ itr2
JNE sealAVX2IntroLoop
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
......@@ -2022,7 +2022,7 @@ sealAVX2IntroLoop:
VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95
// Clamp and store poly key
VPAND polyClampMask<>(SB), DD0, DD0
VPAND ·polyClampMask<>(SB), DD0, DD0
VMOVDQA DD0, rsStoreAVX2
// Hash AD
......@@ -2068,11 +2068,11 @@ sealAVX2IntroLoop:
JBE sealAVX2Tail512
// We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
VMOVDQA CC3, tmpStoreAVX2
......@@ -2100,7 +2100,7 @@ sealAVX2IntroLoop:
VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
......@@ -2116,10 +2116,10 @@ sealAVX2IntroLoop:
sealAVX2MainLoop:
// Load state, increment counter blocks, store the incremented counters
VMOVDQU chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
MOVQ $10, itr1
......@@ -2128,7 +2128,7 @@ sealAVX2InternalLoop:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage1_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulStage2_AVX2
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
......@@ -2144,7 +2144,7 @@ sealAVX2InternalLoop:
sealAVX2InternalLoopStart:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
polyAdd(2*8(oup))
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage1_AVX2
......@@ -2162,7 +2162,7 @@ sealAVX2InternalLoopStart:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
polyMulStage3_AVX2
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
polyMulReduceStage
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
......@@ -2178,7 +2178,7 @@ sealAVX2InternalLoopStart:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
polyMulStage2_AVX2
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
polyMulStage3_AVX2
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
......@@ -2195,7 +2195,7 @@ sealAVX2InternalLoopStart:
DECQ itr1
JNE sealAVX2InternalLoop
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
......@@ -2250,7 +2250,7 @@ seal192AVX2:
VMOVDQA AA0, AA1
VMOVDQA BB0, BB1
VMOVDQA CC0, CC1
VPADDD avx2IncMask<>(SB), DD0, DD1
VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2
VMOVDQA BB0, BB2
VMOVDQA CC0, CC2
......@@ -2276,7 +2276,7 @@ sealAVX2192InnerCipherLoop:
VPERM2I128 $0x02, AA0, BB0, TT0
// Clamp and store poly key
VPAND polyClampMask<>(SB), TT0, TT0
VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 192 bytes
......@@ -2359,8 +2359,8 @@ sealAVX2ShortDone:
// Special optimization for buffers smaller than 321 bytes
seal320AVX2:
// For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD avx2IncMask<>(SB), DD1, DD2
VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3
MOVQ $10, itr2
......@@ -2376,18 +2376,18 @@ sealAVX2320InnerCipherLoop:
DECQ itr2
JNE sealAVX2320InnerCipherLoop
VMOVDQA chacha20Constants<>(SB), TT0
VMOVDQA ·chacha20Constants<>(SB), TT0
VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2
VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2
VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2
VMOVDQA avx2IncMask<>(SB), TT0
VMOVDQA ·avx2IncMask<>(SB), TT0
VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3
VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3
VPADDD TT3, DD2, DD2
// Clamp and store poly key
VPERM2I128 $0x02, AA0, BB0, TT0
VPAND polyClampMask<>(SB), TT0, TT0
VPAND ·polyClampMask<>(SB), TT0, TT0
VMOVDQA TT0, rsStoreAVX2
// Stream for up to 320 bytes
......@@ -2409,11 +2409,11 @@ sealAVX2Tail128:
// Need to decrypt up to 128 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0
VMOVDQA ·chacha20Constants<>(SB), AA0
VMOVDQA state1StoreAVX2, BB0
VMOVDQA state2StoreAVX2, CC0
VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0
VPADDD ·avx2IncMask<>(SB), DD0, DD0
VMOVDQA DD0, DD1
sealAVX2Tail128LoopA:
......@@ -2440,7 +2440,7 @@ sealAVX2Tail128LoopB:
DECQ itr2
JGE sealAVX2Tail128LoopB
VPADDD chacha20Constants<>(SB), AA0, AA1
VPADDD ·chacha20Constants<>(SB), AA0, AA1
VPADDD state1StoreAVX2, BB0, BB1
VPADDD state2StoreAVX2, CC0, CC1
VPADDD DD1, DD0, DD1
......@@ -2457,12 +2457,12 @@ sealAVX2Tail256:
// Need to decrypt up to 256 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA chacha20Constants<>(SB), AA1
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1
VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1
VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1
VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0
VPADDD avx2IncMask<>(SB), DD0, DD1
VPADDD ·avx2IncMask<>(SB), DD0, DD0
VPADDD ·avx2IncMask<>(SB), DD0, DD1
VMOVDQA DD0, TT1
VMOVDQA DD1, TT2
......@@ -2490,7 +2490,7 @@ sealAVX2Tail256LoopB:
DECQ itr2
JGE sealAVX2Tail256LoopB
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1
......@@ -2516,11 +2516,11 @@ sealAVX2Tail384:
// Need to decrypt up to 384 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2
VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2
VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2
VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3
sealAVX2Tail384LoopA:
......@@ -2547,7 +2547,7 @@ sealAVX2Tail384LoopB:
DECQ itr2
JGE sealAVX2Tail384LoopB
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2
VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2
......@@ -2579,11 +2579,11 @@ sealAVX2Tail512:
// Need to decrypt up to 512 bytes - prepare two blocks
// If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed
// If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed
VMOVDQA chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3
VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3
VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3
VMOVDQA ctr3StoreAVX2, DD0
VPADDD avx2IncMask<>(SB), DD0, DD0; VPADDD avx2IncMask<>(SB), DD0, DD1; VPADDD avx2IncMask<>(SB), DD1, DD2; VPADDD avx2IncMask<>(SB), DD2, DD3
VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3
VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2
sealAVX2Tail512LoopA:
......@@ -2594,7 +2594,7 @@ sealAVX2Tail512LoopA:
sealAVX2Tail512LoopB:
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
......@@ -2607,7 +2607,7 @@ sealAVX2Tail512LoopB:
polyMulAVX2
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
......@@ -2621,7 +2621,7 @@ sealAVX2Tail512LoopB:
VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol16<>(SB), DD0, DD0; VPSHUFB rol16<>(SB), DD1, DD1; VPSHUFB rol16<>(SB), DD2, DD2; VPSHUFB rol16<>(SB), DD3, DD3
VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
polyAdd(2*8(oup))
......@@ -2635,7 +2635,7 @@ sealAVX2Tail512LoopB:
VMOVDQA tmpStoreAVX2, CC3
VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3
VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3
VPSHUFB rol8<>(SB), DD0, DD0; VPSHUFB rol8<>(SB), DD1, DD1; VPSHUFB rol8<>(SB), DD2, DD2; VPSHUFB rol8<>(SB), DD3, DD3
VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3
VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3
VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3
VMOVDQA CC3, tmpStoreAVX2
......@@ -2653,7 +2653,7 @@ sealAVX2Tail512LoopB:
DECQ itr2
JGE sealAVX2Tail512LoopB
VPADDD chacha20Constants<>(SB), AA0, AA0; VPADDD chacha20Constants<>(SB), AA1, AA1; VPADDD chacha20Constants<>(SB), AA2, AA2; VPADDD chacha20Constants<>(SB), AA3, AA3
VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3
VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3
VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3
VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment