Commit f9b1d646 authored by Dave Watson's avatar Dave Watson Committed by Herbert Xu

crypto: aesni - Merge GCM_ENC_DEC

The GCM_ENC_DEC routines for AVX and AVX2 are identical, except they
call separate sub-macros.  Pass the macros as arguments, and merge them.
This facilitates additional refactoring, by requiring changes in only
one place.

The GCM_ENC_DEC macro was moved above the CONFIG_AS_AVX* ifdefs,
since it will be used by both AVX and AVX2.
Signed-off-by: default avatarDave Watson <davejwatson@fb.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 00c9fe37
...@@ -280,6 +280,320 @@ VARIABLE_OFFSET = 16*8 ...@@ -280,6 +280,320 @@ VARIABLE_OFFSET = 16*8
vaesenclast 16*10(arg1), \XMM0, \XMM0 vaesenclast 16*10(arg1), \XMM0, \XMM0
.endm .endm
# combined for GCM encrypt and decrypt functions
# clobbering all xmm registers
# clobbering r10, r11, r12, r13, r14, r15
.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC
#the number of pushes must equal STACK_OFFSET
push %r12
push %r13
push %r14
push %r15
mov %rsp, %r14
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes
vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
and $-16, %r13 # r13 = r13 - (r13 mod 16)
mov %r13, %r12
shr $4, %r12
and $7, %r12
jz _initial_num_blocks_is_0\@
cmp $7, %r12
je _initial_num_blocks_is_7\@
cmp $6, %r12
je _initial_num_blocks_is_6\@
cmp $5, %r12
je _initial_num_blocks_is_5\@
cmp $4, %r12
je _initial_num_blocks_is_4\@
cmp $3, %r12
je _initial_num_blocks_is_3\@
cmp $2, %r12
je _initial_num_blocks_is_2\@
jmp _initial_num_blocks_is_1\@
_initial_num_blocks_is_7\@:
\INITIAL_BLOCKS 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*7, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_6\@:
\INITIAL_BLOCKS 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*6, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_5\@:
\INITIAL_BLOCKS 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*5, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_4\@:
\INITIAL_BLOCKS 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*4, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_3\@:
\INITIAL_BLOCKS 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*3, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_2\@:
\INITIAL_BLOCKS 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*2, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_1\@:
\INITIAL_BLOCKS 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*1, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_0\@:
\INITIAL_BLOCKS 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
_initial_blocks_encrypted\@:
cmp $0, %r13
je _zero_cipher_left\@
sub $128, %r13
je _eight_cipher_left\@
vmovd %xmm9, %r15d
and $255, %r15d
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
_encrypt_by_8_new\@:
cmp $(255-8), %r15d
jg _encrypt_by_8\@
add $8, %r15b
\GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
add $128, %r11
sub $128, %r13
jne _encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
jmp _eight_cipher_left\@
_encrypt_by_8\@:
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $8, %r15b
\GHASH_8_ENCRYPT_8_PARALLEL %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $128, %r11
sub $128, %r13
jne _encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
_eight_cipher_left\@:
\GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
_zero_cipher_left\@:
cmp $16, arg4
jl _only_less_than_16\@
mov arg4, %r13
and $15, %r13 # r13 = (arg4 mod 16)
je _multiple_of_16_bytes\@
# handle the last <16 Byte block seperately
vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
sub $16, %r11
add %r13, %r11
vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
lea SHIFT_MASK+16(%rip), %r12
sub %r13, %r12 # adjust the shuffle mask pointer to be
# able to shift 16-r13 bytes (r13 is the
# number of bytes in plaintext mod 16)
vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
jmp _final_ghash_mul\@
_only_less_than_16\@:
# check for 0 length
mov arg4, %r13
and $15, %r13 # r13 = (arg4 mod 16)
je _multiple_of_16_bytes\@
# handle the last <16 Byte block separately
vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
lea SHIFT_MASK+16(%rip), %r12
sub %r13, %r12 # adjust the shuffle mask pointer to be
# able to shift 16-r13 bytes (r13 is the
# number of bytes in plaintext mod 16)
_get_last_16_byte_loop\@:
movb (arg3, %r11), %al
movb %al, TMP1 (%rsp , %r11)
add $1, %r11
cmp %r13, %r11
jne _get_last_16_byte_loop\@
vmovdqu TMP1(%rsp), %xmm1
sub $16, %r11
_final_ghash_mul\@:
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm2
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
# mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm2, %xmm2
vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
vpxor %xmm2, %xmm14, %xmm14
#GHASH computation for the last <16 Byte block
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
sub %r13, %r11
add $16, %r11
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
# mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
vpxor %xmm9, %xmm14, %xmm14
#GHASH computation for the last <16 Byte block
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
sub %r13, %r11
add $16, %r11
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
.endif
#############################
# output r13 Bytes
vmovq %xmm9, %rax
cmp $8, %r13
jle _less_than_8_bytes_left\@
mov %rax, (arg2 , %r11)
add $8, %r11
vpsrldq $8, %xmm9, %xmm9
vmovq %xmm9, %rax
sub $8, %r13
_less_than_8_bytes_left\@:
movb %al, (arg2 , %r11)
add $1, %r11
shr $8, %rax
sub $1, %r13
jne _less_than_8_bytes_left\@
#############################
_multiple_of_16_bytes\@:
mov arg7, %r12 # r12 = aadLen (number of bytes)
shl $3, %r12 # convert into number of bits
vmovd %r12d, %xmm15 # len(A) in xmm15
shl $3, arg4 # len(C) in bits (*128)
vmovq arg4, %xmm1
vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
vpxor %xmm15, %xmm14, %xmm14
\GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
mov arg5, %rax # rax = *Y0
vmovdqu (%rax), %xmm9 # xmm9 = Y0
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
vpxor %xmm14, %xmm9, %xmm9
_return_T\@:
mov arg8, %r10 # r10 = authTag
mov arg9, %r11 # r11 = auth_tag_len
cmp $16, %r11
je _T_16\@
cmp $8, %r11
jl _T_4\@
_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_4\@:
vmovd %xmm9, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_123\@:
vmovd %xmm9, %eax
cmp $2, %r11
jl _T_1\@
mov %ax, (%r10)
cmp $2, %r11
je _return_T_done\@
add $2, %r10
sar $16, %eax
_T_1\@:
mov %al, (%r10)
jmp _return_T_done\@
_T_16\@:
vmovdqu %xmm9, (%r10)
_return_T_done\@:
mov %r14, %rsp
pop %r15
pop %r14
pop %r13
pop %r12
.endm
#ifdef CONFIG_AS_AVX #ifdef CONFIG_AS_AVX
############################################################################### ###############################################################################
# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) # GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
...@@ -873,316 +1187,128 @@ _initial_blocks_done\@: ...@@ -873,316 +1187,128 @@ _initial_blocks_done\@:
vaesenc \T1, \XMM5, \XMM5 vaesenc \T1, \XMM5, \XMM5
vaesenc \T1, \XMM6, \XMM6 vaesenc \T1, \XMM6, \XMM6
vaesenc \T1, \XMM7, \XMM7 vaesenc \T1, \XMM7, \XMM7
vaesenc \T1, \XMM8, \XMM8 vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP5(%rsp), \T1
vmovdqa HashKey_4(arg1), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
vmovdqa HashKey_4_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
vmovdqu 16*7(arg1), \T1
vaesenc \T1, \XMM1, \XMM1
vaesenc \T1, \XMM2, \XMM2
vaesenc \T1, \XMM3, \XMM3
vaesenc \T1, \XMM4, \XMM4
vaesenc \T1, \XMM5, \XMM5
vaesenc \T1, \XMM6, \XMM6
vaesenc \T1, \XMM7, \XMM7
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP6(%rsp), \T1
vmovdqa HashKey_3(arg1), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
vmovdqa HashKey_3_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
vmovdqu 16*8(arg1), \T1
vaesenc \T1, \XMM1, \XMM1
vaesenc \T1, \XMM2, \XMM2
vaesenc \T1, \XMM3, \XMM3
vaesenc \T1, \XMM4, \XMM4
vaesenc \T1, \XMM5, \XMM5
vaesenc \T1, \XMM6, \XMM6
vaesenc \T1, \XMM7, \XMM7
vaesenc \T1, \XMM8, \XMM8
vmovdqa TMP7(%rsp), \T1
vmovdqa HashKey_2(arg1), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
vmovdqa HashKey_2_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
#######################################################################
vmovdqu 16*9(arg1), \T5
vaesenc \T5, \XMM1, \XMM1
vaesenc \T5, \XMM2, \XMM2
vaesenc \T5, \XMM3, \XMM3
vaesenc \T5, \XMM4, \XMM4
vaesenc \T5, \XMM5, \XMM5
vaesenc \T5, \XMM6, \XMM6
vaesenc \T5, \XMM7, \XMM7
vaesenc \T5, \XMM8, \XMM8
vmovdqa TMP8(%rsp), \T1
vmovdqa HashKey(arg1), \T5
vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
vpshufd $0b01001110, \T1, \T3
vpxor \T1, \T3, \T3
vmovdqa HashKey_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
vpxor \T4, \T6, \T6
vpxor \T7, \T6, \T6
vmovdqu 16*10(arg1), \T5
i = 0
j = 1
setreg
.rep 8
vpxor 16*i(arg3, %r11), \T5, \T2
.if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j
.else
vaesenclast \T2, reg_j, \T3
vmovdqu 16*i(arg3, %r11), reg_j
vmovdqu \T3, 16*i(arg2, %r11)
.endif
i = (i+1)
j = (j+1)
setreg
.endr
#######################################################################
vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
vpxor \T3, \T7, \T7
vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
#######################################################################
#first phase of the reduction
#######################################################################
vpslld $31, \T7, \T2 # packed right shifting << 31
vpslld $30, \T7, \T3 # packed right shifting shift << 30
vpslld $25, \T7, \T4 # packed right shifting shift << 25
vpxor \T3, \T2, \T2 # xor the shifted versions
vpxor \T4, \T2, \T2
vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
vpxor \T2, \T7, \T7 # first phase of the reduction complete
#######################################################################
.if \ENC_DEC == ENC
vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
.endif
#######################################################################
#second phase of the reduction
vpsrld $1, \T7, \T2 # packed left shifting >> 1
vpsrld $2, \T7, \T3 # packed left shifting >> 2
vpsrld $7, \T7, \T4 # packed left shifting >> 7
vpxor \T3, \T2, \T2 # xor the shifted versions
vpxor \T4, \T2, \T2
vpxor \T1, \T2, \T2
vpxor \T2, \T7, \T7
vpxor \T7, \T6, \T6 # the result is in T6
#######################################################################
vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
vpxor \T6, \XMM1, \XMM1
.endm
# GHASH the last 4 ciphertext blocks.
.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
## Karatsuba Method
vpshufd $0b01001110, \XMM1, \T2
vpxor \XMM1, \T2, \T2
vmovdqa HashKey_8(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM1, \T6
vpclmulqdq $0x00, \T5, \XMM1, \T7
vmovdqa HashKey_8_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \XMM1
######################
vpshufd $0b01001110, \XMM2, \T2
vpxor \XMM2, \T2, \T2
vmovdqa HashKey_7(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM2, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM2, \T4
vpxor \T4, \T7, \T7
vmovdqa HashKey_7_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vpshufd $0b01001110, \XMM3, \T2
vpxor \XMM3, \T2, \T2
vmovdqa HashKey_6(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM3, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM3, \T4
vpxor \T4, \T7, \T7
vmovdqa HashKey_6_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vpshufd $0b01001110, \XMM4, \T2
vpxor \XMM4, \T2, \T2
vmovdqa HashKey_5(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM4, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM4, \T4
vpxor \T4, \T7, \T7
vmovdqa HashKey_5_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vpshufd $0b01001110, \XMM5, \T2 vmovdqa TMP5(%rsp), \T1
vpxor \XMM5, \T2, \T2
vmovdqa HashKey_4(arg1), \T5 vmovdqa HashKey_4(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM5, \T4 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T4, \T6, \T6 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpclmulqdq $0x00, \T5, \XMM5, \T4 vpxor \T3, \T7, \T7
vpxor \T4, \T7, \T7
vmovdqa HashKey_4_k(arg1), \T3 vpshufd $0b01001110, \T1, \T3
vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T1, \T3, \T3
vpxor \T2, \XMM1, \XMM1 vmovdqa HashKey_4_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
###################### vmovdqu 16*7(arg1), \T1
vaesenc \T1, \XMM1, \XMM1
vaesenc \T1, \XMM2, \XMM2
vaesenc \T1, \XMM3, \XMM3
vaesenc \T1, \XMM4, \XMM4
vaesenc \T1, \XMM5, \XMM5
vaesenc \T1, \XMM6, \XMM6
vaesenc \T1, \XMM7, \XMM7
vaesenc \T1, \XMM8, \XMM8
vpshufd $0b01001110, \XMM6, \T2 vmovdqa TMP6(%rsp), \T1
vpxor \XMM6, \T2, \T2
vmovdqa HashKey_3(arg1), \T5 vmovdqa HashKey_3(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM6, \T4 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T4, \T6, \T6 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
vpclmulqdq $0x00, \T5, \XMM6, \T4 vpshufd $0b01001110, \T1, \T3
vpxor \T4, \T7, \T7 vpxor \T1, \T3, \T3
vmovdqa HashKey_3_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
vmovdqa HashKey_3_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
###################### vmovdqu 16*8(arg1), \T1
vaesenc \T1, \XMM1, \XMM1
vaesenc \T1, \XMM2, \XMM2
vaesenc \T1, \XMM3, \XMM3
vaesenc \T1, \XMM4, \XMM4
vaesenc \T1, \XMM5, \XMM5
vaesenc \T1, \XMM6, \XMM6
vaesenc \T1, \XMM7, \XMM7
vaesenc \T1, \XMM8, \XMM8
vpshufd $0b01001110, \XMM7, \T2 vmovdqa TMP7(%rsp), \T1
vpxor \XMM7, \T2, \T2
vmovdqa HashKey_2(arg1), \T5 vmovdqa HashKey_2(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM7, \T4 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T4, \T6, \T6 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
vpclmulqdq $0x00, \T5, \XMM7, \T4 vpshufd $0b01001110, \T1, \T3
vpxor \T4, \T7, \T7 vpxor \T1, \T3, \T3
vmovdqa HashKey_2_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
vmovdqa HashKey_2_k(arg1), \T3 #######################################################################
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
###################### vmovdqu 16*9(arg1), \T5
vaesenc \T5, \XMM1, \XMM1
vaesenc \T5, \XMM2, \XMM2
vaesenc \T5, \XMM3, \XMM3
vaesenc \T5, \XMM4, \XMM4
vaesenc \T5, \XMM5, \XMM5
vaesenc \T5, \XMM6, \XMM6
vaesenc \T5, \XMM7, \XMM7
vaesenc \T5, \XMM8, \XMM8
vpshufd $0b01001110, \XMM8, \T2 vmovdqa TMP8(%rsp), \T1
vpxor \XMM8, \T2, \T2
vmovdqa HashKey(arg1), \T5 vmovdqa HashKey(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM8, \T4 vpclmulqdq $0x11, \T5, \T1, \T3
vpxor \T4, \T6, \T6 vpxor \T3, \T4, \T4
vpclmulqdq $0x00, \T5, \T1, \T3
vpxor \T3, \T7, \T7
vpclmulqdq $0x00, \T5, \XMM8, \T4 vpshufd $0b01001110, \T1, \T3
vpxor \T4, \T7, \T7 vpxor \T1, \T3, \T3
vmovdqa HashKey_k(arg1), \T5
vpclmulqdq $0x10, \T5, \T3, \T3
vpxor \T3, \T6, \T6
vmovdqa HashKey_k(arg1), \T3 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T3, \T2, \T2 vpxor \T7, \T6, \T6
vpxor \T2, \XMM1, \XMM1 vmovdqu 16*10(arg1), \T5
vpxor \T6, \XMM1, \XMM1
vpxor \T7, \XMM1, \T2
i = 0
j = 1
setreg
.rep 8
vpxor 16*i(arg3, %r11), \T5, \T2
.if \ENC_DEC == ENC
vaesenclast \T2, reg_j, reg_j
.else
vaesenclast \T2, reg_j, \T3
vmovdqu 16*i(arg3, %r11), reg_j
vmovdqu \T3, 16*i(arg2, %r11)
.endif
i = (i+1)
j = (j+1)
setreg
.endr
#######################################################################
vpslldq $8, \T6, \T3 # shift-L T3 2 DWs
vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs
vpxor \T3, \T7, \T7
vpxor \T4, \T6, \T6 # accumulate the results in T6:T7
vpslldq $8, \T2, \T4
vpsrldq $8, \T2, \T2
vpxor \T4, \T7, \T7
vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
# the accumulated carry-less multiplications
####################################################################### #######################################################################
#first phase of the reduction #first phase of the reduction
#######################################################################
vpslld $31, \T7, \T2 # packed right shifting << 31 vpslld $31, \T7, \T2 # packed right shifting << 31
vpslld $30, \T7, \T3 # packed right shifting shift << 30 vpslld $30, \T7, \T3 # packed right shifting shift << 30
vpslld $25, \T7, \T4 # packed right shifting shift << 25 vpslld $25, \T7, \T4 # packed right shifting shift << 25
...@@ -1195,8 +1321,18 @@ _initial_blocks_done\@: ...@@ -1195,8 +1321,18 @@ _initial_blocks_done\@:
vpslldq $12, \T2, \T2 # shift-L T2 3 DWs vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
vpxor \T2, \T7, \T7 # first phase of the reduction complete vpxor \T2, \T7, \T7 # first phase of the reduction complete
####################################################################### #######################################################################
.if \ENC_DEC == ENC
vmovdqu \XMM1, 16*0(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM2, 16*1(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM3, 16*2(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM4, 16*3(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM5, 16*4(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM6, 16*5(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM7, 16*6(arg2,%r11) # Write to the Ciphertext buffer
vmovdqu \XMM8, 16*7(arg2,%r11) # Write to the Ciphertext buffer
.endif
#######################################################################
#second phase of the reduction #second phase of the reduction
vpsrld $1, \T7, \T2 # packed left shifting >> 1 vpsrld $1, \T7, \T2 # packed left shifting >> 1
vpsrld $2, \T7, \T3 # packed left shifting >> 2 vpsrld $2, \T7, \T3 # packed left shifting >> 2
...@@ -1207,325 +1343,187 @@ _initial_blocks_done\@: ...@@ -1207,325 +1343,187 @@ _initial_blocks_done\@:
vpxor \T1, \T2, \T2 vpxor \T1, \T2, \T2
vpxor \T2, \T7, \T7 vpxor \T2, \T7, \T7
vpxor \T7, \T6, \T6 # the result is in T6 vpxor \T7, \T6, \T6 # the result is in T6
#######################################################################
.endm vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap
# combined for GCM encrypt and decrypt functions vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap
# clobbering all xmm registers vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap
# clobbering r10, r11, r12, r13, r14, r15 vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap
.macro GCM_ENC_DEC_AVX ENC_DEC vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap
vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap
#the number of pushes must equal STACK_OFFSET
push %r12
push %r13
push %r14
push %r15
mov %rsp, %r14
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes
vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
and $-16, %r13 # r13 = r13 - (r13 mod 16)
mov %r13, %r12
shr $4, %r12
and $7, %r12
jz _initial_num_blocks_is_0\@
cmp $7, %r12
je _initial_num_blocks_is_7\@
cmp $6, %r12
je _initial_num_blocks_is_6\@
cmp $5, %r12
je _initial_num_blocks_is_5\@
cmp $4, %r12
je _initial_num_blocks_is_4\@
cmp $3, %r12
je _initial_num_blocks_is_3\@
cmp $2, %r12
je _initial_num_blocks_is_2\@
jmp _initial_num_blocks_is_1\@
_initial_num_blocks_is_7\@:
INITIAL_BLOCKS_AVX 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*7, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_6\@:
INITIAL_BLOCKS_AVX 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*6, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_5\@:
INITIAL_BLOCKS_AVX 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*5, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_4\@:
INITIAL_BLOCKS_AVX 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*4, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_3\@:
INITIAL_BLOCKS_AVX 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*3, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_2\@:
INITIAL_BLOCKS_AVX 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*2, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_1\@:
INITIAL_BLOCKS_AVX 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*1, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_0\@:
INITIAL_BLOCKS_AVX 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
_initial_blocks_encrypted\@:
cmp $0, %r13
je _zero_cipher_left\@
sub $128, %r13
je _eight_cipher_left\@
vmovd %xmm9, %r15d vpxor \T6, \XMM1, \XMM1
and $255, %r15d
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
_encrypt_by_8_new\@:
cmp $(255-8), %r15d
jg _encrypt_by_8\@
.endm
add $8, %r15b # GHASH the last 4 ciphertext blocks.
GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC .macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
add $128, %r11
sub $128, %r13
jne _encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 ## Karatsuba Method
jmp _eight_cipher_left\@
_encrypt_by_8\@:
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $8, %r15b
GHASH_8_ENCRYPT_8_PARALLEL_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $128, %r11
sub $128, %r13
jne _encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpshufd $0b01001110, \XMM1, \T2
vpxor \XMM1, \T2, \T2
vmovdqa HashKey_8(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM1, \T6
vpclmulqdq $0x00, \T5, \XMM1, \T7
vmovdqa HashKey_8_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \XMM1
######################
vpshufd $0b01001110, \XMM2, \T2
vpxor \XMM2, \T2, \T2
vmovdqa HashKey_7(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM2, \T4
vpxor \T4, \T6, \T6
_eight_cipher_left\@: vpclmulqdq $0x00, \T5, \XMM2, \T4
GHASH_LAST_8_AVX %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 vpxor \T4, \T7, \T7
vmovdqa HashKey_7_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
_zero_cipher_left\@: ######################
cmp $16, arg4
jl _only_less_than_16\@
mov arg4, %r13 vpshufd $0b01001110, \XMM3, \T2
and $15, %r13 # r13 = (arg4 mod 16) vpxor \XMM3, \T2, \T2
vmovdqa HashKey_6(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM3, \T4
vpxor \T4, \T6, \T6
je _multiple_of_16_bytes\@ vpclmulqdq $0x00, \T5, \XMM3, \T4
vpxor \T4, \T7, \T7
# handle the last <16 Byte block seperately vmovdqa HashKey_6_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vpshufd $0b01001110, \XMM4, \T2
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpxor \XMM4, \T2, \T2
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn) vmovdqa HashKey_5(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM4, \T4
vpxor \T4, \T6, \T6
sub $16, %r11 vpclmulqdq $0x00, \T5, \XMM4, \T4
add %r13, %r11 vpxor \T4, \T7, \T7
vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
lea SHIFT_MASK+16(%rip), %r12 vmovdqa HashKey_5_k(arg1), \T3
sub %r13, %r12 # adjust the shuffle mask pointer to be vpclmulqdq $0x00, \T3, \T2, \T2
# able to shift 16-r13 bytes (r13 is the vpxor \T2, \XMM1, \XMM1
# number of bytes in plaintext mod 16)
vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
jmp _final_ghash_mul\@
_only_less_than_16\@: ######################
# check for 0 length
mov arg4, %r13
and $15, %r13 # r13 = (arg4 mod 16)
je _multiple_of_16_bytes\@ vpshufd $0b01001110, \XMM5, \T2
vpxor \XMM5, \T2, \T2
vmovdqa HashKey_4(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM5, \T4
vpxor \T4, \T6, \T6
# handle the last <16 Byte block seperately vpclmulqdq $0x00, \T5, \XMM5, \T4
vpxor \T4, \T7, \T7
vmovdqa HashKey_4_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn ######################
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
vpshufd $0b01001110, \XMM6, \T2
vpxor \XMM6, \T2, \T2
vmovdqa HashKey_3(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM6, \T4
vpxor \T4, \T6, \T6
lea SHIFT_MASK+16(%rip), %r12 vpclmulqdq $0x00, \T5, \XMM6, \T4
sub %r13, %r12 # adjust the shuffle mask pointer to be vpxor \T4, \T7, \T7
# able to shift 16-r13 bytes (r13 is the
# number of bytes in plaintext mod 16)
_get_last_16_byte_loop\@: vmovdqa HashKey_3_k(arg1), \T3
movb (arg3, %r11), %al vpclmulqdq $0x00, \T3, \T2, \T2
movb %al, TMP1 (%rsp , %r11) vpxor \T2, \XMM1, \XMM1
add $1, %r11
cmp %r13, %r11
jne _get_last_16_byte_loop\@
vmovdqu TMP1(%rsp), %xmm1 ######################
sub $16, %r11 vpshufd $0b01001110, \XMM7, \T2
vpxor \XMM7, \T2, \T2
vmovdqa HashKey_2(arg1), \T5
vpclmulqdq $0x11, \T5, \XMM7, \T4
vpxor \T4, \T6, \T6
_final_ghash_mul\@: vpclmulqdq $0x00, \T5, \XMM7, \T4
.if \ENC_DEC == DEC vpxor \T4, \T7, \T7
vmovdqa %xmm1, %xmm2
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
# mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm2, %xmm2
vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
vpxor %xmm2, %xmm14, %xmm14
#GHASH computation for the last <16 Byte block
GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
sub %r13, %r11
add $16, %r11
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to
# mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
vpxor %xmm9, %xmm14, %xmm14
#GHASH computation for the last <16 Byte block
GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
sub %r13, %r11
add $16, %r11
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
.endif
vmovdqa HashKey_2_k(arg1), \T3
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
############################# ######################
# output r13 Bytes
vmovq %xmm9, %rax
cmp $8, %r13
jle _less_than_8_bytes_left\@
mov %rax, (arg2 , %r11) vpshufd $0b01001110, \XMM8, \T2
add $8, %r11 vpxor \XMM8, \T2, \T2
vpsrldq $8, %xmm9, %xmm9 vmovdqa HashKey(arg1), \T5
vmovq %xmm9, %rax vpclmulqdq $0x11, \T5, \XMM8, \T4
sub $8, %r13 vpxor \T4, \T6, \T6
_less_than_8_bytes_left\@: vpclmulqdq $0x00, \T5, \XMM8, \T4
movb %al, (arg2 , %r11) vpxor \T4, \T7, \T7
add $1, %r11
shr $8, %rax
sub $1, %r13
jne _less_than_8_bytes_left\@
#############################
_multiple_of_16_bytes\@: vmovdqa HashKey_k(arg1), \T3
mov arg7, %r12 # r12 = aadLen (number of bytes) vpclmulqdq $0x00, \T3, \T2, \T2
shl $3, %r12 # convert into number of bits
vmovd %r12d, %xmm15 # len(A) in xmm15
shl $3, arg4 # len(C) in bits (*128) vpxor \T2, \XMM1, \XMM1
vmovq arg4, %xmm1 vpxor \T6, \XMM1, \XMM1
vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 vpxor \T7, \XMM1, \T2
vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
vpxor %xmm15, %xmm14, %xmm14
GHASH_MUL_AVX %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation
vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap
mov arg5, %rax # rax = *Y0
vmovdqu (%rax), %xmm9 # xmm9 = Y0
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
vpxor %xmm14, %xmm9, %xmm9 vpslldq $8, \T2, \T4
vpsrldq $8, \T2, \T2
vpxor \T4, \T7, \T7
vpxor \T2, \T6, \T6 # <T6:T7> holds the result of
# the accumulated carry-less multiplications
#######################################################################
#first phase of the reduction
vpslld $31, \T7, \T2 # packed right shifting << 31
vpslld $30, \T7, \T3 # packed right shifting shift << 30
vpslld $25, \T7, \T4 # packed right shifting shift << 25
_return_T\@: vpxor \T3, \T2, \T2 # xor the shifted versions
mov arg8, %r10 # r10 = authTag vpxor \T4, \T2, \T2
mov arg9, %r11 # r11 = auth_tag_len
cmp $16, %r11 vpsrldq $4, \T2, \T1 # shift-R T1 1 DW
je _T_16\@
cmp $8, %r11 vpslldq $12, \T2, \T2 # shift-L T2 3 DWs
jl _T_4\@ vpxor \T2, \T7, \T7 # first phase of the reduction complete
#######################################################################
_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_4\@:
vmovd %xmm9, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_123\@:
vmovd %xmm9, %eax
cmp $2, %r11
jl _T_1\@
mov %ax, (%r10)
cmp $2, %r11
je _return_T_done\@
add $2, %r10
sar $16, %eax
_T_1\@:
mov %al, (%r10)
jmp _return_T_done\@
_T_16\@: #second phase of the reduction
vmovdqu %xmm9, (%r10) vpsrld $1, \T7, \T2 # packed left shifting >> 1
vpsrld $2, \T7, \T3 # packed left shifting >> 2
vpsrld $7, \T7, \T4 # packed left shifting >> 7
vpxor \T3, \T2, \T2 # xor the shifted versions
vpxor \T4, \T2, \T2
_return_T_done\@: vpxor \T1, \T2, \T2
mov %r14, %rsp vpxor \T2, \T7, \T7
vpxor \T7, \T6, \T6 # the result is in T6
pop %r15
pop %r14
pop %r13
pop %r12
.endm .endm
############################################################# #############################################################
#void aesni_gcm_precomp_avx_gen2 #void aesni_gcm_precomp_avx_gen2
# (gcm_data *my_ctx_data, # (gcm_data *my_ctx_data,
...@@ -1593,7 +1591,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2) ...@@ -1593,7 +1591,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen2)
# Valid values are 16 (most likely), 12 or 8. */ # Valid values are 16 (most likely), 12 or 8. */
############################################################################### ###############################################################################
ENTRY(aesni_gcm_enc_avx_gen2) ENTRY(aesni_gcm_enc_avx_gen2)
GCM_ENC_DEC_AVX ENC GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX ENC
ret ret
ENDPROC(aesni_gcm_enc_avx_gen2) ENDPROC(aesni_gcm_enc_avx_gen2)
...@@ -1614,7 +1612,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2) ...@@ -1614,7 +1612,7 @@ ENDPROC(aesni_gcm_enc_avx_gen2)
# Valid values are 16 (most likely), 12 or 8. */ # Valid values are 16 (most likely), 12 or 8. */
############################################################################### ###############################################################################
ENTRY(aesni_gcm_dec_avx_gen2) ENTRY(aesni_gcm_dec_avx_gen2)
GCM_ENC_DEC_AVX DEC GCM_ENC_DEC INITIAL_BLOCKS_AVX GHASH_8_ENCRYPT_8_PARALLEL_AVX GHASH_LAST_8_AVX GHASH_MUL_AVX DEC
ret ret
ENDPROC(aesni_gcm_dec_avx_gen2) ENDPROC(aesni_gcm_dec_avx_gen2)
#endif /* CONFIG_AS_AVX */ #endif /* CONFIG_AS_AVX */
...@@ -2379,476 +2377,163 @@ _initial_blocks_done\@: ...@@ -2379,476 +2377,163 @@ _initial_blocks_done\@:
vpshufd $0b01001110, \XMM2, \T2 vpshufd $0b01001110, \XMM2, \T2
vpshufd $0b01001110, \T5, \T3 vpshufd $0b01001110, \T5, \T3
vpxor \XMM2, \T2, \T2 vpxor \XMM2, \T2, \T2
vpxor \T5, \T3, \T3 vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM2, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM2, \T4
vpxor \T4, \T7, \T7
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vmovdqa HashKey_6(arg1), \T5
vpshufd $0b01001110, \XMM3, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM3, \T2, \T2
vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM3, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM3, \T4
vpxor \T4, \T7, \T7
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vmovdqa HashKey_5(arg1), \T5
vpshufd $0b01001110, \XMM4, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM4, \T2, \T2
vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM4, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM4, \T4
vpxor \T4, \T7, \T7
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vmovdqa HashKey_4(arg1), \T5
vpshufd $0b01001110, \XMM5, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM5, \T2, \T2
vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM5, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM5, \T4
vpxor \T4, \T7, \T7
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vmovdqa HashKey_3(arg1), \T5
vpshufd $0b01001110, \XMM6, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM6, \T2, \T2
vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM6, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM6, \T4
vpxor \T4, \T7, \T7
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vmovdqa HashKey_2(arg1), \T5
vpshufd $0b01001110, \XMM7, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM7, \T2, \T2
vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM7, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM7, \T4
vpxor \T4, \T7, \T7
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vmovdqa HashKey(arg1), \T5
vpshufd $0b01001110, \XMM8, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM8, \T2, \T2
vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM8, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM8, \T4
vpxor \T4, \T7, \T7
vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
vpxor \T6, \XMM1, \XMM1
vpxor \T7, \XMM1, \T2
vpslldq $8, \T2, \T4
vpsrldq $8, \T2, \T2
vpxor \T4, \T7, \T7
vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
# accumulated carry-less multiplications
#######################################################################
#first phase of the reduction
vmovdqa POLY2(%rip), \T3
vpclmulqdq $0x01, \T7, \T3, \T2
vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
vpxor \T2, \T7, \T7 # first phase of the reduction complete
#######################################################################
#second phase of the reduction
vpclmulqdq $0x00, \T7, \T3, \T2
vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
vpclmulqdq $0x10, \T7, \T3, \T4
vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
vpxor \T2, \T4, \T4 # second phase of the reduction complete
#######################################################################
vpxor \T4, \T6, \T6 # the result is in T6
.endm
# combined for GCM encrypt and decrypt functions
# clobbering all xmm registers
# clobbering r10, r11, r12, r13, r14, r15
.macro GCM_ENC_DEC_AVX2 ENC_DEC
#the number of pushes must equal STACK_OFFSET
push %r12
push %r13
push %r14
push %r15
mov %rsp, %r14
sub $VARIABLE_OFFSET, %rsp
and $~63, %rsp # align rsp to 64 bytes
vmovdqu HashKey(arg1), %xmm13 # xmm13 = HashKey
mov arg4, %r13 # save the number of bytes of plaintext/ciphertext
and $-16, %r13 # r13 = r13 - (r13 mod 16)
mov %r13, %r12
shr $4, %r12
and $7, %r12
jz _initial_num_blocks_is_0\@
cmp $7, %r12
je _initial_num_blocks_is_7\@
cmp $6, %r12
je _initial_num_blocks_is_6\@
cmp $5, %r12
je _initial_num_blocks_is_5\@
cmp $4, %r12
je _initial_num_blocks_is_4\@
cmp $3, %r12
je _initial_num_blocks_is_3\@
cmp $2, %r12
je _initial_num_blocks_is_2\@
jmp _initial_num_blocks_is_1\@
_initial_num_blocks_is_7\@:
INITIAL_BLOCKS_AVX2 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*7, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_6\@:
INITIAL_BLOCKS_AVX2 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*6, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_5\@:
INITIAL_BLOCKS_AVX2 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*5, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_4\@:
INITIAL_BLOCKS_AVX2 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*4, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_3\@:
INITIAL_BLOCKS_AVX2 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*3, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_2\@:
INITIAL_BLOCKS_AVX2 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*2, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_1\@:
INITIAL_BLOCKS_AVX2 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
sub $16*1, %r13
jmp _initial_blocks_encrypted\@
_initial_num_blocks_is_0\@:
INITIAL_BLOCKS_AVX2 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
_initial_blocks_encrypted\@:
cmp $0, %r13
je _zero_cipher_left\@
sub $128, %r13
je _eight_cipher_left\@
vmovd %xmm9, %r15d vpclmulqdq $0x11, \T5, \XMM2, \T4
and $255, %r15d vpxor \T4, \T6, \T6
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
vpclmulqdq $0x00, \T5, \XMM2, \T4
vpxor \T4, \T7, \T7
_encrypt_by_8_new\@: vpclmulqdq $0x00, \T3, \T2, \T2
cmp $(255-8), %r15d
jg _encrypt_by_8\@
vpxor \T2, \XMM1, \XMM1
######################
add $8, %r15b vmovdqa HashKey_6(arg1), \T5
GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC vpshufd $0b01001110, \XMM3, \T2
add $128, %r11 vpshufd $0b01001110, \T5, \T3
sub $128, %r13 vpxor \XMM3, \T2, \T2
jne _encrypt_by_8_new\@ vpxor \T5, \T3, \T3
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpclmulqdq $0x11, \T5, \XMM3, \T4
jmp _eight_cipher_left\@ vpxor \T4, \T6, \T6
_encrypt_by_8\@: vpclmulqdq $0x00, \T5, \XMM3, \T4
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpxor \T4, \T7, \T7
add $8, %r15b
GHASH_8_ENCRYPT_8_PARALLEL_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
add $128, %r11
sub $128, %r13
jne _encrypt_by_8_new\@
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpclmulqdq $0x00, \T3, \T2, \T2
vpxor \T2, \XMM1, \XMM1
######################
vmovdqa HashKey_5(arg1), \T5
vpshufd $0b01001110, \XMM4, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM4, \T2, \T2
vpxor \T5, \T3, \T3
_eight_cipher_left\@: vpclmulqdq $0x11, \T5, \XMM4, \T4
GHASH_LAST_8_AVX2 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM4, \T4
vpxor \T4, \T7, \T7
_zero_cipher_left\@: vpclmulqdq $0x00, \T3, \T2, \T2
cmp $16, arg4
jl _only_less_than_16\@
mov arg4, %r13 vpxor \T2, \XMM1, \XMM1
and $15, %r13 # r13 = (arg4 mod 16)
je _multiple_of_16_bytes\@ ######################
# handle the last <16 Byte block seperately vmovdqa HashKey_4(arg1), \T5
vpshufd $0b01001110, \XMM5, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM5, \T2, \T2
vpxor \T5, \T3, \T3
vpclmulqdq $0x11, \T5, \XMM5, \T4
vpxor \T4, \T6, \T6
vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vpclmulqdq $0x00, \T5, \XMM5, \T4
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 vpxor \T4, \T7, \T7
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
sub $16, %r11 vpclmulqdq $0x00, \T3, \T2, \T2
add %r13, %r11
vmovdqu (arg3, %r11), %xmm1 # receive the last <16 Byte block
lea SHIFT_MASK+16(%rip), %r12 vpxor \T2, \XMM1, \XMM1
sub %r13, %r12 # adjust the shuffle mask pointer
# to be able to shift 16-r13 bytes
# (r13 is the number of bytes in plaintext mod 16)
vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask
vpshufb %xmm2, %xmm1, %xmm1 # shift right 16-r13 bytes
jmp _final_ghash_mul\@
_only_less_than_16\@: ######################
# check for 0 length
mov arg4, %r13
and $15, %r13 # r13 = (arg4 mod 16)
je _multiple_of_16_bytes\@ vmovdqa HashKey_3(arg1), \T5
vpshufd $0b01001110, \XMM6, \T2
vpshufd $0b01001110, \T5, \T3
vpxor \XMM6, \T2, \T2
vpxor \T5, \T3, \T3
# handle the last <16 Byte block seperately vpclmulqdq $0x11, \T5, \XMM6, \T4
vpxor \T4, \T6, \T6
vpclmulqdq $0x00, \T5, \XMM6, \T4
vpxor \T4, \T7, \T7
vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn vpclmulqdq $0x00, \T3, \T2, \T2
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Yn)
vpxor \T2, \XMM1, \XMM1
lea SHIFT_MASK+16(%rip), %r12 ######################
sub %r13, %r12 # adjust the shuffle mask pointer to be
# able to shift 16-r13 bytes (r13 is the
# number of bytes in plaintext mod 16)
_get_last_16_byte_loop\@: vmovdqa HashKey_2(arg1), \T5
movb (arg3, %r11), %al vpshufd $0b01001110, \XMM7, \T2
movb %al, TMP1 (%rsp , %r11) vpshufd $0b01001110, \T5, \T3
add $1, %r11 vpxor \XMM7, \T2, \T2
cmp %r13, %r11 vpxor \T5, \T3, \T3
jne _get_last_16_byte_loop\@
vmovdqu TMP1(%rsp), %xmm1 vpclmulqdq $0x11, \T5, \XMM7, \T4
vpxor \T4, \T6, \T6
sub $16, %r11 vpclmulqdq $0x00, \T5, \XMM7, \T4
vpxor \T4, \T7, \T7
_final_ghash_mul\@: vpclmulqdq $0x00, \T3, \T2, \T2
.if \ENC_DEC == DEC
vmovdqa %xmm1, %xmm2
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm2, %xmm2
vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
vpxor %xmm2, %xmm14, %xmm14
#GHASH computation for the last <16 Byte block
GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
sub %r13, %r11
add $16, %r11
.else
vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn)
vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to mask out top 16-r13 bytes of xmm9
vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
vpxor %xmm9, %xmm14, %xmm14
#GHASH computation for the last <16 Byte block
GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
sub %r13, %r11
add $16, %r11
vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext
.endif
vpxor \T2, \XMM1, \XMM1
############################# ######################
# output r13 Bytes
vmovq %xmm9, %rax
cmp $8, %r13
jle _less_than_8_bytes_left\@
mov %rax, (arg2 , %r11) vmovdqa HashKey(arg1), \T5
add $8, %r11 vpshufd $0b01001110, \XMM8, \T2
vpsrldq $8, %xmm9, %xmm9 vpshufd $0b01001110, \T5, \T3
vmovq %xmm9, %rax vpxor \XMM8, \T2, \T2
sub $8, %r13 vpxor \T5, \T3, \T3
_less_than_8_bytes_left\@: vpclmulqdq $0x11, \T5, \XMM8, \T4
movb %al, (arg2 , %r11) vpxor \T4, \T6, \T6
add $1, %r11
shr $8, %rax
sub $1, %r13
jne _less_than_8_bytes_left\@
#############################
_multiple_of_16_bytes\@: vpclmulqdq $0x00, \T5, \XMM8, \T4
mov arg7, %r12 # r12 = aadLen (number of bytes) vpxor \T4, \T7, \T7
shl $3, %r12 # convert into number of bits
vmovd %r12d, %xmm15 # len(A) in xmm15
shl $3, arg4 # len(C) in bits (*128) vpclmulqdq $0x00, \T3, \T2, \T2
vmovq arg4, %xmm1
vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000
vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C)
vpxor %xmm15, %xmm14, %xmm14 vpxor \T2, \XMM1, \XMM1
GHASH_MUL_AVX2 %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation vpxor \T6, \XMM1, \XMM1
vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap vpxor \T7, \XMM1, \T2
mov arg5, %rax # rax = *Y0
vmovdqu (%rax), %xmm9 # xmm9 = Y0
ENCRYPT_SINGLE_BLOCK %xmm9 # E(K, Y0)
vpxor %xmm14, %xmm9, %xmm9
vpslldq $8, \T2, \T4
vpsrldq $8, \T2, \T2
vpxor \T4, \T7, \T7
vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the
# accumulated carry-less multiplications
_return_T\@: #######################################################################
mov arg8, %r10 # r10 = authTag #first phase of the reduction
mov arg9, %r11 # r11 = auth_tag_len vmovdqa POLY2(%rip), \T3
cmp $16, %r11 vpclmulqdq $0x01, \T7, \T3, \T2
je _T_16\@ vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs
cmp $8, %r11 vpxor \T2, \T7, \T7 # first phase of the reduction complete
jl _T_4\@ #######################################################################
_T_8\@:
vmovq %xmm9, %rax
mov %rax, (%r10)
add $8, %r10
sub $8, %r11
vpsrldq $8, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_4\@:
vmovd %xmm9, %eax
mov %eax, (%r10)
add $4, %r10
sub $4, %r11
vpsrldq $4, %xmm9, %xmm9
cmp $0, %r11
je _return_T_done\@
_T_123\@:
vmovd %xmm9, %eax
cmp $2, %r11
jl _T_1\@
mov %ax, (%r10)
cmp $2, %r11
je _return_T_done\@
add $2, %r10
sar $16, %eax
_T_1\@:
mov %al, (%r10)
jmp _return_T_done\@
_T_16\@: #second phase of the reduction
vmovdqu %xmm9, (%r10) vpclmulqdq $0x00, \T7, \T3, \T2
vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
_return_T_done\@: vpclmulqdq $0x10, \T7, \T3, \T4
mov %r14, %rsp vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
pop %r15 vpxor \T2, \T4, \T4 # second phase of the reduction complete
pop %r14 #######################################################################
pop %r13 vpxor \T4, \T6, \T6 # the result is in T6
pop %r12
.endm .endm
############################################################# #############################################################
#void aesni_gcm_precomp_avx_gen4 #void aesni_gcm_precomp_avx_gen4
# (gcm_data *my_ctx_data, # (gcm_data *my_ctx_data,
...@@ -2918,7 +2603,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4) ...@@ -2918,7 +2603,7 @@ ENDPROC(aesni_gcm_precomp_avx_gen4)
# Valid values are 16 (most likely), 12 or 8. */ # Valid values are 16 (most likely), 12 or 8. */
############################################################################### ###############################################################################
ENTRY(aesni_gcm_enc_avx_gen4) ENTRY(aesni_gcm_enc_avx_gen4)
GCM_ENC_DEC_AVX2 ENC GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 ENC
ret ret
ENDPROC(aesni_gcm_enc_avx_gen4) ENDPROC(aesni_gcm_enc_avx_gen4)
...@@ -2939,7 +2624,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4) ...@@ -2939,7 +2624,7 @@ ENDPROC(aesni_gcm_enc_avx_gen4)
# Valid values are 16 (most likely), 12 or 8. */ # Valid values are 16 (most likely), 12 or 8. */
############################################################################### ###############################################################################
ENTRY(aesni_gcm_dec_avx_gen4) ENTRY(aesni_gcm_dec_avx_gen4)
GCM_ENC_DEC_AVX2 DEC GCM_ENC_DEC INITIAL_BLOCKS_AVX2 GHASH_8_ENCRYPT_8_PARALLEL_AVX2 GHASH_LAST_8_AVX2 GHASH_MUL_AVX2 DEC
ret ret
ENDPROC(aesni_gcm_dec_avx_gen4) ENDPROC(aesni_gcm_dec_avx_gen4)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment