Commit 0487ccac authored by Sabrina Dubroca's avatar Sabrina Dubroca Committed by Herbert Xu

crypto: aesni - make non-AVX AES-GCM work with any aadlen

This is the first step to make the aesni AES-GCM implementation
generic. The current code was written for rfc4106, so it handles only
some specific sizes of associated data.
Signed-off-by: default avatarSabrina Dubroca <sd@queasysnail.net>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent f4857f4c
...@@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 ...@@ -89,6 +89,29 @@ SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000 .octa 0x00000000000000000000000000000000
.section .rodata
.align 16
.type aad_shift_arr, @object
.size aad_shift_arr, 272
aad_shift_arr:
.octa 0xffffffffffffffffffffffffffffffff
.octa 0xffffffffffffffffffffffffffffff0C
.octa 0xffffffffffffffffffffffffffff0D0C
.octa 0xffffffffffffffffffffffffff0E0D0C
.octa 0xffffffffffffffffffffffff0F0E0D0C
.octa 0xffffffffffffffffffffff0C0B0A0908
.octa 0xffffffffffffffffffff0D0C0B0A0908
.octa 0xffffffffffffffffff0E0D0C0B0A0908
.octa 0xffffffffffffffff0F0E0D0C0B0A0908
.octa 0xffffffffffffff0C0B0A090807060504
.octa 0xffffffffffff0D0C0B0A090807060504
.octa 0xffffffffff0E0D0C0B0A090807060504
.octa 0xffffffff0F0E0D0C0B0A090807060504
.octa 0xffffff0C0B0A09080706050403020100
.octa 0xffff0D0C0B0A09080706050403020100
.octa 0xff0E0D0C0B0A09080706050403020100
.octa 0x0F0E0D0C0B0A09080706050403020100
.text .text
...@@ -252,31 +275,65 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation ...@@ -252,31 +275,65 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
mov arg8, %r12 # %r12 = aadLen mov arg8, %r12 # %r12 = aadLen
mov %r12, %r11 mov %r12, %r11
pxor %xmm\i, %xmm\i pxor %xmm\i, %xmm\i
pxor \XMM2, \XMM2
_get_AAD_loop\num_initial_blocks\operation: cmp $16, %r11
movd (%r10), \TMP1 jl _get_AAD_rest8\num_initial_blocks\operation
pslldq $12, \TMP1 _get_AAD_blocks\num_initial_blocks\operation:
psrldq $4, %xmm\i movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor %xmm\i, \XMM2
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
add $16, %r10
sub $16, %r12
sub $16, %r11
cmp $16, %r11
jge _get_AAD_blocks\num_initial_blocks\operation
movdqu \XMM2, %xmm\i
cmp $0, %r11
je _get_AAD_done\num_initial_blocks\operation
pxor %xmm\i,%xmm\i
/* read the last <16B of AAD. since we have at least 4B of
data right after the AAD (the ICV, and maybe some CT), we can
read 4B/8B blocks safely, and then get rid of the extra stuff */
_get_AAD_rest8\num_initial_blocks\operation:
cmp $4, %r11
jle _get_AAD_rest4\num_initial_blocks\operation
movq (%r10), \TMP1
add $8, %r10
sub $8, %r11
pslldq $8, \TMP1
psrldq $8, %xmm\i
pxor \TMP1, %xmm\i pxor \TMP1, %xmm\i
jmp _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_rest4\num_initial_blocks\operation:
cmp $0, %r11
jle _get_AAD_rest0\num_initial_blocks\operation
mov (%r10), %eax
movq %rax, \TMP1
add $4, %r10 add $4, %r10
sub $4, %r12 sub $4, %r10
jne _get_AAD_loop\num_initial_blocks\operation pslldq $12, \TMP1
cmp $16, %r11
je _get_AAD_loop2_done\num_initial_blocks\operation
mov $16, %r12
_get_AAD_loop2\num_initial_blocks\operation:
psrldq $4, %xmm\i psrldq $4, %xmm\i
sub $4, %r12 pxor \TMP1, %xmm\i
cmp %r11, %r12 _get_AAD_rest0\num_initial_blocks\operation:
jne _get_AAD_loop2\num_initial_blocks\operation /* finalize: shift out the extra bytes we read, and align
left. since pslldq can only shift by an immediate, we use
_get_AAD_loop2_done\num_initial_blocks\operation: vpshufb and an array of shuffle masks */
movq %r12, %r11
salq $4, %r11
movdqu aad_shift_arr(%r11), \TMP1
PSHUFB_XMM \TMP1, %xmm\i
_get_AAD_rest_final\num_initial_blocks\operation:
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor \XMM2, %xmm\i
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
_get_AAD_done\num_initial_blocks\operation:
xor %r11, %r11 # initialise the data pointer offset as zero xor %r11, %r11 # initialise the data pointer offset as zero
# start AES for num_initial_blocks blocks # start AES for num_initial_blocks blocks
mov %arg5, %rax # %rax = *Y0 mov %arg5, %rax # %rax = *Y0
...@@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks: ...@@ -322,7 +379,7 @@ aes_loop_initial_dec\num_initial_blocks:
# prepare plaintext/ciphertext for GHASH computation # prepare plaintext/ciphertext for GHASH computation
.endr .endr
.endif .endif
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
# apply GHASH on num_initial_blocks blocks # apply GHASH on num_initial_blocks blocks
.if \i == 5 .if \i == 5
...@@ -477,27 +534,65 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation ...@@ -477,27 +534,65 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
mov arg8, %r12 # %r12 = aadLen mov arg8, %r12 # %r12 = aadLen
mov %r12, %r11 mov %r12, %r11
pxor %xmm\i, %xmm\i pxor %xmm\i, %xmm\i
_get_AAD_loop\num_initial_blocks\operation: pxor \XMM2, \XMM2
movd (%r10), \TMP1
pslldq $12, \TMP1 cmp $16, %r11
psrldq $4, %xmm\i jl _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor %xmm\i, \XMM2
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
add $16, %r10
sub $16, %r12
sub $16, %r11
cmp $16, %r11
jge _get_AAD_blocks\num_initial_blocks\operation
movdqu \XMM2, %xmm\i
cmp $0, %r11
je _get_AAD_done\num_initial_blocks\operation
pxor %xmm\i,%xmm\i
/* read the last <16B of AAD. since we have at least 4B of
data right after the AAD (the ICV, and maybe some PT), we can
read 4B/8B blocks safely, and then get rid of the extra stuff */
_get_AAD_rest8\num_initial_blocks\operation:
cmp $4, %r11
jle _get_AAD_rest4\num_initial_blocks\operation
movq (%r10), \TMP1
add $8, %r10
sub $8, %r11
pslldq $8, \TMP1
psrldq $8, %xmm\i
pxor \TMP1, %xmm\i pxor \TMP1, %xmm\i
jmp _get_AAD_rest8\num_initial_blocks\operation
_get_AAD_rest4\num_initial_blocks\operation:
cmp $0, %r11
jle _get_AAD_rest0\num_initial_blocks\operation
mov (%r10), %eax
movq %rax, \TMP1
add $4, %r10 add $4, %r10
sub $4, %r12 sub $4, %r10
jne _get_AAD_loop\num_initial_blocks\operation pslldq $12, \TMP1
cmp $16, %r11
je _get_AAD_loop2_done\num_initial_blocks\operation
mov $16, %r12
_get_AAD_loop2\num_initial_blocks\operation:
psrldq $4, %xmm\i psrldq $4, %xmm\i
sub $4, %r12 pxor \TMP1, %xmm\i
cmp %r11, %r12 _get_AAD_rest0\num_initial_blocks\operation:
jne _get_AAD_loop2\num_initial_blocks\operation /* finalize: shift out the extra bytes we read, and align
_get_AAD_loop2_done\num_initial_blocks\operation: left. since pslldq can only shift by an immediate, we use
vpshufb and an array of shuffle masks */
movq %r12, %r11
salq $4, %r11
movdqu aad_shift_arr(%r11), \TMP1
PSHUFB_XMM \TMP1, %xmm\i
_get_AAD_rest_final\num_initial_blocks\operation:
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor \XMM2, %xmm\i
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
_get_AAD_done\num_initial_blocks\operation:
xor %r11, %r11 # initialise the data pointer offset as zero xor %r11, %r11 # initialise the data pointer offset as zero
# start AES for num_initial_blocks blocks # start AES for num_initial_blocks blocks
mov %arg5, %rax # %rax = *Y0 mov %arg5, %rax # %rax = *Y0
...@@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks: ...@@ -543,7 +638,7 @@ aes_loop_initial_enc\num_initial_blocks:
# prepare plaintext/ciphertext for GHASH computation # prepare plaintext/ciphertext for GHASH computation
.endr .endr
.endif .endif
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
# apply GHASH on num_initial_blocks blocks # apply GHASH on num_initial_blocks blocks
.if \i == 5 .if \i == 5
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment