Commit 5b3da651 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/crct10dif-ce - yield NEON after every block of input

Avoid excessive scheduling delays under a preemptible kernel by
yielding the NEON after every block of input.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 4e530fba
...@@ -74,13 +74,19 @@ ...@@ -74,13 +74,19 @@
.text .text
.cpu generic+crypto .cpu generic+crypto
arg1_low32 .req w0 arg1_low32 .req w19
arg2 .req x1 arg2 .req x20
arg3 .req x2 arg3 .req x21
vzr .req v13 vzr .req v13
ENTRY(crc_t10dif_pmull) ENTRY(crc_t10dif_pmull)
frame_push 3, 128
mov arg1_low32, w0
mov arg2, x1
mov arg3, x2
movi vzr.16b, #0 // init zero register movi vzr.16b, #0 // init zero register
// adjust the 16-bit initial_crc value, scale it to 32 bits // adjust the 16-bit initial_crc value, scale it to 32 bits
...@@ -175,8 +181,25 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) ...@@ -175,8 +181,25 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
subs arg3, arg3, #128 subs arg3, arg3, #128
// check if there is another 64B in the buffer to be able to fold // check if there is another 64B in the buffer to be able to fold
b.ge _fold_64_B_loop b.lt _fold_64_B_end
if_will_cond_yield_neon
stp q0, q1, [sp, #.Lframe_local_offset]
stp q2, q3, [sp, #.Lframe_local_offset + 32]
stp q4, q5, [sp, #.Lframe_local_offset + 64]
stp q6, q7, [sp, #.Lframe_local_offset + 96]
do_cond_yield_neon
ldp q0, q1, [sp, #.Lframe_local_offset]
ldp q2, q3, [sp, #.Lframe_local_offset + 32]
ldp q4, q5, [sp, #.Lframe_local_offset + 64]
ldp q6, q7, [sp, #.Lframe_local_offset + 96]
ldr_l q10, rk3, x8
movi vzr.16b, #0 // init zero register
endif_yield_neon
b _fold_64_B_loop
_fold_64_B_end:
// at this point, the buffer pointer is pointing at the last y Bytes // at this point, the buffer pointer is pointing at the last y Bytes
// of the buffer the 64B of folded data is in 4 of the vector // of the buffer the 64B of folded data is in 4 of the vector
// registers: v0, v1, v2, v3 // registers: v0, v1, v2, v3
...@@ -304,6 +327,7 @@ _barrett: ...@@ -304,6 +327,7 @@ _barrett:
_cleanup: _cleanup:
// scale the result back to 16 bits // scale the result back to 16 bits
lsr x0, x0, #16 lsr x0, x0, #16
frame_pop
ret ret
_less_than_128: _less_than_128:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment