Commit 4e530fba authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/crc32-ce - yield NEON after every block of input

Avoid excessive scheduling delays under a preemptible kernel by
yielding the NEON after every block of input.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 7c50136a
...@@ -100,9 +100,10 @@ ...@@ -100,9 +100,10 @@
dCONSTANT .req d0 dCONSTANT .req d0
qCONSTANT .req q0 qCONSTANT .req q0
BUF .req x0 BUF .req x19
LEN .req x1 LEN .req x20
CRC .req x2 CRC .req x21
CONST .req x22
vzr .req v9 vzr .req v9
...@@ -123,7 +124,14 @@ ENTRY(crc32_pmull_le) ...@@ -123,7 +124,14 @@ ENTRY(crc32_pmull_le)
ENTRY(crc32c_pmull_le) ENTRY(crc32c_pmull_le)
adr_l x3, .Lcrc32c_constants adr_l x3, .Lcrc32c_constants
0: bic LEN, LEN, #15 0: frame_push 4, 64
mov BUF, x0
mov LEN, x1
mov CRC, x2
mov CONST, x3
bic LEN, LEN, #15
ld1 {v1.16b-v4.16b}, [BUF], #0x40 ld1 {v1.16b-v4.16b}, [BUF], #0x40
movi vzr.16b, #0 movi vzr.16b, #0
fmov dCONSTANT, CRC fmov dCONSTANT, CRC
...@@ -132,7 +140,7 @@ ENTRY(crc32c_pmull_le) ...@@ -132,7 +140,7 @@ ENTRY(crc32c_pmull_le)
cmp LEN, #0x40 cmp LEN, #0x40
b.lt less_64 b.lt less_64
ldr qCONSTANT, [x3] ldr qCONSTANT, [CONST]
loop_64: /* 64 bytes Full cache line folding */ loop_64: /* 64 bytes Full cache line folding */
sub LEN, LEN, #0x40 sub LEN, LEN, #0x40
...@@ -162,10 +170,21 @@ loop_64: /* 64 bytes Full cache line folding */ ...@@ -162,10 +170,21 @@ loop_64: /* 64 bytes Full cache line folding */
eor v4.16b, v4.16b, v8.16b eor v4.16b, v4.16b, v8.16b
cmp LEN, #0x40 cmp LEN, #0x40
b.ge loop_64 b.lt less_64
if_will_cond_yield_neon
stp q1, q2, [sp, #.Lframe_local_offset]
stp q3, q4, [sp, #.Lframe_local_offset + 32]
do_cond_yield_neon
ldp q1, q2, [sp, #.Lframe_local_offset]
ldp q3, q4, [sp, #.Lframe_local_offset + 32]
ldr qCONSTANT, [CONST]
movi vzr.16b, #0
endif_yield_neon
b loop_64
less_64: /* Folding cache line into 128bit */ less_64: /* Folding cache line into 128bit */
ldr qCONSTANT, [x3, #16] ldr qCONSTANT, [CONST, #16]
pmull2 v5.1q, v1.2d, vCONSTANT.2d pmull2 v5.1q, v1.2d, vCONSTANT.2d
pmull v1.1q, v1.1d, vCONSTANT.1d pmull v1.1q, v1.1d, vCONSTANT.1d
...@@ -204,8 +223,8 @@ fold_64: ...@@ -204,8 +223,8 @@ fold_64:
eor v1.16b, v1.16b, v2.16b eor v1.16b, v1.16b, v2.16b
/* final 32-bit fold */ /* final 32-bit fold */
ldr dCONSTANT, [x3, #32] ldr dCONSTANT, [CONST, #32]
ldr d3, [x3, #40] ldr d3, [CONST, #40]
ext v2.16b, v1.16b, vzr.16b, #4 ext v2.16b, v1.16b, vzr.16b, #4
and v1.16b, v1.16b, v3.16b and v1.16b, v1.16b, v3.16b
...@@ -213,7 +232,7 @@ fold_64: ...@@ -213,7 +232,7 @@ fold_64:
eor v1.16b, v1.16b, v2.16b eor v1.16b, v1.16b, v2.16b
/* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */
ldr qCONSTANT, [x3, #48] ldr qCONSTANT, [CONST, #48]
and v2.16b, v1.16b, v3.16b and v2.16b, v1.16b, v3.16b
ext v2.16b, vzr.16b, v2.16b, #8 ext v2.16b, vzr.16b, v2.16b, #8
...@@ -223,6 +242,7 @@ fold_64: ...@@ -223,6 +242,7 @@ fold_64:
eor v1.16b, v1.16b, v2.16b eor v1.16b, v1.16b, v2.16b
mov w0, v1.s[1] mov w0, v1.s[1]
frame_pop
ret ret
ENDPROC(crc32_pmull_le) ENDPROC(crc32_pmull_le)
ENDPROC(crc32c_pmull_le) ENDPROC(crc32c_pmull_le)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment