Commit 6c1b0da1 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/crct10dif - preparatory refactor for 8x8 PMULL version

Reorganize the CRC-T10DIF asm routine so we can easily instantiate an
alternative version based on 8x8 polynomial multiplication in a
subsequent patch.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 598b7d41
...@@ -80,7 +80,46 @@ ...@@ -80,7 +80,46 @@
vzr .req v13 vzr .req v13
ENTRY(crc_t10dif_pmull) .macro fold64, p, reg1, reg2
ldp q11, q12, [arg2], #0x20
__pmull_\p v8, \reg1, v10, 2
__pmull_\p \reg1, \reg1, v10
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
__pmull_\p v9, \reg2, v10, 2
__pmull_\p \reg2, \reg2, v10
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg1\().16b, \reg1\().16b, v8.16b
eor \reg2\().16b, \reg2\().16b, v9.16b
eor \reg1\().16b, \reg1\().16b, v11.16b
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
.macro fold16, p, reg, rk
__pmull_\p v8, \reg, v10
__pmull_\p \reg, \reg, v10, 2
.ifnb \rk
ldr_l q10, \rk, x8
.endif
eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, \reg\().16b
.endm
.macro __pmull_p64, rd, rn, rm, n
.ifb \n
pmull \rd\().1q, \rn\().1d, \rm\().1d
.else
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
.endif
.endm
.macro crc_t10dif_pmull, p
frame_push 3, 128 frame_push 3, 128
mov arg1_low32, w0 mov arg1_low32, w0
...@@ -96,7 +135,7 @@ ENTRY(crc_t10dif_pmull) ...@@ -96,7 +135,7 @@ ENTRY(crc_t10dif_pmull)
cmp arg3, #256 cmp arg3, #256
// for sizes less than 128, we can't fold 64B at a time... // for sizes less than 128, we can't fold 64B at a time...
b.lt _less_than_128 b.lt .L_less_than_128_\@
// load the initial crc value // load the initial crc value
// crc value does not need to be byte-reflected, but it needs // crc value does not need to be byte-reflected, but it needs
...@@ -147,41 +186,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) ...@@ -147,41 +186,19 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
// buffer. The _fold_64_B_loop will fold 64B at a time // buffer. The _fold_64_B_loop will fold 64B at a time
// until we have 64+y Bytes of buffer // until we have 64+y Bytes of buffer
// fold 64B at a time. This section of the code folds 4 vector // fold 64B at a time. This section of the code folds 4 vector
// registers in parallel // registers in parallel
_fold_64_B_loop: .L_fold_64_B_loop_\@:
.macro fold64, reg1, reg2
ldp q11, q12, [arg2], #0x20
pmull2 v8.1q, \reg1\().2d, v10.2d
pmull \reg1\().1q, \reg1\().1d, v10.1d
CPU_LE( rev64 v11.16b, v11.16b )
CPU_LE( rev64 v12.16b, v12.16b )
pmull2 v9.1q, \reg2\().2d, v10.2d
pmull \reg2\().1q, \reg2\().1d, v10.1d
CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
eor \reg1\().16b, \reg1\().16b, v8.16b
eor \reg2\().16b, \reg2\().16b, v9.16b
eor \reg1\().16b, \reg1\().16b, v11.16b
eor \reg2\().16b, \reg2\().16b, v12.16b
.endm
fold64 v0, v1 fold64 \p, v0, v1
fold64 v2, v3 fold64 \p, v2, v3
fold64 v4, v5 fold64 \p, v4, v5
fold64 v6, v7 fold64 \p, v6, v7
subs arg3, arg3, #128 subs arg3, arg3, #128
// check if there is another 64B in the buffer to be able to fold // check if there is another 64B in the buffer to be able to fold
b.lt _fold_64_B_end b.lt .L_fold_64_B_end_\@
if_will_cond_yield_neon if_will_cond_yield_neon
stp q0, q1, [sp, #.Lframe_local_offset] stp q0, q1, [sp, #.Lframe_local_offset]
...@@ -197,9 +214,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) ...@@ -197,9 +214,9 @@ CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
movi vzr.16b, #0 // init zero register movi vzr.16b, #0 // init zero register
endif_yield_neon endif_yield_neon
b _fold_64_B_loop b .L_fold_64_B_loop_\@
_fold_64_B_end: .L_fold_64_B_end_\@:
// at this point, the buffer pointer is pointing at the last y Bytes // at this point, the buffer pointer is pointing at the last y Bytes
// of the buffer the 64B of folded data is in 4 of the vector // of the buffer the 64B of folded data is in 4 of the vector
// registers: v0, v1, v2, v3 // registers: v0, v1, v2, v3
...@@ -209,37 +226,27 @@ _fold_64_B_end: ...@@ -209,37 +226,27 @@ _fold_64_B_end:
ldr_l q10, rk9, x8 ldr_l q10, rk9, x8
.macro fold16, reg, rk fold16 \p, v0, rk11
pmull v8.1q, \reg\().1d, v10.1d fold16 \p, v1, rk13
pmull2 \reg\().1q, \reg\().2d, v10.2d fold16 \p, v2, rk15
.ifnb \rk fold16 \p, v3, rk17
ldr_l q10, \rk, x8 fold16 \p, v4, rk19
.endif fold16 \p, v5, rk1
eor v7.16b, v7.16b, v8.16b fold16 \p, v6
eor v7.16b, v7.16b, \reg\().16b
.endm
fold16 v0, rk11
fold16 v1, rk13
fold16 v2, rk15
fold16 v3, rk17
fold16 v4, rk19
fold16 v5, rk1
fold16 v6
// instead of 64, we add 48 to the loop counter to save 1 instruction // instead of 64, we add 48 to the loop counter to save 1 instruction
// from the loop instead of a cmp instruction, we use the negative // from the loop instead of a cmp instruction, we use the negative
// flag with the jl instruction // flag with the jl instruction
adds arg3, arg3, #(128-16) adds arg3, arg3, #(128-16)
b.lt _final_reduction_for_128 b.lt .L_final_reduction_for_128_\@
// now we have 16+y bytes left to reduce. 16 Bytes is in register v7 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
// and the rest is in memory. We can fold 16 bytes at a time if y>=16 // and the rest is in memory. We can fold 16 bytes at a time if y>=16
// continue folding 16B at a time // continue folding 16B at a time
_16B_reduction_loop: .L_16B_reduction_loop_\@:
pmull v8.1q, v7.1d, v10.1d __pmull_\p v8, v7, v10
pmull2 v7.1q, v7.2d, v10.2d __pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b eor v7.16b, v7.16b, v8.16b
ldr q0, [arg2], #16 ldr q0, [arg2], #16
...@@ -251,22 +258,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) ...@@ -251,22 +258,22 @@ CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
// instead of a cmp instruction, we utilize the flags with the // instead of a cmp instruction, we utilize the flags with the
// jge instruction equivalent of: cmp arg3, 16-16 // jge instruction equivalent of: cmp arg3, 16-16
// check if there is any more 16B in the buffer to be able to fold // check if there is any more 16B in the buffer to be able to fold
b.ge _16B_reduction_loop b.ge .L_16B_reduction_loop_\@
// now we have 16+z bytes left to reduce, where 0<= z < 16. // now we have 16+z bytes left to reduce, where 0<= z < 16.
// first, we reduce the data in the xmm7 register // first, we reduce the data in the xmm7 register
_final_reduction_for_128: .L_final_reduction_for_128_\@:
// check if any more data to fold. If not, compute the CRC of // check if any more data to fold. If not, compute the CRC of
// the final 128 bits // the final 128 bits
adds arg3, arg3, #16 adds arg3, arg3, #16
b.eq _128_done b.eq .L_128_done_\@
// here we are getting data that is less than 16 bytes. // here we are getting data that is less than 16 bytes.
// since we know that there was data before the pointer, we can // since we know that there was data before the pointer, we can
// offset the input pointer before the actual point, to receive // offset the input pointer before the actual point, to receive
// exactly 16 bytes. after that the registers need to be adjusted. // exactly 16 bytes. after that the registers need to be adjusted.
_get_last_two_regs: .L_get_last_two_regs_\@:
add arg2, arg2, arg3 add arg2, arg2, arg3
ldr q1, [arg2, #-16] ldr q1, [arg2, #-16]
CPU_LE( rev64 v1.16b, v1.16b ) CPU_LE( rev64 v1.16b, v1.16b )
...@@ -291,47 +298,46 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) ...@@ -291,47 +298,46 @@ CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
bsl v0.16b, v2.16b, v1.16b bsl v0.16b, v2.16b, v1.16b
// fold 16 Bytes // fold 16 Bytes
pmull v8.1q, v7.1d, v10.1d __pmull_\p v8, v7, v10
pmull2 v7.1q, v7.2d, v10.2d __pmull_\p v7, v7, v10, 2
eor v7.16b, v7.16b, v8.16b eor v7.16b, v7.16b, v8.16b
eor v7.16b, v7.16b, v0.16b eor v7.16b, v7.16b, v0.16b
_128_done: .L_128_done_\@:
// compute crc of a 128-bit value // compute crc of a 128-bit value
ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10 ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
// 64b fold // 64b fold
ext v0.16b, vzr.16b, v7.16b, #8 ext v0.16b, vzr.16b, v7.16b, #8
mov v7.d[0], v7.d[1] mov v7.d[0], v7.d[1]
pmull v7.1q, v7.1d, v10.1d __pmull_\p v7, v7, v10
eor v7.16b, v7.16b, v0.16b eor v7.16b, v7.16b, v0.16b
// 32b fold // 32b fold
ext v0.16b, v7.16b, vzr.16b, #4 ext v0.16b, v7.16b, vzr.16b, #4
mov v7.s[3], vzr.s[0] mov v7.s[3], vzr.s[0]
pmull2 v0.1q, v0.2d, v10.2d __pmull_\p v0, v0, v10, 2
eor v7.16b, v7.16b, v0.16b eor v7.16b, v7.16b, v0.16b
// barrett reduction // barrett reduction
_barrett:
ldr_l q10, rk7, x8 ldr_l q10, rk7, x8
mov v0.d[0], v7.d[1] mov v0.d[0], v7.d[1]
pmull v0.1q, v0.1d, v10.1d __pmull_\p v0, v0, v10
ext v0.16b, vzr.16b, v0.16b, #12 ext v0.16b, vzr.16b, v0.16b, #12
pmull2 v0.1q, v0.2d, v10.2d __pmull_\p v0, v0, v10, 2
ext v0.16b, vzr.16b, v0.16b, #12 ext v0.16b, vzr.16b, v0.16b, #12
eor v7.16b, v7.16b, v0.16b eor v7.16b, v7.16b, v0.16b
mov w0, v7.s[1] mov w0, v7.s[1]
_cleanup: .L_cleanup_\@:
// scale the result back to 16 bits // scale the result back to 16 bits
lsr x0, x0, #16 lsr x0, x0, #16
frame_pop frame_pop
ret ret
_less_than_128: .L_less_than_128_\@:
cbz arg3, _cleanup cbz arg3, .L_cleanup_\@
movi v0.16b, #0 movi v0.16b, #0
mov v0.s[3], arg1_low32 // get the initial crc value mov v0.s[3], arg1_low32 // get the initial crc value
...@@ -342,20 +348,20 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) ...@@ -342,20 +348,20 @@ CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
eor v7.16b, v7.16b, v0.16b // xor the initial crc value eor v7.16b, v7.16b, v0.16b // xor the initial crc value
cmp arg3, #16 cmp arg3, #16
b.eq _128_done // exactly 16 left b.eq .L_128_done_\@ // exactly 16 left
b.lt _less_than_16_left b.lt .L_less_than_16_left_\@
ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10 ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
// update the counter. subtract 32 instead of 16 to save one // update the counter. subtract 32 instead of 16 to save one
// instruction from the loop // instruction from the loop
subs arg3, arg3, #32 subs arg3, arg3, #32
b.ge _16B_reduction_loop b.ge .L_16B_reduction_loop_\@
add arg3, arg3, #16 add arg3, arg3, #16
b _get_last_two_regs b .L_get_last_two_regs_\@
_less_than_16_left: .L_less_than_16_left_\@:
// shl r9, 4 // shl r9, 4
adr_l x0, tbl_shf_table + 16 adr_l x0, tbl_shf_table + 16
sub x0, x0, arg3 sub x0, x0, arg3
...@@ -363,8 +369,12 @@ _less_than_16_left: ...@@ -363,8 +369,12 @@ _less_than_16_left:
movi v9.16b, #0x80 movi v9.16b, #0x80
eor v0.16b, v0.16b, v9.16b eor v0.16b, v0.16b, v9.16b
tbl v7.16b, {v7.16b}, v0.16b tbl v7.16b, {v7.16b}, v0.16b
b _128_done b .L_128_done_\@
ENDPROC(crc_t10dif_pmull) .endm
ENTRY(crc_t10dif_pmull_p64)
crc_t10dif_pmull p64
ENDPROC(crc_t10dif_pmull_p64)
// precomputed constants // precomputed constants
// these constants are precomputed from the poly: // these constants are precomputed from the poly:
......
...@@ -22,7 +22,9 @@ ...@@ -22,7 +22,9 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 buf[], u64 len); asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 buf[], u64 len);
static u16 (*crc_t10dif_pmull)(u16 init_crc, const u8 buf[], u64 len);
static int crct10dif_init(struct shash_desc *desc) static int crct10dif_init(struct shash_desc *desc)
{ {
...@@ -85,6 +87,8 @@ static struct shash_alg crc_t10dif_alg = { ...@@ -85,6 +87,8 @@ static struct shash_alg crc_t10dif_alg = {
static int __init crc_t10dif_mod_init(void) static int __init crc_t10dif_mod_init(void)
{ {
crc_t10dif_pmull = crc_t10dif_pmull_p64;
return crypto_register_shash(&crc_t10dif_alg); return crypto_register_shash(&crc_t10dif_alg);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment