Commit 2e5d2f33 authored by Ard Biesheuvel's avatar Ard Biesheuvel Committed by Herbert Xu

crypto: arm64/aes-blk - improve XTS mask handling

The Crypto Extension instantiation of the aes-modes.S collection of
skciphers uses only 15 NEON registers for the round key array, whereas
the pure NEON flavor uses 16 NEON registers for the AES S-box.

This means we have a spare register available that we can use to hold
the XTS mask vector, removing the need to reload it at every iteration
of the inner loop.

Since the pure NEON version does not permit this optimization, tweak
the macros so we can factor out this functionality. Also, replace the
literal load with a short sequence to compose the mask vector.

On Cortex-A53, this results in a ~4% speedup.
Signed-off-by: default avatarArd Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent dd597fb3
...@@ -17,6 +17,11 @@ ...@@ -17,6 +17,11 @@
.arch armv8-a+crypto .arch armv8-a+crypto
xtsmask .req v16
.macro xts_reload_mask, tmp
.endm
/* preload all round keys */ /* preload all round keys */
.macro load_round_keys, rounds, rk .macro load_round_keys, rounds, rk
cmp \rounds, #12 cmp \rounds, #12
......
...@@ -340,17 +340,19 @@ AES_ENDPROC(aes_ctr_encrypt) ...@@ -340,17 +340,19 @@ AES_ENDPROC(aes_ctr_encrypt)
* int blocks, u8 const rk2[], u8 iv[], int first) * int blocks, u8 const rk2[], u8 iv[], int first)
*/ */
.macro next_tweak, out, in, const, tmp .macro next_tweak, out, in, tmp
sshr \tmp\().2d, \in\().2d, #63 sshr \tmp\().2d, \in\().2d, #63
and \tmp\().16b, \tmp\().16b, \const\().16b and \tmp\().16b, \tmp\().16b, xtsmask.16b
add \out\().2d, \in\().2d, \in\().2d add \out\().2d, \in\().2d, \in\().2d
ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
eor \out\().16b, \out\().16b, \tmp\().16b eor \out\().16b, \out\().16b, \tmp\().16b
.endm .endm
.Lxts_mul_x: .macro xts_load_mask, tmp
CPU_LE( .quad 1, 0x87 ) movi xtsmask.2s, #0x1
CPU_BE( .quad 0x87, 1 ) movi \tmp\().2s, #0x87
uzp1 xtsmask.4s, xtsmask.4s, \tmp\().4s
.endm
AES_ENTRY(aes_xts_encrypt) AES_ENTRY(aes_xts_encrypt)
stp x29, x30, [sp, #-16]! stp x29, x30, [sp, #-16]!
...@@ -362,24 +364,24 @@ AES_ENTRY(aes_xts_encrypt) ...@@ -362,24 +364,24 @@ AES_ENTRY(aes_xts_encrypt)
enc_prepare w3, x5, x8 enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */ encrypt_block v4, w3, x5, x8, w7 /* first tweak */
enc_switch_key w3, x2, x8 enc_switch_key w3, x2, x8
ldr q7, .Lxts_mul_x xts_load_mask v8
b .LxtsencNx b .LxtsencNx
.Lxtsencnotfirst: .Lxtsencnotfirst:
enc_prepare w3, x2, x8 enc_prepare w3, x2, x8
.LxtsencloopNx: .LxtsencloopNx:
ldr q7, .Lxts_mul_x xts_reload_mask v8
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v8
.LxtsencNx: .LxtsencNx:
subs w4, w4, #4 subs w4, w4, #4
bmi .Lxtsenc1x bmi .Lxtsenc1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */
next_tweak v5, v4, v7, v8 next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8 next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8 next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
bl aes_encrypt_block4x bl aes_encrypt_block4x
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
...@@ -401,7 +403,7 @@ AES_ENTRY(aes_xts_encrypt) ...@@ -401,7 +403,7 @@ AES_ENTRY(aes_xts_encrypt)
st1 {v0.16b}, [x0], #16 st1 {v0.16b}, [x0], #16
subs w4, w4, #1 subs w4, w4, #1
beq .Lxtsencout beq .Lxtsencout
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v8
b .Lxtsencloop b .Lxtsencloop
.Lxtsencout: .Lxtsencout:
st1 {v4.16b}, [x6] st1 {v4.16b}, [x6]
...@@ -420,24 +422,24 @@ AES_ENTRY(aes_xts_decrypt) ...@@ -420,24 +422,24 @@ AES_ENTRY(aes_xts_decrypt)
enc_prepare w3, x5, x8 enc_prepare w3, x5, x8
encrypt_block v4, w3, x5, x8, w7 /* first tweak */ encrypt_block v4, w3, x5, x8, w7 /* first tweak */
dec_prepare w3, x2, x8 dec_prepare w3, x2, x8
ldr q7, .Lxts_mul_x xts_load_mask v8
b .LxtsdecNx b .LxtsdecNx
.Lxtsdecnotfirst: .Lxtsdecnotfirst:
dec_prepare w3, x2, x8 dec_prepare w3, x2, x8
.LxtsdecloopNx: .LxtsdecloopNx:
ldr q7, .Lxts_mul_x xts_reload_mask v8
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v8
.LxtsdecNx: .LxtsdecNx:
subs w4, w4, #4 subs w4, w4, #4
bmi .Lxtsdec1x bmi .Lxtsdec1x
ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */
next_tweak v5, v4, v7, v8 next_tweak v5, v4, v8
eor v0.16b, v0.16b, v4.16b eor v0.16b, v0.16b, v4.16b
next_tweak v6, v5, v7, v8 next_tweak v6, v5, v8
eor v1.16b, v1.16b, v5.16b eor v1.16b, v1.16b, v5.16b
eor v2.16b, v2.16b, v6.16b eor v2.16b, v2.16b, v6.16b
next_tweak v7, v6, v7, v8 next_tweak v7, v6, v8
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
bl aes_decrypt_block4x bl aes_decrypt_block4x
eor v3.16b, v3.16b, v7.16b eor v3.16b, v3.16b, v7.16b
...@@ -459,7 +461,7 @@ AES_ENTRY(aes_xts_decrypt) ...@@ -459,7 +461,7 @@ AES_ENTRY(aes_xts_decrypt)
st1 {v0.16b}, [x0], #16 st1 {v0.16b}, [x0], #16
subs w4, w4, #1 subs w4, w4, #1
beq .Lxtsdecout beq .Lxtsdecout
next_tweak v4, v4, v7, v8 next_tweak v4, v4, v8
b .Lxtsdecloop b .Lxtsdecloop
.Lxtsdecout: .Lxtsdecout:
st1 {v4.16b}, [x6] st1 {v4.16b}, [x6]
......
...@@ -14,6 +14,12 @@ ...@@ -14,6 +14,12 @@
#define AES_ENTRY(func) ENTRY(neon_ ## func) #define AES_ENTRY(func) ENTRY(neon_ ## func)
#define AES_ENDPROC(func) ENDPROC(neon_ ## func) #define AES_ENDPROC(func) ENDPROC(neon_ ## func)
xtsmask .req v7
.macro xts_reload_mask, tmp
xts_load_mask \tmp
.endm
/* multiply by polynomial 'x' in GF(2^8) */ /* multiply by polynomial 'x' in GF(2^8) */
.macro mul_by_x, out, in, temp, const .macro mul_by_x, out, in, temp, const
sshr \temp, \in, #7 sshr \temp, \in, #7
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment