crypto: arm64/aes-neon-blk - tweak performance for low end cores

The non-bitsliced AES implementation using the NEON is highly sensitive to micro-architectural details, and, as it turns out, the Cortex-A53 on the Raspberry Pi 3 is a core that can benefit from this code, given that its scalar AES performance is abysmal (32.9 cycles per byte). The new bitsliced AES code manages 19.8 cycles per byte on this core, but can only operate on 8 blocks at a time, which is not supported by all chaining modes. With a bit of tweaking, we can get the plain NEON code to run at 22.0 cycles per byte, making it useful for sequential modes like CBC encryption. (Like bitsliced NEON, the plain NEON implementation does not use any lookup tables, which makes it easy on the D-cache, and invulnerable to cache timing attacks) So tweak the plain NEON AES code to use tbl instructions rather than shl/sri pairs, and to avoid the need to reload permutation vectors or other constants from memory in every round. Also, improve the decryption performance by switching to 16x8 pmul instructions for the performing the multiplications in GF(2^8). To allow the ECB and CBC encrypt routines to be reused by the bitsliced NEON code in a subsequent patch, export them from the module. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

crypto: arm64/aes-neon-blk - tweak performance for low end cores
The non-bitsliced AES implementation using the NEON is highly sensitive to micro-architectural details, and, as it turns out, the Cortex-A53 on the Raspberry Pi 3 is a core that can benefit from this code, given that its scalar AES performance is abysmal (32.9 cycles per byte). The new bitsliced AES code manages 19.8 cycles per byte on this core, but can only operate on 8 blocks at a time, which is not supported by all chaining modes. With a bit of tweaking, we can get the plain NEON code to run at 22.0 cycles per byte, making it useful for sequential modes like CBC encryption. (Like bitsliced NEON, the plain NEON implementation does not use any lookup tables, which makes it easy on the D-cache, and invulnerable to cache timing attacks) So tweak the plain NEON AES code to use tbl instructions rather than shl/sri pairs, and to avoid the need to reload permutation vectors or other constants from memory in every round. Also, improve the decryption performance by switching to 16x8 pmul instructions for the performing the multiplications in GF(2^8). To allow the ECB and CBC encrypt routines to be reused by the bitsliced NEON code in a subsequent patch, export them from the module. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
4edd7d01 · Ard Biesheuvel · Herbert Xu · c458c4ad · 4edd7d01 · 4edd7d01
Commit 4edd7d01 authored Jan 28, 2017 by Ard Biesheuvel Committed by Herbert Xu Feb 03, 2017
Hide whitespace changes
Inline Side-by-side

Showing with 102 additions and 135 deletions

arch/arm64/crypto/aes-glue.c arch/arm64/crypto/aes-glue.c +2 -0

arch/arm64/crypto/aes-neon.S arch/arm64/crypto/aes-neon.S +100 -135

No files found.
--- a/arch/arm64/crypto/aes-glue.c
+++ b/arch/arm64/crypto/aes-glue.c
@@ -409,5 +409,7 @@ static int __init aes_init(void)
 module_cpu_feature_match(AES, aes_init);
 #else
 module_init(aes_init);
+EXPORT_SYMBOL(neon_aes_ecb_encrypt);
+EXPORT_SYMBOL(neon_aes_cbc_encrypt);
 #endif
 module_exit(aes_exit);
--- a/arch/arm64/crypto/aes-neon.S
+++ b/arch/arm64/crypto/aes-neon.S
 /*
 * linux/arch/arm64/crypto/aes-neon.S - AES cipher for ARMv8 NEON
 *
- * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
+ * Copyright (C) 2013 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
@@ -17,17 +17,25 @@
 	/* multiply by polynomial 'x' in GF(2^8) */
 	.macro		mul_by_x, out, in, temp, const
 	sshr		\temp, \in, #7
-	add		\out, \in, \in
+	shl		\out, \in, #1
 	and		\temp, \temp, \const
 	eor		\out, \out, \temp
 	.endm

+	/* multiply by polynomial 'x^2' in GF(2^8) */
+	.macro		mul_by_x2, out, in, temp, const
+	ushr		\temp, \in, #6
+	shl		\out, \in, #2
+	pmul		\temp, \temp, \const
+	eor		\out, \out, \temp
+	.endm
+
 	/* preload the entire Sbox */
 	.macro		prepare, sbox, shiftrows, temp
 	adr		\temp, \sbox
-	movi		v12.16b, #0x40
+	movi		v12.16b, #0x1b
 	ldr		q13, \shiftrows
-	movi		v14.16b, #0x1b
+	ldr		q14, .Lror32by8
 	ld1		{v16.16b-v19.16b}, [\temp], #64
 	ld1		{v20.16b-v23.16b}, [\temp], #64
 	ld1		{v24.16b-v27.16b}, [\temp], #64
@@ -50,37 +58,31 @@

 	/* apply SubBytes transformation using the the preloaded Sbox */
 	.macro		sub_bytes, in
-	sub		v9.16b, \in\().16b, v12.16b
+	sub		v9.16b, \in\().16b, v15.16b
 	tbl		\in\().16b, {v16.16b-v19.16b}, \in\().16b
-	sub		v10.16b, v9.16b, v12.16b
+	sub		v10.16b, v9.16b, v15.16b
 	tbx		\in\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v11.16b, v10.16b, v12.16b
+	sub		v11.16b, v10.16b, v15.16b
 	tbx		\in\().16b, {v24.16b-v27.16b}, v10.16b
 	tbx		\in\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm

 	/* apply MixColumns transformation */
-	.macro		mix_columns, in
-	mul_by_x	v10.16b, \in\().16b, v9.16b, v14.16b
-	rev32		v8.8h, \in\().8h
-	eor		\in\().16b, v10.16b, \in\().16b
-	shl		v9.4s, v8.4s, #24
-	shl		v11.4s, \in\().4s, #24
-	sri		v9.4s, v8.4s, #8
-	sri		v11.4s, \in\().4s, #8
-	eor		v9.16b, v9.16b, v8.16b
-	eor		v10.16b, v10.16b, v9.16b
-	eor		\in\().16b, v10.16b, v11.16b
-	.endm
-
+	.macro		mix_columns, in, enc
+	.if		\enc == 0
 	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
-	.macro		inv_mix_columns, in
-	mul_by_x	v11.16b, \in\().16b, v10.16b, v14.16b
-	mul_by_x	v11.16b, v11.16b, v10.16b, v14.16b
-	eor		\in\().16b, \in\().16b, v11.16b
-	rev32		v11.8h, v11.8h
-	eor		\in\().16b, \in\().16b, v11.16b
-	mix_columns	\in
+	mul_by_x2	v8.16b, \in\().16b, v9.16b, v12.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	rev32		v8.8h, v8.8h
+	eor		\in\().16b, \in\().16b, v8.16b
+	.endif
+
+	mul_by_x	v9.16b, \in\().16b, v8.16b, v12.16b
+	rev32		v8.8h, \in\().8h
+	eor		v8.16b, v8.16b, v9.16b
+	eor		\in\().16b, \in\().16b, v8.16b
+	tbl		\in\().16b, {\in\().16b}, v14.16b
+	eor		\in\().16b, \in\().16b, v8.16b
 	.endm

 	.macro		do_block, enc, in, rounds, rk, rkp, i
@@ -88,16 +90,13 @@
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
+	movi		v15.16b, #0x40
 	tbl		\in\().16b, {\in\().16b}, v13.16b	/* ShiftRows */
 	sub_bytes	\in
-	ld1		{v15.4s}, [\rkp], #16
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns	\in
-	.else
-	inv_mix_columns	\in
-	.endif
+	mix_columns	\in, \enc
 	b		1111b
 2222:	eor		\in\().16b, \in\().16b, v15.16b		/* ^round key */
 	.endm
@@ -116,139 +115,114 @@
 	 */

 	.macro		sub_bytes_2x, in0, in1
-	sub		v8.16b, \in0\().16b, v12.16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, v8.16b, v12.16b
-	sub		v11.16b, v9.16b, v12.16b
+	sub		v10.16b, v8.16b, v15.16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
+	sub		v11.16b, v9.16b, v15.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v10.16b, v12.16b
-	sub		v9.16b, v11.16b, v12.16b
+	sub		v8.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v10.16b
+	sub		v9.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v11.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	.endm

 	.macro		sub_bytes_4x, in0, in1, in2, in3
-	sub		v8.16b, \in0\().16b, v12.16b
+	sub		v8.16b, \in0\().16b, v15.16b
 	tbl		\in0\().16b, {v16.16b-v19.16b}, \in0\().16b
-	sub		v9.16b, \in1\().16b, v12.16b
+	sub		v9.16b, \in1\().16b, v15.16b
 	tbl		\in1\().16b, {v16.16b-v19.16b}, \in1\().16b
-	sub		v10.16b, \in2\().16b, v12.16b
+	sub		v10.16b, \in2\().16b, v15.16b
 	tbl		\in2\().16b, {v16.16b-v19.16b}, \in2\().16b
-	sub		v11.16b, \in3\().16b, v12.16b
+	sub		v11.16b, \in3\().16b, v15.16b
 	tbl		\in3\().16b, {v16.16b-v19.16b}, \in3\().16b
 	tbx		\in0\().16b, {v20.16b-v23.16b}, v8.16b
 	tbx		\in1\().16b, {v20.16b-v23.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v20.16b-v23.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v20.16b-v23.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v24.16b-v27.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v24.16b-v27.16b}, v9.16b
-	sub		v8.16b, v8.16b, v12.16b
+	sub		v8.16b, v8.16b, v15.16b
 	tbx		\in2\().16b, {v24.16b-v27.16b}, v10.16b
-	sub		v9.16b, v9.16b, v12.16b
+	sub		v9.16b, v9.16b, v15.16b
 	tbx		\in3\().16b, {v24.16b-v27.16b}, v11.16b
-	sub		v10.16b, v10.16b, v12.16b
+	sub		v10.16b, v10.16b, v15.16b
 	tbx		\in0\().16b, {v28.16b-v31.16b}, v8.16b
-	sub		v11.16b, v11.16b, v12.16b
+	sub		v11.16b, v11.16b, v15.16b
 	tbx		\in1\().16b, {v28.16b-v31.16b}, v9.16b
 	tbx		\in2\().16b, {v28.16b-v31.16b}, v10.16b
 	tbx		\in3\().16b, {v28.16b-v31.16b}, v11.16b
 	.endm

 	.macro		mul_by_x_2x, out0, out1, in0, in1, tmp0, tmp1, const
-	sshr		\tmp0\().16b, \in0\().16b,  #7
-	add		\out0\().16b, \in0\().16b,  \in0\().16b
-	sshr		\tmp1\().16b, \in1\().16b,  #7
+	sshr		\tmp0\().16b, \in0\().16b, #7
+	shl		\out0\().16b, \in0\().16b, #1
+	sshr		\tmp1\().16b, \in1\().16b, #7
 	and		\tmp0\().16b, \tmp0\().16b, \const\().16b
-	add		\out1\().16b, \in1\().16b,  \in1\().16b
+	shl		\out1\().16b, \in1\().16b, #1
 	and		\tmp1\().16b, \tmp1\().16b, \const\().16b
 	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
 	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
 	.endm

-	.macro		mix_columns_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	rev32		v10.8h, \in0\().8h
-	rev32		v11.8h, \in1\().8h
-	eor		\in0\().16b, v8.16b, \in0\().16b
-	eor		\in1\().16b, v9.16b, \in1\().16b
-	shl		v12.4s, v10.4s, #24
-	shl		v13.4s, v11.4s, #24
-	eor		v8.16b, v8.16b, v10.16b
-	sri		v12.4s, v10.4s, #8
-	shl		v10.4s, \in0\().4s, #24
-	eor		v9.16b, v9.16b, v11.16b
-	sri		v13.4s, v11.4s, #8
-	shl		v11.4s, \in1\().4s, #24
-	sri		v10.4s, \in0\().4s, #8
-	eor		\in0\().16b, v8.16b, v12.16b
-	sri		v11.4s, \in1\().4s, #8
-	eor		\in1\().16b, v9.16b, v13.16b
-	eor		\in0\().16b, v10.16b, \in0\().16b
-	eor		\in1\().16b, v11.16b, \in1\().16b
+	.macro		mul_by_x2_2x, out0, out1, in0, in1, tmp0, tmp1, const
+	ushr		\tmp0\().16b, \in0\().16b, #6
+	shl		\out0\().16b, \in0\().16b, #2
+	ushr		\tmp1\().16b, \in1\().16b, #6
+	pmul		\tmp0\().16b, \tmp0\().16b, \const\().16b
+	shl		\out1\().16b, \in1\().16b, #2
+	pmul		\tmp1\().16b, \tmp1\().16b, \const\().16b
+	eor		\out0\().16b, \out0\().16b, \tmp0\().16b
+	eor		\out1\().16b, \out1\().16b, \tmp1\().16b
 	.endm

-	.macro		inv_mix_cols_2x, in0, in1
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v8, v9, v8, v9, v10, v11, v14
+	.macro		mix_columns_2x, in0, in1, enc
+	.if		\enc == 0
+	/* Inverse MixColumns: pre-multiply by { 5, 0, 4, 0 } */
+	mul_by_x2_2x	v8, v9, \in0, \in1, v10, v11, v12
 	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
 	rev32		v8.8h, v8.8h
-	rev32		v9.8h, v9.8h
-	eor		\in0\().16b, \in0\().16b, v8.16b
-	eor		\in1\().16b, \in1\().16b, v9.16b
-	mix_columns_2x	\in0, \in1
-	.endm
-
-	.macro		inv_mix_cols_4x, in0, in1, in2, in3
-	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v14
-	mul_by_x_2x	v10, v11, \in2, \in3, v12, v13, v14
-	mul_by_x_2x	v8, v9, v8, v9, v12, v13, v14
-	mul_by_x_2x	v10, v11, v10, v11, v12, v13, v14
-	eor		\in0\().16b, \in0\().16b, v8.16b
 	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	rev32		v8.8h, v8.8h
 	rev32		v9.8h, v9.8h
-	rev32		v10.8h, v10.8h
-	rev32		v11.8h, v11.8h
 	eor		\in0\().16b, \in0\().16b, v8.16b
 	eor		\in1\().16b, \in1\().16b, v9.16b
-	eor		\in2\().16b, \in2\().16b, v10.16b
-	eor		\in3\().16b, \in3\().16b, v11.16b
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
+	.endif
+
+	mul_by_x_2x	v8, v9, \in0, \in1, v10, v11, v12
+	rev32		v10.8h, \in0\().8h
+	rev32		v11.8h, \in1\().8h
+	eor		v10.16b, v10.16b, v8.16b
+	eor		v11.16b, v11.16b, v9.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
+	tbl		\in0\().16b, {\in0\().16b}, v14.16b
+	tbl		\in1\().16b, {\in1\().16b}, v14.16b
+	eor		\in0\().16b, \in0\().16b, v10.16b
+	eor		\in1\().16b, \in1\().16b, v11.16b
 	.endm

-	.macro		do_block_2x, enc, in0, in1 rounds, rk, rkp, i
+	.macro		do_block_2x, enc, in0, in1, rounds, rk, rkp, i
 	ld1		{v15.4s}, [\rk]
 	add		\rkp, \rk, #16
 	mov		\i, \rounds
 1111:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
-	sub_bytes_2x	\in0, \in1
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_2x	\in0, \in1
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_2x	\in0, \in1
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -262,23 +236,17 @@
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
 	eor		\in2\().16b, \in2\().16b, v15.16b	/* ^round key */
 	eor		\in3\().16b, \in3\().16b, v15.16b	/* ^round key */
-	sub_bytes_4x	\in0, \in1, \in2, \in3
+	movi		v15.16b, #0x40
 	tbl		\in0\().16b, {\in0\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in1\().16b, {\in1\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in2\().16b, {\in2\().16b}, v13.16b	/* ShiftRows */
 	tbl		\in3\().16b, {\in3\().16b}, v13.16b	/* ShiftRows */
-	ld1		{v15.4s}, [\rkp], #16
+	sub_bytes_4x	\in0, \in1, \in2, \in3
 	subs		\i, \i, #1
+	ld1		{v15.4s}, [\rkp], #16
 	beq		2222f
-	.if		\enc == 1
-	mix_columns_2x	\in0, \in1
-	mix_columns_2x	\in2, \in3
-	ldr		q13, .LForward_ShiftRows
-	.else
-	inv_mix_cols_4x	\in0, \in1, \in2, \in3
-	ldr		q13, .LReverse_ShiftRows
-	.endif
-	movi		v12.16b, #0x40
+	mix_columns_2x	\in0, \in1, \enc
+	mix_columns_2x	\in2, \in3, \enc
 	b		1111b
 2222:	eor		\in0\().16b, \in0\().16b, v15.16b	/* ^round key */
 	eor		\in1\().16b, \in1\().16b, v15.16b	/* ^round key */
@@ -305,19 +273,7 @@
 #include "aes-modes.S"

 	.text
-	.align		4
-.LForward_ShiftRows:
-CPU_LE(	.byte		0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3	)
-CPU_LE(	.byte		0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb	)
-CPU_BE(	.byte		0xb, 0x6, 0x1, 0xc, 0x7, 0x2, 0xd, 0x8	)
-CPU_BE(	.byte		0x3, 0xe, 0x9, 0x4, 0xf, 0xa, 0x5, 0x0	)
-
-.LReverse_ShiftRows:
-CPU_LE(	.byte		0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb	)
-CPU_LE(	.byte		0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3	)
-CPU_BE(	.byte		0x3, 0x6, 0x9, 0xc, 0xf, 0x2, 0x5, 0x8	)
-CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
-
+	.align		6
 .LForward_Sbox:
 	.byte		0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
 	.byte		0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
@@ -385,3 +341,12 @@ CPU_BE(	.byte		0xb, 0xe, 0x1, 0x4, 0x7, 0xa, 0xd, 0x0	)
 	.byte		0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
 	.byte		0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
 	.byte		0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+
+.LForward_ShiftRows:
+	.octa		0x0b06010c07020d08030e09040f0a0500
+
+.LReverse_ShiftRows:
+	.octa		0x0306090c0f0205080b0e0104070a0d00
+
+.Lror32by8:
+	.octa		0x0c0f0e0d080b0a090407060500030201