crypto: x86/aes-xts - wire up VAES + AVX10/512 implementation

Add an AES-XTS implementation "xts-aes-vaes-avx10_512" for x86_64 CPUs with the VAES, VPCLMULQDQ, and either AVX10/512 or AVX512BW + AVX512VL extensions. This implementation uses zmm registers to operate on four AES blocks at a time. The assembly code is instantiated using a macro so that most of the source code is shared with other implementations. To avoid downclocking on older Intel CPU models, an exclusion list is used to prevent this 512-bit implementation from being used by default on some CPU models. They will use xts-aes-vaes-avx10_256 instead. For now, this exclusion list is simply coded into aesni-intel_glue.c. It may make sense to eventually move it into a more central location. xts-aes-vaes-avx10_512 is slightly faster than xts-aes-vaes-avx10_256 on some current CPUs. E.g., on AMD Zen 4, AES-256-XTS decryption throughput increases by 13% with 4096-byte inputs, or 14% with 512-byte inputs. On Intel Sapphire Rapids, AES-256-XTS decryption throughput increases by 2% with 4096-byte inputs, or 3% with 512-byte inputs. Future CPUs may provide stronger 512-bit support, in which case a larger benefit should be seen. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

crypto: x86/aes-xts - wire up VAES + AVX10/512 implementation
Add an AES-XTS implementation "xts-aes-vaes-avx10_512" for x86_64 CPUs with the VAES, VPCLMULQDQ, and either AVX10/512 or AVX512BW + AVX512VL extensions. This implementation uses zmm registers to operate on four AES blocks at a time. The assembly code is instantiated using a macro so that most of the source code is shared with other implementations. To avoid downclocking on older Intel CPU models, an exclusion list is used to prevent this 512-bit implementation from being used by default on some CPU models. They will use xts-aes-vaes-avx10_256 instead. For now, this exclusion list is simply coded into aesni-intel_glue.c. It may make sense to eventually move it into a more central location. xts-aes-vaes-avx10_512 is slightly faster than xts-aes-vaes-avx10_256 on some current CPUs. E.g., on AMD Zen 4, AES-256-XTS decryption throughput increases by 13% with 4096-byte inputs, or 14% with 512-byte inputs. On Intel Sapphire Rapids, AES-256-XTS decryption throughput increases by 2% with 4096-byte inputs, or 3% with 512-byte inputs. Future CPUs may provide stronger 512-bit support, in which case a larger benefit should be seen. Signed-off-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
aa2197f5 · Eric Biggers · Herbert Xu · ee63fea0 · aa2197f5 · aa2197f5
Commit aa2197f5 authored Mar 29, 2024 by Eric Biggers Committed by Herbert Xu Apr 05, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 41 additions and 0 deletions

arch/x86/crypto/aes-xts-avx-x86_64.S arch/x86/crypto/aes-xts-avx-x86_64.S +9 -0

arch/x86/crypto/aesni-intel_glue.c arch/x86/crypto/aesni-intel_glue.c +32 -0

No files found.
--- a/arch/x86/crypto/aes-xts-avx-x86_64.S
+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S
@@ -826,4 +826,13 @@ SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256)
 SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256)
 	_aes_xts_crypt	0
 SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256)
+
+.set	VL, 64
+.set	USE_AVX10, 1
+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512)
+	_aes_xts_crypt	1
+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512)
+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512)
+	_aes_xts_crypt	0
+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512)
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1298,8 +1298,29 @@ DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500);
 #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ)
 DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600);
 DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700);
+DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800);
 #endif

+/*
+ * This is a list of CPU models that are known to suffer from downclocking when
+ * zmm registers (512-bit vectors) are used.  On these CPUs, the AES-XTS
+ * implementation with zmm registers won't be used by default.  An
+ * implementation with ymm registers (256-bit vectors) will be used instead.
+ */
+static const struct x86_cpu_id zmm_exclusion_list[] = {
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_SKYLAKE_X },
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_X },
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_D },
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE },
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_L },
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_NNPI },
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE_L },
+	{ .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE },
+	/* Allow Rocket Lake and later, and Sapphire Rapids and later. */
+	/* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */
+	{},
+};
+
 static int __init register_xts_algs(void)
 {
 	int err;
@@ -1333,6 +1354,14 @@ static int __init register_xts_algs(void)
 					     &aes_xts_simdalg_vaes_avx10_256);
 	if (err)
 		return err;
+
+	if (x86_match_cpu(zmm_exclusion_list))
+		aes_xts_alg_vaes_avx10_512.base.cra_priority = 1;
+
+	err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1,
+					     &aes_xts_simdalg_vaes_avx10_512);
+	if (err)
+		return err;
 #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */
 	return 0;
 }
@@ -1349,6 +1378,9 @@ static void unregister_xts_algs(void)
 	if (aes_xts_simdalg_vaes_avx10_256)
 		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1,
 					  &aes_xts_simdalg_vaes_avx10_256);
+	if (aes_xts_simdalg_vaes_avx10_512)
+		simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1,
+					  &aes_xts_simdalg_vaes_avx10_512);
 #endif
 }
 #else /* CONFIG_X86_64 */