Commit 6a8ce1ef authored by Tim Chen's avatar Tim Chen Committed by Herbert Xu

crypto: crc32c - Optimize CRC32C calculation with PCLMULQDQ instruction

This patch adds the crc_pcl function that calculates CRC32C checksum using the
PCLMULQDQ instruction on processors that support this feature. This will
provide speedup over using CRC32 instruction only.
The usage of PCLMULQDQ necessitate the invocation of kernel_fpu_begin and
kernel_fpu_end and incur some overhead.  So the new crc_pcl function is only
invoked for buffer size of 512 bytes or more.  Larger sized
buffers will expect to see greater speedup.  This feature is best used coupled
with eager_fpu which reduces the kernel_fpu_begin/end overhead.  For
buffer size of 1K the speedup is around 1.6x and for buffer size greater than
4K, the speedup is around 3x compared to original implementation in crc32c-intel
module. Test was performed on Sandy Bridge based platform with constant frequency
set for cpu.

A white paper detailing the algorithm can be found here:
http://download.intel.com/design/intarch/papers/323405.pdfSigned-off-by: default avatarTim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent 35b80920
......@@ -48,3 +48,4 @@ aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o
crc32c-intel-y := crc32c-intel_glue.o
crc32c-intel-$(CONFIG_CRYPTO_CRC32C_X86_64) += crc32c-pcl-intel-asm_64.o
......@@ -32,6 +32,8 @@
#include <asm/cpufeature.h>
#include <asm/cpu_device_id.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#define CHKSUM_BLOCK_SIZE 1
#define CHKSUM_DIGEST_SIZE 4
......@@ -44,6 +46,31 @@
#define REX_PRE
#endif
#ifdef CONFIG_X86_64
/*
* use carryless multiply version of crc32c when buffer
* size is >= 512 (when eager fpu is enabled) or
* >= 1024 (when eager fpu is disabled) to account
* for fpu state save/restore overhead.
*/
#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512
#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024
asmlinkage unsigned int crc_pcl(const u8 *buffer, int len,
unsigned int crc_init);
static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU;
#if defined(X86_FEATURE_EAGER_FPU)
#define set_pcl_breakeven_point() \
do { \
if (!use_eager_fpu()) \
crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \
} while (0)
#else
#define set_pcl_breakeven_point() \
(crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU)
#endif
#endif /* CONFIG_X86_64 */
static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
{
while (length--) {
......@@ -154,6 +181,52 @@ static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
return 0;
}
#ifdef CONFIG_X86_64
static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data,
unsigned int len)
{
u32 *crcp = shash_desc_ctx(desc);
/*
* use faster PCL version if datasize is large enough to
* overcome kernel fpu state save/restore overhead
*/
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
kernel_fpu_begin();
*crcp = crc_pcl(data, len, *crcp);
kernel_fpu_end();
} else
*crcp = crc32c_intel_le_hw(*crcp, data, len);
return 0;
}
static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len,
u8 *out)
{
if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) {
kernel_fpu_begin();
*(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp));
kernel_fpu_end();
} else
*(__le32 *)out =
~cpu_to_le32(crc32c_intel_le_hw(*crcp, data, len));
return 0;
}
static int crc32c_pcl_intel_finup(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(shash_desc_ctx(desc), data, len, out);
}
static int crc32c_pcl_intel_digest(struct shash_desc *desc, const u8 *data,
unsigned int len, u8 *out)
{
return __crc32c_pcl_intel_finup(crypto_shash_ctx(desc->tfm), data, len,
out);
}
#endif /* CONFIG_X86_64 */
static struct shash_alg alg = {
.setkey = crc32c_intel_setkey,
.init = crc32c_intel_init,
......@@ -184,6 +257,14 @@ static int __init crc32c_intel_mod_init(void)
{
if (!x86_match_cpu(crc32c_cpu_id))
return -ENODEV;
#ifdef CONFIG_X86_64
if (cpu_has_pclmulqdq) {
alg.update = crc32c_pcl_intel_update;
alg.finup = crc32c_pcl_intel_finup;
alg.digest = crc32c_pcl_intel_digest;
set_pcl_breakeven_point();
}
#endif
return crypto_register_shash(&alg);
}
......
This diff is collapsed.
......@@ -324,9 +324,19 @@ config CRYPTO_CRC32C
by iSCSI for header and data digests and by others.
See Castagnoli93. Module will be crc32c.
config CRYPTO_CRC32C_X86_64
bool
depends on X86 && 64BIT
select CRYPTO_HASH
help
In Intel processor with SSE4.2 supported, the processor will
support CRC32C calculation using hardware accelerated CRC32
instruction optimized with PCLMULQDQ instruction when available.
config CRYPTO_CRC32C_INTEL
tristate "CRC32c INTEL hardware acceleration"
depends on X86
select CRYPTO_CRC32C_X86_64 if 64BIT
select CRYPTO_HASH
help
In Intel processor with SSE4.2 supported, the processor will
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment