Commit 937c30d7 authored by Jussi Kivilinna's avatar Jussi Kivilinna Committed by Herbert Xu

crypto: serpent - add 8-way parallel x86_64/SSE2 assembler implementation

Patch adds x86_64/SSE2 assembler implementation of serpent cipher. Assembler
functions crypt data in eigth block chunks (two 4 block chunk SSE2 operations
in parallel to improve performance on out-of-order CPUs). Glue code is based
on one from AES-NI implementation, so requests from irq context are redirected
to cryptd.

v2:
 - add missing include of linux/module.h
   (appearently crypto.h used to include module.h, which changed for 3.2 by
    commit 7c926402)

Patch has been tested with tcrypt and automated filesystem tests.

Tcrypt benchmarks results (serpent-sse2/serpent_generic speed ratios):

AMD Phenom II 1055T (fam:16, model:10):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     1.03x   1.01x   1.03x   1.05x   1.00x   0.99x
64B     1.00x   1.01x   1.02x   1.04x   1.02x   1.01x
256B    2.34x   2.41x   0.99x   2.43x   2.39x   2.40x
1024B   2.51x   2.57x   1.00x   2.59x   2.56x   2.56x
8192B   2.50x   2.54x   1.00x   2.55x   2.57x   2.57x

Intel Celeron T1600 (fam:6, model:15, step:13):

size    ecb-enc ecb-dec cbc-enc cbc-dec ctr-enc ctr-dec
16B     0.97x   0.97x   1.01x   1.01x   1.01x   1.02x
64B     1.00x   1.00x   1.00x   1.02x   1.01x   1.01x
256B    3.41x   3.35x   1.00x   3.39x   3.42x   3.44x
1024B   3.75x   3.72x   0.99x   3.74x   3.75x   3.75x
8192B   3.70x   3.68x   0.99x   3.68x   3.69x   3.69x

Full output:
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/phenom-ii-1055t/serpent-sse2.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-generic.txt
 http://koti.mbnet.fi/axh/kernel/crypto/celeron-t1600/serpent-sse2.txtSigned-off-by: default avatarJussi Kivilinna <jussi.kivilinna@mbnet.fi>
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent d19978f5
......@@ -11,6 +11,7 @@ obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o
obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
obj-$(CONFIG_CRYPTO_SERPENT_SSE2_X86_64) += serpent-sse2-x86_64.o
obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
......@@ -26,6 +27,7 @@ blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o
twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o
salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
serpent-sse2-x86_64-y := serpent-sse2-x86_64-asm_64.o serpent_sse2_glue.o
aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
......
This diff is collapsed.
This diff is collapsed.
#ifndef ASM_X86_SERPENT_H
#define ASM_X86_SERPENT_H
#include <linux/crypto.h>
#include <crypto/serpent.h>
#define SERPENT_PARALLEL_BLOCKS 8
asmlinkage void __serpent_enc_blk_8way(struct serpent_ctx *ctx, u8 *dst,
const u8 *src, bool xor);
asmlinkage void serpent_dec_blk_8way(struct serpent_ctx *ctx, u8 *dst,
const u8 *src);
static inline void serpent_enc_blk_xway(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
__serpent_enc_blk_8way(ctx, dst, src, false);
}
static inline void serpent_enc_blk_xway_xor(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
__serpent_enc_blk_8way(ctx, dst, src, true);
}
static inline void serpent_dec_blk_xway(struct serpent_ctx *ctx, u8 *dst,
const u8 *src)
{
serpent_dec_blk_8way(ctx, dst, src);
}
#endif
......@@ -766,6 +766,23 @@ config CRYPTO_SERPENT
See also:
<http://www.cl.cam.ac.uk/~rja14/serpent.html>
config CRYPTO_SERPENT_SSE2_X86_64
tristate "Serpent cipher algorithm (x86_64/SSE2)"
depends on X86 && 64BIT
select CRYPTO_ALGAPI
select CRYPTO_SERPENT
help
Serpent cipher algorithm, by Anderson, Biham & Knudsen.
Keys are allowed to be from 0 to 256 bits in length, in steps
of 8 bits.
This module provides Serpent cipher algorithm that processes eigth
blocks parallel using SSE2 instruction set.
See also:
<http://www.cl.cam.ac.uk/~rja14/serpent.html>
config CRYPTO_TEA
tristate "TEA, XTEA and XETA cipher algorithms"
select CRYPTO_ALGAPI
......
......@@ -1534,6 +1534,21 @@ static int alg_test_null(const struct alg_test_desc *desc,
/* Please keep this list sorted by algorithm name. */
static const struct alg_test_desc alg_test_descs[] = {
{
.alg = "__cbc-serpent-sse2",
.test = alg_test_null,
.suite = {
.cipher = {
.enc = {
.vecs = NULL,
.count = 0
},
.dec = {
.vecs = NULL,
.count = 0
}
}
}
}, {
.alg = "__driver-cbc-aes-aesni",
.test = alg_test_null,
.suite = {
......@@ -1548,6 +1563,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}
}, {
.alg = "__driver-cbc-serpent-sse2",
.test = alg_test_null,
.suite = {
.cipher = {
.enc = {
.vecs = NULL,
.count = 0
},
.dec = {
.vecs = NULL,
.count = 0
}
}
}
}, {
.alg = "__driver-ecb-aes-aesni",
.test = alg_test_null,
......@@ -1563,6 +1593,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}
}, {
.alg = "__driver-ecb-serpent-sse2",
.test = alg_test_null,
.suite = {
.cipher = {
.enc = {
.vecs = NULL,
.count = 0
},
.dec = {
.vecs = NULL,
.count = 0
}
}
}
}, {
.alg = "__ghash-pclmulqdqni",
.test = alg_test_null,
......@@ -1745,6 +1790,21 @@ static const struct alg_test_desc alg_test_descs[] = {
}
}
}
}, {
.alg = "cryptd(__driver-ecb-serpent-sse2)",
.test = alg_test_null,
.suite = {
.cipher = {
.enc = {
.vecs = NULL,
.count = 0
},
.dec = {
.vecs = NULL,
.count = 0
}
}
}
}, {
.alg = "cryptd(__ghash-pclmulqdqni)",
.test = alg_test_null,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment