Commit 5386e5d1 authored by Herbert Xu's avatar Herbert Xu

Revert "crypto: arm64/ARM: NEON accelerated ChaCha20"

This patch reverts the following commits:

8621caa0
80966672

I should not have applied them because they had already been
obsoleted by a subsequent patch series.  They also cause a build
failure because of the subsequent commit 9ae433bc.

Fixes: 9ae433bc ("crypto: chacha20 - convert generic and...")
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent fb91a661
......@@ -130,10 +130,4 @@ config CRYPTO_CRC32_ARM_CE
depends on KERNEL_MODE_NEON && CRC32
select CRYPTO_HASH
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha20 symmetric cipher"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
endif
......@@ -8,7 +8,6 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
ce-obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
ce-obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
......@@ -41,7 +40,6 @@ aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
ghash-arm-ce-y := ghash-ce-core.o ghash-ce-glue.o
crct10dif-arm-ce-y := crct10dif-ce-core.o crct10dif-ce-glue.o
crc32-arm-ce-y:= crc32-ce-core.o crc32-ce-glue.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $(<) > $(@)
......
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SNEON3 functions
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
.text
.fpu neon
.align 5
ENTRY(chacha20_block_xor_neon)
// r0: Input state matrix, s
// r1: 1 data block output, o
// r2: 1 data block input, i
//
// This function encrypts one ChaCha20 block by loading the state matrix
// in four NEON registers. It performs matrix operation on four words in
// parallel, but requireds shuffling to rearrange the words after each
// round.
//
// x0..3 = s0..3
add ip, r0, #0x20
vld1.32 {q0-q1}, [r0]
vld1.32 {q2-q3}, [ip]
vmov q8, q0
vmov q9, q1
vmov q10, q2
vmov q11, q3
mov r3, #10
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vadd.i32 q0, q0, q1
veor q4, q3, q0
vshl.u32 q3, q4, #16
vsri.u32 q3, q4, #16
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vadd.i32 q2, q2, q3
veor q4, q1, q2
vshl.u32 q1, q4, #12
vsri.u32 q1, q4, #20
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32 q0, q0, q1
veor q4, q3, q0
vshl.u32 q3, q4, #8
vsri.u32 q3, q4, #24
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32 q2, q2, q3
veor q4, q1, q2
vshl.u32 q1, q4, #7
vsri.u32 q1, q4, #25
// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
vext.8 q1, q1, q1, #4
// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vext.8 q2, q2, q2, #8
// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
vext.8 q3, q3, q3, #12
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
vadd.i32 q0, q0, q1
veor q4, q3, q0
vshl.u32 q3, q4, #16
vsri.u32 q3, q4, #16
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
vadd.i32 q2, q2, q3
veor q4, q1, q2
vshl.u32 q1, q4, #12
vsri.u32 q1, q4, #20
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
vadd.i32 q0, q0, q1
veor q4, q3, q0
vshl.u32 q3, q4, #8
vsri.u32 q3, q4, #24
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
vadd.i32 q2, q2, q3
veor q4, q1, q2
vshl.u32 q1, q4, #7
vsri.u32 q1, q4, #25
// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
vext.8 q1, q1, q1, #12
// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
vext.8 q2, q2, q2, #8
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
vext.8 q3, q3, q3, #4
subs r3, r3, #1
bne .Ldoubleround
add ip, r2, #0x20
vld1.8 {q4-q5}, [r2]
vld1.8 {q6-q7}, [ip]
// o0 = i0 ^ (x0 + s0)
vadd.i32 q0, q0, q8
veor q0, q0, q4
// o1 = i1 ^ (x1 + s1)
vadd.i32 q1, q1, q9
veor q1, q1, q5
// o2 = i2 ^ (x2 + s2)
vadd.i32 q2, q2, q10
veor q2, q2, q6
// o3 = i3 ^ (x3 + s3)
vadd.i32 q3, q3, q11
veor q3, q3, q7
add ip, r1, #0x20
vst1.8 {q0-q1}, [r1]
vst1.8 {q2-q3}, [ip]
bx lr
ENDPROC(chacha20_block_xor_neon)
.align 5
ENTRY(chacha20_4block_xor_neon)
push {r4-r6, lr}
mov ip, sp // preserve the stack pointer
sub r3, sp, #0x20 // allocate a 32 byte buffer
bic r3, r3, #0x1f // aligned to 32 bytes
mov sp, r3
// r0: Input state matrix, s
// r1: 4 data blocks output, o
// r2: 4 data blocks input, i
//
// This function encrypts four consecutive ChaCha20 blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
// requires no word shuffling. For final XORing step we transpose the
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
// x0..15[0-3] = s0..3[0..3]
add r3, r0, #0x20
vld1.32 {q0-q1}, [r0]
vld1.32 {q2-q3}, [r3]
adr r3, CTRINC
vdup.32 q15, d7[1]
vdup.32 q14, d7[0]
vld1.32 {q11}, [r3, :128]
vdup.32 q13, d6[1]
vdup.32 q12, d6[0]
vadd.i32 q12, q12, q11 // x12 += counter values 0-3
vdup.32 q11, d5[1]
vdup.32 q10, d5[0]
vdup.32 q9, d4[1]
vdup.32 q8, d4[0]
vdup.32 q7, d3[1]
vdup.32 q6, d3[0]
vdup.32 q5, d2[1]
vdup.32 q4, d2[0]
vdup.32 q3, d1[1]
vdup.32 q2, d1[0]
vdup.32 q1, d0[1]
vdup.32 q0, d0[0]
mov r3, #10
.Ldoubleround4:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
vadd.i32 q0, q0, q4
vadd.i32 q1, q1, q5
vadd.i32 q2, q2, q6
vadd.i32 q3, q3, q7
veor q12, q12, q0
veor q13, q13, q1
veor q14, q14, q2
veor q15, q15, q3
vrev32.16 q12, q12
vrev32.16 q13, q13
vrev32.16 q14, q14
vrev32.16 q15, q15
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i32 q10, q10, q14
vadd.i32 q11, q11, q15
vst1.32 {q8-q9}, [sp, :256]
veor q8, q4, q8
veor q9, q5, q9
vshl.u32 q4, q8, #12
vshl.u32 q5, q9, #12
vsri.u32 q4, q8, #20
vsri.u32 q5, q9, #20
veor q8, q6, q10
veor q9, q7, q11
vshl.u32 q6, q8, #12
vshl.u32 q7, q9, #12
vsri.u32 q6, q8, #20
vsri.u32 q7, q9, #20
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
vadd.i32 q0, q0, q4
vadd.i32 q1, q1, q5
vadd.i32 q2, q2, q6
vadd.i32 q3, q3, q7
veor q8, q12, q0
veor q9, q13, q1
vshl.u32 q12, q8, #8
vshl.u32 q13, q9, #8
vsri.u32 q12, q8, #24
vsri.u32 q13, q9, #24
veor q8, q14, q2
veor q9, q15, q3
vshl.u32 q14, q8, #8
vshl.u32 q15, q9, #8
vsri.u32 q14, q8, #24
vsri.u32 q15, q9, #24
vld1.32 {q8-q9}, [sp, :256]
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
vadd.i32 q8, q8, q12
vadd.i32 q9, q9, q13
vadd.i32 q10, q10, q14
vadd.i32 q11, q11, q15
vst1.32 {q8-q9}, [sp, :256]
veor q8, q4, q8
veor q9, q5, q9
vshl.u32 q4, q8, #7
vshl.u32 q5, q9, #7
vsri.u32 q4, q8, #25
vsri.u32 q5, q9, #25
veor q8, q6, q10
veor q9, q7, q11
vshl.u32 q6, q8, #7
vshl.u32 q7, q9, #7
vsri.u32 q6, q8, #25
vsri.u32 q7, q9, #25
vld1.32 {q8-q9}, [sp, :256]
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
vadd.i32 q0, q0, q5
vadd.i32 q1, q1, q6
vadd.i32 q2, q2, q7
vadd.i32 q3, q3, q4
veor q15, q15, q0
veor q12, q12, q1
veor q13, q13, q2
veor q14, q14, q3
vrev32.16 q15, q15
vrev32.16 q12, q12
vrev32.16 q13, q13
vrev32.16 q14, q14
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
vadd.i32 q10, q10, q15
vadd.i32 q11, q11, q12
vadd.i32 q8, q8, q13
vadd.i32 q9, q9, q14
vst1.32 {q8-q9}, [sp, :256]
veor q8, q7, q8
veor q9, q4, q9
vshl.u32 q7, q8, #12
vshl.u32 q4, q9, #12
vsri.u32 q7, q8, #20
vsri.u32 q4, q9, #20
veor q8, q5, q10
veor q9, q6, q11
vshl.u32 q5, q8, #12
vshl.u32 q6, q9, #12
vsri.u32 q5, q8, #20
vsri.u32 q6, q9, #20
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
vadd.i32 q0, q0, q5
vadd.i32 q1, q1, q6
vadd.i32 q2, q2, q7
vadd.i32 q3, q3, q4
veor q8, q15, q0
veor q9, q12, q1
vshl.u32 q15, q8, #8
vshl.u32 q12, q9, #8
vsri.u32 q15, q8, #24
vsri.u32 q12, q9, #24
veor q8, q13, q2
veor q9, q14, q3
vshl.u32 q13, q8, #8
vshl.u32 q14, q9, #8
vsri.u32 q13, q8, #24
vsri.u32 q14, q9, #24
vld1.32 {q8-q9}, [sp, :256]
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
vadd.i32 q10, q10, q15
vadd.i32 q11, q11, q12
vadd.i32 q8, q8, q13
vadd.i32 q9, q9, q14
vst1.32 {q8-q9}, [sp, :256]
veor q8, q7, q8
veor q9, q4, q9
vshl.u32 q7, q8, #7
vshl.u32 q4, q9, #7
vsri.u32 q7, q8, #25
vsri.u32 q4, q9, #25
veor q8, q5, q10
veor q9, q6, q11
vshl.u32 q5, q8, #7
vshl.u32 q6, q9, #7
vsri.u32 q5, q8, #25
vsri.u32 q6, q9, #25
subs r3, r3, #1
beq 0f
vld1.32 {q8-q9}, [sp, :256]
b .Ldoubleround4
// x0[0-3] += s0[0]
// x1[0-3] += s0[1]
// x2[0-3] += s0[2]
// x3[0-3] += s0[3]
0: ldmia r0!, {r3-r6}
vdup.32 q8, r3
vdup.32 q9, r4
vadd.i32 q0, q0, q8
vadd.i32 q1, q1, q9
vdup.32 q8, r5
vdup.32 q9, r6
vadd.i32 q2, q2, q8
vadd.i32 q3, q3, q9
// x4[0-3] += s1[0]
// x5[0-3] += s1[1]
// x6[0-3] += s1[2]
// x7[0-3] += s1[3]
ldmia r0!, {r3-r6}
vdup.32 q8, r3
vdup.32 q9, r4
vadd.i32 q4, q4, q8
vadd.i32 q5, q5, q9
vdup.32 q8, r5
vdup.32 q9, r6
vadd.i32 q6, q6, q8
vadd.i32 q7, q7, q9
// interleave 32-bit words in state n, n+1
vzip.32 q0, q1
vzip.32 q2, q3
vzip.32 q4, q5
vzip.32 q6, q7
// interleave 64-bit words in state n, n+2
vswp d1, d4
vswp d3, d6
vswp d9, d12
vswp d11, d14
// xor with corresponding input, write to output
vld1.8 {q8-q9}, [r2]!
veor q8, q8, q0
veor q9, q9, q4
vst1.8 {q8-q9}, [r1]!
vld1.32 {q8-q9}, [sp, :256]
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
// x10[0-3] += s2[2]
// x11[0-3] += s2[3]
ldmia r0!, {r3-r6}
vdup.32 q0, r3
vdup.32 q4, r4
vadd.i32 q8, q8, q0
vadd.i32 q9, q9, q4
vdup.32 q0, r5
vdup.32 q4, r6
vadd.i32 q10, q10, q0
vadd.i32 q11, q11, q4
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
// x14[0-3] += s3[2]
// x15[0-3] += s3[3]
ldmia r0!, {r3-r6}
vdup.32 q0, r3
vdup.32 q4, r4
adr r3, CTRINC
vadd.i32 q12, q12, q0
vld1.32 {q0}, [r3, :128]
vadd.i32 q13, q13, q4
vadd.i32 q12, q12, q0 // x12 += counter values 0-3
vdup.32 q0, r5
vdup.32 q4, r6
vadd.i32 q14, q14, q0
vadd.i32 q15, q15, q4
// interleave 32-bit words in state n, n+1
vzip.32 q8, q9
vzip.32 q10, q11
vzip.32 q12, q13
vzip.32 q14, q15
// interleave 64-bit words in state n, n+2
vswp d17, d20
vswp d19, d22
vswp d25, d28
vswp d27, d30
vmov q4, q1
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q8
veor q1, q1, q12
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q2
veor q1, q1, q6
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q10
veor q1, q1, q14
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q4
veor q1, q1, q5
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q9
veor q1, q1, q13
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]!
veor q0, q0, q3
veor q1, q1, q7
vst1.8 {q0-q1}, [r1]!
vld1.8 {q0-q1}, [r2]
veor q0, q0, q11
veor q1, q1, q15
vst1.8 {q0-q1}, [r1]
mov sp, ip
pop {r4-r6, pc}
ENDPROC(chacha20_4block_xor_neon)
.align 4
CTRINC: .word 0, 1, 2, 3
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, ARM NEON functions
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha20.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/hwcap.h>
#include <asm/neon.h>
#include <asm/simd.h>
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA20_BLOCK_SIZE];
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
chacha20_4block_xor_neon(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE * 4;
src += CHACHA20_BLOCK_SIZE * 4;
dst += CHACHA20_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA20_BLOCK_SIZE) {
chacha20_block_xor_neon(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE;
src += CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_neon(state, buf, buf);
memcpy(dst, buf, bytes);
}
}
static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
u32 state[16];
int err;
if (nbytes <= CHACHA20_BLOCK_SIZE || !may_use_simd())
return crypto_chacha20_crypt(desc, dst, src, nbytes);
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
kernel_neon_begin();
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
err = blkcipher_walk_done(desc, &walk,
walk.nbytes % CHACHA20_BLOCK_SIZE);
}
if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes);
err = blkcipher_walk_done(desc, &walk, 0);
}
kernel_neon_end();
return err;
}
static struct crypto_alg alg = {
.cra_name = "chacha20",
.cra_driver_name = "chacha20-neon",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_type = &crypto_blkcipher_type,
.cra_ctxsize = sizeof(struct chacha20_ctx),
.cra_alignmask = sizeof(u32) - 1,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = CHACHA20_KEY_SIZE,
.max_keysize = CHACHA20_KEY_SIZE,
.ivsize = CHACHA20_IV_SIZE,
.geniv = "seqiv",
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_simd,
.decrypt = chacha20_simd,
},
},
};
static int __init chacha20_simd_mod_init(void)
{
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
return crypto_register_alg(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_alg(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
......@@ -72,10 +72,4 @@ config CRYPTO_CRC32_ARM64
depends on ARM64
select CRYPTO_HASH
config CRYPTO_CHACHA20_NEON
tristate "NEON accelerated ChaCha20 symmetric cipher"
depends on KERNEL_MODE_NEON
select CRYPTO_BLKCIPHER
select CRYPTO_CHACHA20
endif
......@@ -41,9 +41,6 @@ sha256-arm64-y := sha256-glue.o sha256-core.o
obj-$(CONFIG_CRYPTO_SHA512_ARM64) += sha512-arm64.o
sha512-arm64-y := sha512-glue.o sha512-core.o
obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha20-neon.o
chacha20-neon-y := chacha20-neon-core.o chacha20-neon-glue.o
AFLAGS_aes-ce.o := -DINTERLEAVE=4
AFLAGS_aes-neon.o := -DINTERLEAVE=4
......
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <linux/linkage.h>
.text
.align 6
ENTRY(chacha20_block_xor_neon)
// x0: Input state matrix, s
// x1: 1 data block output, o
// x2: 1 data block input, i
//
// This function encrypts one ChaCha20 block by loading the state matrix
// in four NEON registers. It performs matrix operation on four words in
// parallel, but requires shuffling to rearrange the words after each
// round.
//
// x0..3 = s0..3
ld1 {v0.4s-v3.4s}, [x0]
ld1 {v8.4s-v11.4s}, [x0]
mov x3, #10
.Ldoubleround:
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
add v0.4s, v0.4s, v1.4s
eor v3.16b, v3.16b, v0.16b
rev32 v3.8h, v3.8h
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
add v2.4s, v2.4s, v3.4s
eor v4.16b, v1.16b, v2.16b
shl v1.4s, v4.4s, #12
sri v1.4s, v4.4s, #20
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
add v0.4s, v0.4s, v1.4s
eor v4.16b, v3.16b, v0.16b
shl v3.4s, v4.4s, #8
sri v3.4s, v4.4s, #24
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
add v2.4s, v2.4s, v3.4s
eor v4.16b, v1.16b, v2.16b
shl v1.4s, v4.4s, #7
sri v1.4s, v4.4s, #25
// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
ext v1.16b, v1.16b, v1.16b, #4
// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
ext v2.16b, v2.16b, v2.16b, #8
// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
ext v3.16b, v3.16b, v3.16b, #12
// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
add v0.4s, v0.4s, v1.4s
eor v3.16b, v3.16b, v0.16b
rev32 v3.8h, v3.8h
// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
add v2.4s, v2.4s, v3.4s
eor v4.16b, v1.16b, v2.16b
shl v1.4s, v4.4s, #12
sri v1.4s, v4.4s, #20
// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
add v0.4s, v0.4s, v1.4s
eor v4.16b, v3.16b, v0.16b
shl v3.4s, v4.4s, #8
sri v3.4s, v4.4s, #24
// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
add v2.4s, v2.4s, v3.4s
eor v4.16b, v1.16b, v2.16b
shl v1.4s, v4.4s, #7
sri v1.4s, v4.4s, #25
// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
ext v1.16b, v1.16b, v1.16b, #12
// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
ext v2.16b, v2.16b, v2.16b, #8
// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
ext v3.16b, v3.16b, v3.16b, #4
subs x3, x3, #1
b.ne .Ldoubleround
ld1 {v4.16b-v7.16b}, [x2]
// o0 = i0 ^ (x0 + s0)
add v0.4s, v0.4s, v8.4s
eor v0.16b, v0.16b, v4.16b
// o1 = i1 ^ (x1 + s1)
add v1.4s, v1.4s, v9.4s
eor v1.16b, v1.16b, v5.16b
// o2 = i2 ^ (x2 + s2)
add v2.4s, v2.4s, v10.4s
eor v2.16b, v2.16b, v6.16b
// o3 = i3 ^ (x3 + s3)
add v3.4s, v3.4s, v11.4s
eor v3.16b, v3.16b, v7.16b
st1 {v0.16b-v3.16b}, [x1]
ret
ENDPROC(chacha20_block_xor_neon)
.align 6
ENTRY(chacha20_4block_xor_neon)
// x0: Input state matrix, s
// x1: 4 data blocks output, o
// x2: 4 data blocks input, i
//
// This function encrypts four consecutive ChaCha20 blocks by loading
// the state matrix in NEON registers four times. The algorithm performs
// each operation on the corresponding word of each state matrix, hence
// requires no word shuffling. For final XORing step we transpose the
// matrix by interleaving 32- and then 64-bit words, which allows us to
// do XOR in NEON registers.
//
adr x3, CTRINC
ld1 {v16.4s}, [x3]
// x0..15[0-3] = s0..3[0..3]
mov x4, x0
ld4r { v0.4s- v3.4s}, [x4], #16
ld4r { v4.4s- v7.4s}, [x4], #16
ld4r { v8.4s-v11.4s}, [x4], #16
ld4r {v12.4s-v15.4s}, [x4]
// x12 += counter values 0-3
add v12.4s, v12.4s, v16.4s
mov x3, #10
.Ldoubleround4:
// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
add v0.4s, v0.4s, v4.4s
add v1.4s, v1.4s, v5.4s
add v2.4s, v2.4s, v6.4s
add v3.4s, v3.4s, v7.4s
eor v12.16b, v12.16b, v0.16b
eor v13.16b, v13.16b, v1.16b
eor v14.16b, v14.16b, v2.16b
eor v15.16b, v15.16b, v3.16b
rev32 v12.8h, v12.8h
rev32 v13.8h, v13.8h
rev32 v14.8h, v14.8h
rev32 v15.8h, v15.8h
// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
add v8.4s, v8.4s, v12.4s
add v9.4s, v9.4s, v13.4s
add v10.4s, v10.4s, v14.4s
add v11.4s, v11.4s, v15.4s
eor v17.16b, v4.16b, v8.16b
eor v18.16b, v5.16b, v9.16b
eor v19.16b, v6.16b, v10.16b
eor v20.16b, v7.16b, v11.16b
shl v4.4s, v17.4s, #12
shl v5.4s, v18.4s, #12
shl v6.4s, v19.4s, #12
shl v7.4s, v20.4s, #12
sri v4.4s, v17.4s, #20
sri v5.4s, v18.4s, #20
sri v6.4s, v19.4s, #20
sri v7.4s, v20.4s, #20
// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
add v0.4s, v0.4s, v4.4s
add v1.4s, v1.4s, v5.4s
add v2.4s, v2.4s, v6.4s
add v3.4s, v3.4s, v7.4s
eor v17.16b, v12.16b, v0.16b
eor v18.16b, v13.16b, v1.16b
eor v19.16b, v14.16b, v2.16b
eor v20.16b, v15.16b, v3.16b
shl v12.4s, v17.4s, #8
shl v13.4s, v18.4s, #8
shl v14.4s, v19.4s, #8
shl v15.4s, v20.4s, #8
sri v12.4s, v17.4s, #24
sri v13.4s, v18.4s, #24
sri v14.4s, v19.4s, #24
sri v15.4s, v20.4s, #24
// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
add v8.4s, v8.4s, v12.4s
add v9.4s, v9.4s, v13.4s
add v10.4s, v10.4s, v14.4s
add v11.4s, v11.4s, v15.4s
eor v17.16b, v4.16b, v8.16b
eor v18.16b, v5.16b, v9.16b
eor v19.16b, v6.16b, v10.16b
eor v20.16b, v7.16b, v11.16b
shl v4.4s, v17.4s, #7
shl v5.4s, v18.4s, #7
shl v6.4s, v19.4s, #7
shl v7.4s, v20.4s, #7
sri v4.4s, v17.4s, #25
sri v5.4s, v18.4s, #25
sri v6.4s, v19.4s, #25
sri v7.4s, v20.4s, #25
// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
add v0.4s, v0.4s, v5.4s
add v1.4s, v1.4s, v6.4s
add v2.4s, v2.4s, v7.4s
add v3.4s, v3.4s, v4.4s
eor v15.16b, v15.16b, v0.16b
eor v12.16b, v12.16b, v1.16b
eor v13.16b, v13.16b, v2.16b
eor v14.16b, v14.16b, v3.16b
rev32 v15.8h, v15.8h
rev32 v12.8h, v12.8h
rev32 v13.8h, v13.8h
rev32 v14.8h, v14.8h
// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
add v10.4s, v10.4s, v15.4s
add v11.4s, v11.4s, v12.4s
add v8.4s, v8.4s, v13.4s
add v9.4s, v9.4s, v14.4s
eor v17.16b, v5.16b, v10.16b
eor v18.16b, v6.16b, v11.16b
eor v19.16b, v7.16b, v8.16b
eor v20.16b, v4.16b, v9.16b
shl v5.4s, v17.4s, #12
shl v6.4s, v18.4s, #12
shl v7.4s, v19.4s, #12
shl v4.4s, v20.4s, #12
sri v5.4s, v17.4s, #20
sri v6.4s, v18.4s, #20
sri v7.4s, v19.4s, #20
sri v4.4s, v20.4s, #20
// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
add v0.4s, v0.4s, v5.4s
add v1.4s, v1.4s, v6.4s
add v2.4s, v2.4s, v7.4s
add v3.4s, v3.4s, v4.4s
eor v17.16b, v15.16b, v0.16b
eor v18.16b, v12.16b, v1.16b
eor v19.16b, v13.16b, v2.16b
eor v20.16b, v14.16b, v3.16b
shl v15.4s, v17.4s, #8
shl v12.4s, v18.4s, #8
shl v13.4s, v19.4s, #8
shl v14.4s, v20.4s, #8
sri v15.4s, v17.4s, #24
sri v12.4s, v18.4s, #24
sri v13.4s, v19.4s, #24
sri v14.4s, v20.4s, #24
// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
add v10.4s, v10.4s, v15.4s
add v11.4s, v11.4s, v12.4s
add v8.4s, v8.4s, v13.4s
add v9.4s, v9.4s, v14.4s
eor v17.16b, v5.16b, v10.16b
eor v18.16b, v6.16b, v11.16b
eor v19.16b, v7.16b, v8.16b
eor v20.16b, v4.16b, v9.16b
shl v5.4s, v17.4s, #7
shl v6.4s, v18.4s, #7
shl v7.4s, v19.4s, #7
shl v4.4s, v20.4s, #7
sri v5.4s, v17.4s, #25
sri v6.4s, v18.4s, #25
sri v7.4s, v19.4s, #25
sri v4.4s, v20.4s, #25
subs x3, x3, #1
b.ne .Ldoubleround4
// x0[0-3] += s0[0]
// x1[0-3] += s0[1]
// x2[0-3] += s0[2]
// x3[0-3] += s0[3]
ld4r {v17.4s-v20.4s}, [x0], #16
add v0.4s, v0.4s, v17.4s
add v1.4s, v1.4s, v18.4s
add v2.4s, v2.4s, v19.4s
add v3.4s, v3.4s, v20.4s
// x4[0-3] += s1[0]
// x5[0-3] += s1[1]
// x6[0-3] += s1[2]
// x7[0-3] += s1[3]
ld4r {v21.4s-v24.4s}, [x0], #16
add v4.4s, v4.4s, v21.4s
add v5.4s, v5.4s, v22.4s
add v6.4s, v6.4s, v23.4s
add v7.4s, v7.4s, v24.4s
// x8[0-3] += s2[0]
// x9[0-3] += s2[1]
// x10[0-3] += s2[2]
// x11[0-3] += s2[3]
ld4r {v17.4s-v20.4s}, [x0], #16
add v8.4s, v8.4s, v17.4s
add v9.4s, v9.4s, v18.4s
add v10.4s, v10.4s, v19.4s
add v11.4s, v11.4s, v20.4s
// x12[0-3] += s3[0]
// x13[0-3] += s3[1]
// x14[0-3] += s3[2]
// x15[0-3] += s3[3]
ld4r {v21.4s-v24.4s}, [x0]
add v12.4s, v12.4s, v21.4s
add v13.4s, v13.4s, v22.4s
add v14.4s, v14.4s, v23.4s
add v15.4s, v15.4s, v24.4s
// x12 += counter values 0-3
add v12.4s, v12.4s, v16.4s
ld1 {v16.16b-v19.16b}, [x2], #64
ld1 {v20.16b-v23.16b}, [x2], #64
// interleave 32-bit words in state n, n+1
zip1 v24.4s, v0.4s, v1.4s
zip1 v25.4s, v2.4s, v3.4s
zip1 v26.4s, v4.4s, v5.4s
zip1 v27.4s, v6.4s, v7.4s
zip1 v28.4s, v8.4s, v9.4s
zip1 v29.4s, v10.4s, v11.4s
zip1 v30.4s, v12.4s, v13.4s
zip1 v31.4s, v14.4s, v15.4s
zip2 v1.4s, v0.4s, v1.4s
zip2 v3.4s, v2.4s, v3.4s
zip2 v5.4s, v4.4s, v5.4s
zip2 v7.4s, v6.4s, v7.4s
zip2 v9.4s, v8.4s, v9.4s
zip2 v11.4s, v10.4s, v11.4s
zip2 v13.4s, v12.4s, v13.4s
zip2 v15.4s, v14.4s, v15.4s
mov v0.16b, v24.16b
mov v2.16b, v25.16b
mov v4.16b, v26.16b
mov v6.16b, v27.16b
mov v8.16b, v28.16b
mov v10.16b, v29.16b
mov v12.16b, v30.16b
mov v14.16b, v31.16b
// interleave 64-bit words in state n, n+2
zip1 v24.2d, v0.2d, v2.2d
zip1 v25.2d, v1.2d, v3.2d
zip1 v26.2d, v4.2d, v6.2d
zip1 v27.2d, v5.2d, v7.2d
zip1 v28.2d, v8.2d, v10.2d
zip1 v29.2d, v9.2d, v11.2d
zip1 v30.2d, v12.2d, v14.2d
zip1 v31.2d, v13.2d, v15.2d
zip2 v2.2d, v0.2d, v2.2d
zip2 v3.2d, v1.2d, v3.2d
zip2 v6.2d, v4.2d, v6.2d
zip2 v7.2d, v5.2d, v7.2d
zip2 v10.2d, v8.2d, v10.2d
zip2 v11.2d, v9.2d, v11.2d
zip2 v14.2d, v12.2d, v14.2d
zip2 v15.2d, v13.2d, v15.2d
mov v0.16b, v24.16b
mov v1.16b, v25.16b
mov v4.16b, v26.16b
mov v5.16b, v27.16b
mov v8.16b, v28.16b
mov v9.16b, v29.16b
mov v12.16b, v30.16b
mov v13.16b, v31.16b
ld1 {v24.16b-v27.16b}, [x2], #64
ld1 {v28.16b-v31.16b}, [x2]
// xor with corresponding input, write to output
eor v16.16b, v16.16b, v0.16b
eor v17.16b, v17.16b, v4.16b
eor v18.16b, v18.16b, v8.16b
eor v19.16b, v19.16b, v12.16b
st1 {v16.16b-v19.16b}, [x1], #64
eor v20.16b, v20.16b, v2.16b
eor v21.16b, v21.16b, v6.16b
eor v22.16b, v22.16b, v10.16b
eor v23.16b, v23.16b, v14.16b
st1 {v20.16b-v23.16b}, [x1], #64
eor v24.16b, v24.16b, v1.16b
eor v25.16b, v25.16b, v5.16b
eor v26.16b, v26.16b, v9.16b
eor v27.16b, v27.16b, v13.16b
st1 {v24.16b-v27.16b}, [x1], #64
eor v28.16b, v28.16b, v3.16b
eor v29.16b, v29.16b, v7.16b
eor v30.16b, v30.16b, v11.16b
eor v31.16b, v31.16b, v15.16b
st1 {v28.16b-v31.16b}, [x1]
ret
ENDPROC(chacha20_4block_xor_neon)
CTRINC: .word 0, 1, 2, 3
/*
* ChaCha20 256-bit cipher algorithm, RFC7539, arm64 NEON functions
*
* Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* Based on:
* ChaCha20 256-bit cipher algorithm, RFC7539, SIMD glue code
*
* Copyright (C) 2015 Martin Willi
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*/
#include <crypto/algapi.h>
#include <crypto/chacha20.h>
#include <linux/crypto.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <asm/neon.h>
asmlinkage void chacha20_block_xor_neon(u32 *state, u8 *dst, const u8 *src);
asmlinkage void chacha20_4block_xor_neon(u32 *state, u8 *dst, const u8 *src);
static void chacha20_dosimd(u32 *state, u8 *dst, const u8 *src,
unsigned int bytes)
{
u8 buf[CHACHA20_BLOCK_SIZE];
while (bytes >= CHACHA20_BLOCK_SIZE * 4) {
chacha20_4block_xor_neon(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE * 4;
src += CHACHA20_BLOCK_SIZE * 4;
dst += CHACHA20_BLOCK_SIZE * 4;
state[12] += 4;
}
while (bytes >= CHACHA20_BLOCK_SIZE) {
chacha20_block_xor_neon(state, dst, src);
bytes -= CHACHA20_BLOCK_SIZE;
src += CHACHA20_BLOCK_SIZE;
dst += CHACHA20_BLOCK_SIZE;
state[12]++;
}
if (bytes) {
memcpy(buf, src, bytes);
chacha20_block_xor_neon(state, buf, buf);
memcpy(dst, buf, bytes);
}
}
static int chacha20_simd(struct blkcipher_desc *desc, struct scatterlist *dst,
struct scatterlist *src, unsigned int nbytes)
{
struct blkcipher_walk walk;
u32 state[16];
int err;
if (nbytes <= CHACHA20_BLOCK_SIZE)
return crypto_chacha20_crypt(desc, dst, src, nbytes);
blkcipher_walk_init(&walk, dst, src, nbytes);
err = blkcipher_walk_virt_block(desc, &walk, CHACHA20_BLOCK_SIZE);
crypto_chacha20_init(state, crypto_blkcipher_ctx(desc->tfm), walk.iv);
kernel_neon_begin();
while (walk.nbytes >= CHACHA20_BLOCK_SIZE) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
rounddown(walk.nbytes, CHACHA20_BLOCK_SIZE));
err = blkcipher_walk_done(desc, &walk,
walk.nbytes % CHACHA20_BLOCK_SIZE);
}
if (walk.nbytes) {
chacha20_dosimd(state, walk.dst.virt.addr, walk.src.virt.addr,
walk.nbytes);
err = blkcipher_walk_done(desc, &walk, 0);
}
kernel_neon_end();
return err;
}
static struct crypto_alg alg = {
.cra_name = "chacha20",
.cra_driver_name = "chacha20-neon",
.cra_priority = 300,
.cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER,
.cra_blocksize = 1,
.cra_type = &crypto_blkcipher_type,
.cra_ctxsize = sizeof(struct chacha20_ctx),
.cra_alignmask = sizeof(u32) - 1,
.cra_module = THIS_MODULE,
.cra_u = {
.blkcipher = {
.min_keysize = CHACHA20_KEY_SIZE,
.max_keysize = CHACHA20_KEY_SIZE,
.ivsize = CHACHA20_IV_SIZE,
.geniv = "seqiv",
.setkey = crypto_chacha20_setkey,
.encrypt = chacha20_simd,
.decrypt = chacha20_simd,
},
},
};
static int __init chacha20_simd_mod_init(void)
{
return crypto_register_alg(&alg);
}
static void __exit chacha20_simd_mod_fini(void)
{
crypto_unregister_alg(&alg);
}
module_init(chacha20_simd_mod_init);
module_exit(chacha20_simd_mod_fini);
MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel@linaro.org>");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("chacha20");
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment