Commit 596f674d authored by Herbert Xu's avatar Herbert Xu

crypto: p10-aes-gcm - Revert implementation

Revert the changes that added p10-aes-gcm:

	0781bbd7 ("crypto: p10-aes-gcm - A perl script to process PowerPC assembler source")
	41a6437a ("crypto: p10-aes-gcm - Supporting functions for ghash")
	3b47ecca ("crypto: p10-aes-gcm - Supporting functions for AES")
	ca68a96c ("crypto: p10-aes-gcm - An accelerated AES/GCM stitched implementation")
	cc40379b ("crypto: p10-aes-gcm - Glue code for AES/GCM stitched implementation")
	3c657e86 ("crypto: p10-aes-gcm - Update Kconfig and Makefile")

These changes fail to build in many configurations and are not ready
for prime time.
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
parent f81c1d4a
......@@ -94,15 +94,4 @@ config CRYPTO_AES_PPC_SPE
architecture specific assembler implementations that work on 1KB
tables or 256 bytes S-boxes.
config CRYPTO_P10_AES_GCM
tristate "Stitched AES/GCM acceleration support on P10+ CPU (PPC)"
depends on PPC64
select CRYPTO_LIB_AES
select CRYPTO_ALGAPI
select CRYPTO_AEAD
default m
help
Support for cryptographic acceleration instructions on Power10+ CPU.
This module supports stitched acceleration for AES/GCM in hardware.
endmenu
......@@ -13,7 +13,6 @@ obj-$(CONFIG_CRYPTO_SHA256_PPC_SPE) += sha256-ppc-spe.o
obj-$(CONFIG_CRYPTO_CRC32C_VPMSUM) += crc32c-vpmsum.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_VPMSUM) += crct10dif-vpmsum.o
obj-$(CONFIG_CRYPTO_VPMSUM_TESTER) += crc-vpmsum_test.o
obj-$(CONFIG_CRYPTO_P10_AES_GCM) += p10-aes-gcm-crypto.o
aes-ppc-spe-y := aes-spe-core.o aes-spe-keys.o aes-tab-4k.o aes-spe-modes.o aes-spe-glue.o
md5-ppc-y := md5-asm.o md5-glue.o
......@@ -22,12 +21,3 @@ sha1-ppc-spe-y := sha1-spe-asm.o sha1-spe-glue.o
sha256-ppc-spe-y := sha256-spe-asm.o sha256-spe-glue.o
crc32c-vpmsum-y := crc32c-vpmsum_asm.o crc32c-vpmsum_glue.o
crct10dif-vpmsum-y := crct10dif-vpmsum_asm.o crct10dif-vpmsum_glue.o
p10-aes-gcm-crypto-y := p10-aes-gcm-glue.o p10_aes_gcm.o ghashp8-ppc.o aesp8-ppc.o
quiet_cmd_perl = PERL $@
cmd_perl = $(PERL) $< $(if $(CONFIG_CPU_LITTLE_ENDIAN), linux-ppc64le, linux-ppc64) > $@
targets += aesp8-ppc.S ghashp8-ppc.S
$(obj)/aesp8-ppc.S $(obj)/ghashp8-ppc.S: $(obj)/%.S: $(src)/%.pl FORCE
$(call if_changed,perl)
#! /usr/bin/env perl
# SPDX-License-Identifier: GPL-2.0
# This code is taken from CRYPTOGAMs[1] and is included here using the option
# in the license to distribute the code under the GPL. Therefore this program
# is free software; you can redistribute it and/or modify it under the terms of
# the GNU General Public License version 2 as published by the Free Software
# Foundation.
#
# [1] https://www.openssl.org/~appro/cryptogams/
# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# * Redistributions of source code must retain copyright notices,
# this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# * Neither the name of the CRYPTOGAMS nor the names of its
# copyright holder and contributors may be used to endorse or
# promote products derived from this software without specific
# prior written permission.
#
# ALTERNATIVELY, provided that this notice is retained in full, this
# product may be distributed under the terms of the GNU General Public
# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
# those given above.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see https://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for AES instructions as per PowerISA
# specification version 2.07, first implemented by POWER8 processor.
# The module is endian-agnostic in sense that it supports both big-
# and little-endian cases. Data alignment in parallelizable modes is
# handled with VSX loads and stores, which implies MSR.VSX flag being
# set. It should also be noted that ISA specification doesn't prohibit
# alignment exceptions for these instructions on page boundaries.
# Initially alignment was handled in pure AltiVec/VMX way [when data
# is aligned programmatically, which in turn guarantees exception-
# free execution], but it turned to hamper performance when vcipher
# instructions are interleaved. It's reckoned that eventual
# misalignment penalties at page boundaries are in average lower
# than additional overhead in pure AltiVec approach.
#
# May 2016
#
# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
# systems were measured.
#
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS
# POWER8[le] 3.96/0.72 0.74 1.1
# POWER8[be] 3.75/0.65 0.66 1.0
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
$UCMP ="cmpld";
$SHL ="sldi";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
$UCMP ="cmplw";
$SHL ="slwi";
} else { die "nonsense $flavour"; }
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=8*$SIZE_T;
$prefix="aes_p8";
$sp="r1";
$vrsave="r12";
#########################################################################
{{{ # Key setup procedures #
my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
$code.=<<___;
.machine "any"
.text
.align 7
rcon:
.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
.long 0,0,0,0 ?asis
Lconsts:
mflr r0
bcl 20,31,\$+4
mflr $ptr #vvvvv "distance between . and rcon
addi $ptr,$ptr,-0x48
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.globl .${prefix}_set_encrypt_key
Lset_encrypt_key:
mflr r11
$PUSH r11,$LRSAVE($sp)
li $ptr,-1
${UCMP}i $inp,0
beq- Lenc_key_abort # if ($inp==0) return -1;
${UCMP}i $out,0
beq- Lenc_key_abort # if ($out==0) return -1;
li $ptr,-2
cmpwi $bits,128
blt- Lenc_key_abort
cmpwi $bits,256
bgt- Lenc_key_abort
andi. r0,$bits,0x3f
bne- Lenc_key_abort
lis r0,0xfff0
mfspr $vrsave,256
mtspr 256,r0
bl Lconsts
mtlr r11
neg r9,$inp
lvx $in0,0,$inp
addi $inp,$inp,15 # 15 is not typo
lvsr $key,0,r9 # borrow $key
li r8,0x20
cmpwi $bits,192
lvx $in1,0,$inp
le?vspltisb $mask,0x0f # borrow $mask
lvx $rcon,0,$ptr
le?vxor $key,$key,$mask # adjust for byte swap
lvx $mask,r8,$ptr
addi $ptr,$ptr,0x10
vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
li $cnt,8
vxor $zero,$zero,$zero
mtctr $cnt
?lvsr $outperm,0,$out
vspltisb $outmask,-1
lvx $outhead,0,$out
?vperm $outmask,$zero,$outmask,$outperm
blt Loop128
addi $inp,$inp,8
beq L192
addi $inp,$inp,8
b L256
.align 4
Loop128:
vperm $key,$in0,$in0,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vadduwm $rcon,$rcon,$rcon
vxor $in0,$in0,$key
bdnz Loop128
lvx $rcon,0,$ptr # last two round keys
vperm $key,$in0,$in0,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vadduwm $rcon,$rcon,$rcon
vxor $in0,$in0,$key
vperm $key,$in0,$in0,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vxor $in0,$in0,$key
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $inp,$out,15 # 15 is not typo
addi $out,$out,0x50
li $rounds,10
b Ldone
.align 4
L192:
lvx $tmp,0,$inp
li $cnt,4
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $out,$out,16
vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
vspltisb $key,8 # borrow $key
mtctr $cnt
vsububm $mask,$mask,$key # adjust the mask
Loop192:
vperm $key,$in1,$in1,$mask # roate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vcipherlast $key,$key,$rcon
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $stage,$zero,$in1,8
vspltw $tmp,$in0,3
vxor $tmp,$tmp,$in1
vsldoi $in1,$zero,$in1,12 # >>32
vadduwm $rcon,$rcon,$rcon
vxor $in1,$in1,$tmp
vxor $in0,$in0,$key
vxor $in1,$in1,$key
vsldoi $stage,$stage,$in0,8
vperm $key,$in1,$in1,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$stage,$stage,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vsldoi $stage,$in0,$in1,8
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vperm $outtail,$stage,$stage,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
stvx $stage,0,$out
addi $out,$out,16
vspltw $tmp,$in0,3
vxor $tmp,$tmp,$in1
vsldoi $in1,$zero,$in1,12 # >>32
vadduwm $rcon,$rcon,$rcon
vxor $in1,$in1,$tmp
vxor $in0,$in0,$key
vxor $in1,$in1,$key
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $inp,$out,15 # 15 is not typo
addi $out,$out,16
bdnz Loop192
li $rounds,12
addi $out,$out,0x20
b Ldone
.align 4
L256:
lvx $tmp,0,$inp
li $cnt,7
li $rounds,14
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $out,$out,16
vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
mtctr $cnt
Loop256:
vperm $key,$in1,$in1,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in1,$in1,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vadduwm $rcon,$rcon,$rcon
vxor $in0,$in0,$key
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $inp,$out,15 # 15 is not typo
addi $out,$out,16
bdz Ldone
vspltw $key,$in0,3 # just splat
vsldoi $tmp,$zero,$in1,12 # >>32
vsbox $key,$key
vxor $in1,$in1,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in1,$in1,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in1,$in1,$tmp
vxor $in1,$in1,$key
b Loop256
.align 4
Ldone:
lvx $in1,0,$inp # redundant in aligned case
vsel $in1,$outhead,$in1,$outmask
stvx $in1,0,$inp
li $ptr,0
mtspr 256,$vrsave
stw $rounds,0($out)
Lenc_key_abort:
mr r3,$ptr
blr
.long 0
.byte 0,12,0x14,1,0,0,3,0
.long 0
.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
.globl .${prefix}_set_decrypt_key
$STU $sp,-$FRAME($sp)
mflr r10
$PUSH r10,$FRAME+$LRSAVE($sp)
bl Lset_encrypt_key
mtlr r10
cmpwi r3,0
bne- Ldec_key_abort
slwi $cnt,$rounds,4
subi $inp,$out,240 # first round key
srwi $rounds,$rounds,1
add $out,$inp,$cnt # last round key
mtctr $rounds
Ldeckey:
lwz r0, 0($inp)
lwz r6, 4($inp)
lwz r7, 8($inp)
lwz r8, 12($inp)
addi $inp,$inp,16
lwz r9, 0($out)
lwz r10,4($out)
lwz r11,8($out)
lwz r12,12($out)
stw r0, 0($out)
stw r6, 4($out)
stw r7, 8($out)
stw r8, 12($out)
subi $out,$out,16
stw r9, -16($inp)
stw r10,-12($inp)
stw r11,-8($inp)
stw r12,-4($inp)
bdnz Ldeckey
xor r3,r3,r3 # return value
Ldec_key_abort:
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,0,3,0
.long 0
.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
___
}}}
#########################################################################
{{{ # Single block en- and decrypt procedures #
sub gen_block () {
my $dir = shift;
my $n = $dir eq "de" ? "n" : "";
my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
$code.=<<___;
.globl .${prefix}_${dir}crypt
lwz $rounds,240($key)
lis r0,0xfc00
mfspr $vrsave,256
li $idx,15 # 15 is not typo
mtspr 256,r0
lvx v0,0,$inp
neg r11,$out
lvx v1,$idx,$inp
lvsl v2,0,$inp # inpperm
le?vspltisb v4,0x0f
?lvsl v3,0,r11 # outperm
le?vxor v2,v2,v4
li $idx,16
vperm v0,v0,v1,v2 # align [and byte swap in LE]
lvx v1,0,$key
?lvsl v5,0,$key # keyperm
srwi $rounds,$rounds,1
lvx v2,$idx,$key
addi $idx,$idx,16
subi $rounds,$rounds,1
?vperm v1,v1,v2,v5 # align round key
vxor v0,v0,v1
lvx v1,$idx,$key
addi $idx,$idx,16
mtctr $rounds
Loop_${dir}c:
?vperm v2,v2,v1,v5
v${n}cipher v0,v0,v2
lvx v2,$idx,$key
addi $idx,$idx,16
?vperm v1,v1,v2,v5
v${n}cipher v0,v0,v1
lvx v1,$idx,$key
addi $idx,$idx,16
bdnz Loop_${dir}c
?vperm v2,v2,v1,v5
v${n}cipher v0,v0,v2
lvx v2,$idx,$key
?vperm v1,v1,v2,v5
v${n}cipherlast v0,v0,v1
vspltisb v2,-1
vxor v1,v1,v1
li $idx,15 # 15 is not typo
?vperm v2,v1,v2,v3 # outmask
le?vxor v3,v3,v4
lvx v1,0,$out # outhead
vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
vsel v1,v1,v0,v2
lvx v4,$idx,$out
stvx v1,0,$out
vsel v0,v0,v4,v2
stvx v0,$idx,$out
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
#########################################################################
{{{ # CBC en- and decrypt procedures #
my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
map("v$_",(4..10));
$code.=<<___;
.globl .${prefix}_cbc_encrypt
${UCMP}i $len,16
bltlr-
cmpwi $enc,0 # test direction
lis r0,0xffe0
mfspr $vrsave,256
mtspr 256,r0
li $idx,15
vxor $rndkey0,$rndkey0,$rndkey0
le?vspltisb $tmp,0x0f
lvx $ivec,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
le?vxor $inpperm,$inpperm,$tmp
vperm $ivec,$ivec,$inptail,$inpperm
neg r11,$inp
?lvsl $keyperm,0,$key # prepare for unaligned key
lwz $rounds,240($key)
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inptail,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
?lvsr $outperm,0,$out # prepare for unaligned store
vspltisb $outmask,-1
lvx $outhead,0,$out
?vperm $outmask,$rndkey0,$outmask,$outperm
le?vxor $outperm,$outperm,$tmp
srwi $rounds,$rounds,1
li $idx,16
subi $rounds,$rounds,1
beq Lcbc_dec
Lcbc_enc:
vmr $inout,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
mtctr $rounds
subi $len,$len,16 # len-=16
lvx $rndkey0,0,$key
vperm $inout,$inout,$inptail,$inpperm
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
vxor $inout,$inout,$ivec
Loop_cbc_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
bdnz Loop_cbc_enc
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipherlast $ivec,$inout,$rndkey0
${UCMP}i $len,16
vperm $tmp,$ivec,$ivec,$outperm
vsel $inout,$outhead,$tmp,$outmask
vmr $outhead,$tmp
stvx $inout,0,$out
addi $out,$out,16
bge Lcbc_enc
b Lcbc_done
.align 4
Lcbc_dec:
${UCMP}i $len,128
bge _aesp8_cbc_decrypt8x
vmr $tmp,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
mtctr $rounds
subi $len,$len,16 # len-=16
lvx $rndkey0,0,$key
vperm $tmp,$tmp,$inptail,$inpperm
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$tmp,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
Loop_cbc_dec:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
bdnz Loop_cbc_dec
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipherlast $inout,$inout,$rndkey0
${UCMP}i $len,16
vxor $inout,$inout,$ivec
vmr $ivec,$tmp
vperm $tmp,$inout,$inout,$outperm
vsel $inout,$outhead,$tmp,$outmask
vmr $outhead,$tmp
stvx $inout,0,$out
addi $out,$out,16
bge Lcbc_dec
Lcbc_done:
addi $out,$out,-1
lvx $inout,0,$out # redundant in aligned case
vsel $inout,$outhead,$inout,$outmask
stvx $inout,0,$out
neg $enc,$ivp # write [unaligned] iv
li $idx,15 # 15 is not typo
vxor $rndkey0,$rndkey0,$rndkey0
vspltisb $outmask,-1
le?vspltisb $tmp,0x0f
?lvsl $outperm,0,$enc
?vperm $outmask,$rndkey0,$outmask,$outperm
le?vxor $outperm,$outperm,$tmp
lvx $outhead,0,$ivp
vperm $ivec,$ivec,$ivec,$outperm
vsel $inout,$outhead,$ivec,$outmask
lvx $inptail,$idx,$ivp
stvx $inout,0,$ivp
vsel $inout,$ivec,$inptail,$outmask
stvx $inout,$idx,$ivp
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,6,0
.long 0
___
#########################################################################
{{ # Optimized CBC decrypt procedure #
my $key_="r11";
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
# v26-v31 last 6 round keys
my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
$code.=<<___;
.align 5
_aesp8_cbc_decrypt8x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
li r10,`$FRAME+8*16+15`
li r11,`$FRAME+8*16+31`
stvx v20,r10,$sp # ABI says so
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
subi $len,$len,128 # bias
lvx $rndkey0,$x00,$key # load key schedule
lvx v30,$x10,$key
addi $key,$key,0x20
lvx v31,$x00,$key
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_cbc_dec_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key
addi $key,$key,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_cbc_dec_key
lvx v26,$x10,$key
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key
?vperm v29,v29,v30,$keyperm
lvx $out0,$x70,$key # borrow $out0
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$out0,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
#lvx $inptail,0,$inp # "caller" already did this
#addi $inp,$inp,15 # 15 is not typo
subi $inp,$inp,15 # undo "caller"
le?li $idx,8
lvx_u $in0,$x00,$inp # load first 8 "words"
le?lvsl $inpperm,0,$idx
le?vspltisb $tmp,0x0f
lvx_u $in1,$x10,$inp
le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
lvx_u $in2,$x20,$inp
le?vperm $in0,$in0,$in0,$inpperm
lvx_u $in3,$x30,$inp
le?vperm $in1,$in1,$in1,$inpperm
lvx_u $in4,$x40,$inp
le?vperm $in2,$in2,$in2,$inpperm
vxor $out0,$in0,$rndkey0
lvx_u $in5,$x50,$inp
le?vperm $in3,$in3,$in3,$inpperm
vxor $out1,$in1,$rndkey0
lvx_u $in6,$x60,$inp
le?vperm $in4,$in4,$in4,$inpperm
vxor $out2,$in2,$rndkey0
lvx_u $in7,$x70,$inp
addi $inp,$inp,0x80
le?vperm $in5,$in5,$in5,$inpperm
vxor $out3,$in3,$rndkey0
le?vperm $in6,$in6,$in6,$inpperm
vxor $out4,$in4,$rndkey0
le?vperm $in7,$in7,$in7,$inpperm
vxor $out5,$in5,$rndkey0
vxor $out6,$in6,$rndkey0
vxor $out7,$in7,$rndkey0
mtctr $rounds
b Loop_cbc_dec8x
.align 5
Loop_cbc_dec8x:
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_cbc_dec8x
subic $len,$len,128 # $len-=128
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
subfe. r0,r0,r0 # borrow?-1:0
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
and r0,r0,$len
vncipher $out0,$out0,v26
vncipher $out1,$out1,v26
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vncipher $out4,$out4,v26
vncipher $out5,$out5,v26
vncipher $out6,$out6,v26
vncipher $out7,$out7,v26
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in7 are loaded
# with last "words"
vncipher $out0,$out0,v27
vncipher $out1,$out1,v27
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vncipher $out4,$out4,v27
vncipher $out5,$out5,v27
vncipher $out6,$out6,v27
vncipher $out7,$out7,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
vncipher $out1,$out1,v28
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vncipher $out4,$out4,v28
vncipher $out5,$out5,v28
vncipher $out6,$out6,v28
vncipher $out7,$out7,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vncipher $out0,$out0,v29
vncipher $out1,$out1,v29
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vncipher $out4,$out4,v29
vncipher $out5,$out5,v29
vncipher $out6,$out6,v29
vncipher $out7,$out7,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vncipher $out0,$out0,v30
vxor $ivec,$ivec,v31 # xor with last round key
vncipher $out1,$out1,v30
vxor $in0,$in0,v31
vncipher $out2,$out2,v30
vxor $in1,$in1,v31
vncipher $out3,$out3,v30
vxor $in2,$in2,v31
vncipher $out4,$out4,v30
vxor $in3,$in3,v31
vncipher $out5,$out5,v30
vxor $in4,$in4,v31
vncipher $out6,$out6,v30
vxor $in5,$in5,v31
vncipher $out7,$out7,v30
vxor $in6,$in6,v31
vncipherlast $out0,$out0,$ivec
vncipherlast $out1,$out1,$in0
lvx_u $in0,$x00,$inp # load next input block
vncipherlast $out2,$out2,$in1
lvx_u $in1,$x10,$inp
vncipherlast $out3,$out3,$in2
le?vperm $in0,$in0,$in0,$inpperm
lvx_u $in2,$x20,$inp
vncipherlast $out4,$out4,$in3
le?vperm $in1,$in1,$in1,$inpperm
lvx_u $in3,$x30,$inp
vncipherlast $out5,$out5,$in4
le?vperm $in2,$in2,$in2,$inpperm
lvx_u $in4,$x40,$inp
vncipherlast $out6,$out6,$in5
le?vperm $in3,$in3,$in3,$inpperm
lvx_u $in5,$x50,$inp
vncipherlast $out7,$out7,$in6
le?vperm $in4,$in4,$in4,$inpperm
lvx_u $in6,$x60,$inp
vmr $ivec,$in7
le?vperm $in5,$in5,$in5,$inpperm
lvx_u $in7,$x70,$inp
addi $inp,$inp,0x80
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $in6,$in6,$in6,$inpperm
vxor $out0,$in0,$rndkey0
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $in7,$in7,$in7,$inpperm
vxor $out1,$in1,$rndkey0
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
vxor $out2,$in2,$rndkey0
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
vxor $out3,$in3,$rndkey0
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
vxor $out4,$in4,$rndkey0
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x50,$out
vxor $out5,$in5,$rndkey0
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x60,$out
vxor $out6,$in6,$rndkey0
stvx_u $out7,$x70,$out
addi $out,$out,0x80
vxor $out7,$in7,$rndkey0
mtctr $rounds
beq Loop_cbc_dec8x # did $len-=128 borrow?
addic. $len,$len,128
beq Lcbc_dec8x_done
nop
nop
Loop_cbc_dec8x_tail: # up to 7 "words" tail...
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_cbc_dec8x_tail
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
vncipher $out1,$out1,v26
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vncipher $out4,$out4,v26
vncipher $out5,$out5,v26
vncipher $out6,$out6,v26
vncipher $out7,$out7,v26
vncipher $out1,$out1,v27
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vncipher $out4,$out4,v27
vncipher $out5,$out5,v27
vncipher $out6,$out6,v27
vncipher $out7,$out7,v27
vncipher $out1,$out1,v28
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vncipher $out4,$out4,v28
vncipher $out5,$out5,v28
vncipher $out6,$out6,v28
vncipher $out7,$out7,v28
vncipher $out1,$out1,v29
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vncipher $out4,$out4,v29
vncipher $out5,$out5,v29
vncipher $out6,$out6,v29
vncipher $out7,$out7,v29
vncipher $out1,$out1,v30
vxor $ivec,$ivec,v31 # last round key
vncipher $out2,$out2,v30
vxor $in1,$in1,v31
vncipher $out3,$out3,v30
vxor $in2,$in2,v31
vncipher $out4,$out4,v30
vxor $in3,$in3,v31
vncipher $out5,$out5,v30
vxor $in4,$in4,v31
vncipher $out6,$out6,v30
vxor $in5,$in5,v31
vncipher $out7,$out7,v30
vxor $in6,$in6,v31
cmplwi $len,32 # switch($len)
blt Lcbc_dec8x_one
nop
beq Lcbc_dec8x_two
cmplwi $len,64
blt Lcbc_dec8x_three
nop
beq Lcbc_dec8x_four
cmplwi $len,96
blt Lcbc_dec8x_five
nop
beq Lcbc_dec8x_six
Lcbc_dec8x_seven:
vncipherlast $out1,$out1,$ivec
vncipherlast $out2,$out2,$in1
vncipherlast $out3,$out3,$in2
vncipherlast $out4,$out4,$in3
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out1,$out1,$out1,$inpperm
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x00,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x10,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x20,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x30,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x40,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x50,$out
stvx_u $out7,$x60,$out
addi $out,$out,0x70
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_six:
vncipherlast $out2,$out2,$ivec
vncipherlast $out3,$out3,$in2
vncipherlast $out4,$out4,$in3
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out2,$out2,$out2,$inpperm
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x00,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x10,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x20,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x30,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x40,$out
stvx_u $out7,$x50,$out
addi $out,$out,0x60
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_five:
vncipherlast $out3,$out3,$ivec
vncipherlast $out4,$out4,$in3
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out3,$out3,$out3,$inpperm
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x00,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x10,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x20,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x30,$out
stvx_u $out7,$x40,$out
addi $out,$out,0x50
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_four:
vncipherlast $out4,$out4,$ivec
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out4,$out4,$out4,$inpperm
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x00,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x10,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x20,$out
stvx_u $out7,$x30,$out
addi $out,$out,0x40
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_three:
vncipherlast $out5,$out5,$ivec
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out5,$out5,$out5,$inpperm
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x00,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x10,$out
stvx_u $out7,$x20,$out
addi $out,$out,0x30
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_two:
vncipherlast $out6,$out6,$ivec
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out6,$out6,$out6,$inpperm
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x00,$out
stvx_u $out7,$x10,$out
addi $out,$out,0x20
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_one:
vncipherlast $out7,$out7,$ivec
vmr $ivec,$in7
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out7,0,$out
addi $out,$out,0x10
Lcbc_dec8x_done:
le?vperm $ivec,$ivec,$ivec,$inpperm
stvx_u $ivec,0,$ivp # write [unaligned] iv
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $inpperm,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x14,0,0x80,6,6,0
.long 0
.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
___
}} }}}
#########################################################################
{{{ # CTR procedure[s] #
####################### WARNING: Here be dragons! #######################
#
# This code is written as 'ctr32', based on a 32-bit counter used
# upstream. The kernel does *not* use a 32-bit counter. The kernel uses
# a 128-bit counter.
#
# This leads to subtle changes from the upstream code: the counter
# is incremented with vaddu_q_m rather than vaddu_w_m. This occurs in
# both the bulk (8 blocks at a time) path, and in the individual block
# path. Be aware of this when doing updates.
#
# See:
# 1d4aa0b4c181 ("crypto: vmx - Fixing AES-CTR counter bug")
# 009b30ac7444 ("crypto: vmx - CTR: always increment IV as quadword")
# https://github.com/openssl/openssl/pull/8942
#
#########################################################################
my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
map("v$_",(4..11));
my $dat=$tmp;
$code.=<<___;
.globl .${prefix}_ctr32_encrypt_blocks
${UCMP}i $len,1
bltlr-
lis r0,0xfff0
mfspr $vrsave,256
mtspr 256,r0
li $idx,15
vxor $rndkey0,$rndkey0,$rndkey0
le?vspltisb $tmp,0x0f
lvx $ivec,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
vspltisb $one,1
le?vxor $inpperm,$inpperm,$tmp
vperm $ivec,$ivec,$inptail,$inpperm
vsldoi $one,$rndkey0,$one,1
neg r11,$inp
?lvsl $keyperm,0,$key # prepare for unaligned key
lwz $rounds,240($key)
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inptail,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
srwi $rounds,$rounds,1
li $idx,16
subi $rounds,$rounds,1
${UCMP}i $len,8
bge _aesp8_ctr32_encrypt8x
?lvsr $outperm,0,$out # prepare for unaligned store
vspltisb $outmask,-1
lvx $outhead,0,$out
?vperm $outmask,$rndkey0,$outmask,$outperm
le?vxor $outperm,$outperm,$tmp
lvx $rndkey0,0,$key
mtctr $rounds
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$ivec,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
b Loop_ctr32_enc
.align 5
Loop_ctr32_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
bdnz Loop_ctr32_enc
vadduqm $ivec,$ivec,$one # Kernel change for 128-bit
vmr $dat,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
subic. $len,$len,1 # blocks--
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
vperm $dat,$dat,$inptail,$inpperm
li $idx,16
?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
lvx $rndkey0,0,$key
vxor $dat,$dat,$rndkey1 # last round key
vcipherlast $inout,$inout,$dat
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
vperm $inout,$inout,$inout,$outperm
vsel $dat,$outhead,$inout,$outmask
mtctr $rounds
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vmr $outhead,$inout
vxor $inout,$ivec,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
stvx $dat,0,$out
addi $out,$out,16
bne Loop_ctr32_enc
addi $out,$out,-1
lvx $inout,0,$out # redundant in aligned case
vsel $inout,$outhead,$inout,$outmask
stvx $inout,0,$out
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,6,0
.long 0
___
#########################################################################
{{ # Optimized CTR procedure #
my $key_="r11";
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
# v26-v31 last 6 round keys
my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
my ($two,$three,$four)=($outhead,$outperm,$outmask);
$code.=<<___;
.align 5
_aesp8_ctr32_encrypt8x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
li r10,`$FRAME+8*16+15`
li r11,`$FRAME+8*16+31`
stvx v20,r10,$sp # ABI says so
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key # load key schedule
lvx v30,$x10,$key
addi $key,$key,0x20
lvx v31,$x00,$key
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_ctr32_enc_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key
addi $key,$key,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_ctr32_enc_key
lvx v26,$x10,$key
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key
?vperm v29,v29,v30,$keyperm
lvx $out0,$x70,$key # borrow $out0
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$out0,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
vadduqm $two,$one,$one
subi $inp,$inp,15 # undo "caller"
$SHL $len,$len,4
vadduqm $out1,$ivec,$one # counter values ...
vadduqm $out2,$ivec,$two # (do all ctr adds as 128-bit)
vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
le?li $idx,8
vadduqm $out3,$out1,$two
vxor $out1,$out1,$rndkey0
le?lvsl $inpperm,0,$idx
vadduqm $out4,$out2,$two
vxor $out2,$out2,$rndkey0
le?vspltisb $tmp,0x0f
vadduqm $out5,$out3,$two
vxor $out3,$out3,$rndkey0
le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
vadduqm $out6,$out4,$two
vxor $out4,$out4,$rndkey0
vadduqm $out7,$out5,$two
vxor $out5,$out5,$rndkey0
vadduqm $ivec,$out6,$two # next counter value
vxor $out6,$out6,$rndkey0
vxor $out7,$out7,$rndkey0
mtctr $rounds
b Loop_ctr32_enc8x
.align 5
Loop_ctr32_enc8x:
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
vcipher $out6,$out6,v24
vcipher $out7,$out7,v24
Loop_ctr32_enc8x_middle:
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
vcipher $out6,$out6,v25
vcipher $out7,$out7,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_ctr32_enc8x
subic r11,$len,256 # $len-256, borrow $key_
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
vcipher $out6,$out6,v24
vcipher $out7,$out7,v24
subfe r0,r0,r0 # borrow?-1:0
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
vcipher $out6,$out6,v25
vcipher $out7,$out7,v25
and r0,r0,r11
addi $key_,$sp,$FRAME+15 # rewind $key_
vcipher $out0,$out0,v26
vcipher $out1,$out1,v26
vcipher $out2,$out2,v26
vcipher $out3,$out3,v26
vcipher $out4,$out4,v26
vcipher $out5,$out5,v26
vcipher $out6,$out6,v26
vcipher $out7,$out7,v26
lvx v24,$x00,$key_ # re-pre-load round[1]
subic $len,$len,129 # $len-=129
vcipher $out0,$out0,v27
addi $len,$len,1 # $len-=128 really
vcipher $out1,$out1,v27
vcipher $out2,$out2,v27
vcipher $out3,$out3,v27
vcipher $out4,$out4,v27
vcipher $out5,$out5,v27
vcipher $out6,$out6,v27
vcipher $out7,$out7,v27
lvx v25,$x10,$key_ # re-pre-load round[2]
vcipher $out0,$out0,v28
lvx_u $in0,$x00,$inp # load input
vcipher $out1,$out1,v28
lvx_u $in1,$x10,$inp
vcipher $out2,$out2,v28
lvx_u $in2,$x20,$inp
vcipher $out3,$out3,v28
lvx_u $in3,$x30,$inp
vcipher $out4,$out4,v28
lvx_u $in4,$x40,$inp
vcipher $out5,$out5,v28
lvx_u $in5,$x50,$inp
vcipher $out6,$out6,v28
lvx_u $in6,$x60,$inp
vcipher $out7,$out7,v28
lvx_u $in7,$x70,$inp
addi $inp,$inp,0x80
vcipher $out0,$out0,v29
le?vperm $in0,$in0,$in0,$inpperm
vcipher $out1,$out1,v29
le?vperm $in1,$in1,$in1,$inpperm
vcipher $out2,$out2,v29
le?vperm $in2,$in2,$in2,$inpperm
vcipher $out3,$out3,v29
le?vperm $in3,$in3,$in3,$inpperm
vcipher $out4,$out4,v29
le?vperm $in4,$in4,$in4,$inpperm
vcipher $out5,$out5,v29
le?vperm $in5,$in5,$in5,$inpperm
vcipher $out6,$out6,v29
le?vperm $in6,$in6,$in6,$inpperm
vcipher $out7,$out7,v29
le?vperm $in7,$in7,$in7,$inpperm
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in7 are loaded
# with last "words"
subfe. r0,r0,r0 # borrow?-1:0
vcipher $out0,$out0,v30
vxor $in0,$in0,v31 # xor with last round key
vcipher $out1,$out1,v30
vxor $in1,$in1,v31
vcipher $out2,$out2,v30
vxor $in2,$in2,v31
vcipher $out3,$out3,v30
vxor $in3,$in3,v31
vcipher $out4,$out4,v30
vxor $in4,$in4,v31
vcipher $out5,$out5,v30
vxor $in5,$in5,v31
vcipher $out6,$out6,v30
vxor $in6,$in6,v31
vcipher $out7,$out7,v30
vxor $in7,$in7,v31
bne Lctr32_enc8x_break # did $len-129 borrow?
vcipherlast $in0,$out0,$in0
vcipherlast $in1,$out1,$in1
vadduqm $out1,$ivec,$one # counter values ...
vcipherlast $in2,$out2,$in2
vadduqm $out2,$ivec,$two
vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
vcipherlast $in3,$out3,$in3
vadduqm $out3,$out1,$two
vxor $out1,$out1,$rndkey0
vcipherlast $in4,$out4,$in4
vadduqm $out4,$out2,$two
vxor $out2,$out2,$rndkey0
vcipherlast $in5,$out5,$in5
vadduqm $out5,$out3,$two
vxor $out3,$out3,$rndkey0
vcipherlast $in6,$out6,$in6
vadduqm $out6,$out4,$two
vxor $out4,$out4,$rndkey0
vcipherlast $in7,$out7,$in7
vadduqm $out7,$out5,$two
vxor $out5,$out5,$rndkey0
le?vperm $in0,$in0,$in0,$inpperm
vadduqm $ivec,$out6,$two # next counter value
vxor $out6,$out6,$rndkey0
le?vperm $in1,$in1,$in1,$inpperm
vxor $out7,$out7,$rndkey0
mtctr $rounds
vcipher $out0,$out0,v24
stvx_u $in0,$x00,$out
le?vperm $in2,$in2,$in2,$inpperm
vcipher $out1,$out1,v24
stvx_u $in1,$x10,$out
le?vperm $in3,$in3,$in3,$inpperm
vcipher $out2,$out2,v24
stvx_u $in2,$x20,$out
le?vperm $in4,$in4,$in4,$inpperm
vcipher $out3,$out3,v24
stvx_u $in3,$x30,$out
le?vperm $in5,$in5,$in5,$inpperm
vcipher $out4,$out4,v24
stvx_u $in4,$x40,$out
le?vperm $in6,$in6,$in6,$inpperm
vcipher $out5,$out5,v24
stvx_u $in5,$x50,$out
le?vperm $in7,$in7,$in7,$inpperm
vcipher $out6,$out6,v24
stvx_u $in6,$x60,$out
vcipher $out7,$out7,v24
stvx_u $in7,$x70,$out
addi $out,$out,0x80
b Loop_ctr32_enc8x_middle
.align 5
Lctr32_enc8x_break:
cmpwi $len,-0x60
blt Lctr32_enc8x_one
nop
beq Lctr32_enc8x_two
cmpwi $len,-0x40
blt Lctr32_enc8x_three
nop
beq Lctr32_enc8x_four
cmpwi $len,-0x20
blt Lctr32_enc8x_five
nop
beq Lctr32_enc8x_six
cmpwi $len,0x00
blt Lctr32_enc8x_seven
Lctr32_enc8x_eight:
vcipherlast $out0,$out0,$in0
vcipherlast $out1,$out1,$in1
vcipherlast $out2,$out2,$in2
vcipherlast $out3,$out3,$in3
vcipherlast $out4,$out4,$in4
vcipherlast $out5,$out5,$in5
vcipherlast $out6,$out6,$in6
vcipherlast $out7,$out7,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x50,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x60,$out
stvx_u $out7,$x70,$out
addi $out,$out,0x80
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_seven:
vcipherlast $out0,$out0,$in1
vcipherlast $out1,$out1,$in2
vcipherlast $out2,$out2,$in3
vcipherlast $out3,$out3,$in4
vcipherlast $out4,$out4,$in5
vcipherlast $out5,$out5,$in6
vcipherlast $out6,$out6,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x50,$out
stvx_u $out6,$x60,$out
addi $out,$out,0x70
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_six:
vcipherlast $out0,$out0,$in2
vcipherlast $out1,$out1,$in3
vcipherlast $out2,$out2,$in4
vcipherlast $out3,$out3,$in5
vcipherlast $out4,$out4,$in6
vcipherlast $out5,$out5,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
stvx_u $out5,$x50,$out
addi $out,$out,0x60
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_five:
vcipherlast $out0,$out0,$in3
vcipherlast $out1,$out1,$in4
vcipherlast $out2,$out2,$in5
vcipherlast $out3,$out3,$in6
vcipherlast $out4,$out4,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
stvx_u $out4,$x40,$out
addi $out,$out,0x50
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_four:
vcipherlast $out0,$out0,$in4
vcipherlast $out1,$out1,$in5
vcipherlast $out2,$out2,$in6
vcipherlast $out3,$out3,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
stvx_u $out3,$x30,$out
addi $out,$out,0x40
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_three:
vcipherlast $out0,$out0,$in5
vcipherlast $out1,$out1,$in6
vcipherlast $out2,$out2,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
stvx_u $out2,$x20,$out
addi $out,$out,0x30
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_two:
vcipherlast $out0,$out0,$in6
vcipherlast $out1,$out1,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
stvx_u $out1,$x10,$out
addi $out,$out,0x20
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_one:
vcipherlast $out0,$out0,$in7
le?vperm $out0,$out0,$out0,$inpperm
stvx_u $out0,0,$out
addi $out,$out,0x10
Lctr32_enc8x_done:
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $inpperm,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x14,0,0x80,6,6,0
.long 0
.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
___
}} }}}
#########################################################################
{{{ # XTS procedures #
# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, #
# const AES_KEY *key1, const AES_KEY *key2, #
# [const] unsigned char iv[16]); #
# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which #
# input tweak value is assumed to be encrypted already, and last tweak #
# value, one suitable for consecutive call on same chunk of data, is #
# written back to original buffer. In addition, in "tweak chaining" #
# mode only complete input blocks are processed. #
my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10));
my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2));
my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7));
my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12));
my $taillen = $key2;
($inp,$idx) = ($idx,$inp); # reassign
$code.=<<___;
.globl .${prefix}_xts_encrypt
mr $inp,r3 # reassign
li r3,-1
${UCMP}i $len,16
bltlr-
lis r0,0xfff0
mfspr r12,256 # save vrsave
li r11,0
mtspr 256,r0
vspltisb $seven,0x07 # 0x070707..07
le?lvsl $leperm,r11,r11
le?vspltisb $tmp,0x0f
le?vxor $leperm,$leperm,$seven
li $idx,15
lvx $tweak,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
le?vxor $inpperm,$inpperm,$tmp
vperm $tweak,$tweak,$inptail,$inpperm
neg r11,$inp
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inout,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
${UCMP}i $key2,0 # key2==NULL?
beq Lxts_enc_no_key2
?lvsl $keyperm,0,$key2 # prepare for unaligned key
lwz $rounds,240($key2)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
lvx $rndkey0,0,$key2
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
mtctr $rounds
Ltweak_xts_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
bdnz Ltweak_xts_enc
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipherlast $tweak,$tweak,$rndkey0
li $ivp,0 # don't chain the tweak
b Lxts_enc
Lxts_enc_no_key2:
li $idx,-16
and $len,$len,$idx # in "tweak chaining"
# mode only complete
# blocks are processed
Lxts_enc:
lvx $inptail,0,$inp
addi $inp,$inp,16
?lvsl $keyperm,0,$key1 # prepare for unaligned key
lwz $rounds,240($key1)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
vslb $eighty7,$seven,$seven # 0x808080..80
vor $eighty7,$eighty7,$seven # 0x878787..87
vspltisb $tmp,1 # 0x010101..01
vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
${UCMP}i $len,96
bge _aesp8_xts_encrypt6x
andi. $taillen,$len,15
subic r0,$len,32
subi $taillen,$taillen,16
subfe r0,r0,r0
and r0,r0,$taillen
add $inp,$inp,r0
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
b Loop_xts_enc
.align 5
Loop_xts_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
bdnz Loop_xts_enc
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $rndkey0,$rndkey0,$tweak
vcipherlast $output,$inout,$rndkey0
le?vperm $tmp,$output,$output,$leperm
be?nop
le?stvx_u $tmp,0,$out
be?stvx_u $output,0,$out
addi $out,$out,16
subic. $len,$len,16
beq Lxts_enc_done
vmr $inout,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
subic r0,$len,32
subfe r0,r0,r0
and r0,r0,$taillen
add $inp,$inp,r0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $output,$output,$rndkey0 # just in case $len<16
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
${UCMP}i $len,16
bge Loop_xts_enc
vxor $output,$output,$tweak
lvsr $inpperm,0,$len # $inpperm is no longer needed
vxor $inptail,$inptail,$inptail # $inptail is no longer needed
vspltisb $tmp,-1
vperm $inptail,$inptail,$tmp,$inpperm
vsel $inout,$inout,$output,$inptail
subi r11,$out,17
subi $out,$out,16
mtctr $len
li $len,16
Loop_xts_enc_steal:
lbzu r0,1(r11)
stb r0,16(r11)
bdnz Loop_xts_enc_steal
mtctr $rounds
b Loop_xts_enc # one more time...
Lxts_enc_done:
${UCMP}i $ivp,0
beq Lxts_enc_ret
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_enc_ret:
mtspr 256,r12 # restore vrsave
li r3,0
blr
.long 0
.byte 0,12,0x04,0,0x80,6,6,0
.long 0
.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
.globl .${prefix}_xts_decrypt
mr $inp,r3 # reassign
li r3,-1
${UCMP}i $len,16
bltlr-
lis r0,0xfff8
mfspr r12,256 # save vrsave
li r11,0
mtspr 256,r0
andi. r0,$len,15
neg r0,r0
andi. r0,r0,16
sub $len,$len,r0
vspltisb $seven,0x07 # 0x070707..07
le?lvsl $leperm,r11,r11
le?vspltisb $tmp,0x0f
le?vxor $leperm,$leperm,$seven
li $idx,15
lvx $tweak,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
le?vxor $inpperm,$inpperm,$tmp
vperm $tweak,$tweak,$inptail,$inpperm
neg r11,$inp
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inout,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
${UCMP}i $key2,0 # key2==NULL?
beq Lxts_dec_no_key2
?lvsl $keyperm,0,$key2 # prepare for unaligned key
lwz $rounds,240($key2)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
lvx $rndkey0,0,$key2
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
mtctr $rounds
Ltweak_xts_dec:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
bdnz Ltweak_xts_dec
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipherlast $tweak,$tweak,$rndkey0
li $ivp,0 # don't chain the tweak
b Lxts_dec
Lxts_dec_no_key2:
neg $idx,$len
andi. $idx,$idx,15
add $len,$len,$idx # in "tweak chaining"
# mode only complete
# blocks are processed
Lxts_dec:
lvx $inptail,0,$inp
addi $inp,$inp,16
?lvsl $keyperm,0,$key1 # prepare for unaligned key
lwz $rounds,240($key1)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
vslb $eighty7,$seven,$seven # 0x808080..80
vor $eighty7,$eighty7,$seven # 0x878787..87
vspltisb $tmp,1 # 0x010101..01
vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
${UCMP}i $len,96
bge _aesp8_xts_decrypt6x
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
${UCMP}i $len,16
blt Ltail_xts_dec
be?b Loop_xts_dec
.align 5
Loop_xts_dec:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
bdnz Loop_xts_dec
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $rndkey0,$rndkey0,$tweak
vncipherlast $output,$inout,$rndkey0
le?vperm $tmp,$output,$output,$leperm
be?nop
le?stvx_u $tmp,0,$out
be?stvx_u $output,0,$out
addi $out,$out,16
subic. $len,$len,16
beq Lxts_dec_done
vmr $inout,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
${UCMP}i $len,16
bge Loop_xts_dec
Ltail_xts_dec:
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak1,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak1,$tweak1,$tmp
subi $inp,$inp,16
add $inp,$inp,$len
vxor $inout,$inout,$tweak # :-(
vxor $inout,$inout,$tweak1 # :-)
Loop_xts_dec_short:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
bdnz Loop_xts_dec_short
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $rndkey0,$rndkey0,$tweak1
vncipherlast $output,$inout,$rndkey0
le?vperm $tmp,$output,$output,$leperm
be?nop
le?stvx_u $tmp,0,$out
be?stvx_u $output,0,$out
vmr $inout,$inptail
lvx $inptail,0,$inp
#addi $inp,$inp,16
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
lvsr $inpperm,0,$len # $inpperm is no longer needed
vxor $inptail,$inptail,$inptail # $inptail is no longer needed
vspltisb $tmp,-1
vperm $inptail,$inptail,$tmp,$inpperm
vsel $inout,$inout,$output,$inptail
vxor $rndkey0,$rndkey0,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
subi r11,$out,1
mtctr $len
li $len,16
Loop_xts_dec_steal:
lbzu r0,1(r11)
stb r0,16(r11)
bdnz Loop_xts_dec_steal
mtctr $rounds
b Loop_xts_dec # one more time...
Lxts_dec_done:
${UCMP}i $ivp,0
beq Lxts_dec_ret
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_dec_ret:
mtspr 256,r12 # restore vrsave
li r3,0
blr
.long 0
.byte 0,12,0x04,0,0x80,6,6,0
.long 0
.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
___
#########################################################################
{{ # Optimized XTS procedures #
my $key_=$key2;
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
$x00=0 if ($flavour =~ /osx/);
my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5));
my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
# v26-v31 last 6 round keys
my ($keyperm)=($out0); # aliases with "caller", redundant assignment
my $taillen=$x70;
$code.=<<___;
.align 5
_aesp8_xts_encrypt6x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
mflr r11
li r7,`$FRAME+8*16+15`
li r3,`$FRAME+8*16+31`
$PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
stvx v20,r7,$sp # ABI says so
addi r7,r7,32
stvx v21,r3,$sp
addi r3,r3,32
stvx v22,r7,$sp
addi r7,r7,32
stvx v23,r3,$sp
addi r3,r3,32
stvx v24,r7,$sp
addi r7,r7,32
stvx v25,r3,$sp
addi r3,r3,32
stvx v26,r7,$sp
addi r7,r7,32
stvx v27,r3,$sp
addi r3,r3,32
stvx v28,r7,$sp
addi r7,r7,32
stvx v29,r3,$sp
addi r3,r3,32
stvx v30,r7,$sp
stvx v31,r3,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule
lvx v30,$x10,$key1
addi $key1,$key1,0x20
lvx v31,$x00,$key1
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_xts_enc_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key1
addi $key1,$key1,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key1
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_xts_enc_key
lvx v26,$x10,$key1
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key1
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key1
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key1
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key1
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key1
?vperm v29,v29,v30,$keyperm
lvx $twk5,$x70,$key1 # borrow $twk5
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
vperm $in0,$inout,$inptail,$inpperm
subi $inp,$inp,31 # undo "caller"
vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0
vxor $tweak,$tweak,$tmp
lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1
vxor $tweak,$tweak,$tmp
lvx_u $in2,$x20,$inp
andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2
vxor $tweak,$tweak,$tmp
lvx_u $in3,$x30,$inp
sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3
vxor $tweak,$tweak,$tmp
lvx_u $in4,$x40,$inp
subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4
vxor $tweak,$tweak,$tmp
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5
vxor $tweak,$tweak,$tmp
vxor v31,v31,$rndkey0
mtctr $rounds
b Loop_xts_enc6x
.align 5
Loop_xts_enc6x:
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_enc6x
subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk0,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
subfe. r0,r0,r0 # borrow?-1:0
vand $tmp,$tmp,$eighty7
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vxor $tweak,$tweak,$tmp
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vxor $in1,$twk1,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk1,$tweak,$rndkey0
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
and r0,r0,$len
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v26
vcipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v26
vcipher $out3,$out3,v26
vxor $tweak,$tweak,$tmp
vcipher $out4,$out4,v26
vcipher $out5,$out5,v26
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in5 are loaded
# with last "words"
vxor $in2,$twk2,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk2,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vcipher $out0,$out0,v27
vcipher $out1,$out1,v27
vsldoi $tmp,$tmp,$tmp,15
vcipher $out2,$out2,v27
vcipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7
vcipher $out4,$out4,v27
vcipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vxor $tweak,$tweak,$tmp
vcipher $out0,$out0,v28
vcipher $out1,$out1,v28
vxor $in3,$twk3,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk3,$tweak,$rndkey0
vcipher $out2,$out2,v28
vcipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v28
vcipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vand $tmp,$tmp,$eighty7
vcipher $out0,$out0,v29
vcipher $out1,$out1,v29
vxor $tweak,$tweak,$tmp
vcipher $out2,$out2,v29
vcipher $out3,$out3,v29
vxor $in4,$twk4,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk4,$tweak,$rndkey0
vcipher $out4,$out4,v29
vcipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v30
vcipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v30
vcipher $out3,$out3,v30
vxor $tweak,$tweak,$tmp
vcipher $out4,$out4,v30
vcipher $out5,$out5,v30
vxor $in5,$twk5,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk5,$tweak,$rndkey0
vcipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp
vcipherlast $out2,$out2,$in2
le?vperm $in0,$in0,$in0,$leperm
lvx_u $in2,$x20,$inp
vand $tmp,$tmp,$eighty7
vcipherlast $out3,$out3,$in3
le?vperm $in1,$in1,$in1,$leperm
lvx_u $in3,$x30,$inp
vcipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp
vxor $tweak,$tweak,$tmp
vcipherlast $tmp,$out5,$in5 # last block might be needed
# in stealing mode
le?vperm $in3,$in3,$in3,$leperm
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
le?vperm $in4,$in4,$in4,$leperm
le?vperm $in5,$in5,$in5,$leperm
le?vperm $out0,$out0,$out0,$leperm
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk0
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
vxor $out1,$in1,$twk1
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
vxor $out2,$in2,$twk2
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
vxor $out3,$in3,$twk3
le?vperm $out5,$tmp,$tmp,$leperm
stvx_u $out4,$x40,$out
vxor $out4,$in4,$twk4
le?stvx_u $out5,$x50,$out
be?stvx_u $tmp, $x50,$out
vxor $out5,$in5,$twk5
addi $out,$out,0x60
mtctr $rounds
beq Loop_xts_enc6x # did $len-=96 borrow?
addic. $len,$len,0x60
beq Lxts_enc6x_zero
cmpwi $len,0x20
blt Lxts_enc6x_one
nop
beq Lxts_enc6x_two
cmpwi $len,0x40
blt Lxts_enc6x_three
nop
beq Lxts_enc6x_four
Lxts_enc6x_five:
vxor $out0,$in1,$twk0
vxor $out1,$in2,$twk1
vxor $out2,$in3,$twk2
vxor $out3,$in4,$twk3
vxor $out4,$in5,$twk4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk5 # unused tweak
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
vxor $tmp,$out4,$twk5 # last block prep for stealing
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
stvx_u $out4,$x40,$out
addi $out,$out,0x50
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_four:
vxor $out0,$in2,$twk0
vxor $out1,$in3,$twk1
vxor $out2,$in4,$twk2
vxor $out3,$in5,$twk3
vxor $out4,$out4,$out4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk4 # unused tweak
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
vxor $tmp,$out3,$twk4 # last block prep for stealing
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
stvx_u $out3,$x30,$out
addi $out,$out,0x40
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_three:
vxor $out0,$in3,$twk0
vxor $out1,$in4,$twk1
vxor $out2,$in5,$twk2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk3 # unused tweak
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $tmp,$out2,$twk3 # last block prep for stealing
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
stvx_u $out2,$x20,$out
addi $out,$out,0x30
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_two:
vxor $out0,$in4,$twk0
vxor $out1,$in5,$twk1
vxor $out2,$out2,$out2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk2 # unused tweak
vxor $tmp,$out1,$twk2 # last block prep for stealing
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
stvx_u $out1,$x10,$out
addi $out,$out,0x20
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_one:
vxor $out0,$in5,$twk0
nop
Loop_xts_enc1x:
vcipher $out0,$out0,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_enc1x
add $inp,$inp,$taillen
cmpwi $taillen,0
vcipher $out0,$out0,v24
subi $inp,$inp,16
vcipher $out0,$out0,v25
lvsr $inpperm,0,$taillen
vcipher $out0,$out0,v26
lvx_u $in0,0,$inp
vcipher $out0,$out0,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vcipher $out0,$out0,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vcipher $out0,$out0,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $twk0,$twk0,v31
le?vperm $in0,$in0,$in0,$leperm
vcipher $out0,$out0,v30
vperm $in0,$in0,$in0,$inpperm
vcipherlast $out0,$out0,$twk0
vmr $twk0,$twk1 # unused tweak
vxor $tmp,$out0,$twk1 # last block prep for stealing
le?vperm $out0,$out0,$out0,$leperm
stvx_u $out0,$x00,$out # store output
addi $out,$out,0x10
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_zero:
cmpwi $taillen,0
beq Lxts_enc6x_done
add $inp,$inp,$taillen
subi $inp,$inp,16
lvx_u $in0,0,$inp
lvsr $inpperm,0,$taillen # $in5 is no more
le?vperm $in0,$in0,$in0,$leperm
vperm $in0,$in0,$in0,$inpperm
vxor $tmp,$tmp,$twk0
Lxts_enc6x_steal:
vxor $in0,$in0,$twk0
vxor $out0,$out0,$out0
vspltisb $out1,-1
vperm $out0,$out0,$out1,$inpperm
vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember?
subi r30,$out,17
subi $out,$out,16
mtctr $taillen
Loop_xts_enc6x_steal:
lbzu r0,1(r30)
stb r0,16(r30)
bdnz Loop_xts_enc6x_steal
li $taillen,0
mtctr $rounds
b Loop_xts_enc1x # one more time...
.align 4
Lxts_enc6x_done:
${UCMP}i $ivp,0
beq Lxts_enc6x_ret
vxor $tweak,$twk0,$rndkey0
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_enc6x_ret:
mtlr r11
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $seven,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x04,1,0x80,6,6,0
.long 0
.align 5
_aesp8_xts_enc5x:
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
lvx v25,$x10,$key_ # round[4]
bdnz _aesp8_xts_enc5x
add $inp,$inp,$taillen
cmpwi $taillen,0
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
subi $inp,$inp,16
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vxor $twk0,$twk0,v31
vcipher $out0,$out0,v26
lvsr $inpperm,r0,$taillen # $in5 is no more
vcipher $out1,$out1,v26
vcipher $out2,$out2,v26
vcipher $out3,$out3,v26
vcipher $out4,$out4,v26
vxor $in1,$twk1,v31
vcipher $out0,$out0,v27
lvx_u $in0,0,$inp
vcipher $out1,$out1,v27
vcipher $out2,$out2,v27
vcipher $out3,$out3,v27
vcipher $out4,$out4,v27
vxor $in2,$twk2,v31
addi $key_,$sp,$FRAME+15 # rewind $key_
vcipher $out0,$out0,v28
vcipher $out1,$out1,v28
vcipher $out2,$out2,v28
vcipher $out3,$out3,v28
vcipher $out4,$out4,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vxor $in3,$twk3,v31
vcipher $out0,$out0,v29
le?vperm $in0,$in0,$in0,$leperm
vcipher $out1,$out1,v29
vcipher $out2,$out2,v29
vcipher $out3,$out3,v29
vcipher $out4,$out4,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $in4,$twk4,v31
vcipher $out0,$out0,v30
vperm $in0,$in0,$in0,$inpperm
vcipher $out1,$out1,v30
vcipher $out2,$out2,v30
vcipher $out3,$out3,v30
vcipher $out4,$out4,v30
vcipherlast $out0,$out0,$twk0
vcipherlast $out1,$out1,$in1
vcipherlast $out2,$out2,$in2
vcipherlast $out3,$out3,$in3
vcipherlast $out4,$out4,$in4
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.align 5
_aesp8_xts_decrypt6x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
mflr r11
li r7,`$FRAME+8*16+15`
li r3,`$FRAME+8*16+31`
$PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
stvx v20,r7,$sp # ABI says so
addi r7,r7,32
stvx v21,r3,$sp
addi r3,r3,32
stvx v22,r7,$sp
addi r7,r7,32
stvx v23,r3,$sp
addi r3,r3,32
stvx v24,r7,$sp
addi r7,r7,32
stvx v25,r3,$sp
addi r3,r3,32
stvx v26,r7,$sp
addi r7,r7,32
stvx v27,r3,$sp
addi r3,r3,32
stvx v28,r7,$sp
addi r7,r7,32
stvx v29,r3,$sp
addi r3,r3,32
stvx v30,r7,$sp
stvx v31,r3,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule
lvx v30,$x10,$key1
addi $key1,$key1,0x20
lvx v31,$x00,$key1
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_xts_dec_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key1
addi $key1,$key1,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key1
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_xts_dec_key
lvx v26,$x10,$key1
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key1
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key1
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key1
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key1
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key1
?vperm v29,v29,v30,$keyperm
lvx $twk5,$x70,$key1 # borrow $twk5
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
vperm $in0,$inout,$inptail,$inpperm
subi $inp,$inp,31 # undo "caller"
vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0
vxor $tweak,$tweak,$tmp
lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1
vxor $tweak,$tweak,$tmp
lvx_u $in2,$x20,$inp
andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2
vxor $tweak,$tweak,$tmp
lvx_u $in3,$x30,$inp
sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3
vxor $tweak,$tweak,$tmp
lvx_u $in4,$x40,$inp
subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4
vxor $tweak,$tweak,$tmp
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5
vxor $tweak,$tweak,$tmp
vxor v31,v31,$rndkey0
mtctr $rounds
b Loop_xts_dec6x
.align 5
Loop_xts_dec6x:
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_dec6x
subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk0,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
subfe. r0,r0,r0 # borrow?-1:0
vand $tmp,$tmp,$eighty7
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vxor $tweak,$tweak,$tmp
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vxor $in1,$twk1,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk1,$tweak,$rndkey0
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
and r0,r0,$len
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v26
vncipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vxor $tweak,$tweak,$tmp
vncipher $out4,$out4,v26
vncipher $out5,$out5,v26
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in5 are loaded
# with last "words"
vxor $in2,$twk2,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk2,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vncipher $out0,$out0,v27
vncipher $out1,$out1,v27
vsldoi $tmp,$tmp,$tmp,15
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7
vncipher $out4,$out4,v27
vncipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vxor $tweak,$tweak,$tmp
vncipher $out0,$out0,v28
vncipher $out1,$out1,v28
vxor $in3,$twk3,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk3,$tweak,$rndkey0
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v28
vncipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vand $tmp,$tmp,$eighty7
vncipher $out0,$out0,v29
vncipher $out1,$out1,v29
vxor $tweak,$tweak,$tmp
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vxor $in4,$twk4,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk4,$tweak,$rndkey0
vncipher $out4,$out4,v29
vncipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v30
vncipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v30
vncipher $out3,$out3,v30
vxor $tweak,$tweak,$tmp
vncipher $out4,$out4,v30
vncipher $out5,$out5,v30
vxor $in5,$twk5,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk5,$tweak,$rndkey0
vncipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp
vncipherlast $out2,$out2,$in2
le?vperm $in0,$in0,$in0,$leperm
lvx_u $in2,$x20,$inp
vand $tmp,$tmp,$eighty7
vncipherlast $out3,$out3,$in3
le?vperm $in1,$in1,$in1,$leperm
lvx_u $in3,$x30,$inp
vncipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp
vxor $tweak,$tweak,$tmp
vncipherlast $out5,$out5,$in5
le?vperm $in3,$in3,$in3,$leperm
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
le?vperm $in4,$in4,$in4,$leperm
le?vperm $in5,$in5,$in5,$leperm
le?vperm $out0,$out0,$out0,$leperm
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk0
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
vxor $out1,$in1,$twk1
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
vxor $out2,$in2,$twk2
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
vxor $out3,$in3,$twk3
le?vperm $out5,$out5,$out5,$leperm
stvx_u $out4,$x40,$out
vxor $out4,$in4,$twk4
stvx_u $out5,$x50,$out
vxor $out5,$in5,$twk5
addi $out,$out,0x60
mtctr $rounds
beq Loop_xts_dec6x # did $len-=96 borrow?
addic. $len,$len,0x60
beq Lxts_dec6x_zero
cmpwi $len,0x20
blt Lxts_dec6x_one
nop
beq Lxts_dec6x_two
cmpwi $len,0x40
blt Lxts_dec6x_three
nop
beq Lxts_dec6x_four
Lxts_dec6x_five:
vxor $out0,$in1,$twk0
vxor $out1,$in2,$twk1
vxor $out2,$in3,$twk2
vxor $out3,$in4,$twk3
vxor $out4,$in5,$twk4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk5 # unused tweak
vxor $twk1,$tweak,$rndkey0
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk1
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
stvx_u $out4,$x40,$out
addi $out,$out,0x50
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_four:
vxor $out0,$in2,$twk0
vxor $out1,$in3,$twk1
vxor $out2,$in4,$twk2
vxor $out3,$in5,$twk3
vxor $out4,$out4,$out4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk4 # unused tweak
vmr $twk1,$twk5
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk5
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
stvx_u $out3,$x30,$out
addi $out,$out,0x40
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_three:
vxor $out0,$in3,$twk0
vxor $out1,$in4,$twk1
vxor $out2,$in5,$twk2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk3 # unused tweak
vmr $twk1,$twk4
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk4
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
stvx_u $out2,$x20,$out
addi $out,$out,0x30
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_two:
vxor $out0,$in4,$twk0
vxor $out1,$in5,$twk1
vxor $out2,$out2,$out2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk2 # unused tweak
vmr $twk1,$twk3
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk3
stvx_u $out1,$x10,$out
addi $out,$out,0x20
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_one:
vxor $out0,$in5,$twk0
nop
Loop_xts_dec1x:
vncipher $out0,$out0,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_dec1x
subi r0,$taillen,1
vncipher $out0,$out0,v24
andi. r0,r0,16
cmpwi $taillen,0
vncipher $out0,$out0,v25
sub $inp,$inp,r0
vncipher $out0,$out0,v26
lvx_u $in0,0,$inp
vncipher $out0,$out0,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vncipher $out0,$out0,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $twk0,$twk0,v31
le?vperm $in0,$in0,$in0,$leperm
vncipher $out0,$out0,v30
mtctr $rounds
vncipherlast $out0,$out0,$twk0
vmr $twk0,$twk1 # unused tweak
vmr $twk1,$twk2
le?vperm $out0,$out0,$out0,$leperm
stvx_u $out0,$x00,$out # store output
addi $out,$out,0x10
vxor $out0,$in0,$twk2
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_zero:
cmpwi $taillen,0
beq Lxts_dec6x_done
lvx_u $in0,0,$inp
le?vperm $in0,$in0,$in0,$leperm
vxor $out0,$in0,$twk1
Lxts_dec6x_steal:
vncipher $out0,$out0,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
lvx v25,$x10,$key_ # round[4]
bdnz Lxts_dec6x_steal
add $inp,$inp,$taillen
vncipher $out0,$out0,v24
cmpwi $taillen,0
vncipher $out0,$out0,v25
lvx_u $in0,0,$inp
vncipher $out0,$out0,v26
lvsr $inpperm,0,$taillen # $in5 is no more
vncipher $out0,$out0,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vncipher $out0,$out0,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $twk1,$twk1,v31
le?vperm $in0,$in0,$in0,$leperm
vncipher $out0,$out0,v30
vperm $in0,$in0,$in0,$inpperm
vncipherlast $tmp,$out0,$twk1
le?vperm $out0,$tmp,$tmp,$leperm
le?stvx_u $out0,0,$out
be?stvx_u $tmp,0,$out
vxor $out0,$out0,$out0
vspltisb $out1,-1
vperm $out0,$out0,$out1,$inpperm
vsel $out0,$in0,$tmp,$out0
vxor $out0,$out0,$twk0
subi r30,$out,1
mtctr $taillen
Loop_xts_dec6x_steal:
lbzu r0,1(r30)
stb r0,16(r30)
bdnz Loop_xts_dec6x_steal
li $taillen,0
mtctr $rounds
b Loop_xts_dec1x # one more time...
.align 4
Lxts_dec6x_done:
${UCMP}i $ivp,0
beq Lxts_dec6x_ret
vxor $tweak,$twk0,$rndkey0
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_dec6x_ret:
mtlr r11
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $seven,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x04,1,0x80,6,6,0
.long 0
.align 5
_aesp8_xts_dec5x:
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
lvx v25,$x10,$key_ # round[4]
bdnz _aesp8_xts_dec5x
subi r0,$taillen,1
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
andi. r0,r0,16
cmpwi $taillen,0
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vxor $twk0,$twk0,v31
sub $inp,$inp,r0
vncipher $out0,$out0,v26
vncipher $out1,$out1,v26
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vncipher $out4,$out4,v26
vxor $in1,$twk1,v31
vncipher $out0,$out0,v27
lvx_u $in0,0,$inp
vncipher $out1,$out1,v27
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vncipher $out4,$out4,v27
vxor $in2,$twk2,v31
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
vncipher $out1,$out1,v28
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vncipher $out4,$out4,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vxor $in3,$twk3,v31
vncipher $out0,$out0,v29
le?vperm $in0,$in0,$in0,$leperm
vncipher $out1,$out1,v29
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vncipher $out4,$out4,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $in4,$twk4,v31
vncipher $out0,$out0,v30
vncipher $out1,$out1,v30
vncipher $out2,$out2,v30
vncipher $out3,$out3,v30
vncipher $out4,$out4,v30
vncipherlast $out0,$out0,$twk0
vncipherlast $out1,$out1,$in1
vncipherlast $out2,$out2,$in2
vncipherlast $out3,$out3,$in3
vncipherlast $out4,$out4,$in4
mtctr $rounds
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
}} }}}
my $consts=1;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
# constants table endian-specific conversion
if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
my $conv=$3;
my @bytes=();
# convert to endian-agnostic format
if ($1 eq "long") {
foreach (split(/,\s*/,$2)) {
my $l = /^0/?oct:int;
push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
}
} else {
@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
}
# little-endian conversion
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}
#emit
print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
next;
}
$consts=0 if (m/Lconsts:/o); # end of table
# instructions prefixed with '?' are endian-specific and need
# to be adjusted accordingly...
if ($flavour =~ /le$/o) { # little-endian
s/le\?//o or
s/be\?/#be#/o or
s/\?lvsr/lvsl/o or
s/\?lvsl/lvsr/o or
s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
} else { # big-endian
s/le\?/#le#/o or
s/be\?//o or
s/\?([a-z]+)/$1/o;
}
print $_,"\n";
}
close STDOUT;
#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-2.0
# This code is taken from the OpenSSL project but the author (Andy Polyakov)
# has relicensed it under the GPLv2. Therefore this program is free software;
# you can redistribute it and/or modify it under the terms of the GNU General
# Public License version 2 as published by the Free Software Foundation.
#
# The original headers, including the original license headers, are
# included below for completeness.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see https://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# GHASH for PowerISA v2.07.
#
# July 2014
#
# Accurate performance measurements are problematic, because it's
# always virtualized setup with possibly throttled processor.
# Relative comparison is therefore more informative. This initial
# version is ~2.1x slower than hardware-assisted AES-128-CTR, ~12x
# faster than "4-bit" integer-only compiler-generated 64-bit code.
# "Initial version" means that there is room for futher improvement.
$flavour=shift;
$output =shift;
if ($flavour =~ /64/) {
$SIZE_T=8;
$LRSAVE=2*$SIZE_T;
$STU="stdu";
$POP="ld";
$PUSH="std";
} elsif ($flavour =~ /32/) {
$SIZE_T=4;
$LRSAVE=$SIZE_T;
$STU="stwu";
$POP="lwz";
$PUSH="stw";
} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
my ($Xip,$Htbl,$inp,$len)=map("r$_",(3..6)); # argument block
my ($Xl,$Xm,$Xh,$IN)=map("v$_",(0..3));
my ($zero,$t0,$t1,$t2,$xC2,$H,$Hh,$Hl,$lemask)=map("v$_",(4..12));
my ($Xl1,$Xm1,$Xh1,$IN1,$H2,$H2h,$H2l)=map("v$_",(13..19));
my $vrsave="r12";
my ($t4,$t5,$t6) = ($Hl,$H,$Hh);
$code=<<___;
.machine "any"
.text
.globl .gcm_init_p8
lis r0,0xfff0
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $H,0,r4 # load H
le?xor r7,r7,r7
le?addi r7,r7,0x8 # need a vperm start with 08
le?lvsr 5,0,r7
le?vspltisb 6,0x0f
le?vxor 5,5,6 # set a b-endian mask
le?vperm $H,$H,$H,5
vspltisb $xC2,-16 # 0xf0
vspltisb $t0,1 # one
vaddubm $xC2,$xC2,$xC2 # 0xe0
vxor $zero,$zero,$zero
vor $xC2,$xC2,$t0 # 0xe1
vsldoi $xC2,$xC2,$zero,15 # 0xe1...
vsldoi $t1,$zero,$t0,1 # ...1
vaddubm $xC2,$xC2,$xC2 # 0xc2...
vspltisb $t2,7
vor $xC2,$xC2,$t1 # 0xc2....01
vspltb $t1,$H,0 # most significant byte
vsl $H,$H,$t0 # H<<=1
vsrab $t1,$t1,$t2 # broadcast carry bit
vand $t1,$t1,$xC2
vxor $H,$H,$t1 # twisted H
vsldoi $H,$H,$H,8 # twist even more ...
vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
vsldoi $Hl,$zero,$H,8 # ... and split
vsldoi $Hh,$H,$zero,8
stvx_u $xC2,0,r3 # save pre-computed table
stvx_u $Hl,r8,r3
stvx_u $H, r9,r3
stvx_u $Hh,r10,r3
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_init_p8,.-.gcm_init_p8
.globl .gcm_init_htable
lis r0,0xfff0
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $H,0,r4 # load H
vspltisb $xC2,-16 # 0xf0
vspltisb $t0,1 # one
vaddubm $xC2,$xC2,$xC2 # 0xe0
vxor $zero,$zero,$zero
vor $xC2,$xC2,$t0 # 0xe1
vsldoi $xC2,$xC2,$zero,15 # 0xe1...
vsldoi $t1,$zero,$t0,1 # ...1
vaddubm $xC2,$xC2,$xC2 # 0xc2...
vspltisb $t2,7
vor $xC2,$xC2,$t1 # 0xc2....01
vspltb $t1,$H,0 # most significant byte
vsl $H,$H,$t0 # H<<=1
vsrab $t1,$t1,$t2 # broadcast carry bit
vand $t1,$t1,$xC2
vxor $IN,$H,$t1 # twisted H
vsldoi $H,$IN,$IN,8 # twist even more ...
vsldoi $xC2,$zero,$xC2,8 # 0xc2.0
vsldoi $Hl,$zero,$H,8 # ... and split
vsldoi $Hh,$H,$zero,8
stvx_u $xC2,0,r3 # save pre-computed table
stvx_u $Hl,r8,r3
li r8,0x40
stvx_u $H, r9,r3
li r9,0x50
stvx_u $Hh,r10,r3
li r10,0x60
vpmsumd $Xl,$IN,$Hl # H.lo·H.lo
vpmsumd $Xm,$IN,$H # H.hi·H.lo+H.lo·H.hi
vpmsumd $Xh,$IN,$Hh # H.hi·H.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $IN1,$Xl,$t1
vsldoi $H2,$IN1,$IN1,8
vsldoi $H2l,$zero,$H2,8
vsldoi $H2h,$H2,$zero,8
stvx_u $H2l,r8,r3 # save H^2
li r8,0x70
stvx_u $H2,r9,r3
li r9,0x80
stvx_u $H2h,r10,r3
li r10,0x90
vpmsumd $Xl,$IN,$H2l # H.lo·H^2.lo
vpmsumd $Xl1,$IN1,$H2l # H^2.lo·H^2.lo
vpmsumd $Xm,$IN,$H2 # H.hi·H^2.lo+H.lo·H^2.hi
vpmsumd $Xm1,$IN1,$H2 # H^2.hi·H^2.lo+H^2.lo·H^2.hi
vpmsumd $Xh,$IN,$H2h # H.hi·H^2.hi
vpmsumd $Xh1,$IN1,$H2h # H^2.hi·H^2.hi
vpmsumd $t2,$Xl,$xC2 # 1st reduction phase
vpmsumd $t6,$Xl1,$xC2 # 1st reduction phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vsldoi $t4,$Xm1,$zero,8
vsldoi $t5,$zero,$Xm1,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vxor $Xl1,$Xl1,$t4
vxor $Xh1,$Xh1,$t5
vsldoi $Xl,$Xl,$Xl,8
vsldoi $Xl1,$Xl1,$Xl1,8
vxor $Xl,$Xl,$t2
vxor $Xl1,$Xl1,$t6
vsldoi $t1,$Xl,$Xl,8 # 2nd reduction phase
vsldoi $t5,$Xl1,$Xl1,8 # 2nd reduction phase
vpmsumd $Xl,$Xl,$xC2
vpmsumd $Xl1,$Xl1,$xC2
vxor $t1,$t1,$Xh
vxor $t5,$t5,$Xh1
vxor $Xl,$Xl,$t1
vxor $Xl1,$Xl1,$t5
vsldoi $H,$Xl,$Xl,8
vsldoi $H2,$Xl1,$Xl1,8
vsldoi $Hl,$zero,$H,8
vsldoi $Hh,$H,$zero,8
vsldoi $H2l,$zero,$H2,8
vsldoi $H2h,$H2,$zero,8
stvx_u $Hl,r8,r3 # save H^3
li r8,0xa0
stvx_u $H,r9,r3
li r9,0xb0
stvx_u $Hh,r10,r3
li r10,0xc0
stvx_u $H2l,r8,r3 # save H^4
stvx_u $H2,r9,r3
stvx_u $H2h,r10,r3
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_init_htable,.-.gcm_init_htable
.globl .gcm_gmult_p8
lis r0,0xfff8
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $IN,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $IN,$IN,$IN,$lemask
vxor $zero,$zero,$zero
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
vpmsumd $t2,$Xl,$xC2 # 1st phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
vsldoi $t1,$Xl,$Xl,8 # 2nd phase
vpmsumd $Xl,$Xl,$xC2
vxor $t1,$t1,$Xh
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,2,0
.long 0
.size .gcm_gmult_p8,.-.gcm_gmult_p8
.globl .gcm_ghash_p8
lis r0,0xfff8
li r8,0x10
mfspr $vrsave,256
li r9,0x20
mtspr 256,r0
li r10,0x30
lvx_u $Xl,0,$Xip # load Xi
lvx_u $Hl,r8,$Htbl # load pre-computed table
le?lvsl $lemask,r0,r0
lvx_u $H, r9,$Htbl
le?vspltisb $t0,0x07
lvx_u $Hh,r10,$Htbl
le?vxor $lemask,$lemask,$t0
lvx_u $xC2,0,$Htbl
le?vperm $Xl,$Xl,$Xl,$lemask
vxor $zero,$zero,$zero
lvx_u $IN,0,$inp
addi $inp,$inp,16
subi $len,$len,16
le?vperm $IN,$IN,$IN,$lemask
vxor $IN,$IN,$Xl
b Loop
.align 5
Loop:
subic $len,$len,16
vpmsumd $Xl,$IN,$Hl # H.lo·Xi.lo
subfe. r0,r0,r0 # borrow?-1:0
vpmsumd $Xm,$IN,$H # H.hi·Xi.lo+H.lo·Xi.hi
and r0,r0,$len
vpmsumd $Xh,$IN,$Hh # H.hi·Xi.hi
add $inp,$inp,r0
vpmsumd $t2,$Xl,$xC2 # 1st phase
vsldoi $t0,$Xm,$zero,8
vsldoi $t1,$zero,$Xm,8
vxor $Xl,$Xl,$t0
vxor $Xh,$Xh,$t1
vsldoi $Xl,$Xl,$Xl,8
vxor $Xl,$Xl,$t2
lvx_u $IN,0,$inp
addi $inp,$inp,16
vsldoi $t1,$Xl,$Xl,8 # 2nd phase
vpmsumd $Xl,$Xl,$xC2
le?vperm $IN,$IN,$IN,$lemask
vxor $t1,$t1,$Xh
vxor $IN,$IN,$t1
vxor $IN,$IN,$Xl
beq Loop # did $len-=16 borrow?
vxor $Xl,$Xl,$t1
le?vperm $Xl,$Xl,$Xl,$lemask
stvx_u $Xl,0,$Xip # write out Xi
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,4,0
.long 0
.size .gcm_ghash_p8,.-.gcm_ghash_p8
.asciz "GHASH for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___
foreach (split("\n",$code)) {
if ($flavour =~ /le$/o) { # little-endian
s/le\?//o or
s/be\?/#be#/o;
} else {
s/le\?/#le#/o or
s/be\?//o;
}
print $_,"\n";
}
close STDOUT; # enforce flush
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Glue code for accelerated AES-GCM stitched implementation for ppc64le.
*
* Copyright 2022- IBM Inc. All rights reserved
*/
#include <asm/unaligned.h>
#include <asm/simd.h>
#include <asm/switch_to.h>
#include <crypto/algapi.h>
#include <crypto/aes.h>
#include <crypto/algapi.h>
#include <crypto/b128ops.h>
#include <crypto/gf128mul.h>
#include <crypto/internal/simd.h>
#include <crypto/internal/aead.h>
#include <crypto/internal/hash.h>
#include <crypto/internal/skcipher.h>
#include <crypto/scatterwalk.h>
#include <linux/cpufeature.h>
#include <linux/crypto.h>
#include <linux/module.h>
#include <linux/types.h>
#define PPC_MODULE_FEATURE_P10 (32 + ilog2(PPC_FEATURE2_ARCH_3_1))
#define PPC_ALIGN 16
#define GCM_IV_SIZE 12
MODULE_DESCRIPTION("PPC64le AES-GCM with Stitched implementation");
MODULE_AUTHOR("Danny Tsen <dtsen@linux.ibm.com");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS_CRYPTO("aes");
asmlinkage int aes_p8_set_encrypt_key(const u8 *userKey, const int bits,
void *key);
asmlinkage void aes_p8_encrypt(const u8 *in, u8 *out, const void *key);
asmlinkage void aes_p10_gcm_encrypt(u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
asmlinkage void aes_p10_gcm_decrypt(u8 *in, u8 *out, size_t len,
void *rkey, u8 *iv, void *Xi);
asmlinkage void gcm_init_htable(unsigned char htable[256], unsigned char Xi[16]);
asmlinkage void gcm_ghash_p8(unsigned char *Xi, unsigned char *Htable,
unsigned char *aad, unsigned int alen);
struct aes_key {
u8 key[AES_MAX_KEYLENGTH];
u64 rounds;
};
struct gcm_ctx {
u8 iv[16];
u8 ivtag[16];
u8 aad_hash[16];
u64 aadLen;
u64 Plen; /* offset 56 - used in aes_p10_gcm_{en/de}crypt */
};
struct Hash_ctx {
u8 H[16]; /* subkey */
u8 Htable[256]; /* Xi, Hash table(offset 32) */
};
struct p10_aes_gcm_ctx {
struct aes_key enc_key;
};
static void vsx_begin(void)
{
preempt_disable();
enable_kernel_vsx();
}
static void vsx_end(void)
{
disable_kernel_vsx();
preempt_enable();
}
static void set_subkey(unsigned char *hash)
{
*(u64 *)&hash[0] = be64_to_cpup((__be64 *)&hash[0]);
*(u64 *)&hash[8] = be64_to_cpup((__be64 *)&hash[8]);
}
/*
* Compute aad if any.
* - Hash aad and copy to Xi.
*/
static void set_aad(struct gcm_ctx *gctx, struct Hash_ctx *hash,
unsigned char *aad, int alen)
{
int i;
u8 nXi[16] = {0, };
gctx->aadLen = alen;
i = alen & ~0xf;
if (i) {
gcm_ghash_p8(nXi, hash->Htable+32, aad, i);
aad += i;
alen -= i;
}
if (alen) {
for (i = 0; i < alen; i++)
nXi[i] ^= aad[i];
memset(gctx->aad_hash, 0, 16);
gcm_ghash_p8(gctx->aad_hash, hash->Htable+32, nXi, 16);
} else {
memcpy(gctx->aad_hash, nXi, 16);
}
memcpy(hash->Htable, gctx->aad_hash, 16);
}
static void gcmp10_init(struct gcm_ctx *gctx, u8 *iv, unsigned char *rdkey,
struct Hash_ctx *hash, u8 *assoc, unsigned int assoclen)
{
__be32 counter = cpu_to_be32(1);
aes_p8_encrypt(hash->H, hash->H, rdkey);
set_subkey(hash->H);
gcm_init_htable(hash->Htable+32, hash->H);
*((__be32 *)(iv+12)) = counter;
gctx->Plen = 0;
/*
* Encrypt counter vector as iv tag and increment counter.
*/
aes_p8_encrypt(iv, gctx->ivtag, rdkey);
counter = cpu_to_be32(2);
*((__be32 *)(iv+12)) = counter;
memcpy(gctx->iv, iv, 16);
gctx->aadLen = assoclen;
memset(gctx->aad_hash, 0, 16);
if (assoclen)
set_aad(gctx, hash, assoc, assoclen);
}
static void finish_tag(struct gcm_ctx *gctx, struct Hash_ctx *hash, int len)
{
int i;
unsigned char len_ac[16 + PPC_ALIGN];
unsigned char *aclen = PTR_ALIGN((void *)len_ac, PPC_ALIGN);
__be64 clen = cpu_to_be64(len << 3);
__be64 alen = cpu_to_be64(gctx->aadLen << 3);
if (len == 0 && gctx->aadLen == 0) {
memcpy(hash->Htable, gctx->ivtag, 16);
return;
}
/*
* Len is in bits.
*/
*((__be64 *)(aclen)) = alen;
*((__be64 *)(aclen+8)) = clen;
/*
* hash (AAD len and len)
*/
gcm_ghash_p8(hash->Htable, hash->Htable+32, aclen, 16);
for (i = 0; i < 16; i++)
hash->Htable[i] ^= gctx->ivtag[i];
}
static int set_authsize(struct crypto_aead *tfm, unsigned int authsize)
{
switch (authsize) {
case 4:
case 8:
case 12:
case 13:
case 14:
case 15:
case 16:
break;
default:
return -EINVAL;
}
return 0;
}
static int p10_aes_gcm_setkey(struct crypto_aead *aead, const u8 *key,
unsigned int keylen)
{
struct crypto_tfm *tfm = crypto_aead_tfm(aead);
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
int ret;
vsx_begin();
ret = aes_p8_set_encrypt_key(key, keylen * 8, &ctx->enc_key);
vsx_end();
return ret ? -EINVAL : 0;
}
static int p10_aes_gcm_crypt(struct aead_request *req, int enc)
{
struct crypto_tfm *tfm = req->base.tfm;
struct p10_aes_gcm_ctx *ctx = crypto_tfm_ctx(tfm);
u8 databuf[sizeof(struct gcm_ctx) + PPC_ALIGN];
struct gcm_ctx *gctx = PTR_ALIGN((void *)databuf, PPC_ALIGN);
u8 hashbuf[sizeof(struct Hash_ctx) + PPC_ALIGN];
struct Hash_ctx *hash = PTR_ALIGN((void *)hashbuf, PPC_ALIGN);
struct scatter_walk assoc_sg_walk;
struct skcipher_walk walk;
u8 *assocmem = NULL;
u8 *assoc;
unsigned int assoclen = req->assoclen;
unsigned int cryptlen = req->cryptlen;
unsigned char ivbuf[AES_BLOCK_SIZE+PPC_ALIGN];
unsigned char *iv = PTR_ALIGN((void *)ivbuf, PPC_ALIGN);
int ret;
unsigned long auth_tag_len = crypto_aead_authsize(__crypto_aead_cast(tfm));
u8 otag[16];
int total_processed = 0;
memset(databuf, 0, sizeof(databuf));
memset(hashbuf, 0, sizeof(hashbuf));
memset(ivbuf, 0, sizeof(ivbuf));
memcpy(iv, req->iv, GCM_IV_SIZE);
/* Linearize assoc, if not already linear */
if (req->src->length >= assoclen && req->src->length) {
scatterwalk_start(&assoc_sg_walk, req->src);
assoc = scatterwalk_map(&assoc_sg_walk);
} else {
gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ?
GFP_KERNEL : GFP_ATOMIC;
/* assoc can be any length, so must be on heap */
assocmem = kmalloc(assoclen, flags);
if (unlikely(!assocmem))
return -ENOMEM;
assoc = assocmem;
scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0);
}
vsx_begin();
gcmp10_init(gctx, iv, (unsigned char *) &ctx->enc_key, hash, assoc, assoclen);
vsx_end();
if (!assocmem)
scatterwalk_unmap(assoc);
else
kfree(assocmem);
if (enc)
ret = skcipher_walk_aead_encrypt(&walk, req, false);
else
ret = skcipher_walk_aead_decrypt(&walk, req, false);
if (ret)
return ret;
while (walk.nbytes > 0 && ret == 0) {
vsx_begin();
if (enc)
aes_p10_gcm_encrypt(walk.src.virt.addr,
walk.dst.virt.addr,
walk.nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
else
aes_p10_gcm_decrypt(walk.src.virt.addr,
walk.dst.virt.addr,
walk.nbytes,
&ctx->enc_key, gctx->iv, hash->Htable);
vsx_end();
total_processed += walk.nbytes;
ret = skcipher_walk_done(&walk, 0);
}
if (ret)
return ret;
/* Finalize hash */
vsx_begin();
finish_tag(gctx, hash, total_processed);
vsx_end();
/* copy Xi to end of dst */
if (enc)
scatterwalk_map_and_copy(hash->Htable, req->dst, req->assoclen + cryptlen,
auth_tag_len, 1);
else {
scatterwalk_map_and_copy(otag, req->src,
req->assoclen + cryptlen - auth_tag_len,
auth_tag_len, 0);
if (crypto_memneq(otag, hash->Htable, auth_tag_len)) {
memzero_explicit(hash->Htable, 16);
return -EBADMSG;
}
}
return 0;
}
static int p10_aes_gcm_encrypt(struct aead_request *req)
{
return p10_aes_gcm_crypt(req, 1);
}
static int p10_aes_gcm_decrypt(struct aead_request *req)
{
return p10_aes_gcm_crypt(req, 0);
}
static struct aead_alg gcm_aes_alg = {
.ivsize = GCM_IV_SIZE,
.maxauthsize = 16,
.setauthsize = set_authsize,
.setkey = p10_aes_gcm_setkey,
.encrypt = p10_aes_gcm_encrypt,
.decrypt = p10_aes_gcm_decrypt,
.base.cra_name = "gcm(aes)",
.base.cra_driver_name = "p10_aes_gcm",
.base.cra_priority = 2100,
.base.cra_blocksize = 1,
.base.cra_ctxsize = sizeof(struct p10_aes_gcm_ctx),
.base.cra_module = THIS_MODULE,
};
static int __init p10_init(void)
{
return crypto_register_aead(&gcm_aes_alg);
}
static void __exit p10_exit(void)
{
crypto_unregister_aead(&gcm_aes_alg);
}
module_cpu_feature_match(PPC_MODULE_FEATURE_P10, p10_init);
module_exit(p10_exit);
/* SPDX-License-Identifier: GPL-2.0-or-later */
#
# Accelerated AES-GCM stitched implementation for ppc64le.
#
# Copyright 2022- IBM Inc. All rights reserved
#
#===================================================================================
# Written by Danny Tsen <dtsen@linux.ibm.com>
#
# GHASH is based on the Karatsuba multiplication method.
#
# Xi xor X1
#
# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
# (X4.h * H.h + X4.l * H.l + X4 * H)
#
# Xi = v0
# H Poly = v2
# Hash keys = v3 - v14
# ( H.l, H, H.h)
# ( H^2.l, H^2, H^2.h)
# ( H^3.l, H^3, H^3.h)
# ( H^4.l, H^4, H^4.h)
#
# v30 is IV
# v31 - counter 1
#
# AES used,
# vs0 - vs14 for round keys
# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
#
# This implementation uses stitched AES-GCM approach to improve overall performance.
# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
#
# ===================================================================================
#
.machine "any"
.abiversion 1
.text
# 4x loops
# v15 - v18 - input states
# vs1 - vs9 - round keys
#
.macro Loop_aes_middle4x
xxlor 19+32, 1, 1
xxlor 20+32, 2, 2
xxlor 21+32, 3, 3
xxlor 22+32, 4, 4
vcipher 15, 15, 19
vcipher 16, 16, 19
vcipher 17, 17, 19
vcipher 18, 18, 19
vcipher 15, 15, 20
vcipher 16, 16, 20
vcipher 17, 17, 20
vcipher 18, 18, 20
vcipher 15, 15, 21
vcipher 16, 16, 21
vcipher 17, 17, 21
vcipher 18, 18, 21
vcipher 15, 15, 22
vcipher 16, 16, 22
vcipher 17, 17, 22
vcipher 18, 18, 22
xxlor 19+32, 5, 5
xxlor 20+32, 6, 6
xxlor 21+32, 7, 7
xxlor 22+32, 8, 8
vcipher 15, 15, 19
vcipher 16, 16, 19
vcipher 17, 17, 19
vcipher 18, 18, 19
vcipher 15, 15, 20
vcipher 16, 16, 20
vcipher 17, 17, 20
vcipher 18, 18, 20
vcipher 15, 15, 21
vcipher 16, 16, 21
vcipher 17, 17, 21
vcipher 18, 18, 21
vcipher 15, 15, 22
vcipher 16, 16, 22
vcipher 17, 17, 22
vcipher 18, 18, 22
xxlor 23+32, 9, 9
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
.endm
# 8x loops
# v15 - v22 - input states
# vs1 - vs9 - round keys
#
.macro Loop_aes_middle8x
xxlor 23+32, 1, 1
xxlor 24+32, 2, 2
xxlor 25+32, 3, 3
xxlor 26+32, 4, 4
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
vcipher 15, 15, 25
vcipher 16, 16, 25
vcipher 17, 17, 25
vcipher 18, 18, 25
vcipher 19, 19, 25
vcipher 20, 20, 25
vcipher 21, 21, 25
vcipher 22, 22, 25
vcipher 15, 15, 26
vcipher 16, 16, 26
vcipher 17, 17, 26
vcipher 18, 18, 26
vcipher 19, 19, 26
vcipher 20, 20, 26
vcipher 21, 21, 26
vcipher 22, 22, 26
xxlor 23+32, 5, 5
xxlor 24+32, 6, 6
xxlor 25+32, 7, 7
xxlor 26+32, 8, 8
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
vcipher 15, 15, 25
vcipher 16, 16, 25
vcipher 17, 17, 25
vcipher 18, 18, 25
vcipher 19, 19, 25
vcipher 20, 20, 25
vcipher 21, 21, 25
vcipher 22, 22, 25
vcipher 15, 15, 26
vcipher 16, 16, 26
vcipher 17, 17, 26
vcipher 18, 18, 26
vcipher 19, 19, 26
vcipher 20, 20, 26
vcipher 21, 21, 26
vcipher 22, 22, 26
xxlor 23+32, 9, 9
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
.endm
.macro Loop_aes_middle_1x
xxlor 19+32, 1, 1
xxlor 20+32, 2, 2
xxlor 21+32, 3, 3
xxlor 22+32, 4, 4
vcipher 15, 15, 19
vcipher 15, 15, 20
vcipher 15, 15, 21
vcipher 15, 15, 22
xxlor 19+32, 5, 5
xxlor 20+32, 6, 6
xxlor 21+32, 7, 7
xxlor 22+32, 8, 8
vcipher 15, 15, 19
vcipher 15, 15, 20
vcipher 15, 15, 21
vcipher 15, 15, 22
xxlor 19+32, 9, 9
vcipher 15, 15, 19
.endm
#
# Compute 4x hash values based on Karatsuba method.
#
ppc_aes_gcm_ghash:
vxor 15, 15, 0
vpmsumd 23, 12, 15 # H4.L * X.L
vpmsumd 24, 9, 16
vpmsumd 25, 6, 17
vpmsumd 26, 3, 18
vxor 23, 23, 24
vxor 23, 23, 25
vxor 23, 23, 26 # L
vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
vpmsumd 26, 7, 17
vpmsumd 27, 4, 18
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27 # M
# sum hash and reduction with H Poly
vpmsumd 28, 23, 2 # reduction
vxor 29, 29, 29
vsldoi 26, 24, 29, 8 # mL
vsldoi 29, 29, 24, 8 # mH
vxor 23, 23, 26 # mL + L
vsldoi 23, 23, 23, 8 # swap
vxor 23, 23, 28
vpmsumd 24, 14, 15 # H4.H * X.H
vpmsumd 25, 11, 16
vpmsumd 26, 8, 17
vpmsumd 27, 5, 18
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27
vxor 24, 24, 29
# sum hash and reduction with H Poly
vsldoi 27, 23, 23, 8 # swap
vpmsumd 23, 23, 2
vxor 27, 27, 24
vxor 23, 23, 27
xxlor 32, 23+32, 23+32 # update hash
blr
#
# Combine two 4x ghash
# v15 - v22 - input blocks
#
.macro ppc_aes_gcm_ghash2_4x
# first 4x hash
vxor 15, 15, 0 # Xi + X
vpmsumd 23, 12, 15 # H4.L * X.L
vpmsumd 24, 9, 16
vpmsumd 25, 6, 17
vpmsumd 26, 3, 18
vxor 23, 23, 24
vxor 23, 23, 25
vxor 23, 23, 26 # L
vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
vpmsumd 26, 7, 17
vpmsumd 27, 4, 18
vxor 24, 24, 25
vxor 24, 24, 26
# sum hash and reduction with H Poly
vpmsumd 28, 23, 2 # reduction
vxor 29, 29, 29
vxor 24, 24, 27 # M
vsldoi 26, 24, 29, 8 # mL
vsldoi 29, 29, 24, 8 # mH
vxor 23, 23, 26 # mL + L
vsldoi 23, 23, 23, 8 # swap
vxor 23, 23, 28
vpmsumd 24, 14, 15 # H4.H * X.H
vpmsumd 25, 11, 16
vpmsumd 26, 8, 17
vpmsumd 27, 5, 18
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27 # H
vxor 24, 24, 29 # H + mH
# sum hash and reduction with H Poly
vsldoi 27, 23, 23, 8 # swap
vpmsumd 23, 23, 2
vxor 27, 27, 24
vxor 27, 23, 27 # 1st Xi
# 2nd 4x hash
vpmsumd 24, 9, 20
vpmsumd 25, 6, 21
vpmsumd 26, 3, 22
vxor 19, 19, 27 # Xi + X
vpmsumd 23, 12, 19 # H4.L * X.L
vxor 23, 23, 24
vxor 23, 23, 25
vxor 23, 23, 26 # L
vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
vpmsumd 26, 7, 21
vpmsumd 27, 4, 22
vxor 24, 24, 25
vxor 24, 24, 26
# sum hash and reduction with H Poly
vpmsumd 28, 23, 2 # reduction
vxor 29, 29, 29
vxor 24, 24, 27 # M
vsldoi 26, 24, 29, 8 # mL
vsldoi 29, 29, 24, 8 # mH
vxor 23, 23, 26 # mL + L
vsldoi 23, 23, 23, 8 # swap
vxor 23, 23, 28
vpmsumd 24, 14, 19 # H4.H * X.H
vpmsumd 25, 11, 20
vpmsumd 26, 8, 21
vpmsumd 27, 5, 22
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27 # H
vxor 24, 24, 29 # H + mH
# sum hash and reduction with H Poly
vsldoi 27, 23, 23, 8 # swap
vpmsumd 23, 23, 2
vxor 27, 27, 24
vxor 23, 23, 27
xxlor 32, 23+32, 23+32 # update hash
.endm
#
# Compute update single hash
#
.macro ppc_update_hash_1x
vxor 28, 28, 0
vxor 19, 19, 19
vpmsumd 22, 3, 28 # L
vpmsumd 23, 4, 28 # M
vpmsumd 24, 5, 28 # H
vpmsumd 27, 22, 2 # reduction
vsldoi 25, 23, 19, 8 # mL
vsldoi 26, 19, 23, 8 # mH
vxor 22, 22, 25 # LL + LL
vxor 24, 24, 26 # HH + HH
vsldoi 22, 22, 22, 8 # swap
vxor 22, 22, 27
vsldoi 20, 22, 22, 8 # swap
vpmsumd 22, 22, 2 # reduction
vxor 20, 20, 24
vxor 22, 22, 20
vmr 0, 22 # update hash
.endm
.macro SAVE_REGS
stdu 1,-640(1)
mflr 0
std 14,112(1)
std 15,120(1)
std 16,128(1)
std 17,136(1)
std 18,144(1)
std 19,152(1)
std 20,160(1)
std 21,168(1)
li 9, 256
stvx 20, 9, 1
addi 9, 9, 16
stvx 21, 9, 1
addi 9, 9, 16
stvx 22, 9, 1
addi 9, 9, 16
stvx 23, 9, 1
addi 9, 9, 16
stvx 24, 9, 1
addi 9, 9, 16
stvx 25, 9, 1
addi 9, 9, 16
stvx 26, 9, 1
addi 9, 9, 16
stvx 27, 9, 1
addi 9, 9, 16
stvx 28, 9, 1
addi 9, 9, 16
stvx 29, 9, 1
addi 9, 9, 16
stvx 30, 9, 1
addi 9, 9, 16
stvx 31, 9, 1
stxv 14, 464(1)
stxv 15, 480(1)
stxv 16, 496(1)
stxv 17, 512(1)
stxv 18, 528(1)
stxv 19, 544(1)
stxv 20, 560(1)
stxv 21, 576(1)
stxv 22, 592(1)
std 0, 656(1)
.endm
.macro RESTORE_REGS
lxv 14, 464(1)
lxv 15, 480(1)
lxv 16, 496(1)
lxv 17, 512(1)
lxv 18, 528(1)
lxv 19, 544(1)
lxv 20, 560(1)
lxv 21, 576(1)
lxv 22, 592(1)
li 9, 256
lvx 20, 9, 1
addi 9, 9, 16
lvx 21, 9, 1
addi 9, 9, 16
lvx 22, 9, 1
addi 9, 9, 16
lvx 23, 9, 1
addi 9, 9, 16
lvx 24, 9, 1
addi 9, 9, 16
lvx 25, 9, 1
addi 9, 9, 16
lvx 26, 9, 1
addi 9, 9, 16
lvx 27, 9, 1
addi 9, 9, 16
lvx 28, 9, 1
addi 9, 9, 16
lvx 29, 9, 1
addi 9, 9, 16
lvx 30, 9, 1
addi 9, 9, 16
lvx 31, 9, 1
ld 0, 656(1)
ld 14,112(1)
ld 15,120(1)
ld 16,128(1)
ld 17,136(1)
ld 18,144(1)
ld 19,152(1)
ld 20,160(1)
ld 21,168(1)
mtlr 0
addi 1, 1, 640
.endm
.macro LOAD_HASH_TABLE
# Load Xi
lxvb16x 32, 0, 8 # load Xi
# load Hash - h^4, h^3, h^2, h
li 10, 32
lxvd2x 2+32, 10, 8 # H Poli
li 10, 48
lxvd2x 3+32, 10, 8 # Hl
li 10, 64
lxvd2x 4+32, 10, 8 # H
li 10, 80
lxvd2x 5+32, 10, 8 # Hh
li 10, 96
lxvd2x 6+32, 10, 8 # H^2l
li 10, 112
lxvd2x 7+32, 10, 8 # H^2
li 10, 128
lxvd2x 8+32, 10, 8 # H^2h
li 10, 144
lxvd2x 9+32, 10, 8 # H^3l
li 10, 160
lxvd2x 10+32, 10, 8 # H^3
li 10, 176
lxvd2x 11+32, 10, 8 # H^3h
li 10, 192
lxvd2x 12+32, 10, 8 # H^4l
li 10, 208
lxvd2x 13+32, 10, 8 # H^4
li 10, 224
lxvd2x 14+32, 10, 8 # H^4h
.endm
#
# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
# const char *rk, unsigned char iv[16], void *Xip);
#
# r3 - inp
# r4 - out
# r5 - len
# r6 - AES round keys
# r7 - iv and other data
# r8 - Xi, HPoli, hash keys
#
# rounds is at offset 240 in rk
# Xi is at 0 in gcm_table (Xip).
#
.global aes_p10_gcm_encrypt
.align 5
aes_p10_gcm_encrypt:
SAVE_REGS
LOAD_HASH_TABLE
# initialize ICB: GHASH( IV ), IV - r7
lxvb16x 30+32, 0, 7 # load IV - v30
mr 12, 5 # length
li 11, 0 # block index
# counter 1
vxor 31, 31, 31
vspltisb 22, 1
vsldoi 31, 31, 22,1 # counter 1
# load round key to VSR
lxv 0, 0(6)
lxv 1, 0x10(6)
lxv 2, 0x20(6)
lxv 3, 0x30(6)
lxv 4, 0x40(6)
lxv 5, 0x50(6)
lxv 6, 0x60(6)
lxv 7, 0x70(6)
lxv 8, 0x80(6)
lxv 9, 0x90(6)
lxv 10, 0xa0(6)
# load rounds - 10 (128), 12 (192), 14 (256)
lwz 9,240(6)
#
# vxor state, state, w # addroundkey
xxlor 32+29, 0, 0
vxor 15, 30, 29 # IV + round key - add round key 0
cmpdi 9, 10
beq Loop_aes_gcm_8x
# load 2 more round keys (v11, v12)
lxv 11, 0xb0(6)
lxv 12, 0xc0(6)
cmpdi 9, 12
beq Loop_aes_gcm_8x
# load 2 more round keys (v11, v12, v13, v14)
lxv 13, 0xd0(6)
lxv 14, 0xe0(6)
cmpdi 9, 14
beq Loop_aes_gcm_8x
b aes_gcm_out
.align 5
Loop_aes_gcm_8x:
mr 14, 3
mr 9, 4
#
# check partial block
#
Continue_partial_check:
ld 15, 56(7)
cmpdi 15, 0
beq Continue
bgt Final_block
cmpdi 15, 16
blt Final_block
Continue:
# n blcoks
li 10, 128
divdu 10, 12, 10 # n 128 bytes-blocks
cmpdi 10, 0
beq Loop_last_block
vaddudm 30, 30, 31 # IV + counter
vxor 16, 30, 29
vaddudm 30, 30, 31
vxor 17, 30, 29
vaddudm 30, 30, 31
vxor 18, 30, 29
vaddudm 30, 30, 31
vxor 19, 30, 29
vaddudm 30, 30, 31
vxor 20, 30, 29
vaddudm 30, 30, 31
vxor 21, 30, 29
vaddudm 30, 30, 31
vxor 22, 30, 29
mtctr 10
li 15, 16
li 16, 32
li 17, 48
li 18, 64
li 19, 80
li 20, 96
li 21, 112
lwz 10, 240(6)
Loop_8x_block:
lxvb16x 15, 0, 14 # load block
lxvb16x 16, 15, 14 # load block
lxvb16x 17, 16, 14 # load block
lxvb16x 18, 17, 14 # load block
lxvb16x 19, 18, 14 # load block
lxvb16x 20, 19, 14 # load block
lxvb16x 21, 20, 14 # load block
lxvb16x 22, 21, 14 # load block
addi 14, 14, 128
Loop_aes_middle8x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_ghash
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_ghash
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_ghash
b aes_gcm_out
Do_next_ghash:
#
# last round
vcipherlast 15, 15, 23
vcipherlast 16, 16, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
xxlxor 48, 48, 16
stxvb16x 48, 15, 9 # store output
vcipherlast 17, 17, 23
vcipherlast 18, 18, 23
xxlxor 49, 49, 17
stxvb16x 49, 16, 9 # store output
xxlxor 50, 50, 18
stxvb16x 50, 17, 9 # store output
vcipherlast 19, 19, 23
vcipherlast 20, 20, 23
xxlxor 51, 51, 19
stxvb16x 51, 18, 9 # store output
xxlxor 52, 52, 20
stxvb16x 52, 19, 9 # store output
vcipherlast 21, 21, 23
vcipherlast 22, 22, 23
xxlxor 53, 53, 21
stxvb16x 53, 20, 9 # store output
xxlxor 54, 54, 22
stxvb16x 54, 21, 9 # store output
addi 9, 9, 128
# ghash here
ppc_aes_gcm_ghash2_4x
xxlor 27+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vmr 29, 30
vxor 15, 30, 27 # add round key
vaddudm 30, 30, 31
vxor 16, 30, 27
vaddudm 30, 30, 31
vxor 17, 30, 27
vaddudm 30, 30, 31
vxor 18, 30, 27
vaddudm 30, 30, 31
vxor 19, 30, 27
vaddudm 30, 30, 31
vxor 20, 30, 27
vaddudm 30, 30, 31
vxor 21, 30, 27
vaddudm 30, 30, 31
vxor 22, 30, 27
addi 12, 12, -128
addi 11, 11, 128
bdnz Loop_8x_block
vmr 30, 29
stxvb16x 30+32, 0, 7 # update IV
Loop_last_block:
cmpdi 12, 0
beq aes_gcm_out
# loop last few blocks
li 10, 16
divdu 10, 12, 10
mtctr 10
lwz 10, 240(6)
cmpdi 12, 16
blt Final_block
Next_rem_block:
lxvb16x 15, 0, 14 # load block
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_1x
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_1x
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_1x
Do_next_1x:
vcipherlast 15, 15, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
addi 14, 14, 16
addi 9, 9, 16
vmr 28, 15
ppc_update_hash_1x
addi 12, 12, -16
addi 11, 11, 16
xxlor 19+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vxor 15, 30, 19 # add round key
bdnz Next_rem_block
li 15, 0
std 15, 56(7) # clear partial?
stxvb16x 30+32, 0, 7 # update IV
cmpdi 12, 0
beq aes_gcm_out
Final_block:
lwz 10, 240(6)
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_final_1x
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_final_1x
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_final_1x
Do_final_1x:
vcipherlast 15, 15, 23
# check partial block
li 21, 0 # encrypt
ld 15, 56(7) # partial?
cmpdi 15, 0
beq Normal_block
bl Do_partial_block
cmpdi 12, 0
ble aes_gcm_out
b Continue_partial_check
Normal_block:
lxvb16x 15, 0, 14 # load last block
xxlxor 47, 47, 15
# create partial block mask
li 15, 16
sub 15, 15, 12 # index to the mask
vspltisb 16, -1 # first 16 bytes - 0xffff...ff
vspltisb 17, 0 # second 16 bytes - 0x0000...00
li 10, 192
stvx 16, 10, 1
addi 10, 10, 16
stvx 17, 10, 1
addi 10, 1, 192
lxvb16x 16, 15, 10 # load partial block mask
xxland 47, 47, 16
vmr 28, 15
ppc_update_hash_1x
# * should store only the remaining bytes.
bl Write_partial_block
stxvb16x 30+32, 0, 7 # update IV
std 12, 56(7) # update partial?
li 16, 16
stxvb16x 32, 0, 8 # write out Xi
stxvb16x 32, 16, 8 # write out Xi
b aes_gcm_out
#
# Compute data mask
#
.macro GEN_MASK _mask _start _end
vspltisb 16, -1 # first 16 bytes - 0xffff...ff
vspltisb 17, 0 # second 16 bytes - 0x0000...00
li 10, 192
stxvb16x 17+32, 10, 1
add 10, 10, \_start
stxvb16x 16+32, 10, 1
add 10, 10, \_end
stxvb16x 17+32, 10, 1
addi 10, 1, 192
lxvb16x \_mask, 0, 10 # load partial block mask
.endm
#
# Handle multiple partial blocks for encrypt and decrypt
# operations.
#
Do_partial_block:
add 17, 15, 5
cmpdi 17, 16
bgt Big_block
GEN_MASK 18, 15, 5
b _Partial
Big_block:
li 16, 16
GEN_MASK 18, 15, 16
_Partial:
lxvb16x 17+32, 0, 14 # load last block
sldi 16, 15, 3
mtvsrdd 32+16, 0, 16
vsro 17, 17, 16
xxlxor 47, 47, 17+32
xxland 47, 47, 18
vxor 0, 0, 0 # clear Xi
vmr 28, 15
cmpdi 21, 0 # encrypt/decrypt ops?
beq Skip_decrypt
xxland 32+28, 32+17, 18
Skip_decrypt:
ppc_update_hash_1x
li 16, 16
lxvb16x 32+29, 16, 8
vxor 0, 0, 29
stxvb16x 32, 0, 8 # save Xi
stxvb16x 32, 16, 8 # save Xi
# store partial block
# loop the rest of the stream if any
sldi 16, 15, 3
mtvsrdd 32+16, 0, 16
vslo 15, 15, 16
#stxvb16x 15+32, 0, 9 # last block
li 16, 16
sub 17, 16, 15 # 16 - partial
add 16, 15, 5
cmpdi 16, 16
bgt Larger_16
mr 17, 5
Larger_16:
# write partial
li 10, 192
stxvb16x 15+32, 10, 1 # save current block
addi 10, 9, -1
addi 16, 1, 191
mtctr 17 # move partial byte count
Write_last_partial:
lbzu 18, 1(16)
stbu 18, 1(10)
bdnz Write_last_partial
# Complete loop partial
add 14, 14, 17
add 9, 9, 17
sub 12, 12, 17
add 11, 11, 17
add 15, 15, 5
cmpdi 15, 16
blt Save_partial
vaddudm 30, 30, 31
stxvb16x 30+32, 0, 7 # update IV
xxlor 32+29, 0, 0
vxor 15, 30, 29 # IV + round key - add round key 0
li 15, 0
std 15, 56(7) # partial done - clear
b Partial_done
Save_partial:
std 15, 56(7) # partial
Partial_done:
blr
#
# Write partial block
# r9 - output
# r12 - remaining bytes
# v15 - partial input data
#
Write_partial_block:
li 10, 192
stxvb16x 15+32, 10, 1 # last block
addi 10, 9, -1
addi 16, 1, 191
mtctr 12 # remaining bytes
li 15, 0
Write_last_byte:
lbzu 14, 1(16)
stbu 14, 1(10)
bdnz Write_last_byte
blr
aes_gcm_out:
# out = state
stxvb16x 32, 0, 8 # write out Xi
add 3, 11, 12 # return count
RESTORE_REGS
blr
#
# 8x Decrypt
#
.global aes_p10_gcm_decrypt
.align 5
aes_p10_gcm_decrypt:
SAVE_REGS
LOAD_HASH_TABLE
# initialize ICB: GHASH( IV ), IV - r7
lxvb16x 30+32, 0, 7 # load IV - v30
mr 12, 5 # length
li 11, 0 # block index
# counter 1
vxor 31, 31, 31
vspltisb 22, 1
vsldoi 31, 31, 22,1 # counter 1
# load round key to VSR
lxv 0, 0(6)
lxv 1, 0x10(6)
lxv 2, 0x20(6)
lxv 3, 0x30(6)
lxv 4, 0x40(6)
lxv 5, 0x50(6)
lxv 6, 0x60(6)
lxv 7, 0x70(6)
lxv 8, 0x80(6)
lxv 9, 0x90(6)
lxv 10, 0xa0(6)
# load rounds - 10 (128), 12 (192), 14 (256)
lwz 9,240(6)
#
# vxor state, state, w # addroundkey
xxlor 32+29, 0, 0
vxor 15, 30, 29 # IV + round key - add round key 0
cmpdi 9, 10
beq Loop_aes_gcm_8x_dec
# load 2 more round keys (v11, v12)
lxv 11, 0xb0(6)
lxv 12, 0xc0(6)
cmpdi 9, 12
beq Loop_aes_gcm_8x_dec
# load 2 more round keys (v11, v12, v13, v14)
lxv 13, 0xd0(6)
lxv 14, 0xe0(6)
cmpdi 9, 14
beq Loop_aes_gcm_8x_dec
b aes_gcm_out
.align 5
Loop_aes_gcm_8x_dec:
mr 14, 3
mr 9, 4
#
# check partial block
#
Continue_partial_check_dec:
ld 15, 56(7)
cmpdi 15, 0
beq Continue_dec
bgt Final_block_dec
cmpdi 15, 16
blt Final_block_dec
Continue_dec:
# n blcoks
li 10, 128
divdu 10, 12, 10 # n 128 bytes-blocks
cmpdi 10, 0
beq Loop_last_block_dec
vaddudm 30, 30, 31 # IV + counter
vxor 16, 30, 29
vaddudm 30, 30, 31
vxor 17, 30, 29
vaddudm 30, 30, 31
vxor 18, 30, 29
vaddudm 30, 30, 31
vxor 19, 30, 29
vaddudm 30, 30, 31
vxor 20, 30, 29
vaddudm 30, 30, 31
vxor 21, 30, 29
vaddudm 30, 30, 31
vxor 22, 30, 29
mtctr 10
li 15, 16
li 16, 32
li 17, 48
li 18, 64
li 19, 80
li 20, 96
li 21, 112
lwz 10, 240(6)
Loop_8x_block_dec:
lxvb16x 15, 0, 14 # load block
lxvb16x 16, 15, 14 # load block
lxvb16x 17, 16, 14 # load block
lxvb16x 18, 17, 14 # load block
lxvb16x 19, 18, 14 # load block
lxvb16x 20, 19, 14 # load block
lxvb16x 21, 20, 14 # load block
lxvb16x 22, 21, 14 # load block
addi 14, 14, 128
Loop_aes_middle8x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_ghash_dec
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_ghash_dec
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_ghash_dec
b aes_gcm_out
Do_next_ghash_dec:
#
# last round
vcipherlast 15, 15, 23
vcipherlast 16, 16, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
xxlxor 48, 48, 16
stxvb16x 48, 15, 9 # store output
vcipherlast 17, 17, 23
vcipherlast 18, 18, 23
xxlxor 49, 49, 17
stxvb16x 49, 16, 9 # store output
xxlxor 50, 50, 18
stxvb16x 50, 17, 9 # store output
vcipherlast 19, 19, 23
vcipherlast 20, 20, 23
xxlxor 51, 51, 19
stxvb16x 51, 18, 9 # store output
xxlxor 52, 52, 20
stxvb16x 52, 19, 9 # store output
vcipherlast 21, 21, 23
vcipherlast 22, 22, 23
xxlxor 53, 53, 21
stxvb16x 53, 20, 9 # store output
xxlxor 54, 54, 22
stxvb16x 54, 21, 9 # store output
addi 9, 9, 128
xxlor 15+32, 15, 15
xxlor 16+32, 16, 16
xxlor 17+32, 17, 17
xxlor 18+32, 18, 18
xxlor 19+32, 19, 19
xxlor 20+32, 20, 20
xxlor 21+32, 21, 21
xxlor 22+32, 22, 22
# ghash here
ppc_aes_gcm_ghash2_4x
xxlor 27+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vmr 29, 30
vxor 15, 30, 27 # add round key
vaddudm 30, 30, 31
vxor 16, 30, 27
vaddudm 30, 30, 31
vxor 17, 30, 27
vaddudm 30, 30, 31
vxor 18, 30, 27
vaddudm 30, 30, 31
vxor 19, 30, 27
vaddudm 30, 30, 31
vxor 20, 30, 27
vaddudm 30, 30, 31
vxor 21, 30, 27
vaddudm 30, 30, 31
vxor 22, 30, 27
addi 12, 12, -128
addi 11, 11, 128
bdnz Loop_8x_block_dec
vmr 30, 29
stxvb16x 30+32, 0, 7 # update IV
Loop_last_block_dec:
cmpdi 12, 0
beq aes_gcm_out
# loop last few blocks
li 10, 16
divdu 10, 12, 10
mtctr 10
lwz 10, 240(6)
cmpdi 12, 16
blt Final_block_dec
Next_rem_block_dec:
lxvb16x 15, 0, 14 # load block
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_1x_dec
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_1x_dec
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_1x_dec
Do_next_1x_dec:
vcipherlast 15, 15, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
addi 14, 14, 16
addi 9, 9, 16
xxlor 28+32, 15, 15
#vmr 28, 15
ppc_update_hash_1x
addi 12, 12, -16
addi 11, 11, 16
xxlor 19+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vxor 15, 30, 19 # add round key
bdnz Next_rem_block_dec
li 15, 0
std 15, 56(7) # clear partial?
stxvb16x 30+32, 0, 7 # update IV
cmpdi 12, 0
beq aes_gcm_out
Final_block_dec:
lwz 10, 240(6)
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_final_1x_dec
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_final_1x_dec
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_final_1x_dec
Do_final_1x_dec:
vcipherlast 15, 15, 23
# check partial block
li 21, 1 # decrypt
ld 15, 56(7) # partial?
cmpdi 15, 0
beq Normal_block_dec
bl Do_partial_block
cmpdi 12, 0
ble aes_gcm_out
b Continue_partial_check_dec
Normal_block_dec:
lxvb16x 15, 0, 14 # load last block
xxlxor 47, 47, 15
# create partial block mask
li 15, 16
sub 15, 15, 12 # index to the mask
vspltisb 16, -1 # first 16 bytes - 0xffff...ff
vspltisb 17, 0 # second 16 bytes - 0x0000...00
li 10, 192
stvx 16, 10, 1
addi 10, 10, 16
stvx 17, 10, 1
addi 10, 1, 192
lxvb16x 16, 15, 10 # load partial block mask
xxland 47, 47, 16
xxland 32+28, 15, 16
#vmr 28, 15
ppc_update_hash_1x
# * should store only the remaining bytes.
bl Write_partial_block
stxvb16x 30+32, 0, 7 # update IV
std 12, 56(7) # update partial?
li 16, 16
stxvb16x 32, 0, 8 # write out Xi
stxvb16x 32, 16, 8 # write out Xi
b aes_gcm_out
#!/usr/bin/env perl
# SPDX-License-Identifier: GPL-2.0
# PowerPC assembler distiller by <appro>.
my $flavour = shift;
my $output = shift;
open STDOUT,">$output" || die "can't open $output: $!";
my %GLOBALS;
my $dotinlocallabels=($flavour=~/linux/)?1:0;
################################################################
# directives which need special treatment on different platforms
################################################################
my $globl = sub {
my $junk = shift;
my $name = shift;
my $global = \$GLOBALS{$name};
my $ret;
$name =~ s|^[\.\_]||;
SWITCH: for ($flavour) {
/aix/ && do { $name = ".$name";
last;
};
/osx/ && do { $name = "_$name";
last;
};
/linux/
&& do { $ret = "_GLOBAL($name)";
last;
};
}
$ret = ".globl $name\nalign 5\n$name:" if (!$ret);
$$global = $name;
$ret;
};
my $text = sub {
my $ret = ($flavour =~ /aix/) ? ".csect\t.text[PR],7" : ".text";
$ret = ".abiversion 2\n".$ret if ($flavour =~ /linux.*64le/);
$ret;
};
my $machine = sub {
my $junk = shift;
my $arch = shift;
if ($flavour =~ /osx/)
{ $arch =~ s/\"//g;
$arch = ($flavour=~/64/) ? "ppc970-64" : "ppc970" if ($arch eq "any");
}
".machine $arch";
};
my $size = sub {
if ($flavour =~ /linux/)
{ shift;
my $name = shift; $name =~ s|^[\.\_]||;
my $ret = ".size $name,.-".($flavour=~/64$/?".":"").$name;
$ret .= "\n.size .$name,.-.$name" if ($flavour=~/64$/);
$ret;
}
else
{ ""; }
};
my $asciz = sub {
shift;
my $line = join(",",@_);
if ($line =~ /^"(.*)"$/)
{ ".byte " . join(",",unpack("C*",$1),0) . "\n.align 2"; }
else
{ ""; }
};
my $quad = sub {
shift;
my @ret;
my ($hi,$lo);
for (@_) {
if (/^0x([0-9a-f]*?)([0-9a-f]{1,8})$/io)
{ $hi=$1?"0x$1":"0"; $lo="0x$2"; }
elsif (/^([0-9]+)$/o)
{ $hi=$1>>32; $lo=$1&0xffffffff; } # error-prone with 32-bit perl
else
{ $hi=undef; $lo=$_; }
if (defined($hi))
{ push(@ret,$flavour=~/le$/o?".long\t$lo,$hi":".long\t$hi,$lo"); }
else
{ push(@ret,".quad $lo"); }
}
join("\n",@ret);
};
################################################################
# simplified mnemonics not handled by at least one assembler
################################################################
my $cmplw = sub {
my $f = shift;
my $cr = 0; $cr = shift if ($#_>1);
# Some out-of-date 32-bit GNU assembler just can't handle cmplw...
($flavour =~ /linux.*32/) ?
" .long ".sprintf "0x%x",31<<26|$cr<<23|$_[0]<<16|$_[1]<<11|64 :
" cmplw ".join(',',$cr,@_);
};
my $bdnz = sub {
my $f = shift;
my $bo = $f=~/[\+\-]/ ? 16+9 : 16; # optional "to be taken" hint
" bc $bo,0,".shift;
} if ($flavour!~/linux/);
my $bltlr = sub {
my $f = shift;
my $bo = $f=~/\-/ ? 12+2 : 12; # optional "not to be taken" hint
($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints
" .long ".sprintf "0x%x",19<<26|$bo<<21|16<<1 :
" bclr $bo,0";
};
my $bnelr = sub {
my $f = shift;
my $bo = $f=~/\-/ ? 4+2 : 4; # optional "not to be taken" hint
($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints
" .long ".sprintf "0x%x",19<<26|$bo<<21|2<<16|16<<1 :
" bclr $bo,2";
};
my $beqlr = sub {
my $f = shift;
my $bo = $f=~/-/ ? 12+2 : 12; # optional "not to be taken" hint
($flavour =~ /linux/) ? # GNU as doesn't allow most recent hints
" .long ".sprintf "0x%X",19<<26|$bo<<21|2<<16|16<<1 :
" bclr $bo,2";
};
# GNU assembler can't handle extrdi rA,rS,16,48, or when sum of last two
# arguments is 64, with "operand out of range" error.
my $extrdi = sub {
my ($f,$ra,$rs,$n,$b) = @_;
$b = ($b+$n)&63; $n = 64-$n;
" rldicl $ra,$rs,$b,$n";
};
my $vmr = sub {
my ($f,$vx,$vy) = @_;
" vor $vx,$vy,$vy";
};
# Some ABIs specify vrsave, special-purpose register #256, as reserved
# for system use.
my $no_vrsave = ($flavour =~ /linux-ppc64le/);
my $mtspr = sub {
my ($f,$idx,$ra) = @_;
if ($idx == 256 && $no_vrsave) {
" or $ra,$ra,$ra";
} else {
" mtspr $idx,$ra";
}
};
my $mfspr = sub {
my ($f,$rd,$idx) = @_;
if ($idx == 256 && $no_vrsave) {
" li $rd,-1";
} else {
" mfspr $rd,$idx";
}
};
# PowerISA 2.06 stuff
sub vsxmem_op {
my ($f, $vrt, $ra, $rb, $op) = @_;
" .long ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1);
}
# made-up unaligned memory reference AltiVec/VMX instructions
my $lvx_u = sub { vsxmem_op(@_, 844); }; # lxvd2x
my $stvx_u = sub { vsxmem_op(@_, 972); }; # stxvd2x
my $lvdx_u = sub { vsxmem_op(@_, 588); }; # lxsdx
my $stvdx_u = sub { vsxmem_op(@_, 716); }; # stxsdx
my $lvx_4w = sub { vsxmem_op(@_, 780); }; # lxvw4x
my $stvx_4w = sub { vsxmem_op(@_, 908); }; # stxvw4x
# PowerISA 2.07 stuff
sub vcrypto_op {
my ($f, $vrt, $vra, $vrb, $op) = @_;
" .long ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
}
my $vcipher = sub { vcrypto_op(@_, 1288); };
my $vcipherlast = sub { vcrypto_op(@_, 1289); };
my $vncipher = sub { vcrypto_op(@_, 1352); };
my $vncipherlast= sub { vcrypto_op(@_, 1353); };
my $vsbox = sub { vcrypto_op(@_, 0, 1480); };
my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); };
my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); };
my $vpmsumb = sub { vcrypto_op(@_, 1032); };
my $vpmsumd = sub { vcrypto_op(@_, 1224); };
my $vpmsubh = sub { vcrypto_op(@_, 1096); };
my $vpmsumw = sub { vcrypto_op(@_, 1160); };
my $vaddudm = sub { vcrypto_op(@_, 192); };
my $vadduqm = sub { vcrypto_op(@_, 256); };
my $mtsle = sub {
my ($f, $arg) = @_;
" .long ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
};
print "#include <asm/ppc_asm.h>\n" if $flavour =~ /linux/;
while($line=<>) {
$line =~ s|[#!;].*$||; # get rid of asm-style comments...
$line =~ s|/\*.*\*/||; # ... and C-style comments...
$line =~ s|^\s+||; # ... and skip white spaces in beginning...
$line =~ s|\s+$||; # ... and at the end
{
$line =~ s|\b\.L(\w+)|L$1|g; # common denominator for Locallabel
$line =~ s|\bL(\w+)|\.L$1|g if ($dotinlocallabels);
}
{
$line =~ s|^\s*(\.?)(\w+)([\.\+\-]?)\s*||;
my $c = $1; $c = "\t" if ($c eq "");
my $mnemonic = $2;
my $f = $3;
my $opcode = eval("\$$mnemonic");
$line =~ s/\b(c?[rf]|v|vs)([0-9]+)\b/$2/g if ($c ne "." and $flavour !~ /osx/);
if (ref($opcode) eq 'CODE') { $line = &$opcode($f,split(',',$line)); }
elsif ($mnemonic) { $line = $c.$mnemonic.$f."\t".$line; }
}
print $line if ($line);
print "\n";
}
close STDOUT;
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment