linux/drivers/crypto/vmx/aesp8-ppc.pl
Daniel Axtens dcf7b48212 crypto: vmx - fix copy-paste error in CTR mode
The original assembly imported from OpenSSL has two copy-paste
errors in handling CTR mode. When dealing with a 2 or 3 block tail,
the code branches to the CBC decryption exit path, rather than to
the CTR exit path.

This leads to corruption of the IV, which leads to subsequent blocks
being corrupted.

This can be detected with libkcapi test suite, which is available at
https://github.com/smuellerDD/libkcapi

Reported-by: Ondrej Mosnáček <omosnacek@gmail.com>
Fixes: 5c380d623e ("crypto: vmx - Add support for VMS instructions by ASM")
Cc: stable@vger.kernel.org
Signed-off-by: Daniel Axtens <dja@axtens.net>
Tested-by: Michael Ellerman <mpe@ellerman.id.au>
Tested-by: Ondrej Mosnacek <omosnacek@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2019-03-22 20:57:28 +08:00

3829 lines
93 KiB
Raku

#! /usr/bin/env perl
# SPDX-License-Identifier: GPL-2.0
# This code is taken from CRYPTOGAMs[1] and is included here using the option
# in the license to distribute the code under the GPL. Therefore this program
# is free software; you can redistribute it and/or modify it under the terms of
# the GNU General Public License version 2 as published by the Free Software
# Foundation.
#
# [1] https://www.openssl.org/~appro/cryptogams/
# Copyright (c) 2006-2017, CRYPTOGAMS by <appro@openssl.org>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# * Redistributions of source code must retain copyright notices,
# this list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials
# provided with the distribution.
#
# * Neither the name of the CRYPTOGAMS nor the names of its
# copyright holder and contributors may be used to endorse or
# promote products derived from this software without specific
# prior written permission.
#
# ALTERNATIVELY, provided that this notice is retained in full, this
# product may be distributed under the terms of the GNU General Public
# License (GPL), in which case the provisions of the GPL apply INSTEAD OF
# those given above.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for AES instructions as per PowerISA
# specification version 2.07, first implemented by POWER8 processor.
# The module is endian-agnostic in sense that it supports both big-
# and little-endian cases. Data alignment in parallelizable modes is
# handled with VSX loads and stores, which implies MSR.VSX flag being
# set. It should also be noted that ISA specification doesn't prohibit
# alignment exceptions for these instructions on page boundaries.
# Initially alignment was handled in pure AltiVec/VMX way [when data
# is aligned programmatically, which in turn guarantees exception-
# free execution], but it turned to hamper performance when vcipher
# instructions are interleaved. It's reckoned that eventual
# misalignment penalties at page boundaries are in average lower
# than additional overhead in pure AltiVec approach.
#
# May 2016
#
# Add XTS subroutine, 9x on little- and 12x improvement on big-endian
# systems were measured.
#
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS
# POWER8[le] 3.96/0.72 0.74 1.1
# POWER8[be] 3.75/0.65 0.66 1.0
$flavour = shift;
if ($flavour =~ /64/) {
$SIZE_T =8;
$LRSAVE =2*$SIZE_T;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
$UCMP ="cmpld";
$SHL ="sldi";
} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$LRSAVE =$SIZE_T;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
$UCMP ="cmplw";
$SHL ="slwi";
} else { die "nonsense $flavour"; }
$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=8*$SIZE_T;
$prefix="aes_p8";
$sp="r1";
$vrsave="r12";
#########################################################################
{{{ # Key setup procedures #
my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
$code.=<<___;
.machine "any"
.text
.align 7
rcon:
.long 0x01000000, 0x01000000, 0x01000000, 0x01000000 ?rev
.long 0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000 ?rev
.long 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c ?rev
.long 0,0,0,0 ?asis
Lconsts:
mflr r0
bcl 20,31,\$+4
mflr $ptr #vvvvv "distance between . and rcon
addi $ptr,$ptr,-0x48
mtlr r0
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
.globl .${prefix}_set_encrypt_key
Lset_encrypt_key:
mflr r11
$PUSH r11,$LRSAVE($sp)
li $ptr,-1
${UCMP}i $inp,0
beq- Lenc_key_abort # if ($inp==0) return -1;
${UCMP}i $out,0
beq- Lenc_key_abort # if ($out==0) return -1;
li $ptr,-2
cmpwi $bits,128
blt- Lenc_key_abort
cmpwi $bits,256
bgt- Lenc_key_abort
andi. r0,$bits,0x3f
bne- Lenc_key_abort
lis r0,0xfff0
mfspr $vrsave,256
mtspr 256,r0
bl Lconsts
mtlr r11
neg r9,$inp
lvx $in0,0,$inp
addi $inp,$inp,15 # 15 is not typo
lvsr $key,0,r9 # borrow $key
li r8,0x20
cmpwi $bits,192
lvx $in1,0,$inp
le?vspltisb $mask,0x0f # borrow $mask
lvx $rcon,0,$ptr
le?vxor $key,$key,$mask # adjust for byte swap
lvx $mask,r8,$ptr
addi $ptr,$ptr,0x10
vperm $in0,$in0,$in1,$key # align [and byte swap in LE]
li $cnt,8
vxor $zero,$zero,$zero
mtctr $cnt
?lvsr $outperm,0,$out
vspltisb $outmask,-1
lvx $outhead,0,$out
?vperm $outmask,$zero,$outmask,$outperm
blt Loop128
addi $inp,$inp,8
beq L192
addi $inp,$inp,8
b L256
.align 4
Loop128:
vperm $key,$in0,$in0,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vadduwm $rcon,$rcon,$rcon
vxor $in0,$in0,$key
bdnz Loop128
lvx $rcon,0,$ptr # last two round keys
vperm $key,$in0,$in0,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vadduwm $rcon,$rcon,$rcon
vxor $in0,$in0,$key
vperm $key,$in0,$in0,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vxor $in0,$in0,$key
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $inp,$out,15 # 15 is not typo
addi $out,$out,0x50
li $rounds,10
b Ldone
.align 4
L192:
lvx $tmp,0,$inp
li $cnt,4
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $out,$out,16
vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
vspltisb $key,8 # borrow $key
mtctr $cnt
vsububm $mask,$mask,$key # adjust the mask
Loop192:
vperm $key,$in1,$in1,$mask # roate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vcipherlast $key,$key,$rcon
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $stage,$zero,$in1,8
vspltw $tmp,$in0,3
vxor $tmp,$tmp,$in1
vsldoi $in1,$zero,$in1,12 # >>32
vadduwm $rcon,$rcon,$rcon
vxor $in1,$in1,$tmp
vxor $in0,$in0,$key
vxor $in1,$in1,$key
vsldoi $stage,$stage,$in0,8
vperm $key,$in1,$in1,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$stage,$stage,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vsldoi $stage,$in0,$in1,8
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vperm $outtail,$stage,$stage,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
stvx $stage,0,$out
addi $out,$out,16
vspltw $tmp,$in0,3
vxor $tmp,$tmp,$in1
vsldoi $in1,$zero,$in1,12 # >>32
vadduwm $rcon,$rcon,$rcon
vxor $in1,$in1,$tmp
vxor $in0,$in0,$key
vxor $in1,$in1,$key
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $inp,$out,15 # 15 is not typo
addi $out,$out,16
bdnz Loop192
li $rounds,12
addi $out,$out,0x20
b Ldone
.align 4
L256:
lvx $tmp,0,$inp
li $cnt,7
li $rounds,14
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $out,$out,16
vperm $in1,$in1,$tmp,$key # align [and byte swap in LE]
mtctr $cnt
Loop256:
vperm $key,$in1,$in1,$mask # rotate-n-splat
vsldoi $tmp,$zero,$in0,12 # >>32
vperm $outtail,$in1,$in1,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
vcipherlast $key,$key,$rcon
stvx $stage,0,$out
addi $out,$out,16
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in0,$in0,$tmp
vadduwm $rcon,$rcon,$rcon
vxor $in0,$in0,$key
vperm $outtail,$in0,$in0,$outperm # rotate
vsel $stage,$outhead,$outtail,$outmask
vmr $outhead,$outtail
stvx $stage,0,$out
addi $inp,$out,15 # 15 is not typo
addi $out,$out,16
bdz Ldone
vspltw $key,$in0,3 # just splat
vsldoi $tmp,$zero,$in1,12 # >>32
vsbox $key,$key
vxor $in1,$in1,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in1,$in1,$tmp
vsldoi $tmp,$zero,$tmp,12 # >>32
vxor $in1,$in1,$tmp
vxor $in1,$in1,$key
b Loop256
.align 4
Ldone:
lvx $in1,0,$inp # redundant in aligned case
vsel $in1,$outhead,$in1,$outmask
stvx $in1,0,$inp
li $ptr,0
mtspr 256,$vrsave
stw $rounds,0($out)
Lenc_key_abort:
mr r3,$ptr
blr
.long 0
.byte 0,12,0x14,1,0,0,3,0
.long 0
.size .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
.globl .${prefix}_set_decrypt_key
$STU $sp,-$FRAME($sp)
mflr r10
$PUSH r10,$FRAME+$LRSAVE($sp)
bl Lset_encrypt_key
mtlr r10
cmpwi r3,0
bne- Ldec_key_abort
slwi $cnt,$rounds,4
subi $inp,$out,240 # first round key
srwi $rounds,$rounds,1
add $out,$inp,$cnt # last round key
mtctr $rounds
Ldeckey:
lwz r0, 0($inp)
lwz r6, 4($inp)
lwz r7, 8($inp)
lwz r8, 12($inp)
addi $inp,$inp,16
lwz r9, 0($out)
lwz r10,4($out)
lwz r11,8($out)
lwz r12,12($out)
stw r0, 0($out)
stw r6, 4($out)
stw r7, 8($out)
stw r8, 12($out)
subi $out,$out,16
stw r9, -16($inp)
stw r10,-12($inp)
stw r11,-8($inp)
stw r12,-4($inp)
bdnz Ldeckey
xor r3,r3,r3 # return value
Ldec_key_abort:
addi $sp,$sp,$FRAME
blr
.long 0
.byte 0,12,4,1,0x80,0,3,0
.long 0
.size .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
___
}}}
#########################################################################
{{{ # Single block en- and decrypt procedures #
sub gen_block () {
my $dir = shift;
my $n = $dir eq "de" ? "n" : "";
my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
$code.=<<___;
.globl .${prefix}_${dir}crypt
lwz $rounds,240($key)
lis r0,0xfc00
mfspr $vrsave,256
li $idx,15 # 15 is not typo
mtspr 256,r0
lvx v0,0,$inp
neg r11,$out
lvx v1,$idx,$inp
lvsl v2,0,$inp # inpperm
le?vspltisb v4,0x0f
?lvsl v3,0,r11 # outperm
le?vxor v2,v2,v4
li $idx,16
vperm v0,v0,v1,v2 # align [and byte swap in LE]
lvx v1,0,$key
?lvsl v5,0,$key # keyperm
srwi $rounds,$rounds,1
lvx v2,$idx,$key
addi $idx,$idx,16
subi $rounds,$rounds,1
?vperm v1,v1,v2,v5 # align round key
vxor v0,v0,v1
lvx v1,$idx,$key
addi $idx,$idx,16
mtctr $rounds
Loop_${dir}c:
?vperm v2,v2,v1,v5
v${n}cipher v0,v0,v2
lvx v2,$idx,$key
addi $idx,$idx,16
?vperm v1,v1,v2,v5
v${n}cipher v0,v0,v1
lvx v1,$idx,$key
addi $idx,$idx,16
bdnz Loop_${dir}c
?vperm v2,v2,v1,v5
v${n}cipher v0,v0,v2
lvx v2,$idx,$key
?vperm v1,v1,v2,v5
v${n}cipherlast v0,v0,v1
vspltisb v2,-1
vxor v1,v1,v1
li $idx,15 # 15 is not typo
?vperm v2,v1,v2,v3 # outmask
le?vxor v3,v3,v4
lvx v1,0,$out # outhead
vperm v0,v0,v0,v3 # rotate [and byte swap in LE]
vsel v1,v1,v0,v2
lvx v4,$idx,$out
stvx v1,0,$out
vsel v0,v0,v4,v2
stvx v0,$idx,$out
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,3,0
.long 0
.size .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
___
}
&gen_block("en");
&gen_block("de");
}}}
#########################################################################
{{{ # CBC en- and decrypt procedures #
my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
map("v$_",(4..10));
$code.=<<___;
.globl .${prefix}_cbc_encrypt
${UCMP}i $len,16
bltlr-
cmpwi $enc,0 # test direction
lis r0,0xffe0
mfspr $vrsave,256
mtspr 256,r0
li $idx,15
vxor $rndkey0,$rndkey0,$rndkey0
le?vspltisb $tmp,0x0f
lvx $ivec,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
le?vxor $inpperm,$inpperm,$tmp
vperm $ivec,$ivec,$inptail,$inpperm
neg r11,$inp
?lvsl $keyperm,0,$key # prepare for unaligned key
lwz $rounds,240($key)
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inptail,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
?lvsr $outperm,0,$out # prepare for unaligned store
vspltisb $outmask,-1
lvx $outhead,0,$out
?vperm $outmask,$rndkey0,$outmask,$outperm
le?vxor $outperm,$outperm,$tmp
srwi $rounds,$rounds,1
li $idx,16
subi $rounds,$rounds,1
beq Lcbc_dec
Lcbc_enc:
vmr $inout,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
mtctr $rounds
subi $len,$len,16 # len-=16
lvx $rndkey0,0,$key
vperm $inout,$inout,$inptail,$inpperm
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
vxor $inout,$inout,$ivec
Loop_cbc_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
bdnz Loop_cbc_enc
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipherlast $ivec,$inout,$rndkey0
${UCMP}i $len,16
vperm $tmp,$ivec,$ivec,$outperm
vsel $inout,$outhead,$tmp,$outmask
vmr $outhead,$tmp
stvx $inout,0,$out
addi $out,$out,16
bge Lcbc_enc
b Lcbc_done
.align 4
Lcbc_dec:
${UCMP}i $len,128
bge _aesp8_cbc_decrypt8x
vmr $tmp,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
mtctr $rounds
subi $len,$len,16 # len-=16
lvx $rndkey0,0,$key
vperm $tmp,$tmp,$inptail,$inpperm
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$tmp,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
Loop_cbc_dec:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
bdnz Loop_cbc_dec
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipherlast $inout,$inout,$rndkey0
${UCMP}i $len,16
vxor $inout,$inout,$ivec
vmr $ivec,$tmp
vperm $tmp,$inout,$inout,$outperm
vsel $inout,$outhead,$tmp,$outmask
vmr $outhead,$tmp
stvx $inout,0,$out
addi $out,$out,16
bge Lcbc_dec
Lcbc_done:
addi $out,$out,-1
lvx $inout,0,$out # redundant in aligned case
vsel $inout,$outhead,$inout,$outmask
stvx $inout,0,$out
neg $enc,$ivp # write [unaligned] iv
li $idx,15 # 15 is not typo
vxor $rndkey0,$rndkey0,$rndkey0
vspltisb $outmask,-1
le?vspltisb $tmp,0x0f
?lvsl $outperm,0,$enc
?vperm $outmask,$rndkey0,$outmask,$outperm
le?vxor $outperm,$outperm,$tmp
lvx $outhead,0,$ivp
vperm $ivec,$ivec,$ivec,$outperm
vsel $inout,$outhead,$ivec,$outmask
lvx $inptail,$idx,$ivp
stvx $inout,0,$ivp
vsel $inout,$ivec,$inptail,$outmask
stvx $inout,$idx,$ivp
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,6,0
.long 0
___
#########################################################################
{{ # Optimized CBC decrypt procedure #
my $key_="r11";
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
# v26-v31 last 6 round keys
my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
$code.=<<___;
.align 5
_aesp8_cbc_decrypt8x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
li r10,`$FRAME+8*16+15`
li r11,`$FRAME+8*16+31`
stvx v20,r10,$sp # ABI says so
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
subi $len,$len,128 # bias
lvx $rndkey0,$x00,$key # load key schedule
lvx v30,$x10,$key
addi $key,$key,0x20
lvx v31,$x00,$key
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_cbc_dec_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key
addi $key,$key,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_cbc_dec_key
lvx v26,$x10,$key
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key
?vperm v29,v29,v30,$keyperm
lvx $out0,$x70,$key # borrow $out0
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$out0,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
#lvx $inptail,0,$inp # "caller" already did this
#addi $inp,$inp,15 # 15 is not typo
subi $inp,$inp,15 # undo "caller"
le?li $idx,8
lvx_u $in0,$x00,$inp # load first 8 "words"
le?lvsl $inpperm,0,$idx
le?vspltisb $tmp,0x0f
lvx_u $in1,$x10,$inp
le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
lvx_u $in2,$x20,$inp
le?vperm $in0,$in0,$in0,$inpperm
lvx_u $in3,$x30,$inp
le?vperm $in1,$in1,$in1,$inpperm
lvx_u $in4,$x40,$inp
le?vperm $in2,$in2,$in2,$inpperm
vxor $out0,$in0,$rndkey0
lvx_u $in5,$x50,$inp
le?vperm $in3,$in3,$in3,$inpperm
vxor $out1,$in1,$rndkey0
lvx_u $in6,$x60,$inp
le?vperm $in4,$in4,$in4,$inpperm
vxor $out2,$in2,$rndkey0
lvx_u $in7,$x70,$inp
addi $inp,$inp,0x80
le?vperm $in5,$in5,$in5,$inpperm
vxor $out3,$in3,$rndkey0
le?vperm $in6,$in6,$in6,$inpperm
vxor $out4,$in4,$rndkey0
le?vperm $in7,$in7,$in7,$inpperm
vxor $out5,$in5,$rndkey0
vxor $out6,$in6,$rndkey0
vxor $out7,$in7,$rndkey0
mtctr $rounds
b Loop_cbc_dec8x
.align 5
Loop_cbc_dec8x:
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_cbc_dec8x
subic $len,$len,128 # $len-=128
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
subfe. r0,r0,r0 # borrow?-1:0
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
and r0,r0,$len
vncipher $out0,$out0,v26
vncipher $out1,$out1,v26
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vncipher $out4,$out4,v26
vncipher $out5,$out5,v26
vncipher $out6,$out6,v26
vncipher $out7,$out7,v26
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in7 are loaded
# with last "words"
vncipher $out0,$out0,v27
vncipher $out1,$out1,v27
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vncipher $out4,$out4,v27
vncipher $out5,$out5,v27
vncipher $out6,$out6,v27
vncipher $out7,$out7,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
vncipher $out1,$out1,v28
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vncipher $out4,$out4,v28
vncipher $out5,$out5,v28
vncipher $out6,$out6,v28
vncipher $out7,$out7,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vncipher $out0,$out0,v29
vncipher $out1,$out1,v29
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vncipher $out4,$out4,v29
vncipher $out5,$out5,v29
vncipher $out6,$out6,v29
vncipher $out7,$out7,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vncipher $out0,$out0,v30
vxor $ivec,$ivec,v31 # xor with last round key
vncipher $out1,$out1,v30
vxor $in0,$in0,v31
vncipher $out2,$out2,v30
vxor $in1,$in1,v31
vncipher $out3,$out3,v30
vxor $in2,$in2,v31
vncipher $out4,$out4,v30
vxor $in3,$in3,v31
vncipher $out5,$out5,v30
vxor $in4,$in4,v31
vncipher $out6,$out6,v30
vxor $in5,$in5,v31
vncipher $out7,$out7,v30
vxor $in6,$in6,v31
vncipherlast $out0,$out0,$ivec
vncipherlast $out1,$out1,$in0
lvx_u $in0,$x00,$inp # load next input block
vncipherlast $out2,$out2,$in1
lvx_u $in1,$x10,$inp
vncipherlast $out3,$out3,$in2
le?vperm $in0,$in0,$in0,$inpperm
lvx_u $in2,$x20,$inp
vncipherlast $out4,$out4,$in3
le?vperm $in1,$in1,$in1,$inpperm
lvx_u $in3,$x30,$inp
vncipherlast $out5,$out5,$in4
le?vperm $in2,$in2,$in2,$inpperm
lvx_u $in4,$x40,$inp
vncipherlast $out6,$out6,$in5
le?vperm $in3,$in3,$in3,$inpperm
lvx_u $in5,$x50,$inp
vncipherlast $out7,$out7,$in6
le?vperm $in4,$in4,$in4,$inpperm
lvx_u $in6,$x60,$inp
vmr $ivec,$in7
le?vperm $in5,$in5,$in5,$inpperm
lvx_u $in7,$x70,$inp
addi $inp,$inp,0x80
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $in6,$in6,$in6,$inpperm
vxor $out0,$in0,$rndkey0
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $in7,$in7,$in7,$inpperm
vxor $out1,$in1,$rndkey0
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
vxor $out2,$in2,$rndkey0
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
vxor $out3,$in3,$rndkey0
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
vxor $out4,$in4,$rndkey0
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x50,$out
vxor $out5,$in5,$rndkey0
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x60,$out
vxor $out6,$in6,$rndkey0
stvx_u $out7,$x70,$out
addi $out,$out,0x80
vxor $out7,$in7,$rndkey0
mtctr $rounds
beq Loop_cbc_dec8x # did $len-=128 borrow?
addic. $len,$len,128
beq Lcbc_dec8x_done
nop
nop
Loop_cbc_dec8x_tail: # up to 7 "words" tail...
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_cbc_dec8x_tail
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
vncipher $out6,$out6,v24
vncipher $out7,$out7,v24
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
vncipher $out6,$out6,v25
vncipher $out7,$out7,v25
vncipher $out1,$out1,v26
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vncipher $out4,$out4,v26
vncipher $out5,$out5,v26
vncipher $out6,$out6,v26
vncipher $out7,$out7,v26
vncipher $out1,$out1,v27
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vncipher $out4,$out4,v27
vncipher $out5,$out5,v27
vncipher $out6,$out6,v27
vncipher $out7,$out7,v27
vncipher $out1,$out1,v28
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vncipher $out4,$out4,v28
vncipher $out5,$out5,v28
vncipher $out6,$out6,v28
vncipher $out7,$out7,v28
vncipher $out1,$out1,v29
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vncipher $out4,$out4,v29
vncipher $out5,$out5,v29
vncipher $out6,$out6,v29
vncipher $out7,$out7,v29
vncipher $out1,$out1,v30
vxor $ivec,$ivec,v31 # last round key
vncipher $out2,$out2,v30
vxor $in1,$in1,v31
vncipher $out3,$out3,v30
vxor $in2,$in2,v31
vncipher $out4,$out4,v30
vxor $in3,$in3,v31
vncipher $out5,$out5,v30
vxor $in4,$in4,v31
vncipher $out6,$out6,v30
vxor $in5,$in5,v31
vncipher $out7,$out7,v30
vxor $in6,$in6,v31
cmplwi $len,32 # switch($len)
blt Lcbc_dec8x_one
nop
beq Lcbc_dec8x_two
cmplwi $len,64
blt Lcbc_dec8x_three
nop
beq Lcbc_dec8x_four
cmplwi $len,96
blt Lcbc_dec8x_five
nop
beq Lcbc_dec8x_six
Lcbc_dec8x_seven:
vncipherlast $out1,$out1,$ivec
vncipherlast $out2,$out2,$in1
vncipherlast $out3,$out3,$in2
vncipherlast $out4,$out4,$in3
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out1,$out1,$out1,$inpperm
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x00,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x10,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x20,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x30,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x40,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x50,$out
stvx_u $out7,$x60,$out
addi $out,$out,0x70
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_six:
vncipherlast $out2,$out2,$ivec
vncipherlast $out3,$out3,$in2
vncipherlast $out4,$out4,$in3
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out2,$out2,$out2,$inpperm
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x00,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x10,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x20,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x30,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x40,$out
stvx_u $out7,$x50,$out
addi $out,$out,0x60
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_five:
vncipherlast $out3,$out3,$ivec
vncipherlast $out4,$out4,$in3
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out3,$out3,$out3,$inpperm
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x00,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x10,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x20,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x30,$out
stvx_u $out7,$x40,$out
addi $out,$out,0x50
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_four:
vncipherlast $out4,$out4,$ivec
vncipherlast $out5,$out5,$in4
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out4,$out4,$out4,$inpperm
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x00,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x10,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x20,$out
stvx_u $out7,$x30,$out
addi $out,$out,0x40
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_three:
vncipherlast $out5,$out5,$ivec
vncipherlast $out6,$out6,$in5
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out5,$out5,$out5,$inpperm
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x00,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x10,$out
stvx_u $out7,$x20,$out
addi $out,$out,0x30
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_two:
vncipherlast $out6,$out6,$ivec
vncipherlast $out7,$out7,$in6
vmr $ivec,$in7
le?vperm $out6,$out6,$out6,$inpperm
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x00,$out
stvx_u $out7,$x10,$out
addi $out,$out,0x20
b Lcbc_dec8x_done
.align 5
Lcbc_dec8x_one:
vncipherlast $out7,$out7,$ivec
vmr $ivec,$in7
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out7,0,$out
addi $out,$out,0x10
Lcbc_dec8x_done:
le?vperm $ivec,$ivec,$ivec,$inpperm
stvx_u $ivec,0,$ivp # write [unaligned] iv
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $inpperm,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x14,0,0x80,6,6,0
.long 0
.size .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
___
}} }}}
#########################################################################
{{{ # CTR procedure[s] #
my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
my ($rndkey0,$rndkey1,$inout,$tmp)= map("v$_",(0..3));
my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
map("v$_",(4..11));
my $dat=$tmp;
$code.=<<___;
.globl .${prefix}_ctr32_encrypt_blocks
${UCMP}i $len,1
bltlr-
lis r0,0xfff0
mfspr $vrsave,256
mtspr 256,r0
li $idx,15
vxor $rndkey0,$rndkey0,$rndkey0
le?vspltisb $tmp,0x0f
lvx $ivec,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
vspltisb $one,1
le?vxor $inpperm,$inpperm,$tmp
vperm $ivec,$ivec,$inptail,$inpperm
vsldoi $one,$rndkey0,$one,1
neg r11,$inp
?lvsl $keyperm,0,$key # prepare for unaligned key
lwz $rounds,240($key)
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inptail,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
srwi $rounds,$rounds,1
li $idx,16
subi $rounds,$rounds,1
${UCMP}i $len,8
bge _aesp8_ctr32_encrypt8x
?lvsr $outperm,0,$out # prepare for unaligned store
vspltisb $outmask,-1
lvx $outhead,0,$out
?vperm $outmask,$rndkey0,$outmask,$outperm
le?vxor $outperm,$outperm,$tmp
lvx $rndkey0,0,$key
mtctr $rounds
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$ivec,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
b Loop_ctr32_enc
.align 5
Loop_ctr32_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
bdnz Loop_ctr32_enc
vadduwm $ivec,$ivec,$one
vmr $dat,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
subic. $len,$len,1 # blocks--
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key
vperm $dat,$dat,$inptail,$inpperm
li $idx,16
?vperm $rndkey1,$rndkey0,$rndkey1,$keyperm
lvx $rndkey0,0,$key
vxor $dat,$dat,$rndkey1 # last round key
vcipherlast $inout,$inout,$dat
lvx $rndkey1,$idx,$key
addi $idx,$idx,16
vperm $inout,$inout,$inout,$outperm
vsel $dat,$outhead,$inout,$outmask
mtctr $rounds
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vmr $outhead,$inout
vxor $inout,$ivec,$rndkey0
lvx $rndkey0,$idx,$key
addi $idx,$idx,16
stvx $dat,0,$out
addi $out,$out,16
bne Loop_ctr32_enc
addi $out,$out,-1
lvx $inout,0,$out # redundant in aligned case
vsel $inout,$outhead,$inout,$outmask
stvx $inout,0,$out
mtspr 256,$vrsave
blr
.long 0
.byte 0,12,0x14,0,0,0,6,0
.long 0
___
#########################################################################
{{ # Optimized CTR procedure #
my $key_="r11";
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
# v26-v31 last 6 round keys
my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
my ($two,$three,$four)=($outhead,$outperm,$outmask);
$code.=<<___;
.align 5
_aesp8_ctr32_encrypt8x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
li r10,`$FRAME+8*16+15`
li r11,`$FRAME+8*16+31`
stvx v20,r10,$sp # ABI says so
addi r10,r10,32
stvx v21,r11,$sp
addi r11,r11,32
stvx v22,r10,$sp
addi r10,r10,32
stvx v23,r11,$sp
addi r11,r11,32
stvx v24,r10,$sp
addi r10,r10,32
stvx v25,r11,$sp
addi r11,r11,32
stvx v26,r10,$sp
addi r10,r10,32
stvx v27,r11,$sp
addi r11,r11,32
stvx v28,r10,$sp
addi r10,r10,32
stvx v29,r11,$sp
addi r11,r11,32
stvx v30,r10,$sp
stvx v31,r11,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key # load key schedule
lvx v30,$x10,$key
addi $key,$key,0x20
lvx v31,$x00,$key
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_ctr32_enc_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key
addi $key,$key,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_ctr32_enc_key
lvx v26,$x10,$key
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key
?vperm v29,v29,v30,$keyperm
lvx $out0,$x70,$key # borrow $out0
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$out0,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
vadduqm $two,$one,$one
subi $inp,$inp,15 # undo "caller"
$SHL $len,$len,4
vadduqm $out1,$ivec,$one # counter values ...
vadduqm $out2,$ivec,$two
vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
le?li $idx,8
vadduqm $out3,$out1,$two
vxor $out1,$out1,$rndkey0
le?lvsl $inpperm,0,$idx
vadduqm $out4,$out2,$two
vxor $out2,$out2,$rndkey0
le?vspltisb $tmp,0x0f
vadduqm $out5,$out3,$two
vxor $out3,$out3,$rndkey0
le?vxor $inpperm,$inpperm,$tmp # transform for lvx_u/stvx_u
vadduqm $out6,$out4,$two
vxor $out4,$out4,$rndkey0
vadduqm $out7,$out5,$two
vxor $out5,$out5,$rndkey0
vadduqm $ivec,$out6,$two # next counter value
vxor $out6,$out6,$rndkey0
vxor $out7,$out7,$rndkey0
mtctr $rounds
b Loop_ctr32_enc8x
.align 5
Loop_ctr32_enc8x:
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
vcipher $out6,$out6,v24
vcipher $out7,$out7,v24
Loop_ctr32_enc8x_middle:
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
vcipher $out6,$out6,v25
vcipher $out7,$out7,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_ctr32_enc8x
subic r11,$len,256 # $len-256, borrow $key_
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
vcipher $out6,$out6,v24
vcipher $out7,$out7,v24
subfe r0,r0,r0 # borrow?-1:0
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
vcipher $out6,$out6,v25
vcipher $out7,$out7,v25
and r0,r0,r11
addi $key_,$sp,$FRAME+15 # rewind $key_
vcipher $out0,$out0,v26
vcipher $out1,$out1,v26
vcipher $out2,$out2,v26
vcipher $out3,$out3,v26
vcipher $out4,$out4,v26
vcipher $out5,$out5,v26
vcipher $out6,$out6,v26
vcipher $out7,$out7,v26
lvx v24,$x00,$key_ # re-pre-load round[1]
subic $len,$len,129 # $len-=129
vcipher $out0,$out0,v27
addi $len,$len,1 # $len-=128 really
vcipher $out1,$out1,v27
vcipher $out2,$out2,v27
vcipher $out3,$out3,v27
vcipher $out4,$out4,v27
vcipher $out5,$out5,v27
vcipher $out6,$out6,v27
vcipher $out7,$out7,v27
lvx v25,$x10,$key_ # re-pre-load round[2]
vcipher $out0,$out0,v28
lvx_u $in0,$x00,$inp # load input
vcipher $out1,$out1,v28
lvx_u $in1,$x10,$inp
vcipher $out2,$out2,v28
lvx_u $in2,$x20,$inp
vcipher $out3,$out3,v28
lvx_u $in3,$x30,$inp
vcipher $out4,$out4,v28
lvx_u $in4,$x40,$inp
vcipher $out5,$out5,v28
lvx_u $in5,$x50,$inp
vcipher $out6,$out6,v28
lvx_u $in6,$x60,$inp
vcipher $out7,$out7,v28
lvx_u $in7,$x70,$inp
addi $inp,$inp,0x80
vcipher $out0,$out0,v29
le?vperm $in0,$in0,$in0,$inpperm
vcipher $out1,$out1,v29
le?vperm $in1,$in1,$in1,$inpperm
vcipher $out2,$out2,v29
le?vperm $in2,$in2,$in2,$inpperm
vcipher $out3,$out3,v29
le?vperm $in3,$in3,$in3,$inpperm
vcipher $out4,$out4,v29
le?vperm $in4,$in4,$in4,$inpperm
vcipher $out5,$out5,v29
le?vperm $in5,$in5,$in5,$inpperm
vcipher $out6,$out6,v29
le?vperm $in6,$in6,$in6,$inpperm
vcipher $out7,$out7,v29
le?vperm $in7,$in7,$in7,$inpperm
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in7 are loaded
# with last "words"
subfe. r0,r0,r0 # borrow?-1:0
vcipher $out0,$out0,v30
vxor $in0,$in0,v31 # xor with last round key
vcipher $out1,$out1,v30
vxor $in1,$in1,v31
vcipher $out2,$out2,v30
vxor $in2,$in2,v31
vcipher $out3,$out3,v30
vxor $in3,$in3,v31
vcipher $out4,$out4,v30
vxor $in4,$in4,v31
vcipher $out5,$out5,v30
vxor $in5,$in5,v31
vcipher $out6,$out6,v30
vxor $in6,$in6,v31
vcipher $out7,$out7,v30
vxor $in7,$in7,v31
bne Lctr32_enc8x_break # did $len-129 borrow?
vcipherlast $in0,$out0,$in0
vcipherlast $in1,$out1,$in1
vadduqm $out1,$ivec,$one # counter values ...
vcipherlast $in2,$out2,$in2
vadduqm $out2,$ivec,$two
vxor $out0,$ivec,$rndkey0 # ... xored with rndkey[0]
vcipherlast $in3,$out3,$in3
vadduqm $out3,$out1,$two
vxor $out1,$out1,$rndkey0
vcipherlast $in4,$out4,$in4
vadduqm $out4,$out2,$two
vxor $out2,$out2,$rndkey0
vcipherlast $in5,$out5,$in5
vadduqm $out5,$out3,$two
vxor $out3,$out3,$rndkey0
vcipherlast $in6,$out6,$in6
vadduqm $out6,$out4,$two
vxor $out4,$out4,$rndkey0
vcipherlast $in7,$out7,$in7
vadduqm $out7,$out5,$two
vxor $out5,$out5,$rndkey0
le?vperm $in0,$in0,$in0,$inpperm
vadduqm $ivec,$out6,$two # next counter value
vxor $out6,$out6,$rndkey0
le?vperm $in1,$in1,$in1,$inpperm
vxor $out7,$out7,$rndkey0
mtctr $rounds
vcipher $out0,$out0,v24
stvx_u $in0,$x00,$out
le?vperm $in2,$in2,$in2,$inpperm
vcipher $out1,$out1,v24
stvx_u $in1,$x10,$out
le?vperm $in3,$in3,$in3,$inpperm
vcipher $out2,$out2,v24
stvx_u $in2,$x20,$out
le?vperm $in4,$in4,$in4,$inpperm
vcipher $out3,$out3,v24
stvx_u $in3,$x30,$out
le?vperm $in5,$in5,$in5,$inpperm
vcipher $out4,$out4,v24
stvx_u $in4,$x40,$out
le?vperm $in6,$in6,$in6,$inpperm
vcipher $out5,$out5,v24
stvx_u $in5,$x50,$out
le?vperm $in7,$in7,$in7,$inpperm
vcipher $out6,$out6,v24
stvx_u $in6,$x60,$out
vcipher $out7,$out7,v24
stvx_u $in7,$x70,$out
addi $out,$out,0x80
b Loop_ctr32_enc8x_middle
.align 5
Lctr32_enc8x_break:
cmpwi $len,-0x60
blt Lctr32_enc8x_one
nop
beq Lctr32_enc8x_two
cmpwi $len,-0x40
blt Lctr32_enc8x_three
nop
beq Lctr32_enc8x_four
cmpwi $len,-0x20
blt Lctr32_enc8x_five
nop
beq Lctr32_enc8x_six
cmpwi $len,0x00
blt Lctr32_enc8x_seven
Lctr32_enc8x_eight:
vcipherlast $out0,$out0,$in0
vcipherlast $out1,$out1,$in1
vcipherlast $out2,$out2,$in2
vcipherlast $out3,$out3,$in3
vcipherlast $out4,$out4,$in4
vcipherlast $out5,$out5,$in5
vcipherlast $out6,$out6,$in6
vcipherlast $out7,$out7,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x50,$out
le?vperm $out7,$out7,$out7,$inpperm
stvx_u $out6,$x60,$out
stvx_u $out7,$x70,$out
addi $out,$out,0x80
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_seven:
vcipherlast $out0,$out0,$in1
vcipherlast $out1,$out1,$in2
vcipherlast $out2,$out2,$in3
vcipherlast $out3,$out3,$in4
vcipherlast $out4,$out4,$in5
vcipherlast $out5,$out5,$in6
vcipherlast $out6,$out6,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
le?vperm $out6,$out6,$out6,$inpperm
stvx_u $out5,$x50,$out
stvx_u $out6,$x60,$out
addi $out,$out,0x70
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_six:
vcipherlast $out0,$out0,$in2
vcipherlast $out1,$out1,$in3
vcipherlast $out2,$out2,$in4
vcipherlast $out3,$out3,$in5
vcipherlast $out4,$out4,$in6
vcipherlast $out5,$out5,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
le?vperm $out5,$out5,$out5,$inpperm
stvx_u $out4,$x40,$out
stvx_u $out5,$x50,$out
addi $out,$out,0x60
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_five:
vcipherlast $out0,$out0,$in3
vcipherlast $out1,$out1,$in4
vcipherlast $out2,$out2,$in5
vcipherlast $out3,$out3,$in6
vcipherlast $out4,$out4,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$inpperm
stvx_u $out3,$x30,$out
stvx_u $out4,$x40,$out
addi $out,$out,0x50
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_four:
vcipherlast $out0,$out0,$in4
vcipherlast $out1,$out1,$in5
vcipherlast $out2,$out2,$in6
vcipherlast $out3,$out3,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$inpperm
stvx_u $out2,$x20,$out
stvx_u $out3,$x30,$out
addi $out,$out,0x40
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_three:
vcipherlast $out0,$out0,$in5
vcipherlast $out1,$out1,$in6
vcipherlast $out2,$out2,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
le?vperm $out2,$out2,$out2,$inpperm
stvx_u $out1,$x10,$out
stvx_u $out2,$x20,$out
addi $out,$out,0x30
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_two:
vcipherlast $out0,$out0,$in6
vcipherlast $out1,$out1,$in7
le?vperm $out0,$out0,$out0,$inpperm
le?vperm $out1,$out1,$out1,$inpperm
stvx_u $out0,$x00,$out
stvx_u $out1,$x10,$out
addi $out,$out,0x20
b Lctr32_enc8x_done
.align 5
Lctr32_enc8x_one:
vcipherlast $out0,$out0,$in7
le?vperm $out0,$out0,$out0,$inpperm
stvx_u $out0,0,$out
addi $out,$out,0x10
Lctr32_enc8x_done:
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $inpperm,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
stvx $inpperm,r10,$sp
addi r10,r10,32
stvx $inpperm,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x14,0,0x80,6,6,0
.long 0
.size .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
___
}} }}}
#########################################################################
{{{ # XTS procedures #
# int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len, #
# const AES_KEY *key1, const AES_KEY *key2, #
# [const] unsigned char iv[16]); #
# If $key2 is NULL, then a "tweak chaining" mode is engaged, in which #
# input tweak value is assumed to be encrypted already, and last tweak #
# value, one suitable for consecutive call on same chunk of data, is #
# written back to original buffer. In addition, in "tweak chaining" #
# mode only complete input blocks are processed. #
my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) = map("r$_",(3..10));
my ($rndkey0,$rndkey1,$inout) = map("v$_",(0..2));
my ($output,$inptail,$inpperm,$leperm,$keyperm) = map("v$_",(3..7));
my ($tweak,$seven,$eighty7,$tmp,$tweak1) = map("v$_",(8..12));
my $taillen = $key2;
($inp,$idx) = ($idx,$inp); # reassign
$code.=<<___;
.globl .${prefix}_xts_encrypt
mr $inp,r3 # reassign
li r3,-1
${UCMP}i $len,16
bltlr-
lis r0,0xfff0
mfspr r12,256 # save vrsave
li r11,0
mtspr 256,r0
vspltisb $seven,0x07 # 0x070707..07
le?lvsl $leperm,r11,r11
le?vspltisb $tmp,0x0f
le?vxor $leperm,$leperm,$seven
li $idx,15
lvx $tweak,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
le?vxor $inpperm,$inpperm,$tmp
vperm $tweak,$tweak,$inptail,$inpperm
neg r11,$inp
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inout,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
${UCMP}i $key2,0 # key2==NULL?
beq Lxts_enc_no_key2
?lvsl $keyperm,0,$key2 # prepare for unaligned key
lwz $rounds,240($key2)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
lvx $rndkey0,0,$key2
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
mtctr $rounds
Ltweak_xts_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
bdnz Ltweak_xts_enc
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipherlast $tweak,$tweak,$rndkey0
li $ivp,0 # don't chain the tweak
b Lxts_enc
Lxts_enc_no_key2:
li $idx,-16
and $len,$len,$idx # in "tweak chaining"
# mode only complete
# blocks are processed
Lxts_enc:
lvx $inptail,0,$inp
addi $inp,$inp,16
?lvsl $keyperm,0,$key1 # prepare for unaligned key
lwz $rounds,240($key1)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
vslb $eighty7,$seven,$seven # 0x808080..80
vor $eighty7,$eighty7,$seven # 0x878787..87
vspltisb $tmp,1 # 0x010101..01
vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
${UCMP}i $len,96
bge _aesp8_xts_encrypt6x
andi. $taillen,$len,15
subic r0,$len,32
subi $taillen,$taillen,16
subfe r0,r0,r0
and r0,r0,$taillen
add $inp,$inp,r0
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
b Loop_xts_enc
.align 5
Loop_xts_enc:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
bdnz Loop_xts_enc
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $rndkey0,$rndkey0,$tweak
vcipherlast $output,$inout,$rndkey0
le?vperm $tmp,$output,$output,$leperm
be?nop
le?stvx_u $tmp,0,$out
be?stvx_u $output,0,$out
addi $out,$out,16
subic. $len,$len,16
beq Lxts_enc_done
vmr $inout,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
subic r0,$len,32
subfe r0,r0,r0
and r0,r0,$taillen
add $inp,$inp,r0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $output,$output,$rndkey0 # just in case $len<16
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
${UCMP}i $len,16
bge Loop_xts_enc
vxor $output,$output,$tweak
lvsr $inpperm,0,$len # $inpperm is no longer needed
vxor $inptail,$inptail,$inptail # $inptail is no longer needed
vspltisb $tmp,-1
vperm $inptail,$inptail,$tmp,$inpperm
vsel $inout,$inout,$output,$inptail
subi r11,$out,17
subi $out,$out,16
mtctr $len
li $len,16
Loop_xts_enc_steal:
lbzu r0,1(r11)
stb r0,16(r11)
bdnz Loop_xts_enc_steal
mtctr $rounds
b Loop_xts_enc # one more time...
Lxts_enc_done:
${UCMP}i $ivp,0
beq Lxts_enc_ret
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_enc_ret:
mtspr 256,r12 # restore vrsave
li r3,0
blr
.long 0
.byte 0,12,0x04,0,0x80,6,6,0
.long 0
.size .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
.globl .${prefix}_xts_decrypt
mr $inp,r3 # reassign
li r3,-1
${UCMP}i $len,16
bltlr-
lis r0,0xfff8
mfspr r12,256 # save vrsave
li r11,0
mtspr 256,r0
andi. r0,$len,15
neg r0,r0
andi. r0,r0,16
sub $len,$len,r0
vspltisb $seven,0x07 # 0x070707..07
le?lvsl $leperm,r11,r11
le?vspltisb $tmp,0x0f
le?vxor $leperm,$leperm,$seven
li $idx,15
lvx $tweak,0,$ivp # load [unaligned] iv
lvsl $inpperm,0,$ivp
lvx $inptail,$idx,$ivp
le?vxor $inpperm,$inpperm,$tmp
vperm $tweak,$tweak,$inptail,$inpperm
neg r11,$inp
lvsr $inpperm,0,r11 # prepare for unaligned load
lvx $inout,0,$inp
addi $inp,$inp,15 # 15 is not typo
le?vxor $inpperm,$inpperm,$tmp
${UCMP}i $key2,0 # key2==NULL?
beq Lxts_dec_no_key2
?lvsl $keyperm,0,$key2 # prepare for unaligned key
lwz $rounds,240($key2)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
lvx $rndkey0,0,$key2
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
mtctr $rounds
Ltweak_xts_dec:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipher $tweak,$tweak,$rndkey0
lvx $rndkey0,$idx,$key2
addi $idx,$idx,16
bdnz Ltweak_xts_dec
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vcipher $tweak,$tweak,$rndkey1
lvx $rndkey1,$idx,$key2
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vcipherlast $tweak,$tweak,$rndkey0
li $ivp,0 # don't chain the tweak
b Lxts_dec
Lxts_dec_no_key2:
neg $idx,$len
andi. $idx,$idx,15
add $len,$len,$idx # in "tweak chaining"
# mode only complete
# blocks are processed
Lxts_dec:
lvx $inptail,0,$inp
addi $inp,$inp,16
?lvsl $keyperm,0,$key1 # prepare for unaligned key
lwz $rounds,240($key1)
srwi $rounds,$rounds,1
subi $rounds,$rounds,1
li $idx,16
vslb $eighty7,$seven,$seven # 0x808080..80
vor $eighty7,$eighty7,$seven # 0x878787..87
vspltisb $tmp,1 # 0x010101..01
vsldoi $eighty7,$eighty7,$tmp,15 # 0x870101..01
${UCMP}i $len,96
bge _aesp8_xts_decrypt6x
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
${UCMP}i $len,16
blt Ltail_xts_dec
be?b Loop_xts_dec
.align 5
Loop_xts_dec:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
bdnz Loop_xts_dec
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $rndkey0,$rndkey0,$tweak
vncipherlast $output,$inout,$rndkey0
le?vperm $tmp,$output,$output,$leperm
be?nop
le?stvx_u $tmp,0,$out
be?stvx_u $output,0,$out
addi $out,$out,16
subic. $len,$len,16
beq Lxts_dec_done
vmr $inout,$inptail
lvx $inptail,0,$inp
addi $inp,$inp,16
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $inout,$inout,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
mtctr $rounds
${UCMP}i $len,16
bge Loop_xts_dec
Ltail_xts_dec:
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak1,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak1,$tweak1,$tmp
subi $inp,$inp,16
add $inp,$inp,$len
vxor $inout,$inout,$tweak # :-(
vxor $inout,$inout,$tweak1 # :-)
Loop_xts_dec_short:
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vncipher $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
bdnz Loop_xts_dec_short
?vperm $rndkey1,$rndkey1,$rndkey0,$keyperm
vncipher $inout,$inout,$rndkey1
lvx $rndkey1,$idx,$key1
li $idx,16
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
vxor $rndkey0,$rndkey0,$tweak1
vncipherlast $output,$inout,$rndkey0
le?vperm $tmp,$output,$output,$leperm
be?nop
le?stvx_u $tmp,0,$out
be?stvx_u $output,0,$out
vmr $inout,$inptail
lvx $inptail,0,$inp
#addi $inp,$inp,16
lvx $rndkey0,0,$key1
lvx $rndkey1,$idx,$key1
addi $idx,$idx,16
vperm $inout,$inout,$inptail,$inpperm
?vperm $rndkey0,$rndkey0,$rndkey1,$keyperm
lvsr $inpperm,0,$len # $inpperm is no longer needed
vxor $inptail,$inptail,$inptail # $inptail is no longer needed
vspltisb $tmp,-1
vperm $inptail,$inptail,$tmp,$inpperm
vsel $inout,$inout,$output,$inptail
vxor $rndkey0,$rndkey0,$tweak
vxor $inout,$inout,$rndkey0
lvx $rndkey0,$idx,$key1
addi $idx,$idx,16
subi r11,$out,1
mtctr $len
li $len,16
Loop_xts_dec_steal:
lbzu r0,1(r11)
stb r0,16(r11)
bdnz Loop_xts_dec_steal
mtctr $rounds
b Loop_xts_dec # one more time...
Lxts_dec_done:
${UCMP}i $ivp,0
beq Lxts_dec_ret
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $tweak,$tweak,$tmp
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_dec_ret:
mtspr 256,r12 # restore vrsave
li r3,0
blr
.long 0
.byte 0,12,0x04,0,0x80,6,6,0
.long 0
.size .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
___
#########################################################################
{{ # Optimized XTS procedures #
my $key_=$key2;
my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
$x00=0 if ($flavour =~ /osx/);
my ($in0, $in1, $in2, $in3, $in4, $in5 )=map("v$_",(0..5));
my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
my $rndkey0="v23"; # v24-v25 rotating buffer for first found keys
# v26-v31 last 6 round keys
my ($keyperm)=($out0); # aliases with "caller", redundant assignment
my $taillen=$x70;
$code.=<<___;
.align 5
_aesp8_xts_encrypt6x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
mflr r11
li r7,`$FRAME+8*16+15`
li r3,`$FRAME+8*16+31`
$PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
stvx v20,r7,$sp # ABI says so
addi r7,r7,32
stvx v21,r3,$sp
addi r3,r3,32
stvx v22,r7,$sp
addi r7,r7,32
stvx v23,r3,$sp
addi r3,r3,32
stvx v24,r7,$sp
addi r7,r7,32
stvx v25,r3,$sp
addi r3,r3,32
stvx v26,r7,$sp
addi r7,r7,32
stvx v27,r3,$sp
addi r3,r3,32
stvx v28,r7,$sp
addi r7,r7,32
stvx v29,r3,$sp
addi r3,r3,32
stvx v30,r7,$sp
stvx v31,r3,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule
lvx v30,$x10,$key1
addi $key1,$key1,0x20
lvx v31,$x00,$key1
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_xts_enc_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key1
addi $key1,$key1,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key1
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_xts_enc_key
lvx v26,$x10,$key1
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key1
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key1
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key1
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key1
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key1
?vperm v29,v29,v30,$keyperm
lvx $twk5,$x70,$key1 # borrow $twk5
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
vperm $in0,$inout,$inptail,$inpperm
subi $inp,$inp,31 # undo "caller"
vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0
vxor $tweak,$tweak,$tmp
lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1
vxor $tweak,$tweak,$tmp
lvx_u $in2,$x20,$inp
andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2
vxor $tweak,$tweak,$tmp
lvx_u $in3,$x30,$inp
sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3
vxor $tweak,$tweak,$tmp
lvx_u $in4,$x40,$inp
subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4
vxor $tweak,$tweak,$tmp
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5
vxor $tweak,$tweak,$tmp
vxor v31,v31,$rndkey0
mtctr $rounds
b Loop_xts_enc6x
.align 5
Loop_xts_enc6x:
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_enc6x
subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk0,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v24
vcipher $out5,$out5,v24
subfe. r0,r0,r0 # borrow?-1:0
vand $tmp,$tmp,$eighty7
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vxor $tweak,$tweak,$tmp
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vxor $in1,$twk1,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk1,$tweak,$rndkey0
vcipher $out4,$out4,v25
vcipher $out5,$out5,v25
and r0,r0,$len
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v26
vcipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v26
vcipher $out3,$out3,v26
vxor $tweak,$tweak,$tmp
vcipher $out4,$out4,v26
vcipher $out5,$out5,v26
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in5 are loaded
# with last "words"
vxor $in2,$twk2,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk2,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vcipher $out0,$out0,v27
vcipher $out1,$out1,v27
vsldoi $tmp,$tmp,$tmp,15
vcipher $out2,$out2,v27
vcipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7
vcipher $out4,$out4,v27
vcipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vxor $tweak,$tweak,$tmp
vcipher $out0,$out0,v28
vcipher $out1,$out1,v28
vxor $in3,$twk3,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk3,$tweak,$rndkey0
vcipher $out2,$out2,v28
vcipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out4,$out4,v28
vcipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vand $tmp,$tmp,$eighty7
vcipher $out0,$out0,v29
vcipher $out1,$out1,v29
vxor $tweak,$tweak,$tmp
vcipher $out2,$out2,v29
vcipher $out3,$out3,v29
vxor $in4,$twk4,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk4,$tweak,$rndkey0
vcipher $out4,$out4,v29
vcipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipher $out0,$out0,v30
vcipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7
vcipher $out2,$out2,v30
vcipher $out3,$out3,v30
vxor $tweak,$tweak,$tmp
vcipher $out4,$out4,v30
vcipher $out5,$out5,v30
vxor $in5,$twk5,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk5,$tweak,$rndkey0
vcipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vcipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp
vcipherlast $out2,$out2,$in2
le?vperm $in0,$in0,$in0,$leperm
lvx_u $in2,$x20,$inp
vand $tmp,$tmp,$eighty7
vcipherlast $out3,$out3,$in3
le?vperm $in1,$in1,$in1,$leperm
lvx_u $in3,$x30,$inp
vcipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp
vxor $tweak,$tweak,$tmp
vcipherlast $tmp,$out5,$in5 # last block might be needed
# in stealing mode
le?vperm $in3,$in3,$in3,$leperm
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
le?vperm $in4,$in4,$in4,$leperm
le?vperm $in5,$in5,$in5,$leperm
le?vperm $out0,$out0,$out0,$leperm
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk0
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
vxor $out1,$in1,$twk1
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
vxor $out2,$in2,$twk2
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
vxor $out3,$in3,$twk3
le?vperm $out5,$tmp,$tmp,$leperm
stvx_u $out4,$x40,$out
vxor $out4,$in4,$twk4
le?stvx_u $out5,$x50,$out
be?stvx_u $tmp, $x50,$out
vxor $out5,$in5,$twk5
addi $out,$out,0x60
mtctr $rounds
beq Loop_xts_enc6x # did $len-=96 borrow?
addic. $len,$len,0x60
beq Lxts_enc6x_zero
cmpwi $len,0x20
blt Lxts_enc6x_one
nop
beq Lxts_enc6x_two
cmpwi $len,0x40
blt Lxts_enc6x_three
nop
beq Lxts_enc6x_four
Lxts_enc6x_five:
vxor $out0,$in1,$twk0
vxor $out1,$in2,$twk1
vxor $out2,$in3,$twk2
vxor $out3,$in4,$twk3
vxor $out4,$in5,$twk4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk5 # unused tweak
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
vxor $tmp,$out4,$twk5 # last block prep for stealing
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
stvx_u $out4,$x40,$out
addi $out,$out,0x50
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_four:
vxor $out0,$in2,$twk0
vxor $out1,$in3,$twk1
vxor $out2,$in4,$twk2
vxor $out3,$in5,$twk3
vxor $out4,$out4,$out4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk4 # unused tweak
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
vxor $tmp,$out3,$twk4 # last block prep for stealing
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
stvx_u $out3,$x30,$out
addi $out,$out,0x40
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_three:
vxor $out0,$in3,$twk0
vxor $out1,$in4,$twk1
vxor $out2,$in5,$twk2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk3 # unused tweak
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $tmp,$out2,$twk3 # last block prep for stealing
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
stvx_u $out2,$x20,$out
addi $out,$out,0x30
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_two:
vxor $out0,$in4,$twk0
vxor $out1,$in5,$twk1
vxor $out2,$out2,$out2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_enc5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk2 # unused tweak
vxor $tmp,$out1,$twk2 # last block prep for stealing
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
stvx_u $out1,$x10,$out
addi $out,$out,0x20
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_one:
vxor $out0,$in5,$twk0
nop
Loop_xts_enc1x:
vcipher $out0,$out0,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_enc1x
add $inp,$inp,$taillen
cmpwi $taillen,0
vcipher $out0,$out0,v24
subi $inp,$inp,16
vcipher $out0,$out0,v25
lvsr $inpperm,0,$taillen
vcipher $out0,$out0,v26
lvx_u $in0,0,$inp
vcipher $out0,$out0,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vcipher $out0,$out0,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vcipher $out0,$out0,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $twk0,$twk0,v31
le?vperm $in0,$in0,$in0,$leperm
vcipher $out0,$out0,v30
vperm $in0,$in0,$in0,$inpperm
vcipherlast $out0,$out0,$twk0
vmr $twk0,$twk1 # unused tweak
vxor $tmp,$out0,$twk1 # last block prep for stealing
le?vperm $out0,$out0,$out0,$leperm
stvx_u $out0,$x00,$out # store output
addi $out,$out,0x10
bne Lxts_enc6x_steal
b Lxts_enc6x_done
.align 4
Lxts_enc6x_zero:
cmpwi $taillen,0
beq Lxts_enc6x_done
add $inp,$inp,$taillen
subi $inp,$inp,16
lvx_u $in0,0,$inp
lvsr $inpperm,0,$taillen # $in5 is no more
le?vperm $in0,$in0,$in0,$leperm
vperm $in0,$in0,$in0,$inpperm
vxor $tmp,$tmp,$twk0
Lxts_enc6x_steal:
vxor $in0,$in0,$twk0
vxor $out0,$out0,$out0
vspltisb $out1,-1
vperm $out0,$out0,$out1,$inpperm
vsel $out0,$in0,$tmp,$out0 # $tmp is last block, remember?
subi r30,$out,17
subi $out,$out,16
mtctr $taillen
Loop_xts_enc6x_steal:
lbzu r0,1(r30)
stb r0,16(r30)
bdnz Loop_xts_enc6x_steal
li $taillen,0
mtctr $rounds
b Loop_xts_enc1x # one more time...
.align 4
Lxts_enc6x_done:
${UCMP}i $ivp,0
beq Lxts_enc6x_ret
vxor $tweak,$twk0,$rndkey0
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_enc6x_ret:
mtlr r11
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $seven,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x04,1,0x80,6,6,0
.long 0
.align 5
_aesp8_xts_enc5x:
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
lvx v25,$x10,$key_ # round[4]
bdnz _aesp8_xts_enc5x
add $inp,$inp,$taillen
cmpwi $taillen,0
vcipher $out0,$out0,v24
vcipher $out1,$out1,v24
vcipher $out2,$out2,v24
vcipher $out3,$out3,v24
vcipher $out4,$out4,v24
subi $inp,$inp,16
vcipher $out0,$out0,v25
vcipher $out1,$out1,v25
vcipher $out2,$out2,v25
vcipher $out3,$out3,v25
vcipher $out4,$out4,v25
vxor $twk0,$twk0,v31
vcipher $out0,$out0,v26
lvsr $inpperm,r0,$taillen # $in5 is no more
vcipher $out1,$out1,v26
vcipher $out2,$out2,v26
vcipher $out3,$out3,v26
vcipher $out4,$out4,v26
vxor $in1,$twk1,v31
vcipher $out0,$out0,v27
lvx_u $in0,0,$inp
vcipher $out1,$out1,v27
vcipher $out2,$out2,v27
vcipher $out3,$out3,v27
vcipher $out4,$out4,v27
vxor $in2,$twk2,v31
addi $key_,$sp,$FRAME+15 # rewind $key_
vcipher $out0,$out0,v28
vcipher $out1,$out1,v28
vcipher $out2,$out2,v28
vcipher $out3,$out3,v28
vcipher $out4,$out4,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vxor $in3,$twk3,v31
vcipher $out0,$out0,v29
le?vperm $in0,$in0,$in0,$leperm
vcipher $out1,$out1,v29
vcipher $out2,$out2,v29
vcipher $out3,$out3,v29
vcipher $out4,$out4,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $in4,$twk4,v31
vcipher $out0,$out0,v30
vperm $in0,$in0,$in0,$inpperm
vcipher $out1,$out1,v30
vcipher $out2,$out2,v30
vcipher $out3,$out3,v30
vcipher $out4,$out4,v30
vcipherlast $out0,$out0,$twk0
vcipherlast $out1,$out1,$in1
vcipherlast $out2,$out2,$in2
vcipherlast $out3,$out3,$in3
vcipherlast $out4,$out4,$in4
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
.align 5
_aesp8_xts_decrypt6x:
$STU $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
mflr r11
li r7,`$FRAME+8*16+15`
li r3,`$FRAME+8*16+31`
$PUSH r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
stvx v20,r7,$sp # ABI says so
addi r7,r7,32
stvx v21,r3,$sp
addi r3,r3,32
stvx v22,r7,$sp
addi r7,r7,32
stvx v23,r3,$sp
addi r3,r3,32
stvx v24,r7,$sp
addi r7,r7,32
stvx v25,r3,$sp
addi r3,r3,32
stvx v26,r7,$sp
addi r7,r7,32
stvx v27,r3,$sp
addi r3,r3,32
stvx v28,r7,$sp
addi r7,r7,32
stvx v29,r3,$sp
addi r3,r3,32
stvx v30,r7,$sp
stvx v31,r3,$sp
li r0,-1
stw $vrsave,`$FRAME+21*16-4`($sp) # save vrsave
li $x10,0x10
$PUSH r26,`$FRAME+21*16+0*$SIZE_T`($sp)
li $x20,0x20
$PUSH r27,`$FRAME+21*16+1*$SIZE_T`($sp)
li $x30,0x30
$PUSH r28,`$FRAME+21*16+2*$SIZE_T`($sp)
li $x40,0x40
$PUSH r29,`$FRAME+21*16+3*$SIZE_T`($sp)
li $x50,0x50
$PUSH r30,`$FRAME+21*16+4*$SIZE_T`($sp)
li $x60,0x60
$PUSH r31,`$FRAME+21*16+5*$SIZE_T`($sp)
li $x70,0x70
mtspr 256,r0
subi $rounds,$rounds,3 # -4 in total
lvx $rndkey0,$x00,$key1 # load key schedule
lvx v30,$x10,$key1
addi $key1,$key1,0x20
lvx v31,$x00,$key1
?vperm $rndkey0,$rndkey0,v30,$keyperm
addi $key_,$sp,$FRAME+15
mtctr $rounds
Load_xts_dec_key:
?vperm v24,v30,v31,$keyperm
lvx v30,$x10,$key1
addi $key1,$key1,0x20
stvx v24,$x00,$key_ # off-load round[1]
?vperm v25,v31,v30,$keyperm
lvx v31,$x00,$key1
stvx v25,$x10,$key_ # off-load round[2]
addi $key_,$key_,0x20
bdnz Load_xts_dec_key
lvx v26,$x10,$key1
?vperm v24,v30,v31,$keyperm
lvx v27,$x20,$key1
stvx v24,$x00,$key_ # off-load round[3]
?vperm v25,v31,v26,$keyperm
lvx v28,$x30,$key1
stvx v25,$x10,$key_ # off-load round[4]
addi $key_,$sp,$FRAME+15 # rewind $key_
?vperm v26,v26,v27,$keyperm
lvx v29,$x40,$key1
?vperm v27,v27,v28,$keyperm
lvx v30,$x50,$key1
?vperm v28,v28,v29,$keyperm
lvx v31,$x60,$key1
?vperm v29,v29,v30,$keyperm
lvx $twk5,$x70,$key1 # borrow $twk5
?vperm v30,v30,v31,$keyperm
lvx v24,$x00,$key_ # pre-load round[1]
?vperm v31,v31,$twk5,$keyperm
lvx v25,$x10,$key_ # pre-load round[2]
vperm $in0,$inout,$inptail,$inpperm
subi $inp,$inp,31 # undo "caller"
vxor $twk0,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vand $tmp,$tmp,$eighty7
vxor $out0,$in0,$twk0
vxor $tweak,$tweak,$tmp
lvx_u $in1,$x10,$inp
vxor $twk1,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in1,$in1,$in1,$leperm
vand $tmp,$tmp,$eighty7
vxor $out1,$in1,$twk1
vxor $tweak,$tweak,$tmp
lvx_u $in2,$x20,$inp
andi. $taillen,$len,15
vxor $twk2,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in2,$in2,$in2,$leperm
vand $tmp,$tmp,$eighty7
vxor $out2,$in2,$twk2
vxor $tweak,$tweak,$tmp
lvx_u $in3,$x30,$inp
sub $len,$len,$taillen
vxor $twk3,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in3,$in3,$in3,$leperm
vand $tmp,$tmp,$eighty7
vxor $out3,$in3,$twk3
vxor $tweak,$tweak,$tmp
lvx_u $in4,$x40,$inp
subi $len,$len,0x60
vxor $twk4,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in4,$in4,$in4,$leperm
vand $tmp,$tmp,$eighty7
vxor $out4,$in4,$twk4
vxor $tweak,$tweak,$tmp
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
vxor $twk5,$tweak,$rndkey0
vsrab $tmp,$tweak,$seven # next tweak value
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
le?vperm $in5,$in5,$in5,$leperm
vand $tmp,$tmp,$eighty7
vxor $out5,$in5,$twk5
vxor $tweak,$tweak,$tmp
vxor v31,v31,$rndkey0
mtctr $rounds
b Loop_xts_dec6x
.align 5
Loop_xts_dec6x:
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_dec6x
subic $len,$len,96 # $len-=96
vxor $in0,$twk0,v31 # xor with last round key
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk0,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v24
vncipher $out5,$out5,v24
subfe. r0,r0,r0 # borrow?-1:0
vand $tmp,$tmp,$eighty7
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vxor $tweak,$tweak,$tmp
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vxor $in1,$twk1,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk1,$tweak,$rndkey0
vncipher $out4,$out4,v25
vncipher $out5,$out5,v25
and r0,r0,$len
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v26
vncipher $out1,$out1,v26
vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vxor $tweak,$tweak,$tmp
vncipher $out4,$out4,v26
vncipher $out5,$out5,v26
add $inp,$inp,r0 # $inp is adjusted in such
# way that at exit from the
# loop inX-in5 are loaded
# with last "words"
vxor $in2,$twk2,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk2,$tweak,$rndkey0
vaddubm $tweak,$tweak,$tweak
vncipher $out0,$out0,v27
vncipher $out1,$out1,v27
vsldoi $tmp,$tmp,$tmp,15
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vand $tmp,$tmp,$eighty7
vncipher $out4,$out4,v27
vncipher $out5,$out5,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vxor $tweak,$tweak,$tmp
vncipher $out0,$out0,v28
vncipher $out1,$out1,v28
vxor $in3,$twk3,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk3,$tweak,$rndkey0
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out4,$out4,v28
vncipher $out5,$out5,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vand $tmp,$tmp,$eighty7
vncipher $out0,$out0,v29
vncipher $out1,$out1,v29
vxor $tweak,$tweak,$tmp
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vxor $in4,$twk4,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk4,$tweak,$rndkey0
vncipher $out4,$out4,v29
vncipher $out5,$out5,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipher $out0,$out0,v30
vncipher $out1,$out1,v30
vand $tmp,$tmp,$eighty7
vncipher $out2,$out2,v30
vncipher $out3,$out3,v30
vxor $tweak,$tweak,$tmp
vncipher $out4,$out4,v30
vncipher $out5,$out5,v30
vxor $in5,$twk5,v31
vsrab $tmp,$tweak,$seven # next tweak value
vxor $twk5,$tweak,$rndkey0
vncipherlast $out0,$out0,$in0
lvx_u $in0,$x00,$inp # load next input block
vaddubm $tweak,$tweak,$tweak
vsldoi $tmp,$tmp,$tmp,15
vncipherlast $out1,$out1,$in1
lvx_u $in1,$x10,$inp
vncipherlast $out2,$out2,$in2
le?vperm $in0,$in0,$in0,$leperm
lvx_u $in2,$x20,$inp
vand $tmp,$tmp,$eighty7
vncipherlast $out3,$out3,$in3
le?vperm $in1,$in1,$in1,$leperm
lvx_u $in3,$x30,$inp
vncipherlast $out4,$out4,$in4
le?vperm $in2,$in2,$in2,$leperm
lvx_u $in4,$x40,$inp
vxor $tweak,$tweak,$tmp
vncipherlast $out5,$out5,$in5
le?vperm $in3,$in3,$in3,$leperm
lvx_u $in5,$x50,$inp
addi $inp,$inp,0x60
le?vperm $in4,$in4,$in4,$leperm
le?vperm $in5,$in5,$in5,$leperm
le?vperm $out0,$out0,$out0,$leperm
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk0
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
vxor $out1,$in1,$twk1
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
vxor $out2,$in2,$twk2
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
vxor $out3,$in3,$twk3
le?vperm $out5,$out5,$out5,$leperm
stvx_u $out4,$x40,$out
vxor $out4,$in4,$twk4
stvx_u $out5,$x50,$out
vxor $out5,$in5,$twk5
addi $out,$out,0x60
mtctr $rounds
beq Loop_xts_dec6x # did $len-=96 borrow?
addic. $len,$len,0x60
beq Lxts_dec6x_zero
cmpwi $len,0x20
blt Lxts_dec6x_one
nop
beq Lxts_dec6x_two
cmpwi $len,0x40
blt Lxts_dec6x_three
nop
beq Lxts_dec6x_four
Lxts_dec6x_five:
vxor $out0,$in1,$twk0
vxor $out1,$in2,$twk1
vxor $out2,$in3,$twk2
vxor $out3,$in4,$twk3
vxor $out4,$in5,$twk4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk5 # unused tweak
vxor $twk1,$tweak,$rndkey0
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk1
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
le?vperm $out4,$out4,$out4,$leperm
stvx_u $out3,$x30,$out
stvx_u $out4,$x40,$out
addi $out,$out,0x50
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_four:
vxor $out0,$in2,$twk0
vxor $out1,$in3,$twk1
vxor $out2,$in4,$twk2
vxor $out3,$in5,$twk3
vxor $out4,$out4,$out4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk4 # unused tweak
vmr $twk1,$twk5
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk5
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
le?vperm $out3,$out3,$out3,$leperm
stvx_u $out2,$x20,$out
stvx_u $out3,$x30,$out
addi $out,$out,0x40
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_three:
vxor $out0,$in3,$twk0
vxor $out1,$in4,$twk1
vxor $out2,$in5,$twk2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk3 # unused tweak
vmr $twk1,$twk4
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk4
le?vperm $out2,$out2,$out2,$leperm
stvx_u $out1,$x10,$out
stvx_u $out2,$x20,$out
addi $out,$out,0x30
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_two:
vxor $out0,$in4,$twk0
vxor $out1,$in5,$twk1
vxor $out2,$out2,$out2
vxor $out3,$out3,$out3
vxor $out4,$out4,$out4
bl _aesp8_xts_dec5x
le?vperm $out0,$out0,$out0,$leperm
vmr $twk0,$twk2 # unused tweak
vmr $twk1,$twk3
le?vperm $out1,$out1,$out1,$leperm
stvx_u $out0,$x00,$out # store output
vxor $out0,$in0,$twk3
stvx_u $out1,$x10,$out
addi $out,$out,0x20
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_one:
vxor $out0,$in5,$twk0
nop
Loop_xts_dec1x:
vncipher $out0,$out0,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
lvx v25,$x10,$key_ # round[4]
bdnz Loop_xts_dec1x
subi r0,$taillen,1
vncipher $out0,$out0,v24
andi. r0,r0,16
cmpwi $taillen,0
vncipher $out0,$out0,v25
sub $inp,$inp,r0
vncipher $out0,$out0,v26
lvx_u $in0,0,$inp
vncipher $out0,$out0,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vncipher $out0,$out0,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $twk0,$twk0,v31
le?vperm $in0,$in0,$in0,$leperm
vncipher $out0,$out0,v30
mtctr $rounds
vncipherlast $out0,$out0,$twk0
vmr $twk0,$twk1 # unused tweak
vmr $twk1,$twk2
le?vperm $out0,$out0,$out0,$leperm
stvx_u $out0,$x00,$out # store output
addi $out,$out,0x10
vxor $out0,$in0,$twk2
bne Lxts_dec6x_steal
b Lxts_dec6x_done
.align 4
Lxts_dec6x_zero:
cmpwi $taillen,0
beq Lxts_dec6x_done
lvx_u $in0,0,$inp
le?vperm $in0,$in0,$in0,$leperm
vxor $out0,$in0,$twk1
Lxts_dec6x_steal:
vncipher $out0,$out0,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
lvx v25,$x10,$key_ # round[4]
bdnz Lxts_dec6x_steal
add $inp,$inp,$taillen
vncipher $out0,$out0,v24
cmpwi $taillen,0
vncipher $out0,$out0,v25
lvx_u $in0,0,$inp
vncipher $out0,$out0,v26
lvsr $inpperm,0,$taillen # $in5 is no more
vncipher $out0,$out0,v27
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vncipher $out0,$out0,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $twk1,$twk1,v31
le?vperm $in0,$in0,$in0,$leperm
vncipher $out0,$out0,v30
vperm $in0,$in0,$in0,$inpperm
vncipherlast $tmp,$out0,$twk1
le?vperm $out0,$tmp,$tmp,$leperm
le?stvx_u $out0,0,$out
be?stvx_u $tmp,0,$out
vxor $out0,$out0,$out0
vspltisb $out1,-1
vperm $out0,$out0,$out1,$inpperm
vsel $out0,$in0,$tmp,$out0
vxor $out0,$out0,$twk0
subi r30,$out,1
mtctr $taillen
Loop_xts_dec6x_steal:
lbzu r0,1(r30)
stb r0,16(r30)
bdnz Loop_xts_dec6x_steal
li $taillen,0
mtctr $rounds
b Loop_xts_dec1x # one more time...
.align 4
Lxts_dec6x_done:
${UCMP}i $ivp,0
beq Lxts_dec6x_ret
vxor $tweak,$twk0,$rndkey0
le?vperm $tweak,$tweak,$tweak,$leperm
stvx_u $tweak,0,$ivp
Lxts_dec6x_ret:
mtlr r11
li r10,`$FRAME+15`
li r11,`$FRAME+31`
stvx $seven,r10,$sp # wipe copies of round keys
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
stvx $seven,r10,$sp
addi r10,r10,32
stvx $seven,r11,$sp
addi r11,r11,32
mtspr 256,$vrsave
lvx v20,r10,$sp # ABI says so
addi r10,r10,32
lvx v21,r11,$sp
addi r11,r11,32
lvx v22,r10,$sp
addi r10,r10,32
lvx v23,r11,$sp
addi r11,r11,32
lvx v24,r10,$sp
addi r10,r10,32
lvx v25,r11,$sp
addi r11,r11,32
lvx v26,r10,$sp
addi r10,r10,32
lvx v27,r11,$sp
addi r11,r11,32
lvx v28,r10,$sp
addi r10,r10,32
lvx v29,r11,$sp
addi r11,r11,32
lvx v30,r10,$sp
lvx v31,r11,$sp
$POP r26,`$FRAME+21*16+0*$SIZE_T`($sp)
$POP r27,`$FRAME+21*16+1*$SIZE_T`($sp)
$POP r28,`$FRAME+21*16+2*$SIZE_T`($sp)
$POP r29,`$FRAME+21*16+3*$SIZE_T`($sp)
$POP r30,`$FRAME+21*16+4*$SIZE_T`($sp)
$POP r31,`$FRAME+21*16+5*$SIZE_T`($sp)
addi $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
blr
.long 0
.byte 0,12,0x04,1,0x80,6,6,0
.long 0
.align 5
_aesp8_xts_dec5x:
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
lvx v24,$x20,$key_ # round[3]
addi $key_,$key_,0x20
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
lvx v25,$x10,$key_ # round[4]
bdnz _aesp8_xts_dec5x
subi r0,$taillen,1
vncipher $out0,$out0,v24
vncipher $out1,$out1,v24
vncipher $out2,$out2,v24
vncipher $out3,$out3,v24
vncipher $out4,$out4,v24
andi. r0,r0,16
cmpwi $taillen,0
vncipher $out0,$out0,v25
vncipher $out1,$out1,v25
vncipher $out2,$out2,v25
vncipher $out3,$out3,v25
vncipher $out4,$out4,v25
vxor $twk0,$twk0,v31
sub $inp,$inp,r0
vncipher $out0,$out0,v26
vncipher $out1,$out1,v26
vncipher $out2,$out2,v26
vncipher $out3,$out3,v26
vncipher $out4,$out4,v26
vxor $in1,$twk1,v31
vncipher $out0,$out0,v27
lvx_u $in0,0,$inp
vncipher $out1,$out1,v27
vncipher $out2,$out2,v27
vncipher $out3,$out3,v27
vncipher $out4,$out4,v27
vxor $in2,$twk2,v31
addi $key_,$sp,$FRAME+15 # rewind $key_
vncipher $out0,$out0,v28
vncipher $out1,$out1,v28
vncipher $out2,$out2,v28
vncipher $out3,$out3,v28
vncipher $out4,$out4,v28
lvx v24,$x00,$key_ # re-pre-load round[1]
vxor $in3,$twk3,v31
vncipher $out0,$out0,v29
le?vperm $in0,$in0,$in0,$leperm
vncipher $out1,$out1,v29
vncipher $out2,$out2,v29
vncipher $out3,$out3,v29
vncipher $out4,$out4,v29
lvx v25,$x10,$key_ # re-pre-load round[2]
vxor $in4,$twk4,v31
vncipher $out0,$out0,v30
vncipher $out1,$out1,v30
vncipher $out2,$out2,v30
vncipher $out3,$out3,v30
vncipher $out4,$out4,v30
vncipherlast $out0,$out0,$twk0
vncipherlast $out1,$out1,$in1
vncipherlast $out2,$out2,$in2
vncipherlast $out3,$out3,$in3
vncipherlast $out4,$out4,$in4
mtctr $rounds
blr
.long 0
.byte 0,12,0x14,0,0,0,0,0
___
}} }}}
my $consts=1;
foreach(split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/geo;
# constants table endian-specific conversion
if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
my $conv=$3;
my @bytes=();
# convert to endian-agnostic format
if ($1 eq "long") {
foreach (split(/,\s*/,$2)) {
my $l = /^0/?oct:int;
push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
}
} else {
@bytes = map(/^0/?oct:int,split(/,\s*/,$2));
}
# little-endian conversion
if ($flavour =~ /le$/o) {
SWITCH: for($conv) {
/\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
/\?rev/ && do { @bytes=reverse(@bytes); last; };
}
}
#emit
print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
next;
}
$consts=0 if (m/Lconsts:/o); # end of table
# instructions prefixed with '?' are endian-specific and need
# to be adjusted accordingly...
if ($flavour =~ /le$/o) { # little-endian
s/le\?//o or
s/be\?/#be#/o or
s/\?lvsr/lvsl/o or
s/\?lvsl/lvsr/o or
s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
} else { # big-endian
s/le\?/#le#/o or
s/be\?//o or
s/\?([a-z]+)/$1/o;
}
print $_,"\n";
}
close STDOUT;