linux/arch/powerpc/crypto/aes-gcm-p10.S
Danny Tsen fd0e9b3e2e crypto: p10-aes-gcm - An accelerated AES/GCM stitched implementation
Improve overall performance of AES/GCM encrypt and decrypt operations
for Power10 or later CPU.

Signed-off-by: Danny Tsen <dtsen@linux.ibm.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2023-03-14 17:06:43 +08:00

1522 lines
25 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0-or-later */
#
# Accelerated AES-GCM stitched implementation for ppc64le.
#
# Copyright 2022- IBM Inc. All rights reserved
#
#===================================================================================
# Written by Danny Tsen <dtsen@linux.ibm.com>
#
# GHASH is based on the Karatsuba multiplication method.
#
# Xi xor X1
#
# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
# (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
# (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
# (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
# (X4.h * H.h + X4.l * H.l + X4 * H)
#
# Xi = v0
# H Poly = v2
# Hash keys = v3 - v14
# ( H.l, H, H.h)
# ( H^2.l, H^2, H^2.h)
# ( H^3.l, H^3, H^3.h)
# ( H^4.l, H^4, H^4.h)
#
# v30 is IV
# v31 - counter 1
#
# AES used,
# vs0 - vs14 for round keys
# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
#
# This implementation uses stitched AES-GCM approach to improve overall performance.
# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
#
# ===================================================================================
#
#include <asm/ppc_asm.h>
#include <linux/linkage.h>
.machine "any"
.text
# 4x loops
# v15 - v18 - input states
# vs1 - vs9 - round keys
#
.macro Loop_aes_middle4x
xxlor 19+32, 1, 1
xxlor 20+32, 2, 2
xxlor 21+32, 3, 3
xxlor 22+32, 4, 4
vcipher 15, 15, 19
vcipher 16, 16, 19
vcipher 17, 17, 19
vcipher 18, 18, 19
vcipher 15, 15, 20
vcipher 16, 16, 20
vcipher 17, 17, 20
vcipher 18, 18, 20
vcipher 15, 15, 21
vcipher 16, 16, 21
vcipher 17, 17, 21
vcipher 18, 18, 21
vcipher 15, 15, 22
vcipher 16, 16, 22
vcipher 17, 17, 22
vcipher 18, 18, 22
xxlor 19+32, 5, 5
xxlor 20+32, 6, 6
xxlor 21+32, 7, 7
xxlor 22+32, 8, 8
vcipher 15, 15, 19
vcipher 16, 16, 19
vcipher 17, 17, 19
vcipher 18, 18, 19
vcipher 15, 15, 20
vcipher 16, 16, 20
vcipher 17, 17, 20
vcipher 18, 18, 20
vcipher 15, 15, 21
vcipher 16, 16, 21
vcipher 17, 17, 21
vcipher 18, 18, 21
vcipher 15, 15, 22
vcipher 16, 16, 22
vcipher 17, 17, 22
vcipher 18, 18, 22
xxlor 23+32, 9, 9
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
.endm
# 8x loops
# v15 - v22 - input states
# vs1 - vs9 - round keys
#
.macro Loop_aes_middle8x
xxlor 23+32, 1, 1
xxlor 24+32, 2, 2
xxlor 25+32, 3, 3
xxlor 26+32, 4, 4
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
vcipher 15, 15, 25
vcipher 16, 16, 25
vcipher 17, 17, 25
vcipher 18, 18, 25
vcipher 19, 19, 25
vcipher 20, 20, 25
vcipher 21, 21, 25
vcipher 22, 22, 25
vcipher 15, 15, 26
vcipher 16, 16, 26
vcipher 17, 17, 26
vcipher 18, 18, 26
vcipher 19, 19, 26
vcipher 20, 20, 26
vcipher 21, 21, 26
vcipher 22, 22, 26
xxlor 23+32, 5, 5
xxlor 24+32, 6, 6
xxlor 25+32, 7, 7
xxlor 26+32, 8, 8
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
vcipher 15, 15, 25
vcipher 16, 16, 25
vcipher 17, 17, 25
vcipher 18, 18, 25
vcipher 19, 19, 25
vcipher 20, 20, 25
vcipher 21, 21, 25
vcipher 22, 22, 25
vcipher 15, 15, 26
vcipher 16, 16, 26
vcipher 17, 17, 26
vcipher 18, 18, 26
vcipher 19, 19, 26
vcipher 20, 20, 26
vcipher 21, 21, 26
vcipher 22, 22, 26
xxlor 23+32, 9, 9
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
.endm
.macro Loop_aes_middle_1x
xxlor 19+32, 1, 1
xxlor 20+32, 2, 2
xxlor 21+32, 3, 3
xxlor 22+32, 4, 4
vcipher 15, 15, 19
vcipher 15, 15, 20
vcipher 15, 15, 21
vcipher 15, 15, 22
xxlor 19+32, 5, 5
xxlor 20+32, 6, 6
xxlor 21+32, 7, 7
xxlor 22+32, 8, 8
vcipher 15, 15, 19
vcipher 15, 15, 20
vcipher 15, 15, 21
vcipher 15, 15, 22
xxlor 19+32, 9, 9
vcipher 15, 15, 19
.endm
#
# Compute 4x hash values based on Karatsuba method.
#
.macro ppc_aes_gcm_ghash
vxor 15, 15, 0
vpmsumd 23, 12, 15 # H4.L * X.L
vpmsumd 24, 9, 16
vpmsumd 25, 6, 17
vpmsumd 26, 3, 18
vxor 23, 23, 24
vxor 23, 23, 25
vxor 23, 23, 26 # L
vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
vpmsumd 26, 7, 17
vpmsumd 27, 4, 18
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27 # M
# sum hash and reduction with H Poly
vpmsumd 28, 23, 2 # reduction
vxor 29, 29, 29
vsldoi 26, 24, 29, 8 # mL
vsldoi 29, 29, 24, 8 # mH
vxor 23, 23, 26 # mL + L
vsldoi 23, 23, 23, 8 # swap
vxor 23, 23, 28
vpmsumd 24, 14, 15 # H4.H * X.H
vpmsumd 25, 11, 16
vpmsumd 26, 8, 17
vpmsumd 27, 5, 18
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27
vxor 24, 24, 29
# sum hash and reduction with H Poly
vsldoi 27, 23, 23, 8 # swap
vpmsumd 23, 23, 2
vxor 27, 27, 24
vxor 23, 23, 27
xxlor 32, 23+32, 23+32 # update hash
.endm
#
# Combine two 4x ghash
# v15 - v22 - input blocks
#
.macro ppc_aes_gcm_ghash2_4x
# first 4x hash
vxor 15, 15, 0 # Xi + X
vpmsumd 23, 12, 15 # H4.L * X.L
vpmsumd 24, 9, 16
vpmsumd 25, 6, 17
vpmsumd 26, 3, 18
vxor 23, 23, 24
vxor 23, 23, 25
vxor 23, 23, 26 # L
vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L
vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L
vpmsumd 26, 7, 17
vpmsumd 27, 4, 18
vxor 24, 24, 25
vxor 24, 24, 26
# sum hash and reduction with H Poly
vpmsumd 28, 23, 2 # reduction
vxor 29, 29, 29
vxor 24, 24, 27 # M
vsldoi 26, 24, 29, 8 # mL
vsldoi 29, 29, 24, 8 # mH
vxor 23, 23, 26 # mL + L
vsldoi 23, 23, 23, 8 # swap
vxor 23, 23, 28
vpmsumd 24, 14, 15 # H4.H * X.H
vpmsumd 25, 11, 16
vpmsumd 26, 8, 17
vpmsumd 27, 5, 18
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27 # H
vxor 24, 24, 29 # H + mH
# sum hash and reduction with H Poly
vsldoi 27, 23, 23, 8 # swap
vpmsumd 23, 23, 2
vxor 27, 27, 24
vxor 27, 23, 27 # 1st Xi
# 2nd 4x hash
vpmsumd 24, 9, 20
vpmsumd 25, 6, 21
vpmsumd 26, 3, 22
vxor 19, 19, 27 # Xi + X
vpmsumd 23, 12, 19 # H4.L * X.L
vxor 23, 23, 24
vxor 23, 23, 25
vxor 23, 23, 26 # L
vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L
vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L
vpmsumd 26, 7, 21
vpmsumd 27, 4, 22
vxor 24, 24, 25
vxor 24, 24, 26
# sum hash and reduction with H Poly
vpmsumd 28, 23, 2 # reduction
vxor 29, 29, 29
vxor 24, 24, 27 # M
vsldoi 26, 24, 29, 8 # mL
vsldoi 29, 29, 24, 8 # mH
vxor 23, 23, 26 # mL + L
vsldoi 23, 23, 23, 8 # swap
vxor 23, 23, 28
vpmsumd 24, 14, 19 # H4.H * X.H
vpmsumd 25, 11, 20
vpmsumd 26, 8, 21
vpmsumd 27, 5, 22
vxor 24, 24, 25
vxor 24, 24, 26
vxor 24, 24, 27 # H
vxor 24, 24, 29 # H + mH
# sum hash and reduction with H Poly
vsldoi 27, 23, 23, 8 # swap
vpmsumd 23, 23, 2
vxor 27, 27, 24
vxor 23, 23, 27
xxlor 32, 23+32, 23+32 # update hash
.endm
#
# Compute update single hash
#
.macro ppc_update_hash_1x
vxor 28, 28, 0
vxor 19, 19, 19
vpmsumd 22, 3, 28 # L
vpmsumd 23, 4, 28 # M
vpmsumd 24, 5, 28 # H
vpmsumd 27, 22, 2 # reduction
vsldoi 25, 23, 19, 8 # mL
vsldoi 26, 19, 23, 8 # mH
vxor 22, 22, 25 # LL + LL
vxor 24, 24, 26 # HH + HH
vsldoi 22, 22, 22, 8 # swap
vxor 22, 22, 27
vsldoi 20, 22, 22, 8 # swap
vpmsumd 22, 22, 2 # reduction
vxor 20, 20, 24
vxor 22, 22, 20
vmr 0, 22 # update hash
.endm
.macro SAVE_REGS
stdu 1,-640(1)
mflr 0
std 14,112(1)
std 15,120(1)
std 16,128(1)
std 17,136(1)
std 18,144(1)
std 19,152(1)
std 20,160(1)
std 21,168(1)
li 9, 256
stvx 20, 9, 1
addi 9, 9, 16
stvx 21, 9, 1
addi 9, 9, 16
stvx 22, 9, 1
addi 9, 9, 16
stvx 23, 9, 1
addi 9, 9, 16
stvx 24, 9, 1
addi 9, 9, 16
stvx 25, 9, 1
addi 9, 9, 16
stvx 26, 9, 1
addi 9, 9, 16
stvx 27, 9, 1
addi 9, 9, 16
stvx 28, 9, 1
addi 9, 9, 16
stvx 29, 9, 1
addi 9, 9, 16
stvx 30, 9, 1
addi 9, 9, 16
stvx 31, 9, 1
stxv 14, 464(1)
stxv 15, 480(1)
stxv 16, 496(1)
stxv 17, 512(1)
stxv 18, 528(1)
stxv 19, 544(1)
stxv 20, 560(1)
stxv 21, 576(1)
stxv 22, 592(1)
std 0, 656(1)
.endm
.macro RESTORE_REGS
lxv 14, 464(1)
lxv 15, 480(1)
lxv 16, 496(1)
lxv 17, 512(1)
lxv 18, 528(1)
lxv 19, 544(1)
lxv 20, 560(1)
lxv 21, 576(1)
lxv 22, 592(1)
li 9, 256
lvx 20, 9, 1
addi 9, 9, 16
lvx 21, 9, 1
addi 9, 9, 16
lvx 22, 9, 1
addi 9, 9, 16
lvx 23, 9, 1
addi 9, 9, 16
lvx 24, 9, 1
addi 9, 9, 16
lvx 25, 9, 1
addi 9, 9, 16
lvx 26, 9, 1
addi 9, 9, 16
lvx 27, 9, 1
addi 9, 9, 16
lvx 28, 9, 1
addi 9, 9, 16
lvx 29, 9, 1
addi 9, 9, 16
lvx 30, 9, 1
addi 9, 9, 16
lvx 31, 9, 1
ld 0, 656(1)
ld 14,112(1)
ld 15,120(1)
ld 16,128(1)
ld 17,136(1)
ld 18,144(1)
ld 19,152(1)
ld 20,160(1)
ld 21,168(1)
mtlr 0
addi 1, 1, 640
.endm
.macro LOAD_HASH_TABLE
# Load Xi
lxvb16x 32, 0, 8 # load Xi
# load Hash - h^4, h^3, h^2, h
li 10, 32
lxvd2x 2+32, 10, 8 # H Poli
li 10, 48
lxvd2x 3+32, 10, 8 # Hl
li 10, 64
lxvd2x 4+32, 10, 8 # H
li 10, 80
lxvd2x 5+32, 10, 8 # Hh
li 10, 96
lxvd2x 6+32, 10, 8 # H^2l
li 10, 112
lxvd2x 7+32, 10, 8 # H^2
li 10, 128
lxvd2x 8+32, 10, 8 # H^2h
li 10, 144
lxvd2x 9+32, 10, 8 # H^3l
li 10, 160
lxvd2x 10+32, 10, 8 # H^3
li 10, 176
lxvd2x 11+32, 10, 8 # H^3h
li 10, 192
lxvd2x 12+32, 10, 8 # H^4l
li 10, 208
lxvd2x 13+32, 10, 8 # H^4
li 10, 224
lxvd2x 14+32, 10, 8 # H^4h
.endm
#
# aes_p10_gcm_encrypt (const void *inp, void *out, size_t len,
# const char *rk, unsigned char iv[16], void *Xip);
#
# r3 - inp
# r4 - out
# r5 - len
# r6 - AES round keys
# r7 - iv and other data
# r8 - Xi, HPoli, hash keys
#
# rounds is at offset 240 in rk
# Xi is at 0 in gcm_table (Xip).
#
_GLOBAL(aes_p10_gcm_encrypt)
.align 5
SAVE_REGS
LOAD_HASH_TABLE
# initialize ICB: GHASH( IV ), IV - r7
lxvb16x 30+32, 0, 7 # load IV - v30
mr 12, 5 # length
li 11, 0 # block index
# counter 1
vxor 31, 31, 31
vspltisb 22, 1
vsldoi 31, 31, 22,1 # counter 1
# load round key to VSR
lxv 0, 0(6)
lxv 1, 0x10(6)
lxv 2, 0x20(6)
lxv 3, 0x30(6)
lxv 4, 0x40(6)
lxv 5, 0x50(6)
lxv 6, 0x60(6)
lxv 7, 0x70(6)
lxv 8, 0x80(6)
lxv 9, 0x90(6)
lxv 10, 0xa0(6)
# load rounds - 10 (128), 12 (192), 14 (256)
lwz 9,240(6)
#
# vxor state, state, w # addroundkey
xxlor 32+29, 0, 0
vxor 15, 30, 29 # IV + round key - add round key 0
cmpdi 9, 10
beq Loop_aes_gcm_8x
# load 2 more round keys (v11, v12)
lxv 11, 0xb0(6)
lxv 12, 0xc0(6)
cmpdi 9, 12
beq Loop_aes_gcm_8x
# load 2 more round keys (v11, v12, v13, v14)
lxv 13, 0xd0(6)
lxv 14, 0xe0(6)
cmpdi 9, 14
beq Loop_aes_gcm_8x
b aes_gcm_out
.align 5
Loop_aes_gcm_8x:
mr 14, 3
mr 9, 4
#
# check partial block
#
Continue_partial_check:
ld 15, 56(7)
cmpdi 15, 0
beq Continue
bgt Final_block
cmpdi 15, 16
blt Final_block
Continue:
# n blcoks
li 10, 128
divdu 10, 12, 10 # n 128 bytes-blocks
cmpdi 10, 0
beq Loop_last_block
vaddudm 30, 30, 31 # IV + counter
vxor 16, 30, 29
vaddudm 30, 30, 31
vxor 17, 30, 29
vaddudm 30, 30, 31
vxor 18, 30, 29
vaddudm 30, 30, 31
vxor 19, 30, 29
vaddudm 30, 30, 31
vxor 20, 30, 29
vaddudm 30, 30, 31
vxor 21, 30, 29
vaddudm 30, 30, 31
vxor 22, 30, 29
mtctr 10
li 15, 16
li 16, 32
li 17, 48
li 18, 64
li 19, 80
li 20, 96
li 21, 112
lwz 10, 240(6)
Loop_8x_block:
lxvb16x 15, 0, 14 # load block
lxvb16x 16, 15, 14 # load block
lxvb16x 17, 16, 14 # load block
lxvb16x 18, 17, 14 # load block
lxvb16x 19, 18, 14 # load block
lxvb16x 20, 19, 14 # load block
lxvb16x 21, 20, 14 # load block
lxvb16x 22, 21, 14 # load block
addi 14, 14, 128
Loop_aes_middle8x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_ghash
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_ghash
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_ghash
b aes_gcm_out
Do_next_ghash:
#
# last round
vcipherlast 15, 15, 23
vcipherlast 16, 16, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
xxlxor 48, 48, 16
stxvb16x 48, 15, 9 # store output
vcipherlast 17, 17, 23
vcipherlast 18, 18, 23
xxlxor 49, 49, 17
stxvb16x 49, 16, 9 # store output
xxlxor 50, 50, 18
stxvb16x 50, 17, 9 # store output
vcipherlast 19, 19, 23
vcipherlast 20, 20, 23
xxlxor 51, 51, 19
stxvb16x 51, 18, 9 # store output
xxlxor 52, 52, 20
stxvb16x 52, 19, 9 # store output
vcipherlast 21, 21, 23
vcipherlast 22, 22, 23
xxlxor 53, 53, 21
stxvb16x 53, 20, 9 # store output
xxlxor 54, 54, 22
stxvb16x 54, 21, 9 # store output
addi 9, 9, 128
# ghash here
ppc_aes_gcm_ghash2_4x
xxlor 27+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vmr 29, 30
vxor 15, 30, 27 # add round key
vaddudm 30, 30, 31
vxor 16, 30, 27
vaddudm 30, 30, 31
vxor 17, 30, 27
vaddudm 30, 30, 31
vxor 18, 30, 27
vaddudm 30, 30, 31
vxor 19, 30, 27
vaddudm 30, 30, 31
vxor 20, 30, 27
vaddudm 30, 30, 31
vxor 21, 30, 27
vaddudm 30, 30, 31
vxor 22, 30, 27
addi 12, 12, -128
addi 11, 11, 128
bdnz Loop_8x_block
vmr 30, 29
stxvb16x 30+32, 0, 7 # update IV
Loop_last_block:
cmpdi 12, 0
beq aes_gcm_out
# loop last few blocks
li 10, 16
divdu 10, 12, 10
mtctr 10
lwz 10, 240(6)
cmpdi 12, 16
blt Final_block
Next_rem_block:
lxvb16x 15, 0, 14 # load block
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_1x
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_1x
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_1x
Do_next_1x:
vcipherlast 15, 15, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
addi 14, 14, 16
addi 9, 9, 16
vmr 28, 15
ppc_update_hash_1x
addi 12, 12, -16
addi 11, 11, 16
xxlor 19+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vxor 15, 30, 19 # add round key
bdnz Next_rem_block
li 15, 0
std 15, 56(7) # clear partial?
stxvb16x 30+32, 0, 7 # update IV
cmpdi 12, 0
beq aes_gcm_out
Final_block:
lwz 10, 240(6)
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_final_1x
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_final_1x
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_final_1x
Do_final_1x:
vcipherlast 15, 15, 23
# check partial block
li 21, 0 # encrypt
ld 15, 56(7) # partial?
cmpdi 15, 0
beq Normal_block
bl Do_partial_block
cmpdi 12, 0
ble aes_gcm_out
b Continue_partial_check
Normal_block:
lxvb16x 15, 0, 14 # load last block
xxlxor 47, 47, 15
# create partial block mask
li 15, 16
sub 15, 15, 12 # index to the mask
vspltisb 16, -1 # first 16 bytes - 0xffff...ff
vspltisb 17, 0 # second 16 bytes - 0x0000...00
li 10, 192
stvx 16, 10, 1
addi 10, 10, 16
stvx 17, 10, 1
addi 10, 1, 192
lxvb16x 16, 15, 10 # load partial block mask
xxland 47, 47, 16
vmr 28, 15
ppc_update_hash_1x
# * should store only the remaining bytes.
bl Write_partial_block
stxvb16x 30+32, 0, 7 # update IV
std 12, 56(7) # update partial?
li 16, 16
stxvb16x 32, 0, 8 # write out Xi
stxvb16x 32, 16, 8 # write out Xi
b aes_gcm_out
#
# Compute data mask
#
.macro GEN_MASK _mask _start _end
vspltisb 16, -1 # first 16 bytes - 0xffff...ff
vspltisb 17, 0 # second 16 bytes - 0x0000...00
li 10, 192
stxvb16x 17+32, 10, 1
add 10, 10, \_start
stxvb16x 16+32, 10, 1
add 10, 10, \_end
stxvb16x 17+32, 10, 1
addi 10, 1, 192
lxvb16x \_mask, 0, 10 # load partial block mask
.endm
#
# Handle multiple partial blocks for encrypt and decrypt
# operations.
#
SYM_FUNC_START_LOCAL(Do_partial_block)
add 17, 15, 5
cmpdi 17, 16
bgt Big_block
GEN_MASK 18, 15, 5
b _Partial
SYM_FUNC_END(Do_partial_block)
Big_block:
li 16, 16
GEN_MASK 18, 15, 16
_Partial:
lxvb16x 17+32, 0, 14 # load last block
sldi 16, 15, 3
mtvsrdd 32+16, 0, 16
vsro 17, 17, 16
xxlxor 47, 47, 17+32
xxland 47, 47, 18
vxor 0, 0, 0 # clear Xi
vmr 28, 15
cmpdi 21, 0 # encrypt/decrypt ops?
beq Skip_decrypt
xxland 32+28, 32+17, 18
Skip_decrypt:
ppc_update_hash_1x
li 16, 16
lxvb16x 32+29, 16, 8
vxor 0, 0, 29
stxvb16x 32, 0, 8 # save Xi
stxvb16x 32, 16, 8 # save Xi
# store partial block
# loop the rest of the stream if any
sldi 16, 15, 3
mtvsrdd 32+16, 0, 16
vslo 15, 15, 16
#stxvb16x 15+32, 0, 9 # last block
li 16, 16
sub 17, 16, 15 # 16 - partial
add 16, 15, 5
cmpdi 16, 16
bgt Larger_16
mr 17, 5
Larger_16:
# write partial
li 10, 192
stxvb16x 15+32, 10, 1 # save current block
addi 10, 9, -1
addi 16, 1, 191
mtctr 17 # move partial byte count
Write_last_partial:
lbzu 18, 1(16)
stbu 18, 1(10)
bdnz Write_last_partial
# Complete loop partial
add 14, 14, 17
add 9, 9, 17
sub 12, 12, 17
add 11, 11, 17
add 15, 15, 5
cmpdi 15, 16
blt Save_partial
vaddudm 30, 30, 31
stxvb16x 30+32, 0, 7 # update IV
xxlor 32+29, 0, 0
vxor 15, 30, 29 # IV + round key - add round key 0
li 15, 0
std 15, 56(7) # partial done - clear
b Partial_done
Save_partial:
std 15, 56(7) # partial
Partial_done:
blr
#
# Write partial block
# r9 - output
# r12 - remaining bytes
# v15 - partial input data
#
SYM_FUNC_START_LOCAL(Write_partial_block)
li 10, 192
stxvb16x 15+32, 10, 1 # last block
addi 10, 9, -1
addi 16, 1, 191
mtctr 12 # remaining bytes
li 15, 0
Write_last_byte:
lbzu 14, 1(16)
stbu 14, 1(10)
bdnz Write_last_byte
blr
SYM_FUNC_END(Write_partial_block)
aes_gcm_out:
# out = state
stxvb16x 32, 0, 8 # write out Xi
add 3, 11, 12 # return count
RESTORE_REGS
blr
#
# 8x Decrypt
#
_GLOBAL(aes_p10_gcm_decrypt)
.align 5
SAVE_REGS
LOAD_HASH_TABLE
# initialize ICB: GHASH( IV ), IV - r7
lxvb16x 30+32, 0, 7 # load IV - v30
mr 12, 5 # length
li 11, 0 # block index
# counter 1
vxor 31, 31, 31
vspltisb 22, 1
vsldoi 31, 31, 22,1 # counter 1
# load round key to VSR
lxv 0, 0(6)
lxv 1, 0x10(6)
lxv 2, 0x20(6)
lxv 3, 0x30(6)
lxv 4, 0x40(6)
lxv 5, 0x50(6)
lxv 6, 0x60(6)
lxv 7, 0x70(6)
lxv 8, 0x80(6)
lxv 9, 0x90(6)
lxv 10, 0xa0(6)
# load rounds - 10 (128), 12 (192), 14 (256)
lwz 9,240(6)
#
# vxor state, state, w # addroundkey
xxlor 32+29, 0, 0
vxor 15, 30, 29 # IV + round key - add round key 0
cmpdi 9, 10
beq Loop_aes_gcm_8x_dec
# load 2 more round keys (v11, v12)
lxv 11, 0xb0(6)
lxv 12, 0xc0(6)
cmpdi 9, 12
beq Loop_aes_gcm_8x_dec
# load 2 more round keys (v11, v12, v13, v14)
lxv 13, 0xd0(6)
lxv 14, 0xe0(6)
cmpdi 9, 14
beq Loop_aes_gcm_8x_dec
b aes_gcm_out
.align 5
Loop_aes_gcm_8x_dec:
mr 14, 3
mr 9, 4
#
# check partial block
#
Continue_partial_check_dec:
ld 15, 56(7)
cmpdi 15, 0
beq Continue_dec
bgt Final_block_dec
cmpdi 15, 16
blt Final_block_dec
Continue_dec:
# n blcoks
li 10, 128
divdu 10, 12, 10 # n 128 bytes-blocks
cmpdi 10, 0
beq Loop_last_block_dec
vaddudm 30, 30, 31 # IV + counter
vxor 16, 30, 29
vaddudm 30, 30, 31
vxor 17, 30, 29
vaddudm 30, 30, 31
vxor 18, 30, 29
vaddudm 30, 30, 31
vxor 19, 30, 29
vaddudm 30, 30, 31
vxor 20, 30, 29
vaddudm 30, 30, 31
vxor 21, 30, 29
vaddudm 30, 30, 31
vxor 22, 30, 29
mtctr 10
li 15, 16
li 16, 32
li 17, 48
li 18, 64
li 19, 80
li 20, 96
li 21, 112
lwz 10, 240(6)
Loop_8x_block_dec:
lxvb16x 15, 0, 14 # load block
lxvb16x 16, 15, 14 # load block
lxvb16x 17, 16, 14 # load block
lxvb16x 18, 17, 14 # load block
lxvb16x 19, 18, 14 # load block
lxvb16x 20, 19, 14 # load block
lxvb16x 21, 20, 14 # load block
lxvb16x 22, 21, 14 # load block
addi 14, 14, 128
Loop_aes_middle8x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_ghash_dec
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_ghash_dec
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 16, 16, 23
vcipher 17, 17, 23
vcipher 18, 18, 23
vcipher 19, 19, 23
vcipher 20, 20, 23
vcipher 21, 21, 23
vcipher 22, 22, 23
vcipher 15, 15, 24
vcipher 16, 16, 24
vcipher 17, 17, 24
vcipher 18, 18, 24
vcipher 19, 19, 24
vcipher 20, 20, 24
vcipher 21, 21, 24
vcipher 22, 22, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_ghash_dec
b aes_gcm_out
Do_next_ghash_dec:
#
# last round
vcipherlast 15, 15, 23
vcipherlast 16, 16, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
xxlxor 48, 48, 16
stxvb16x 48, 15, 9 # store output
vcipherlast 17, 17, 23
vcipherlast 18, 18, 23
xxlxor 49, 49, 17
stxvb16x 49, 16, 9 # store output
xxlxor 50, 50, 18
stxvb16x 50, 17, 9 # store output
vcipherlast 19, 19, 23
vcipherlast 20, 20, 23
xxlxor 51, 51, 19
stxvb16x 51, 18, 9 # store output
xxlxor 52, 52, 20
stxvb16x 52, 19, 9 # store output
vcipherlast 21, 21, 23
vcipherlast 22, 22, 23
xxlxor 53, 53, 21
stxvb16x 53, 20, 9 # store output
xxlxor 54, 54, 22
stxvb16x 54, 21, 9 # store output
addi 9, 9, 128
xxlor 15+32, 15, 15
xxlor 16+32, 16, 16
xxlor 17+32, 17, 17
xxlor 18+32, 18, 18
xxlor 19+32, 19, 19
xxlor 20+32, 20, 20
xxlor 21+32, 21, 21
xxlor 22+32, 22, 22
# ghash here
ppc_aes_gcm_ghash2_4x
xxlor 27+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vmr 29, 30
vxor 15, 30, 27 # add round key
vaddudm 30, 30, 31
vxor 16, 30, 27
vaddudm 30, 30, 31
vxor 17, 30, 27
vaddudm 30, 30, 31
vxor 18, 30, 27
vaddudm 30, 30, 31
vxor 19, 30, 27
vaddudm 30, 30, 31
vxor 20, 30, 27
vaddudm 30, 30, 31
vxor 21, 30, 27
vaddudm 30, 30, 31
vxor 22, 30, 27
addi 12, 12, -128
addi 11, 11, 128
bdnz Loop_8x_block_dec
vmr 30, 29
stxvb16x 30+32, 0, 7 # update IV
Loop_last_block_dec:
cmpdi 12, 0
beq aes_gcm_out
# loop last few blocks
li 10, 16
divdu 10, 12, 10
mtctr 10
lwz 10, 240(6)
cmpdi 12, 16
blt Final_block_dec
Next_rem_block_dec:
lxvb16x 15, 0, 14 # load block
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_next_1x_dec
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_next_1x_dec
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_next_1x_dec
Do_next_1x_dec:
vcipherlast 15, 15, 23
xxlxor 47, 47, 15
stxvb16x 47, 0, 9 # store output
addi 14, 14, 16
addi 9, 9, 16
xxlor 28+32, 15, 15
#vmr 28, 15
ppc_update_hash_1x
addi 12, 12, -16
addi 11, 11, 16
xxlor 19+32, 0, 0
vaddudm 30, 30, 31 # IV + counter
vxor 15, 30, 19 # add round key
bdnz Next_rem_block_dec
li 15, 0
std 15, 56(7) # clear partial?
stxvb16x 30+32, 0, 7 # update IV
cmpdi 12, 0
beq aes_gcm_out
Final_block_dec:
lwz 10, 240(6)
Loop_aes_middle_1x
xxlor 23+32, 10, 10
cmpdi 10, 10
beq Do_final_1x_dec
# 192 bits
xxlor 24+32, 11, 11
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 12, 12
cmpdi 10, 12
beq Do_final_1x_dec
# 256 bits
xxlor 24+32, 13, 13
vcipher 15, 15, 23
vcipher 15, 15, 24
xxlor 23+32, 14, 14
cmpdi 10, 14
beq Do_final_1x_dec
Do_final_1x_dec:
vcipherlast 15, 15, 23
# check partial block
li 21, 1 # decrypt
ld 15, 56(7) # partial?
cmpdi 15, 0
beq Normal_block_dec
bl Do_partial_block
cmpdi 12, 0
ble aes_gcm_out
b Continue_partial_check_dec
Normal_block_dec:
lxvb16x 15, 0, 14 # load last block
xxlxor 47, 47, 15
# create partial block mask
li 15, 16
sub 15, 15, 12 # index to the mask
vspltisb 16, -1 # first 16 bytes - 0xffff...ff
vspltisb 17, 0 # second 16 bytes - 0x0000...00
li 10, 192
stvx 16, 10, 1
addi 10, 10, 16
stvx 17, 10, 1
addi 10, 1, 192
lxvb16x 16, 15, 10 # load partial block mask
xxland 47, 47, 16
xxland 32+28, 15, 16
#vmr 28, 15
ppc_update_hash_1x
# * should store only the remaining bytes.
bl Write_partial_block
stxvb16x 30+32, 0, 7 # update IV
std 12, 56(7) # update partial?
li 16, 16
stxvb16x 32, 0, 8 # write out Xi
stxvb16x 32, 16, 8 # write out Xi
b aes_gcm_out