6499e8cfaa
This trims off a couple of instructions of the total size of the core AES transform by reordering the final branch in the AES-192 code path with the rounds that are performed regardless of whether the branch is taken or not. Other than the slight size reduction, this has no performance benefit. Fix up a comment regarding the prototype of this function while we're at it. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
518 lines
11 KiB
ArmAsm
518 lines
11 KiB
ArmAsm
/*
|
|
* aes-ce-core.S - AES in CBC/CTR/XTS mode using ARMv8 Crypto Extensions
|
|
*
|
|
* Copyright (C) 2015 Linaro Ltd <ard.biesheuvel@linaro.org>
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
* published by the Free Software Foundation.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/assembler.h>
|
|
|
|
.text
|
|
.fpu crypto-neon-fp-armv8
|
|
.align 3
|
|
|
|
.macro enc_round, state, key
|
|
aese.8 \state, \key
|
|
aesmc.8 \state, \state
|
|
.endm
|
|
|
|
.macro dec_round, state, key
|
|
aesd.8 \state, \key
|
|
aesimc.8 \state, \state
|
|
.endm
|
|
|
|
.macro enc_dround, key1, key2
|
|
enc_round q0, \key1
|
|
enc_round q0, \key2
|
|
.endm
|
|
|
|
.macro dec_dround, key1, key2
|
|
dec_round q0, \key1
|
|
dec_round q0, \key2
|
|
.endm
|
|
|
|
.macro enc_fround, key1, key2, key3
|
|
enc_round q0, \key1
|
|
aese.8 q0, \key2
|
|
veor q0, q0, \key3
|
|
.endm
|
|
|
|
.macro dec_fround, key1, key2, key3
|
|
dec_round q0, \key1
|
|
aesd.8 q0, \key2
|
|
veor q0, q0, \key3
|
|
.endm
|
|
|
|
.macro enc_dround_3x, key1, key2
|
|
enc_round q0, \key1
|
|
enc_round q1, \key1
|
|
enc_round q2, \key1
|
|
enc_round q0, \key2
|
|
enc_round q1, \key2
|
|
enc_round q2, \key2
|
|
.endm
|
|
|
|
.macro dec_dround_3x, key1, key2
|
|
dec_round q0, \key1
|
|
dec_round q1, \key1
|
|
dec_round q2, \key1
|
|
dec_round q0, \key2
|
|
dec_round q1, \key2
|
|
dec_round q2, \key2
|
|
.endm
|
|
|
|
.macro enc_fround_3x, key1, key2, key3
|
|
enc_round q0, \key1
|
|
enc_round q1, \key1
|
|
enc_round q2, \key1
|
|
aese.8 q0, \key2
|
|
aese.8 q1, \key2
|
|
aese.8 q2, \key2
|
|
veor q0, q0, \key3
|
|
veor q1, q1, \key3
|
|
veor q2, q2, \key3
|
|
.endm
|
|
|
|
.macro dec_fround_3x, key1, key2, key3
|
|
dec_round q0, \key1
|
|
dec_round q1, \key1
|
|
dec_round q2, \key1
|
|
aesd.8 q0, \key2
|
|
aesd.8 q1, \key2
|
|
aesd.8 q2, \key2
|
|
veor q0, q0, \key3
|
|
veor q1, q1, \key3
|
|
veor q2, q2, \key3
|
|
.endm
|
|
|
|
.macro do_block, dround, fround
|
|
cmp r3, #12 @ which key size?
|
|
vld1.8 {q10-q11}, [ip]!
|
|
\dround q8, q9
|
|
vld1.8 {q12-q13}, [ip]!
|
|
\dround q10, q11
|
|
vld1.8 {q10-q11}, [ip]!
|
|
\dround q12, q13
|
|
vld1.8 {q12-q13}, [ip]!
|
|
\dround q10, q11
|
|
blo 0f @ AES-128: 10 rounds
|
|
vld1.8 {q10-q11}, [ip]!
|
|
\dround q12, q13
|
|
beq 1f @ AES-192: 12 rounds
|
|
vld1.8 {q12-q13}, [ip]
|
|
\dround q10, q11
|
|
0: \fround q12, q13, q14
|
|
bx lr
|
|
|
|
1: \fround q10, q11, q14
|
|
bx lr
|
|
.endm
|
|
|
|
/*
|
|
* Internal, non-AAPCS compliant functions that implement the core AES
|
|
* transforms. These should preserve all registers except q0 - q2 and ip
|
|
* Arguments:
|
|
* q0 : first in/output block
|
|
* q1 : second in/output block (_3x version only)
|
|
* q2 : third in/output block (_3x version only)
|
|
* q8 : first round key
|
|
* q9 : secound round key
|
|
* q14 : final round key
|
|
* r2 : address of round key array
|
|
* r3 : number of rounds
|
|
*/
|
|
.align 6
|
|
aes_encrypt:
|
|
add ip, r2, #32 @ 3rd round key
|
|
.Laes_encrypt_tweak:
|
|
do_block enc_dround, enc_fround
|
|
ENDPROC(aes_encrypt)
|
|
|
|
.align 6
|
|
aes_decrypt:
|
|
add ip, r2, #32 @ 3rd round key
|
|
do_block dec_dround, dec_fround
|
|
ENDPROC(aes_decrypt)
|
|
|
|
.align 6
|
|
aes_encrypt_3x:
|
|
add ip, r2, #32 @ 3rd round key
|
|
do_block enc_dround_3x, enc_fround_3x
|
|
ENDPROC(aes_encrypt_3x)
|
|
|
|
.align 6
|
|
aes_decrypt_3x:
|
|
add ip, r2, #32 @ 3rd round key
|
|
do_block dec_dround_3x, dec_fround_3x
|
|
ENDPROC(aes_decrypt_3x)
|
|
|
|
.macro prepare_key, rk, rounds
|
|
add ip, \rk, \rounds, lsl #4
|
|
vld1.8 {q8-q9}, [\rk] @ load first 2 round keys
|
|
vld1.8 {q14}, [ip] @ load last round key
|
|
.endm
|
|
|
|
/*
|
|
* aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
|
* int blocks)
|
|
* aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
|
* int blocks)
|
|
*/
|
|
ENTRY(ce_aes_ecb_encrypt)
|
|
push {r4, lr}
|
|
ldr r4, [sp, #8]
|
|
prepare_key r2, r3
|
|
.Lecbencloop3x:
|
|
subs r4, r4, #3
|
|
bmi .Lecbenc1x
|
|
vld1.8 {q0-q1}, [r1, :64]!
|
|
vld1.8 {q2}, [r1, :64]!
|
|
bl aes_encrypt_3x
|
|
vst1.8 {q0-q1}, [r0, :64]!
|
|
vst1.8 {q2}, [r0, :64]!
|
|
b .Lecbencloop3x
|
|
.Lecbenc1x:
|
|
adds r4, r4, #3
|
|
beq .Lecbencout
|
|
.Lecbencloop:
|
|
vld1.8 {q0}, [r1, :64]!
|
|
bl aes_encrypt
|
|
vst1.8 {q0}, [r0, :64]!
|
|
subs r4, r4, #1
|
|
bne .Lecbencloop
|
|
.Lecbencout:
|
|
pop {r4, pc}
|
|
ENDPROC(ce_aes_ecb_encrypt)
|
|
|
|
ENTRY(ce_aes_ecb_decrypt)
|
|
push {r4, lr}
|
|
ldr r4, [sp, #8]
|
|
prepare_key r2, r3
|
|
.Lecbdecloop3x:
|
|
subs r4, r4, #3
|
|
bmi .Lecbdec1x
|
|
vld1.8 {q0-q1}, [r1, :64]!
|
|
vld1.8 {q2}, [r1, :64]!
|
|
bl aes_decrypt_3x
|
|
vst1.8 {q0-q1}, [r0, :64]!
|
|
vst1.8 {q2}, [r0, :64]!
|
|
b .Lecbdecloop3x
|
|
.Lecbdec1x:
|
|
adds r4, r4, #3
|
|
beq .Lecbdecout
|
|
.Lecbdecloop:
|
|
vld1.8 {q0}, [r1, :64]!
|
|
bl aes_decrypt
|
|
vst1.8 {q0}, [r0, :64]!
|
|
subs r4, r4, #1
|
|
bne .Lecbdecloop
|
|
.Lecbdecout:
|
|
pop {r4, pc}
|
|
ENDPROC(ce_aes_ecb_decrypt)
|
|
|
|
/*
|
|
* aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
|
* int blocks, u8 iv[])
|
|
* aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
|
* int blocks, u8 iv[])
|
|
*/
|
|
ENTRY(ce_aes_cbc_encrypt)
|
|
push {r4-r6, lr}
|
|
ldrd r4, r5, [sp, #16]
|
|
vld1.8 {q0}, [r5]
|
|
prepare_key r2, r3
|
|
.Lcbcencloop:
|
|
vld1.8 {q1}, [r1, :64]! @ get next pt block
|
|
veor q0, q0, q1 @ ..and xor with iv
|
|
bl aes_encrypt
|
|
vst1.8 {q0}, [r0, :64]!
|
|
subs r4, r4, #1
|
|
bne .Lcbcencloop
|
|
vst1.8 {q0}, [r5]
|
|
pop {r4-r6, pc}
|
|
ENDPROC(ce_aes_cbc_encrypt)
|
|
|
|
ENTRY(ce_aes_cbc_decrypt)
|
|
push {r4-r6, lr}
|
|
ldrd r4, r5, [sp, #16]
|
|
vld1.8 {q6}, [r5] @ keep iv in q6
|
|
prepare_key r2, r3
|
|
.Lcbcdecloop3x:
|
|
subs r4, r4, #3
|
|
bmi .Lcbcdec1x
|
|
vld1.8 {q0-q1}, [r1, :64]!
|
|
vld1.8 {q2}, [r1, :64]!
|
|
vmov q3, q0
|
|
vmov q4, q1
|
|
vmov q5, q2
|
|
bl aes_decrypt_3x
|
|
veor q0, q0, q6
|
|
veor q1, q1, q3
|
|
veor q2, q2, q4
|
|
vmov q6, q5
|
|
vst1.8 {q0-q1}, [r0, :64]!
|
|
vst1.8 {q2}, [r0, :64]!
|
|
b .Lcbcdecloop3x
|
|
.Lcbcdec1x:
|
|
adds r4, r4, #3
|
|
beq .Lcbcdecout
|
|
vmov q15, q14 @ preserve last round key
|
|
.Lcbcdecloop:
|
|
vld1.8 {q0}, [r1, :64]! @ get next ct block
|
|
veor q14, q15, q6 @ combine prev ct with last key
|
|
vmov q6, q0
|
|
bl aes_decrypt
|
|
vst1.8 {q0}, [r0, :64]!
|
|
subs r4, r4, #1
|
|
bne .Lcbcdecloop
|
|
.Lcbcdecout:
|
|
vst1.8 {q6}, [r5] @ keep iv in q6
|
|
pop {r4-r6, pc}
|
|
ENDPROC(ce_aes_cbc_decrypt)
|
|
|
|
/*
|
|
* aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
|
|
* int blocks, u8 ctr[])
|
|
*/
|
|
ENTRY(ce_aes_ctr_encrypt)
|
|
push {r4-r6, lr}
|
|
ldrd r4, r5, [sp, #16]
|
|
vld1.8 {q6}, [r5] @ load ctr
|
|
prepare_key r2, r3
|
|
vmov r6, s27 @ keep swabbed ctr in r6
|
|
rev r6, r6
|
|
cmn r6, r4 @ 32 bit overflow?
|
|
bcs .Lctrloop
|
|
.Lctrloop3x:
|
|
subs r4, r4, #3
|
|
bmi .Lctr1x
|
|
add r6, r6, #1
|
|
vmov q0, q6
|
|
vmov q1, q6
|
|
rev ip, r6
|
|
add r6, r6, #1
|
|
vmov q2, q6
|
|
vmov s7, ip
|
|
rev ip, r6
|
|
add r6, r6, #1
|
|
vmov s11, ip
|
|
vld1.8 {q3-q4}, [r1, :64]!
|
|
vld1.8 {q5}, [r1, :64]!
|
|
bl aes_encrypt_3x
|
|
veor q0, q0, q3
|
|
veor q1, q1, q4
|
|
veor q2, q2, q5
|
|
rev ip, r6
|
|
vst1.8 {q0-q1}, [r0, :64]!
|
|
vst1.8 {q2}, [r0, :64]!
|
|
vmov s27, ip
|
|
b .Lctrloop3x
|
|
.Lctr1x:
|
|
adds r4, r4, #3
|
|
beq .Lctrout
|
|
.Lctrloop:
|
|
vmov q0, q6
|
|
bl aes_encrypt
|
|
subs r4, r4, #1
|
|
bmi .Lctrhalfblock @ blocks < 0 means 1/2 block
|
|
vld1.8 {q3}, [r1, :64]!
|
|
veor q3, q0, q3
|
|
vst1.8 {q3}, [r0, :64]!
|
|
|
|
adds r6, r6, #1 @ increment BE ctr
|
|
rev ip, r6
|
|
vmov s27, ip
|
|
bcs .Lctrcarry
|
|
teq r4, #0
|
|
bne .Lctrloop
|
|
.Lctrout:
|
|
vst1.8 {q6}, [r5]
|
|
pop {r4-r6, pc}
|
|
|
|
.Lctrhalfblock:
|
|
vld1.8 {d1}, [r1, :64]
|
|
veor d0, d0, d1
|
|
vst1.8 {d0}, [r0, :64]
|
|
pop {r4-r6, pc}
|
|
|
|
.Lctrcarry:
|
|
.irp sreg, s26, s25, s24
|
|
vmov ip, \sreg @ load next word of ctr
|
|
rev ip, ip @ ... to handle the carry
|
|
adds ip, ip, #1
|
|
rev ip, ip
|
|
vmov \sreg, ip
|
|
bcc 0f
|
|
.endr
|
|
0: teq r4, #0
|
|
beq .Lctrout
|
|
b .Lctrloop
|
|
ENDPROC(ce_aes_ctr_encrypt)
|
|
|
|
/*
|
|
* aes_xts_encrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
|
|
* int blocks, u8 iv[], u8 const rk2[], int first)
|
|
* aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
|
|
* int blocks, u8 iv[], u8 const rk2[], int first)
|
|
*/
|
|
|
|
.macro next_tweak, out, in, const, tmp
|
|
vshr.s64 \tmp, \in, #63
|
|
vand \tmp, \tmp, \const
|
|
vadd.u64 \out, \in, \in
|
|
vext.8 \tmp, \tmp, \tmp, #8
|
|
veor \out, \out, \tmp
|
|
.endm
|
|
|
|
.align 3
|
|
.Lxts_mul_x:
|
|
.quad 1, 0x87
|
|
|
|
ce_aes_xts_init:
|
|
vldr d14, .Lxts_mul_x
|
|
vldr d15, .Lxts_mul_x + 8
|
|
|
|
ldrd r4, r5, [sp, #16] @ load args
|
|
ldr r6, [sp, #28]
|
|
vld1.8 {q0}, [r5] @ load iv
|
|
teq r6, #1 @ start of a block?
|
|
bxne lr
|
|
|
|
@ Encrypt the IV in q0 with the second AES key. This should only
|
|
@ be done at the start of a block.
|
|
ldr r6, [sp, #24] @ load AES key 2
|
|
prepare_key r6, r3
|
|
add ip, r6, #32 @ 3rd round key of key 2
|
|
b .Laes_encrypt_tweak @ tail call
|
|
ENDPROC(ce_aes_xts_init)
|
|
|
|
ENTRY(ce_aes_xts_encrypt)
|
|
push {r4-r6, lr}
|
|
|
|
bl ce_aes_xts_init @ run shared prologue
|
|
prepare_key r2, r3
|
|
vmov q3, q0
|
|
|
|
teq r6, #0 @ start of a block?
|
|
bne .Lxtsenc3x
|
|
|
|
.Lxtsencloop3x:
|
|
next_tweak q3, q3, q7, q6
|
|
.Lxtsenc3x:
|
|
subs r4, r4, #3
|
|
bmi .Lxtsenc1x
|
|
vld1.8 {q0-q1}, [r1, :64]! @ get 3 pt blocks
|
|
vld1.8 {q2}, [r1, :64]!
|
|
next_tweak q4, q3, q7, q6
|
|
veor q0, q0, q3
|
|
next_tweak q5, q4, q7, q6
|
|
veor q1, q1, q4
|
|
veor q2, q2, q5
|
|
bl aes_encrypt_3x
|
|
veor q0, q0, q3
|
|
veor q1, q1, q4
|
|
veor q2, q2, q5
|
|
vst1.8 {q0-q1}, [r0, :64]! @ write 3 ct blocks
|
|
vst1.8 {q2}, [r0, :64]!
|
|
vmov q3, q5
|
|
teq r4, #0
|
|
beq .Lxtsencout
|
|
b .Lxtsencloop3x
|
|
.Lxtsenc1x:
|
|
adds r4, r4, #3
|
|
beq .Lxtsencout
|
|
.Lxtsencloop:
|
|
vld1.8 {q0}, [r1, :64]!
|
|
veor q0, q0, q3
|
|
bl aes_encrypt
|
|
veor q0, q0, q3
|
|
vst1.8 {q0}, [r0, :64]!
|
|
subs r4, r4, #1
|
|
beq .Lxtsencout
|
|
next_tweak q3, q3, q7, q6
|
|
b .Lxtsencloop
|
|
.Lxtsencout:
|
|
vst1.8 {q3}, [r5]
|
|
pop {r4-r6, pc}
|
|
ENDPROC(ce_aes_xts_encrypt)
|
|
|
|
|
|
ENTRY(ce_aes_xts_decrypt)
|
|
push {r4-r6, lr}
|
|
|
|
bl ce_aes_xts_init @ run shared prologue
|
|
prepare_key r2, r3
|
|
vmov q3, q0
|
|
|
|
teq r6, #0 @ start of a block?
|
|
bne .Lxtsdec3x
|
|
|
|
.Lxtsdecloop3x:
|
|
next_tweak q3, q3, q7, q6
|
|
.Lxtsdec3x:
|
|
subs r4, r4, #3
|
|
bmi .Lxtsdec1x
|
|
vld1.8 {q0-q1}, [r1, :64]! @ get 3 ct blocks
|
|
vld1.8 {q2}, [r1, :64]!
|
|
next_tweak q4, q3, q7, q6
|
|
veor q0, q0, q3
|
|
next_tweak q5, q4, q7, q6
|
|
veor q1, q1, q4
|
|
veor q2, q2, q5
|
|
bl aes_decrypt_3x
|
|
veor q0, q0, q3
|
|
veor q1, q1, q4
|
|
veor q2, q2, q5
|
|
vst1.8 {q0-q1}, [r0, :64]! @ write 3 pt blocks
|
|
vst1.8 {q2}, [r0, :64]!
|
|
vmov q3, q5
|
|
teq r4, #0
|
|
beq .Lxtsdecout
|
|
b .Lxtsdecloop3x
|
|
.Lxtsdec1x:
|
|
adds r4, r4, #3
|
|
beq .Lxtsdecout
|
|
.Lxtsdecloop:
|
|
vld1.8 {q0}, [r1, :64]!
|
|
veor q0, q0, q3
|
|
add ip, r2, #32 @ 3rd round key
|
|
bl aes_decrypt
|
|
veor q0, q0, q3
|
|
vst1.8 {q0}, [r0, :64]!
|
|
subs r4, r4, #1
|
|
beq .Lxtsdecout
|
|
next_tweak q3, q3, q7, q6
|
|
b .Lxtsdecloop
|
|
.Lxtsdecout:
|
|
vst1.8 {q3}, [r5]
|
|
pop {r4-r6, pc}
|
|
ENDPROC(ce_aes_xts_decrypt)
|
|
|
|
/*
|
|
* u32 ce_aes_sub(u32 input) - use the aese instruction to perform the
|
|
* AES sbox substitution on each byte in
|
|
* 'input'
|
|
*/
|
|
ENTRY(ce_aes_sub)
|
|
vdup.32 q1, r0
|
|
veor q0, q0, q0
|
|
aese.8 q0, q1
|
|
vmov r0, s0
|
|
bx lr
|
|
ENDPROC(ce_aes_sub)
|
|
|
|
/*
|
|
* void ce_aes_invert(u8 *dst, u8 *src) - perform the Inverse MixColumns
|
|
* operation on round key *src
|
|
*/
|
|
ENTRY(ce_aes_invert)
|
|
vld1.8 {q0}, [r1]
|
|
aesimc.8 q0, q0
|
|
vst1.8 {q0}, [r0]
|
|
bx lr
|
|
ENDPROC(ce_aes_invert)
|