linux/arch/arm64/crypto/ghash-ce-core.S
Ard Biesheuvel f10dc56c64 crypto: arm64 - revert NEON yield for fast AEAD implementations
As it turns out, checking the TIF_NEED_RESCHED flag after each
iteration results in a significant performance regression (~10%)
when running fast algorithms (i.e., ones that use special instructions
and operate in the < 4 cycles per byte range) on in-order cores with
comparatively slow memory accesses such as the Cortex-A53.

Given the speed of these ciphers, and the fact that the page based
nature of the AEAD scatterwalk API guarantees that the core NEON
transform is never invoked with more than a single page's worth of
input, we can estimate the worst case duration of any resulting
scheduling blackout: on a 1 GHz Cortex-A53 running with 64k pages,
processing a page's worth of input at 4 cycles per byte results in
a delay of ~250 us, which is a reasonable upper bound.

So let's remove the yield checks from the fused AES-CCM and AES-GCM
routines entirely.

This reverts commit 7b67ae4d5c and
partially reverts commit 7c50136a8a.

Fixes: 7c50136a8a ("crypto: arm64/aes-ghash - yield NEON after every ...")
Fixes: 7b67ae4d5c ("crypto: arm64/aes-ccm - yield NEON after every ...")
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Acked-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2018-08-07 17:26:23 +08:00

466 lines
9.7 KiB
ArmAsm

/*
* Accelerated GHASH implementation with ARMv8 PMULL instructions.
*
* Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 as published
* by the Free Software Foundation.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
SHASH .req v0
SHASH2 .req v1
T1 .req v2
T2 .req v3
MASK .req v4
XL .req v5
XM .req v6
XH .req v7
IN1 .req v7
k00_16 .req v8
k32_48 .req v9
t3 .req v10
t4 .req v11
t5 .req v12
t6 .req v13
t7 .req v14
t8 .req v15
t9 .req v16
perm1 .req v17
perm2 .req v18
perm3 .req v19
sh1 .req v20
sh2 .req v21
sh3 .req v22
sh4 .req v23
ss1 .req v24
ss2 .req v25
ss3 .req v26
ss4 .req v27
.text
.arch armv8-a+crypto
.macro __pmull_p64, rd, rn, rm
pmull \rd\().1q, \rn\().1d, \rm\().1d
.endm
.macro __pmull2_p64, rd, rn, rm
pmull2 \rd\().1q, \rn\().2d, \rm\().2d
.endm
.macro __pmull_p8, rq, ad, bd
ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1
ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2
ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3
__pmull_p8_\bd \rq, \ad
.endm
.macro __pmull2_p8, rq, ad, bd
tbl t3.16b, {\ad\().16b}, perm1.16b // A1
tbl t5.16b, {\ad\().16b}, perm2.16b // A2
tbl t7.16b, {\ad\().16b}, perm3.16b // A3
__pmull2_p8_\bd \rq, \ad
.endm
.macro __pmull_p8_SHASH, rq, ad
__pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4
.endm
.macro __pmull_p8_SHASH2, rq, ad
__pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4
.endm
.macro __pmull2_p8_SHASH, rq, ad
__pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4
.endm
.macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4
pmull\t t3.8h, t3.\nb, \bd // F = A1*B
pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1
pmull\t t5.8h, t5.\nb, \bd // H = A2*B
pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2
pmull\t t7.8h, t7.\nb, \bd // J = A3*B
pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3
pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4
pmull\t \rq\().8h, \ad, \bd // D = A*B
eor t3.16b, t3.16b, t4.16b // L = E + F
eor t5.16b, t5.16b, t6.16b // M = G + H
eor t7.16b, t7.16b, t8.16b // N = I + J
uzp1 t4.2d, t3.2d, t5.2d
uzp2 t3.2d, t3.2d, t5.2d
uzp1 t6.2d, t7.2d, t9.2d
uzp2 t7.2d, t7.2d, t9.2d
// t3 = (L) (P0 + P1) << 8
// t5 = (M) (P2 + P3) << 16
eor t4.16b, t4.16b, t3.16b
and t3.16b, t3.16b, k32_48.16b
// t7 = (N) (P4 + P5) << 24
// t9 = (K) (P6 + P7) << 32
eor t6.16b, t6.16b, t7.16b
and t7.16b, t7.16b, k00_16.16b
eor t4.16b, t4.16b, t3.16b
eor t6.16b, t6.16b, t7.16b
zip2 t5.2d, t4.2d, t3.2d
zip1 t3.2d, t4.2d, t3.2d
zip2 t9.2d, t6.2d, t7.2d
zip1 t7.2d, t6.2d, t7.2d
ext t3.16b, t3.16b, t3.16b, #15
ext t5.16b, t5.16b, t5.16b, #14
ext t7.16b, t7.16b, t7.16b, #13
ext t9.16b, t9.16b, t9.16b, #12
eor t3.16b, t3.16b, t5.16b
eor t7.16b, t7.16b, t9.16b
eor \rq\().16b, \rq\().16b, t3.16b
eor \rq\().16b, \rq\().16b, t7.16b
.endm
.macro __pmull_pre_p64
movi MASK.16b, #0xe1
shl MASK.2d, MASK.2d, #57
.endm
.macro __pmull_pre_p8
// k00_16 := 0x0000000000000000_000000000000ffff
// k32_48 := 0x00000000ffffffff_0000ffffffffffff
movi k32_48.2d, #0xffffffff
mov k32_48.h[2], k32_48.h[0]
ushr k00_16.2d, k32_48.2d, #32
// prepare the permutation vectors
mov_q x5, 0x080f0e0d0c0b0a09
movi T1.8b, #8
dup perm1.2d, x5
eor perm1.16b, perm1.16b, T1.16b
ushr perm2.2d, perm1.2d, #8
ushr perm3.2d, perm1.2d, #16
ushr T1.2d, perm1.2d, #24
sli perm2.2d, perm1.2d, #56
sli perm3.2d, perm1.2d, #48
sli T1.2d, perm1.2d, #40
// precompute loop invariants
tbl sh1.16b, {SHASH.16b}, perm1.16b
tbl sh2.16b, {SHASH.16b}, perm2.16b
tbl sh3.16b, {SHASH.16b}, perm3.16b
tbl sh4.16b, {SHASH.16b}, T1.16b
ext ss1.8b, SHASH2.8b, SHASH2.8b, #1
ext ss2.8b, SHASH2.8b, SHASH2.8b, #2
ext ss3.8b, SHASH2.8b, SHASH2.8b, #3
ext ss4.8b, SHASH2.8b, SHASH2.8b, #4
.endm
//
// PMULL (64x64->128) based reduction for CPUs that can do
// it in a single instruction.
//
.macro __pmull_reduce_p64
pmull T2.1q, XL.1d, MASK.1d
eor XM.16b, XM.16b, T1.16b
mov XH.d[0], XM.d[1]
mov XM.d[1], XL.d[0]
eor XL.16b, XM.16b, T2.16b
ext T2.16b, XL.16b, XL.16b, #8
pmull XL.1q, XL.1d, MASK.1d
.endm
//
// Alternative reduction for CPUs that lack support for the
// 64x64->128 PMULL instruction
//
.macro __pmull_reduce_p8
eor XM.16b, XM.16b, T1.16b
mov XL.d[1], XM.d[0]
mov XH.d[0], XM.d[1]
shl T1.2d, XL.2d, #57
shl T2.2d, XL.2d, #62
eor T2.16b, T2.16b, T1.16b
shl T1.2d, XL.2d, #63
eor T2.16b, T2.16b, T1.16b
ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, T2.16b, T1.16b
mov XL.d[1], T2.d[0]
mov XH.d[0], T2.d[1]
ushr T2.2d, XL.2d, #1
eor XH.16b, XH.16b, XL.16b
eor XL.16b, XL.16b, T2.16b
ushr T2.2d, T2.2d, #6
ushr XL.2d, XL.2d, #1
.endm
.macro __pmull_ghash, pn
frame_push 5
mov x19, x0
mov x20, x1
mov x21, x2
mov x22, x3
mov x23, x4
0: ld1 {SHASH.2d}, [x22]
ld1 {XL.2d}, [x20]
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
eor SHASH2.16b, SHASH2.16b, SHASH.16b
__pmull_pre_\pn
/* do the head block first, if supplied */
cbz x23, 1f
ld1 {T1.2d}, [x23]
mov x23, xzr
b 2f
1: ld1 {T1.2d}, [x21], #16
sub w19, w19, #1
2: /* multiply XL by SHASH in GF(2^128) */
CPU_LE( rev64 T1.16b, T1.16b )
ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8
eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b
__pmull2_\pn XH, XL, SHASH // a1 * b1
eor T1.16b, T1.16b, XL.16b
__pmull_\pn XL, XL, SHASH // a0 * b0
__pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0)
eor T2.16b, XL.16b, XH.16b
ext T1.16b, XL.16b, XH.16b, #8
eor XM.16b, XM.16b, T2.16b
__pmull_reduce_\pn
eor T2.16b, T2.16b, XH.16b
eor XL.16b, XL.16b, T2.16b
cbz w19, 3f
if_will_cond_yield_neon
st1 {XL.2d}, [x20]
do_cond_yield_neon
b 0b
endif_yield_neon
b 1b
3: st1 {XL.2d}, [x20]
frame_pop
ret
.endm
/*
* void pmull_ghash_update(int blocks, u64 dg[], const char *src,
* struct ghash_key const *k, const char *head)
*/
ENTRY(pmull_ghash_update_p64)
__pmull_ghash p64
ENDPROC(pmull_ghash_update_p64)
ENTRY(pmull_ghash_update_p8)
__pmull_ghash p8
ENDPROC(pmull_ghash_update_p8)
KS .req v8
CTR .req v9
INP .req v10
.macro load_round_keys, rounds, rk
cmp \rounds, #12
blo 2222f /* 128 bits */
beq 1111f /* 192 bits */
ld1 {v17.4s-v18.4s}, [\rk], #32
1111: ld1 {v19.4s-v20.4s}, [\rk], #32
2222: ld1 {v21.4s-v24.4s}, [\rk], #64
ld1 {v25.4s-v28.4s}, [\rk], #64
ld1 {v29.4s-v31.4s}, [\rk]
.endm
.macro enc_round, state, key
aese \state\().16b, \key\().16b
aesmc \state\().16b, \state\().16b
.endm
.macro enc_block, state, rounds
cmp \rounds, #12
b.lo 2222f /* 128 bits */
b.eq 1111f /* 192 bits */
enc_round \state, v17
enc_round \state, v18
1111: enc_round \state, v19
enc_round \state, v20
2222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29
enc_round \state, \key
.endr
aese \state\().16b, v30.16b
eor \state\().16b, \state\().16b, v31.16b
.endm
.macro pmull_gcm_do_crypt, enc
ld1 {SHASH.2d}, [x4]
ld1 {XL.2d}, [x1]
ldr x8, [x5, #8] // load lower counter
load_round_keys w7, x6
movi MASK.16b, #0xe1
ext SHASH2.16b, SHASH.16b, SHASH.16b, #8
CPU_LE( rev x8, x8 )
shl MASK.2d, MASK.2d, #57
eor SHASH2.16b, SHASH2.16b, SHASH.16b
.if \enc == 1
ldr x10, [sp]
ld1 {KS.16b}, [x10]
.endif
0: ld1 {CTR.8b}, [x5] // load upper counter
ld1 {INP.16b}, [x3], #16
rev x9, x8
add x8, x8, #1
sub w0, w0, #1
ins CTR.d[1], x9 // set lower counter
.if \enc == 1
eor INP.16b, INP.16b, KS.16b // encrypt input
st1 {INP.16b}, [x2], #16
.endif
rev64 T1.16b, INP.16b
cmp w7, #12
b.ge 2f // AES-192/256?
1: enc_round CTR, v21
ext T2.16b, XL.16b, XL.16b, #8
ext IN1.16b, T1.16b, T1.16b, #8
enc_round CTR, v22
eor T1.16b, T1.16b, T2.16b
eor XL.16b, XL.16b, IN1.16b
enc_round CTR, v23
pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1
eor T1.16b, T1.16b, XL.16b
enc_round CTR, v24
pmull XL.1q, SHASH.1d, XL.1d // a0 * b0
pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0)
enc_round CTR, v25
ext T1.16b, XL.16b, XH.16b, #8
eor T2.16b, XL.16b, XH.16b
eor XM.16b, XM.16b, T1.16b
enc_round CTR, v26
eor XM.16b, XM.16b, T2.16b
pmull T2.1q, XL.1d, MASK.1d
enc_round CTR, v27
mov XH.d[0], XM.d[1]
mov XM.d[1], XL.d[0]
enc_round CTR, v28
eor XL.16b, XM.16b, T2.16b
enc_round CTR, v29
ext T2.16b, XL.16b, XL.16b, #8
aese CTR.16b, v30.16b
pmull XL.1q, XL.1d, MASK.1d
eor T2.16b, T2.16b, XH.16b
eor KS.16b, CTR.16b, v31.16b
eor XL.16b, XL.16b, T2.16b
.if \enc == 0
eor INP.16b, INP.16b, KS.16b
st1 {INP.16b}, [x2], #16
.endif
cbnz w0, 0b
CPU_LE( rev x8, x8 )
st1 {XL.2d}, [x1]
str x8, [x5, #8] // store lower counter
.if \enc == 1
st1 {KS.16b}, [x10]
.endif
ret
2: b.eq 3f // AES-192?
enc_round CTR, v17
enc_round CTR, v18
3: enc_round CTR, v19
enc_round CTR, v20
b 1b
.endm
/*
* void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
* struct ghash_key const *k, u8 ctr[],
* int rounds, u8 ks[])
*/
ENTRY(pmull_gcm_encrypt)
pmull_gcm_do_crypt 1
ENDPROC(pmull_gcm_encrypt)
/*
* void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[],
* struct ghash_key const *k, u8 ctr[],
* int rounds)
*/
ENTRY(pmull_gcm_decrypt)
pmull_gcm_do_crypt 0
ENDPROC(pmull_gcm_decrypt)
/*
* void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds)
*/
ENTRY(pmull_gcm_encrypt_block)
cbz x2, 0f
load_round_keys w3, x2
0: ld1 {v0.16b}, [x1]
enc_block v0, w3
st1 {v0.16b}, [x0]
ret
ENDPROC(pmull_gcm_encrypt_block)