forked from Minki/linux
crypto: arm/ghash-ce - implement support for 4-way aggregation
Speed up the GHASH algorithm based on 64-bit polynomial multiplication by adding support for 4-way aggregation. This improves throughput by ~85% on Cortex-A53, from 1.7 cycles per byte to 0.9 cycles per byte. When combined with AES into GCM, throughput improves by ~25%, from 3.8 cycles per byte to 3.0 cycles per byte. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
ab8085c130
commit
00227e3a1d
@ -99,6 +99,7 @@ config CRYPTO_GHASH_ARM_CE
|
||||
depends on KERNEL_MODE_NEON
|
||||
select CRYPTO_HASH
|
||||
select CRYPTO_CRYPTD
|
||||
select CRYPTO_GF128MUL
|
||||
help
|
||||
Use an implementation of GHASH (used by the GCM AEAD chaining mode)
|
||||
that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
|
||||
|
@ -63,6 +63,33 @@
|
||||
k48 .req d31
|
||||
SHASH2_p64 .req d31
|
||||
|
||||
HH .req q10
|
||||
HH3 .req q11
|
||||
HH4 .req q12
|
||||
HH34 .req q13
|
||||
|
||||
HH_L .req d20
|
||||
HH_H .req d21
|
||||
HH3_L .req d22
|
||||
HH3_H .req d23
|
||||
HH4_L .req d24
|
||||
HH4_H .req d25
|
||||
HH34_L .req d26
|
||||
HH34_H .req d27
|
||||
SHASH2_H .req d29
|
||||
|
||||
XL2 .req q5
|
||||
XM2 .req q6
|
||||
XH2 .req q7
|
||||
T3 .req q8
|
||||
|
||||
XL2_L .req d10
|
||||
XL2_H .req d11
|
||||
XM2_L .req d12
|
||||
XM2_H .req d13
|
||||
T3_L .req d16
|
||||
T3_H .req d17
|
||||
|
||||
.text
|
||||
.fpu crypto-neon-fp-armv8
|
||||
|
||||
@ -175,12 +202,77 @@
|
||||
beq 0f
|
||||
vld1.64 {T1}, [ip]
|
||||
teq r0, #0
|
||||
b 1f
|
||||
b 3f
|
||||
|
||||
0: vld1.64 {T1}, [r2]!
|
||||
0: .ifc \pn, p64
|
||||
tst r0, #3 // skip until #blocks is a
|
||||
bne 2f // round multiple of 4
|
||||
|
||||
vld1.8 {XL2-XM2}, [r2]!
|
||||
1: vld1.8 {T3-T2}, [r2]!
|
||||
vrev64.8 XL2, XL2
|
||||
vrev64.8 XM2, XM2
|
||||
|
||||
subs r0, r0, #4
|
||||
|
||||
vext.8 T1, XL2, XL2, #8
|
||||
veor XL2_H, XL2_H, XL_L
|
||||
veor XL, XL, T1
|
||||
|
||||
vrev64.8 T3, T3
|
||||
vrev64.8 T1, T2
|
||||
|
||||
vmull.p64 XH, HH4_H, XL_H // a1 * b1
|
||||
veor XL2_H, XL2_H, XL_H
|
||||
vmull.p64 XL, HH4_L, XL_L // a0 * b0
|
||||
vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0)
|
||||
|
||||
vmull.p64 XH2, HH3_H, XM2_L // a1 * b1
|
||||
veor XM2_L, XM2_L, XM2_H
|
||||
vmull.p64 XL2, HH3_L, XM2_H // a0 * b0
|
||||
vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0)
|
||||
|
||||
veor XH, XH, XH2
|
||||
veor XL, XL, XL2
|
||||
veor XM, XM, XM2
|
||||
|
||||
vmull.p64 XH2, HH_H, T3_L // a1 * b1
|
||||
veor T3_L, T3_L, T3_H
|
||||
vmull.p64 XL2, HH_L, T3_H // a0 * b0
|
||||
vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0)
|
||||
|
||||
veor XH, XH, XH2
|
||||
veor XL, XL, XL2
|
||||
veor XM, XM, XM2
|
||||
|
||||
vmull.p64 XH2, SHASH_H, T1_L // a1 * b1
|
||||
veor T1_L, T1_L, T1_H
|
||||
vmull.p64 XL2, SHASH_L, T1_H // a0 * b0
|
||||
vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0)
|
||||
|
||||
veor XH, XH, XH2
|
||||
veor XL, XL, XL2
|
||||
veor XM, XM, XM2
|
||||
|
||||
beq 4f
|
||||
|
||||
vld1.8 {XL2-XM2}, [r2]!
|
||||
|
||||
veor T1, XL, XH
|
||||
veor XM, XM, T1
|
||||
|
||||
__pmull_reduce_p64
|
||||
|
||||
veor T1, T1, XH
|
||||
veor XL, XL, T1
|
||||
|
||||
b 1b
|
||||
.endif
|
||||
|
||||
2: vld1.64 {T1}, [r2]!
|
||||
subs r0, r0, #1
|
||||
|
||||
1: /* multiply XL by SHASH in GF(2^128) */
|
||||
3: /* multiply XL by SHASH in GF(2^128) */
|
||||
#ifndef CONFIG_CPU_BIG_ENDIAN
|
||||
vrev64.8 T1, T1
|
||||
#endif
|
||||
@ -193,7 +285,7 @@
|
||||
__pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0
|
||||
__pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0)
|
||||
|
||||
veor T1, XL, XH
|
||||
4: veor T1, XL, XH
|
||||
veor XM, XM, T1
|
||||
|
||||
__pmull_reduce_\pn
|
||||
@ -212,8 +304,14 @@
|
||||
* struct ghash_key const *k, const char *head)
|
||||
*/
|
||||
ENTRY(pmull_ghash_update_p64)
|
||||
vld1.64 {SHASH}, [r3]
|
||||
vld1.64 {SHASH}, [r3]!
|
||||
vld1.64 {HH}, [r3]!
|
||||
vld1.64 {HH3-HH4}, [r3]
|
||||
|
||||
veor SHASH2_p64, SHASH_L, SHASH_H
|
||||
veor SHASH2_H, HH_L, HH_H
|
||||
veor HH34_L, HH3_L, HH3_H
|
||||
veor HH34_H, HH4_L, HH4_H
|
||||
|
||||
vmov.i8 MASK, #0xe1
|
||||
vshl.u64 MASK, MASK, #57
|
||||
|
@ -1,7 +1,7 @@
|
||||
/*
|
||||
* Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
|
||||
*
|
||||
* Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
* Copyright (C) 2015 - 2018 Linaro Ltd. <ard.biesheuvel@linaro.org>
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 as published
|
||||
@ -28,8 +28,10 @@ MODULE_ALIAS_CRYPTO("ghash");
|
||||
#define GHASH_DIGEST_SIZE 16
|
||||
|
||||
struct ghash_key {
|
||||
u64 a;
|
||||
u64 b;
|
||||
u64 h[2];
|
||||
u64 h2[2];
|
||||
u64 h3[2];
|
||||
u64 h4[2];
|
||||
};
|
||||
|
||||
struct ghash_desc_ctx {
|
||||
@ -117,26 +119,40 @@ static int ghash_final(struct shash_desc *desc, u8 *dst)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ghash_reflect(u64 h[], const be128 *k)
|
||||
{
|
||||
u64 carry = be64_to_cpu(k->a) >> 63;
|
||||
|
||||
h[0] = (be64_to_cpu(k->b) << 1) | carry;
|
||||
h[1] = (be64_to_cpu(k->a) << 1) | (be64_to_cpu(k->b) >> 63);
|
||||
|
||||
if (carry)
|
||||
h[1] ^= 0xc200000000000000UL;
|
||||
}
|
||||
|
||||
static int ghash_setkey(struct crypto_shash *tfm,
|
||||
const u8 *inkey, unsigned int keylen)
|
||||
{
|
||||
struct ghash_key *key = crypto_shash_ctx(tfm);
|
||||
u64 a, b;
|
||||
be128 h, k;
|
||||
|
||||
if (keylen != GHASH_BLOCK_SIZE) {
|
||||
crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* perform multiplication by 'x' in GF(2^128) */
|
||||
b = get_unaligned_be64(inkey);
|
||||
a = get_unaligned_be64(inkey + 8);
|
||||
memcpy(&k, inkey, GHASH_BLOCK_SIZE);
|
||||
ghash_reflect(key->h, &k);
|
||||
|
||||
key->a = (a << 1) | (b >> 63);
|
||||
key->b = (b << 1) | (a >> 63);
|
||||
h = k;
|
||||
gf128mul_lle(&h, &k);
|
||||
ghash_reflect(key->h2, &h);
|
||||
|
||||
if (b >> 63)
|
||||
key->b ^= 0xc200000000000000UL;
|
||||
gf128mul_lle(&h, &k);
|
||||
ghash_reflect(key->h3, &h);
|
||||
|
||||
gf128mul_lle(&h, &k);
|
||||
ghash_reflect(key->h4, &h);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user