crypto: arm/crct10dif - Implement plain NEON variant

The CRC-T10DIF algorithm produces a 16-bit CRC, and this is reflected in
the folding coefficients, which are also only 16 bits wide.

This means that the polynomial multiplications involving these
coefficients can be performed using 8-bit long polynomial multiplication
(8x8 -> 16) in only a few steps, and this is an instruction that is part
of the base NEON ISA, which is all most real ARMv7 cores implement. (The
64-bit PMULL instruction is part of the crypto extensions, which are
only implemented by 64-bit cores)

The final reduction is a bit more involved, but we can delegate that to
the generic CRC-T10DIF implementation after folding the entire input
into a 16 byte vector.

This results in a speedup of around 6.6x on Cortex-A72 running in 32-bit
mode. On Cortex-A8 (BeagleBone White), the results are substantially
better than that, but not sufficiently reproducible (with tcrypt) to
quote a number here.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2024-11-05 17:09:06 +01:00 committed by Herbert Xu
parent 802d8d110c
commit e7c1d1c9b2
2 changed files with 134 additions and 9 deletions

View File

@ -112,6 +112,82 @@
FOLD_CONST_L .req q10l
FOLD_CONST_H .req q10h
/*
* Pairwise long polynomial multiplication of two 16-bit values
*
* { w0, w1 }, { y0, y1 }
*
* by two 64-bit values
*
* { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 }
*
* where each vector element is a byte, ordered from least to most
* significant. The resulting 80-bit vectors are XOR'ed together.
*
* This can be implemented using 8x8 long polynomial multiplication, by
* reorganizing the input so that each pairwise 8x8 multiplication
* produces one of the terms from the decomposition below, and
* combining the results of each rank and shifting them into place.
*
* Rank
* 0 w0*x0 ^ | y0*z0 ^
* 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^
* 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^
* 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^
* 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^
* 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^
* 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^
* 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^
* 8 w1*x7 << 64 | y1*z7 << 64
*
* The inputs can be reorganized into
*
* { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 }
* { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 }
*
* and after performing 8x8->16 bit long polynomial multiplication of
* each of the halves of the first vector with those of the second one,
* we obtain the following four vectors of 16-bit elements:
*
* a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 }
* b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 }
* c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 }
* d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 }
*
* Results b and c can be XORed together, as the vector elements have
* matching ranks. Then, the final XOR can be pulled forward, and
* applied between the halves of each of the remaining three vectors,
* which are then shifted into place, and XORed together to produce the
* final 80-bit result.
*/
.macro pmull16x64_p8, v16, v64
vext.8 q11, \v64, \v64, #1
vld1.64 {q12}, [r4, :128]
vuzp.8 q11, \v64
vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
bl __pmull16x64_p8
veor \v64, q12, q14
.endm
__pmull16x64_p8:
vmull.p8 q13, d23, d24
vmull.p8 q14, d23, d25
vmull.p8 q15, d22, d24
vmull.p8 q12, d22, d25
veor q14, q14, q15
veor d24, d24, d25
veor d26, d26, d27
veor d28, d28, d29
vmov.i32 d25, #0
vmov.i32 d29, #0
vext.8 q12, q12, q12, #14
vext.8 q14, q14, q14, #15
veor d24, d24, d26
bx lr
ENDPROC(__pmull16x64_p8)
.macro pmull16x64_p64, v16, v64
vmull.p64 q11, \v64\()l, \v16\()_L
vmull.p64 \v64, \v64\()h, \v16\()_H
@ -249,9 +325,9 @@ CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h
// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
mov_l r3, .Lbyteshift_table + 16
sub r3, r3, len
vld1.8 {q2}, [r3]
mov_l r1, .Lbyteshift_table + 16
sub r1, r1, len
vld1.8 {q2}, [r1]
vtbl.8 q1l, {q7l-q7h}, q2l
vtbl.8 q1h, {q7l-q7h}, q2h
@ -341,9 +417,20 @@ ENTRY(crc_t10dif_pmull64)
vmov.u16 r0, q0l[0]
bx lr
ENDPROC(crc_t10dif_pmull64)
ENTRY(crc_t10dif_pmull8)
push {r4, lr}
mov_l r4, .L16x64perm
crct10dif p8
CPU_LE( vrev64.8 q7, q7 )
vswp q7l, q7h
vst1.64 {q7}, [r3, :128]
pop {r4, pc}
ENDPROC(crc_t10dif_pmull8)
.section ".rodata", "a"
.align 4
@ -376,3 +463,6 @@ ENDPROC(crc_t10dif_pmull64)
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
.L16x64perm:
.quad 0x808080800000000, 0x909090901010101

View File

@ -20,6 +20,8 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len,
u8 out[16]);
static int crct10dif_init(struct shash_desc *desc)
{
@ -45,6 +47,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
return 0;
}
static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
unsigned int length)
{
u16 *crcp = shash_desc_ctx(desc);
u8 buf[16] __aligned(16);
u16 crc = *crcp;
if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
kernel_neon_begin();
crc_t10dif_pmull8(crc, data, length, buf);
kernel_neon_end();
crc = 0;
data = buf;
length = sizeof(buf);
}
*crcp = crc_t10dif_generic(crc, data, length);
return 0;
}
static int crct10dif_final(struct shash_desc *desc, u8 *out)
{
u16 *crc = shash_desc_ctx(desc);
@ -53,7 +76,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
static struct shash_alg crc_t10dif_alg = {
static struct shash_alg algs[] = {{
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
.update = crct10dif_update_neon,
.final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE,
.base.cra_name = "crct10dif",
.base.cra_driver_name = "crct10dif-arm-neon",
.base.cra_priority = 150,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
}, {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
.update = crct10dif_update_ce,
@ -65,19 +100,19 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
};
}};
static int __init crc_t10dif_mod_init(void)
{
if (!(elf_hwcap2 & HWCAP2_PMULL))
if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
return crypto_register_shash(&crc_t10dif_alg);
return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
static void __exit crc_t10dif_mod_exit(void)
{
crypto_unregister_shash(&crc_t10dif_alg);
crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
module_init(crc_t10dif_mod_init);