crypto: arm/crct10dif - Macroify PMULL asm code

To allow an alternative version to be created of the PMULL based
CRC-T10DIF algorithm, turn the bulk of it into a macro, except for the
final reduction, which will only be used by the existing version.

Reviewed-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
Ard Biesheuvel 2024-11-05 17:09:05 +01:00 committed by Herbert Xu
parent fcf27785ae
commit 802d8d110c
2 changed files with 83 additions and 81 deletions

View File

@ -112,48 +112,42 @@
FOLD_CONST_L .req q10l FOLD_CONST_L .req q10l
FOLD_CONST_H .req q10h FOLD_CONST_H .req q10h
.macro pmull16x64_p64, v16, v64
vmull.p64 q11, \v64\()l, \v16\()_L
vmull.p64 \v64, \v64\()h, \v16\()_H
veor \v64, \v64, q11
.endm
// Fold reg1, reg2 into the next 32 data bytes, storing the result back // Fold reg1, reg2 into the next 32 data bytes, storing the result back
// into reg1, reg2. // into reg1, reg2.
.macro fold_32_bytes, reg1, reg2 .macro fold_32_bytes, reg1, reg2, p
vld1.64 {q11-q12}, [buf]! vld1.64 {q8-q9}, [buf]!
vmull.p64 q8, \reg1\()h, FOLD_CONST_H pmull16x64_\p FOLD_CONST, \reg1
vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L pmull16x64_\p FOLD_CONST, \reg2
vmull.p64 q9, \reg2\()h, FOLD_CONST_H
vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
CPU_LE( vrev64.8 q11, q11 ) CPU_LE( vrev64.8 q8, q8 )
CPU_LE( vrev64.8 q12, q12 ) CPU_LE( vrev64.8 q9, q9 )
vswp q11l, q11h vswp q8l, q8h
vswp q12l, q12h vswp q9l, q9h
veor.8 \reg1, \reg1, q8 veor.8 \reg1, \reg1, q8
veor.8 \reg2, \reg2, q9 veor.8 \reg2, \reg2, q9
veor.8 \reg1, \reg1, q11
veor.8 \reg2, \reg2, q12
.endm .endm
// Fold src_reg into dst_reg, optionally loading the next fold constants // Fold src_reg into dst_reg, optionally loading the next fold constants
.macro fold_16_bytes, src_reg, dst_reg, load_next_consts .macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
vmull.p64 q8, \src_reg\()l, FOLD_CONST_L pmull16x64_\p FOLD_CONST, \src_reg
vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
.ifnb \load_next_consts .ifnb \load_next_consts
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
.endif .endif
veor.8 \dst_reg, \dst_reg, q8
veor.8 \dst_reg, \dst_reg, \src_reg veor.8 \dst_reg, \dst_reg, \src_reg
.endm .endm
// .macro crct10dif, p
// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
//
// Assumes len >= 16.
//
ENTRY(crc_t10dif_pmull)
// For sizes less than 256 bytes, we can't fold 128 bytes at a time. // For sizes less than 256 bytes, we can't fold 128 bytes at a time.
cmp len, #256 cmp len, #256
blt .Lless_than_256_bytes blt .Lless_than_256_bytes\@
mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
@ -194,27 +188,27 @@ CPU_LE( vrev64.8 q7, q7 )
// While >= 128 data bytes remain (not counting q0-q7), fold the 128 // While >= 128 data bytes remain (not counting q0-q7), fold the 128
// bytes q0-q7 into them, storing the result back into q0-q7. // bytes q0-q7 into them, storing the result back into q0-q7.
.Lfold_128_bytes_loop: .Lfold_128_bytes_loop\@:
fold_32_bytes q0, q1 fold_32_bytes q0, q1, \p
fold_32_bytes q2, q3 fold_32_bytes q2, q3, \p
fold_32_bytes q4, q5 fold_32_bytes q4, q5, \p
fold_32_bytes q6, q7 fold_32_bytes q6, q7, \p
subs len, len, #128 subs len, len, #128
bge .Lfold_128_bytes_loop bge .Lfold_128_bytes_loop\@
// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7. // Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
// Fold across 64 bytes. // Fold across 64 bytes.
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]! vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
fold_16_bytes q0, q4 fold_16_bytes q0, q4, \p
fold_16_bytes q1, q5 fold_16_bytes q1, q5, \p
fold_16_bytes q2, q6 fold_16_bytes q2, q6, \p
fold_16_bytes q3, q7, 1 fold_16_bytes q3, q7, \p, 1
// Fold across 32 bytes. // Fold across 32 bytes.
fold_16_bytes q4, q6 fold_16_bytes q4, q6, \p
fold_16_bytes q5, q7, 1 fold_16_bytes q5, q7, \p, 1
// Fold across 16 bytes. // Fold across 16 bytes.
fold_16_bytes q6, q7 fold_16_bytes q6, q7, \p
// Add 128 to get the correct number of data bytes remaining in 0...127 // Add 128 to get the correct number of data bytes remaining in 0...127
// (not counting q7), following the previous extra subtraction by 128. // (not counting q7), following the previous extra subtraction by 128.
@ -224,25 +218,23 @@ CPU_LE( vrev64.8 q7, q7 )
// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7 // While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
// into them, storing the result back into q7. // into them, storing the result back into q7.
blt .Lfold_16_bytes_loop_done blt .Lfold_16_bytes_loop_done\@
.Lfold_16_bytes_loop: .Lfold_16_bytes_loop\@:
vmull.p64 q8, q7l, FOLD_CONST_L pmull16x64_\p FOLD_CONST, q7
vmull.p64 q7, q7h, FOLD_CONST_H
veor.8 q7, q7, q8
vld1.64 {q0}, [buf]! vld1.64 {q0}, [buf]!
CPU_LE( vrev64.8 q0, q0 ) CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h vswp q0l, q0h
veor.8 q7, q7, q0 veor.8 q7, q7, q0
subs len, len, #16 subs len, len, #16
bge .Lfold_16_bytes_loop bge .Lfold_16_bytes_loop\@
.Lfold_16_bytes_loop_done: .Lfold_16_bytes_loop_done\@:
// Add 16 to get the correct number of data bytes remaining in 0...15 // Add 16 to get the correct number of data bytes remaining in 0...15
// (not counting q7), following the previous extra subtraction by 16. // (not counting q7), following the previous extra subtraction by 16.
adds len, len, #16 adds len, len, #16
beq .Lreduce_final_16_bytes beq .Lreduce_final_16_bytes\@
.Lhandle_partial_segment: .Lhandle_partial_segment\@:
// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
// 16 bytes are in q7 and the rest are the remaining data in 'buf'. To // 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
// do this without needing a fold constant for each possible 'len', // do this without needing a fold constant for each possible 'len',
@ -277,12 +269,46 @@ CPU_LE( vrev64.8 q0, q0 )
vbsl.8 q2, q1, q0 vbsl.8 q2, q1, q0
// Fold the first chunk into the second chunk, storing the result in q7. // Fold the first chunk into the second chunk, storing the result in q7.
vmull.p64 q0, q3l, FOLD_CONST_L pmull16x64_\p FOLD_CONST, q3
vmull.p64 q7, q3h, FOLD_CONST_H veor.8 q7, q3, q2
veor.8 q7, q7, q0 b .Lreduce_final_16_bytes\@
veor.8 q7, q7, q2
.Lless_than_256_bytes\@:
// Checksumming a buffer of length 16...255 bytes
mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
// Load the first 16 data bytes.
vld1.64 {q7}, [buf]!
CPU_LE( vrev64.8 q7, q7 )
vswp q7l, q7h
// XOR the first 16 data *bits* with the initial CRC value.
vmov.i8 q0h, #0
vmov.u16 q0h[3], init_crc
veor.8 q7h, q7h, q0h
// Load the fold-across-16-bytes constants.
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
cmp len, #16
beq .Lreduce_final_16_bytes\@ // len == 16
subs len, len, #32
addlt len, len, #16
blt .Lhandle_partial_segment\@ // 17 <= len <= 31
b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
.Lreduce_final_16_bytes\@:
.endm
//
// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
//
// Assumes len >= 16.
//
ENTRY(crc_t10dif_pmull64)
crct10dif p64
.Lreduce_final_16_bytes:
// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC. // Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
@ -316,31 +342,7 @@ CPU_LE( vrev64.8 q0, q0 )
vmov.u16 r0, q0l[0] vmov.u16 r0, q0l[0]
bx lr bx lr
.Lless_than_256_bytes: ENDPROC(crc_t10dif_pmull64)
// Checksumming a buffer of length 16...255 bytes
mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
// Load the first 16 data bytes.
vld1.64 {q7}, [buf]!
CPU_LE( vrev64.8 q7, q7 )
vswp q7l, q7h
// XOR the first 16 data *bits* with the initial CRC value.
vmov.i8 q0h, #0
vmov.u16 q0h[3], init_crc
veor.8 q7h, q7h, q0h
// Load the fold-across-16-bytes constants.
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
cmp len, #16
beq .Lreduce_final_16_bytes // len == 16
subs len, len, #32
addlt len, len, #16
blt .Lhandle_partial_segment // 17 <= len <= 31
b .Lfold_16_bytes_loop // 32 <= len <= 255
ENDPROC(crc_t10dif_pmull)
.section ".rodata", "a" .section ".rodata", "a"
.align 4 .align 4

View File

@ -19,7 +19,7 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U #define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len); asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
static int crct10dif_init(struct shash_desc *desc) static int crct10dif_init(struct shash_desc *desc)
{ {
@ -29,14 +29,14 @@ static int crct10dif_init(struct shash_desc *desc)
return 0; return 0;
} }
static int crct10dif_update(struct shash_desc *desc, const u8 *data, static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
unsigned int length) unsigned int length)
{ {
u16 *crc = shash_desc_ctx(desc); u16 *crc = shash_desc_ctx(desc);
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) { if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
kernel_neon_begin(); kernel_neon_begin();
*crc = crc_t10dif_pmull(*crc, data, length); *crc = crc_t10dif_pmull64(*crc, data, length);
kernel_neon_end(); kernel_neon_end();
} else { } else {
*crc = crc_t10dif_generic(*crc, data, length); *crc = crc_t10dif_generic(*crc, data, length);
@ -56,7 +56,7 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
static struct shash_alg crc_t10dif_alg = { static struct shash_alg crc_t10dif_alg = {
.digestsize = CRC_T10DIF_DIGEST_SIZE, .digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init, .init = crct10dif_init,
.update = crct10dif_update, .update = crct10dif_update_ce,
.final = crct10dif_final, .final = crct10dif_final,
.descsize = CRC_T10DIF_DIGEST_SIZE, .descsize = CRC_T10DIF_DIGEST_SIZE,