mirror of
https://github.com/torvalds/linux.git
synced 2024-11-24 21:21:41 +00:00
crypto: arm/crct10dif - Macroify PMULL asm code
To allow an alternative version to be created of the PMULL based CRC-T10DIF algorithm, turn the bulk of it into a macro, except for the final reduction, which will only be used by the existing version. Reviewed-by: Eric Biggers <ebiggers@google.com> Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
This commit is contained in:
parent
fcf27785ae
commit
802d8d110c
@ -112,48 +112,42 @@
|
|||||||
FOLD_CONST_L .req q10l
|
FOLD_CONST_L .req q10l
|
||||||
FOLD_CONST_H .req q10h
|
FOLD_CONST_H .req q10h
|
||||||
|
|
||||||
|
.macro pmull16x64_p64, v16, v64
|
||||||
|
vmull.p64 q11, \v64\()l, \v16\()_L
|
||||||
|
vmull.p64 \v64, \v64\()h, \v16\()_H
|
||||||
|
veor \v64, \v64, q11
|
||||||
|
.endm
|
||||||
|
|
||||||
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
|
// Fold reg1, reg2 into the next 32 data bytes, storing the result back
|
||||||
// into reg1, reg2.
|
// into reg1, reg2.
|
||||||
.macro fold_32_bytes, reg1, reg2
|
.macro fold_32_bytes, reg1, reg2, p
|
||||||
vld1.64 {q11-q12}, [buf]!
|
vld1.64 {q8-q9}, [buf]!
|
||||||
|
|
||||||
vmull.p64 q8, \reg1\()h, FOLD_CONST_H
|
pmull16x64_\p FOLD_CONST, \reg1
|
||||||
vmull.p64 \reg1, \reg1\()l, FOLD_CONST_L
|
pmull16x64_\p FOLD_CONST, \reg2
|
||||||
vmull.p64 q9, \reg2\()h, FOLD_CONST_H
|
|
||||||
vmull.p64 \reg2, \reg2\()l, FOLD_CONST_L
|
|
||||||
|
|
||||||
CPU_LE( vrev64.8 q11, q11 )
|
CPU_LE( vrev64.8 q8, q8 )
|
||||||
CPU_LE( vrev64.8 q12, q12 )
|
CPU_LE( vrev64.8 q9, q9 )
|
||||||
vswp q11l, q11h
|
vswp q8l, q8h
|
||||||
vswp q12l, q12h
|
vswp q9l, q9h
|
||||||
|
|
||||||
veor.8 \reg1, \reg1, q8
|
veor.8 \reg1, \reg1, q8
|
||||||
veor.8 \reg2, \reg2, q9
|
veor.8 \reg2, \reg2, q9
|
||||||
veor.8 \reg1, \reg1, q11
|
|
||||||
veor.8 \reg2, \reg2, q12
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
// Fold src_reg into dst_reg, optionally loading the next fold constants
|
// Fold src_reg into dst_reg, optionally loading the next fold constants
|
||||||
.macro fold_16_bytes, src_reg, dst_reg, load_next_consts
|
.macro fold_16_bytes, src_reg, dst_reg, p, load_next_consts
|
||||||
vmull.p64 q8, \src_reg\()l, FOLD_CONST_L
|
pmull16x64_\p FOLD_CONST, \src_reg
|
||||||
vmull.p64 \src_reg, \src_reg\()h, FOLD_CONST_H
|
|
||||||
.ifnb \load_next_consts
|
.ifnb \load_next_consts
|
||||||
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
|
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
|
||||||
.endif
|
.endif
|
||||||
veor.8 \dst_reg, \dst_reg, q8
|
|
||||||
veor.8 \dst_reg, \dst_reg, \src_reg
|
veor.8 \dst_reg, \dst_reg, \src_reg
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
//
|
.macro crct10dif, p
|
||||||
// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
|
|
||||||
//
|
|
||||||
// Assumes len >= 16.
|
|
||||||
//
|
|
||||||
ENTRY(crc_t10dif_pmull)
|
|
||||||
|
|
||||||
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
|
// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
|
||||||
cmp len, #256
|
cmp len, #256
|
||||||
blt .Lless_than_256_bytes
|
blt .Lless_than_256_bytes\@
|
||||||
|
|
||||||
mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
|
mov_l fold_consts_ptr, .Lfold_across_128_bytes_consts
|
||||||
|
|
||||||
@ -194,27 +188,27 @@ CPU_LE( vrev64.8 q7, q7 )
|
|||||||
|
|
||||||
// While >= 128 data bytes remain (not counting q0-q7), fold the 128
|
// While >= 128 data bytes remain (not counting q0-q7), fold the 128
|
||||||
// bytes q0-q7 into them, storing the result back into q0-q7.
|
// bytes q0-q7 into them, storing the result back into q0-q7.
|
||||||
.Lfold_128_bytes_loop:
|
.Lfold_128_bytes_loop\@:
|
||||||
fold_32_bytes q0, q1
|
fold_32_bytes q0, q1, \p
|
||||||
fold_32_bytes q2, q3
|
fold_32_bytes q2, q3, \p
|
||||||
fold_32_bytes q4, q5
|
fold_32_bytes q4, q5, \p
|
||||||
fold_32_bytes q6, q7
|
fold_32_bytes q6, q7, \p
|
||||||
subs len, len, #128
|
subs len, len, #128
|
||||||
bge .Lfold_128_bytes_loop
|
bge .Lfold_128_bytes_loop\@
|
||||||
|
|
||||||
// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
|
// Now fold the 112 bytes in q0-q6 into the 16 bytes in q7.
|
||||||
|
|
||||||
// Fold across 64 bytes.
|
// Fold across 64 bytes.
|
||||||
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
|
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
|
||||||
fold_16_bytes q0, q4
|
fold_16_bytes q0, q4, \p
|
||||||
fold_16_bytes q1, q5
|
fold_16_bytes q1, q5, \p
|
||||||
fold_16_bytes q2, q6
|
fold_16_bytes q2, q6, \p
|
||||||
fold_16_bytes q3, q7, 1
|
fold_16_bytes q3, q7, \p, 1
|
||||||
// Fold across 32 bytes.
|
// Fold across 32 bytes.
|
||||||
fold_16_bytes q4, q6
|
fold_16_bytes q4, q6, \p
|
||||||
fold_16_bytes q5, q7, 1
|
fold_16_bytes q5, q7, \p, 1
|
||||||
// Fold across 16 bytes.
|
// Fold across 16 bytes.
|
||||||
fold_16_bytes q6, q7
|
fold_16_bytes q6, q7, \p
|
||||||
|
|
||||||
// Add 128 to get the correct number of data bytes remaining in 0...127
|
// Add 128 to get the correct number of data bytes remaining in 0...127
|
||||||
// (not counting q7), following the previous extra subtraction by 128.
|
// (not counting q7), following the previous extra subtraction by 128.
|
||||||
@ -224,25 +218,23 @@ CPU_LE( vrev64.8 q7, q7 )
|
|||||||
|
|
||||||
// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
|
// While >= 16 data bytes remain (not counting q7), fold the 16 bytes q7
|
||||||
// into them, storing the result back into q7.
|
// into them, storing the result back into q7.
|
||||||
blt .Lfold_16_bytes_loop_done
|
blt .Lfold_16_bytes_loop_done\@
|
||||||
.Lfold_16_bytes_loop:
|
.Lfold_16_bytes_loop\@:
|
||||||
vmull.p64 q8, q7l, FOLD_CONST_L
|
pmull16x64_\p FOLD_CONST, q7
|
||||||
vmull.p64 q7, q7h, FOLD_CONST_H
|
|
||||||
veor.8 q7, q7, q8
|
|
||||||
vld1.64 {q0}, [buf]!
|
vld1.64 {q0}, [buf]!
|
||||||
CPU_LE( vrev64.8 q0, q0 )
|
CPU_LE( vrev64.8 q0, q0 )
|
||||||
vswp q0l, q0h
|
vswp q0l, q0h
|
||||||
veor.8 q7, q7, q0
|
veor.8 q7, q7, q0
|
||||||
subs len, len, #16
|
subs len, len, #16
|
||||||
bge .Lfold_16_bytes_loop
|
bge .Lfold_16_bytes_loop\@
|
||||||
|
|
||||||
.Lfold_16_bytes_loop_done:
|
.Lfold_16_bytes_loop_done\@:
|
||||||
// Add 16 to get the correct number of data bytes remaining in 0...15
|
// Add 16 to get the correct number of data bytes remaining in 0...15
|
||||||
// (not counting q7), following the previous extra subtraction by 16.
|
// (not counting q7), following the previous extra subtraction by 16.
|
||||||
adds len, len, #16
|
adds len, len, #16
|
||||||
beq .Lreduce_final_16_bytes
|
beq .Lreduce_final_16_bytes\@
|
||||||
|
|
||||||
.Lhandle_partial_segment:
|
.Lhandle_partial_segment\@:
|
||||||
// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
|
// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
|
||||||
// 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
|
// 16 bytes are in q7 and the rest are the remaining data in 'buf'. To
|
||||||
// do this without needing a fold constant for each possible 'len',
|
// do this without needing a fold constant for each possible 'len',
|
||||||
@ -277,12 +269,46 @@ CPU_LE( vrev64.8 q0, q0 )
|
|||||||
vbsl.8 q2, q1, q0
|
vbsl.8 q2, q1, q0
|
||||||
|
|
||||||
// Fold the first chunk into the second chunk, storing the result in q7.
|
// Fold the first chunk into the second chunk, storing the result in q7.
|
||||||
vmull.p64 q0, q3l, FOLD_CONST_L
|
pmull16x64_\p FOLD_CONST, q3
|
||||||
vmull.p64 q7, q3h, FOLD_CONST_H
|
veor.8 q7, q3, q2
|
||||||
veor.8 q7, q7, q0
|
b .Lreduce_final_16_bytes\@
|
||||||
veor.8 q7, q7, q2
|
|
||||||
|
.Lless_than_256_bytes\@:
|
||||||
|
// Checksumming a buffer of length 16...255 bytes
|
||||||
|
|
||||||
|
mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
|
||||||
|
|
||||||
|
// Load the first 16 data bytes.
|
||||||
|
vld1.64 {q7}, [buf]!
|
||||||
|
CPU_LE( vrev64.8 q7, q7 )
|
||||||
|
vswp q7l, q7h
|
||||||
|
|
||||||
|
// XOR the first 16 data *bits* with the initial CRC value.
|
||||||
|
vmov.i8 q0h, #0
|
||||||
|
vmov.u16 q0h[3], init_crc
|
||||||
|
veor.8 q7h, q7h, q0h
|
||||||
|
|
||||||
|
// Load the fold-across-16-bytes constants.
|
||||||
|
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
|
||||||
|
|
||||||
|
cmp len, #16
|
||||||
|
beq .Lreduce_final_16_bytes\@ // len == 16
|
||||||
|
subs len, len, #32
|
||||||
|
addlt len, len, #16
|
||||||
|
blt .Lhandle_partial_segment\@ // 17 <= len <= 31
|
||||||
|
b .Lfold_16_bytes_loop\@ // 32 <= len <= 255
|
||||||
|
|
||||||
|
.Lreduce_final_16_bytes\@:
|
||||||
|
.endm
|
||||||
|
|
||||||
|
//
|
||||||
|
// u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
|
||||||
|
//
|
||||||
|
// Assumes len >= 16.
|
||||||
|
//
|
||||||
|
ENTRY(crc_t10dif_pmull64)
|
||||||
|
crct10dif p64
|
||||||
|
|
||||||
.Lreduce_final_16_bytes:
|
|
||||||
// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
|
// Reduce the 128-bit value M(x), stored in q7, to the final 16-bit CRC.
|
||||||
|
|
||||||
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
|
// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
|
||||||
@ -316,31 +342,7 @@ CPU_LE( vrev64.8 q0, q0 )
|
|||||||
vmov.u16 r0, q0l[0]
|
vmov.u16 r0, q0l[0]
|
||||||
bx lr
|
bx lr
|
||||||
|
|
||||||
.Lless_than_256_bytes:
|
ENDPROC(crc_t10dif_pmull64)
|
||||||
// Checksumming a buffer of length 16...255 bytes
|
|
||||||
|
|
||||||
mov_l fold_consts_ptr, .Lfold_across_16_bytes_consts
|
|
||||||
|
|
||||||
// Load the first 16 data bytes.
|
|
||||||
vld1.64 {q7}, [buf]!
|
|
||||||
CPU_LE( vrev64.8 q7, q7 )
|
|
||||||
vswp q7l, q7h
|
|
||||||
|
|
||||||
// XOR the first 16 data *bits* with the initial CRC value.
|
|
||||||
vmov.i8 q0h, #0
|
|
||||||
vmov.u16 q0h[3], init_crc
|
|
||||||
veor.8 q7h, q7h, q0h
|
|
||||||
|
|
||||||
// Load the fold-across-16-bytes constants.
|
|
||||||
vld1.64 {FOLD_CONSTS}, [fold_consts_ptr, :128]!
|
|
||||||
|
|
||||||
cmp len, #16
|
|
||||||
beq .Lreduce_final_16_bytes // len == 16
|
|
||||||
subs len, len, #32
|
|
||||||
addlt len, len, #16
|
|
||||||
blt .Lhandle_partial_segment // 17 <= len <= 31
|
|
||||||
b .Lfold_16_bytes_loop // 32 <= len <= 255
|
|
||||||
ENDPROC(crc_t10dif_pmull)
|
|
||||||
|
|
||||||
.section ".rodata", "a"
|
.section ".rodata", "a"
|
||||||
.align 4
|
.align 4
|
||||||
|
@ -19,7 +19,7 @@
|
|||||||
|
|
||||||
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
|
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
|
||||||
|
|
||||||
asmlinkage u16 crc_t10dif_pmull(u16 init_crc, const u8 *buf, size_t len);
|
asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
|
||||||
|
|
||||||
static int crct10dif_init(struct shash_desc *desc)
|
static int crct10dif_init(struct shash_desc *desc)
|
||||||
{
|
{
|
||||||
@ -29,14 +29,14 @@ static int crct10dif_init(struct shash_desc *desc)
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
static int crct10dif_update(struct shash_desc *desc, const u8 *data,
|
static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
|
||||||
unsigned int length)
|
unsigned int length)
|
||||||
{
|
{
|
||||||
u16 *crc = shash_desc_ctx(desc);
|
u16 *crc = shash_desc_ctx(desc);
|
||||||
|
|
||||||
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
|
||||||
kernel_neon_begin();
|
kernel_neon_begin();
|
||||||
*crc = crc_t10dif_pmull(*crc, data, length);
|
*crc = crc_t10dif_pmull64(*crc, data, length);
|
||||||
kernel_neon_end();
|
kernel_neon_end();
|
||||||
} else {
|
} else {
|
||||||
*crc = crc_t10dif_generic(*crc, data, length);
|
*crc = crc_t10dif_generic(*crc, data, length);
|
||||||
@ -56,7 +56,7 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
|
|||||||
static struct shash_alg crc_t10dif_alg = {
|
static struct shash_alg crc_t10dif_alg = {
|
||||||
.digestsize = CRC_T10DIF_DIGEST_SIZE,
|
.digestsize = CRC_T10DIF_DIGEST_SIZE,
|
||||||
.init = crct10dif_init,
|
.init = crct10dif_init,
|
||||||
.update = crct10dif_update,
|
.update = crct10dif_update_ce,
|
||||||
.final = crct10dif_final,
|
.final = crct10dif_final,
|
||||||
.descsize = CRC_T10DIF_DIGEST_SIZE,
|
.descsize = CRC_T10DIF_DIGEST_SIZE,
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user