crypto: arm64/crct10dif - Remove remaining 64x64 PMULL fallback code

The only remaining user of the fallback implementation of 64x64 polynomial multiplication using 8x8 PMULL instructions is the final reduction from a 16 byte vector to a 16-bit CRC. The fallback code is complicated and messy, and this reduction has little impact on the overall performance, so instead, let's calculate the final CRC by passing the 16 byte vector to the generic CRC-T10DIF implementation when running the fallback version. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
2024-11-21 19:41:42 +00:00 · 2024-11-05 17:09:03 +01:00 · 2024-11-05 17:09:03 +01:00 · 779cee8209
commit 779cee8209
parent 67dfb1b73f
2 changed files with 68 additions and 194 deletions
--- a/arch/arm64/crypto/crct10dif-ce-core.S
+++ b/arch/arm64/crypto/crct10dif-ce-core.S
@ -74,137 +74,18 @@
 	init_crc	.req	w0
 	buf		.req	x1
 	len		.req	x2
-	fold_consts_ptr	.req	x3
+	fold_consts_ptr	.req	x5
 	fold_consts	.req	v10
 	ad		.req	v14
 	k00_16		.req	v15
 	k32_48		.req	v16
 	t3		.req	v17
 	t4		.req	v18
 	t5		.req	v19
 	t6		.req	v20
 	t7		.req	v21
 	t8		.req	v22
 	t9		.req	v23
-	perm1		.req	v24
+	perm		.req	v27
 	perm2		.req	v25
 	perm3		.req	v26
 	perm4		.req	v27
 	bd1		.req	v28
 	bd2		.req	v29
 	bd3		.req	v30
 	bd4		.req	v31
 	.macro		__pmull_init_p64
 	.endm
 	.macro		__pmull_pre_p64, bd
 	.endm
 	.macro		__pmull_init_p8
 	// k00_16 := 0x0000000000000000_000000000000ffff
 	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
 	movi		k32_48.2d, #0xffffffff
 	mov		k32_48.h[2], k32_48.h[0]
 	ushr		k00_16.2d, k32_48.2d, #32
 	// prepare the permutation vectors
 	mov_q		x5, 0x080f0e0d0c0b0a09
 	movi		perm4.8b, #8
 	dup		perm1.2d, x5
 	eor		perm1.16b, perm1.16b, perm4.16b
 	ushr		perm2.2d, perm1.2d, #8
 	ushr		perm3.2d, perm1.2d, #16
 	ushr		perm4.2d, perm1.2d, #24
 	sli		perm2.2d, perm1.2d, #56
 	sli		perm3.2d, perm1.2d, #48
 	sli		perm4.2d, perm1.2d, #40
 	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
 	movi		bd1.4h, #8, lsl #8
 	orr		bd1.2s, #1, lsl #16
 	orr		bd1.2s, #1, lsl #24
 	zip1		bd1.16b, bd1.16b, bd1.16b
 	zip1		bd1.16b, bd1.16b, bd1.16b
 	.endm
 	.macro		__pmull_pre_p8, bd
 	tbl		bd1.16b, {\bd\().16b}, perm1.16b
 	tbl		bd2.16b, {\bd\().16b}, perm2.16b
 	tbl		bd3.16b, {\bd\().16b}, perm3.16b
 	tbl		bd4.16b, {\bd\().16b}, perm4.16b
 	.endm
 SYM_FUNC_START_LOCAL(__pmull_p8_core)
 .L__pmull_p8_core:
 	ext		t4.8b, ad.8b, ad.8b, #1			// A1
 	ext		t5.8b, ad.8b, ad.8b, #2			// A2
 	ext		t6.8b, ad.8b, ad.8b, #3			// A3
 	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
 	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
 	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
 	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
 	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
 	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
 	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
 	b		0f
 .L__pmull_p8_core2:
 	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
 	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
 	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
 	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
 	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
 	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
 	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
 	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
 	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
 	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
 0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
 	eor		t5.16b, t5.16b, t7.16b			// M = G + H
 	eor		t6.16b, t6.16b, t9.16b			// N = I + J
 	uzp1		t8.2d, t4.2d, t5.2d
 	uzp2		t4.2d, t4.2d, t5.2d
 	uzp1		t7.2d, t6.2d, t3.2d
 	uzp2		t6.2d, t6.2d, t3.2d
 	// t4 = (L) (P0 + P1) << 8
 	// t5 = (M) (P2 + P3) << 16
 	eor		t8.16b, t8.16b, t4.16b
 	and		t4.16b, t4.16b, k32_48.16b
 	// t6 = (N) (P4 + P5) << 24
 	// t7 = (K) (P6 + P7) << 32
 	eor		t7.16b, t7.16b, t6.16b
 	and		t6.16b, t6.16b, k00_16.16b
 	eor		t8.16b, t8.16b, t4.16b
 	eor		t7.16b, t7.16b, t6.16b
 	zip2		t5.2d, t8.2d, t4.2d
 	zip1		t4.2d, t8.2d, t4.2d
 	zip2		t3.2d, t7.2d, t6.2d
 	zip1		t6.2d, t7.2d, t6.2d
 	ext		t4.16b, t4.16b, t4.16b, #15
 	ext		t5.16b, t5.16b, t5.16b, #14
 	ext		t6.16b, t6.16b, t6.16b, #13
 	ext		t3.16b, t3.16b, t3.16b, #12
 	eor		t4.16b, t4.16b, t5.16b
 	eor		t6.16b, t6.16b, t3.16b
 	ret
 SYM_FUNC_END(__pmull_p8_core)
 	.macro		pmull16x64_p64, a16, b64, c64
 	pmull2		\c64\().1q, \a16\().2d, \b64\().2d
@ -266,7 +147,7 @@ SYM_FUNC_END(__pmull_p8_core)
 	 */
 	.macro		pmull16x64_p8, a16, b64, c64
 	ext		t7.16b, \b64\().16b, \b64\().16b, #1
-	tbl		t5.16b, {\a16\().16b}, bd1.16b
+	tbl		t5.16b, {\a16\().16b}, perm.16b
 	uzp1		t7.16b, \b64\().16b, t7.16b
 	bl		__pmull_p8_16x64
 	ext		\b64\().16b, t4.16b, t4.16b, #15
@ -292,22 +173,6 @@ SYM_FUNC_START_LOCAL(__pmull_p8_16x64)
 	ret
 SYM_FUNC_END(__pmull_p8_16x64)
 	.macro		__pmull_p8, rq, ad, bd, i
 	.ifnc		\bd, fold_consts
 	.err
 	.endif
 	mov		ad.16b, \ad\().16b
 	.ifb		\i
 	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
 	.else
 	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
 	.endif
 	bl		.L__pmull_p8_core\i
 	eor		\rq\().16b, \rq\().16b, t4.16b
 	eor		\rq\().16b, \rq\().16b, t6.16b
 	.endm
 	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
 	// into reg1, reg2.
@ -340,16 +205,7 @@ CPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
 	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
 	.endm
 	.macro		__pmull_p64, rd, rn, rm, n
 	.ifb		\n
 	pmull		\rd\().1q, \rn\().1d, \rm\().1d
 	.else
 	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
 	.endif
 	.endm
 	.macro		crc_t10dif_pmull, p
 	__pmull_init_\p
 	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
 	cmp		len, #256
@ -479,47 +335,7 @@ CPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
 	pmull16x64_\p	fold_consts, v3, v0
 	eor		v7.16b, v3.16b, v0.16b
 	eor		v7.16b, v7.16b, v2.16b
-
+	b		.Lreduce_final_16_bytes_\@
 .Lreduce_final_16_bytes_\@:
 	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
 	movi		v2.16b, #0		// init zero register
 	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
 	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
 	__pmull_pre_\p	fold_consts
 	// Fold the high 64 bits into the low 64 bits, while also multiplying by
 	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
 	// whose low 48 bits are 0.
 	ext		v0.16b, v2.16b, v7.16b, #8
 	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
 	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
 	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
 	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
 	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
 	mov		v0.s[3], v2.s[0]	// zero high 32 bits
 	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
 	eor		v0.16b, v0.16b, v1.16b	// + low bits
 	// Load G(x) and floor(x^48 / G(x)).
 	ld1		{fold_consts.2d}, [fold_consts_ptr]
 	__pmull_pre_\p	fold_consts
 	// Use Barrett reduction to compute the final CRC value.
 	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
 	ushr		v1.2d, v1.2d, #32	// /= x^32
 	__pmull_\p	v1, v1, fold_consts	// *= G(x)
 	ushr		v0.2d, v0.2d, #48
 	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
 	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
 	umov		w0, v0.h[0]
 	.ifc		\p, p8
 	frame_pop
 	.endif
 	ret
 .Lless_than_256_bytes_\@:
 	// Checksumming a buffer of length 16...255 bytes
@ -545,6 +361,8 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
 	add		len, len, #16
 	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
 .Lreduce_final_16_bytes_\@:
 	.endm
 //
@ -554,7 +372,22 @@ CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 //
 SYM_FUNC_START(crc_t10dif_pmull_p8)
 	frame_push	1
 	// Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 }
 	movi		perm.4h, #8, lsl #8
 	orr		perm.2s, #1, lsl #16
 	orr		perm.2s, #1, lsl #24
 	zip1		perm.16b, perm.16b, perm.16b
 	zip1		perm.16b, perm.16b, perm.16b
 	crc_t10dif_pmull p8
 CPU_LE(	rev64		v7.16b, v7.16b			)
 CPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
 	str		q7, [x3]
 	frame_pop
 	ret
 SYM_FUNC_END(crc_t10dif_pmull_p8)
 	.align		5
@ -565,6 +398,41 @@ SYM_FUNC_END(crc_t10dif_pmull_p8)
 //
 SYM_FUNC_START(crc_t10dif_pmull_p64)
 	crc_t10dif_pmull	p64
 	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
 	movi		v2.16b, #0		// init zero register
 	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
 	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
 	// Fold the high 64 bits into the low 64 bits, while also multiplying by
 	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
 	// whose low 48 bits are 0.
 	ext		v0.16b, v2.16b, v7.16b, #8
 	pmull2		v7.1q, v7.2d, fold_consts.2d	// high bits * x^48 * (x^80 mod G(x))
 	eor		v0.16b, v0.16b, v7.16b		// + low bits * x^64
 	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
 	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
 	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
 	mov		v0.s[3], v2.s[0]		// zero high 32 bits
 	pmull		v1.1q, v1.1d, fold_consts.1d	// high 32 bits * x^48 * (x^48 mod G(x))
 	eor		v0.16b, v0.16b, v1.16b		// + low bits
 	// Load G(x) and floor(x^48 / G(x)).
 	ld1		{fold_consts.2d}, [fold_consts_ptr]
 	// Use Barrett reduction to compute the final CRC value.
 	pmull2		v1.1q, v0.2d, fold_consts.2d	// high 32 bits * floor(x^48 / G(x))
 	ushr		v1.2d, v1.2d, #32		// /= x^32
 	pmull		v1.1q, v1.1d, fold_consts.1d	// *= G(x)
 	ushr		v0.2d, v0.2d, #48
 	eor		v0.16b, v0.16b, v1.16b		// + low 16 nonzero bits
 	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
 	umov		w0, v0.h[0]
 	ret
 SYM_FUNC_END(crc_t10dif_pmull_p64)
 	.section	".rodata", "a"
--- a/arch/arm64/crypto/crct10dif-ce-glue.c
+++ b/arch/arm64/crypto/crct10dif-ce-glue.c
@ -20,7 +20,8 @@
 #define CRC_T10DIF_PMULL_CHUNK_SIZE	16U
-asmlinkage u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len,
 				    u8 out[16]);
 asmlinkage u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
 static int crct10dif_init(struct shash_desc *desc)
@ -34,16 +35,21 @@ static int crct10dif_init(struct shash_desc *desc)
 static int crct10dif_update_pmull_p8(struct shash_desc *desc, const u8 *data,
 			    unsigned int length)
 {
-	u16 *crc = shash_desc_ctx(desc);
+	u16 *crcp = shash_desc_ctx(desc);
 	u16 crc = *crcp;
 	u8 buf[16];
-	if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+	if (length > CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
 		kernel_neon_begin();
-		*crc = crc_t10dif_pmull_p8(*crc, data, length);
+		crc_t10dif_pmull_p8(crc, data, length, buf);
 		kernel_neon_end();
-	} else {
+
-		*crc = crc_t10dif_generic(*crc, data, length);
+		crc = 0;
 		data = buf;
 		length = sizeof(buf);
 	}
 	*crcp = crc_t10dif_generic(crc, data, length);
 	return 0;
 }