powerpc/net: Implement powerpc specific csum_shift() to remove branch

Today's implementation of csum_shift() leads to branching based on
parity of 'offset'

	000002f8 <csum_block_add>:
	     2f8:	70 a5 00 01 	andi.   r5,r5,1
	     2fc:	41 a2 00 08 	beq     304 <csum_block_add+0xc>
	     300:	54 84 c0 3e 	rotlwi  r4,r4,24
	     304:	7c 63 20 14 	addc    r3,r3,r4
	     308:	7c 63 01 94 	addze   r3,r3
	     30c:	4e 80 00 20 	blr

Use first bit of 'offset' directly as input of the rotation instead of
branching.

	000002f8 <csum_block_add>:
	     2f8:	54 a5 1f 38 	rlwinm  r5,r5,3,28,28
	     2fc:	20 a5 00 20 	subfic  r5,r5,32
	     300:	5c 84 28 3e 	rotlw   r4,r4,r5
	     304:	7c 63 20 14 	addc    r3,r3,r4
	     308:	7c 63 01 94 	addze   r3,r3
	     30c:	4e 80 00 20 	blr

And change to left shift instead of right shift to skip one more
instruction. This has no impact on the final sum.

	000002f8 <csum_block_add>:
	     2f8:	54 a5 1f 38 	rlwinm  r5,r5,3,28,28
	     2fc:	5c 84 28 3e 	rotlw   r4,r4,r5
	     300:	7c 63 20 14 	addc    r3,r3,r4
	     304:	7c 63 01 94 	addze   r3,r3
	     308:	4e 80 00 20 	blr

Seems like only powerpc benefits from a branchless implementation.
Other main architectures like ARM or X86 get better code with
the generic implementation and its branch.

Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Christophe Leroy 2022-03-08 17:12:10 +01:00 committed by David S. Miller
parent 8ef1dc4d20
commit 3af722cb73
2 changed files with 9 additions and 0 deletions

View File

@ -112,6 +112,13 @@ static __always_inline __wsum csum_add(__wsum csum, __wsum addend)
#endif #endif
} }
#define HAVE_ARCH_CSUM_SHIFT
static __always_inline __wsum csum_shift(__wsum sum, int offset)
{
/* rotate sum to align it with a 16b boundary */
return (__force __wsum)rol32((__force u32)sum, (offset & 1) << 3);
}
/* /*
* This is a version of ip_compute_csum() optimized for IP headers, * This is a version of ip_compute_csum() optimized for IP headers,
* which always checksum on 4 octet boundaries. ihl is the number * which always checksum on 4 octet boundaries. ihl is the number

View File

@ -80,6 +80,7 @@ static __always_inline __sum16 csum16_sub(__sum16 csum, __be16 addend)
return csum16_add(csum, ~addend); return csum16_add(csum, ~addend);
} }
#ifndef HAVE_ARCH_CSUM_SHIFT
static __always_inline __wsum csum_shift(__wsum sum, int offset) static __always_inline __wsum csum_shift(__wsum sum, int offset)
{ {
/* rotate sum to align it with a 16b boundary */ /* rotate sum to align it with a 16b boundary */
@ -87,6 +88,7 @@ static __always_inline __wsum csum_shift(__wsum sum, int offset)
return (__force __wsum)ror32((__force u32)sum, 8); return (__force __wsum)ror32((__force u32)sum, 8);
return sum; return sum;
} }
#endif
static __always_inline __wsum static __always_inline __wsum
csum_block_add(__wsum csum, __wsum csum2, int offset) csum_block_add(__wsum csum, __wsum csum2, int offset)