f317820cb6
On CPUs with 64-byte last level cache lines, this yields roughly 10% better performance, independent of CPU vendor or specific model (as far as I was able to test). Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: H. Peter Anvin <hpa@zytor.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Link: http://lkml.kernel.org/r/5093E4B802000078000A615E@nat28.tlf.novell.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
28 lines
677 B
C
28 lines
677 B
C
#ifndef _ASM_X86_XOR_64_H
|
|
#define _ASM_X86_XOR_64_H
|
|
|
|
static struct xor_block_template xor_block_sse = {
|
|
.name = "generic_sse",
|
|
.do_2 = xor_sse_2,
|
|
.do_3 = xor_sse_3,
|
|
.do_4 = xor_sse_4,
|
|
.do_5 = xor_sse_5,
|
|
};
|
|
|
|
|
|
/* Also try the AVX routines */
|
|
#include <asm/xor_avx.h>
|
|
|
|
/* We force the use of the SSE xor block because it can write around L2.
|
|
We may also be able to load into the L1 only depending on how the cpu
|
|
deals with a load to a line that is being prefetched. */
|
|
#undef XOR_TRY_TEMPLATES
|
|
#define XOR_TRY_TEMPLATES \
|
|
do { \
|
|
AVX_XOR_SPEED; \
|
|
xor_speed(&xor_block_sse_pf64); \
|
|
xor_speed(&xor_block_sse); \
|
|
} while (0)
|
|
|
|
#endif /* _ASM_X86_XOR_64_H */
|