linux/tools/arch/x86/lib/memset_64.S
Arnaldo Carvalho de Melo efe80f9c90 tools headers: Update the copy of x86's mem{cpy,set}_64.S used in 'perf bench'
This is to get the changes from:

  94ea9c0521 ("x86/headers: Replace #include <asm/export.h> with #include <linux/export.h>")
  10f4c9b9a3 ("x86/asm: Fix build of UML with KASAN")

That addresses these perf tools build warning:

  Warning: Kernel ABI header differences:
    diff -u tools/arch/x86/lib/memcpy_64.S arch/x86/lib/memcpy_64.S
    diff -u tools/arch/x86/lib/memset_64.S arch/x86/lib/memset_64.S

Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Ian Rogers <irogers@google.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Masahiro Yamada <masahiroy@kernel.org>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Vincent Whitchurch <vincent.whitchurch@axis.com>
Link: https://lore.kernel.org/lkml/ZbkIKpKdNqOFdMwJ@kernel.org
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
2024-01-30 11:31:15 -03:00

118 lines
2.4 KiB
ArmAsm

/* SPDX-License-Identifier: GPL-2.0 */
/* Copyright 2002 Andi Kleen, SuSE Labs */
#include <linux/export.h>
#include <linux/linkage.h>
#include <asm/cpufeatures.h>
#include <asm/alternative.h>
.section .noinstr.text, "ax"
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
* simpler and shorter than the original function as well.
*
* rdi destination
* rsi value (char)
* rdx count (bytes)
*
* rax original destination
*
* The FSRS alternative should be done inline (avoiding the call and
* the disgusting return handling), but that would require some help
* from the compiler for better calling conventions.
*
* The 'rep stosb' itself is small enough to replace the call, but all
* the register moves blow up the code. And two of them are "needed"
* only for the return value that is the same as the source input,
* which the compiler could/should do much better anyway.
*/
SYM_FUNC_START(__memset)
ALTERNATIVE "jmp memset_orig", "", X86_FEATURE_FSRS
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
RET
SYM_FUNC_END(__memset)
EXPORT_SYMBOL(__memset)
SYM_FUNC_ALIAS_MEMFUNC(memset, __memset)
EXPORT_SYMBOL(memset)
SYM_FUNC_START_LOCAL(memset_orig)
movq %rdi,%r10
/* expand byte value */
movzbl %sil,%ecx
movabs $0x0101010101010101,%rax
imulq %rcx,%rax
/* align dst */
movl %edi,%r9d
andl $7,%r9d
jnz .Lbad_alignment
.Lafter_bad_alignment:
movq %rdx,%rcx
shrq $6,%rcx
jz .Lhandle_tail
.p2align 4
.Lloop_64:
decq %rcx
movq %rax,(%rdi)
movq %rax,8(%rdi)
movq %rax,16(%rdi)
movq %rax,24(%rdi)
movq %rax,32(%rdi)
movq %rax,40(%rdi)
movq %rax,48(%rdi)
movq %rax,56(%rdi)
leaq 64(%rdi),%rdi
jnz .Lloop_64
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
.Lhandle_tail:
movl %edx,%ecx
andl $63&(~7),%ecx
jz .Lhandle_7
shrl $3,%ecx
.p2align 4
.Lloop_8:
decl %ecx
movq %rax,(%rdi)
leaq 8(%rdi),%rdi
jnz .Lloop_8
.Lhandle_7:
andl $7,%edx
jz .Lende
.p2align 4
.Lloop_1:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
jnz .Lloop_1
.Lende:
movq %r10,%rax
RET
.Lbad_alignment:
cmpq $7,%rdx
jbe .Lhandle_7
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
jmp .Lafter_bad_alignment
.Lfinal:
SYM_FUNC_END(memset_orig)