84d95ad4cb
Make alternatives replace single JMPs instead of whole memset functions, thus decreasing the amount of instructions copied during patching time at boot. While at it, make it use the REP_GOOD version by default which means alternatives NOP out the JMP to the other versions, as REP_GOOD is set by default on the majority of relevant x86 processors. Signed-off-by: Borislav Petkov <bp@suse.de>
144 lines
2.7 KiB
ArmAsm
144 lines
2.7 KiB
ArmAsm
/* Copyright 2002 Andi Kleen, SuSE Labs */
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/dwarf2.h>
|
|
#include <asm/cpufeature.h>
|
|
#include <asm/alternative-asm.h>
|
|
|
|
.weak memset
|
|
|
|
/*
|
|
* ISO C memset - set a memory block to a byte value. This function uses fast
|
|
* string to get better performance than the original function. The code is
|
|
* simpler and shorter than the orignal function as well.
|
|
*
|
|
* rdi destination
|
|
* rsi value (char)
|
|
* rdx count (bytes)
|
|
*
|
|
* rax original destination
|
|
*/
|
|
ENTRY(memset)
|
|
ENTRY(__memset)
|
|
/*
|
|
* Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
|
|
* to use it when possible. If not available, use fast string instructions.
|
|
*
|
|
* Otherwise, use original memset function.
|
|
*/
|
|
ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
|
|
"jmp memset_erms", X86_FEATURE_ERMS
|
|
|
|
movq %rdi,%r9
|
|
movq %rdx,%rcx
|
|
andl $7,%edx
|
|
shrq $3,%rcx
|
|
/* expand byte value */
|
|
movzbl %sil,%esi
|
|
movabs $0x0101010101010101,%rax
|
|
imulq %rsi,%rax
|
|
rep stosq
|
|
movl %edx,%ecx
|
|
rep stosb
|
|
movq %r9,%rax
|
|
ret
|
|
ENDPROC(memset)
|
|
ENDPROC(__memset)
|
|
|
|
/*
|
|
* ISO C memset - set a memory block to a byte value. This function uses
|
|
* enhanced rep stosb to override the fast string function.
|
|
* The code is simpler and shorter than the fast string function as well.
|
|
*
|
|
* rdi destination
|
|
* rsi value (char)
|
|
* rdx count (bytes)
|
|
*
|
|
* rax original destination
|
|
*/
|
|
ENTRY(memset_erms)
|
|
movq %rdi,%r9
|
|
movb %sil,%al
|
|
movq %rdx,%rcx
|
|
rep stosb
|
|
movq %r9,%rax
|
|
ret
|
|
ENDPROC(memset_erms)
|
|
|
|
ENTRY(memset_orig)
|
|
CFI_STARTPROC
|
|
movq %rdi,%r10
|
|
|
|
/* expand byte value */
|
|
movzbl %sil,%ecx
|
|
movabs $0x0101010101010101,%rax
|
|
imulq %rcx,%rax
|
|
|
|
/* align dst */
|
|
movl %edi,%r9d
|
|
andl $7,%r9d
|
|
jnz .Lbad_alignment
|
|
CFI_REMEMBER_STATE
|
|
.Lafter_bad_alignment:
|
|
|
|
movq %rdx,%rcx
|
|
shrq $6,%rcx
|
|
jz .Lhandle_tail
|
|
|
|
.p2align 4
|
|
.Lloop_64:
|
|
decq %rcx
|
|
movq %rax,(%rdi)
|
|
movq %rax,8(%rdi)
|
|
movq %rax,16(%rdi)
|
|
movq %rax,24(%rdi)
|
|
movq %rax,32(%rdi)
|
|
movq %rax,40(%rdi)
|
|
movq %rax,48(%rdi)
|
|
movq %rax,56(%rdi)
|
|
leaq 64(%rdi),%rdi
|
|
jnz .Lloop_64
|
|
|
|
/* Handle tail in loops. The loops should be faster than hard
|
|
to predict jump tables. */
|
|
.p2align 4
|
|
.Lhandle_tail:
|
|
movl %edx,%ecx
|
|
andl $63&(~7),%ecx
|
|
jz .Lhandle_7
|
|
shrl $3,%ecx
|
|
.p2align 4
|
|
.Lloop_8:
|
|
decl %ecx
|
|
movq %rax,(%rdi)
|
|
leaq 8(%rdi),%rdi
|
|
jnz .Lloop_8
|
|
|
|
.Lhandle_7:
|
|
andl $7,%edx
|
|
jz .Lende
|
|
.p2align 4
|
|
.Lloop_1:
|
|
decl %edx
|
|
movb %al,(%rdi)
|
|
leaq 1(%rdi),%rdi
|
|
jnz .Lloop_1
|
|
|
|
.Lende:
|
|
movq %r10,%rax
|
|
ret
|
|
|
|
CFI_RESTORE_STATE
|
|
.Lbad_alignment:
|
|
cmpq $7,%rdx
|
|
jbe .Lhandle_7
|
|
movq %rax,(%rdi) /* unaligned store */
|
|
movq $8,%r8
|
|
subq %r9,%r8
|
|
addq %r8,%rdi
|
|
subq %r8,%rdx
|
|
jmp .Lafter_bad_alignment
|
|
.Lfinal:
|
|
CFI_ENDPROC
|
|
ENDPROC(memset_orig)
|