mirror of
https://github.com/torvalds/linux.git
synced 2024-12-30 14:52:05 +00:00
7269e8812a
In order to avoid unnecessary chains of branches, rather than implementing memcpy()/memset()'s access to their alternative implementations via a jump, patch the (larger) original function directly. The memcpy() part of this is slightly subtle: while alternative instruction patching does itself use memcpy(), with the replacement block being less than 64-bytes in size the main loop of the original function doesn't get used for copying memcpy_c() over memcpy(), and hence we can safely write over its beginning. Also note that the CFI annotations are fine for both variants of each of the functions. Signed-off-by: Jan Beulich <jbeulich@novell.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> LKML-Reference: <4B2BB8D30200007800026AF2@vpn.id2.novell.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
144 lines
2.3 KiB
ArmAsm
144 lines
2.3 KiB
ArmAsm
/* Copyright 2002 Andi Kleen */
|
|
|
|
#include <linux/linkage.h>
|
|
|
|
#include <asm/cpufeature.h>
|
|
#include <asm/dwarf2.h>
|
|
|
|
/*
|
|
* memcpy - Copy a memory block.
|
|
*
|
|
* Input:
|
|
* rdi destination
|
|
* rsi source
|
|
* rdx count
|
|
*
|
|
* Output:
|
|
* rax original destination
|
|
*/
|
|
|
|
/*
|
|
* memcpy_c() - fast string ops (REP MOVSQ) based variant.
|
|
*
|
|
* This gets patched over the unrolled variant (below) via the
|
|
* alternative instructions framework:
|
|
*/
|
|
.section .altinstr_replacement, "ax", @progbits
|
|
.Lmemcpy_c:
|
|
movq %rdi, %rax
|
|
|
|
movl %edx, %ecx
|
|
shrl $3, %ecx
|
|
andl $7, %edx
|
|
rep movsq
|
|
movl %edx, %ecx
|
|
rep movsb
|
|
ret
|
|
.Lmemcpy_e:
|
|
.previous
|
|
|
|
ENTRY(__memcpy)
|
|
ENTRY(memcpy)
|
|
CFI_STARTPROC
|
|
|
|
/*
|
|
* Put the number of full 64-byte blocks into %ecx.
|
|
* Tail portion is handled at the end:
|
|
*/
|
|
movq %rdi, %rax
|
|
movl %edx, %ecx
|
|
shrl $6, %ecx
|
|
jz .Lhandle_tail
|
|
|
|
.p2align 4
|
|
.Lloop_64:
|
|
/*
|
|
* We decrement the loop index here - and the zero-flag is
|
|
* checked at the end of the loop (instructions inbetween do
|
|
* not change the zero flag):
|
|
*/
|
|
decl %ecx
|
|
|
|
/*
|
|
* Move in blocks of 4x16 bytes:
|
|
*/
|
|
movq 0*8(%rsi), %r11
|
|
movq 1*8(%rsi), %r8
|
|
movq %r11, 0*8(%rdi)
|
|
movq %r8, 1*8(%rdi)
|
|
|
|
movq 2*8(%rsi), %r9
|
|
movq 3*8(%rsi), %r10
|
|
movq %r9, 2*8(%rdi)
|
|
movq %r10, 3*8(%rdi)
|
|
|
|
movq 4*8(%rsi), %r11
|
|
movq 5*8(%rsi), %r8
|
|
movq %r11, 4*8(%rdi)
|
|
movq %r8, 5*8(%rdi)
|
|
|
|
movq 6*8(%rsi), %r9
|
|
movq 7*8(%rsi), %r10
|
|
movq %r9, 6*8(%rdi)
|
|
movq %r10, 7*8(%rdi)
|
|
|
|
leaq 64(%rsi), %rsi
|
|
leaq 64(%rdi), %rdi
|
|
|
|
jnz .Lloop_64
|
|
|
|
.Lhandle_tail:
|
|
movl %edx, %ecx
|
|
andl $63, %ecx
|
|
shrl $3, %ecx
|
|
jz .Lhandle_7
|
|
|
|
.p2align 4
|
|
.Lloop_8:
|
|
decl %ecx
|
|
movq (%rsi), %r8
|
|
movq %r8, (%rdi)
|
|
leaq 8(%rdi), %rdi
|
|
leaq 8(%rsi), %rsi
|
|
jnz .Lloop_8
|
|
|
|
.Lhandle_7:
|
|
movl %edx, %ecx
|
|
andl $7, %ecx
|
|
jz .Lend
|
|
|
|
.p2align 4
|
|
.Lloop_1:
|
|
movb (%rsi), %r8b
|
|
movb %r8b, (%rdi)
|
|
incq %rdi
|
|
incq %rsi
|
|
decl %ecx
|
|
jnz .Lloop_1
|
|
|
|
.Lend:
|
|
ret
|
|
CFI_ENDPROC
|
|
ENDPROC(memcpy)
|
|
ENDPROC(__memcpy)
|
|
|
|
/*
|
|
* Some CPUs run faster using the string copy instructions.
|
|
* It is also a lot simpler. Use this when possible:
|
|
*/
|
|
|
|
.section .altinstructions, "a"
|
|
.align 8
|
|
.quad memcpy
|
|
.quad .Lmemcpy_c
|
|
.byte X86_FEATURE_REP_GOOD
|
|
|
|
/*
|
|
* Replace only beginning, memcpy is used to apply alternatives,
|
|
* so it is silly to overwrite itself with nops - reboot is the
|
|
* only outcome...
|
|
*/
|
|
.byte .Lmemcpy_e - .Lmemcpy_c
|
|
.byte .Lmemcpy_e - .Lmemcpy_c
|
|
.previous
|