mirror of
https://github.com/torvalds/linux.git
synced 2024-12-12 14:12:51 +00:00
b4623d4e5b
Provide an s390 specific memmove implementation which is faster than the generic implementation which copies byte-wise. For non-destructive (as defined by the mvc instruction) memmove operations the following table compares the old default implementation versus the new s390 specific implementation: size old new 1 1ns 8ns 2 2ns 8ns 4 4ns 8ns 8 7ns 8ns 16 17ns 8ns 32 35ns 8ns 64 65ns 9ns 128 146ns 10ns 256 298ns 11ns 512 537ns 11ns 1024 1193ns 19ns 2048 2405ns 36ns So only for very small sizes the old implementation is faster. For overlapping memmoves, where the mvc instruction can't be used, the new implementation is as slow as the old one. Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
131 lines
2.3 KiB
ArmAsm
131 lines
2.3 KiB
ArmAsm
/*
|
|
* String handling functions.
|
|
*
|
|
* Copyright IBM Corp. 2012
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/export.h>
|
|
|
|
/*
|
|
* void *memmove(void *dest, const void *src, size_t n)
|
|
*/
|
|
ENTRY(memmove)
|
|
ltgr %r4,%r4
|
|
lgr %r1,%r2
|
|
bzr %r14
|
|
clgr %r2,%r3
|
|
jnh .Lmemmove_forward
|
|
la %r5,0(%r4,%r3)
|
|
clgr %r2,%r5
|
|
jl .Lmemmove_reverse
|
|
.Lmemmove_forward:
|
|
aghi %r4,-1
|
|
srlg %r0,%r4,8
|
|
ltgr %r0,%r0
|
|
jz .Lmemmove_rest
|
|
.Lmemmove_loop:
|
|
mvc 0(256,%r1),0(%r3)
|
|
la %r1,256(%r1)
|
|
la %r3,256(%r3)
|
|
brctg %r0,.Lmemmove_loop
|
|
.Lmemmove_rest:
|
|
larl %r5,.Lmemmove_mvc
|
|
ex %r4,0(%r5)
|
|
br %r14
|
|
.Lmemmove_reverse:
|
|
aghi %r4,-1
|
|
.Lmemmove_reverse_loop:
|
|
ic %r0,0(%r4,%r3)
|
|
stc %r0,0(%r4,%r1)
|
|
brctg %r4,.Lmemmove_reverse_loop
|
|
ic %r0,0(%r4,%r3)
|
|
stc %r0,0(%r4,%r1)
|
|
br %r14
|
|
.Lmemmove_mvc:
|
|
mvc 0(1,%r1),0(%r3)
|
|
EXPORT_SYMBOL(memmove)
|
|
|
|
/*
|
|
* memset implementation
|
|
*
|
|
* This code corresponds to the C construct below. We do distinguish
|
|
* between clearing (c == 0) and setting a memory array (c != 0) simply
|
|
* because nearly all memset invocations in the kernel clear memory and
|
|
* the xc instruction is preferred in such cases.
|
|
*
|
|
* void *memset(void *s, int c, size_t n)
|
|
* {
|
|
* if (likely(c == 0))
|
|
* return __builtin_memset(s, 0, n);
|
|
* return __builtin_memset(s, c, n);
|
|
* }
|
|
*/
|
|
ENTRY(memset)
|
|
ltgr %r4,%r4
|
|
bzr %r14
|
|
ltgr %r3,%r3
|
|
jnz .Lmemset_fill
|
|
aghi %r4,-1
|
|
srlg %r3,%r4,8
|
|
ltgr %r3,%r3
|
|
lgr %r1,%r2
|
|
jz .Lmemset_clear_rest
|
|
.Lmemset_clear_loop:
|
|
xc 0(256,%r1),0(%r1)
|
|
la %r1,256(%r1)
|
|
brctg %r3,.Lmemset_clear_loop
|
|
.Lmemset_clear_rest:
|
|
larl %r3,.Lmemset_xc
|
|
ex %r4,0(%r3)
|
|
br %r14
|
|
.Lmemset_fill:
|
|
stc %r3,0(%r2)
|
|
cghi %r4,1
|
|
lgr %r1,%r2
|
|
ber %r14
|
|
aghi %r4,-2
|
|
srlg %r3,%r4,8
|
|
ltgr %r3,%r3
|
|
jz .Lmemset_fill_rest
|
|
.Lmemset_fill_loop:
|
|
mvc 1(256,%r1),0(%r1)
|
|
la %r1,256(%r1)
|
|
brctg %r3,.Lmemset_fill_loop
|
|
.Lmemset_fill_rest:
|
|
larl %r3,.Lmemset_mvc
|
|
ex %r4,0(%r3)
|
|
br %r14
|
|
.Lmemset_xc:
|
|
xc 0(1,%r1),0(%r1)
|
|
.Lmemset_mvc:
|
|
mvc 1(1,%r1),0(%r1)
|
|
EXPORT_SYMBOL(memset)
|
|
|
|
/*
|
|
* memcpy implementation
|
|
*
|
|
* void *memcpy(void *dest, const void *src, size_t n)
|
|
*/
|
|
ENTRY(memcpy)
|
|
ltgr %r4,%r4
|
|
bzr %r14
|
|
aghi %r4,-1
|
|
srlg %r5,%r4,8
|
|
ltgr %r5,%r5
|
|
lgr %r1,%r2
|
|
jnz .Lmemcpy_loop
|
|
.Lmemcpy_rest:
|
|
larl %r5,.Lmemcpy_mvc
|
|
ex %r4,0(%r5)
|
|
br %r14
|
|
.Lmemcpy_loop:
|
|
mvc 0(256,%r1),0(%r3)
|
|
la %r1,256(%r1)
|
|
la %r3,256(%r3)
|
|
brctg %r5,.Lmemcpy_loop
|
|
j .Lmemcpy_rest
|
|
.Lmemcpy_mvc:
|
|
mvc 0(1,%r1),0(%r3)
|
|
EXPORT_SYMBOL(memcpy)
|