linux/arch/s390/lib/mem.S
Heiko Carstens 993fef95b9 s390: optimize memset implementation
Like for the memset16/32/64 variants avoid that subsequent mvc
instructions depend on each other since that might have negative
performance impacts.

This patch is currently hardly relevant since at least gcc 7.1
generates only inline memset code and not a single memset call.
However there is no reason to not provide an optimized version
just in case gcc generates memset calls again, like it did in
the past.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
2017-10-09 11:18:07 +02:00

177 lines
3.3 KiB
ArmAsm

/*
* String handling functions.
*
* Copyright IBM Corp. 2012
*/
#include <linux/linkage.h>
#include <asm/export.h>
/*
* void *memmove(void *dest, const void *src, size_t n)
*/
ENTRY(memmove)
ltgr %r4,%r4
lgr %r1,%r2
bzr %r14
aghi %r4,-1
clgr %r2,%r3
jnh .Lmemmove_forward
la %r5,1(%r4,%r3)
clgr %r2,%r5
jl .Lmemmove_reverse
.Lmemmove_forward:
srlg %r0,%r4,8
ltgr %r0,%r0
jz .Lmemmove_forward_remainder
.Lmemmove_forward_loop:
mvc 0(256,%r1),0(%r3)
la %r1,256(%r1)
la %r3,256(%r3)
brctg %r0,.Lmemmove_forward_loop
.Lmemmove_forward_remainder:
larl %r5,.Lmemmove_mvc
ex %r4,0(%r5)
br %r14
.Lmemmove_reverse:
ic %r0,0(%r4,%r3)
stc %r0,0(%r4,%r1)
brctg %r4,.Lmemmove_reverse
ic %r0,0(%r4,%r3)
stc %r0,0(%r4,%r1)
br %r14
.Lmemmove_mvc:
mvc 0(1,%r1),0(%r3)
EXPORT_SYMBOL(memmove)
/*
* memset implementation
*
* This code corresponds to the C construct below. We do distinguish
* between clearing (c == 0) and setting a memory array (c != 0) simply
* because nearly all memset invocations in the kernel clear memory and
* the xc instruction is preferred in such cases.
*
* void *memset(void *s, int c, size_t n)
* {
* if (likely(c == 0))
* return __builtin_memset(s, 0, n);
* return __builtin_memset(s, c, n);
* }
*/
ENTRY(memset)
ltgr %r4,%r4
bzr %r14
ltgr %r3,%r3
jnz .Lmemset_fill
aghi %r4,-1
srlg %r3,%r4,8
ltgr %r3,%r3
lgr %r1,%r2
jz .Lmemset_clear_remainder
.Lmemset_clear_loop:
xc 0(256,%r1),0(%r1)
la %r1,256(%r1)
brctg %r3,.Lmemset_clear_loop
.Lmemset_clear_remainder:
larl %r3,.Lmemset_xc
ex %r4,0(%r3)
br %r14
.Lmemset_fill:
cghi %r4,1
lgr %r1,%r2
je .Lmemset_fill_exit
aghi %r4,-2
srlg %r5,%r4,8
ltgr %r5,%r5
jz .Lmemset_fill_remainder
.Lmemset_fill_loop:
stc %r3,0(%r1)
mvc 1(255,%r1),0(%r1)
la %r1,256(%r1)
brctg %r5,.Lmemset_fill_loop
.Lmemset_fill_remainder:
stc %r3,0(%r1)
larl %r5,.Lmemset_mvc
ex %r4,0(%r5)
br %r14
.Lmemset_fill_exit:
stc %r3,0(%r1)
br %r14
.Lmemset_xc:
xc 0(1,%r1),0(%r1)
.Lmemset_mvc:
mvc 1(1,%r1),0(%r1)
EXPORT_SYMBOL(memset)
/*
* memcpy implementation
*
* void *memcpy(void *dest, const void *src, size_t n)
*/
ENTRY(memcpy)
ltgr %r4,%r4
bzr %r14
aghi %r4,-1
srlg %r5,%r4,8
ltgr %r5,%r5
lgr %r1,%r2
jnz .Lmemcpy_loop
.Lmemcpy_remainder:
larl %r5,.Lmemcpy_mvc
ex %r4,0(%r5)
br %r14
.Lmemcpy_loop:
mvc 0(256,%r1),0(%r3)
la %r1,256(%r1)
la %r3,256(%r3)
brctg %r5,.Lmemcpy_loop
j .Lmemcpy_remainder
.Lmemcpy_mvc:
mvc 0(1,%r1),0(%r3)
EXPORT_SYMBOL(memcpy)
/*
* __memset16/32/64
*
* void *__memset16(uint16_t *s, uint16_t v, size_t count)
* void *__memset32(uint32_t *s, uint32_t v, size_t count)
* void *__memset64(uint64_t *s, uint64_t v, size_t count)
*/
.macro __MEMSET bits,bytes,insn
ENTRY(__memset\bits)
ltgr %r4,%r4
bzr %r14
cghi %r4,\bytes
je .L__memset_exit\bits
aghi %r4,-(\bytes+1)
srlg %r5,%r4,8
ltgr %r5,%r5
lgr %r1,%r2
jz .L__memset_remainder\bits
.L__memset_loop\bits:
\insn %r3,0(%r1)
mvc \bytes(256-\bytes,%r1),0(%r1)
la %r1,256(%r1)
brctg %r5,.L__memset_loop\bits
.L__memset_remainder\bits:
\insn %r3,0(%r1)
larl %r5,.L__memset_mvc\bits
ex %r4,0(%r5)
br %r14
.L__memset_exit\bits:
\insn %r3,0(%r2)
br %r14
.L__memset_mvc\bits:
mvc \bytes(1,%r1),0(%r1)
.endm
__MEMSET 16,2,sth
EXPORT_SYMBOL(__memset16)
__MEMSET 32,4,st
EXPORT_SYMBOL(__memset32)
__MEMSET 64,8,stg
EXPORT_SYMBOL(__memset64)