mirror of
https://github.com/torvalds/linux.git
synced 2024-12-06 19:11:31 +00:00
60e0a09db2
On ThunderX T88 pass 1 and pass 2, there is no hardware prefetching so we need to patch in explicit software prefetching instructions Prefetching improves this code by 60% over the original code and 2x over the code without prefetching for the affected hardware using the benchmark code at https://github.com/apinski-cavium/copy_page_benchmark Signed-off-by: Andrew Pinski <apinski@cavium.com> Signed-off-by: Will Deacon <will.deacon@arm.com> Tested-by: Andrew Pinski <apinski@cavium.com> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
94 lines
2.1 KiB
ArmAsm
94 lines
2.1 KiB
ArmAsm
/*
|
|
* Copyright (C) 2012 ARM Ltd.
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include <linux/linkage.h>
|
|
#include <linux/const.h>
|
|
#include <asm/assembler.h>
|
|
#include <asm/page.h>
|
|
#include <asm/cpufeature.h>
|
|
#include <asm/alternative.h>
|
|
|
|
/*
|
|
* Copy a page from src to dest (both are page aligned)
|
|
*
|
|
* Parameters:
|
|
* x0 - dest
|
|
* x1 - src
|
|
*/
|
|
ENTRY(copy_page)
|
|
alternative_if_not ARM64_HAS_NO_HW_PREFETCH
|
|
nop
|
|
nop
|
|
alternative_else
|
|
# Prefetch two cache lines ahead.
|
|
prfm pldl1strm, [x1, #128]
|
|
prfm pldl1strm, [x1, #256]
|
|
alternative_endif
|
|
|
|
ldp x2, x3, [x1]
|
|
ldp x4, x5, [x1, #16]
|
|
ldp x6, x7, [x1, #32]
|
|
ldp x8, x9, [x1, #48]
|
|
ldp x10, x11, [x1, #64]
|
|
ldp x12, x13, [x1, #80]
|
|
ldp x14, x15, [x1, #96]
|
|
ldp x16, x17, [x1, #112]
|
|
|
|
mov x18, #(PAGE_SIZE - 128)
|
|
add x1, x1, #128
|
|
1:
|
|
subs x18, x18, #128
|
|
|
|
alternative_if_not ARM64_HAS_NO_HW_PREFETCH
|
|
nop
|
|
alternative_else
|
|
prfm pldl1strm, [x1, #384]
|
|
alternative_endif
|
|
|
|
stnp x2, x3, [x0]
|
|
ldp x2, x3, [x1]
|
|
stnp x4, x5, [x0, #16]
|
|
ldp x4, x5, [x1, #16]
|
|
stnp x6, x7, [x0, #32]
|
|
ldp x6, x7, [x1, #32]
|
|
stnp x8, x9, [x0, #48]
|
|
ldp x8, x9, [x1, #48]
|
|
stnp x10, x11, [x0, #64]
|
|
ldp x10, x11, [x1, #64]
|
|
stnp x12, x13, [x0, #80]
|
|
ldp x12, x13, [x1, #80]
|
|
stnp x14, x15, [x0, #96]
|
|
ldp x14, x15, [x1, #96]
|
|
stnp x16, x17, [x0, #112]
|
|
ldp x16, x17, [x1, #112]
|
|
|
|
add x0, x0, #128
|
|
add x1, x1, #128
|
|
|
|
b.gt 1b
|
|
|
|
stnp x2, x3, [x0]
|
|
stnp x4, x5, [x0, #16]
|
|
stnp x6, x7, [x0, #32]
|
|
stnp x8, x9, [x0, #48]
|
|
stnp x10, x11, [x0, #64]
|
|
stnp x12, x13, [x0, #80]
|
|
stnp x14, x15, [x0, #96]
|
|
stnp x16, x17, [x0, #112]
|
|
|
|
ret
|
|
ENDPROC(copy_page)
|