mirror of
https://github.com/torvalds/linux.git
synced 2024-12-05 18:41:23 +00:00
a894e8ed09
Andy Chiu <andy.chiu@sifive.com> says: This series provides support running Vector in kernel mode. Additionally, kernel-mode Vector can be configured to run without turnning off preemption on a CONFIG_PREEMPT kernel. Along with the suport, we add Vector optimized copy_{to,from}_user. And provide a simple threshold to decide when to run the vectorized functions. We decided to drop vectorized memcpy/memset/memmove for the moment due to the concern of memory side-effect in kernel_vector_begin(). The detailed description can be found at v9[0] This series is composed by 4 parts: patch 1-4: adds basic support for kernel-mode Vector patch 5: includes vectorized copy_{to,from}_user into the kernel patch 6: refactor context switch code in fpu [1] patch 7-10: provides some code refactors and support for preemptible kernel-mode Vector. This series can be merged if we feel any part of {1~4, 5, 6, 7~10} is mature enough. This patch is tested on a QEMU with V and verified that booting, normal userspace operations all work as usual with thresholds set to 0. Also, we test by launching multiple kernel threads which continuously executes and verifies Vector operations in the background. The module that tests these operation is expected to be upstream later. * b4-shazam-merge: riscv: vector: allow kernel-mode Vector with preemption riscv: vector: use kmem_cache to manage vector context riscv: vector: use a mask to write vstate_ctrl riscv: vector: do not pass task_struct into riscv_v_vstate_{save,restore}() riscv: fpu: drop SR_SD bit checking riscv: lib: vectorize copy_to_user/copy_from_user riscv: sched: defer restoring Vector context for user riscv: Add vector extension XOR implementation riscv: vector: make Vector always available for softirq context riscv: Add support for kernel mode vector Link: https://lore.kernel.org/r/20240115055929.4736-1-andy.chiu@sifive.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
247 lines
5.6 KiB
ArmAsm
247 lines
5.6 KiB
ArmAsm
#include <linux/linkage.h>
|
|
#include <linux/export.h>
|
|
#include <asm/asm.h>
|
|
#include <asm/asm-extable.h>
|
|
#include <asm/csr.h>
|
|
#include <asm/hwcap.h>
|
|
#include <asm/alternative-macros.h>
|
|
|
|
.macro fixup op reg addr lbl
|
|
100:
|
|
\op \reg, \addr
|
|
_asm_extable 100b, \lbl
|
|
.endm
|
|
|
|
SYM_FUNC_START(__asm_copy_to_user)
|
|
#ifdef CONFIG_RISCV_ISA_V
|
|
ALTERNATIVE("j fallback_scalar_usercopy", "nop", 0, RISCV_ISA_EXT_v, CONFIG_RISCV_ISA_V)
|
|
REG_L t0, riscv_v_usercopy_threshold
|
|
bltu a2, t0, fallback_scalar_usercopy
|
|
tail enter_vector_usercopy
|
|
#endif
|
|
SYM_FUNC_START(fallback_scalar_usercopy)
|
|
|
|
/* Enable access to user memory */
|
|
li t6, SR_SUM
|
|
csrs CSR_STATUS, t6
|
|
|
|
/*
|
|
* Save the terminal address which will be used to compute the number
|
|
* of bytes copied in case of a fixup exception.
|
|
*/
|
|
add t5, a0, a2
|
|
|
|
/*
|
|
* Register allocation for code below:
|
|
* a0 - start of uncopied dst
|
|
* a1 - start of uncopied src
|
|
* a2 - size
|
|
* t0 - end of uncopied dst
|
|
*/
|
|
add t0, a0, a2
|
|
|
|
/*
|
|
* Use byte copy only if too small.
|
|
* SZREG holds 4 for RV32 and 8 for RV64
|
|
*/
|
|
li a3, 9*SZREG /* size must be larger than size in word_copy */
|
|
bltu a2, a3, .Lbyte_copy_tail
|
|
|
|
/*
|
|
* Copy first bytes until dst is aligned to word boundary.
|
|
* a0 - start of dst
|
|
* t1 - start of aligned dst
|
|
*/
|
|
addi t1, a0, SZREG-1
|
|
andi t1, t1, ~(SZREG-1)
|
|
/* dst is already aligned, skip */
|
|
beq a0, t1, .Lskip_align_dst
|
|
1:
|
|
/* a5 - one byte for copying data */
|
|
fixup lb a5, 0(a1), 10f
|
|
addi a1, a1, 1 /* src */
|
|
fixup sb a5, 0(a0), 10f
|
|
addi a0, a0, 1 /* dst */
|
|
bltu a0, t1, 1b /* t1 - start of aligned dst */
|
|
|
|
.Lskip_align_dst:
|
|
/*
|
|
* Now dst is aligned.
|
|
* Use shift-copy if src is misaligned.
|
|
* Use word-copy if both src and dst are aligned because
|
|
* can not use shift-copy which do not require shifting
|
|
*/
|
|
/* a1 - start of src */
|
|
andi a3, a1, SZREG-1
|
|
bnez a3, .Lshift_copy
|
|
|
|
.Lword_copy:
|
|
/*
|
|
* Both src and dst are aligned, unrolled word copy
|
|
*
|
|
* a0 - start of aligned dst
|
|
* a1 - start of aligned src
|
|
* t0 - end of aligned dst
|
|
*/
|
|
addi t0, t0, -(8*SZREG) /* not to over run */
|
|
2:
|
|
fixup REG_L a4, 0(a1), 10f
|
|
fixup REG_L a5, SZREG(a1), 10f
|
|
fixup REG_L a6, 2*SZREG(a1), 10f
|
|
fixup REG_L a7, 3*SZREG(a1), 10f
|
|
fixup REG_L t1, 4*SZREG(a1), 10f
|
|
fixup REG_L t2, 5*SZREG(a1), 10f
|
|
fixup REG_L t3, 6*SZREG(a1), 10f
|
|
fixup REG_L t4, 7*SZREG(a1), 10f
|
|
fixup REG_S a4, 0(a0), 10f
|
|
fixup REG_S a5, SZREG(a0), 10f
|
|
fixup REG_S a6, 2*SZREG(a0), 10f
|
|
fixup REG_S a7, 3*SZREG(a0), 10f
|
|
fixup REG_S t1, 4*SZREG(a0), 10f
|
|
fixup REG_S t2, 5*SZREG(a0), 10f
|
|
fixup REG_S t3, 6*SZREG(a0), 10f
|
|
fixup REG_S t4, 7*SZREG(a0), 10f
|
|
addi a0, a0, 8*SZREG
|
|
addi a1, a1, 8*SZREG
|
|
bltu a0, t0, 2b
|
|
|
|
addi t0, t0, 8*SZREG /* revert to original value */
|
|
j .Lbyte_copy_tail
|
|
|
|
.Lshift_copy:
|
|
|
|
/*
|
|
* Word copy with shifting.
|
|
* For misaligned copy we still perform aligned word copy, but
|
|
* we need to use the value fetched from the previous iteration and
|
|
* do some shifts.
|
|
* This is safe because reading is less than a word size.
|
|
*
|
|
* a0 - start of aligned dst
|
|
* a1 - start of src
|
|
* a3 - a1 & mask:(SZREG-1)
|
|
* t0 - end of uncopied dst
|
|
* t1 - end of aligned dst
|
|
*/
|
|
/* calculating aligned word boundary for dst */
|
|
andi t1, t0, ~(SZREG-1)
|
|
/* Converting unaligned src to aligned src */
|
|
andi a1, a1, ~(SZREG-1)
|
|
|
|
/*
|
|
* Calculate shifts
|
|
* t3 - prev shift
|
|
* t4 - current shift
|
|
*/
|
|
slli t3, a3, 3 /* converting bytes in a3 to bits */
|
|
li a5, SZREG*8
|
|
sub t4, a5, t3
|
|
|
|
/* Load the first word to combine with second word */
|
|
fixup REG_L a5, 0(a1), 10f
|
|
|
|
3:
|
|
/* Main shifting copy
|
|
*
|
|
* a0 - start of aligned dst
|
|
* a1 - start of aligned src
|
|
* t1 - end of aligned dst
|
|
*/
|
|
|
|
/* At least one iteration will be executed */
|
|
srl a4, a5, t3
|
|
fixup REG_L a5, SZREG(a1), 10f
|
|
addi a1, a1, SZREG
|
|
sll a2, a5, t4
|
|
or a2, a2, a4
|
|
fixup REG_S a2, 0(a0), 10f
|
|
addi a0, a0, SZREG
|
|
bltu a0, t1, 3b
|
|
|
|
/* Revert src to original unaligned value */
|
|
add a1, a1, a3
|
|
|
|
.Lbyte_copy_tail:
|
|
/*
|
|
* Byte copy anything left.
|
|
*
|
|
* a0 - start of remaining dst
|
|
* a1 - start of remaining src
|
|
* t0 - end of remaining dst
|
|
*/
|
|
bgeu a0, t0, .Lout_copy_user /* check if end of copy */
|
|
4:
|
|
fixup lb a5, 0(a1), 10f
|
|
addi a1, a1, 1 /* src */
|
|
fixup sb a5, 0(a0), 10f
|
|
addi a0, a0, 1 /* dst */
|
|
bltu a0, t0, 4b /* t0 - end of dst */
|
|
|
|
.Lout_copy_user:
|
|
/* Disable access to user memory */
|
|
csrc CSR_STATUS, t6
|
|
li a0, 0
|
|
ret
|
|
|
|
/* Exception fixup code */
|
|
10:
|
|
/* Disable access to user memory */
|
|
csrc CSR_STATUS, t6
|
|
sub a0, t5, a0
|
|
ret
|
|
SYM_FUNC_END(__asm_copy_to_user)
|
|
SYM_FUNC_END(fallback_scalar_usercopy)
|
|
EXPORT_SYMBOL(__asm_copy_to_user)
|
|
SYM_FUNC_ALIAS(__asm_copy_from_user, __asm_copy_to_user)
|
|
EXPORT_SYMBOL(__asm_copy_from_user)
|
|
|
|
|
|
SYM_FUNC_START(__clear_user)
|
|
|
|
/* Enable access to user memory */
|
|
li t6, SR_SUM
|
|
csrs CSR_STATUS, t6
|
|
|
|
add a3, a0, a1
|
|
addi t0, a0, SZREG-1
|
|
andi t1, a3, ~(SZREG-1)
|
|
andi t0, t0, ~(SZREG-1)
|
|
/*
|
|
* a3: terminal address of target region
|
|
* t0: lowest doubleword-aligned address in target region
|
|
* t1: highest doubleword-aligned address in target region
|
|
*/
|
|
bgeu t0, t1, 2f
|
|
bltu a0, t0, 4f
|
|
1:
|
|
fixup REG_S, zero, (a0), 11f
|
|
addi a0, a0, SZREG
|
|
bltu a0, t1, 1b
|
|
2:
|
|
bltu a0, a3, 5f
|
|
|
|
3:
|
|
/* Disable access to user memory */
|
|
csrc CSR_STATUS, t6
|
|
li a0, 0
|
|
ret
|
|
4: /* Edge case: unalignment */
|
|
fixup sb, zero, (a0), 11f
|
|
addi a0, a0, 1
|
|
bltu a0, t0, 4b
|
|
j 1b
|
|
5: /* Edge case: remainder */
|
|
fixup sb, zero, (a0), 11f
|
|
addi a0, a0, 1
|
|
bltu a0, a3, 5b
|
|
j 3b
|
|
|
|
/* Exception fixup code */
|
|
11:
|
|
/* Disable access to user memory */
|
|
csrc CSR_STATUS, t6
|
|
sub a0, a3, a0
|
|
ret
|
|
SYM_FUNC_END(__clear_user)
|
|
EXPORT_SYMBOL(__clear_user)
|