mirror of
https://github.com/torvalds/linux.git
synced 2024-11-26 14:12:06 +00:00
Merge patch series "riscv: Use Kconfig to set unaligned access speed"
Charlie Jenkins <charlie@rivosinc.com> says: If the hardware unaligned access speed is known at compile time, it is possible to avoid running the unaligned access speed probe to speedup boot-time. * b4-shazam-merge: riscv: Set unaligned access speed at compile time riscv: Decouple emulated unaligned accesses from access speed riscv: Only check online cpus for emulated accesses riscv: lib: Introduce has_fast_unaligned_access() Link: https://lore.kernel.org/r/20240308-disable_misaligned_probe_config-v9-0-a388770ba0ce@rivosinc.com Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
This commit is contained in:
commit
2b2ca35467
@ -704,27 +704,61 @@ config THREAD_SIZE_ORDER
|
||||
affects irq stack size, which is equal to thread stack size.
|
||||
|
||||
config RISCV_MISALIGNED
|
||||
bool "Support misaligned load/store traps for kernel and userspace"
|
||||
bool
|
||||
select SYSCTL_ARCH_UNALIGN_ALLOW
|
||||
default y
|
||||
help
|
||||
Say Y here if you want the kernel to embed support for misaligned
|
||||
load/store for both kernel and userspace. When disable, misaligned
|
||||
accesses will generate SIGBUS in userspace and panic in kernel.
|
||||
Embed support for emulating misaligned loads and stores.
|
||||
|
||||
choice
|
||||
prompt "Unaligned Accesses Support"
|
||||
default RISCV_PROBE_UNALIGNED_ACCESS
|
||||
help
|
||||
This determines the level of support for unaligned accesses. This
|
||||
information is used by the kernel to perform optimizations. It is also
|
||||
exposed to user space via the hwprobe syscall. The hardware will be
|
||||
probed at boot by default.
|
||||
|
||||
config RISCV_PROBE_UNALIGNED_ACCESS
|
||||
bool "Probe for hardware unaligned access support"
|
||||
select RISCV_MISALIGNED
|
||||
help
|
||||
During boot, the kernel will run a series of tests to determine the
|
||||
speed of unaligned accesses. This probing will dynamically determine
|
||||
the speed of unaligned accesses on the underlying system. If unaligned
|
||||
memory accesses trap into the kernel as they are not supported by the
|
||||
system, the kernel will emulate the unaligned accesses to preserve the
|
||||
UABI.
|
||||
|
||||
config RISCV_EMULATED_UNALIGNED_ACCESS
|
||||
bool "Emulate unaligned access where system support is missing"
|
||||
select RISCV_MISALIGNED
|
||||
help
|
||||
If unaligned memory accesses trap into the kernel as they are not
|
||||
supported by the system, the kernel will emulate the unaligned
|
||||
accesses to preserve the UABI. When the underlying system does support
|
||||
unaligned accesses, the unaligned accesses are assumed to be slow.
|
||||
|
||||
config RISCV_SLOW_UNALIGNED_ACCESS
|
||||
bool "Assume the system supports slow unaligned memory accesses"
|
||||
depends on NONPORTABLE
|
||||
help
|
||||
Assume that the system supports slow unaligned memory accesses. The
|
||||
kernel and userspace programs may not be able to run at all on systems
|
||||
that do not support unaligned memory accesses.
|
||||
|
||||
config RISCV_EFFICIENT_UNALIGNED_ACCESS
|
||||
bool "Assume the CPU supports fast unaligned memory accesses"
|
||||
bool "Assume the system supports fast unaligned memory accesses"
|
||||
depends on NONPORTABLE
|
||||
select DCACHE_WORD_ACCESS if MMU
|
||||
select HAVE_EFFICIENT_UNALIGNED_ACCESS
|
||||
help
|
||||
Say Y here if you want the kernel to assume that the CPU supports
|
||||
efficient unaligned memory accesses. When enabled, this option
|
||||
improves the performance of the kernel on such CPUs. However, the
|
||||
kernel will run much more slowly, or will not be able to run at all,
|
||||
on CPUs that do not support efficient unaligned memory accesses.
|
||||
Assume that the system supports fast unaligned memory accesses. When
|
||||
enabled, this option improves the performance of the kernel on such
|
||||
systems. However, the kernel and userspace programs will run much more
|
||||
slowly, or will not be able to run at all, on systems that do not
|
||||
support efficient unaligned memory accesses.
|
||||
|
||||
If unsure what to do here, say N.
|
||||
endchoice
|
||||
|
||||
endmenu # "Platform type"
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-only */
|
||||
/*
|
||||
* Copyright 2022-2023 Rivos, Inc
|
||||
* Copyright 2022-2024 Rivos, Inc
|
||||
*/
|
||||
|
||||
#ifndef _ASM_CPUFEATURE_H
|
||||
@ -28,29 +28,38 @@ struct riscv_isainfo {
|
||||
|
||||
DECLARE_PER_CPU(struct riscv_cpuinfo, riscv_cpuinfo);
|
||||
|
||||
DECLARE_PER_CPU(long, misaligned_access_speed);
|
||||
|
||||
/* Per-cpu ISA extensions. */
|
||||
extern struct riscv_isainfo hart_isa[NR_CPUS];
|
||||
|
||||
void riscv_user_isa_enable(void);
|
||||
|
||||
#ifdef CONFIG_RISCV_MISALIGNED
|
||||
bool unaligned_ctl_available(void);
|
||||
bool check_unaligned_access_emulated(int cpu);
|
||||
#if defined(CONFIG_RISCV_MISALIGNED)
|
||||
bool check_unaligned_access_emulated_all_cpus(void);
|
||||
void unaligned_emulation_finish(void);
|
||||
bool unaligned_ctl_available(void);
|
||||
DECLARE_PER_CPU(long, misaligned_access_speed);
|
||||
#else
|
||||
static inline bool unaligned_ctl_available(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline bool check_unaligned_access_emulated(int cpu)
|
||||
#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
|
||||
DECLARE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
|
||||
|
||||
static __always_inline bool has_fast_unaligned_accesses(void)
|
||||
{
|
||||
return false;
|
||||
return static_branch_likely(&fast_unaligned_access_speed_key);
|
||||
}
|
||||
#else
|
||||
static __always_inline bool has_fast_unaligned_accesses(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS))
|
||||
return true;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline void unaligned_emulation_finish(void) {}
|
||||
#endif
|
||||
|
||||
unsigned long riscv_get_elf_hwcap(void);
|
||||
@ -135,6 +144,4 @@ static __always_inline bool riscv_cpu_has_extension_unlikely(int cpu, const unsi
|
||||
return __riscv_isa_extension_available(hart_isa[cpu].isa, ext);
|
||||
}
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
|
||||
|
||||
#endif
|
||||
|
@ -38,7 +38,6 @@ extra-y += vmlinux.lds
|
||||
obj-y += head.o
|
||||
obj-y += soc.o
|
||||
obj-$(CONFIG_RISCV_ALTERNATIVE) += alternative.o
|
||||
obj-y += copy-unaligned.o
|
||||
obj-y += cpu.o
|
||||
obj-y += cpufeature.o
|
||||
obj-y += entry.o
|
||||
@ -62,6 +61,9 @@ obj-y += tests/
|
||||
obj-$(CONFIG_MMU) += vdso.o vdso/
|
||||
|
||||
obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
|
||||
obj-$(CONFIG_RISCV_MISALIGNED) += unaligned_access_speed.o
|
||||
obj-$(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS) += copy-unaligned.o
|
||||
|
||||
obj-$(CONFIG_FPU) += fpu.o
|
||||
obj-$(CONFIG_RISCV_ISA_V) += vector.o
|
||||
obj-$(CONFIG_RISCV_ISA_V) += kernel_mode_vector.o
|
||||
|
@ -11,7 +11,6 @@
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpuhotplug.h>
|
||||
#include <linux/ctype.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/module.h>
|
||||
@ -21,20 +20,12 @@
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/hwcap.h>
|
||||
#include <asm/hwprobe.h>
|
||||
#include <asm/patch.h>
|
||||
#include <asm/processor.h>
|
||||
#include <asm/vector.h>
|
||||
|
||||
#include "copy-unaligned.h"
|
||||
|
||||
#define NUM_ALPHA_EXTS ('z' - 'a' + 1)
|
||||
|
||||
#define MISALIGNED_ACCESS_JIFFIES_LG2 1
|
||||
#define MISALIGNED_BUFFER_SIZE 0x4000
|
||||
#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
|
||||
#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
|
||||
|
||||
unsigned long elf_hwcap __read_mostly;
|
||||
|
||||
/* Host ISA bitmap */
|
||||
@ -43,11 +34,6 @@ static DECLARE_BITMAP(riscv_isa, RISCV_ISA_EXT_MAX) __read_mostly;
|
||||
/* Per-cpu ISA extensions. */
|
||||
struct riscv_isainfo hart_isa[NR_CPUS];
|
||||
|
||||
/* Performance information */
|
||||
DEFINE_PER_CPU(long, misaligned_access_speed);
|
||||
|
||||
static cpumask_t fast_misaligned_access;
|
||||
|
||||
/**
|
||||
* riscv_isa_extension_base() - Get base extension word
|
||||
*
|
||||
@ -707,247 +693,6 @@ unsigned long riscv_get_elf_hwcap(void)
|
||||
return hwcap;
|
||||
}
|
||||
|
||||
static int check_unaligned_access(void *param)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
u64 start_cycles, end_cycles;
|
||||
u64 word_cycles;
|
||||
u64 byte_cycles;
|
||||
int ratio;
|
||||
unsigned long start_jiffies, now;
|
||||
struct page *page = param;
|
||||
void *dst;
|
||||
void *src;
|
||||
long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
|
||||
|
||||
if (check_unaligned_access_emulated(cpu))
|
||||
return 0;
|
||||
|
||||
/* Make an unaligned destination buffer. */
|
||||
dst = (void *)((unsigned long)page_address(page) | 0x1);
|
||||
/* Unalign src as well, but differently (off by 1 + 2 = 3). */
|
||||
src = dst + (MISALIGNED_BUFFER_SIZE / 2);
|
||||
src += 2;
|
||||
word_cycles = -1ULL;
|
||||
/* Do a warmup. */
|
||||
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
preempt_disable();
|
||||
start_jiffies = jiffies;
|
||||
while ((now = jiffies) == start_jiffies)
|
||||
cpu_relax();
|
||||
|
||||
/*
|
||||
* For a fixed amount of time, repeatedly try the function, and take
|
||||
* the best time in cycles as the measurement.
|
||||
*/
|
||||
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
|
||||
start_cycles = get_cycles64();
|
||||
/* Ensure the CSR read can't reorder WRT to the copy. */
|
||||
mb();
|
||||
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
/* Ensure the copy ends before the end time is snapped. */
|
||||
mb();
|
||||
end_cycles = get_cycles64();
|
||||
if ((end_cycles - start_cycles) < word_cycles)
|
||||
word_cycles = end_cycles - start_cycles;
|
||||
}
|
||||
|
||||
byte_cycles = -1ULL;
|
||||
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
start_jiffies = jiffies;
|
||||
while ((now = jiffies) == start_jiffies)
|
||||
cpu_relax();
|
||||
|
||||
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
|
||||
start_cycles = get_cycles64();
|
||||
mb();
|
||||
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
mb();
|
||||
end_cycles = get_cycles64();
|
||||
if ((end_cycles - start_cycles) < byte_cycles)
|
||||
byte_cycles = end_cycles - start_cycles;
|
||||
}
|
||||
|
||||
preempt_enable();
|
||||
|
||||
/* Don't divide by zero. */
|
||||
if (!word_cycles || !byte_cycles) {
|
||||
pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
|
||||
cpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (word_cycles < byte_cycles)
|
||||
speed = RISCV_HWPROBE_MISALIGNED_FAST;
|
||||
|
||||
ratio = div_u64((byte_cycles * 100), word_cycles);
|
||||
pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
|
||||
cpu,
|
||||
ratio / 100,
|
||||
ratio % 100,
|
||||
(speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
|
||||
|
||||
per_cpu(misaligned_access_speed, cpu) = speed;
|
||||
|
||||
/*
|
||||
* Set the value of fast_misaligned_access of a CPU. These operations
|
||||
* are atomic to avoid race conditions.
|
||||
*/
|
||||
if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
|
||||
cpumask_set_cpu(cpu, &fast_misaligned_access);
|
||||
else
|
||||
cpumask_clear_cpu(cpu, &fast_misaligned_access);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void check_unaligned_access_nonboot_cpu(void *param)
|
||||
{
|
||||
unsigned int cpu = smp_processor_id();
|
||||
struct page **pages = param;
|
||||
|
||||
if (smp_processor_id() != 0)
|
||||
check_unaligned_access(pages[cpu]);
|
||||
}
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(fast_misaligned_access_speed_key);
|
||||
|
||||
static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
|
||||
{
|
||||
if (cpumask_weight(mask) == weight)
|
||||
static_branch_enable_cpuslocked(&fast_misaligned_access_speed_key);
|
||||
else
|
||||
static_branch_disable_cpuslocked(&fast_misaligned_access_speed_key);
|
||||
}
|
||||
|
||||
static void set_unaligned_access_static_branches_except_cpu(int cpu)
|
||||
{
|
||||
/*
|
||||
* Same as set_unaligned_access_static_branches, except excludes the
|
||||
* given CPU from the result. When a CPU is hotplugged into an offline
|
||||
* state, this function is called before the CPU is set to offline in
|
||||
* the cpumask, and thus the CPU needs to be explicitly excluded.
|
||||
*/
|
||||
|
||||
cpumask_t fast_except_me;
|
||||
|
||||
cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
|
||||
cpumask_clear_cpu(cpu, &fast_except_me);
|
||||
|
||||
modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
|
||||
}
|
||||
|
||||
static void set_unaligned_access_static_branches(void)
|
||||
{
|
||||
/*
|
||||
* This will be called after check_unaligned_access_all_cpus so the
|
||||
* result of unaligned access speed for all CPUs will be available.
|
||||
*
|
||||
* To avoid the number of online cpus changing between reading
|
||||
* cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
|
||||
* held before calling this function.
|
||||
*/
|
||||
|
||||
cpumask_t fast_and_online;
|
||||
|
||||
cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
|
||||
|
||||
modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
|
||||
}
|
||||
|
||||
static int lock_and_set_unaligned_access_static_branch(void)
|
||||
{
|
||||
cpus_read_lock();
|
||||
set_unaligned_access_static_branches();
|
||||
cpus_read_unlock();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
|
||||
|
||||
static int riscv_online_cpu(unsigned int cpu)
|
||||
{
|
||||
static struct page *buf;
|
||||
|
||||
/* We are already set since the last check */
|
||||
if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
|
||||
goto exit;
|
||||
|
||||
buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
|
||||
if (!buf) {
|
||||
pr_warn("Allocation failure, not measuring misaligned performance\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
check_unaligned_access(buf);
|
||||
__free_pages(buf, MISALIGNED_BUFFER_ORDER);
|
||||
|
||||
exit:
|
||||
set_unaligned_access_static_branches();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int riscv_offline_cpu(unsigned int cpu)
|
||||
{
|
||||
set_unaligned_access_static_branches_except_cpu(cpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Measure unaligned access on all CPUs present at boot in parallel. */
|
||||
static int check_unaligned_access_all_cpus(void)
|
||||
{
|
||||
unsigned int cpu;
|
||||
unsigned int cpu_count = num_possible_cpus();
|
||||
struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
|
||||
GFP_KERNEL);
|
||||
|
||||
if (!bufs) {
|
||||
pr_warn("Allocation failure, not measuring misaligned performance\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate separate buffers for each CPU so there's no fighting over
|
||||
* cache lines.
|
||||
*/
|
||||
for_each_cpu(cpu, cpu_online_mask) {
|
||||
bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
|
||||
if (!bufs[cpu]) {
|
||||
pr_warn("Allocation failure, not measuring misaligned performance\n");
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Check everybody except 0, who stays behind to tend jiffies. */
|
||||
on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
|
||||
|
||||
/* Check core 0. */
|
||||
smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
|
||||
|
||||
/*
|
||||
* Setup hotplug callbacks for any new CPUs that come online or go
|
||||
* offline.
|
||||
*/
|
||||
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
|
||||
riscv_online_cpu, riscv_offline_cpu);
|
||||
|
||||
out:
|
||||
unaligned_emulation_finish();
|
||||
for_each_cpu(cpu, cpu_online_mask) {
|
||||
if (bufs[cpu])
|
||||
__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
|
||||
}
|
||||
|
||||
kfree(bufs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
arch_initcall(check_unaligned_access_all_cpus);
|
||||
|
||||
void riscv_user_isa_enable(void)
|
||||
{
|
||||
if (riscv_cpu_has_extension_unlikely(smp_processor_id(), RISCV_ISA_EXT_ZICBOZ))
|
||||
|
@ -147,6 +147,7 @@ static bool hwprobe_ext0_has(const struct cpumask *cpus, unsigned long ext)
|
||||
return (pair.value & ext);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_RISCV_PROBE_UNALIGNED_ACCESS)
|
||||
static u64 hwprobe_misaligned(const struct cpumask *cpus)
|
||||
{
|
||||
int cpu;
|
||||
@ -169,6 +170,18 @@ static u64 hwprobe_misaligned(const struct cpumask *cpus)
|
||||
|
||||
return perf;
|
||||
}
|
||||
#else
|
||||
static u64 hwprobe_misaligned(const struct cpumask *cpus)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_RISCV_EFFICIENT_UNALIGNED_ACCESS))
|
||||
return RISCV_HWPROBE_MISALIGNED_FAST;
|
||||
|
||||
if (IS_ENABLED(CONFIG_RISCV_EMULATED_UNALIGNED_ACCESS) && unaligned_ctl_available())
|
||||
return RISCV_HWPROBE_MISALIGNED_EMULATED;
|
||||
|
||||
return RISCV_HWPROBE_MISALIGNED_SLOW;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void hwprobe_one_pair(struct riscv_hwprobe *pair,
|
||||
const struct cpumask *cpus)
|
||||
|
@ -413,7 +413,9 @@ int handle_misaligned_load(struct pt_regs *regs)
|
||||
|
||||
perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, addr);
|
||||
|
||||
#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
|
||||
*this_cpu_ptr(&misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_EMULATED;
|
||||
#endif
|
||||
|
||||
if (!unaligned_enabled)
|
||||
return -1;
|
||||
@ -596,7 +598,7 @@ int handle_misaligned_store(struct pt_regs *regs)
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool check_unaligned_access_emulated(int cpu)
|
||||
static bool check_unaligned_access_emulated(int cpu)
|
||||
{
|
||||
long *mas_ptr = per_cpu_ptr(&misaligned_access_speed, cpu);
|
||||
unsigned long tmp_var, tmp_val;
|
||||
@ -623,7 +625,7 @@ bool check_unaligned_access_emulated(int cpu)
|
||||
return misaligned_emu_detected;
|
||||
}
|
||||
|
||||
void unaligned_emulation_finish(void)
|
||||
bool check_unaligned_access_emulated_all_cpus(void)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
@ -632,13 +634,12 @@ void unaligned_emulation_finish(void)
|
||||
* accesses emulated since tasks requesting such control can run on any
|
||||
* CPU.
|
||||
*/
|
||||
for_each_present_cpu(cpu) {
|
||||
if (per_cpu(misaligned_access_speed, cpu) !=
|
||||
RISCV_HWPROBE_MISALIGNED_EMULATED) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
for_each_online_cpu(cpu)
|
||||
if (!check_unaligned_access_emulated(cpu))
|
||||
return false;
|
||||
|
||||
unaligned_ctl = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool unaligned_ctl_available(void)
|
||||
|
282
arch/riscv/kernel/unaligned_access_speed.c
Normal file
282
arch/riscv/kernel/unaligned_access_speed.c
Normal file
@ -0,0 +1,282 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-only
|
||||
/*
|
||||
* Copyright 2024 Rivos Inc.
|
||||
*/
|
||||
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/types.h>
|
||||
#include <asm/cpufeature.h>
|
||||
#include <asm/hwprobe.h>
|
||||
|
||||
#include "copy-unaligned.h"
|
||||
|
||||
#define MISALIGNED_ACCESS_JIFFIES_LG2 1
|
||||
#define MISALIGNED_BUFFER_SIZE 0x4000
|
||||
#define MISALIGNED_BUFFER_ORDER get_order(MISALIGNED_BUFFER_SIZE)
|
||||
#define MISALIGNED_COPY_SIZE ((MISALIGNED_BUFFER_SIZE / 2) - 0x80)
|
||||
|
||||
DEFINE_PER_CPU(long, misaligned_access_speed);
|
||||
|
||||
#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
|
||||
static cpumask_t fast_misaligned_access;
|
||||
static int check_unaligned_access(void *param)
|
||||
{
|
||||
int cpu = smp_processor_id();
|
||||
u64 start_cycles, end_cycles;
|
||||
u64 word_cycles;
|
||||
u64 byte_cycles;
|
||||
int ratio;
|
||||
unsigned long start_jiffies, now;
|
||||
struct page *page = param;
|
||||
void *dst;
|
||||
void *src;
|
||||
long speed = RISCV_HWPROBE_MISALIGNED_SLOW;
|
||||
|
||||
if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
|
||||
return 0;
|
||||
|
||||
/* Make an unaligned destination buffer. */
|
||||
dst = (void *)((unsigned long)page_address(page) | 0x1);
|
||||
/* Unalign src as well, but differently (off by 1 + 2 = 3). */
|
||||
src = dst + (MISALIGNED_BUFFER_SIZE / 2);
|
||||
src += 2;
|
||||
word_cycles = -1ULL;
|
||||
/* Do a warmup. */
|
||||
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
preempt_disable();
|
||||
start_jiffies = jiffies;
|
||||
while ((now = jiffies) == start_jiffies)
|
||||
cpu_relax();
|
||||
|
||||
/*
|
||||
* For a fixed amount of time, repeatedly try the function, and take
|
||||
* the best time in cycles as the measurement.
|
||||
*/
|
||||
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
|
||||
start_cycles = get_cycles64();
|
||||
/* Ensure the CSR read can't reorder WRT to the copy. */
|
||||
mb();
|
||||
__riscv_copy_words_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
/* Ensure the copy ends before the end time is snapped. */
|
||||
mb();
|
||||
end_cycles = get_cycles64();
|
||||
if ((end_cycles - start_cycles) < word_cycles)
|
||||
word_cycles = end_cycles - start_cycles;
|
||||
}
|
||||
|
||||
byte_cycles = -1ULL;
|
||||
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
start_jiffies = jiffies;
|
||||
while ((now = jiffies) == start_jiffies)
|
||||
cpu_relax();
|
||||
|
||||
while (time_before(jiffies, now + (1 << MISALIGNED_ACCESS_JIFFIES_LG2))) {
|
||||
start_cycles = get_cycles64();
|
||||
mb();
|
||||
__riscv_copy_bytes_unaligned(dst, src, MISALIGNED_COPY_SIZE);
|
||||
mb();
|
||||
end_cycles = get_cycles64();
|
||||
if ((end_cycles - start_cycles) < byte_cycles)
|
||||
byte_cycles = end_cycles - start_cycles;
|
||||
}
|
||||
|
||||
preempt_enable();
|
||||
|
||||
/* Don't divide by zero. */
|
||||
if (!word_cycles || !byte_cycles) {
|
||||
pr_warn("cpu%d: rdtime lacks granularity needed to measure unaligned access speed\n",
|
||||
cpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (word_cycles < byte_cycles)
|
||||
speed = RISCV_HWPROBE_MISALIGNED_FAST;
|
||||
|
||||
ratio = div_u64((byte_cycles * 100), word_cycles);
|
||||
pr_info("cpu%d: Ratio of byte access time to unaligned word access is %d.%02d, unaligned accesses are %s\n",
|
||||
cpu,
|
||||
ratio / 100,
|
||||
ratio % 100,
|
||||
(speed == RISCV_HWPROBE_MISALIGNED_FAST) ? "fast" : "slow");
|
||||
|
||||
per_cpu(misaligned_access_speed, cpu) = speed;
|
||||
|
||||
/*
|
||||
* Set the value of fast_misaligned_access of a CPU. These operations
|
||||
* are atomic to avoid race conditions.
|
||||
*/
|
||||
if (speed == RISCV_HWPROBE_MISALIGNED_FAST)
|
||||
cpumask_set_cpu(cpu, &fast_misaligned_access);
|
||||
else
|
||||
cpumask_clear_cpu(cpu, &fast_misaligned_access);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void check_unaligned_access_nonboot_cpu(void *param)
|
||||
{
|
||||
unsigned int cpu = smp_processor_id();
|
||||
struct page **pages = param;
|
||||
|
||||
if (smp_processor_id() != 0)
|
||||
check_unaligned_access(pages[cpu]);
|
||||
}
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
|
||||
|
||||
static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
|
||||
{
|
||||
if (cpumask_weight(mask) == weight)
|
||||
static_branch_enable_cpuslocked(&fast_unaligned_access_speed_key);
|
||||
else
|
||||
static_branch_disable_cpuslocked(&fast_unaligned_access_speed_key);
|
||||
}
|
||||
|
||||
static void set_unaligned_access_static_branches_except_cpu(int cpu)
|
||||
{
|
||||
/*
|
||||
* Same as set_unaligned_access_static_branches, except excludes the
|
||||
* given CPU from the result. When a CPU is hotplugged into an offline
|
||||
* state, this function is called before the CPU is set to offline in
|
||||
* the cpumask, and thus the CPU needs to be explicitly excluded.
|
||||
*/
|
||||
|
||||
cpumask_t fast_except_me;
|
||||
|
||||
cpumask_and(&fast_except_me, &fast_misaligned_access, cpu_online_mask);
|
||||
cpumask_clear_cpu(cpu, &fast_except_me);
|
||||
|
||||
modify_unaligned_access_branches(&fast_except_me, num_online_cpus() - 1);
|
||||
}
|
||||
|
||||
static void set_unaligned_access_static_branches(void)
|
||||
{
|
||||
/*
|
||||
* This will be called after check_unaligned_access_all_cpus so the
|
||||
* result of unaligned access speed for all CPUs will be available.
|
||||
*
|
||||
* To avoid the number of online cpus changing between reading
|
||||
* cpu_online_mask and calling num_online_cpus, cpus_read_lock must be
|
||||
* held before calling this function.
|
||||
*/
|
||||
|
||||
cpumask_t fast_and_online;
|
||||
|
||||
cpumask_and(&fast_and_online, &fast_misaligned_access, cpu_online_mask);
|
||||
|
||||
modify_unaligned_access_branches(&fast_and_online, num_online_cpus());
|
||||
}
|
||||
|
||||
static int lock_and_set_unaligned_access_static_branch(void)
|
||||
{
|
||||
cpus_read_lock();
|
||||
set_unaligned_access_static_branches();
|
||||
cpus_read_unlock();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
|
||||
|
||||
static int riscv_online_cpu(unsigned int cpu)
|
||||
{
|
||||
static struct page *buf;
|
||||
|
||||
/* We are already set since the last check */
|
||||
if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_UNKNOWN)
|
||||
goto exit;
|
||||
|
||||
buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
|
||||
if (!buf) {
|
||||
pr_warn("Allocation failure, not measuring misaligned performance\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
check_unaligned_access(buf);
|
||||
__free_pages(buf, MISALIGNED_BUFFER_ORDER);
|
||||
|
||||
exit:
|
||||
set_unaligned_access_static_branches();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int riscv_offline_cpu(unsigned int cpu)
|
||||
{
|
||||
set_unaligned_access_static_branches_except_cpu(cpu);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Measure unaligned access speed on all CPUs present at boot in parallel. */
|
||||
static int check_unaligned_access_speed_all_cpus(void)
|
||||
{
|
||||
unsigned int cpu;
|
||||
unsigned int cpu_count = num_possible_cpus();
|
||||
struct page **bufs = kzalloc(cpu_count * sizeof(struct page *),
|
||||
GFP_KERNEL);
|
||||
|
||||
if (!bufs) {
|
||||
pr_warn("Allocation failure, not measuring misaligned performance\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Allocate separate buffers for each CPU so there's no fighting over
|
||||
* cache lines.
|
||||
*/
|
||||
for_each_cpu(cpu, cpu_online_mask) {
|
||||
bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
|
||||
if (!bufs[cpu]) {
|
||||
pr_warn("Allocation failure, not measuring misaligned performance\n");
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
/* Check everybody except 0, who stays behind to tend jiffies. */
|
||||
on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
|
||||
|
||||
/* Check core 0. */
|
||||
smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
|
||||
|
||||
/*
|
||||
* Setup hotplug callbacks for any new CPUs that come online or go
|
||||
* offline.
|
||||
*/
|
||||
cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
|
||||
riscv_online_cpu, riscv_offline_cpu);
|
||||
|
||||
out:
|
||||
for_each_cpu(cpu, cpu_online_mask) {
|
||||
if (bufs[cpu])
|
||||
__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
|
||||
}
|
||||
|
||||
kfree(bufs);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int check_unaligned_access_all_cpus(void)
|
||||
{
|
||||
bool all_cpus_emulated = check_unaligned_access_emulated_all_cpus();
|
||||
|
||||
if (!all_cpus_emulated)
|
||||
return check_unaligned_access_speed_all_cpus();
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
|
||||
static int check_unaligned_access_all_cpus(void)
|
||||
{
|
||||
check_unaligned_access_emulated_all_cpus();
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
arch_initcall(check_unaligned_access_all_cpus);
|
@ -3,7 +3,7 @@
|
||||
* Checksum library
|
||||
*
|
||||
* Influenced by arch/arm64/lib/csum.c
|
||||
* Copyright (C) 2023 Rivos Inc.
|
||||
* Copyright (C) 2023-2024 Rivos Inc.
|
||||
*/
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/compiler.h>
|
||||
@ -318,10 +318,7 @@ unsigned int do_csum(const unsigned char *buff, int len)
|
||||
* branches. The largest chunk of overlap was delegated into the
|
||||
* do_csum_common function.
|
||||
*/
|
||||
if (static_branch_likely(&fast_misaligned_access_speed_key))
|
||||
return do_csum_no_alignment(buff, len);
|
||||
|
||||
if (((unsigned long)buff & OFFSET_MASK) == 0)
|
||||
if (has_fast_unaligned_accesses() || (((unsigned long)buff & OFFSET_MASK) == 0))
|
||||
return do_csum_no_alignment(buff, len);
|
||||
|
||||
return do_csum_with_alignment(buff, len);
|
||||
|
Loading…
Reference in New Issue
Block a user