Merge branch 'x86/entry' into ras/core

to fixup conflicts in arch/x86/kernel/cpu/mce/core.c so MCE specific follow
up patches can be applied without creating a horrible merge conflict
afterwards.
This commit is contained in:
Thomas Gleixner
2020-06-11 15:17:57 +02:00
15151 changed files with 949470 additions and 411984 deletions

View File

@@ -90,7 +90,6 @@ obj-$(CONFIG_X86_MPPARSE) += mpparse.o
obj-y += apic/
obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
obj-$(CONFIG_LIVEPATCH) += livepatch.o
obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o
obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o
obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o
@@ -102,9 +101,7 @@ obj-$(CONFIG_KEXEC_FILE) += kexec-bzimage64.o
obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
obj-y += kprobes/
obj-$(CONFIG_MODULES) += module.o
ifeq ($(CONFIG_X86_32),y)
obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
endif
obj-$(CONFIG_X86_32) += doublefault_32.o
obj-$(CONFIG_KGDB) += kgdb.o
obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o

View File

@@ -20,11 +20,11 @@
#include <linux/pci.h>
#include <linux/efi-bgrt.h>
#include <linux/serial_core.h>
#include <linux/pgtable.h>
#include <asm/e820/api.h>
#include <asm/irqdomain.h>
#include <asm/pci_x86.h>
#include <asm/pgtable.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
#include <asm/io.h>

View File

@@ -10,9 +10,9 @@
#include <linux/memblock.h>
#include <linux/dmi.h>
#include <linux/cpumask.h>
#include <linux/pgtable.h>
#include <asm/segment.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/realmode.h>

View File

@@ -18,7 +18,6 @@
#include <asm/text-patching.h>
#include <asm/alternative.h>
#include <asm/sections.h>
#include <asm/pgtable.h>
#include <asm/mce.h>
#include <asm/nmi.h>
#include <asm/cacheflush.h>
@@ -783,6 +782,61 @@ void __init_or_module text_poke_early(void *addr, const void *opcode,
}
}
typedef struct {
struct mm_struct *mm;
} temp_mm_state_t;
/*
* Using a temporary mm allows to set temporary mappings that are not accessible
* by other CPUs. Such mappings are needed to perform sensitive memory writes
* that override the kernel memory protections (e.g., W^X), without exposing the
* temporary page-table mappings that are required for these write operations to
* other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
* mapping is torn down.
*
* Context: The temporary mm needs to be used exclusively by a single core. To
* harden security IRQs must be disabled while the temporary mm is
* loaded, thereby preventing interrupt handler bugs from overriding
* the kernel memory protection.
*/
static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
{
temp_mm_state_t temp_state;
lockdep_assert_irqs_disabled();
temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
switch_mm_irqs_off(NULL, mm, current);
/*
* If breakpoints are enabled, disable them while the temporary mm is
* used. Userspace might set up watchpoints on addresses that are used
* in the temporary mm, which would lead to wrong signals being sent or
* crashes.
*
* Note that breakpoints are not disabled selectively, which also causes
* kernel breakpoints (e.g., perf's) to be disabled. This might be
* undesirable, but still seems reasonable as the code that runs in the
* temporary mm should be short.
*/
if (hw_breakpoint_active())
hw_breakpoint_disable();
return temp_state;
}
static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
{
lockdep_assert_irqs_disabled();
switch_mm_irqs_off(NULL, prev_state.mm, current);
/*
* Restore the breakpoints if they were disabled before the temporary mm
* was loaded.
*/
if (hw_breakpoint_active())
hw_breakpoint_restore();
}
__ro_after_init struct mm_struct *poking_mm;
__ro_after_init unsigned long poking_addr;
@@ -957,28 +1011,29 @@ struct bp_patching_desc {
static struct bp_patching_desc *bp_desc;
static inline struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
static __always_inline
struct bp_patching_desc *try_get_desc(struct bp_patching_desc **descp)
{
struct bp_patching_desc *desc = READ_ONCE(*descp); /* rcu_dereference */
struct bp_patching_desc *desc = __READ_ONCE(*descp); /* rcu_dereference */
if (!desc || !atomic_inc_not_zero(&desc->refs))
if (!desc || !arch_atomic_inc_not_zero(&desc->refs))
return NULL;
return desc;
}
static inline void put_desc(struct bp_patching_desc *desc)
static __always_inline void put_desc(struct bp_patching_desc *desc)
{
smp_mb__before_atomic();
atomic_dec(&desc->refs);
arch_atomic_dec(&desc->refs);
}
static inline void *text_poke_addr(struct text_poke_loc *tp)
static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
{
return _stext + tp->rel_addr;
}
static int notrace patch_cmp(const void *key, const void *elt)
static __always_inline int patch_cmp(const void *key, const void *elt)
{
struct text_poke_loc *tp = (struct text_poke_loc *) elt;
@@ -988,9 +1043,8 @@ static int notrace patch_cmp(const void *key, const void *elt)
return 1;
return 0;
}
NOKPROBE_SYMBOL(patch_cmp);
int notrace poke_int3_handler(struct pt_regs *regs)
int noinstr poke_int3_handler(struct pt_regs *regs)
{
struct bp_patching_desc *desc;
struct text_poke_loc *tp;
@@ -1023,9 +1077,9 @@ int notrace poke_int3_handler(struct pt_regs *regs)
* Skip the binary search if there is a single member in the vector.
*/
if (unlikely(desc->nr_entries > 1)) {
tp = bsearch(ip, desc->vec, desc->nr_entries,
sizeof(struct text_poke_loc),
patch_cmp);
tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
sizeof(struct text_poke_loc),
patch_cmp);
if (!tp)
goto out_put;
} else {
@@ -1064,7 +1118,6 @@ out_put:
put_desc(desc);
return ret;
}
NOKPROBE_SYMBOL(poke_int3_handler);
#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
static struct text_poke_loc tp_vec[TP_VEC_MAX];

View File

@@ -33,7 +33,6 @@
#include <linux/atomic.h>
#include <linux/dma-direct.h>
#include <asm/mtrr.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/iommu.h>
#include <asm/gart.h>
@@ -159,7 +158,7 @@ static void dump_leak(void)
return;
dump = 1;
show_stack(NULL, NULL);
show_stack(NULL, NULL, KERN_ERR);
debug_dma_dump_mappings(NULL);
}
#endif

View File

@@ -345,56 +345,3 @@ out_noapbt:
apb_timer_block_enabled = 0;
panic("failed to enable APB timer\n");
}
/* called before apb_timer_enable, use early map */
unsigned long apbt_quick_calibrate(void)
{
int i, scale;
u64 old, new;
u64 t1, t2;
unsigned long khz = 0;
u32 loop, shift;
apbt_set_mapping();
dw_apb_clocksource_start(clocksource_apbt);
/* check if the timer can count down, otherwise return */
old = dw_apb_clocksource_read(clocksource_apbt);
i = 10000;
while (--i) {
if (old != dw_apb_clocksource_read(clocksource_apbt))
break;
}
if (!i)
goto failed;
/* count 16 ms */
loop = (apbt_freq / 1000) << 4;
/* restart the timer to ensure it won't get to 0 in the calibration */
dw_apb_clocksource_start(clocksource_apbt);
old = dw_apb_clocksource_read(clocksource_apbt);
old += loop;
t1 = rdtsc();
do {
new = dw_apb_clocksource_read(clocksource_apbt);
} while (new < old);
t2 = rdtsc();
shift = 5;
if (unlikely(loop >> shift == 0)) {
printk(KERN_INFO
"APBT TSC calibration failed, not enough resolution\n");
return 0;
}
scale = (int)div_u64((t2 - t1), loop >> shift);
khz = (scale * (apbt_freq / 1000)) >> shift;
printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
return khz;
failed:
return 0;
}

View File

@@ -352,8 +352,6 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
* According to Intel, MFENCE can do the serialization here.
*/
asm volatile("mfence" : : : "memory");
printk_once(KERN_DEBUG "TSC deadline timer enabled\n");
return;
}
@@ -546,46 +544,20 @@ static struct clock_event_device lapic_clockevent = {
};
static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
static u32 hsx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x3a; /* EP */
case 0x04: return 0x0f; /* EX */
}
static const struct x86_cpu_id deadline_match[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x2, 0x2), 0x3a), /* EP */
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(HASWELL_X, X86_STEPPINGS(0x4, 0x4), 0x0f), /* EX */
return ~0U;
}
static u32 bdx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x02: return 0x00000011;
case 0x03: return 0x0700000e;
case 0x04: return 0x0f00000c;
case 0x05: return 0x0e000003;
}
return ~0U;
}
static u32 skx_deadline_rev(void)
{
switch (boot_cpu_data.x86_stepping) {
case 0x03: return 0x01000136;
case 0x04: return 0x02000014;
}
if (boot_cpu_data.x86_stepping > 4)
return 0;
return ~0U;
}
static const struct x86_cpu_id deadline_match[] = {
X86_MATCH_INTEL_FAM6_MODEL( HASWELL_X, &hsx_deadline_rev),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_X, 0x0b000020),
X86_MATCH_INTEL_FAM6_MODEL( BROADWELL_D, &bdx_deadline_rev),
X86_MATCH_INTEL_FAM6_MODEL( SKYLAKE_X, &skx_deadline_rev),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x2, 0x2), 0x00000011),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x3, 0x3), 0x0700000e),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x4, 0x4), 0x0f00000c),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(BROADWELL_D, X86_STEPPINGS(0x5, 0x5), 0x0e000003),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x3, 0x3), 0x01000136),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x4, 0x4), 0x02000014),
X86_MATCH_INTEL_FAM6_MODEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x5, 0xf), 0),
X86_MATCH_INTEL_FAM6_MODEL( HASWELL, 0x22),
X86_MATCH_INTEL_FAM6_MODEL( HASWELL_L, 0x20),
@@ -603,34 +575,29 @@ static const struct x86_cpu_id deadline_match[] = {
{},
};
static void apic_check_deadline_errata(void)
static __init bool apic_validate_deadline_timer(void)
{
const struct x86_cpu_id *m;
u32 rev;
if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) ||
boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER))
return false;
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return true;
m = x86_match_cpu(deadline_match);
if (!m)
return;
return true;
/*
* Function pointers will have the MSB set due to address layout,
* immediate revisions will not.
*/
if ((long)m->driver_data < 0)
rev = ((u32 (*)(void))(m->driver_data))();
else
rev = (u32)m->driver_data;
rev = (u32)m->driver_data;
if (boot_cpu_data.microcode >= rev)
return;
return true;
setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER);
pr_err(FW_BUG "TSC_DEADLINE disabled due to Errata; "
"please update microcode to version: 0x%x (or later)\n", rev);
return false;
}
/*
@@ -1121,23 +1088,14 @@ static void local_apic_timer_interrupt(void)
* [ if a single-CPU system runs an SMP kernel then we call the local
* interrupt as well. Thus we cannot inline the local irq ... ]
*/
__visible void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_apic_timer_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
/*
* NOTE! We'd better ACK the irq immediately,
* because timer handling can be slow.
*
* update_process_times() expects us to have done irq_enter().
* Besides, if we don't timer interrupts ignore the global
* interrupt lock, which is the WrongThing (tm) to do.
*/
entering_ack_irq();
ack_APIC_irq();
trace_local_timer_entry(LOCAL_TIMER_VECTOR);
local_apic_timer_interrupt();
trace_local_timer_exit(LOCAL_TIMER_VECTOR);
exiting_irq();
set_irq_regs(old_regs);
}
@@ -2092,7 +2050,8 @@ void __init init_apic_mappings(void)
{
unsigned int new_apicid;
apic_check_deadline_errata();
if (apic_validate_deadline_timer())
pr_debug("TSC deadline timer available\n");
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
@@ -2152,15 +2111,21 @@ void __init register_lapic_address(unsigned long address)
* Local APIC interrupts
*/
/*
* This interrupt should _never_ happen with our APIC/SMP architecture
/**
* spurious_interrupt - Catch all for interrupts raised on unused vectors
* @regs: Pointer to pt_regs on stack
* @vector: The vector number
*
* This is invoked from ASM entry code to catch all interrupts which
* trigger on an entry which is routed to the common_spurious idtentry
* point.
*
* Also called from sysvec_spurious_apic_interrupt().
*/
__visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_IRQ(spurious_interrupt)
{
u8 vector = ~regs->orig_ax;
u32 v;
entering_irq();
trace_spurious_apic_entry(vector);
inc_irq_stat(irq_spurious_count);
@@ -2190,13 +2155,17 @@ __visible void __irq_entry smp_spurious_interrupt(struct pt_regs *regs)
}
out:
trace_spurious_apic_exit(vector);
exiting_irq();
}
DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
{
__spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
}
/*
* This interrupt should never happen with our APIC/SMP architecture
*/
__visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_error_interrupt)
{
static const char * const error_interrupt_reason[] = {
"Send CS error", /* APIC Error Bit 0 */
@@ -2210,7 +2179,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
};
u32 v, i = 0;
entering_irq();
trace_error_apic_entry(ERROR_APIC_VECTOR);
/* First tickle the hardware, only then report what went on. -- REW */
@@ -2234,7 +2202,6 @@ __visible void __irq_entry smp_error_interrupt(struct pt_regs *regs)
apic_printk(APIC_DEBUG, KERN_CONT "\n");
trace_error_apic_exit(ERROR_APIC_VECTOR);
exiting_irq();
}
/**

View File

@@ -12,11 +12,11 @@
*/
#include <linux/types.h>
#include <linux/init.h>
#include <linux/pgtable.h>
#include <asm/numachip/numachip.h>
#include <asm/numachip/numachip_csr.h>
#include <asm/pgtable.h>
#include "local.h"

View File

@@ -154,19 +154,6 @@ static inline bool mp_is_legacy_irq(int irq)
return irq >= 0 && irq < nr_legacy_irqs();
}
/*
* Initialize all legacy IRQs and all pins on the first IOAPIC
* if we have legacy interrupt controller. Kernel boot option "pirq="
* may rely on non-legacy pins on the first IOAPIC.
*/
static inline int mp_init_irq_at_boot(int ioapic, int irq)
{
if (!nr_legacy_irqs())
return 0;
return ioapic == 0 || mp_is_legacy_irq(irq);
}
static inline struct irq_domain *mp_ioapic_irqdomain(int ioapic)
{
return ioapics[ioapic].irqdomain;

View File

@@ -115,7 +115,8 @@ msi_set_affinity(struct irq_data *irqd, const struct cpumask *mask, bool force)
* denote it as spurious which is no harm as this is a rare event
* and interrupt handlers have to cope with spurious interrupts
* anyway. If the vector is unused, then it is marked so it won't
* trigger the 'No irq handler for vector' warning in do_IRQ().
* trigger the 'No irq handler for vector' warning in
* common_interrupt().
*
* This requires to hold vector lock to prevent concurrent updates to
* the affected vector.

View File

@@ -861,13 +861,13 @@ static void free_moved_vector(struct apic_chip_data *apicd)
apicd->move_in_progress = 0;
}
asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
DEFINE_IDTENTRY_SYSVEC(sysvec_irq_move_cleanup)
{
struct hlist_head *clhead = this_cpu_ptr(&cleanup_list);
struct apic_chip_data *apicd;
struct hlist_node *tmp;
entering_ack_irq();
ack_APIC_irq();
/* Prevent vectors vanishing under us */
raw_spin_lock(&vector_lock);
@@ -892,7 +892,6 @@ asmlinkage __visible void __irq_entry smp_irq_move_cleanup_interrupt(void)
}
raw_spin_unlock(&vector_lock);
exiting_irq();
}
static void __send_cleanup_vector(struct apic_chip_data *apicd)

View File

@@ -30,8 +30,6 @@ static enum uv_system_type uv_system_type;
static int uv_hubbed_system;
static int uv_hubless_system;
static u64 gru_start_paddr, gru_end_paddr;
static u64 gru_dist_base, gru_first_node_paddr = -1LL, gru_last_node_paddr;
static u64 gru_dist_lmask, gru_dist_umask;
static union uvh_apicid uvh_apicid;
/* Unpack OEM/TABLE ID's to be NULL terminated strings */
@@ -48,11 +46,9 @@ static struct {
unsigned int gnode_shift;
} uv_cpuid;
int uv_min_hub_revision_id;
EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
static int uv_min_hub_revision_id;
unsigned int uv_apicid_hibits;
EXPORT_SYMBOL_GPL(uv_apicid_hibits);
static struct apic apic_x2apic_uv_x;
static struct uv_hub_info_s uv_hub_info_node0;
@@ -85,20 +81,7 @@ static unsigned long __init uv_early_read_mmr(unsigned long addr)
static inline bool is_GRU_range(u64 start, u64 end)
{
if (gru_dist_base) {
u64 su = start & gru_dist_umask; /* Upper (incl pnode) bits */
u64 sl = start & gru_dist_lmask; /* Base offset bits */
u64 eu = end & gru_dist_umask;
u64 el = end & gru_dist_lmask;
/* Must reside completely within a single GRU range: */
return (sl == gru_dist_base && el == gru_dist_base &&
su >= gru_first_node_paddr &&
su <= gru_last_node_paddr &&
eu == su);
} else {
return start >= gru_start_paddr && end <= gru_end_paddr;
}
return start >= gru_start_paddr && end <= gru_end_paddr;
}
static bool uv_is_untracked_pat_range(u64 start, u64 end)
@@ -385,11 +368,10 @@ int is_uv_hubbed(int uvtype)
}
EXPORT_SYMBOL_GPL(is_uv_hubbed);
int is_uv_hubless(int uvtype)
static int is_uv_hubless(int uvtype)
{
return (uv_hubless_system & uvtype);
}
EXPORT_SYMBOL_GPL(is_uv_hubless);
void **__uv_hub_info_list;
EXPORT_SYMBOL_GPL(__uv_hub_info_list);
@@ -417,12 +399,6 @@ static __initdata struct uv_gam_range_s *_gr_table;
#define SOCK_EMPTY ((unsigned short)~0)
extern int uv_hub_info_version(void)
{
return UV_HUB_INFO_VERSION;
}
EXPORT_SYMBOL(uv_hub_info_version);
/* Default UV memory block size is 2GB */
static unsigned long mem_block_size __initdata = (2UL << 30);
@@ -590,12 +566,21 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
static void uv_send_IPI_one(int cpu, int vector)
{
unsigned long apicid;
int pnode;
unsigned long apicid = per_cpu(x86_cpu_to_apicid, cpu);
int pnode = uv_apicid_to_pnode(apicid);
unsigned long dmode, val;
apicid = per_cpu(x86_cpu_to_apicid, cpu);
pnode = uv_apicid_to_pnode(apicid);
uv_hub_send_ipi(pnode, apicid, vector);
if (vector == NMI_VECTOR)
dmode = dest_NMI;
else
dmode = dest_Fixed;
val = (1UL << UVH_IPI_INT_SEND_SHFT) |
((apicid | uv_apicid_hibits) << UVH_IPI_INT_APIC_ID_SHFT) |
(dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
(vector << UVH_IPI_INT_VECTOR_SHFT);
uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
}
static void uv_send_IPI_mask(const struct cpumask *mask, int vector)
@@ -797,42 +782,6 @@ static __init void map_high(char *id, unsigned long base, int pshift, int bshift
init_extra_mapping_wb(paddr, bytes);
}
static __init void map_gru_distributed(unsigned long c)
{
union uvh_rh_gam_gru_overlay_config_mmr_u gru;
u64 paddr;
unsigned long bytes;
int nid;
gru.v = c;
/* Only base bits 42:28 relevant in dist mode */
gru_dist_base = gru.v & 0x000007fff0000000UL;
if (!gru_dist_base) {
pr_info("UV: Map GRU_DIST base address NULL\n");
return;
}
bytes = 1UL << UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
gru_dist_lmask = ((1UL << uv_hub_info->m_val) - 1) & ~(bytes - 1);
gru_dist_umask = ~((1UL << uv_hub_info->m_val) - 1);
gru_dist_base &= gru_dist_lmask; /* Clear bits above M */
for_each_online_node(nid) {
paddr = ((u64)uv_node_to_pnode(nid) << uv_hub_info->m_val) |
gru_dist_base;
init_extra_mapping_wb(paddr, bytes);
gru_first_node_paddr = min(paddr, gru_first_node_paddr);
gru_last_node_paddr = max(paddr, gru_last_node_paddr);
}
/* Save upper (63:M) bits of address only for is_GRU_range */
gru_first_node_paddr &= gru_dist_umask;
gru_last_node_paddr &= gru_dist_umask;
pr_debug("UV: Map GRU_DIST base 0x%016llx 0x%016llx - 0x%016llx\n", gru_dist_base, gru_first_node_paddr, gru_last_node_paddr);
}
static __init void map_gru_high(int max_pnode)
{
union uvh_rh_gam_gru_overlay_config_mmr_u gru;
@@ -846,12 +795,6 @@ static __init void map_gru_high(int max_pnode)
return;
}
/* Only UV3 has distributed GRU mode */
if (is_uv3_hub() && gru.s3.mode) {
map_gru_distributed(gru.v);
return;
}
base = (gru.v & mask) >> shift;
map_high("GRU", base, shift, shift, max_pnode, map_wb);
gru_start_paddr = ((u64)base << shift);

View File

@@ -57,9 +57,6 @@ int main(void)
BLANK();
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
DEFINE(DB_STACK_OFFSET, offsetof(struct cea_exception_stacks, DB_stack) -
offsetof(struct cea_exception_stacks, DB1_stack));
BLANK();
#ifdef CONFIG_STACKPROTECTOR

View File

@@ -3,6 +3,7 @@
#include <linux/types.h>
#include <linux/audit.h>
#include <asm/unistd.h>
#include <asm/audit.h>
static unsigned dir_class[] = {
#include <asm-generic/audit_dir_write.h>
@@ -41,7 +42,6 @@ int audit_classify_arch(int arch)
int audit_classify_syscall(int abi, unsigned syscall)
{
#ifdef CONFIG_IA32_EMULATION
extern int ia32_classify_syscall(unsigned);
if (abi == AUDIT_ARCH_I386)
return ia32_classify_syscall(syscall);
#endif

View File

@@ -10,10 +10,10 @@
*/
#include <linux/interrupt.h>
#include <asm/acrn.h>
#include <asm/apic.h>
#include <asm/desc.h>
#include <asm/hypervisor.h>
#include <asm/idtentry.h>
#include <asm/irq_regs.h>
static uint32_t __init acrn_detect(void)
@@ -24,7 +24,7 @@ static uint32_t __init acrn_detect(void)
static void __init acrn_init_platform(void)
{
/* Setup the IDT for ACRN hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, acrn_hv_callback_vector);
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_acrn_hv_callback);
}
static bool acrn_x2apic_available(void)
@@ -39,7 +39,7 @@ static bool acrn_x2apic_available(void)
static void (*acrn_intr_handler)(void);
__visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
@@ -50,13 +50,12 @@ __visible void __irq_entry acrn_hv_vector_handler(struct pt_regs *regs)
* will block the interrupt whose vector is lower than
* HYPERVISOR_CALLBACK_VECTOR.
*/
entering_ack_irq();
ack_APIC_irq();
inc_irq_stat(irq_hv_callback_count);
if (acrn_intr_handler)
acrn_intr_handler();
exiting_irq();
set_irq_regs(old_regs);
}

View File

@@ -18,6 +18,7 @@
#include <asm/pci-direct.h>
#include <asm/delay.h>
#include <asm/debugreg.h>
#include <asm/resctrl.h>
#ifdef CONFIG_X86_64
# include <asm/mmconfig.h>
@@ -597,6 +598,8 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
x86_amd_ls_cfg_ssbd_mask = 1ULL << bit;
}
}
resctrl_cpu_detect(c);
}
static void early_detect_mem_encrypt(struct cpuinfo_x86 *c)
@@ -1142,8 +1145,7 @@ static const int amd_erratum_383[] =
/* #1054: Instructions Retired Performance Counter May Be Inaccurate */
static const int amd_erratum_1054[] =
AMD_OSVW_ERRATUM(0, AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf));
static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum)
{

View File

@@ -15,6 +15,7 @@
#include <linux/nospec.h>
#include <linux/prctl.h>
#include <linux/sched/smt.h>
#include <linux/pgtable.h>
#include <asm/spec-ctrl.h>
#include <asm/cmdline.h>
@@ -26,7 +27,6 @@
#include <asm/vmx.h>
#include <asm/paravirt.h>
#include <asm/alternative.h>
#include <asm/pgtable.h>
#include <asm/set_memory.h>
#include <asm/intel-family.h>
#include <asm/e820/api.h>
@@ -41,6 +41,7 @@ static void __init l1tf_select_mitigation(void);
static void __init mds_select_mitigation(void);
static void __init mds_print_mitigation(void);
static void __init taa_select_mitigation(void);
static void __init srbds_select_mitigation(void);
/* The base value of the SPEC_CTRL MSR that always has to be preserved. */
u64 x86_spec_ctrl_base;
@@ -108,6 +109,7 @@ void __init check_bugs(void)
l1tf_select_mitigation();
mds_select_mitigation();
taa_select_mitigation();
srbds_select_mitigation();
/*
* As MDS and TAA mitigations are inter-related, print MDS
@@ -397,6 +399,97 @@ static int __init tsx_async_abort_parse_cmdline(char *str)
}
early_param("tsx_async_abort", tsx_async_abort_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "SRBDS: " fmt
enum srbds_mitigations {
SRBDS_MITIGATION_OFF,
SRBDS_MITIGATION_UCODE_NEEDED,
SRBDS_MITIGATION_FULL,
SRBDS_MITIGATION_TSX_OFF,
SRBDS_MITIGATION_HYPERVISOR,
};
static enum srbds_mitigations srbds_mitigation __ro_after_init = SRBDS_MITIGATION_FULL;
static const char * const srbds_strings[] = {
[SRBDS_MITIGATION_OFF] = "Vulnerable",
[SRBDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode",
[SRBDS_MITIGATION_FULL] = "Mitigation: Microcode",
[SRBDS_MITIGATION_TSX_OFF] = "Mitigation: TSX disabled",
[SRBDS_MITIGATION_HYPERVISOR] = "Unknown: Dependent on hypervisor status",
};
static bool srbds_off;
void update_srbds_msr(void)
{
u64 mcu_ctrl;
if (!boot_cpu_has_bug(X86_BUG_SRBDS))
return;
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
if (srbds_mitigation == SRBDS_MITIGATION_UCODE_NEEDED)
return;
rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
switch (srbds_mitigation) {
case SRBDS_MITIGATION_OFF:
case SRBDS_MITIGATION_TSX_OFF:
mcu_ctrl |= RNGDS_MITG_DIS;
break;
case SRBDS_MITIGATION_FULL:
mcu_ctrl &= ~RNGDS_MITG_DIS;
break;
default:
break;
}
wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl);
}
static void __init srbds_select_mitigation(void)
{
u64 ia32_cap;
if (!boot_cpu_has_bug(X86_BUG_SRBDS))
return;
/*
* Check to see if this is one of the MDS_NO systems supporting
* TSX that are only exposed to SRBDS when TSX is enabled.
*/
ia32_cap = x86_read_arch_cap_msr();
if ((ia32_cap & ARCH_CAP_MDS_NO) && !boot_cpu_has(X86_FEATURE_RTM))
srbds_mitigation = SRBDS_MITIGATION_TSX_OFF;
else if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR;
else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL))
srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED;
else if (cpu_mitigations_off() || srbds_off)
srbds_mitigation = SRBDS_MITIGATION_OFF;
update_srbds_msr();
pr_info("%s\n", srbds_strings[srbds_mitigation]);
}
static int __init srbds_parse_cmdline(char *str)
{
if (!str)
return -EINVAL;
if (!boot_cpu_has_bug(X86_BUG_SRBDS))
return 0;
srbds_off = !strcmp(str, "off");
return 0;
}
early_param("srbds", srbds_parse_cmdline);
#undef pr_fmt
#define pr_fmt(fmt) "Spectre V1 : " fmt
@@ -1528,6 +1621,11 @@ static char *ibpb_state(void)
return "";
}
static ssize_t srbds_show_state(char *buf)
{
return sprintf(buf, "%s\n", srbds_strings[srbds_mitigation]);
}
static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
char *buf, unsigned int bug)
{
@@ -1572,6 +1670,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr
case X86_BUG_ITLB_MULTIHIT:
return itlb_multihit_show_state(buf);
case X86_BUG_SRBDS:
return srbds_show_state(buf);
default:
break;
}
@@ -1618,4 +1719,9 @@ ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr
{
return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT);
}
ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf)
{
return cpu_show_common(dev, attr, buf, X86_BUG_SRBDS);
}
#endif

View File

@@ -21,6 +21,7 @@
#include <linux/smp.h>
#include <linux/io.h>
#include <linux/syscore_ops.h>
#include <linux/pgtable.h>
#include <asm/stackprotector.h>
#include <asm/perf_event.h>
@@ -35,7 +36,6 @@
#include <asm/vsyscall.h>
#include <linux/topology.h>
#include <linux/cpumask.h>
#include <asm/pgtable.h>
#include <linux/atomic.h>
#include <asm/proto.h>
#include <asm/setup.h>
@@ -387,7 +387,30 @@ set_register:
bits_missing);
}
}
EXPORT_SYMBOL(native_write_cr4);
#if IS_MODULE(CONFIG_LKDTM)
EXPORT_SYMBOL_GPL(native_write_cr4);
#endif
void cr4_update_irqsoff(unsigned long set, unsigned long clear)
{
unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
lockdep_assert_irqs_disabled();
newval = (cr4 & ~clear) | set;
if (newval != cr4) {
this_cpu_write(cpu_tlbstate.cr4, newval);
__write_cr4(newval);
}
}
EXPORT_SYMBOL(cr4_update_irqsoff);
/* Read the CR4 shadow. */
unsigned long cr4_read_shadow(void)
{
return this_cpu_read(cpu_tlbstate.cr4);
}
EXPORT_SYMBOL_GPL(cr4_read_shadow);
void cr4_init(void)
{
@@ -854,30 +877,6 @@ static void init_speculation_control(struct cpuinfo_x86 *c)
}
}
static void init_cqm(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
c->x86_cache_max_rmid = -1;
c->x86_cache_occ_scale = -1;
return;
}
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = cpuid_ebx(0xf);
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
u32 eax, ebx, ecx, edx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
}
}
void get_cpu_cap(struct cpuinfo_x86 *c)
{
u32 eax, ebx, ecx, edx;
@@ -945,7 +944,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
init_scattered_cpuid_features(c);
init_speculation_control(c);
init_cqm(c);
/*
* Clear/Set all flags overridden by options, after probe.
@@ -1075,9 +1073,30 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = {
{}
};
static bool __init cpu_matches(unsigned long which)
#define VULNBL_INTEL_STEPPINGS(model, steppings, issues) \
X86_MATCH_VENDOR_FAM_MODEL_STEPPINGS_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, steppings, \
X86_FEATURE_ANY, issues)
#define SRBDS BIT(0)
static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = {
VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_L, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(HASWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, SRBDS),
VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xC), SRBDS),
VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xD), SRBDS),
{}
};
static bool __init cpu_matches(const struct x86_cpu_id *table, unsigned long which)
{
const struct x86_cpu_id *m = x86_match_cpu(cpu_vuln_whitelist);
const struct x86_cpu_id *m = x86_match_cpu(table);
return m && !!(m->driver_data & which);
}
@@ -1097,31 +1116,34 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
u64 ia32_cap = x86_read_arch_cap_msr();
/* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */
if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) &&
!(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO))
setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT);
if (cpu_matches(NO_SPECULATION))
if (cpu_matches(cpu_vuln_whitelist, NO_SPECULATION))
return;
setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
if (!cpu_matches(NO_SPECTRE_V2))
if (!cpu_matches(cpu_vuln_whitelist, NO_SPECTRE_V2))
setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
if (!cpu_matches(NO_SSB) && !(ia32_cap & ARCH_CAP_SSB_NO) &&
if (!cpu_matches(cpu_vuln_whitelist, NO_SSB) &&
!(ia32_cap & ARCH_CAP_SSB_NO) &&
!cpu_has(c, X86_FEATURE_AMD_SSB_NO))
setup_force_cpu_bug(X86_BUG_SPEC_STORE_BYPASS);
if (ia32_cap & ARCH_CAP_IBRS_ALL)
setup_force_cpu_cap(X86_FEATURE_IBRS_ENHANCED);
if (!cpu_matches(NO_MDS) && !(ia32_cap & ARCH_CAP_MDS_NO)) {
if (!cpu_matches(cpu_vuln_whitelist, NO_MDS) &&
!(ia32_cap & ARCH_CAP_MDS_NO)) {
setup_force_cpu_bug(X86_BUG_MDS);
if (cpu_matches(MSBDS_ONLY))
if (cpu_matches(cpu_vuln_whitelist, MSBDS_ONLY))
setup_force_cpu_bug(X86_BUG_MSBDS_ONLY);
}
if (!cpu_matches(NO_SWAPGS))
if (!cpu_matches(cpu_vuln_whitelist, NO_SWAPGS))
setup_force_cpu_bug(X86_BUG_SWAPGS);
/*
@@ -1139,7 +1161,16 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
(ia32_cap & ARCH_CAP_TSX_CTRL_MSR)))
setup_force_cpu_bug(X86_BUG_TAA);
if (cpu_matches(NO_MELTDOWN))
/*
* SRBDS affects CPUs which support RDRAND or RDSEED and are listed
* in the vulnerability blacklist.
*/
if ((cpu_has(c, X86_FEATURE_RDRAND) ||
cpu_has(c, X86_FEATURE_RDSEED)) &&
cpu_matches(cpu_vuln_blacklist, SRBDS))
setup_force_cpu_bug(X86_BUG_SRBDS);
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return;
/* Rogue Data Cache Load? No! */
@@ -1148,7 +1179,7 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
if (cpu_matches(NO_L1TF))
if (cpu_matches(cpu_vuln_whitelist, NO_L1TF))
return;
setup_force_cpu_bug(X86_BUG_L1TF);
@@ -1377,20 +1408,6 @@ static void generic_identify(struct cpuinfo_x86 *c)
#endif
}
static void x86_init_cache_qos(struct cpuinfo_x86 *c)
{
/*
* The heavy lifting of max_rmid and cache_occ_scale are handled
* in get_cpu_cap(). Here we just set the max_rmid for the boot_cpu
* in case CQM bits really aren't there in this CPU.
*/
if (c != &boot_cpu_data) {
boot_cpu_data.x86_cache_max_rmid =
min(boot_cpu_data.x86_cache_max_rmid,
c->x86_cache_max_rmid);
}
}
/*
* Validate that ACPI/mptables have the same information about the
* effective APIC id and update the package map.
@@ -1503,7 +1520,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
x86_init_rdrand(c);
x86_init_cache_qos(c);
setup_pku(c);
/*
@@ -1591,6 +1607,7 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c)
mtrr_ap_init();
validate_apic_and_package_id(c);
x86_spec_ctrl_setup_ap();
update_srbds_msr();
}
static __init int setup_noclflush(char *arg)
@@ -1689,25 +1706,6 @@ void syscall_init(void)
X86_EFLAGS_IOPL|X86_EFLAGS_AC|X86_EFLAGS_NT);
}
DEFINE_PER_CPU(int, debug_stack_usage);
DEFINE_PER_CPU(u32, debug_idt_ctr);
void debug_stack_set_zero(void)
{
this_cpu_inc(debug_idt_ctr);
load_current_idt();
}
NOKPROBE_SYMBOL(debug_stack_set_zero);
void debug_stack_reset(void)
{
if (WARN_ON(!this_cpu_read(debug_idt_ctr)))
return;
if (this_cpu_dec_return(debug_idt_ctr) == 0)
load_current_idt();
}
NOKPROBE_SYMBOL(debug_stack_reset);
#else /* CONFIG_X86_64 */
DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;

View File

@@ -77,6 +77,7 @@ extern void detect_ht(struct cpuinfo_x86 *c);
unsigned int aperfmperf_get_khz(int cpu);
extern void x86_spec_ctrl_setup_ap(void);
extern void update_srbds_msr(void);
extern u64 x86_read_arch_cap_msr(void);

View File

@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/kernel.h>
#include <linux/pgtable.h>
#include <linux/string.h>
#include <linux/bitops.h>
@@ -11,7 +12,6 @@
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
#include <asm/pgtable.h>
#include <asm/msr.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
@@ -22,6 +22,7 @@
#include <asm/cpu_device_id.h>
#include <asm/cmdline.h>
#include <asm/traps.h>
#include <asm/resctrl.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -322,6 +323,11 @@ static void early_init_intel(struct cpuinfo_x86 *c)
detect_ht_early(c);
}
static void bsp_init_intel(struct cpuinfo_x86 *c)
{
resctrl_cpu_detect(c);
}
#ifdef CONFIG_X86_32
/*
* Early probe support logic for ppro memory erratum #50
@@ -961,6 +967,7 @@ static const struct cpu_dev intel_cpu_dev = {
#endif
.c_detect_tlb = intel_detect_tlb,
.c_early_init = early_init_intel,
.c_bsp_init = bsp_init_intel,
.c_init = init_intel,
.c_x86_vendor = X86_VENDOR_INTEL,
};
@@ -1119,35 +1126,53 @@ void switch_to_sld(unsigned long tifn)
sld_update_msr(!(tifn & _TIF_SLD));
}
#define SPLIT_LOCK_CPU(model) {X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY}
/*
* The following processors have the split lock detection feature. But
* since they don't have the IA32_CORE_CAPABILITIES MSR, the feature cannot
* be enumerated. Enable it by family and model matching on these
* processors.
* Bits in the IA32_CORE_CAPABILITIES are not architectural, so they should
* only be trusted if it is confirmed that a CPU model implements a
* specific feature at a particular bit position.
*
* The possible driver data field values:
*
* - 0: CPU models that are known to have the per-core split-lock detection
* feature even though they do not enumerate IA32_CORE_CAPABILITIES.
*
* - 1: CPU models which may enumerate IA32_CORE_CAPABILITIES and if so use
* bit 5 to enumerate the per-core split-lock detection feature.
*/
static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_X),
SPLIT_LOCK_CPU(INTEL_FAM6_ICELAKE_L),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, 0),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, 0),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT, 1),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_D, 1),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_TREMONT_L, 1),
{}
};
void __init cpu_set_core_cap_bits(struct cpuinfo_x86 *c)
{
u64 ia32_core_caps = 0;
const struct x86_cpu_id *m;
u64 ia32_core_caps;
if (c->x86_vendor != X86_VENDOR_INTEL)
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return;
if (cpu_has(c, X86_FEATURE_CORE_CAPABILITIES)) {
/* Enumerate features reported in IA32_CORE_CAPABILITIES MSR. */
m = x86_match_cpu(split_lock_cpu_ids);
if (!m)
return;
switch (m->driver_data) {
case 0:
break;
case 1:
if (!cpu_has(c, X86_FEATURE_CORE_CAPABILITIES))
return;
rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps);
} else if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
/* Enumerate split lock detection by family and model. */
if (x86_match_cpu(split_lock_cpu_ids))
ia32_core_caps |= MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT;
if (!(ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT))
return;
break;
default:
return;
}
if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT)
split_lock_setup();
split_lock_setup();
}

View File

@@ -39,13 +39,18 @@ const struct x86_cpu_id *x86_match_cpu(const struct x86_cpu_id *match)
const struct x86_cpu_id *m;
struct cpuinfo_x86 *c = &boot_cpu_data;
for (m = match; m->vendor | m->family | m->model | m->feature; m++) {
for (m = match;
m->vendor | m->family | m->model | m->steppings | m->feature;
m++) {
if (m->vendor != X86_VENDOR_ANY && c->x86_vendor != m->vendor)
continue;
if (m->family != X86_FAMILY_ANY && c->x86 != m->family)
continue;
if (m->model != X86_MODEL_ANY && c->x86_model != m->model)
continue;
if (m->steppings != X86_STEPPING_ANY &&
!(BIT(c->x86_stepping) & m->steppings))
continue;
if (m->feature != X86_FEATURE_ANY && !cpu_has(c, m->feature))
continue;
return m;

View File

@@ -921,14 +921,13 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc)
mce_log(&m);
}
asmlinkage __visible void __irq_entry smp_deferred_error_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_deferred_error)
{
entering_irq();
trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR);
inc_irq_stat(irq_deferred_error_count);
deferred_error_int_vector();
trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR);
exiting_ack_irq();
ack_APIC_irq();
}
/*

View File

@@ -42,6 +42,8 @@
#include <linux/export.h>
#include <linux/jump_label.h>
#include <linux/set_memory.h>
#include <linux/task_work.h>
#include <linux/hardirq.h>
#include <asm/intel-family.h>
#include <asm/processor.h>
@@ -128,7 +130,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
/* Do initial initialization of a struct mce */
void mce_setup(struct mce *m)
noinstr void mce_setup(struct mce *m)
{
memset(m, 0, sizeof(struct mce));
m->cpu = m->extcpu = smp_processor_id();
@@ -138,12 +140,12 @@ void mce_setup(struct mce *m)
m->cpuid = cpuid_eax(1);
m->socketid = cpu_data(m->extcpu).phys_proc_id;
m->apicid = cpu_data(m->extcpu).initial_apicid;
rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap);
m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP);
if (this_cpu_has(X86_FEATURE_INTEL_PPIN))
rdmsrl(MSR_PPIN, m->ppin);
m->ppin = __rdmsr(MSR_PPIN);
else if (this_cpu_has(X86_FEATURE_AMD_PPIN))
rdmsrl(MSR_AMD_PPIN, m->ppin);
m->ppin = __rdmsr(MSR_AMD_PPIN);
m->microcode = boot_cpu_data.microcode;
}
@@ -1057,23 +1059,6 @@ static void mce_clear_state(unsigned long *toclear)
}
}
static int do_memory_failure(struct mce *m)
{
int flags = MF_ACTION_REQUIRED;
int ret;
pr_err("Uncorrected hardware memory error in user-access at %llx", m->addr);
if (!(m->mcgstatus & MCG_STATUS_RIPV))
flags |= MF_MUST_KILL;
ret = memory_failure(m->addr >> PAGE_SHIFT, flags);
if (ret)
pr_err("Memory error not recovered");
else
set_mce_nospec(m->addr >> PAGE_SHIFT);
return ret;
}
/*
* Cases where we avoid rendezvous handler timeout:
* 1) If this CPU is offline.
@@ -1086,13 +1071,15 @@ static int do_memory_failure(struct mce *m)
* kdump kernel establishing a new #MC handler where a broadcasted MCE
* might not get handled properly.
*/
static bool __mc_check_crashing_cpu(int cpu)
static noinstr bool mce_check_crashing_cpu(void)
{
unsigned int cpu = smp_processor_id();
if (cpu_is_offline(cpu) ||
(crashing_cpu != -1 && crashing_cpu != cpu)) {
u64 mcgstatus;
mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS);
mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS);
if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) {
if (mcgstatus & MCG_STATUS_LMCES)
@@ -1100,7 +1087,7 @@ static bool __mc_check_crashing_cpu(int cpu)
}
if (mcgstatus & MCG_STATUS_RIPV) {
mce_wrmsrl(MSR_IA32_MCG_STATUS, 0);
__wrmsr(MSR_IA32_MCG_STATUS, 0, 0);
return true;
}
}
@@ -1175,6 +1162,29 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
*m = *final;
}
static void kill_me_now(struct callback_head *ch)
{
force_sig(SIGBUS);
}
static void kill_me_maybe(struct callback_head *cb)
{
struct task_struct *p = container_of(cb, struct task_struct, mce_kill_me);
int flags = MF_ACTION_REQUIRED;
pr_err("Uncorrected hardware memory error in user-access at %llx", p->mce_addr);
if (!(p->mce_status & MCG_STATUS_RIPV))
flags |= MF_MUST_KILL;
if (!memory_failure(p->mce_addr >> PAGE_SHIFT, flags)) {
set_mce_nospec(p->mce_addr >> PAGE_SHIFT);
return;
}
pr_err("Memory error not recovered");
kill_me_now(cb);
}
/*
* The actual machine check handler. This only handles real
* exceptions when something got corrupted coming in through int 18.
@@ -1193,12 +1203,11 @@ static void __mc_scan_banks(struct mce *m, struct mce *final,
* backing the user stack, tracing that reads the user stack will cause
* potentially infinite recursion.
*/
void notrace do_machine_check(struct pt_regs *regs, long error_code)
void noinstr do_machine_check(struct pt_regs *regs)
{
DECLARE_BITMAP(valid_banks, MAX_NR_BANKS);
DECLARE_BITMAP(toclear, MAX_NR_BANKS);
struct mca_config *cfg = &mca_cfg;
int cpu = smp_processor_id();
struct mce m, *final;
char *msg = NULL;
int worst = 0;
@@ -1227,11 +1236,6 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code)
*/
int lmce = 1;
if (__mc_check_crashing_cpu(cpu))
return;
ist_enter(regs);
this_cpu_inc(mce_exception_count);
mce_gather_info(&m, regs);
@@ -1319,17 +1323,19 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code)
sync_core();
if (worst != MCE_AR_SEVERITY && !kill_it)
goto out_ist;
return;
/* Fault was in user mode and we need to take some action */
if ((m.cs & 3) == 3) {
ist_begin_non_atomic(regs);
local_irq_enable();
/* If this triggers there is no way to recover. Die hard. */
BUG_ON(!on_thread_stack() || !user_mode(regs));
if (kill_it || do_memory_failure(&m))
force_sig(SIGBUS);
local_irq_disable();
ist_end_non_atomic();
current->mce_addr = m.addr;
current->mce_status = m.mcgstatus;
current->mce_kill_me.func = kill_me_maybe;
if (kill_it)
current->mce_kill_me.func = kill_me_now;
task_work_add(current, &current->mce_kill_me, true);
} else {
/*
* Handle an MCE which has happened in kernel space but from
@@ -1341,16 +1347,12 @@ void notrace do_machine_check(struct pt_regs *regs, long error_code)
* proper one.
*/
if (m.kflags & MCE_IN_KERNEL_RECOV) {
if (!fixup_exception(regs, X86_TRAP_MC, error_code, 0))
if (!fixup_exception(regs, X86_TRAP_MC, 0, 0))
mce_panic("Failed kernel mode recovery", &m, msg);
}
}
out_ist:
ist_exit(regs);
}
EXPORT_SYMBOL_GPL(do_machine_check);
NOKPROBE_SYMBOL(do_machine_check);
#ifndef CONFIG_MEMORY_FAILURE
int memory_failure(unsigned long pfn, int flags)
@@ -1876,21 +1878,84 @@ bool filter_mce(struct mce *m)
}
/* Handle unconfigured int18 (should never happen) */
static void unexpected_machine_check(struct pt_regs *regs, long error_code)
static noinstr void unexpected_machine_check(struct pt_regs *regs)
{
instrumentation_begin();
pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
smp_processor_id());
instrumentation_end();
}
/* Call the installed machine check handler for this CPU setup. */
void (*machine_check_vector)(struct pt_regs *, long error_code) =
unexpected_machine_check;
void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check;
dotraplinkage notrace void do_mce(struct pt_regs *regs, long error_code)
static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
{
machine_check_vector(regs, error_code);
/*
* Only required when from kernel mode. See
* mce_check_crashing_cpu() for details.
*/
if (machine_check_vector == do_machine_check &&
mce_check_crashing_cpu())
return;
nmi_enter();
/*
* The call targets are marked noinstr, but objtool can't figure
* that out because it's an indirect call. Annotate it.
*/
instrumentation_begin();
trace_hardirqs_off_finish();
machine_check_vector(regs);
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
nmi_exit();
}
NOKPROBE_SYMBOL(do_mce);
static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
idtentry_enter_user(regs);
instrumentation_begin();
machine_check_vector(regs);
instrumentation_end();
idtentry_exit_user(regs);
}
#ifdef CONFIG_X86_64
/* MCE hit kernel mode */
DEFINE_IDTENTRY_MCE(exc_machine_check)
{
unsigned long dr7;
dr7 = local_db_save();
exc_machine_check_kernel(regs);
local_db_restore(dr7);
}
/* The user mode variant. */
DEFINE_IDTENTRY_MCE_USER(exc_machine_check)
{
unsigned long dr7;
dr7 = local_db_save();
exc_machine_check_user(regs);
local_db_restore(dr7);
}
#else
/* 32bit unified entry point */
DEFINE_IDTENTRY_MCE(exc_machine_check)
{
unsigned long dr7;
dr7 = local_db_save();
if (user_mode(regs))
exc_machine_check_user(regs);
else
exc_machine_check_kernel(regs);
local_db_restore(dr7);
}
#endif
/*
* Called for each booted CPU to set up machine checks.

View File

@@ -146,9 +146,9 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
regs.cs = m->cs;
pregs = &regs;
}
/* in mcheck exeception handler, irq will be disabled */
/* do_machine_check() expects interrupts disabled -- at least */
local_irq_save(flags);
do_machine_check(pregs, 0);
do_machine_check(pregs);
local_irq_restore(flags);
m->finished = 0;
}

View File

@@ -9,7 +9,7 @@
#include <asm/mce.h>
/* Pointer to the installed machine check handler for this CPU setup. */
extern void (*machine_check_vector)(struct pt_regs *, long error_code);
extern void (*machine_check_vector)(struct pt_regs *);
enum severity_level {
MCE_NO_SEVERITY,

View File

@@ -7,6 +7,7 @@
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/smp.h>
#include <linux/hardirq.h>
#include <asm/processor.h>
#include <asm/traps.h>
@@ -20,12 +21,11 @@
int mce_p5_enabled __read_mostly;
/* Machine check handler for Pentium class Intel CPUs: */
static void pentium_machine_check(struct pt_regs *regs, long error_code)
static noinstr void pentium_machine_check(struct pt_regs *regs)
{
u32 loaddr, hi, lotype;
ist_enter(regs);
instrumentation_begin();
rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -38,8 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
}
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs);
instrumentation_end();
}
/* Set up machine check reporting for processors with Intel style MCE: */

View File

@@ -614,14 +614,13 @@ static void unexpected_thermal_interrupt(void)
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
asmlinkage __visible void __irq_entry smp_thermal_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
{
entering_irq();
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
exiting_ack_irq();
ack_APIC_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */

View File

@@ -21,12 +21,11 @@ static void default_threshold_interrupt(void)
void (*mce_threshold_vector)(void) = default_threshold_interrupt;
asmlinkage __visible void __irq_entry smp_threshold_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_threshold)
{
entering_irq();
trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR);
inc_irq_stat(irq_threshold_count);
mce_threshold_vector();
trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR);
exiting_ack_irq();
ack_APIC_irq();
}

View File

@@ -6,6 +6,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/hardirq.h>
#include <asm/processor.h>
#include <asm/traps.h>
@@ -16,14 +17,12 @@
#include "internal.h"
/* Machine check handler for WinChip C6: */
static void winchip_machine_check(struct pt_regs *regs, long error_code)
static noinstr void winchip_machine_check(struct pt_regs *regs)
{
ist_enter(regs);
instrumentation_begin();
pr_emerg("CPU0: Machine Check Exception.\n");
add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
ist_exit(regs);
instrumentation_end();
}
/* Set up machine check reporting on the Winchip C6 series */

View File

@@ -545,8 +545,7 @@ static int __wait_for_cpus(atomic_t *t, long long timeout)
/*
* Returns:
* < 0 - on error
* 0 - no update done
* 1 - microcode was updated
* 0 - success (no update done or microcode was updated)
*/
static int __reload_late(void *info)
{
@@ -573,11 +572,11 @@ static int __reload_late(void *info)
else
goto wait_for_siblings;
if (err > UCODE_NFOUND) {
pr_warn("Error reloading microcode on CPU %d\n", cpu);
if (err >= UCODE_NFOUND) {
if (err == UCODE_ERROR)
pr_warn("Error reloading microcode on CPU %d\n", cpu);
ret = -1;
} else if (err == UCODE_UPDATED || err == UCODE_OK) {
ret = 1;
}
wait_for_siblings:
@@ -608,7 +607,7 @@ static int microcode_reload_late(void)
atomic_set(&late_cpus_out, 0);
ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
if (ret > 0)
if (ret == 0)
microcode_check();
pr_info("Reload completed, microcode revision: 0x%x\n", boot_cpu_data.microcode);
@@ -649,7 +648,7 @@ static ssize_t reload_store(struct device *dev,
put:
put_online_cpus();
if (ret >= 0)
if (ret == 0)
ret = size;
return ret;

View File

@@ -23,6 +23,7 @@
#include <asm/hyperv-tlfs.h>
#include <asm/mshyperv.h>
#include <asm/desc.h>
#include <asm/idtentry.h>
#include <asm/irq_regs.h>
#include <asm/i8259.h>
#include <asm/apic.h>
@@ -40,11 +41,10 @@ static void (*hv_stimer0_handler)(void);
static void (*hv_kexec_handler)(void);
static void (*hv_crash_handler)(struct pt_regs *regs);
__visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_callback)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_irq();
inc_irq_stat(irq_hv_callback_count);
if (vmbus_handler)
vmbus_handler();
@@ -52,7 +52,6 @@ __visible void __irq_entry hyperv_vector_handler(struct pt_regs *regs)
if (ms_hyperv.hints & HV_DEPRECATING_AEOI_RECOMMENDED)
ack_APIC_irq();
exiting_irq();
set_irq_regs(old_regs);
}
@@ -73,19 +72,16 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
* Routines to do per-architecture handling of stimer0
* interrupts when in Direct Mode
*/
__visible void __irq_entry hv_stimer0_vector_handler(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_irq();
inc_irq_stat(hyperv_stimer0_count);
if (hv_stimer0_handler)
hv_stimer0_handler();
add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0);
ack_APIC_irq();
exiting_irq();
set_irq_regs(old_regs);
}
@@ -227,8 +223,8 @@ static void __init ms_hyperv_init_platform(void)
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
pr_info("Hyper-V: features 0x%x, hints 0x%x\n",
ms_hyperv.features, ms_hyperv.hints);
pr_info("Hyper-V: features 0x%x, hints 0x%x, misc 0x%x\n",
ms_hyperv.features, ms_hyperv.hints, ms_hyperv.misc_features);
ms_hyperv.max_vp_index = cpuid_eax(HYPERV_CPUID_IMPLEMENT_LIMITS);
ms_hyperv.max_lp_index = cpuid_ebx(HYPERV_CPUID_IMPLEMENT_LIMITS);
@@ -263,6 +259,16 @@ static void __init ms_hyperv_init_platform(void)
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
}
/*
* Hyper-V expects to get crash register data or kmsg when
* crash enlightment is available and system crashes. Set
* crash_kexec_post_notifiers to be true to make sure that
* calling crash enlightment interface before running kdump
* kernel.
*/
if (ms_hyperv.misc_features & HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE)
crash_kexec_post_notifiers = true;
#ifdef CONFIG_X86_LOCAL_APIC
if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS &&
ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) {
@@ -321,17 +327,19 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.apic_post_init = hyperv_init;
hyperv_setup_mmu_ops();
/* Setup the IDT for hypervisor callback */
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, hyperv_callback_vector);
alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, asm_sysvec_hyperv_callback);
/* Setup the IDT for reenlightenment notifications */
if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT)
if (ms_hyperv.features & HV_X64_ACCESS_REENLIGHTENMENT) {
alloc_intr_gate(HYPERV_REENLIGHTENMENT_VECTOR,
hyperv_reenlightenment_vector);
asm_sysvec_hyperv_reenlightenment);
}
/* Setup the IDT for stimer0 */
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE)
if (ms_hyperv.misc_features & HV_STIMER_DIRECT_MODE_AVAILABLE) {
alloc_intr_gate(HYPERV_STIMER0_VECTOR,
hv_stimer0_callback_vector);
asm_sysvec_hyperv_stimer0);
}
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;

View File

@@ -761,7 +761,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
/* Flush all TLBs via a mov %cr3, %reg; mov %reg, %cr3 */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
flush_tlb_local();
/* Save MTRR state */
rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
@@ -778,7 +778,7 @@ static void post_set(void) __releases(set_atomicity_lock)
{
/* Flush TLBs (no need to flush caches - they are disabled) */
count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
__flush_tlb();
flush_tlb_local();
/* Intel (P6) standard MTRRs */
mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);

View File

@@ -63,6 +63,10 @@ static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BPU_PERFCTR0;
}
fallthrough;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_PERFCTR0;
}
return 0;
}
@@ -92,6 +96,10 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
case 15:
return msr - MSR_P4_BSU_ESCR0;
}
fallthrough;
case X86_VENDOR_ZHAOXIN:
case X86_VENDOR_CENTAUR:
return msr - MSR_ARCH_PERFMON_EVENTSEL0;
}
return 0;

View File

@@ -22,7 +22,7 @@
#include <linux/cpuhotplug.h>
#include <asm/intel-family.h>
#include <asm/resctrl_sched.h>
#include <asm/resctrl.h>
#include "internal.h"
/* Mutex to protect rdtgroup access. */
@@ -578,6 +578,8 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
d->id = id;
cpumask_set_cpu(cpu, &d->cpu_mask);
rdt_domain_reconfigure_cdp(r);
if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
kfree(d);
return;
@@ -956,6 +958,36 @@ static __init void rdt_init_res_defs(void)
static enum cpuhp_state rdt_online;
/* Runs once on the BSP during boot. */
void resctrl_cpu_detect(struct cpuinfo_x86 *c)
{
if (!cpu_has(c, X86_FEATURE_CQM_LLC)) {
c->x86_cache_max_rmid = -1;
c->x86_cache_occ_scale = -1;
c->x86_cache_mbm_width_offset = -1;
return;
}
/* will be overridden if occupancy monitoring exists */
c->x86_cache_max_rmid = cpuid_ebx(0xf);
if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC) ||
cpu_has(c, X86_FEATURE_CQM_MBM_TOTAL) ||
cpu_has(c, X86_FEATURE_CQM_MBM_LOCAL)) {
u32 eax, ebx, ecx, edx;
/* QoS sub-leaf, EAX=0Fh, ECX=1 */
cpuid_count(0xf, 1, &eax, &ebx, &ecx, &edx);
c->x86_cache_max_rmid = ecx;
c->x86_cache_occ_scale = ebx;
if (c->x86_vendor == X86_VENDOR_INTEL)
c->x86_cache_mbm_width_offset = eax & 0xff;
else
c->x86_cache_mbm_width_offset = -1;
}
}
static int __init resctrl_late_init(void)
{
struct rdt_resource *r;

View File

@@ -495,14 +495,16 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
return ret;
}
void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
struct rdtgroup *rdtgrp, int evtid, int first)
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_domain *d, struct rdtgroup *rdtgrp,
int evtid, int first)
{
/*
* setup the parameters to send to the IPI to read the data.
*/
rr->rgrp = rdtgrp;
rr->evtid = evtid;
rr->r = r;
rr->d = d;
rr->val = 0;
rr->first = first;
@@ -539,7 +541,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
goto out;
}
mon_event_read(&rr, d, rdtgrp, evtid, false);
mon_event_read(&rr, r, d, rdtgrp, evtid, false);
if (rr.val & RMID_VAL_ERROR)
seq_puts(m, "Error\n");

View File

@@ -31,7 +31,7 @@
#define CQM_LIMBOCHECK_INTERVAL 1000
#define MBM_CNTR_WIDTH 24
#define MBM_CNTR_WIDTH_BASE 24
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
@@ -40,6 +40,12 @@
#define RMID_VAL_ERROR BIT_ULL(63)
#define RMID_VAL_UNAVAIL BIT_ULL(62)
/*
* With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for
* data to be returned. The counter width is discovered from the hardware
* as an offset from MBM_CNTR_WIDTH_BASE.
*/
#define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE)
struct rdt_fs_context {
@@ -87,6 +93,7 @@ union mon_data_bits {
struct rmid_read {
struct rdtgroup *rgrp;
struct rdt_resource *r;
struct rdt_domain *d;
int evtid;
bool first;
@@ -460,6 +467,7 @@ struct rdt_resource {
struct list_head evt_list;
int num_rmid;
unsigned int mon_scale;
unsigned int mbm_width;
unsigned long fflags;
};
@@ -587,8 +595,9 @@ void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
unsigned int dom_id);
void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
struct rdt_domain *d);
void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
struct rdtgroup *rdtgrp, int evtid, int first);
void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
struct rdt_domain *d, struct rdtgroup *rdtgrp,
int evtid, int first);
void mbm_setup_overflow_handler(struct rdt_domain *dom,
unsigned long delay_ms);
void mbm_handle_overflow(struct work_struct *work);
@@ -601,5 +610,6 @@ bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
void __check_limbo(struct rdt_domain *d, bool force_free);
bool cbm_validate_intel(char *buf, u32 *data, struct rdt_resource *r);
bool cbm_validate_amd(char *buf, u32 *data, struct rdt_resource *r);
void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
#endif /* _ASM_X86_RESCTRL_INTERNAL_H */

View File

@@ -214,9 +214,9 @@ void free_rmid(u32 rmid)
list_add_tail(&entry->list, &rmid_free_lru);
}
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr, unsigned int width)
{
u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
u64 shift = 64 - width, chunks;
chunks = (cur_msr << shift) - (prev_msr << shift);
return chunks >>= shift;
@@ -256,7 +256,7 @@ static int __mon_event_count(u32 rmid, struct rmid_read *rr)
return 0;
}
chunks = mbm_overflow_count(m->prev_msr, tval);
chunks = mbm_overflow_count(m->prev_msr, tval, rr->r->mbm_width);
m->chunks += chunks;
m->prev_msr = tval;
@@ -278,7 +278,7 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
return;
chunks = mbm_overflow_count(m->prev_bw_msr, tval);
chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
m->chunks_bw += chunks;
m->chunks = m->chunks_bw;
cur_bw = (chunks * r->mon_scale) >> 20;
@@ -433,11 +433,12 @@ static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
}
}
static void mbm_update(struct rdt_domain *d, int rmid)
static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
{
struct rmid_read rr;
rr.first = false;
rr.r = r;
rr.d = d;
/*
@@ -510,6 +511,7 @@ void mbm_handle_overflow(struct work_struct *work)
struct rdtgroup *prgrp, *crgrp;
int cpu = smp_processor_id();
struct list_head *head;
struct rdt_resource *r;
struct rdt_domain *d;
mutex_lock(&rdtgroup_mutex);
@@ -517,16 +519,18 @@ void mbm_handle_overflow(struct work_struct *work)
if (!static_branch_likely(&rdt_mon_enable_key))
goto out_unlock;
d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
r = &rdt_resources_all[RDT_RESOURCE_L3];
d = get_domain_from_cpu(cpu, r);
if (!d)
goto out_unlock;
list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
mbm_update(d, prgrp->mon.rmid);
mbm_update(r, d, prgrp->mon.rmid);
head = &prgrp->mon.crdtgrp_list;
list_for_each_entry(crgrp, head, mon.crdtgrp_list)
mbm_update(d, crgrp->mon.rmid);
mbm_update(r, d, crgrp->mon.rmid);
if (is_mba_sc(NULL))
update_mba_bw(prgrp, d);
@@ -614,11 +618,18 @@ static void l3_mon_evt_init(struct rdt_resource *r)
int rdt_get_mon_l3_config(struct rdt_resource *r)
{
unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset;
unsigned int cl_size = boot_cpu_data.x86_cache_size;
int ret;
r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
r->mbm_width = MBM_CNTR_WIDTH_BASE;
if (mbm_offset > 0 && mbm_offset <= MBM_CNTR_WIDTH_OFFSET_MAX)
r->mbm_width += mbm_offset;
else if (mbm_offset > MBM_CNTR_WIDTH_OFFSET_MAX)
pr_warn("Ignoring impossible MBM counter offset\n");
/*
* A reasonable upper limit on the max threshold is the number

View File

@@ -24,7 +24,7 @@
#include <asm/cacheflush.h>
#include <asm/intel-family.h>
#include <asm/resctrl_sched.h>
#include <asm/resctrl.h>
#include <asm/perf_event.h>
#include "../../events/perf_event.h" /* For X86_CONFIG() */
@@ -1326,9 +1326,9 @@ int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
* pseudo-locked region will still be here on return.
*
* The mutex has to be released temporarily to avoid a potential
* deadlock with the mm->mmap_sem semaphore which is obtained in
* the device_create() and debugfs_create_dir() callpath below
* as well as before the mmap() callback is called.
* deadlock with the mm->mmap_lock which is obtained in the
* device_create() and debugfs_create_dir() callpath below as well as
* before the mmap() callback is called.
*/
mutex_unlock(&rdtgroup_mutex);

View File

@@ -29,7 +29,7 @@
#include <uapi/linux/magic.h>
#include <asm/resctrl_sched.h>
#include <asm/resctrl.h>
#include "internal.h"
DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
@@ -1859,6 +1859,19 @@ static int set_cache_qos_cfg(int level, bool enable)
return 0;
}
/* Restore the qos cfg state when a domain comes online */
void rdt_domain_reconfigure_cdp(struct rdt_resource *r)
{
if (!r->alloc_capable)
return;
if (r == &rdt_resources_all[RDT_RESOURCE_L2DATA])
l2_qos_cfg_update(&r->alloc_enabled);
if (r == &rdt_resources_all[RDT_RESOURCE_L3DATA])
l3_qos_cfg_update(&r->alloc_enabled);
}
/*
* Enable or disable the MBA software controller
* which helps user specify bandwidth in MBps.
@@ -2459,7 +2472,7 @@ static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
goto out_destroy;
if (is_mbm_event(mevt->evtid))
mon_event_read(&rr, d, prgrp, mevt->evtid, true);
mon_event_read(&rr, r, d, prgrp, mevt->evtid, true);
}
kernfs_activate(kn);
return 0;
@@ -3072,7 +3085,8 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
* If the rdtgroup is a mon group and parent directory
* is a valid "mon_groups" directory, remove the mon group.
*/
if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn &&
rdtgrp != &rdtgroup_default) {
if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
ret = rdtgroup_ctrl_remove(kn, rdtgrp);
@@ -3185,10 +3199,10 @@ int __init rdtgroup_init(void)
* during the debugfs directory creation also &sb->s_type->i_mutex_key
* (the lockdep class of inode->i_rwsem). Other filesystem
* interactions (eg. SyS_getdents) have the lock ordering:
* &sb->s_type->i_mutex_key --> &mm->mmap_sem
* During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
* &sb->s_type->i_mutex_key --> &mm->mmap_lock
* During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex
* is taken, thus creating dependency:
* &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
* &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause
* issues considering the other two lock dependencies.
* By creating the debugfs directory here we avoid a dependency
* that may cause deadlock (even though file operations cannot

View File

@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/crash_core.h>
#include <linux/pgtable.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
void arch_crash_save_vmcoreinfo(void)

View File

@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
#include <linux/crash_core.h>
#include <linux/pgtable.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
void arch_crash_save_vmcoreinfo(void)

View File

@@ -6,12 +6,10 @@
#include <linux/fs.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/desc.h>
#include <asm/traps.h>
extern void double_fault(void);
#define ptr_ok(x) ((x) > PAGE_OFFSET && (x) < PAGE_OFFSET + MAXMEM)
#define TSS(x) this_cpu_read(cpu_tss_rw.x86_tss.x)
@@ -22,7 +20,7 @@ static void set_df_gdt_entry(unsigned int cpu);
* Called by double_fault with CR0.TS and EFLAGS.NT cleared. The CPU thinks
* we're running the doublefault task. Cannot return.
*/
asmlinkage notrace void __noreturn doublefault_shim(void)
asmlinkage noinstr void __noreturn doublefault_shim(void)
{
unsigned long cr2;
struct pt_regs regs;
@@ -41,7 +39,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void)
* Fill in pt_regs. A downside of doing this in C is that the unwinder
* won't see it (no ENCODE_FRAME_POINTER), so a nested stack dump
* won't successfully unwind to the source of the double fault.
* The main dump from do_double_fault() is fine, though, since it
* The main dump from exc_double_fault() is fine, though, since it
* uses these regs directly.
*
* If anyone ever cares, this could be moved to asm.
@@ -71,7 +69,7 @@ asmlinkage notrace void __noreturn doublefault_shim(void)
regs.cx = TSS(cx);
regs.bx = TSS(bx);
do_double_fault(&regs, 0, cr2);
exc_double_fault(&regs, 0, cr2);
/*
* x86_32 does not save the original CR3 anywhere on a task switch.
@@ -85,7 +83,6 @@ asmlinkage notrace void __noreturn doublefault_shim(void)
*/
panic("cannot return from double fault\n");
}
NOKPROBE_SYMBOL(doublefault_shim);
DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.tss = {
@@ -96,7 +93,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct doublefault_stack, doublefault_stack) = {
.ldt = 0,
.io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
.ip = (unsigned long) double_fault,
.ip = (unsigned long) asm_exc_double_fault,
.flags = X86_EFLAGS_FIXED,
.es = __USER_DS,
.cs = __KERNEL_CS,

View File

@@ -65,7 +65,7 @@ bool in_entry_stack(unsigned long *stack, struct stack_info *info)
}
static void printk_stack_address(unsigned long address, int reliable,
char *log_lvl)
const char *log_lvl)
{
touch_nmi_watchdog();
printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address);
@@ -160,7 +160,7 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs,
}
void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, char *log_lvl)
unsigned long *stack, const char *log_lvl)
{
struct unwind_state state;
struct stack_info stack_info = {0};
@@ -279,7 +279,8 @@ next:
}
}
void show_stack(struct task_struct *task, unsigned long *sp)
void show_stack(struct task_struct *task, unsigned long *sp,
const char *loglvl)
{
task = task ? : current;
@@ -290,7 +291,7 @@ void show_stack(struct task_struct *task, unsigned long *sp)
if (!sp && task == current)
sp = get_stack_pointer(current, NULL);
show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT);
show_trace_log_lvl(task, NULL, sp, loglvl);
}
void show_stack_regs(struct pt_regs *regs)

View File

@@ -87,7 +87,6 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info)
static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
{
#ifdef CONFIG_DOUBLEFAULT
struct cpu_entry_area *cea = get_cpu_entry_area(raw_smp_processor_id());
struct doublefault_stack *ss = &cea->doublefault_stack;
@@ -103,9 +102,6 @@ static bool in_doublefault_stack(unsigned long *stack, struct stack_info *info)
info->next_sp = (unsigned long *)this_cpu_read(cpu_tss_rw.x86_tss.sp);
return true;
#else
return false;
#endif
}

View File

@@ -22,15 +22,13 @@
static const char * const exception_stack_names[] = {
[ ESTACK_DF ] = "#DF",
[ ESTACK_NMI ] = "NMI",
[ ESTACK_DB2 ] = "#DB2",
[ ESTACK_DB1 ] = "#DB1",
[ ESTACK_DB ] = "#DB",
[ ESTACK_MCE ] = "#MC",
};
const char *stack_type_name(enum stack_type type)
{
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
if (type == STACK_TYPE_IRQ)
return "IRQ";
@@ -79,7 +77,6 @@ static const
struct estack_pages estack_pages[CEA_ESTACK_PAGES] ____cacheline_aligned = {
EPAGERANGE(DF),
EPAGERANGE(NMI),
EPAGERANGE(DB1),
EPAGERANGE(DB),
EPAGERANGE(MCE),
};
@@ -91,7 +88,7 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info)
struct pt_regs *regs;
unsigned int k;
BUILD_BUG_ON(N_EXCEPTION_STACKS != 6);
BUILD_BUG_ON(N_EXCEPTION_STACKS != 4);
begin = (unsigned long)__this_cpu_read(cea_exception_stacks);
/*
@@ -183,7 +180,8 @@ recursion_check:
*/
if (visit_mask) {
if (*visit_mask & (1UL << info->type)) {
printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
if (task == current)
printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type);
goto unknown;
}
*visit_mask |= 1UL << info->type;

View File

@@ -910,14 +910,6 @@ static int __init parse_memmap_one(char *p)
return -EINVAL;
if (!strncmp(p, "exactmap", 8)) {
#ifdef CONFIG_CRASH_DUMP
/*
* If we are doing a crash dump, we still need to know
* the real memory size before the original memory map is
* reset.
*/
saved_max_pfn = e820__end_of_ram_pfn();
#endif
e820_table->nr_entries = 0;
userdef = 1;
return 0;

View File

@@ -8,6 +8,7 @@
#include <linux/pci_regs.h>
#include <linux/pci_ids.h>
#include <linux/errno.h>
#include <linux/pgtable.h>
#include <asm/io.h>
#include <asm/processor.h>
#include <asm/fcntl.h>
@@ -15,12 +16,8 @@
#include <xen/hvc-console.h>
#include <asm/pci-direct.h>
#include <asm/fixmap.h>
#include <asm/intel-mid.h>
#include <asm/pgtable.h>
#include <linux/usb/ehci_def.h>
#include <linux/usb/xhci-dbgp.h>
#include <linux/efi.h>
#include <asm/efi.h>
#include <asm/pci_x86.h>
/* Simple VGA output */

View File

@@ -29,7 +29,7 @@
#include <linux/percpu.h>
#include <linux/gfp.h>
#include <linux/random.h>
#include <asm/pgtable.h>
#include <linux/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/setup.h>
#include <asm/espfix.h>

View File

@@ -291,15 +291,13 @@ void fpu__drop(struct fpu *fpu)
}
/*
* Clear FPU registers by setting them up from
* the init fpstate:
* Clear FPU registers by setting them up from the init fpstate.
* Caller must do fpregs_[un]lock() around it.
*/
static inline void copy_init_fpstate_to_fpregs(void)
static inline void copy_init_fpstate_to_fpregs(u64 features_mask)
{
fpregs_lock();
if (use_xsave())
copy_kernel_to_xregs(&init_fpstate.xsave, -1);
copy_kernel_to_xregs(&init_fpstate.xsave, features_mask);
else if (static_cpu_has(X86_FEATURE_FXSR))
copy_kernel_to_fxregs(&init_fpstate.fxsave);
else
@@ -307,9 +305,6 @@ static inline void copy_init_fpstate_to_fpregs(void)
if (boot_cpu_has(X86_FEATURE_OSPKE))
copy_init_pkru_to_fpregs();
fpregs_mark_activate();
fpregs_unlock();
}
/*
@@ -318,18 +313,40 @@ static inline void copy_init_fpstate_to_fpregs(void)
* Called by sys_execve(), by the signal handler code and by various
* error paths.
*/
void fpu__clear(struct fpu *fpu)
static void fpu__clear(struct fpu *fpu, bool user_only)
{
WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
WARN_ON_FPU(fpu != &current->thread.fpu);
fpu__drop(fpu);
if (!static_cpu_has(X86_FEATURE_FPU)) {
fpu__drop(fpu);
fpu__initialize(fpu);
return;
}
/*
* Make sure fpstate is cleared and initialized.
*/
fpu__initialize(fpu);
if (static_cpu_has(X86_FEATURE_FPU))
copy_init_fpstate_to_fpregs();
fpregs_lock();
if (user_only) {
if (!fpregs_state_valid(fpu, smp_processor_id()) &&
xfeatures_mask_supervisor())
copy_kernel_to_xregs(&fpu->state.xsave,
xfeatures_mask_supervisor());
copy_init_fpstate_to_fpregs(xfeatures_mask_user());
} else {
copy_init_fpstate_to_fpregs(xfeatures_mask_all);
}
fpregs_mark_activate();
fpregs_unlock();
}
void fpu__clear_user_states(struct fpu *fpu)
{
fpu__clear(fpu, true);
}
void fpu__clear_all(struct fpu *fpu)
{
fpu__clear(fpu, false);
}
/*

View File

@@ -224,7 +224,8 @@ static void __init fpu__init_system_xstate_size_legacy(void)
*/
u64 __init fpu__get_supported_xfeatures_mask(void)
{
return XCNTXT_MASK;
return XFEATURE_MASK_USER_SUPPORTED |
XFEATURE_MASK_SUPERVISOR_SUPPORTED;
}
/* Legacy code to initialize eager fpu mode. */

View File

@@ -139,7 +139,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
} else {
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
if (!ret)
ret = validate_xstate_header(&xsave->header);
ret = validate_user_xstate_header(&xsave->header);
}
/*

View File

@@ -211,9 +211,9 @@ retry:
}
static inline void
sanitize_restored_xstate(union fpregs_state *state,
struct user_i387_ia32_struct *ia32_env,
u64 xfeatures, int fx_only)
sanitize_restored_user_xstate(union fpregs_state *state,
struct user_i387_ia32_struct *ia32_env,
u64 user_xfeatures, int fx_only)
{
struct xregs_state *xsave = &state->xsave;
struct xstate_header *header = &xsave->header;
@@ -226,13 +226,22 @@ sanitize_restored_xstate(union fpregs_state *state,
*/
/*
* Init the state that is not present in the memory
* layout and not enabled by the OS.
* 'user_xfeatures' might have bits clear which are
* set in header->xfeatures. This represents features that
* were in init state prior to a signal delivery, and need
* to be reset back to the init state. Clear any user
* feature bits which are set in the kernel buffer to get
* them back to the init state.
*
* Supervisor state is unchanged by input from userspace.
* Ensure supervisor state bits stay set and supervisor
* state is not modified.
*/
if (fx_only)
header->xfeatures = XFEATURE_MASK_FPSSE;
else
header->xfeatures &= xfeatures;
header->xfeatures &= user_xfeatures |
xfeatures_mask_supervisor();
}
if (use_fxsr()) {
@@ -252,16 +261,24 @@ sanitize_restored_xstate(union fpregs_state *state,
*/
static int copy_user_to_fpregs_zeroing(void __user *buf, u64 xbv, int fx_only)
{
u64 init_bv;
int r;
if (use_xsave()) {
if (fx_only) {
u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return copy_user_to_fxregs(buf);
} else {
u64 init_bv = xfeatures_mask & ~xbv;
if (unlikely(init_bv))
init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
r = copy_user_to_fxregs(buf);
if (!r)
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return copy_user_to_xregs(buf, xbv);
return r;
} else {
init_bv = xfeatures_mask_user() & ~xbv;
r = copy_user_to_xregs(buf, xbv);
if (!r && unlikely(init_bv))
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
return r;
}
} else if (use_fxsr()) {
return copy_user_to_fxregs(buf);
@@ -277,7 +294,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
struct task_struct *tsk = current;
struct fpu *fpu = &tsk->thread.fpu;
struct user_i387_ia32_struct env;
u64 xfeatures = 0;
u64 user_xfeatures = 0;
int fx_only = 0;
int ret = 0;
@@ -285,7 +302,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
IS_ENABLED(CONFIG_IA32_EMULATION));
if (!buf) {
fpu__clear(fpu);
fpu__clear_user_states(fpu);
return 0;
}
@@ -310,32 +327,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
trace_x86_fpu_xstate_check_failed(fpu);
} else {
state_size = fx_sw_user.xstate_size;
xfeatures = fx_sw_user.xfeatures;
user_xfeatures = fx_sw_user.xfeatures;
}
}
/*
* The current state of the FPU registers does not matter. By setting
* TIF_NEED_FPU_LOAD unconditionally it is ensured that the our xstate
* is not modified on context switch and that the xstate is considered
* to be loaded again on return to userland (overriding last_cpu avoids
* the optimisation).
*/
set_thread_flag(TIF_NEED_FPU_LOAD);
__fpu_invalidate_fpregs_state(fpu);
if ((unsigned long)buf_fx % 64)
fx_only = 1;
/*
* For 32-bit frames with fxstate, copy the fxstate so it can be
* reconstructed later.
*/
if (ia32_fxstate) {
ret = __copy_from_user(&env, buf, sizeof(env));
if (ret)
goto err_out;
envp = &env;
} else {
if (!ia32_fxstate) {
/*
* Attempt to restore the FPU registers directly from user
* memory. For that to succeed, the user access cannot cause
@@ -345,20 +344,65 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
*/
fpregs_lock();
pagefault_disable();
ret = copy_user_to_fpregs_zeroing(buf_fx, xfeatures, fx_only);
ret = copy_user_to_fpregs_zeroing(buf_fx, user_xfeatures, fx_only);
pagefault_enable();
if (!ret) {
/*
* Restore supervisor states: previous context switch
* etc has done XSAVES and saved the supervisor states
* in the kernel buffer from which they can be restored
* now.
*
* We cannot do a single XRSTORS here - which would
* be nice - because the rest of the FPU registers are
* being restored from a user buffer directly. The
* single XRSTORS happens below, when the user buffer
* has been copied to the kernel one.
*/
if (test_thread_flag(TIF_NEED_FPU_LOAD) &&
xfeatures_mask_supervisor())
copy_kernel_to_xregs(&fpu->state.xsave,
xfeatures_mask_supervisor());
fpregs_mark_activate();
fpregs_unlock();
return 0;
}
fpregs_deactivate(fpu);
fpregs_unlock();
} else {
/*
* For 32-bit frames with fxstate, copy the fxstate so it can
* be reconstructed later.
*/
ret = __copy_from_user(&env, buf, sizeof(env));
if (ret)
goto err_out;
envp = &env;
}
/*
* By setting TIF_NEED_FPU_LOAD it is ensured that our xstate is
* not modified on context switch and that the xstate is considered
* to be loaded again on return to userland (overriding last_cpu avoids
* the optimisation).
*/
fpregs_lock();
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
/*
* Supervisor states are not modified by user space input. Save
* current supervisor states first and invalidate the FPU regs.
*/
if (xfeatures_mask_supervisor())
copy_supervisor_to_kernel(&fpu->state.xsave);
set_thread_flag(TIF_NEED_FPU_LOAD);
}
__fpu_invalidate_fpregs_state(fpu);
fpregs_unlock();
if (use_xsave() && !fx_only) {
u64 init_bv = xfeatures_mask & ~xfeatures;
u64 init_bv = xfeatures_mask_user() & ~user_xfeatures;
if (using_compacted_format()) {
ret = copy_user_to_xstate(&fpu->state.xsave, buf_fx);
@@ -366,17 +410,24 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
ret = __copy_from_user(&fpu->state.xsave, buf_fx, state_size);
if (!ret && state_size > offsetof(struct xregs_state, header))
ret = validate_xstate_header(&fpu->state.xsave.header);
ret = validate_user_xstate_header(&fpu->state.xsave.header);
}
if (ret)
goto err_out;
sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
fx_only);
fpregs_lock();
if (unlikely(init_bv))
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
ret = copy_kernel_to_xregs_err(&fpu->state.xsave, xfeatures);
/*
* Restore previously saved supervisor xstates along with
* copied-in user xstates.
*/
ret = copy_kernel_to_xregs_err(&fpu->state.xsave,
user_xfeatures | xfeatures_mask_supervisor());
} else if (use_fxsr()) {
ret = __copy_from_user(&fpu->state.fxsave, buf_fx, state_size);
@@ -385,11 +436,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
goto err_out;
}
sanitize_restored_xstate(&fpu->state, envp, xfeatures, fx_only);
sanitize_restored_user_xstate(&fpu->state, envp, user_xfeatures,
fx_only);
fpregs_lock();
if (use_xsave()) {
u64 init_bv = xfeatures_mask & ~XFEATURE_MASK_FPSSE;
u64 init_bv;
init_bv = xfeatures_mask_user() & ~XFEATURE_MASK_FPSSE;
copy_kernel_to_xregs(&init_fpstate.xsave, init_bv);
}
@@ -410,7 +464,7 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)
err_out:
if (ret)
fpu__clear(fpu);
fpu__clear_user_states(fpu);
return ret;
}
@@ -465,7 +519,7 @@ void fpu__init_prepare_fx_sw_frame(void)
fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
fx_sw_reserved.extended_size = size;
fx_sw_reserved.xfeatures = xfeatures_mask;
fx_sw_reserved.xfeatures = xfeatures_mask_user();
fx_sw_reserved.xstate_size = fpu_user_xstate_size;
if (IS_ENABLED(CONFIG_IA32_EMULATION) ||

View File

@@ -54,13 +54,15 @@ static short xsave_cpuid_features[] __initdata = {
};
/*
* Mask of xstate features supported by the CPU and the kernel:
* This represents the full set of bits that should ever be set in a kernel
* XSAVE buffer, both supervisor and user xstates.
*/
u64 xfeatures_mask __read_mostly;
u64 xfeatures_mask_all __read_mostly;
static unsigned int xstate_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_sizes[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_comp_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
static unsigned int xstate_supervisor_only_offsets[XFEATURE_MAX] = { [ 0 ... XFEATURE_MAX - 1] = -1};
/*
* The XSAVE area of kernel can be in standard or compacted format;
@@ -76,7 +78,7 @@ unsigned int fpu_user_xstate_size;
*/
int cpu_has_xfeatures(u64 xfeatures_needed, const char **feature_name)
{
u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask;
u64 xfeatures_missing = xfeatures_needed & ~xfeatures_mask_all;
if (unlikely(feature_name)) {
long xfeature_idx, max_idx;
@@ -150,7 +152,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* None of the feature bits are in init state. So nothing else
* to do for us, as the memory layout is up to date.
*/
if ((xfeatures & xfeatures_mask) == xfeatures_mask)
if ((xfeatures & xfeatures_mask_all) == xfeatures_mask_all)
return;
/*
@@ -177,7 +179,7 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
* in a special way already:
*/
feature_bit = 0x2;
xfeatures = (xfeatures_mask & ~xfeatures) >> 2;
xfeatures = (xfeatures_mask_user() & ~xfeatures) >> 2;
/*
* Update all the remaining memory layouts according to their
@@ -205,30 +207,39 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
*/
void fpu__init_cpu_xstate(void)
{
if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask)
u64 unsup_bits;
if (!boot_cpu_has(X86_FEATURE_XSAVE) || !xfeatures_mask_all)
return;
/*
* Make it clear that XSAVES supervisor states are not yet
* implemented should anyone expect it to work by changing
* bits in XFEATURE_MASK_* macros and XCR0.
* Unsupported supervisor xstates should not be found in
* the xfeatures mask.
*/
WARN_ONCE((xfeatures_mask & XFEATURE_MASK_SUPERVISOR),
"x86/fpu: XSAVES supervisor states are not yet implemented.\n");
unsup_bits = xfeatures_mask_all & XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
WARN_ONCE(unsup_bits, "x86/fpu: Found unsupported supervisor xstates: 0x%llx\n",
unsup_bits);
xfeatures_mask &= ~XFEATURE_MASK_SUPERVISOR;
xfeatures_mask_all &= ~XFEATURE_MASK_SUPERVISOR_UNSUPPORTED;
cr4_set_bits(X86_CR4_OSXSAVE);
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
/*
* XCR_XFEATURE_ENABLED_MASK (aka. XCR0) sets user features
* managed by XSAVE{C, OPT, S} and XRSTOR{S}. Only XSAVE user
* states can be set here.
*/
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
/*
* MSR_IA32_XSS sets supervisor states managed by XSAVES.
*/
if (boot_cpu_has(X86_FEATURE_XSAVES))
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
}
/*
* Note that in the future we will likely need a pair of
* functions here: one for user xstates and the other for
* system xstates. For now, they are the same.
*/
static int xfeature_enabled(enum xfeature xfeature)
static bool xfeature_enabled(enum xfeature xfeature)
{
return !!(xfeatures_mask & (1UL << xfeature));
return xfeatures_mask_all & BIT_ULL(xfeature);
}
/*
@@ -382,6 +393,33 @@ static void __init setup_xstate_comp_offsets(void)
}
}
/*
* Setup offsets of a supervisor-state-only XSAVES buffer:
*
* The offsets stored in xstate_comp_offsets[] only work for one specific
* value of the Requested Feature BitMap (RFBM). In cases where a different
* RFBM value is used, a different set of offsets is required. This set of
* offsets is for when RFBM=xfeatures_mask_supervisor().
*/
static void __init setup_supervisor_only_offsets(void)
{
unsigned int next_offset;
int i;
next_offset = FXSAVE_SIZE + XSAVE_HDR_SIZE;
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
if (!xfeature_enabled(i) || !xfeature_is_supervisor(i))
continue;
if (xfeature_is_aligned(i))
next_offset = ALIGN(next_offset, 64);
xstate_supervisor_only_offsets[i] = next_offset;
next_offset += xstate_sizes[i];
}
}
/*
* Print out xstate component offsets and sizes
*/
@@ -415,7 +453,7 @@ static void __init setup_init_fpu_buf(void)
if (boot_cpu_has(X86_FEATURE_XSAVES))
init_fpstate.xsave.header.xcomp_bv = XCOMP_BV_COMPACTED_FORMAT |
xfeatures_mask;
xfeatures_mask_all;
/*
* Init all the features state with header.xfeatures being 0x0
@@ -438,7 +476,7 @@ static int xfeature_uncompacted_offset(int xfeature_nr)
* format. Checking a supervisor state's uncompacted offset is
* an error.
*/
if (XFEATURE_MASK_SUPERVISOR & BIT_ULL(xfeature_nr)) {
if (XFEATURE_MASK_SUPERVISOR_ALL & BIT_ULL(xfeature_nr)) {
WARN_ONCE(1, "No fixed offset for xstate %d\n", xfeature_nr);
return -1;
}
@@ -472,10 +510,10 @@ int using_compacted_format(void)
}
/* Validate an xstate header supplied by userspace (ptrace or sigreturn) */
int validate_xstate_header(const struct xstate_header *hdr)
int validate_user_xstate_header(const struct xstate_header *hdr)
{
/* No unknown or supervisor features may be set */
if (hdr->xfeatures & (~xfeatures_mask | XFEATURE_MASK_SUPERVISOR))
if (hdr->xfeatures & ~xfeatures_mask_user())
return -EINVAL;
/* Userspace must use the uncompacted format */
@@ -610,15 +648,12 @@ static void do_extra_xstate_size_checks(void)
/*
* Get total size of enabled xstates in XCR0/xfeatures_mask.
* Get total size of enabled xstates in XCR0 | IA32_XSS.
*
* Note the SDM's wording here. "sub-function 0" only enumerates
* the size of the *user* states. If we use it to size a buffer
* that we use 'XSAVES' on, we could potentially overflow the
* buffer because 'XSAVES' saves system states too.
*
* Note that we do not currently set any bits on IA32_XSS so
* 'XCR0 | IA32_XSS == XCR0' for now.
*/
static unsigned int __init get_xsaves_size(void)
{
@@ -700,7 +735,7 @@ static int __init init_xstate_size(void)
*/
static void fpu__init_disable_system_xstate(void)
{
xfeatures_mask = 0;
xfeatures_mask_all = 0;
cr4_clear_bits(X86_CR4_OSXSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
}
@@ -735,16 +770,26 @@ void __init fpu__init_system_xstate(void)
return;
}
/*
* Find user xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
xfeatures_mask = eax + ((u64)edx << 32);
xfeatures_mask_all = eax + ((u64)edx << 32);
if ((xfeatures_mask & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* Find supervisor xstates supported by the processor.
*/
cpuid_count(XSTATE_CPUID, 1, &eax, &ebx, &ecx, &edx);
xfeatures_mask_all |= ecx + ((u64)edx << 32);
if ((xfeatures_mask_user() & XFEATURE_MASK_FPSSE) != XFEATURE_MASK_FPSSE) {
/*
* This indicates that something really unexpected happened
* with the enumeration. Disable XSAVE and try to continue
* booting without it. This is too early to BUG().
*/
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n", xfeatures_mask);
pr_err("x86/fpu: FP/SSE not present amongst the CPU's xstate features: 0x%llx.\n",
xfeatures_mask_all);
goto out_disable;
}
@@ -753,10 +798,10 @@ void __init fpu__init_system_xstate(void)
*/
for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
if (!boot_cpu_has(xsave_cpuid_features[i]))
xfeatures_mask &= ~BIT(i);
xfeatures_mask_all &= ~BIT_ULL(i);
}
xfeatures_mask &= fpu__get_supported_xfeatures_mask();
xfeatures_mask_all &= fpu__get_supported_xfeatures_mask();
/* Enable xstate instructions to be able to continue with initialization: */
fpu__init_cpu_xstate();
@@ -768,15 +813,16 @@ void __init fpu__init_system_xstate(void)
* Update info used for ptrace frames; use standard-format size and no
* supervisor xstates:
*/
update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask & ~XFEATURE_MASK_SUPERVISOR);
update_regset_xstate_info(fpu_user_xstate_size, xfeatures_mask_user());
fpu__init_prepare_fx_sw_frame();
setup_init_fpu_buf();
setup_xstate_comp_offsets();
setup_supervisor_only_offsets();
print_xstate_offset_size();
pr_info("x86/fpu: Enabled xstate features 0x%llx, context size is %d bytes, using '%s' format.\n",
xfeatures_mask,
xfeatures_mask_all,
fpu_kernel_xstate_size,
boot_cpu_has(X86_FEATURE_XSAVES) ? "compacted" : "standard");
return;
@@ -795,7 +841,14 @@ void fpu__resume_cpu(void)
* Restore XCR0 on xsave capable CPUs:
*/
if (boot_cpu_has(X86_FEATURE_XSAVE))
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask);
xsetbv(XCR_XFEATURE_ENABLED_MASK, xfeatures_mask_user());
/*
* Restore IA32_XSS. The same CPUID bit enumerates support
* of XSAVES and MSR_IA32_XSS.
*/
if (boot_cpu_has(X86_FEATURE_XSAVES))
wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor());
}
/*
@@ -840,10 +893,9 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
/*
* We should not ever be requesting features that we
* have not enabled. Remember that xfeatures_mask is
* what we write to the XCR0 register.
* have not enabled.
*/
WARN_ONCE(!(xfeatures_mask & BIT_ULL(xfeature_nr)),
WARN_ONCE(!(xfeatures_mask_all & BIT_ULL(xfeature_nr)),
"get of unsupported state");
/*
* This assumes the last 'xsave*' instruction to
@@ -957,18 +1009,31 @@ static inline bool xfeatures_mxcsr_quirk(u64 xfeatures)
return true;
}
/*
* This is similar to user_regset_copyout(), but will not add offset to
* the source data pointer or increment pos, count, kbuf, and ubuf.
*/
static inline void
__copy_xstate_to_kernel(void *kbuf, const void *data,
unsigned int offset, unsigned int size, unsigned int size_total)
static void fill_gap(unsigned to, void **kbuf, unsigned *pos, unsigned *count)
{
if (offset < size_total) {
unsigned int copy = min(size, size_total - offset);
if (*pos < to) {
unsigned size = to - *pos;
memcpy(kbuf + offset, data, copy);
if (size > *count)
size = *count;
memcpy(*kbuf, (void *)&init_fpstate.xsave + *pos, size);
*kbuf += size;
*pos += size;
*count -= size;
}
}
static void copy_part(unsigned offset, unsigned size, void *from,
void **kbuf, unsigned *pos, unsigned *count)
{
fill_gap(offset, kbuf, pos, count);
if (size > *count)
size = *count;
if (size) {
memcpy(*kbuf, from, size);
*kbuf += size;
*pos += size;
*count -= size;
}
}
@@ -981,8 +1046,9 @@ __copy_xstate_to_kernel(void *kbuf, const void *data,
*/
int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int offset_start, unsigned int size_total)
{
unsigned int offset, size;
struct xstate_header header;
const unsigned off_mxcsr = offsetof(struct fxregs_state, mxcsr);
unsigned count = size_total;
int i;
/*
@@ -996,48 +1062,44 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
header.xfeatures &= xfeatures_mask_user();
if (header.xfeatures & XFEATURE_MASK_FP)
copy_part(0, off_mxcsr,
&xsave->i387, &kbuf, &offset_start, &count);
if (header.xfeatures & (XFEATURE_MASK_SSE | XFEATURE_MASK_YMM))
copy_part(off_mxcsr, MXCSR_AND_FLAGS_SIZE,
&xsave->i387.mxcsr, &kbuf, &offset_start, &count);
if (header.xfeatures & XFEATURE_MASK_FP)
copy_part(offsetof(struct fxregs_state, st_space), 128,
&xsave->i387.st_space, &kbuf, &offset_start, &count);
if (header.xfeatures & XFEATURE_MASK_SSE)
copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256,
&xsave->i387.xmm_space, &kbuf, &offset_start, &count);
/*
* Fill xsave->i387.sw_reserved value for ptrace frame:
*/
copy_part(offsetof(struct fxregs_state, sw_reserved), 48,
xstate_fx_sw_bytes, &kbuf, &offset_start, &count);
/*
* Copy xregs_state->header:
*/
offset = offsetof(struct xregs_state, header);
size = sizeof(header);
copy_part(offsetof(struct xregs_state, header), sizeof(header),
&header, &kbuf, &offset_start, &count);
__copy_xstate_to_kernel(kbuf, &header, offset, size, size_total);
for (i = 0; i < XFEATURE_MAX; i++) {
for (i = FIRST_EXTENDED_XFEATURE; i < XFEATURE_MAX; i++) {
/*
* Copy only in-use xstates:
*/
if ((header.xfeatures >> i) & 1) {
void *src = __raw_xsave_addr(xsave, i);
offset = xstate_offsets[i];
size = xstate_sizes[i];
/* The next component has to fit fully into the output buffer: */
if (offset + size > size_total)
break;
__copy_xstate_to_kernel(kbuf, src, offset, size, size_total);
copy_part(xstate_offsets[i], xstate_sizes[i],
src, &kbuf, &offset_start, &count);
}
}
if (xfeatures_mxcsr_quirk(header.xfeatures)) {
offset = offsetof(struct fxregs_state, mxcsr);
size = MXCSR_AND_FLAGS_SIZE;
__copy_xstate_to_kernel(kbuf, &xsave->i387.mxcsr, offset, size, size_total);
}
/*
* Fill xsave->i387.sw_reserved value for ptrace frame:
*/
offset = offsetof(struct fxregs_state, sw_reserved);
size = sizeof(xstate_fx_sw_bytes);
__copy_xstate_to_kernel(kbuf, xstate_fx_sw_bytes, offset, size, size_total);
fill_gap(size_total, &kbuf, &offset_start, &count);
return 0;
}
@@ -1080,7 +1142,7 @@ int copy_xstate_to_user(void __user *ubuf, struct xregs_state *xsave, unsigned i
*/
memset(&header, 0, sizeof(header));
header.xfeatures = xsave->header.xfeatures;
header.xfeatures &= ~XFEATURE_MASK_SUPERVISOR;
header.xfeatures &= xfeatures_mask_user();
/*
* Copy xregs_state->header:
@@ -1147,7 +1209,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
memcpy(&hdr, kbuf + offset, size);
if (validate_xstate_header(&hdr))
if (validate_user_xstate_header(&hdr))
return -EINVAL;
for (i = 0; i < XFEATURE_MAX; i++) {
@@ -1173,7 +1235,7 @@ int copy_kernel_to_xstate(struct xregs_state *xsave, const void *kbuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
/*
* Add back in the features that came in from userspace:
@@ -1201,7 +1263,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
if (__copy_from_user(&hdr, ubuf + offset, size))
return -EFAULT;
if (validate_xstate_header(&hdr))
if (validate_user_xstate_header(&hdr))
return -EINVAL;
for (i = 0; i < XFEATURE_MAX; i++) {
@@ -1229,7 +1291,7 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
* The state that came in from userspace was user-state only.
* Mask all the user states out of 'xfeatures':
*/
xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR;
xsave->header.xfeatures &= XFEATURE_MASK_SUPERVISOR_ALL;
/*
* Add back in the features that came in from userspace:
@@ -1239,6 +1301,61 @@ int copy_user_to_xstate(struct xregs_state *xsave, const void __user *ubuf)
return 0;
}
/*
* Save only supervisor states to the kernel buffer. This blows away all
* old states, and is intended to be used only in __fpu__restore_sig(), where
* user states are restored from the user buffer.
*/
void copy_supervisor_to_kernel(struct xregs_state *xstate)
{
struct xstate_header *header;
u64 max_bit, min_bit;
u32 lmask, hmask;
int err, i;
if (WARN_ON(!boot_cpu_has(X86_FEATURE_XSAVES)))
return;
if (!xfeatures_mask_supervisor())
return;
max_bit = __fls(xfeatures_mask_supervisor());
min_bit = __ffs(xfeatures_mask_supervisor());
lmask = xfeatures_mask_supervisor();
hmask = xfeatures_mask_supervisor() >> 32;
XSTATE_OP(XSAVES, xstate, lmask, hmask, err);
/* We should never fault when copying to a kernel buffer: */
if (WARN_ON_FPU(err))
return;
/*
* At this point, the buffer has only supervisor states and must be
* converted back to normal kernel format.
*/
header = &xstate->header;
header->xcomp_bv |= xfeatures_mask_all;
/*
* This only moves states up in the buffer. Start with
* the last state and move backwards so that states are
* not overwritten until after they are moved. Note:
* memmove() allows overlapping src/dst buffers.
*/
for (i = max_bit; i >= min_bit; i--) {
u8 *xbuf = (u8 *)xstate;
if (!((header->xfeatures >> i) & 1))
continue;
/* Move xfeature 'i' into its normal location */
memmove(xbuf + xstate_comp_offsets[i],
xbuf + xstate_supervisor_only_offsets[i],
xstate_sizes[i]);
}
}
#ifdef CONFIG_PROC_PID_ARCH_STATUS
/*
* Report the amount of time elapsed in millisecond since last AVX512

View File

@@ -282,7 +282,8 @@ static inline void tramp_free(void *tramp) { }
/* Defined as markers to the end of the ftrace default trampolines */
extern void ftrace_regs_caller_end(void);
extern void ftrace_epilogue(void);
extern void ftrace_regs_caller_ret(void);
extern void ftrace_caller_end(void);
extern void ftrace_caller_op_ptr(void);
extern void ftrace_regs_caller_op_ptr(void);
@@ -334,7 +335,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
call_offset = (unsigned long)ftrace_regs_call;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_epilogue;
end_offset = (unsigned long)ftrace_caller_end;
op_offset = (unsigned long)ftrace_caller_op_ptr;
call_offset = (unsigned long)ftrace_call;
}
@@ -366,6 +367,13 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
if (WARN_ON(ret < 0))
goto fail;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
ip = trampoline + (ftrace_regs_caller_ret - ftrace_regs_caller);
ret = probe_kernel_read(ip, (void *)retq, RET_SIZE);
if (WARN_ON(ret < 0))
goto fail;
}
/*
* The address of the ftrace_ops that is used for this trampoline
* is stored at the end of the trampoline. This will be used to
@@ -407,7 +415,8 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size)
set_vm_flush_reset_perms(trampoline);
set_memory_ro((unsigned long)trampoline, npages);
if (likely(system_state != SYSTEM_BOOTING))
set_memory_ro((unsigned long)trampoline, npages);
set_memory_x((unsigned long)trampoline, npages);
return (unsigned long)trampoline;
fail:
@@ -415,6 +424,32 @@ fail:
return 0;
}
void set_ftrace_ops_ro(void)
{
struct ftrace_ops *ops;
unsigned long start_offset;
unsigned long end_offset;
unsigned long npages;
unsigned long size;
do_for_each_ftrace_op(ops, ftrace_ops_list) {
if (!(ops->flags & FTRACE_OPS_FL_ALLOC_TRAMP))
continue;
if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) {
start_offset = (unsigned long)ftrace_regs_caller;
end_offset = (unsigned long)ftrace_regs_caller_end;
} else {
start_offset = (unsigned long)ftrace_caller;
end_offset = (unsigned long)ftrace_caller_end;
}
size = end_offset - start_offset;
size = size + RET_SIZE + sizeof(void *);
npages = DIV_ROUND_UP(size, PAGE_SIZE);
set_memory_ro((unsigned long)ops->trampoline, npages);
} while_for_each_ftrace_op(ops);
}
static unsigned long calc_trampoline_call_offset(bool save_regs)
{
unsigned long start_offset;

View File

@@ -189,5 +189,5 @@ return_to_handler:
movl %eax, %ecx
popl %edx
popl %eax
JMP_NOSPEC %ecx
JMP_NOSPEC ecx
#endif

View File

@@ -12,7 +12,7 @@
#include <asm/frame.h>
.code64
.section .entry.text, "ax"
.section .text, "ax"
#ifdef CONFIG_FRAME_POINTER
/* Save parent and function stack frames (rip and rbp) */
@@ -23,7 +23,7 @@
#endif /* CONFIG_FRAME_POINTER */
/* Size of stack used to save mcount regs in save_mcount_regs */
#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE)
#define MCOUNT_REG_SIZE (FRAME_SIZE + MCOUNT_FRAME_SIZE)
/*
* gcc -pg option adds a call to 'mcount' in most functions.
@@ -77,7 +77,7 @@
/*
* We add enough stack to save all regs.
*/
subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp
subq $(FRAME_SIZE), %rsp
movq %rax, RAX(%rsp)
movq %rcx, RCX(%rsp)
movq %rdx, RDX(%rsp)
@@ -157,8 +157,12 @@ SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL)
* think twice before adding any new code or changing the
* layout here.
*/
SYM_INNER_LABEL(ftrace_epilogue, SYM_L_GLOBAL)
SYM_INNER_LABEL(ftrace_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_caller);
SYM_FUNC_START(ftrace_epilogue)
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
jmp ftrace_stub
@@ -170,14 +174,12 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
*/
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
retq
SYM_FUNC_END(ftrace_caller)
SYM_FUNC_END(ftrace_epilogue)
SYM_FUNC_START(ftrace_regs_caller)
/* Save the current flags before any operations that can change them */
pushfq
UNWIND_HINT_SAVE
/* added 8 bytes to save flags */
save_mcount_regs 8
/* save_mcount_regs fills in first two parameters */
@@ -233,10 +235,13 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
movq ORIG_RAX(%rsp), %rax
movq %rax, MCOUNT_REG_SIZE-8(%rsp)
/* If ORIG_RAX is anything but zero, make this a call to that */
/*
* If ORIG_RAX is anything but zero, make this a call to that.
* See arch_ftrace_set_direct_caller().
*/
movq ORIG_RAX(%rsp), %rax
cmpq $0, %rax
je 1f
testq %rax, %rax
jz 1f
/* Swap the flags with orig_rax */
movq MCOUNT_REG_SIZE(%rsp), %rdi
@@ -244,20 +249,14 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
movq %rax, MCOUNT_REG_SIZE(%rsp)
restore_mcount_regs 8
/* Restore flags */
popfq
jmp 2f
SYM_INNER_LABEL(ftrace_regs_caller_ret, SYM_L_GLOBAL);
UNWIND_HINT_RET_OFFSET
jmp ftrace_epilogue
1: restore_mcount_regs
2:
/*
* The stack layout is nondetermistic here, depending on which path was
* taken. This confuses objtool and ORC, rightfully so. For now,
* pretend the stack always looks like the non-direct case.
*/
UNWIND_HINT_RESTORE
/* Restore flags */
popfq
@@ -268,7 +267,6 @@ SYM_INNER_LABEL(ftrace_regs_call, SYM_L_GLOBAL)
* to the return.
*/
SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_regs_caller)
@@ -303,7 +301,7 @@ trace:
* function tracing is enabled.
*/
movq ftrace_trace_function, %r8
CALL_NOSPEC %r8
CALL_NOSPEC r8
restore_mcount_regs
jmp fgraph_trace
@@ -340,6 +338,6 @@ SYM_CODE_START(return_to_handler)
movq 8(%rsp), %rdx
movq (%rsp), %rax
addq $24, %rsp
JMP_NOSPEC %rdi
JMP_NOSPEC rdi
SYM_CODE_END(return_to_handler)
#endif

View File

@@ -20,13 +20,13 @@
#include <linux/io.h>
#include <linux/memblock.h>
#include <linux/mem_encrypt.h>
#include <linux/pgtable.h>
#include <asm/processor.h>
#include <asm/proto.h>
#include <asm/smp.h>
#include <asm/setup.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/sections.h>
#include <asm/kdebug.h>

View File

@@ -13,8 +13,8 @@
#include <linux/linkage.h>
#include <linux/threads.h>
#include <linux/init.h>
#include <linux/pgtable.h>
#include <asm/segment.h>
#include <asm/pgtable.h>
#include <asm/page.h>
#include <asm/msr.h>
#include <asm/cache.h>
@@ -29,15 +29,16 @@
#ifdef CONFIG_PARAVIRT_XXL
#include <asm/asm-offsets.h>
#include <asm/paravirt.h>
#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
#else
#define INTERRUPT_RETURN iretq
#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
#endif
/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
/*
* We are not able to switch in one step to the final KERNEL ADDRESS SPACE
* because we need identity-mapped pages.
*
*/
#define l4_index(x) (((x) >> 39) & 511)
#define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))

View File

@@ -32,6 +32,8 @@
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <asm/user.h>
#include <asm/desc.h>
#include <asm/tlbflush.h>
/* Per cpu debug control register value */
DEFINE_PER_CPU(unsigned long, cpu_dr7);
@@ -97,6 +99,8 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
unsigned long *dr7;
int i;
lockdep_assert_irqs_disabled();
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
@@ -115,6 +119,12 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
dr7 = this_cpu_ptr(&cpu_dr7);
*dr7 |= encode_dr7(i, info->len, info->type);
/*
* Ensure we first write cpu_dr7 before we set the DR7 register.
* This ensures an NMI never see cpu_dr7 0 when DR7 is not.
*/
barrier();
set_debugreg(*dr7, 7);
if (info->mask)
set_dr_addr_mask(info->mask, i);
@@ -134,9 +144,11 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
void arch_uninstall_hw_breakpoint(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned long *dr7;
unsigned long dr7;
int i;
lockdep_assert_irqs_disabled();
for (i = 0; i < HBP_NUM; i++) {
struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]);
@@ -149,12 +161,20 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp)
if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
return;
dr7 = this_cpu_ptr(&cpu_dr7);
*dr7 &= ~__encode_dr7(i, info->len, info->type);
dr7 = this_cpu_read(cpu_dr7);
dr7 &= ~__encode_dr7(i, info->len, info->type);
set_debugreg(*dr7, 7);
set_debugreg(dr7, 7);
if (info->mask)
set_dr_addr_mask(0, i);
/*
* Ensure the write to cpu_dr7 is after we've set the DR7 register.
* This ensures an NMI never see cpu_dr7 0 when DR7 is not.
*/
barrier();
this_cpu_write(cpu_dr7, dr7);
}
static int arch_bp_generic_len(int x86_len)
@@ -227,10 +247,76 @@ int arch_check_bp_in_kernelspace(struct arch_hw_breakpoint *hw)
return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
}
/*
* Checks whether the range [addr, end], overlaps the area [base, base + size).
*/
static inline bool within_area(unsigned long addr, unsigned long end,
unsigned long base, unsigned long size)
{
return end >= base && addr < (base + size);
}
/*
* Checks whether the range from addr to end, inclusive, overlaps the fixed
* mapped CPU entry area range or other ranges used for CPU entry.
*/
static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
{
int cpu;
/* CPU entry erea is always used for CPU entry */
if (within_area(addr, end, CPU_ENTRY_AREA_BASE,
CPU_ENTRY_AREA_TOTAL_SIZE))
return true;
for_each_possible_cpu(cpu) {
/* The original rw GDT is being used after load_direct_gdt() */
if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
GDT_SIZE))
return true;
/*
* cpu_tss_rw is not directly referenced by hardware, but
* cpu_tss_rw is also used in CPU entry code,
*/
if (within_area(addr, end,
(unsigned long)&per_cpu(cpu_tss_rw, cpu),
sizeof(struct tss_struct)))
return true;
/*
* cpu_tlbstate.user_pcid_flush_mask is used for CPU entry.
* If a data breakpoint on it, it will cause an unwanted #DB.
* Protect the full cpu_tlbstate structure to be sure.
*/
if (within_area(addr, end,
(unsigned long)&per_cpu(cpu_tlbstate, cpu),
sizeof(struct tlb_state)))
return true;
}
return false;
}
static int arch_build_bp_info(struct perf_event *bp,
const struct perf_event_attr *attr,
struct arch_hw_breakpoint *hw)
{
unsigned long bp_end;
bp_end = attr->bp_addr + attr->bp_len - 1;
if (bp_end < attr->bp_addr)
return -EINVAL;
/*
* Prevent any breakpoint of any type that overlaps the CPU
* entry area and data. This protects the IST stacks and also
* reduces the chance that we ever find out what happens if
* there's a data breakpoint on the GDT, IDT, or TSS.
*/
if (within_cpu_entry(attr->bp_addr, bp_end))
return -EINVAL;
hw->address = attr->bp_addr;
hw->mask = 0;
@@ -439,7 +525,7 @@ static int hw_breakpoint_handler(struct die_args *args)
{
int i, cpu, rc = NOTIFY_STOP;
struct perf_event *bp;
unsigned long dr7, dr6;
unsigned long dr6;
unsigned long *dr6_p;
/* The DR6 value is pointed by args->err */
@@ -454,9 +540,6 @@ static int hw_breakpoint_handler(struct die_args *args)
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
get_debugreg(dr7, 7);
/* Disable breakpoints during exception handling */
set_debugreg(0UL, 7);
/*
* Assert that local interrupts are disabled
* Reset the DRn bits in the virtualized register value.
@@ -513,7 +596,6 @@ static int hw_breakpoint_handler(struct die_args *args)
(dr6 & (~DR_TRAP_BITS)))
rc = NOTIFY_DONE;
set_debugreg(dr7, 7);
put_cpu();
return rc;

View File

@@ -15,11 +15,11 @@
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
#include <linux/pgtable.h>
#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/i8259.h>

View File

@@ -4,6 +4,8 @@
*/
#include <linux/interrupt.h>
#include <asm/cpu_entry_area.h>
#include <asm/set_memory.h>
#include <asm/traps.h>
#include <asm/proto.h>
#include <asm/desc.h>
@@ -51,15 +53,23 @@ struct idt_data {
#define TSKG(_vector, _gdt) \
G(_vector, NULL, DEFAULT_STACK, GATE_TASK, DPL0, _gdt << 3)
#define IDT_TABLE_SIZE (IDT_ENTRIES * sizeof(gate_desc))
static bool idt_setup_done __initdata;
/*
* Early traps running on the DEFAULT_STACK because the other interrupt
* stacks work only after cpu_init().
*/
static const __initconst struct idt_data early_idts[] = {
INTG(X86_TRAP_DB, debug),
SYSG(X86_TRAP_BP, int3),
INTG(X86_TRAP_DB, asm_exc_debug),
SYSG(X86_TRAP_BP, asm_exc_int3),
#ifdef CONFIG_X86_32
INTG(X86_TRAP_PF, page_fault),
/*
* Not possible on 64-bit. See idt_setup_early_pf() for details.
*/
INTG(X86_TRAP_PF, asm_exc_page_fault),
#endif
};
@@ -70,33 +80,33 @@ static const __initconst struct idt_data early_idts[] = {
* set up TSS.
*/
static const __initconst struct idt_data def_idts[] = {
INTG(X86_TRAP_DE, divide_error),
INTG(X86_TRAP_NMI, nmi),
INTG(X86_TRAP_BR, bounds),
INTG(X86_TRAP_UD, invalid_op),
INTG(X86_TRAP_NM, device_not_available),
INTG(X86_TRAP_OLD_MF, coprocessor_segment_overrun),
INTG(X86_TRAP_TS, invalid_TSS),
INTG(X86_TRAP_NP, segment_not_present),
INTG(X86_TRAP_SS, stack_segment),
INTG(X86_TRAP_GP, general_protection),
INTG(X86_TRAP_SPURIOUS, spurious_interrupt_bug),
INTG(X86_TRAP_MF, coprocessor_error),
INTG(X86_TRAP_AC, alignment_check),
INTG(X86_TRAP_XF, simd_coprocessor_error),
INTG(X86_TRAP_DE, asm_exc_divide_error),
INTG(X86_TRAP_NMI, asm_exc_nmi),
INTG(X86_TRAP_BR, asm_exc_bounds),
INTG(X86_TRAP_UD, asm_exc_invalid_op),
INTG(X86_TRAP_NM, asm_exc_device_not_available),
INTG(X86_TRAP_OLD_MF, asm_exc_coproc_segment_overrun),
INTG(X86_TRAP_TS, asm_exc_invalid_tss),
INTG(X86_TRAP_NP, asm_exc_segment_not_present),
INTG(X86_TRAP_SS, asm_exc_stack_segment),
INTG(X86_TRAP_GP, asm_exc_general_protection),
INTG(X86_TRAP_SPURIOUS, asm_exc_spurious_interrupt_bug),
INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
INTG(X86_TRAP_AC, asm_exc_alignment_check),
INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
#ifdef CONFIG_X86_32
TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
#else
INTG(X86_TRAP_DF, double_fault),
INTG(X86_TRAP_DF, asm_exc_double_fault),
#endif
INTG(X86_TRAP_DB, debug),
INTG(X86_TRAP_DB, asm_exc_debug),
#ifdef CONFIG_X86_MCE
INTG(X86_TRAP_MC, &machine_check),
INTG(X86_TRAP_MC, asm_exc_machine_check),
#endif
SYSG(X86_TRAP_OF, overflow),
SYSG(X86_TRAP_OF, asm_exc_overflow),
#if defined(CONFIG_IA32_EMULATION)
SYSG(IA32_SYSCALL_VECTOR, entry_INT80_compat),
#elif defined(CONFIG_X86_32)
@@ -109,95 +119,63 @@ static const __initconst struct idt_data def_idts[] = {
*/
static const __initconst struct idt_data apic_idts[] = {
#ifdef CONFIG_SMP
INTG(RESCHEDULE_VECTOR, reschedule_interrupt),
INTG(CALL_FUNCTION_VECTOR, call_function_interrupt),
INTG(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt),
INTG(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt),
INTG(REBOOT_VECTOR, reboot_interrupt),
INTG(RESCHEDULE_VECTOR, asm_sysvec_reschedule_ipi),
INTG(CALL_FUNCTION_VECTOR, asm_sysvec_call_function),
INTG(CALL_FUNCTION_SINGLE_VECTOR, asm_sysvec_call_function_single),
INTG(IRQ_MOVE_CLEANUP_VECTOR, asm_sysvec_irq_move_cleanup),
INTG(REBOOT_VECTOR, asm_sysvec_reboot),
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
INTG(THERMAL_APIC_VECTOR, thermal_interrupt),
INTG(THERMAL_APIC_VECTOR, asm_sysvec_thermal),
#endif
#ifdef CONFIG_X86_MCE_THRESHOLD
INTG(THRESHOLD_APIC_VECTOR, threshold_interrupt),
INTG(THRESHOLD_APIC_VECTOR, asm_sysvec_threshold),
#endif
#ifdef CONFIG_X86_MCE_AMD
INTG(DEFERRED_ERROR_VECTOR, deferred_error_interrupt),
INTG(DEFERRED_ERROR_VECTOR, asm_sysvec_deferred_error),
#endif
#ifdef CONFIG_X86_LOCAL_APIC
INTG(LOCAL_TIMER_VECTOR, apic_timer_interrupt),
INTG(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi),
INTG(LOCAL_TIMER_VECTOR, asm_sysvec_apic_timer_interrupt),
INTG(X86_PLATFORM_IPI_VECTOR, asm_sysvec_x86_platform_ipi),
# ifdef CONFIG_HAVE_KVM
INTG(POSTED_INTR_VECTOR, kvm_posted_intr_ipi),
INTG(POSTED_INTR_WAKEUP_VECTOR, kvm_posted_intr_wakeup_ipi),
INTG(POSTED_INTR_NESTED_VECTOR, kvm_posted_intr_nested_ipi),
INTG(POSTED_INTR_VECTOR, asm_sysvec_kvm_posted_intr_ipi),
INTG(POSTED_INTR_WAKEUP_VECTOR, asm_sysvec_kvm_posted_intr_wakeup_ipi),
INTG(POSTED_INTR_NESTED_VECTOR, asm_sysvec_kvm_posted_intr_nested_ipi),
# endif
# ifdef CONFIG_IRQ_WORK
INTG(IRQ_WORK_VECTOR, irq_work_interrupt),
INTG(IRQ_WORK_VECTOR, asm_sysvec_irq_work),
# endif
#ifdef CONFIG_X86_UV
INTG(UV_BAU_MESSAGE, uv_bau_message_intr1),
#endif
INTG(SPURIOUS_APIC_VECTOR, spurious_interrupt),
INTG(ERROR_APIC_VECTOR, error_interrupt),
# ifdef CONFIG_X86_UV
INTG(UV_BAU_MESSAGE, asm_sysvec_uv_bau_message),
# endif
INTG(SPURIOUS_APIC_VECTOR, asm_sysvec_spurious_apic_interrupt),
INTG(ERROR_APIC_VECTOR, asm_sysvec_error_interrupt),
#endif
};
#ifdef CONFIG_X86_64
/*
* Early traps running on the DEFAULT_STACK because the other interrupt
* stacks work only after cpu_init().
*/
static const __initconst struct idt_data early_pf_idts[] = {
INTG(X86_TRAP_PF, page_fault),
};
/*
* Override for the debug_idt. Same as the default, but with interrupt
* stack set to DEFAULT_STACK (0). Required for NMI trap handling.
*/
static const __initconst struct idt_data dbg_idts[] = {
INTG(X86_TRAP_DB, debug),
};
#endif
/* Must be page-aligned because the real IDT is used in a fixmap. */
gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
/* Must be page-aligned because the real IDT is used in the cpu entry area */
static gate_desc idt_table[IDT_ENTRIES] __page_aligned_bss;
struct desc_ptr idt_descr __ro_after_init = {
.size = (IDT_ENTRIES * 2 * sizeof(unsigned long)) - 1,
.size = IDT_TABLE_SIZE - 1,
.address = (unsigned long) idt_table,
};
#ifdef CONFIG_X86_64
/* No need to be aligned, but done to keep all IDTs defined the same way. */
gate_desc debug_idt_table[IDT_ENTRIES] __page_aligned_bss;
void load_current_idt(void)
{
lockdep_assert_irqs_disabled();
load_idt(&idt_descr);
}
/*
* The exceptions which use Interrupt stacks. They are setup after
* cpu_init() when the TSS has been initialized.
*/
static const __initconst struct idt_data ist_idts[] = {
ISTG(X86_TRAP_DB, debug, IST_INDEX_DB),
ISTG(X86_TRAP_NMI, nmi, IST_INDEX_NMI),
ISTG(X86_TRAP_DF, double_fault, IST_INDEX_DF),
#ifdef CONFIG_X86_MCE
ISTG(X86_TRAP_MC, &machine_check, IST_INDEX_MCE),
#endif
};
/*
* Override for the debug_idt. Same as the default, but with interrupt
* stack set to DEFAULT_STACK (0). Required for NMI trap handling.
*/
const struct desc_ptr debug_idt_descr = {
.size = IDT_ENTRIES * 16 - 1,
.address = (unsigned long) debug_idt_table,
};
#ifdef CONFIG_X86_F00F_BUG
bool idt_is_f00f_address(unsigned long address)
{
return ((address - idt_descr.address) >> 3) == 6;
}
#endif
static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
@@ -214,7 +192,7 @@ static inline void idt_init_desc(gate_desc *gate, const struct idt_data *d)
#endif
}
static void
static __init void
idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sys)
{
gate_desc desc;
@@ -227,7 +205,7 @@ idt_setup_from_table(gate_desc *idt, const struct idt_data *t, int size, bool sy
}
}
static void set_intr_gate(unsigned int n, const void *addr)
static __init void set_intr_gate(unsigned int n, const void *addr)
{
struct idt_data data;
@@ -266,6 +244,27 @@ void __init idt_setup_traps(void)
}
#ifdef CONFIG_X86_64
/*
* Early traps running on the DEFAULT_STACK because the other interrupt
* stacks work only after cpu_init().
*/
static const __initconst struct idt_data early_pf_idts[] = {
INTG(X86_TRAP_PF, asm_exc_page_fault),
};
/*
* The exceptions which use Interrupt stacks. They are setup after
* cpu_init() when the TSS has been initialized.
*/
static const __initconst struct idt_data ist_idts[] = {
ISTG(X86_TRAP_DB, asm_exc_debug, IST_INDEX_DB),
ISTG(X86_TRAP_NMI, asm_exc_nmi, IST_INDEX_NMI),
ISTG(X86_TRAP_DF, asm_exc_double_fault, IST_INDEX_DF),
#ifdef CONFIG_X86_MCE
ISTG(X86_TRAP_MC, asm_exc_machine_check, IST_INDEX_MCE),
#endif
};
/**
* idt_setup_early_pf - Initialize the idt table with early pagefault handler
*
@@ -273,8 +272,10 @@ void __init idt_setup_traps(void)
* cpu_init() is invoked and sets up TSS. The IST variant is installed
* after that.
*
* FIXME: Why is 32bit and 64bit installing the PF handler at different
* places in the early setup code?
* Note, that X86_64 cannot install the real #PF handler in
* idt_setup_early_traps() because the memory intialization needs the #PF
* handler from the early_idt_handler_array to initialize the early page
* tables.
*/
void __init idt_setup_early_pf(void)
{
@@ -289,18 +290,21 @@ void __init idt_setup_ist_traps(void)
{
idt_setup_from_table(idt_table, ist_idts, ARRAY_SIZE(ist_idts), true);
}
/**
* idt_setup_debugidt_traps - Initialize the debug idt table with debug traps
*/
void __init idt_setup_debugidt_traps(void)
{
memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
idt_setup_from_table(debug_idt_table, dbg_idts, ARRAY_SIZE(dbg_idts), false);
}
#endif
static void __init idt_map_in_cea(void)
{
/*
* Set the IDT descriptor to a fixed read-only location in the cpu
* entry area, so that the "sidt" instruction will not leak the
* location of the kernel, and to defend the IDT against arbitrary
* memory write vulnerabilities.
*/
cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
PAGE_KERNEL_RO);
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
}
/**
* idt_setup_apic_and_irq_gates - Setup APIC/SMP and normal interrupt gates
*/
@@ -318,11 +322,23 @@ void __init idt_setup_apic_and_irq_gates(void)
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
set_bit(i, system_vectors);
/*
* Don't set the non assigned system vectors in the
* system_vectors bitmap. Otherwise they show up in
* /proc/interrupts.
*/
entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
set_intr_gate(i, entry);
}
#endif
/* Map IDT into CPU entry area and reload it. */
idt_map_in_cea();
load_idt(&idt_descr);
/* Make the IDT table read only */
set_memory_ro((unsigned long)&idt_table, 1);
idt_setup_done = true;
}
/**
@@ -352,16 +368,14 @@ void idt_invalidate(void *addr)
load_idt(&idt);
}
void __init update_intr_gate(unsigned int n, const void *addr)
void __init alloc_intr_gate(unsigned int n, const void *addr)
{
if (WARN_ON_ONCE(!test_bit(n, system_vectors)))
if (WARN_ON(n < FIRST_SYSTEM_VECTOR))
return;
set_intr_gate(n, addr);
}
void alloc_intr_gate(unsigned int n, const void *addr)
{
BUG_ON(n < FIRST_SYSTEM_VECTOR);
if (!test_and_set_bit(n, system_vectors))
if (WARN_ON(idt_setup_done))
return;
if (!WARN_ON(test_and_set_bit(n, system_vectors)))
set_intr_gate(n, addr);
}

View File

@@ -33,15 +33,15 @@ void io_bitmap_share(struct task_struct *tsk)
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
}
static void task_update_io_bitmap(void)
static void task_update_io_bitmap(struct task_struct *tsk)
{
struct thread_struct *t = &current->thread;
struct thread_struct *t = &tsk->thread;
if (t->iopl_emul == 3 || t->io_bitmap) {
/* TSS update is handled on exit to user space */
set_thread_flag(TIF_IO_BITMAP);
set_tsk_thread_flag(tsk, TIF_IO_BITMAP);
} else {
clear_thread_flag(TIF_IO_BITMAP);
clear_tsk_thread_flag(tsk, TIF_IO_BITMAP);
/* Invalidate TSS */
preempt_disable();
tss_update_io_bitmap();
@@ -49,12 +49,12 @@ static void task_update_io_bitmap(void)
}
}
void io_bitmap_exit(void)
void io_bitmap_exit(struct task_struct *tsk)
{
struct io_bitmap *iobm = current->thread.io_bitmap;
struct io_bitmap *iobm = tsk->thread.io_bitmap;
current->thread.io_bitmap = NULL;
task_update_io_bitmap();
tsk->thread.io_bitmap = NULL;
task_update_io_bitmap(tsk);
if (iobm && refcount_dec_and_test(&iobm->refcnt))
kfree(iobm);
}
@@ -102,7 +102,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
if (!iobm)
return -ENOMEM;
refcount_set(&iobm->refcnt, 1);
io_bitmap_exit();
io_bitmap_exit(current);
}
/*
@@ -134,7 +134,7 @@ long ksys_ioperm(unsigned long from, unsigned long num, int turn_on)
}
/* All permissions dropped? */
if (max_long == UINT_MAX) {
io_bitmap_exit();
io_bitmap_exit(current);
return 0;
}
@@ -192,7 +192,7 @@ SYSCALL_DEFINE1(iopl, unsigned int, level)
}
t->iopl_emul = level;
task_update_io_bitmap();
task_update_io_bitmap(current);
return 0;
}

View File

@@ -13,12 +13,14 @@
#include <linux/export.h>
#include <linux/irq.h>
#include <asm/irq_stack.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/irq.h>
#include <asm/mce.h>
#include <asm/hw_irq.h>
#include <asm/desc.h>
#include <asm/traps.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -26,9 +28,6 @@
DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
DEFINE_PER_CPU(struct pt_regs *, irq_regs);
EXPORT_PER_CPU_SYMBOL(irq_regs);
atomic_t irq_err_count;
/*
@@ -224,35 +223,35 @@ u64 arch_irq_stat(void)
return sum;
}
static __always_inline void handle_irq(struct irq_desc *desc,
struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_X86_64))
run_on_irqstack_cond(desc->handle_irq, desc, regs);
else
__handle_irq(desc, regs);
}
/*
* do_IRQ handles all normal device IRQ's (the special
* SMP cross-CPU interrupts have their own specific
* handlers).
* common_interrupt() handles all normal device IRQ's (the special SMP
* cross-CPU interrupts have their own entry points).
*/
__visible void __irq_entry do_IRQ(struct pt_regs *regs)
DEFINE_IDTENTRY_IRQ(common_interrupt)
{
struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc * desc;
/* high bit used in ret_from_ code */
unsigned vector = ~regs->orig_ax;
struct irq_desc *desc;
entering_irq();
/* entering_irq() tells RCU that we're not quiescent. Check it. */
/* entry code tells RCU that we're not quiescent. Check it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");
desc = __this_cpu_read(vector_irq[vector]);
if (likely(!IS_ERR_OR_NULL(desc))) {
if (IS_ENABLED(CONFIG_X86_32))
handle_irq(desc, regs);
else
generic_handle_irq_desc(desc);
handle_irq(desc, regs);
} else {
ack_APIC_irq();
if (desc == VECTOR_UNUSED) {
pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
pr_emerg_ratelimited("%s: %d.%u No irq handler for vector\n",
__func__, smp_processor_id(),
vector);
} else {
@@ -260,8 +259,6 @@ __visible void __irq_entry do_IRQ(struct pt_regs *regs)
}
}
exiting_irq();
set_irq_regs(old_regs);
}
@@ -271,17 +268,16 @@ void (*x86_platform_ipi_callback)(void) = NULL;
/*
* Handler for X86_PLATFORM_IPI_VECTOR.
*/
__visible void __irq_entry smp_x86_platform_ipi(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_x86_platform_ipi)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_ack_irq();
ack_APIC_irq();
trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR);
inc_irq_stat(x86_platform_ipis);
if (x86_platform_ipi_callback)
x86_platform_ipi_callback();
trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR);
exiting_irq();
set_irq_regs(old_regs);
}
#endif
@@ -302,41 +298,29 @@ EXPORT_SYMBOL_GPL(kvm_set_posted_intr_wakeup_handler);
/*
* Handler for POSTED_INTERRUPT_VECTOR.
*/
__visible void smp_kvm_posted_intr_ipi(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_ipi)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_ack_irq();
ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_ipis);
exiting_irq();
set_irq_regs(old_regs);
}
/*
* Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
*/
__visible void smp_kvm_posted_intr_wakeup_ipi(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_posted_intr_wakeup_ipi)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_ack_irq();
ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_wakeup_ipis);
kvm_posted_intr_wakeup_handler();
exiting_irq();
set_irq_regs(old_regs);
}
/*
* Handler for POSTED_INTERRUPT_NESTED_VECTOR.
*/
__visible void smp_kvm_posted_intr_nested_ipi(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_kvm_posted_intr_nested_ipi)
{
struct pt_regs *old_regs = set_irq_regs(regs);
entering_ack_irq();
ack_APIC_irq();
inc_irq_stat(kvm_posted_intr_nested_ipis);
exiting_irq();
set_irq_regs(old_regs);
}
#endif

View File

@@ -148,7 +148,7 @@ void do_softirq_own_stack(void)
call_on_stack(__do_softirq, isp);
}
void handle_irq(struct irq_desc *desc, struct pt_regs *regs)
void __handle_irq(struct irq_desc *desc, struct pt_regs *regs)
{
int overflow = check_stack_overflow();

View File

@@ -20,6 +20,7 @@
#include <linux/sched/task_stack.h>
#include <asm/cpu_entry_area.h>
#include <asm/irq_stack.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -43,7 +44,7 @@ static int map_irq_stack(unsigned int cpu)
pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
}
va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL);
if (!va)
return -ENOMEM;
@@ -70,3 +71,8 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
return map_irq_stack(cpu);
}
void do_softirq_own_stack(void)
{
run_on_irqstack_cond(__do_softirq, NULL, NULL);
}

View File

@@ -9,18 +9,18 @@
#include <linux/irq_work.h>
#include <linux/hardirq.h>
#include <asm/apic.h>
#include <asm/idtentry.h>
#include <asm/trace/irq_vectors.h>
#include <linux/interrupt.h>
#ifdef CONFIG_X86_LOCAL_APIC
__visible void __irq_entry smp_irq_work_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_irq_work)
{
ipi_entering_ack_irq();
ack_APIC_irq();
trace_irq_work_entry(IRQ_WORK_VECTOR);
inc_irq_stat(apic_irq_work_irqs);
irq_work_run();
trace_irq_work_exit(IRQ_WORK_VECTOR);
exiting_irq();
}
void arch_irq_work_raise(void)

View File

@@ -16,11 +16,11 @@
#include <linux/acpi.h>
#include <linux/io.h>
#include <linux/delay.h>
#include <linux/pgtable.h>
#include <linux/atomic.h>
#include <asm/timer.h>
#include <asm/hw_irq.h>
#include <asm/pgtable.h>
#include <asm/desc.h>
#include <asm/apic.h>
#include <asm/setup.h>

View File

@@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable;
unsigned int __read_mostly sysctl_sched_itmt_enabled;
static int sched_itmt_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned int old_sysctl;
int ret;

View File

@@ -41,11 +41,11 @@
#include <linux/kasan.h>
#include <linux/moduleloader.h>
#include <linux/vmalloc.h>
#include <linux/pgtable.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
#include <asm/alternative.h>
#include <asm/insn.h>
@@ -1073,13 +1073,6 @@ NOKPROBE_SYMBOL(kprobe_fault_handler);
int __init arch_populate_kprobe_blacklist(void)
{
int ret;
ret = kprobe_add_area_blacklist((unsigned long)__irqentry_text_start,
(unsigned long)__irqentry_text_end);
if (ret)
return ret;
return kprobe_add_area_blacklist((unsigned long)__entry_text_start,
(unsigned long)__entry_text_end);
}

View File

@@ -16,11 +16,11 @@
#include <linux/kallsyms.h>
#include <linux/ftrace.h>
#include <linux/frame.h>
#include <linux/pgtable.h>
#include <asm/text-patching.h>
#include <asm/cacheflush.h>
#include <asm/desc.h>
#include <asm/pgtable.h>
#include <linux/uaccess.h>
#include <asm/alternative.h>
#include <asm/insn.h>
@@ -286,9 +286,7 @@ static int can_optimize(unsigned long paddr)
* stack handling and registers setup.
*/
if (((paddr >= (unsigned long)__entry_text_start) &&
(paddr < (unsigned long)__entry_text_end)) ||
((paddr >= (unsigned long)__irqentry_text_start) &&
(paddr < (unsigned long)__irqentry_text_end)))
(paddr < (unsigned long)__entry_text_end)))
return 0;
/* Check there is enough space for a relative jump. */

View File

@@ -35,6 +35,8 @@
#include <asm/tlb.h>
#include <asm/cpuidle_haltpoll.h>
DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled);
static int kvmapf = 1;
static int __init parse_no_kvmapf(char *arg)
@@ -73,7 +75,6 @@ struct kvm_task_sleep_node {
struct swait_queue_head wq;
u32 token;
int cpu;
bool halted;
};
static struct kvm_task_sleep_head {
@@ -96,77 +97,64 @@ static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
return NULL;
}
/*
* @interrupt_kernel: Is this called from a routine which interrupts the kernel
* (other than user space)?
*/
void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
static bool kvm_async_pf_queue_task(u32 token, struct kvm_task_sleep_node *n)
{
u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
struct kvm_task_sleep_node n, *e;
DECLARE_SWAITQUEUE(wait);
rcu_irq_enter();
struct kvm_task_sleep_node *e;
raw_spin_lock(&b->lock);
e = _find_apf_task(b, token);
if (e) {
/* dummy entry exist -> wake up was delivered ahead of PF */
hlist_del(&e->link);
kfree(e);
raw_spin_unlock(&b->lock);
rcu_irq_exit();
return;
kfree(e);
return false;
}
n.token = token;
n.cpu = smp_processor_id();
n.halted = is_idle_task(current) ||
(IS_ENABLED(CONFIG_PREEMPT_COUNT)
? preempt_count() > 1 || rcu_preempt_depth()
: interrupt_kernel);
init_swait_queue_head(&n.wq);
hlist_add_head(&n.link, &b->list);
n->token = token;
n->cpu = smp_processor_id();
init_swait_queue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
raw_spin_unlock(&b->lock);
return true;
}
/*
* kvm_async_pf_task_wait_schedule - Wait for pagefault to be handled
* @token: Token to identify the sleep node entry
*
* Invoked from the async pagefault handling code or from the VM exit page
* fault handler. In both cases RCU is watching.
*/
void kvm_async_pf_task_wait_schedule(u32 token)
{
struct kvm_task_sleep_node n;
DECLARE_SWAITQUEUE(wait);
lockdep_assert_irqs_disabled();
if (!kvm_async_pf_queue_task(token, &n))
return;
for (;;) {
if (!n.halted)
prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
if (hlist_unhashed(&n.link))
break;
rcu_irq_exit();
if (!n.halted) {
local_irq_enable();
schedule();
local_irq_disable();
} else {
/*
* We cannot reschedule. So halt.
*/
native_safe_halt();
local_irq_disable();
}
rcu_irq_enter();
local_irq_enable();
schedule();
local_irq_disable();
}
if (!n.halted)
finish_swait(&n.wq, &wait);
rcu_irq_exit();
return;
finish_swait(&n.wq, &wait);
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait_schedule);
static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
hlist_del_init(&n->link);
if (n->halted)
smp_send_reschedule(n->cpu);
else if (swq_has_sleeper(&n->wq))
if (swq_has_sleeper(&n->wq))
swake_up_one(&n->wq);
}
@@ -175,12 +163,13 @@ static void apf_task_wake_all(void)
int i;
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
struct hlist_node *p, *next;
struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
struct kvm_task_sleep_node *n;
struct hlist_node *p, *next;
raw_spin_lock(&b->lock);
hlist_for_each_safe(p, next, &b->list) {
struct kvm_task_sleep_node *n =
hlist_entry(p, typeof(*n), link);
n = hlist_entry(p, typeof(*n), link);
if (n->cpu == smp_processor_id())
apf_task_wake_one(n);
}
@@ -221,46 +210,64 @@ again:
n->cpu = smp_processor_id();
init_swait_queue_head(&n->wq);
hlist_add_head(&n->link, &b->list);
} else
} else {
apf_task_wake_one(n);
}
raw_spin_unlock(&b->lock);
return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
u32 kvm_read_and_reset_pf_reason(void)
noinstr u32 kvm_read_and_reset_apf_flags(void)
{
u32 reason = 0;
u32 flags = 0;
if (__this_cpu_read(apf_reason.enabled)) {
reason = __this_cpu_read(apf_reason.reason);
__this_cpu_write(apf_reason.reason, 0);
flags = __this_cpu_read(apf_reason.flags);
__this_cpu_write(apf_reason.flags, 0);
}
return reason;
return flags;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
EXPORT_SYMBOL_GPL(kvm_read_and_reset_apf_flags);
dotraplinkage void
do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
noinstr bool __kvm_handle_async_pf(struct pt_regs *regs, u32 token)
{
switch (kvm_read_and_reset_pf_reason()) {
default:
do_page_fault(regs, error_code, address);
break;
u32 reason = kvm_read_and_reset_apf_flags();
bool rcu_exit;
switch (reason) {
case KVM_PV_REASON_PAGE_NOT_PRESENT:
/* page is swapped out by the host. */
kvm_async_pf_task_wait((u32)address, !user_mode(regs));
break;
case KVM_PV_REASON_PAGE_READY:
rcu_irq_enter();
kvm_async_pf_task_wake((u32)address);
rcu_irq_exit();
break;
default:
return false;
}
rcu_exit = idtentry_enter_cond_rcu(regs);
instrumentation_begin();
/*
* If the host managed to inject an async #PF into an interrupt
* disabled region, then die hard as this is not going to end well
* and the host side is seriously broken.
*/
if (unlikely(!(regs->flags & X86_EFLAGS_IF)))
panic("Host injected async #PF in interrupt disabled region\n");
if (reason == KVM_PV_REASON_PAGE_NOT_PRESENT) {
if (unlikely(!(user_mode(regs))))
panic("Host injected async #PF in kernel mode\n");
/* Page is swapped out by the host. */
kvm_async_pf_task_wait_schedule(token);
} else {
kvm_async_pf_task_wake(token);
}
instrumentation_end();
idtentry_exit_cond_rcu(regs, rcu_exit);
return true;
}
NOKPROBE_SYMBOL(do_async_page_fault);
static void __init paravirt_ops_setup(void)
{
@@ -306,11 +313,11 @@ static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
static void kvm_guest_cpu_init(void)
{
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
u64 pa;
#ifdef CONFIG_PREEMPTION
pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
WARN_ON_ONCE(!static_branch_likely(&kvm_async_pf_enabled));
pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
pa |= KVM_ASYNC_PF_ENABLED;
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
@@ -318,12 +325,12 @@ static void kvm_guest_cpu_init(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
__this_cpu_write(apf_reason.enabled, 1);
printk(KERN_INFO"KVM setup async PF for cpu %d\n",
smp_processor_id());
pr_info("KVM setup async PF for cpu %d\n", smp_processor_id());
}
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
unsigned long pa;
/* Size alignment is implied but just to make it explicit. */
BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
__this_cpu_write(kvm_apic_eoi, 0);
@@ -344,8 +351,7 @@ static void kvm_pv_disable_apf(void)
wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
__this_cpu_write(apf_reason.enabled, 0);
printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
smp_processor_id());
pr_info("Unregister pv shared memory for cpu %d\n", smp_processor_id());
}
static void kvm_pv_guest_cpu_reboot(void *unused)
@@ -592,12 +598,6 @@ static int kvm_cpu_down_prepare(unsigned int cpu)
}
#endif
static void __init kvm_apf_trap_init(void)
{
update_intr_gate(X86_TRAP_PF, async_page_fault);
}
static void kvm_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
@@ -632,8 +632,6 @@ static void __init kvm_guest_init(void)
register_reboot_notifier(&kvm_pv_reboot_nb);
for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
raw_spin_lock_init(&async_pf_sleepers[i].lock);
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
x86_init.irqs.trap_init = kvm_apf_trap_init;
if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
has_steal_clock = 1;
@@ -649,6 +647,9 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf)
static_branch_enable(&kvm_async_pf_enabled);
#ifdef CONFIG_SMP
smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;

View File

@@ -8,7 +8,7 @@
*
* Lock order:
* contex.ldt_usr_sem
* mmap_sem
* mmap_lock
* context.lock
*/

View File

@@ -1,53 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* livepatch.c - x86-specific Kernel Live Patching Core
*/
#include <linux/module.h>
#include <linux/kallsyms.h>
#include <linux/livepatch.h>
#include <asm/text-patching.h>
/* Apply per-object alternatives. Based on x86 module_finalize() */
void arch_klp_init_object_loaded(struct klp_patch *patch,
struct klp_object *obj)
{
int cnt;
struct klp_modinfo *info;
Elf_Shdr *s, *alt = NULL, *para = NULL;
void *aseg, *pseg;
const char *objname;
char sec_objname[MODULE_NAME_LEN];
char secname[KSYM_NAME_LEN];
info = patch->mod->klp_info;
objname = obj->name ? obj->name : "vmlinux";
/* See livepatch core code for BUILD_BUG_ON() explanation */
BUILD_BUG_ON(MODULE_NAME_LEN < 56 || KSYM_NAME_LEN != 128);
for (s = info->sechdrs; s < info->sechdrs + info->hdr.e_shnum; s++) {
/* Apply per-object .klp.arch sections */
cnt = sscanf(info->secstrings + s->sh_name,
".klp.arch.%55[^.].%127s",
sec_objname, secname);
if (cnt != 2)
continue;
if (strcmp(sec_objname, objname))
continue;
if (!strcmp(".altinstructions", secname))
alt = s;
if (!strcmp(".parainstructions", secname))
para = s;
}
if (alt) {
aseg = (void *) alt->sh_addr;
apply_alternatives(aseg, aseg + alt->sh_size);
}
if (para) {
pseg = (void *) para->sh_addr;
apply_paravirt(pseg, pseg + para->sh_size);
}
}

View File

@@ -13,7 +13,6 @@
#include <linux/gfp.h>
#include <linux/io.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>

View File

@@ -19,7 +19,6 @@
#include <linux/efi.h>
#include <asm/init.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/io_apic.h>

View File

@@ -18,10 +18,10 @@
#include <linux/gfp.h>
#include <linux/jump_label.h>
#include <linux/random.h>
#include <linux/memory.h>
#include <asm/text-patching.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/setup.h>
#include <asm/unwind.h>
@@ -126,11 +126,12 @@ int apply_relocate(Elf32_Shdr *sechdrs,
return 0;
}
#else /*X86_64*/
int apply_relocate_add(Elf64_Shdr *sechdrs,
static int __apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
struct module *me,
void *(*write)(void *dest, const void *src, size_t len))
{
unsigned int i;
Elf64_Rela *rel = (void *)sechdrs[relsec].sh_addr;
@@ -162,19 +163,19 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
case R_X86_64_64:
if (*(u64 *)loc != 0)
goto invalid_relocation;
*(u64 *)loc = val;
write(loc, &val, 8);
break;
case R_X86_64_32:
if (*(u32 *)loc != 0)
goto invalid_relocation;
*(u32 *)loc = val;
write(loc, &val, 4);
if (val != *(u32 *)loc)
goto overflow;
break;
case R_X86_64_32S:
if (*(s32 *)loc != 0)
goto invalid_relocation;
*(s32 *)loc = val;
write(loc, &val, 4);
if ((s64)val != *(s32 *)loc)
goto overflow;
break;
@@ -183,7 +184,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
if (*(u32 *)loc != 0)
goto invalid_relocation;
val -= (u64)loc;
*(u32 *)loc = val;
write(loc, &val, 4);
#if 0
if ((s64)val != *(s32 *)loc)
goto overflow;
@@ -193,7 +194,7 @@ int apply_relocate_add(Elf64_Shdr *sechdrs,
if (*(u64 *)loc != 0)
goto invalid_relocation;
val -= (u64)loc;
*(u64 *)loc = val;
write(loc, &val, 8);
break;
default:
pr_err("%s: Unknown rela relocation: %llu\n",
@@ -215,6 +216,33 @@ overflow:
me->name);
return -ENOEXEC;
}
int apply_relocate_add(Elf64_Shdr *sechdrs,
const char *strtab,
unsigned int symindex,
unsigned int relsec,
struct module *me)
{
int ret;
bool early = me->state == MODULE_STATE_UNFORMED;
void *(*write)(void *, const void *, size_t) = memcpy;
if (!early) {
write = text_poke;
mutex_lock(&text_mutex);
}
ret = __apply_relocate_add(sechdrs, strtab, symindex, relsec, me,
write);
if (!early) {
text_poke_sync();
mutex_unlock(&text_mutex);
}
return ret;
}
#endif
int module_finalize(const Elf_Ehdr *hdr,

View File

@@ -25,10 +25,6 @@
#include <linux/atomic.h>
#include <linux/sched/clock.h>
#if defined(CONFIG_EDAC)
#include <linux/edac.h>
#endif
#include <asm/cpu_entry_area.h>
#include <asm/traps.h>
#include <asm/mach_traps.h>
@@ -307,7 +303,7 @@ NOKPROBE_SYMBOL(unknown_nmi_error);
static DEFINE_PER_CPU(bool, swallow_nmi);
static DEFINE_PER_CPU(unsigned long, last_nmi_rip);
static void default_do_nmi(struct pt_regs *regs)
static noinstr void default_do_nmi(struct pt_regs *regs)
{
unsigned char reason = 0;
int handled;
@@ -333,6 +329,9 @@ static void default_do_nmi(struct pt_regs *regs)
__this_cpu_write(last_nmi_rip, regs->ip);
instrumentation_begin();
trace_hardirqs_off_finish();
handled = nmi_handle(NMI_LOCAL, regs);
__this_cpu_add(nmi_stats.normal, handled);
if (handled) {
@@ -346,7 +345,7 @@ static void default_do_nmi(struct pt_regs *regs)
*/
if (handled > 1)
__this_cpu_write(swallow_nmi, true);
return;
goto out;
}
/*
@@ -378,7 +377,7 @@ static void default_do_nmi(struct pt_regs *regs)
#endif
__this_cpu_add(nmi_stats.external, 1);
raw_spin_unlock(&nmi_reason_lock);
return;
goto out;
}
raw_spin_unlock(&nmi_reason_lock);
@@ -416,8 +415,12 @@ static void default_do_nmi(struct pt_regs *regs)
__this_cpu_add(nmi_stats.swallow, 1);
else
unknown_nmi_error(reason, regs);
out:
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
}
NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can page fault or hit breakpoints which will cause it to lose
@@ -471,44 +474,9 @@ enum nmi_states {
};
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
static DEFINE_PER_CPU(unsigned long, nmi_dr7);
#ifdef CONFIG_X86_64
/*
* In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
* some care, the inner breakpoint will clobber the outer breakpoint's
* stack.
*
* If a breakpoint is being processed, and the debug stack is being
* used, if an NMI comes in and also hits a breakpoint, the stack
* pointer will be set to the same fixed address as the breakpoint that
* was interrupted, causing that stack to be corrupted. To handle this
* case, check if the stack that was interrupted is the debug stack, and
* if so, change the IDT so that new breakpoints will use the current
* stack and not switch to the fixed address. On return of the NMI,
* switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
static bool notrace is_debug_stack(unsigned long addr)
{
struct cea_exception_stacks *cs = __this_cpu_read(cea_exception_stacks);
unsigned long top = CEA_ESTACK_TOP(cs, DB);
unsigned long bot = CEA_ESTACK_BOT(cs, DB1);
if (__this_cpu_read(debug_stack_usage))
return true;
/*
* Note, this covers the guard page between DB and DB1 as well to
* avoid two checks. But by all means @addr can never point into
* the guard page.
*/
return addr >= bot && addr < top;
}
NOKPROBE_SYMBOL(is_debug_stack);
#endif
dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY_NMI(exc_nmi)
{
if (IS_ENABLED(CONFIG_SMP) && cpu_is_offline(smp_processor_id()))
return;
@@ -521,18 +489,7 @@ do_nmi(struct pt_regs *regs, long error_code)
this_cpu_write(nmi_cr2, read_cr2());
nmi_restart:
#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
* change the IDT such that breakpoints that happen here
* continue to use the NMI stack.
*/
if (unlikely(is_debug_stack(regs->sp))) {
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
#endif
this_cpu_write(nmi_dr7, local_db_save());
nmi_enter();
@@ -543,12 +500,7 @@ nmi_restart:
nmi_exit();
#ifdef CONFIG_X86_64
if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
this_cpu_write(update_debug_stack, 0);
}
#endif
local_db_restore(this_cpu_read(nmi_dr7));
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
@@ -558,7 +510,6 @@ nmi_restart:
if (user_mode(regs))
mds_user_clear_cpu_buffers();
}
NOKPROBE_SYMBOL(do_nmi);
void stop_nmi(void)
{

View File

@@ -13,13 +13,13 @@
#include <linux/bcd.h>
#include <linux/highmem.h>
#include <linux/kprobes.h>
#include <linux/pgtable.h>
#include <asm/bug.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/setup.h>
#include <asm/pgtable.h>
#include <asm/time.h>
#include <asm/pgalloc.h>
#include <asm/irq.h>
@@ -160,25 +160,6 @@ unsigned paravirt_patch_insns(void *insn_buff, unsigned len,
return insn_len;
}
static void native_flush_tlb(void)
{
__native_flush_tlb();
}
/*
* Global pages have to be flushed a bit differently. Not a real
* performance problem because this does not happen often.
*/
static void native_flush_tlb_global(void)
{
__native_flush_tlb_global();
}
static void native_flush_tlb_one_user(unsigned long addr)
{
__native_flush_tlb_one_user(addr);
}
struct static_key paravirt_steal_enabled;
struct static_key paravirt_steal_rq_enabled;
@@ -359,7 +340,7 @@ struct paravirt_patch_template pv_ops = {
#endif /* CONFIG_PARAVIRT_XXL */
/* Mmu ops. */
.mmu.flush_tlb_user = native_flush_tlb,
.mmu.flush_tlb_user = native_flush_tlb_local,
.mmu.flush_tlb_kernel = native_flush_tlb_global,
.mmu.flush_tlb_one_user = native_flush_tlb_one_user,
.mmu.flush_tlb_others = native_flush_tlb_others,

View File

@@ -96,7 +96,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
}
/*
* Free current thread data structures etc..
* Free thread data structures etc..
*/
void exit_thread(struct task_struct *tsk)
{
@@ -104,7 +104,7 @@ void exit_thread(struct task_struct *tsk)
struct fpu *fpu = &t->fpu;
if (test_thread_flag(TIF_IO_BITMAP))
io_bitmap_exit();
io_bitmap_exit(tsk);
free_vm86(t);
@@ -191,7 +191,7 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
fpu__clear(&tsk->thread.fpu);
fpu__clear_all(&tsk->thread.fpu);
}
void disable_TSC(void)
@@ -612,6 +612,17 @@ void speculation_ctrl_update_current(void)
preempt_enable();
}
static inline void cr4_toggle_bits_irqsoff(unsigned long mask)
{
unsigned long newval, cr4 = this_cpu_read(cpu_tlbstate.cr4);
newval = cr4 ^ mask;
if (newval != cr4) {
this_cpu_write(cpu_tlbstate.cr4, newval);
__write_cr4(newval);
}
}
void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
{
unsigned long tifp, tifn;

View File

@@ -39,7 +39,6 @@
#include <linux/kdebug.h>
#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <asm/ldt.h>
#include <asm/processor.h>
#include <asm/fpu/internal.h>
@@ -52,7 +51,7 @@
#include <asm/debugreg.h>
#include <asm/switch_to.h>
#include <asm/vm86.h>
#include <asm/resctrl_sched.h>
#include <asm/resctrl.h>
#include <asm/proto.h>
#include "process.h"

View File

@@ -40,7 +40,6 @@
#include <linux/ftrace.h>
#include <linux/syscalls.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/fpu/internal.h>
#include <asm/mmu_context.h>
@@ -52,7 +51,7 @@
#include <asm/switch_to.h>
#include <asm/xen/hypervisor.h>
#include <asm/vdso.h>
#include <asm/resctrl_sched.h>
#include <asm/resctrl.h>
#include <asm/unistd.h>
#include <asm/fsgsbase.h>
#ifdef CONFIG_IA32_EMULATION

View File

@@ -28,7 +28,6 @@
#include <linux/nospec.h>
#include <linux/uaccess.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
#include <asm/fpu/internal.h>
#include <asm/fpu/signal.h>

View File

@@ -11,13 +11,13 @@
#include <linux/tboot.h>
#include <linux/delay.h>
#include <linux/frame.h>
#include <linux/pgtable.h>
#include <acpi/reboot.h>
#include <asm/io.h>
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/desc.h>
#include <asm/hpet.h>
#include <asm/pgtable.h>
#include <asm/proto.h>
#include <asm/reboot_fixups.h>
#include <asm/reboot.h>

View File

@@ -237,6 +237,9 @@ static u64 __init get_ramdisk_image(void)
ramdisk_image |= (u64)boot_params.ext_ramdisk_image << 32;
if (ramdisk_image == 0)
ramdisk_image = phys_initrd_start;
return ramdisk_image;
}
static u64 __init get_ramdisk_size(void)
@@ -245,6 +248,9 @@ static u64 __init get_ramdisk_size(void)
ramdisk_size |= (u64)boot_params.ext_ramdisk_size << 32;
if (ramdisk_size == 0)
ramdisk_size = phys_initrd_size;
return ramdisk_size;
}

View File

@@ -287,9 +287,9 @@ void __init setup_per_cpu_areas(void)
/*
* Sync back kernel address range again. We already did this in
* setup_arch(), but percpu data also needs to be available in
* the smpboot asm. We can't reliably pick up percpu mappings
* using vmalloc_fault(), because exception dispatch needs
* percpu data.
* the smpboot asm and arch_sync_kernel_mappings() doesn't sync to
* swapper_pg_dir on 32-bit. The per-cpu mappings need to be available
* there too.
*
* FIXME: Can the later sync in setup_cpu_entry_areas() replace
* this call?

View File

@@ -37,6 +37,7 @@
#include <asm/vm86.h>
#ifdef CONFIG_X86_64
#include <linux/compat.h>
#include <asm/proto.h>
#include <asm/ia32_unistd.h>
#endif /* CONFIG_X86_64 */
@@ -511,6 +512,31 @@ Efault:
}
#endif /* CONFIG_X86_32 */
#ifdef CONFIG_X86_X32_ABI
static int x32_copy_siginfo_to_user(struct compat_siginfo __user *to,
const struct kernel_siginfo *from)
{
struct compat_siginfo new;
copy_siginfo_to_external32(&new, from);
if (from->si_signo == SIGCHLD) {
new._sifields._sigchld_x32._utime = from->si_utime;
new._sifields._sigchld_x32._stime = from->si_stime;
}
if (copy_to_user(to, &new, sizeof(struct compat_siginfo)))
return -EFAULT;
return 0;
}
int copy_siginfo_to_user32(struct compat_siginfo __user *to,
const struct kernel_siginfo *from)
{
if (in_x32_syscall())
return x32_copy_siginfo_to_user(to, from);
return __copy_siginfo_to_user32(to, from);
}
#endif /* CONFIG_X86_X32_ABI */
static int x32_setup_rt_frame(struct ksignal *ksig,
compat_sigset_t *set,
struct pt_regs *regs)
@@ -543,7 +569,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
user_access_end();
if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
if (x32_copy_siginfo_to_user(&frame->info, &ksig->info))
return -EFAULT;
}
@@ -732,7 +758,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
/*
* Ensure the signal handler starts with the new fpu state.
*/
fpu__clear(fpu);
fpu__clear_user_states(fpu);
}
signal_setup_done(failed, ksig, stepping);
}

View File

@@ -27,6 +27,7 @@
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/apic.h>
#include <asm/idtentry.h>
#include <asm/nmi.h>
#include <asm/mce.h>
#include <asm/trace/irq_vectors.h>
@@ -130,13 +131,11 @@ static int smp_stop_nmi_callback(unsigned int val, struct pt_regs *regs)
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
asmlinkage __visible void smp_reboot_interrupt(void)
DEFINE_IDTENTRY_SYSVEC(sysvec_reboot)
{
ipi_entering_ack_irq();
ack_APIC_irq();
cpu_emergency_vmxoff();
stop_this_cpu(NULL);
irq_exit();
}
static int register_stop_handler(void)
@@ -221,47 +220,33 @@ static void native_stop_other_cpus(int wait)
/*
* Reschedule call back. KVM uses this interrupt to force a cpu out of
* guest mode
* guest mode.
*/
__visible void __irq_entry smp_reschedule_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC_SIMPLE(sysvec_reschedule_ipi)
{
ack_APIC_irq();
trace_reschedule_entry(RESCHEDULE_VECTOR);
inc_irq_stat(irq_resched_count);
kvm_set_cpu_l1tf_flush_l1d();
if (trace_resched_ipi_enabled()) {
/*
* scheduler_ipi() might call irq_enter() as well, but
* nested calls are fine.
*/
irq_enter();
trace_reschedule_entry(RESCHEDULE_VECTOR);
scheduler_ipi();
trace_reschedule_exit(RESCHEDULE_VECTOR);
irq_exit();
return;
}
scheduler_ipi();
trace_reschedule_exit(RESCHEDULE_VECTOR);
}
__visible void __irq_entry smp_call_function_interrupt(struct pt_regs *regs)
DEFINE_IDTENTRY_SYSVEC(sysvec_call_function)
{
ipi_entering_ack_irq();
ack_APIC_irq();
trace_call_function_entry(CALL_FUNCTION_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_interrupt();
trace_call_function_exit(CALL_FUNCTION_VECTOR);
exiting_irq();
}
__visible void __irq_entry smp_call_function_single_interrupt(struct pt_regs *r)
DEFINE_IDTENTRY_SYSVEC(sysvec_call_function_single)
{
ipi_entering_ack_irq();
ack_APIC_irq();
trace_call_function_single_entry(CALL_FUNCTION_SINGLE_VECTOR);
inc_irq_stat(irq_call_count);
generic_smp_call_function_single_interrupt();
trace_call_function_single_exit(CALL_FUNCTION_SINGLE_VECTOR);
exiting_irq();
}
static int __init nonmi_ipi_setup(char *str)

View File

@@ -55,6 +55,7 @@
#include <linux/gfp.h>
#include <linux/cpuidle.h>
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -63,7 +64,6 @@
#include <asm/realmode.h>
#include <asm/cpu.h>
#include <asm/numa.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/mtrr.h>
#include <asm/mwait.h>
@@ -147,7 +147,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
*((volatile u32 *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) = 0;
}
static void init_freq_invariance(void);
static void init_freq_invariance(bool secondary);
/*
* Report back to the Boot Processor during boot time or to the caller processor
@@ -185,7 +185,7 @@ static void smp_callin(void)
*/
set_cpu_sibling_map(raw_smp_processor_id());
init_freq_invariance();
init_freq_invariance(true);
/*
* Get our bogomips.
@@ -266,6 +266,14 @@ static void notrace start_secondary(void *unused)
wmb();
cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
/*
* Prevent tail call to cpu_startup_entry() because the stack protector
* guard has been changed a couple of function calls up, in
* boot_init_stack_canary() and must not be checked before tail calling
* another function.
*/
prevent_tail_call_optimization();
}
/**
@@ -1341,7 +1349,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
set_sched_topology(x86_topology);
set_cpu_sibling_map(0);
init_freq_invariance();
init_freq_invariance(false);
smp_sanity_check();
switch (apic_intr_mode) {
@@ -1376,12 +1384,12 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
speculative_store_bypass_ht_init();
}
void arch_enable_nonboot_cpus_begin(void)
void arch_thaw_secondary_cpus_begin(void)
{
set_mtrr_aps_delayed_init();
}
void arch_enable_nonboot_cpus_end(void)
void arch_thaw_secondary_cpus_end(void)
{
mtrr_aps_init();
}
@@ -1849,24 +1857,25 @@ static bool slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
#include <asm/cpu_device_id.h>
#include <asm/intel-family.h>
#define ICPU(model) \
{X86_VENDOR_INTEL, 6, model, X86_FEATURE_APERFMPERF, 0}
#define X86_MATCH(model) \
X86_MATCH_VENDOR_FAM_MODEL_FEATURE(INTEL, 6, \
INTEL_FAM6_##model, X86_FEATURE_APERFMPERF, NULL)
static const struct x86_cpu_id has_knl_turbo_ratio_limits[] = {
ICPU(INTEL_FAM6_XEON_PHI_KNL),
ICPU(INTEL_FAM6_XEON_PHI_KNM),
X86_MATCH(XEON_PHI_KNL),
X86_MATCH(XEON_PHI_KNM),
{}
};
static const struct x86_cpu_id has_skx_turbo_ratio_limits[] = {
ICPU(INTEL_FAM6_SKYLAKE_X),
X86_MATCH(SKYLAKE_X),
{}
};
static const struct x86_cpu_id has_glm_turbo_ratio_limits[] = {
ICPU(INTEL_FAM6_ATOM_GOLDMONT),
ICPU(INTEL_FAM6_ATOM_GOLDMONT_D),
ICPU(INTEL_FAM6_ATOM_GOLDMONT_PLUS),
X86_MATCH(ATOM_GOLDMONT),
X86_MATCH(ATOM_GOLDMONT_D),
X86_MATCH(ATOM_GOLDMONT_PLUS),
{}
};
@@ -1877,9 +1886,6 @@ static bool knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq,
int err, i;
u64 msr;
if (!x86_match_cpu(has_knl_turbo_ratio_limits))
return false;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
@@ -1945,18 +1951,23 @@ static bool skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int size)
static bool core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq)
{
u64 msr;
int err;
err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq);
if (err)
return false;
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, turbo_freq);
err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr);
if (err)
return false;
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
*turbo_freq = (*turbo_freq >> 24) & 0xFF; /* 4C turbo */
*base_freq = (*base_freq >> 8) & 0xFF; /* max P state */
*turbo_freq = (msr >> 24) & 0xFF; /* 4C turbo */
/* The CPU may have less than 4 cores */
if (!*turbo_freq)
*turbo_freq = msr & 0xFF; /* 1C turbo */
return true;
}
@@ -1972,7 +1983,8 @@ static bool intel_set_max_freq_ratio(void)
skx_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
if (x86_match_cpu(has_knl_turbo_ratio_limits) &&
knl_set_max_freq_ratio(&base_freq, &turbo_freq, 1))
goto out;
if (x86_match_cpu(has_skx_turbo_ratio_limits) &&
@@ -1985,13 +1997,22 @@ static bool intel_set_max_freq_ratio(void)
return false;
out:
/*
* Some hypervisors advertise X86_FEATURE_APERFMPERF
* but then fill all MSR's with zeroes.
*/
if (!base_freq) {
pr_debug("Couldn't determine cpu base frequency, necessary for scale-invariant accounting.\n");
return false;
}
arch_turbo_freq_ratio = div_u64(turbo_freq * SCHED_CAPACITY_SCALE,
base_freq);
arch_set_max_freq_ratio(turbo_disabled());
return true;
}
static void init_counter_refs(void *arg)
static void init_counter_refs(void)
{
u64 aperf, mperf;
@@ -2002,18 +2023,25 @@ static void init_counter_refs(void *arg)
this_cpu_write(arch_prev_mperf, mperf);
}
static void init_freq_invariance(void)
static void init_freq_invariance(bool secondary)
{
bool ret = false;
if (smp_processor_id() != 0 || !boot_cpu_has(X86_FEATURE_APERFMPERF))
if (!boot_cpu_has(X86_FEATURE_APERFMPERF))
return;
if (secondary) {
if (static_branch_likely(&arch_scale_freq_key)) {
init_counter_refs();
}
return;
}
if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
ret = intel_set_max_freq_ratio();
if (ret) {
on_each_cpu(init_counter_refs, NULL, 1);
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");

View File

@@ -135,26 +135,30 @@ static int cp_stat64(struct stat64 __user *ubuf, struct kstat *stat)
typeof(ubuf->st_gid) gid = 0;
SET_UID(uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(gid, from_kgid_munged(current_user_ns(), stat->gid));
if (!access_ok(ubuf, sizeof(struct stat64)) ||
__put_user(huge_encode_dev(stat->dev), &ubuf->st_dev) ||
__put_user(stat->ino, &ubuf->__st_ino) ||
__put_user(stat->ino, &ubuf->st_ino) ||
__put_user(stat->mode, &ubuf->st_mode) ||
__put_user(stat->nlink, &ubuf->st_nlink) ||
__put_user(uid, &ubuf->st_uid) ||
__put_user(gid, &ubuf->st_gid) ||
__put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev) ||
__put_user(stat->size, &ubuf->st_size) ||
__put_user(stat->atime.tv_sec, &ubuf->st_atime) ||
__put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec) ||
__put_user(stat->mtime.tv_sec, &ubuf->st_mtime) ||
__put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
__put_user(stat->ctime.tv_sec, &ubuf->st_ctime) ||
__put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
__put_user(stat->blksize, &ubuf->st_blksize) ||
__put_user(stat->blocks, &ubuf->st_blocks))
if (!user_write_access_begin(ubuf, sizeof(struct stat64)))
return -EFAULT;
unsafe_put_user(huge_encode_dev(stat->dev), &ubuf->st_dev, Efault);
unsafe_put_user(stat->ino, &ubuf->__st_ino, Efault);
unsafe_put_user(stat->ino, &ubuf->st_ino, Efault);
unsafe_put_user(stat->mode, &ubuf->st_mode, Efault);
unsafe_put_user(stat->nlink, &ubuf->st_nlink, Efault);
unsafe_put_user(uid, &ubuf->st_uid, Efault);
unsafe_put_user(gid, &ubuf->st_gid, Efault);
unsafe_put_user(huge_encode_dev(stat->rdev), &ubuf->st_rdev, Efault);
unsafe_put_user(stat->size, &ubuf->st_size, Efault);
unsafe_put_user(stat->atime.tv_sec, &ubuf->st_atime, Efault);
unsafe_put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec, Efault);
unsafe_put_user(stat->mtime.tv_sec, &ubuf->st_mtime, Efault);
unsafe_put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec, Efault);
unsafe_put_user(stat->ctime.tv_sec, &ubuf->st_ctime, Efault);
unsafe_put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec, Efault);
unsafe_put_user(stat->blksize, &ubuf->st_blksize, Efault);
unsafe_put_user(stat->blocks, &ubuf->st_blocks, Efault);
user_access_end();
return 0;
Efault:
user_write_access_end();
return -EFAULT;
}
COMPAT_SYSCALL_DEFINE2(ia32_stat64, const char __user *, filename,

View File

@@ -23,7 +23,6 @@
#include <asm/realmode.h>
#include <asm/processor.h>
#include <asm/bootparam.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/swiotlb.h>
#include <asm/fixmap.h>
@@ -35,8 +34,7 @@
#include "../realmode/rm/wakeup.h"
/* Global pointer to shared data; NULL means no measured launch. */
struct tboot *tboot __read_mostly;
EXPORT_SYMBOL(tboot);
static struct tboot *tboot __read_mostly;
/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
#define AP_WAIT_TIMEOUT 1
@@ -46,6 +44,11 @@ EXPORT_SYMBOL(tboot);
static u8 tboot_uuid[16] __initdata = TBOOT_UUID;
bool tboot_enabled(void)
{
return tboot != NULL;
}
void __init tboot_probe(void)
{
/* Look for valid page-aligned address for shared page. */
@@ -90,7 +93,7 @@ static struct mm_struct tboot_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
.mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
};

View File

@@ -103,6 +103,9 @@ static __init void x86_late_time_init(void)
*/
x86_init.irqs.intr_mode_init();
tsc_init();
if (static_cpu_has(X86_FEATURE_WAITPKG))
use_tpause_delay();
}
/*

View File

@@ -25,20 +25,3 @@ void trace_pagefault_unreg(void)
{
static_branch_dec(&trace_pagefault_key);
}
#ifdef CONFIG_SMP
DEFINE_STATIC_KEY_FALSE(trace_resched_ipi_key);
int trace_resched_ipi_reg(void)
{
static_branch_inc(&trace_resched_ipi_key);
return 0;
}
void trace_resched_ipi_unreg(void)
{
static_branch_dec(&trace_resched_ipi_key);
}
#endif

View File

@@ -37,10 +37,12 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/io.h>
#include <linux/hardirq.h>
#include <linux/atomic.h>
#include <asm/stacktrace.h>
#include <asm/processor.h>
#include <asm/debugreg.h>
#include <linux/atomic.h>
#include <asm/text-patching.h>
#include <asm/ftrace.h>
#include <asm/traps.h>
@@ -82,78 +84,6 @@ static inline void cond_local_irq_disable(struct pt_regs *regs)
local_irq_disable();
}
/*
* In IST context, we explicitly disable preemption. This serves two
* purposes: it makes it much less likely that we would accidentally
* schedule in IST context and it will force a warning if we somehow
* manage to schedule by accident.
*/
void ist_enter(struct pt_regs *regs)
{
if (user_mode(regs)) {
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
} else {
/*
* We might have interrupted pretty much anything. In
* fact, if we're a machine check, we can even interrupt
* NMI processing. We don't want in_nmi() to return true,
* but we need to notify RCU.
*/
rcu_nmi_enter();
}
preempt_disable();
/* This code is a bit fragile. Test it. */
RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
}
NOKPROBE_SYMBOL(ist_enter);
void ist_exit(struct pt_regs *regs)
{
preempt_enable_no_resched();
if (!user_mode(regs))
rcu_nmi_exit();
}
/**
* ist_begin_non_atomic() - begin a non-atomic section in an IST exception
* @regs: regs passed to the IST exception handler
*
* IST exception handlers normally cannot schedule. As a special
* exception, if the exception interrupted userspace code (i.e.
* user_mode(regs) would return true) and the exception was not
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
* begins a non-atomic section within an ist_enter()/ist_exit() region.
* Callers are responsible for enabling interrupts themselves inside
* the non-atomic section, and callers must call ist_end_non_atomic()
* before ist_exit().
*/
void ist_begin_non_atomic(struct pt_regs *regs)
{
BUG_ON(!user_mode(regs));
/*
* Sanity check: we need to be on the normal thread stack. This
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
BUG_ON(!on_thread_stack());
preempt_enable_no_resched();
}
/**
* ist_end_non_atomic() - begin a non-atomic section in an IST exception
*
* Ends a non-atomic section started with ist_begin_non_atomic().
*/
void ist_end_non_atomic(void)
{
preempt_disable();
}
int is_valid_bugaddr(unsigned long addr)
{
unsigned short ud;
@@ -215,7 +145,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str,
* process no chance to handle the signal and notice the
* kernel fault information, so that won't result in polluting
* the information about previously queued, but not yet
* delivered, faults. See also do_general_protection below.
* delivered, faults. See also exc_general_protection below.
*/
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = trapnr;
@@ -271,31 +201,79 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
NOTIFY_STOP) {
cond_local_irq_enable(regs);
do_trap(trapnr, signr, str, regs, error_code, sicode, addr);
cond_local_irq_disable(regs);
}
}
#define IP ((void __user *)uprobe_get_trap_addr(regs))
#define DO_ERROR(trapnr, signr, sicode, addr, str, name) \
dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
{ \
do_error_trap(regs, error_code, str, trapnr, signr, sicode, addr); \
/*
* Posix requires to provide the address of the faulting instruction for
* SIGILL (#UD) and SIGFPE (#DE) in the si_addr member of siginfo_t.
*
* This address is usually regs->ip, but when an uprobe moved the code out
* of line then regs->ip points to the XOL code which would confuse
* anything which analyzes the fault address vs. the unmodified binary. If
* a trap happened in XOL code then uprobe maps regs->ip back to the
* original instruction address.
*/
static __always_inline void __user *error_get_trap_addr(struct pt_regs *regs)
{
return (void __user *)uprobe_get_trap_addr(regs);
}
DO_ERROR(X86_TRAP_DE, SIGFPE, FPE_INTDIV, IP, "divide error", divide_error)
DO_ERROR(X86_TRAP_OF, SIGSEGV, 0, NULL, "overflow", overflow)
DO_ERROR(X86_TRAP_UD, SIGILL, ILL_ILLOPN, IP, "invalid opcode", invalid_op)
DO_ERROR(X86_TRAP_OLD_MF, SIGFPE, 0, NULL, "coprocessor segment overrun", coprocessor_segment_overrun)
DO_ERROR(X86_TRAP_TS, SIGSEGV, 0, NULL, "invalid TSS", invalid_TSS)
DO_ERROR(X86_TRAP_NP, SIGBUS, 0, NULL, "segment not present", segment_not_present)
DO_ERROR(X86_TRAP_SS, SIGBUS, 0, NULL, "stack segment", stack_segment)
#undef IP
DEFINE_IDTENTRY(exc_divide_error)
{
do_error_trap(regs, 0, "divide_error", X86_TRAP_DE, SIGFPE,
FPE_INTDIV, error_get_trap_addr(regs));
}
dotraplinkage void do_alignment_check(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY(exc_overflow)
{
do_error_trap(regs, 0, "overflow", X86_TRAP_OF, SIGSEGV, 0, NULL);
}
#ifdef CONFIG_X86_F00F_BUG
void handle_invalid_op(struct pt_regs *regs)
#else
static inline void handle_invalid_op(struct pt_regs *regs)
#endif
{
do_error_trap(regs, 0, "invalid opcode", X86_TRAP_UD, SIGILL,
ILL_ILLOPN, error_get_trap_addr(regs));
}
DEFINE_IDTENTRY(exc_invalid_op)
{
handle_invalid_op(regs);
}
DEFINE_IDTENTRY(exc_coproc_segment_overrun)
{
do_error_trap(regs, 0, "coprocessor segment overrun",
X86_TRAP_OLD_MF, SIGFPE, 0, NULL);
}
DEFINE_IDTENTRY_ERRORCODE(exc_invalid_tss)
{
do_error_trap(regs, error_code, "invalid TSS", X86_TRAP_TS, SIGSEGV,
0, NULL);
}
DEFINE_IDTENTRY_ERRORCODE(exc_segment_not_present)
{
do_error_trap(regs, error_code, "segment not present", X86_TRAP_NP,
SIGBUS, 0, NULL);
}
DEFINE_IDTENTRY_ERRORCODE(exc_stack_segment)
{
do_error_trap(regs, error_code, "stack segment", X86_TRAP_SS, SIGBUS,
0, NULL);
}
DEFINE_IDTENTRY_ERRORCODE(exc_alignment_check)
{
char *str = "alignment check";
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_AC, SIGBUS) == NOTIFY_STOP)
return;
@@ -326,7 +304,6 @@ __visible void __noreturn handle_stack_overflow(const char *message,
}
#endif
#if defined(CONFIG_X86_64) || defined(CONFIG_DOUBLEFAULT)
/*
* Runs on an IST stack for x86_64 and on a special task stack for x86_32.
*
@@ -342,12 +319,19 @@ __visible void __noreturn handle_stack_overflow(const char *message,
* from the TSS. Returning is, in principle, okay, but changes to regs will
* be lost. If, for some reason, we need to return to a context with modified
* regs, the shim code could be adjusted to synchronize the registers.
*
* The 32bit #DF shim provides CR2 already as an argument. On 64bit it needs
* to be read before doing anything else.
*/
dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsigned long cr2)
DEFINE_IDTENTRY_DF(exc_double_fault)
{
static const char str[] = "double fault";
struct task_struct *tsk = current;
#ifdef CONFIG_VMAP_STACK
unsigned long address = read_cr2();
#endif
#ifdef CONFIG_X86_ESPFIX64
extern unsigned char native_irq_return_iret[];
@@ -363,13 +347,14 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
* The net result is that our #GP handler will think that we
* entered from usermode with the bad user context.
*
* No need for ist_enter here because we don't use RCU.
* No need for nmi_enter() here because we don't use RCU.
*/
if (((long)regs->sp >> P4D_SHIFT) == ESPFIX_PGD_ENTRY &&
regs->cs == __KERNEL_CS &&
regs->ip == (unsigned long)native_irq_return_iret)
{
struct pt_regs *gpregs = (struct pt_regs *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
unsigned long *p = (unsigned long *)regs->sp;
/*
* regs->sp points to the failing IRET frame on the
@@ -377,7 +362,11 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
* in gpregs->ss through gpregs->ip.
*
*/
memmove(&gpregs->ip, (void *)regs->sp, 5*8);
gpregs->ip = p[0];
gpregs->cs = p[1];
gpregs->flags = p[2];
gpregs->sp = p[3];
gpregs->ss = p[4];
gpregs->orig_ax = 0; /* Missing (lost) #GP error code */
/*
@@ -391,14 +380,15 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
* which is what the stub expects, given that the faulting
* RIP will be the IRET instruction.
*/
regs->ip = (unsigned long)general_protection;
regs->ip = (unsigned long)asm_exc_general_protection;
regs->sp = (unsigned long)&gpregs->orig_ax;
return;
}
#endif
ist_enter(regs);
nmi_enter();
instrumentation_begin();
notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
tsk->thread.error_code = error_code;
@@ -442,28 +432,31 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code, unsign
* stack even if the actual trigger for the double fault was
* something else.
*/
if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE)
handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2);
if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) {
handle_stack_overflow("kernel stack overflow (double-fault)",
regs, address);
}
#endif
pr_emerg("PANIC: double fault, error_code: 0x%lx\n", error_code);
die("double fault", regs, error_code);
panic("Machine halted.");
instrumentation_end();
}
#endif
dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY(exc_bounds)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
if (notify_die(DIE_TRAP, "bounds", regs, error_code,
if (notify_die(DIE_TRAP, "bounds", regs, 0,
X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
return;
cond_local_irq_enable(regs);
if (!user_mode(regs))
die("bounds", regs, error_code);
die("bounds", regs, 0);
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, 0, NULL);
do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, 0, 0, NULL);
cond_local_irq_disable(regs);
}
enum kernel_gp_hint {
@@ -510,7 +503,7 @@ static enum kernel_gp_hint get_kernel_gp_address(struct pt_regs *regs,
#define GPFSTR "general protection fault"
dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
{
char desc[sizeof(GPFSTR) + 50 + 2*sizeof(unsigned long) + 1] = GPFSTR;
enum kernel_gp_hint hint = GP_NO_HINT;
@@ -518,17 +511,17 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
unsigned long gp_addr;
int ret;
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
cond_local_irq_enable(regs);
if (static_cpu_has(X86_FEATURE_UMIP)) {
if (user_mode(regs) && fixup_umip_exception(regs))
return;
goto exit;
}
if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
local_irq_disable();
return;
}
@@ -540,12 +533,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
show_signal(tsk, SIGSEGV, "", desc, regs, error_code);
force_sig(SIGSEGV);
return;
goto exit;
}
if (fixup_exception(regs, X86_TRAP_GP, error_code, 0))
return;
goto exit;
tsk->thread.error_code = error_code;
tsk->thread.trap_nr = X86_TRAP_GP;
@@ -557,11 +549,11 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
if (!preemptible() &&
kprobe_running() &&
kprobe_fault_handler(regs, X86_TRAP_GP))
return;
goto exit;
ret = notify_die(DIE_GPF, desc, regs, error_code, X86_TRAP_GP, SIGSEGV);
if (ret == NOTIFY_STOP)
return;
goto exit;
if (error_code)
snprintf(desc, sizeof(desc), "segment-related " GPFSTR);
@@ -583,55 +575,74 @@ dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code)
die_addr(desc, regs, error_code, gp_addr);
exit:
cond_local_irq_disable(regs);
}
NOKPROBE_SYMBOL(do_general_protection);
dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
static bool do_int3(struct pt_regs *regs)
{
if (poke_int3_handler(regs))
return;
/*
* Unlike any other non-IST entry, we can be called from a kprobe in
* non-CONTEXT_KERNEL kernel mode or even during context tracking
* state changes. Make sure that we wake up RCU even if we're coming
* from kernel code.
*
* This means that we can't schedule even if we came from a
* preemptible kernel context. That's okay.
*/
if (!user_mode(regs)) {
rcu_nmi_enter();
preempt_disable();
}
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
int res;
#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
if (kgdb_ll_trap(DIE_INT3, "int3", regs, 0, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
return true;
#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
#ifdef CONFIG_KPROBES
if (kprobe_int3_handler(regs))
goto exit;
return true;
#endif
res = notify_die(DIE_INT3, "int3", regs, 0, X86_TRAP_BP, SIGTRAP);
if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
SIGTRAP) == NOTIFY_STOP)
goto exit;
return res == NOTIFY_STOP;
}
static void do_int3_user(struct pt_regs *regs)
{
if (do_int3(regs))
return;
cond_local_irq_enable(regs);
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, 0, NULL);
do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, 0, 0, NULL);
cond_local_irq_disable(regs);
}
exit:
if (!user_mode(regs)) {
preempt_enable_no_resched();
rcu_nmi_exit();
DEFINE_IDTENTRY_RAW(exc_int3)
{
/*
* poke_int3_handler() is completely self contained code; it does (and
* must) *NOT* call out to anything, lest it hits upon yet another
* INT3.
*/
if (poke_int3_handler(regs))
return;
/*
* idtentry_enter_user() uses static_branch_{,un}likely() and therefore
* can trigger INT3, hence poke_int3_handler() must be done
* before. If the entry came from kernel mode, then use nmi_enter()
* because the INT3 could have been hit in any context including
* NMI.
*/
if (user_mode(regs)) {
idtentry_enter_user(regs);
instrumentation_begin();
do_int3_user(regs);
instrumentation_end();
idtentry_exit_user(regs);
} else {
nmi_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
if (!do_int3(regs))
die("int3", regs, 0);
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
nmi_exit();
}
}
NOKPROBE_SYMBOL(do_int3);
#ifdef CONFIG_X86_64
/*
@@ -639,21 +650,20 @@ NOKPROBE_SYMBOL(do_int3);
* to switch to the normal thread stack if the interrupted code was in
* user mode. The actual stack switch is done in entry_64.S
*/
asmlinkage __visible notrace struct pt_regs *sync_regs(struct pt_regs *eregs)
asmlinkage __visible noinstr struct pt_regs *sync_regs(struct pt_regs *eregs)
{
struct pt_regs *regs = (struct pt_regs *)this_cpu_read(cpu_current_top_of_stack) - 1;
if (regs != eregs)
*regs = *eregs;
return regs;
}
NOKPROBE_SYMBOL(sync_regs);
struct bad_iret_stack {
void *error_entry_ret;
struct pt_regs regs;
};
asmlinkage __visible notrace
asmlinkage __visible noinstr
struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
{
/*
@@ -664,19 +674,21 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
* just below the IRET frame) and we want to pretend that the
* exception came from the IRET target.
*/
struct bad_iret_stack *new_stack =
(struct bad_iret_stack *)this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
struct bad_iret_stack tmp, *new_stack =
(struct bad_iret_stack *)__this_cpu_read(cpu_tss_rw.x86_tss.sp0) - 1;
/* Copy the IRET target to the new stack. */
memmove(&new_stack->regs.ip, (void *)s->regs.sp, 5*8);
/* Copy the IRET target to the temporary storage. */
memcpy(&tmp.regs.ip, (void *)s->regs.sp, 5*8);
/* Copy the remainder of the stack from the current stack. */
memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
memcpy(&tmp, s, offsetof(struct bad_iret_stack, regs.ip));
/* Update the entry stack */
memcpy(new_stack, &tmp, sizeof(tmp));
BUG_ON(!user_mode(&new_stack->regs));
return new_stack;
}
NOKPROBE_SYMBOL(fixup_bad_iret);
#endif
static bool is_sysenter_singlestep(struct pt_regs *regs)
@@ -702,6 +714,43 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
#endif
}
static __always_inline void debug_enter(unsigned long *dr6, unsigned long *dr7)
{
/*
* Disable breakpoints during exception handling; recursive exceptions
* are exceedingly 'fun'.
*
* Since this function is NOKPROBE, and that also applies to
* HW_BREAKPOINT_X, we can't hit a breakpoint before this (XXX except a
* HW_BREAKPOINT_W on our stack)
*
* Entry text is excluded for HW_BP_X and cpu_entry_area, which
* includes the entry stack is excluded for everything.
*/
*dr7 = local_db_save();
/*
* The Intel SDM says:
*
* Certain debug exceptions may clear bits 0-3. The remaining
* contents of the DR6 register are never cleared by the
* processor. To avoid confusion in identifying debug
* exceptions, debug handlers should clear the register before
* returning to the interrupted task.
*
* Keep it simple: clear DR6 immediately.
*/
get_debugreg(*dr6, 6);
set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */
*dr6 &= ~DR6_RESERVED;
}
static __always_inline void debug_exit(unsigned long dr7)
{
local_db_restore(dr7);
}
/*
* Our handling of the processor debug registers is non-trivial.
* We do not clear them on entry and exit from the kernel. Therefore
@@ -726,86 +775,54 @@ static bool is_sysenter_singlestep(struct pt_regs *regs)
*
* May run on IST stack.
*/
dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
static void handle_debug(struct pt_regs *regs, unsigned long dr6, bool user)
{
struct task_struct *tsk = current;
int user_icebp = 0;
unsigned long dr6;
bool user_icebp;
int si_code;
ist_enter(regs);
get_debugreg(dr6, 6);
/*
* The Intel SDM says:
*
* Certain debug exceptions may clear bits 0-3. The remaining
* contents of the DR6 register are never cleared by the
* processor. To avoid confusion in identifying debug
* exceptions, debug handlers should clear the register before
* returning to the interrupted task.
*
* Keep it simple: clear DR6 immediately.
*/
set_debugreg(0, 6);
/* Filter out all the reserved bits which are preset to 1 */
dr6 &= ~DR6_RESERVED;
/*
* The SDM says "The processor clears the BTF flag when it
* generates a debug exception." Clear TIF_BLOCKSTEP to keep
* TIF_BLOCKSTEP in sync with the hardware BTF flag.
*/
clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
clear_thread_flag(TIF_BLOCKSTEP);
if (unlikely(!user_mode(regs) && (dr6 & DR_STEP) &&
is_sysenter_singlestep(regs))) {
dr6 &= ~DR_STEP;
if (!dr6)
goto exit;
/*
* else we might have gotten a single-step trap and hit a
* watchpoint at the same time, in which case we should fall
* through and handle the watchpoint.
*/
}
/*
* If DR6 is zero, no point in trying to handle it. The kernel is
* not using INT1.
*/
if (!user && !dr6)
return;
/*
* If dr6 has no reason to give us about the origin of this trap,
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
*/
if (!dr6 && user_mode(regs))
user_icebp = 1;
user_icebp = user && !dr6;
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
#ifdef CONFIG_KPROBES
if (kprobe_debug_handler(regs))
goto exit;
if (kprobe_debug_handler(regs)) {
return;
}
#endif
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, error_code,
SIGTRAP) == NOTIFY_STOP)
goto exit;
/*
* Let others (NMI) know that the debug stack is in use
* as we may switch to the interrupt stack.
*/
debug_stack_usage_inc();
if (notify_die(DIE_DEBUG, "debug", regs, (long)&dr6, 0,
SIGTRAP) == NOTIFY_STOP) {
return;
}
/* It's safe to allow irq's after DR6 has been saved */
cond_local_irq_enable(regs);
if (v8086_mode(regs)) {
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
X86_TRAP_DB);
cond_local_irq_disable(regs);
debug_stack_usage_dec();
goto exit;
handle_vm86_trap((struct kernel_vm86_regs *) regs, 0,
X86_TRAP_DB);
goto out;
}
if (WARN_ON_ONCE((dr6 & DR_STEP) && !user_mode(regs))) {
@@ -819,23 +836,91 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
regs->flags &= ~X86_EFLAGS_TF;
}
si_code = get_si_code(tsk->thread.debugreg6);
if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(regs, error_code, si_code);
cond_local_irq_disable(regs);
debug_stack_usage_dec();
send_sigtrap(regs, 0, si_code);
exit:
ist_exit(regs);
out:
cond_local_irq_disable(regs);
}
NOKPROBE_SYMBOL(do_debug);
static __always_inline void exc_debug_kernel(struct pt_regs *regs,
unsigned long dr6)
{
nmi_enter();
instrumentation_begin();
trace_hardirqs_off_finish();
/*
* Catch SYSENTER with TF set and clear DR_STEP. If this hit a
* watchpoint at the same time then that will still be handled.
*/
if ((dr6 & DR_STEP) && is_sysenter_singlestep(regs))
dr6 &= ~DR_STEP;
handle_debug(regs, dr6, false);
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
nmi_exit();
}
static __always_inline void exc_debug_user(struct pt_regs *regs,
unsigned long dr6)
{
idtentry_enter_user(regs);
instrumentation_begin();
handle_debug(regs, dr6, true);
instrumentation_end();
idtentry_exit_user(regs);
}
#ifdef CONFIG_X86_64
/* IST stack entry */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{
unsigned long dr6, dr7;
debug_enter(&dr6, &dr7);
exc_debug_kernel(regs, dr6);
debug_exit(dr7);
}
/* User entry, runs on regular task stack */
DEFINE_IDTENTRY_DEBUG_USER(exc_debug)
{
unsigned long dr6, dr7;
debug_enter(&dr6, &dr7);
exc_debug_user(regs, dr6);
debug_exit(dr7);
}
#else
/* 32 bit does not have separate entry points. */
DEFINE_IDTENTRY_DEBUG(exc_debug)
{
unsigned long dr6, dr7;
debug_enter(&dr6, &dr7);
if (user_mode(regs))
exc_debug_user(regs, dr6);
else
exc_debug_kernel(regs, dr6);
debug_exit(dr7);
}
#endif
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
static void math_error(struct pt_regs *regs, int error_code, int trapnr)
static void math_error(struct pt_regs *regs, int trapnr)
{
struct task_struct *task = current;
struct fpu *fpu = &task->thread.fpu;
@@ -846,16 +931,16 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
cond_local_irq_enable(regs);
if (!user_mode(regs)) {
if (fixup_exception(regs, trapnr, error_code, 0))
return;
if (fixup_exception(regs, trapnr, 0, 0))
goto exit;
task->thread.error_code = error_code;
task->thread.error_code = 0;
task->thread.trap_nr = trapnr;
if (notify_die(DIE_TRAP, str, regs, error_code,
trapnr, SIGFPE) != NOTIFY_STOP)
die(str, regs, error_code);
return;
if (notify_die(DIE_TRAP, str, regs, 0, trapnr,
SIGFPE) != NOTIFY_STOP)
die(str, regs, 0);
goto exit;
}
/*
@@ -864,32 +949,37 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
fpu__save(fpu);
task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
task->thread.error_code = 0;
si_code = fpu__exception_code(fpu, trapnr);
/* Retry when we get spurious exceptions: */
if (!si_code)
return;
goto exit;
force_sig_fault(SIGFPE, si_code,
(void __user *)uprobe_get_trap_addr(regs));
exit:
cond_local_irq_disable(regs);
}
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY(exc_coprocessor_error)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
math_error(regs, error_code, X86_TRAP_MF);
math_error(regs, X86_TRAP_MF);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY(exc_simd_coprocessor_error)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
math_error(regs, error_code, X86_TRAP_XF);
if (IS_ENABLED(CONFIG_X86_INVD_BUG)) {
/* AMD 486 bug: INVD in CPL 0 raises #XF instead of #GP */
if (!static_cpu_has(X86_FEATURE_XMM)) {
__exc_general_protection(regs, 0);
return;
}
}
math_error(regs, X86_TRAP_XF);
}
dotraplinkage void
do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY(exc_spurious_interrupt_bug)
{
/*
* This addresses a Pentium Pro Erratum:
@@ -912,13 +1002,10 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
*/
}
dotraplinkage void
do_device_not_available(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY(exc_device_not_available)
{
unsigned long cr0 = read_cr0();
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU) && (cr0 & X86_CR0_EM)) {
struct math_emu_info info = { };
@@ -927,6 +1014,8 @@ do_device_not_available(struct pt_regs *regs, long error_code)
info.regs = regs;
math_emulate(&info);
cond_local_irq_disable(regs);
return;
}
#endif
@@ -941,22 +1030,20 @@ do_device_not_available(struct pt_regs *regs, long error_code)
* to kill the task than getting stuck in a never-ending
* loop of #NM faults.
*/
die("unexpected #NM exception", regs, error_code);
die("unexpected #NM exception", regs, 0);
}
}
NOKPROBE_SYMBOL(do_device_not_available);
#ifdef CONFIG_X86_32
dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
DEFINE_IDTENTRY_SW(iret_error)
{
RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
local_irq_enable();
if (notify_die(DIE_TRAP, "iret exception", regs, error_code,
if (notify_die(DIE_TRAP, "iret exception", regs, 0,
X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) {
do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, 0,
ILL_BADSTK, (void __user *)NULL);
}
local_irq_disable();
}
#endif
@@ -967,23 +1054,10 @@ void __init trap_init(void)
idt_setup_traps();
/*
* Set the IDT descriptor to a fixed read-only location, so that the
* "sidt" instruction will not leak the location of the kernel, and
* to defend the IDT against arbitrary memory write vulnerabilities.
* It will be reloaded in cpu_init() */
cea_set_pte(CPU_ENTRY_AREA_RO_IDT_VADDR, __pa_symbol(idt_table),
PAGE_KERNEL_RO);
idt_descr.address = CPU_ENTRY_AREA_RO_IDT;
/*
* Should be a barrier for any external CPU state:
*/
cpu_init();
idt_setup_ist_traps();
x86_init.irqs.trap_init();
idt_setup_debugidt_traps();
}

View File

@@ -41,6 +41,7 @@ EXPORT_SYMBOL(tsc_khz);
* TSC can be unstable due to cpufreq or due to unsynced TSCs
*/
static int __read_mostly tsc_unstable;
static unsigned int __initdata tsc_early_khz;
static DEFINE_STATIC_KEY_FALSE(__use_tsc);
@@ -59,6 +60,12 @@ struct cyc2ns {
static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
static int __init tsc_early_khz_setup(char *buf)
{
return kstrtouint(buf, 0, &tsc_early_khz);
}
early_param("tsc_early_khz", tsc_early_khz_setup);
__always_inline void cyc2ns_read_begin(struct cyc2ns_data *data)
{
int seq, idx;
@@ -1412,7 +1419,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early)
if (early) {
cpu_khz = x86_platform.calibrate_cpu();
tsc_khz = x86_platform.calibrate_tsc();
if (tsc_early_khz)
tsc_khz = tsc_early_khz;
else
tsc_khz = x86_platform.calibrate_tsc();
} else {
/* We should not be here with non-native cpu calibration */
WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu);

View File

@@ -81,7 +81,7 @@
#define UMIP_INST_SLDT 3 /* 0F 00 /0 */
#define UMIP_INST_STR 4 /* 0F 00 /1 */
const char * const umip_insns[5] = {
static const char * const umip_insns[5] = {
[UMIP_INST_SGDT] = "SGDT",
[UMIP_INST_SIDT] = "SIDT",
[UMIP_INST_SMSW] = "SMSW",

View File

@@ -74,13 +74,7 @@ static bool in_entry_code(unsigned long ip)
{
char *addr = (char *)ip;
if (addr >= __entry_text_start && addr < __entry_text_end)
return true;
if (addr >= __irqentry_text_start && addr < __irqentry_text_end)
return true;
return false;
return addr >= __entry_text_start && addr < __entry_text_end;
}
static inline unsigned long *last_frame(struct unwind_state *state)
@@ -344,6 +338,9 @@ bad_address:
if (IS_ENABLED(CONFIG_X86_32))
goto the_end;
if (state->task != current)
goto the_end;
if (state->regs) {
printk_deferred_once(KERN_WARNING
"WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n",

View File

@@ -8,19 +8,21 @@
#include <asm/orc_lookup.h>
#define orc_warn(fmt, ...) \
printk_deferred_once(KERN_WARNING pr_fmt("WARNING: " fmt), ##__VA_ARGS__)
printk_deferred_once(KERN_WARNING "WARNING: " fmt, ##__VA_ARGS__)
#define orc_warn_current(args...) \
({ \
if (state->task == current) \
orc_warn(args); \
})
extern int __start_orc_unwind_ip[];
extern int __stop_orc_unwind_ip[];
extern struct orc_entry __start_orc_unwind[];
extern struct orc_entry __stop_orc_unwind[];
static DEFINE_MUTEX(sort_mutex);
int *cur_orc_ip_table = __start_orc_unwind_ip;
struct orc_entry *cur_orc_table = __start_orc_unwind;
unsigned int lookup_num_blocks;
bool orc_init;
static bool orc_init __ro_after_init;
static unsigned int lookup_num_blocks __ro_after_init;
static inline unsigned long orc_ip(const int *ip)
{
@@ -142,9 +144,6 @@ static struct orc_entry *orc_find(unsigned long ip)
{
static struct orc_entry *orc;
if (!orc_init)
return NULL;
if (ip == 0)
return &null_orc_entry;
@@ -189,6 +188,10 @@ static struct orc_entry *orc_find(unsigned long ip)
#ifdef CONFIG_MODULES
static DEFINE_MUTEX(sort_mutex);
static int *cur_orc_ip_table = __start_orc_unwind_ip;
static struct orc_entry *cur_orc_table = __start_orc_unwind;
static void orc_sort_swap(void *_a, void *_b, int size)
{
struct orc_entry *orc_a, *orc_b;
@@ -317,12 +320,19 @@ EXPORT_SYMBOL_GPL(unwind_get_return_address);
unsigned long *unwind_get_return_address_ptr(struct unwind_state *state)
{
struct task_struct *task = state->task;
if (unwind_done(state))
return NULL;
if (state->regs)
return &state->regs->ip;
if (task != current && state->sp == task->thread.sp) {
struct inactive_task_frame *frame = (void *)task->thread.sp;
return &frame->ret_addr;
}
if (state->sp)
return (unsigned long *)state->sp - 1;
@@ -381,9 +391,38 @@ static bool deref_stack_iret_regs(struct unwind_state *state, unsigned long addr
return true;
}
/*
* If state->regs is non-NULL, and points to a full pt_regs, just get the reg
* value from state->regs.
*
* Otherwise, if state->regs just points to IRET regs, and the previous frame
* had full regs, it's safe to get the value from the previous regs. This can
* happen when early/late IRQ entry code gets interrupted by an NMI.
*/
static bool get_reg(struct unwind_state *state, unsigned int reg_off,
unsigned long *val)
{
unsigned int reg = reg_off/8;
if (!state->regs)
return false;
if (state->full_regs) {
*val = ((unsigned long *)state->regs)[reg];
return true;
}
if (state->prev_regs) {
*val = ((unsigned long *)state->prev_regs)[reg];
return true;
}
return false;
}
bool unwind_next_frame(struct unwind_state *state)
{
unsigned long ip_p, sp, orig_ip = state->ip, prev_sp = state->sp;
unsigned long ip_p, sp, tmp, orig_ip = state->ip, prev_sp = state->sp;
enum stack_type prev_type = state->stack_info.type;
struct orc_entry *orc;
bool indirect = false;
@@ -445,43 +484,39 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_REG_R10:
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg R10 at ip %pB\n",
(void *)state->ip);
if (!get_reg(state, offsetof(struct pt_regs, r10), &sp)) {
orc_warn_current("missing R10 value at %pB\n",
(void *)state->ip);
goto err;
}
sp = state->regs->r10;
break;
case ORC_REG_R13:
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg R13 at ip %pB\n",
(void *)state->ip);
if (!get_reg(state, offsetof(struct pt_regs, r13), &sp)) {
orc_warn_current("missing R13 value at %pB\n",
(void *)state->ip);
goto err;
}
sp = state->regs->r13;
break;
case ORC_REG_DI:
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg DI at ip %pB\n",
(void *)state->ip);
if (!get_reg(state, offsetof(struct pt_regs, di), &sp)) {
orc_warn_current("missing RDI value at %pB\n",
(void *)state->ip);
goto err;
}
sp = state->regs->di;
break;
case ORC_REG_DX:
if (!state->regs || !state->full_regs) {
orc_warn("missing regs for base reg DX at ip %pB\n",
(void *)state->ip);
if (!get_reg(state, offsetof(struct pt_regs, dx), &sp)) {
orc_warn_current("missing DX value at %pB\n",
(void *)state->ip);
goto err;
}
sp = state->regs->dx;
break;
default:
orc_warn("unknown SP base reg %d for ip %pB\n",
orc_warn("unknown SP base reg %d at %pB\n",
orc->sp_reg, (void *)state->ip);
goto err;
}
@@ -504,44 +539,48 @@ bool unwind_next_frame(struct unwind_state *state)
state->sp = sp;
state->regs = NULL;
state->prev_regs = NULL;
state->signal = false;
break;
case ORC_TYPE_REGS:
if (!deref_stack_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
orc_warn_current("can't access registers at %pB\n",
(void *)orig_ip);
goto err;
}
state->regs = (struct pt_regs *)sp;
state->prev_regs = NULL;
state->full_regs = true;
state->signal = true;
break;
case ORC_TYPE_REGS_IRET:
if (!deref_stack_iret_regs(state, sp, &state->ip, &state->sp)) {
orc_warn("can't dereference iret registers at %p for ip %pB\n",
(void *)sp, (void *)orig_ip);
orc_warn_current("can't access iret registers at %pB\n",
(void *)orig_ip);
goto err;
}
if (state->full_regs)
state->prev_regs = state->regs;
state->regs = (void *)sp - IRET_FRAME_OFFSET;
state->full_regs = false;
state->signal = true;
break;
default:
orc_warn("unknown .orc_unwind entry type %d for ip %pB\n",
orc_warn("unknown .orc_unwind entry type %d at %pB\n",
orc->type, (void *)orig_ip);
break;
goto err;
}
/* Find BP: */
switch (orc->bp_reg) {
case ORC_REG_UNDEFINED:
if (state->regs && state->full_regs)
state->bp = state->regs->bp;
if (get_reg(state, offsetof(struct pt_regs, bp), &tmp))
state->bp = tmp;
break;
case ORC_REG_PREV_SP:
@@ -564,8 +603,8 @@ bool unwind_next_frame(struct unwind_state *state)
if (state->stack_info.type == prev_type &&
on_stack(&state->stack_info, (void *)state->sp, sizeof(long)) &&
state->sp <= prev_sp) {
orc_warn("stack going in the wrong direction? ip=%pB\n",
(void *)orig_ip);
orc_warn_current("stack going in the wrong direction? at %pB\n",
(void *)orig_ip);
goto err;
}
@@ -588,17 +627,20 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
memset(state, 0, sizeof(*state));
state->task = task;
if (!orc_init)
goto err;
/*
* Refuse to unwind the stack of a task while it's executing on another
* CPU. This check is racy, but that's ok: the unwinder has other
* checks to prevent it from going off the rails.
*/
if (task_on_another_cpu(task))
goto done;
goto err;
if (regs) {
if (user_mode(regs))
goto done;
goto the_end;
state->ip = regs->ip;
state->sp = regs->sp;
@@ -631,6 +673,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
* generate some kind of backtrace if this happens.
*/
void *next_page = (void *)PAGE_ALIGN((unsigned long)state->sp);
state->error = true;
if (get_stack_info(next_page, state->task, &state->stack_info,
&state->stack_mask))
return;
@@ -651,13 +694,14 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
/* Otherwise, skip ahead to the user-specified starting frame: */
while (!unwind_done(state) &&
(!on_stack(&state->stack_info, first_frame, sizeof(long)) ||
state->sp <= (unsigned long)first_frame))
state->sp < (unsigned long)first_frame))
unwind_next_frame(state);
return;
done:
err:
state->error = true;
the_end:
state->stack_info.type = STACK_TYPE_UNKNOWN;
return;
}
EXPORT_SYMBOL_GPL(__unwind_start);

View File

@@ -171,7 +171,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
pte_t *pte;
int i;
down_write(&mm->mmap_sem);
mmap_write_lock(mm);
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
@@ -197,7 +197,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
}
pte_unmap_unlock(pte, ptl);
out:
up_write(&mm->mmap_sem);
mmap_write_unlock(mm);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}

View File

@@ -134,7 +134,6 @@ SECTIONS
KPROBES_TEXT
ALIGN_ENTRY_TEXT_BEGIN
ENTRY_TEXT
IRQENTRY_TEXT
ALIGN_ENTRY_TEXT_END
SOFTIRQENTRY_TEXT
*(.fixup)

View File

@@ -79,7 +79,6 @@ struct x86_init_ops x86_init __initdata = {
.irqs = {
.pre_vector_init = init_ISA_irqs,
.intr_init = native_init_IRQ,
.trap_init = x86_init_noop,
.intr_mode_select = apic_intr_mode_select,
.intr_mode_init = apic_intr_mode_init
},