Merge drm/drm-next into drm-intel-next

Sync up with upstream.

Signed-off-by: Jani Nikula <jani.nikula@intel.com>
This commit is contained in:
Jani Nikula
2021-03-11 08:19:46 +02:00
11502 changed files with 484355 additions and 286620 deletions

View File

@@ -116,7 +116,6 @@ obj-$(CONFIG_VM86) += vm86_32.o
obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
obj-$(CONFIG_HPET_TIMER) += hpet.o
obj-$(CONFIG_APB_TIMER) += apb_timer.o
obj-$(CONFIG_AMD_NB) += amd_nb.o
obj-$(CONFIG_DEBUG_NMI_SELFTEST) += nmi_selftest.o

View File

@@ -1,5 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
OBJECT_FILES_NON_STANDARD_wakeup_$(BITS).o := y
obj-$(CONFIG_ACPI) += boot.o
obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o

View File

@@ -1,12 +1,14 @@
/* SPDX-License-Identifier: GPL-2.0-only */
.text
#include <linux/linkage.h>
#include <linux/objtool.h>
#include <asm/segment.h>
#include <asm/pgtable_types.h>
#include <asm/page_types.h>
#include <asm/msr.h>
#include <asm/asm-offsets.h>
#include <asm/frame.h>
#include <asm/nospec-branch.h>
# Copyright 2003 Pavel Machek <pavel@suse.cz
@@ -39,6 +41,7 @@ SYM_FUNC_START(wakeup_long64)
movq saved_rbp, %rbp
movq saved_rip, %rax
ANNOTATE_RETPOLINE_SAFE
jmp *%rax
SYM_FUNC_END(wakeup_long64)
@@ -126,6 +129,7 @@ SYM_FUNC_START(do_suspend_lowlevel)
FRAME_END
jmp restore_processor_state
SYM_FUNC_END(do_suspend_lowlevel)
STACK_FRAME_NON_STANDARD do_suspend_lowlevel
.data
saved_rbp: .quad 0

View File

@@ -1,347 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* apb_timer.c: Driver for Langwell APB timers
*
* (C) Copyright 2009 Intel Corporation
* Author: Jacob Pan (jacob.jun.pan@intel.com)
*
* Note:
* Langwell is the south complex of Intel Moorestown MID platform. There are
* eight external timers in total that can be used by the operating system.
* The timer information, such as frequency and addresses, is provided to the
* OS via SFI tables.
* Timer interrupts are routed via FW/HW emulated IOAPIC independently via
* individual redirection table entries (RTE).
* Unlike HPET, there is no master counter, therefore one of the timers are
* used as clocksource. The overall allocation looks like:
* - timer 0 - NR_CPUs for per cpu timer
* - one timer for clocksource
* - one timer for watchdog driver.
* It is also worth notice that APB timer does not support true one-shot mode,
* free-running mode will be used here to emulate one-shot mode.
* APB timer can also be used as broadcast timer along with per cpu local APIC
* timer, but by default APB timer has higher rating than local APIC timers.
*/
#include <linux/delay.h>
#include <linux/dw_apb_timer.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/pm.h>
#include <linux/sfi.h>
#include <linux/interrupt.h>
#include <linux/cpu.h>
#include <linux/irq.h>
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
#include <asm/intel-mid.h>
#include <asm/time.h>
#define APBT_CLOCKEVENT_RATING 110
#define APBT_CLOCKSOURCE_RATING 250
#define APBT_CLOCKEVENT0_NUM (0)
#define APBT_CLOCKSOURCE_NUM (2)
static phys_addr_t apbt_address;
static int apb_timer_block_enabled;
static void __iomem *apbt_virt_address;
/*
* Common DW APB timer info
*/
static unsigned long apbt_freq;
struct apbt_dev {
struct dw_apb_clock_event_device *timer;
unsigned int num;
int cpu;
unsigned int irq;
char name[10];
};
static struct dw_apb_clocksource *clocksource_apbt;
static inline void __iomem *adev_virt_addr(struct apbt_dev *adev)
{
return apbt_virt_address + adev->num * APBTMRS_REG_SIZE;
}
static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
#ifdef CONFIG_SMP
static unsigned int apbt_num_timers_used;
#endif
static inline void apbt_set_mapping(void)
{
struct sfi_timer_table_entry *mtmr;
int phy_cs_timer_id = 0;
if (apbt_virt_address) {
pr_debug("APBT base already mapped\n");
return;
}
mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
if (mtmr == NULL) {
printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
APBT_CLOCKEVENT0_NUM);
return;
}
apbt_address = (phys_addr_t)mtmr->phys_addr;
if (!apbt_address) {
printk(KERN_WARNING "No timer base from SFI, use default\n");
apbt_address = APBT_DEFAULT_BASE;
}
apbt_virt_address = ioremap(apbt_address, APBT_MMAP_SIZE);
if (!apbt_virt_address) {
pr_debug("Failed mapping APBT phy address at %lu\n",\
(unsigned long)apbt_address);
goto panic_noapbt;
}
apbt_freq = mtmr->freq_hz;
sfi_free_mtmr(mtmr);
/* Now figure out the physical timer id for clocksource device */
mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
if (mtmr == NULL)
goto panic_noapbt;
/* Now figure out the physical timer id */
pr_debug("Use timer %d for clocksource\n",
(int)(mtmr->phys_addr & 0xff) / APBTMRS_REG_SIZE);
phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff) /
APBTMRS_REG_SIZE;
clocksource_apbt = dw_apb_clocksource_init(APBT_CLOCKSOURCE_RATING,
"apbt0", apbt_virt_address + phy_cs_timer_id *
APBTMRS_REG_SIZE, apbt_freq);
return;
panic_noapbt:
panic("Failed to setup APB system timer\n");
}
static inline void apbt_clear_mapping(void)
{
iounmap(apbt_virt_address);
apbt_virt_address = NULL;
}
static int __init apbt_clockevent_register(void)
{
struct sfi_timer_table_entry *mtmr;
struct apbt_dev *adev = this_cpu_ptr(&cpu_apbt_dev);
mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
if (mtmr == NULL) {
printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
APBT_CLOCKEVENT0_NUM);
return -ENODEV;
}
adev->num = smp_processor_id();
adev->timer = dw_apb_clockevent_init(smp_processor_id(), "apbt0",
intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ?
APBT_CLOCKEVENT_RATING - 100 : APBT_CLOCKEVENT_RATING,
adev_virt_addr(adev), 0, apbt_freq);
/* Firmware does EOI handling for us. */
adev->timer->eoi = NULL;
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
global_clock_event = &adev->timer->ced;
printk(KERN_DEBUG "%s clockevent registered as global\n",
global_clock_event->name);
}
dw_apb_clockevent_register(adev->timer);
sfi_free_mtmr(mtmr);
return 0;
}
#ifdef CONFIG_SMP
static void apbt_setup_irq(struct apbt_dev *adev)
{
irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
}
/* Should be called with per cpu */
void apbt_setup_secondary_clock(void)
{
struct apbt_dev *adev;
int cpu;
/* Don't register boot CPU clockevent */
cpu = smp_processor_id();
if (!cpu)
return;
adev = this_cpu_ptr(&cpu_apbt_dev);
if (!adev->timer) {
adev->timer = dw_apb_clockevent_init(cpu, adev->name,
APBT_CLOCKEVENT_RATING, adev_virt_addr(adev),
adev->irq, apbt_freq);
adev->timer->eoi = NULL;
} else {
dw_apb_clockevent_resume(adev->timer);
}
printk(KERN_INFO "Registering CPU %d clockevent device %s, cpu %08x\n",
cpu, adev->name, adev->cpu);
apbt_setup_irq(adev);
dw_apb_clockevent_register(adev->timer);
return;
}
/*
* this notify handler process CPU hotplug events. in case of S0i3, nonboot
* cpus are disabled/enabled frequently, for performance reasons, we keep the
* per cpu timer irq registered so that we do need to do free_irq/request_irq.
*
* TODO: it might be more reliable to directly disable percpu clockevent device
* without the notifier chain. currently, cpu 0 may get interrupts from other
* cpu timers during the offline process due to the ordering of notification.
* the extra interrupt is harmless.
*/
static int apbt_cpu_dead(unsigned int cpu)
{
struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
dw_apb_clockevent_pause(adev->timer);
if (system_state == SYSTEM_RUNNING) {
pr_debug("skipping APBT CPU %u offline\n", cpu);
} else {
pr_debug("APBT clockevent for cpu %u offline\n", cpu);
dw_apb_clockevent_stop(adev->timer);
}
return 0;
}
static __init int apbt_late_init(void)
{
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT ||
!apb_timer_block_enabled)
return 0;
return cpuhp_setup_state(CPUHP_X86_APB_DEAD, "x86/apb:dead", NULL,
apbt_cpu_dead);
}
fs_initcall(apbt_late_init);
#else
void apbt_setup_secondary_clock(void) {}
#endif /* CONFIG_SMP */
static int apbt_clocksource_register(void)
{
u64 start, now;
u64 t1;
/* Start the counter, use timer 2 as source, timer 0/1 for event */
dw_apb_clocksource_start(clocksource_apbt);
/* Verify whether apbt counter works */
t1 = dw_apb_clocksource_read(clocksource_apbt);
start = rdtsc();
/*
* We don't know the TSC frequency yet, but waiting for
* 200000 TSC cycles is safe:
* 4 GHz == 50us
* 1 GHz == 200us
*/
do {
rep_nop();
now = rdtsc();
} while ((now - start) < 200000UL);
/* APBT is the only always on clocksource, it has to work! */
if (t1 == dw_apb_clocksource_read(clocksource_apbt))
panic("APBT counter not counting. APBT disabled\n");
dw_apb_clocksource_register(clocksource_apbt);
return 0;
}
/*
* Early setup the APBT timer, only use timer 0 for booting then switch to
* per CPU timer if possible.
* returns 1 if per cpu apbt is setup
* returns 0 if no per cpu apbt is chosen
* panic if set up failed, this is the only platform timer on Moorestown.
*/
void __init apbt_time_init(void)
{
#ifdef CONFIG_SMP
int i;
struct sfi_timer_table_entry *p_mtmr;
struct apbt_dev *adev;
#endif
if (apb_timer_block_enabled)
return;
apbt_set_mapping();
if (!apbt_virt_address)
goto out_noapbt;
/*
* Read the frequency and check for a sane value, for ESL model
* we extend the possible clock range to allow time scaling.
*/
if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
pr_debug("APBT has invalid freq 0x%lx\n", apbt_freq);
goto out_noapbt;
}
if (apbt_clocksource_register()) {
pr_debug("APBT has failed to register clocksource\n");
goto out_noapbt;
}
if (!apbt_clockevent_register())
apb_timer_block_enabled = 1;
else {
pr_debug("APBT has failed to register clockevent\n");
goto out_noapbt;
}
#ifdef CONFIG_SMP
/* kernel cmdline disable apb timer, so we will use lapic timers */
if (intel_mid_timer_options == INTEL_MID_TIMER_LAPIC_APBT) {
printk(KERN_INFO "apbt: disabled per cpu timer\n");
return;
}
pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
if (num_possible_cpus() <= sfi_mtimer_num)
apbt_num_timers_used = num_possible_cpus();
else
apbt_num_timers_used = 1;
pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
/* here we set up per CPU timer data structure */
for (i = 0; i < apbt_num_timers_used; i++) {
adev = &per_cpu(cpu_apbt_dev, i);
adev->num = i;
adev->cpu = i;
p_mtmr = sfi_get_mtmr(i);
if (p_mtmr)
adev->irq = p_mtmr->irq;
else
printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
snprintf(adev->name, sizeof(adev->name) - 1, "apbt%d", i);
}
#endif
return;
out_noapbt:
apbt_clear_mapping();
apb_timer_block_enabled = 0;
panic("failed to enable APB timer\n");
}

View File

@@ -41,6 +41,7 @@
#include <asm/perf_event.h>
#include <asm/x86_init.h>
#include <linux/atomic.h>
#include <asm/barrier.h>
#include <asm/mpspec.h>
#include <asm/i8259.h>
#include <asm/proto.h>
@@ -477,6 +478,9 @@ static int lapic_next_deadline(unsigned long delta,
{
u64 tsc;
/* This MSR is special and need a special fence: */
weak_wrmsr_fence();
tsc = rdtsc();
wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
return 0;
@@ -1743,6 +1747,7 @@ void apic_ap_setup(void)
#ifdef CONFIG_X86_X2APIC
int x2apic_mode;
EXPORT_SYMBOL_GPL(x2apic_mode);
enum {
X2APIC_OFF,
@@ -2133,18 +2138,11 @@ void __init register_lapic_address(unsigned long address)
* Local APIC interrupts
*/
/**
* spurious_interrupt - Catch all for interrupts raised on unused vectors
* @regs: Pointer to pt_regs on stack
* @vector: The vector number
*
* This is invoked from ASM entry code to catch all interrupts which
* trigger on an entry which is routed to the common_spurious idtentry
* point.
*
* Also called from sysvec_spurious_apic_interrupt().
/*
* Common handling code for spurious_interrupt and spurious_vector entry
* points below. No point in allowing the compiler to inline it twice.
*/
DEFINE_IDTENTRY_IRQ(spurious_interrupt)
static noinline void handle_spurious_interrupt(u8 vector)
{
u32 v;
@@ -2179,9 +2177,23 @@ out:
trace_spurious_apic_exit(vector);
}
/**
* spurious_interrupt - Catch all for interrupts raised on unused vectors
* @regs: Pointer to pt_regs on stack
* @vector: The vector number
*
* This is invoked from ASM entry code to catch all interrupts which
* trigger on an entry which is routed to the common_spurious idtentry
* point.
*/
DEFINE_IDTENTRY_IRQ(spurious_interrupt)
{
handle_spurious_interrupt(vector);
}
DEFINE_IDTENTRY_SYSVEC(sysvec_spurious_apic_interrupt)
{
__spurious_interrupt(regs, SPURIOUS_APIC_VECTOR);
handle_spurious_interrupt(SPURIOUS_APIC_VECTOR);
}
/*

View File

@@ -198,7 +198,7 @@ static int __init parse_noapic(char *str)
}
early_param("noapic", parse_noapic);
/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
/* Will be called in mpparse/ACPI codes for saving IRQ info */
void mp_save_irq(struct mpc_intsrc *m)
{
int i;
@@ -2863,7 +2863,7 @@ int mp_register_ioapic(int id, u32 address, u32 gsi_base,
/*
* If mp_register_ioapic() is called during early boot stage when
* walking ACPI/SFI/DT tables, it's too early to create irqdomain,
* walking ACPI/DT tables, it's too early to create irqdomain,
* we are still using bootmem allocator. So delay it to setup_IO_APIC().
*/
if (hotplug) {

View File

@@ -29,7 +29,8 @@ static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_logical_apicid, cpu);
x2apic_wrmsr_fence();
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_LOGICAL);
}
@@ -41,7 +42,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long flags;
u32 dest;
x2apic_wrmsr_fence();
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
local_irq_save(flags);
tmpmsk = this_cpu_cpumask_var_ptr(ipi_mask);

View File

@@ -43,7 +43,8 @@ static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
x2apic_wrmsr_fence();
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
}
@@ -54,7 +55,8 @@ __x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
unsigned long this_cpu;
unsigned long flags;
x2apic_wrmsr_fence();
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
local_irq_save(flags);
@@ -125,7 +127,8 @@ void __x2apic_send_IPI_shorthand(int vector, u32 which)
{
unsigned long cfg = __prepare_ICR(which, vector, 0);
x2apic_wrmsr_fence();
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
native_x2apic_icr_write(cfg, 0);
}

View File

@@ -13,9 +13,6 @@ int main(void)
{
#ifdef CONFIG_PARAVIRT
#ifdef CONFIG_PARAVIRT_XXL
OFFSET(PV_CPU_usergs_sysret64, paravirt_patch_template,
cpu.usergs_sysret64);
OFFSET(PV_CPU_swapgs, paravirt_patch_template, cpu.swapgs);
#ifdef CONFIG_DEBUG_ENTRY
OFFSET(PV_IRQ_save_fl, paravirt_patch_template, irq.save_fl);
#endif

View File

@@ -10,6 +10,8 @@
*/
#include <linux/interrupt.h>
#include <asm/acrn.h>
#include <asm/apic.h>
#include <asm/cpufeatures.h>
#include <asm/desc.h>
@@ -19,7 +21,7 @@
static u32 __init acrn_detect(void)
{
return hypervisor_cpuid_base("ACRNACRNACRN", 0);
return acrn_cpuid_base();
}
static void __init acrn_init_platform(void)
@@ -55,6 +57,18 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_acrn_hv_callback)
set_irq_regs(old_regs);
}
void acrn_setup_intr_handler(void (*handler)(void))
{
acrn_intr_handler = handler;
}
EXPORT_SYMBOL_GPL(acrn_setup_intr_handler);
void acrn_remove_intr_handler(void)
{
acrn_intr_handler = NULL;
}
EXPORT_SYMBOL_GPL(acrn_remove_intr_handler);
const __initconst struct hypervisor_x86 x86_hyper_acrn = {
.name = "ACRN",
.detect = acrn_detect,

View File

@@ -542,12 +542,12 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
u32 ecx;
ecx = cpuid_ecx(0x8000001e);
nodes_per_socket = ((ecx >> 8) & 7) + 1;
__max_die_per_package = nodes_per_socket = ((ecx >> 8) & 7) + 1;
} else if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) {
u64 value;
rdmsrl(MSR_FAM10H_NODE_ID, value);
nodes_per_socket = ((value >> 3) & 7) + 1;
__max_die_per_package = nodes_per_socket = ((value >> 3) & 7) + 1;
}
if (!boot_cpu_has(X86_FEATURE_AMD_SSBD) &&

View File

@@ -960,6 +960,9 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x8000000a)
c->x86_capability[CPUID_8000_000A_EDX] = cpuid_edx(0x8000000a);
if (c->extended_cpuid_level >= 0x8000001f)
c->x86_capability[CPUID_8000_001F_EAX] = cpuid_eax(0x8000001f);
init_scattered_cpuid_features(c);
init_speculation_control(c);
@@ -1739,8 +1742,8 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
EXPORT_PER_CPU_SYMBOL(current_task);
DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
DEFINE_PER_CPU(void *, hardirq_stack_ptr);
DEFINE_PER_CPU(bool, hardirq_stack_inuse);
DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);

View File

@@ -24,6 +24,7 @@
#include <asm/traps.h>
#include <asm/resctrl.h>
#include <asm/numa.h>
#include <asm/thermal.h>
#ifdef CONFIG_X86_64
#include <linux/topology.h>
@@ -719,6 +720,8 @@ static void init_intel(struct cpuinfo_x86 *c)
tsx_disable();
split_lock_init();
intel_init_thermal(c);
}
#ifdef CONFIG_X86_32
@@ -1159,6 +1162,7 @@ static const struct x86_cpu_id split_lock_cpu_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, 1),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, 1),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, 1),
{}
};

View File

@@ -9,8 +9,6 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
mce-inject-y := inject.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
obj-$(CONFIG_ACPI_APEI) += apei.o
obj-$(CONFIG_X86_MCELOG_LEGACY) += dev-mcelog.o

View File

@@ -877,6 +877,12 @@ static atomic_t mce_executing;
*/
static atomic_t mce_callin;
/*
* Track which CPUs entered the MCA broadcast synchronization and which not in
* order to print holdouts.
*/
static cpumask_t mce_missing_cpus = CPU_MASK_ALL;
/*
* Check if a timeout waiting for other CPUs happened.
*/
@@ -894,8 +900,12 @@ static int mce_timed_out(u64 *t, const char *msg)
if (!mca_cfg.monarch_timeout)
goto out;
if ((s64)*t < SPINUNIT) {
if (mca_cfg.tolerant <= 1)
if (mca_cfg.tolerant <= 1) {
if (cpumask_and(&mce_missing_cpus, cpu_online_mask, &mce_missing_cpus))
pr_emerg("CPUs not responding to MCE broadcast (may include false positives): %*pbl\n",
cpumask_pr_args(&mce_missing_cpus));
mce_panic(msg, NULL, NULL);
}
cpu_missing = 1;
return 1;
}
@@ -1006,6 +1016,7 @@ static int mce_start(int *no_way_out)
* is updated before mce_callin.
*/
order = atomic_inc_return(&mce_callin);
cpumask_clear_cpu(smp_processor_id(), &mce_missing_cpus);
/*
* Wait for everyone.
@@ -1114,6 +1125,7 @@ static int mce_end(int order)
reset:
atomic_set(&global_nwo, 0);
atomic_set(&mce_callin, 0);
cpumask_setall(&mce_missing_cpus);
barrier();
/*
@@ -1992,10 +2004,9 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs)
* that out because it's an indirect call. Annotate it.
*/
instrumentation_begin();
trace_hardirqs_off_finish();
machine_check_vector(regs);
if (regs->flags & X86_EFLAGS_IF)
trace_hardirqs_on_prepare();
instrumentation_end();
irqentry_nmi_exit(regs, irq_state);
}
@@ -2004,7 +2015,9 @@ static __always_inline void exc_machine_check_user(struct pt_regs *regs)
{
irqentry_enter_from_user_mode(regs);
instrumentation_begin();
machine_check_vector(regs);
instrumentation_end();
irqentry_exit_to_user_mode(regs);
}
@@ -2177,7 +2190,6 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
mce_register_decode_chain(&early_nb);
mce_register_decode_chain(&mce_uc_nb);
mce_register_decode_chain(&mce_default_nb);
@@ -2712,6 +2724,7 @@ static void mce_reset(void)
atomic_set(&mce_executing, 0);
atomic_set(&mce_callin, 0);
atomic_set(&global_nwo, 0);
cpumask_setall(&mce_missing_cpus);
}
static int fake_panic_get(void *data, u64 *val)

View File

@@ -531,7 +531,6 @@ static void intel_imc_init(struct cpuinfo_x86 *c)
void mce_intel_feature_init(struct cpuinfo_x86 *c)
{
intel_init_thermal(c);
intel_init_cmci();
intel_init_lmce();
intel_ppin_init(c);

View File

@@ -1,739 +0,0 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Thermal throttle event support code (such as syslog messaging and rate
* limiting) that was factored out from x86_64 (mce_intel.c) and i386 (p4.c).
*
* This allows consistent reporting of CPU thermal throttle events.
*
* Maintains a counter in /sys that keeps track of the number of thermal
* events, such that the user knows how bad the thermal problem might be
* (since the logging to syslog is rate limited).
*
* Author: Dmitriy Zavin (dmitriyz@google.com)
*
* Credits: Adapted from Zwane Mwaikambo's original code in mce_intel.c.
* Inspired by Ross Biro's and Al Borchers' counter code.
*/
#include <linux/interrupt.h>
#include <linux/notifier.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
#include <linux/percpu.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/init.h>
#include <linux/smp.h>
#include <linux/cpu.h>
#include <asm/processor.h>
#include <asm/traps.h>
#include <asm/apic.h>
#include <asm/mce.h>
#include <asm/msr.h>
#include <asm/trace/irq_vectors.h>
#include "internal.h"
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
#define THERMAL_THROTTLING_EVENT 0
#define POWER_LIMIT_EVENT 1
/**
* struct _thermal_state - Represent the current thermal event state
* @next_check: Stores the next timestamp, when it is allowed
* to log the next warning message.
* @last_interrupt_time: Stores the timestamp for the last threshold
* high event.
* @therm_work: Delayed workqueue structure
* @count: Stores the current running count for thermal
* or power threshold interrupts.
* @last_count: Stores the previous running count for thermal
* or power threshold interrupts.
* @max_time_ms: This shows the maximum amount of time CPU was
* in throttled state for a single thermal
* threshold high to low state.
* @total_time_ms: This is a cumulative time during which CPU was
* in the throttled state.
* @rate_control_active: Set when a throttling message is logged.
* This is used for the purpose of rate-control.
* @new_event: Stores the last high/low status of the
* THERM_STATUS_PROCHOT or
* THERM_STATUS_POWER_LIMIT.
* @level: Stores whether this _thermal_state instance is
* for a CORE level or for PACKAGE level.
* @sample_index: Index for storing the next sample in the buffer
* temp_samples[].
* @sample_count: Total number of samples collected in the buffer
* temp_samples[].
* @average: The last moving average of temperature samples
* @baseline_temp: Temperature at which thermal threshold high
* interrupt was generated.
* @temp_samples: Storage for temperature samples to calculate
* moving average.
*
* This structure is used to represent data related to thermal state for a CPU.
* There is a separate storage for core and package level for each CPU.
*/
struct _thermal_state {
u64 next_check;
u64 last_interrupt_time;
struct delayed_work therm_work;
unsigned long count;
unsigned long last_count;
unsigned long max_time_ms;
unsigned long total_time_ms;
bool rate_control_active;
bool new_event;
u8 level;
u8 sample_index;
u8 sample_count;
u8 average;
u8 baseline_temp;
u8 temp_samples[3];
};
struct thermal_state {
struct _thermal_state core_throttle;
struct _thermal_state core_power_limit;
struct _thermal_state package_throttle;
struct _thermal_state package_power_limit;
struct _thermal_state core_thresh0;
struct _thermal_state core_thresh1;
struct _thermal_state pkg_thresh0;
struct _thermal_state pkg_thresh1;
};
/* Callback to handle core threshold interrupts */
int (*platform_thermal_notify)(__u64 msr_val);
EXPORT_SYMBOL(platform_thermal_notify);
/* Callback to handle core package threshold_interrupts */
int (*platform_thermal_package_notify)(__u64 msr_val);
EXPORT_SYMBOL_GPL(platform_thermal_package_notify);
/* Callback support of rate control, return true, if
* callback has rate control */
bool (*platform_thermal_package_rate_control)(void);
EXPORT_SYMBOL_GPL(platform_thermal_package_rate_control);
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
static atomic_t therm_throt_en = ATOMIC_INIT(0);
static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_device_one_ro(_name) \
static DEVICE_ATTR(_name, 0444, \
therm_throt_device_show_##_name, \
NULL) \
#define define_therm_throt_device_show_func(event, name) \
\
static ssize_t therm_throt_device_show_##event##_##name( \
struct device *dev, \
struct device_attribute *attr, \
char *buf) \
{ \
unsigned int cpu = dev->id; \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
per_cpu(thermal_state, cpu).event.name); \
} else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}
define_therm_throt_device_show_func(core_throttle, count);
define_therm_throt_device_one_ro(core_throttle_count);
define_therm_throt_device_show_func(core_power_limit, count);
define_therm_throt_device_one_ro(core_power_limit_count);
define_therm_throt_device_show_func(package_throttle, count);
define_therm_throt_device_one_ro(package_throttle_count);
define_therm_throt_device_show_func(package_power_limit, count);
define_therm_throt_device_one_ro(package_power_limit_count);
define_therm_throt_device_show_func(core_throttle, max_time_ms);
define_therm_throt_device_one_ro(core_throttle_max_time_ms);
define_therm_throt_device_show_func(package_throttle, max_time_ms);
define_therm_throt_device_one_ro(package_throttle_max_time_ms);
define_therm_throt_device_show_func(core_throttle, total_time_ms);
define_therm_throt_device_one_ro(core_throttle_total_time_ms);
define_therm_throt_device_show_func(package_throttle, total_time_ms);
define_therm_throt_device_one_ro(package_throttle_total_time_ms);
static struct attribute *thermal_throttle_attrs[] = {
&dev_attr_core_throttle_count.attr,
&dev_attr_core_throttle_max_time_ms.attr,
&dev_attr_core_throttle_total_time_ms.attr,
NULL
};
static const struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
#define CORE_LEVEL 0
#define PACKAGE_LEVEL 1
#define THERM_THROT_POLL_INTERVAL HZ
#define THERM_STATUS_PROCHOT_LOG BIT(1)
#define THERM_STATUS_CLEAR_CORE_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11) | BIT(13) | BIT(15))
#define THERM_STATUS_CLEAR_PKG_MASK (BIT(1) | BIT(3) | BIT(5) | BIT(7) | BIT(9) | BIT(11))
static void clear_therm_status_log(int level)
{
int msr;
u64 mask, msr_val;
if (level == CORE_LEVEL) {
msr = MSR_IA32_THERM_STATUS;
mask = THERM_STATUS_CLEAR_CORE_MASK;
} else {
msr = MSR_IA32_PACKAGE_THERM_STATUS;
mask = THERM_STATUS_CLEAR_PKG_MASK;
}
rdmsrl(msr, msr_val);
msr_val &= mask;
wrmsrl(msr, msr_val & ~THERM_STATUS_PROCHOT_LOG);
}
static void get_therm_status(int level, bool *proc_hot, u8 *temp)
{
int msr;
u64 msr_val;
if (level == CORE_LEVEL)
msr = MSR_IA32_THERM_STATUS;
else
msr = MSR_IA32_PACKAGE_THERM_STATUS;
rdmsrl(msr, msr_val);
if (msr_val & THERM_STATUS_PROCHOT_LOG)
*proc_hot = true;
else
*proc_hot = false;
*temp = (msr_val >> 16) & 0x7F;
}
static void __maybe_unused throttle_active_work(struct work_struct *work)
{
struct _thermal_state *state = container_of(to_delayed_work(work),
struct _thermal_state, therm_work);
unsigned int i, avg, this_cpu = smp_processor_id();
u64 now = get_jiffies_64();
bool hot;
u8 temp;
get_therm_status(state->level, &hot, &temp);
/* temperature value is offset from the max so lesser means hotter */
if (!hot && temp > state->baseline_temp) {
if (state->rate_control_active)
pr_info("CPU%d: %s temperature/speed normal (total events = %lu)\n",
this_cpu,
state->level == CORE_LEVEL ? "Core" : "Package",
state->count);
state->rate_control_active = false;
return;
}
if (time_before64(now, state->next_check) &&
state->rate_control_active)
goto re_arm;
state->next_check = now + CHECK_INTERVAL;
if (state->count != state->last_count) {
/* There was one new thermal interrupt */
state->last_count = state->count;
state->average = 0;
state->sample_count = 0;
state->sample_index = 0;
}
state->temp_samples[state->sample_index] = temp;
state->sample_count++;
state->sample_index = (state->sample_index + 1) % ARRAY_SIZE(state->temp_samples);
if (state->sample_count < ARRAY_SIZE(state->temp_samples))
goto re_arm;
avg = 0;
for (i = 0; i < ARRAY_SIZE(state->temp_samples); ++i)
avg += state->temp_samples[i];
avg /= ARRAY_SIZE(state->temp_samples);
if (state->average > avg) {
pr_warn("CPU%d: %s temperature is above threshold, cpu clock is throttled (total events = %lu)\n",
this_cpu,
state->level == CORE_LEVEL ? "Core" : "Package",
state->count);
state->rate_control_active = true;
}
state->average = avg;
re_arm:
clear_therm_status_log(state->level);
schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
}
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
* thermal interrupt normally gets called both when the thermal
* event begins and once the event has ended.
*
* This function is called by the thermal interrupt after the
* IRQ has been acknowledged.
*
* It will take care of rate limiting and printing messages to the syslog.
*/
static void therm_throt_process(bool new_event, int event, int level)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
bool old_event;
u64 now;
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
now = get_jiffies_64();
if (level == CORE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->core_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->core_power_limit;
else
return;
} else if (level == PACKAGE_LEVEL) {
if (event == THERMAL_THROTTLING_EVENT)
state = &pstate->package_throttle;
else if (event == POWER_LIMIT_EVENT)
state = &pstate->package_power_limit;
else
return;
} else
return;
old_event = state->new_event;
state->new_event = new_event;
if (new_event)
state->count++;
if (event != THERMAL_THROTTLING_EVENT)
return;
if (new_event && !state->last_interrupt_time) {
bool hot;
u8 temp;
get_therm_status(state->level, &hot, &temp);
/*
* Ignore short temperature spike as the system is not close
* to PROCHOT. 10C offset is large enough to ignore. It is
* already dropped from the high threshold temperature.
*/
if (temp > 10)
return;
state->baseline_temp = temp;
state->last_interrupt_time = now;
schedule_delayed_work_on(this_cpu, &state->therm_work, THERM_THROT_POLL_INTERVAL);
} else if (old_event && state->last_interrupt_time) {
unsigned long throttle_time;
throttle_time = jiffies_delta_to_msecs(now - state->last_interrupt_time);
if (throttle_time > state->max_time_ms)
state->max_time_ms = throttle_time;
state->total_time_ms += throttle_time;
state->last_interrupt_time = 0;
}
}
static int thresh_event_valid(int level, int event)
{
struct _thermal_state *state;
unsigned int this_cpu = smp_processor_id();
struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
u64 now = get_jiffies_64();
if (level == PACKAGE_LEVEL)
state = (event == 0) ? &pstate->pkg_thresh0 :
&pstate->pkg_thresh1;
else
state = (event == 0) ? &pstate->core_thresh0 :
&pstate->core_thresh1;
if (time_before64(now, state->next_check))
return 0;
state->next_check = now + CHECK_INTERVAL;
return 1;
}
static bool int_pln_enable;
static int __init int_pln_enable_setup(char *s)
{
int_pln_enable = true;
return 1;
}
__setup("int_pln_enable", int_pln_enable_setup);
#ifdef CONFIG_SYSFS
/* Add/Remove thermal_throttle interface for CPU device: */
static int thermal_throttle_add_dev(struct device *dev, unsigned int cpu)
{
int err;
struct cpuinfo_x86 *c = &cpu_data(cpu);
err = sysfs_create_group(&dev->kobj, &thermal_attr_group);
if (err)
return err;
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_core_power_limit_count.attr,
thermal_attr_group.name);
if (err)
goto del_group;
}
if (cpu_has(c, X86_FEATURE_PTS)) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_count.attr,
thermal_attr_group.name);
if (err)
goto del_group;
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_max_time_ms.attr,
thermal_attr_group.name);
if (err)
goto del_group;
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_throttle_total_time_ms.attr,
thermal_attr_group.name);
if (err)
goto del_group;
if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable) {
err = sysfs_add_file_to_group(&dev->kobj,
&dev_attr_package_power_limit_count.attr,
thermal_attr_group.name);
if (err)
goto del_group;
}
}
return 0;
del_group:
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
return err;
}
static void thermal_throttle_remove_dev(struct device *dev)
{
sysfs_remove_group(&dev->kobj, &thermal_attr_group);
}
/* Get notified when a cpu comes on/off. Be hotplug friendly. */
static int thermal_throttle_online(unsigned int cpu)
{
struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
u32 l;
state->package_throttle.level = PACKAGE_LEVEL;
state->core_throttle.level = CORE_LEVEL;
INIT_DELAYED_WORK(&state->package_throttle.therm_work, throttle_active_work);
INIT_DELAYED_WORK(&state->core_throttle.therm_work, throttle_active_work);
/* Unmask the thermal vector after the above workqueues are initialized. */
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
return thermal_throttle_add_dev(dev, cpu);
}
static int thermal_throttle_offline(unsigned int cpu)
{
struct thermal_state *state = &per_cpu(thermal_state, cpu);
struct device *dev = get_cpu_device(cpu);
u32 l;
/* Mask the thermal vector before draining evtl. pending work */
l = apic_read(APIC_LVTTHMR);
apic_write(APIC_LVTTHMR, l | APIC_LVT_MASKED);
cancel_delayed_work_sync(&state->package_throttle.therm_work);
cancel_delayed_work_sync(&state->core_throttle.therm_work);
state->package_throttle.rate_control_active = false;
state->core_throttle.rate_control_active = false;
thermal_throttle_remove_dev(dev);
return 0;
}
static __init int thermal_throttle_init_device(void)
{
int ret;
if (!atomic_read(&therm_throt_en))
return 0;
ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online",
thermal_throttle_online,
thermal_throttle_offline);
return ret < 0 ? ret : 0;
}
device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
static void notify_package_thresholds(__u64 msr_val)
{
bool notify_thres_0 = false;
bool notify_thres_1 = false;
if (!platform_thermal_package_notify)
return;
/* lower threshold check */
if (msr_val & THERM_LOG_THRESHOLD0)
notify_thres_0 = true;
/* higher threshold check */
if (msr_val & THERM_LOG_THRESHOLD1)
notify_thres_1 = true;
if (!notify_thres_0 && !notify_thres_1)
return;
if (platform_thermal_package_rate_control &&
platform_thermal_package_rate_control()) {
/* Rate control is implemented in callback */
platform_thermal_package_notify(msr_val);
return;
}
/* lower threshold reached */
if (notify_thres_0 && thresh_event_valid(PACKAGE_LEVEL, 0))
platform_thermal_package_notify(msr_val);
/* higher threshold reached */
if (notify_thres_1 && thresh_event_valid(PACKAGE_LEVEL, 1))
platform_thermal_package_notify(msr_val);
}
static void notify_thresholds(__u64 msr_val)
{
/* check whether the interrupt handler is defined;
* otherwise simply return
*/
if (!platform_thermal_notify)
return;
/* lower threshold reached */
if ((msr_val & THERM_LOG_THRESHOLD0) &&
thresh_event_valid(CORE_LEVEL, 0))
platform_thermal_notify(msr_val);
/* higher threshold reached */
if ((msr_val & THERM_LOG_THRESHOLD1) &&
thresh_event_valid(CORE_LEVEL, 1))
platform_thermal_notify(msr_val);
}
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
if (static_cpu_has(X86_FEATURE_HWP))
wrmsrl_safe(MSR_HWP_STATUS, 0);
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
/* Check for violation of core thermal thresholds*/
notify_thresholds(msr_val);
therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
CORE_LEVEL);
if (this_cpu_has(X86_FEATURE_PTS)) {
rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
/* check violations of package thermal thresholds */
notify_package_thresholds(msr_val);
therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
THERMAL_THROTTLING_EVENT,
PACKAGE_LEVEL);
if (this_cpu_has(X86_FEATURE_PLN) && int_pln_enable)
therm_throt_process(msr_val &
PACKAGE_THERM_STATUS_POWER_LIMIT,
POWER_LIMIT_EVENT,
PACKAGE_LEVEL);
}
}
static void unexpected_thermal_interrupt(void)
{
pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
}
static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
{
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
ack_APIC_irq();
}
/* Thermal monitoring depends on APIC, ACPI and clock modulation */
static int intel_thermal_supported(struct cpuinfo_x86 *c)
{
if (!boot_cpu_has(X86_FEATURE_APIC))
return 0;
if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
return 0;
return 1;
}
void __init mcheck_intel_therm_init(void)
{
/*
* This function is only called on boot CPU. Save the init thermal
* LVT value on BSP and use that value to restore APs' thermal LVT
* entry BIOS programmed later
*/
if (intel_thermal_supported(&boot_cpu_data))
lvtthmr_init = apic_read(APIC_LVTTHMR);
}
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
int tm2 = 0;
u32 l, h;
if (!intel_thermal_supported(c))
return;
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler
* since it might be delivered via SMI already:
*/
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
h = lvtthmr_init;
/*
* The initial value of thermal LVT entries on all APs always reads
* 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
* sequence to them and LVT registers are reset to 0s except for
* the mask bits which are set to 1s when APs receive INIT IPI.
* If BIOS takes over the thermal interrupt and sets its interrupt
* delivery mode to SMI (not fixed), it restores the value that the
* BIOS has programmed on AP based on BSP's info we saved since BIOS
* is always setting the same value for all threads/cores.
*/
if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
apic_write(APIC_LVTTHMR, lvtthmr_init);
if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
if (system_state == SYSTEM_BOOTING)
pr_debug("CPU%d: Thermal monitoring handled by SMI\n", cpu);
return;
}
/* early Pentium M models use different method for enabling TM2 */
if (cpu_has(c, X86_FEATURE_TM2)) {
if (c->x86 == 6 && (c->x86_model == 9 || c->x86_model == 13)) {
rdmsr(MSR_THERM2_CTL, l, h);
if (l & MSR_THERM2_CTL_TM_SELECT)
tm2 = 1;
} else if (l & MSR_IA32_MISC_ENABLE_TM2)
tm2 = 1;
}
/* We'll mask the thermal vector in the lapic till we're ready: */
h = THERMAL_APIC_VECTOR | APIC_DM_FIXED | APIC_LVT_MASKED;
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
(l | (THERM_INT_LOW_ENABLE
| THERM_INT_HIGH_ENABLE)) & ~THERM_INT_PLN_ENABLE, h);
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
wrmsr(MSR_IA32_THERM_INTERRUPT,
l | (THERM_INT_LOW_ENABLE
| THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
else
wrmsr(MSR_IA32_THERM_INTERRUPT,
l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
if (cpu_has(c, X86_FEATURE_PTS)) {
rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
if (cpu_has(c, X86_FEATURE_PLN) && !int_pln_enable)
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
(l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE))
& ~PACKAGE_THERM_INT_PLN_ENABLE, h);
else if (cpu_has(c, X86_FEATURE_PLN) && int_pln_enable)
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE
| PACKAGE_THERM_INT_PLN_ENABLE), h);
else
wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
l | (PACKAGE_THERM_INT_LOW_ENABLE
| PACKAGE_THERM_INT_HIGH_ENABLE), h);
}
smp_thermal_vector = intel_thermal_interrupt;
rdmsr(MSR_IA32_MISC_ENABLE, l, h);
wrmsr(MSR_IA32_MISC_ENABLE, l | MSR_IA32_MISC_ENABLE_TM1, h);
pr_info_once("CPU0: Thermal monitoring enabled (%s)\n",
tm2 ? "TM2" : "TM1");
/* enable thermal throttle processing */
atomic_set(&therm_throt_en, 1);
}

View File

@@ -830,7 +830,7 @@ static const struct attribute_group cpu_root_microcode_group = {
.attrs = cpu_root_microcode_attrs,
};
int __init microcode_init(void)
static int __init microcode_init(void)
{
struct cpuinfo_x86 *c = &boot_cpu_data;
int error;

View File

@@ -31,6 +31,11 @@
#include <asm/reboot.h>
#include <asm/nmi.h>
#include <clocksource/hyperv_timer.h>
#include <asm/numa.h>
/* Is Linux running as the root partition? */
bool hv_root_partition;
EXPORT_SYMBOL_GPL(hv_root_partition);
struct ms_hyperv_info ms_hyperv;
EXPORT_SYMBOL_GPL(ms_hyperv);
@@ -135,14 +140,32 @@ static void hv_machine_shutdown(void)
{
if (kexec_in_progress && hv_kexec_handler)
hv_kexec_handler();
/*
* Call hv_cpu_die() on all the CPUs, otherwise later the hypervisor
* corrupts the old VP Assist Pages and can crash the kexec kernel.
*/
if (kexec_in_progress && hyperv_init_cpuhp > 0)
cpuhp_remove_state(hyperv_init_cpuhp);
/* The function calls stop_other_cpus(). */
native_machine_shutdown();
/* Disable the hypercall page when there is only 1 active CPU. */
if (kexec_in_progress)
hyperv_cleanup();
}
static void hv_machine_crash_shutdown(struct pt_regs *regs)
{
if (hv_crash_handler)
hv_crash_handler(regs);
/* The function calls crash_smp_send_stop(). */
native_machine_crash_shutdown(regs);
/* Disable the hypercall page when there is only 1 active CPU. */
hyperv_cleanup();
}
#endif /* CONFIG_KEXEC_CORE */
#endif /* CONFIG_HYPERV */
@@ -208,6 +231,32 @@ static void __init hv_smp_prepare_boot_cpu(void)
hv_init_spinlocks();
#endif
}
static void __init hv_smp_prepare_cpus(unsigned int max_cpus)
{
#ifdef CONFIG_X86_64
int i;
int ret;
#endif
native_smp_prepare_cpus(max_cpus);
#ifdef CONFIG_X86_64
for_each_present_cpu(i) {
if (i == 0)
continue;
ret = hv_call_add_logical_proc(numa_cpu_node(i), i, cpu_physical_id(i));
BUG_ON(ret);
}
for_each_present_cpu(i) {
if (i == 0)
continue;
ret = hv_call_create_vp(numa_cpu_node(i), hv_current_partition_id, i, i);
BUG_ON(ret);
}
#endif
}
#endif
static void __init ms_hyperv_init_platform(void)
@@ -225,6 +274,7 @@ static void __init ms_hyperv_init_platform(void)
* Extract the features and hints
*/
ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
ms_hyperv.features_b = cpuid_ebx(HYPERV_CPUID_FEATURES);
ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
@@ -237,6 +287,22 @@ static void __init ms_hyperv_init_platform(void)
pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n",
ms_hyperv.max_vp_index, ms_hyperv.max_lp_index);
/*
* Check CPU management privilege.
*
* To mirror what Windows does we should extract CPU management
* features and use the ReservedIdentityBit to detect if Linux is the
* root partition. But that requires negotiating CPU management
* interface (a process to be finalized).
*
* For now, use the privilege flag as the indicator for running as
* root.
*/
if (cpuid_ebx(HYPERV_CPUID_FEATURES) & HV_CPU_MANAGEMENT) {
hv_root_partition = true;
pr_info("Hyper-V: running as root partition\n");
}
/*
* Extract host information.
*/
@@ -259,6 +325,14 @@ static void __init ms_hyperv_init_platform(void)
x86_platform.calibrate_cpu = hv_get_tsc_khz;
}
if (ms_hyperv.features_b & HV_ISOLATION) {
ms_hyperv.isolation_config_a = cpuid_eax(HYPERV_CPUID_ISOLATION_CONFIG);
ms_hyperv.isolation_config_b = cpuid_ebx(HYPERV_CPUID_ISOLATION_CONFIG);
pr_info("Hyper-V: Isolation Config: Group A 0x%x, Group B 0x%x\n",
ms_hyperv.isolation_config_a, ms_hyperv.isolation_config_b);
}
if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED) {
ms_hyperv.nested_features =
cpuid_eax(HYPERV_CPUID_NESTED_FEATURES);
@@ -348,6 +422,8 @@ static void __init ms_hyperv_init_platform(void)
# ifdef CONFIG_SMP
smp_ops.smp_prepare_boot_cpu = hv_smp_prepare_boot_cpu;
if (hv_root_partition)
smp_ops.smp_prepare_cpus = hv_smp_prepare_cpus;
# endif
/*

View File

@@ -537,9 +537,9 @@ static void __init print_out_mtrr_range_state(void)
if (!size_base)
continue;
size_base = to_size_factor(size_base, &size_factor),
size_base = to_size_factor(size_base, &size_factor);
start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
start_base = to_size_factor(start_base, &start_factor),
start_base = to_size_factor(start_base, &start_factor);
type = range_state[i].type;
pr_debug("reg %d, base: %ld%cB, range: %ld%cB, type %s\n",

View File

@@ -3,7 +3,6 @@
* This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
* because MTRRs can span up to 40 bits (36bits on most modern x86)
*/
#define DEBUG
#include <linux/export.h>
#include <linux/init.h>
@@ -167,9 +166,6 @@ static u8 mtrr_type_lookup_variable(u64 start, u64 end, u64 *partial_end,
*repeat = 0;
*uniform = 1;
/* Make end inclusive instead of exclusive */
end--;
prev_match = MTRR_TYPE_INVALID;
for (i = 0; i < num_var_ranges; ++i) {
unsigned short start_state, end_state, inclusive;
@@ -261,6 +257,9 @@ u8 mtrr_type_lookup(u64 start, u64 end, u8 *uniform)
int repeat;
u64 partial_end;
/* Make end inclusive instead of exclusive */
end--;
if (!mtrr_state_set)
return MTRR_TYPE_INVALID;

View File

@@ -31,8 +31,6 @@
System Programming Guide; Section 9.11. (1997 edition - PPro).
*/
#define DEBUG
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
#include <linux/stop_machine.h>

View File

@@ -3,7 +3,7 @@
* local apic based NMI watchdog for various CPUs.
*
* This file also handles reservation of performance counters for coordination
* with other users (like oprofile).
* with other users.
*
* Note that these events normally don't tick when the CPU idles. This means
* the frequency varies with CPU load.
@@ -105,15 +105,6 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
}
/* checks for a bit availability (hack for oprofile) */
int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
{
BUG_ON(counter > NMI_MAX_COUNTER_BITS);
return !test_bit(counter, perfctr_nmi_owner);
}
EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
int reserve_perfctr_nmi(unsigned int msr)
{
unsigned int counter;

View File

@@ -572,6 +572,7 @@ union cpuid_0x10_x_edx {
void rdt_last_cmd_clear(void);
void rdt_last_cmd_puts(const char *s);
__printf(1, 2)
void rdt_last_cmd_printf(const char *fmt, ...);
void rdt_ctrl_update(void *arg);

View File

@@ -525,89 +525,70 @@ static void rdtgroup_remove(struct rdtgroup *rdtgrp)
kfree(rdtgrp);
}
struct task_move_callback {
struct callback_head work;
struct rdtgroup *rdtgrp;
};
static void move_myself(struct callback_head *head)
static void _update_task_closid_rmid(void *task)
{
struct task_move_callback *callback;
struct rdtgroup *rdtgrp;
callback = container_of(head, struct task_move_callback, work);
rdtgrp = callback->rdtgrp;
/*
* If resource group was deleted before this task work callback
* was invoked, then assign the task to root group and free the
* resource group.
* If the task is still current on this CPU, update PQR_ASSOC MSR.
* Otherwise, the MSR is updated when the task is scheduled in.
*/
if (atomic_dec_and_test(&rdtgrp->waitcount) &&
(rdtgrp->flags & RDT_DELETED)) {
current->closid = 0;
current->rmid = 0;
rdtgroup_remove(rdtgrp);
}
if (task == current)
resctrl_sched_in();
}
if (unlikely(current->flags & PF_EXITING))
goto out;
preempt_disable();
/* update PQR_ASSOC MSR to make resource group go into effect */
resctrl_sched_in();
preempt_enable();
out:
kfree(callback);
static void update_task_closid_rmid(struct task_struct *t)
{
if (IS_ENABLED(CONFIG_SMP) && task_curr(t))
smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1);
else
_update_task_closid_rmid(t);
}
static int __rdtgroup_move_task(struct task_struct *tsk,
struct rdtgroup *rdtgrp)
{
struct task_move_callback *callback;
int ret;
callback = kzalloc(sizeof(*callback), GFP_KERNEL);
if (!callback)
return -ENOMEM;
callback->work.func = move_myself;
callback->rdtgrp = rdtgrp;
/* If the task is already in rdtgrp, no need to move the task. */
if ((rdtgrp->type == RDTCTRL_GROUP && tsk->closid == rdtgrp->closid &&
tsk->rmid == rdtgrp->mon.rmid) ||
(rdtgrp->type == RDTMON_GROUP && tsk->rmid == rdtgrp->mon.rmid &&
tsk->closid == rdtgrp->mon.parent->closid))
return 0;
/*
* Take a refcount, so rdtgrp cannot be freed before the
* callback has been invoked.
* Set the task's closid/rmid before the PQR_ASSOC MSR can be
* updated by them.
*
* For ctrl_mon groups, move both closid and rmid.
* For monitor groups, can move the tasks only from
* their parent CTRL group.
*/
atomic_inc(&rdtgrp->waitcount);
ret = task_work_add(tsk, &callback->work, TWA_RESUME);
if (ret) {
/*
* Task is exiting. Drop the refcount and free the callback.
* No need to check the refcount as the group cannot be
* deleted before the write function unlocks rdtgroup_mutex.
*/
atomic_dec(&rdtgrp->waitcount);
kfree(callback);
rdt_last_cmd_puts("Task exited\n");
} else {
/*
* For ctrl_mon groups move both closid and rmid.
* For monitor groups, can move the tasks only from
* their parent CTRL group.
*/
if (rdtgrp->type == RDTCTRL_GROUP) {
tsk->closid = rdtgrp->closid;
tsk->rmid = rdtgrp->mon.rmid;
} else if (rdtgrp->type == RDTMON_GROUP) {
if (rdtgrp->mon.parent->closid == tsk->closid) {
tsk->rmid = rdtgrp->mon.rmid;
} else {
rdt_last_cmd_puts("Can't move task to different control group\n");
ret = -EINVAL;
}
if (rdtgrp->type == RDTCTRL_GROUP) {
WRITE_ONCE(tsk->closid, rdtgrp->closid);
WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else if (rdtgrp->type == RDTMON_GROUP) {
if (rdtgrp->mon.parent->closid == tsk->closid) {
WRITE_ONCE(tsk->rmid, rdtgrp->mon.rmid);
} else {
rdt_last_cmd_puts("Can't move task to different control group\n");
return -EINVAL;
}
}
return ret;
/*
* Ensure the task's closid and rmid are written before determining if
* the task is current that will decide if it will be interrupted.
*/
barrier();
/*
* By now, the task's closid and rmid are set. If the task is current
* on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource
* group go into effect. If the task is not current, the MSR will be
* updated when the task is scheduled in.
*/
update_task_closid_rmid(tsk);
return 0;
}
static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
@@ -2329,22 +2310,18 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
for_each_process_thread(p, t) {
if (!from || is_closid_match(t, from) ||
is_rmid_match(t, from)) {
t->closid = to->closid;
t->rmid = to->mon.rmid;
WRITE_ONCE(t->closid, to->closid);
WRITE_ONCE(t->rmid, to->mon.rmid);
#ifdef CONFIG_SMP
/*
* This is safe on x86 w/o barriers as the ordering
* of writing to task_cpu() and t->on_cpu is
* reverse to the reading here. The detection is
* inaccurate as tasks might move or schedule
* before the smp function call takes place. In
* such a case the function call is pointless, but
* If the task is on a CPU, set the CPU in the mask.
* The detection is inaccurate as tasks might move or
* schedule before the smp function call takes place.
* In such a case the function call is pointless, but
* there is no other side effect.
*/
if (mask && t->on_cpu)
if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t))
cpumask_set_cpu(task_cpu(t), mask);
#endif
}
}
read_unlock(&tasklist_lock);

View File

@@ -40,11 +40,6 @@ static const struct cpuid_bit cpuid_bits[] = {
{ X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 },
{ X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 },
{ X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 },
{ X86_FEATURE_SME, CPUID_EAX, 0, 0x8000001f, 0 },
{ X86_FEATURE_SEV, CPUID_EAX, 1, 0x8000001f, 0 },
{ X86_FEATURE_SEV_ES, CPUID_EAX, 3, 0x8000001f, 0 },
{ X86_FEATURE_SME_COHERENT, CPUID_EAX, 10, 0x8000001f, 0 },
{ X86_FEATURE_VM_PAGE_FLUSH, CPUID_EAX, 2, 0x8000001f, 0 },
{ 0, 0, 0, 0, 0 }
};

View File

@@ -72,6 +72,9 @@ static int sgx_release(struct inode *inode, struct file *file)
synchronize_srcu(&encl->srcu);
mmu_notifier_unregister(&encl_mm->mmu_notifier, encl_mm->mm);
kfree(encl_mm);
/* 'encl_mm' is gone, put encl_mm->encl reference: */
kref_put(&encl->refcount, sgx_encl_release);
}
kref_put(&encl->refcount, sgx_encl_release);

View File

@@ -141,7 +141,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
struct sgx_encl_page *entry;
unsigned long phys_addr;
struct sgx_encl *encl;
unsigned long pfn;
vm_fault_t ret;
encl = vma->vm_private_data;
@@ -168,13 +167,6 @@ static vm_fault_t sgx_vma_fault(struct vm_fault *vmf)
phys_addr = sgx_get_epc_phys_addr(entry->epc_page);
/* Check if another thread got here first to insert the PTE. */
if (!follow_pfn(vma, addr, &pfn)) {
mutex_unlock(&encl->lock);
return VM_FAULT_NOPAGE;
}
ret = vmf_insert_pfn(vma, addr, PFN_DOWN(phys_addr));
if (ret != VM_FAULT_NOPAGE) {
mutex_unlock(&encl->lock);
@@ -481,6 +473,9 @@ static void sgx_mmu_notifier_free(struct mmu_notifier *mn)
{
struct sgx_encl_mm *encl_mm = container_of(mn, struct sgx_encl_mm, mmu_notifier);
/* 'encl_mm' is going away, put encl_mm->encl reference: */
kref_put(&encl_mm->encl->refcount, sgx_encl_release);
kfree(encl_mm);
}
@@ -534,6 +529,8 @@ int sgx_encl_mm_add(struct sgx_encl *encl, struct mm_struct *mm)
if (!encl_mm)
return -ENOMEM;
/* Grab a refcount for the encl_mm->encl reference: */
kref_get(&encl->refcount);
encl_mm->encl = encl;
encl_mm->mm = mm;
encl_mm->mmu_notifier.ops = &sgx_mmu_notifier_ops;

View File

@@ -700,25 +700,27 @@ static bool __init sgx_page_cache_init(void)
return true;
}
static void __init sgx_init(void)
static int __init sgx_init(void)
{
int ret;
int i;
if (!cpu_feature_enabled(X86_FEATURE_SGX))
return;
return -ENODEV;
if (!sgx_page_cache_init())
return;
return -ENOMEM;
if (!sgx_page_reclaimer_init())
if (!sgx_page_reclaimer_init()) {
ret = -ENOMEM;
goto err_page_cache;
}
ret = sgx_drv_init();
if (ret)
goto err_kthread;
return;
return 0;
err_kthread:
kthread_stop(ksgxd_tsk);
@@ -728,6 +730,8 @@ err_page_cache:
vfree(sgx_epc_sections[i].pages);
memunmap(sgx_epc_sections[i].virt_addr);
}
return ret;
}
device_initcall(sgx_init);

View File

@@ -25,10 +25,10 @@
#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
#ifdef CONFIG_SMP
unsigned int __max_die_per_package __read_mostly = 1;
EXPORT_SYMBOL(__max_die_per_package);
#ifdef CONFIG_SMP
/*
* Check if given CPUID extended toplogy "leaf" is implemented
*/

View File

@@ -128,12 +128,21 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac
static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info *info)
{
unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long));
unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr);
unsigned long *begin;
/*
* This is a software stack, so 'end' can be a valid stack pointer.
* It just means the stack is empty.
* @end points directly to the top most stack entry to avoid a -8
* adjustment in the stack switch hotpath. Adjust it back before
* calculating @begin.
*/
end++;
begin = end - (IRQ_STACK_SIZE / sizeof(long));
/*
* Due to the switching logic RSP can never be == @end because the
* final operation is 'popq %rsp' which means after that RSP points
* to the original stack and not to @end.
*/
if (stack < begin || stack >= end)
return false;
@@ -143,8 +152,9 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info
info->end = end;
/*
* The next stack pointer is the first thing pushed by the entry code
* after switching to the irq stack.
* The next stack pointer is stored at the top of the irq stack
* before switching to the irq stack. Actual stack entries are all
* below that.
*/
info->next_sp = (unsigned long *)*(end - 1);

View File

@@ -121,7 +121,7 @@ int copy_fpregs_to_fpstate(struct fpu *fpu)
}
EXPORT_SYMBOL(copy_fpregs_to_fpstate);
void kernel_fpu_begin(void)
void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
preempt_disable();
@@ -141,13 +141,14 @@ void kernel_fpu_begin(void)
}
__cpu_invalidate_fpregs_state();
if (boot_cpu_has(X86_FEATURE_XMM))
/* Put sane initial values into the control registers. */
if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
ldmxcsr(MXCSR_DEFAULT);
if (boot_cpu_has(X86_FEATURE_FPU))
if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU))
asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin);
EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);
void kernel_fpu_end(void)
{

View File

@@ -167,14 +167,14 @@ void fpstate_sanitize_xstate(struct fpu *fpu)
fx->fop = 0;
fx->rip = 0;
fx->rdp = 0;
memset(&fx->st_space[0], 0, 128);
memset(fx->st_space, 0, sizeof(fx->st_space));
}
/*
* SSE is in init state
*/
if (!(xfeatures & XFEATURE_MASK_SSE))
memset(&fx->xmm_space[0], 0, 256);
memset(fx->xmm_space, 0, sizeof(fx->xmm_space));
/*
* First two features are FPU and SSE, which above we handled

View File

@@ -184,6 +184,7 @@ SYM_INNER_LABEL(ftrace_graph_call, SYM_L_GLOBAL)
* It is also used to copy the retq for trampolines.
*/
SYM_INNER_LABEL_ALIGN(ftrace_stub, SYM_L_WEAK)
UNWIND_HINT_FUNC
retq
SYM_FUNC_END(ftrace_epilogue)
@@ -276,7 +277,7 @@ SYM_INNER_LABEL(ftrace_regs_caller_end, SYM_L_GLOBAL)
restore_mcount_regs 8
/* Restore flags */
popfq
UNWIND_HINT_RET_OFFSET
UNWIND_HINT_FUNC
jmp ftrace_epilogue
SYM_FUNC_END(ftrace_regs_caller)
@@ -333,8 +334,7 @@ SYM_FUNC_START(ftrace_graph_caller)
retq
SYM_FUNC_END(ftrace_graph_caller)
SYM_CODE_START(return_to_handler)
UNWIND_HINT_EMPTY
SYM_FUNC_START(return_to_handler)
subq $24, %rsp
/* Save the return values */
@@ -349,5 +349,5 @@ SYM_CODE_START(return_to_handler)
movq (%rsp), %rax
addq $24, %rsp
JMP_NOSPEC rdi
SYM_CODE_END(return_to_handler)
SYM_FUNC_END(return_to_handler)
#endif

View File

@@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
CPU_ENTRY_AREA_TOTAL_SIZE))
return true;
/*
* When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
* GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
*/
#ifdef CONFIG_SMP
if (within_area(addr, end, (unsigned long)__per_cpu_offset,
sizeof(unsigned long) * nr_cpu_ids))
return true;
#else
if (within_area(addr, end, (unsigned long)&pcpu_unit_offsets,
sizeof(pcpu_unit_offsets)))
return true;
#endif
for_each_possible_cpu(cpu) {
/* The original rw GDT is being used after load_direct_gdt() */
if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
@@ -293,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, unsigned long end)
(unsigned long)&per_cpu(cpu_tlbstate, cpu),
sizeof(struct tlb_state)))
return true;
/*
* When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
* will read per-cpu cpu_dr7 before clear dr7 register.
*/
if (within_area(addr, end, (unsigned long)&per_cpu(cpu_dr7, cpu),
sizeof(cpu_dr7)))
return true;
}
return false;
@@ -491,15 +513,12 @@ static int hw_breakpoint_handler(struct die_args *args)
struct perf_event *bp;
unsigned long *dr6_p;
unsigned long dr6;
bool bpx;
/* The DR6 value is pointed by args->err */
dr6_p = (unsigned long *)ERR_PTR(args->err);
dr6 = *dr6_p;
/* If it's a single step, TRAP bits are random */
if (dr6 & DR_STEP)
return NOTIFY_DONE;
/* Do an early return if no trap bits are set in DR6 */
if ((dr6 & DR_TRAP_BITS) == 0)
return NOTIFY_DONE;
@@ -509,28 +528,29 @@ static int hw_breakpoint_handler(struct die_args *args)
if (likely(!(dr6 & (DR_TRAP0 << i))))
continue;
/*
* The counter may be concurrently released but that can only
* occur from a call_rcu() path. We can then safely fetch
* the breakpoint, use its callback, touch its counter
* while we are in an rcu_read_lock() path.
*/
rcu_read_lock();
bp = this_cpu_read(bp_per_reg[i]);
if (!bp)
continue;
bpx = bp->hw.info.type == X86_BREAKPOINT_EXECUTE;
/*
* TF and data breakpoints are traps and can be merged, however
* instruction breakpoints are faults and will be raised
* separately.
*
* However DR6 can indicate both TF and instruction
* breakpoints. In that case take TF as that has precedence and
* delay the instruction breakpoint for the next exception.
*/
if (bpx && (dr6 & DR_STEP))
continue;
/*
* Reset the 'i'th TRAP bit in dr6 to denote completion of
* exception handling
*/
(*dr6_p) &= ~(DR_TRAP0 << i);
/*
* bp can be NULL due to lazy debug register switching
* or due to concurrent perf counter removing.
*/
if (!bp) {
rcu_read_unlock();
break;
}
perf_bp_event(bp, args->regs);
@@ -538,11 +558,10 @@ static int hw_breakpoint_handler(struct die_args *args)
* Set up resume flag to avoid breakpoint recursion when
* returning back to origin.
*/
if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
if (bpx)
args->regs->flags |= X86_EFLAGS_RF;
rcu_read_unlock();
}
/*
* Further processing in do_debug() is needed for a) user-space
* breakpoints (to generate signals) and b) when the system has

View File

@@ -21,6 +21,7 @@
#include <asm/hw_irq.h>
#include <asm/desc.h>
#include <asm/traps.h>
#include <asm/thermal.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/irq_vectors.h>
@@ -227,7 +228,7 @@ static __always_inline void handle_irq(struct irq_desc *desc,
struct pt_regs *regs)
{
if (IS_ENABLED(CONFIG_X86_64))
run_irq_on_irqstack_cond(desc->handle_irq, desc, regs);
generic_handle_irq_desc(desc);
else
__handle_irq(desc, regs);
}
@@ -374,3 +375,23 @@ void fixup_irqs(void)
}
}
#endif
#ifdef CONFIG_X86_THERMAL_VECTOR
static void smp_thermal_vector(void)
{
if (x86_thermal_enabled())
intel_thermal_interrupt();
else
pr_err("CPU%d: Unexpected LVT thermal interrupt!\n",
smp_processor_id());
}
DEFINE_IDTENTRY_SYSVEC(sysvec_thermal)
{
trace_thermal_apic_entry(THERMAL_APIC_VECTOR);
inc_irq_stat(irq_thermal_count);
smp_thermal_vector();
trace_thermal_apic_exit(THERMAL_APIC_VECTOR);
ack_APIC_irq();
}
#endif

View File

@@ -22,6 +22,7 @@
#include <asm/apic.h>
#include <asm/nospec-branch.h>
#include <asm/softirq_stack.h>
#ifdef CONFIG_DEBUG_STACKOVERFLOW

View File

@@ -20,6 +20,7 @@
#include <linux/sched/task_stack.h>
#include <asm/cpu_entry_area.h>
#include <asm/softirq_stack.h>
#include <asm/irq_stack.h>
#include <asm/io_apic.h>
#include <asm/apic.h>
@@ -48,7 +49,8 @@ static int map_irq_stack(unsigned int cpu)
if (!va)
return -ENOMEM;
per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
/* Store actual TOS to avoid adjustment in the hotpath */
per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#else
@@ -60,7 +62,8 @@ static int map_irq_stack(unsigned int cpu)
{
void *va = per_cpu_ptr(&irq_stack_backing_store, cpu);
per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE;
/* Store actual TOS to avoid adjustment in the hotpath */
per_cpu(hardirq_stack_ptr, cpu) = va + IRQ_STACK_SIZE - 8;
return 0;
}
#endif
@@ -71,8 +74,3 @@ int irq_init_percpu_irqstack(unsigned int cpu)
return 0;
return map_irq_stack(cpu);
}
void do_softirq_own_stack(void)
{
run_on_irqstack_cond(__do_softirq, NULL);
}

View File

@@ -13,14 +13,3 @@ SYM_FUNC_START(native_save_fl)
ret
SYM_FUNC_END(native_save_fl)
EXPORT_SYMBOL(native_save_fl)
/*
* void native_restore_fl(unsigned long flags)
* %eax/%rdi: flags
*/
SYM_FUNC_START(native_restore_fl)
push %_ASM_ARG1
popf
ret
SYM_FUNC_END(native_restore_fl)
EXPORT_SYMBOL(native_restore_fl)

View File

@@ -132,26 +132,6 @@ void synthesize_relcall(void *dest, void *from, void *to)
}
NOKPROBE_SYMBOL(synthesize_relcall);
/*
* Skip the prefixes of the instruction.
*/
static kprobe_opcode_t *skip_prefixes(kprobe_opcode_t *insn)
{
insn_attr_t attr;
attr = inat_get_opcode_attribute((insn_byte_t)*insn);
while (inat_is_legacy_prefix(attr)) {
insn++;
attr = inat_get_opcode_attribute((insn_byte_t)*insn);
}
#ifdef CONFIG_X86_64
if (inat_is_rex_prefix(attr))
insn++;
#endif
return insn;
}
NOKPROBE_SYMBOL(skip_prefixes);
/*
* Returns non-zero if INSN is boostable.
* RIP relative instructions are adjusted at copying time in 64 bits mode
@@ -311,25 +291,6 @@ static int can_probe(unsigned long paddr)
return (addr == paddr);
}
/*
* Returns non-zero if opcode modifies the interrupt flag.
*/
static int is_IF_modifier(kprobe_opcode_t *insn)
{
/* Skip prefixes */
insn = skip_prefixes(insn);
switch (*insn) {
case 0xfa: /* cli */
case 0xfb: /* sti */
case 0xcf: /* iret/iretd */
case 0x9d: /* popf/popfd */
return 1;
}
return 0;
}
/*
* Copy an instruction with recovering modified instruction by kprobes
* and adjust the displacement if the instruction uses the %rip-relative
@@ -411,9 +372,9 @@ static int prepare_boost(kprobe_opcode_t *buf, struct kprobe *p,
synthesize_reljump(buf + len, p->ainsn.insn + len,
p->addr + insn->length);
len += JMP32_INSN_SIZE;
p->ainsn.boostable = true;
p->ainsn.boostable = 1;
} else {
p->ainsn.boostable = false;
p->ainsn.boostable = 0;
}
return len;
@@ -450,6 +411,67 @@ void free_insn_page(void *page)
module_memfree(page);
}
static void set_resume_flags(struct kprobe *p, struct insn *insn)
{
insn_byte_t opcode = insn->opcode.bytes[0];
switch (opcode) {
case 0xfa: /* cli */
case 0xfb: /* sti */
case 0x9d: /* popf/popfd */
/* Check whether the instruction modifies Interrupt Flag or not */
p->ainsn.if_modifier = 1;
break;
case 0x9c: /* pushfl */
p->ainsn.is_pushf = 1;
break;
case 0xcf: /* iret */
p->ainsn.if_modifier = 1;
fallthrough;
case 0xc2: /* ret/lret */
case 0xc3:
case 0xca:
case 0xcb:
case 0xea: /* jmp absolute -- ip is correct */
/* ip is already adjusted, no more changes required */
p->ainsn.is_abs_ip = 1;
/* Without resume jump, this is boostable */
p->ainsn.boostable = 1;
break;
case 0xe8: /* call relative - Fix return addr */
p->ainsn.is_call = 1;
break;
#ifdef CONFIG_X86_32
case 0x9a: /* call absolute -- same as call absolute, indirect */
p->ainsn.is_call = 1;
p->ainsn.is_abs_ip = 1;
break;
#endif
case 0xff:
opcode = insn->opcode.bytes[1];
if ((opcode & 0x30) == 0x10) {
/*
* call absolute, indirect
* Fix return addr; ip is correct.
* But this is not boostable
*/
p->ainsn.is_call = 1;
p->ainsn.is_abs_ip = 1;
break;
} else if (((opcode & 0x31) == 0x20) ||
((opcode & 0x31) == 0x21)) {
/*
* jmp near and far, absolute indirect
* ip is correct.
*/
p->ainsn.is_abs_ip = 1;
/* Without resume jump, this is boostable */
p->ainsn.boostable = 1;
}
break;
}
}
static int arch_copy_kprobe(struct kprobe *p)
{
struct insn insn;
@@ -467,8 +489,8 @@ static int arch_copy_kprobe(struct kprobe *p)
*/
len = prepare_boost(buf, p, &insn);
/* Check whether the instruction modifies Interrupt Flag or not */
p->ainsn.if_modifier = is_IF_modifier(buf);
/* Analyze the opcode and set resume flags */
set_resume_flags(p, &insn);
/* Also, displacement change doesn't affect the first byte */
p->opcode = buf[0];
@@ -491,6 +513,9 @@ int arch_prepare_kprobe(struct kprobe *p)
if (!can_probe((unsigned long)p->addr))
return -EILSEQ;
memset(&p->ainsn, 0, sizeof(p->ainsn));
/* insn: must be on special executable page on x86. */
p->ainsn.insn = get_insn_slot();
if (!p->ainsn.insn)
@@ -806,11 +831,6 @@ NOKPROBE_SYMBOL(trampoline_handler);
* 2) If the single-stepped instruction was a call, the return address
* that is atop the stack is the address following the copied instruction.
* We need to make it the address following the original instruction.
*
* If this is the first time we've single-stepped the instruction at
* this probepoint, and the instruction is boostable, boost it: add a
* jump instruction after the copied instruction, that jumps to the next
* instruction after the probepoint.
*/
static void resume_execution(struct kprobe *p, struct pt_regs *regs,
struct kprobe_ctlblk *kcb)
@@ -818,60 +838,20 @@ static void resume_execution(struct kprobe *p, struct pt_regs *regs,
unsigned long *tos = stack_addr(regs);
unsigned long copy_ip = (unsigned long)p->ainsn.insn;
unsigned long orig_ip = (unsigned long)p->addr;
kprobe_opcode_t *insn = p->ainsn.insn;
/* Skip prefixes */
insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
switch (*insn) {
case 0x9c: /* pushfl */
/* Fixup the contents of top of stack */
if (p->ainsn.is_pushf) {
*tos &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF);
*tos |= kcb->kprobe_old_flags;
break;
case 0xc2: /* iret/ret/lret */
case 0xc3:
case 0xca:
case 0xcb:
case 0xcf:
case 0xea: /* jmp absolute -- ip is correct */
/* ip is already adjusted, no more changes required */
p->ainsn.boostable = true;
goto no_change;
case 0xe8: /* call relative - Fix return addr */
} else if (p->ainsn.is_call) {
*tos = orig_ip + (*tos - copy_ip);
break;
#ifdef CONFIG_X86_32
case 0x9a: /* call absolute -- same as call absolute, indirect */
*tos = orig_ip + (*tos - copy_ip);
goto no_change;
#endif
case 0xff:
if ((insn[1] & 0x30) == 0x10) {
/*
* call absolute, indirect
* Fix return addr; ip is correct.
* But this is not boostable
*/
*tos = orig_ip + (*tos - copy_ip);
goto no_change;
} else if (((insn[1] & 0x31) == 0x20) ||
((insn[1] & 0x31) == 0x21)) {
/*
* jmp near and far, absolute indirect
* ip is correct. And this is boostable
*/
p->ainsn.boostable = true;
goto no_change;
}
break;
default:
break;
}
regs->ip += orig_ip - copy_ip;
if (!p->ainsn.is_abs_ip)
regs->ip += orig_ip - copy_ip;
no_change:
restore_btf();
}
NOKPROBE_SYMBOL(resume_execution);

View File

@@ -398,9 +398,15 @@ static void free_ldt_pgtables(struct mm_struct *mm)
if (!boot_cpu_has(X86_FEATURE_PTI))
return;
tlb_gather_mmu(&tlb, mm, start, end);
/*
* Although free_pgd_range() is intended for freeing user
* page-tables, it also works out for kernel mappings on x86.
* We use tlb_gather_mmu_fullmm() to avoid confusing the
* range-tracking logic in __tlb_adjust_range().
*/
tlb_gather_mmu_fullmm(&tlb, mm);
free_pgd_range(&tlb, start, end, start, end);
tlb_finish_mmu(&tlb, start, end);
tlb_finish_mmu(&tlb);
#endif
}

View File

@@ -114,6 +114,7 @@ int apply_relocate(Elf32_Shdr *sechdrs,
*location += sym->st_value;
break;
case R_386_PC32:
case R_386_PLT32:
/* Add the value, subtract its position */
*location += sym->st_value - (uint32_t)location;
break;

View File

@@ -182,6 +182,13 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
err = security_locked_down(LOCKDOWN_MSR);
if (err)
break;
err = filter_write(regs[1]);
if (err)
return err;
add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
err = wrmsr_safe_regs_on_cpu(cpu, regs);
if (err)
break;

View File

@@ -135,8 +135,7 @@ unsigned paravirt_patch_default(u8 type, void *insn_buff,
else if (opfunc == _paravirt_ident_64)
ret = paravirt_patch_ident_64(insn_buff, len);
else if (type == PARAVIRT_PATCH(cpu.iret) ||
type == PARAVIRT_PATCH(cpu.usergs_sysret64))
else if (type == PARAVIRT_PATCH(cpu.iret))
/* If operation requires a jmp, then jmp */
ret = paravirt_patch_jmp(insn_buff, opfunc, addr, len);
#endif
@@ -170,7 +169,6 @@ static u64 native_steal_clock(int cpu)
/* These are in entry.S */
extern void native_iret(void);
extern void native_usergs_sysret64(void);
static struct resource reserve_ioports = {
.start = 0,
@@ -310,9 +308,7 @@ struct paravirt_patch_template pv_ops = {
.cpu.load_sp0 = native_load_sp0,
.cpu.usergs_sysret64 = native_usergs_sysret64,
.cpu.iret = native_iret,
.cpu.swapgs = native_swapgs,
#ifdef CONFIG_X86_IOPL_IOPERM
.cpu.invalidate_io_bitmap = native_tss_invalidate_io_bitmap,
@@ -324,7 +320,6 @@ struct paravirt_patch_template pv_ops = {
/* Irq ops. */
.irq.save_fl = __PV_IS_CALLEE_SAVE(native_save_fl),
.irq.restore_fl = __PV_IS_CALLEE_SAVE(native_restore_fl),
.irq.irq_disable = __PV_IS_CALLEE_SAVE(native_irq_disable),
.irq.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.irq.safe_halt = native_safe_halt,

View File

@@ -25,10 +25,7 @@ struct patch_xxl {
const unsigned char mmu_read_cr2[3];
const unsigned char mmu_read_cr3[3];
const unsigned char mmu_write_cr3[3];
const unsigned char irq_restore_fl[2];
const unsigned char cpu_wbinvd[2];
const unsigned char cpu_usergs_sysret64[6];
const unsigned char cpu_swapgs[3];
const unsigned char mov64[3];
};
@@ -39,11 +36,7 @@ static const struct patch_xxl patch_data_xxl = {
.mmu_read_cr2 = { 0x0f, 0x20, 0xd0 }, // mov %cr2, %[re]ax
.mmu_read_cr3 = { 0x0f, 0x20, 0xd8 }, // mov %cr3, %[re]ax
.mmu_write_cr3 = { 0x0f, 0x22, 0xdf }, // mov %rdi, %cr3
.irq_restore_fl = { 0x57, 0x9d }, // push %rdi; popfq
.cpu_wbinvd = { 0x0f, 0x09 }, // wbinvd
.cpu_usergs_sysret64 = { 0x0f, 0x01, 0xf8,
0x48, 0x0f, 0x07 }, // swapgs; sysretq
.cpu_swapgs = { 0x0f, 0x01, 0xf8 }, // swapgs
.mov64 = { 0x48, 0x89, 0xf8 }, // mov %rdi, %rax
};
@@ -76,7 +69,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
switch (type) {
#ifdef CONFIG_PARAVIRT_XXL
PATCH_CASE(irq, restore_fl, xxl, insn_buff, len);
PATCH_CASE(irq, save_fl, xxl, insn_buff, len);
PATCH_CASE(irq, irq_enable, xxl, insn_buff, len);
PATCH_CASE(irq, irq_disable, xxl, insn_buff, len);
@@ -85,8 +77,6 @@ unsigned int native_patch(u8 type, void *insn_buff, unsigned long addr,
PATCH_CASE(mmu, read_cr3, xxl, insn_buff, len);
PATCH_CASE(mmu, write_cr3, xxl, insn_buff, len);
PATCH_CASE(cpu, usergs_sysret64, xxl, insn_buff, len);
PATCH_CASE(cpu, swapgs, xxl, insn_buff, len);
PATCH_CASE(cpu, wbinvd, xxl, insn_buff, len);
#endif

View File

@@ -4,9 +4,6 @@
#include <linux/string.h>
#include <linux/kallsyms.h>
#define DEBUG 1
static struct iommu_table_entry * __init
find_dependents_of(struct iommu_table_entry *start,
struct iommu_table_entry *finish,

View File

@@ -161,7 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
#endif
/* Kernel thread ? */
if (unlikely(p->flags & PF_KTHREAD)) {
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
kthread_frame_init(frame, sp, arg);
return 0;

View File

@@ -539,7 +539,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
int cpu = smp_processor_id();
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
this_cpu_read(hardirq_stack_inuse));
if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_fpu, cpu);

View File

@@ -704,6 +704,9 @@ void ptrace_disable(struct task_struct *child)
#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
static const struct user_regset_view user_x86_32_view; /* Initialized below. */
#endif
#ifdef CONFIG_X86_64
static const struct user_regset_view user_x86_64_view; /* Initialized below. */
#endif
long arch_ptrace(struct task_struct *child, long request,
unsigned long addr, unsigned long data)
@@ -711,6 +714,14 @@ long arch_ptrace(struct task_struct *child, long request,
int ret;
unsigned long __user *datap = (unsigned long __user *)data;
#ifdef CONFIG_X86_64
/* This is native 64-bit ptrace() */
const struct user_regset_view *regset_view = &user_x86_64_view;
#else
/* This is native 32-bit ptrace() */
const struct user_regset_view *regset_view = &user_x86_32_view;
#endif
switch (request) {
/* read the word at location addr in the USER area. */
case PTRACE_PEEKUSR: {
@@ -749,28 +760,28 @@ long arch_ptrace(struct task_struct *child, long request,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
task_user_regset_view(current),
regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
task_user_regset_view(current),
regset_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
task_user_regset_view(current),
regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
task_user_regset_view(current),
regset_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1152,28 +1163,28 @@ static long x32_arch_ptrace(struct task_struct *child,
case PTRACE_GETREGS: /* Get all gp regs from the child. */
return copy_regset_to_user(child,
task_user_regset_view(current),
&user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_SETREGS: /* Set all gp regs in the child. */
return copy_regset_from_user(child,
task_user_regset_view(current),
&user_x86_64_view,
REGSET_GENERAL,
0, sizeof(struct user_regs_struct),
datap);
case PTRACE_GETFPREGS: /* Get the child FPU state. */
return copy_regset_to_user(child,
task_user_regset_view(current),
&user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
case PTRACE_SETFPREGS: /* Set the child FPU state. */
return copy_regset_from_user(child,
task_user_regset_view(current),
&user_x86_64_view,
REGSET_FP,
0, sizeof(struct user_i387_struct),
datap);
@@ -1309,6 +1320,25 @@ void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask)
xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
}
/*
* This is used by the core dump code to decide which regset to dump. The
* core dump code writes out the resulting .e_machine and the corresponding
* regsets. This is suboptimal if the task is messing around with its CS.L
* field, but at worst the core dump will end up missing some information.
*
* Unfortunately, it is also used by the broken PTRACE_GETREGSET and
* PTRACE_SETREGSET APIs. These APIs look at the .regsets field but have
* no way to make sure that the e_machine they use matches the caller's
* expectations. The result is that the data format returned by
* PTRACE_GETREGSET depends on the returned CS field (and even the offset
* of the returned CS field depends on its value!) and the data format
* accepted by PTRACE_SETREGSET is determined by the old CS value. The
* upshot is that it is basically impossible to use these APIs correctly.
*
* The best way to fix it in the long run would probably be to add new
* improved ptrace() APIs to read and write registers reliably, possibly by
* allowing userspace to select the ELF e_machine variant that they expect.
*/
const struct user_regset_view *task_user_regset_view(struct task_struct *task)
{
#ifdef CONFIG_IA32_EMULATION

View File

@@ -477,6 +477,15 @@ static const struct dmi_system_id reboot_dmi_table[] __initconst = {
},
},
{ /* PCIe Wifi card isn't detected after reboot otherwise */
.callback = set_pci_reboot,
.ident = "Zotac ZBOX CI327 nano",
.matches = {
DMI_MATCH(DMI_SYS_VENDOR, "NA"),
DMI_MATCH(DMI_PRODUCT_NAME, "ZBOX-CI327NANO-GS-01"),
},
},
/* Sony */
{ /* Handle problems with rebooting on Sony VGN-Z540N */
.callback = set_bios_reboot,
@@ -538,31 +547,21 @@ static void emergency_vmx_disable_all(void)
local_irq_disable();
/*
* We need to disable VMX on all CPUs before rebooting, otherwise
* we risk hanging up the machine, because the CPU ignores INIT
* signals when VMX is enabled.
* Disable VMX on all CPUs before rebooting, otherwise we risk hanging
* the machine, because the CPU blocks INIT when it's in VMX root.
*
* We can't take any locks and we may be on an inconsistent
* state, so we use NMIs as IPIs to tell the other CPUs to disable
* VMX and halt.
* We can't take any locks and we may be on an inconsistent state, so
* use NMIs as IPIs to tell the other CPUs to exit VMX root and halt.
*
* For safety, we will avoid running the nmi_shootdown_cpus()
* stuff unnecessarily, but we don't have a way to check
* if other CPUs have VMX enabled. So we will call it only if the
* CPU we are running on has VMX enabled.
*
* We will miss cases where VMX is not enabled on all CPUs. This
* shouldn't do much harm because KVM always enable VMX on all
* CPUs anyway. But we can miss it on the small window where KVM
* is still enabling VMX.
* Do the NMI shootdown even if VMX if off on _this_ CPU, as that
* doesn't prevent a different CPU from being in VMX root operation.
*/
if (cpu_has_vmx() && cpu_vmx_enabled()) {
/* Disable VMX on this CPU. */
cpu_vmxoff();
if (cpu_has_vmx()) {
/* Safely force _this_ CPU out of VMX root operation. */
__cpu_emergency_vmxoff();
/* Halt and disable VMX on the other CPUs */
/* Halt and exit VMX root operation on the other CPUs. */
nmi_shootdown_cpus(vmxoff_nmi);
}
}

View File

@@ -16,7 +16,6 @@
#include <linux/memblock.h>
#include <linux/pci.h>
#include <linux/root_dev.h>
#include <linux/sfi.h>
#include <linux/hugetlb.h>
#include <linux/tboot.h>
#include <linux/usb/xhci-dbgp.h>
@@ -1185,7 +1184,6 @@ void __init setup_arch(char **cmdline_p)
* Read APIC and some other early information from ACPI tables.
*/
acpi_boot_init();
sfi_init();
x86_dtb_init();
/*

View File

@@ -305,14 +305,14 @@ static enum es_result vc_ioio_exitinfo(struct es_em_ctxt *ctxt, u64 *exitinfo)
case 0xe4:
case 0xe5:
*exitinfo |= IOIO_TYPE_IN;
*exitinfo |= (u64)insn->immediate.value << 16;
*exitinfo |= (u8)insn->immediate.value << 16;
break;
/* OUT immediate opcodes */
case 0xe6:
case 0xe7:
*exitinfo |= IOIO_TYPE_OUT;
*exitinfo |= (u64)insn->immediate.value << 16;
*exitinfo |= (u8)insn->immediate.value << 16;
break;
/* IN register opcodes */

View File

@@ -225,7 +225,7 @@ static inline u64 sev_es_rd_ghcb_msr(void)
return __rdmsr(MSR_AMD64_SEV_ES_GHCB);
}
static inline void sev_es_wr_ghcb_msr(u64 val)
static __always_inline void sev_es_wr_ghcb_msr(u64 val)
{
u32 low, high;
@@ -286,6 +286,12 @@ static enum es_result vc_write_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
/* If instruction ran in kernel mode and the I/O buffer is in kernel space */
if (!user_mode(ctxt->regs) && !access_ok(target, size)) {
memcpy(dst, buf, size);
return ES_OK;
}
switch (size) {
case 1:
memcpy(&d1, buf, 1);
@@ -335,6 +341,12 @@ static enum es_result vc_read_mem(struct es_em_ctxt *ctxt,
u16 d2;
u8 d1;
/* If instruction ran in kernel mode and the I/O buffer is in kernel space */
if (!user_mode(ctxt->regs) && !access_ok(s, size)) {
memcpy(buf, src, size);
return ES_OK;
}
switch (size) {
case 1:
if (get_user(d1, s))

View File

@@ -56,6 +56,7 @@
#include <linux/numa.h>
#include <linux/pgtable.h>
#include <linux/overflow.h>
#include <linux/syscore_ops.h>
#include <asm/acpi.h>
#include <asm/desc.h>
@@ -1832,6 +1833,7 @@ void arch_set_max_freq_ratio(bool turbo_disabled)
arch_max_freq_ratio = turbo_disabled ? SCHED_CAPACITY_SCALE :
arch_turbo_freq_ratio;
}
EXPORT_SYMBOL_GPL(arch_set_max_freq_ratio);
static bool turbo_disabled(void)
{
@@ -2083,6 +2085,23 @@ static void init_counter_refs(void)
this_cpu_write(arch_prev_mperf, mperf);
}
#ifdef CONFIG_PM_SLEEP
static struct syscore_ops freq_invariance_syscore_ops = {
.resume = init_counter_refs,
};
static void register_freq_invariance_syscore_ops(void)
{
/* Bail out if registered already. */
if (freq_invariance_syscore_ops.node.prev)
return;
register_syscore_ops(&freq_invariance_syscore_ops);
}
#else
static inline void register_freq_invariance_syscore_ops(void) {}
#endif
static void init_freq_invariance(bool secondary, bool cppc_ready)
{
bool ret = false;
@@ -2109,6 +2128,7 @@ static void init_freq_invariance(bool secondary, bool cppc_ready)
if (ret) {
init_counter_refs();
static_branch_enable(&arch_scale_freq_key);
register_freq_invariance_syscore_ops();
pr_info("Estimated ratio of average max frequency by base frequency (times 1024): %llu\n", arch_max_freq_ratio);
} else {
pr_debug("Couldn't determine max cpu frequency, necessary for scale-invariant accounting.\n");

View File

@@ -11,14 +11,26 @@ enum insn_type {
RET = 3, /* tramp / site cond-tail-call */
};
/*
* data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax
* The REX.W cancels the effect of any data16.
*/
static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 };
static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
{
const void *emulate = NULL;
int size = CALL_INSN_SIZE;
const void *code;
switch (type) {
case CALL:
code = text_gen_insn(CALL_INSN_OPCODE, insn, func);
if (func == &__static_call_return0) {
emulate = code;
code = &xor5rax;
}
break;
case NOP:
@@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void
if (unlikely(system_state == SYSTEM_BOOTING))
return text_poke_early(insn, code, size);
text_poke_bp(insn, code, size, NULL);
text_poke_bp(insn, code, size, emulate);
}
static void __static_call_validate(void *insn, bool tail)
@@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail)
return;
} else {
if (opcode == CALL_INSN_OPCODE ||
!memcmp(insn, ideal_nops[NOP_ATOMIC5], 5))
!memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) ||
!memcmp(insn, xor5rax, 5))
return;
}

View File

@@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child)
regs->flags |= X86_EFLAGS_TF;
/*
* Always set TIF_SINGLESTEP - this guarantees that
* we single-step system calls etc.. This will also
* Always set TIF_SINGLESTEP. This will also
* cause us to set TF when returning to user mode.
*/
set_tsk_thread_flag(child, TIF_SINGLESTEP);
/*
* Ensure that a trap is triggered once stepping out of a system
* call prior to executing any user instruction.
*/
set_task_syscall_work(child, SYSCALL_EXIT_TRAP);
oflags = regs->flags;
/* Set TF on the kernel stack.. */
@@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child)
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
clear_task_syscall_work(child, SYSCALL_EXIT_TRAP);
/* But touch TF only if it was set by us.. */
if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF))

View File

@@ -90,14 +90,10 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
unsigned long, prot, unsigned long, flags,
unsigned long, fd, unsigned long, off)
{
long error;
error = -EINVAL;
if (off & ~PAGE_MASK)
goto out;
return -EINVAL;
error = ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
out:
return error;
return ksys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
}
static void find_start_end(unsigned long addr, unsigned long flags,

View File

@@ -471,7 +471,7 @@ bool unwind_next_frame(struct unwind_state *state)
break;
case ORC_REG_SP_INDIRECT:
sp = state->sp + orc->sp_offset;
sp = state->sp;
indirect = true;
break;
@@ -521,6 +521,9 @@ bool unwind_next_frame(struct unwind_state *state)
if (indirect) {
if (!deref_stack_reg(state, sp, &sp))
goto err;
if (orc->sp_reg == ORC_REG_SP_INDIRECT)
sp += orc->sp_offset;
}
/* Find IP, SP and possibly regs: */

View File

@@ -134,7 +134,11 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
unsafe_put_user(regs->ds, &user->regs.ds, Efault_end);
unsafe_put_user(regs->fs, &user->regs.fs, Efault_end);
unsafe_put_user(regs->gs, &user->regs.gs, Efault_end);
unsafe_put_user(vm86->screen_bitmap, &user->screen_bitmap, Efault_end);
/*
* Don't write screen_bitmap in case some user had a value there
* and expected it to remain unchanged.
*/
user_access_end();
@@ -160,49 +164,6 @@ Efault:
do_exit(SIGSEGV);
}
static void mark_screen_rdonly(struct mm_struct *mm)
{
struct vm_area_struct *vma;
spinlock_t *ptl;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
int i;
mmap_write_lock(mm);
pgd = pgd_offset(mm, 0xA0000);
if (pgd_none_or_clear_bad(pgd))
goto out;
p4d = p4d_offset(pgd, 0xA0000);
if (p4d_none_or_clear_bad(p4d))
goto out;
pud = pud_offset(p4d, 0xA0000);
if (pud_none_or_clear_bad(pud))
goto out;
pmd = pmd_offset(pud, 0xA0000);
if (pmd_trans_huge(*pmd)) {
vma = find_vma(mm, 0xA0000);
split_huge_pmd(vma, pmd, 0xA0000);
}
if (pmd_none_or_clear_bad(pmd))
goto out;
pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
for (i = 0; i < 32; i++) {
if (pte_present(*pte))
set_pte(pte, pte_wrprotect(*pte));
pte++;
}
pte_unmap_unlock(pte, ptl);
out:
mmap_write_unlock(mm);
flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, PAGE_SHIFT, false);
}
static int do_vm86_irq_handling(int subfunction, int irqnumber);
static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
@@ -282,6 +243,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
offsetof(struct vm86_struct, int_revectored)))
return -EFAULT;
/* VM86_SCREEN_BITMAP had numerous bugs and appears to have no users. */
if (v.flags & VM86_SCREEN_BITMAP) {
char comm[TASK_COMM_LEN];
pr_info_once("vm86: '%s' uses VM86_SCREEN_BITMAP, which is no longer supported\n", get_task_comm(comm, current));
return -EINVAL;
}
memset(&vm86regs, 0, sizeof(vm86regs));
vm86regs.pt.bx = v.regs.ebx;
@@ -302,7 +272,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
vm86regs.gs = v.regs.gs;
vm86->flags = v.flags;
vm86->screen_bitmap = v.screen_bitmap;
vm86->cpu_type = v.cpu_type;
if (copy_from_user(&vm86->int_revectored,
@@ -370,9 +339,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
update_task_stack(tsk);
preempt_enable();
if (vm86->flags & VM86_SCREEN_BITMAP)
mark_screen_rdonly(tsk->mm);
memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
return regs->ax;
}