linux/arch/x86/kernel/apic/x2apic_phys.c
Dave Hansen 25a068b8e9 x86/apic: Add extra serialization for non-serializing MSRs
Jan Kiszka reported that the x2apic_wrmsr_fence() function uses a plain
MFENCE while the Intel SDM (10.12.3 MSR Access in x2APIC Mode) calls for
MFENCE; LFENCE.

Short summary: we have special MSRs that have weaker ordering than all
the rest. Add fencing consistent with current SDM recommendations.

This is not known to cause any issues in practice, only in theory.

Longer story below:

The reason the kernel uses a different semantic is that the SDM changed
(roughly in late 2017). The SDM changed because folks at Intel were
auditing all of the recommended fences in the SDM and realized that the
x2apic fences were insufficient.

Why was the pain MFENCE judged insufficient?

WRMSR itself is normally a serializing instruction. No fences are needed
because the instruction itself serializes everything.

But, there are explicit exceptions for this serializing behavior written
into the WRMSR instruction documentation for two classes of MSRs:
IA32_TSC_DEADLINE and the X2APIC MSRs.

Back to x2apic: WRMSR is *not* serializing in this specific case.
But why is MFENCE insufficient? MFENCE makes writes visible, but
only affects load/store instructions. WRMSR is unfortunately not a
load/store instruction and is unaffected by MFENCE. This means that a
non-serializing WRMSR could be reordered by the CPU to execute before
the writes made visible by the MFENCE have even occurred in the first
place.

This means that an x2apic IPI could theoretically be triggered before
there is any (visible) data to process.

Does this affect anything in practice? I honestly don't know. It seems
quite possible that by the time an interrupt gets to consume the (not
yet) MFENCE'd data, it has become visible, mostly by accident.

To be safe, add the SDM-recommended fences for all x2apic WRMSRs.

This also leaves open the question of the _other_ weakly-ordered WRMSR:
MSR_IA32_TSC_DEADLINE. While it has the same ordering architecture as
the x2APIC MSRs, it seems substantially less likely to be a problem in
practice. While writes to the in-memory Local Vector Table (LVT) might
theoretically be reordered with respect to a weakly-ordered WRMSR like
TSC_DEADLINE, the SDM has this to say:

  In x2APIC mode, the WRMSR instruction is used to write to the LVT
  entry. The processor ensures the ordering of this write and any
  subsequent WRMSR to the deadline; no fencing is required.

But, that might still leave xAPIC exposed. The safest thing to do for
now is to add the extra, recommended LFENCE.

 [ bp: Massage commit message, fix typos, drop accidentally added
   newline to tools/arch/x86/include/asm/barrier.h. ]

Reported-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: https://lkml.kernel.org/r/20200305174708.F77040DD@viggo.jf.intel.com
2021-02-04 19:36:31 +01:00

201 lines
4.6 KiB
C

// SPDX-License-Identifier: GPL-2.0
#include <linux/cpumask.h>
#include <linux/acpi.h>
#include "local.h"
int x2apic_phys;
static struct apic apic_x2apic_phys;
static u32 x2apic_max_apicid __ro_after_init;
void __init x2apic_set_max_apicid(u32 apicid)
{
x2apic_max_apicid = apicid;
}
static int __init set_x2apic_phys_mode(char *arg)
{
x2apic_phys = 1;
return 0;
}
early_param("x2apic_phys", set_x2apic_phys_mode);
static bool x2apic_fadt_phys(void)
{
#ifdef CONFIG_ACPI
if ((acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID) &&
(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
printk(KERN_DEBUG "System requires x2apic physical mode\n");
return true;
}
#endif
return false;
}
static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
return x2apic_enabled() && (x2apic_phys || x2apic_fadt_phys());
}
static void x2apic_send_IPI(int cpu, int vector)
{
u32 dest = per_cpu(x86_cpu_to_apicid, cpu);
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
__x2apic_send_IPI_dest(dest, vector, APIC_DEST_PHYSICAL);
}
static void
__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
{
unsigned long query_cpu;
unsigned long this_cpu;
unsigned long flags;
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
local_irq_save(flags);
this_cpu = smp_processor_id();
for_each_cpu(query_cpu, mask) {
if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
continue;
__x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
vector, APIC_DEST_PHYSICAL);
}
local_irq_restore(flags);
}
static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
{
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
}
static void
x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
{
__x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_allbutself(int vector)
{
__x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLBUT);
}
static void x2apic_send_IPI_all(int vector)
{
__x2apic_send_IPI_shorthand(vector, APIC_DEST_ALLINC);
}
static void init_x2apic_ldr(void)
{
}
static int x2apic_phys_probe(void)
{
if (x2apic_mode && (x2apic_phys || x2apic_fadt_phys()))
return 1;
return apic == &apic_x2apic_phys;
}
/* Common x2apic functions, also used by x2apic_cluster */
int x2apic_apic_id_valid(u32 apicid)
{
if (x2apic_max_apicid && apicid > x2apic_max_apicid)
return 0;
return 1;
}
int x2apic_apic_id_registered(void)
{
return 1;
}
void __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
{
unsigned long cfg = __prepare_ICR(0, vector, dest);
native_x2apic_icr_write(cfg, apicid);
}
void __x2apic_send_IPI_shorthand(int vector, u32 which)
{
unsigned long cfg = __prepare_ICR(which, vector, 0);
/* x2apic MSRs are special and need a special fence: */
weak_wrmsr_fence();
native_x2apic_icr_write(cfg, 0);
}
unsigned int x2apic_get_apic_id(unsigned long id)
{
return id;
}
u32 x2apic_set_apic_id(unsigned int id)
{
return id;
}
int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
{
return initial_apicid >> index_msb;
}
void x2apic_send_IPI_self(int vector)
{
apic_write(APIC_SELF_IPI, vector);
}
static struct apic apic_x2apic_phys __ro_after_init = {
.name = "physical x2apic",
.probe = x2apic_phys_probe,
.acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
.apic_id_valid = x2apic_apic_id_valid,
.apic_id_registered = x2apic_apic_id_registered,
.delivery_mode = APIC_DELIVERY_MODE_FIXED,
.dest_mode_logical = false,
.disable_esr = 0,
.check_apicid_used = NULL,
.init_apic_ldr = init_x2apic_ldr,
.ioapic_phys_id_map = NULL,
.setup_apic_routing = NULL,
.cpu_present_to_apicid = default_cpu_present_to_apicid,
.apicid_to_cpu_present = NULL,
.check_phys_apicid_present = default_check_phys_apicid_present,
.phys_pkg_id = x2apic_phys_pkg_id,
.get_apic_id = x2apic_get_apic_id,
.set_apic_id = x2apic_set_apic_id,
.calc_dest_apicid = apic_default_calc_apicid,
.send_IPI = x2apic_send_IPI,
.send_IPI_mask = x2apic_send_IPI_mask,
.send_IPI_mask_allbutself = x2apic_send_IPI_mask_allbutself,
.send_IPI_allbutself = x2apic_send_IPI_allbutself,
.send_IPI_all = x2apic_send_IPI_all,
.send_IPI_self = x2apic_send_IPI_self,
.inquire_remote_apic = NULL,
.read = native_apic_msr_read,
.write = native_apic_msr_write,
.eoi_write = native_apic_msr_eoi_write,
.icr_read = native_x2apic_icr_read,
.icr_write = native_x2apic_icr_write,
.wait_icr_idle = native_x2apic_wait_icr_idle,
.safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
};
apic_driver(apic_x2apic_phys);