- Fix out-of-spec hardware (1st gen Hygon) which does not implement

MSR_AMD64_SEV even though the spec clearly states so, and check CPUID
 bits first.
 
 - Send only one signal to a task when it is a SEGV_PKUERR si_code type.
 
 - Do away with all the wankery of reserving X amount of memory in
 the first megabyte to prevent BIOS corrupting it and simply and
 unconditionally reserve the whole first megabyte.
 
 - Make alternatives NOP optimization work at an arbitrary position
 within the patched sequence because the compiler can put single-byte
 NOPs for alignment anywhere in the sequence (32-bit retpoline), vs our
 previous assumption that the NOPs are only appended.
 
 - Force-disable ENQCMD[S] instructions support and remove update_pasid()
 because of insufficient protection against FPU state modification in an
 interrupt context, among other xstate horrors which are being addressed
 at the moment. This one limits the fallout until proper enablement.
 
 - Use cpu_feature_enabled() in the idxd driver so that it can be
 build-time disabled through the defines in .../asm/disabled-features.h.
 
 - Fix LVT thermal setup for SMI delivery mode by making sure the APIC
 LVT value is read before APIC initialization so that softlockups during
 boot do not happen at least on one machine.
 
 - Mark all legacy interrupts as legacy vectors when the IO-APIC is
 disabled and when all legacy interrupts are routed through the PIC.
 -----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEEzv7L6UO9uDPlPSfHEsHwGGHeVUoFAmC8fdEACgkQEsHwGGHe
 VUqO5A/+IbIo8myl8VPjw6HRnHgY8rsYRjxdtmVhbaMi5XOmTMfVA9zJ6QALxseo
 Mar8bmWcezEs0/FmNvk1vEOtIgZvRVy5RqXbu3W2EgWICuzRWbj822q+KrkbY0tH
 1GWjcZQO8VlgeuQsukyj5QHaBLffpn3Fh1XB8r0cktZvwciM+LRNMnK8d6QjqxNM
 ctTX4wdI6kc076pOi7MhKxSe+/xo5Wnf27lClLMOcsO/SS42KqgeRM5psWqxihhL
 j6Y3Oe+Nm+7GKF8y841PUSlwjgWmlZa6UkR6DBTP7DGnHDa5hMpzxYvHOquq/SbA
 leV9OLqI0iWs56kSzbEcXo7do1kld62KjsA2KtUhJfVAtm+igQLh5G0jESBwrWca
 TBWaE5kt6s8wP7LXeg26o4U8XD8vqEH88Tmsjlgqb/t/PKDV9PMGvNpF00dPZFo6
 Jhj2yntJYjLQYoAQLuQm5pfnKhZy3KKvk7ViGcnp3iN9i4eU9HzawIiXnliNOrTI
 ohQ9KoRhy1Cx0UfLkR+cdK4ks0u26DC2/Ewt0CE5AP/CQ1rX6Zbv2gFLjSpy7yQo
 6A99HEpbaLuy3kDt5vn91viPNUlOveuIXIdHp6u+zgFfx88eLUoEvfR135aV/Gyh
 p5PJm/BO99KByQzFCnilkp7nBeKtnKYSmUojA6JsZKjzJimSPYo=
 =zRI1
 -----END PGP SIGNATURE-----

Merge tag 'x86_urgent_for_v5.13-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 fixes from Borislav Petkov:
 "A bunch of x86/urgent stuff accumulated for the last two weeks so
  lemme unload it to you.

  It should be all totally risk-free, of course. :-)

   - Fix out-of-spec hardware (1st gen Hygon) which does not implement
     MSR_AMD64_SEV even though the spec clearly states so, and check
     CPUID bits first.

   - Send only one signal to a task when it is a SEGV_PKUERR si_code
     type.

   - Do away with all the wankery of reserving X amount of memory in the
     first megabyte to prevent BIOS corrupting it and simply and
     unconditionally reserve the whole first megabyte.

   - Make alternatives NOP optimization work at an arbitrary position
     within the patched sequence because the compiler can put
     single-byte NOPs for alignment anywhere in the sequence (32-bit
     retpoline), vs our previous assumption that the NOPs are only
     appended.

   - Force-disable ENQCMD[S] instructions support and remove
     update_pasid() because of insufficient protection against FPU state
     modification in an interrupt context, among other xstate horrors
     which are being addressed at the moment. This one limits the
     fallout until proper enablement.

   - Use cpu_feature_enabled() in the idxd driver so that it can be
     build-time disabled through the defines in disabled-features.h.

   - Fix LVT thermal setup for SMI delivery mode by making sure the APIC
     LVT value is read before APIC initialization so that softlockups
     during boot do not happen at least on one machine.

   - Mark all legacy interrupts as legacy vectors when the IO-APIC is
     disabled and when all legacy interrupts are routed through the PIC"

* tag 'x86_urgent_for_v5.13-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/sev: Check SME/SEV support in CPUID first
  x86/fault: Don't send SIGSEGV twice on SEGV_PKUERR
  x86/setup: Always reserve the first 1M of RAM
  x86/alternative: Optimize single-byte NOPs at an arbitrary position
  x86/cpufeatures: Force disable X86_FEATURE_ENQCMD and remove update_pasid()
  dmaengine: idxd: Use cpu_feature_enabled()
  x86/thermal: Fix LVT thermal setup for SMI delivery mode
  x86/apic: Mark _all_ legacy interrupts when IO/APIC is missing
This commit is contained in:
Linus Torvalds 2021-06-06 12:25:43 -07:00
commit 773ac53bbf
16 changed files with 145 additions and 126 deletions

View File

@ -174,6 +174,7 @@ static inline int apic_is_clustered_box(void)
extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
extern void lapic_assign_system_vectors(void);
extern void lapic_assign_legacy_vector(unsigned int isairq, bool replace);
extern void lapic_update_legacy_vectors(void);
extern void lapic_online(void);
extern void lapic_offline(void);
extern bool apic_needs_pit(void);

View File

@ -56,11 +56,8 @@
# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31))
#endif
#ifdef CONFIG_IOMMU_SUPPORT
# define DISABLE_ENQCMD 0
#else
# define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
#endif
/* Force disable because it's broken beyond repair */
#define DISABLE_ENQCMD (1 << (X86_FEATURE_ENQCMD & 31))
#ifdef CONFIG_X86_SGX
# define DISABLE_SGX 0

View File

@ -106,10 +106,6 @@ extern int cpu_has_xfeatures(u64 xfeatures_mask, const char **feature_name);
*/
#define PASID_DISABLED 0
#ifdef CONFIG_IOMMU_SUPPORT
/* Update current's PASID MSR/state by mm's PASID. */
void update_pasid(void);
#else
static inline void update_pasid(void) { }
#endif
#endif /* _ASM_X86_FPU_API_H */

View File

@ -584,13 +584,6 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
pkru_val = pk->pkru;
}
__write_pkru(pkru_val);
/*
* Expensive PASID MSR write will be avoided in update_pasid() because
* TIF_NEED_FPU_LOAD was set. And the PASID state won't be updated
* unless it's different from mm->pasid to reduce overhead.
*/
update_pasid();
}
#endif /* _ASM_X86_FPU_INTERNAL_H */

View File

@ -3,11 +3,13 @@
#define _ASM_X86_THERMAL_H
#ifdef CONFIG_X86_THERMAL_VECTOR
void therm_lvt_init(void);
void intel_init_thermal(struct cpuinfo_x86 *c);
bool x86_thermal_enabled(void);
void intel_thermal_interrupt(void);
#else
static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
static inline void therm_lvt_init(void) { }
static inline void intel_init_thermal(struct cpuinfo_x86 *c) { }
#endif
#endif /* _ASM_X86_THERMAL_H */

View File

@ -182,42 +182,70 @@ done:
n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
}
/*
* optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
*
* @instr: instruction byte stream
* @instrlen: length of the above
* @off: offset within @instr where the first NOP has been detected
*
* Return: number of NOPs found (and replaced).
*/
static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
{
unsigned long flags;
int i = off, nnops;
while (i < instrlen) {
if (instr[i] != 0x90)
break;
i++;
}
nnops = i - off;
if (nnops <= 1)
return nnops;
local_irq_save(flags);
add_nops(instr + off, nnops);
local_irq_restore(flags);
DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
return nnops;
}
/*
* "noinline" to cause control flow change and thus invalidate I$ and
* cause refetch after modification.
*/
static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
{
unsigned long flags;
struct insn insn;
int nop, i = 0;
int i = 0;
/*
* Jump over the non-NOP insns, the remaining bytes must be single-byte
* NOPs, optimize them.
* Jump over the non-NOP insns and optimize single-byte NOPs into bigger
* ones.
*/
for (;;) {
if (insn_decode_kernel(&insn, &instr[i]))
return;
/*
* See if this and any potentially following NOPs can be
* optimized.
*/
if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
break;
i += optimize_nops_range(instr, a->instrlen, i);
else
i += insn.length;
if ((i += insn.length) >= a->instrlen)
if (i >= a->instrlen)
return;
}
for (nop = i; i < a->instrlen; i++) {
if (WARN_ONCE(instr[i] != 0x90, "Not a NOP at 0x%px\n", &instr[i]))
return;
}
local_irq_save(flags);
add_nops(instr + nop, i - nop);
local_irq_restore(flags);
DUMP_BYTES(instr, a->instrlen, "%px: [%d:%d) optimized NOPs: ",
instr, nop, a->instrlen);
}
/*

View File

@ -2604,6 +2604,7 @@ static void __init apic_bsp_setup(bool upmode)
end_local_APIC_setup();
irq_remap_enable_fault_handling();
setup_IO_APIC();
lapic_update_legacy_vectors();
}
#ifdef CONFIG_UP_LATE_INIT

View File

@ -738,6 +738,26 @@ void lapic_assign_legacy_vector(unsigned int irq, bool replace)
irq_matrix_assign_system(vector_matrix, ISA_IRQ_VECTOR(irq), replace);
}
void __init lapic_update_legacy_vectors(void)
{
unsigned int i;
if (IS_ENABLED(CONFIG_X86_IO_APIC) && nr_ioapics > 0)
return;
/*
* If the IO/APIC is disabled via config, kernel command line or
* lack of enumeration then all legacy interrupts are routed
* through the PIC. Make sure that they are marked as legacy
* vectors. PIC_CASCADE_IRQ has already been marked in
* lapic_assign_system_vectors().
*/
for (i = 0; i < nr_legacy_irqs(); i++) {
if (i != PIC_CASCADE_IR)
lapic_assign_legacy_vector(i, true);
}
}
void __init lapic_assign_system_vectors(void)
{
unsigned int i, vector = 0;

View File

@ -1402,60 +1402,3 @@ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns,
return 0;
}
#endif /* CONFIG_PROC_PID_ARCH_STATUS */
#ifdef CONFIG_IOMMU_SUPPORT
void update_pasid(void)
{
u64 pasid_state;
u32 pasid;
if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
return;
if (!current->mm)
return;
pasid = READ_ONCE(current->mm->pasid);
/* Set the valid bit in the PASID MSR/state only for valid pasid. */
pasid_state = pasid == PASID_DISABLED ?
pasid : pasid | MSR_IA32_PASID_VALID;
/*
* No need to hold fregs_lock() since the task's fpstate won't
* be changed by others (e.g. ptrace) while the task is being
* switched to or is in IPI.
*/
if (!test_thread_flag(TIF_NEED_FPU_LOAD)) {
/* The MSR is active and can be directly updated. */
wrmsrl(MSR_IA32_PASID, pasid_state);
} else {
struct fpu *fpu = &current->thread.fpu;
struct ia32_pasid_state *ppasid_state;
struct xregs_state *xsave;
/*
* The CPU's xstate registers are not currently active. Just
* update the PASID state in the memory buffer here. The
* PASID MSR will be loaded when returning to user mode.
*/
xsave = &fpu->state.xsave;
xsave->header.xfeatures |= XFEATURE_MASK_PASID;
ppasid_state = get_xsave_addr(xsave, XFEATURE_PASID);
/*
* Since XFEATURE_MASK_PASID is set in xfeatures, ppasid_state
* won't be NULL and no need to check its value.
*
* Only update the task's PASID state when it's different
* from the mm's pasid.
*/
if (ppasid_state->pasid != pasid_state) {
/*
* Invalid fpregs so that state restoring will pick up
* the PASID state.
*/
__fpu_invalidate_fpregs_state(fpu);
ppasid_state->pasid = pasid_state;
}
}
}
#endif /* CONFIG_IOMMU_SUPPORT */

View File

@ -44,6 +44,7 @@
#include <asm/pci-direct.h>
#include <asm/prom.h>
#include <asm/proto.h>
#include <asm/thermal.h>
#include <asm/unwind.h>
#include <asm/vsyscall.h>
#include <linux/vmalloc.h>
@ -637,11 +638,11 @@ static void __init trim_snb_memory(void)
* them from accessing certain memory ranges, namely anything below
* 1M and in the pages listed in bad_pages[] above.
*
* To avoid these pages being ever accessed by SNB gfx devices
* reserve all memory below the 1 MB mark and bad_pages that have
* not already been reserved at boot time.
* To avoid these pages being ever accessed by SNB gfx devices reserve
* bad_pages that have not already been reserved at boot time.
* All memory below the 1 MB mark is anyway reserved later during
* setup_arch(), so there is no need to reserve it here.
*/
memblock_reserve(0, 1<<20);
for (i = 0; i < ARRAY_SIZE(bad_pages); i++) {
if (memblock_reserve(bad_pages[i], PAGE_SIZE))
@ -733,14 +734,14 @@ static void __init early_reserve_memory(void)
* The first 4Kb of memory is a BIOS owned area, but generally it is
* not listed as such in the E820 table.
*
* Reserve the first memory page and typically some additional
* memory (64KiB by default) since some BIOSes are known to corrupt
* low memory. See the Kconfig help text for X86_RESERVE_LOW.
* Reserve the first 64K of memory since some BIOSes are known to
* corrupt low memory. After the real mode trampoline is allocated the
* rest of the memory below 640k is reserved.
*
* In addition, make sure page 0 is always reserved because on
* systems with L1TF its contents can be leaked to user processes.
*/
memblock_reserve(0, ALIGN(reserve_low, PAGE_SIZE));
memblock_reserve(0, SZ_64K);
early_reserve_initrd();
@ -751,6 +752,7 @@ static void __init early_reserve_memory(void)
reserve_ibft_region();
reserve_bios_regions();
trim_snb_memory();
}
/*
@ -1081,14 +1083,20 @@ void __init setup_arch(char **cmdline_p)
(max_pfn_mapped<<PAGE_SHIFT) - 1);
#endif
reserve_real_mode();
/*
* Reserving memory causing GPU hangs on Sandy Bridge integrated
* graphics devices should be done after we allocated memory under
* 1M for the real mode trampoline.
* Find free memory for the real mode trampoline and place it
* there.
* If there is not enough free memory under 1M, on EFI-enabled
* systems there will be additional attempt to reclaim the memory
* for the real mode trampoline at efi_free_boot_services().
*
* Unconditionally reserve the entire first 1M of RAM because
* BIOSes are know to corrupt low memory and several
* hundred kilobytes are not worth complex detection what memory gets
* clobbered. Moreover, on machines with SandyBridge graphics or in
* setups that use crashkernel the entire 1M is reserved anyway.
*/
trim_snb_memory();
reserve_real_mode();
init_mem_mapping();
@ -1226,6 +1234,14 @@ void __init setup_arch(char **cmdline_p)
x86_init.timers.wallclock_init();
/*
* This needs to run before setup_local_APIC() which soft-disables the
* local APIC temporarily and that masks the thermal LVT interrupt,
* leading to softlockups on machines which have configured SMI
* interrupt delivery.
*/
therm_lvt_init();
mcheck_init();
register_refined_jiffies(CLOCK_TICK_RATE);

View File

@ -836,8 +836,8 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
if (si_code == SEGV_PKUERR)
force_sig_pkuerr((void __user *)address, pkey);
force_sig_fault(SIGSEGV, si_code, (void __user *)address);
else
force_sig_fault(SIGSEGV, si_code, (void __user *)address);
local_irq_disable();
}

View File

@ -504,10 +504,6 @@ void __init sme_enable(struct boot_params *bp)
#define AMD_SME_BIT BIT(0)
#define AMD_SEV_BIT BIT(1)
/* Check the SEV MSR whether SEV or SME is enabled */
sev_status = __rdmsr(MSR_AMD64_SEV);
feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/*
* Check for the SME/SEV feature:
* CPUID Fn8000_001F[EAX]
@ -519,11 +515,16 @@ void __init sme_enable(struct boot_params *bp)
eax = 0x8000001f;
ecx = 0;
native_cpuid(&eax, &ebx, &ecx, &edx);
if (!(eax & feature_mask))
/* Check whether SEV or SME is supported */
if (!(eax & (AMD_SEV_BIT | AMD_SME_BIT)))
return;
me_mask = 1UL << (ebx & 0x3f);
/* Check the SEV MSR whether SEV or SME is enabled */
sev_status = __rdmsr(MSR_AMD64_SEV);
feature_mask = (sev_status & MSR_AMD64_SEV_ENABLED) ? AMD_SEV_BIT : AMD_SME_BIT;
/* Check if memory encryption is enabled */
if (feature_mask == AMD_SME_BIT) {
/*

View File

@ -450,6 +450,18 @@ void __init efi_free_boot_services(void)
size -= rm_size;
}
/*
* Don't free memory under 1M for two reasons:
* - BIOS might clobber it
* - Crash kernel needs it to be reserved
*/
if (start + size < SZ_1M)
continue;
if (start < SZ_1M) {
size -= (SZ_1M - start);
start = SZ_1M;
}
memblock_free_late(start, size);
}

View File

@ -29,14 +29,16 @@ void __init reserve_real_mode(void)
/* Has to be under 1M so we can execute real-mode AP code. */
mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
if (!mem) {
if (!mem)
pr_info("No sub-1M memory is available for the trampoline\n");
return;
}
else
set_real_mode_mem(mem);
memblock_reserve(mem, size);
set_real_mode_mem(mem);
crash_reserve_low_1M();
/*
* Unconditionally reserve the entire fisrt 1M, see comment in
* setup_arch().
*/
memblock_reserve(0, SZ_1M);
}
static void sme_sev_setup_real_mode(struct trampoline_header *th)

View File

@ -745,12 +745,12 @@ static int __init idxd_init_module(void)
* If the CPU does not support MOVDIR64B or ENQCMDS, there's no point in
* enumerating the device. We can not utilize it.
*/
if (!boot_cpu_has(X86_FEATURE_MOVDIR64B)) {
if (!cpu_feature_enabled(X86_FEATURE_MOVDIR64B)) {
pr_warn("idxd driver failed to load without MOVDIR64B.\n");
return -ENODEV;
}
if (!boot_cpu_has(X86_FEATURE_ENQCMD))
if (!cpu_feature_enabled(X86_FEATURE_ENQCMD))
pr_warn("Platform does not have ENQCMD(S) support.\n");
else
support_enqcmd = true;

View File

@ -621,6 +621,17 @@ bool x86_thermal_enabled(void)
return atomic_read(&therm_throt_en);
}
void __init therm_lvt_init(void)
{
/*
* This function is only called on boot CPU. Save the init thermal
* LVT value on BSP and use that value to restore APs' thermal LVT
* entry BIOS programmed later
*/
if (intel_thermal_supported(&boot_cpu_data))
lvtthmr_init = apic_read(APIC_LVTTHMR);
}
void intel_init_thermal(struct cpuinfo_x86 *c)
{
unsigned int cpu = smp_processor_id();
@ -630,10 +641,6 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
if (!intel_thermal_supported(c))
return;
/* On the BSP? */
if (c == &boot_cpu_data)
lvtthmr_init = apic_read(APIC_LVTTHMR);
/*
* First check if its enabled already, in which case there might
* be some SMM goo which handles it, so we can't even put a handler