Merge branch 'linus' into x86/apic

Pull in upstream changes to avoid conflicts
This commit is contained in:
Thomas Gleixner
2015-08-05 23:55:52 +02:00
832 changed files with 6928 additions and 4736 deletions

View File

@@ -409,12 +409,6 @@ static void __setup_vector_irq(int cpu)
int irq, vector;
struct apic_chip_data *data;
/*
* vector_lock will make sure that we don't run into irq vector
* assignments that might be happening on another cpu in parallel,
* while we setup our initial vector to irq mappings.
*/
raw_spin_lock(&vector_lock);
/* Mark the inuse vectors */
for_each_active_irq(irq) {
data = apic_chip_data(irq_get_irq_data(irq));
@@ -436,16 +430,16 @@ static void __setup_vector_irq(int cpu)
if (!cpumask_test_cpu(cpu, data->domain))
per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
}
raw_spin_unlock(&vector_lock);
}
/*
* Setup the vector to irq mappings.
* Setup the vector to irq mappings. Must be called with vector_lock held.
*/
void setup_vector_irq(int cpu)
{
int irq;
lockdep_assert_held(&vector_lock);
/*
* On most of the platforms, legacy PIC delivers the interrupts on the
* boot cpu. But there are certain platforms where PIC interrupts are

View File

@@ -951,6 +951,14 @@ static u64 intel_cqm_event_count(struct perf_event *event)
if (!cqm_group_leader(event))
return 0;
/*
* Getting up-to-date values requires an SMP IPI which is not
* possible if we're being called in interrupt context. Return
* the cached values instead.
*/
if (unlikely(in_interrupt()))
goto out;
/*
* Notice that we don't perform the reading of an RMID
* atomically, because we can't hold a spin lock across the

View File

@@ -175,7 +175,9 @@ static __init void early_serial_init(char *s)
}
if (*s) {
if (kstrtoul(s, 0, &baud) < 0 || baud == 0)
baud = simple_strtoull(s, &e, 0);
if (baud == 0 || s == e)
baud = DEFAULT_BAUD;
}

View File

@@ -131,25 +131,24 @@ void __init init_espfix_bsp(void)
init_espfix_random();
/* The rest is the same as for any other processor */
init_espfix_ap();
init_espfix_ap(0);
}
void init_espfix_ap(void)
void init_espfix_ap(int cpu)
{
unsigned int cpu, page;
unsigned int page;
unsigned long addr;
pud_t pud, *pud_p;
pmd_t pmd, *pmd_p;
pte_t pte, *pte_p;
int n;
int n, node;
void *stack_page;
pteval_t ptemask;
/* We only have to do this once... */
if (likely(this_cpu_read(espfix_stack)))
if (likely(per_cpu(espfix_stack, cpu)))
return; /* Already initialized */
cpu = smp_processor_id();
addr = espfix_base_addr(cpu);
page = cpu/ESPFIX_STACKS_PER_PAGE;
@@ -165,12 +164,15 @@ void init_espfix_ap(void)
if (stack_page)
goto unlock_done;
node = cpu_to_node(cpu);
ptemask = __supported_pte_mask;
pud_p = &espfix_pud_page[pud_index(addr)];
pud = *pud_p;
if (!pud_present(pud)) {
pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP);
struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
pmd_p = (pmd_t *)page_address(page);
pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PUD_CLONES; n++)
@@ -180,7 +182,9 @@ void init_espfix_ap(void)
pmd_p = pmd_offset(&pud, addr);
pmd = *pmd_p;
if (!pmd_present(pmd)) {
pte_p = (pte_t *)__get_free_page(PGALLOC_GFP);
struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0);
pte_p = (pte_t *)page_address(page);
pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask));
paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT);
for (n = 0; n < ESPFIX_PMD_CLONES; n++)
@@ -188,7 +192,7 @@ void init_espfix_ap(void)
}
pte_p = pte_offset_kernel(&pmd, addr);
stack_page = (void *)__get_free_page(GFP_KERNEL);
stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0));
pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask));
for (n = 0; n < ESPFIX_PTE_CLONES; n++)
set_pte(&pte_p[n*PTE_STRIDE], pte);
@@ -199,7 +203,7 @@ void init_espfix_ap(void)
unlock_done:
mutex_unlock(&espfix_init_mutex);
done:
this_cpu_write(espfix_stack, addr);
this_cpu_write(espfix_waddr, (unsigned long)stack_page
+ (addr & ~PAGE_MASK));
per_cpu(espfix_stack, cpu) = addr;
per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page
+ (addr & ~PAGE_MASK);
}

View File

@@ -4,6 +4,8 @@
#include <asm/fpu/internal.h>
#include <asm/tlbflush.h>
#include <linux/sched.h>
/*
* Initialize the TS bit in CR0 according to the style of context-switches
* we are using:
@@ -136,6 +138,43 @@ static void __init fpu__init_system_generic(void)
unsigned int xstate_size;
EXPORT_SYMBOL_GPL(xstate_size);
/* Enforce that 'MEMBER' is the last field of 'TYPE': */
#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
BUILD_BUG_ON(sizeof(TYPE) != offsetofend(TYPE, MEMBER))
/*
* We append the 'struct fpu' to the task_struct:
*/
static void __init fpu__init_task_struct_size(void)
{
int task_size = sizeof(struct task_struct);
/*
* Subtract off the static size of the register state.
* It potentially has a bunch of padding.
*/
task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
/*
* Add back the dynamically-calculated register state
* size.
*/
task_size += xstate_size;
/*
* We dynamically size 'struct fpu', so we require that
* it be at the end of 'thread_struct' and that
* 'thread_struct' be at the end of 'task_struct'. If
* you hit a compile error here, check the structure to
* see if something got added to the end.
*/
CHECK_MEMBER_AT_END_OF(struct fpu, state);
CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
arch_task_struct_size = task_size;
}
/*
* Set up the xstate_size based on the legacy FPU context size.
*
@@ -287,6 +326,7 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
fpu__init_task_struct_size();
fpu__init_system_ctx_switch();
}
@@ -311,9 +351,15 @@ static int __init x86_noxsave_setup(char *s)
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
setup_clear_cpu_cap(X86_FEATURE_AVX);
setup_clear_cpu_cap(X86_FEATURE_AVX2);
setup_clear_cpu_cap(X86_FEATURE_AVX512F);
setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
setup_clear_cpu_cap(X86_FEATURE_MPX);
return 1;
}

View File

@@ -161,11 +161,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
/* Kill off the identity-map trampoline */
reset_early_page_tables();
kasan_map_early_shadow(early_level4_pgt);
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();
clear_page(init_level4_pgt);
kasan_early_init();
for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
set_intr_gate(i, early_idt_handler_array[i]);
load_idt((const struct desc_ptr *)&idt_descr);
@@ -177,12 +178,9 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
clear_page(init_level4_pgt);
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
kasan_map_early_shadow(init_level4_pgt);
x86_64_start_reservations(real_mode_data);
}

View File

@@ -516,38 +516,9 @@ ENTRY(phys_base)
/* This must match the first entry in level2_kernel_pgt */
.quad 0x0000000000000000
#ifdef CONFIG_KASAN
#define FILL(VAL, COUNT) \
.rept (COUNT) ; \
.quad (VAL) ; \
.endr
NEXT_PAGE(kasan_zero_pte)
FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512)
NEXT_PAGE(kasan_zero_pmd)
FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512)
NEXT_PAGE(kasan_zero_pud)
FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512)
#undef FILL
#endif
#include "../../x86/xen/xen-head.S"
__PAGE_ALIGNED_BSS
NEXT_PAGE(empty_zero_page)
.skip PAGE_SIZE
#ifdef CONFIG_KASAN
/*
* This page used as early shadow. We don't use empty_zero_page
* at early stages, stack instrumentation could write some garbage
* to this page.
* Latter we reuse it as zero shadow for large ranges of memory
* that allowed to access, but not instrumented by kasan
* (vmalloc/vmemmap ...).
*/
NEXT_PAGE(kasan_zero_page)
.skip PAGE_SIZE
#endif

View File

@@ -347,15 +347,23 @@ int check_irq_vectors_for_cpu_disable(void)
if (!desc)
continue;
/*
* Protect against concurrent action removal,
* affinity changes etc.
*/
raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
cpumask_copy(&affinity_new,
irq_data_get_affinity_mask(data));
cpumask_clear_cpu(this_cpu, &affinity_new);
/* Do not count inactive or per-cpu irqs. */
if (!irq_has_action(irq) || irqd_is_per_cpu(data))
if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
raw_spin_unlock(&desc->lock);
continue;
}
raw_spin_unlock(&desc->lock);
/*
* A single irq may be mapped to multiple
* cpu's vector_irq[] (for example IOAPIC cluster
@@ -386,6 +394,9 @@ int check_irq_vectors_for_cpu_disable(void)
* vector. If the vector is marked in the used vectors
* bitmap or an irq is assigned to it, we don't count
* it as available.
*
* As this is an inaccurate snapshot anyway, we can do
* this w/o holding vector_lock.
*/
for (vector = FIRST_EXTERNAL_VECTOR;
vector < first_system_vector; vector++) {
@@ -487,6 +498,11 @@ void fixup_irqs(void)
*/
mdelay(1);
/*
* We can walk the vector array of this cpu without holding
* vector_lock because the cpu is already marked !online, so
* nothing else will touch it.
*/
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
unsigned int irr;
@@ -498,9 +514,9 @@ void fixup_irqs(void)
irq = __this_cpu_read(vector_irq[vector]);
desc = irq_to_desc(irq);
raw_spin_lock(&desc->lock);
data = irq_desc_get_irq_data(desc);
chip = irq_data_get_irq_chip(data);
raw_spin_lock(&desc->lock);
if (chip->irq_retrigger) {
chip->irq_retrigger(data);
__this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED);

View File

@@ -408,15 +408,15 @@ static void default_do_nmi(struct pt_regs *regs)
NOKPROBE_SYMBOL(default_do_nmi);
/*
* NMIs can hit breakpoints which will cause it to lose its
* NMI context with the CPU when the breakpoint does an iret.
*/
#ifdef CONFIG_X86_32
/*
* For i386, NMIs use the same stack as the kernel, and we can
* add a workaround to the iret problem in C (preventing nested
* NMIs if an NMI takes a trap). Simply have 3 states the NMI
* can be in:
* NMIs can page fault or hit breakpoints which will cause it to lose
* its NMI context with the CPU when the breakpoint or page fault does an IRET.
*
* As a result, NMIs can nest if NMIs get unmasked due an IRET during
* NMI processing. On x86_64, the asm glue protects us from nested NMIs
* if the outer NMI came from kernel mode, but we can still nest if the
* outer NMI came from user mode.
*
* To handle these nested NMIs, we have three states:
*
* 1) not running
* 2) executing
@@ -430,15 +430,14 @@ NOKPROBE_SYMBOL(default_do_nmi);
* (Note, the latch is binary, thus multiple NMIs triggering,
* when one is running, are ignored. Only one NMI is restarted.)
*
* If an NMI hits a breakpoint that executes an iret, another
* NMI can preempt it. We do not want to allow this new NMI
* to run, but we want to execute it when the first one finishes.
* We set the state to "latched", and the exit of the first NMI will
* perform a dec_return, if the result is zero (NOT_RUNNING), then
* it will simply exit the NMI handler. If not, the dec_return
* would have set the state to NMI_EXECUTING (what we want it to
* be when we are running). In this case, we simply jump back
* to rerun the NMI handler again, and restart the 'latched' NMI.
* If an NMI executes an iret, another NMI can preempt it. We do not
* want to allow this new NMI to run, but we want to execute it when the
* first one finishes. We set the state to "latched", and the exit of
* the first NMI will perform a dec_return, if the result is zero
* (NOT_RUNNING), then it will simply exit the NMI handler. If not, the
* dec_return would have set the state to NMI_EXECUTING (what we want it
* to be when we are running). In this case, we simply jump back to
* rerun the NMI handler again, and restart the 'latched' NMI.
*
* No trap (breakpoint or page fault) should be hit before nmi_restart,
* thus there is no race between the first check of state for NOT_RUNNING
@@ -461,49 +460,36 @@ enum nmi_states {
static DEFINE_PER_CPU(enum nmi_states, nmi_state);
static DEFINE_PER_CPU(unsigned long, nmi_cr2);
#define nmi_nesting_preprocess(regs) \
do { \
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) { \
this_cpu_write(nmi_state, NMI_LATCHED); \
return; \
} \
this_cpu_write(nmi_state, NMI_EXECUTING); \
this_cpu_write(nmi_cr2, read_cr2()); \
} while (0); \
nmi_restart:
#define nmi_nesting_postprocess() \
do { \
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2())) \
write_cr2(this_cpu_read(nmi_cr2)); \
if (this_cpu_dec_return(nmi_state)) \
goto nmi_restart; \
} while (0)
#else /* x86_64 */
#ifdef CONFIG_X86_64
/*
* In x86_64 things are a bit more difficult. This has the same problem
* where an NMI hitting a breakpoint that calls iret will remove the
* NMI context, allowing a nested NMI to enter. What makes this more
* difficult is that both NMIs and breakpoints have their own stack.
* When a new NMI or breakpoint is executed, the stack is set to a fixed
* point. If an NMI is nested, it will have its stack set at that same
* fixed address that the first NMI had, and will start corrupting the
* stack. This is handled in entry_64.S, but the same problem exists with
* the breakpoint stack.
* In x86_64, we need to handle breakpoint -> NMI -> breakpoint. Without
* some care, the inner breakpoint will clobber the outer breakpoint's
* stack.
*
* If a breakpoint is being processed, and the debug stack is being used,
* if an NMI comes in and also hits a breakpoint, the stack pointer
* will be set to the same fixed address as the breakpoint that was
* interrupted, causing that stack to be corrupted. To handle this case,
* check if the stack that was interrupted is the debug stack, and if
* so, change the IDT so that new breakpoints will use the current stack
* and not switch to the fixed address. On return of the NMI, switch back
* to the original IDT.
* If a breakpoint is being processed, and the debug stack is being
* used, if an NMI comes in and also hits a breakpoint, the stack
* pointer will be set to the same fixed address as the breakpoint that
* was interrupted, causing that stack to be corrupted. To handle this
* case, check if the stack that was interrupted is the debug stack, and
* if so, change the IDT so that new breakpoints will use the current
* stack and not switch to the fixed address. On return of the NMI,
* switch back to the original IDT.
*/
static DEFINE_PER_CPU(int, update_debug_stack);
#endif
static inline void nmi_nesting_preprocess(struct pt_regs *regs)
dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
this_cpu_write(nmi_state, NMI_LATCHED);
return;
}
this_cpu_write(nmi_state, NMI_EXECUTING);
this_cpu_write(nmi_cr2, read_cr2());
nmi_restart:
#ifdef CONFIG_X86_64
/*
* If we interrupted a breakpoint, it is possible that
* the nmi handler will have breakpoints too. We need to
@@ -514,22 +500,8 @@ static inline void nmi_nesting_preprocess(struct pt_regs *regs)
debug_stack_set_zero();
this_cpu_write(update_debug_stack, 1);
}
}
static inline void nmi_nesting_postprocess(void)
{
if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
this_cpu_write(update_debug_stack, 0);
}
}
#endif
dotraplinkage notrace void
do_nmi(struct pt_regs *regs, long error_code)
{
nmi_nesting_preprocess(regs);
nmi_enter();
inc_irq_stat(__nmi_count);
@@ -539,8 +511,17 @@ do_nmi(struct pt_regs *regs, long error_code)
nmi_exit();
/* On i386, may loop back to preprocess */
nmi_nesting_postprocess();
#ifdef CONFIG_X86_64
if (unlikely(this_cpu_read(update_debug_stack))) {
debug_stack_reset();
this_cpu_write(update_debug_stack, 0);
}
#endif
if (unlikely(this_cpu_read(nmi_cr2) != read_cr2()))
write_cr2(this_cpu_read(nmi_cr2));
if (this_cpu_dec_return(nmi_state))
goto nmi_restart;
}
NOKPROBE_SYMBOL(do_nmi);

View File

@@ -81,7 +81,7 @@ EXPORT_SYMBOL_GPL(idle_notifier_unregister);
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
*dst = *src;
memcpy(dst, src, arch_task_struct_size);
return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
}

View File

@@ -170,11 +170,6 @@ static void smp_callin(void)
*/
apic_ap_setup();
/*
* Need to setup vector mappings before we enable interrupts.
*/
setup_vector_irq(smp_processor_id());
/*
* Save our processor parameters. Note: this information
* is needed for clock calibration.
@@ -239,18 +234,13 @@ static void notrace start_secondary(void *unused)
check_tsc_sync_target();
/*
* Enable the espfix hack for this CPU
*/
#ifdef CONFIG_X86_ESPFIX64
init_espfix_ap();
#endif
/*
* We need to hold vector_lock so there the set of online cpus
* does not change while we are assigning vectors to cpus. Holding
* this lock ensures we don't half assign or remove an irq from a cpu.
* Lock vector_lock and initialize the vectors on this cpu
* before setting the cpu online. We must set it online with
* vector_lock held to prevent a concurrent setup/teardown
* from seeing a half valid vector space.
*/
lock_vector_lock();
setup_vector_irq(smp_processor_id());
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
cpu_set_state_online(smp_processor_id());
@@ -854,6 +844,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
initial_code = (unsigned long)start_secondary;
stack_start = idle->thread.sp;
/*
* Enable the espfix hack for this CPU
*/
#ifdef CONFIG_X86_ESPFIX64
init_espfix_ap(cpu);
#endif
/* So we see what's up */
announce_cpu(cpu, apicid);
@@ -995,8 +992,17 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
common_cpu_up(cpu, tidle);
/*
* We have to walk the irq descriptors to setup the vector
* space for the cpu which comes online. Prevent irq
* alloc/free across the bringup.
*/
irq_lock_sparse();
err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
irq_unlock_sparse();
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
return -EIO;
}
@@ -1014,6 +1020,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
touch_nmi_watchdog();
}
irq_unlock_sparse();
return 0;
}

View File

@@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void)
if (!pit_expect_msb(0xff-i, &delta, &d2))
break;
delta -= tsc;
/*
* Extrapolate the error and fail fast if the error will
* never be below 500 ppm.
*/
if (i == 1 &&
d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11)
return 0;
/*
* Iterate until the error is less than 500 ppm
*/
delta -= tsc;
if (d1+d2 >= delta >> 11)
continue;