linux/arch/x86/kernel/io_apic_32.c

2901 lines
71 KiB
C
Raw Normal View History

/*
* Intel IO-APIC support for multi-Pentium hosts.
*
* Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
*
* Many thanks to Stig Venaas for trying out countless experimental
* patches and reporting/debugging problems patiently!
*
* (c) 1999, Multiple IO-APIC support, developed by
* Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
* Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
* further tested and cleaned up by Zach Brown <zab@redhat.com>
* and Ingo Molnar <mingo@redhat.com>
*
* Fixes
* Maciej W. Rozycki : Bits for genuine 82489DX APICs;
* thanks to Eric Gilmore
* and Rolf G. Tews
* for testing these extensively
* Paul Diefenbaugh : Added full ACPI support
*/
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/delay.h>
#include <linux/sched.h>
#include <linux/bootmem.h>
#include <linux/mc146818rtc.h>
#include <linux/compiler.h>
#include <linux/acpi.h>
#include <linux/module.h>
#include <linux/sysdev.h>
#include <linux/pci.h>
#include <linux/msi.h>
#include <linux/htirq.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
#include <linux/jiffies.h> /* time_after() */
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
#include <asm/io.h>
#include <asm/smp.h>
#include <asm/desc.h>
#include <asm/timer.h>
#include <asm/i8259.h>
#include <asm/nmi.h>
#include <asm/msidef.h>
#include <asm/hypertransport.h>
#include <mach_apic.h>
#include <mach_apicdef.h>
int (*ioapic_renumber_irq)(int ioapic, int irq);
atomic_t irq_mis_count;
/* Where if anywhere is the i8259 connect in external int mode */
static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
static DEFINE_SPINLOCK(ioapic_lock);
static DEFINE_SPINLOCK(vector_lock);
int timer_through_8259 __initdata;
/*
* Is the SiS APIC rmw bug present ?
* -1 = don't know, 0 = no, 1 = yes
*/
int sis_apic_bug = -1;
/*
* # of IRQ routing registers
*/
int nr_ioapic_registers[MAX_IO_APICS];
/* I/O APIC entries */
struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
int nr_ioapics;
/* MP IRQ source entries */
struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
/* # of MP IRQ source entries */
int mp_irq_entries;
#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
int mp_bus_id_to_type[MAX_MP_BUSSES];
#endif
DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
static int disable_timer_pin_1 __initdata;
/*
* Rough estimation of how many shared IRQs there are, can
* be changed anytime.
*/
#define MAX_PLUS_SHARED_IRQS NR_IRQS
#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
/*
* This is performance-critical, we want to do it O(1)
*
* the indexing order of this array favors 1:1 mappings
* between pins and IRQs.
*/
static struct irq_pin_list {
int apic, pin, next;
} irq_2_pin[PIN_MAP_SIZE];
struct io_apic {
unsigned int index;
unsigned int unused[3];
unsigned int data;
};
static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
{
return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
+ (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
}
static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
return readl(&io_apic->data);
}
static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
{
struct io_apic __iomem *io_apic = io_apic_base(apic);
writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
/*
* Re-write a value: to be used for read-modify-write
* cycles where the read already set up the index register.
*
* Older SiS APIC requires we rewrite the index register
*/
static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
{
volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
if (sis_apic_bug)
writel(reg, &io_apic->index);
writel(value, &io_apic->data);
}
union entry_union {
struct { u32 w1, w2; };
struct IO_APIC_route_entry entry;
};
static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
{
union entry_union eu;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
spin_unlock_irqrestore(&ioapic_lock, flags);
return eu.entry;
}
/*
* When we write a new IO APIC routing entry, we need to write the high
* word first! If the mask bit in the low word is clear, we will enable
* the interrupt, and we need to make sure the entry is fully populated
* before that happens.
*/
static void
__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
union entry_union eu;
eu.entry = e;
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
}
static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
{
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
__ioapic_write_entry(apic, pin, e);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
* When we mask an IO APIC routing entry, we need to write the low
* word first, in order to set the mask bit before we change the
* high bits!
*/
static void ioapic_mask_entry(int apic, int pin)
{
unsigned long flags;
union entry_union eu = { .entry.mask = 1 };
spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0x10 + 2*pin, eu.w1);
io_apic_write(apic, 0x11 + 2*pin, eu.w2);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
/*
* The common case is 1:1 IRQ<->pin mappings. Sometimes there are
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
static void add_pin_to_irq(unsigned int irq, int apic, int pin)
{
static int first_free_entry = NR_IRQS;
struct irq_pin_list *entry = irq_2_pin + irq;
while (entry->next)
entry = irq_2_pin + entry->next;
if (entry->pin != -1) {
entry->next = first_free_entry;
entry = irq_2_pin + entry->next;
if (++first_free_entry >= PIN_MAP_SIZE)
panic("io_apic.c: whoops");
}
entry->apic = apic;
entry->pin = pin;
}
/*
* Reroute an IRQ to a different pin.
*/
static void __init replace_pin_at_irq(unsigned int irq,
int oldapic, int oldpin,
int newapic, int newpin)
{
struct irq_pin_list *entry = irq_2_pin + irq;
while (1) {
if (entry->apic == oldapic && entry->pin == oldpin) {
entry->apic = newapic;
entry->pin = newpin;
}
if (!entry->next)
break;
entry = irq_2_pin + entry->next;
}
}
static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
{
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int pin, reg;
for (;;) {
pin = entry->pin;
if (pin == -1)
break;
reg = io_apic_read(entry->apic, 0x10 + pin*2);
reg &= ~disable;
reg |= enable;
io_apic_modify(entry->apic, 0x10 + pin*2, reg);
if (!entry->next)
break;
entry = irq_2_pin + entry->next;
}
}
/* mask = 1 */
static void __mask_IO_APIC_irq(unsigned int irq)
{
__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
}
/* mask = 0 */
static void __unmask_IO_APIC_irq(unsigned int irq)
{
__modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
}
/* mask = 1, trigger = 0 */
static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
{
__modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
IO_APIC_REDIR_LEVEL_TRIGGER);
}
/* mask = 0, trigger = 1 */
static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
{
__modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
IO_APIC_REDIR_MASKED);
}
static void mask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
__mask_IO_APIC_irq(irq);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void unmask_IO_APIC_irq(unsigned int irq)
{
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
__unmask_IO_APIC_irq(irq);
spin_unlock_irqrestore(&ioapic_lock, flags);
}
static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
{
struct IO_APIC_route_entry entry;
/* Check delivery_mode to be sure we're not clearing an SMI pin */
entry = ioapic_read_entry(apic, pin);
if (entry.delivery_mode == dest_SMI)
return;
/*
* Disable it in the IO-APIC irq-routing table:
*/
ioapic_mask_entry(apic, pin);
}
static void clear_IO_APIC(void)
{
int apic, pin;
for (apic = 0; apic < nr_ioapics; apic++)
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
clear_IO_APIC_pin(apic, pin);
}
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
#ifdef CONFIG_SMP
static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
{
unsigned long flags;
int pin;
struct irq_pin_list *entry = irq_2_pin + irq;
unsigned int apicid_value;
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
cpumask_t tmp;
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
cpus_and(tmp, cpumask, cpu_online_map);
if (cpus_empty(tmp))
tmp = TARGET_CPUS;
cpus_and(cpumask, tmp, CPU_MASK_ALL);
apicid_value = cpu_mask_to_apicid(cpumask);
/* Prepare to do the io_apic_write */
apicid_value = apicid_value << 24;
spin_lock_irqsave(&ioapic_lock, flags);
for (;;) {
pin = entry->pin;
if (pin == -1)
break;
io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
if (!entry->next)
break;
entry = irq_2_pin + entry->next;
}
irq_desc[irq].affinity = cpumask;
spin_unlock_irqrestore(&ioapic_lock, flags);
}
#if defined(CONFIG_IRQBALANCE)
# include <asm/processor.h> /* kernel_thread() */
# include <linux/kernel_stat.h> /* kstat */
# include <linux/slab.h> /* kmalloc() */
# include <linux/timer.h>
#define IRQBALANCE_CHECK_ARCH -999
#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
#define BALANCED_IRQ_MORE_DELTA (HZ/10)
#define BALANCED_IRQ_LESS_DELTA (HZ)
static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
static int physical_balance __read_mostly;
static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
static struct irq_cpu_info {
unsigned long *last_irq;
unsigned long *irq_delta;
unsigned long irq;
} irq_cpu_data[NR_CPUS];
#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
#define IDLE_ENOUGH(cpu,now) \
(idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
static cpumask_t balance_irq_affinity[NR_IRQS] = {
[0 ... NR_IRQS-1] = CPU_MASK_ALL
};
void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
{
balance_irq_affinity[irq] = mask;
}
static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
unsigned long now, int direction)
{
int search_idle = 1;
int cpu = curr_cpu;
goto inside;
do {
if (unlikely(cpu == curr_cpu))
search_idle = 0;
inside:
if (direction == 1) {
cpu++;
if (cpu >= NR_CPUS)
cpu = 0;
} else {
cpu--;
if (cpu == -1)
cpu = NR_CPUS-1;
}
} while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
(search_idle && !IDLE_ENOUGH(cpu, now)));
return cpu;
}
static inline void balance_irq(int cpu, int irq)
{
unsigned long now = jiffies;
cpumask_t allowed_mask;
unsigned int new_cpu;
if (irqbalance_disabled)
return;
cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
new_cpu = move(cpu, allowed_mask, now, 1);
if (cpu != new_cpu)
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
set_pending_irq(irq, cpumask_of_cpu(new_cpu));
}
static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
{
int i, j;
for_each_online_cpu(i) {
for (j = 0; j < NR_IRQS; j++) {
if (!irq_desc[j].action)
continue;
/* Is it a significant load ? */
if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
useful_load_threshold)
continue;
balance_irq(i, j);
}
}
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
static void do_irq_balance(void)
{
int i, j;
unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
unsigned long move_this_load = 0;
int max_loaded = 0, min_loaded = 0;
int load;
unsigned long useful_load_threshold = balanced_irq_interval + 10;
int selected_irq;
int tmp_loaded, first_attempt = 1;
unsigned long tmp_cpu_irq;
unsigned long imbalance = 0;
cpumask_t allowed_mask, target_cpu_mask, tmp;
for_each_possible_cpu(i) {
int package_index;
CPU_IRQ(i) = 0;
if (!cpu_online(i))
continue;
package_index = CPU_TO_PACKAGEINDEX(i);
for (j = 0; j < NR_IRQS; j++) {
unsigned long value_now, delta;
/* Is this an active IRQ or balancing disabled ? */
if (!irq_desc[j].action || irq_balancing_disabled(j))
continue;
if (package_index == i)
IRQ_DELTA(package_index, j) = 0;
/* Determine the total count per processor per IRQ */
value_now = (unsigned long) kstat_cpu(i).irqs[j];
/* Determine the activity per processor per IRQ */
delta = value_now - LAST_CPU_IRQ(i, j);
/* Update last_cpu_irq[][] for the next time */
LAST_CPU_IRQ(i, j) = value_now;
/* Ignore IRQs whose rate is less than the clock */
if (delta < useful_load_threshold)
continue;
/* update the load for the processor or package total */
IRQ_DELTA(package_index, j) += delta;
/* Keep track of the higher numbered sibling as well */
if (i != package_index)
CPU_IRQ(i) += delta;
/*
* We have sibling A and sibling B in the package
*
* cpu_irq[A] = load for cpu A + load for cpu B
* cpu_irq[B] = load for cpu B
*/
CPU_IRQ(package_index) += delta;
}
}
/* Find the least loaded processor package */
for_each_online_cpu(i) {
if (i != CPU_TO_PACKAGEINDEX(i))
continue;
if (min_cpu_irq > CPU_IRQ(i)) {
min_cpu_irq = CPU_IRQ(i);
min_loaded = i;
}
}
max_cpu_irq = ULONG_MAX;
tryanothercpu:
/*
* Look for heaviest loaded processor.
* We may come back to get the next heaviest loaded processor.
* Skip processors with trivial loads.
*/
tmp_cpu_irq = 0;
tmp_loaded = -1;
for_each_online_cpu(i) {
if (i != CPU_TO_PACKAGEINDEX(i))
continue;
if (max_cpu_irq <= CPU_IRQ(i))
continue;
if (tmp_cpu_irq < CPU_IRQ(i)) {
tmp_cpu_irq = CPU_IRQ(i);
tmp_loaded = i;
}
}
if (tmp_loaded == -1) {
/*
* In the case of small number of heavy interrupt sources,
* loading some of the cpus too much. We use Ingo's original
* approach to rotate them around.
*/
if (!first_attempt && imbalance >= useful_load_threshold) {
rotate_irqs_among_cpus(useful_load_threshold);
return;
}
goto not_worth_the_effort;
}
first_attempt = 0; /* heaviest search */
max_cpu_irq = tmp_cpu_irq; /* load */
max_loaded = tmp_loaded; /* processor */
imbalance = (max_cpu_irq - min_cpu_irq) / 2;
/*
* if imbalance is less than approx 10% of max load, then
* observe diminishing returns action. - quit
*/
if (imbalance < (max_cpu_irq >> 3))
goto not_worth_the_effort;
tryanotherirq:
/* if we select an IRQ to move that can't go where we want, then
* see if there is another one to try.
*/
move_this_load = 0;
selected_irq = -1;
for (j = 0; j < NR_IRQS; j++) {
/* Is this an active IRQ? */
if (!irq_desc[j].action)
continue;
if (imbalance <= IRQ_DELTA(max_loaded, j))
continue;
/* Try to find the IRQ that is closest to the imbalance
* without going over.
*/
if (move_this_load < IRQ_DELTA(max_loaded, j)) {
move_this_load = IRQ_DELTA(max_loaded, j);
selected_irq = j;
}
}
if (selected_irq == -1)
goto tryanothercpu;
imbalance = move_this_load;
/* For physical_balance case, we accumulated both load
* values in the one of the siblings cpu_irq[],
* to use the same code for physical and logical processors
* as much as possible.
*
* NOTE: the cpu_irq[] array holds the sum of the load for
* sibling A and sibling B in the slot for the lowest numbered
* sibling (A), _AND_ the load for sibling B in the slot for
* the higher numbered sibling.
*
* We seek the least loaded sibling by making the comparison
* (A+B)/2 vs B
*/
load = CPU_IRQ(min_loaded) >> 1;
for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
if (load > CPU_IRQ(j)) {
/* This won't change cpu_sibling_map[min_loaded] */
load = CPU_IRQ(j);
min_loaded = j;
}
}
cpus_and(allowed_mask,
cpu_online_map,
balance_irq_affinity[selected_irq]);
target_cpu_mask = cpumask_of_cpu(min_loaded);
cpus_and(tmp, target_cpu_mask, allowed_mask);
if (!cpus_empty(tmp)) {
/* mark for change destination */
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
/* Since we made a change, come back sooner to
* check for more variation.
*/
balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
return;
}
goto tryanotherirq;
not_worth_the_effort:
/*
* if we did not find an IRQ to move, then adjust the time interval
* upward
*/
balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
return;
}
static int balanced_irq(void *unused)
{
int i;
unsigned long prev_balance_time = jiffies;
long time_remaining = balanced_irq_interval;
/* push everything to CPU 0 to give us a starting point. */
for (i = 0 ; i < NR_IRQS ; i++) {
irq_desc[i].pending_mask = cpumask_of_cpu(0);
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
set_pending_irq(i, cpumask_of_cpu(0));
}
set_freezable();
for ( ; ; ) {
time_remaining = schedule_timeout_interruptible(time_remaining);
try_to_freeze();
if (time_after(jiffies,
prev_balance_time+balanced_irq_interval)) {
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
preempt_disable();
do_irq_balance();
prev_balance_time = jiffies;
time_remaining = balanced_irq_interval;
[PATCH] i386 CPU hotplug (The i386 CPU hotplug patch provides infrastructure for some work which Pavel is doing as well as for ACPI S3 (suspend-to-RAM) work which Li Shaohua <shaohua.li@intel.com> is doing) The following provides i386 architecture support for safely unregistering and registering processors during runtime, updated for the current -mm tree. In order to avoid dumping cpu hotplug code into kernel/irq/* i dropped the cpu_online check in do_IRQ() by modifying fixup_irqs(). The difference being that on cpu offline, fixup_irqs() is called before we clear the cpu from cpu_online_map and a long delay in order to ensure that we never have any queued external interrupts on the APICs. There are additional changes to s390 and ppc64 to account for this change. 1) Add CONFIG_HOTPLUG_CPU 2) disable local APIC timer on dead cpus. 3) Disable preempt around irq balancing to prevent CPUs going down. 4) Print irq stats for all possible cpus. 5) Debugging check for interrupts on offline cpus. 6) Hacky fixup_irqs() to redirect irqs when cpus go off/online. 7) play_dead() for offline cpus to spin inside. 8) Handle offline cpus set in flush_tlb_others(). 9) Grab lock earlier in smp_call_function() to prevent CPUs going down. 10) Implement __cpu_disable() and __cpu_die(). 11) Enable local interrupts in cpu_enable() after fixup_irqs() 12) Don't fiddle with NMI on dead cpu, but leave intact on other cpus. 13) Program IRQ affinity whilst cpu is still in cpu_online_map on offline. Signed-off-by: Zwane Mwaikambo <zwane@linuxpower.ca> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-25 21:54:50 +00:00
preempt_enable();
}
}
return 0;
}
static int __init balanced_irq_init(void)
{
int i;
struct cpuinfo_x86 *c;
cpumask_t tmp;
cpus_shift_right(tmp, cpu_online_map, 2);
c = &boot_cpu_data;
/* When not overwritten by the command line ask subarchitecture. */
if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
irqbalance_disabled = NO_BALANCE_IRQ;
if (irqbalance_disabled)
return 0;
/* disable irqbalance completely if there is only one processor online */
if (num_online_cpus() < 2) {
irqbalance_disabled = 1;
return 0;
}
/*
* Enable physical balance only if more than 1 physical processor
* is present
*/
if (smp_num_siblings > 1 && !cpus_empty(tmp))
physical_balance = 1;
for_each_online_cpu(i) {
irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
printk(KERN_ERR "balanced_irq_init: out of memory");
goto failed;
}
}
printk(KERN_INFO "Starting balanced_irq\n");
if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
return 0;
printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
failed:
for_each_possible_cpu(i) {
kfree(irq_cpu_data[i].irq_delta);
irq_cpu_data[i].irq_delta = NULL;
kfree(irq_cpu_data[i].last_irq);
irq_cpu_data[i].last_irq = NULL;
}
return 0;
}
int __devinit irqbalance_disable(char *str)
{
irqbalance_disabled = 1;
return 1;
}
__setup("noirqbalance", irqbalance_disable);
late_initcall(balanced_irq_init);
#endif /* CONFIG_IRQBALANCE */
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
#endif /* CONFIG_SMP */
#ifndef CONFIG_SMP
void send_IPI_self(int vector)
{
unsigned int cfg;
/*
* Wait for idle.
*/
apic_wait_icr_idle();
cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
/*
* Send the IPI. The write to APIC_ICR fires this off.
*/
apic_write_around(APIC_ICR, cfg);
}
#endif /* !CONFIG_SMP */
/*
* support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
* specific CPU-side IRQs.
*/
#define MAX_PIRQS 8
static int pirq_entries [MAX_PIRQS];
static int pirqs_enabled;
int skip_ioapic_setup;
static int __init ioapic_pirq_setup(char *str)
{
int i, max;
int ints[MAX_PIRQS+1];
get_options(str, ARRAY_SIZE(ints), ints);
for (i = 0; i < MAX_PIRQS; i++)
pirq_entries[i] = -1;
pirqs_enabled = 1;
apic_printk(APIC_VERBOSE, KERN_INFO
"PIRQ redirection, working around broken MP-BIOS.\n");
max = MAX_PIRQS;
if (ints[0] < MAX_PIRQS)
max = ints[0];
for (i = 0; i < max; i++) {
apic_printk(APIC_VERBOSE, KERN_DEBUG
"... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
/*
* PIRQs are mapped upside down, usually.
*/
pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
}
return 1;
}
__setup("pirq=", ioapic_pirq_setup);
/*
* Find the IRQ entry number of a certain pin.
*/
static int find_irq_entry(int apic, int pin, int type)
{
int i;
for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].mp_irqtype == type &&
(mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
mp_irqs[i].mp_dstirq == pin)
return i;
return -1;
}
/*
* Find the pin to which IRQ[irq] (ISA) is connected
*/
static int __init find_isa_irq_pin(int irq, int type)
{
int i;
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mp_irqtype == type) &&
(mp_irqs[i].mp_srcbusirq == irq))
return mp_irqs[i].mp_dstirq;
}
return -1;
}
static int __init find_isa_irq_apic(int irq, int type)
{
int i;
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mp_srcbus;
if (test_bit(lbus, mp_bus_not_pci) &&
(mp_irqs[i].mp_irqtype == type) &&
(mp_irqs[i].mp_srcbusirq == irq))
break;
}
if (i < mp_irq_entries) {
int apic;
for (apic = 0; apic < nr_ioapics; apic++) {
if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
return apic;
}
}
return -1;
}
/*
* Find a specific PCI IRQ entry.
* Not an __init, possibly needed by modules
*/
static int pin_2_irq(int idx, int apic, int pin);
int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
{
int apic, i, best_guess = -1;
apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
"slot:%d, pin:%d.\n", bus, slot, pin);
if (test_bit(bus, mp_bus_not_pci)) {
printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
return -1;
}
for (i = 0; i < mp_irq_entries; i++) {
int lbus = mp_irqs[i].mp_srcbus;
for (apic = 0; apic < nr_ioapics; apic++)
if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
mp_irqs[i].mp_dstapic == MP_APIC_ALL)
break;
if (!test_bit(lbus, mp_bus_not_pci) &&
!mp_irqs[i].mp_irqtype &&
(bus == lbus) &&
(slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
if (!(apic || IO_APIC_IRQ(irq)))
continue;
if (pin == (mp_irqs[i].mp_srcbusirq & 3))
return irq;
/*
* Use the first all-but-pin matching entry as a
* best-guess fuzzy result for broken mptables.
*/
if (best_guess < 0)
best_guess = irq;
}
}
return best_guess;
}
EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
/*
* This function currently is only a helper for the i386 smp boot process where
* we need to reprogram the ioredtbls to cater for the cpus which have come online
* so mask in all cases should simply be TARGET_CPUS
*/
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
#ifdef CONFIG_SMP
void __init setup_ioapic_dest(void)
{
int pin, ioapic, irq, irq_entry;
if (skip_ioapic_setup == 1)
return;
for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
irq_entry = find_irq_entry(ioapic, pin, mp_INT);
if (irq_entry == -1)
continue;
irq = pin_2_irq(irq_entry, ioapic, pin);
set_ioapic_affinity_irq(irq, TARGET_CPUS);
}
}
}
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
#endif
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
/*
* EISA Edge/Level control register, ELCR
*/
static int EISA_ELCR(unsigned int irq)
{
if (irq < 16) {
unsigned int port = 0x4d0 + (irq >> 3);
return (inb(port) >> (irq & 7)) & 1;
}
apic_printk(APIC_VERBOSE, KERN_INFO
"Broken MPtable reports ISA irq %d\n", irq);
return 0;
}
#endif
/* ISA interrupts are always polarity zero edge triggered,
* when listed as conforming in the MP table. */
#define default_ISA_trigger(idx) (0)
#define default_ISA_polarity(idx) (0)
/* EISA interrupts are always polarity zero and can be edge or level
* trigger depending on the ELCR value. If an interrupt is listed as
* EISA conforming in the MP table, that means its trigger type must
* be read in from the ELCR */
#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
#define default_EISA_polarity(idx) default_ISA_polarity(idx)
/* PCI interrupts are always polarity one level triggered,
* when listed as conforming in the MP table. */
#define default_PCI_trigger(idx) (1)
#define default_PCI_polarity(idx) (1)
/* MCA interrupts are always polarity zero level triggered,
* when listed as conforming in the MP table. */
#define default_MCA_trigger(idx) (1)
#define default_MCA_polarity(idx) default_ISA_polarity(idx)
static int MPBIOS_polarity(int idx)
{
int bus = mp_irqs[idx].mp_srcbus;
int polarity;
/*
* Determine IRQ line polarity (high active or low active):
*/
switch (mp_irqs[idx].mp_irqflag & 3) {
case 0: /* conforms, ie. bus-type dependent polarity */
{
polarity = test_bit(bus, mp_bus_not_pci)?
default_ISA_polarity(idx):
default_PCI_polarity(idx);
break;
}
case 1: /* high active */
{
polarity = 0;
break;
}
case 2: /* reserved */
{
printk(KERN_WARNING "broken BIOS!!\n");
polarity = 1;
break;
}
case 3: /* low active */
{
polarity = 1;
break;
}
default: /* invalid */
{
printk(KERN_WARNING "broken BIOS!!\n");
polarity = 1;
break;
}
}
return polarity;
}
static int MPBIOS_trigger(int idx)
{
int bus = mp_irqs[idx].mp_srcbus;
int trigger;
/*
* Determine IRQ trigger mode (edge or level sensitive):
*/
switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
case 0: /* conforms, ie. bus-type dependent */
{
trigger = test_bit(bus, mp_bus_not_pci)?
default_ISA_trigger(idx):
default_PCI_trigger(idx);
#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
switch (mp_bus_id_to_type[bus]) {
case MP_BUS_ISA: /* ISA pin */
{
/* set before the switch */
break;
}
case MP_BUS_EISA: /* EISA pin */
{
trigger = default_EISA_trigger(idx);
break;
}
case MP_BUS_PCI: /* PCI pin */
{
/* set before the switch */
break;
}
case MP_BUS_MCA: /* MCA pin */
{
trigger = default_MCA_trigger(idx);
break;
}
default:
{
printk(KERN_WARNING "broken BIOS!!\n");
trigger = 1;
break;
}
}
#endif
break;
}
case 1: /* edge */
{
trigger = 0;
break;
}
case 2: /* reserved */
{
printk(KERN_WARNING "broken BIOS!!\n");
trigger = 1;
break;
}
case 3: /* level */
{
trigger = 1;
break;
}
default: /* invalid */
{
printk(KERN_WARNING "broken BIOS!!\n");
trigger = 0;
break;
}
}
return trigger;
}
static inline int irq_polarity(int idx)
{
return MPBIOS_polarity(idx);
}
static inline int irq_trigger(int idx)
{
return MPBIOS_trigger(idx);
}
static int pin_2_irq(int idx, int apic, int pin)
{
int irq, i;
int bus = mp_irqs[idx].mp_srcbus;
/*
* Debugging check, we are in big trouble if this message pops up!
*/
if (mp_irqs[idx].mp_dstirq != pin)
printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
if (test_bit(bus, mp_bus_not_pci))
irq = mp_irqs[idx].mp_srcbusirq;
else {
/*
* PCI IRQs are mapped in order
*/
i = irq = 0;
while (i < apic)
irq += nr_ioapic_registers[i++];
irq += pin;
/*
* For MPS mode, so far only needed by ES7000 platform
*/
if (ioapic_renumber_irq)
irq = ioapic_renumber_irq(apic, irq);
}
/*
* PCI IRQ command line redirection. Yes, limits are hardcoded.
*/
if ((pin >= 16) && (pin <= 23)) {
if (pirq_entries[pin-16] != -1) {
if (!pirq_entries[pin-16]) {
apic_printk(APIC_VERBOSE, KERN_DEBUG
"disabling PIRQ%d\n", pin-16);
} else {
irq = pirq_entries[pin-16];
apic_printk(APIC_VERBOSE, KERN_DEBUG
"using PIRQ%d -> IRQ %d\n",
pin-16, irq);
}
}
}
return irq;
}
static inline int IO_APIC_irq_trigger(int irq)
{
int apic, idx, pin;
for (apic = 0; apic < nr_ioapics; apic++) {
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
idx = find_irq_entry(apic, pin, mp_INT);
if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
return irq_trigger(idx);
}
}
/*
* nonexistent IRQs are edge default
*/
return 0;
}
/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
static int __assign_irq_vector(int irq)
{
static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
int vector, offset;
BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
if (irq_vector[irq] > 0)
return irq_vector[irq];
vector = current_vector;
[PATCH] i386: In assign_irq_vector look at all vectors before giving up When the world was a simple and static place setting up irqs was easy. It sufficed to allocate a linux irq number and a find a free cpu vector we could receive that linux irq on. In those days it was a safe assumption that any allocated vector was actually in use so after one global pass through all of the vectors we would have none left. These days things are much more dynamic with interrupt controllers (in the form of MSI or MSI-X) appearing on plug in cards and linux irqs appearing and disappearing. As these irqs come and go vectors are allocated and freed, invalidating the ancient assumption that all allocated vectors stayed in use forever. So this patch modifies the vector allocator to walk through every possible vector before giving up, and to check to see if a vector is in use before assigning it. With these changes we stop leaking freed vectors and it becomes possible to allocate and free irq vectors all day long. This changed was modeled after the vector allocator on x86_64 where this limitation has already been removed. In essence we don't update the static variables that hold the position of the last vector we allocated until have successfully allocated another vector. This allows us to detect if we have completed one complete scan through all of the possible vectors. Acked-by: Auke Kok <auke-jan.h.kok@intel.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-01-29 20:19:05 +00:00
offset = current_offset;
next:
vector += 8;
if (vector >= first_system_vector) {
[PATCH] i386: In assign_irq_vector look at all vectors before giving up When the world was a simple and static place setting up irqs was easy. It sufficed to allocate a linux irq number and a find a free cpu vector we could receive that linux irq on. In those days it was a safe assumption that any allocated vector was actually in use so after one global pass through all of the vectors we would have none left. These days things are much more dynamic with interrupt controllers (in the form of MSI or MSI-X) appearing on plug in cards and linux irqs appearing and disappearing. As these irqs come and go vectors are allocated and freed, invalidating the ancient assumption that all allocated vectors stayed in use forever. So this patch modifies the vector allocator to walk through every possible vector before giving up, and to check to see if a vector is in use before assigning it. With these changes we stop leaking freed vectors and it becomes possible to allocate and free irq vectors all day long. This changed was modeled after the vector allocator on x86_64 where this limitation has already been removed. In essence we don't update the static variables that hold the position of the last vector we allocated until have successfully allocated another vector. This allows us to detect if we have completed one complete scan through all of the possible vectors. Acked-by: Auke Kok <auke-jan.h.kok@intel.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-01-29 20:19:05 +00:00
offset = (offset + 1) % 8;
vector = FIRST_DEVICE_VECTOR + offset;
}
if (vector == current_vector)
return -ENOSPC;
if (test_and_set_bit(vector, used_vectors))
[PATCH] i386: In assign_irq_vector look at all vectors before giving up When the world was a simple and static place setting up irqs was easy. It sufficed to allocate a linux irq number and a find a free cpu vector we could receive that linux irq on. In those days it was a safe assumption that any allocated vector was actually in use so after one global pass through all of the vectors we would have none left. These days things are much more dynamic with interrupt controllers (in the form of MSI or MSI-X) appearing on plug in cards and linux irqs appearing and disappearing. As these irqs come and go vectors are allocated and freed, invalidating the ancient assumption that all allocated vectors stayed in use forever. So this patch modifies the vector allocator to walk through every possible vector before giving up, and to check to see if a vector is in use before assigning it. With these changes we stop leaking freed vectors and it becomes possible to allocate and free irq vectors all day long. This changed was modeled after the vector allocator on x86_64 where this limitation has already been removed. In essence we don't update the static variables that hold the position of the last vector we allocated until have successfully allocated another vector. This allows us to detect if we have completed one complete scan through all of the possible vectors. Acked-by: Auke Kok <auke-jan.h.kok@intel.com> Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-01-29 20:19:05 +00:00
goto next;
current_vector = vector;
current_offset = offset;
irq_vector[irq] = vector;
return vector;
}
static int assign_irq_vector(int irq)
{
unsigned long flags;
int vector;
spin_lock_irqsave(&vector_lock, flags);
vector = __assign_irq_vector(irq);
[PATCH] x86_64: fix vector_lock deadlock in io_apic.c Fix a potential deadlock scenario introduced by io_apic.c's new vector_lock on i386 and x86_64. Found by the locking correctness validator. The patch was boot-tested on x86. For details of the deadlock scenario, see the validator output: ====================================================== [ BUG: hard-safe -> hard-unsafe lock order detected! ] ------------------------------------------------------ idle/1 [HC0[0]:SC0[0]:HE0:SE1] is trying to acquire: (msi_lock){....}, at: [<c04ff8d2>] startup_msi_irq_wo_maskbit+0x10/0x35 and this task is already holding: (&irq_desc[i].lock){++..}, at: [<c015b924>] probe_irq_on+0x36/0x107 which would create a new lock dependency: (&irq_desc[i].lock){++..} -> (msi_lock){....} but this new dependency connects a hard-irq-safe lock: (&irq_desc[i].lock){++..} ... which became hard-irq-safe at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10485e9>] _spin_lock+0x21/0x2f [<c015aff5>] __do_IRQ+0x3d/0x113 [<c01062d3>] do_IRQ+0x8c/0xad to a hard-irq-unsafe lock: (vector_lock){--..} ... which became hard-irq-unsafe at: ... [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10485e9>] _spin_lock+0x21/0x2f [<c011b5e8>] assign_irq_vector+0x34/0xc8 [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa [<c010033f>] init+0x32/0x2cb [<c0102005>] kernel_thread_helper+0x5/0xb which could potentially lead to deadlocks! other info that might help us debug this: 3 locks held by idle/1: #0: (port_mutex){--..}, at: [<c067070d>] uart_add_one_port+0x61/0x289 #1: (&state->mutex){--..}, at: [<c067071f>] uart_add_one_port+0x73/0x289 #2: (&irq_desc[i].lock){++..}, at: [<c015b924>] probe_irq_on+0x36/0x107 the hard-irq-safe lock's dependencies: -> (&irq_desc[i].lock){++..} ops: 9861 { initial-use at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c015b415>] setup_irq+0x9b/0x14d [<c1aaa4c4>] time_init_hook+0xf/0x11 [<c1a9f320>] time_init+0x44/0x46 [<c1a9955f>] start_kernel+0x191/0x38f [<c0100210>] 0xc0100210 in-hardirq-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10485e9>] _spin_lock+0x21/0x2f [<c015aff5>] __do_IRQ+0x3d/0x113 [<c01062d3>] do_IRQ+0x8c/0xad in-softirq-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10485e9>] _spin_lock+0x21/0x2f [<c015aff5>] __do_IRQ+0x3d/0x113 [<c01062d3>] do_IRQ+0x8c/0xad } ... key at: [<c1ea31e0>] irq_desc_lock_type+0x0/0x20 -> (i8259A_lock){++..} ops: 5149 { initial-use at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0108090>] init_8259A+0x11/0x8f [<c1aa0d22>] init_ISA_irqs+0x12/0x4d [<c1aaa4f0>] pre_intr_init_hook+0x8/0xa [<c1aa0cb9>] init_IRQ+0xe/0x65 [<c1a99546>] start_kernel+0x178/0x38f [<c0100210>] 0xc0100210 in-hardirq-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc [<c015b007>] __do_IRQ+0x4f/0x113 [<c01062d3>] do_IRQ+0x8c/0xad in-softirq-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc [<c015b007>] __do_IRQ+0x4f/0x113 [<c01062d3>] do_IRQ+0x8c/0xad } ... key at: [<c142f174>] i8259A_lock+0x14/0x40 ... acquired at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0107eb2>] enable_8259A_irq+0x10/0x47 [<c0107f12>] startup_8259A_irq+0x8/0xc [<c015b45e>] setup_irq+0xe4/0x14d [<c1aaa4c4>] time_init_hook+0xf/0x11 [<c1a9f320>] time_init+0x44/0x46 [<c1a9955f>] start_kernel+0x191/0x38f [<c0100210>] 0xc0100210 -> (ioapic_lock){+...} ops: 122 { initial-use at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c1aa71db>] io_apic_get_version+0x16/0x55 [<c1aa5c73>] mp_register_ioapic+0xc6/0x127 [<c1aa382e>] acpi_parse_ioapic+0x2d/0x39 [<c1abe031>] acpi_table_parse_madt_family+0xb4/0x100 [<c1abe093>] acpi_table_parse_madt+0x16/0x18 [<c1aa3c8a>] acpi_boot_init+0x132/0x251 [<c1aa08ea>] setup_arch+0xd36/0xe37 [<c1a99434>] start_kernel+0x66/0x38f [<c0100210>] 0xc0100210 in-hardirq-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c011bce1>] mask_IO_APIC_irq+0x11/0x31 [<c011c5cc>] ack_edge_ioapic_vector+0x31/0x41 [<c015b007>] __do_IRQ+0x4f/0x113 [<c01062d3>] do_IRQ+0x8c/0xad } ... key at: [<c1432514>] ioapic_lock+0x14/0x3c -> (i8259A_lock){++..} ops: 5149 { initial-use at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0108090>] init_8259A+0x11/0x8f [<c1aa0d22>] init_ISA_irqs+0x12/0x4d [<c1aaa4f0>] pre_intr_init_hook+0x8/0xa [<c1aa0cb9>] init_IRQ+0xe/0x65 [<c1a99546>] start_kernel+0x178/0x38f [<c0100210>] 0xc0100210 in-hardirq-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc [<c015b007>] __do_IRQ+0x4f/0x113 [<c01062d3>] do_IRQ+0x8c/0xad in-softirq-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0107fb0>] mask_and_ack_8259A+0x1b/0xcc [<c015b007>] __do_IRQ+0x4f/0x113 [<c01062d3>] do_IRQ+0x8c/0xad } ... key at: [<c142f174>] i8259A_lock+0x14/0x40 ... acquired at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c0107e6b>] disable_8259A_irq+0x10/0x47 [<c011bdbd>] startup_edge_ioapic_vector+0x31/0x58 [<c015b45e>] setup_irq+0xe4/0x14d [<c015b5a1>] request_irq+0xda/0xf9 [<c1ac983a>] rtc_init+0x6a/0x1a7 [<c0100457>] init+0x14a/0x2cb [<c0102005>] kernel_thread_helper+0x5/0xb ... acquired at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c011bce1>] mask_IO_APIC_irq+0x11/0x31 [<c011c5cc>] ack_edge_ioapic_vector+0x31/0x41 [<c015b007>] __do_IRQ+0x4f/0x113 [<c01062d3>] do_IRQ+0x8c/0xad the hard-irq-unsafe lock's dependencies: -> (vector_lock){--..} ops: 31 { initial-use at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10485e9>] _spin_lock+0x21/0x2f [<c011b5e8>] assign_irq_vector+0x34/0xc8 [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa [<c010033f>] init+0x32/0x2cb [<c0102005>] kernel_thread_helper+0x5/0xb softirq-on-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10485e9>] _spin_lock+0x21/0x2f [<c011b5e8>] assign_irq_vector+0x34/0xc8 [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa [<c010033f>] init+0x32/0x2cb [<c0102005>] kernel_thread_helper+0x5/0xb hardirq-on-W at: [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10485e9>] _spin_lock+0x21/0x2f [<c011b5e8>] assign_irq_vector+0x34/0xc8 [<c1aa82fa>] setup_IO_APIC+0x45a/0xcff [<c1aa56e3>] smp_prepare_cpus+0x5ea/0x8aa [<c010033f>] init+0x32/0x2cb [<c0102005>] kernel_thread_helper+0x5/0xb } ... key at: [<c1432574>] vector_lock+0x14/0x3c stack backtrace: [<c0104f36>] show_trace+0xd/0xf [<c010543e>] dump_stack+0x17/0x19 [<c0144e34>] check_usage+0x1f6/0x203 [<c0146395>] __lockdep_acquire+0x8c2/0xaa5 [<c01468c4>] lockdep_acquire+0x68/0x84 [<c10487f4>] _spin_lock_irqsave+0x2a/0x3a [<c04ff8d2>] startup_msi_irq_wo_maskbit+0x10/0x35 [<c015b932>] probe_irq_on+0x44/0x107 [<c0673d58>] serial8250_config_port+0x84b/0x986 [<c06707b1>] uart_add_one_port+0x105/0x289 [<c1ace54b>] serial8250_init+0xc3/0x10a [<c0100457>] init+0x14a/0x2cb [<c0102005>] kernel_thread_helper+0x5/0xb Signed-off-by: Ingo Molnar <mingo@elte.hu> Cc: Jan Beulich <jbeulich@novell.com> Cc: Andi Kleen <ak@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Andi Kleen <ak@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-26 11:57:16 +00:00
spin_unlock_irqrestore(&vector_lock, flags);
return vector;
}
void setup_vector_irq(int cpu)
{
}
static struct irq_chip ioapic_chip;
#define IOAPIC_AUTO -1
#define IOAPIC_EDGE 0
#define IOAPIC_LEVEL 1
[PATCH] genirq: rename desc->handler to desc->chip This patch-queue improves the generic IRQ layer to be truly generic, by adding various abstractions and features to it, without impacting existing functionality. While the queue can be best described as "fix and improve everything in the generic IRQ layer that we could think of", and thus it consists of many smaller features and lots of cleanups, the one feature that stands out most is the new 'irq chip' abstraction. The irq-chip abstraction is about describing and coding and IRQ controller driver by mapping its raw hardware capabilities [and quirks, if needed] in a straightforward way, without having to think about "IRQ flow" (level/edge/etc.) type of details. This stands in contrast with the current 'irq-type' model of genirq architectures, which 'mixes' raw hardware capabilities with 'flow' details. The patchset supports both types of irq controller designs at once, and converts i386 and x86_64 to the new irq-chip design. As a bonus side-effect of the irq-chip approach, chained interrupt controllers (master/slave PIC constructs, etc.) are now supported by design as well. The end result of this patchset intends to be simpler architecture-level code and more consolidation between architectures. We reused many bits of code and many concepts from Russell King's ARM IRQ layer, the merging of which was one of the motivations for this patchset. This patch: rename desc->handler to desc->chip. Originally i did not want to do this, because it's a big patch. But having both "desc->handler", "desc->handle_irq" and "action->handler" caused a large degree of confusion and made the code appear alot less clean than it truly is. I have also attempted a dual approach as well by introducing a desc->chip alias - but that just wasnt robust enough and broke frequently. So lets get over with this quickly. The conversion was done automatically via scripts and converts all the code in the kernel. This renaming patch is the first one amongst the patches, so that the remaining patches can stay flexible and can be merged and split up without having some big monolithic patch act as a merge barrier. [akpm@osdl.org: build fix] [akpm@osdl.org: another build fix] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-29 09:24:36 +00:00
static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
{
if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
trigger == IOAPIC_LEVEL) {
irq_desc[irq].status |= IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_fasteoi_irq, "fasteoi");
} else {
irq_desc[irq].status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &ioapic_chip,
handle_edge_irq, "edge");
}
set_intr_gate(vector, interrupt[irq]);
}
static void __init setup_IO_APIC_irqs(void)
{
struct IO_APIC_route_entry entry;
int apic, pin, idx, irq, first_notcon = 1, vector;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
for (apic = 0; apic < nr_ioapics; apic++) {
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
/*
* add it to the IO-APIC irq-routing table:
*/
memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
entry.mask = 0; /* enable IRQ */
entry.dest.logical.logical_dest =
cpu_mask_to_apicid(TARGET_CPUS);
idx = find_irq_entry(apic, pin, mp_INT);
if (idx == -1) {
if (first_notcon) {
apic_printk(APIC_VERBOSE, KERN_DEBUG
" IO-APIC (apicid-pin) %d-%d",
mp_ioapics[apic].mp_apicid,
pin);
first_notcon = 0;
} else
apic_printk(APIC_VERBOSE, ", %d-%d",
mp_ioapics[apic].mp_apicid, pin);
continue;
}
if (!first_notcon) {
apic_printk(APIC_VERBOSE, " not connected.\n");
first_notcon = 1;
}
entry.trigger = irq_trigger(idx);
entry.polarity = irq_polarity(idx);
if (irq_trigger(idx)) {
entry.trigger = 1;
entry.mask = 1;
}
irq = pin_2_irq(idx, apic, pin);
/*
* skip adding the timer int on secondary nodes, which causes
* a small but painful rift in the time-space continuum
*/
if (multi_timer_check(apic, irq))
continue;
else
add_pin_to_irq(irq, apic, pin);
if (!apic && !IO_APIC_IRQ(irq))
continue;
if (IO_APIC_IRQ(irq)) {
vector = assign_irq_vector(irq);
entry.vector = vector;
ioapic_register_intr(irq, vector, IOAPIC_AUTO);
if (!apic && (irq < 16))
disable_8259A_irq(irq);
}
ioapic_write_entry(apic, pin, entry);
}
}
if (!first_notcon)
apic_printk(APIC_VERBOSE, " not connected.\n");
}
/*
* Set up the timer pin, possibly with the 8259A-master behind.
*/
static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
int vector)
{
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
/*
* We use logical delivery to get the timer IRQ
* to the first CPU.
*/
entry.dest_mode = INT_DEST_MODE;
entry.mask = 1; /* mask IRQ now */
entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.delivery_mode = INT_DELIVERY_MODE;
entry.polarity = 0;
entry.trigger = 0;
entry.vector = vector;
/*
* The timer IRQ doesn't have to know that behind the
* scene we may have a 8259A-master in AEOI mode ...
*/
ioapic_register_intr(0, vector, IOAPIC_EDGE);
/*
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(apic, pin, entry);
}
void __init print_IO_APIC(void)
{
int apic, i;
union IO_APIC_reg_00 reg_00;
union IO_APIC_reg_01 reg_01;
union IO_APIC_reg_02 reg_02;
union IO_APIC_reg_03 reg_03;
unsigned long flags;
if (apic_verbosity == APIC_QUIET)
return;
printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
for (i = 0; i < nr_ioapics; i++)
printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
/*
* We are a bit conservative about what we expect. We have to
* know about every hardware change ASAP.
*/
printk(KERN_INFO "testing the IO APIC.......................\n");
for (apic = 0; apic < nr_ioapics; apic++) {
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
reg_01.raw = io_apic_read(apic, 1);
if (reg_01.bits.version >= 0x10)
reg_02.raw = io_apic_read(apic, 2);
if (reg_01.bits.version >= 0x20)
reg_03.raw = io_apic_read(apic, 3);
spin_unlock_irqrestore(&ioapic_lock, flags);
printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
/*
* Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
* but the value of reg_02 is read as the previous read register
* value, so ignore it if reg_02 == reg_01.
*/
if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
}
/*
* Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
* or reg_03, but the value of reg_0[23] is read as the previous read
* register value, so ignore it if reg_03 == reg_0[12].
*/
if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
reg_03.raw != reg_01.raw) {
printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
}
printk(KERN_DEBUG ".... IRQ redirection table:\n");
printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
" Stat Dest Deli Vect: \n");
for (i = 0; i <= reg_01.bits.entries; i++) {
struct IO_APIC_route_entry entry;
entry = ioapic_read_entry(apic, i);
printk(KERN_DEBUG " %02x %03X %02X ",
i,
entry.dest.logical.logical_dest,
entry.dest.physical.physical_dest
);
printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
entry.mask,
entry.trigger,
entry.irr,
entry.polarity,
entry.delivery_status,
entry.dest_mode,
entry.delivery_mode,
entry.vector
);
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
for (i = 0; i < NR_IRQS; i++) {
struct irq_pin_list *entry = irq_2_pin + i;
if (entry->pin < 0)
continue;
printk(KERN_DEBUG "IRQ%d ", i);
for (;;) {
printk("-> %d:%d", entry->apic, entry->pin);
if (!entry->next)
break;
entry = irq_2_pin + entry->next;
}
printk("\n");
}
printk(KERN_INFO ".................................... done.\n");
return;
}
#if 0
static void print_APIC_bitfield(int base)
{
unsigned int v;
int i, j;
if (apic_verbosity == APIC_QUIET)
return;
printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
for (i = 0; i < 8; i++) {
v = apic_read(base + i*0x10);
for (j = 0; j < 32; j++) {
if (v & (1<<j))
printk("1");
else
printk("0");
}
printk("\n");
}
}
void /*__init*/ print_local_APIC(void *dummy)
{
unsigned int v, ver, maxlvt;
if (apic_verbosity == APIC_QUIET)
return;
printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
smp_processor_id(), hard_smp_processor_id());
v = apic_read(APIC_ID);
printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
GET_APIC_ID(read_apic_id()));
v = apic_read(APIC_LVR);
printk(KERN_INFO "... APIC VERSION: %08x\n", v);
ver = GET_APIC_VERSION(v);
maxlvt = lapic_get_maxlvt();
v = apic_read(APIC_TASKPRI);
printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
if (APIC_INTEGRATED(ver)) { /* !82489DX */
v = apic_read(APIC_ARBPRI);
printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
v & APIC_ARBPRI_MASK);
v = apic_read(APIC_PROCPRI);
printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
}
v = apic_read(APIC_EOI);
printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
v = apic_read(APIC_RRR);
printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
v = apic_read(APIC_LDR);
printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
v = apic_read(APIC_DFR);
printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
v = apic_read(APIC_SPIV);
printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
printk(KERN_DEBUG "... APIC ISR field:\n");
print_APIC_bitfield(APIC_ISR);
printk(KERN_DEBUG "... APIC TMR field:\n");
print_APIC_bitfield(APIC_TMR);
printk(KERN_DEBUG "... APIC IRR field:\n");
print_APIC_bitfield(APIC_IRR);
if (APIC_INTEGRATED(ver)) { /* !82489DX */
if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
apic_write(APIC_ESR, 0);
v = apic_read(APIC_ESR);
printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
}
v = apic_read(APIC_ICR);
printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
v = apic_read(APIC_ICR2);
printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
v = apic_read(APIC_LVTT);
printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
if (maxlvt > 3) { /* PC is LVT#4. */
v = apic_read(APIC_LVTPC);
printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
}
v = apic_read(APIC_LVT0);
printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
v = apic_read(APIC_LVT1);
printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
if (maxlvt > 2) { /* ERR is LVT#3. */
v = apic_read(APIC_LVTERR);
printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
}
v = apic_read(APIC_TMICT);
printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
v = apic_read(APIC_TMCCT);
printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
v = apic_read(APIC_TDCR);
printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
printk("\n");
}
void print_all_local_APICs(void)
{
on_each_cpu(print_local_APIC, NULL, 1);
}
void /*__init*/ print_PIC(void)
{
unsigned int v;
unsigned long flags;
if (apic_verbosity == APIC_QUIET)
return;
printk(KERN_DEBUG "\nprinting PIC contents\n");
spin_lock_irqsave(&i8259A_lock, flags);
v = inb(0xa1) << 8 | inb(0x21);
printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
v = inb(0xa0) << 8 | inb(0x20);
printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
outb(0x0b, 0xa0);
outb(0x0b, 0x20);
v = inb(0xa0) << 8 | inb(0x20);
outb(0x0a, 0xa0);
outb(0x0a, 0x20);
spin_unlock_irqrestore(&i8259A_lock, flags);
printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
v = inb(0x4d1) << 8 | inb(0x4d0);
printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
}
#endif /* 0 */
static void __init enable_IO_APIC(void)
{
union IO_APIC_reg_01 reg_01;
int i8259_apic, i8259_pin;
int i, apic;
unsigned long flags;
for (i = 0; i < PIN_MAP_SIZE; i++) {
irq_2_pin[i].pin = -1;
irq_2_pin[i].next = 0;
}
if (!pirqs_enabled)
for (i = 0; i < MAX_PIRQS; i++)
pirq_entries[i] = -1;
/*
* The number of IO-APIC IRQ registers (== #pins):
*/
for (apic = 0; apic < nr_ioapics; apic++) {
spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(apic, 1);
spin_unlock_irqrestore(&ioapic_lock, flags);
nr_ioapic_registers[apic] = reg_01.bits.entries+1;
}
for (apic = 0; apic < nr_ioapics; apic++) {
int pin;
/* See if any of the pins is in ExtINT mode */
for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
struct IO_APIC_route_entry entry;
entry = ioapic_read_entry(apic, pin);
/* If the interrupt line is enabled and in ExtInt mode
* I have found the pin where the i8259 is connected.
*/
if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
ioapic_i8259.apic = apic;
ioapic_i8259.pin = pin;
goto found_i8259;
}
}
}
found_i8259:
/* Look to see what if the MP table has reported the ExtINT */
/* If we could not find the appropriate pin by looking at the ioapic
* the i8259 probably is not connected the ioapic but give the
* mptable a chance anyway.
*/
i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
/* Trust the MP table if nothing is setup in the hardware */
if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
ioapic_i8259.pin = i8259_pin;
ioapic_i8259.apic = i8259_apic;
}
/* Complain if the MP table and the hardware disagree */
if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
(i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
{
printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
}
/*
* Do not trust the IO-APIC being empty at bootup
*/
clear_IO_APIC();
}
/*
* Not an __init, needed by the reboot code
*/
void disable_IO_APIC(void)
{
/*
* Clear the IO-APIC before rebooting:
*/
clear_IO_APIC();
/*
* If the i8259 is routed through an IOAPIC
* Put that IOAPIC in virtual wire mode
* so legacy interrupts can be delivered.
*/
if (ioapic_i8259.pin != -1) {
struct IO_APIC_route_entry entry;
memset(&entry, 0, sizeof(entry));
entry.mask = 0; /* Enabled */
entry.trigger = 0; /* Edge */
entry.irr = 0;
entry.polarity = 0; /* High */
entry.delivery_status = 0;
entry.dest_mode = 0; /* Physical */
entry.delivery_mode = dest_ExtINT; /* ExtInt */
entry.vector = 0;
entry.dest.physical.physical_dest =
GET_APIC_ID(read_apic_id());
/*
* Add it to the IO-APIC irq-routing table:
*/
ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
}
disconnect_bsp_APIC(ioapic_i8259.pin != -1);
}
/*
* function to set the IO-APIC physical IDs based on the
* values stored in the MPC table.
*
* by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
*/
static void __init setup_ioapic_ids_from_mpc(void)
{
union IO_APIC_reg_00 reg_00;
physid_mask_t phys_id_present_map;
int apic;
int i;
unsigned char old_id;
unsigned long flags;
#ifdef CONFIG_X86_NUMAQ
if (found_numaq)
return;
#endif
/*
* Don't check I/O APIC IDs for xAPIC systems. They have
* no meaning without the serial APIC bus.
*/
if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
|| APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
return;
/*
* This is broken; anything with a real cpu count has to
* circumvent this idiocy regardless.
*/
phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
/*
* Set the IOAPIC ID to the value stored in the MPC table.
*/
for (apic = 0; apic < nr_ioapics; apic++) {
/* Read the register 0 value */
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
old_id = mp_ioapics[apic].mp_apicid;
if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
apic, mp_ioapics[apic].mp_apicid);
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
reg_00.bits.ID);
mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
}
/*
* Sanity check, is the ID really free? Every APIC in a
* system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(phys_id_present_map,
mp_ioapics[apic].mp_apicid)) {
printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
apic, mp_ioapics[apic].mp_apicid);
for (i = 0; i < get_physical_broadcast(); i++)
if (!physid_isset(i, phys_id_present_map))
break;
if (i >= get_physical_broadcast())
panic("Max APIC ID exceeded!\n");
printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
i);
physid_set(i, phys_id_present_map);
mp_ioapics[apic].mp_apicid = i;
} else {
physid_mask_t tmp;
tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
apic_printk(APIC_VERBOSE, "Setting %d in the "
"phys_id_present_map\n",
mp_ioapics[apic].mp_apicid);
physids_or(phys_id_present_map, phys_id_present_map, tmp);
}
/*
* We need to adjust the IRQ routing table
* if the ID changed.
*/
if (old_id != mp_ioapics[apic].mp_apicid)
for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].mp_dstapic == old_id)
mp_irqs[i].mp_dstapic
= mp_ioapics[apic].mp_apicid;
/*
* Read the right value from the MPC table and
* write it into the ID register.
*/
apic_printk(APIC_VERBOSE, KERN_INFO
"...changing IO-APIC physical APIC ID to %d ...",
mp_ioapics[apic].mp_apicid);
reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(apic, 0, reg_00.raw);
spin_unlock_irqrestore(&ioapic_lock, flags);
/*
* Sanity check
*/
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(apic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
printk("could not set ID!\n");
else
apic_printk(APIC_VERBOSE, " ok.\n");
}
}
int no_timer_check __initdata;
static int __init notimercheck(char *s)
{
no_timer_check = 1;
return 1;
}
__setup("no_timer_check", notimercheck);
/*
* There is a nasty bug in some older SMP boards, their mptable lies
* about the timer IRQ. We do the following to work around the situation:
*
* - timer IRQ defaults to IO-APIC IRQ
* - if this function detects that timer IRQs are defunct, then we fall
* back to ISA timer IRQs
*/
static int __init timer_irq_works(void)
{
unsigned long t1 = jiffies;
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
unsigned long flags;
if (no_timer_check)
return 1;
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
local_save_flags(flags);
local_irq_enable();
/* Let ten ticks pass... */
mdelay((10 * 1000) / HZ);
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
local_irq_restore(flags);
/*
* Expect a few ticks at least, to be sure some possible
* glue logic does not lock up after one or two first
* ticks in a non-ExtINT mode. Also the local APIC
* might have cached one ExtINT interrupt. Finally, at
* least one tick may be lost due to delays.
*/
if (time_after(jiffies, t1 + 4))
return 1;
return 0;
}
/*
* In the SMP+IOAPIC case it might happen that there are an unspecified
* number of pending IRQ events unhandled. These cases are very rare,
* so we 'resend' these IRQs via IPIs, to the same CPU. It's much
* better to do it this way as thus we do not have to be aware of
* 'pending' interrupts in the IRQ path, except at this point.
*/
/*
* Edge triggered needs to resend any interrupt
* that was delayed but this is now handled in the device
* independent code.
*/
/*
* Startup quirk:
*
* Starting up a edge-triggered IO-APIC interrupt is
* nasty - we need to make sure that we get the edge.
* If it is already asserted for some reason, we need
* return 1 to indicate that is was pending.
*
* This is not complete - we should be able to fake
* an edge even if it isn't on the 8259A...
*
* (We do this for level-triggered IRQs too - it cannot hurt.)
*/
static unsigned int startup_ioapic_irq(unsigned int irq)
{
int was_pending = 0;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
if (irq < 16) {
disable_8259A_irq(irq);
if (i8259A_irq_pending(irq))
was_pending = 1;
}
__unmask_IO_APIC_irq(irq);
spin_unlock_irqrestore(&ioapic_lock, flags);
return was_pending;
}
static void ack_ioapic_irq(unsigned int irq)
{
move_native_irq(irq);
ack_APIC_irq();
}
static void ack_ioapic_quirk_irq(unsigned int irq)
{
unsigned long v;
int i;
move_native_irq(irq);
/*
* It appears there is an erratum which affects at least version 0x11
* of I/O APIC (that's the 82093AA and cores integrated into various
* chipsets). Under certain conditions a level-triggered interrupt is
* erroneously delivered as edge-triggered one but the respective IRR
* bit gets set nevertheless. As a result the I/O unit expects an EOI
* message but it will never arrive and further interrupts are blocked
* from the source. The exact reason is so far unknown, but the
* phenomenon was observed when two consecutive interrupt requests
* from a given source get delivered to the same CPU and the source is
* temporarily disabled in between.
*
* A workaround is to simulate an EOI message manually. We achieve it
* by setting the trigger mode to edge and then to level when the edge
* trigger mode gets detected in the TMR of a local APIC for a
* level-triggered interrupt. We mask the source for the time of the
* operation to prevent an edge-triggered interrupt escaping meanwhile.
* The idea is from Manfred Spraul. --macro
*/
i = irq_vector[irq];
v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
ack_APIC_irq();
if (!(v & (1 << (i & 0x1f)))) {
atomic_inc(&irq_mis_count);
spin_lock(&ioapic_lock);
__mask_and_edge_IO_APIC_irq(irq);
__unmask_and_level_IO_APIC_irq(irq);
spin_unlock(&ioapic_lock);
}
}
static int ioapic_retrigger_irq(unsigned int irq)
{
send_IPI_self(irq_vector[irq]);
return 1;
}
static struct irq_chip ioapic_chip __read_mostly = {
.name = "IO-APIC",
.startup = startup_ioapic_irq,
.mask = mask_IO_APIC_irq,
.unmask = unmask_IO_APIC_irq,
.ack = ack_ioapic_irq,
.eoi = ack_ioapic_quirk_irq,
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
#ifdef CONFIG_SMP
.set_affinity = set_ioapic_affinity_irq,
[PATCH] x86/x86_64: deferred handling of writes to /proc/irqxx/smp_affinity When handling writes to /proc/irq, current code is re-programming rte entries directly. This is not recommended and could potentially cause chipset's to lockup, or cause missing interrupts. CONFIG_IRQ_BALANCE does this correctly, where it re-programs only when the interrupt is pending. The same needs to be done for /proc/irq handling as well. Otherwise user space irq balancers are really not doing the right thing. - Changed pending_irq_balance_cpumask to pending_irq_migrate_cpumask for lack of a generic name. - added move_irq out of IRQ_BALANCE, and added this same to X86_64 - Added new proc handler for write, so we can do deferred write at irq handling time. - Display of /proc/irq/XX/smp_affinity used to display CPU_MASKALL, instead it now shows only active cpu masks, or exactly what was set. - Provided a common move_irq implementation, instead of duplicating when using generic irq framework. Tested on i386/x86_64 and ia64 with CONFIG_PCI_MSI turned on and off. Tested UP builds as well. MSI testing: tbd: I have cards, need to look for a x-over cable, although I did test an earlier version of this patch. Will test in a couple days. Signed-off-by: Ashok Raj <ashok.raj@intel.com> Acked-by: Zwane Mwaikambo <zwane@holomorphy.com> Grudgingly-acked-by: Andi Kleen <ak@muc.de> Signed-off-by: Coywolf Qi Hunt <coywolf@lovecn.org> Signed-off-by: Ashok Raj <ashok.raj@intel.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-06 22:16:15 +00:00
#endif
.retrigger = ioapic_retrigger_irq,
};
static inline void init_IO_APIC_traps(void)
{
int irq;
/*
* NOTE! The local APIC isn't very good at handling
* multiple interrupts at the same interrupt level.
* As the interrupt level is determined by taking the
* vector number and shifting that right by 4, we
* want to spread these out a bit so that they don't
* all fall in the same interrupt level.
*
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
for (irq = 0; irq < NR_IRQS ; irq++) {
if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
* interrupt if we can..
*/
if (irq < 16)
make_8259A_irq(irq);
else
/* Strange. Oh, well.. */
irq_desc[irq].chip = &no_irq_chip;
}
}
}
/*
* The local APIC irq-chip implementation:
*/
static void ack_lapic_irq(unsigned int irq)
{
ack_APIC_irq();
}
static void mask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
}
static void unmask_lapic_irq(unsigned int irq)
{
unsigned long v;
v = apic_read(APIC_LVT0);
apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
}
static struct irq_chip lapic_chip __read_mostly = {
.name = "local-APIC",
.mask = mask_lapic_irq,
.unmask = unmask_lapic_irq,
.ack = ack_lapic_irq,
};
static void lapic_register_intr(int irq, int vector)
{
irq_desc[irq].status &= ~IRQ_LEVEL;
set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
"edge");
set_intr_gate(vector, interrupt[irq]);
}
static void __init setup_nmi(void)
{
/*
* Dirty trick to enable the NMI watchdog ...
* We put the 8259A master into AEOI mode and
* unmask on all local APICs LVT0 as NMI.
*
* The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
* is from Maciej W. Rozycki - so we do not have to EOI from
* the NMI handler or the timer interrupt.
*/
apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
enable_NMI_through_LVT0();
apic_printk(APIC_VERBOSE, " done.\n");
}
/*
* This looks a bit hackish but it's about the only one way of sending
* a few INTA cycles to 8259As and any associated glue logic. ICR does
* not support the ExtINT mode, unfortunately. We need to send these
* cycles as some i82489DX-based boards have glue logic that keeps the
* 8259A interrupt line asserted until INTA. --macro
*/
static inline void __init unlock_ExtINT_logic(void)
{
int apic, pin, i;
struct IO_APIC_route_entry entry0, entry1;
unsigned char save_control, save_freq_select;
pin = find_isa_irq_pin(8, mp_INT);
if (pin == -1) {
WARN_ON_ONCE(1);
return;
}
apic = find_isa_irq_apic(8, mp_INT);
if (apic == -1) {
WARN_ON_ONCE(1);
return;
}
entry0 = ioapic_read_entry(apic, pin);
clear_IO_APIC_pin(apic, pin);
memset(&entry1, 0, sizeof(entry1));
entry1.dest_mode = 0; /* physical delivery */
entry1.mask = 0; /* unmask IRQ now */
entry1.dest.physical.physical_dest = hard_smp_processor_id();
entry1.delivery_mode = dest_ExtINT;
entry1.polarity = entry0.polarity;
entry1.trigger = 0;
entry1.vector = 0;
ioapic_write_entry(apic, pin, entry1);
save_control = CMOS_READ(RTC_CONTROL);
save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
RTC_FREQ_SELECT);
CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
i = 100;
while (i-- > 0) {
mdelay(10);
if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
i -= 10;
}
CMOS_WRITE(save_control, RTC_CONTROL);
CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
clear_IO_APIC_pin(apic, pin);
ioapic_write_entry(apic, pin, entry0);
}
/*
* This code may look a bit paranoid, but it's supposed to cooperate with
* a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
* is so screwy. Thanks to Brian Perkins for testing/hacking this beast
* fanatically on his truly buggy board.
*/
static inline void __init check_timer(void)
{
int apic1, pin1, apic2, pin2;
int no_pin1 = 0;
int vector;
unsigned int ver;
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
unsigned long flags;
local_irq_save(flags);
ver = apic_read(APIC_LVR);
ver = GET_APIC_VERSION(ver);
/*
* get/set the timer IRQ vector:
*/
disable_8259A_irq(0);
vector = assign_irq_vector(0);
set_intr_gate(vector, interrupt[0]);
/*
* As IRQ0 is to be enabled in the 8259A, the virtual
* wire has to be disabled in the local APIC. Also
* timer interrupts need to be acknowledged manually in
* the 8259A for the i82489DX when using the NMI
* watchdog as that APIC treats NMIs as level-triggered.
* The AEOI mode will finish them in the 8259A
* automatically.
*/
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
init_8259A(1);
timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
pin1 = find_isa_irq_pin(0, mp_INT);
apic1 = find_isa_irq_apic(0, mp_INT);
pin2 = ioapic_i8259.pin;
apic2 = ioapic_i8259.apic;
printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
vector, apic1, pin1, apic2, pin2);
/*
* Some BIOS writers are clueless and report the ExtINTA
* I/O APIC input from the cascaded 8259A as the timer
* interrupt input. So just in case, if only one pin
* was found above, try it both directly and through the
* 8259A.
*/
if (pin1 == -1) {
pin1 = pin2;
apic1 = apic2;
no_pin1 = 1;
} else if (pin2 == -1) {
pin2 = pin1;
apic2 = apic1;
}
if (pin1 != -1) {
/*
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
add_pin_to_irq(0, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, vector);
}
unmask_IO_APIC_irq(0);
if (timer_irq_works()) {
if (nmi_watchdog == NMI_IO_APIC) {
setup_nmi();
enable_8259A_irq(0);
}
if (disable_timer_pin_1 > 0)
clear_IO_APIC_pin(0, pin1);
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
goto out;
}
clear_IO_APIC_pin(apic1, pin1);
if (!no_pin1)
printk(KERN_ERR "..MP-BIOS bug: "
"8254 timer not connected to IO-APIC\n");
printk(KERN_INFO "...trying to set up timer (IRQ0) "
"through the 8259A ... ");
printk("\n..... (found pin %d) ...", pin2);
/*
* legacy devices should be connected to IO APIC #0
*/
replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, vector);
unmask_IO_APIC_irq(0);
enable_8259A_irq(0);
if (timer_irq_works()) {
printk("works.\n");
timer_through_8259 = 1;
if (nmi_watchdog == NMI_IO_APIC) {
disable_8259A_irq(0);
setup_nmi();
enable_8259A_irq(0);
}
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
goto out;
}
/*
* Cleanup, just in case ...
*/
disable_8259A_irq(0);
clear_IO_APIC_pin(apic2, pin2);
printk(" failed.\n");
}
if (nmi_watchdog == NMI_IO_APIC) {
printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
nmi_watchdog = NMI_NONE;
}
timer_ack = 0;
printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
lapic_register_intr(0, vector);
apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
enable_8259A_irq(0);
if (timer_irq_works()) {
printk(" works.\n");
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
goto out;
}
disable_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
printk(" failed.\n");
printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
init_8259A(0);
make_8259A_irq(0);
apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
unlock_ExtINT_logic();
if (timer_irq_works()) {
printk(" works.\n");
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
goto out;
}
printk(" failed :(.\n");
panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
"report. Then try booting with the 'noapic' option");
x86: fix "Kernel panic - not syncing: IO-APIC + timer doesn't work!" this is the tale of a full day spent debugging an ancient but elusive bug. after booting up thousands of random .config kernels, i finally happened to generate a .config that produced the following rare bootup failure on 32-bit x86: | ..TIMER: vector=0x31 apic1=0 pin1=2 apic2=-1 pin2=-1 | ..MP-BIOS bug: 8254 timer not connected to IO-APIC | ...trying to set up timer (IRQ0) through the 8259A ... failed. | ...trying to set up timer as Virtual Wire IRQ... failed. | ...trying to set up timer as ExtINT IRQ... failed :(. | Kernel panic - not syncing: IO-APIC + timer doesn't work! Boot with apic=debug | and send a report. Then try booting with the 'noapic' option this bug has been reported many times during the years, but it was never reproduced nor fixed. the bug that i hit was extremely sensitive to .config details. First i did a .config-bisection - suspecting some .config detail. That led to CONFIG_X86_MCE: enabling X86_MCE magically made the bug disappear and the system would boot up just fine. Debugging my way through the MCE code ended up identifying two unlikely candidates: the thing that made a real difference to the hang was that X86_MCE did two printks: Intel machine check architecture supported. Intel machine check reporting enabled on CPU#1. Adding the same printks to a !CONFIG_X86_MCE kernel made the bug go away! this left timing as the main suspect: i experimented with adding various udelay()s to the arch/x86/kernel/io_apic_32.c:check_timer() function, and the race window turned out to be narrower than 30 microseconds (!). That made debugging especially funny, debugging without having printk ability before the bug hits is ... interesting ;-) eventually i started suspecting IRQ activities - those are pretty much the only thing that happen this early during bootup and have the timescale of a few dozen microseconds. Also, check_timer() changes the IRQ hardware in various creative ways, so the main candidate became IRQ0 interaction. i've added a counter to track timer irqs (on which core they arrived, at what exact time, etc.) and found that no timer IRQ would arrive after the bug condition hits - even if we re-enable IRQ0 and re-initialize the i8259A, but that we'd get a small number of timer irqs right around the time when we call the check_timer() function. Eventually i got the following backtrace triggered from debug code in the timer interrupt: ...trying to set up timer as Virtual Wire IRQ... failed. ...trying to set up timer as ExtINT IRQ... Pid: 1, comm: swapper Not tainted (2.6.24-rc5 #57) EIP: 0060:[<c044d57e>] EFLAGS: 00000246 CPU: 0 EIP is at _spin_unlock_irqrestore+0x5/0x1c EAX: c0634178 EBX: 00000000 ECX: c4947d63 EDX: 00000246 ESI: 00000002 EDI: 00010031 EBP: c04e0f2e ESP: f7c41df4 DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068 CR0: 8005003b CR2: ffe04000 CR3: 00630000 CR4: 000006d0 DR0: 00000000 DR1: 00000000 DR2: 00000000 DR3: 00000000 DR6: ffff0ff0 DR7: 00000400 [<c05f5784>] setup_IO_APIC+0x9c3/0xc5c the spin_unlock() was called from init_8259A(). Wait ... we have an IRQ0 entry while we are in the middle of setting up the local APIC, the i8259A and the PIT?? That is certainly not how it's supposed to work! check_timer() was supposed to be called with irqs turned off - but this eroded away sometime in the past. This code would still work most of the time because this code runs very quickly, but just the right timing conditions are present and IRQ0 hits in this small, ~30 usecs window, timer irqs stop and the system does not boot up. Also, given how early this is during bootup, the hang is very deterministic - but it would only occur on certain machines (and certain configs). The fix was quite simple: disable/restore interrupts properly in this function. With that in place the test-system now boots up just fine. (64-bit x86 io_apic_64.c had the same bug.) Phew! One down, only 1500 other kernel bugs are left ;-) Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2007-12-18 17:05:58 +00:00
out:
local_irq_restore(flags);
}
/*
x86: I/O APIC: Never configure IRQ2 There is no such entity as ISA IRQ2. The ACPI spec does not make it explicitly clear, but does not preclude it either -- all it says is ISA legacy interrupts are identity mapped by default (subject to overrides), but it does not state whether IRQ2 exists or not. As a result if there is no IRQ0 override, then IRQ2 is normally initialised as an ISA interrupt, which implies an edge-triggered line, which is unmasked by default as this is what we do for edge-triggered I/O APIC interrupts so as not to miss an edge. To the best of my knowledge it is useless, as IRQ2 has not been in use since the PC/AT as back then it was taken by the 8259A cascade interrupt to the slave, with the line position in the slot rerouted to newly-created IRQ9. No device could thus make use of this line with the pair of 8259A chips. Now in theory INTIN2 of the I/O APIC may be usable, but the interrupt of the device wired to it would not be available in the PIC mode at all, so I seriously doubt if anybody decided to reuse it for a regular device. However there are two common uses of INTIN2. One is for IRQ0, with an ACPI interrupt override (or its equivalent in the MP table). But in this case IRQ2 is gone entirely with INTIN0 left vacant. The other one is for an 8959A ExtINTA cascade. In this case IRQ0 goes to INTIN0 and if ACPI is used INTIN2 is assumed to be IRQ2 (there is no override and ACPI has no way to report ExtINTA interrupts). This is where a problem happens. The problem is INTIN2 is configured as a native APIC interrupt, with a vector assigned and the mask cleared. And the line may indeed get active and inject interrupts if the master 8959A has its timer interrupt enabled (it might happen for other interrupts too, but they are normally masked in the process of rerouting them to the I/O APIC). There are two cases where it will happen: * When the I/O APIC NMI watchdog is enabled. This is actually a misnomer as the watchdog pulses are delivered through the 8259A to the LINT0 inputs of all the local APICs in the system. The implication is the output of the master 8259A goes high and low repeatedly, signalling interrupts to INTIN2 which is enabled too! [The origin of the name is I think for a brief period during the development we had a capability in our code to configure the watchdog to use an I/O APIC input; that would be INTIN2 in this scenario.] * When the native route of IRQ0 via INTIN0 fails for whatever reason -- as it happens with the system considered here. In this scenario the timer pulse is delivered through the 8259A to LINT0 input of the local APIC of the bootstrap processor, quite similarly to how is done for the watchdog described above. The result is, again, INTIN2 receives these pulses too. Rafael's system used to escape this scenario, because an incorrect IRQ0 override would occupy INTIN2 and prevent it from being unmasked. My conclusion is IRQ2 should be excluded from configuration in all the cases and the current exception for ACPI systems should be lifted. The reason being the exception not only being useless, but harmful as well. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-11 18:35:23 +00:00
* Traditionally ISA IRQ2 is the cascade IRQ, and is not available
* to devices. However there may be an I/O APIC pin available for
* this interrupt regardless. The pin may be left unconnected, but
* typically it will be reused as an ExtINT cascade interrupt for
* the master 8259A. In the MPS case such a pin will normally be
* reported as an ExtINT interrupt in the MP table. With ACPI
* there is no provision for ExtINT interrupts, and in the absence
* of an override it would be treated as an ordinary ISA I/O APIC
* interrupt, that is edge-triggered and unmasked by default. We
* used to do this, but it caused problems on some systems because
* of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
* the same ExtINT cascade interrupt to drive the local APIC of the
* bootstrap processor. Therefore we refrain from routing IRQ2 to
* the I/O APIC in all cases now. No actual device should request
* it anyway. --macro
*/
#define PIC_IRQS (1 << PIC_CASCADE_IR)
void __init setup_IO_APIC(void)
{
int i;
/* Reserve all the system vectors. */
for (i = first_system_vector; i < NR_VECTORS; i++)
set_bit(i, used_vectors);
enable_IO_APIC();
x86: I/O APIC: Never configure IRQ2 There is no such entity as ISA IRQ2. The ACPI spec does not make it explicitly clear, but does not preclude it either -- all it says is ISA legacy interrupts are identity mapped by default (subject to overrides), but it does not state whether IRQ2 exists or not. As a result if there is no IRQ0 override, then IRQ2 is normally initialised as an ISA interrupt, which implies an edge-triggered line, which is unmasked by default as this is what we do for edge-triggered I/O APIC interrupts so as not to miss an edge. To the best of my knowledge it is useless, as IRQ2 has not been in use since the PC/AT as back then it was taken by the 8259A cascade interrupt to the slave, with the line position in the slot rerouted to newly-created IRQ9. No device could thus make use of this line with the pair of 8259A chips. Now in theory INTIN2 of the I/O APIC may be usable, but the interrupt of the device wired to it would not be available in the PIC mode at all, so I seriously doubt if anybody decided to reuse it for a regular device. However there are two common uses of INTIN2. One is for IRQ0, with an ACPI interrupt override (or its equivalent in the MP table). But in this case IRQ2 is gone entirely with INTIN0 left vacant. The other one is for an 8959A ExtINTA cascade. In this case IRQ0 goes to INTIN0 and if ACPI is used INTIN2 is assumed to be IRQ2 (there is no override and ACPI has no way to report ExtINTA interrupts). This is where a problem happens. The problem is INTIN2 is configured as a native APIC interrupt, with a vector assigned and the mask cleared. And the line may indeed get active and inject interrupts if the master 8959A has its timer interrupt enabled (it might happen for other interrupts too, but they are normally masked in the process of rerouting them to the I/O APIC). There are two cases where it will happen: * When the I/O APIC NMI watchdog is enabled. This is actually a misnomer as the watchdog pulses are delivered through the 8259A to the LINT0 inputs of all the local APICs in the system. The implication is the output of the master 8259A goes high and low repeatedly, signalling interrupts to INTIN2 which is enabled too! [The origin of the name is I think for a brief period during the development we had a capability in our code to configure the watchdog to use an I/O APIC input; that would be INTIN2 in this scenario.] * When the native route of IRQ0 via INTIN0 fails for whatever reason -- as it happens with the system considered here. In this scenario the timer pulse is delivered through the 8259A to LINT0 input of the local APIC of the bootstrap processor, quite similarly to how is done for the watchdog described above. The result is, again, INTIN2 receives these pulses too. Rafael's system used to escape this scenario, because an incorrect IRQ0 override would occupy INTIN2 and prevent it from being unmasked. My conclusion is IRQ2 should be excluded from configuration in all the cases and the current exception for ACPI systems should be lifted. The reason being the exception not only being useless, but harmful as well. Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org> Cc: "Rafael J. Wysocki" <rjw@sisk.pl> Cc: Matthew Garrett <mjg59@srcf.ucam.org> Cc: Andreas Herrmann <andreas.herrmann3@amd.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-07-11 18:35:23 +00:00
io_apic_irqs = ~PIC_IRQS;
printk("ENABLING IO-APIC IRQs\n");
/*
* Set up IO-APIC IRQ routing.
*/
if (!acpi_ioapic)
setup_ioapic_ids_from_mpc();
sync_Arb_IDs();
setup_IO_APIC_irqs();
init_IO_APIC_traps();
check_timer();
if (!acpi_ioapic)
print_IO_APIC();
}
/*
* Called after all the initialization is done. If we didnt find any
* APIC bugs then we can allow the modify fast path
*/
static int __init io_apic_bug_finalize(void)
{
if (sis_apic_bug == -1)
sis_apic_bug = 0;
return 0;
}
late_initcall(io_apic_bug_finalize);
struct sysfs_ioapic_data {
struct sys_device dev;
struct IO_APIC_route_entry entry[0];
};
static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
int i;
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
entry[i] = ioapic_read_entry(dev->id, i);
return 0;
}
static int ioapic_resume(struct sys_device *dev)
{
struct IO_APIC_route_entry *entry;
struct sysfs_ioapic_data *data;
unsigned long flags;
union IO_APIC_reg_00 reg_00;
int i;
data = container_of(dev, struct sysfs_ioapic_data, dev);
entry = data->entry;
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(dev->id, 0);
if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
io_apic_write(dev->id, 0, reg_00.raw);
}
spin_unlock_irqrestore(&ioapic_lock, flags);
for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
ioapic_write_entry(dev->id, i, entry[i]);
return 0;
}
static struct sysdev_class ioapic_sysdev_class = {
.name = "ioapic",
.suspend = ioapic_suspend,
.resume = ioapic_resume,
};
static int __init ioapic_init_sysfs(void)
{
struct sys_device *dev;
int i, size, error = 0;
error = sysdev_class_register(&ioapic_sysdev_class);
if (error)
return error;
for (i = 0; i < nr_ioapics; i++) {
size = sizeof(struct sys_device) + nr_ioapic_registers[i]
* sizeof(struct IO_APIC_route_entry);
mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
if (!mp_ioapic_data[i]) {
printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
continue;
}
dev = &mp_ioapic_data[i]->dev;
dev->id = i;
dev->cls = &ioapic_sysdev_class;
error = sysdev_register(dev);
if (error) {
kfree(mp_ioapic_data[i]);
mp_ioapic_data[i] = NULL;
printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
continue;
}
}
return 0;
}
device_initcall(ioapic_init_sysfs);
/*
* Dynamic irq allocate and deallocation
*/
int create_irq(void)
{
/* Allocate an unused irq */
int irq, new, vector = 0;
unsigned long flags;
irq = -ENOSPC;
spin_lock_irqsave(&vector_lock, flags);
for (new = (NR_IRQS - 1); new >= 0; new--) {
if (platform_legacy_irq(new))
continue;
if (irq_vector[new] != 0)
continue;
vector = __assign_irq_vector(new);
if (likely(vector > 0))
irq = new;
break;
}
spin_unlock_irqrestore(&vector_lock, flags);
if (irq >= 0) {
set_intr_gate(vector, interrupt[irq]);
dynamic_irq_init(irq);
}
return irq;
}
void destroy_irq(unsigned int irq)
{
unsigned long flags;
dynamic_irq_cleanup(irq);
spin_lock_irqsave(&vector_lock, flags);
clear_bit(irq_vector[irq], used_vectors);
irq_vector[irq] = 0;
spin_unlock_irqrestore(&vector_lock, flags);
}
/*
* MSI message composition
*/
#ifdef CONFIG_PCI_MSI
static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
{
int vector;
unsigned dest;
vector = assign_irq_vector(irq);
if (vector >= 0) {
dest = cpu_mask_to_apicid(TARGET_CPUS);
msg->address_hi = MSI_ADDR_BASE_HI;
msg->address_lo =
MSI_ADDR_BASE_LO |
((INT_DEST_MODE == 0) ?
MSI_ADDR_DEST_MODE_PHYSICAL:
MSI_ADDR_DEST_MODE_LOGICAL) |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
MSI_ADDR_REDIRECTION_CPU:
MSI_ADDR_REDIRECTION_LOWPRI) |
MSI_ADDR_DEST_ID(dest);
msg->data =
MSI_DATA_TRIGGER_EDGE |
MSI_DATA_LEVEL_ASSERT |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
MSI_DATA_DELIVERY_FIXED:
MSI_DATA_DELIVERY_LOWPRI) |
MSI_DATA_VECTOR(vector);
}
return vector;
}
#ifdef CONFIG_SMP
static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
{
struct msi_msg msg;
unsigned int dest;
cpumask_t tmp;
int vector;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
tmp = TARGET_CPUS;
vector = assign_irq_vector(irq);
if (vector < 0)
return;
dest = cpu_mask_to_apicid(mask);
read_msi_msg(irq, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(vector);
msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
msg.address_lo |= MSI_ADDR_DEST_ID(dest);
write_msi_msg(irq, &msg);
irq_desc[irq].affinity = mask;
}
#endif /* CONFIG_SMP */
/*
* IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
* which implement the MSI or MSI-X Capability Structure.
*/
static struct irq_chip msi_chip = {
.name = "PCI-MSI",
.unmask = unmask_msi_irq,
.mask = mask_msi_irq,
.ack = ack_ioapic_irq,
#ifdef CONFIG_SMP
.set_affinity = set_msi_irq_affinity,
#endif
.retrigger = ioapic_retrigger_irq,
};
int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
{
struct msi_msg msg;
int irq, ret;
irq = create_irq();
if (irq < 0)
return irq;
ret = msi_compose_msg(dev, irq, &msg);
if (ret < 0) {
destroy_irq(irq);
return ret;
}
set_irq_msi(irq, desc);
write_msi_msg(irq, &msg);
set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
"edge");
return 0;
}
void arch_teardown_msi_irq(unsigned int irq)
{
destroy_irq(irq);
}
#endif /* CONFIG_PCI_MSI */
/*
* Hypertransport interrupt support
*/
#ifdef CONFIG_HT_IRQ
#ifdef CONFIG_SMP
static void target_ht_irq(unsigned int irq, unsigned int dest)
{
struct ht_irq_msg msg;
fetch_ht_irq_msg(irq, &msg);
msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
write_ht_irq_msg(irq, &msg);
}
static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
{
unsigned int dest;
cpumask_t tmp;
cpus_and(tmp, mask, cpu_online_map);
if (cpus_empty(tmp))
tmp = TARGET_CPUS;
cpus_and(mask, tmp, CPU_MASK_ALL);
dest = cpu_mask_to_apicid(mask);
target_ht_irq(irq, dest);
irq_desc[irq].affinity = mask;
}
#endif
static struct irq_chip ht_irq_chip = {
.name = "PCI-HT",
.mask = mask_ht_irq,
.unmask = unmask_ht_irq,
.ack = ack_ioapic_irq,
#ifdef CONFIG_SMP
.set_affinity = set_ht_irq_affinity,
#endif
.retrigger = ioapic_retrigger_irq,
};
int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
{
int vector;
vector = assign_irq_vector(irq);
if (vector >= 0) {
struct ht_irq_msg msg;
unsigned dest;
cpumask_t tmp;
cpus_clear(tmp);
cpu_set(vector >> 8, tmp);
dest = cpu_mask_to_apicid(tmp);
msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
msg.address_lo =
HT_IRQ_LOW_BASE |
HT_IRQ_LOW_DEST_ID(dest) |
HT_IRQ_LOW_VECTOR(vector) |
((INT_DEST_MODE == 0) ?
HT_IRQ_LOW_DM_PHYSICAL :
HT_IRQ_LOW_DM_LOGICAL) |
HT_IRQ_LOW_RQEOI_EDGE |
((INT_DELIVERY_MODE != dest_LowestPrio) ?
HT_IRQ_LOW_MT_FIXED :
HT_IRQ_LOW_MT_ARBITRATED) |
HT_IRQ_LOW_IRQ_MASKED;
write_ht_irq_msg(irq, &msg);
set_irq_chip_and_handler_name(irq, &ht_irq_chip,
handle_edge_irq, "edge");
}
return vector;
}
#endif /* CONFIG_HT_IRQ */
/* --------------------------------------------------------------------------
ACPI-based IOAPIC Configuration
-------------------------------------------------------------------------- */
#ifdef CONFIG_ACPI
int __init io_apic_get_unique_id(int ioapic, int apic_id)
{
union IO_APIC_reg_00 reg_00;
static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
physid_mask_t tmp;
unsigned long flags;
int i = 0;
/*
* The P4 platform supports up to 256 APIC IDs on two separate APIC
* buses (one for LAPICs, one for IOAPICs), where predecessors only
* supports up to 16 on one shared APIC bus.
*
* TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
* advantage of new APIC bus architecture.
*/
if (physids_empty(apic_id_map))
apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
spin_lock_irqsave(&ioapic_lock, flags);
reg_00.raw = io_apic_read(ioapic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
if (apic_id >= get_physical_broadcast()) {
printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
"%d\n", ioapic, apic_id, reg_00.bits.ID);
apic_id = reg_00.bits.ID;
}
/*
* Every APIC in a system must have a unique ID or we get lots of nice
* 'stuck on smp_invalidate_needed IPI wait' messages.
*/
if (check_apicid_used(apic_id_map, apic_id)) {
for (i = 0; i < get_physical_broadcast(); i++) {
if (!check_apicid_used(apic_id_map, i))
break;
}
if (i == get_physical_broadcast())
panic("Max apic_id exceeded!\n");
printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
"trying %d\n", ioapic, apic_id, i);
apic_id = i;
}
tmp = apicid_to_cpu_present(apic_id);
physids_or(apic_id_map, apic_id_map, tmp);
if (reg_00.bits.ID != apic_id) {
reg_00.bits.ID = apic_id;
spin_lock_irqsave(&ioapic_lock, flags);
io_apic_write(ioapic, 0, reg_00.raw);
reg_00.raw = io_apic_read(ioapic, 0);
spin_unlock_irqrestore(&ioapic_lock, flags);
/* Sanity check */
if (reg_00.bits.ID != apic_id) {
printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
return -1;
}
}
apic_printk(APIC_VERBOSE, KERN_INFO
"IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
return apic_id;
}
int __init io_apic_get_version(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.version;
}
int __init io_apic_get_redir_entries(int ioapic)
{
union IO_APIC_reg_01 reg_01;
unsigned long flags;
spin_lock_irqsave(&ioapic_lock, flags);
reg_01.raw = io_apic_read(ioapic, 1);
spin_unlock_irqrestore(&ioapic_lock, flags);
return reg_01.bits.entries;
}
int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
{
struct IO_APIC_route_entry entry;
if (!IO_APIC_IRQ(irq)) {
printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
ioapic);
return -EINVAL;
}
/*
* Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
* Note that we mask (disable) IRQs now -- these get enabled when the
* corresponding device driver registers for this IRQ.
*/
memset(&entry, 0, sizeof(entry));
entry.delivery_mode = INT_DELIVERY_MODE;
entry.dest_mode = INT_DEST_MODE;
entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
entry.trigger = edge_level;
entry.polarity = active_high_low;
entry.mask = 1;
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
if (irq >= 16)
add_pin_to_irq(irq, ioapic, pin);
entry.vector = assign_irq_vector(irq);
apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
edge_level, active_high_low);
ioapic_register_intr(irq, entry.vector, edge_level);
if (!ioapic && (irq < 16))
disable_8259A_irq(irq);
ioapic_write_entry(ioapic, pin, entry);
return 0;
}
int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
{
int i;
if (skip_ioapic_setup)
return -1;
for (i = 0; i < mp_irq_entries; i++)
if (mp_irqs[i].mp_irqtype == mp_INT &&
mp_irqs[i].mp_srcbusirq == bus_irq)
break;
if (i >= mp_irq_entries)
return -1;
*trigger = irq_trigger(i);
*polarity = irq_polarity(i);
return 0;
}
#endif /* CONFIG_ACPI */
static int __init parse_disable_timer_pin_1(char *arg)
{
disable_timer_pin_1 = 1;
return 0;
}
early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
static int __init parse_enable_timer_pin_1(char *arg)
{
disable_timer_pin_1 = -1;
return 0;
}
early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
static int __init parse_noapic(char *arg)
{
/* disable IO-APIC */
disable_ioapic_setup();
return 0;
}
early_param("noapic", parse_noapic);
void __init ioapic_init_mappings(void)
{
unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
int i;
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {
ioapic_phys = mp_ioapics[i].mp_apicaddr;
if (!ioapic_phys) {
printk(KERN_ERR
"WARNING: bogus zero IO-APIC "
"address found in MPTABLE, "
"disabling IO/APIC support!\n");
smp_found_config = 0;
skip_ioapic_setup = 1;
goto fake_ioapic_page;
}
} else {
fake_ioapic_page:
ioapic_phys = (unsigned long)
alloc_bootmem_pages(PAGE_SIZE);
ioapic_phys = __pa(ioapic_phys);
}
set_fixmap_nocache(idx, ioapic_phys);
printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
__fix_to_virt(idx), ioapic_phys);
idx++;
}
}