diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 0c0ac93f422f..a0b11fb3237e 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -53,6 +53,7 @@ struct device_node; #define EEH_PE_ISOLATED (1 << 0) /* Isolated PE */ #define EEH_PE_RECOVERING (1 << 1) /* Recovering PE */ +#define EEH_PE_PHB_DEAD (1 << 2) /* Dead PHB */ struct eeh_pe { int type; /* PE type: PHB/Bus/Device */ @@ -145,6 +146,7 @@ struct eeh_ops { int (*configure_bridge)(struct eeh_pe *pe); int (*read_config)(struct device_node *dn, int where, int size, u32 *val); int (*write_config)(struct device_node *dn, int where, int size, u32 val); + int (*next_error)(struct eeh_pe **pe); }; extern struct eeh_ops *eeh_ops; diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 678bc6cddf82..0974e1326842 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -399,24 +399,7 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) */ #define MAX_WAIT_FOR_RECOVERY 150 -/** - * eeh_handle_event - Reset a PCI device after hard lockup. - * @pe: EEH PE - * - * While PHB detects address or data parity errors on particular PCI - * slot, the associated PE will be frozen. Besides, DMA's occurring - * to wild addresses (which usually happen due to bugs in device - * drivers or in PCI adapter firmware) can cause EEH error. #SERR, - * #PERR or other misc PCI-related errors also can trigger EEH errors. - * - * Recovery process consists of unplugging the device driver (which - * generated hotplug events to userspace), then issuing a PCI #RST to - * the device, then reconfiguring the PCI config space for all bridges - * & devices under this slot, and then finally restarting the device - * drivers (which cause a second set of hotplug events to go out to - * userspace). - */ -void eeh_handle_event(struct eeh_pe *pe) +static void eeh_handle_normal_event(struct eeh_pe *pe) { struct pci_bus *frozen_bus; int rc = 0; @@ -554,3 +537,112 @@ perm_error: if (frozen_bus) pcibios_remove_pci_devices(frozen_bus); } + +static void eeh_handle_special_event(void) +{ + struct eeh_pe *pe, *phb_pe; + struct pci_bus *bus; + struct pci_controller *hose, *tmp; + unsigned long flags; + int rc = 0; + + /* + * The return value from next_error() has been classified as follows. + * It might be good to enumerate them. However, next_error() is only + * supported by PowerNV platform for now. So it would be fine to use + * integer directly: + * + * 4 - Dead IOC 3 - Dead PHB + * 2 - Fenced PHB 1 - Frozen PE + * 0 - No error found + * + */ + rc = eeh_ops->next_error(&pe); + if (rc <= 0) + return; + + switch (rc) { + case 4: + /* Mark all PHBs in dead state */ + eeh_serialize_lock(&flags); + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe) continue; + + eeh_pe_state_mark(phb_pe, + EEH_PE_ISOLATED | EEH_PE_PHB_DEAD); + } + eeh_serialize_unlock(flags); + + /* Purge all events */ + eeh_remove_event(NULL); + break; + case 3: + case 2: + case 1: + /* Mark the PE in fenced state */ + eeh_serialize_lock(&flags); + if (rc == 3) + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_PHB_DEAD); + else + eeh_pe_state_mark(pe, + EEH_PE_ISOLATED | EEH_PE_RECOVERING); + eeh_serialize_unlock(flags); + + /* Purge all events of the PHB */ + eeh_remove_event(pe); + break; + default: + pr_err("%s: Invalid value %d from next_error()\n", + __func__, rc); + return; + } + + /* + * For fenced PHB and frozen PE, it's handled as normal + * event. We have to remove the affected PHBs for dead + * PHB and IOC + */ + if (rc == 2 || rc == 1) + eeh_handle_normal_event(pe); + else { + list_for_each_entry_safe(hose, tmp, + &hose_list, list_node) { + phb_pe = eeh_phb_pe_get(hose); + if (!phb_pe || !(phb_pe->state & EEH_PE_PHB_DEAD)) + continue; + + bus = eeh_pe_bus_get(phb_pe); + /* Notify all devices that they're about to go down. */ + eeh_pe_dev_traverse(pe, eeh_report_failure, NULL); + pcibios_remove_pci_devices(bus); + } + } +} + +/** + * eeh_handle_event - Reset a PCI device after hard lockup. + * @pe: EEH PE + * + * While PHB detects address or data parity errors on particular PCI + * slot, the associated PE will be frozen. Besides, DMA's occurring + * to wild addresses (which usually happen due to bugs in device + * drivers or in PCI adapter firmware) can cause EEH error. #SERR, + * #PERR or other misc PCI-related errors also can trigger EEH errors. + * + * Recovery process consists of unplugging the device driver (which + * generated hotplug events to userspace), then issuing a PCI #RST to + * the device, then reconfiguring the PCI config space for all bridges + * & devices under this slot, and then finally restarting the device + * drivers (which cause a second set of hotplug events to go out to + * userspace). + */ +void eeh_handle_event(struct eeh_pe *pe) +{ + if (pe) + eeh_handle_normal_event(pe); + else + eeh_handle_special_event(); +}