From 85b0cae89d5266e6a7abb2e83c6f716326fc494c Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 31 Jan 2019 19:38:56 +0300 Subject: [PATCH 1/5] PCI: Blacklist power management of Gigabyte X299 DESIGNARE EX PCIe ports Gigabyte X299 DESIGNARE EX motherboard has one PCIe root port that is connected to an Alpine Ridge Thunderbolt controller. This port has slot implemented bit set in the config space but other than that it is not hotplug capable in the sense we are expecting in Linux (it has dev->is_hotplug_bridge set to 0): 00:1c.4 PCI bridge: Intel Corporation 200 Series PCH PCI Express Root Port #5 Bus: primary=00, secondary=05, subordinate=46, sec-latency=0 Memory behind bridge: 78000000-8fffffff [size=384M] Prefetchable memory behind bridge: 00003800f8000000-00003800ffffffff [size=128M] ... Capabilities: [40] Express (v2) Root Port (Slot+), MSI 00 ... SltCap: AttnBtn- PwrCtrl- MRL- AttnInd- PwrInd- HotPlug- Surprise- Slot #8, PowerLimit 25.000W; Interlock- NoCompl+ SltCtl: Enable: AttnBtn- PwrFlt- MRL- PresDet- CmdCplt- HPIrq- LinkChg- Control: AttnInd Unknown, PwrInd Unknown, Power- Interlock- SltSta: Status: AttnBtn- PowerFlt- MRL- CmdCplt- PresDet- Interlock- Changed: MRL- PresDet+ LinkState+ This system is using ACPI based hotplug to notify the OS that it needs to rescan the PCI bus (ACPI hotplug). If there is nothing connected in any of the Thunderbolt ports the root port will not have any runtime PM active children and is thus automatically runtime suspended pretty soon after boot by PCI PM core. Now, when a device is connected the BIOS SMI handler responsible for enumerating newly added devices is not able to find anything because the port is in D3. Prevent this from happening by blacklisting PCI power management of this particular Gigabyte system. Link: https://bugzilla.kernel.org/show_bug.cgi?id=202031 Reported-by: Kedar A Dongre Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki --- drivers/pci/pci.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index c9d8e3c837de..0f29cd1d36d3 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -2501,6 +2501,25 @@ void pci_config_pm_runtime_put(struct pci_dev *pdev) pm_runtime_put_sync(parent); } +static const struct dmi_system_id bridge_d3_blacklist[] = { +#ifdef CONFIG_X86 + { + /* + * Gigabyte X299 root port is not marked as hotplug capable + * which allows Linux to power manage it. However, this + * confuses the BIOS SMI handler so don't power manage root + * ports on that system. + */ + .ident = "X299 DESIGNARE EX-CF", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Gigabyte Technology Co., Ltd."), + DMI_MATCH(DMI_BOARD_NAME, "X299 DESIGNARE EX-CF"), + }, + }, +#endif + { } +}; + /** * pci_bridge_d3_possible - Is it possible to put the bridge into D3 * @bridge: Bridge to check @@ -2546,6 +2565,9 @@ bool pci_bridge_d3_possible(struct pci_dev *bridge) if (bridge->is_hotplug_bridge) return false; + if (dmi_check_system(bridge_d3_blacklist)) + return false; + /* * It should be safe to put PCIe ports from 2015 or newer * to D3. From c528f7bd362b097eeeafa6fbbeccd9750b79c7ba Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 31 Jan 2019 20:07:45 +0300 Subject: [PATCH 2/5] Revert "PCI/PME: Implement runtime PM callbacks" This reverts commit 0e157e52860441cb26051f131dd0b5ae3187a07b. Heiner reported that the commit in question prevents his network adapter from triggering PME and waking up when network cable is plugged. The commit tried to prevent root port waking up from D3cold immediately but looks like disabing root port PME interrupt is not the right way to fix that issue so revert it now. The patch following proposes an alternative solution to that issue. Link: https://bugzilla.kernel.org/show_bug.cgi?id=202103 Fixes: 0e157e528604 ("PCI/PME: Implement runtime PM callbacks") Reported-by: Heiner Kallweit Tested-by: Heiner Kallweit Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki CC: stable@vger.kernel.org # v4.20+ --- drivers/pci/pcie/pme.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index 0dbcf429089f..1a8b85051b1b 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -432,31 +432,6 @@ static void pcie_pme_remove(struct pcie_device *srv) kfree(get_service_data(srv)); } -static int pcie_pme_runtime_suspend(struct pcie_device *srv) -{ - struct pcie_pme_service_data *data = get_service_data(srv); - - spin_lock_irq(&data->lock); - pcie_pme_interrupt_enable(srv->port, false); - pcie_clear_root_pme_status(srv->port); - data->noirq = true; - spin_unlock_irq(&data->lock); - - return 0; -} - -static int pcie_pme_runtime_resume(struct pcie_device *srv) -{ - struct pcie_pme_service_data *data = get_service_data(srv); - - spin_lock_irq(&data->lock); - pcie_pme_interrupt_enable(srv->port, true); - data->noirq = false; - spin_unlock_irq(&data->lock); - - return 0; -} - static struct pcie_port_service_driver pcie_pme_driver = { .name = "pcie_pme", .port_type = PCI_EXP_TYPE_ROOT_PORT, @@ -464,8 +439,6 @@ static struct pcie_port_service_driver pcie_pme_driver = { .probe = pcie_pme_probe, .suspend = pcie_pme_suspend, - .runtime_suspend = pcie_pme_runtime_suspend, - .runtime_resume = pcie_pme_runtime_resume, .resume = pcie_pme_resume, .remove = pcie_pme_remove, }; From bbe54ea5330d828cc396d451c0e1e5c3f9764c1e Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Thu, 31 Jan 2019 20:07:46 +0300 Subject: [PATCH 3/5] PCI: pciehp: Disable Data Link Layer State Changed event on suspend Commit 0e157e528604 ("PCI/PME: Implement runtime PM callbacks") tried to solve an issue where the hierarchy immediately wakes up when it is transitioned into D3cold. However, it turns out to prevent PME propagation on some systems that do not support D3cold. I looked more closely at what might cause the immediate wakeup. It happens when the ACPI power resource of the root port is turned off. The AML code associated with the _OFF() method of the ACPI power resource starts a PCIe L2/L3 Ready transition and waits for it to complete. Right after the L2/L3 Ready transition is started the root port receives a PME from the downstream port. The simplest hierarchy where this happens looks like this: 00:1d.0 PCIe Root Port ^ | v 05:00.0 PCIe switch #1 upstream port 06:01.0 PCIe switch #1 downstream hotplug port ^ | v 08:00.0 PCIe switch #2 upstream port It seems that the PCIe link between the two switches, before PME_Turn_Off/PME_TO_Ack is complete for the whole hierarchy, goes inactive and triggers PME towards the root port bringing it back to D0. The L2/L3 Ready sequence is described in PCIe r4.0 spec sections 5.2 and 5.3.3 but unfortunately they do not state what happens if DLLSCE is enabled during the sequence. Disabling Data Link Layer State Changed event (DLLSCE) seems to prevent the issue and still allows the downstream hotplug port to notice when a device is plugged/unplugged. Link: https://bugzilla.kernel.org/show_bug.cgi?id=202593 Fixes: 0e157e528604 ("PCI/PME: Implement runtime PM callbacks") Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Rafael J. Wysocki CC: stable@vger.kernel.org # v4.20+ --- drivers/pci/hotplug/pciehp_hpc.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c index 7dd443aea5a5..c0fb64ace05a 100644 --- a/drivers/pci/hotplug/pciehp_hpc.c +++ b/drivers/pci/hotplug/pciehp_hpc.c @@ -736,12 +736,25 @@ void pcie_clear_hotplug_events(struct controller *ctrl) void pcie_enable_interrupt(struct controller *ctrl) { - pcie_write_cmd(ctrl, PCI_EXP_SLTCTL_HPIE, PCI_EXP_SLTCTL_HPIE); + u16 mask; + + mask = PCI_EXP_SLTCTL_HPIE | PCI_EXP_SLTCTL_DLLSCE; + pcie_write_cmd(ctrl, mask, mask); } void pcie_disable_interrupt(struct controller *ctrl) { - pcie_write_cmd(ctrl, 0, PCI_EXP_SLTCTL_HPIE); + u16 mask; + + /* + * Mask hot-plug interrupt to prevent it triggering immediately + * when the link goes inactive (we still get PME when any of the + * enabled events is detected). Same goes with Link Layer State + * changed event which generates PME immediately when the link goes + * inactive so mask it as well. + */ + mask = PCI_EXP_SLTCTL_HPIE | PCI_EXP_SLTCTL_DLLSCE; + pcie_write_cmd(ctrl, 0, mask); } /* From 95c80bc6952b6a5badc7b702d23e5bf14d251e7c Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 28 Feb 2019 13:56:27 -0600 Subject: [PATCH 4/5] PCI/PME: Fix hotplug/sysfs remove deadlock in pcie_pme_remove() Dongdong reported a deadlock triggered by a hotplug event during a sysfs "remove" operation: pciehp 0000:00:0c.0:pcie004: Slot(0-1): Link Up # echo 1 > 0000:00:0c.0/remove PME and hotplug share an MSI/MSI-X vector. The sysfs "remove" side is: remove_store pci_stop_and_remove_bus_device_locked pci_lock_rescan_remove pci_stop_and_remove_bus_device ... pcie_pme_remove pcie_pme_suspend synchronize_irq # wait for hotplug IRQ handler pci_unlock_rescan_remove The hotplug side is: pciehp_ist pciehp_handle_presence_or_link_change pciehp_configure_device pci_lock_rescan_remove # wait for pci_unlock_rescan_remove() INFO: task bash:10913 blocked for more than 120 seconds. # ps -ax |grep D PID TTY STAT TIME COMMAND 10913 ttyAMA0 Ds+ 0:00 -bash 14022 ? D 0:00 [irq/745-pciehp] # cat /proc/14022/stack __switch_to+0x94/0xd8 pci_lock_rescan_remove+0x20/0x28 pciehp_configure_device+0x30/0x140 pciehp_handle_presence_or_link_change+0x324/0x458 pciehp_ist+0x1dc/0x1e0 # cat /proc/10913/stack __switch_to+0x94/0xd8 synchronize_irq+0x8c/0xc0 pcie_pme_suspend+0xa4/0x118 pcie_pme_remove+0x20/0x40 pcie_port_remove_service+0x3c/0x58 ... pcie_port_device_remove+0x2c/0x48 pcie_portdrv_remove+0x68/0x78 pci_device_remove+0x48/0x120 ... pci_stop_bus_device+0x84/0xc0 pci_stop_and_remove_bus_device_locked+0x24/0x40 remove_store+0xa4/0xb8 dev_attr_store+0x44/0x60 sysfs_kf_write+0x58/0x80 It is incorrect to call pcie_pme_suspend() from pcie_pme_remove() for two reasons. First, pcie_pme_suspend() calls synchronize_irq(), which will wait for the native hotplug interrupt handler as well as for the PME one, because they share one IRQ (as per the spec). That may deadlock if hotplug is signaled while pcie_pme_remove() is running and the latter calls pci_lock_rescan_remove() before the former. Second, if pcie_pme_suspend() figures out that wakeup needs to be enabled for the port, it will return without disabling the interrupt as expected by pcie_pme_remove() which was overlooked by commit c7b5a4e6e8fb ("PCI / PM: Fix native PME handling during system suspend/resume"). To fix that, rework pcie_pme_remove() to disable the PME interrupt, clear its status and prevent the PME worker function from re-enabling it before calling free_irq() on it, which should be sufficient. Fixes: c7b5a4e6e8fb ("PCI / PM: Fix native PME handling during system suspend/resume") Link: https://lore.kernel.org/linux-pci/c7697e7c-e1af-13e4-8491-0a3996e6ab5d@huawei.com Reported-by: Dongdong Liu Signed-off-by: Rafael J. Wysocki [bhelgaas: add URL and deadlock details from Dongdong] Signed-off-by: Bjorn Helgaas --- drivers/pci/pcie/pme.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index 1a8b85051b1b..efa5b552914b 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -363,6 +363,16 @@ static bool pcie_pme_check_wakeup(struct pci_bus *bus) return false; } +static void pcie_pme_disable_interrupt(struct pci_dev *port, + struct pcie_pme_service_data *data) +{ + spin_lock_irq(&data->lock); + pcie_pme_interrupt_enable(port, false); + pcie_clear_root_pme_status(port); + data->noirq = true; + spin_unlock_irq(&data->lock); +} + /** * pcie_pme_suspend - Suspend PCIe PME service device. * @srv: PCIe service device to suspend. @@ -387,11 +397,7 @@ static int pcie_pme_suspend(struct pcie_device *srv) return 0; } - spin_lock_irq(&data->lock); - pcie_pme_interrupt_enable(port, false); - pcie_clear_root_pme_status(port); - data->noirq = true; - spin_unlock_irq(&data->lock); + pcie_pme_disable_interrupt(port, data); synchronize_irq(srv->irq); @@ -427,9 +433,11 @@ static int pcie_pme_resume(struct pcie_device *srv) */ static void pcie_pme_remove(struct pcie_device *srv) { - pcie_pme_suspend(srv); + struct pcie_pme_service_data *data = get_service_data(srv); + + pcie_pme_disable_interrupt(srv->port, data); free_irq(srv->irq, srv); - kfree(get_service_data(srv)); + kfree(data); } static struct pcie_port_service_driver pcie_pme_driver = { From 7cf58b79b3072029af127ae865ffc6f00f34b1f8 Mon Sep 17 00:00:00 2001 From: Sven Van Asbroeck Date: Fri, 1 Mar 2019 11:54:19 -0500 Subject: [PATCH 5/5] PCI/PME: Fix possible use-after-free on remove In remove(), ensure that the PME work cannot run after kfree() is called. Otherwise, this could result in a use-after-free. This issue was detected with the help of Coccinelle. Signed-off-by: Sven Van Asbroeck Signed-off-by: Bjorn Helgaas Cc: Sinan Kaya Cc: Frederick Lawler Cc: Mika Westerberg Cc: Keith Busch Cc: Rafael J. Wysocki --- drivers/pci/pcie/pme.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/pcie/pme.c b/drivers/pci/pcie/pme.c index efa5b552914b..54d593d10396 100644 --- a/drivers/pci/pcie/pme.c +++ b/drivers/pci/pcie/pme.c @@ -437,6 +437,7 @@ static void pcie_pme_remove(struct pcie_device *srv) pcie_pme_disable_interrupt(srv->port, data); free_irq(srv->irq, srv); + cancel_work_sync(&data->work); kfree(data); }