From d1510a2e5ab6cb3a67f1c55ca5e7a6d2c6dec340 Mon Sep 17 00:00:00 2001 From: Chris Gorman Date: Wed, 12 Jul 2017 13:31:26 -0400 Subject: [PATCH 001/109] i2c: mux: pinctrl: mention correct module name in Kconfig help text Kconfig says the resulting module is pinctrl-i2cmux, but the module when built is i2c-mux-pinctrl. Fixes: ae58d1e40698 ("i2c: Add generic I2C multiplexer using pinctrl API") Signed-off-by: Chris Gorman Signed-off-by: Peter Rosin --- drivers/i2c/muxes/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig index 2c64d0e0740f..17121329bb79 100644 --- a/drivers/i2c/muxes/Kconfig +++ b/drivers/i2c/muxes/Kconfig @@ -83,7 +83,7 @@ config I2C_MUX_PINCTRL different sets of pins at run-time. This driver can also be built as a module. If so, the module will be - called pinctrl-i2cmux. + called i2c-mux-pinctrl. config I2C_MUX_REG tristate "Register-based I2C multiplexer" From 3aa0907675a38498d8f2d343e94207ad28a117cf Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 17 Jul 2017 20:20:08 +0200 Subject: [PATCH 002/109] mtd: nand: atmel: Fix DT backward compatibility in pmecc.c PMECC caps extraction from old DT bindings is broken, thus leading to erroneous EL registers offset, which in turn make HW ECC unusable on sama5d2 when old bindings are in use. Passing the NAND dev node instead of the NFC node to of_match_node() solves the problem. Signed-off-by: Boris Brezillon Fixes: f88fc122cc34 ("mtd: nand: Cleanup/rework the atmel_nand driver") Cc: Tested-by: Romain Izard --- drivers/mtd/nand/atmel/pmecc.c | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/drivers/mtd/nand/atmel/pmecc.c b/drivers/mtd/nand/atmel/pmecc.c index 55a8ee5306ea..8c210a5776bc 100644 --- a/drivers/mtd/nand/atmel/pmecc.c +++ b/drivers/mtd/nand/atmel/pmecc.c @@ -945,6 +945,7 @@ struct atmel_pmecc *devm_atmel_pmecc_get(struct device *userdev) */ struct platform_device *pdev = to_platform_device(userdev); const struct atmel_pmecc_caps *caps; + const struct of_device_id *match; /* No PMECC engine available. */ if (!of_property_read_bool(userdev->of_node, @@ -953,21 +954,11 @@ struct atmel_pmecc *devm_atmel_pmecc_get(struct device *userdev) caps = &at91sam9g45_caps; - /* - * Try to find the NFC subnode and extract the associated caps - * from there. - */ - np = of_find_compatible_node(userdev->of_node, NULL, - "atmel,sama5d3-nfc"); - if (np) { - const struct of_device_id *match; - - match = of_match_node(atmel_pmecc_legacy_match, np); - if (match && match->data) - caps = match->data; - - of_node_put(np); - } + /* Find the caps associated to the NAND dev node. */ + match = of_match_node(atmel_pmecc_legacy_match, + userdev->of_node); + if (match && match->data) + caps = match->data; pmecc = atmel_pmecc_create(pdev, caps, 1, 2); } From b82d6cb41eb54bf1c0e5d2ae73f49a4dff54c54b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 21 Jul 2017 10:35:31 +0200 Subject: [PATCH 003/109] xtensa: remove wrapper header for asm/device.h xtensa's asm/device.h is a verbatim copy of asm-generic/device.h and does not add any arch specific extensions. Thus, use the asm-generic header directly. Acked-by: Max Filippov Signed-off-by: Tobias Klauser Signed-off-by: Max Filippov --- arch/xtensa/include/asm/Kbuild | 1 + arch/xtensa/include/asm/device.h | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) delete mode 100644 arch/xtensa/include/asm/device.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index 2d716ebc5a5e..a2fdabfce43a 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -1,5 +1,6 @@ generic-y += bug.h generic-y += clkdev.h +generic-y += device.h generic-y += div64.h generic-y += dma-contiguous.h generic-y += emergency-restart.h diff --git a/arch/xtensa/include/asm/device.h b/arch/xtensa/include/asm/device.h deleted file mode 100644 index 1deeb8ebbb1b..000000000000 --- a/arch/xtensa/include/asm/device.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Arch specific extensions to struct device - * - * This file is released under the GPLv2 - */ -#ifndef _ASM_XTENSA_DEVICE_H -#define _ASM_XTENSA_DEVICE_H - -struct dev_archdata { -}; - -struct pdev_archdata { -}; - -#endif /* _ASM_XTENSA_DEVICE_H */ From 536dcc9c34035778892a7e3cbf45166ce73f8d34 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Fri, 21 Jul 2017 10:44:07 +0200 Subject: [PATCH 004/109] xtensa: remove wrapper header for asm/param.h xtensa's asm/device.h is a verbatim copy of asm-generic/device.h and does not add any arch specific extensions. Thus, use the asm-generic header directly. Signed-off-by: Tobias Klauser Signed-off-by: Max Filippov --- arch/xtensa/include/asm/Kbuild | 1 + arch/xtensa/include/asm/param.h | 18 ------------------ 2 files changed, 1 insertion(+), 18 deletions(-) delete mode 100644 arch/xtensa/include/asm/param.h diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index a2fdabfce43a..dff7cc39437c 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -18,6 +18,7 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h +generic-y += param.h generic-y += percpu.h generic-y += preempt.h generic-y += rwsem.h diff --git a/arch/xtensa/include/asm/param.h b/arch/xtensa/include/asm/param.h deleted file mode 100644 index 0a70e780ef2a..000000000000 --- a/arch/xtensa/include/asm/param.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * include/asm-xtensa/param.h - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - * - * Copyright (C) 2001 - 2005 Tensilica Inc. - */ -#ifndef _XTENSA_PARAM_H -#define _XTENSA_PARAM_H - -#include - -# define HZ CONFIG_HZ /* internal timer frequency */ -# define USER_HZ 100 /* for user interfaces in "ticks" */ -# define CLOCKS_PER_SEC (USER_HZ) /* frequnzy at which times() counts */ -#endif /* _XTENSA_PARAM_H */ From edf3f301db7af7e784d06f7059dfc8a69359af13 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Mon, 10 Jul 2017 18:45:41 +0300 Subject: [PATCH 005/109] IB/ipoib: Fix race between light events and interface restart A potential race between light_event and interface restart may attach multicast group to an already attached QP. Scenario: light_event flow goes through ipoib_mcast_dev_flush function, if a context switch occurs before calling ipoib_mcast_remove_list, then we may face a situation where the broadcast of the priv is null and the corresponding QP is not detached yet. If an "interface restart" runs during the previous context switch, the following scenario occurs: When the device goes up, ipoib_ib_dev_up function will be called, it will send a new registration request to the broadcast group and then attach the group to the QP that was not detached before. IPOIB_FLUSH_LIGHT INTERFACE RESTART __ipoib_ib_dev_flush | | | | | | | ipoib_mcast_dev_flush | Move mcast list and broadcast to remove_list | | | | | Context Switch--> | | ipoib_ib_dev_down | | | | | ipoib_ib_dev_up | | | | | ipoib_mcast_join_task | allocate new broadcast | | | | | Attach QP to multicast group | | | | | <--Context Switch ipoib_mcast_leave Detach QP from multicast group Signed-off-by: Feras Daoud Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib.h | 1 + drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 2 ++ 3 files changed, 4 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index ff50a7bd66d8..7ac25059c40f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -336,6 +336,7 @@ struct ipoib_dev_priv { unsigned long flags; struct rw_semaphore vlan_rwsem; + struct mutex mcast_mutex; struct rb_root path_tree; struct list_head path_list; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 4ce315c92b48..144187b407bd 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1877,6 +1877,7 @@ static void ipoib_build_priv(struct net_device *dev) priv->dev = dev; spin_lock_init(&priv->lock); init_rwsem(&priv->vlan_rwsem); + mutex_init(&priv->mcast_mutex); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 057f58e6afca..0a0b2ce45cbc 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -838,6 +838,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev) struct ipoib_mcast *mcast, *tmcast; unsigned long flags; + mutex_lock(&priv->mcast_mutex); ipoib_dbg_mcast(priv, "flushing multicast list\n"); spin_lock_irqsave(&priv->lock, flags); @@ -865,6 +866,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev) wait_for_completion(&mcast->done); ipoib_mcast_remove_list(&remove_list); + mutex_unlock(&priv->mcast_mutex); } static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) From 6bdc8de2e86e717124a715ecc480892a2c331ff5 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 12 Jul 2017 10:40:25 +0300 Subject: [PATCH 006/109] IB/ipoib: Use cancel_delayed_work_sync when needed The work mcast_task can re-queue itself, so instead of doing cancel && flush_workqueue, that still can leave a queued task on the air, use cancel_delayed_work_sync. Also, no need to use lock over the cancel, the original lock was due to bit assignment setting (IPOIB_MCAST_RUN) that is not in use anymore. Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 0a0b2ce45cbc..f80bf0f5d7cf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -684,15 +684,10 @@ void ipoib_mcast_start_thread(struct net_device *dev) int ipoib_mcast_stop_thread(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); - unsigned long flags; ipoib_dbg_mcast(priv, "stopping multicast thread\n"); - spin_lock_irqsave(&priv->lock, flags); - cancel_delayed_work(&priv->mcast_task); - spin_unlock_irqrestore(&priv->lock, flags); - - flush_workqueue(priv->wq); + cancel_delayed_work_sync(&priv->mcast_task); return 0; } From a08e1120627f72e9ed7c291e3b9f8dd29c1513ab Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 12 Jul 2017 13:11:54 +0300 Subject: [PATCH 007/109] IB/ipoib: Make sure no in-flight joins while leaving that mcast While cleaning neighs and there is a send-only mcast neigh, the driver should wait to finish its join process before trying to remove it. Without this patch, we will see messages like: "ipoib_mcast_leave on an in-flight join" and unexpected results in the join_complete. Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky --- .../infiniband/ulp/ipoib/ipoib_multicast.c | 24 +++++++------------ 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index f80bf0f5d7cf..93e149efc1f5 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -743,6 +743,14 @@ void ipoib_mcast_remove_list(struct list_head *remove_list) { struct ipoib_mcast *mcast, *tmcast; + /* + * make sure the in-flight joins have finished before we attempt + * to leave + */ + list_for_each_entry_safe(mcast, tmcast, remove_list, list) + if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, remove_list, list) { ipoib_mcast_leave(mcast->dev, mcast); ipoib_mcast_free(mcast); @@ -852,14 +860,6 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); - /* - * make sure the in-flight joins have finished before we attempt - * to leave - */ - list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) - wait_for_completion(&mcast->done); - ipoib_mcast_remove_list(&remove_list); mutex_unlock(&priv->mcast_mutex); } @@ -979,14 +979,6 @@ void ipoib_mcast_restart_task(struct work_struct *work) netif_addr_unlock(dev); local_irq_restore(flags); - /* - * make sure the in-flight joins have finished before we attempt - * to leave - */ - list_for_each_entry_safe(mcast, tmcast, &remove_list, list) - if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) - wait_for_completion(&mcast->done); - ipoib_mcast_remove_list(&remove_list); /* From 11f74b40359b19f760964e71d04882a6caf530cc Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Thu, 13 Jul 2017 11:27:12 +0300 Subject: [PATCH 008/109] IB/ipoib: Prevent setting negative values to max_nonsrq_conn_qp Don't allow negative values to max_nonsrq_conn_qp. There is no functional impact on a negative value but it is logicically incorrect. Fixes: 68e995a29572 ("IPoIB/cm: Add connected mode support for devices without SRQs") Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 144187b407bd..8b7ec15a9d6e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2366,6 +2366,7 @@ static int __init ipoib_init_module(void) ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE); #ifdef CONFIG_INFINIBAND_IPOIB_CM ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP); + ipoib_max_conn_qp = max(ipoib_max_conn_qp, 0); #endif /* From d2e46fccc3e3d73a741efe433f00960331280696 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Sun, 16 Jul 2017 11:33:01 +0300 Subject: [PATCH 009/109] IB/ipoib: Set IPOIB_NEIGH_TBL_FLUSH after flushed completion initialization Set IPOIB_NEIGH_TBL_FLUSH bit after initializing the neighbor flushed completion, otherwise the garbage collector may signal a completion while it is not initialized yet. Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup in xmit path") Signed-off-by: Feras Daoud Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 8b7ec15a9d6e..f4403c52cd67 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1560,6 +1560,7 @@ static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) int i, wait_flushed = 0; init_completion(&priv->ntbl.flushed); + set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); spin_lock_irqsave(&priv->lock, flags); @@ -1604,7 +1605,6 @@ static void ipoib_neigh_hash_uninit(struct net_device *dev) ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n"); init_completion(&priv->ntbl.deleted); - set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags); /* Stop GC if called at init fail need to cancel work */ stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags); From 4829d964dfb027558c732cfa0d13b716ab3f0838 Mon Sep 17 00:00:00 2001 From: Alex Vesker Date: Mon, 10 Jul 2017 18:12:43 +0300 Subject: [PATCH 010/109] IB/ipoib: Add multicast packets statistics Update the multicast counter when multicast packets are received and provide this information through ethtool support. Signed-off-by: Alex Vesker Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_ethtool.c | 3 ++- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 7871379342f4..184a22f48027 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -52,7 +52,8 @@ static const struct ipoib_stats ipoib_gstrings_stats[] = { IPOIB_NETDEV_STAT(tx_bytes), IPOIB_NETDEV_STAT(tx_errors), IPOIB_NETDEV_STAT(rx_dropped), - IPOIB_NETDEV_STAT(tx_dropped) + IPOIB_NETDEV_STAT(tx_dropped), + IPOIB_NETDEV_STAT(multicast), }; #define IPOIB_GLOBAL_STATS_LEN ARRAY_SIZE(ipoib_gstrings_stats) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 57a9655e844d..02eda1f53a67 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -256,6 +256,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) ++dev->stats.rx_packets; dev->stats.rx_bytes += skb->len; + if (skb->pkt_type == PACKET_MULTICAST) + dev->stats.multicast++; skb->dev = dev; if ((dev->features & NETIF_F_RXCSUM) && From eb54714ddcb2462d4d4b8aa78d028b61e217a835 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Sun, 2 Jul 2017 15:05:59 +0300 Subject: [PATCH 011/109] IB/ipoib: Add get statistics support to SRIOV VF Add SRIOV VF support to get traffic statistics. Signed-off-by: Feras Daoud Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index f4403c52cd67..24fa87fe0952 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1847,6 +1847,7 @@ static const struct net_device_ops ipoib_netdev_ops_vf = { .ndo_tx_timeout = ipoib_timeout, .ndo_set_rx_mode = ipoib_set_mcast_list, .ndo_get_iflink = ipoib_get_iflink, + .ndo_get_stats64 = ipoib_get_stats, }; void ipoib_setup_common(struct net_device *dev) From dc892e17bbae670a3d7aa6ab8bd1033b15b24645 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 13 Jul 2017 13:34:19 +0300 Subject: [PATCH 012/109] IB/ipoib: Clean error paths in add port Refactor error paths in ipoib_add_port() function. The code flow ensures that the function terminates on every error flow and it makes redundant all "else" cases. The functions are called during the flow are returning "result < 0", in case of error, so there is no need to check it explicitly. Fixes: 58e9cc90cda7 ("IB/IPoIB: Fix bad error flow in ipoib_add_port()") Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 24fa87fe0952..6c77df34869d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2175,14 +2175,14 @@ static struct net_device *ipoib_add_port(const char *format, priv->dev->dev_id = port - 1; result = ib_query_port(hca, port, &attr); - if (!result) - priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); - else { + if (result) { printk(KERN_WARNING "%s: ib_query_port %d failed\n", hca->name, port); goto device_init_failed; } + priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); + /* MTU will be reset when mcast join happens */ priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); priv->mcast_mtu = priv->admin_mtu = priv->dev->mtu; @@ -2213,12 +2213,14 @@ static struct net_device *ipoib_add_port(const char *format, printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n", hca->name, port, result); goto device_init_failed; - } else - memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid)); + } + + memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, + sizeof(union ib_gid)); set_bit(IPOIB_FLAG_DEV_ADDR_SET, &priv->flags); result = ipoib_dev_init(priv->dev, hca, port); - if (result < 0) { + if (result) { printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n", hca->name, port, result); goto device_init_failed; From 1b355094b308f3377c8f574ce86135ee159c6285 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sat, 15 Jul 2017 16:26:55 +0300 Subject: [PATCH 013/109] IB/ipoib: Remove double pointer assigning There is no need to assign "p" pointer twice. This patch fixes the following smatch warning: drivers/infiniband/ulp/ipoib/ipoib_cm.c:517 ipoib_cm_rx_handler() warn: missing break? reassigning 'p->id' Fixes: 839fcaba355a ("IPoIB: Connected mode experimental support") Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index f87d104837dc..d69410c2ed97 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -511,7 +511,6 @@ static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id, case IB_CM_REQ_RECEIVED: return ipoib_cm_req_handler(cm_id, event); case IB_CM_DREQ_RECEIVED: - p = cm_id->context; ib_send_cm_drep(cm_id, NULL, 0); /* Fall through */ case IB_CM_REJ_RECEIVED: From b287b76e89503ef1d403cc5cc8bd74b035d25bfa Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Sun, 23 Jul 2017 10:46:14 +0300 Subject: [PATCH 014/109] Revert "IB/core: Allow QP state transition from reset to error" The commit ebc9ca43e1d5 ("IB/core: Allow QP state transition from reset to error") allowed transition from Reset to Error state for the QPs. This behavior doesn't follow the IBTA specification 1.3, which in 10.3.1 QUEUE PAIR AND EE CONTEXT STATES section. The quote from the spec: "An error can be forced from any state, except Reset, with the Modify QP/EE Verb." Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/verbs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index fb98ed67d5bc..7f8fe443df46 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -895,7 +895,6 @@ static const struct { } qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = { [IB_QPS_RESET] = { [IB_QPS_RESET] = { .valid = 1 }, - [IB_QPS_ERR] = { .valid = 1 }, [IB_QPS_INIT] = { .valid = 1, .req_param = { From 5dc78ad1904db597bdb4427f3ead437aae86f54c Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Thu, 13 Jul 2017 14:29:08 +0300 Subject: [PATCH 015/109] IB/ipoib: Notify on modify QP failure only when relevant Modify QP can fail and it can be acceptable, like when moving from RST to ERR state, all the rest are not acceptable and a message to the log should be printed. The current code prints on all failures and many messages like: "Failed to modify QP to ERROR state" appear, even when supported by the state machine of the QP object. Signed-off-by: Erez Shitrit Signed-off-by: Leon Romanovsky --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 02eda1f53a67..2e075377242e 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -711,6 +711,27 @@ static int recvs_pending(struct net_device *dev) return pending; } +static void check_qp_movement_and_print(struct ipoib_dev_priv *priv, + struct ib_qp *qp, + enum ib_qp_state new_state) +{ + struct ib_qp_attr qp_attr; + struct ib_qp_init_attr query_init_attr; + int ret; + + ret = ib_query_qp(qp, &qp_attr, IB_QP_STATE, &query_init_attr); + if (ret) { + ipoib_warn(priv, "%s: Failed to query QP\n", __func__); + return; + } + /* print according to the new-state and the previous state.*/ + if (new_state == IB_QPS_ERR && qp_attr.qp_state == IB_QPS_RESET) + ipoib_dbg(priv, "Failed modify QP, IB_QPS_RESET to IB_QPS_ERR, acceptable\n"); + else + ipoib_warn(priv, "Failed to modify QP to state: %d from state: %d\n", + new_state, qp_attr.qp_state); +} + int ipoib_ib_dev_stop_default(struct net_device *dev) { struct ipoib_dev_priv *priv = ipoib_priv(dev); @@ -730,7 +751,7 @@ int ipoib_ib_dev_stop_default(struct net_device *dev) */ qp_attr.qp_state = IB_QPS_ERR; if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE)) - ipoib_warn(priv, "Failed to modify QP to ERROR state\n"); + check_qp_movement_and_print(priv, priv->qp, IB_QPS_ERR); /* Wait for all sends and receives to complete */ begin = jiffies; From 8addebc14a322fa8ca67cd57c6038069acde8ddc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:52:56 +0200 Subject: [PATCH 016/109] scsi: bnx2fc: Plug CPU hotplug race bnx2fc_process_new_cqes() has protection against CPU hotplug, which relies on the per cpu thread pointer. This protection is racy because it happens only partially with the per cpu fp_work_lock held. If the CPU is unplugged after the lock is dropped, the wakeup code can dereference a NULL pointer or access freed and potentially reused memory. Restructure the code so the thread check and wakeup happens with the fp_work_lock held. Signed-off-by: Thomas Gleixner Acked-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_hwi.c | 45 ++++++++++++++++---------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_hwi.c b/drivers/scsi/bnx2fc/bnx2fc_hwi.c index 913c750205ce..26de61d65a4d 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_hwi.c +++ b/drivers/scsi/bnx2fc/bnx2fc_hwi.c @@ -1008,6 +1008,28 @@ static struct bnx2fc_work *bnx2fc_alloc_work(struct bnx2fc_rport *tgt, u16 wqe) return work; } +/* Pending work request completion */ +static void bnx2fc_pending_work(struct bnx2fc_rport *tgt, unsigned int wqe) +{ + unsigned int cpu = wqe % num_possible_cpus(); + struct bnx2fc_percpu_s *fps; + struct bnx2fc_work *work; + + fps = &per_cpu(bnx2fc_percpu, cpu); + spin_lock_bh(&fps->fp_work_lock); + if (fps->iothread) { + work = bnx2fc_alloc_work(tgt, wqe); + if (work) { + list_add_tail(&work->list, &fps->work_list); + wake_up_process(fps->iothread); + spin_unlock_bh(&fps->fp_work_lock); + return; + } + } + spin_unlock_bh(&fps->fp_work_lock); + bnx2fc_process_cq_compl(tgt, wqe); +} + int bnx2fc_process_new_cqes(struct bnx2fc_rport *tgt) { struct fcoe_cqe *cq; @@ -1042,28 +1064,7 @@ int bnx2fc_process_new_cqes(struct bnx2fc_rport *tgt) /* Unsolicited event notification */ bnx2fc_process_unsol_compl(tgt, wqe); } else { - /* Pending work request completion */ - struct bnx2fc_work *work = NULL; - struct bnx2fc_percpu_s *fps = NULL; - unsigned int cpu = wqe % num_possible_cpus(); - - fps = &per_cpu(bnx2fc_percpu, cpu); - spin_lock_bh(&fps->fp_work_lock); - if (unlikely(!fps->iothread)) - goto unlock; - - work = bnx2fc_alloc_work(tgt, wqe); - if (work) - list_add_tail(&work->list, - &fps->work_list); -unlock: - spin_unlock_bh(&fps->fp_work_lock); - - /* Pending work request completion */ - if (fps->iothread && work) - wake_up_process(fps->iothread); - else - bnx2fc_process_cq_compl(tgt, wqe); + bnx2fc_pending_work(tgt, wqe); num_free_sqes++; } cqe++; From 2c2b66ae9d11d99f5f6d7010f0d46401ed9031ce Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:52:57 +0200 Subject: [PATCH 017/109] scsi: bnx2fc: Prevent recursive cpuhotplug locking The BNX2FC module init/exit code installs/removes the hotplug callbacks with the cpu hotplug lock held. This worked with the old CPU locking implementation which allowed recursive locking, but with the new percpu rwsem based mechanism this is not longer allowed. Use the _cpuslocked() variants to fix this. Reported-by: kernel test robot Acked-by: Chad Dupuis Signed-off-by: Thomas Gleixner Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index 7dfe709a7138..c4ebf8c5d884 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -2766,15 +2766,16 @@ static int __init bnx2fc_mod_init(void) for_each_online_cpu(cpu) bnx2fc_percpu_thread_create(cpu); - rc = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "scsi/bnx2fc:online", - bnx2fc_cpu_online, NULL); + rc = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "scsi/bnx2fc:online", + bnx2fc_cpu_online, NULL); if (rc < 0) goto stop_threads; bnx2fc_online_state = rc; - cpuhp_setup_state_nocalls(CPUHP_SCSI_BNX2FC_DEAD, "scsi/bnx2fc:dead", - NULL, bnx2fc_cpu_dead); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2FC_DEAD, + "scsi/bnx2fc:dead", + NULL, bnx2fc_cpu_dead); put_online_cpus(); cnic_register_driver(CNIC_ULP_FCOE, &bnx2fc_cnic_cb); @@ -2850,8 +2851,8 @@ static void __exit bnx2fc_mod_exit(void) bnx2fc_percpu_thread_destroy(cpu); } - cpuhp_remove_state_nocalls(bnx2fc_online_state); - cpuhp_remove_state_nocalls(CPUHP_SCSI_BNX2FC_DEAD); + cpuhp_remove_state_nocalls_cpuslocked(bnx2fc_online_state); + cpuhp_remove_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2FC_DEAD); put_online_cpus(); From 2fa2fa1ae6b42fc4c4995fdbf8fd7df96bb25ba4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:52:58 +0200 Subject: [PATCH 018/109] scsi: bnx2i: Prevent recursive cpuhotplug locking The BNX2I module init/exit code installs/removes the hotplug callbacks with the cpu hotplug lock held. This worked with the old CPU locking implementation which allowed recursive locking, but with the new percpu rwsem based mechanism this is not longer allowed. Use the _cpuslocked() variants to fix this. Reported-by: Steven Rostedt Acked-by: Chad Dupuis Signed-off-by: Thomas Gleixner Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2i/bnx2i_init.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/bnx2i/bnx2i_init.c b/drivers/scsi/bnx2i/bnx2i_init.c index 86afc002814c..7487b653e799 100644 --- a/drivers/scsi/bnx2i/bnx2i_init.c +++ b/drivers/scsi/bnx2i/bnx2i_init.c @@ -516,15 +516,16 @@ static int __init bnx2i_mod_init(void) for_each_online_cpu(cpu) bnx2i_percpu_thread_create(cpu); - err = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, - "scsi/bnx2i:online", - bnx2i_cpu_online, NULL); + err = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, + "scsi/bnx2i:online", + bnx2i_cpu_online, NULL); if (err < 0) goto remove_threads; bnx2i_online_state = err; - cpuhp_setup_state_nocalls(CPUHP_SCSI_BNX2I_DEAD, "scsi/bnx2i:dead", - NULL, bnx2i_cpu_dead); + cpuhp_setup_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2I_DEAD, + "scsi/bnx2i:dead", + NULL, bnx2i_cpu_dead); put_online_cpus(); return 0; @@ -574,8 +575,8 @@ static void __exit bnx2i_mod_exit(void) for_each_online_cpu(cpu) bnx2i_percpu_thread_destroy(cpu); - cpuhp_remove_state_nocalls(bnx2i_online_state); - cpuhp_remove_state_nocalls(CPUHP_SCSI_BNX2I_DEAD); + cpuhp_remove_state_nocalls_cpuslocked(bnx2i_online_state); + cpuhp_remove_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2I_DEAD); put_online_cpus(); iscsi_unregister_transport(&bnx2i_iscsi_transport); From 1937f8a29f4a650bc27e0311b43b53509a34fd22 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:52:59 +0200 Subject: [PATCH 019/109] scsi: bnx2fc: Simplify CPU hotplug code The CPU hotplug related code of this driver can be simplified by: 1) Consolidating the callbacks into a single state. The CPU thread can be torn down on the CPU which goes offline. There is no point in delaying that to the CPU dead state 2) Let the core code invoke the online/offline callbacks and remove the extra for_each_online_cpu() loops. Signed-off-by: Thomas Gleixner Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2fc/bnx2fc_fcoe.c | 69 +++++++------------------------ include/linux/cpuhotplug.h | 1 - 2 files changed, 15 insertions(+), 55 deletions(-) diff --git a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c index c4ebf8c5d884..6844ba361616 100644 --- a/drivers/scsi/bnx2fc/bnx2fc_fcoe.c +++ b/drivers/scsi/bnx2fc/bnx2fc_fcoe.c @@ -2624,12 +2624,11 @@ static struct fcoe_transport bnx2fc_transport = { }; /** - * bnx2fc_percpu_thread_create - Create a receive thread for an - * online CPU + * bnx2fc_cpu_online - Create a receive thread for an online CPU * * @cpu: cpu index for the online cpu */ -static void bnx2fc_percpu_thread_create(unsigned int cpu) +static int bnx2fc_cpu_online(unsigned int cpu) { struct bnx2fc_percpu_s *p; struct task_struct *thread; @@ -2639,15 +2638,17 @@ static void bnx2fc_percpu_thread_create(unsigned int cpu) thread = kthread_create_on_node(bnx2fc_percpu_io_thread, (void *)p, cpu_to_node(cpu), "bnx2fc_thread/%d", cpu); + if (IS_ERR(thread)) + return PTR_ERR(thread); + /* bind thread to the cpu */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - p->iothread = thread; - wake_up_process(thread); - } + kthread_bind(thread, cpu); + p->iothread = thread; + wake_up_process(thread); + return 0; } -static void bnx2fc_percpu_thread_destroy(unsigned int cpu) +static int bnx2fc_cpu_offline(unsigned int cpu) { struct bnx2fc_percpu_s *p; struct task_struct *thread; @@ -2661,7 +2662,6 @@ static void bnx2fc_percpu_thread_destroy(unsigned int cpu) thread = p->iothread; p->iothread = NULL; - /* Free all work in the list */ list_for_each_entry_safe(work, tmp, &p->work_list, list) { list_del_init(&work->list); @@ -2673,20 +2673,6 @@ static void bnx2fc_percpu_thread_destroy(unsigned int cpu) if (thread) kthread_stop(thread); -} - - -static int bnx2fc_cpu_online(unsigned int cpu) -{ - printk(PFX "CPU %x online: Create Rx thread\n", cpu); - bnx2fc_percpu_thread_create(cpu); - return 0; -} - -static int bnx2fc_cpu_dead(unsigned int cpu) -{ - printk(PFX "CPU %x offline: Remove Rx thread\n", cpu); - bnx2fc_percpu_thread_destroy(cpu); return 0; } @@ -2761,31 +2747,16 @@ static int __init bnx2fc_mod_init(void) spin_lock_init(&p->fp_work_lock); } - get_online_cpus(); - - for_each_online_cpu(cpu) - bnx2fc_percpu_thread_create(cpu); - - rc = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, - "scsi/bnx2fc:online", - bnx2fc_cpu_online, NULL); + rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "scsi/bnx2fc:online", + bnx2fc_cpu_online, bnx2fc_cpu_offline); if (rc < 0) - goto stop_threads; + goto stop_thread; bnx2fc_online_state = rc; - cpuhp_setup_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2FC_DEAD, - "scsi/bnx2fc:dead", - NULL, bnx2fc_cpu_dead); - put_online_cpus(); - cnic_register_driver(CNIC_ULP_FCOE, &bnx2fc_cnic_cb); - return 0; -stop_threads: - for_each_online_cpu(cpu) - bnx2fc_percpu_thread_destroy(cpu); - put_online_cpus(); +stop_thread: kthread_stop(l2_thread); free_wq: destroy_workqueue(bnx2fc_wq); @@ -2804,7 +2775,6 @@ static void __exit bnx2fc_mod_exit(void) struct fcoe_percpu_s *bg; struct task_struct *l2_thread; struct sk_buff *skb; - unsigned int cpu = 0; /* * NOTE: Since cnic calls register_driver routine rtnl_lock, @@ -2845,16 +2815,7 @@ static void __exit bnx2fc_mod_exit(void) if (l2_thread) kthread_stop(l2_thread); - get_online_cpus(); - /* Destroy per cpu threads */ - for_each_online_cpu(cpu) { - bnx2fc_percpu_thread_destroy(cpu); - } - - cpuhp_remove_state_nocalls_cpuslocked(bnx2fc_online_state); - cpuhp_remove_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2FC_DEAD); - - put_online_cpus(); + cpuhp_remove_state(bnx2fc_online_state); destroy_workqueue(bnx2fc_wq); /* diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b56573bf440d..2e7b1731ad12 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -39,7 +39,6 @@ enum cpuhp_state { CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, - CPUHP_SCSI_BNX2FC_DEAD, CPUHP_SCSI_BNX2I_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, From f9f22a86912f9d36b50e9b3b383fabfb9f22dd46 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 24 Jul 2017 12:53:00 +0200 Subject: [PATCH 020/109] scsi: bnx2i: Simplify cpu hotplug code The CPU hotplug related code of this driver can be simplified by: 1) Consolidating the callbacks into a single state. The CPU thread can be torn down on the CPU which goes offline. There is no point in delaying that to the CPU dead state 2) Let the core code invoke the online/offline callbacks and remove the extra for_each_online_cpu() loops. Signed-off-by: Thomas Gleixner Acked-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- drivers/scsi/bnx2i/bnx2i_init.c | 65 ++++++++------------------------- include/linux/cpuhotplug.h | 1 - 2 files changed, 15 insertions(+), 51 deletions(-) diff --git a/drivers/scsi/bnx2i/bnx2i_init.c b/drivers/scsi/bnx2i/bnx2i_init.c index 7487b653e799..4ebcda8d9500 100644 --- a/drivers/scsi/bnx2i/bnx2i_init.c +++ b/drivers/scsi/bnx2i/bnx2i_init.c @@ -404,12 +404,11 @@ int bnx2i_get_stats(void *handle) /** - * bnx2i_percpu_thread_create - Create a receive thread for an - * online CPU + * bnx2i_cpu_online - Create a receive thread for an online CPU * * @cpu: cpu index for the online cpu */ -static void bnx2i_percpu_thread_create(unsigned int cpu) +static int bnx2i_cpu_online(unsigned int cpu) { struct bnx2i_percpu_s *p; struct task_struct *thread; @@ -419,16 +418,17 @@ static void bnx2i_percpu_thread_create(unsigned int cpu) thread = kthread_create_on_node(bnx2i_percpu_io_thread, (void *)p, cpu_to_node(cpu), "bnx2i_thread/%d", cpu); + if (IS_ERR(thread)) + return PTR_ERR(thread); + /* bind thread to the cpu */ - if (likely(!IS_ERR(thread))) { - kthread_bind(thread, cpu); - p->iothread = thread; - wake_up_process(thread); - } + kthread_bind(thread, cpu); + p->iothread = thread; + wake_up_process(thread); + return 0; } - -static void bnx2i_percpu_thread_destroy(unsigned int cpu) +static int bnx2i_cpu_offline(unsigned int cpu) { struct bnx2i_percpu_s *p; struct task_struct *thread; @@ -451,19 +451,6 @@ static void bnx2i_percpu_thread_destroy(unsigned int cpu) spin_unlock_bh(&p->p_work_lock); if (thread) kthread_stop(thread); -} - -static int bnx2i_cpu_online(unsigned int cpu) -{ - pr_info("bnx2i: CPU %x online: Create Rx thread\n", cpu); - bnx2i_percpu_thread_create(cpu); - return 0; -} - -static int bnx2i_cpu_dead(unsigned int cpu) -{ - pr_info("CPU %x offline: Remove Rx thread\n", cpu); - bnx2i_percpu_thread_destroy(cpu); return 0; } @@ -511,28 +498,14 @@ static int __init bnx2i_mod_init(void) p->iothread = NULL; } - get_online_cpus(); - - for_each_online_cpu(cpu) - bnx2i_percpu_thread_create(cpu); - - err = cpuhp_setup_state_nocalls_cpuslocked(CPUHP_AP_ONLINE_DYN, - "scsi/bnx2i:online", - bnx2i_cpu_online, NULL); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "scsi/bnx2i:online", + bnx2i_cpu_online, bnx2i_cpu_offline); if (err < 0) - goto remove_threads; + goto unreg_driver; bnx2i_online_state = err; - - cpuhp_setup_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2I_DEAD, - "scsi/bnx2i:dead", - NULL, bnx2i_cpu_dead); - put_online_cpus(); return 0; -remove_threads: - for_each_online_cpu(cpu) - bnx2i_percpu_thread_destroy(cpu); - put_online_cpus(); +unreg_driver: cnic_unregister_driver(CNIC_ULP_ISCSI); unreg_xport: iscsi_unregister_transport(&bnx2i_iscsi_transport); @@ -552,7 +525,6 @@ out: static void __exit bnx2i_mod_exit(void) { struct bnx2i_hba *hba; - unsigned cpu = 0; mutex_lock(&bnx2i_dev_lock); while (!list_empty(&adapter_list)) { @@ -570,14 +542,7 @@ static void __exit bnx2i_mod_exit(void) } mutex_unlock(&bnx2i_dev_lock); - get_online_cpus(); - - for_each_online_cpu(cpu) - bnx2i_percpu_thread_destroy(cpu); - - cpuhp_remove_state_nocalls_cpuslocked(bnx2i_online_state); - cpuhp_remove_state_nocalls_cpuslocked(CPUHP_SCSI_BNX2I_DEAD); - put_online_cpus(); + cpuhp_remove_state(bnx2i_online_state); iscsi_unregister_transport(&bnx2i_iscsi_transport); cnic_unregister_driver(CNIC_ULP_ISCSI); diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 2e7b1731ad12..82b30e638430 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -39,7 +39,6 @@ enum cpuhp_state { CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, - CPUHP_SCSI_BNX2I_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, From 722477c4f2c50bc8d800d293a3bc5aa843f16e8d Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Tue, 25 Jul 2017 11:19:21 +0200 Subject: [PATCH 021/109] scsi: qedf: Limit number of CQs FCOE offloading failed with: [qed_sp_fcoe_func_start:150(sp-0-3b:00.02)]Cannot satisfy CQ amount. CQs requested 8, CQs available 6. Aborting function start [qed_fcoe_start:821()]Failed to start fcoe [__qedf_probe:3041]:6: Cannot start FCoE function. The reason is a newly introduced check in the qed main part. This change also provides the information about how many CQs are available, so we simply limit the number of requested CQs.. Fixes: 3c5da9427802 ("qed: Share additional information with qedf") Signed-off-by: Thomas Bogendoerfer Reviewed-by: Johannes Thumshirn Acked-by: Chad Dupuis Signed-off-by: Martin K. Petersen --- drivers/scsi/qedf/qedf.h | 3 ++- drivers/scsi/qedf/qedf_main.c | 20 +++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/qedf/qedf.h b/drivers/scsi/qedf/qedf.h index 4d038926a455..351f06dfc5a0 100644 --- a/drivers/scsi/qedf/qedf.h +++ b/drivers/scsi/qedf/qedf.h @@ -528,7 +528,8 @@ struct fip_vlan { #define QEDF_WRITE (1 << 0) #define MAX_FIBRE_LUNS 0xffffffff -#define QEDF_MAX_NUM_CQS 8 +#define MIN_NUM_CPUS_MSIX(x) min_t(u32, x->dev_info.num_cqs, \ + num_online_cpus()) /* * PCI function probe defines diff --git a/drivers/scsi/qedf/qedf_main.c b/drivers/scsi/qedf/qedf_main.c index 7786c97e033f..1d13c9ca517d 100644 --- a/drivers/scsi/qedf/qedf_main.c +++ b/drivers/scsi/qedf/qedf_main.c @@ -2760,11 +2760,9 @@ static int qedf_set_fcoe_pf_param(struct qedf_ctx *qedf) * we allocation is the minimum off: * * Number of CPUs - * Number of MSI-X vectors - * Max number allocated in hardware (QEDF_MAX_NUM_CQS) + * Number allocated by qed for our PCI function */ - qedf->num_queues = min((unsigned int)QEDF_MAX_NUM_CQS, - num_online_cpus()); + qedf->num_queues = MIN_NUM_CPUS_MSIX(qedf); QEDF_INFO(&(qedf->dbg_ctx), QEDF_LOG_DISC, "Number of CQs is %d.\n", qedf->num_queues); @@ -2962,6 +2960,13 @@ static int __qedf_probe(struct pci_dev *pdev, int mode) goto err1; } + /* Learn information crucial for qedf to progress */ + rc = qed_ops->fill_dev_info(qedf->cdev, &qedf->dev_info); + if (rc) { + QEDF_ERR(&(qedf->dbg_ctx), "Failed to dev info.\n"); + goto err1; + } + /* queue allocation code should come here * order should be * slowpath_start @@ -2977,13 +2982,6 @@ static int __qedf_probe(struct pci_dev *pdev, int mode) } qed_ops->common->update_pf_params(qedf->cdev, &qedf->pf_params); - /* Learn information crucial for qedf to progress */ - rc = qed_ops->fill_dev_info(qedf->cdev, &qedf->dev_info); - if (rc) { - QEDF_ERR(&(qedf->dbg_ctx), "Failed to dev info.\n"); - goto err1; - } - /* Record BDQ producer doorbell addresses */ qedf->bdq_primary_prod = qedf->dev_info.primary_dbq_rq_addr; qedf->bdq_secondary_prod = qedf->dev_info.secondary_bdq_rq_addr; From e6fd916a625110d75bd4d15b8488df44cf41c9fc Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 25 Jul 2017 22:49:55 +0300 Subject: [PATCH 022/109] scsi: aacraid: reading out of bounds "qd.id" comes directly from the copy_from_user() on the line before so we should verify that it's within bounds. Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aachba.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/aacraid/aachba.c b/drivers/scsi/aacraid/aachba.c index 707ee2f5954d..4591113c49de 100644 --- a/drivers/scsi/aacraid/aachba.c +++ b/drivers/scsi/aacraid/aachba.c @@ -3198,10 +3198,11 @@ static int query_disk(struct aac_dev *dev, void __user *arg) return -EBUSY; if (copy_from_user(&qd, arg, sizeof (struct aac_query_disk))) return -EFAULT; - if (qd.cnum == -1) + if (qd.cnum == -1) { + if (qd.id < 0 || qd.id >= dev->maximum_num_containers) + return -EINVAL; qd.cnum = qd.id; - else if ((qd.bus == -1) && (qd.id == -1) && (qd.lun == -1)) - { + } else if ((qd.bus == -1) && (qd.id == -1) && (qd.lun == -1)) { if (qd.cnum < 0 || qd.cnum >= dev->maximum_num_containers) return -EINVAL; qd.instance = dev->scsi_host_ptr->host_no; From f930c7043663188429cd9b254e9d761edfc101ce Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Thu, 27 Jul 2017 09:11:26 +0200 Subject: [PATCH 023/109] scsi: sg: only check for dxfer_len greater than 256M Don't make any assumptions on the sg_io_hdr_t::dxfer_direction or the sg_io_hdr_t::dxferp in order to determine if it is a valid request. The only way we can check for bad requests is by checking if the length exceeds 256M. Signed-off-by: Johannes Thumshirn Fixes: 28676d869bbb (scsi: sg: check for valid direction before starting the request) Reported-by: Jason L Tibbitts III Tested-by: Jason L Tibbitts III Suggested-by: Doug Gilbert Cc: Doug Gilbert Cc: Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen --- drivers/scsi/sg.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 4fe606b000b4..d7ff71e0c85c 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -751,35 +751,6 @@ sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, return count; } -static bool sg_is_valid_dxfer(sg_io_hdr_t *hp) -{ - switch (hp->dxfer_direction) { - case SG_DXFER_NONE: - if (hp->dxferp || hp->dxfer_len > 0) - return false; - return true; - case SG_DXFER_FROM_DEV: - /* - * for SG_DXFER_FROM_DEV we always set dxfer_len to > 0. dxferp - * can either be NULL or != NULL so there's no point in checking - * it either. So just return true. - */ - return true; - case SG_DXFER_TO_DEV: - case SG_DXFER_TO_FROM_DEV: - if (!hp->dxferp || hp->dxfer_len == 0) - return false; - return true; - case SG_DXFER_UNKNOWN: - if ((!hp->dxferp && hp->dxfer_len) || - (hp->dxferp && hp->dxfer_len == 0)) - return false; - return true; - default: - return false; - } -} - static int sg_common_write(Sg_fd * sfp, Sg_request * srp, unsigned char *cmnd, int timeout, int blocking) @@ -800,7 +771,7 @@ sg_common_write(Sg_fd * sfp, Sg_request * srp, "sg_common_write: scsi opcode=0x%02x, cmd_size=%d\n", (int) cmnd[0], (int) hp->cmd_len)); - if (!sg_is_valid_dxfer(hp)) + if (hp->dxfer_len >= SZ_256M) return -EINVAL; k = sg_start_req(srp, cmnd); From 6d0f581d1768d3eaba15776e7dd1fdfec10cfe36 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Fri, 28 Jul 2017 17:42:59 -0700 Subject: [PATCH 024/109] xtensa: fix cache aliasing handling code for WT cache Currently building kernel for xtensa core with aliasing WT cache fails with the following messages: mm/memory.c:2152: undefined reference to `flush_dcache_page' mm/memory.c:2332: undefined reference to `local_flush_cache_page' mm/memory.c:1919: undefined reference to `local_flush_cache_range' mm/memory.c:4179: undefined reference to `copy_to_user_page' mm/memory.c:4183: undefined reference to `copy_from_user_page' This happens because implementation of these functions is only compiled when data cache is WB, which looks wrong: even when data cache doesn't need flushing it still needs invalidation. The functions like __flush_[invalidate_]dcache_* are correctly defined for both WB and WT caches (and even if they weren't that'd still be ok, just slower). Fix this by providing the same implementation of the above functions for both WB and WT cache. Cc: stable@vger.kernel.org Signed-off-by: Max Filippov --- arch/xtensa/mm/cache.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index 1a804a2f9a5b..dbb1cdef3663 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -120,10 +120,6 @@ void copy_user_highpage(struct page *dst, struct page *src, preempt_enable(); } -#endif /* DCACHE_WAY_SIZE > PAGE_SIZE */ - -#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK - /* * Any time the kernel writes to a user page cache page, or it is about to * read from a page cache page this routine is called. @@ -208,7 +204,7 @@ void local_flush_cache_page(struct vm_area_struct *vma, unsigned long address, __invalidate_icache_page_alias(virt, phys); } -#endif +#endif /* DCACHE_WAY_SIZE > PAGE_SIZE */ void update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep) @@ -225,7 +221,7 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep) flush_tlb_page(vma, addr); -#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK +#if (DCACHE_WAY_SIZE > PAGE_SIZE) if (!PageReserved(page) && test_bit(PG_arch_1, &page->flags)) { unsigned long phys = page_to_phys(page); @@ -256,7 +252,7 @@ update_mmu_cache(struct vm_area_struct * vma, unsigned long addr, pte_t *ptep) * flush_dcache_page() on the page. */ -#if (DCACHE_WAY_SIZE > PAGE_SIZE) && XCHAL_DCACHE_IS_WRITEBACK +#if (DCACHE_WAY_SIZE > PAGE_SIZE) void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, const void *src, From 6ab1d8da972d4c4e318607e96c5ecb32101c80f4 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Fri, 28 Jul 2017 21:41:18 +0200 Subject: [PATCH 025/109] block, bfq: reset in_service_entity if it becomes idle BFQ implements hierarchical scheduling by representing each group of queues with a generic parent entity. For each parent entity, BFQ maintains an in_service_entity pointer: if one of the child entities happens to be in service, in_service_entity points to it. The resetting of these pointers happens only on queue expirations: when the in-service queue is expired, i.e., stops to be the queue in service, BFQ resets all in_service_entity pointers along the parent-entity path from this queue to the root entity. Functions handling the scheduling of entities assume, naturally, that in-service entities are active, i.e., have pending I/O requests (or, as a special case, even if they have no pending requests, they are expected to receive a new request very soon, with the scheduler idling the storage device while waiting for such an event). Unfortunately, the above resetting scheme of the in_service_entity pointers may cause this assumption to be violated. For example, the in-service queue may happen to remain without requests because of a request merge. In this case the queue does become idle, and all related data structures are updated accordingly. But in_service_entity still points to the queue in the parent entity. This inconsistency may even propagate to higher-level parent entities, if they happen to become idle as well, as a consequence of the leaf queue becoming idle. For this queue and parent entities, scheduling functions have an undefined behaviour, and, as reported, may easily lead to kernel crashes or hangs. This commit addresses this issue by simply resetting the in_service_entity field also when it is detected to point to an entity becoming idle (regardless of why the entity becomes idle). Reported-by: Laurentiu Nicola Signed-off-by: Paolo Valente Tested-by: Laurentiu Nicola Signed-off-by: Jens Axboe --- block/bfq-wf2q.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 979f8f21b7e2..881bbe5e1827 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1158,8 +1158,10 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) st = bfq_entity_service_tree(entity); is_in_service = entity == sd->in_service_entity; - if (is_in_service) + if (is_in_service) { bfq_calc_finish(entity, entity->service); + sd->in_service_entity = NULL; + } if (entity->tree == &st->active) bfq_active_extract(st, entity); From 46d556e6aaa0ec4dc83648ab1ca3d01dd2fa3ea3 Mon Sep 17 00:00:00 2001 From: Paolo Valente Date: Sat, 29 Jul 2017 12:42:56 +0200 Subject: [PATCH 026/109] block, bfq: consider also in_service_entity to state whether an entity is active Groups of BFQ queues are represented by generic entities in BFQ. When a queue belonging to a parent entity is deactivated, the parent entity may need to be deactivated too, in case the deactivated queue was the only active queue for the parent entity. This deactivation may need to be propagated upwards if the entity belongs, in its turn, to a further higher-level entity, and so on. In particular, the upward propagation of deactivation stops at the first parent entity that remains active even if one of its child entities has been deactivated. To decide whether the last non-deactivation condition holds for a parent entity, BFQ checks whether the field next_in_service is still not NULL for the parent entity, after the deactivation of one of its child entity. If it is not NULL, then there are certainly other active entities in the parent entity, and deactivations can stop. Unfortunately, this check misses a corner case: if in_service_entity is not NULL, then next_in_service may happen to be NULL, although the parent entity is evidently active. This happens if: 1) the entity pointed by in_service_entity is the only active entity in the parent entity, and 2) according to the definition of next_in_service, the in_service_entity cannot be considered as next_in_service. See the comments on the definition of next_in_service for details on this second point. Hitting the above corner case causes crashes. To address this issue, this commit: 1) Extends the above check on only next_in_service to controlling both next_in_service and in_service_entity (if any of them is not NULL, then no further deactivation is performed) 2) Improves the (important) comments on how next_in_service is defined and updated; in particular it fixes a few rather obscure paragraphs Reported-by: Eric Wheeler Reported-by: Rick Yiu Reported-by: Tom X Nguyen Signed-off-by: Paolo Valente Tested-by: Eric Wheeler Tested-by: Rick Yiu Tested-by: Laurentiu Nicola Tested-by: Tom X Nguyen Signed-off-by: Jens Axboe --- block/bfq-iosched.h | 22 +++++-- block/bfq-wf2q.c | 142 ++++++++++++++++++++++++-------------------- 2 files changed, 95 insertions(+), 69 deletions(-) diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 63e771ab56d8..859f0a8c97c8 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -71,17 +71,29 @@ struct bfq_service_tree { * * bfq_sched_data is the basic scheduler queue. It supports three * ioprio_classes, and can be used either as a toplevel queue or as an - * intermediate queue on a hierarchical setup. @next_in_service - * points to the active entity of the sched_data service trees that - * will be scheduled next. It is used to reduce the number of steps - * needed for each hierarchical-schedule update. + * intermediate queue in a hierarchical setup. * * The supported ioprio_classes are the same as in CFQ, in descending * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. * Requests from higher priority queues are served before all the * requests from lower priority queues; among requests of the same * queue requests are served according to B-WF2Q+. - * All the fields are protected by the queue lock of the containing bfqd. + * + * The schedule is implemented by the service trees, plus the field + * @next_in_service, which points to the entity on the active trees + * that will be served next, if 1) no changes in the schedule occurs + * before the current in-service entity is expired, 2) the in-service + * queue becomes idle when it expires, and 3) if the entity pointed by + * in_service_entity is not a queue, then the in-service child entity + * of the entity pointed by in_service_entity becomes idle on + * expiration. This peculiar definition allows for the following + * optimization, not yet exploited: while a given entity is still in + * service, we already know which is the best candidate for next + * service among the other active entitities in the same parent + * entity. We can then quickly compare the timestamps of the + * in-service entity with those of such best candidate. + * + * All fields are protected by the lock of the containing bfqd. */ struct bfq_sched_data { /* entity in service */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 881bbe5e1827..911aa7431dbe 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -188,21 +188,23 @@ static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) /* * This function tells whether entity stops being a candidate for next - * service, according to the following logic. + * service, according to the restrictive definition of the field + * next_in_service. In particular, this function is invoked for an + * entity that is about to be set in service. * - * This function is invoked for an entity that is about to be set in - * service. If such an entity is a queue, then the entity is no longer - * a candidate for next service (i.e, a candidate entity to serve - * after the in-service entity is expired). The function then returns - * true. + * If entity is a queue, then the entity is no longer a candidate for + * next service according to the that definition, because entity is + * about to become the in-service queue. This function then returns + * true if entity is a queue. * - * In contrast, the entity could stil be a candidate for next service - * if it is not a queue, and has more than one child. In fact, even if - * one of its children is about to be set in service, other children - * may still be the next to serve. As a consequence, a non-queue - * entity is not a candidate for next-service only if it has only one - * child. And only if this condition holds, then the function returns - * true for a non-queue entity. + * In contrast, entity could still be a candidate for next service if + * it is not a queue, and has more than one active child. In fact, + * even if one of its children is about to be set in service, other + * active children may still be the next to serve, for the parent + * entity, even according to the above definition. As a consequence, a + * non-queue entity is not a candidate for next-service only if it has + * only one active child. And only if this condition holds, then this + * function returns true for a non-queue entity. */ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) { @@ -213,6 +215,18 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) bfqg = container_of(entity, struct bfq_group, entity); + /* + * The field active_entities does not always contain the + * actual number of active children entities: it happens to + * not account for the in-service entity in case the latter is + * removed from its active tree (which may get done after + * invoking the function bfq_no_longer_next_in_service in + * bfq_get_next_queue). Fortunately, here, i.e., while + * bfq_no_longer_next_in_service is not yet completed in + * bfq_get_next_queue, bfq_active_extract has not yet been + * invoked, and thus active_entities still coincides with the + * actual number of active entities. + */ if (bfqg->active_entities == 1) return true; @@ -954,7 +968,7 @@ static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, * one of its children receives a new request. * * Basically, this function updates the timestamps of entity and - * inserts entity into its active tree, ater possible extracting it + * inserts entity into its active tree, ater possibly extracting it * from its idle tree. */ static void __bfq_activate_entity(struct bfq_entity *entity, @@ -1048,7 +1062,7 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) entity->start = entity->finish; /* * In addition, if the entity had more than one child - * when set in service, then was not extracted from + * when set in service, then it was not extracted from * the active tree. This implies that the position of * the entity in the active tree may need to be * changed now, because we have just updated the start @@ -1056,9 +1070,8 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) * time in a moment (the requeueing is then, more * precisely, a repositioning in this case). To * implement this repositioning, we: 1) dequeue the - * entity here, 2) update the finish time and - * requeue the entity according to the new - * timestamps below. + * entity here, 2) update the finish time and requeue + * the entity according to the new timestamps below. */ if (entity->tree) bfq_active_extract(st, entity); @@ -1105,9 +1118,10 @@ static void __bfq_activate_requeue_entity(struct bfq_entity *entity, /** - * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, - * and activate, requeue or reposition all ancestors - * for which such an update becomes necessary. + * bfq_activate_requeue_entity - activate or requeue an entity representing a + * bfq_queue, and activate, requeue or reposition + * all ancestors for which such an update becomes + * necessary. * @entity: the entity to activate. * @non_blocking_wait_rq: true if this entity was waiting for a request * @requeue: true if this is a requeue, which implies that bfqq is @@ -1135,9 +1149,9 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, * @ins_into_idle_tree: if false, the entity will not be put into the * idle tree. * - * Deactivates an entity, independently from its previous state. Must + * Deactivates an entity, independently of its previous state. Must * be invoked only if entity is on a service tree. Extracts the entity - * from that tree, and if necessary and allowed, puts it on the idle + * from that tree, and if necessary and allowed, puts it into the idle * tree. */ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) @@ -1179,7 +1193,7 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) /** * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. * @entity: the entity to deactivate. - * @ins_into_idle_tree: true if the entity can be put on the idle tree + * @ins_into_idle_tree: true if the entity can be put into the idle tree */ static void bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree, @@ -1210,16 +1224,29 @@ static void bfq_deactivate_entity(struct bfq_entity *entity, */ bfq_update_next_in_service(sd, NULL); - if (sd->next_in_service) + if (sd->next_in_service || sd->in_service_entity) { /* - * The parent entity is still backlogged, - * because next_in_service is not NULL. So, no - * further upwards deactivation must be - * performed. Yet, next_in_service has - * changed. Then the schedule does need to be - * updated upwards. + * The parent entity is still active, because + * either next_in_service or in_service_entity + * is not NULL. So, no further upwards + * deactivation must be performed. Yet, + * next_in_service has changed. Then the + * schedule does need to be updated upwards. + * + * NOTE If in_service_entity is not NULL, then + * next_in_service may happen to be NULL, + * although the parent entity is evidently + * active. This happens if 1) the entity + * pointed by in_service_entity is the only + * active entity in the parent entity, and 2) + * according to the definition of + * next_in_service, the in_service_entity + * cannot be considered as + * next_in_service. See the comments on the + * definition of next_in_service for details. */ break; + } /* * If we get here, then the parent is no more @@ -1496,47 +1523,34 @@ struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) /* * If entity is no longer a candidate for next - * service, then we extract it from its active tree, - * for the following reason. To further boost the - * throughput in some special case, BFQ needs to know - * which is the next candidate entity to serve, while - * there is already an entity in service. In this - * respect, to make it easy to compute/update the next - * candidate entity to serve after the current - * candidate has been set in service, there is a case - * where it is necessary to extract the current - * candidate from its service tree. Such a case is - * when the entity just set in service cannot be also - * a candidate for next service. Details about when - * this conditions holds are reported in the comments - * on the function bfq_no_longer_next_in_service() - * invoked below. + * service, then it must be extracted from its active + * tree, so as to make sure that it won't be + * considered when computing next_in_service. See the + * comments on the function + * bfq_no_longer_next_in_service() for details. */ if (bfq_no_longer_next_in_service(entity)) bfq_active_extract(bfq_entity_service_tree(entity), entity); /* - * For the same reason why we may have just extracted - * entity from its active tree, we may need to update - * next_in_service for the sched_data of entity too, - * regardless of whether entity has been extracted. - * In fact, even if entity has not been extracted, a - * descendant entity may get extracted. Such an event - * would cause a change in next_in_service for the - * level of the descendant entity, and thus possibly - * back to upper levels. + * Even if entity is not to be extracted according to + * the above check, a descendant entity may get + * extracted in one of the next iterations of this + * loop. Such an event could cause a change in + * next_in_service for the level of the descendant + * entity, and thus possibly back to this level. * - * We cannot perform the resulting needed update - * before the end of this loop, because, to know which - * is the correct next-to-serve candidate entity for - * each level, we need first to find the leaf entity - * to set in service. In fact, only after we know - * which is the next-to-serve leaf entity, we can - * discover whether the parent entity of the leaf - * entity becomes the next-to-serve, and so on. + * However, we cannot perform the resulting needed + * update of next_in_service for this level before the + * end of the whole loop, because, to know which is + * the correct next-to-serve candidate entity for each + * level, we need first to find the leaf entity to set + * in service. In fact, only after we know which is + * the next-to-serve leaf entity, we can discover + * whether the parent entity of the leaf entity + * becomes the next-to-serve, and so on. */ - } bfqq = bfq_entity_to_bfqq(entity); From 54e22f265e872ae140755b3318521d400a094605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Linus=20L=C3=BCssing?= Date: Thu, 6 Jul 2017 07:02:25 +0200 Subject: [PATCH 027/109] batman-adv: fix TT sync flag inconsistencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes an issue in the translation table code potentially leading to a TT Request + Response storm. The issue may occur for nodes involving BLA and an inconsistent configuration of the batman-adv AP isolation feature. However, since the new multicast optimizations, a single, malformed packet may lead to a mesh-wide, persistent Denial-of-Service, too. The issue occurs because nodes are currently OR-ing the TT sync flags of all originators announcing a specific MAC address via the translation table. When an intermediate node now receives a TT Request and wants to answer this on behalf of the destination node, then this intermediate node now responds with an altered flag field and broken CRC. The next OGM of the real destination will lead to a CRC mismatch and triggering a TT Request and Response again. Furthermore, the OR-ing is currently never undone as long as at least one originator announcing the according MAC address remains, leading to the potential persistency of this issue. This patch fixes this issue by storing the flags used in the CRC calculation on a a per TT orig entry basis to be able to respond with the correct, original flags in an intermediate TT Response for one thing. And to be able to correctly unset sync flags once all nodes announcing a sync flag vanish for another. Fixes: e9c00136a475 ("batman-adv: fix tt_global_entries flags update") Signed-off-by: Linus Lüssing Acked-by: Antonio Quartulli [sw: typo in commit message] Signed-off-by: Simon Wunderlich --- net/batman-adv/translation-table.c | 60 +++++++++++++++++++++++++----- net/batman-adv/types.h | 2 + 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index e1133bc634b5..8a3ce79b1307 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1549,9 +1549,41 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry, return found; } +/** + * batadv_tt_global_sync_flags - update TT sync flags + * @tt_global: the TT global entry to update sync flags in + * + * Updates the sync flag bits in the tt_global flag attribute with a logical + * OR of all sync flags from any of its TT orig entries. + */ +static void +batadv_tt_global_sync_flags(struct batadv_tt_global_entry *tt_global) +{ + struct batadv_tt_orig_list_entry *orig_entry; + const struct hlist_head *head; + u16 flags = BATADV_NO_FLAGS; + + rcu_read_lock(); + head = &tt_global->orig_list; + hlist_for_each_entry_rcu(orig_entry, head, list) + flags |= orig_entry->flags; + rcu_read_unlock(); + + flags |= tt_global->common.flags & (~BATADV_TT_SYNC_MASK); + tt_global->common.flags = flags; +} + +/** + * batadv_tt_global_orig_entry_add - add or update a TT orig entry + * @tt_global: the TT global entry to add an orig entry in + * @orig_node: the originator to add an orig entry for + * @ttvn: translation table version number of this changeset + * @flags: TT sync flags + */ static void batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, - struct batadv_orig_node *orig_node, int ttvn) + struct batadv_orig_node *orig_node, int ttvn, + u8 flags) { struct batadv_tt_orig_list_entry *orig_entry; @@ -1561,7 +1593,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, * was added during a "temporary client detection" */ orig_entry->ttvn = ttvn; - goto out; + orig_entry->flags = flags; + goto sync_flags; } orig_entry = kmem_cache_zalloc(batadv_tt_orig_cache, GFP_ATOMIC); @@ -1573,6 +1606,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, batadv_tt_global_size_inc(orig_node, tt_global->common.vid); orig_entry->orig_node = orig_node; orig_entry->ttvn = ttvn; + orig_entry->flags = flags; kref_init(&orig_entry->refcount); spin_lock_bh(&tt_global->list_lock); @@ -1582,6 +1616,8 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global, spin_unlock_bh(&tt_global->list_lock); atomic_inc(&tt_global->orig_list_count); +sync_flags: + batadv_tt_global_sync_flags(tt_global); out: if (orig_entry) batadv_tt_orig_list_entry_put(orig_entry); @@ -1703,10 +1739,10 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } /* the change can carry possible "attribute" flags like the - * TT_CLIENT_WIFI, therefore they have to be copied in the + * TT_CLIENT_TEMP, therefore they have to be copied in the * client entry */ - common->flags |= flags; + common->flags |= flags & (~BATADV_TT_SYNC_MASK); /* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only * one originator left in the list and we previously received a @@ -1723,7 +1759,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, } add_orig_entry: /* add the new orig_entry (if needed) or update it */ - batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn); + batadv_tt_global_orig_entry_add(tt_global_entry, orig_node, ttvn, + flags & BATADV_TT_SYNC_MASK); batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", @@ -1946,6 +1983,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, struct batadv_tt_orig_list_entry *orig, bool best) { + u16 flags = (common->flags & (~BATADV_TT_SYNC_MASK)) | orig->flags; void *hdr; struct batadv_orig_node_vlan *vlan; u8 last_ttvn; @@ -1975,7 +2013,7 @@ batadv_tt_global_dump_subentry(struct sk_buff *msg, u32 portid, u32 seq, nla_put_u8(msg, BATADV_ATTR_TT_LAST_TTVN, last_ttvn) || nla_put_u32(msg, BATADV_ATTR_TT_CRC32, crc) || nla_put_u16(msg, BATADV_ATTR_TT_VID, common->vid) || - nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, common->flags)) + nla_put_u32(msg, BATADV_ATTR_TT_FLAGS, flags)) goto nla_put_failure; if (best && nla_put_flag(msg, BATADV_ATTR_FLAG_BEST)) @@ -2589,6 +2627,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, unsigned short vid) { struct batadv_hashtable *hash = bat_priv->tt.global_hash; + struct batadv_tt_orig_list_entry *tt_orig; struct batadv_tt_common_entry *tt_common; struct batadv_tt_global_entry *tt_global; struct hlist_head *head; @@ -2627,8 +2666,9 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* find out if this global entry is announced by this * originator */ - if (!batadv_tt_global_entry_has_orig(tt_global, - orig_node)) + tt_orig = batadv_tt_global_orig_entry_find(tt_global, + orig_node); + if (!tt_orig) continue; /* use network order to read the VID: this ensures that @@ -2640,10 +2680,12 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv, /* compute the CRC on flags that have to be kept in sync * among nodes */ - flags = tt_common->flags & BATADV_TT_SYNC_MASK; + flags = tt_orig->flags; crc_tmp = crc32c(crc_tmp, &flags, sizeof(flags)); crc ^= crc32c(crc_tmp, tt_common->addr, ETH_ALEN); + + batadv_tt_orig_list_entry_put(tt_orig); } rcu_read_unlock(); } diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index ea43a6449247..a62795868794 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1260,6 +1260,7 @@ struct batadv_tt_global_entry { * struct batadv_tt_orig_list_entry - orig node announcing a non-mesh client * @orig_node: pointer to orig node announcing this non-mesh client * @ttvn: translation table version number which added the non-mesh client + * @flags: per orig entry TT sync flags * @list: list node for batadv_tt_global_entry::orig_list * @refcount: number of contexts the object is used * @rcu: struct used for freeing in an RCU-safe manner @@ -1267,6 +1268,7 @@ struct batadv_tt_global_entry { struct batadv_tt_orig_list_entry { struct batadv_orig_node *orig_node; u8 ttvn; + u8 flags; struct hlist_node list; struct kref refcount; struct rcu_head rcu; From c64ffff7a9d1eee6624d3eaab36968fe6df31a9f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Jul 2017 17:13:28 +0300 Subject: [PATCH 028/109] i2c: core: Allow empty id_table in ACPI case as well For now empty ID table is not allowed with ACPI and prevents driver to be probed. Add a check to allow empty ID table. This introduces a helper i2c_acpi_match_device(). Note, we rename some static function in i2c-core-acpi.c to distinguish with public API. Fixes: da10c06a044b ("i2c: Make I2C ID tables non-mandatory for DT'ed devices") Signed-off-by: Andy Shevchenko Tested-by: Rajmohan Mani Acked-by: Mika Westerberg [wsa: needed to get some drivers probed again] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-core-acpi.c | 19 +++++++++++++++---- drivers/i2c/i2c-core-base.c | 1 + drivers/i2c/i2c-core.h | 9 +++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/drivers/i2c/i2c-core-acpi.c b/drivers/i2c/i2c-core-acpi.c index 4842ec3a5451..a9126b3cda61 100644 --- a/drivers/i2c/i2c-core-acpi.c +++ b/drivers/i2c/i2c-core-acpi.c @@ -230,6 +230,16 @@ void i2c_acpi_register_devices(struct i2c_adapter *adap) dev_warn(&adap->dev, "failed to enumerate I2C slaves\n"); } +const struct acpi_device_id * +i2c_acpi_match_device(const struct acpi_device_id *matches, + struct i2c_client *client) +{ + if (!(client && matches)) + return NULL; + + return acpi_match_device(matches, &client->dev); +} + static acpi_status i2c_acpi_lookup_speed(acpi_handle handle, u32 level, void *data, void **return_value) { @@ -289,7 +299,7 @@ u32 i2c_acpi_find_bus_speed(struct device *dev) } EXPORT_SYMBOL_GPL(i2c_acpi_find_bus_speed); -static int i2c_acpi_match_adapter(struct device *dev, void *data) +static int i2c_acpi_find_match_adapter(struct device *dev, void *data) { struct i2c_adapter *adapter = i2c_verify_adapter(dev); @@ -299,7 +309,7 @@ static int i2c_acpi_match_adapter(struct device *dev, void *data) return ACPI_HANDLE(dev) == (acpi_handle)data; } -static int i2c_acpi_match_device(struct device *dev, void *data) +static int i2c_acpi_find_match_device(struct device *dev, void *data) { return ACPI_COMPANION(dev) == data; } @@ -309,7 +319,7 @@ static struct i2c_adapter *i2c_acpi_find_adapter_by_handle(acpi_handle handle) struct device *dev; dev = bus_find_device(&i2c_bus_type, NULL, handle, - i2c_acpi_match_adapter); + i2c_acpi_find_match_adapter); return dev ? i2c_verify_adapter(dev) : NULL; } @@ -317,7 +327,8 @@ static struct i2c_client *i2c_acpi_find_client_by_adev(struct acpi_device *adev) { struct device *dev; - dev = bus_find_device(&i2c_bus_type, NULL, adev, i2c_acpi_match_device); + dev = bus_find_device(&i2c_bus_type, NULL, adev, + i2c_acpi_find_match_device); return dev ? i2c_verify_client(dev) : NULL; } diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c index c89dac7fd2e7..12822a4b8f8f 100644 --- a/drivers/i2c/i2c-core-base.c +++ b/drivers/i2c/i2c-core-base.c @@ -357,6 +357,7 @@ static int i2c_device_probe(struct device *dev) * Tree match table entry is supplied for the probing device. */ if (!driver->id_table && + !i2c_acpi_match_device(dev->driver->acpi_match_table, client) && !i2c_of_match_device(dev->driver->of_match_table, client)) return -ENODEV; diff --git a/drivers/i2c/i2c-core.h b/drivers/i2c/i2c-core.h index 3b63f5e5b89c..3d3d9bf02101 100644 --- a/drivers/i2c/i2c-core.h +++ b/drivers/i2c/i2c-core.h @@ -31,9 +31,18 @@ int i2c_check_addr_validity(unsigned addr, unsigned short flags); int i2c_check_7bit_addr_validity_strict(unsigned short addr); #ifdef CONFIG_ACPI +const struct acpi_device_id * +i2c_acpi_match_device(const struct acpi_device_id *matches, + struct i2c_client *client); void i2c_acpi_register_devices(struct i2c_adapter *adap); #else /* CONFIG_ACPI */ static inline void i2c_acpi_register_devices(struct i2c_adapter *adap) { } +static inline const struct acpi_device_id * +i2c_acpi_match_device(const struct acpi_device_id *matches, + struct i2c_client *client) +{ + return NULL; +} #endif /* CONFIG_ACPI */ extern struct notifier_block i2c_acpi_notifier; From 22acc37b8681461707a3fc73505af808f9af8260 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 13 Jul 2017 15:45:01 +0200 Subject: [PATCH 029/109] i2c: designware: Print clock freq on invalid clock freq error When we refuse to probe due to an invalid clock frequency, log the frequency which is causing this error. Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 2ea6d0d25a01..d139b156f9c9 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -319,7 +319,8 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) if (dev->clk_freq != 100000 && dev->clk_freq != 400000 && dev->clk_freq != 1000000 && dev->clk_freq != 3400000) { dev_err(&pdev->dev, - "Only 100kHz, 400kHz, 1MHz and 3.4MHz supported"); + "%d Hz is unsupported, only 100kHz, 400kHz, 1MHz and 3.4MHz are supported\n", + dev->clk_freq); ret = -EINVAL; goto exit_reset; } From 682c6c2188f39d13548ccdc89c9888fbcb547889 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 13 Jul 2017 15:45:02 +0200 Subject: [PATCH 030/109] i2c: designware: Some broken DSTDs use 1MiHz instead of 1MHz At least the Acer Iconia Tab8 / aka W1-810 uses 1MiHz instead of 1MHz for one of its busses, fix this up to 1MHz instead of failing the probe of that bus. This fixes the accelerometer on the Acer Iconia Tab8 not working. Cc: stable@vger.kernel.org Signed-off-by: Hans de Goede Reviewed-by: Andy Shevchenko Signed-off-by: Wolfram Sang --- drivers/i2c/busses/i2c-designware-platdrv.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index d139b156f9c9..143a8fd582b4 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -298,6 +298,9 @@ static int dw_i2c_plat_probe(struct platform_device *pdev) } acpi_speed = i2c_acpi_find_bus_speed(&pdev->dev); + /* Some broken DSTDs use 1MiHz instead of 1MHz */ + if (acpi_speed == 1048576) + acpi_speed = 1000000; /* * Find bus speed from the "clock-frequency" device property, ACPI * or by using fast mode if neither is set. From 5871a1538cd2d7a4a18e5e21adc7e9e6dab702e2 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 26 Jul 2017 11:03:49 +0100 Subject: [PATCH 031/109] i2c: allow i2c-versatile for ARM MPS platforms Allow i2c-versatile to be enabled for ARM MPS platforms. Signed-off-by: Russell King Signed-off-by: Wolfram Sang --- drivers/i2c/busses/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 1006b230b236..65fa29591d21 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -983,7 +983,7 @@ config I2C_UNIPHIER_F config I2C_VERSATILE tristate "ARM Versatile/Realview I2C bus support" - depends on ARCH_VERSATILE || ARCH_REALVIEW || ARCH_VEXPRESS || COMPILE_TEST + depends on ARCH_MPS2 || ARCH_VERSATILE || ARCH_REALVIEW || ARCH_VEXPRESS || COMPILE_TEST select I2C_ALGOBIT help Say yes if you want to support the I2C serial bus on ARMs Versatile From 9c80034921d1ece5c0e3241ba1645bce32387684 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sat, 29 Jul 2017 14:11:43 +0200 Subject: [PATCH 032/109] i2c: rephrase explanation of I2C_CLASS_DEPRECATED Hopefully making clear that it is not needed for new drivers. Signed-off-by: Wolfram Sang --- include/linux/i2c.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 00ca5b86a753..d501d3956f13 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -689,7 +689,8 @@ i2c_unlock_adapter(struct i2c_adapter *adapter) #define I2C_CLASS_HWMON (1<<0) /* lm_sensors, ... */ #define I2C_CLASS_DDC (1<<3) /* DDC bus on graphics adapters */ #define I2C_CLASS_SPD (1<<7) /* Memory modules */ -#define I2C_CLASS_DEPRECATED (1<<8) /* Warn users that adapter will stop using classes */ +/* Warn users that the adapter doesn't support classes anymore */ +#define I2C_CLASS_DEPRECATED (1<<8) /* Internal numbers to terminate lists */ #define I2C_CLIENT_END 0xfffeU From 7f81e55c737a8fa82c71f290945d729a4902f8d2 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 1 Aug 2017 11:02:46 -0700 Subject: [PATCH 033/109] xtensa: don't limit csum_partial export by CONFIG_NET csum_partial and csum_partial_copy_generic are defined unconditionally and are available even when CONFIG_NET is disabled. They are used not only by the network drivers, but also by scsi and media. Don't limit these functions export by CONFIG_NET. Cc: stable@vger.kernel.org Signed-off-by: Max Filippov --- arch/xtensa/kernel/xtensa_ksyms.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/xtensa/kernel/xtensa_ksyms.c b/arch/xtensa/kernel/xtensa_ksyms.c index d159e9b9c018..672391003e40 100644 --- a/arch/xtensa/kernel/xtensa_ksyms.c +++ b/arch/xtensa/kernel/xtensa_ksyms.c @@ -94,13 +94,11 @@ unsigned long __sync_fetch_and_or_4(unsigned long *p, unsigned long v) } EXPORT_SYMBOL(__sync_fetch_and_or_4); -#ifdef CONFIG_NET /* * Networking support */ EXPORT_SYMBOL(csum_partial); EXPORT_SYMBOL(csum_partial_copy_generic); -#endif /* CONFIG_NET */ /* * Architecture-specific symbols From bc652eb6a0d5cffaea7dc8e8ad488aab2a1bf1ed Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Tue, 1 Aug 2017 11:15:15 -0700 Subject: [PATCH 034/109] xtensa: mm/cache: add missing EXPORT_SYMBOLs Functions clear_user_highpage, copy_user_highpage, flush_dcache_page, local_flush_cache_range and local_flush_cache_page may be used from modules. Export them. Cc: stable@vger.kernel.org Signed-off-by: Max Filippov --- arch/xtensa/mm/cache.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/xtensa/mm/cache.c b/arch/xtensa/mm/cache.c index dbb1cdef3663..3c75c4e597da 100644 --- a/arch/xtensa/mm/cache.c +++ b/arch/xtensa/mm/cache.c @@ -103,6 +103,7 @@ void clear_user_highpage(struct page *page, unsigned long vaddr) clear_page_alias(kvaddr, paddr); preempt_enable(); } +EXPORT_SYMBOL(clear_user_highpage); void copy_user_highpage(struct page *dst, struct page *src, unsigned long vaddr, struct vm_area_struct *vma) @@ -119,6 +120,7 @@ void copy_user_highpage(struct page *dst, struct page *src, copy_page_alias(dst_vaddr, src_vaddr, dst_paddr, src_paddr); preempt_enable(); } +EXPORT_SYMBOL(copy_user_highpage); /* * Any time the kernel writes to a user page cache page, or it is about to @@ -172,7 +174,7 @@ void flush_dcache_page(struct page *page) /* There shouldn't be an entry in the cache for this page anymore. */ } - +EXPORT_SYMBOL(flush_dcache_page); /* * For now, flush the whole cache. FIXME?? @@ -184,6 +186,7 @@ void local_flush_cache_range(struct vm_area_struct *vma, __flush_invalidate_dcache_all(); __invalidate_icache_all(); } +EXPORT_SYMBOL(local_flush_cache_range); /* * Remove any entry in the cache for this page. @@ -203,6 +206,7 @@ void local_flush_cache_page(struct vm_area_struct *vma, unsigned long address, __flush_invalidate_dcache_page_alias(virt, phys); __invalidate_icache_page_alias(virt, phys); } +EXPORT_SYMBOL(local_flush_cache_page); #endif /* DCACHE_WAY_SIZE > PAGE_SIZE */ From d9535cb7b7603aeb549c697ecdf92024e4d0a650 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 28 Jul 2017 17:30:02 -0500 Subject: [PATCH 035/109] ptp: introduce ptp auxiliary worker Many PTP drivers required to perform some asynchronous or periodic work, like periodically handling PHC counter overflow or handle delayed timestamp for RX/TX network packets. In most of the cases, such work is implemented using workqueues. Unfortunately, Kernel workqueues might introduce significant delay in work scheduling under high system load and on -RT, which could cause misbehavior of PTP drivers due to internal counter overflow, for example, and there is no way to tune its execution policy and priority manuallly. Hence, The kthread_worker can be used insted of workqueues, as it create separte named kthread for each worker and its its execution policy and priority can be configured using chrt tool. This prblem was reported for two drivers TI CPSW CPTS and dp83640, so instead of modifying each of these driver it was proposed to add PTP auxiliary worker to the PHC subsystem. The patch adds PTP auxiliary worker in PHC subsystem using kthread_worker and kthread_delayed_work and introduces two new PHC subsystem APIs: - long (*do_aux_work)(struct ptp_clock_info *ptp) callback in ptp_clock_info structure, which driver should assign if it require to perform asynchronous or periodic work. Driver should return the delay of the PTP next auxiliary work scheduling time (>=0) or negative value in case further scheduling is not required. - int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) which allows schedule PTP auxiliary work. The name of kthread_worker thread corresponds PTP PHC device name "ptp%d". Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/ptp/ptp_clock.c | 42 ++++++++++++++++++++++++++++++++ drivers/ptp/ptp_private.h | 3 +++ include/linux/ptp_clock_kernel.h | 20 +++++++++++++++ 3 files changed, 65 insertions(+) diff --git a/drivers/ptp/ptp_clock.c b/drivers/ptp/ptp_clock.c index b77435783ef3..7eacc1c4b3b1 100644 --- a/drivers/ptp/ptp_clock.c +++ b/drivers/ptp/ptp_clock.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "ptp_private.h" @@ -184,6 +185,19 @@ static void delete_ptp_clock(struct posix_clock *pc) kfree(ptp); } +static void ptp_aux_kworker(struct kthread_work *work) +{ + struct ptp_clock *ptp = container_of(work, struct ptp_clock, + aux_work.work); + struct ptp_clock_info *info = ptp->info; + long delay; + + delay = info->do_aux_work(info); + + if (delay >= 0) + kthread_queue_delayed_work(ptp->kworker, &ptp->aux_work, delay); +} + /* public interface */ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, @@ -217,6 +231,20 @@ struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, mutex_init(&ptp->pincfg_mux); init_waitqueue_head(&ptp->tsev_wq); + if (ptp->info->do_aux_work) { + char *worker_name = kasprintf(GFP_KERNEL, "ptp%d", ptp->index); + + kthread_init_delayed_work(&ptp->aux_work, ptp_aux_kworker); + ptp->kworker = kthread_create_worker(0, worker_name ? + worker_name : info->name); + kfree(worker_name); + if (IS_ERR(ptp->kworker)) { + err = PTR_ERR(ptp->kworker); + pr_err("failed to create ptp aux_worker %d\n", err); + goto kworker_err; + } + } + err = ptp_populate_pin_groups(ptp); if (err) goto no_pin_groups; @@ -259,6 +287,9 @@ no_pps: no_device: ptp_cleanup_pin_groups(ptp); no_pin_groups: + if (ptp->kworker) + kthread_destroy_worker(ptp->kworker); +kworker_err: mutex_destroy(&ptp->tsevq_mux); mutex_destroy(&ptp->pincfg_mux); ida_simple_remove(&ptp_clocks_map, index); @@ -274,6 +305,11 @@ int ptp_clock_unregister(struct ptp_clock *ptp) ptp->defunct = 1; wake_up_interruptible(&ptp->tsev_wq); + if (ptp->kworker) { + kthread_cancel_delayed_work_sync(&ptp->aux_work); + kthread_destroy_worker(ptp->kworker); + } + /* Release the clock's resources. */ if (ptp->pps_source) pps_unregister_source(ptp->pps_source); @@ -339,6 +375,12 @@ int ptp_find_pin(struct ptp_clock *ptp, } EXPORT_SYMBOL(ptp_find_pin); +int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay) +{ + return kthread_mod_delayed_work(ptp->kworker, &ptp->aux_work, delay); +} +EXPORT_SYMBOL(ptp_schedule_worker); + /* module operations */ static void __exit ptp_exit(void) diff --git a/drivers/ptp/ptp_private.h b/drivers/ptp/ptp_private.h index d95888974d0c..b86f1bfecd6f 100644 --- a/drivers/ptp/ptp_private.h +++ b/drivers/ptp/ptp_private.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -56,6 +57,8 @@ struct ptp_clock { struct attribute_group pin_attr_group; /* 1st entry is a pointer to the real group, 2nd is NULL terminator */ const struct attribute_group *pin_attr_groups[2]; + struct kthread_worker *kworker; + struct kthread_delayed_work aux_work; }; /* diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index a026bfd089db..51349d124ee5 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -99,6 +99,11 @@ struct system_device_crosststamp; * parameter func: the desired function to use. * parameter chan: the function channel index to use. * + * @do_work: Request driver to perform auxiliary (periodic) operations + * Driver should return delay of the next auxiliary work scheduling + * time (>=0) or negative value in case further scheduling + * is not required. + * * Drivers should embed their ptp_clock_info within a private * structure, obtaining a reference to it using container_of(). * @@ -126,6 +131,7 @@ struct ptp_clock_info { struct ptp_clock_request *request, int on); int (*verify)(struct ptp_clock_info *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan); + long (*do_aux_work)(struct ptp_clock_info *ptp); }; struct ptp_clock; @@ -211,6 +217,16 @@ extern int ptp_clock_index(struct ptp_clock *ptp); int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan); +/** + * ptp_schedule_worker() - schedule ptp auxiliary work + * + * @ptp: The clock obtained from ptp_clock_register(). + * @delay: number of jiffies to wait before queuing + * See kthread_queue_delayed_work() for more info. + */ + +int ptp_schedule_worker(struct ptp_clock *ptp, unsigned long delay); + #else static inline struct ptp_clock *ptp_clock_register(struct ptp_clock_info *info, struct device *parent) @@ -225,6 +241,10 @@ static inline int ptp_clock_index(struct ptp_clock *ptp) static inline int ptp_find_pin(struct ptp_clock *ptp, enum ptp_pin_function func, unsigned int chan) { return -1; } +static inline int ptp_schedule_worker(struct ptp_clock *ptp, + unsigned long delay) +{ return -EOPNOTSUPP; } + #endif #endif From 999f129289ab1599e7cdc08804a9b5ec4e1418f4 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 28 Jul 2017 17:30:03 -0500 Subject: [PATCH 036/109] net: ethernet: ti: cpts: convert to use ptp auxiliary worker There could be significant delay in CPTS work schedule under high system load and on -RT which could cause CPTS misbehavior due to internal counter overflow. Usage of own kthread_worker allows to avoid such kind of issues and makes it possible to tune priority of CPTS kthread_worker thread on -RT (thread name "cpts"). Hence, the CPTS driver is converted to use PTP auxiliary worker as PHC subsystem implements such functionality in a generic way now. Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 27 +++++++++++++-------------- drivers/net/ethernet/ti/cpts.h | 1 - 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 32279d21c836..3ed438ef9e1e 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -224,6 +224,17 @@ static int cpts_ptp_enable(struct ptp_clock_info *ptp, return -EOPNOTSUPP; } +static long cpts_overflow_check(struct ptp_clock_info *ptp) +{ + struct cpts *cpts = container_of(ptp, struct cpts, info); + unsigned long delay = cpts->ov_check_period; + struct timespec64 ts; + + cpts_ptp_gettime(&cpts->info, &ts); + pr_debug("cpts overflow check at %lld.%09lu\n", ts.tv_sec, ts.tv_nsec); + return (long)delay; +} + static struct ptp_clock_info cpts_info = { .owner = THIS_MODULE, .name = "CTPS timer", @@ -236,18 +247,9 @@ static struct ptp_clock_info cpts_info = { .gettime64 = cpts_ptp_gettime, .settime64 = cpts_ptp_settime, .enable = cpts_ptp_enable, + .do_aux_work = cpts_overflow_check, }; -static void cpts_overflow_check(struct work_struct *work) -{ - struct timespec64 ts; - struct cpts *cpts = container_of(work, struct cpts, overflow_work.work); - - cpts_ptp_gettime(&cpts->info, &ts); - pr_debug("cpts overflow check at %lld.%09lu\n", ts.tv_sec, ts.tv_nsec); - schedule_delayed_work(&cpts->overflow_work, cpts->ov_check_period); -} - static int cpts_match(struct sk_buff *skb, unsigned int ptp_class, u16 ts_seqid, u8 ts_msgtype) { @@ -378,7 +380,7 @@ int cpts_register(struct cpts *cpts) } cpts->phc_index = ptp_clock_index(cpts->clock); - schedule_delayed_work(&cpts->overflow_work, cpts->ov_check_period); + ptp_schedule_worker(cpts->clock, cpts->ov_check_period); return 0; err_ptp: @@ -392,8 +394,6 @@ void cpts_unregister(struct cpts *cpts) if (WARN_ON(!cpts->clock)) return; - cancel_delayed_work_sync(&cpts->overflow_work); - ptp_clock_unregister(cpts->clock); cpts->clock = NULL; @@ -476,7 +476,6 @@ struct cpts *cpts_create(struct device *dev, void __iomem *regs, cpts->dev = dev; cpts->reg = (struct cpsw_cpts __iomem *)regs; spin_lock_init(&cpts->lock); - INIT_DELAYED_WORK(&cpts->overflow_work, cpts_overflow_check); ret = cpts_of_parse(cpts, node); if (ret) diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index 01ea82ba9cdc..586edd9c7de0 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -119,7 +119,6 @@ struct cpts { u32 cc_mult; /* for the nominal frequency */ struct cyclecounter cc; struct timecounter tc; - struct delayed_work overflow_work; int phc_index; struct clk *refclk; struct list_head events; From 0d5f54fec0a18eea5e4ac005646fd2bc2118636c Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 28 Jul 2017 17:30:04 -0500 Subject: [PATCH 037/109] net: ethernet: ti: cpts: fix tx timestamping timeout With the low speed Ethernet connection CPDMA notification about packet processing can be received before CPTS TX timestamp event, which is set when packet actually left CPSW while cpdma notification is sent when packet pushed in CPSW fifo. As result, when connection is slow and CPU is fast enough TX timestamping is not working properly. Fix it, by introducing TX SKB queue to store PTP SKBs for which Ethernet Transmit Event hasn't been received yet and then re-check this queue with new Ethernet Transmit Events by scheduling CPTS overflow work more often (every 1 jiffies) until TX SKB queue is not empty. Side effect of this change is: - User space tools require to take into account possible delay in TX timestamp processing (for example ptp4l works with tx_timestamp_timeout=400 under net traffic and tx_timestamp_timeout=25 in idle). Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 84 +++++++++++++++++++++++++++++++++- drivers/net/ethernet/ti/cpts.h | 1 + 2 files changed, 83 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 3ed438ef9e1e..95a0076773b3 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -31,9 +31,18 @@ #include "cpts.h" +#define CPTS_SKB_TX_WORK_TIMEOUT 1 /* jiffies */ + +struct cpts_skb_cb_data { + unsigned long tmo; +}; + #define cpts_read32(c, r) readl_relaxed(&c->reg->r) #define cpts_write32(c, v, r) writel_relaxed(v, &c->reg->r) +static int cpts_match(struct sk_buff *skb, unsigned int ptp_class, + u16 ts_seqid, u8 ts_msgtype); + static int event_expired(struct cpts_event *event) { return time_after(jiffies, event->tmo); @@ -77,6 +86,47 @@ static int cpts_purge_events(struct cpts *cpts) return removed ? 0 : -1; } +static bool cpts_match_tx_ts(struct cpts *cpts, struct cpts_event *event) +{ + struct sk_buff *skb, *tmp; + u16 seqid; + u8 mtype; + bool found = false; + + mtype = (event->high >> MESSAGE_TYPE_SHIFT) & MESSAGE_TYPE_MASK; + seqid = (event->high >> SEQUENCE_ID_SHIFT) & SEQUENCE_ID_MASK; + + /* no need to grab txq.lock as access is always done under cpts->lock */ + skb_queue_walk_safe(&cpts->txq, skb, tmp) { + struct skb_shared_hwtstamps ssh; + unsigned int class = ptp_classify_raw(skb); + struct cpts_skb_cb_data *skb_cb = + (struct cpts_skb_cb_data *)skb->cb; + + if (cpts_match(skb, class, seqid, mtype)) { + u64 ns = timecounter_cyc2time(&cpts->tc, event->low); + + memset(&ssh, 0, sizeof(ssh)); + ssh.hwtstamp = ns_to_ktime(ns); + skb_tstamp_tx(skb, &ssh); + found = true; + __skb_unlink(skb, &cpts->txq); + dev_consume_skb_any(skb); + dev_dbg(cpts->dev, "match tx timestamp mtype %u seqid %04x\n", + mtype, seqid); + } else if (time_after(jiffies, skb_cb->tmo)) { + /* timeout any expired skbs over 1s */ + dev_dbg(cpts->dev, + "expiring tx timestamp mtype %u seqid %04x\n", + mtype, seqid); + __skb_unlink(skb, &cpts->txq); + dev_consume_skb_any(skb); + } + } + + return found; +} + /* * Returns zero if matching event type was found. */ @@ -101,9 +151,15 @@ static int cpts_fifo_read(struct cpts *cpts, int match) event->low = lo; type = event_type(event); switch (type) { + case CPTS_EV_TX: + if (cpts_match_tx_ts(cpts, event)) { + /* if the new event matches an existing skb, + * then don't queue it + */ + break; + } case CPTS_EV_PUSH: case CPTS_EV_RX: - case CPTS_EV_TX: list_del_init(&event->list); list_add_tail(&event->list, &cpts->events); break; @@ -229,8 +285,15 @@ static long cpts_overflow_check(struct ptp_clock_info *ptp) struct cpts *cpts = container_of(ptp, struct cpts, info); unsigned long delay = cpts->ov_check_period; struct timespec64 ts; + unsigned long flags; + + spin_lock_irqsave(&cpts->lock, flags); + ts = ns_to_timespec64(timecounter_read(&cpts->tc)); + + if (!skb_queue_empty(&cpts->txq)) + delay = CPTS_SKB_TX_WORK_TIMEOUT; + spin_unlock_irqrestore(&cpts->lock, flags); - cpts_ptp_gettime(&cpts->info, &ts); pr_debug("cpts overflow check at %lld.%09lu\n", ts.tv_sec, ts.tv_nsec); return (long)delay; } @@ -319,6 +382,19 @@ static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, int ev_type) break; } } + + if (ev_type == CPTS_EV_TX && !ns) { + struct cpts_skb_cb_data *skb_cb = + (struct cpts_skb_cb_data *)skb->cb; + /* Not found, add frame to queue for processing later. + * The periodic FIFO check will handle this. + */ + skb_get(skb); + /* get the timestamp for timeouts */ + skb_cb->tmo = jiffies + msecs_to_jiffies(100); + __skb_queue_tail(&cpts->txq, skb); + ptp_schedule_worker(cpts->clock, 0); + } spin_unlock_irqrestore(&cpts->lock, flags); return ns; @@ -360,6 +436,7 @@ int cpts_register(struct cpts *cpts) { int err, i; + skb_queue_head_init(&cpts->txq); INIT_LIST_HEAD(&cpts->events); INIT_LIST_HEAD(&cpts->pool); for (i = 0; i < CPTS_MAX_EVENTS; i++) @@ -400,6 +477,9 @@ void cpts_unregister(struct cpts *cpts) cpts_write32(cpts, 0, int_enable); cpts_write32(cpts, 0, control); + /* Drop all packet */ + skb_queue_purge(&cpts->txq); + clk_disable(cpts->refclk); } EXPORT_SYMBOL_GPL(cpts_unregister); diff --git a/drivers/net/ethernet/ti/cpts.h b/drivers/net/ethernet/ti/cpts.h index 586edd9c7de0..73d73faf0f38 100644 --- a/drivers/net/ethernet/ti/cpts.h +++ b/drivers/net/ethernet/ti/cpts.h @@ -125,6 +125,7 @@ struct cpts { struct list_head pool; struct cpts_event pool_data[CPTS_MAX_EVENTS]; unsigned long ov_check_period; + struct sk_buff_head txq; }; void cpts_rx_timestamp(struct cpts *cpts, struct sk_buff *skb); From a93439cce2c1d31469e8245c504dbb6d1bed8749 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko Date: Fri, 28 Jul 2017 17:30:05 -0500 Subject: [PATCH 038/109] net: ethernet: ti: cpts: fix fifo read in cpts_find_ts Now the call chain cpts_find_ts() |- cpts_fifo_read(cpts, CPTS_EV_PUSH) will stop reading CPTS FIFO if PUSH event is found. But this is not expected and CPTS FIFI should be completely drained here. This is most probably copy-paste error and it has no negative impact as CPTS_EV_PUSH should not be present in FIFO without TS_PUSH request and cpts_systim_read() and cpts_find_ts() synchronized by spin_lock. Correct above by calling cpts_fifo_read() with -1 parameter, so it will read all CPTS event from FIFO. Signed-off-by: Grygorii Strashko Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index 95a0076773b3..c2121d214f08 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -364,7 +364,7 @@ static u64 cpts_find_ts(struct cpts *cpts, struct sk_buff *skb, int ev_type) return 0; spin_lock_irqsave(&cpts->lock, flags); - cpts_fifo_read(cpts, CPTS_EV_PUSH); + cpts_fifo_read(cpts, -1); list_for_each_safe(this, next, &cpts->events) { event = list_entry(this, struct cpts_event, list); if (event_expired(event)) { From 40413955ee265a5e42f710940ec78f5450d49149 Mon Sep 17 00:00:00 2001 From: "yujuan.qi" Date: Mon, 31 Jul 2017 11:23:01 +0800 Subject: [PATCH 039/109] Cipso: cipso_v4_optptr enter infinite loop in for(),if((optlen > 0) && (optptr[1] == 0)), enter infinite loop. Test: receive a packet which the ip length > 20 and the first byte of ip option is 0, produce this issue Signed-off-by: yujuan.qi Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index c4c6e1969ed0..2ae8f54cb321 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1523,9 +1523,17 @@ unsigned char *cipso_v4_optptr(const struct sk_buff *skb) int taglen; for (optlen = iph->ihl*4 - sizeof(struct iphdr); optlen > 0; ) { - if (optptr[0] == IPOPT_CIPSO) + switch (optptr[0]) { + case IPOPT_CIPSO: return optptr; - taglen = optptr[1]; + case IPOPT_END: + return NULL; + case IPOPT_NOOP: + taglen = 1; + break; + default: + taglen = optptr[1]; + } optlen -= taglen; optptr += taglen; } From be73b3043bf465455d4c9b88f68e03b6447bcfb0 Mon Sep 17 00:00:00 2001 From: "K. Den" Date: Tue, 1 Aug 2017 01:05:20 +0900 Subject: [PATCH 040/109] vxlan: fix remcsum when GRO on and CHECKSUM_PARTIAL boundary is outer UDP In the case that GRO is turned on and the original received packet is CHECKSUM_PARTIAL, if the outer UDP header is exactly at the last csum-unnecessary point, which for instance could occur if the packet comes from another Linux guest on the same Linux host, we have to do either remcsum_adjust or set up CHECKSUM_PARTIAL again with its csum_start properly reset considering RCO. However, since b7fe10e5ebac("gro: Fix remcsum offload to deal with frags in GRO") that barrier in such case could be skipped if GRO turned on, hence we pass over it and the inner L4 validation mistakenly reckons it as a bad csum. This patch makes remcsum_offload being reset at the same time of GRO remcsum cleanup, so as to make it work in such case as before. Fixes: b7fe10e5ebac ("gro: Fix remcsum offload to deal with frags in GRO") Signed-off-by: Koichiro Den Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 96aa7e6cf214..e17baac70f43 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -623,6 +623,7 @@ static struct sk_buff **vxlan_gro_receive(struct sock *sk, out: skb_gro_remcsum_cleanup(skb, &grc); + skb->remcsum_offload = 0; NAPI_GRO_CB(skb)->flush |= flush; return pp; From 1bff8a0c1f8c236209ee369b7952751c04eaa71a Mon Sep 17 00:00:00 2001 From: "K. Den" Date: Tue, 1 Aug 2017 01:05:39 +0900 Subject: [PATCH 041/109] gue: fix remcsum when GRO on and CHECKSUM_PARTIAL boundary is outer UDP In the case that GRO is turned on and the original received packet is CHECKSUM_PARTIAL, if the outer UDP header is exactly at the last csum-unnecessary point, which for instance could occur if the packet comes from another Linux guest on the same Linux host, we have to do either remcsum_adjust or set up CHECKSUM_PARTIAL again with its csum_start properly reset considering RCO. However, since b7fe10e5ebac ("gro: Fix remcsum offload to deal with frags in GRO") that barrier in such case could be skipped if GRO turned on, hence we pass over it and the inner L4 validation mistakenly reckons it as a bad csum. This patch makes remcsum_offload being reset at the same time of GRO remcsum cleanup, so as to make it work in such case as before. Fixes: b7fe10e5ebac ("gro: Fix remcsum offload to deal with frags in GRO") Signed-off-by: Koichiro Den Signed-off-by: David S. Miller --- net/ipv4/fou.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index 8e0257d01200..1540db65241a 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -450,6 +450,7 @@ out_unlock: out: NAPI_GRO_CB(skb)->flush |= flush; skb_gro_remcsum_cleanup(skb, &grc); + skb->remcsum_offload = 0; return pp; } From e43c9f23efadade684773a855675c99da278c862 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 1 Aug 2017 12:11:06 -0700 Subject: [PATCH 042/109] b44: Initialize 64-bit stats seqcount On 32-bit hosts and with CONFIG_DEBUG_LOCK_ALLOC we should be seeing a lockdep splat indicating this seqcount is not correctly initialized, fix that. Fixes: eeda8585522b ("b44: add 64 bit stats") Signed-off-by: Florian Fainelli Acked-by: Michael Chan Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/b44.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/b44.c b/drivers/net/ethernet/broadcom/b44.c index f411936b744c..a1125d10c825 100644 --- a/drivers/net/ethernet/broadcom/b44.c +++ b/drivers/net/ethernet/broadcom/b44.c @@ -2368,6 +2368,7 @@ static int b44_init_one(struct ssb_device *sdev, bp->msg_enable = netif_msg_init(b44_debug, B44_DEF_MSG_ENABLE); spin_lock_init(&bp->lock); + u64_stats_init(&bp->hw_stats.syncp); bp->rx_pending = B44_DEF_RX_RING_PENDING; bp->tx_pending = B44_DEF_TX_RING_PENDING; From 7d6d067790289e4f61f59fa60550ca5918aa25bd Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 1 Aug 2017 12:11:07 -0700 Subject: [PATCH 043/109] i40e: Initialize 64-bit statistics TX ring seqcount On 32-bit hosts and with CONFIG_DEBUG_LOCK_ALLOC we should be seeing a lockdep splat indicating this seqcount is not correctly initialized, fix that. Fixes: 980e9b118642 ("i40e: Add support for 64 bit netstats") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index b936febc315a..2194960d5855 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1113,6 +1113,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring) if (!tx_ring->tx_bi) goto err; + u64_stats_init(&tx_ring->syncp); + /* round up to nearest 4K */ tx_ring->size = tx_ring->count * sizeof(struct i40e_tx_desc); /* add u32 for head writeback, align after this takes care of From 7c3a4626eb65e78ebe208f48ffa21a5002f7f38e Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 1 Aug 2017 12:11:08 -0700 Subject: [PATCH 044/109] ixgbe: Initialize 64-bit stats seqcounts On 32-bit hosts and with CONFIG_DEBUG_LOCK_ALLOC we should be seeing a lockdep splat indicating this seqcount is not correctly initialized, fix that. Fixes: 4197aa7bb818 ("ixgbevf: provide 64 bit statistics") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 084c53582793..032f8ac06357 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -2988,6 +2988,8 @@ int ixgbevf_setup_tx_resources(struct ixgbevf_ring *tx_ring) if (!tx_ring->tx_buffer_info) goto err; + u64_stats_init(&tx_ring->syncp); + /* round up to nearest 4K */ tx_ring->size = tx_ring->count * sizeof(union ixgbe_adv_tx_desc); tx_ring->size = ALIGN(tx_ring->size, 4096); @@ -3046,6 +3048,8 @@ int ixgbevf_setup_rx_resources(struct ixgbevf_ring *rx_ring) if (!rx_ring->rx_buffer_info) goto err; + u64_stats_init(&rx_ring->syncp); + /* Round up to nearest 4K */ rx_ring->size = rx_ring->count * sizeof(union ixgbe_adv_rx_desc); rx_ring->size = ALIGN(rx_ring->size, 4096); From 7ceb781a1aa0356086e2bdc76bcd953fcb7ac377 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 1 Aug 2017 12:11:09 -0700 Subject: [PATCH 045/109] nfp: Initialize RX and TX ring 64-bit stats seqcounts On 32-bit hosts and with CONFIG_DEBUG_LOCK_ALLOC we should be seeing a lockdep splat indicating this seqcount is not correctly initialized, fix that. Fixes: 4c3523623dc0 ("net: add driver for Netronome NFP4000/NFP6000 NIC VFs") Signed-off-by: Florian Fainelli Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/netronome/nfp/nfp_net_common.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index 18750ff0ede6..4631ca8b8eb2 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -513,6 +513,7 @@ nfp_net_tx_ring_init(struct nfp_net_tx_ring *tx_ring, tx_ring->idx = idx; tx_ring->r_vec = r_vec; tx_ring->is_xdp = is_xdp; + u64_stats_init(&tx_ring->r_vec->tx_sync); tx_ring->qcidx = tx_ring->idx * nn->stride_tx; tx_ring->qcp_q = nn->tx_bar + NFP_QCP_QUEUE_OFF(tx_ring->qcidx); @@ -532,6 +533,7 @@ nfp_net_rx_ring_init(struct nfp_net_rx_ring *rx_ring, rx_ring->idx = idx; rx_ring->r_vec = r_vec; + u64_stats_init(&rx_ring->r_vec->rx_sync); rx_ring->fl_qcidx = rx_ring->idx * nn->stride_rx; rx_ring->qcp_fl = nn->rx_bar + NFP_QCP_QUEUE_OFF(rx_ring->fl_qcidx); From 790cb2ebb3f9c5d26a320117d5d13cafe479484d Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 1 Aug 2017 12:11:10 -0700 Subject: [PATCH 046/109] gtp: Initialize 64-bit per-cpu stats correctly On 32-bit hosts and with CONFIG_DEBUG_LOCK_ALLOC we should be seeing a lockdep splat indicating this seqcount is not correctly initialized, fix that by using netdev_alloc_pcpu_stats() instead of an open coded allocation. Fixes: 459aa660eb1d ("gtp: add initial driver for datapath of GPRS Tunneling Protocol (GTP-U)") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/gtp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 1542e837fdfa..f38e32a7ec9c 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -364,7 +364,7 @@ static int gtp_dev_init(struct net_device *dev) gtp->dev = dev; - dev->tstats = alloc_percpu(struct pcpu_sw_netstats); + dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); if (!dev->tstats) return -ENOMEM; From 4a0dee1ffe0e8f4101e704a325e97f8997b0abcc Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 1 Aug 2017 12:11:12 -0700 Subject: [PATCH 047/109] netvsc: Initialize 64-bit stats seqcount On 32-bit hosts and with CONFIG_DEBUG_LOCK_ALLOC we should be seeing a lockdep splat indicating this seqcount is not correctly initialized, fix that. In commit 6c80f3fc2398 ("netvsc: report per-channel stats in ethtool statistics") netdev_alloc_pcpu_stats() was removed in favor of open-coding the 64-bits statistics, except that u64_stats_init() was missed. Fixes: 6c80f3fc2398 ("netvsc: report per-channel stats in ethtool statistics") Signed-off-by: Florian Fainelli Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/netvsc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 0a9167dd72fb..96f90c75d1b7 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1302,6 +1302,8 @@ int netvsc_device_add(struct hv_device *device, struct netvsc_channel *nvchan = &net_device->chan_table[i]; nvchan->channel = device->channel; + u64_stats_init(&nvchan->tx_stats.syncp); + u64_stats_init(&nvchan->rx_stats.syncp); } /* Enable NAPI handler before init callbacks */ From 87173cd6cf242f2340db187d82bd47a48fe5e5fe Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Tue, 1 Aug 2017 12:11:13 -0700 Subject: [PATCH 048/109] ipvlan: Fix 64-bit statistics seqcount initialization On 32-bit hosts and with CONFIG_DEBUG_LOCK_ALLOC we should be seeing a lockdep splat indicating this seqcount is not correctly initialized, fix that by using the proper helper function: netdev_alloc_pcpu_stats(). Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ipvlan/ipvlan_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index f37e3c1fd4e7..8dab74a81303 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -192,7 +192,7 @@ static int ipvlan_init(struct net_device *dev) netdev_lockdep_set_classes(dev); - ipvlan->pcpu_stats = alloc_percpu(struct ipvl_pcpu_stats); + ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; From f7f8c1756e9a5f1258a7cc6b663f8451b724900f Mon Sep 17 00:00:00 2001 From: Miquel Raynal Date: Wed, 5 Jul 2017 08:51:09 +0200 Subject: [PATCH 049/109] nand: fix wrong default oob layout for small pages using soft ecc When using soft ecc, if no ooblayout is given, the core automatically uses one of the nand_ooblayout_{sp,lp}*() functions to determine the layout inside the out of band data. Until kernel version 4.6, struct nand_ecclayout was used for that purpose. During the migration from 4.6 to 4.7, an error shown up in the small page layout, in the case oob section is only 8 bytes long. The layout was using three bytes (0, 1, 2) for ecc, two bytes (3, 4) as free bytes, one byte (5) for bad block marker and finally two bytes (6, 7) as free bytes, as shown there: [linux-4.6] drivers/mtd/nand/nand_base.c:52 static struct nand_ecclayout nand_oob_8 = { .eccbytes = 3, .eccpos = {0, 1, 2}, .oobfree = { {.offset = 3, .length = 2}, {.offset = 6, .length = 2} } }; This fixes the current implementation which is incoherent. It references bit 3 at the same time as an ecc byte and a free byte. Furthermore, it is clear with the previous implementation that there is only one ecc section with 8 bytes oob sections. We shall return -ERANGE in the nand_ooblayout_ecc_sp() function when asked for the second section. Signed-off-by: Miquel Raynal Fixes: 41b207a70d3a ("mtd: nand: implement the default mtd_ooblayout_ops") Cc: Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_base.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 5fa5ddc94834..7b3826b42447 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -65,8 +65,14 @@ static int nand_ooblayout_ecc_sp(struct mtd_info *mtd, int section, if (!section) { oobregion->offset = 0; - oobregion->length = 4; + if (mtd->oobsize == 16) + oobregion->length = 4; + else + oobregion->length = 3; } else { + if (mtd->oobsize == 8) + return -ERANGE; + oobregion->offset = 6; oobregion->length = ecc->total - 4; } From 791eccd94965a8029ae09c5530bcb9a76794e408 Mon Sep 17 00:00:00 2001 From: Bryan O'Donoghue Date: Fri, 28 Jul 2017 14:22:57 +0100 Subject: [PATCH 050/109] mtd: nand: sunxi: fix potential divide-by-zero error clk_round_rate() can return <= 0. Currently the value returned by clk_round_rate() is used directly for a division. This patch introduces a guard to ensure a divide-by-zero or a divide by a negative number for that matter can't happen by bugging out returning -EINVAL if clk_round_rate() returns <= 0. Fixes: 2d43457f79e4 ("mtd: nand: sunxi: fix EDO mode selection") Signed-off-by: Bryan O'Donoghue Signed-off-by: Boris Brezillon --- drivers/mtd/nand/sunxi_nand.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/mtd/nand/sunxi_nand.c b/drivers/mtd/nand/sunxi_nand.c index d0b6f8f9f297..6abd142b1324 100644 --- a/drivers/mtd/nand/sunxi_nand.c +++ b/drivers/mtd/nand/sunxi_nand.c @@ -1728,6 +1728,10 @@ static int sunxi_nfc_setup_data_interface(struct mtd_info *mtd, int csline, */ chip->clk_rate = NSEC_PER_SEC / min_clk_period; real_clk_rate = clk_round_rate(nfc->mod_clk, chip->clk_rate); + if (real_clk_rate <= 0) { + dev_err(nfc->dev, "Unable to round clk %lu\n", chip->clk_rate); + return -EINVAL; + } /* * ONFI specification 3.1, paragraph 4.15.2 dictates that EDO data From cb25fae18207330caac848d850e11f9cdb500dbf Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Sun, 30 Jul 2017 16:18:03 -0600 Subject: [PATCH 051/109] mtd: nand: Fix a docs build warning Commit 0b4773fd1649 (mtd: nand: Drop unused cached programming support) removed the "cached" parameter from nand_write_page(), but did not update the kerneldoc comments, creating this docs build warning: ./drivers/mtd/nand/nand_base.c:2751: warning: Excess function parameter 'cached' description in 'nand_write_page' Remove the offending line so we can have a little peace and quiet. Signed-off-by: Jonathan Corbet Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_base.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 7b3826b42447..5b7c8d05360f 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -2747,7 +2747,6 @@ static int nand_write_page_syndrome(struct mtd_info *mtd, * @buf: the data to write * @oob_required: must write chip->oob_poi to OOB * @page: page number to write - * @cached: cached programming * @raw: use _raw version of write_page */ static int nand_write_page(struct mtd_info *mtd, struct nand_chip *chip, From a11bf5ed951f8900d244d09eb03a888b59c7fc82 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 31 Jul 2017 10:29:56 +0200 Subject: [PATCH 052/109] mtd: nand: Fix timing setup for NANDs that do not support SET FEATURES Some ONFI NANDs do not support the SET/GET FEATURES commands, which, according to the spec, is perfectly valid. On these NANDs we can't set a specific timing mode using the "timing mode" feature, and we should assume the NAND does not require any setup to enter a specific timing mode. Signed-off-by: Boris Brezillon Fixes: d8e725dd8311 ("mtd: nand: automate NAND timings selection") Reported-by: Alexander Dahl Cc: Tested-by: Alexander Dahl Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_base.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 5b7c8d05360f..c6c18b82f8f4 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -1131,7 +1131,9 @@ static int nand_setup_data_interface(struct nand_chip *chip, int chipnr) * Ensure the timing mode has been changed on the chip side * before changing timings on the controller side. */ - if (chip->onfi_version) { + if (chip->onfi_version && + (le16_to_cpu(chip->onfi_params.opt_cmd) & + ONFI_OPT_CMD_SET_GET_FEATURES)) { u8 tmode_param[ONFI_SUBFEATURE_PARAM_LEN] = { chip->onfi_timing_mode_default, }; From 6d29231000bbe0fb9e4893a9c68151ffdd3b5469 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 31 Jul 2017 10:31:27 +0200 Subject: [PATCH 053/109] mtd: nand: Declare tBERS, tR and tPROG as u64 to avoid integer overflow All timings in nand_sdr_timings are expressed in picoseconds but some of them may not fit in an u32. Signed-off-by: Boris Brezillon Fixes: 204e7ecd47e2 ("mtd: nand: Add a few more timings to nand_sdr_timings") Reported-by: Alexander Dahl Cc: Reviewed-by: Alexander Dahl Tested-by: Alexander Dahl Signed-off-by: Boris Brezillon --- drivers/mtd/nand/nand_timings.c | 6 +++--- include/linux/mtd/nand.h | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mtd/nand/nand_timings.c b/drivers/mtd/nand/nand_timings.c index f06312df3669..7e36d7d13c26 100644 --- a/drivers/mtd/nand/nand_timings.c +++ b/drivers/mtd/nand/nand_timings.c @@ -311,9 +311,9 @@ int onfi_init_data_interface(struct nand_chip *chip, struct nand_sdr_timings *timings = &iface->timings.sdr; /* microseconds -> picoseconds */ - timings->tPROG_max = 1000000UL * le16_to_cpu(params->t_prog); - timings->tBERS_max = 1000000UL * le16_to_cpu(params->t_bers); - timings->tR_max = 1000000UL * le16_to_cpu(params->t_r); + timings->tPROG_max = 1000000ULL * le16_to_cpu(params->t_prog); + timings->tBERS_max = 1000000ULL * le16_to_cpu(params->t_bers); + timings->tR_max = 1000000ULL * le16_to_cpu(params->t_r); /* nanoseconds -> picoseconds */ timings->tCCS_min = 1000UL * le16_to_cpu(params->t_ccs); diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 892148c448cc..5216d2eb2289 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -681,10 +681,10 @@ struct nand_buffers { * @tWW_min: WP# transition to WE# low */ struct nand_sdr_timings { - u32 tBERS_max; + u64 tBERS_max; u32 tCCS_min; - u32 tPROG_max; - u32 tR_max; + u64 tPROG_max; + u64 tR_max; u32 tALH_min; u32 tADL_min; u32 tALS_min; From ee02f73e04c0e690600f621a3a1d2245834af7fe Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 31 Jul 2017 10:32:21 +0200 Subject: [PATCH 054/109] mtd: nand: atmel: Fix EDO mode check EDO mode should be used when tRC is less than 30ns, but timings are expressed in picoseconds in the nand_sdr_timings struct. Signed-off-by: Boris Brezillon Fixes: f9ce2eddf176 ("mtd: nand: atmel: Add ->setup_data_interface() hooks") Reported-by: Alexander Dahl Tested-by: Alexander Dahl Signed-off-by: Boris Brezillon --- drivers/mtd/nand/atmel/nand-controller.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/atmel/nand-controller.c b/drivers/mtd/nand/atmel/nand-controller.c index d922a88e407f..2c8baa0c2c4e 100644 --- a/drivers/mtd/nand/atmel/nand-controller.c +++ b/drivers/mtd/nand/atmel/nand-controller.c @@ -1201,7 +1201,7 @@ static int atmel_smc_nand_prepare_smcconf(struct atmel_nand *nand, * tRC < 30ns implies EDO mode. This controller does not support this * mode. */ - if (conf->timings.sdr.tRC_min < 30) + if (conf->timings.sdr.tRC_min < 30000) return -ENOTSUPP; atmel_smc_cs_conf_init(smcconf); From 1ad43c0078b79a76accd0fe64062e47b3430dc6b Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 2 Aug 2017 08:01:45 +0800 Subject: [PATCH 055/109] blk-mq: don't leak preempt counter/q_usage_counter when allocating rq failed When blk_mq_get_request() failed, preempt counter isn't released, and blk_mq_make_request() doesn't release the counter too. This patch fixes the issue, and makes sure that preempt counter is only held if rq is allocated successfully. The same policy is applied on .q_usage_counter too. Signed-off-by: Ming Lei Signed-off-by: Jens Axboe --- block/blk-mq.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 041f7b7fa0d6..211ef367345f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -301,11 +301,12 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct elevator_queue *e = q->elevator; struct request *rq; unsigned int tag; + struct blk_mq_ctx *local_ctx = NULL; blk_queue_enter_live(q); data->q = q; if (likely(!data->ctx)) - data->ctx = blk_mq_get_ctx(q); + data->ctx = local_ctx = blk_mq_get_ctx(q); if (likely(!data->hctx)) data->hctx = blk_mq_map_queue(q, data->ctx->cpu); if (op & REQ_NOWAIT) @@ -324,6 +325,10 @@ static struct request *blk_mq_get_request(struct request_queue *q, tag = blk_mq_get_tag(data); if (tag == BLK_MQ_TAG_FAIL) { + if (local_ctx) { + blk_mq_put_ctx(local_ctx); + data->ctx = NULL; + } blk_queue_exit(q); return NULL; } @@ -356,12 +361,12 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, rq = blk_mq_get_request(q, NULL, op, &alloc_data); - blk_mq_put_ctx(alloc_data.ctx); - blk_queue_exit(q); - if (!rq) return ERR_PTR(-EWOULDBLOCK); + blk_mq_put_ctx(alloc_data.ctx); + blk_queue_exit(q); + rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -407,11 +412,11 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, rq = blk_mq_get_request(q, NULL, op, &alloc_data); - blk_queue_exit(q); - if (!rq) return ERR_PTR(-EWOULDBLOCK); + blk_queue_exit(q); + return rq; } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); From fb52c3b597f0b44e893ded8c253845561f9f57da Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Tue, 1 Aug 2017 10:24:17 +0000 Subject: [PATCH 056/109] lan78xx: USB fast connect/disconnect crash fix USB fast connect/disconnect crash fix When USB plugged/unplugged at fast rate, lan78xx_mdio_init() in lan78xx_bind() failing case is not handled. Whenever lan78xx_mdio_init() failed, dev->mdiobus will be freed, however since lan78xx_bind() not consider as error and try to proceed for further initialization in lan78xx_probe() which leads system hung/crash. Also when register_netdev() failed, netdev is freed without calling lan78xx_unbind(). Hence halting the failed cases right manner fixes the system crash/hung issue. Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 5833f7e2a127..8ef3639649a0 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2858,13 +2858,13 @@ static int lan78xx_bind(struct lan78xx_net *dev, struct usb_interface *intf) /* Init all registers */ ret = lan78xx_reset(dev); - lan78xx_mdio_init(dev); + ret = lan78xx_mdio_init(dev); dev->net->flags |= IFF_MULTICAST; pdata->wol = WAKE_MAGIC; - return 0; + return ret; } static void lan78xx_unbind(struct lan78xx_net *dev, struct usb_interface *intf) @@ -3525,11 +3525,11 @@ static int lan78xx_probe(struct usb_interface *intf, udev = interface_to_usbdev(intf); udev = usb_get_dev(udev); - ret = -ENOMEM; netdev = alloc_etherdev(sizeof(struct lan78xx_net)); if (!netdev) { - dev_err(&intf->dev, "Error: OOM\n"); - goto out1; + dev_err(&intf->dev, "Error: OOM\n"); + ret = -ENOMEM; + goto out1; } /* netdev_printk() needs this */ @@ -3610,7 +3610,7 @@ static int lan78xx_probe(struct usb_interface *intf, ret = register_netdev(netdev); if (ret != 0) { netif_err(dev, probe, netdev, "couldn't register the device\n"); - goto out2; + goto out3; } usb_set_intfdata(intf, dev); From 0573f94b1abfb42952275ca67f665dbd720b00d7 Mon Sep 17 00:00:00 2001 From: Nisar Sayed Date: Tue, 1 Aug 2017 10:24:33 +0000 Subject: [PATCH 057/109] lan78xx: Fix to handle hard_header_len update Fix to handle hard_header_len update When ifconfig up/down sequence is initiated hard_header_len get updated incrementally for each ifconfig up /down sequence, this leads invalid hard_header_len, moving to lan78xx_bind to have one time update of hard_header_len addresses the issue. Signed-off-by: Nisar Sayed Signed-off-by: David S. Miller --- drivers/net/usb/lan78xx.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c index 8ef3639649a0..b99a7fb09f8e 100644 --- a/drivers/net/usb/lan78xx.c +++ b/drivers/net/usb/lan78xx.c @@ -2367,9 +2367,6 @@ static int lan78xx_reset(struct lan78xx_net *dev) /* Init LTM */ lan78xx_init_ltm(dev); - dev->net->hard_header_len += TX_OVERHEAD; - dev->hard_mtu = dev->net->mtu + dev->net->hard_header_len; - if (dev->udev->speed == USB_SPEED_SUPER) { buf = DEFAULT_BURST_CAP_SIZE / SS_USB_PKT_SIZE; dev->rx_urb_size = DEFAULT_BURST_CAP_SIZE; @@ -2855,6 +2852,9 @@ static int lan78xx_bind(struct lan78xx_net *dev, struct usb_interface *intf) return ret; } + dev->net->hard_header_len += TX_OVERHEAD; + dev->hard_mtu = dev->net->mtu + dev->net->hard_header_len; + /* Init all registers */ ret = lan78xx_reset(dev); From c994f778bb1cca8ebe7a4e528cefec233e93b5cc Mon Sep 17 00:00:00 2001 From: Inbar Karmy Date: Tue, 1 Aug 2017 16:43:43 +0300 Subject: [PATCH 058/109] net/mlx4_en: Fix wrong indication of Wake-on-LAN (WoL) support Currently when WoL is supported but disabled, ethtool reports: "Supports Wake-on: d". Fix the indication of Wol support, so that the indication remains "g" all the time if the NIC supports WoL. Tested: As accepted, when NIC supports WoL- ethtool reports: Supports Wake-on: g Wake-on: d when NIC doesn't support WoL- ethtool reports: Supports Wake-on: d Wake-on: d Fixes: 14c07b1358ed ("mlx4: Wake on LAN support") Signed-off-by: Inbar Karmy Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 15 ++++++++------- drivers/net/ethernet/mellanox/mlx4/fw.c | 4 ++++ drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 2 ++ include/linux/mlx4/device.h | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index c751a1d434ad..3d4e4a5d00d1 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -223,6 +223,7 @@ static void mlx4_en_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) { struct mlx4_en_priv *priv = netdev_priv(netdev); + struct mlx4_caps *caps = &priv->mdev->dev->caps; int err = 0; u64 config = 0; u64 mask; @@ -235,24 +236,24 @@ static void mlx4_en_get_wol(struct net_device *netdev, mask = (priv->port == 1) ? MLX4_DEV_CAP_FLAG_WOL_PORT1 : MLX4_DEV_CAP_FLAG_WOL_PORT2; - if (!(priv->mdev->dev->caps.flags & mask)) { + if (!(caps->flags & mask)) { wol->supported = 0; wol->wolopts = 0; return; } + if (caps->wol_port[priv->port]) + wol->supported = WAKE_MAGIC; + else + wol->supported = 0; + err = mlx4_wol_read(priv->mdev->dev, &config, priv->port); if (err) { en_err(priv, "Failed to get WoL information\n"); return; } - if (config & MLX4_EN_WOL_MAGIC) - wol->supported = WAKE_MAGIC; - else - wol->supported = 0; - - if (config & MLX4_EN_WOL_ENABLED) + if ((config & MLX4_EN_WOL_ENABLED) && (config & MLX4_EN_WOL_MAGIC)) wol->wolopts = WAKE_MAGIC; else wol->wolopts = 0; diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 37e84a59e751..c165f16623a9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -764,6 +764,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_CQ_TS_SUPPORT_OFFSET 0x3e #define QUERY_DEV_CAP_MAX_PKEY_OFFSET 0x3f #define QUERY_DEV_CAP_EXT_FLAGS_OFFSET 0x40 +#define QUERY_DEV_CAP_WOL_OFFSET 0x43 #define QUERY_DEV_CAP_FLAGS_OFFSET 0x44 #define QUERY_DEV_CAP_RSVD_UAR_OFFSET 0x48 #define QUERY_DEV_CAP_UAR_SZ_OFFSET 0x49 @@ -920,6 +921,9 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(ext_flags, outbox, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); MLX4_GET(flags, outbox, QUERY_DEV_CAP_FLAGS_OFFSET); dev_cap->flags = flags | (u64)ext_flags << 32; + MLX4_GET(field, outbox, QUERY_DEV_CAP_WOL_OFFSET); + dev_cap->wol_port[1] = !!(field & 0x20); + dev_cap->wol_port[2] = !!(field & 0x40); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_UAR_OFFSET); dev_cap->reserved_uars = field >> 4; MLX4_GET(field, outbox, QUERY_DEV_CAP_UAR_SZ_OFFSET); diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.h b/drivers/net/ethernet/mellanox/mlx4/fw.h index 5343a0599253..b52ba01aa486 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.h +++ b/drivers/net/ethernet/mellanox/mlx4/fw.h @@ -129,6 +129,7 @@ struct mlx4_dev_cap { u32 dmfs_high_rate_qpn_range; struct mlx4_rate_limit_caps rl_caps; struct mlx4_port_cap port_cap[MLX4_MAX_PORTS + 1]; + bool wol_port[MLX4_MAX_PORTS + 1]; }; struct mlx4_func_cap { diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index a27c9c13a36e..09b9bc17bce9 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -424,6 +424,8 @@ static int mlx4_dev_cap(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev->caps.stat_rate_support = dev_cap->stat_rate_support; dev->caps.max_gso_sz = dev_cap->max_gso_sz; dev->caps.max_rss_tbl_sz = dev_cap->max_rss_tbl_sz; + dev->caps.wol_port[1] = dev_cap->wol_port[1]; + dev->caps.wol_port[2] = dev_cap->wol_port[2]; /* Save uar page shift */ if (!mlx4_is_slave(dev)) { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index aad5d81dfb44..b54517c05e9a 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -620,6 +620,7 @@ struct mlx4_caps { u32 dmfs_high_rate_qpn_base; u32 dmfs_high_rate_qpn_range; u32 vf_caps; + bool wol_port[MLX4_MAX_PORTS + 1]; struct mlx4_rate_limit_caps rl_caps; }; From 5886259c127026ee4a6a7093d6c320440d833fda Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 1 Aug 2017 16:43:44 +0300 Subject: [PATCH 059/109] net/mlx4_core: Fix sl_to_vl_change bit offset in flags2 dump The index value in function dump_dev_cap_flags2() for outputting "sl to vl mapping table change event support" needs to be consistent with the value of the enumerated constant MLX4_DEV_CAP_FLAG2_SL_TO_VL_CHANGE_EVENT defined in file include/linux/mlx4_device.h The change here restores that consistency. Fixes: fd10ed8e6f42 ("IB/mlx4: Fix possible vl/sl field mismatch in LRH header in QP1 packets") Reported-by: Mukesh Kacker Signed-off-by: Jack Morgenstein Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index c165f16623a9..009dd03466d6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -160,7 +160,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [33] = "RoCEv2 support", [34] = "DMFS Sniffer support (UC & MC)", [35] = "QinQ VST mode support", - [36] = "sl to vl mapping table change event support" + [37] = "sl to vl mapping table change event support", }; int i; From f9fb9d0b8519a795eeaca5108205593b66068cce Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 1 Aug 2017 16:43:45 +0300 Subject: [PATCH 060/109] net/mlx4_core: Fix namespace misalignment in QinQ VST support commit The cited commit introduced the following new enum value in file include/linux/mlx4/device.h: MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP However the value of MLX4_DEV_CAP_FLAG2_SVLAN_BY_QP needs to stay consistent with the value used in another namespace in function dump_dev_cap_flags2(), which is manually kept in sync. The change here restores that consistency. Fixes: 7c3d21c8153c ("net/mlx4_core: Preparation for VF vlan protocol 802.1ad") Reported-by: Mukesh Kacker Signed-off-by: Jack Morgenstein Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 009dd03466d6..7c9502bae1cc 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -159,7 +159,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [32] = "Loopback source checks support", [33] = "RoCEv2 support", [34] = "DMFS Sniffer support (UC & MC)", - [35] = "QinQ VST mode support", + [36] = "QinQ VST mode support", [37] = "sl to vl mapping table change event support", }; int i; From bff0c6840c135c14675404ba697aae2638c37cb6 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Tue, 1 Aug 2017 16:43:46 +0300 Subject: [PATCH 061/109] net/mlx4_core: Fixes missing capability bit in flags2 capability dump The cited commit introduced the following new enum value in file include/linux/mlx4/device.h: QUERY_DEV_CAP_DIAG_RPRT_PER_PORT However, it failed to introduce a corresponding entry in function dump_dev_cap_flags2() for outputting a line in the message log when this capability bit is set. The change here fixes that omission. Fixes: c7c122ed67e4 ("net/mlx4: Add diagnostic counters capability bit") Reported-by: Mukesh Kacker Signed-off-by: Jack Morgenstein Signed-off-by: Tariq Toukan Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/fw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 7c9502bae1cc..041c0ed65929 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -159,6 +159,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [32] = "Loopback source checks support", [33] = "RoCEv2 support", [34] = "DMFS Sniffer support (UC & MC)", + [35] = "Diag counters per port", [36] = "QinQ VST mode support", [37] = "sl to vl mapping table change event support", }; From bed9ff165960921303a100228585f2d1691b42eb Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Wed, 2 Aug 2017 00:45:44 +0900 Subject: [PATCH 062/109] usb: qmi_wwan: add D-Link DWM-222 device ID Signed-off-by: Hector Martin Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 5894e3c9468f..ff6f39fe6c00 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1175,6 +1175,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x1428, 2)}, /* Telewell TW-LTE 4G v2 */ {QMI_FIXED_INTF(0x19d2, 0x2002, 4)}, /* ZTE (Vodafone) K3765-Z */ {QMI_FIXED_INTF(0x2001, 0x7e19, 4)}, /* D-Link DWM-221 B1 */ + {QMI_FIXED_INTF(0x2001, 0x7e35, 4)}, /* D-Link DWM-222 */ {QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)}, /* Sierra Wireless MC7700 */ {QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */ {QMI_FIXED_INTF(0x1199, 0x68a2, 8)}, /* Sierra Wireless MC7710 in QMI mode */ From 4d96f12a072c669d48dc3a2c6b539a9faeca138d Mon Sep 17 00:00:00 2001 From: Thomas Falcon Date: Tue, 1 Aug 2017 15:04:36 -0500 Subject: [PATCH 063/109] ibmvnic: Initialize SCRQ's during login renegotiation SCRQ resources are freed during renegotiation, but they are not re-allocated afterwards due to some changes in the initialization process. Fix that by re-allocating the memory after renegotation. SCRQ's can also be freed if a server capabilities request fails. If this were encountered during a device reset for example, SCRQ's may not be re-allocated. This operation is not necessary anymore so remove it. Signed-off-by: Thomas Falcon Signed-off-by: David S. Miller --- drivers/net/ethernet/ibm/ibmvnic.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index a3e694679635..c45e8e3b82d3 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -111,6 +111,7 @@ static void send_request_map(struct ibmvnic_adapter *, dma_addr_t, __be32, u8); static void send_request_unmap(struct ibmvnic_adapter *, u8); static void send_login(struct ibmvnic_adapter *adapter); static void send_cap_queries(struct ibmvnic_adapter *adapter); +static int init_sub_crqs(struct ibmvnic_adapter *); static int init_sub_crq_irqs(struct ibmvnic_adapter *adapter); static int ibmvnic_init(struct ibmvnic_adapter *); static void release_crq_queue(struct ibmvnic_adapter *); @@ -651,6 +652,7 @@ static int ibmvnic_login(struct net_device *netdev) struct ibmvnic_adapter *adapter = netdev_priv(netdev); unsigned long timeout = msecs_to_jiffies(30000); struct device *dev = &adapter->vdev->dev; + int rc; do { if (adapter->renegotiate) { @@ -664,6 +666,18 @@ static int ibmvnic_login(struct net_device *netdev) dev_err(dev, "Capabilities query timeout\n"); return -1; } + rc = init_sub_crqs(adapter); + if (rc) { + dev_err(dev, + "Initialization of SCRQ's failed\n"); + return -1; + } + rc = init_sub_crq_irqs(adapter); + if (rc) { + dev_err(dev, + "Initialization of SCRQ's irqs failed\n"); + return -1; + } } reinit_completion(&adapter->init_done); @@ -3004,7 +3018,6 @@ static void handle_request_cap_rsp(union ibmvnic_crq *crq, *req_value, (long int)be64_to_cpu(crq->request_capability_rsp. number), name); - release_sub_crqs(adapter); *req_value = be64_to_cpu(crq->request_capability_rsp.number); ibmvnic_send_req_caps(adapter, 1); return; From ed254971edea92c3ac5c67c6a05247a92aa6075e Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 1 Aug 2017 13:22:32 -0700 Subject: [PATCH 064/109] tcp: avoid setting cwnd to invalid ssthresh after cwnd reduction states If the sender switches the congestion control during ECN-triggered cwnd-reduction state (CA_CWR), upon exiting recovery cwnd is set to the ssthresh value calculated by the previous congestion control. If the previous congestion control is BBR that always keep ssthresh to TCP_INIFINITE_SSTHRESH, cwnd ends up being infinite. The safe step is to avoid assigning invalid ssthresh value when recovery ends. Signed-off-by: Yuchung Cheng Signed-off-by: Neal Cardwell Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2920e0cb09f8..dad026fcfd09 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2520,8 +2520,8 @@ static inline void tcp_end_cwnd_reduction(struct sock *sk) return; /* Reset cwnd to ssthresh in CWR or Recovery (unless it's undone) */ - if (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || - (tp->undo_marker && tp->snd_ssthresh < TCP_INFINITE_SSTHRESH)) { + if (tp->snd_ssthresh < TCP_INFINITE_SSTHRESH && + (inet_csk(sk)->icsk_ca_state == TCP_CA_CWR || tp->undo_marker)) { tp->snd_cwnd = tp->snd_ssthresh; tp->snd_cwnd_stamp = tcp_jiffies32; } From 42ef3bedf30020a3b5298c2b918ccd7b6e1794d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20T=C3=A9nart?= Date: Wed, 19 Jul 2017 11:02:30 +0200 Subject: [PATCH 065/109] crypto: inside-secure - fix invalidation check in hmac_sha1_setkey The safexcel_hmac_sha1_setkey function checks if an invalidation command should be issued, i.e. when the context ipad/opad change. This checks is done after filling the ipad/opad which and it can't be true. The patch fixes this by moving the check before the ipad/opad memcpy operations. Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver") Signed-off-by: Antoine Tenart Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel_hash.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index 8527a5899a2f..a11b2edb41b9 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -883,9 +883,6 @@ static int safexcel_hmac_sha1_setkey(struct crypto_ahash *tfm, const u8 *key, if (ret) return ret; - memcpy(ctx->ipad, &istate.state, SHA1_DIGEST_SIZE); - memcpy(ctx->opad, &ostate.state, SHA1_DIGEST_SIZE); - for (i = 0; i < ARRAY_SIZE(istate.state); i++) { if (ctx->ipad[i] != le32_to_cpu(istate.state[i]) || ctx->opad[i] != le32_to_cpu(ostate.state[i])) { @@ -894,6 +891,9 @@ static int safexcel_hmac_sha1_setkey(struct crypto_ahash *tfm, const u8 *key, } } + memcpy(ctx->ipad, &istate.state, SHA1_DIGEST_SIZE); + memcpy(ctx->opad, &ostate.state, SHA1_DIGEST_SIZE); + return 0; } From 60c4081ec4195389f4fa1fce83eaad5e4b948748 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Antoine=20T=C3=A9nart?= Date: Wed, 19 Jul 2017 11:02:31 +0200 Subject: [PATCH 066/109] crypto: inside-secure - fix the sha state length in hmac_sha1_setkey A check is performed on the ipad/opad in the safexcel_hmac_sha1_setkey function, but the index used by the loop doing it is wrong. It is currently the size of the state array while it should be the size of a sha1 state. This patch fixes it. Fixes: 1b44c5a60c13 ("crypto: inside-secure - add SafeXcel EIP197 crypto engine driver") Reported-by: Dan Carpenter Signed-off-by: Antoine Tenart Signed-off-by: Herbert Xu --- drivers/crypto/inside-secure/safexcel_hash.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index a11b2edb41b9..3f819399cd95 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -883,7 +883,7 @@ static int safexcel_hmac_sha1_setkey(struct crypto_ahash *tfm, const u8 *key, if (ret) return ret; - for (i = 0; i < ARRAY_SIZE(istate.state); i++) { + for (i = 0; i < SHA1_DIGEST_SIZE / sizeof(u32); i++) { if (ctx->ipad[i] != le32_to_cpu(istate.state[i]) || ctx->opad[i] != le32_to_cpu(ostate.state[i])) { ctx->base.needs_inv = true; From 2dda640040876cd8ae646408b69eea40c24f9ae9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 2 Aug 2017 23:10:46 -0700 Subject: [PATCH 067/109] net: fix keepalive code vs TCP_FASTOPEN_CONNECT syzkaller was able to trigger a divide by 0 in TCP stack [1] Issue here is that keepalive timer needs to be updated to not attempt to send a probe if the connection setup was deferred using TCP_FASTOPEN_CONNECT socket option added in linux-4.11 [1] divide error: 0000 [#1] SMP CPU: 18 PID: 0 Comm: swapper/18 Not tainted task: ffff986f62f4b040 ti: ffff986f62fa2000 task.ti: ffff986f62fa2000 RIP: 0010:[] [] __tcp_select_window+0x8d/0x160 Call Trace: [] tcp_transmit_skb+0x11/0x20 [] tcp_xmit_probe_skb+0xc1/0xe0 [] tcp_write_wakeup+0x68/0x160 [] tcp_keepalive_timer+0x17b/0x230 [] call_timer_fn+0x39/0xf0 [] run_timer_softirq+0x1d7/0x280 [] __do_softirq+0xcb/0x257 [] irq_exit+0x9c/0xb0 [] smp_apic_timer_interrupt+0x6a/0x80 [] apic_timer_interrupt+0x7f/0x90 [] ? cpuidle_enter_state+0x13a/0x3b0 [] ? cpuidle_enter_state+0x11d/0x3b0 Tested: Following packetdrill no longer crashes the kernel `echo 0 >/proc/sys/net/ipv4/tcp_timestamps` // Cache warmup: send a Fast Open cookie request 0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0 fcntl(3, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(3, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(3, ..., ...) = -1 EINPROGRESS (Operation is now in progress) +0 > S 0:0(0) +.01 < S. 123:123(0) ack 1 win 14600 +0 > . 1:1(0) ack 1 +0 close(3) = 0 +0 > F. 1:1(0) ack 1 +0 < F. 1:1(0) ack 2 win 92 +0 > . 2:2(0) ack 2 +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 setsockopt(4, SOL_SOCKET, SO_KEEPALIVE, [1], 4) = 0 +.01 connect(4, ..., ...) = 0 +0 setsockopt(4, SOL_TCP, TCP_KEEPIDLE, [5], 4) = 0 +10 close(4) = 0 `echo 1 >/proc/sys/net/ipv4/tcp_timestamps` Fixes: 19f6d3f3c842 ("net/tcp-fastopen: Add new API support") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Wei Wang Cc: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c0feeeef962a..e906014890b6 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -652,7 +652,8 @@ static void tcp_keepalive_timer (unsigned long data) goto death; } - if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + if (!sock_flag(sk, SOCK_KEEPOPEN) || + ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT))) goto out; elapsed = keepalive_time_when(tp); From b91d532928dff2141ea9c107c3e73104d9843767 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Thu, 3 Aug 2017 14:13:46 +0800 Subject: [PATCH 068/109] ipv6: set rt6i_protocol properly in the route when it is installed After commit c2ed1880fd61 ("net: ipv6: check route protocol when deleting routes"), ipv6 route checks rt protocol when trying to remove a rt entry. It introduced a side effect causing 'ip -6 route flush cache' not to work well. When flushing caches with iproute, all route caches get dumped from kernel then removed one by one by sending DELROUTE requests to kernel for each cache. The thing is iproute sends the request with the cache whose proto is set with RTPROT_REDIRECT by rt6_fill_node() when kernel dumps it. But in kernel the rt_cache protocol is still 0, which causes the cache not to be matched and removed. So the real reason is rt6i_protocol in the route is not set when it is allocated. As David Ahern's suggestion, this patch is to set rt6i_protocol properly in the route when it is installed and remove the codes setting rtm_protocol according to rt6i_flags in rt6_fill_node. This is also an improvement to keep rt6i_protocol consistent with rtm_protocol. Fixes: c2ed1880fd61 ("net: ipv6: check route protocol when deleting routes") Reported-by: Jianlin Shi Suggested-by: David Ahern Signed-off-by: Xin Long Signed-off-by: David S. Miller --- net/ipv6/route.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d30c96a819d..a640fbcba15d 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2351,6 +2351,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (on_link) nrt->rt6i_flags &= ~RTF_GATEWAY; + nrt->rt6i_protocol = RTPROT_REDIRECT; nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key; if (ip6_ins_rt(nrt)) @@ -2461,6 +2462,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net, .fc_dst_len = prefixlen, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO | RTF_UP | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = net, @@ -2513,6 +2515,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr, .fc_ifindex = dev->ifindex, .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES | RTF_PREF(pref), + .fc_protocol = RTPROT_RA, .fc_nlinfo.portid = 0, .fc_nlinfo.nlh = NULL, .fc_nlinfo.nl_net = dev_net(dev), @@ -3424,14 +3427,6 @@ static int rt6_fill_node(struct net *net, rtm->rtm_flags = 0; rtm->rtm_scope = RT_SCOPE_UNIVERSE; rtm->rtm_protocol = rt->rt6i_protocol; - if (rt->rt6i_flags & RTF_DYNAMIC) - rtm->rtm_protocol = RTPROT_REDIRECT; - else if (rt->rt6i_flags & RTF_ADDRCONF) { - if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO)) - rtm->rtm_protocol = RTPROT_RA; - else - rtm->rtm_protocol = RTPROT_KERNEL; - } if (rt->rt6i_flags & RTF_CACHE) rtm->rtm_flags |= RTM_F_CLONED; From e1a10ef7fa876f8510aaec36ea5c0cf34baba410 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 3 Aug 2017 09:19:52 -0400 Subject: [PATCH 069/109] tcp: introduce tcp_rto_delta_us() helper for xmit timer fix Pure refactor. This helper will be required in the xmit timer fix later in the patch series. (Because the TLP logic will want to make this calculation.) Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 10 ++++++++++ net/ipv4/tcp_input.c | 5 +---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 70483296157f..ada65e767b28 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1916,6 +1916,16 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); +/* At how many usecs into the future should the RTO fire? */ +static inline s64 tcp_rto_delta_us(const struct sock *sk) +{ + const struct sk_buff *skb = tcp_write_queue_head(sk); + u32 rto = inet_csk(sk)->icsk_rto; + u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto); + + return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; +} + /* * Save and compile IPv4 options, return a pointer to it */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index dad026fcfd09..b9ba8d55d8b8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3004,10 +3004,7 @@ void tcp_rearm_rto(struct sock *sk) /* Offset the time elapsed after installing regular RTO */ if (icsk->icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) { - struct sk_buff *skb = tcp_write_queue_head(sk); - u64 rto_time_stamp = skb->skb_mstamp + - jiffies_to_usecs(rto); - s64 delta_us = rto_time_stamp - tp->tcp_mstamp; + s64 delta_us = tcp_rto_delta_us(sk); /* delta_us may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ From a2815817ffa68c7933a43eb55836d6e789bd4389 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 3 Aug 2017 09:19:53 -0400 Subject: [PATCH 070/109] tcp: enable xmit timer fix by having TLP use time when RTO should fire Have tcp_schedule_loss_probe() base the TLP scheduling decision based on when the RTO *should* fire. This is to enable the upcoming xmit timer fix in this series, where tcp_schedule_loss_probe() cannot assume that the last timer installed was an RTO timer (because we are no longer doing the "rearm RTO, rearm RTO, rearm TLP" dance on every ACK). So tcp_schedule_loss_probe() must independently figure out when an RTO would want to fire. In the new TLP implementation following in this series, we cannot assume that icsk_timeout was set based on an RTO; after processing a cumulative ACK the icsk_timeout we see can be from a previous TLP or RTO. So we need to independently recalculate the RTO time (instead of reading it out of icsk_timeout). Removing this dependency on the nature of icsk_timeout makes things a little easier to reason about anyway. Note that the old and new code should be equivalent, since they are both saying: "if the RTO is in the future, but at an earlier time than the normal TLP time, then set the TLP timer to fire when the RTO would have fired". Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2f1588bf73da..cd8e257492c4 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2377,8 +2377,8 @@ bool tcp_schedule_loss_probe(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - u32 timeout, tlp_time_stamp, rto_time_stamp; u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); + u32 timeout, rto_delta_us; /* No consecutive loss probes. */ if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { @@ -2417,14 +2417,10 @@ bool tcp_schedule_loss_probe(struct sock *sk) (rtt + (rtt >> 1) + TCP_DELACK_MAX)); timeout = max_t(u32, timeout, msecs_to_jiffies(10)); - /* If RTO is shorter, just schedule TLP in its place. */ - tlp_time_stamp = tcp_jiffies32 + timeout; - rto_time_stamp = (u32)inet_csk(sk)->icsk_timeout; - if ((s32)(tlp_time_stamp - rto_time_stamp) > 0) { - s32 delta = rto_time_stamp - tcp_jiffies32; - if (delta > 0) - timeout = delta; - } + /* If the RTO formula yields an earlier time, then use that time. */ + rto_delta_us = tcp_rto_delta_us(sk); /* How far in future is RTO? */ + if (rto_delta_us > 0) + timeout = min_t(u32, timeout, usecs_to_jiffies(rto_delta_us)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_LOSS_PROBE, timeout, TCP_RTO_MAX); From df92c8394e6ea0469e8056946ef8add740ab8046 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Thu, 3 Aug 2017 09:19:54 -0400 Subject: [PATCH 071/109] tcp: fix xmit timer to only be reset if data ACKed/SACKed Fix a TCP loss recovery performance bug raised recently on the netdev list, in two threads: (i) July 26, 2017: netdev thread "TCP fast retransmit issues" (ii) July 26, 2017: netdev thread: "[PATCH V2 net-next] TLP: Don't reschedule PTO when there's one outstanding TLP retransmission" The basic problem is that incoming TCP packets that did not indicate forward progress could cause the xmit timer (TLP or RTO) to be rearmed and pushed back in time. In certain corner cases this could result in the following problems noted in these threads: - Repeated ACKs coming in with bogus SACKs corrupted by middleboxes could cause TCP to repeatedly schedule TLPs forever. We kept sending TLPs after every ~200ms, which elicited bogus SACKs, which caused more TLPs, ad infinitum; we never fired an RTO to fill in the holes. - Incoming data segments could, in some cases, cause us to reschedule our RTO or TLP timer further out in time, for no good reason. This could cause repeated inbound data to result in stalls in outbound data, in the presence of packet loss. This commit fixes these bugs by changing the TLP and RTO ACK processing to: (a) Only reschedule the xmit timer once per ACK. (b) Only reschedule the xmit timer if tcp_clean_rtx_queue() deems the ACK indicates sufficient forward progress (a packet was cumulatively ACKed, or we got a SACK for a packet that was sent before the most recent retransmit of the write queue head). This brings us back into closer compliance with the RFCs, since, as the comment for tcp_rearm_rto() notes, we should only restart the RTO timer after forward progress on the connection. Previously we were restarting the xmit timer even in these cases where there was no forward progress. As a side benefit, this commit simplifies and speeds up the TCP timer arming logic. We had been calling inet_csk_reset_xmit_timer() three times on normal ACKs that cumulatively acknowledged some data: 1) Once near the top of tcp_ack() to switch from TLP timer to RTO: if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); 2) Once in tcp_clean_rtx_queue(), to update the RTO: if (flag & FLAG_ACKED) { tcp_rearm_rto(sk); 3) Once in tcp_ack() after tcp_fastretrans_alert() to switch from RTO to TLP: if (icsk->icsk_pending == ICSK_TIME_RETRANS) tcp_schedule_loss_probe(sk); This commit, by only rescheduling the xmit timer once per ACK, simplifies the code and reduces CPU overhead. This commit was tested in an A/B test with Google web server traffic. SNMP stats and request latency metrics were within noise levels, substantiating that for normal web traffic patterns this is a rare issue. This commit was also tested with packetdrill tests to verify that it fixes the timer behavior in the corner cases discussed in the netdev threads mentioned above. This patch is a bug fix patch intended to be queued for -stable relases. Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Reported-by: Klavs Klavsen Reported-by: Mao Wenan Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Nandita Dukkipati Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 25 ++++++++++++++++--------- net/ipv4/tcp_output.c | 9 --------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b9ba8d55d8b8..53de1424c13c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -107,6 +107,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ +#define FLAG_SET_XMIT_TIMER 0x1000 /* Set TLP or RTO timer */ #define FLAG_SACK_RENEGING 0x2000 /* snd_una advanced to a sacked seq */ #define FLAG_UPDATE_TS_RECENT 0x4000 /* tcp_replace_ts_recent() */ #define FLAG_NO_CHALLENGE_ACK 0x8000 /* do not call tcp_send_challenge_ack() */ @@ -3016,6 +3017,13 @@ void tcp_rearm_rto(struct sock *sk) } } +/* Try to schedule a loss probe; if that doesn't work, then schedule an RTO. */ +static void tcp_set_xmit_timer(struct sock *sk) +{ + if (!tcp_schedule_loss_probe(sk)) + tcp_rearm_rto(sk); +} + /* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { @@ -3177,7 +3185,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, ca_rtt_us, sack->rate); if (flag & FLAG_ACKED) { - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ if (unlikely(icsk->icsk_mtup.probe_size && !after(tp->mtu_probe.probe_seq_end, tp->snd_una))) { tcp_mtup_probe_success(sk); @@ -3205,7 +3213,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, * after when the head was last (re)transmitted. Otherwise the * timeout may continue to extend in loss recovery. */ - tcp_rearm_rto(sk); + flag |= FLAG_SET_XMIT_TIMER; /* set TLP or RTO timer */ } if (icsk->icsk_ca_ops->pkts_acked) { @@ -3577,9 +3585,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) - tcp_rearm_rto(sk); - if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; icsk->icsk_retransmits = 0; @@ -3644,18 +3649,20 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked, &sack_state); + if (tp->tlp_high_seq) + tcp_process_tlp_ack(sk, ack, flag); + /* If needed, reset TLP/RTO timer; RACK may later override this. */ + if (flag & FLAG_SET_XMIT_TIMER) + tcp_set_xmit_timer(sk); + if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } - if (tp->tlp_high_seq) - tcp_process_tlp_ack(sk, ack, flag); if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - if (icsk->icsk_pending == ICSK_TIME_RETRANS) - tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ tcp_rate_gen(sk, delivered, lost, sack_state.rate); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cd8e257492c4..276406a83a37 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2380,21 +2380,12 @@ bool tcp_schedule_loss_probe(struct sock *sk) u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3); u32 timeout, rto_delta_us; - /* No consecutive loss probes. */ - if (WARN_ON(icsk->icsk_pending == ICSK_TIME_LOSS_PROBE)) { - tcp_rearm_rto(sk); - return false; - } /* Don't do any loss probe on a Fast Open connection before 3WHS * finishes. */ if (tp->fastopen_rsk) return false; - /* TLP is only scheduled when next timer event is RTO. */ - if (icsk->icsk_pending != ICSK_TIME_RETRANS) - return false; - /* Schedule a loss probe in 2*RTT for SACK capable connections * in Open state, that are either limited by cwnd or application. */ From 17b334a876fe121abbd6634ace83ca58deea4bc5 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 4 Aug 2017 14:12:29 +0200 Subject: [PATCH 072/109] mlxsw: spectrum_switchdev: Don't warn about valid situations Some operations in the bridge driver such as MDB deletion are preformed in an atomic context and thus deferred to a process context by the switchdev infrastructure. Therefore, by the time the operation is performed by the underlying device driver it's possible the bridge port context is already gone. This is especially true for removal flows, but theoretically can also be invoked during addition. Remove the warnings in such situations and return normally. Fixes: c57529e1d5d8 ("mlxsw: spectrum: Replace vPorts with Port-VLAN") Fixes: 3922285d96e7 ("net: bridge: Add support for offloading port attributes") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../mellanox/mlxsw/spectrum_switchdev.c | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 656b2d3f1bee..73d54df89578 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -626,8 +626,8 @@ static int mlxsw_sp_port_attr_br_flags_set(struct mlxsw_sp_port *mlxsw_sp_port, bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge, orig_dev); - if (WARN_ON(!bridge_port)) - return -EINVAL; + if (!bridge_port) + return 0; err = mlxsw_sp_bridge_port_flood_table_set(mlxsw_sp_port, bridge_port, MLXSW_SP_FLOOD_TYPE_UC, @@ -711,8 +711,8 @@ static int mlxsw_sp_port_attr_mc_router_set(struct mlxsw_sp_port *mlxsw_sp_port, bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp_port->mlxsw_sp->bridge, orig_dev); - if (WARN_ON(!bridge_port)) - return -EINVAL; + if (!bridge_port) + return 0; if (!bridge_port->bridge_device->multicast_enabled) return 0; @@ -1283,15 +1283,15 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, return 0; bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev); - if (WARN_ON(!bridge_port)) - return -EINVAL; + if (!bridge_port) + return 0; bridge_device = bridge_port->bridge_device; mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port, bridge_device, mdb->vid); - if (WARN_ON(!mlxsw_sp_port_vlan)) - return -EINVAL; + if (!mlxsw_sp_port_vlan) + return 0; fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid); @@ -1407,15 +1407,15 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, int err = 0; bridge_port = mlxsw_sp_bridge_port_find(mlxsw_sp->bridge, orig_dev); - if (WARN_ON(!bridge_port)) - return -EINVAL; + if (!bridge_port) + return 0; bridge_device = bridge_port->bridge_device; mlxsw_sp_port_vlan = mlxsw_sp_port_vlan_find_by_bridge(mlxsw_sp_port, bridge_device, mdb->vid); - if (WARN_ON(!mlxsw_sp_port_vlan)) - return -EINVAL; + if (!mlxsw_sp_port_vlan) + return 0; fid_index = mlxsw_sp_fid_index(mlxsw_sp_port_vlan->fid); From 852cfeed0ebe93a1c1e0c153c04a57221f72bd7e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 4 Aug 2017 14:12:30 +0200 Subject: [PATCH 073/109] mlxsw: spectrum_switchdev: Release multicast groups during fini Each multicast group (MID) stores a bitmap of ports to which a packet should be forwarded to in case an MDB entry associated with the MID is hit. Since the initial introduction of IGMP snooping in commit 3a49b4fde2a1 ("mlxsw: Adding layer 2 multicast support") the driver didn't correctly free these multicast groups upon ungraceful situations such as the removal of the upper bridge device or module removal. The correct way to fix this is to associate each MID with the bridge ports member in it and then drop the reference in case the bridge port is destroyed, but this will result in a lot more code and will be fixed in net-next. For now, upon module removal, traverse the MID list and release each one. Fixes: 3a49b4fde2a1 ("mlxsw: Adding layer 2 multicast support") Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- .../ethernet/mellanox/mlxsw/spectrum_switchdev.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 73d54df89578..5eb1606765c5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -1974,6 +1974,17 @@ static void mlxsw_sp_fdb_fini(struct mlxsw_sp *mlxsw_sp) } +static void mlxsw_sp_mids_fini(struct mlxsw_sp *mlxsw_sp) +{ + struct mlxsw_sp_mid *mid, *tmp; + + list_for_each_entry_safe(mid, tmp, &mlxsw_sp->bridge->mids_list, list) { + list_del(&mid->list); + clear_bit(mid->mid, mlxsw_sp->bridge->mids_bitmap); + kfree(mid); + } +} + int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp) { struct mlxsw_sp_bridge *bridge; @@ -1996,7 +2007,7 @@ int mlxsw_sp_switchdev_init(struct mlxsw_sp *mlxsw_sp) void mlxsw_sp_switchdev_fini(struct mlxsw_sp *mlxsw_sp) { mlxsw_sp_fdb_fini(mlxsw_sp); - WARN_ON(!list_empty(&mlxsw_sp->bridge->mids_list)); + mlxsw_sp_mids_fini(mlxsw_sp); WARN_ON(!list_empty(&mlxsw_sp->bridge->bridges_list)); kfree(mlxsw_sp->bridge); } From b0a0c2566f28e71e5e32121992ac8060cec75510 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Aug 2017 14:20:54 +0200 Subject: [PATCH 074/109] bpf, s390: fix jit branch offset related to ldimm64 While testing some other work that required JIT modifications, I run into test_bpf causing a hang when JIT enabled on s390. The problematic test case was the one from ddc665a4bb4b (bpf, arm64: fix jit branch offset related to ldimm64), and turns out that we do have a similar issue on s390 as well. In bpf_jit_prog() we update next instruction address after returning from bpf_jit_insn() with an insn_count. bpf_jit_insn() returns either -1 in case of error (e.g. unsupported insn), 1 or 2. The latter is only the case for ldimm64 due to spanning 2 insns, however, next address is only set to i + 1 not taking actual insn_count into account, thus fix is to use insn_count instead of 1. bpf_jit_enable in mode 2 provides also disasm on s390: Before fix: 000003ff800349b6: a7f40003 brc 15,3ff800349bc ; target 000003ff800349ba: 0000 unknown 000003ff800349bc: e3b0f0700024 stg %r11,112(%r15) 000003ff800349c2: e3e0f0880024 stg %r14,136(%r15) 000003ff800349c8: 0db0 basr %r11,%r0 000003ff800349ca: c0ef00000000 llilf %r14,0 000003ff800349d0: e320b0360004 lg %r2,54(%r11) 000003ff800349d6: e330b03e0004 lg %r3,62(%r11) 000003ff800349dc: ec23ffeda065 clgrj %r2,%r3,10,3ff800349b6 ; jmp 000003ff800349e2: e3e0b0460004 lg %r14,70(%r11) 000003ff800349e8: e3e0b04e0004 lg %r14,78(%r11) 000003ff800349ee: b904002e lgr %r2,%r14 000003ff800349f2: e3b0f0700004 lg %r11,112(%r15) 000003ff800349f8: e3e0f0880004 lg %r14,136(%r15) 000003ff800349fe: 07fe bcr 15,%r14 After fix: 000003ff80ef3db4: a7f40003 brc 15,3ff80ef3dba 000003ff80ef3db8: 0000 unknown 000003ff80ef3dba: e3b0f0700024 stg %r11,112(%r15) 000003ff80ef3dc0: e3e0f0880024 stg %r14,136(%r15) 000003ff80ef3dc6: 0db0 basr %r11,%r0 000003ff80ef3dc8: c0ef00000000 llilf %r14,0 000003ff80ef3dce: e320b0360004 lg %r2,54(%r11) 000003ff80ef3dd4: e330b03e0004 lg %r3,62(%r11) 000003ff80ef3dda: ec230006a065 clgrj %r2,%r3,10,3ff80ef3de6 ; jmp 000003ff80ef3de0: e3e0b0460004 lg %r14,70(%r11) 000003ff80ef3de6: e3e0b04e0004 lg %r14,78(%r11) ; target 000003ff80ef3dec: b904002e lgr %r2,%r14 000003ff80ef3df0: e3b0f0700004 lg %r11,112(%r15) 000003ff80ef3df6: e3e0f0880004 lg %r14,136(%r15) 000003ff80ef3dfc: 07fe bcr 15,%r14 test_bpf.ko suite runs fine after the fix. Fixes: 054623105728 ("s390/bpf: Add s390x eBPF JIT compiler backend") Signed-off-by: Daniel Borkmann Tested-by: Michael Holzheu Signed-off-by: David S. Miller --- arch/s390/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 01c6fbc3e85b..1803797fc885 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1253,7 +1253,8 @@ static int bpf_jit_prog(struct bpf_jit *jit, struct bpf_prog *fp) insn_count = bpf_jit_insn(jit, fp, i); if (insn_count < 0) return -1; - jit->addrs[i + 1] = jit->prg; /* Next instruction address */ + /* Next instruction address */ + jit->addrs[i + insn_count] = jit->prg; } bpf_jit_epilogue(jit); From bad1926dd2f60bbb4af5223ace3a7b34a0219a66 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Aug 2017 14:20:55 +0200 Subject: [PATCH 075/109] bpf, s390: fix build for libbpf and selftest suite The BPF feature test as well as libbpf is missing the __NR_bpf define for s390 and currently refuses to compile (selftest suite depends on libbpf as well). Similar issue was fixed some time ago via b0c47807d31d ("bpf: Add sparc support to tools and samples."), just do the same and add definitions. Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/build/feature/test-bpf.c | 2 ++ tools/lib/bpf/bpf.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tools/build/feature/test-bpf.c b/tools/build/feature/test-bpf.c index 7598361ef1f1..da2172ff9662 100644 --- a/tools/build/feature/test-bpf.c +++ b/tools/build/feature/test-bpf.c @@ -11,6 +11,8 @@ # define __NR_bpf 280 # elif defined(__sparc__) # define __NR_bpf 349 +# elif defined(__s390__) +# define __NR_bpf 351 # else # error __NR_bpf not defined. libbpf does not support your arch. # endif diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 256f571f2ab5..e5bbb090bf88 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -39,6 +39,8 @@ # define __NR_bpf 280 # elif defined(__sparc__) # define __NR_bpf 349 +# elif defined(__s390__) +# define __NR_bpf 351 # else # error __NR_bpf not defined. libbpf does not support your arch. # endif From b6bd53f9c4e825fca82fe1392157c78443c814ab Mon Sep 17 00:00:00 2001 From: David Daney Date: Thu, 3 Aug 2017 17:10:12 -0700 Subject: [PATCH 076/109] MIPS: Add missing file for eBPF JIT. Inexplicably, commit f381bf6d82f0 ("MIPS: Add support for eBPF JIT.") lost a file somewhere on its path to Linus' tree. Add back the missing ebpf_jit.c so that we can build with CONFIG_BPF_JIT selected. This version of ebpf_jit.c is identical to the original except for two minor change need to resolve conflicts with changes merged from the BPF branch: A) Set prog->jited_len = image_size; B) Use BPF_TAIL_CALL instead of BPF_CALL | BPF_X Fixes: f381bf6d82f0 ("MIPS: Add support for eBPF JIT.") Signed-off-by: David Daney Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- arch/mips/net/ebpf_jit.c | 1950 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 1950 insertions(+) create mode 100644 arch/mips/net/ebpf_jit.c diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c new file mode 100644 index 000000000000..3f87b96da5c4 --- /dev/null +++ b/arch/mips/net/ebpf_jit.c @@ -0,0 +1,1950 @@ +/* + * Just-In-Time compiler for eBPF filters on MIPS + * + * Copyright (c) 2017 Cavium, Inc. + * + * Based on code from: + * + * Copyright (c) 2014 Imagination Technologies Ltd. + * Author: Markos Chandras + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Registers used by JIT */ +#define MIPS_R_ZERO 0 +#define MIPS_R_AT 1 +#define MIPS_R_V0 2 /* BPF_R0 */ +#define MIPS_R_V1 3 +#define MIPS_R_A0 4 /* BPF_R1 */ +#define MIPS_R_A1 5 /* BPF_R2 */ +#define MIPS_R_A2 6 /* BPF_R3 */ +#define MIPS_R_A3 7 /* BPF_R4 */ +#define MIPS_R_A4 8 /* BPF_R5 */ +#define MIPS_R_T4 12 /* BPF_AX */ +#define MIPS_R_T5 13 +#define MIPS_R_T6 14 +#define MIPS_R_T7 15 +#define MIPS_R_S0 16 /* BPF_R6 */ +#define MIPS_R_S1 17 /* BPF_R7 */ +#define MIPS_R_S2 18 /* BPF_R8 */ +#define MIPS_R_S3 19 /* BPF_R9 */ +#define MIPS_R_S4 20 /* BPF_TCC */ +#define MIPS_R_S5 21 +#define MIPS_R_S6 22 +#define MIPS_R_S7 23 +#define MIPS_R_T8 24 +#define MIPS_R_T9 25 +#define MIPS_R_SP 29 +#define MIPS_R_RA 31 + +/* eBPF flags */ +#define EBPF_SAVE_S0 BIT(0) +#define EBPF_SAVE_S1 BIT(1) +#define EBPF_SAVE_S2 BIT(2) +#define EBPF_SAVE_S3 BIT(3) +#define EBPF_SAVE_S4 BIT(4) +#define EBPF_SAVE_RA BIT(5) +#define EBPF_SEEN_FP BIT(6) +#define EBPF_SEEN_TC BIT(7) +#define EBPF_TCC_IN_V1 BIT(8) + +/* + * For the mips64 ISA, we need to track the value range or type for + * each JIT register. The BPF machine requires zero extended 32-bit + * values, but the mips64 ISA requires sign extended 32-bit values. + * At each point in the BPF program we track the state of every + * register so that we can zero extend or sign extend as the BPF + * semantics require. + */ +enum reg_val_type { + /* uninitialized */ + REG_UNKNOWN, + /* not known to be 32-bit compatible. */ + REG_64BIT, + /* 32-bit compatible, no truncation needed for 64-bit ops. */ + REG_64BIT_32BIT, + /* 32-bit compatible, need truncation for 64-bit ops. */ + REG_32BIT, + /* 32-bit zero extended. */ + REG_32BIT_ZERO_EX, + /* 32-bit no sign/zero extension needed. */ + REG_32BIT_POS +}; + +/* + * high bit of offsets indicates if long branch conversion done at + * this insn. + */ +#define OFFSETS_B_CONV BIT(31) + +/** + * struct jit_ctx - JIT context + * @skf: The sk_filter + * @stack_size: eBPF stack size + * @tmp_offset: eBPF $sp offset to 8-byte temporary memory + * @idx: Instruction index + * @flags: JIT flags + * @offsets: Instruction offsets + * @target: Memory location for the compiled filter + * @reg_val_types Packed enum reg_val_type for each register. + */ +struct jit_ctx { + const struct bpf_prog *skf; + int stack_size; + int tmp_offset; + u32 idx; + u32 flags; + u32 *offsets; + u32 *target; + u64 *reg_val_types; + unsigned int long_b_conversion:1; + unsigned int gen_b_offsets:1; +}; + +static void set_reg_val_type(u64 *rvt, int reg, enum reg_val_type type) +{ + *rvt &= ~(7ull << (reg * 3)); + *rvt |= ((u64)type << (reg * 3)); +} + +static enum reg_val_type get_reg_val_type(const struct jit_ctx *ctx, + int index, int reg) +{ + return (ctx->reg_val_types[index] >> (reg * 3)) & 7; +} + +/* Simply emit the instruction if the JIT memory space has been allocated */ +#define emit_instr(ctx, func, ...) \ +do { \ + if ((ctx)->target != NULL) { \ + u32 *p = &(ctx)->target[ctx->idx]; \ + uasm_i_##func(&p, ##__VA_ARGS__); \ + } \ + (ctx)->idx++; \ +} while (0) + +static unsigned int j_target(struct jit_ctx *ctx, int target_idx) +{ + unsigned long target_va, base_va; + unsigned int r; + + if (!ctx->target) + return 0; + + base_va = (unsigned long)ctx->target; + target_va = base_va + (ctx->offsets[target_idx] & ~OFFSETS_B_CONV); + + if ((base_va & ~0x0ffffffful) != (target_va & ~0x0ffffffful)) + return (unsigned int)-1; + r = target_va & 0x0ffffffful; + return r; +} + +/* Compute the immediate value for PC-relative branches. */ +static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx) +{ + if (!ctx->gen_b_offsets) + return 0; + + /* + * We want a pc-relative branch. tgt is the instruction offset + * we want to jump to. + + * Branch on MIPS: + * I: target_offset <- sign_extend(offset) + * I+1: PC += target_offset (delay slot) + * + * ctx->idx currently points to the branch instruction + * but the offset is added to the delay slot so we need + * to subtract 4. + */ + return (ctx->offsets[tgt] & ~OFFSETS_B_CONV) - + (ctx->idx * 4) - 4; +} + +int bpf_jit_enable __read_mostly; + +enum which_ebpf_reg { + src_reg, + src_reg_no_fp, + dst_reg, + dst_reg_fp_ok +}; + +/* + * For eBPF, the register mapping naturally falls out of the + * requirements of eBPF and the MIPS n64 ABI. We don't maintain a + * separate frame pointer, so BPF_REG_10 relative accesses are + * adjusted to be $sp relative. + */ +int ebpf_to_mips_reg(struct jit_ctx *ctx, const struct bpf_insn *insn, + enum which_ebpf_reg w) +{ + int ebpf_reg = (w == src_reg || w == src_reg_no_fp) ? + insn->src_reg : insn->dst_reg; + + switch (ebpf_reg) { + case BPF_REG_0: + return MIPS_R_V0; + case BPF_REG_1: + return MIPS_R_A0; + case BPF_REG_2: + return MIPS_R_A1; + case BPF_REG_3: + return MIPS_R_A2; + case BPF_REG_4: + return MIPS_R_A3; + case BPF_REG_5: + return MIPS_R_A4; + case BPF_REG_6: + ctx->flags |= EBPF_SAVE_S0; + return MIPS_R_S0; + case BPF_REG_7: + ctx->flags |= EBPF_SAVE_S1; + return MIPS_R_S1; + case BPF_REG_8: + ctx->flags |= EBPF_SAVE_S2; + return MIPS_R_S2; + case BPF_REG_9: + ctx->flags |= EBPF_SAVE_S3; + return MIPS_R_S3; + case BPF_REG_10: + if (w == dst_reg || w == src_reg_no_fp) + goto bad_reg; + ctx->flags |= EBPF_SEEN_FP; + /* + * Needs special handling, return something that + * cannot be clobbered just in case. + */ + return MIPS_R_ZERO; + case BPF_REG_AX: + return MIPS_R_T4; + default: +bad_reg: + WARN(1, "Illegal bpf reg: %d\n", ebpf_reg); + return -EINVAL; + } +} +/* + * eBPF stack frame will be something like: + * + * Entry $sp ------> +--------------------------------+ + * | $ra (optional) | + * +--------------------------------+ + * | $s0 (optional) | + * +--------------------------------+ + * | $s1 (optional) | + * +--------------------------------+ + * | $s2 (optional) | + * +--------------------------------+ + * | $s3 (optional) | + * +--------------------------------+ + * | $s4 (optional) | + * +--------------------------------+ + * | tmp-storage (if $ra saved) | + * $sp + tmp_offset --> +--------------------------------+ <--BPF_REG_10 + * | BPF_REG_10 relative storage | + * | MAX_BPF_STACK (optional) | + * | . | + * | . | + * | . | + * $sp --------> +--------------------------------+ + * + * If BPF_REG_10 is never referenced, then the MAX_BPF_STACK sized + * area is not allocated. + */ +static int gen_int_prologue(struct jit_ctx *ctx) +{ + int stack_adjust = 0; + int store_offset; + int locals_size; + + if (ctx->flags & EBPF_SAVE_RA) + /* + * If RA we are doing a function call and may need + * extra 8-byte tmp area. + */ + stack_adjust += 16; + if (ctx->flags & EBPF_SAVE_S0) + stack_adjust += 8; + if (ctx->flags & EBPF_SAVE_S1) + stack_adjust += 8; + if (ctx->flags & EBPF_SAVE_S2) + stack_adjust += 8; + if (ctx->flags & EBPF_SAVE_S3) + stack_adjust += 8; + if (ctx->flags & EBPF_SAVE_S4) + stack_adjust += 8; + + BUILD_BUG_ON(MAX_BPF_STACK & 7); + locals_size = (ctx->flags & EBPF_SEEN_FP) ? MAX_BPF_STACK : 0; + + stack_adjust += locals_size; + ctx->tmp_offset = locals_size; + + ctx->stack_size = stack_adjust; + + /* + * First instruction initializes the tail call count (TCC). + * On tail call we skip this instruction, and the TCC is + * passed in $v1 from the caller. + */ + emit_instr(ctx, daddiu, MIPS_R_V1, MIPS_R_ZERO, MAX_TAIL_CALL_CNT); + if (stack_adjust) + emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, -stack_adjust); + else + return 0; + + store_offset = stack_adjust - 8; + + if (ctx->flags & EBPF_SAVE_RA) { + emit_instr(ctx, sd, MIPS_R_RA, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S0) { + emit_instr(ctx, sd, MIPS_R_S0, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S1) { + emit_instr(ctx, sd, MIPS_R_S1, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S2) { + emit_instr(ctx, sd, MIPS_R_S2, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S3) { + emit_instr(ctx, sd, MIPS_R_S3, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S4) { + emit_instr(ctx, sd, MIPS_R_S4, store_offset, MIPS_R_SP); + store_offset -= 8; + } + + if ((ctx->flags & EBPF_SEEN_TC) && !(ctx->flags & EBPF_TCC_IN_V1)) + emit_instr(ctx, daddu, MIPS_R_S4, MIPS_R_V1, MIPS_R_ZERO); + + return 0; +} + +static int build_int_epilogue(struct jit_ctx *ctx, int dest_reg) +{ + const struct bpf_prog *prog = ctx->skf; + int stack_adjust = ctx->stack_size; + int store_offset = stack_adjust - 8; + int r0 = MIPS_R_V0; + + if (dest_reg == MIPS_R_RA && + get_reg_val_type(ctx, prog->len, BPF_REG_0) == REG_32BIT_ZERO_EX) + /* Don't let zero extended value escape. */ + emit_instr(ctx, sll, r0, r0, 0); + + if (ctx->flags & EBPF_SAVE_RA) { + emit_instr(ctx, ld, MIPS_R_RA, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S0) { + emit_instr(ctx, ld, MIPS_R_S0, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S1) { + emit_instr(ctx, ld, MIPS_R_S1, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S2) { + emit_instr(ctx, ld, MIPS_R_S2, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S3) { + emit_instr(ctx, ld, MIPS_R_S3, store_offset, MIPS_R_SP); + store_offset -= 8; + } + if (ctx->flags & EBPF_SAVE_S4) { + emit_instr(ctx, ld, MIPS_R_S4, store_offset, MIPS_R_SP); + store_offset -= 8; + } + emit_instr(ctx, jr, dest_reg); + + if (stack_adjust) + emit_instr(ctx, daddiu, MIPS_R_SP, MIPS_R_SP, stack_adjust); + else + emit_instr(ctx, nop); + + return 0; +} + +static void gen_imm_to_reg(const struct bpf_insn *insn, int reg, + struct jit_ctx *ctx) +{ + if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) { + emit_instr(ctx, addiu, reg, MIPS_R_ZERO, insn->imm); + } else { + int lower = (s16)(insn->imm & 0xffff); + int upper = insn->imm - lower; + + emit_instr(ctx, lui, reg, upper >> 16); + emit_instr(ctx, addiu, reg, reg, lower); + } + +} + +static int gen_imm_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, + int idx) +{ + int upper_bound, lower_bound; + int dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + + if (dst < 0) + return dst; + + switch (BPF_OP(insn->code)) { + case BPF_MOV: + case BPF_ADD: + upper_bound = S16_MAX; + lower_bound = S16_MIN; + break; + case BPF_SUB: + upper_bound = -(int)S16_MIN; + lower_bound = -(int)S16_MAX; + break; + case BPF_AND: + case BPF_OR: + case BPF_XOR: + upper_bound = 0xffff; + lower_bound = 0; + break; + case BPF_RSH: + case BPF_LSH: + case BPF_ARSH: + /* Shift amounts are truncated, no need for bounds */ + upper_bound = S32_MAX; + lower_bound = S32_MIN; + break; + default: + return -EINVAL; + } + + /* + * Immediate move clobbers the register, so no sign/zero + * extension needed. + */ + if (BPF_CLASS(insn->code) == BPF_ALU64 && + BPF_OP(insn->code) != BPF_MOV && + get_reg_val_type(ctx, idx, insn->dst_reg) == REG_32BIT) + emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); + /* BPF_ALU | BPF_LSH doesn't need separate sign extension */ + if (BPF_CLASS(insn->code) == BPF_ALU && + BPF_OP(insn->code) != BPF_LSH && + BPF_OP(insn->code) != BPF_MOV && + get_reg_val_type(ctx, idx, insn->dst_reg) != REG_32BIT) + emit_instr(ctx, sll, dst, dst, 0); + + if (insn->imm >= lower_bound && insn->imm <= upper_bound) { + /* single insn immediate case */ + switch (BPF_OP(insn->code) | BPF_CLASS(insn->code)) { + case BPF_ALU64 | BPF_MOV: + emit_instr(ctx, daddiu, dst, MIPS_R_ZERO, insn->imm); + break; + case BPF_ALU64 | BPF_AND: + case BPF_ALU | BPF_AND: + emit_instr(ctx, andi, dst, dst, insn->imm); + break; + case BPF_ALU64 | BPF_OR: + case BPF_ALU | BPF_OR: + emit_instr(ctx, ori, dst, dst, insn->imm); + break; + case BPF_ALU64 | BPF_XOR: + case BPF_ALU | BPF_XOR: + emit_instr(ctx, xori, dst, dst, insn->imm); + break; + case BPF_ALU64 | BPF_ADD: + emit_instr(ctx, daddiu, dst, dst, insn->imm); + break; + case BPF_ALU64 | BPF_SUB: + emit_instr(ctx, daddiu, dst, dst, -insn->imm); + break; + case BPF_ALU64 | BPF_RSH: + emit_instr(ctx, dsrl_safe, dst, dst, insn->imm & 0x3f); + break; + case BPF_ALU | BPF_RSH: + emit_instr(ctx, srl, dst, dst, insn->imm & 0x1f); + break; + case BPF_ALU64 | BPF_LSH: + emit_instr(ctx, dsll_safe, dst, dst, insn->imm & 0x3f); + break; + case BPF_ALU | BPF_LSH: + emit_instr(ctx, sll, dst, dst, insn->imm & 0x1f); + break; + case BPF_ALU64 | BPF_ARSH: + emit_instr(ctx, dsra_safe, dst, dst, insn->imm & 0x3f); + break; + case BPF_ALU | BPF_ARSH: + emit_instr(ctx, sra, dst, dst, insn->imm & 0x1f); + break; + case BPF_ALU | BPF_MOV: + emit_instr(ctx, addiu, dst, MIPS_R_ZERO, insn->imm); + break; + case BPF_ALU | BPF_ADD: + emit_instr(ctx, addiu, dst, dst, insn->imm); + break; + case BPF_ALU | BPF_SUB: + emit_instr(ctx, addiu, dst, dst, -insn->imm); + break; + default: + return -EINVAL; + } + } else { + /* multi insn immediate case */ + if (BPF_OP(insn->code) == BPF_MOV) { + gen_imm_to_reg(insn, dst, ctx); + } else { + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + switch (BPF_OP(insn->code) | BPF_CLASS(insn->code)) { + case BPF_ALU64 | BPF_AND: + case BPF_ALU | BPF_AND: + emit_instr(ctx, and, dst, dst, MIPS_R_AT); + break; + case BPF_ALU64 | BPF_OR: + case BPF_ALU | BPF_OR: + emit_instr(ctx, or, dst, dst, MIPS_R_AT); + break; + case BPF_ALU64 | BPF_XOR: + case BPF_ALU | BPF_XOR: + emit_instr(ctx, xor, dst, dst, MIPS_R_AT); + break; + case BPF_ALU64 | BPF_ADD: + emit_instr(ctx, daddu, dst, dst, MIPS_R_AT); + break; + case BPF_ALU64 | BPF_SUB: + emit_instr(ctx, dsubu, dst, dst, MIPS_R_AT); + break; + case BPF_ALU | BPF_ADD: + emit_instr(ctx, addu, dst, dst, MIPS_R_AT); + break; + case BPF_ALU | BPF_SUB: + emit_instr(ctx, subu, dst, dst, MIPS_R_AT); + break; + default: + return -EINVAL; + } + } + } + + return 0; +} + +static void * __must_check +ool_skb_header_pointer(const struct sk_buff *skb, int offset, + int len, void *buffer) +{ + return skb_header_pointer(skb, offset, len, buffer); +} + +static int size_to_len(const struct bpf_insn *insn) +{ + switch (BPF_SIZE(insn->code)) { + case BPF_B: + return 1; + case BPF_H: + return 2; + case BPF_W: + return 4; + case BPF_DW: + return 8; + } + return 0; +} + +static void emit_const_to_reg(struct jit_ctx *ctx, int dst, u64 value) +{ + if (value >= 0xffffffffffff8000ull || value < 0x8000ull) { + emit_instr(ctx, daddiu, dst, MIPS_R_ZERO, (int)value); + } else if (value >= 0xffffffff80000000ull || + (value < 0x80000000 && value > 0xffff)) { + emit_instr(ctx, lui, dst, (s32)(s16)(value >> 16)); + emit_instr(ctx, ori, dst, dst, (unsigned int)(value & 0xffff)); + } else { + int i; + bool seen_part = false; + int needed_shift = 0; + + for (i = 0; i < 4; i++) { + u64 part = (value >> (16 * (3 - i))) & 0xffff; + + if (seen_part && needed_shift > 0 && (part || i == 3)) { + emit_instr(ctx, dsll_safe, dst, dst, needed_shift); + needed_shift = 0; + } + if (part) { + if (i == 0 || (!seen_part && i < 3 && part < 0x8000)) { + emit_instr(ctx, lui, dst, (s32)(s16)part); + needed_shift = -16; + } else { + emit_instr(ctx, ori, dst, + seen_part ? dst : MIPS_R_ZERO, + (unsigned int)part); + } + seen_part = true; + } + if (seen_part) + needed_shift += 16; + } + } +} + +static int emit_bpf_tail_call(struct jit_ctx *ctx, int this_idx) +{ + int off, b_off; + + ctx->flags |= EBPF_SEEN_TC; + /* + * if (index >= array->map.max_entries) + * goto out; + */ + off = offsetof(struct bpf_array, map.max_entries); + emit_instr(ctx, lwu, MIPS_R_T5, off, MIPS_R_A1); + emit_instr(ctx, sltu, MIPS_R_AT, MIPS_R_T5, MIPS_R_A2); + b_off = b_imm(this_idx + 1, ctx); + emit_instr(ctx, bne, MIPS_R_AT, MIPS_R_ZERO, b_off); + /* + * if (--TCC < 0) + * goto out; + */ + /* Delay slot */ + emit_instr(ctx, daddiu, MIPS_R_T5, + (ctx->flags & EBPF_TCC_IN_V1) ? MIPS_R_V1 : MIPS_R_S4, -1); + b_off = b_imm(this_idx + 1, ctx); + emit_instr(ctx, bltz, MIPS_R_T5, b_off); + /* + * prog = array->ptrs[index]; + * if (prog == NULL) + * goto out; + */ + /* Delay slot */ + emit_instr(ctx, dsll, MIPS_R_T8, MIPS_R_A2, 3); + emit_instr(ctx, daddu, MIPS_R_T8, MIPS_R_T8, MIPS_R_A1); + off = offsetof(struct bpf_array, ptrs); + emit_instr(ctx, ld, MIPS_R_AT, off, MIPS_R_T8); + b_off = b_imm(this_idx + 1, ctx); + emit_instr(ctx, beq, MIPS_R_AT, MIPS_R_ZERO, b_off); + /* Delay slot */ + emit_instr(ctx, nop); + + /* goto *(prog->bpf_func + 4); */ + off = offsetof(struct bpf_prog, bpf_func); + emit_instr(ctx, ld, MIPS_R_T9, off, MIPS_R_AT); + /* All systems are go... propagate TCC */ + emit_instr(ctx, daddu, MIPS_R_V1, MIPS_R_T5, MIPS_R_ZERO); + /* Skip first instruction (TCC initialization) */ + emit_instr(ctx, daddiu, MIPS_R_T9, MIPS_R_T9, 4); + return build_int_epilogue(ctx, MIPS_R_T9); +} + +static bool use_bbit_insns(void) +{ + switch (current_cpu_type()) { + case CPU_CAVIUM_OCTEON: + case CPU_CAVIUM_OCTEON_PLUS: + case CPU_CAVIUM_OCTEON2: + case CPU_CAVIUM_OCTEON3: + return true; + default: + return false; + } +} + +static bool is_bad_offset(int b_off) +{ + return b_off > 0x1ffff || b_off < -0x20000; +} + +/* Returns the number of insn slots consumed. */ +static int build_one_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, + int this_idx, int exit_idx) +{ + int src, dst, r, td, ts, mem_off, b_off; + bool need_swap, did_move, cmp_eq; + unsigned int target; + u64 t64; + s64 t64s; + + switch (insn->code) { + case BPF_ALU64 | BPF_ADD | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_SUB | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_OR | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_AND | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_LSH | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_RSH | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_XOR | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_ARSH | BPF_K: /* ALU64_IMM */ + case BPF_ALU64 | BPF_MOV | BPF_K: /* ALU64_IMM */ + case BPF_ALU | BPF_MOV | BPF_K: /* ALU32_IMM */ + case BPF_ALU | BPF_ADD | BPF_K: /* ALU32_IMM */ + case BPF_ALU | BPF_SUB | BPF_K: /* ALU32_IMM */ + case BPF_ALU | BPF_OR | BPF_K: /* ALU64_IMM */ + case BPF_ALU | BPF_AND | BPF_K: /* ALU64_IMM */ + case BPF_ALU | BPF_LSH | BPF_K: /* ALU64_IMM */ + case BPF_ALU | BPF_RSH | BPF_K: /* ALU64_IMM */ + case BPF_ALU | BPF_XOR | BPF_K: /* ALU64_IMM */ + case BPF_ALU | BPF_ARSH | BPF_K: /* ALU64_IMM */ + r = gen_imm_insn(insn, ctx, this_idx); + if (r < 0) + return r; + break; + case BPF_ALU64 | BPF_MUL | BPF_K: /* ALU64_IMM */ + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT) + emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); + if (insn->imm == 1) /* Mult by 1 is a nop */ + break; + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + emit_instr(ctx, dmultu, MIPS_R_AT, dst); + emit_instr(ctx, mflo, dst); + break; + case BPF_ALU64 | BPF_NEG | BPF_K: /* ALU64_IMM */ + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT) + emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); + emit_instr(ctx, dsubu, dst, MIPS_R_ZERO, dst); + break; + case BPF_ALU | BPF_MUL | BPF_K: /* ALU_IMM */ + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + td = get_reg_val_type(ctx, this_idx, insn->dst_reg); + if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) { + /* sign extend */ + emit_instr(ctx, sll, dst, dst, 0); + } + if (insn->imm == 1) /* Mult by 1 is a nop */ + break; + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + emit_instr(ctx, multu, dst, MIPS_R_AT); + emit_instr(ctx, mflo, dst); + break; + case BPF_ALU | BPF_NEG | BPF_K: /* ALU_IMM */ + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + td = get_reg_val_type(ctx, this_idx, insn->dst_reg); + if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) { + /* sign extend */ + emit_instr(ctx, sll, dst, dst, 0); + } + emit_instr(ctx, subu, dst, MIPS_R_ZERO, dst); + break; + case BPF_ALU | BPF_DIV | BPF_K: /* ALU_IMM */ + case BPF_ALU | BPF_MOD | BPF_K: /* ALU_IMM */ + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + if (insn->imm == 0) { /* Div by zero */ + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); + emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); + } + td = get_reg_val_type(ctx, this_idx, insn->dst_reg); + if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) + /* sign extend */ + emit_instr(ctx, sll, dst, dst, 0); + if (insn->imm == 1) { + /* div by 1 is a nop, mod by 1 is zero */ + if (BPF_OP(insn->code) == BPF_MOD) + emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO); + break; + } + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + emit_instr(ctx, divu, dst, MIPS_R_AT); + if (BPF_OP(insn->code) == BPF_DIV) + emit_instr(ctx, mflo, dst); + else + emit_instr(ctx, mfhi, dst); + break; + case BPF_ALU64 | BPF_DIV | BPF_K: /* ALU_IMM */ + case BPF_ALU64 | BPF_MOD | BPF_K: /* ALU_IMM */ + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + if (insn->imm == 0) { /* Div by zero */ + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); + emit_instr(ctx, addu, MIPS_R_V0, MIPS_R_ZERO, MIPS_R_ZERO); + } + if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT) + emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); + + if (insn->imm == 1) { + /* div by 1 is a nop, mod by 1 is zero */ + if (BPF_OP(insn->code) == BPF_MOD) + emit_instr(ctx, addu, dst, MIPS_R_ZERO, MIPS_R_ZERO); + break; + } + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + emit_instr(ctx, ddivu, dst, MIPS_R_AT); + if (BPF_OP(insn->code) == BPF_DIV) + emit_instr(ctx, mflo, dst); + else + emit_instr(ctx, mfhi, dst); + break; + case BPF_ALU64 | BPF_MOV | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_ADD | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_SUB | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_XOR | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_OR | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_AND | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_MUL | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_DIV | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_MOD | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_LSH | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_RSH | BPF_X: /* ALU64_REG */ + case BPF_ALU64 | BPF_ARSH | BPF_X: /* ALU64_REG */ + src = ebpf_to_mips_reg(ctx, insn, src_reg); + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (src < 0 || dst < 0) + return -EINVAL; + if (get_reg_val_type(ctx, this_idx, insn->dst_reg) == REG_32BIT) + emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); + did_move = false; + if (insn->src_reg == BPF_REG_10) { + if (BPF_OP(insn->code) == BPF_MOV) { + emit_instr(ctx, daddiu, dst, MIPS_R_SP, MAX_BPF_STACK); + did_move = true; + } else { + emit_instr(ctx, daddiu, MIPS_R_AT, MIPS_R_SP, MAX_BPF_STACK); + src = MIPS_R_AT; + } + } else if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) { + int tmp_reg = MIPS_R_AT; + + if (BPF_OP(insn->code) == BPF_MOV) { + tmp_reg = dst; + did_move = true; + } + emit_instr(ctx, daddu, tmp_reg, src, MIPS_R_ZERO); + emit_instr(ctx, dinsu, tmp_reg, MIPS_R_ZERO, 32, 32); + src = MIPS_R_AT; + } + switch (BPF_OP(insn->code)) { + case BPF_MOV: + if (!did_move) + emit_instr(ctx, daddu, dst, src, MIPS_R_ZERO); + break; + case BPF_ADD: + emit_instr(ctx, daddu, dst, dst, src); + break; + case BPF_SUB: + emit_instr(ctx, dsubu, dst, dst, src); + break; + case BPF_XOR: + emit_instr(ctx, xor, dst, dst, src); + break; + case BPF_OR: + emit_instr(ctx, or, dst, dst, src); + break; + case BPF_AND: + emit_instr(ctx, and, dst, dst, src); + break; + case BPF_MUL: + emit_instr(ctx, dmultu, dst, src); + emit_instr(ctx, mflo, dst); + break; + case BPF_DIV: + case BPF_MOD: + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); + emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); + emit_instr(ctx, ddivu, dst, src); + if (BPF_OP(insn->code) == BPF_DIV) + emit_instr(ctx, mflo, dst); + else + emit_instr(ctx, mfhi, dst); + break; + case BPF_LSH: + emit_instr(ctx, dsllv, dst, dst, src); + break; + case BPF_RSH: + emit_instr(ctx, dsrlv, dst, dst, src); + break; + case BPF_ARSH: + emit_instr(ctx, dsrav, dst, dst, src); + break; + default: + pr_err("ALU64_REG NOT HANDLED\n"); + return -EINVAL; + } + break; + case BPF_ALU | BPF_MOV | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_ADD | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_SUB | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_XOR | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_OR | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_AND | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_MUL | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_DIV | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_MOD | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_LSH | BPF_X: /* ALU_REG */ + case BPF_ALU | BPF_RSH | BPF_X: /* ALU_REG */ + src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (src < 0 || dst < 0) + return -EINVAL; + td = get_reg_val_type(ctx, this_idx, insn->dst_reg); + if (td == REG_64BIT || td == REG_32BIT_ZERO_EX) { + /* sign extend */ + emit_instr(ctx, sll, dst, dst, 0); + } + did_move = false; + ts = get_reg_val_type(ctx, this_idx, insn->src_reg); + if (ts == REG_64BIT || ts == REG_32BIT_ZERO_EX) { + int tmp_reg = MIPS_R_AT; + + if (BPF_OP(insn->code) == BPF_MOV) { + tmp_reg = dst; + did_move = true; + } + /* sign extend */ + emit_instr(ctx, sll, tmp_reg, src, 0); + src = MIPS_R_AT; + } + switch (BPF_OP(insn->code)) { + case BPF_MOV: + if (!did_move) + emit_instr(ctx, addu, dst, src, MIPS_R_ZERO); + break; + case BPF_ADD: + emit_instr(ctx, addu, dst, dst, src); + break; + case BPF_SUB: + emit_instr(ctx, subu, dst, dst, src); + break; + case BPF_XOR: + emit_instr(ctx, xor, dst, dst, src); + break; + case BPF_OR: + emit_instr(ctx, or, dst, dst, src); + break; + case BPF_AND: + emit_instr(ctx, and, dst, dst, src); + break; + case BPF_MUL: + emit_instr(ctx, mul, dst, dst, src); + break; + case BPF_DIV: + case BPF_MOD: + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, beq, src, MIPS_R_ZERO, b_off); + emit_instr(ctx, movz, MIPS_R_V0, MIPS_R_ZERO, src); + emit_instr(ctx, divu, dst, src); + if (BPF_OP(insn->code) == BPF_DIV) + emit_instr(ctx, mflo, dst); + else + emit_instr(ctx, mfhi, dst); + break; + case BPF_LSH: + emit_instr(ctx, sllv, dst, dst, src); + break; + case BPF_RSH: + emit_instr(ctx, srlv, dst, dst, src); + break; + default: + pr_err("ALU_REG NOT HANDLED\n"); + return -EINVAL; + } + break; + case BPF_JMP | BPF_EXIT: + if (this_idx + 1 < exit_idx) { + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, beq, MIPS_R_ZERO, MIPS_R_ZERO, b_off); + emit_instr(ctx, nop); + } + break; + case BPF_JMP | BPF_JEQ | BPF_K: /* JMP_IMM */ + case BPF_JMP | BPF_JNE | BPF_K: /* JMP_IMM */ + cmp_eq = (BPF_OP(insn->code) == BPF_JEQ); + dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); + if (dst < 0) + return dst; + if (insn->imm == 0) { + src = MIPS_R_ZERO; + } else { + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + src = MIPS_R_AT; + } + goto jeq_common; + case BPF_JMP | BPF_JEQ | BPF_X: /* JMP_REG */ + case BPF_JMP | BPF_JNE | BPF_X: + case BPF_JMP | BPF_JSGT | BPF_X: + case BPF_JMP | BPF_JSGE | BPF_X: + case BPF_JMP | BPF_JGT | BPF_X: + case BPF_JMP | BPF_JGE | BPF_X: + case BPF_JMP | BPF_JSET | BPF_X: + src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (src < 0 || dst < 0) + return -EINVAL; + td = get_reg_val_type(ctx, this_idx, insn->dst_reg); + ts = get_reg_val_type(ctx, this_idx, insn->src_reg); + if (td == REG_32BIT && ts != REG_32BIT) { + emit_instr(ctx, sll, MIPS_R_AT, src, 0); + src = MIPS_R_AT; + } else if (ts == REG_32BIT && td != REG_32BIT) { + emit_instr(ctx, sll, MIPS_R_AT, dst, 0); + dst = MIPS_R_AT; + } + if (BPF_OP(insn->code) == BPF_JSET) { + emit_instr(ctx, and, MIPS_R_AT, dst, src); + cmp_eq = false; + dst = MIPS_R_AT; + src = MIPS_R_ZERO; + } else if (BPF_OP(insn->code) == BPF_JSGT) { + emit_instr(ctx, dsubu, MIPS_R_AT, dst, src); + if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, blez, MIPS_R_AT, b_off); + emit_instr(ctx, nop); + return 2; /* We consumed the exit. */ + } + b_off = b_imm(this_idx + insn->off + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, bgtz, MIPS_R_AT, b_off); + emit_instr(ctx, nop); + break; + } else if (BPF_OP(insn->code) == BPF_JSGE) { + emit_instr(ctx, slt, MIPS_R_AT, dst, src); + cmp_eq = true; + dst = MIPS_R_AT; + src = MIPS_R_ZERO; + } else if (BPF_OP(insn->code) == BPF_JGT) { + /* dst or src could be AT */ + emit_instr(ctx, dsubu, MIPS_R_T8, dst, src); + emit_instr(ctx, sltu, MIPS_R_AT, dst, src); + /* SP known to be non-zero, movz becomes boolean not */ + emit_instr(ctx, movz, MIPS_R_T9, MIPS_R_SP, MIPS_R_T8); + emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_ZERO, MIPS_R_T8); + emit_instr(ctx, or, MIPS_R_AT, MIPS_R_T9, MIPS_R_AT); + cmp_eq = true; + dst = MIPS_R_AT; + src = MIPS_R_ZERO; + } else if (BPF_OP(insn->code) == BPF_JGE) { + emit_instr(ctx, sltu, MIPS_R_AT, dst, src); + cmp_eq = true; + dst = MIPS_R_AT; + src = MIPS_R_ZERO; + } else { /* JNE/JEQ case */ + cmp_eq = (BPF_OP(insn->code) == BPF_JEQ); + } +jeq_common: + /* + * If the next insn is EXIT and we are jumping arround + * only it, invert the sense of the compare and + * conditionally jump to the exit. Poor man's branch + * chaining. + */ + if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) { + target = j_target(ctx, exit_idx); + if (target == (unsigned int)-1) + return -E2BIG; + cmp_eq = !cmp_eq; + b_off = 4 * 3; + if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) { + ctx->offsets[this_idx] |= OFFSETS_B_CONV; + ctx->long_b_conversion = 1; + } + } + + if (cmp_eq) + emit_instr(ctx, bne, dst, src, b_off); + else + emit_instr(ctx, beq, dst, src, b_off); + emit_instr(ctx, nop); + if (ctx->offsets[this_idx] & OFFSETS_B_CONV) { + emit_instr(ctx, j, target); + emit_instr(ctx, nop); + } + return 2; /* We consumed the exit. */ + } + b_off = b_imm(this_idx + insn->off + 1, ctx); + if (is_bad_offset(b_off)) { + target = j_target(ctx, this_idx + insn->off + 1); + if (target == (unsigned int)-1) + return -E2BIG; + cmp_eq = !cmp_eq; + b_off = 4 * 3; + if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) { + ctx->offsets[this_idx] |= OFFSETS_B_CONV; + ctx->long_b_conversion = 1; + } + } + + if (cmp_eq) + emit_instr(ctx, beq, dst, src, b_off); + else + emit_instr(ctx, bne, dst, src, b_off); + emit_instr(ctx, nop); + if (ctx->offsets[this_idx] & OFFSETS_B_CONV) { + emit_instr(ctx, j, target); + emit_instr(ctx, nop); + } + break; + case BPF_JMP | BPF_JSGT | BPF_K: /* JMP_IMM */ + case BPF_JMP | BPF_JSGE | BPF_K: /* JMP_IMM */ + cmp_eq = (BPF_OP(insn->code) == BPF_JSGE); + dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); + if (dst < 0) + return dst; + + if (insn->imm == 0) { + if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + if (cmp_eq) + emit_instr(ctx, bltz, dst, b_off); + else + emit_instr(ctx, blez, dst, b_off); + emit_instr(ctx, nop); + return 2; /* We consumed the exit. */ + } + b_off = b_imm(this_idx + insn->off + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + if (cmp_eq) + emit_instr(ctx, bgez, dst, b_off); + else + emit_instr(ctx, bgtz, dst, b_off); + emit_instr(ctx, nop); + break; + } + /* + * only "LT" compare available, so we must use imm + 1 + * to generate "GT" + */ + t64s = insn->imm + (cmp_eq ? 0 : 1); + if (t64s >= S16_MIN && t64s <= S16_MAX) { + emit_instr(ctx, slti, MIPS_R_AT, dst, (int)t64s); + src = MIPS_R_AT; + dst = MIPS_R_ZERO; + cmp_eq = true; + goto jeq_common; + } + emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s); + emit_instr(ctx, slt, MIPS_R_AT, dst, MIPS_R_AT); + src = MIPS_R_AT; + dst = MIPS_R_ZERO; + cmp_eq = true; + goto jeq_common; + + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + cmp_eq = (BPF_OP(insn->code) == BPF_JGE); + dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); + if (dst < 0) + return dst; + /* + * only "LT" compare available, so we must use imm + 1 + * to generate "GT" + */ + t64s = (u64)(u32)(insn->imm) + (cmp_eq ? 0 : 1); + if (t64s >= 0 && t64s <= S16_MAX) { + emit_instr(ctx, sltiu, MIPS_R_AT, dst, (int)t64s); + src = MIPS_R_AT; + dst = MIPS_R_ZERO; + cmp_eq = true; + goto jeq_common; + } + emit_const_to_reg(ctx, MIPS_R_AT, (u64)t64s); + emit_instr(ctx, sltu, MIPS_R_AT, dst, MIPS_R_AT); + src = MIPS_R_AT; + dst = MIPS_R_ZERO; + cmp_eq = true; + goto jeq_common; + + case BPF_JMP | BPF_JSET | BPF_K: /* JMP_IMM */ + dst = ebpf_to_mips_reg(ctx, insn, dst_reg_fp_ok); + if (dst < 0) + return dst; + + if (use_bbit_insns() && hweight32((u32)insn->imm) == 1) { + if ((insn + 1)->code == (BPF_JMP | BPF_EXIT) && insn->off == 1) { + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, bbit0, dst, ffs((u32)insn->imm) - 1, b_off); + emit_instr(ctx, nop); + return 2; /* We consumed the exit. */ + } + b_off = b_imm(this_idx + insn->off + 1, ctx); + if (is_bad_offset(b_off)) + return -E2BIG; + emit_instr(ctx, bbit1, dst, ffs((u32)insn->imm) - 1, b_off); + emit_instr(ctx, nop); + break; + } + t64 = (u32)insn->imm; + emit_const_to_reg(ctx, MIPS_R_AT, t64); + emit_instr(ctx, and, MIPS_R_AT, dst, MIPS_R_AT); + src = MIPS_R_AT; + dst = MIPS_R_ZERO; + cmp_eq = false; + goto jeq_common; + + case BPF_JMP | BPF_JA: + /* + * Prefer relative branch for easier debugging, but + * fall back if needed. + */ + b_off = b_imm(this_idx + insn->off + 1, ctx); + if (is_bad_offset(b_off)) { + target = j_target(ctx, this_idx + insn->off + 1); + if (target == (unsigned int)-1) + return -E2BIG; + emit_instr(ctx, j, target); + } else { + emit_instr(ctx, b, b_off); + } + emit_instr(ctx, nop); + break; + case BPF_LD | BPF_DW | BPF_IMM: + if (insn->src_reg != 0) + return -EINVAL; + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + t64 = ((u64)(u32)insn->imm) | ((u64)(insn + 1)->imm << 32); + emit_const_to_reg(ctx, dst, t64); + return 2; /* Double slot insn */ + + case BPF_JMP | BPF_CALL: + ctx->flags |= EBPF_SAVE_RA; + t64s = (s64)insn->imm + (s64)__bpf_call_base; + emit_const_to_reg(ctx, MIPS_R_T9, (u64)t64s); + emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9); + /* delay slot */ + emit_instr(ctx, nop); + break; + + case BPF_JMP | BPF_TAIL_CALL: + if (emit_bpf_tail_call(ctx, this_idx)) + return -EINVAL; + break; + + case BPF_LD | BPF_B | BPF_ABS: + case BPF_LD | BPF_H | BPF_ABS: + case BPF_LD | BPF_W | BPF_ABS: + case BPF_LD | BPF_DW | BPF_ABS: + ctx->flags |= EBPF_SAVE_RA; + + gen_imm_to_reg(insn, MIPS_R_A1, ctx); + emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn)); + + if (insn->imm < 0) { + emit_const_to_reg(ctx, MIPS_R_T9, (u64)bpf_internal_load_pointer_neg_helper); + } else { + emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer); + emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset); + } + goto ld_skb_common; + + case BPF_LD | BPF_B | BPF_IND: + case BPF_LD | BPF_H | BPF_IND: + case BPF_LD | BPF_W | BPF_IND: + case BPF_LD | BPF_DW | BPF_IND: + ctx->flags |= EBPF_SAVE_RA; + src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); + if (src < 0) + return src; + ts = get_reg_val_type(ctx, this_idx, insn->src_reg); + if (ts == REG_32BIT_ZERO_EX) { + /* sign extend */ + emit_instr(ctx, sll, MIPS_R_A1, src, 0); + src = MIPS_R_A1; + } + if (insn->imm >= S16_MIN && insn->imm <= S16_MAX) { + emit_instr(ctx, daddiu, MIPS_R_A1, src, insn->imm); + } else { + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + emit_instr(ctx, daddu, MIPS_R_A1, MIPS_R_AT, src); + } + /* truncate to 32-bit int */ + emit_instr(ctx, sll, MIPS_R_A1, MIPS_R_A1, 0); + emit_instr(ctx, daddiu, MIPS_R_A3, MIPS_R_SP, ctx->tmp_offset); + emit_instr(ctx, slt, MIPS_R_AT, MIPS_R_A1, MIPS_R_ZERO); + + emit_const_to_reg(ctx, MIPS_R_T8, (u64)bpf_internal_load_pointer_neg_helper); + emit_const_to_reg(ctx, MIPS_R_T9, (u64)ool_skb_header_pointer); + emit_instr(ctx, addiu, MIPS_R_A2, MIPS_R_ZERO, size_to_len(insn)); + emit_instr(ctx, movn, MIPS_R_T9, MIPS_R_T8, MIPS_R_AT); + +ld_skb_common: + emit_instr(ctx, jalr, MIPS_R_RA, MIPS_R_T9); + /* delay slot move */ + emit_instr(ctx, daddu, MIPS_R_A0, MIPS_R_S0, MIPS_R_ZERO); + + /* Check the error value */ + b_off = b_imm(exit_idx, ctx); + if (is_bad_offset(b_off)) { + target = j_target(ctx, exit_idx); + if (target == (unsigned int)-1) + return -E2BIG; + + if (!(ctx->offsets[this_idx] & OFFSETS_B_CONV)) { + ctx->offsets[this_idx] |= OFFSETS_B_CONV; + ctx->long_b_conversion = 1; + } + emit_instr(ctx, bne, MIPS_R_V0, MIPS_R_ZERO, 4 * 3); + emit_instr(ctx, nop); + emit_instr(ctx, j, target); + emit_instr(ctx, nop); + } else { + emit_instr(ctx, beq, MIPS_R_V0, MIPS_R_ZERO, b_off); + emit_instr(ctx, nop); + } + +#ifdef __BIG_ENDIAN + need_swap = false; +#else + need_swap = true; +#endif + dst = MIPS_R_V0; + switch (BPF_SIZE(insn->code)) { + case BPF_B: + emit_instr(ctx, lbu, dst, 0, MIPS_R_V0); + break; + case BPF_H: + emit_instr(ctx, lhu, dst, 0, MIPS_R_V0); + if (need_swap) + emit_instr(ctx, wsbh, dst, dst); + break; + case BPF_W: + emit_instr(ctx, lw, dst, 0, MIPS_R_V0); + if (need_swap) { + emit_instr(ctx, wsbh, dst, dst); + emit_instr(ctx, rotr, dst, dst, 16); + } + break; + case BPF_DW: + emit_instr(ctx, ld, dst, 0, MIPS_R_V0); + if (need_swap) { + emit_instr(ctx, dsbh, dst, dst); + emit_instr(ctx, dshd, dst, dst); + } + break; + } + + break; + case BPF_ALU | BPF_END | BPF_FROM_BE: + case BPF_ALU | BPF_END | BPF_FROM_LE: + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + td = get_reg_val_type(ctx, this_idx, insn->dst_reg); + if (insn->imm == 64 && td == REG_32BIT) + emit_instr(ctx, dinsu, dst, MIPS_R_ZERO, 32, 32); + + if (insn->imm != 64 && + (td == REG_64BIT || td == REG_32BIT_ZERO_EX)) { + /* sign extend */ + emit_instr(ctx, sll, dst, dst, 0); + } + +#ifdef __BIG_ENDIAN + need_swap = (BPF_SRC(insn->code) == BPF_FROM_LE); +#else + need_swap = (BPF_SRC(insn->code) == BPF_FROM_BE); +#endif + if (insn->imm == 16) { + if (need_swap) + emit_instr(ctx, wsbh, dst, dst); + emit_instr(ctx, andi, dst, dst, 0xffff); + } else if (insn->imm == 32) { + if (need_swap) { + emit_instr(ctx, wsbh, dst, dst); + emit_instr(ctx, rotr, dst, dst, 16); + } + } else { /* 64-bit*/ + if (need_swap) { + emit_instr(ctx, dsbh, dst, dst); + emit_instr(ctx, dshd, dst, dst); + } + } + break; + + case BPF_ST | BPF_B | BPF_MEM: + case BPF_ST | BPF_H | BPF_MEM: + case BPF_ST | BPF_W | BPF_MEM: + case BPF_ST | BPF_DW | BPF_MEM: + if (insn->dst_reg == BPF_REG_10) { + ctx->flags |= EBPF_SEEN_FP; + dst = MIPS_R_SP; + mem_off = insn->off + MAX_BPF_STACK; + } else { + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + mem_off = insn->off; + } + gen_imm_to_reg(insn, MIPS_R_AT, ctx); + switch (BPF_SIZE(insn->code)) { + case BPF_B: + emit_instr(ctx, sb, MIPS_R_AT, mem_off, dst); + break; + case BPF_H: + emit_instr(ctx, sh, MIPS_R_AT, mem_off, dst); + break; + case BPF_W: + emit_instr(ctx, sw, MIPS_R_AT, mem_off, dst); + break; + case BPF_DW: + emit_instr(ctx, sd, MIPS_R_AT, mem_off, dst); + break; + } + break; + + case BPF_LDX | BPF_B | BPF_MEM: + case BPF_LDX | BPF_H | BPF_MEM: + case BPF_LDX | BPF_W | BPF_MEM: + case BPF_LDX | BPF_DW | BPF_MEM: + if (insn->src_reg == BPF_REG_10) { + ctx->flags |= EBPF_SEEN_FP; + src = MIPS_R_SP; + mem_off = insn->off + MAX_BPF_STACK; + } else { + src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); + if (src < 0) + return src; + mem_off = insn->off; + } + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + switch (BPF_SIZE(insn->code)) { + case BPF_B: + emit_instr(ctx, lbu, dst, mem_off, src); + break; + case BPF_H: + emit_instr(ctx, lhu, dst, mem_off, src); + break; + case BPF_W: + emit_instr(ctx, lw, dst, mem_off, src); + break; + case BPF_DW: + emit_instr(ctx, ld, dst, mem_off, src); + break; + } + break; + + case BPF_STX | BPF_B | BPF_MEM: + case BPF_STX | BPF_H | BPF_MEM: + case BPF_STX | BPF_W | BPF_MEM: + case BPF_STX | BPF_DW | BPF_MEM: + case BPF_STX | BPF_W | BPF_XADD: + case BPF_STX | BPF_DW | BPF_XADD: + if (insn->dst_reg == BPF_REG_10) { + ctx->flags |= EBPF_SEEN_FP; + dst = MIPS_R_SP; + mem_off = insn->off + MAX_BPF_STACK; + } else { + dst = ebpf_to_mips_reg(ctx, insn, dst_reg); + if (dst < 0) + return dst; + mem_off = insn->off; + } + src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); + if (src < 0) + return dst; + if (BPF_MODE(insn->code) == BPF_XADD) { + switch (BPF_SIZE(insn->code)) { + case BPF_W: + if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) { + emit_instr(ctx, sll, MIPS_R_AT, src, 0); + src = MIPS_R_AT; + } + emit_instr(ctx, ll, MIPS_R_T8, mem_off, dst); + emit_instr(ctx, addu, MIPS_R_T8, MIPS_R_T8, src); + emit_instr(ctx, sc, MIPS_R_T8, mem_off, dst); + /* + * On failure back up to LL (-4 + * instructions of 4 bytes each + */ + emit_instr(ctx, beq, MIPS_R_T8, MIPS_R_ZERO, -4 * 4); + emit_instr(ctx, nop); + break; + case BPF_DW: + if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) { + emit_instr(ctx, daddu, MIPS_R_AT, src, MIPS_R_ZERO); + emit_instr(ctx, dinsu, MIPS_R_AT, MIPS_R_ZERO, 32, 32); + src = MIPS_R_AT; + } + emit_instr(ctx, lld, MIPS_R_T8, mem_off, dst); + emit_instr(ctx, daddu, MIPS_R_T8, MIPS_R_T8, src); + emit_instr(ctx, scd, MIPS_R_T8, mem_off, dst); + emit_instr(ctx, beq, MIPS_R_T8, MIPS_R_ZERO, -4 * 4); + emit_instr(ctx, nop); + break; + } + } else { /* BPF_MEM */ + switch (BPF_SIZE(insn->code)) { + case BPF_B: + emit_instr(ctx, sb, src, mem_off, dst); + break; + case BPF_H: + emit_instr(ctx, sh, src, mem_off, dst); + break; + case BPF_W: + emit_instr(ctx, sw, src, mem_off, dst); + break; + case BPF_DW: + if (get_reg_val_type(ctx, this_idx, insn->src_reg) == REG_32BIT) { + emit_instr(ctx, daddu, MIPS_R_AT, src, MIPS_R_ZERO); + emit_instr(ctx, dinsu, MIPS_R_AT, MIPS_R_ZERO, 32, 32); + src = MIPS_R_AT; + } + emit_instr(ctx, sd, src, mem_off, dst); + break; + } + } + break; + + default: + pr_err("NOT HANDLED %d - (%02x)\n", + this_idx, (unsigned int)insn->code); + return -EINVAL; + } + return 1; +} + +#define RVT_VISITED_MASK 0xc000000000000000ull +#define RVT_FALL_THROUGH 0x4000000000000000ull +#define RVT_BRANCH_TAKEN 0x8000000000000000ull +#define RVT_DONE (RVT_FALL_THROUGH | RVT_BRANCH_TAKEN) + +static int build_int_body(struct jit_ctx *ctx) +{ + const struct bpf_prog *prog = ctx->skf; + const struct bpf_insn *insn; + int i, r; + + for (i = 0; i < prog->len; ) { + insn = prog->insnsi + i; + if ((ctx->reg_val_types[i] & RVT_VISITED_MASK) == 0) { + /* dead instruction, don't emit it. */ + i++; + continue; + } + + if (ctx->target == NULL) + ctx->offsets[i] = (ctx->offsets[i] & OFFSETS_B_CONV) | (ctx->idx * 4); + + r = build_one_insn(insn, ctx, i, prog->len); + if (r < 0) + return r; + i += r; + } + /* epilogue offset */ + if (ctx->target == NULL) + ctx->offsets[i] = ctx->idx * 4; + + /* + * All exits have an offset of the epilogue, some offsets may + * not have been set due to banch-around threading, so set + * them now. + */ + if (ctx->target == NULL) + for (i = 0; i < prog->len; i++) { + insn = prog->insnsi + i; + if (insn->code == (BPF_JMP | BPF_EXIT)) + ctx->offsets[i] = ctx->idx * 4; + } + return 0; +} + +/* return the last idx processed, or negative for error */ +static int reg_val_propagate_range(struct jit_ctx *ctx, u64 initial_rvt, + int start_idx, bool follow_taken) +{ + const struct bpf_prog *prog = ctx->skf; + const struct bpf_insn *insn; + u64 exit_rvt = initial_rvt; + u64 *rvt = ctx->reg_val_types; + int idx; + int reg; + + for (idx = start_idx; idx < prog->len; idx++) { + rvt[idx] = (rvt[idx] & RVT_VISITED_MASK) | exit_rvt; + insn = prog->insnsi + idx; + switch (BPF_CLASS(insn->code)) { + case BPF_ALU: + switch (BPF_OP(insn->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_MUL: + case BPF_DIV: + case BPF_OR: + case BPF_AND: + case BPF_LSH: + case BPF_RSH: + case BPF_NEG: + case BPF_MOD: + case BPF_XOR: + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT); + break; + case BPF_MOV: + if (BPF_SRC(insn->code)) { + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT); + } else { + /* IMM to REG move*/ + if (insn->imm >= 0) + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS); + else + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT); + } + break; + case BPF_END: + if (insn->imm == 64) + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT); + else if (insn->imm == 32) + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT); + else /* insn->imm == 16 */ + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS); + break; + } + rvt[idx] |= RVT_DONE; + break; + case BPF_ALU64: + switch (BPF_OP(insn->code)) { + case BPF_MOV: + if (BPF_SRC(insn->code)) { + /* REG to REG move*/ + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT); + } else { + /* IMM to REG move*/ + if (insn->imm >= 0) + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS); + else + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT_32BIT); + } + break; + default: + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT); + } + rvt[idx] |= RVT_DONE; + break; + case BPF_LD: + switch (BPF_SIZE(insn->code)) { + case BPF_DW: + if (BPF_MODE(insn->code) == BPF_IMM) { + s64 val; + + val = (s64)((u32)insn->imm | ((u64)(insn + 1)->imm << 32)); + if (val > 0 && val <= S32_MAX) + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS); + else if (val >= S32_MIN && val <= S32_MAX) + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT_32BIT); + else + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT); + rvt[idx] |= RVT_DONE; + idx++; + } else { + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT); + } + break; + case BPF_B: + case BPF_H: + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS); + break; + case BPF_W: + if (BPF_MODE(insn->code) == BPF_IMM) + set_reg_val_type(&exit_rvt, insn->dst_reg, + insn->imm >= 0 ? REG_32BIT_POS : REG_32BIT); + else + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT); + break; + } + rvt[idx] |= RVT_DONE; + break; + case BPF_LDX: + switch (BPF_SIZE(insn->code)) { + case BPF_DW: + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_64BIT); + break; + case BPF_B: + case BPF_H: + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT_POS); + break; + case BPF_W: + set_reg_val_type(&exit_rvt, insn->dst_reg, REG_32BIT); + break; + } + rvt[idx] |= RVT_DONE; + break; + case BPF_JMP: + switch (BPF_OP(insn->code)) { + case BPF_EXIT: + rvt[idx] = RVT_DONE | exit_rvt; + rvt[prog->len] = exit_rvt; + return idx; + case BPF_JA: + rvt[idx] |= RVT_DONE; + idx += insn->off; + break; + case BPF_JEQ: + case BPF_JGT: + case BPF_JGE: + case BPF_JSET: + case BPF_JNE: + case BPF_JSGT: + case BPF_JSGE: + if (follow_taken) { + rvt[idx] |= RVT_BRANCH_TAKEN; + idx += insn->off; + follow_taken = false; + } else { + rvt[idx] |= RVT_FALL_THROUGH; + } + break; + case BPF_CALL: + set_reg_val_type(&exit_rvt, BPF_REG_0, REG_64BIT); + /* Upon call return, argument registers are clobbered. */ + for (reg = BPF_REG_0; reg <= BPF_REG_5; reg++) + set_reg_val_type(&exit_rvt, reg, REG_64BIT); + + rvt[idx] |= RVT_DONE; + break; + default: + WARN(1, "Unhandled BPF_JMP case.\n"); + rvt[idx] |= RVT_DONE; + break; + } + break; + default: + rvt[idx] |= RVT_DONE; + break; + } + } + return idx; +} + +/* + * Track the value range (i.e. 32-bit vs. 64-bit) of each register at + * each eBPF insn. This allows unneeded sign and zero extension + * operations to be omitted. + * + * Doesn't handle yet confluence of control paths with conflicting + * ranges, but it is good enough for most sane code. + */ +static int reg_val_propagate(struct jit_ctx *ctx) +{ + const struct bpf_prog *prog = ctx->skf; + u64 exit_rvt; + int reg; + int i; + + /* + * 11 registers * 3 bits/reg leaves top bits free for other + * uses. Bit-62..63 used to see if we have visited an insn. + */ + exit_rvt = 0; + + /* Upon entry, argument registers are 64-bit. */ + for (reg = BPF_REG_1; reg <= BPF_REG_5; reg++) + set_reg_val_type(&exit_rvt, reg, REG_64BIT); + + /* + * First follow all conditional branches on the fall-through + * edge of control flow.. + */ + reg_val_propagate_range(ctx, exit_rvt, 0, false); +restart_search: + /* + * Then repeatedly find the first conditional branch where + * both edges of control flow have not been taken, and follow + * the branch taken edge. We will end up restarting the + * search once per conditional branch insn. + */ + for (i = 0; i < prog->len; i++) { + u64 rvt = ctx->reg_val_types[i]; + + if ((rvt & RVT_VISITED_MASK) == RVT_DONE || + (rvt & RVT_VISITED_MASK) == 0) + continue; + if ((rvt & RVT_VISITED_MASK) == RVT_FALL_THROUGH) { + reg_val_propagate_range(ctx, rvt & ~RVT_VISITED_MASK, i, true); + } else { /* RVT_BRANCH_TAKEN */ + WARN(1, "Unexpected RVT_BRANCH_TAKEN case.\n"); + reg_val_propagate_range(ctx, rvt & ~RVT_VISITED_MASK, i, false); + } + goto restart_search; + } + /* + * Eventually all conditional branches have been followed on + * both branches and we are done. Any insn that has not been + * visited at this point is dead. + */ + + return 0; +} + +static void jit_fill_hole(void *area, unsigned int size) +{ + u32 *p; + + /* We are guaranteed to have aligned memory. */ + for (p = area; size >= sizeof(u32); size -= sizeof(u32)) + uasm_i_break(&p, BRK_BUG); /* Increments p */ +} + +struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) +{ + struct bpf_prog *orig_prog = prog; + bool tmp_blinded = false; + struct bpf_prog *tmp; + struct bpf_binary_header *header = NULL; + struct jit_ctx ctx; + unsigned int image_size; + u8 *image_ptr; + + if (!bpf_jit_enable || !cpu_has_mips64r2) + return prog; + + tmp = bpf_jit_blind_constants(prog); + /* If blinding was requested and we failed during blinding, + * we must fall back to the interpreter. + */ + if (IS_ERR(tmp)) + return orig_prog; + if (tmp != prog) { + tmp_blinded = true; + prog = tmp; + } + + memset(&ctx, 0, sizeof(ctx)); + + ctx.offsets = kcalloc(prog->len + 1, sizeof(*ctx.offsets), GFP_KERNEL); + if (ctx.offsets == NULL) + goto out_err; + + ctx.reg_val_types = kcalloc(prog->len + 1, sizeof(*ctx.reg_val_types), GFP_KERNEL); + if (ctx.reg_val_types == NULL) + goto out_err; + + ctx.skf = prog; + + if (reg_val_propagate(&ctx)) + goto out_err; + + /* + * First pass discovers used resources and instruction offsets + * assuming short branches are used. + */ + if (build_int_body(&ctx)) + goto out_err; + + /* + * If no calls are made (EBPF_SAVE_RA), then tail call count + * in $v1, else we must save in n$s4. + */ + if (ctx.flags & EBPF_SEEN_TC) { + if (ctx.flags & EBPF_SAVE_RA) + ctx.flags |= EBPF_SAVE_S4; + else + ctx.flags |= EBPF_TCC_IN_V1; + } + + /* + * Second pass generates offsets, if any branches are out of + * range a jump-around long sequence is generated, and we have + * to try again from the beginning to generate the new + * offsets. This is done until no additional conversions are + * necessary. + */ + do { + ctx.idx = 0; + ctx.gen_b_offsets = 1; + ctx.long_b_conversion = 0; + if (gen_int_prologue(&ctx)) + goto out_err; + if (build_int_body(&ctx)) + goto out_err; + if (build_int_epilogue(&ctx, MIPS_R_RA)) + goto out_err; + } while (ctx.long_b_conversion); + + image_size = 4 * ctx.idx; + + header = bpf_jit_binary_alloc(image_size, &image_ptr, + sizeof(u32), jit_fill_hole); + if (header == NULL) + goto out_err; + + ctx.target = (u32 *)image_ptr; + + /* Third pass generates the code */ + ctx.idx = 0; + if (gen_int_prologue(&ctx)) + goto out_err; + if (build_int_body(&ctx)) + goto out_err; + if (build_int_epilogue(&ctx, MIPS_R_RA)) + goto out_err; + + /* Update the icache */ + flush_icache_range((unsigned long)ctx.target, + (unsigned long)(ctx.target + ctx.idx * sizeof(u32))); + + if (bpf_jit_enable > 1) + /* Dump JIT code */ + bpf_jit_dump(prog->len, image_size, 2, ctx.target); + + bpf_jit_binary_lock_ro(header); + prog->bpf_func = (void *)ctx.target; + prog->jited = 1; + prog->jited_len = image_size; +out_normal: + if (tmp_blinded) + bpf_jit_prog_release_other(prog, prog == orig_prog ? + tmp : orig_prog); + kfree(ctx.offsets); + kfree(ctx.reg_val_types); + + return prog; + +out_err: + prog = orig_prog; + if (header) + bpf_jit_binary_free(header); + goto out_normal; +} From 5fff41e1f89d93feef9833c49a415dc337af5a99 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 1 Aug 2017 09:41:34 +0300 Subject: [PATCH 077/109] IB/core: Fix race condition in resolving IP to MAC Currently while resolving IP address to MAC address single delayed work is used for resolving multiple such resolve requests. This singled work is essentially performs two tasks. (a) any retry needed to resolve and (b) it executes the callback function for all completed requests While work is executing callbacks, any new work scheduled on for this workqueue is lost because workqueue has completed looking at all pending requests and now looking at callbacks, but work is still under execution. Any further retry to look at pending requests in process_req() after executing callbacks would lead to similar race condition (may be reduce the probably further but doesn't eliminate it). Retrying to enqueue work that from queue_req() context is not something rest of the kernel modules have followed. Therefore fix in this patch utilizes kernel facility to enqueue multiple work items to a workqueue. This ensures that no such requests gets lost in synchronization. Request list is still maintained so that rdma_cancel_addr() can unlink the request and get the completion with error sooner. Neighbour update event handling continues to be handled in same way as before. Additionally process_req() work entry cancels any pending work for a request that gets completed while processing those requests. Originally ib_addr was ST workqueue, but it became MT work queue with patch of [1]. This patch again makes it similar to ST so that neighbour update events handler work item doesn't race with other work items. In one such below trace, (though on 4.5 based kernel) it can be seen that process_req() never executed the callback, which is likely for an event that was schedule by queue_req() when previous callback was getting executed by workqueue. [] schedule+0x3e/0x90 [] schedule_timeout+0x1b5/0x210 [] ? ip_route_output_flow+0x27/0x70 [] ? addr_resolve+0x149/0x1b0 [ib_addr] [] wait_for_completion+0x10f/0x170 [] ? try_to_wake_up+0x210/0x210 [] ? rdma_copy_addr+0xa0/0xa0 [ib_addr] [] rdma_addr_find_l2_eth_by_grh+0x1d0/0x278 [ib_addr] [] ? sub_alloc+0x77/0x1c0 [] ib_init_ah_from_wc+0x3a7/0x5a0 [ib_core] [] cm_req_handler+0xea/0x580 [ib_cm] [] ? __switch_to+0x212/0x5e0 [] cm_work_handler+0x6d/0x150 [ib_cm] [] process_one_work+0x151/0x4b0 [] worker_thread+0x120/0x480 [] ? __schedule+0x30b/0x890 [] ? process_one_work+0x4b0/0x4b0 [] ? process_one_work+0x4b0/0x4b0 [] kthread+0xce/0xf0 [] ? kthread_freezable_should_stop+0x70/0x70 [] ret_from_fork+0x42/0x70 [] ? kthread_freezable_should_stop+0x70/0x70 INFO: task kworker/u144:1:156520 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/u144:1 D ffff883ffe1d7600 0 156520 2 0x00000080 Workqueue: ib_addr process_req [ib_addr] ffff883f446fbbd8 0000000000000046 ffff881f95280000 ffff881ff24de200 ffff883f66120000 ffff883f446f8008 ffff881f95280000 ffff883f6f9208c4 ffff883f6f9208c8 00000000ffffffff ffff883f446fbbf8 ffffffff816b0dde [1] http://lkml.iu.edu/hypermail/linux/kernel/1608.1/05834.html Signed-off-by: Parav Pandit Reviewed-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 62 ++++++++++++++++++++++++++-------- 1 file changed, 48 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 01236cef7bfb..437522ca97b4 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -61,6 +61,7 @@ struct addr_req { void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context); unsigned long timeout; + struct delayed_work work; int status; u32 seq; }; @@ -295,7 +296,7 @@ int rdma_translate_ip(const struct sockaddr *addr, } EXPORT_SYMBOL(rdma_translate_ip); -static void set_timeout(unsigned long time) +static void set_timeout(struct delayed_work *delayed_work, unsigned long time) { unsigned long delay; @@ -303,7 +304,7 @@ static void set_timeout(unsigned long time) if ((long)delay < 0) delay = 0; - mod_delayed_work(addr_wq, &work, delay); + mod_delayed_work(addr_wq, delayed_work, delay); } static void queue_req(struct addr_req *req) @@ -318,8 +319,7 @@ static void queue_req(struct addr_req *req) list_add(&req->list, &temp_req->list); - if (req_list.next == &req->list) - set_timeout(req->timeout); + set_timeout(&req->work, req->timeout); mutex_unlock(&lock); } @@ -574,6 +574,37 @@ static int addr_resolve(struct sockaddr *src_in, return ret; } +static void process_one_req(struct work_struct *_work) +{ + struct addr_req *req; + struct sockaddr *src_in, *dst_in; + + mutex_lock(&lock); + req = container_of(_work, struct addr_req, work.work); + + if (req->status == -ENODATA) { + src_in = (struct sockaddr *)&req->src_addr; + dst_in = (struct sockaddr *)&req->dst_addr; + req->status = addr_resolve(src_in, dst_in, req->addr, + true, req->seq); + if (req->status && time_after_eq(jiffies, req->timeout)) { + req->status = -ETIMEDOUT; + } else if (req->status == -ENODATA) { + /* requeue the work for retrying again */ + set_timeout(&req->work, req->timeout); + mutex_unlock(&lock); + return; + } + } + list_del(&req->list); + mutex_unlock(&lock); + + req->callback(req->status, (struct sockaddr *)&req->src_addr, + req->addr, req->context); + put_client(req->client); + kfree(req); +} + static void process_req(struct work_struct *work) { struct addr_req *req, *temp_req; @@ -591,20 +622,23 @@ static void process_req(struct work_struct *work) true, req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; - else if (req->status == -ENODATA) + else if (req->status == -ENODATA) { + set_timeout(&req->work, req->timeout); continue; + } } list_move_tail(&req->list, &done_list); } - if (!list_empty(&req_list)) { - req = list_entry(req_list.next, struct addr_req, list); - set_timeout(req->timeout); - } mutex_unlock(&lock); list_for_each_entry_safe(req, temp_req, &done_list, list) { list_del(&req->list); + /* It is safe to cancel other work items from this work item + * because at a time there can be only one work item running + * with this single threaded work queue. + */ + cancel_delayed_work(&req->work); req->callback(req->status, (struct sockaddr *) &req->src_addr, req->addr, req->context); put_client(req->client); @@ -647,6 +681,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->context = context; req->client = client; atomic_inc(&client->refcount); + INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); @@ -701,7 +736,7 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) req->status = -ECANCELED; req->timeout = jiffies; list_move(&req->list, &req_list); - set_timeout(req->timeout); + set_timeout(&req->work, req->timeout); break; } } @@ -807,9 +842,8 @@ static int netevent_callback(struct notifier_block *self, unsigned long event, if (event == NETEVENT_NEIGH_UPDATE) { struct neighbour *neigh = ctx; - if (neigh->nud_state & NUD_VALID) { - set_timeout(jiffies); - } + if (neigh->nud_state & NUD_VALID) + set_timeout(&work, jiffies); } return 0; } @@ -820,7 +854,7 @@ static struct notifier_block nb = { int addr_init(void) { - addr_wq = alloc_workqueue("ib_addr", WQ_MEM_RECLAIM, 0); + addr_wq = alloc_ordered_workqueue("ib_addr", WQ_MEM_RECLAIM); if (!addr_wq) return -ENOMEM; From f7a6cb7b38c6845b26aaa8bbdf519ff6e3090831 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Aug 2017 09:41:35 +0300 Subject: [PATCH 078/109] RDMA/uverbs: Prevent leak of reserved field initialize to zero the response structure to prevent the leakage of "resp.reserved" field. drivers/infiniband/core/uverbs_cmd.c:1178 ib_uverbs_resize_cq() warn: check that 'resp.reserved' doesn't leak information Fixes: 33b9b3ee9709 ("IB: Add userspace support for resizing CQs") Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_cmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2c98533a0203..c551d2b275fd 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1153,7 +1153,7 @@ ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file, int out_len) { struct ib_uverbs_resize_cq cmd; - struct ib_uverbs_resize_cq_resp resp; + struct ib_uverbs_resize_cq_resp resp = {}; struct ib_udata udata; struct ib_cq *cq; int ret = -EINVAL; From efdd6f53b10aead0f5cf19a93dd3eb268ac0d991 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Tue, 1 Aug 2017 09:41:36 +0300 Subject: [PATCH 079/109] IB/uverbs: Fix device cleanup Uverbs device should be cleaned up only when there is no potential usage of. As part of ib_uverbs_remove_one which might be triggered upon reset flow the device reference count is decreased as expected and leave the final cleanup to the FDs that were opened. Current code increases reference count upon opening a new command FD and decreases it upon closing the file. The event FD is opened internally and rely on the command FD by taking on it a reference count. In case that the command FD was closed and just later the event FD we may ensure that the device resources as of srcu are still alive as they are still in use. Fixing the above by moving the reference count decreasing to the place where the command FD is really freed instead of doing that when it was just closed. fixes: 036b10635739 ("IB/uverbs: Enable device removal when there are active user space applications") Signed-off-by: Yishai Hadas Reviewed-by: Matan Barak Reviewed-by: Jason Gunthorpe Tested-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 3d2609608f58..c023e2c81b8f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -250,6 +250,7 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); + kobject_put(&file->device->kobj); kfree(file); } @@ -917,7 +918,6 @@ err: static int ib_uverbs_close(struct inode *inode, struct file *filp) { struct ib_uverbs_file *file = filp->private_data; - struct ib_uverbs_device *dev = file->device; mutex_lock(&file->cleanup_mutex); if (file->ucontext) { @@ -939,7 +939,6 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) ib_uverbs_release_async_event_file); kref_put(&file->ref, ib_uverbs_release_file); - kobject_put(&dev->kobj); return 0; } From 931b3c1a832621b4bdcbaf783096fc267eb36fbe Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Aug 2017 09:41:37 +0300 Subject: [PATCH 080/109] RDMA/mlx5: Fix existence check for extended address vector The extended address vector is the highest bit in be32 variable, but it was compared with the lowest. This patch fixes the endianness of that check and removes already declared define. Fixes: 17d2f88f92ce ("IB/mlx5: Add ODP atomics support") Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/odp.c | 2 +- include/linux/mlx5/qp.h | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index ae0746754008..3d701c7a4c91 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -939,7 +939,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( if (qp->ibqp.qp_type != IB_QPT_RC) { av = *wqe; - if (av->dqp_dct & be32_to_cpu(MLX5_WQE_AV_EXT)) + if (av->dqp_dct & cpu_to_be32(MLX5_EXTENDED_UD_AV)) *wqe += sizeof(struct mlx5_av); else *wqe += sizeof(struct mlx5_base_av); diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 6f41270d80c0..f378dc0e7eaf 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -212,7 +212,6 @@ struct mlx5_wqe_ctrl_seg { #define MLX5_WQE_CTRL_OPCODE_MASK 0xff #define MLX5_WQE_CTRL_WQE_INDEX_MASK 0x00ffff00 #define MLX5_WQE_CTRL_WQE_INDEX_SHIFT 8 -#define MLX5_WQE_AV_EXT 0x80000000 enum { MLX5_ETH_WQE_L3_INNER_CSUM = 1 << 4, From aaf83aecd294fd61db6b34051f718f3e7ea34c22 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Thu, 3 Aug 2017 15:43:14 +0200 Subject: [PATCH 081/109] xgene: Always get clk source, but ignore if it's missing for SGMII ports Even the driver doesn't do anything with the clk source for SGMII ports it needs to be enabled by doing a devm_clk_get(), if there is a clk source in DT. Fixes: 0db01097cabd ('xgene: Don't fail probe, if there is no clk resource for SGMII interfaces') Signed-off-by: Thomas Bogendoerfer Tested-by: Laura Abbott Acked-by: Iyappan Subramanian Tested-by: Will Deacon Signed-off-by: David S. Miller --- drivers/net/ethernet/apm/xgene/xgene_enet_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 86058a9f3417..1d307f2def2d 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -1785,9 +1785,9 @@ static int xgene_enet_get_resources(struct xgene_enet_pdata *pdata) xgene_enet_gpiod_get(pdata); - if (pdata->phy_mode != PHY_INTERFACE_MODE_SGMII) { - pdata->clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(pdata->clk)) { + pdata->clk = devm_clk_get(&pdev->dev, NULL); + if (IS_ERR(pdata->clk)) { + if (pdata->phy_mode != PHY_INTERFACE_MODE_SGMII) { /* Abort if the clock is defined but couldn't be * retrived. Always abort if the clock is missing on * DT system as the driver can't cope with this case. From 5db465f235e74293e285e1fa924a55e52ba52a98 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 4 Aug 2017 11:12:08 +0300 Subject: [PATCH 082/109] IB/hns: checking for IS_ERR() instead of NULL The hns_roce_v1_create_lp_qp() returns NULL on error, not error pointers. Fixes: bfcc681bd09d ("IB/hns: Fix the bug when free mr") Signed-off-by: Dan Carpenter Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 23fad6d96944..2540b65e242c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -733,7 +733,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) continue; free_mr->mr_free_qp[i] = hns_roce_v1_create_lp_qp(hr_dev, pd); - if (IS_ERR(free_mr->mr_free_qp[i])) { + if (!free_mr->mr_free_qp[i]) { dev_err(dev, "Create loop qp failed!\n"); goto create_lp_qp_failed; } From ea7bd56fa309d10a41b1041827a63c0746c47554 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 2 Aug 2017 12:37:16 -0700 Subject: [PATCH 083/109] xfs: Fix leak of discard bio The bio describing discard operation is allocated by __blkdev_issue_discard() which returns us a reference to it. That reference is never released and thus we leak this bio. Drop the bio reference once it completes in xlog_discard_endio(). CC: stable@vger.kernel.org Fixes: 4560e78f40cb55bd2ea8f1ef4001c5baa88531c7 Signed-off-by: Jan Kara Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_log_cil.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index fbe72b134bef..43aa42a3a5d3 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -539,6 +539,7 @@ xlog_discard_endio( INIT_WORK(&ctx->discard_endio_work, xlog_discard_endio_work); queue_work(xfs_discard_wq, &ctx->discard_endio_work); + bio_put(bio); } static void From 56bdf855e676f1f2ed7033f288f57dfd315725ba Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Thu, 3 Aug 2017 13:19:13 -0700 Subject: [PATCH 084/109] xfs: Fix per-inode DAX flag inheritance According to the commit that implemented per-inode DAX flag: commit 58f88ca2df72 ("xfs: introduce per-inode DAX enablement") the flag is supposed to act as "inherit flag". Currently this only works in the situations where parent directory already has a flag in di_flags set, otherwise inheritance does not work. This is because setting the XFS_DIFLAG2_DAX flag is done in a wrong branch designated for di_flags, not di_flags2. Fix this by moving the code to branch designated for setting di_flags2, which does test for flags in di_flags2. Fixes: 58f88ca2df72 ("xfs: introduce per-inode DAX enablement") Signed-off-by: Lukas Czerner Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong --- fs/xfs/xfs_inode.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ceef77c0416a..ff48f0096810 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -874,7 +874,6 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint64_t di_flags2 = 0; uint di_flags = 0; if (S_ISDIR(mode)) { @@ -911,20 +910,23 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; - if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) - di_flags2 |= XFS_DIFLAG2_DAX; ip->i_d.di_flags |= di_flags; - ip->i_d.di_flags2 |= di_flags2; } if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) && pip->i_d.di_version == 3 && ip->i_d.di_version == 3) { + uint64_t di_flags2 = 0; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) { - ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; + di_flags2 |= XFS_DIFLAG2_COWEXTSIZE; ip->i_d.di_cowextsize = pip->i_d.di_cowextsize; } + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: From 2c460621bb2e6baf8a475c407cdb29029b2497ac Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 4 Aug 2017 22:24:41 +0200 Subject: [PATCH 085/109] bpf: fix byte order test in test_verifier We really must check with #if __BYTE_ORDER == XYZ instead of just presence of #ifdef __LITTLE_ENDIAN. I noticed that when actually running this on big endian machine, the latter test resolves to true for user space, same for #ifdef __BIG_ENDIAN. E.g., looking at endian.h from libc, both are also defined there, so we really must test this against __BYTE_ORDER instead for proper insns selection. For the kernel, such checks are fine though e.g. see 13da9e200fe4 ("Revert "endian: #define __BYTE_ORDER"") and 415586c9e6d3 ("UAPI: fix endianness conditionals in M32R's asm/stat.h") for some more context, but not for user space. Lets also make sure to properly include endian.h. After that, suite passes for me: ./test_verifier: ELF 64-bit MSB executable, [...] Linux foo 4.13.0-rc3+ #4 SMP Fri Aug 4 06:59:30 EDT 2017 s390x s390x s390x GNU/Linux Before fix: Summary: 505 PASSED, 11 FAILED After fix: Summary: 516 PASSED, 0 FAILED Fixes: 18f3d6be6be1 ("selftests/bpf: Add test cases to test narrower ctx field loads") Signed-off-by: Daniel Borkmann Acked-by: Yonghong Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_verifier.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index addea82f76c9..d3ed7324105e 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8,6 +8,7 @@ * License as published by the Free Software Foundation. */ +#include #include #include #include @@ -1098,7 +1099,7 @@ static struct bpf_test tests[] = { "check skb->hash byte load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash)), #else @@ -1135,7 +1136,7 @@ static struct bpf_test tests[] = { "check skb->hash byte load not permitted 3", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash) + 3), #else @@ -1244,7 +1245,7 @@ static struct bpf_test tests[] = { "check skb->hash half load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash)), #else @@ -1259,7 +1260,7 @@ static struct bpf_test tests[] = { "check skb->hash half load not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, hash) + 2), #else @@ -5422,7 +5423,7 @@ static struct bpf_test tests[] = { "check bpf_perf_event_data->sample_period byte load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_perf_event_data, sample_period)), #else @@ -5438,7 +5439,7 @@ static struct bpf_test tests[] = { "check bpf_perf_event_data->sample_period half load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_perf_event_data, sample_period)), #else @@ -5454,7 +5455,7 @@ static struct bpf_test tests[] = { "check bpf_perf_event_data->sample_period word load permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct bpf_perf_event_data, sample_period)), #else @@ -5481,7 +5482,7 @@ static struct bpf_test tests[] = { "check skb->data half load not permitted", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, data)), #else @@ -5497,7 +5498,7 @@ static struct bpf_test tests[] = { "check skb->tc_classid half load not permitted for lwt prog", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), -#ifdef __LITTLE_ENDIAN +#if __BYTE_ORDER == __LITTLE_ENDIAN BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, tc_classid)), #else From 732e49850c5e15231e11a0a464748bcbade5e3c2 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Thu, 3 Aug 2017 17:13:54 -0700 Subject: [PATCH 086/109] netvsc: fix race on sub channel creation The existing sub channel code did not wait for all the sub-channels to completely initialize. This could lead to race causing crash in napi_netif_del() from bad list. The existing code would send an init message, then wait only for the initial response that the init message was received. It thought it was waiting for sub channels but really the init response did the wakeup. The new code keeps track of the number of open channels and waits until that many are open. Other issues here were: * host might return less sub-channels than was requested. * the new init status is not valid until after init was completed. Fixes: b3e6b82a0099 ("hv_netvsc: Wait for sub-channels to be processed during probe") Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- drivers/net/hyperv/hyperv_net.h | 3 ++- drivers/net/hyperv/netvsc.c | 1 + drivers/net/hyperv/rndis_filter.c | 14 ++++++++------ 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index d6c25580f8dd..12cc64bfcff8 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -765,7 +765,8 @@ struct netvsc_device { u32 max_chn; u32 num_chn; - refcount_t sc_offered; + atomic_t open_chn; + wait_queue_head_t subchan_open; struct rndis_device *extension; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 96f90c75d1b7..d18c3326a1f7 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -78,6 +78,7 @@ static struct netvsc_device *alloc_net_device(void) net_device->max_pkt = RNDIS_MAX_PKT_DEFAULT; net_device->pkt_align = RNDIS_PKT_ALIGN_DEFAULT; init_completion(&net_device->channel_init_wait); + init_waitqueue_head(&net_device->subchan_open); return net_device; } diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index 85c00e1c52b6..d6308ffda53e 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1048,8 +1048,8 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) else netif_napi_del(&nvchan->napi); - if (refcount_dec_and_test(&nvscdev->sc_offered)) - complete(&nvscdev->channel_init_wait); + atomic_inc(&nvscdev->open_chn); + wake_up(&nvscdev->subchan_open); } int rndis_filter_device_add(struct hv_device *dev, @@ -1090,8 +1090,6 @@ int rndis_filter_device_add(struct hv_device *dev, net_device->max_chn = 1; net_device->num_chn = 1; - refcount_set(&net_device->sc_offered, 0); - net_device->extension = rndis_device; rndis_device->ndev = net; @@ -1221,11 +1219,11 @@ int rndis_filter_device_add(struct hv_device *dev, rndis_device->ind_table[i] = ethtool_rxfh_indir_default(i, net_device->num_chn); + atomic_set(&net_device->open_chn, 1); num_rss_qs = net_device->num_chn - 1; if (num_rss_qs == 0) return 0; - refcount_set(&net_device->sc_offered, num_rss_qs); vmbus_set_sc_create_callback(dev->channel, netvsc_sc_open); init_packet = &net_device->channel_init_pkt; @@ -1242,15 +1240,19 @@ int rndis_filter_device_add(struct hv_device *dev, if (ret) goto out; + wait_for_completion(&net_device->channel_init_wait); if (init_packet->msg.v5_msg.subchn_comp.status != NVSP_STAT_SUCCESS) { ret = -ENODEV; goto out; } - wait_for_completion(&net_device->channel_init_wait); net_device->num_chn = 1 + init_packet->msg.v5_msg.subchn_comp.num_subchannels; + /* wait for all sub channels to open */ + wait_event(net_device->subchan_open, + atomic_read(&net_device->open_chn) == net_device->num_chn); + /* ignore failues from setting rss parameters, still have channels */ rndis_filter_set_rss_param(rndis_device, netvsc_hash_key, net_device->num_chn); From f9ea3225ddafa269cf1f6b495d132c26fde93903 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Mon, 7 Aug 2017 10:16:36 +0200 Subject: [PATCH 087/109] bpf: fix selftest/bpf/test_pkt_md_access on s390x Commit 18f3d6be6be1 ("selftests/bpf: Add test cases to test narrower ctx field loads") introduced new eBPF test cases. One of them (test_pkt_md_access.c) fails on s390x. The BPF verifier error message is: [root@s8360046 bpf]# ./test_progs test_pkt_access:PASS:ipv4 349 nsec test_pkt_access:PASS:ipv6 212 nsec [....] libbpf: load bpf program failed: Permission denied libbpf: -- BEGIN DUMP LOG --- libbpf: 0: (71) r2 = *(u8 *)(r1 +0) invalid bpf_context access off=0 size=1 libbpf: -- END LOG -- libbpf: failed to load program 'test1' libbpf: failed to load object './test_pkt_md_access.o' Summary: 29 PASSED, 1 FAILED [root@s8360046 bpf]# This is caused by a byte endianness issue. S390x is a big endian architecture. Pointer access to the lowest byte or halfword of a four byte value need to add an offset. On little endian architectures this offset is not needed. Fix this and use the same approach as the originator used for other files (for example test_verifier.c) in his original commit. With this fix the test program test_progs succeeds on s390x: [root@s8360046 bpf]# ./test_progs test_pkt_access:PASS:ipv4 236 nsec test_pkt_access:PASS:ipv6 217 nsec test_xdp:PASS:ipv4 3624 nsec test_xdp:PASS:ipv6 1722 nsec test_l4lb:PASS:ipv4 926 nsec test_l4lb:PASS:ipv6 1322 nsec test_tcp_estats:PASS: 0 nsec test_bpf_obj_id:PASS:get-fd-by-notexist-prog-id 0 nsec test_bpf_obj_id:PASS:get-fd-by-notexist-map-id 0 nsec test_bpf_obj_id:PASS:get-prog-info(fd) 0 nsec test_bpf_obj_id:PASS:get-map-info(fd) 0 nsec test_bpf_obj_id:PASS:get-prog-info(fd) 0 nsec test_bpf_obj_id:PASS:get-map-info(fd) 0 nsec test_bpf_obj_id:PASS:get-prog-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-prog-info(next_id->fd) 0 nsec test_bpf_obj_id:PASS:get-prog-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-prog-info(next_id->fd) 0 nsec test_bpf_obj_id:PASS:check total prog id found by get_next_id 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:check get-map-info(next_id->fd) 0 nsec test_bpf_obj_id:PASS:get-map-fd(next_id) 0 nsec test_bpf_obj_id:PASS:check get-map-info(next_id->fd) 0 nsec test_bpf_obj_id:PASS:check total map id found by get_next_id 0 nsec test_pkt_md_access:PASS: 277 nsec Summary: 30 PASSED, 0 FAILED [root@s8360046 bpf]# Fixes: 18f3d6be6be1 ("selftests/bpf: Add test cases to test narrower ctx field loads") Signed-off-by: Thomas Richter Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- tools/testing/selftests/bpf/test_pkt_md_access.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/testing/selftests/bpf/test_pkt_md_access.c b/tools/testing/selftests/bpf/test_pkt_md_access.c index 71729d47eb85..7956302ecdf2 100644 --- a/tools/testing/selftests/bpf/test_pkt_md_access.c +++ b/tools/testing/selftests/bpf/test_pkt_md_access.c @@ -12,12 +12,23 @@ int _version SEC("version") = 1; +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define TEST_FIELD(TYPE, FIELD, MASK) \ { \ TYPE tmp = *(volatile TYPE *)&skb->FIELD; \ if (tmp != ((*(volatile __u32 *)&skb->FIELD) & MASK)) \ return TC_ACT_SHOT; \ } +#else +#define TEST_FIELD_OFFSET(a, b) ((sizeof(a) - sizeof(b)) / sizeof(b)) +#define TEST_FIELD(TYPE, FIELD, MASK) \ + { \ + TYPE tmp = *((volatile TYPE *)&skb->FIELD + \ + TEST_FIELD_OFFSET(skb->FIELD, TYPE)); \ + if (tmp != ((*(volatile __u32 *)&skb->FIELD) & MASK)) \ + return TC_ACT_SHOT; \ + } +#endif SEC("test1") int process(struct __sk_buff *skb) From 22889dbbd98a0e3390e9120074c39c6e5a3fea5e Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Mon, 7 Aug 2017 09:50:14 +0100 Subject: [PATCH 088/109] asix: Add rx->ax_skb = NULL after usbnet_skb_return() In asix_rx_fixup_internal() there is a risk that rx->ax_skb gets reused after passing the Ethernet frame into the network stack via usbnet_skb_return(). The risks include: a) asynchronously freeing rx->ax_skb after passing the netdev buffer to the NAPI layer which might corrupt the backlog queue. b) erroneously reusing rx->ax_skb such as calling skb_put_data() multiple times which causes writing off the end of the netdev buffer. Therefore add a defensive rx->ax_skb = NULL after usbnet_skb_return() so that it is not possible to free rx->ax_skb or to apply skb_put_data() too many times. Signed-off-by: Dean Jenkins Signed-off-by: David S. Miller --- drivers/net/usb/asix_common.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 7847436c441e..6983b6bd8cf9 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -168,8 +168,10 @@ int asix_rx_fixup_internal(struct usbnet *dev, struct sk_buff *skb, if (rx->ax_skb) { skb_put_data(rx->ax_skb, skb->data + offset, copy_length); - if (!rx->remaining) + if (!rx->remaining) { usbnet_skb_return(dev, rx->ax_skb); + rx->ax_skb = NULL; + } } offset += (copy_length + 1) & 0xfffe; From 960eb4eeaa47a3a5061a4e47e28411e85840ab2c Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Mon, 7 Aug 2017 09:50:15 +0100 Subject: [PATCH 089/109] asix: Ensure asix_rx_fixup_info members are all reset There is a risk that the members of the structure asix_rx_fixup_info become unsynchronised leading to the possibility of a malfunction. For example, rx->split_head was not being set to false after an error was detected so potentially could cause a malformed 32-bit Data header word to be formed. Therefore add function reset_asix_rx_fixup_info() to reset all the members of asix_rx_fixup_info so that future processing will start with known initial conditions. Also, if (skb->len != offset) becomes true then call reset_asix_rx_fixup_info() so that the processing of the next URB starts with known initial conditions. Without the call, the check does nothing which potentially could lead to a malfunction when the next URB is processed. In addition, for robustness, call reset_asix_rx_fixup_info() before every error path's "return 0". This ensures that the next URB is processed from known initial conditions. Signed-off-by: Dean Jenkins Signed-off-by: David S. Miller --- drivers/net/usb/asix_common.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index 6983b6bd8cf9..fda74f33e3ec 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -75,6 +75,27 @@ void asix_write_cmd_async(struct usbnet *dev, u8 cmd, u16 value, u16 index, value, index, data, size); } +static void reset_asix_rx_fixup_info(struct asix_rx_fixup_info *rx) +{ + /* Reset the variables that have a lifetime outside of + * asix_rx_fixup_internal() so that future processing starts from a + * known set of initial conditions. + */ + + if (rx->ax_skb) { + /* Discard any incomplete Ethernet frame in the netdev buffer */ + kfree_skb(rx->ax_skb); + rx->ax_skb = NULL; + } + + /* Assume the Data header 32-bit word is at the start of the current + * or next URB socket buffer so reset all the state variables. + */ + rx->remaining = 0; + rx->split_head = false; + rx->header = 0; +} + int asix_rx_fixup_internal(struct usbnet *dev, struct sk_buff *skb, struct asix_rx_fixup_info *rx) { @@ -99,15 +120,7 @@ int asix_rx_fixup_internal(struct usbnet *dev, struct sk_buff *skb, if (size != ((~rx->header >> 16) & 0x7ff)) { netdev_err(dev->net, "asix_rx_fixup() Data Header synchronisation was lost, remaining %d\n", rx->remaining); - if (rx->ax_skb) { - kfree_skb(rx->ax_skb); - rx->ax_skb = NULL; - /* Discard the incomplete netdev Ethernet frame - * and assume the Data header is at the start of - * the current URB socket buffer. - */ - } - rx->remaining = 0; + reset_asix_rx_fixup_info(rx); } } @@ -139,11 +152,13 @@ int asix_rx_fixup_internal(struct usbnet *dev, struct sk_buff *skb, if (size != ((~rx->header >> 16) & 0x7ff)) { netdev_err(dev->net, "asix_rx_fixup() Bad Header Length 0x%x, offset %d\n", rx->header, offset); + reset_asix_rx_fixup_info(rx); return 0; } if (size > dev->net->mtu + ETH_HLEN + VLAN_HLEN) { netdev_dbg(dev->net, "asix_rx_fixup() Bad RX Length %d\n", size); + reset_asix_rx_fixup_info(rx); return 0; } @@ -180,6 +195,7 @@ int asix_rx_fixup_internal(struct usbnet *dev, struct sk_buff *skb, if (skb->len != offset) { netdev_err(dev->net, "asix_rx_fixup() Bad SKB Length %d, %d\n", skb->len, offset); + reset_asix_rx_fixup_info(rx); return 0; } From d0c8f338ab41438bdf8472cb4209d4ab54d439d5 Mon Sep 17 00:00:00 2001 From: Dean Jenkins Date: Mon, 7 Aug 2017 09:50:16 +0100 Subject: [PATCH 090/109] asix: Fix small memory leak in ax88772_unbind() When Ethernet frames span mulitple URBs, the netdev buffer memory pointed to by the asix_rx_fixup_info structure remains allocated during the time gap between the 2 executions of asix_rx_fixup_internal(). This means that if ax88772_unbind() is called within this time gap to free the memory of the parent private data structure then a memory leak of the part filled netdev buffer memory will occur. Therefore, create a new function asix_rx_fixup_common_free() to free the memory of the netdev buffer and add a call to asix_rx_fixup_common_free() from inside ax88772_unbind(). Consequently when an unbind occurs part way through receiving an Ethernet frame, the netdev buffer memory that is holding part of the received Ethernet frame will now be freed. Signed-off-by: Dean Jenkins Signed-off-by: David S. Miller --- drivers/net/usb/asix.h | 1 + drivers/net/usb/asix_common.c | 15 +++++++++++++++ drivers/net/usb/asix_devices.c | 1 + 3 files changed, 17 insertions(+) diff --git a/drivers/net/usb/asix.h b/drivers/net/usb/asix.h index d1092421aaa7..9a4171b90947 100644 --- a/drivers/net/usb/asix.h +++ b/drivers/net/usb/asix.h @@ -209,6 +209,7 @@ void asix_write_cmd_async(struct usbnet *dev, u8 cmd, u16 value, int asix_rx_fixup_internal(struct usbnet *dev, struct sk_buff *skb, struct asix_rx_fixup_info *rx); int asix_rx_fixup_common(struct usbnet *dev, struct sk_buff *skb); +void asix_rx_fixup_common_free(struct asix_common_private *dp); struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags); diff --git a/drivers/net/usb/asix_common.c b/drivers/net/usb/asix_common.c index fda74f33e3ec..522d2900cd1d 100644 --- a/drivers/net/usb/asix_common.c +++ b/drivers/net/usb/asix_common.c @@ -210,6 +210,21 @@ int asix_rx_fixup_common(struct usbnet *dev, struct sk_buff *skb) return asix_rx_fixup_internal(dev, skb, rx); } +void asix_rx_fixup_common_free(struct asix_common_private *dp) +{ + struct asix_rx_fixup_info *rx; + + if (!dp) + return; + + rx = &dp->rx_fixup_info; + + if (rx->ax_skb) { + kfree_skb(rx->ax_skb); + rx->ax_skb = NULL; + } +} + struct sk_buff *asix_tx_fixup(struct usbnet *dev, struct sk_buff *skb, gfp_t flags) { diff --git a/drivers/net/usb/asix_devices.c b/drivers/net/usb/asix_devices.c index a3aa0a27dfe5..b2ff88e69a81 100644 --- a/drivers/net/usb/asix_devices.c +++ b/drivers/net/usb/asix_devices.c @@ -764,6 +764,7 @@ static int ax88772_bind(struct usbnet *dev, struct usb_interface *intf) static void ax88772_unbind(struct usbnet *dev, struct usb_interface *intf) { + asix_rx_fixup_common_free(dev->driver_priv); kfree(dev->driver_priv); } From ec2c6726322f0d270bab477e4904bf9496f70ee5 Mon Sep 17 00:00:00 2001 From: Julian Wiedmann Date: Mon, 7 Aug 2017 13:28:39 +0200 Subject: [PATCH 091/109] s390/qeth: fix L3 next-hop in xmit qeth hdr On L3, the qeth_hdr struct needs to be filled with the next-hop IP address. The current code accesses rtable->rt_gateway without checking that rtable is a valid address. The accidental access to a lowcore area results in a random next-hop address in the qeth_hdr. rtable (or more precisely, skb_dst(skb)) can be NULL in rare cases (for instance together with AF_PACKET sockets). This patch adds the missing NULL-ptr checks. Signed-off-by: Julian Wiedmann Signed-off-by: Ursula Braun Fixes: 87e7597b5a3 qeth: Move away from using neighbour entries in qeth_l3_fill_header() Signed-off-by: David S. Miller --- drivers/s390/net/qeth_l3_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c index 8975cd321390..d42e758518ed 100644 --- a/drivers/s390/net/qeth_l3_main.c +++ b/drivers/s390/net/qeth_l3_main.c @@ -2512,7 +2512,7 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr, struct rtable *rt = (struct rtable *) dst; __be32 *pkey = &ip_hdr(skb)->daddr; - if (rt->rt_gateway) + if (rt && rt->rt_gateway) pkey = &rt->rt_gateway; /* IPv4 */ @@ -2523,7 +2523,7 @@ static void qeth_l3_fill_header(struct qeth_card *card, struct qeth_hdr *hdr, struct rt6_info *rt = (struct rt6_info *) dst; struct in6_addr *pkey = &ipv6_hdr(skb)->daddr; - if (!ipv6_addr_any(&rt->rt6i_gateway)) + if (rt && !ipv6_addr_any(&rt->rt6i_gateway)) pkey = &rt->rt6i_gateway; /* IPv6 */ From b925ef37b0a152b0c06aa43bc9204d0116f676d7 Mon Sep 17 00:00:00 2001 From: Anton Volkov Date: Mon, 7 Aug 2017 15:54:14 +0300 Subject: [PATCH 092/109] hysdn: fix to a race condition in put_log_buffer The synchronization type that was used earlier to guard the loop that deletes unused log buffers may lead to a situation that prevents any thread from going through the loop. The patch deletes previously used synchronization mechanism and moves the loop under the spin_lock so the similar cases won't be feasible in the future. Found by by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Anton Volkov Signed-off-by: David S. Miller --- drivers/isdn/hysdn/hysdn_proclog.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c index 7b5fd8fb1761..aaca0b3d662e 100644 --- a/drivers/isdn/hysdn/hysdn_proclog.c +++ b/drivers/isdn/hysdn/hysdn_proclog.c @@ -44,7 +44,6 @@ struct procdata { char log_name[15]; /* log filename */ struct log_data *log_head, *log_tail; /* head and tail for queue */ int if_used; /* open count for interface */ - int volatile del_lock; /* lock for delete operations */ unsigned char logtmp[LOG_MAX_LINELEN]; wait_queue_head_t rd_queue; }; @@ -102,7 +101,6 @@ put_log_buffer(hysdn_card *card, char *cp) { struct log_data *ib; struct procdata *pd = card->proclog; - int i; unsigned long flags; if (!pd) @@ -126,21 +124,21 @@ put_log_buffer(hysdn_card *card, char *cp) else pd->log_tail->next = ib; /* follows existing messages */ pd->log_tail = ib; /* new tail */ - i = pd->del_lock++; /* get lock state */ - spin_unlock_irqrestore(&card->hysdn_lock, flags); /* delete old entrys */ - if (!i) - while (pd->log_head->next) { - if ((pd->log_head->usage_cnt <= 0) && - (pd->log_head->next->usage_cnt <= 0)) { - ib = pd->log_head; - pd->log_head = pd->log_head->next; - kfree(ib); - } else - break; - } /* pd->log_head->next */ - pd->del_lock--; /* release lock level */ + while (pd->log_head->next) { + if ((pd->log_head->usage_cnt <= 0) && + (pd->log_head->next->usage_cnt <= 0)) { + ib = pd->log_head; + pd->log_head = pd->log_head->next; + kfree(ib); + } else { + break; + } + } /* pd->log_head->next */ + + spin_unlock_irqrestore(&card->hysdn_lock, flags); + wake_up_interruptible(&(pd->rd_queue)); /* announce new entry */ } /* put_log_buffer */ From eb2a6b800c2d1336cce6709d48c42753a611c07b Mon Sep 17 00:00:00 2001 From: Christophe Jaillet Date: Mon, 7 Aug 2017 00:00:17 +0200 Subject: [PATCH 093/109] qed: Fix a memory allocation failure test in 'qed_mcp_cmd_init()' We allocate 'p_info->mfw_mb_cur' and 'p_info->mfw_mb_shadow' but we check 'p_info->mfw_mb_addr' instead of 'p_info->mfw_mb_cur'. 'p_info->mfw_mb_addr' is never 0, because it is initiliazed a few lines above in 'qed_load_mcp_offsets()'. Update the test and check the result of the 2 'kzalloc()' instead. Signed-off-by: Christophe JAILLET Acked-by: Tomer Tayar Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qed/qed_mcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.c b/drivers/net/ethernet/qlogic/qed/qed_mcp.c index 9da91045d167..3eb241657368 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.c +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.c @@ -253,7 +253,7 @@ int qed_mcp_cmd_init(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) size = MFW_DRV_MSG_MAX_DWORDS(p_info->mfw_mb_length) * sizeof(u32); p_info->mfw_mb_cur = kzalloc(size, GFP_KERNEL); p_info->mfw_mb_shadow = kzalloc(size, GFP_KERNEL); - if (!p_info->mfw_mb_shadow || !p_info->mfw_mb_addr) + if (!p_info->mfw_mb_cur || !p_info->mfw_mb_shadow) goto err; return 0; From 51d96dc2e2dc2cf9b81cf976cc93c51ba3ac2f92 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Tue, 8 Aug 2017 18:28:41 +0200 Subject: [PATCH 094/109] random: fix warning message on ia64 and parisc Fix the warning message on the parisc and IA64 architectures to show the correct function name of the caller by using %pS instead of %pF. The message is printed with the value of _RET_IP_ which calls __builtin_return_address(0) and as such returns the IP address caller instead of pointer to a function descriptor of the caller. The effect of this patch is visible on the parisc and ia64 architectures only since those are the ones which use function descriptors while on all others %pS and %pF will behave the same. Cc: Theodore Ts'o Cc: Jason A. Donenfeld Signed-off-by: Helge Deller Fixes: eecabf567422 ("random: suppress spammy warnings about unseeded randomness") Fixes: d06bfd1989fe ("random: warn when kernel uses unseeded randomness") Signed-off-by: Linus Torvalds --- drivers/char/random.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index afa3ce7d3e72..8ad92707e45f 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1492,7 +1492,7 @@ static void _warn_unseeded_randomness(const char *func_name, void *caller, #ifndef CONFIG_WARN_ALL_UNSEEDED_RANDOM print_once = true; #endif - pr_notice("random: %s called from %pF with crng_init=%d\n", + pr_notice("random: %s called from %pS with crng_init=%d\n", func_name, caller, crng_init); } From a1ffc2d25ae98878f37445d223b3344686b5c822 Mon Sep 17 00:00:00 2001 From: Sedat Dilek Date: Tue, 25 Jul 2017 14:53:42 +0200 Subject: [PATCH 095/109] MAINTAINERS: greybus: Fix typo s/LOOBACK/LOOPBACK Fixes: f47e07bc5f1a5c48 ("Fix up MAINTAINERS file problems") Cc: Joe Perches Signed-off-by: Sedat Dilek Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 44cb004c765d..01609e368bc7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5834,7 +5834,7 @@ F: drivers/staging/greybus/spi.c F: drivers/staging/greybus/spilib.c F: drivers/staging/greybus/spilib.h -GREYBUS LOOBACK/TIME PROTOCOLS DRIVERS +GREYBUS LOOPBACK/TIME PROTOCOLS DRIVERS M: Bryan O'Donoghue S: Maintained F: drivers/staging/greybus/loopback.c From 6209ef67883ae6407cceb557b9ec4b2117d91697 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 2 Aug 2017 10:57:45 -0700 Subject: [PATCH 096/109] MAINTAINERS: openbmc mailing list is moderated The openbmc mailing list is moderated for non-subscribers. Signed-off-by: Randy Dunlap Acked-by: Brendan Higgins Cc: Benjamin Herrenschmidt Cc: Joel Stanley Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 01609e368bc7..3c419022ed93 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1161,7 +1161,7 @@ M: Brendan Higgins R: Benjamin Herrenschmidt R: Joel Stanley L: linux-i2c@vger.kernel.org -L: openbmc@lists.ozlabs.org +L: openbmc@lists.ozlabs.org (moderated for non-subscribers) S: Maintained F: drivers/irqchip/irq-aspeed-i2c-ic.c F: drivers/i2c/busses/i2c-aspeed.c From 6f7d98ec445b61f8f416fedfe607cb4f1653a8bc Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 4 Aug 2017 21:45:48 -0700 Subject: [PATCH 097/109] get_maintainer: Prepare for separate MAINTAINERS files Allow for MAINTAINERS to become a directory and if it is, read all the files in the directory for maintained sections. Optionally look for all files named MAINTAINERS in directories excluding the .git directory by using --find-maintainer-files. This optional feature adds ~.3 seconds of CPU on an Intel i5-6200 with an SSD. Miscellanea: - Create a read_maintainer_file subroutine from the existing code - Test only the existence of MAINTAINERS, not whether it's a file Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 87 ++++++++++++++++++++++++++++----------- 1 file changed, 64 insertions(+), 23 deletions(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 3bd5f4f30235..bc443201d3ef 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -18,6 +18,7 @@ my $V = '0.26'; use Getopt::Long qw(:config no_auto_abbrev); use Cwd; +use File::Find; my $cur_path = fastgetcwd() . '/'; my $lk_path = "./"; @@ -58,6 +59,7 @@ my $from_filename = 0; my $pattern_depth = 0; my $version = 0; my $help = 0; +my $find_maintainer_files = 0; my $vcs_used = 0; @@ -249,6 +251,7 @@ if (!GetOptions( 'sections!' => \$sections, 'fe|file-emails!' => \$file_emails, 'f|file' => \$from_filename, + 'find-maintainer-files' => \$find_maintainer_files, 'v|version' => \$version, 'h|help|usage' => \$help, )) { @@ -307,36 +310,74 @@ if (!top_of_kernel_tree($lk_path)) { my @typevalue = (); my %keyword_hash; +my @mfiles = (); -open (my $maint, '<', "${lk_path}MAINTAINERS") - or die "$P: Can't open MAINTAINERS: $!\n"; -while (<$maint>) { - my $line = $_; +sub read_maintainer_file { + my ($file) = @_; - if ($line =~ m/^([A-Z]):\s*(.*)/) { - my $type = $1; - my $value = $2; + open (my $maint, '<', "$file") + or die "$P: Can't open MAINTAINERS file '$file': $!\n"; + while (<$maint>) { + my $line = $_; - ##Filename pattern matching - if ($type eq "F" || $type eq "X") { - $value =~ s@\.@\\\.@g; ##Convert . to \. - $value =~ s/\*/\.\*/g; ##Convert * to .* - $value =~ s/\?/\./g; ##Convert ? to . - ##if pattern is a directory and it lacks a trailing slash, add one - if ((-d $value)) { - $value =~ s@([^/])$@$1/@; + if ($line =~ m/^([A-Z]):\s*(.*)/) { + my $type = $1; + my $value = $2; + + ##Filename pattern matching + if ($type eq "F" || $type eq "X") { + $value =~ s@\.@\\\.@g; ##Convert . to \. + $value =~ s/\*/\.\*/g; ##Convert * to .* + $value =~ s/\?/\./g; ##Convert ? to . + ##if pattern is a directory and it lacks a trailing slash, add one + if ((-d $value)) { + $value =~ s@([^/])$@$1/@; + } + } elsif ($type eq "K") { + $keyword_hash{@typevalue} = $value; } - } elsif ($type eq "K") { - $keyword_hash{@typevalue} = $value; + push(@typevalue, "$type:$value"); + } elsif (!(/^\s*$/ || /^\s*\#/)) { + $line =~ s/\n$//g; + push(@typevalue, $line); } - push(@typevalue, "$type:$value"); - } elsif (!/^(\s)*$/) { - $line =~ s/\n$//g; - push(@typevalue, $line); + } + close($maint); +} + +sub find_is_maintainer_file { + my ($file) = $_; + return if ($file !~ m@/MAINTAINERS$@); + $file = $File::Find::name; + return if (! -f $file); + push(@mfiles, $file); +} + +sub find_ignore_git { + return grep { $_ !~ /^\.git$/; } @_; +} + +if (-d "${lk_path}MAINTAINERS") { + opendir(DIR, "${lk_path}MAINTAINERS") or die $!; + my @files = readdir(DIR); + closedir(DIR); + foreach my $file (@files) { + push(@mfiles, "${lk_path}MAINTAINERS/$file") if ($file !~ /^\./); } } -close($maint); +if ($find_maintainer_files) { + find( { wanted => \&find_is_maintainer_file, + preprocess => \&find_ignore_git, + no_chdir => 1, + }, "${lk_path}"); +} else { + push(@mfiles, "${lk_path}MAINTAINERS") if -f "${lk_path}MAINTAINERS"; +} + +foreach my $file (@mfiles) { + read_maintainer_file("$file"); +} # # Read mail address map @@ -873,7 +914,7 @@ sub top_of_kernel_tree { if ( (-f "${lk_path}COPYING") && (-f "${lk_path}CREDITS") && (-f "${lk_path}Kbuild") - && (-f "${lk_path}MAINTAINERS") + && (-e "${lk_path}MAINTAINERS") && (-f "${lk_path}Makefile") && (-f "${lk_path}README") && (-d "${lk_path}Documentation") From 61f741645a354d91fece7b9cbb2f3f3587db8b8a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Aug 2017 18:45:47 -0700 Subject: [PATCH 098/109] parse-maintainers: Add section pattern sorting Section [A-Z]: patterns are not currently in any required sorting order. Add a specific sorting sequence to MAINTAINERS entries. Sort F: and X: patterns in alphabetic order. The preferred section ordering is: SECTION HEADER M: Maintainers R: Reviewers P: Named persons without email addresses L: Mailing list addresses S: Status of this section (Supported, Maintained, Orphan, etc...) W: Any relevant URLs T: Source code control type (git, quilt, etc) Q: Patchwork patch acceptance queue site B: Bug tracking URIs C: Chat URIs F: Files with wildcard patterns (alphabetic ordered) X: Excluded files with wildcard patterns (alphabetic ordered) N: Files with regex patterns K: Keyword regexes in source code for maintainership identification Miscellaneous perl neatening: - Rename %map to %hash, map has a different meaning in perl - Avoid using \& and local variables for function indirection - Use return for a little c like clarity - Use c-like function call style instead of &function Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- scripts/parse-maintainers.pl | 70 +++++++++++++++++++++++++----------- 1 file changed, 49 insertions(+), 21 deletions(-) diff --git a/scripts/parse-maintainers.pl b/scripts/parse-maintainers.pl index a0fe34349b24..5c8e0504c67e 100644 --- a/scripts/parse-maintainers.pl +++ b/scripts/parse-maintainers.pl @@ -2,9 +2,9 @@ use strict; -my %map; +my %hash; -# sort comparison function +# sort comparison functions sub by_category($$) { my ($a, $b) = @_; @@ -15,20 +15,47 @@ sub by_category($$) { $a =~ s/THE REST/ZZZZZZ/g; $b =~ s/THE REST/ZZZZZZ/g; - $a cmp $b; + return $a cmp $b; +} + +sub by_pattern($$) { + my ($a, $b) = @_; + my $preferred_order = 'MRPLSWTQBCFXNK'; + + my $a1 = uc(substr($a, 0, 1)); + my $b1 = uc(substr($b, 0, 1)); + + my $a_index = index($preferred_order, $a1); + my $b_index = index($preferred_order, $b1); + + $a_index = 1000 if ($a_index == -1); + $b_index = 1000 if ($b_index == -1); + + if (($a1 =~ /^F$/ && $b1 =~ /^F$/) || + ($a1 =~ /^X$/ && $b1 =~ /^X$/)) { + return $a cmp $b; + } + + if ($a_index < $b_index) { + return -1; + } elsif ($a_index == $b_index) { + return 0; + } else { + return 1; + } } sub alpha_output { - my $key; - my $sort_method = \&by_category; - my $sep = ""; - - foreach $key (sort $sort_method keys %map) { - if ($key ne " ") { - print $sep . $key . "\n"; - $sep = "\n"; - } - print $map{$key}; + foreach my $key (sort by_category keys %hash) { + if ($key eq " ") { + chomp $hash{$key}; + print $hash{$key}; + } else { + print "\n" . $key . "\n"; + foreach my $pattern (sort by_pattern split('\n', $hash{$key})) { + print($pattern . "\n"); + } + } } } @@ -42,7 +69,7 @@ sub trim { sub file_input { my $lastline = ""; my $case = " "; - $map{$case} = ""; + $hash{$case} = ""; while (<>) { my $line = $_; @@ -51,27 +78,28 @@ sub file_input { if ($line =~ m/^([A-Z]):\s*(.*)/) { $line = $1 . ":\t" . trim($2) . "\n"; if ($lastline eq "") { - $map{$case} = $map{$case} . $line; + $hash{$case} = $hash{$case} . $line; next; } $case = trim($lastline); - exists $map{$case} and die "Header '$case' already exists"; - $map{$case} = $line; + exists $hash{$case} and die "Header '$case' already exists"; + $hash{$case} = $line; $lastline = ""; next; } if ($case eq " ") { - $map{$case} = $map{$case} . $lastline; + $hash{$case} = $hash{$case} . $lastline; $lastline = $line; next; } trim($lastline) eq "" or die ("Odd non-pattern line '$lastline' for '$case'"); $lastline = $line; } - $map{$case} = $map{$case} . $lastline; + $hash{$case} = $hash{$case} . $lastline; } -&file_input; -&alpha_output; +file_input(); +alpha_output(); + exit(0); From fe9090301fed202a80a6113534efaa0d9184543b Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Aug 2017 18:45:48 -0700 Subject: [PATCH 099/109] parse-maintainers: Use perl hash references and specific filenames Instead of reading STDIN and writing STDOUT, use specific filenames of MAINTAINERS and MAINTAINERS.new. Use hash references instead of global hash %hash so future modifications can read and write specific hashes to split up MAINTAINERS into multiple files using a script. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- scripts/parse-maintainers.pl | 59 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 24 deletions(-) diff --git a/scripts/parse-maintainers.pl b/scripts/parse-maintainers.pl index 5c8e0504c67e..c286154a2b68 100644 --- a/scripts/parse-maintainers.pl +++ b/scripts/parse-maintainers.pl @@ -2,7 +2,7 @@ use strict; -my %hash; +my $P = $0; # sort comparison functions sub by_category($$) { @@ -45,20 +45,6 @@ sub by_pattern($$) { } } -sub alpha_output { - foreach my $key (sort by_category keys %hash) { - if ($key eq " ") { - chomp $hash{$key}; - print $hash{$key}; - } else { - print "\n" . $key . "\n"; - foreach my $pattern (sort by_pattern split('\n', $hash{$key})) { - print($pattern . "\n"); - } - } - } -} - sub trim { my $s = shift; $s =~ s/\s+$//; @@ -66,40 +52,65 @@ sub trim { return $s; } +sub alpha_output { + my ($hashref, $filename) = (@_); + + open(my $file, '>', "$filename") or die "$P: $filename: open failed - $!\n"; + foreach my $key (sort by_category keys %$hashref) { + if ($key eq " ") { + chomp $$hashref{$key}; + print $file $$hashref{$key}; + } else { + print $file "\n" . $key . "\n"; + foreach my $pattern (sort by_pattern split('\n', %$hashref{$key})) { + print $file ($pattern . "\n"); + } + } + } + close($file); +} + sub file_input { + my ($hashref, $filename) = (@_); + my $lastline = ""; my $case = " "; - $hash{$case} = ""; + $$hashref{$case} = ""; - while (<>) { + open(my $file, '<', "$filename") or die "$P: $filename: open failed - $!\n"; + + while (<$file>) { my $line = $_; # Pattern line? if ($line =~ m/^([A-Z]):\s*(.*)/) { $line = $1 . ":\t" . trim($2) . "\n"; if ($lastline eq "") { - $hash{$case} = $hash{$case} . $line; + $$hashref{$case} = $$hashref{$case} . $line; next; } $case = trim($lastline); - exists $hash{$case} and die "Header '$case' already exists"; - $hash{$case} = $line; + exists $$hashref{$case} and die "Header '$case' already exists"; + $$hashref{$case} = $line; $lastline = ""; next; } if ($case eq " ") { - $hash{$case} = $hash{$case} . $lastline; + $$hashref{$case} = $$hashref{$case} . $lastline; $lastline = $line; next; } trim($lastline) eq "" or die ("Odd non-pattern line '$lastline' for '$case'"); $lastline = $line; } - $hash{$case} = $hash{$case} . $lastline; + $$hashref{$case} = $$hashref{$case} . $lastline; + close($file); } -file_input(); -alpha_output(); +my %hash; + +file_input(\%hash, "MAINTAINERS"); +alpha_output(\%hash, "MAINTAINERS.new"); exit(0); From b95c29a20f070babfea92ab8f741ec94695617d3 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 5 Aug 2017 18:45:49 -0700 Subject: [PATCH 100/109] parse-maintainers: Move matching sections from MAINTAINERS Allow any number of command line arguments to match either the section header or the section contents and create new files. Create MAINTAINERS.new and SECTION.new. This allows scripting of the movement of various sections from MAINTAINERS. Signed-off-by: Joe Perches Signed-off-by: Linus Torvalds --- scripts/parse-maintainers.pl | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/parse-maintainers.pl b/scripts/parse-maintainers.pl index c286154a2b68..e40b53db7f9f 100644 --- a/scripts/parse-maintainers.pl +++ b/scripts/parse-maintainers.pl @@ -109,8 +109,20 @@ sub file_input { } my %hash; +my %new_hash; file_input(\%hash, "MAINTAINERS"); + +foreach my $type (@ARGV) { + foreach my $key (keys %hash) { + if ($key =~ /$type/ || $hash{$key} =~ /$type/) { + $new_hash{$key} = $hash{$key}; + delete $hash{$key}; + } + } +} + alpha_output(\%hash, "MAINTAINERS.new"); +alpha_output(\%new_hash, "SECTION.new"); exit(0); From e718fe450e616227b74d27a233cdf37b4df0c82b Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Thu, 3 Aug 2017 22:54:48 +0200 Subject: [PATCH 101/109] net/mlx4_en: don't set CHECKSUM_COMPLETE on SCTP packets if the NIC fails to validate the checksum on TCP/UDP, and validation of IP checksum is successful, the driver subtracts the pseudo-header checksum from the value obtained by the hardware and sets CHECKSUM_COMPLETE. Don't do that if protocol is IPPROTO_SCTP, otherwise CRC32c validation fails. V2: don't test MLX4_CQE_STATUS_IPV6 if MLX4_CQE_STATUS_IPV4 is set Reported-by: Shuang Li Fixes: f8c6455bb04b ("net/mlx4_en: Extend checksum offloading by CHECKSUM COMPLETE") Signed-off-by: Davide Caratti Acked-by: Saeed Mahameed Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 29 ++++++++++++++-------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 436f7689a032..bf1638044a7a 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -574,16 +574,21 @@ static inline __wsum get_fixed_vlan_csum(__wsum hw_checksum, * header, the HW adds it. To address that, we are subtracting the pseudo * header checksum from the checksum value provided by the HW. */ -static void get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb, - struct iphdr *iph) +static int get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb, + struct iphdr *iph) { __u16 length_for_csum = 0; __wsum csum_pseudo_header = 0; + __u8 ipproto = iph->protocol; + + if (unlikely(ipproto == IPPROTO_SCTP)) + return -1; length_for_csum = (be16_to_cpu(iph->tot_len) - (iph->ihl << 2)); csum_pseudo_header = csum_tcpudp_nofold(iph->saddr, iph->daddr, - length_for_csum, iph->protocol, 0); + length_for_csum, ipproto, 0); skb->csum = csum_sub(hw_checksum, csum_pseudo_header); + return 0; } #if IS_ENABLED(CONFIG_IPV6) @@ -594,17 +599,20 @@ static void get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb, static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb, struct ipv6hdr *ipv6h) { + __u8 nexthdr = ipv6h->nexthdr; __wsum csum_pseudo_hdr = 0; - if (unlikely(ipv6h->nexthdr == IPPROTO_FRAGMENT || - ipv6h->nexthdr == IPPROTO_HOPOPTS)) + if (unlikely(nexthdr == IPPROTO_FRAGMENT || + nexthdr == IPPROTO_HOPOPTS || + nexthdr == IPPROTO_SCTP)) return -1; - hw_checksum = csum_add(hw_checksum, (__force __wsum)htons(ipv6h->nexthdr)); + hw_checksum = csum_add(hw_checksum, (__force __wsum)htons(nexthdr)); csum_pseudo_hdr = csum_partial(&ipv6h->saddr, sizeof(ipv6h->saddr) + sizeof(ipv6h->daddr), 0); csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ipv6h->payload_len); - csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ntohs(ipv6h->nexthdr)); + csum_pseudo_hdr = csum_add(csum_pseudo_hdr, + (__force __wsum)htons(nexthdr)); skb->csum = csum_sub(hw_checksum, csum_pseudo_hdr); skb->csum = csum_add(skb->csum, csum_partial(ipv6h, sizeof(struct ipv6hdr), 0)); @@ -627,11 +635,10 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va, } if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4)) - get_fixed_ipv4_csum(hw_checksum, skb, hdr); + return get_fixed_ipv4_csum(hw_checksum, skb, hdr); #if IS_ENABLED(CONFIG_IPV6) - else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) - if (unlikely(get_fixed_ipv6_csum(hw_checksum, skb, hdr))) - return -1; + if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6)) + return get_fixed_ipv6_csum(hw_checksum, skb, hdr); #endif return 0; } From 8e6f1521ec431dbade73f57e21e5dc46eaae50ba Mon Sep 17 00:00:00 2001 From: John Crispin Date: Mon, 7 Aug 2017 16:20:49 +0200 Subject: [PATCH 102/109] net: dsa: mediatek: add adjust link support for user ports Manually adjust the port settings of user ports once PHY polling has completed. This patch extends the adjust_link callback to configure the per port PMCR register, applying the proper values polled from the PHY. Without this patch flow control was not always getting setup properly. Signed-off-by: Shashidhar Lakkavalli Signed-off-by: Muciri Gatimu Signed-off-by: John Crispin Signed-off-by: David S. Miller --- drivers/net/dsa/mt7530.c | 38 ++++++++++++++++++++++++++++++++++++++ drivers/net/dsa/mt7530.h | 1 + 2 files changed, 39 insertions(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 1e46418a3b74..264b281eb86b 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -625,6 +625,44 @@ static void mt7530_adjust_link(struct dsa_switch *ds, int port, * all finished. */ mt7623_pad_clk_setup(ds); + } else { + u16 lcl_adv = 0, rmt_adv = 0; + u8 flowctrl; + u32 mcr = PMCR_USERP_LINK | PMCR_FORCE_MODE; + + switch (phydev->speed) { + case SPEED_1000: + mcr |= PMCR_FORCE_SPEED_1000; + break; + case SPEED_100: + mcr |= PMCR_FORCE_SPEED_100; + break; + }; + + if (phydev->link) + mcr |= PMCR_FORCE_LNK; + + if (phydev->duplex) { + mcr |= PMCR_FORCE_FDX; + + if (phydev->pause) + rmt_adv = LPA_PAUSE_CAP; + if (phydev->asym_pause) + rmt_adv |= LPA_PAUSE_ASYM; + + if (phydev->advertising & ADVERTISED_Pause) + lcl_adv |= ADVERTISE_PAUSE_CAP; + if (phydev->advertising & ADVERTISED_Asym_Pause) + lcl_adv |= ADVERTISE_PAUSE_ASYM; + + flowctrl = mii_resolve_flowctrl_fdx(lcl_adv, rmt_adv); + + if (flowctrl & FLOW_CTRL_TX) + mcr |= PMCR_TX_FC_EN; + if (flowctrl & FLOW_CTRL_RX) + mcr |= PMCR_RX_FC_EN; + } + mt7530_write(priv, MT7530_PMCR_P(port), mcr); } } diff --git a/drivers/net/dsa/mt7530.h b/drivers/net/dsa/mt7530.h index b83d76b99802..74db9822eb40 100644 --- a/drivers/net/dsa/mt7530.h +++ b/drivers/net/dsa/mt7530.h @@ -151,6 +151,7 @@ enum mt7530_stp_state { #define PMCR_TX_FC_EN BIT(5) #define PMCR_RX_FC_EN BIT(4) #define PMCR_FORCE_SPEED_1000 BIT(3) +#define PMCR_FORCE_SPEED_100 BIT(2) #define PMCR_FORCE_FDX BIT(1) #define PMCR_FORCE_LNK BIT(0) #define PMCR_COMMON_LINK (PMCR_IFG_XMIT(1) | PMCR_MAC_MODE | \ From ec0acb09313074ba1a4976945791d9c6815f39fb Mon Sep 17 00:00:00 2001 From: Xin Long Date: Tue, 8 Aug 2017 15:25:25 +0800 Subject: [PATCH 103/109] net: sched: set xt_tgchk_param par.net properly in ipt_init_target Now xt_tgchk_param par in ipt_init_target is a local varibale, par.net is not initialized there. Later when xt_check_target calls target's checkentry in which it may access par.net, it would cause kernel panic. Jaroslav found this panic when running: # ip link add TestIface type dummy # tc qd add dev TestIface ingress handle ffff: # tc filter add dev TestIface parent ffff: u32 match u32 0 0 \ action xt -j CONNMARK --set-mark 4 This patch is to pass net param into ipt_init_target and set par.net with it properly in there. v1->v2: As Wang Cong pointed, I missed ipt_net_id != xt_net_id, so fix it by also passing net_id to __tcf_ipt_init. v2->v3: Missed the fixes tag, so add it. Fixes: ecb2421b5ddf ("netfilter: add and use nf_ct_netns_get/put") Reported-by: Jaroslav Aster Signed-off-by: Xin Long Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- net/sched/act_ipt.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 36f0ced9e60c..94ba5cfab860 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -36,8 +36,8 @@ static struct tc_action_ops act_ipt_ops; static unsigned int xt_net_id; static struct tc_action_ops act_xt_ops; -static int ipt_init_target(struct xt_entry_target *t, char *table, - unsigned int hook) +static int ipt_init_target(struct net *net, struct xt_entry_target *t, + char *table, unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; @@ -49,6 +49,7 @@ static int ipt_init_target(struct xt_entry_target *t, char *table, return PTR_ERR(target); t->u.kernel.target = target; + par.net = net; par.table = table; par.entryinfo = NULL; par.target = target; @@ -91,10 +92,11 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; -static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, +static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, struct nlattr *est, struct tc_action **a, const struct tc_action_ops *ops, int ovr, int bind) { + struct tc_action_net *tn = net_generic(net, id); struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct xt_entry_target *td, *t; @@ -159,7 +161,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (unlikely(!t)) goto err2; - err = ipt_init_target(t, tname, hook); + err = ipt_init_target(net, t, tname, hook); if (err < 0) goto err3; @@ -193,18 +195,16 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { - struct tc_action_net *tn = net_generic(net, ipt_net_id); - - return __tcf_ipt_init(tn, nla, est, a, &act_ipt_ops, ovr, bind); + return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, ovr, + bind); } static int tcf_xt_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, int ovr, int bind) { - struct tc_action_net *tn = net_generic(net, xt_net_id); - - return __tcf_ipt_init(tn, nla, est, a, &act_xt_ops, ovr, bind); + return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, ovr, + bind); } static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a, From 8ba60924710cde564a3905588b6219741d6356d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 8 Aug 2017 01:41:58 -0700 Subject: [PATCH 104/109] tcp: fastopen: tcp_connect() must refresh the route With new TCP_FASTOPEN_CONNECT socket option, there is a possibility to call tcp_connect() while socket sk_dst_cache is either NULL or invalid. +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 4 +0 fcntl(4, F_SETFL, O_RDWR|O_NONBLOCK) = 0 +0 setsockopt(4, SOL_TCP, TCP_FASTOPEN_CONNECT, [1], 4) = 0 +0 connect(4, ..., ...) = 0 << sk->sk_dst_cache becomes obsolete, or even set to NULL >> +1 sendto(4, ..., 1000, MSG_FASTOPEN, ..., ...) = 1000 We need to refresh the route otherwise bad things can happen, especially when syzkaller is running on the host :/ Fixes: 19f6d3f3c8422 ("net/tcp-fastopen: Add new API support") Reported-by: Dmitry Vyukov Signed-off-by: Eric Dumazet Cc: Wei Wang Cc: Yuchung Cheng Acked-by: Wei Wang Acked-by: Yuchung Cheng Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 276406a83a37..b7661a68d498 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3436,6 +3436,10 @@ int tcp_connect(struct sock *sk) int err; tcp_call_bpf(sk, BPF_SOCK_OPS_TCP_CONNECT_CB); + + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_connect_init(sk); if (unlikely(tp->repair)) { From 05bfd7dbb53a10be4a3e7aebaeec04b558198d49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Tue, 8 Aug 2017 11:13:32 +0200 Subject: [PATCH 105/109] rds: Reintroduce statistics counting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In commit 7e3f2952eeb1 ("rds: don't let RDS shutdown a connection while senders are present"), refilling the receive queue was removed from rds_ib_recv(), along with the increment of s_ib_rx_refill_from_thread. Commit 73ce4317bf98 ("RDS: make sure we post recv buffers") re-introduces filling the receive queue from rds_ib_recv(), but does not add the statistics counter. rds_ib_recv() was later renamed to rds_ib_recv_path(). This commit reintroduces the statistics counting of s_ib_rx_refill_from_thread and s_ib_rx_refill_from_cq. Signed-off-by: Håkon Bugge Reviewed-by: Knut Omang Reviewed-by: Wei Lin Guay Reviewed-by: Shamir Rabinovitch Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e10624aa6959..9722bf839d9d 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1015,8 +1015,10 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, if (rds_ib_ring_empty(&ic->i_recv_ring)) rds_ib_stats_inc(s_ib_rx_ring_empty); - if (rds_ib_ring_low(&ic->i_recv_ring)) + if (rds_ib_ring_low(&ic->i_recv_ring)) { rds_ib_recv_refill(conn, 0, GFP_NOWAIT); + rds_ib_stats_inc(s_ib_rx_refill_from_cq); + } } int rds_ib_recv_path(struct rds_conn_path *cp) @@ -1029,6 +1031,7 @@ int rds_ib_recv_path(struct rds_conn_path *cp) if (rds_conn_up(conn)) { rds_ib_attempt_ack(ic); rds_ib_recv_refill(conn, 0, GFP_KERNEL); + rds_ib_stats_inc(s_ib_rx_refill_from_thread); } return ret; From 0a0e1a85c83775a648041be2b15de6d0a2f2b8eb Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 8 Aug 2017 11:43:24 +0200 Subject: [PATCH 106/109] ppp: fix xmit recursion detection on ppp channels Commit e5dadc65f9e0 ("ppp: Fix false xmit recursion detect with two ppp devices") dropped the xmit_recursion counter incrementation in ppp_channel_push() and relied on ppp_xmit_process() for this task. But __ppp_channel_push() can also send packets directly (using the .start_xmit() channel callback), in which case the xmit_recursion counter isn't incremented anymore. If such packets get routed back to the parent ppp unit, ppp_xmit_process() won't notice the recursion and will call ppp_channel_push() on the same channel, effectively creating the deadlock situation that the xmit_recursion mechanism was supposed to prevent. This patch re-introduces the xmit_recursion counter incrementation in ppp_channel_push(). Since the xmit_recursion variable is now part of the parent ppp unit, incrementation is skipped if the channel doesn't have any. This is fine because only packets routed through the parent unit may enter the channel recursively. Finally, we have to ensure that pch->ppp is not going to be modified while executing ppp_channel_push(). Instead of taking this lock only while calling ppp_xmit_process(), we now have to hold it for the full ppp_channel_push() execution. This respects the ppp locks ordering which requires locking ->upl before ->downl. Fixes: e5dadc65f9e0 ("ppp: Fix false xmit recursion detect with two ppp devices") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/ppp_generic.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index bd4303944e44..a404552555d4 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1915,21 +1915,23 @@ static void __ppp_channel_push(struct channel *pch) spin_unlock(&pch->downl); /* see if there is anything from the attached unit to be sent */ if (skb_queue_empty(&pch->file.xq)) { - read_lock(&pch->upl); ppp = pch->ppp; if (ppp) - ppp_xmit_process(ppp); - read_unlock(&pch->upl); + __ppp_xmit_process(ppp); } } static void ppp_channel_push(struct channel *pch) { - local_bh_disable(); - - __ppp_channel_push(pch); - - local_bh_enable(); + read_lock_bh(&pch->upl); + if (pch->ppp) { + (*this_cpu_ptr(pch->ppp->xmit_recursion))++; + __ppp_channel_push(pch); + (*this_cpu_ptr(pch->ppp->xmit_recursion))--; + } else { + __ppp_channel_push(pch); + } + read_unlock_bh(&pch->upl); } /* From bbae08e592706dc32e5c7c97827b13c1c178668b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Tue, 8 Aug 2017 18:02:11 +0200 Subject: [PATCH 107/109] qmi_wwan: fix NULL deref on disconnect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit qmi_wwan_disconnect is called twice when disconnecting devices with separate control and data interfaces. The first invocation will set the interface data to NULL for both interfaces to flag that the disconnect has been handled. But the matching NULL check was left out when qmi_wwan_disconnect was added, resulting in this oops: usb 2-1.4: USB disconnect, device number 4 qmi_wwan 2-1.4:1.6 wwp0s29u1u4i6: unregister 'qmi_wwan' usb-0000:00:1d.0-1.4, WWAN/QMI device BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0 IP: qmi_wwan_disconnect+0x25/0xc0 [qmi_wwan] PGD 0 P4D 0 Oops: 0000 [#1] SMP Modules linked in: CPU: 2 PID: 33 Comm: kworker/2:1 Tainted: G E 4.12.3-nr44-normandy-r1500619820+ #1 Hardware name: LENOVO 4291LR7/4291LR7, BIOS CBET4000 4.6-810-g50522254fb 07/21/2017 Workqueue: usb_hub_wq hub_event [usbcore] task: ffff8c882b716040 task.stack: ffffb8e800d84000 RIP: 0010:qmi_wwan_disconnect+0x25/0xc0 [qmi_wwan] RSP: 0018:ffffb8e800d87b38 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: ffff8c8824f3f1d0 RDI: ffff8c8824ef6400 RBP: ffff8c8824ef6400 R08: 0000000000000000 R09: 0000000000000000 R10: ffffb8e800d87780 R11: 0000000000000011 R12: ffffffffc07ea0e8 R13: ffff8c8824e2e000 R14: ffff8c8824e2e098 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8c8835300000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000000e0 CR3: 0000000229ca5000 CR4: 00000000000406e0 Call Trace: ? usb_unbind_interface+0x71/0x270 [usbcore] ? device_release_driver_internal+0x154/0x210 ? qmi_wwan_unbind+0x6d/0xc0 [qmi_wwan] ? usbnet_disconnect+0x6c/0xf0 [usbnet] ? qmi_wwan_disconnect+0x87/0xc0 [qmi_wwan] ? usb_unbind_interface+0x71/0x270 [usbcore] ? device_release_driver_internal+0x154/0x210 Reported-and-tested-by: Nathaniel Roach Fixes: c6adf77953bc ("net: usb: qmi_wwan: add qmap mux protocol support") Cc: Daniele Palmas Signed-off-by: Bjørn Mork Signed-off-by: David S. Miller --- drivers/net/usb/qmi_wwan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index ff6f39fe6c00..8c3733608271 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1341,10 +1341,14 @@ static int qmi_wwan_probe(struct usb_interface *intf, static void qmi_wwan_disconnect(struct usb_interface *intf) { struct usbnet *dev = usb_get_intfdata(intf); - struct qmi_wwan_state *info = (void *)&dev->data; + struct qmi_wwan_state *info; struct list_head *iter; struct net_device *ldev; + /* called twice if separate control and data intf */ + if (!dev) + return; + info = (void *)&dev->data; if (info->flags & QMI_WWAN_FLAG_MUX) { if (!rtnl_trylock()) { restart_syscall(); From 8d63bee643f1fb53e472f0e135cae4eb99d62d19 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 8 Aug 2017 14:22:55 -0400 Subject: [PATCH 108/109] net: avoid skb_warn_bad_offload false positives on UFO skb_warn_bad_offload triggers a warning when an skb enters the GSO stack at __skb_gso_segment that does not have CHECKSUM_PARTIAL checksum offload set. Commit b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") observed that SKB_GSO_DODGY producers can trigger the check and that passing those packets through the GSO handlers will fix it up. But, the software UFO handler will set ip_summed to CHECKSUM_NONE. When __skb_gso_segment is called from the receive path, this triggers the warning again. Make UFO set CHECKSUM_UNNECESSARY instead of CHECKSUM_NONE. On Tx these two are equivalent. On Rx, this better matches the skb state (checksum computed), as CHECKSUM_NONE here means no checksum computed. See also this thread for context: http://patchwork.ozlabs.org/patch/799015/ Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/ipv4/udp_offload.c | 2 +- net/ipv6/udp_offload.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 8515f8fe0460..ce15a06d5558 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2739,7 +2739,7 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path) { if (tx_path) return skb->ip_summed != CHECKSUM_PARTIAL && - skb->ip_summed != CHECKSUM_NONE; + skb->ip_summed != CHECKSUM_UNNECESSARY; return skb->ip_summed == CHECKSUM_NONE; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 781250151d40..0932c85b42af 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -235,7 +235,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index a2267f80febb..e7d378c032cb 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -72,7 +72,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (uh->check == 0) uh->check = CSUM_MANGLED_0; - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = CHECKSUM_UNNECESSARY; /* If there is no outer header we can fake a checksum offload * due to the fact that we have already done the checksum in From 48fb6f4db940e92cfb16cd878cddd59ea6120d06 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 9 Aug 2017 08:27:11 +0100 Subject: [PATCH 109/109] futex: Remove unnecessary warning from get_futex_key Commit 65d8fc777f6d ("futex: Remove requirement for lock_page() in get_futex_key()") removed an unnecessary lock_page() with the side-effect that page->mapping needed to be treated very carefully. Two defensive warnings were added in case any assumption was missed and the first warning assumed a correct application would not alter a mapping backing a futex key. Since merging, it has not triggered for any unexpected case but Mark Rutland reported the following bug triggering due to the first warning. kernel BUG at kernel/futex.c:679! Internal error: Oops - BUG: 0 [#1] PREEMPT SMP Modules linked in: CPU: 0 PID: 3695 Comm: syz-executor1 Not tainted 4.13.0-rc3-00020-g307fec773ba3 #3 Hardware name: linux,dummy-virt (DT) task: ffff80001e271780 task.stack: ffff000010908000 PC is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 LR is at get_futex_key+0x6a4/0xcf0 kernel/futex.c:679 pc : [] lr : [] pstate: 80000145 The fact that it's a bug instead of a warning was due to an unrelated arm64 problem, but the warning itself triggered because the underlying mapping changed. This is an application issue but from a kernel perspective it's a recoverable situation and the warning is unnecessary so this patch removes the warning. The warning may potentially be triggered with the following test program from Mark although it may be necessary to adjust NR_FUTEX_THREADS to be a value smaller than the number of CPUs in the system. #include #include #include #include #include #include #include #include #define NR_FUTEX_THREADS 16 pthread_t threads[NR_FUTEX_THREADS]; void *mem; #define MEM_PROT (PROT_READ | PROT_WRITE) #define MEM_SIZE 65536 static int futex_wrapper(int *uaddr, int op, int val, const struct timespec *timeout, int *uaddr2, int val3) { syscall(SYS_futex, uaddr, op, val, timeout, uaddr2, val3); } void *poll_futex(void *unused) { for (;;) { futex_wrapper(mem, FUTEX_CMP_REQUEUE_PI, 1, NULL, mem + 4, 1); } } int main(int argc, char *argv[]) { int i; mem = mmap(NULL, MEM_SIZE, MEM_PROT, MAP_SHARED | MAP_ANONYMOUS, -1, 0); printf("Mapping @ %p\n", mem); printf("Creating futex threads...\n"); for (i = 0; i < NR_FUTEX_THREADS; i++) pthread_create(&threads[i], NULL, poll_futex, NULL); printf("Flipping mapping...\n"); for (;;) { mmap(mem, MEM_SIZE, MEM_PROT, MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, -1, 0); } return 0; } Reported-and-tested-by: Mark Rutland Signed-off-by: Mel Gorman Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Linus Torvalds --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 16dbe4c93895..f50b434756c1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -670,13 +670,14 @@ again: * this reference was taken by ihold under the page lock * pinning the inode in place so i_lock was unnecessary. The * only way for this check to fail is if the inode was - * truncated in parallel so warn for now if this happens. + * truncated in parallel which is almost certainly an + * application bug. In such a case, just retry. * * We are not calling into get_futex_key_refs() in file-backed * cases, therefore a successful atomic_inc return below will * guarantee that get_futex_key() will still imply smp_mb(); (B). */ - if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + if (!atomic_inc_not_zero(&inode->i_count)) { rcu_read_unlock(); put_page(page);