From f02ced62ec35e847ebadb1c6759320345c30fcb5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 9 Sep 2020 20:25:36 +0200 Subject: [PATCH 01/46] selftests: netfilter: add cpu counter check run task on first CPU with netfilter counters reset and check cpu meta after another ping Signed-off-by: Fabian Frederick Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_meta.sh | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index d250b84dd5bc..17b2d6eaa204 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -33,6 +33,7 @@ table inet filter { counter infproto4count {} counter il4protocounter {} counter imarkcounter {} + counter icpu0counter {} counter oifcount {} counter oifnamecount {} @@ -54,6 +55,7 @@ table inet filter { meta nfproto ipv4 counter name "infproto4count" meta l4proto icmp counter name "il4protocounter" meta mark 42 counter name "imarkcounter" + meta cpu 0 counter name "icpu0counter" } chain output { @@ -119,6 +121,18 @@ check_one_counter omarkcounter "1" true if [ $ret -eq 0 ];then echo "OK: nftables meta iif/oif counters at expected values" +else + exit $ret +fi + +#First CPU execution and counter +taskset -p 01 $$ > /dev/null +ip netns exec "$ns0" nft reset counters > /dev/null +ip netns exec "$ns0" ping -q -c 1 127.0.0.1 > /dev/null +check_one_counter icpu0counter "2" true + +if [ $ret -eq 0 ];then + echo "OK: nftables meta cpu counter at expected values" fi exit $ret From 5b1a995bfa93667762badbe1d8f0f7ad680bdf29 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 9 Sep 2020 20:26:13 +0200 Subject: [PATCH 02/46] selftests: netfilter: fix nft_meta.sh error reporting When some test directly done with check_one_counter() fails, counter variable is undefined. This patch calls ip with cname which avoids errors like: FAIL: oskuidcounter, want "packets 2", got Error: syntax error, unexpected newline, expecting string list counter inet filter ^ Error is now correctly rendered: FAIL: oskuidcounter, want "packets 2", got table inet filter { counter oskuidcounter { packets 1 bytes 84 } } Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_meta.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index 17b2d6eaa204..1f5b46542c14 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -90,7 +90,7 @@ check_one_counter() if [ $? -ne 0 ];then echo "FAIL: $cname, want \"$want\", got" ret=1 - ip netns exec "$ns0" nft list counter inet filter $counter + ip netns exec "$ns0" nft list counter inet filter $cname fi } From d30a7d54e8481acbb5cc42016c817e598c3d697b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 9 Sep 2020 20:26:24 +0200 Subject: [PATCH 03/46] selftests: netfilter: remove unused cnt and simplify command testing cnt was not used in nft_meta.sh This patch also fixes 2 shellcheck SC2181 warnings: "check exit code directly with e.g. 'if mycmd;', not indirectly with $?." Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_meta.sh | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index 1f5b46542c14..18a1abca3262 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -7,8 +7,7 @@ ksft_skip=4 sfx=$(mktemp -u "XXXXXXXX") ns0="ns0-$sfx" -nft --version > /dev/null 2>&1 -if [ $? -ne 0 ];then +if ! nft --version > /dev/null 2>&1; then echo "SKIP: Could not run test without nft tool" exit $ksft_skip fi @@ -86,8 +85,7 @@ check_one_counter() local want="packets $2" local verbose="$3" - cnt=$(ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want") - if [ $? -ne 0 ];then + if ! ip netns exec "$ns0" nft list counter inet filter $cname | grep -q "$want"; then echo "FAIL: $cname, want \"$want\", got" ret=1 ip netns exec "$ns0" nft list counter inet filter $cname From 48d072c4e8cdb542ade06727c31d7851bcc40a89 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Thu, 24 Sep 2020 12:17:33 +0200 Subject: [PATCH 04/46] selftests: netfilter: add time counter check Check packets are correctly placed in current year. Also do a NULL check for another one. Signed-off-by: Fabian Frederick Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nft_meta.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/netfilter/nft_meta.sh b/tools/testing/selftests/netfilter/nft_meta.sh index 18a1abca3262..087f0e6e71ce 100755 --- a/tools/testing/selftests/netfilter/nft_meta.sh +++ b/tools/testing/selftests/netfilter/nft_meta.sh @@ -23,6 +23,8 @@ ip -net "$ns0" addr add 127.0.0.1 dev lo trap cleanup EXIT +currentyear=$(date +%G) +lastyear=$((currentyear-1)) ip netns exec "$ns0" nft -f /dev/stdin < Date: Tue, 11 Aug 2020 10:15:44 +0200 Subject: [PATCH 05/46] can: m_can_platform: don't call m_can_class_suspend in runtime suspend 0704c5743694 can: m_can_platform: remove unnecessary m_can_class_resume() call removed the m_can_class_resume() call in the runtime resume path to get rid of a infinite recursion, so the runtime resume now only handles the device clocks. Unfortunately it did not remove the complementary m_can_class_suspend() call in the runtime suspend function, so those paths are now unbalanced, which causes the pinctrl state to get stuck on the "sleep" state, which breaks all CAN functionality on SoCs where this state is defined. Remove the m_can_class_suspend() call to fix this. Fixes: 0704c5743694 can: m_can_platform: remove unnecessary m_can_class_resume() call Signed-off-by: Lucas Stach Link: https://lore.kernel.org/r/20200811081545.19921-1-l.stach@pengutronix.de Acked-by: Dan Murphy Signed-off-by: Marc Kleine-Budde --- drivers/net/can/m_can/m_can_platform.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/can/m_can/m_can_platform.c b/drivers/net/can/m_can/m_can_platform.c index 38ea5e600fb8..e6d0cb9ee02f 100644 --- a/drivers/net/can/m_can/m_can_platform.c +++ b/drivers/net/can/m_can/m_can_platform.c @@ -144,8 +144,6 @@ static int __maybe_unused m_can_runtime_suspend(struct device *dev) struct net_device *ndev = dev_get_drvdata(dev); struct m_can_classdev *mcan_class = netdev_priv(ndev); - m_can_class_suspend(dev); - clk_disable_unprepare(mcan_class->cclk); clk_disable_unprepare(mcan_class->hclk); From e009f95b1543e26606dca2f7e6e9f0f9174538e5 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 7 Oct 2020 23:18:21 -0700 Subject: [PATCH 06/46] can: j1935: j1939_tp_tx_dat_new(): fix missing initialization of skbcnt This fixes an uninit-value warning: BUG: KMSAN: uninit-value in can_receive+0x26b/0x630 net/can/af_can.c:650 Reported-and-tested-by: syzbot+3f3837e61a48d32b495f@syzkaller.appspotmail.com Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Cc: Robin van der Gracht Cc: Oleksij Rempel Cc: Pengutronix Kernel Team Cc: Oliver Hartkopp Cc: Marc Kleine-Budde Signed-off-by: Cong Wang Link: https://lore.kernel.org/r/20201008061821.24663-1-xiyou.wangcong@gmail.com Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 0cec4152f979..88cf1062e1e9 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -580,6 +580,7 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + can_skb_prv(skb)->skbcnt = 0; /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); From 13ba4c434422837d7c8c163f9c8d854e67bf3c99 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Thu, 8 Oct 2020 23:23:10 +0200 Subject: [PATCH 07/46] net: j1939: j1939_session_fresh_new(): fix missing initialization of skbcnt This patch add the initialization of skbcnt, similar to: e009f95b1543 can: j1935: j1939_tp_tx_dat_new(): fix missing initialization of skbcnt Let's play save and initialize this skbcnt as well. Suggested-by: Jakub Kicinski Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Marc Kleine-Budde --- net/can/j1939/transport.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 88cf1062e1e9..e09d087ba240 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -1488,6 +1488,7 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, skb->dev = priv->ndev; can_skb_reserve(skb); can_skb_prv(skb)->ifindex = priv->ndev->ifindex; + can_skb_prv(skb)->skbcnt = 0; skcb = j1939_skb_to_cb(skb); memcpy(skcb, rel_skcb, sizeof(*skcb)); From 0da1ccbbefb662915228bc17e1c7d4ad28b3ddab Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Tue, 6 Oct 2020 15:52:53 +0200 Subject: [PATCH 08/46] net: fec: Fix PHY init after phy_reset_after_clk_enable() The phy_reset_after_clk_enable() does a PHY reset, which means the PHY loses its register settings. The fec_enet_mii_probe() starts the PHY and does the necessary calls to configure the PHY via PHY framework, and loads the correct register settings into the PHY. Therefore, fec_enet_mii_probe() should be called only after the PHY has been reset, not before as it is now. Fixes: 1b0a83ac04e3 ("net: fec: add phy_reset_after_clk_enable() support") Reviewed-by: Andrew Lunn Tested-by: Richard Leitner Signed-off-by: Marek Vasut Cc: Christoph Niedermaier Cc: David S. Miller Cc: NXP Linux Team Cc: Shawn Guo Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index fb37816a74db..23abe7f6cad0 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -2984,17 +2984,17 @@ fec_enet_open(struct net_device *ndev) /* Init MAC prior to mii bus probe */ fec_restart(ndev); - /* Probe and connect to PHY when open the interface */ - ret = fec_enet_mii_probe(ndev); - if (ret) - goto err_enet_mii_probe; - /* Call phy_reset_after_clk_enable() again if it failed during * phy_reset_after_clk_enable() before because the PHY wasn't probed. */ if (reset_again) phy_reset_after_clk_enable(ndev->phydev); + /* Probe and connect to PHY when open the interface */ + ret = fec_enet_mii_probe(ndev); + if (ret) + goto err_enet_mii_probe; + if (fep->quirks & FEC_QUIRK_ERR006687) imx6q_cpuidle_fec_irqs_used(); From 37198e93ced70733f0b993dff28b7c33857e254f Mon Sep 17 00:00:00 2001 From: Davide Caratti Date: Tue, 6 Oct 2020 18:26:17 +0200 Subject: [PATCH 09/46] net: mptcp: make DACK4/DACK8 usage consistent among all subflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit using packetdrill it's possible to observe the same MPTCP DSN being acked by different subflows with DACK4 and DACK8. This is in contrast with what specified in RFC8684 ยง3.3.2: if an MPTCP endpoint transmits a 64-bit wide DSN, it MUST be acknowledged with a 64-bit wide DACK. Fix 'use_64bit_ack' variable to make it a property of MPTCP sockets, not TCP subflows. Fixes: a0c1d0eafd1e ("mptcp: Use 32-bit DATA_ACK when possible") Acked-by: Paolo Abeni Signed-off-by: Davide Caratti Reviewed-by: Mat Martineau Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 2 +- net/mptcp/protocol.h | 2 +- net/mptcp/subflow.c | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 888bbbbb3e8a..9d7fa93fe0cf 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -516,7 +516,7 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, return ret; } - if (subflow->use_64bit_ack) { + if (READ_ONCE(msk->use_64bit_ack)) { ack_size = TCPOLEN_MPTCP_DSS_ACK64; opts->ext_copy.data_ack = READ_ONCE(msk->ack_seq); opts->ext_copy.ack64 = 1; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 20f04ac85409..285dd8b2b43a 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -202,6 +202,7 @@ struct mptcp_sock { bool fully_established; bool rcv_data_fin; bool snd_data_fin_enable; + bool use_64bit_ack; /* Set when we received a 64-bit DSN */ spinlock_t join_list_lock; struct work_struct work; struct list_head conn_list; @@ -294,7 +295,6 @@ struct mptcp_subflow_context { backup : 1, data_avail : 1, rx_eof : 1, - use_64bit_ack : 1, /* Set when we received a 64-bit DSN */ can_ack : 1; /* only after processing the remote a key */ u32 remote_nonce; u64 thmac; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 6f035af1c9d2..91bef7bfffa6 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -769,12 +769,11 @@ static enum mapping_status get_mapping_status(struct sock *ssk, if (!mpext->dsn64) { map_seq = expand_seq(subflow->map_seq, subflow->map_data_len, mpext->data_seq); - subflow->use_64bit_ack = 0; pr_debug("expanded seq=%llu", subflow->map_seq); } else { map_seq = mpext->data_seq; - subflow->use_64bit_ack = 1; } + WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64); if (subflow->map_valid) { /* Allow replacing only with an identical map */ From d1704382821032fede445b816f4296fd379baacf Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Fri, 9 Oct 2020 15:28:48 -0500 Subject: [PATCH 10/46] net: ipa: skip suspend/resume activities if not set up When processing a system suspend request we suspend modem endpoints if they are enabled, and call ipa_cmd_tag_process() (which issues IPA commands) to ensure the IPA pipeline is cleared. It is an error to attempt to issue an IPA command before setup is complete, so this is clearly a bug. But we also shouldn't suspend or resume any endpoints that have not been set up. Have ipa_endpoint_suspend() and ipa_endpoint_resume() immediately return if setup hasn't completed, to avoid any attempt to configure endpoints or issue IPA commands in that case. Fixes: 84f9bd12d46d ("soc: qcom: ipa: IPA endpoints") Tested-by: Matthias Kaehlcke Signed-off-by: Alex Elder Signed-off-by: Jakub Kicinski --- drivers/net/ipa/ipa_endpoint.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ipa/ipa_endpoint.c b/drivers/net/ipa/ipa_endpoint.c index b7efd7c95e9c..ed60fa5bcdac 100644 --- a/drivers/net/ipa/ipa_endpoint.c +++ b/drivers/net/ipa/ipa_endpoint.c @@ -1471,6 +1471,9 @@ void ipa_endpoint_resume_one(struct ipa_endpoint *endpoint) void ipa_endpoint_suspend(struct ipa *ipa) { + if (!ipa->setup_complete) + return; + if (ipa->modem_netdev) ipa_modem_suspend(ipa->modem_netdev); @@ -1482,6 +1485,9 @@ void ipa_endpoint_suspend(struct ipa *ipa) void ipa_endpoint_resume(struct ipa *ipa) { + if (!ipa->setup_complete) + return; + ipa_endpoint_resume_one(ipa->name_map[IPA_ENDPOINT_AP_COMMAND_TX]); ipa_endpoint_resume_one(ipa->name_map[IPA_ENDPOINT_AP_LAN_RX]); From 4a65dff81a04f874fa6915c7f069b4dc2c4010e4 Mon Sep 17 00:00:00 2001 From: Georg Kohmann Date: Wed, 7 Oct 2020 14:53:02 +0200 Subject: [PATCH 11/46] net: ipv6: Discard next-hop MTU less than minimum link MTU When a ICMPV6_PKT_TOOBIG report a next-hop MTU that is less than the IPv6 minimum link MTU, the estimated path MTU is reduced to the minimum link MTU. This behaviour breaks TAHI IPv6 Core Conformance Test v6LC4.1.6: Packet Too Big Less than IPv6 MTU. Referring to RFC 8201 section 4: "If a node receives a Packet Too Big message reporting a next-hop MTU that is less than the IPv6 minimum link MTU, it must discard it. A node must not reduce its estimate of the Path MTU below the IPv6 minimum link MTU on receipt of a Packet Too Big message." Drop the path MTU update if reported MTU is less than the minimum link MTU. Signed-off-by: Georg Kohmann Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index fb075d9545b9..27430d6e1a99 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2745,7 +2745,8 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (confirm_neigh) dst_confirm_neigh(dst, daddr); - mtu = max_t(u32, mtu, IPV6_MIN_MTU); + if (mtu < IPV6_MIN_MTU) + return; if (mtu >= dst_mtu(dst)) return; From 394039fe2cfd5fb15851da19e2981649585fc245 Mon Sep 17 00:00:00 2001 From: Naoki Hayama Date: Thu, 8 Oct 2020 17:47:31 +0900 Subject: [PATCH 12/46] net: tlan: Fix typo abitrary Fix comment typo. s/abitrary/arbitrary/ Signed-off-by: Naoki Hayama Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ti/tlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c index 76a342ea3797..e922258cae3f 100644 --- a/drivers/net/ethernet/ti/tlan.c +++ b/drivers/net/ethernet/ti/tlan.c @@ -2511,7 +2511,7 @@ static void tlan_phy_power_down(struct net_device *dev) } /* Wait for 50 ms and powerup - * This is abitrary. It is intended to make sure the + * This is arbitrary. It is intended to make sure the * transceiver settles. */ tlan_set_timer(dev, msecs_to_jiffies(50), TLAN_TIMER_PHY_PUP); From ea1dd3e9d080c961b9a451130b61c72dc9a5397b Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Thu, 8 Oct 2020 00:10:21 +0530 Subject: [PATCH 13/46] net/tls: sendfile fails with ktls offload At first when sendpage gets called, if there is more data, 'more' in tls_push_data() gets set which later sets pending_open_record_frags, but when there is no more data in file left, and last time tls_push_data() gets called, pending_open_record_frags doesn't get reset. And later when 2 bytes of encrypted alert comes as sendmsg, it first checks for pending_open_record_frags, and since this is set, it creates a record with 0 data bytes to encrypt, meaning record length is prepend_size + tag_size only, which causes problem. We should set/reset pending_open_record_frags based on more bit. Fixes: e8f69799810c ("net/tls: Add generic NIC offload infrastructure") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- net/tls/tls_device.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index b74e2741f74f..cec86229a6a0 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -418,14 +418,14 @@ static int tls_push_data(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; - int copy, rc = 0; + bool more = false; bool done = false; + int copy, rc = 0; long timeo; if (flags & @@ -492,9 +492,8 @@ handle_error: if (!size) { last_record: tls_push_record_flags = flags; - if (more) { - tls_ctx->pending_open_record_frags = - !!record->num_frags; + if (flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE)) { + more = true; break; } @@ -526,6 +525,8 @@ last_record: } } while (!done); + tls_ctx->pending_open_record_frags = more; + if (orig_size - size > 0) rc = orig_size - size; From 923527dcb4d164925a2fed0b53c6a1625a60a472 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 8 Oct 2020 22:49:00 -0700 Subject: [PATCH 14/46] net/tls: remove a duplicate function prototype Remove one of the two instances of the function prototype for tls_validate_xmit_skb(). Signed-off-by: Randy Dunlap Cc: Boris Pismenny Cc: Aviad Yehezkel Cc: John Fastabend Cc: Daniel Borkmann Signed-off-by: Jakub Kicinski --- include/net/tls.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/net/tls.h b/include/net/tls.h index e5dac7e74e79..baf1e99d8193 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -679,10 +679,6 @@ int decrypt_skb(struct sock *sk, struct sk_buff *skb, struct scatterlist *sgout); struct sk_buff *tls_encrypt_skb(struct sk_buff *skb); -struct sk_buff *tls_validate_xmit_skb(struct sock *sk, - struct net_device *dev, - struct sk_buff *skb); - int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info); From ed42989eab57d619667d7e87dfbd8fe207db54fe Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 7 Oct 2020 21:12:50 -0700 Subject: [PATCH 15/46] tipc: fix the skb_unshare() in tipc_buf_append() skb_unshare() drops a reference count on the old skb unconditionally, so in the failure case, we end up freeing the skb twice here. And because the skb is allocated in fclone and cloned by caller tipc_msg_reassemble(), the consequence is actually freeing the original skb too, thus triggered the UAF by syzbot. Fix this by replacing this skb_unshare() with skb_cloned()+skb_copy(). Fixes: ff48b6222e65 ("tipc: use skb_unshare() instead in tipc_buf_append()") Reported-and-tested-by: syzbot+e96a7ba46281824cc46a@syzkaller.appspotmail.com Cc: Jon Maloy Cc: Ying Xue Signed-off-by: Cong Wang Reviewed-by: Xin Long Signed-off-by: Jakub Kicinski --- net/tipc/msg.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 52e93ba4d8e2..681224401871 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -150,7 +150,8 @@ int tipc_buf_append(struct sk_buff **headbuf, struct sk_buff **buf) if (fragid == FIRST_FRAGMENT) { if (unlikely(head)) goto err; - frag = skb_unshare(frag, GFP_ATOMIC); + if (skb_cloned(frag)) + frag = skb_copy(frag, GFP_ATOMIC); if (unlikely(!frag)) goto err; head = *headbuf = frag; From 7b50ee3dad2581dc022b4e32e55964d4fcdccf20 Mon Sep 17 00:00:00 2001 From: Hoang Huu Le Date: Thu, 8 Oct 2020 14:31:56 +0700 Subject: [PATCH 16/46] tipc: fix NULL pointer dereference in tipc_named_rcv In the function node_lost_contact(), we call __skb_queue_purge() without grabbing the list->lock. This can cause to a race-condition why processing the list 'namedq' in calling path tipc_named_rcv()->tipc_named_dequeue(). [] BUG: kernel NULL pointer dereference, address: 0000000000000000 [] #PF: supervisor read access in kernel mode [] #PF: error_code(0x0000) - not-present page [] PGD 7ca63067 P4D 7ca63067 PUD 6c553067 PMD 0 [] Oops: 0000 [#1] SMP NOPTI [] CPU: 1 PID: 15 Comm: ksoftirqd/1 Tainted: G O 5.9.0-rc6+ #2 [] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS [...] [] RIP: 0010:tipc_named_rcv+0x103/0x320 [tipc] [] Code: 41 89 44 24 10 49 8b 16 49 8b 46 08 49 c7 06 00 00 00 [...] [] RSP: 0018:ffffc900000a7c58 EFLAGS: 00000282 [] RAX: 00000000000012ec RBX: 0000000000000000 RCX: ffff88807bde1270 [] RDX: 0000000000002c7c RSI: 0000000000002c7c RDI: ffff88807b38f1a8 [] RBP: ffff88807b006288 R08: ffff88806a367800 R09: ffff88806a367900 [] R10: ffff88806a367a00 R11: ffff88806a367b00 R12: ffff88807b006258 [] R13: ffff88807b00628a R14: ffff888069334d00 R15: ffff88806a434600 [] FS: 0000000000000000(0000) GS:ffff888079480000(0000) knlGS:0[...] [] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [] CR2: 0000000000000000 CR3: 0000000077320000 CR4: 00000000000006e0 [] Call Trace: [] ? tipc_bcast_rcv+0x9a/0x1a0 [tipc] [] tipc_rcv+0x40d/0x670 [tipc] [] ? _raw_spin_unlock+0xa/0x20 [] tipc_l2_rcv_msg+0x55/0x80 [tipc] [] __netif_receive_skb_one_core+0x8c/0xa0 [] process_backlog+0x98/0x140 [] net_rx_action+0x13a/0x420 [] __do_softirq+0xdb/0x316 [] ? smpboot_thread_fn+0x2f/0x1e0 [] ? smpboot_thread_fn+0x74/0x1e0 [] ? smpboot_thread_fn+0x14e/0x1e0 [] run_ksoftirqd+0x1a/0x40 [] smpboot_thread_fn+0x149/0x1e0 [] ? sort_range+0x20/0x20 [] kthread+0x131/0x150 [] ? kthread_unuse_mm+0xa0/0xa0 [] ret_from_fork+0x22/0x30 [] Modules linked in: veth tipc(O) ip6_udp_tunnel udp_tunnel [...] [] CR2: 0000000000000000 [] ---[ end trace 65c276a8e2e2f310 ]--- To fix this, we need to grab the lock of the 'namedq' list on both path calling. Fixes: cad2929dc432 ("tipc: update a binding service via broadcast") Acked-by: Jon Maloy Signed-off-by: Hoang Huu Le Signed-off-by: Jakub Kicinski --- net/tipc/name_distr.c | 10 +++++++++- net/tipc/node.c | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 2f9c148f17e2..fe4edce459ad 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -327,8 +327,13 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, struct tipc_msg *hdr; u16 seqno; + spin_lock_bh(&namedq->lock); skb_queue_walk_safe(namedq, skb, tmp) { - skb_linearize(skb); + if (unlikely(skb_linearize(skb))) { + __skb_unlink(skb, namedq); + kfree_skb(skb); + continue; + } hdr = buf_msg(skb); seqno = msg_named_seqno(hdr); if (msg_is_last_bulk(hdr)) { @@ -338,12 +343,14 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, if (msg_is_bulk(hdr) || msg_is_legacy(hdr)) { __skb_unlink(skb, namedq); + spin_unlock_bh(&namedq->lock); return skb; } if (*open && (*rcv_nxt == seqno)) { (*rcv_nxt)++; __skb_unlink(skb, namedq); + spin_unlock_bh(&namedq->lock); return skb; } @@ -353,6 +360,7 @@ static struct sk_buff *tipc_named_dequeue(struct sk_buff_head *namedq, continue; } } + spin_unlock_bh(&namedq->lock); return NULL; } diff --git a/net/tipc/node.c b/net/tipc/node.c index 4edcee3088da..e4cf515e323f 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1485,7 +1485,7 @@ static void node_lost_contact(struct tipc_node *n, /* Clean up broadcast state */ tipc_bcast_remove_peer(n->net, n->bc_entry.link); - __skb_queue_purge(&n->bc_entry.namedq); + skb_queue_purge(&n->bc_entry.namedq); /* Abort any ongoing link failover */ for (i = 0; i < MAX_BEARERS; i++) { From 7e94e46c16d060fe272b658ad5e7c2374eeec1bb Mon Sep 17 00:00:00 2001 From: Pujin Shi Date: Thu, 8 Oct 2020 20:19:28 +0800 Subject: [PATCH 17/46] net: smc: fix missing brace warning for old compilers For older versions of gcc, the array = {0}; will cause warnings: net/smc/smc_llc.c: In function 'smc_llc_send_link_delete_all': net/smc/smc_llc.c:1317:9: warning: missing braces around initializer [-Wmissing-braces] struct smc_llc_msg_del_link delllc = {0}; ^ net/smc/smc_llc.c:1317:9: warning: (near initialization for 'delllc.hd') [-Wmissing-braces] 1 warnings generated Fixes: f3811fd7bc97 ("net/smc: send DELETE_LINK, ALL message and wait for send to complete") Signed-off-by: Pujin Shi Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_llc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 3ea33466ebe9..19c8ff7a3609 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1314,7 +1314,7 @@ out: */ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn) { - struct smc_llc_msg_del_link delllc = {0}; + struct smc_llc_msg_del_link delllc = {}; int i; delllc.hd.common.type = SMC_LLC_DELETE_LINK; From 16cb3653803dee243547f95cb51f01bec1323cdf Mon Sep 17 00:00:00 2001 From: Pujin Shi Date: Thu, 8 Oct 2020 20:19:29 +0800 Subject: [PATCH 18/46] net: smc: fix missing brace warning for old compilers For older versions of gcc, the array = {0}; will cause warnings: net/smc/smc_llc.c: In function 'smc_llc_add_link_local': net/smc/smc_llc.c:1212:9: warning: missing braces around initializer [-Wmissing-braces] struct smc_llc_msg_add_link add_llc = {0}; ^ net/smc/smc_llc.c:1212:9: warning: (near initialization for 'add_llc.hd') [-Wmissing-braces] net/smc/smc_llc.c: In function 'smc_llc_srv_delete_link_local': net/smc/smc_llc.c:1245:9: warning: missing braces around initializer [-Wmissing-braces] struct smc_llc_msg_del_link del_llc = {0}; ^ net/smc/smc_llc.c:1245:9: warning: (near initialization for 'del_llc.hd') [-Wmissing-braces] 2 warnings generated Fixes: 4dadd151b265 ("net/smc: enqueue local LLC messages") Signed-off-by: Pujin Shi Acked-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_llc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 19c8ff7a3609..f5f6487bb847 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -1209,7 +1209,7 @@ static void smc_llc_process_srv_add_link(struct smc_link_group *lgr) /* enqueue a local add_link req to trigger a new add_link flow */ void smc_llc_add_link_local(struct smc_link *link) { - struct smc_llc_msg_add_link add_llc = {0}; + struct smc_llc_msg_add_link add_llc = {}; add_llc.hd.length = sizeof(add_llc); add_llc.hd.common.type = SMC_LLC_ADD_LINK; @@ -1242,7 +1242,7 @@ out: */ void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id) { - struct smc_llc_msg_del_link del_llc = {0}; + struct smc_llc_msg_del_link del_llc = {}; del_llc.hd.length = sizeof(del_llc); del_llc.hd.common.type = SMC_LLC_DELETE_LINK; From d582484726c4c46c8580923e855665fb91e3463e Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Oct 2020 19:00:00 +0200 Subject: [PATCH 19/46] mptcp: fix fallback for MP_JOIN subflows Additional/MP_JOIN subflows that do not pass some initial handshake tests currently causes fallback to TCP. That is an RFC violation: we should instead reset the subflow and leave the the msk untouched. Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/91 Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/options.c | 32 +++++++++++++++++++++++++------- net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 10 ++++++++-- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/net/mptcp/options.c b/net/mptcp/options.c index 9d7fa93fe0cf..b2f018d32840 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -626,6 +626,12 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, if (unlikely(mptcp_check_fallback(sk))) return false; + /* prevent adding of any MPTCP related options on reset packet + * until we support MP_TCPRST/MP_FASTCLOSE + */ + if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST)) + return false; + if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) ret = true; else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, @@ -676,7 +682,7 @@ bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, return false; } -static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, +static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk, struct mptcp_subflow_context *subflow, struct sk_buff *skb, struct mptcp_options_received *mp_opt) @@ -693,15 +699,20 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && subflow->mp_join && mp_opt->mp_join && READ_ONCE(msk->pm.server_side)) - tcp_send_ack(sk); + tcp_send_ack(ssk); goto fully_established; } - /* we should process OoO packets before the first subflow is fully - * established, but not expected for MP_JOIN subflows + /* we must process OoO packets before the first subflow is fully + * established. OoO packets are instead a protocol violation + * for MP_JOIN subflows as the peer must not send any data + * before receiving the forth ack - cfr. RFC 8684 section 3.2. */ - if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) + if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) { + if (subflow->mp_join) + goto reset; return subflow->mp_capable; + } if (mp_opt->dss && mp_opt->use_ack) { /* subflows are fully established as soon as we get any @@ -713,9 +724,12 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, } /* If the first established packet does not contain MP_CAPABLE + data - * then fallback to TCP + * then fallback to TCP. Fallback scenarios requires a reset for + * MP_JOIN subflows. */ if (!mp_opt->mp_capable) { + if (subflow->mp_join) + goto reset; subflow->mp_capable = 0; pr_fallback(msk); __mptcp_do_fallback(msk); @@ -732,12 +746,16 @@ fully_established: subflow->pm_notified = 1; if (subflow->mp_join) { - clear_3rdack_retransmission(sk); + clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk, subflow); } else { mptcp_pm_fully_established(msk); } return true; + +reset: + mptcp_subflow_reset(ssk); + return false; } static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 285dd8b2b43a..a26b33556e9b 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -348,6 +348,7 @@ void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool mptcp_subflow_data_available(struct sock *sk); void __init mptcp_subflow_init(void); +void mptcp_subflow_reset(struct sock *ssk); /* called with sk socket lock held */ int __mptcp_subflow_connect(struct sock *sk, int ifindex, diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 91bef7bfffa6..298bcfdd05f1 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -270,6 +270,13 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) return thmac == subflow->thmac; } +void mptcp_subflow_reset(struct sock *ssk) +{ + tcp_set_state(ssk, TCP_CLOSE); + tcp_send_active_reset(ssk, GFP_ATOMIC); + tcp_done(ssk); +} + static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -342,8 +349,7 @@ fallback: return; do_reset: - tcp_send_active_reset(sk, GFP_ATOMIC); - tcp_done(sk); + mptcp_subflow_reset(sk); } struct request_sock_ops mptcp_subflow_request_sock_ops; From 0e4f35d7880157ceccf0a58377d778b02762af82 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Oct 2020 19:00:01 +0200 Subject: [PATCH 20/46] mptcp: subflows garbage collection The msk can close MP_JOIN subflows if the initial handshake fails. Currently such subflows are kept alive in the conn_list until the msk itself is closed. Beyond the wasted memory, we could end-up sending the DATA_FIN and the DATA_FIN ack on such socket, even after a reset. Fixes: 43b54c6ee382 ("mptcp: Use full MPTCP-level disconnect state machine") Reviewed-by: Mat Martineau Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 17 +++++++++++++++++ net/mptcp/protocol.h | 1 + net/mptcp/subflow.c | 6 ++++++ 3 files changed, 24 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5d747c6a610e..b295eb6e9580 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1383,6 +1383,20 @@ static void pm_work(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } +static void __mptcp_close_subflow(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow, *tmp; + + list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if (inet_sk_state_load(ssk) != TCP_CLOSE) + continue; + + __mptcp_close_ssk((struct sock *)msk, ssk, subflow, 0); + } +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -1400,6 +1414,9 @@ static void mptcp_worker(struct work_struct *work) mptcp_clean_una(sk); mptcp_check_data_fin_ack(sk); __mptcp_flush_join_list(msk); + if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + __mptcp_close_subflow(msk); + __mptcp_move_skbs(msk); if (msk->pm.status) diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index a26b33556e9b..972463642690 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -90,6 +90,7 @@ #define MPTCP_WORK_RTX 2 #define MPTCP_WORK_EOF 3 #define MPTCP_FALLBACK_DONE 4 +#define MPTCP_WORK_CLOSE_SUBFLOW 5 struct mptcp_options_received { u64 sndr_key; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 298bcfdd05f1..559f5bbd9622 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -272,9 +272,15 @@ static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow) void mptcp_subflow_reset(struct sock *ssk) { + struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + struct sock *sk = subflow->conn; + tcp_set_state(ssk, TCP_CLOSE); tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && + schedule_work(&mptcp_sk(sk)->work)) + sock_hold(sk); } static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb) From 874fb9e2ca949b443cc419a4f2227cafd4381d39 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 9 Oct 2020 11:01:01 -0700 Subject: [PATCH 21/46] ipv4: Restore flowi4_oif update before call to xfrm_lookup_route Tobias reported regressions in IPsec tests following the patch referenced by the Fixes tag below. The root cause is dropping the reset of the flowi4_oif after the fib_lookup. Apparently it is needed for xfrm cases, so restore the oif update to ip_route_output_flow right before the call to xfrm_lookup_route. Fixes: 2fbc6e89b2f1 ("ipv4: Update exception handling for multipath routes via same device") Reported-by: Tobias Brunner Signed-off-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 58642b29a499..9bd30fd4de4b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2769,10 +2769,12 @@ struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, if (IS_ERR(rt)) return rt; - if (flp4->flowi4_proto) + if (flp4->flowi4_proto) { + flp4->flowi4_oif = rt->dst.dev->ifindex; rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, flowi4_to_flowi(flp4), sk, 0); + } return rt; } From ea2f7da1799bfc5a98b8e0c9e9cf1a7fedc32549 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 9 Oct 2020 13:58:34 +0200 Subject: [PATCH 22/46] selftests: netfilter: extend nfqueue test case add a test with re-queueing: usespace doesn't pass accept verdict, but tells to re-queue to another nf_queue instance. Also, make the second nf-queue program use non-gso mode, kernel will have to perform software segmentation. Lastly, do not queue every packet, just one per second, and add delay when re-injecting the packet to the kernel. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- tools/testing/selftests/netfilter/nf-queue.c | 61 +++++++++++++--- .../testing/selftests/netfilter/nft_queue.sh | 70 +++++++++++++++---- 2 files changed, 109 insertions(+), 22 deletions(-) diff --git a/tools/testing/selftests/netfilter/nf-queue.c b/tools/testing/selftests/netfilter/nf-queue.c index 29c73bce38fa..9e56b9d47037 100644 --- a/tools/testing/selftests/netfilter/nf-queue.c +++ b/tools/testing/selftests/netfilter/nf-queue.c @@ -17,9 +17,12 @@ struct options { bool count_packets; + bool gso_enabled; int verbose; unsigned int queue_num; unsigned int timeout; + uint32_t verdict; + uint32_t delay_ms; }; static unsigned int queue_stats[5]; @@ -27,7 +30,7 @@ static struct options opts; static void help(const char *p) { - printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num]\n", p); + printf("Usage: %s [-c|-v [-vv] ] [-t timeout] [-q queue_num] [-Qdst_queue ] [ -d ms_delay ] [-G]\n", p); } static int parse_attr_cb(const struct nlattr *attr, void *data) @@ -162,7 +165,7 @@ nfq_build_cfg_params(char *buf, uint8_t mode, int range, int queue_num) } static struct nlmsghdr * -nfq_build_verdict(char *buf, int id, int queue_num, int verd) +nfq_build_verdict(char *buf, int id, int queue_num, uint32_t verd) { struct nfqnl_msg_verdict_hdr vh = { .verdict = htonl(verd), @@ -189,9 +192,6 @@ static void print_stats(void) unsigned int last, total; int i; - if (!opts.count_packets) - return; - total = 0; last = queue_stats[0]; @@ -234,7 +234,8 @@ struct mnl_socket *open_queue(void) nlh = nfq_build_cfg_params(buf, NFQNL_COPY_PACKET, 0xFFFF, queue_num); - flags = NFQA_CFG_F_GSO | NFQA_CFG_F_UID_GID; + flags = opts.gso_enabled ? NFQA_CFG_F_GSO : 0; + flags |= NFQA_CFG_F_UID_GID; mnl_attr_put_u32(nlh, NFQA_CFG_FLAGS, htonl(flags)); mnl_attr_put_u32(nlh, NFQA_CFG_MASK, htonl(flags)); @@ -255,6 +256,17 @@ struct mnl_socket *open_queue(void) return nl; } +static void sleep_ms(uint32_t delay) +{ + struct timespec ts = { .tv_sec = delay / 1000 }; + + delay %= 1000; + + ts.tv_nsec = delay * 1000llu * 1000llu; + + nanosleep(&ts, NULL); +} + static int mainloop(void) { unsigned int buflen = 64 * 1024 + MNL_SOCKET_BUFFER_SIZE; @@ -278,7 +290,7 @@ static int mainloop(void) ret = mnl_socket_recvfrom(nl, buf, buflen); if (ret == -1) { - if (errno == ENOBUFS) + if (errno == ENOBUFS || errno == EINTR) continue; if (errno == EAGAIN) { @@ -298,7 +310,10 @@ static int mainloop(void) } id = ret - MNL_CB_OK; - nlh = nfq_build_verdict(buf, id, opts.queue_num, NF_ACCEPT); + if (opts.delay_ms) + sleep_ms(opts.delay_ms); + + nlh = nfq_build_verdict(buf, id, opts.queue_num, opts.verdict); if (mnl_socket_sendto(nl, nlh, nlh->nlmsg_len) < 0) { perror("mnl_socket_sendto"); exit(EXIT_FAILURE); @@ -314,7 +329,7 @@ static void parse_opts(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "chvt:q:")) != -1) { + while ((c = getopt(argc, argv, "chvt:q:Q:d:G")) != -1) { switch (c) { case 'c': opts.count_packets = true; @@ -328,20 +343,48 @@ static void parse_opts(int argc, char **argv) if (opts.queue_num > 0xffff) opts.queue_num = 0; break; + case 'Q': + opts.verdict = atoi(optarg); + if (opts.verdict > 0xffff) { + fprintf(stderr, "Expected destination queue number\n"); + exit(1); + } + + opts.verdict <<= 16; + opts.verdict |= NF_QUEUE; + break; + case 'd': + opts.delay_ms = atoi(optarg); + if (opts.delay_ms == 0) { + fprintf(stderr, "Expected nonzero delay (in milliseconds)\n"); + exit(1); + } + break; case 't': opts.timeout = atoi(optarg); break; + case 'G': + opts.gso_enabled = false; + break; case 'v': opts.verbose++; break; } } + + if (opts.verdict != NF_ACCEPT && (opts.verdict >> 16 == opts.queue_num)) { + fprintf(stderr, "Cannot use same destination and source queue\n"); + exit(1); + } } int main(int argc, char *argv[]) { int ret; + opts.verdict = NF_ACCEPT; + opts.gso_enabled = true; + parse_opts(argc, argv); ret = mainloop(); diff --git a/tools/testing/selftests/netfilter/nft_queue.sh b/tools/testing/selftests/netfilter/nft_queue.sh index 6898448b4266..3d202b90b33d 100755 --- a/tools/testing/selftests/netfilter/nft_queue.sh +++ b/tools/testing/selftests/netfilter/nft_queue.sh @@ -12,6 +12,7 @@ sfx=$(mktemp -u "XXXXXXXX") ns1="ns1-$sfx" ns2="ns2-$sfx" nsrouter="nsrouter-$sfx" +timeout=4 cleanup() { @@ -20,6 +21,7 @@ cleanup() ip netns del ${nsrouter} rm -f "$TMPFILE0" rm -f "$TMPFILE1" + rm -f "$TMPFILE2" "$TMPFILE3" } nft --version > /dev/null 2>&1 @@ -42,6 +44,8 @@ fi TMPFILE0=$(mktemp) TMPFILE1=$(mktemp) +TMPFILE2=$(mktemp) +TMPFILE3=$(mktemp) trap cleanup EXIT ip netns add ${ns1} @@ -83,7 +87,7 @@ load_ruleset() { local name=$1 local prio=$2 -ip netns exec ${nsrouter} nft -f - < /dev/null + ip netns exec ${ns1} ping -W 2 -c 1 -q 10.0.2.99 > /dev/null lret=$? elif [ $proto = "ip6" ]; then - ip netns exec ${ns1} ping -c 1 -q dead:2::99 > /dev/null + ip netns exec ${ns1} ping -W 2 -c 1 -q dead:2::99 > /dev/null lret=$? else lret=111 @@ -214,8 +218,8 @@ test_queue() local last="" # spawn nf-queue listeners - ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t 3 > "$TMPFILE0" & - ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t 3 > "$TMPFILE1" & + ip netns exec ${nsrouter} ./nf-queue -c -q 0 -t $timeout > "$TMPFILE0" & + ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE1" & sleep 1 test_ping ret=$? @@ -250,11 +254,11 @@ test_queue() test_tcp_forward() { - ip netns exec ${nsrouter} ./nf-queue -q 2 -t 10 & + ip netns exec ${nsrouter} ./nf-queue -q 2 -t $timeout & local nfqpid=$! tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=100 of=$tmpfile + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile ip netns exec ${ns2} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & local rpid=$! @@ -270,15 +274,13 @@ test_tcp_forward() test_tcp_localhost() { - tc -net "${nsrouter}" qdisc add dev lo root netem loss random 1% - tmpfile=$(mktemp) || exit 1 - dd conv=sparse status=none if=/dev/zero bs=1M count=900 of=$tmpfile + dd conv=sparse status=none if=/dev/zero bs=1M count=200 of=$tmpfile ip netns exec ${nsrouter} nc -w 5 -l -p 12345 <"$tmpfile" >/dev/null & local rpid=$! - ip netns exec ${nsrouter} ./nf-queue -q 3 -t 30 & + ip netns exec ${nsrouter} ./nf-queue -q 3 -t $timeout & local nfqpid=$! sleep 1 @@ -287,6 +289,47 @@ test_tcp_localhost() wait $rpid [ $? -eq 0 ] && echo "PASS: tcp via loopback" + wait 2>/dev/null +} + +test_tcp_localhost_requeue() +{ +ip netns exec ${nsrouter} nft -f /dev/stdin </dev/null & + local rpid=$! + + ip netns exec ${nsrouter} ./nf-queue -c -q 1 -t $timeout > "$TMPFILE2" & + + # nfqueue 1 will be called via output hook. But this time, + # re-queue the packet to nfqueue program on queue 2. + ip netns exec ${nsrouter} ./nf-queue -G -d 150 -c -q 0 -Q 1 -t $timeout > "$TMPFILE3" & + + sleep 1 + ip netns exec ${nsrouter} nc -w 5 127.0.0.1 12345 <"$tmpfile" > /dev/null + rm -f "$tmpfile" + + wait + + if ! diff -u "$TMPFILE2" "$TMPFILE3" ; then + echo "FAIL: lost packets during requeue?!" 1>&2 + return + fi + + echo "PASS: tcp via loopback and re-queueing" } ip netns exec ${nsrouter} sysctl net.ipv6.conf.all.forwarding=1 > /dev/null @@ -328,5 +371,6 @@ test_queue 20 test_tcp_forward test_tcp_localhost +test_tcp_localhost_requeue exit $ret From 7980d2eabde82be86c5be18aa3d07e88ec13c6a1 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 9 Oct 2020 21:24:25 +0300 Subject: [PATCH 23/46] ipvs: clear skb->tstamp in forwarding path fq qdisc requires tstamp to be cleared in forwarding path Reported-by: Evgeny B Link: https://bugzilla.kernel.org/show_bug.cgi?id=209427 Suggested-by: Eric Dumazet Fixes: 8203e2d844d3 ("net: clear skb->tstamp in forwarding paths") Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") Fixes: 80b14dee2bea ("net: Add a new socket option for a future transmit time.") Signed-off-by: Julian Anastasov Reviewed-by: Simon Horman Signed-off-by: Pablo Neira Ayuso --- net/netfilter/ipvs/ip_vs_xmit.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b00866d777fe..d2e5a8f644b8 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -609,6 +609,8 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb, if (ret == NF_ACCEPT) { nf_reset_ct(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; } return ret; } @@ -649,6 +651,8 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb, if (!local) { skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else @@ -669,6 +673,8 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb, if (!local) { ip_vs_drop_early_demux_sk(skb); skb_forward_csum(skb); + if (skb->dev) + skb->tstamp = 0; NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb, NULL, skb_dst(skb)->dev, dst_output); } else From 98a381a7d4892dd9969d24433a4bca2f45092643 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 12 Oct 2020 16:54:30 +0200 Subject: [PATCH 24/46] netfilter: nftables: extend error reporting for chain updates The initial support for netlink extended ACK is missing the chain update path, which results in misleading error reporting in case of EEXIST. Fixes 36dd1bcc07e5 ("netfilter: nf_tables: initial support for extended ACK reporting") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 4603b667973a..0e43063767d6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2103,7 +2103,8 @@ static bool nft_hook_list_equal(struct list_head *hook_list1, } static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, - u32 flags) + u32 flags, const struct nlattr *attr, + struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; struct nft_table *table = ctx->table; @@ -2119,9 +2120,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, return -EOPNOTSUPP; if (nla[NFTA_CHAIN_HOOK]) { - if (!nft_is_base_chain(chain)) + if (!nft_is_base_chain(chain)) { + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; - + } err = nft_chain_parse_hook(ctx->net, nla, &hook, ctx->family, false); if (err < 0) @@ -2130,6 +2132,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, basechain = nft_base_chain(chain); if (basechain->type != hook.type) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } @@ -2137,6 +2140,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (!nft_hook_list_equal(&basechain->hook_list, &hook.list)) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } else { @@ -2144,6 +2148,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, if (ops->hooknum != hook.num || ops->priority != hook.priority) { nft_chain_release_hook(&hook); + NL_SET_BAD_ATTR(extack, attr); return -EEXIST; } } @@ -2156,8 +2161,10 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, chain2 = nft_chain_lookup(ctx->net, table, nla[NFTA_CHAIN_NAME], genmask); - if (!IS_ERR(chain2)) + if (!IS_ERR(chain2)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); return -EEXIST; + } } if (nla[NFTA_CHAIN_COUNTERS]) { @@ -2200,6 +2207,7 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, nft_trans_chain_update(tmp) && nft_trans_chain_name(tmp) && strcmp(name, nft_trans_chain_name(tmp)) == 0) { + NL_SET_BAD_ATTR(extack, nla[NFTA_CHAIN_NAME]); kfree(name); goto err; } @@ -2322,7 +2330,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, return -EOPNOTSUPP; flags |= chain->flags & NFT_CHAIN_BASE; - return nf_tables_updchain(&ctx, genmask, policy, flags); + return nf_tables_updchain(&ctx, genmask, policy, flags, attr, + extack); } return nf_tables_addchain(&ctx, family, genmask, policy, flags); From 8098bd69bc4e925070313b1b95d03510f4f24738 Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 12 Oct 2020 10:39:42 +0200 Subject: [PATCH 25/46] net: dsa: microchip: fix race condition Between queuing the delayed work and finishing the setup of the dsa ports, the process may sleep in request_module() (via phy_device_create()) and the queued work may be executed prior to the switch net devices being registered. In ksz_mib_read_work(), a NULL dereference will happen within netof_carrier_ok(dp->slave). Not queuing the delayed work in ksz_init_mib_timer() makes things even worse because the work will now be queued for immediate execution (instead of 2000 ms) in ksz_mac_link_down() via dsa_port_link_register_of(). Call tree: ksz9477_i2c_probe() \--ksz9477_switch_register() \--ksz_switch_register() +--dsa_register_switch() | \--dsa_switch_probe() | \--dsa_tree_setup() | \--dsa_tree_setup_switches() | +--dsa_switch_setup() | | +--ksz9477_setup() | | | \--ksz_init_mib_timer() | | | |--/* Start the timer 2 seconds later. */ | | | \--schedule_delayed_work(&dev->mib_read, msecs_to_jiffies(2000)); | | \--__mdiobus_register() | | \--mdiobus_scan() | | \--get_phy_device() | | +--get_phy_id() | | \--phy_device_create() | | |--/* sleeping, ksz_mib_read_work() can be called meanwhile */ | | \--request_module() | | | \--dsa_port_setup() | +--/* Called for non-CPU ports */ | +--dsa_slave_create() | | +--/* Too late, ksz_mib_read_work() may be called beforehand */ | | \--port->slave = ... | ... | +--Called for CPU port */ | \--dsa_port_link_register_of() | \--ksz_mac_link_down() | +--/* mib_read must be initialized here */ | +--/* work is already scheduled, so it will be executed after 2000 ms */ | \--schedule_delayed_work(&dev->mib_read, 0); \-- /* here port->slave is setup properly, scheduling the delayed work should be safe */ Solution: 1. Do not queue (only initialize) delayed work in ksz_init_mib_timer(). 2. Only queue delayed work in ksz_mac_link_down() if init is completed. 3. Queue work once in ksz_switch_register(), after dsa_register_switch() has completed. Fixes: 7c6ff470aa86 ("net: dsa: microchip: add MIB counter reading support") Signed-off-by: Christian Eggers Reviewed-by: Florian Fainelli Reviewed-by: Vladimir Oltean Signed-off-by: Jakub Kicinski --- drivers/net/dsa/microchip/ksz_common.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index c796d42730ba..e5f047129b15 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -103,14 +103,8 @@ void ksz_init_mib_timer(struct ksz_device *dev) INIT_DELAYED_WORK(&dev->mib_read, ksz_mib_read_work); - /* Read MIB counters every 30 seconds to avoid overflow. */ - dev->mib_read_interval = msecs_to_jiffies(30000); - for (i = 0; i < dev->mib_port_cnt; i++) dev->dev_ops->port_init_cnt(dev, i); - - /* Start the timer 2 seconds later. */ - schedule_delayed_work(&dev->mib_read, msecs_to_jiffies(2000)); } EXPORT_SYMBOL_GPL(ksz_init_mib_timer); @@ -143,7 +137,9 @@ void ksz_mac_link_down(struct dsa_switch *ds, int port, unsigned int mode, /* Read all MIB counters when the link is going down. */ p->read = true; - schedule_delayed_work(&dev->mib_read, 0); + /* timer started */ + if (dev->mib_read_interval) + schedule_delayed_work(&dev->mib_read, 0); } EXPORT_SYMBOL_GPL(ksz_mac_link_down); @@ -450,6 +446,12 @@ int ksz_switch_register(struct ksz_device *dev, return ret; } + /* Read MIB counters every 30 seconds to avoid overflow. */ + dev->mib_read_interval = msecs_to_jiffies(30000); + + /* Start the MIB timer. */ + schedule_delayed_work(&dev->mib_read, 0); + return 0; } EXPORT_SYMBOL(ksz_switch_register); From 3af5f0f5c74ecbaf757ef06c3f80d56751277637 Mon Sep 17 00:00:00 2001 From: Valentin Vidic Date: Mon, 12 Oct 2020 00:03:29 +0200 Subject: [PATCH 26/46] net: korina: fix kfree of rx/tx descriptor array kmalloc returns KSEG0 addresses so convert back from KSEG1 in kfree. Also make sure array is freed when the driver is unloaded from the kernel. Fixes: ef11291bcd5f ("Add support the Korina (IDT RC32434) Ethernet MAC") Signed-off-by: Valentin Vidic Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/korina.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/korina.c b/drivers/net/ethernet/korina.c index 03e034918d14..af441d699a57 100644 --- a/drivers/net/ethernet/korina.c +++ b/drivers/net/ethernet/korina.c @@ -1113,7 +1113,7 @@ out: return rc; probe_err_register: - kfree(lp->td_ring); + kfree(KSEG0ADDR(lp->td_ring)); probe_err_td_ring: iounmap(lp->tx_dma_regs); probe_err_dma_tx: @@ -1133,6 +1133,7 @@ static int korina_remove(struct platform_device *pdev) iounmap(lp->eth_regs); iounmap(lp->rx_dma_regs); iounmap(lp->tx_dma_regs); + kfree(KSEG0ADDR(lp->td_ring)); unregister_netdev(bif->dev); free_netdev(bif->dev); From b2b8a92733b288128feb57ffa694758cf475106c Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Thu, 8 Oct 2020 11:45:26 -0700 Subject: [PATCH 27/46] mlx4: handle non-napi callers to napi_poll netcons calls napi_poll with a budget of 0 to transmit packets. Handle this by: - skipping RX processing - do not try to recycle TX packets to the RX cache Signed-off-by: Jonathan Lemon Reviewed-by: Tariq Toukan Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 3 +++ drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b50c567ef508..24006440e86e 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -943,6 +943,9 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget) bool clean_complete = true; int done; + if (!budget) + return 0; + if (priv->tx_ring_num[TX_XDP]) { xdp_tx_cq = priv->tx_cq[TX_XDP][cq->ring]; if (xdp_tx_cq->xdp_busy) { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 9dff7b086c9f..1f11379ad5b6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -350,7 +350,7 @@ u32 mlx4_en_recycle_tx_desc(struct mlx4_en_priv *priv, .dma = tx_info->map0_dma, }; - if (!mlx4_en_rx_recycle(ring->recycle_ring, &frame)) { + if (!napi_mode || !mlx4_en_rx_recycle(ring->recycle_ring, &frame)) { dma_unmap_page(priv->ddev, tx_info->map0_dma, PAGE_SIZE, priv->dma_dir); put_page(tx_info->page); From 64a632da538a6827fad0ea461925cedb9899ebe2 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Sat, 10 Oct 2020 11:10:00 +0200 Subject: [PATCH 28/46] net: fec: Fix phy_device lookup for phy_reset_after_clk_enable() The phy_reset_after_clk_enable() is always called with ndev->phydev, however that pointer may be NULL even though the PHY device instance already exists and is sufficient to perform the PHY reset. This condition happens in fec_open(), where the clock must be enabled first, then the PHY must be reset, and then the PHY IDs can be read out of the PHY. If the PHY still is not bound to the MAC, but there is OF PHY node and a matching PHY device instance already, use the OF PHY node to obtain the PHY device instance, and then use that PHY device instance when triggering the PHY reset. Fixes: 1b0a83ac04e3 ("net: fec: add phy_reset_after_clk_enable() support") Signed-off-by: Marek Vasut Cc: Christoph Niedermaier Cc: David S. Miller Cc: NXP Linux Team Cc: Richard Leitner Cc: Shawn Guo Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/fec_main.c | 25 +++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 23abe7f6cad0..31f60b542feb 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -1912,6 +1912,27 @@ out: return ret; } +static void fec_enet_phy_reset_after_clk_enable(struct net_device *ndev) +{ + struct fec_enet_private *fep = netdev_priv(ndev); + struct phy_device *phy_dev = ndev->phydev; + + if (phy_dev) { + phy_reset_after_clk_enable(phy_dev); + } else if (fep->phy_node) { + /* + * If the PHY still is not bound to the MAC, but there is + * OF PHY node and a matching PHY device instance already, + * use the OF PHY node to obtain the PHY device instance, + * and then use that PHY device instance when triggering + * the PHY reset. + */ + phy_dev = of_phy_find_device(fep->phy_node); + phy_reset_after_clk_enable(phy_dev); + put_device(&phy_dev->mdio.dev); + } +} + static int fec_enet_clk_enable(struct net_device *ndev, bool enable) { struct fec_enet_private *fep = netdev_priv(ndev); @@ -1938,7 +1959,7 @@ static int fec_enet_clk_enable(struct net_device *ndev, bool enable) if (ret) goto failed_clk_ref; - phy_reset_after_clk_enable(ndev->phydev); + fec_enet_phy_reset_after_clk_enable(ndev); } else { clk_disable_unprepare(fep->clk_enet_out); if (fep->clk_ptp) { @@ -2988,7 +3009,7 @@ fec_enet_open(struct net_device *ndev) * phy_reset_after_clk_enable() before because the PHY wasn't probed. */ if (reset_again) - phy_reset_after_clk_enable(ndev->phydev); + fec_enet_phy_reset_after_clk_enable(ndev); /* Probe and connect to PHY when open the interface */ ret = fec_enet_mii_probe(ndev); From 254941f323702dbebe3f9145fdb1ab24c5bd23c1 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 13 Oct 2020 15:45:08 -0400 Subject: [PATCH 29/46] docs: networking: update XPS to account for netif_set_xps_queue With the introduction of netif_set_xps_queue, XPS can be enabled by the driver at initialization. Update the documentation to reflect this, as otherwise users may incorrectly believe that the feature is off by default. Fixes: 537c00de1c9b ("net: Add functions netif_reset_xps_queue and netif_set_xps_queue") Signed-off-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- Documentation/networking/scaling.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/scaling.rst b/Documentation/networking/scaling.rst index 8f0347b9fb3d..3d435caa3ef2 100644 --- a/Documentation/networking/scaling.rst +++ b/Documentation/networking/scaling.rst @@ -465,9 +465,9 @@ XPS Configuration ----------------- XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by -default for SMP). The functionality remains disabled until explicitly -configured. To enable XPS, the bitmap of CPUs/receive-queues that may -use a transmit queue is configured using the sysfs file entry: +default for SMP). If compiled in, it is driver dependent whether, and +how, XPS is configured at device init. The mapping of CPUs/receive-queues +to transmit queue can be inspected and configured using sysfs: For selection based on CPUs map:: From 0d9826bc18ce356e8909919ad681ad65d0a6061e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 12 Oct 2020 17:06:06 +0200 Subject: [PATCH 30/46] netfilter: nf_log: missing vlan offload tag and proto Dump vlan tag and proto for the usual vlan offload case if the NF_LOG_MACDECODE flag is set on. Without this information the logging is misleading as there is no reference to the VLAN header. [12716.993704] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0800 SRC=192.168.10.2 DST=172.217.168.163 LEN=52 TOS=0x00 PREC=0x00 TTL=64 ID=2548 DF PROTO=TCP SPT=55848 DPT=80 WINDOW=501 RES=0x00 ACK FIN URGP=0 [12721.157643] test: IN=veth0 OUT= MACSRC=86:6c:92:ea:d6:73 MACDST=0e:3b:eb:86:73:76 VPROTO=8100 VID=10 MACPROTO=0806 ARP HTYPE=1 PTYPE=0x0800 OPCODE=2 MACSRC=86:6c:92:ea:d6:73 IPSRC=192.168.10.2 MACDST=0e:3b:eb:86:73:76 IPDST=192.168.10.1 Fixes: 83e96d443b37 ("netfilter: log: split family specific code to nf_log_{ip,ip6,common}.c files") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_log.h | 1 + net/ipv4/netfilter/nf_log_arp.c | 19 +++++++++++++++++-- net/ipv4/netfilter/nf_log_ipv4.c | 6 ++++-- net/ipv6/netfilter/nf_log_ipv6.c | 8 +++++--- net/netfilter/nf_log_common.c | 12 ++++++++++++ 5 files changed, 39 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_log.h b/include/net/netfilter/nf_log.h index 0d3920896d50..716db4a0fed8 100644 --- a/include/net/netfilter/nf_log.h +++ b/include/net/netfilter/nf_log.h @@ -108,6 +108,7 @@ int nf_log_dump_tcp_header(struct nf_log_buf *m, const struct sk_buff *skb, unsigned int logflags); void nf_log_dump_sk_uid_gid(struct net *net, struct nf_log_buf *m, struct sock *sk); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb); void nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, unsigned int hooknum, const struct sk_buff *skb, const struct net_device *in, diff --git a/net/ipv4/netfilter/nf_log_arp.c b/net/ipv4/netfilter/nf_log_arp.c index 7a83f881efa9..136030ad2e54 100644 --- a/net/ipv4/netfilter/nf_log_arp.c +++ b/net/ipv4/netfilter/nf_log_arp.c @@ -43,16 +43,31 @@ static void dump_arp_packet(struct nf_log_buf *m, const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int nhoff) { - const struct arphdr *ah; - struct arphdr _arph; const struct arppayload *ap; struct arppayload _arpp; + const struct arphdr *ah; + unsigned int logflags; + struct arphdr _arph; ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) { nf_log_buf_add(m, "TRUNCATED"); return; } + + if (info->type == NF_LOG_TYPE_LOG) + logflags = info->u.log.logflags; + else + logflags = NF_LOG_DEFAULT_MASK; + + if (logflags & NF_LOG_MACDECODE) { + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); + } + nf_log_buf_add(m, "ARP HTYPE=%d PTYPE=0x%04x OPCODE=%d", ntohs(ah->ar_hrd), ntohs(ah->ar_pro), ntohs(ah->ar_op)); diff --git a/net/ipv4/netfilter/nf_log_ipv4.c b/net/ipv4/netfilter/nf_log_ipv4.c index 0c72156130b6..d07583fac8f8 100644 --- a/net/ipv4/netfilter/nf_log_ipv4.c +++ b/net/ipv4/netfilter/nf_log_ipv4.c @@ -284,8 +284,10 @@ static void dump_ipv4_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", ntohs(eth_hdr(skb)->h_proto)); return; default: diff --git a/net/ipv6/netfilter/nf_log_ipv6.c b/net/ipv6/netfilter/nf_log_ipv6.c index da64550a5707..8210ff34ed9b 100644 --- a/net/ipv6/netfilter/nf_log_ipv6.c +++ b/net/ipv6/netfilter/nf_log_ipv6.c @@ -297,9 +297,11 @@ static void dump_ipv6_mac_header(struct nf_log_buf *m, switch (dev->type) { case ARPHRD_ETHER: - nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); + nf_log_buf_add(m, "MACSRC=%pM MACDST=%pM ", + eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest); + nf_log_dump_vlan(m, skb); + nf_log_buf_add(m, "MACPROTO=%04x ", + ntohs(eth_hdr(skb)->h_proto)); return; default: break; diff --git a/net/netfilter/nf_log_common.c b/net/netfilter/nf_log_common.c index ae5628ddbe6d..fd7c5f0f5c25 100644 --- a/net/netfilter/nf_log_common.c +++ b/net/netfilter/nf_log_common.c @@ -171,6 +171,18 @@ nf_log_dump_packet_common(struct nf_log_buf *m, u_int8_t pf, } EXPORT_SYMBOL_GPL(nf_log_dump_packet_common); +void nf_log_dump_vlan(struct nf_log_buf *m, const struct sk_buff *skb) +{ + u16 vid; + + if (!skb_vlan_tag_present(skb)) + return; + + vid = skb_vlan_tag_get(skb); + nf_log_buf_add(m, "VPROTO=%04x VID=%u ", ntohs(skb->vlan_proto), vid); +} +EXPORT_SYMBOL_GPL(nf_log_dump_vlan); + /* bridge and netdev logging families share this code. */ void nf_log_l2packet(struct net *net, u_int8_t pf, __be16 protocol, From 59e611a566e7cd48cf54b6777a11fe3f9c2f9db5 Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 12 Oct 2020 11:35:41 +0200 Subject: [PATCH 31/46] socket: fix option SO_TIMESTAMPING_NEW The comparison of optname with SO_TIMESTAMPING_NEW is wrong way around, so SOCK_TSTAMP_NEW will first be set and than reset again. Additionally move it out of the test for SOF_TIMESTAMPING_RX_SOFTWARE as this seems unrelated. This problem happens on 32 bit platforms were the libc has already switched to struct timespec64 (from SO_TIMExxx_OLD to SO_TIMExxx_NEW socket options). ptp4l complains with "missing timestamp on transmitted peer delay request" because the wrong format is received (and discarded). Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") Signed-off-by: Christian Eggers Reviewed-by: Willem de Bruijn Reviewed-by: Deepa Dinamani Acked-by: Willem de Bruijn Acked-by: Deepa Dinamani Signed-off-by: Jakub Kicinski --- net/core/sock.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 6c5c6b18eff4..dfe5d8047850 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1007,8 +1007,6 @@ set_sndbuf: __sock_set_timestamps(sk, valbool, true, true); break; case SO_TIMESTAMPING_NEW: - sock_set_flag(sk, SOCK_TSTAMP_NEW); - fallthrough; case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; @@ -1037,16 +1035,14 @@ set_sndbuf: } sk->sk_tsflags = val; + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); - else { - if (optname == SO_TIMESTAMPING_NEW) - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - + else sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); - } break; case SO_RCVLOWAT: From 4e3bbb33e6f36e4b05be1b1b9b02e3dd5aaa3e69 Mon Sep 17 00:00:00 2001 From: Christian Eggers Date: Mon, 12 Oct 2020 11:35:42 +0200 Subject: [PATCH 32/46] socket: don't clear SOCK_TSTAMP_NEW when SO_TIMESTAMPNS is disabled SOCK_TSTAMP_NEW (timespec64 instead of timespec) is also used for hardware time stamps (configured via SO_TIMESTAMPING_NEW). User space (ptp4l) first configures hardware time stamping via SO_TIMESTAMPING_NEW which sets SOCK_TSTAMP_NEW. In the next step, ptp4l disables SO_TIMESTAMPNS(_NEW) (software time stamps), but this must not switch hardware time stamps back to "32 bit mode". This problem happens on 32 bit platforms were the libc has already switched to struct timespec64 (from SO_TIMExxx_OLD to SO_TIMExxx_NEW socket options). ptp4l complains with "missing timestamp on transmitted peer delay request" because the wrong format is received (and discarded). Fixes: 887feae36aee ("socket: Add SO_TIMESTAMP[NS]_NEW") Fixes: 783da70e8396 ("net: add sock_enable_timestamps") Signed-off-by: Christian Eggers Acked-by: Willem de Bruijn Acked-by: Deepa Dinamani Signed-off-by: Jakub Kicinski --- net/core/sock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index dfe5d8047850..40a95e0c4bad 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -769,7 +769,6 @@ static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); } } From fdafed459998e2be0e877e6189b24cb7a0183224 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 12 Oct 2020 16:17:21 -0700 Subject: [PATCH 33/46] ip_gre: set dev->hard_header_len and dev->needed_headroom properly GRE tunnel has its own header_ops, ipgre_header_ops, and sets it conditionally. When it is set, it assumes the outer IP header is already created before ipgre_xmit(). This is not true when we send packets through a raw packet socket, where L2 headers are supposed to be constructed by user. Packet socket calls dev_validate_header() to validate the header. But GRE tunnel does not set dev->hard_header_len, so that check can be simply bypassed, therefore uninit memory could be passed down to ipgre_xmit(). Similar for dev->needed_headroom. dev->hard_header_len is supposed to be the length of the header created by dev->header_ops->create(), so it should be used whenever header_ops is set, and dev->needed_headroom should be used when it is not set. Reported-and-tested-by: syzbot+4a2c52677a8a1aa283cb@syzkaller.appspotmail.com Cc: William Tu Acked-by: Willem de Bruijn Signed-off-by: Cong Wang Acked-by: Xie He Signed-off-by: Jakub Kicinski --- net/ipv4/ip_gre.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 4e31f23e4117..e70291748889 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -625,9 +625,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { - /* Need space for new headers */ - if (skb_cow_head(skb, dev->needed_headroom - - (tunnel->hlen + sizeof(struct iphdr)))) + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; @@ -748,7 +746,11 @@ static void ipgre_link_update(struct net_device *dev, bool set_mtu) len = tunnel->tun_hlen - len; tunnel->hlen = tunnel->hlen + len; - dev->needed_headroom = dev->needed_headroom + len; + if (dev->header_ops) + dev->hard_header_len += len; + else + dev->needed_headroom += len; + if (set_mtu) dev->mtu = max_t(int, dev->mtu - len, 68); @@ -944,6 +946,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->parms.iph.protocol = IPPROTO_GRE; tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; + dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; @@ -987,10 +990,14 @@ static int ipgre_tunnel_init(struct net_device *dev) return -EINVAL; dev->flags = IFF_BROADCAST; dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } #endif } else if (!tunnel->collect_md) { dev->header_ops = &ipgre_header_ops; + dev->hard_header_len = tunnel->hlen + sizeof(*iph); + dev->needed_headroom = 0; } return ip_tunnel_init(dev); From e1e84eb58eb494b77c8389fc6308b5042dcce791 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 Oct 2020 10:50:14 -0400 Subject: [PATCH 34/46] ipv4/icmp: l3mdev: Perform icmp error route lookup on source device routing table (v2) As per RFC792, ICMP errors should be sent to the source host. However, in configurations with Virtual Routing and Forwarding tables, looking up which routing table to use is currently done by using the destination net_device. commit 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to determine L3 domain") changes the interface passed to l3mdev_master_ifindex() and inet_addr_type_dev_table() from skb_in->dev to skb_dst(skb_in)->dev. This effectively uses the destination device rather than the source device for choosing which routing table should be used to lookup where to send the ICMP error. Therefore, if the source and destination interfaces are within separate VRFs, or one in the global routing table and the other in a VRF, looking up the source host in the destination interface's routing table will fail if the destination interface's routing table contains no route to the source host. One observable effect of this issue is that traceroute does not work in the following cases: - Route leaking between global routing table and VRF - Route leaking between VRFs Preferably use the source device routing table when sending ICMP error messages. If no source device is set, fall-back on the destination device routing table. Else, use the main routing table (index 0). [ It has been pointed out that a similar issue may exist with ICMP errors triggered when forwarding between network namespaces. It would be worthwhile to investigate, but is outside of the scope of this investigation. ] [ It has also been pointed out that a similar issue exists with unreachable / fragmentation needed messages, which can be triggered by changing the MTU of eth1 in r1 to 1400 and running: ip netns exec h1 ping -s 1450 -Mdo -c1 172.16.2.2 Some investigation points to raw_icmp_error() and raw_err() as being involved in this last scenario. The focus of this patch is TTL expired ICMP messages, which go through icmp_route_lookup. Investigation of failure modes related to raw_icmp_error() is beyond this investigation's scope. ] Fixes: 9d1a6c4ea43e ("net: icmp_route_lookup should use rt dev to determine L3 domain") Link: https://tools.ietf.org/html/rfc792 Signed-off-by: Mathieu Desnoyers Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/icmp.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index cf36f955bfe6..9ea66d903c41 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -457,6 +457,23 @@ out_bh_enable: local_bh_enable(); } +/* + * The device used for looking up which routing table to use for sending an ICMP + * error is preferably the source whenever it is set, which should ensure the + * icmp error can be sent to the source host, else lookup using the routing + * table of the destination device, else use the main routing table (index 0). + */ +static struct net_device *icmp_get_route_lookup_dev(struct sk_buff *skb) +{ + struct net_device *route_lookup_dev = NULL; + + if (skb->dev) + route_lookup_dev = skb->dev; + else if (skb_dst(skb)) + route_lookup_dev = skb_dst(skb)->dev; + return route_lookup_dev; +} + static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -465,6 +482,7 @@ static struct rtable *icmp_route_lookup(struct net *net, int type, int code, struct icmp_bxm *param) { + struct net_device *route_lookup_dev; struct rtable *rt, *rt2; struct flowi4 fl4_dec; int err; @@ -479,7 +497,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); + route_lookup_dev = icmp_get_route_lookup_dev(skb_in); + fl4->flowi4_oif = l3mdev_master_ifindex(route_lookup_dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = ip_route_output_key_hash(net, fl4, skb_in); @@ -503,7 +522,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, + if (inet_addr_type_dev_table(net, route_lookup_dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) From 272928d1cdacfc3b55f605cb0e9115832ecfb20c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Mon, 12 Oct 2020 10:50:15 -0400 Subject: [PATCH 35/46] ipv6/icmp: l3mdev: Perform icmp error route lookup on source device routing table (v2) As per RFC4443, the destination address field for ICMPv6 error messages is copied from the source address field of the invoking packet. In configurations with Virtual Routing and Forwarding tables, looking up which routing table to use for sending ICMPv6 error messages is currently done by using the destination net_device. If the source and destination interfaces are within separate VRFs, or one in the global routing table and the other in a VRF, looking up the source address of the invoking packet in the destination interface's routing table will fail if the destination interface's routing table contains no route to the invoking packet's source address. One observable effect of this issue is that traceroute6 does not work in the following cases: - Route leaking between global routing table and VRF - Route leaking between VRFs Use the source device routing table when sending ICMPv6 error messages. [ In the context of ipv4, it has been pointed out that a similar issue may exist with ICMP errors triggered when forwarding between network namespaces. It would be worthwhile to investigate whether ipv6 has similar issues, but is outside of the scope of this investigation. ] [ Testing shows that similar issues exist with ipv6 unreachable / fragmentation needed messages. However, investigation of this additional failure mode is beyond this investigation's scope. ] Link: https://tools.ietf.org/html/rfc4443 Signed-off-by: Mathieu Desnoyers Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv6/icmp.c | 7 +++++-- net/ipv6/ip6_output.c | 2 -- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index a4e4912ad607..91209a2760aa 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -501,8 +501,11 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (__ipv6_addr_needs_scope_id(addr_type)) { iif = icmp6_iif(skb); } else { - dst = skb_dst(skb); - iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev); + /* + * The source device is used for looking up which routing table + * to use for sending an ICMP error. + */ + iif = l3mdev_master_ifindex(skb->dev); } /* diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index c78e67d7747f..cd623068de53 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -468,8 +468,6 @@ int ip6_forward(struct sk_buff *skb) * check and decrement ttl */ if (hdr->hop_limit <= 1) { - /* Force OUTPUT device used as source address */ - skb->dev = dst->dev; icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); From 1a01727676a87945bd6b9796fc5ee894c24b4fe2 Mon Sep 17 00:00:00 2001 From: Michael Jeanson Date: Mon, 12 Oct 2020 10:50:16 -0400 Subject: [PATCH 36/46] selftests: Add VRF route leaking tests The objective of the tests is to check that ICMP errors generated while crossing between VRFs are properly routed back to the source host. The first ttl test sends a ping with a ttl of 1 from h1 to h2 and parses the output of the command to check that a ttl expired error is received. The second ttl test runs traceroute from h1 to h2 and parses the output to check for a hop on r1. The mtu test sends a ping with a payload of 1450 from h1 to h2, through r1 which has an interface with a mtu of 1400 and parses the output of the command to check that a fragmentation needed error is received. [ The IPv6 MTU test still fails with the symmetric routing setup. It appears to be caused by source address selection picking ::1. Fixing this is beyond the scope of this series. ] Signed-off-by: Michael Jeanson Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + .../selftests/net/vrf_route_leaking.sh | 626 ++++++++++++++++++ 2 files changed, 627 insertions(+) create mode 100755 tools/testing/selftests/net/vrf_route_leaking.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 9491bbaa0831..3e7fb1e70c77 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -19,6 +19,7 @@ TEST_PROGS += txtimestamp.sh TEST_PROGS += vrf-xfrm-tests.sh TEST_PROGS += rxtimestamp.sh TEST_PROGS += devlink_port_split.py +TEST_PROGS += vrf_route_leaking.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/vrf_route_leaking.sh b/tools/testing/selftests/net/vrf_route_leaking.sh new file mode 100755 index 000000000000..23cf924754a5 --- /dev/null +++ b/tools/testing/selftests/net/vrf_route_leaking.sh @@ -0,0 +1,626 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Copyright (c) 2019 David Ahern . All rights reserved. +# Copyright (c) 2020 Michael Jeanson . All rights reserved. +# +# Requires CONFIG_NET_VRF, CONFIG_VETH, CONFIG_BRIDGE and CONFIG_NET_NS. +# +# +# Symmetric routing topology +# +# blue red +# +----+ .253 +----+ .253 +----+ +# | h1 |-------------------| r1 |-------------------| h2 | +# +----+ .1 +----+ .2 +----+ +# 172.16.1/24 172.16.2/24 +# 2001:db8:16:1/64 2001:db8:16:2/64 +# +# +# Route from h1 to h2 and back goes through r1, incoming vrf blue has a route +# to the outgoing vrf red for the n2 network and red has a route back to n1. +# The red VRF interface has a MTU of 1400. +# +# The first test sends a ping with a ttl of 1 from h1 to h2 and parses the +# output of the command to check that a ttl expired error is received. +# +# The second test runs traceroute from h1 to h2 and parses the output to check +# for a hop on r1. +# +# The third test sends a ping with a packet size of 1450 from h1 to h2 and +# parses the output of the command to check that a fragmentation error is +# received. +# +# +# Asymmetric routing topology +# +# This topology represents a customer setup where the issue with icmp errors +# and VRF route leaking was initialy reported. The MTU test isn't done here +# because of the lack of a return route in the red VRF. +# +# blue red +# .253 +----+ .253 +# +----| r1 |----+ +# | +----+ | +# +----+ | | +----+ +# | h1 |--------------+ +--------------| h2 | +# +----+ .1 | | .2 +----+ +# 172.16.1/24 | +----+ | 172.16.2/24 +# 2001:db8:16:1/64 +----| r2 |----+ 2001:db8:16:2/64 +# .254 +----+ .254 +# +# +# Route from h1 to h2 goes through r1, incoming vrf blue has a route to the +# outgoing vrf red for the n2 network but red doesn't have a route back to n1. +# Route from h2 to h1 goes through r2. +# +# The objective is to check that the incoming vrf routing table is selected +# to send an ICMP error back to the source when the ttl of a packet reaches 1 +# while it is forwarded between different vrfs. + +VERBOSE=0 +PAUSE_ON_FAIL=no +DEFAULT_TTYPE=sym + +H1_N1=172.16.1.0/24 +H1_N1_6=2001:db8:16:1::/64 + +H1_N1_IP=172.16.1.1 +R1_N1_IP=172.16.1.253 +R2_N1_IP=172.16.1.254 + +H1_N1_IP6=2001:db8:16:1::1 +R1_N1_IP6=2001:db8:16:1::253 +R2_N1_IP6=2001:db8:16:1::254 + +H2_N2=172.16.2.0/24 +H2_N2_6=2001:db8:16:2::/64 + +H2_N2_IP=172.16.2.2 +R1_N2_IP=172.16.2.253 +R2_N2_IP=172.16.2.254 + +H2_N2_IP6=2001:db8:16:2::2 +R1_N2_IP6=2001:db8:16:2::253 +R2_N2_IP6=2001:db8:16:2::254 + +################################################################################ +# helpers + +log_section() +{ + echo + echo "###########################################################################" + echo "$*" + echo "###########################################################################" + echo +} + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ "${rc}" -eq "${expected}" ]; then + printf "TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf "TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read -r a + [ "$a" = "q" ] && exit 1 + fi + fi +} + +run_cmd() +{ + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + # shellcheck disable=SC2086 + out=$(eval $cmd 2>&1) + rc=$? + if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then + echo "$out" + fi + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +run_cmd_grep() +{ + local grep_pattern="$1" + shift + local cmd="$*" + local out + local rc + + if [ "$VERBOSE" = "1" ]; then + echo "COMMAND: $cmd" + fi + + # shellcheck disable=SC2086 + out=$(eval $cmd 2>&1) + if [ "$VERBOSE" = "1" ] && [ -n "$out" ]; then + echo "$out" + fi + + echo "$out" | grep -q "$grep_pattern" + rc=$? + + [ "$VERBOSE" = "1" ] && echo + + return $rc +} + +################################################################################ +# setup and teardown + +cleanup() +{ + local ns + + for ns in h1 h2 r1 r2; do + ip netns del $ns 2>/dev/null + done +} + +setup_vrf() +{ + local ns=$1 + + ip -netns "${ns}" rule del pref 0 + ip -netns "${ns}" rule add pref 32765 from all lookup local + ip -netns "${ns}" -6 rule del pref 0 + ip -netns "${ns}" -6 rule add pref 32765 from all lookup local +} + +create_vrf() +{ + local ns=$1 + local vrf=$2 + local table=$3 + + ip -netns "${ns}" link add "${vrf}" type vrf table "${table}" + ip -netns "${ns}" link set "${vrf}" up + ip -netns "${ns}" route add vrf "${vrf}" unreachable default metric 8192 + ip -netns "${ns}" -6 route add vrf "${vrf}" unreachable default metric 8192 + + ip -netns "${ns}" addr add 127.0.0.1/8 dev "${vrf}" + ip -netns "${ns}" -6 addr add ::1 dev "${vrf}" nodad +} + +setup_sym() +{ + local ns + + # make sure we are starting with a clean slate + cleanup + + # + # create nodes as namespaces + # + for ns in h1 h2 r1; do + ip netns add $ns + ip -netns $ns link set lo up + + case "${ns}" in + h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1 + ;; + r1) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 + esac + done + + # + # create interconnects + # + ip -netns h1 link add eth0 type veth peer name r1h1 + ip -netns h1 link set r1h1 netns r1 name eth0 up + + ip -netns h2 link add eth0 type veth peer name r1h2 + ip -netns h2 link set r1h2 netns r1 name eth1 up + + # + # h1 + # + ip -netns h1 addr add dev eth0 ${H1_N1_IP}/24 + ip -netns h1 -6 addr add dev eth0 ${H1_N1_IP6}/64 nodad + ip -netns h1 link set eth0 up + + # h1 to h2 via r1 + ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev eth0 + ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev eth0 + + # + # h2 + # + ip -netns h2 addr add dev eth0 ${H2_N2_IP}/24 + ip -netns h2 -6 addr add dev eth0 ${H2_N2_IP6}/64 nodad + ip -netns h2 link set eth0 up + + # h2 to h1 via r1 + ip -netns h2 route add default via ${R1_N2_IP} dev eth0 + ip -netns h2 -6 route add default via ${R1_N2_IP6} dev eth0 + + # + # r1 + # + setup_vrf r1 + create_vrf r1 blue 1101 + create_vrf r1 red 1102 + ip -netns r1 link set mtu 1400 dev eth1 + ip -netns r1 link set eth0 vrf blue up + ip -netns r1 link set eth1 vrf red up + ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24 + ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad + ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24 + ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad + + # Route leak from blue to red + ip -netns r1 route add vrf blue ${H2_N2} dev red + ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red + + # Route leak from red to blue + ip -netns r1 route add vrf red ${H1_N1} dev blue + ip -netns r1 -6 route add vrf red ${H1_N1_6} dev blue + + + # Wait for ip config to settle + sleep 2 +} + +setup_asym() +{ + local ns + + # make sure we are starting with a clean slate + cleanup + + # + # create nodes as namespaces + # + for ns in h1 h2 r1 r2; do + ip netns add $ns + ip -netns $ns link set lo up + + case "${ns}" in + h[12]) ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=0 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.keep_addr_on_down=1 + ;; + r[12]) ip netns exec $ns sysctl -q -w net.ipv4.ip_forward=1 + ip netns exec $ns sysctl -q -w net.ipv6.conf.all.forwarding=1 + esac + done + + # + # create interconnects + # + ip -netns h1 link add eth0 type veth peer name r1h1 + ip -netns h1 link set r1h1 netns r1 name eth0 up + + ip -netns h1 link add eth1 type veth peer name r2h1 + ip -netns h1 link set r2h1 netns r2 name eth0 up + + ip -netns h2 link add eth0 type veth peer name r1h2 + ip -netns h2 link set r1h2 netns r1 name eth1 up + + ip -netns h2 link add eth1 type veth peer name r2h2 + ip -netns h2 link set r2h2 netns r2 name eth1 up + + # + # h1 + # + ip -netns h1 link add br0 type bridge + ip -netns h1 link set br0 up + ip -netns h1 addr add dev br0 ${H1_N1_IP}/24 + ip -netns h1 -6 addr add dev br0 ${H1_N1_IP6}/64 nodad + ip -netns h1 link set eth0 master br0 up + ip -netns h1 link set eth1 master br0 up + + # h1 to h2 via r1 + ip -netns h1 route add ${H2_N2} via ${R1_N1_IP} dev br0 + ip -netns h1 -6 route add ${H2_N2_6} via "${R1_N1_IP6}" dev br0 + + # + # h2 + # + ip -netns h2 link add br0 type bridge + ip -netns h2 link set br0 up + ip -netns h2 addr add dev br0 ${H2_N2_IP}/24 + ip -netns h2 -6 addr add dev br0 ${H2_N2_IP6}/64 nodad + ip -netns h2 link set eth0 master br0 up + ip -netns h2 link set eth1 master br0 up + + # h2 to h1 via r2 + ip -netns h2 route add default via ${R2_N2_IP} dev br0 + ip -netns h2 -6 route add default via ${R2_N2_IP6} dev br0 + + # + # r1 + # + setup_vrf r1 + create_vrf r1 blue 1101 + create_vrf r1 red 1102 + ip -netns r1 link set mtu 1400 dev eth1 + ip -netns r1 link set eth0 vrf blue up + ip -netns r1 link set eth1 vrf red up + ip -netns r1 addr add dev eth0 ${R1_N1_IP}/24 + ip -netns r1 -6 addr add dev eth0 ${R1_N1_IP6}/64 nodad + ip -netns r1 addr add dev eth1 ${R1_N2_IP}/24 + ip -netns r1 -6 addr add dev eth1 ${R1_N2_IP6}/64 nodad + + # Route leak from blue to red + ip -netns r1 route add vrf blue ${H2_N2} dev red + ip -netns r1 -6 route add vrf blue ${H2_N2_6} dev red + + # No route leak from red to blue + + # + # r2 + # + ip -netns r2 addr add dev eth0 ${R2_N1_IP}/24 + ip -netns r2 -6 addr add dev eth0 ${R2_N1_IP6}/64 nodad + ip -netns r2 addr add dev eth1 ${R2_N2_IP}/24 + ip -netns r2 -6 addr add dev eth1 ${R2_N2_IP6}/64 nodad + + # Wait for ip config to settle + sleep 2 +} + +check_connectivity() +{ + ip netns exec h1 ping -c1 -w1 ${H2_N2_IP} >/dev/null 2>&1 + log_test $? 0 "Basic IPv4 connectivity" + return $? +} + +check_connectivity6() +{ + ip netns exec h1 "${ping6}" -c1 -w1 ${H2_N2_IP6} >/dev/null 2>&1 + log_test $? 0 "Basic IPv6 connectivity" + return $? +} + +check_traceroute() +{ + if [ ! -x "$(command -v traceroute)" ]; then + echo "SKIP: Could not run IPV4 test without traceroute" + return 1 + fi +} + +check_traceroute6() +{ + if [ ! -x "$(command -v traceroute6)" ]; then + echo "SKIP: Could not run IPV6 test without traceroute6" + return 1 + fi +} + +ipv4_traceroute() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP error route lookup traceroute" + + check_traceroute || return + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "${R1_N1_IP}" ip netns exec h1 traceroute ${H2_N2_IP} + log_test $? 0 "Traceroute reports a hop on r1" +} + +ipv4_traceroute_asym() +{ + ipv4_traceroute asym +} + +ipv6_traceroute() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP error route lookup traceroute" + + check_traceroute6 || return + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "${R1_N1_IP6}" ip netns exec h1 traceroute6 ${H2_N2_IP6} + log_test $? 0 "Traceroute6 reports a hop on r1" +} + +ipv6_traceroute_asym() +{ + ipv6_traceroute asym +} + +ipv4_ping_ttl() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP ttl error route lookup ping" + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "Time to live exceeded" ip netns exec h1 ping -t1 -c1 -W2 ${H2_N2_IP} + log_test $? 0 "Ping received ICMP ttl exceeded" +} + +ipv4_ping_ttl_asym() +{ + ipv4_ping_ttl asym +} + +ipv4_ping_frag() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv4 ($ttype route): VRF ICMP fragmentation error route lookup ping" + + setup_"$ttype" + + check_connectivity || return + + run_cmd_grep "Frag needed" ip netns exec h1 ping -s 1450 -Mdo -c1 -W2 ${H2_N2_IP} + log_test $? 0 "Ping received ICMP Frag needed" +} + +ipv4_ping_frag_asym() +{ + ipv4_ping_frag asym +} + +ipv6_ping_ttl() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP ttl error route lookup ping" + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "Time exceeded: Hop limit" ip netns exec h1 "${ping6}" -t1 -c1 -W2 ${H2_N2_IP6} + log_test $? 0 "Ping received ICMP Hop limit" +} + +ipv6_ping_ttl_asym() +{ + ipv6_ping_ttl asym +} + +ipv6_ping_frag() +{ + local ttype="$1" + + [ "x$ttype" = "x" ] && ttype="$DEFAULT_TTYPE" + + log_section "IPv6 ($ttype route): VRF ICMP fragmentation error route lookup ping" + + setup_"$ttype" + + check_connectivity6 || return + + run_cmd_grep "Packet too big" ip netns exec h1 "${ping6}" -s 1450 -Mdo -c1 -W2 ${H2_N2_IP6} + log_test $? 0 "Ping received ICMP Packet too big" +} + +ipv6_ping_frag_asym() +{ + ipv6_ping_frag asym +} + +################################################################################ +# usage + +usage() +{ + cat < /dev/null 2>&1 && ping6=$(command -v ping6) || ping6=$(command -v ping) + +TESTS_IPV4="ipv4_ping_ttl ipv4_traceroute ipv4_ping_frag ipv4_ping_ttl_asym ipv4_traceroute_asym" +TESTS_IPV6="ipv6_ping_ttl ipv6_traceroute ipv6_ping_frag ipv6_ping_ttl_asym ipv6_traceroute_asym" + +ret=0 +nsuccess=0 +nfail=0 + +while getopts :46t:pvh o +do + case $o in + 4) TESTS=ipv4;; + 6) TESTS=ipv6;; + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + v) VERBOSE=1;; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +# +# show user test config +# +if [ -z "$TESTS" ]; then + TESTS="$TESTS_IPV4 $TESTS_IPV6" +elif [ "$TESTS" = "ipv4" ]; then + TESTS="$TESTS_IPV4" +elif [ "$TESTS" = "ipv6" ]; then + TESTS="$TESTS_IPV6" +fi + +for t in $TESTS +do + case $t in + ipv4_ping_ttl|ping) ipv4_ping_ttl;;& + ipv4_ping_ttl_asym|ping) ipv4_ping_ttl_asym;;& + ipv4_traceroute|traceroute) ipv4_traceroute;;& + ipv4_traceroute_asym|traceroute) ipv4_traceroute_asym;;& + ipv4_ping_frag|ping) ipv4_ping_frag;;& + + ipv6_ping_ttl|ping) ipv6_ping_ttl;;& + ipv6_ping_ttl_asym|ping) ipv6_ping_ttl_asym;;& + ipv6_traceroute|traceroute) ipv6_traceroute;;& + ipv6_traceroute_asym|traceroute) ipv6_traceroute_asym;;& + ipv6_ping_frag|ping) ipv6_ping_frag;;& + + # setup namespaces and config, but do not run any tests + setup_sym|setup) setup_sym; exit 0;; + setup_asym) setup_asym; exit 0;; + + help) echo "Test names: $TESTS"; exit 0;; + esac +done + +cleanup + +printf "\nTests passed: %3d\n" ${nsuccess} +printf "Tests failed: %3d\n" ${nfail} + +exit $ret From 2ef813b8f405db3f72202b6fcae40a628ab80a53 Mon Sep 17 00:00:00 2001 From: Herat Ramani Date: Tue, 13 Oct 2020 15:01:29 +0530 Subject: [PATCH 37/46] cxgb4: handle 4-tuple PEDIT to NAT mode translation The 4-tuple NAT offload via PEDIT always overwrites all the 4-tuple fields even if they had not been explicitly enabled. If any fields in the 4-tuple are not enabled, then the hardware overwrites the disabled fields with zeros, instead of ignoring them. So, add a parser that can translate the enabled 4-tuple PEDIT fields to one of the NAT mode combinations supported by the hardware and hence avoid overwriting disabled fields to 0. Any rule with unsupported NAT mode combination is rejected. Signed-off-by: Herat Ramani Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.c | 175 ++++++++++++++++-- .../ethernet/chelsio/cxgb4/cxgb4_tc_flower.h | 15 ++ 2 files changed, 177 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c index f642c1b475c4..1b88bd1c2dbe 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.c @@ -60,6 +60,89 @@ static struct ch_tc_pedit_fields pedits[] = { PEDIT_FIELDS(IP6_, DST_127_96, 4, nat_lip, 12), }; +static const struct cxgb4_natmode_config cxgb4_natmode_config_array[] = { + /* Default supported NAT modes */ + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_NONE, + .natmode = NAT_MODE_NONE, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP, + .natmode = NAT_MODE_DIP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT, + .natmode = NAT_MODE_DIP_DP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_DIP_DP_SIP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_DP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_SIP | CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_SIP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_SIP_SP, + }, + { + .chip = CHELSIO_T5, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP | + CXGB4_ACTION_NATMODE_DPORT | + CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_ALL, + }, + /* T6+ can ignore L4 ports when they're disabled. */ + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_SIP_SP, + }, + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SPORT, + .natmode = NAT_MODE_DIP_DP_SP, + }, + { + .chip = CHELSIO_T6, + .flags = CXGB4_ACTION_NATMODE_DIP | CXGB4_ACTION_NATMODE_SIP, + .natmode = NAT_MODE_ALL, + }, +}; + +static void cxgb4_action_natmode_tweak(struct ch_filter_specification *fs, + u8 natmode_flags) +{ + u8 i = 0; + + /* Translate the enabled NAT 4-tuple fields to one of the + * hardware supported NAT mode configurations. This ensures + * that we pick a valid combination, where the disabled fields + * do not get overwritten to 0. + */ + for (i = 0; i < ARRAY_SIZE(cxgb4_natmode_config_array); i++) { + if (cxgb4_natmode_config_array[i].flags == natmode_flags) { + fs->nat_mode = cxgb4_natmode_config_array[i].natmode; + return; + } + } +} + static struct ch_tc_flower_entry *allocate_flower_entry(void) { struct ch_tc_flower_entry *new = kzalloc(sizeof(*new), GFP_KERNEL); @@ -289,7 +372,8 @@ static void offload_pedit(struct ch_filter_specification *fs, u32 val, u32 mask, } static void process_pedit_field(struct ch_filter_specification *fs, u32 val, - u32 mask, u32 offset, u8 htype) + u32 mask, u32 offset, u8 htype, + u8 *natmode_flags) { switch (htype) { case FLOW_ACT_MANGLE_HDR_TYPE_ETH: @@ -314,67 +398,102 @@ static void process_pedit_field(struct ch_filter_specification *fs, u32 val, switch (offset) { case PEDIT_IP4_SRC: offload_pedit(fs, val, mask, IP4_SRC); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP4_DST: offload_pedit(fs, val, mask, IP4_DST); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_IP6: switch (offset) { case PEDIT_IP6_SRC_31_0: offload_pedit(fs, val, mask, IP6_SRC_31_0); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_63_32: offload_pedit(fs, val, mask, IP6_SRC_63_32); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_95_64: offload_pedit(fs, val, mask, IP6_SRC_95_64); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_SRC_127_96: offload_pedit(fs, val, mask, IP6_SRC_127_96); + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; break; case PEDIT_IP6_DST_31_0: offload_pedit(fs, val, mask, IP6_DST_31_0); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_63_32: offload_pedit(fs, val, mask, IP6_DST_63_32); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_95_64: offload_pedit(fs, val, mask, IP6_DST_95_64); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; case PEDIT_IP6_DST_127_96: offload_pedit(fs, val, mask, IP6_DST_127_96); + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_TCP: switch (offset) { case PEDIT_TCP_SPORT_DPORT: - if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) { fs->nat_fport = val; - else + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + } else { fs->nat_lport = val >> 16; + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; + } } - fs->nat_mode = NAT_MODE_ALL; break; case FLOW_ACT_MANGLE_HDR_TYPE_UDP: switch (offset) { case PEDIT_UDP_SPORT_DPORT: - if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) { fs->nat_fport = val; - else + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + } else { fs->nat_lport = val >> 16; + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; + } } - fs->nat_mode = NAT_MODE_ALL; + break; } } +static int cxgb4_action_natmode_validate(struct adapter *adap, u8 natmode_flags, + struct netlink_ext_ack *extack) +{ + u8 i = 0; + + /* Extract the NAT mode to enable based on what 4-tuple fields + * are enabled to be overwritten. This ensures that the + * disabled fields don't get overwritten to 0. + */ + for (i = 0; i < ARRAY_SIZE(cxgb4_natmode_config_array); i++) { + const struct cxgb4_natmode_config *c; + + c = &cxgb4_natmode_config_array[i]; + if (CHELSIO_CHIP_VERSION(adap->params.chip) >= c->chip && + natmode_flags == c->flags) + return 0; + } + NL_SET_ERR_MSG_MOD(extack, "Unsupported NAT mode 4-tuple combination"); + return -EOPNOTSUPP; +} + void cxgb4_process_flow_actions(struct net_device *in, struct flow_action *actions, struct ch_filter_specification *fs) { struct flow_action_entry *act; + u8 natmode_flags = 0; int i; flow_action_for_each(i, act, actions) { @@ -426,7 +545,8 @@ void cxgb4_process_flow_actions(struct net_device *in, val = act->mangle.val; offset = act->mangle.offset; - process_pedit_field(fs, val, mask, offset, htype); + process_pedit_field(fs, val, mask, offset, htype, + &natmode_flags); } break; case FLOW_ACTION_QUEUE: @@ -438,6 +558,9 @@ void cxgb4_process_flow_actions(struct net_device *in, break; } } + if (natmode_flags) + cxgb4_action_natmode_tweak(fs, natmode_flags); + } static bool valid_l4_mask(u32 mask) @@ -454,7 +577,8 @@ static bool valid_l4_mask(u32 mask) } static bool valid_pedit_action(struct net_device *dev, - const struct flow_action_entry *act) + const struct flow_action_entry *act, + u8 *natmode_flags) { u32 mask, offset; u8 htype; @@ -479,7 +603,10 @@ static bool valid_pedit_action(struct net_device *dev, case FLOW_ACT_MANGLE_HDR_TYPE_IP4: switch (offset) { case PEDIT_IP4_SRC: + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; + break; case PEDIT_IP4_DST: + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -493,10 +620,13 @@ static bool valid_pedit_action(struct net_device *dev, case PEDIT_IP6_SRC_63_32: case PEDIT_IP6_SRC_95_64: case PEDIT_IP6_SRC_127_96: + *natmode_flags |= CXGB4_ACTION_NATMODE_SIP; + break; case PEDIT_IP6_DST_31_0: case PEDIT_IP6_DST_63_32: case PEDIT_IP6_DST_95_64: case PEDIT_IP6_DST_127_96: + *natmode_flags |= CXGB4_ACTION_NATMODE_DIP; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -512,6 +642,10 @@ static bool valid_pedit_action(struct net_device *dev, __func__); return false; } + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + else + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -527,6 +661,10 @@ static bool valid_pedit_action(struct net_device *dev, __func__); return false; } + if (~mask & PEDIT_TCP_UDP_SPORT_MASK) + *natmode_flags |= CXGB4_ACTION_NATMODE_SPORT; + else + *natmode_flags |= CXGB4_ACTION_NATMODE_DPORT; break; default: netdev_err(dev, "%s: Unsupported pedit field\n", @@ -546,10 +684,12 @@ int cxgb4_validate_flow_actions(struct net_device *dev, struct netlink_ext_ack *extack, u8 matchall_filter) { + struct adapter *adap = netdev2adap(dev); struct flow_action_entry *act; bool act_redir = false; bool act_pedit = false; bool act_vlan = false; + u8 natmode_flags = 0; int i; if (!flow_action_basic_hw_stats_check(actions, extack)) @@ -563,7 +703,6 @@ int cxgb4_validate_flow_actions(struct net_device *dev, break; case FLOW_ACTION_MIRRED: case FLOW_ACTION_REDIRECT: { - struct adapter *adap = netdev2adap(dev); struct net_device *n_dev, *target_dev; bool found = false; unsigned int i; @@ -620,7 +759,8 @@ int cxgb4_validate_flow_actions(struct net_device *dev, } break; case FLOW_ACTION_MANGLE: { - bool pedit_valid = valid_pedit_action(dev, act); + bool pedit_valid = valid_pedit_action(dev, act, + &natmode_flags); if (!pedit_valid) return -EOPNOTSUPP; @@ -642,6 +782,15 @@ int cxgb4_validate_flow_actions(struct net_device *dev, return -EINVAL; } + if (act_pedit) { + int ret; + + ret = cxgb4_action_natmode_validate(adap, natmode_flags, + extack); + if (ret) + return ret; + } + return 0; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h index 6296e1d5a12b..3a2fa00c8cde 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_flower.h @@ -108,6 +108,21 @@ struct ch_tc_pedit_fields { #define PEDIT_TCP_SPORT_DPORT 0x0 #define PEDIT_UDP_SPORT_DPORT 0x0 +enum cxgb4_action_natmode_flags { + CXGB4_ACTION_NATMODE_NONE = 0, + CXGB4_ACTION_NATMODE_DIP = (1 << 0), + CXGB4_ACTION_NATMODE_SIP = (1 << 1), + CXGB4_ACTION_NATMODE_DPORT = (1 << 2), + CXGB4_ACTION_NATMODE_SPORT = (1 << 3), +}; + +/* TC PEDIT action to NATMODE translation entry */ +struct cxgb4_natmode_config { + enum chip_type chip; + u8 flags; + u8 natmode; +}; + void cxgb4_process_flow_actions(struct net_device *in, struct flow_action *actions, struct ch_filter_specification *fs); From 5ce9ad815a296374ca21f43f3b1ab5083d202ee1 Mon Sep 17 00:00:00 2001 From: David Wilder Date: Tue, 13 Oct 2020 16:20:13 -0700 Subject: [PATCH 38/46] ibmveth: Switch order of ibmveth_helper calls. ibmveth_rx_csum_helper() must be called after ibmveth_rx_mss_helper() as ibmveth_rx_csum_helper() may alter ip and tcp checksum values. Fixes: 66aa0678efc2 ("ibmveth: Support to enable LSO/CSO for Trunk VEA.") Signed-off-by: David Wilder Reviewed-by: Thomas Falcon Reviewed-by: Cristobal Forno Reviewed-by: Pradeep Satyanarayana Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmveth.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index c5c732601e35..3935a7e22ce5 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1385,16 +1385,16 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); - if (csum_good) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - ibmveth_rx_csum_helper(skb, adapter); - } - if (length > netdev->mtu + ETH_HLEN) { ibmveth_rx_mss_helper(skb, mss, lrg_pkt); adapter->rx_large_packets++; } + if (csum_good) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + ibmveth_rx_csum_helper(skb, adapter); + } + napi_gro_receive(napi, skb); /* send it up */ netdev->stats.rx_packets++; From 413f142cc05cb03f2d1ea83388e40c1ddc0d74e9 Mon Sep 17 00:00:00 2001 From: David Wilder Date: Tue, 13 Oct 2020 16:20:14 -0700 Subject: [PATCH 39/46] ibmveth: Identify ingress large send packets. Ingress large send packets are identified by either: The IBMVETH_RXQ_LRG_PKT flag in the receive buffer or with a -1 placed in the ip header checksum. The method used depends on firmware version. Frame geometry and sufficient header validation is performed by the hypervisor eliminating the need for further header checks here. Fixes: 7b5967389f5a ("ibmveth: set correct gso_size and gso_type") Signed-off-by: David Wilder Reviewed-by: Thomas Falcon Reviewed-by: Cristobal Forno Reviewed-by: Pradeep Satyanarayana Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/ibm/ibmveth.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c index 3935a7e22ce5..7ef3369953b6 100644 --- a/drivers/net/ethernet/ibm/ibmveth.c +++ b/drivers/net/ethernet/ibm/ibmveth.c @@ -1349,6 +1349,7 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) int offset = ibmveth_rxq_frame_offset(adapter); int csum_good = ibmveth_rxq_csum_good(adapter); int lrg_pkt = ibmveth_rxq_large_packet(adapter); + __sum16 iph_check = 0; skb = ibmveth_rxq_get_buffer(adapter); @@ -1385,7 +1386,17 @@ static int ibmveth_poll(struct napi_struct *napi, int budget) skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); - if (length > netdev->mtu + ETH_HLEN) { + /* PHYP without PLSO support places a -1 in the ip + * checksum for large send frames. + */ + if (skb->protocol == cpu_to_be16(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)skb->data; + + iph_check = iph->check; + } + + if ((length > netdev->mtu + ETH_HLEN) || + lrg_pkt || iph_check == 0xffff) { ibmveth_rx_mss_helper(skb, mss, lrg_pkt); adapter->rx_large_packets++; } From d086a1c65aabb5a4e1edc580ca583e2964c62b44 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 14 Oct 2020 11:56:42 +0300 Subject: [PATCH 40/46] net: sched: Fix suspicious RCU usage while accessing tcf_tunnel_info The access of tcf_tunnel_info() produces the following splat, so fix it by dereferencing the tcf_tunnel_key_params pointer with marker that internal tcfa_liock is held. ============================= WARNING: suspicious RCU usage 5.9.0+ #1 Not tainted ----------------------------- include/net/tc_act/tc_tunnel_key.h:59 suspicious rcu_dereference_protected() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by tc/34839: #0: ffff88828572c2a0 (&p->tcfa_lock){+...}-{2:2}, at: tc_setup_flow_action+0xb3/0x48b5 stack backtrace: CPU: 1 PID: 34839 Comm: tc Not tainted 5.9.0+ #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack+0x9a/0xd0 tc_setup_flow_action+0x14cb/0x48b5 fl_hw_replace_filter+0x347/0x690 [cls_flower] fl_change+0x2bad/0x4875 [cls_flower] tc_new_tfilter+0xf6f/0x1ba0 rtnetlink_rcv_msg+0x5f2/0x870 netlink_rcv_skb+0x124/0x350 netlink_unicast+0x433/0x700 netlink_sendmsg+0x6f1/0xbd0 sock_sendmsg+0xb0/0xe0 ____sys_sendmsg+0x4fa/0x6d0 ___sys_sendmsg+0x12e/0x1b0 __sys_sendmsg+0xa4/0x120 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f1f8cd4fe57 Code: 0c 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 RSP: 002b:00007ffdc1e193b8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f1f8cd4fe57 RDX: 0000000000000000 RSI: 00007ffdc1e19420 RDI: 0000000000000003 RBP: 000000005f85aafa R08: 0000000000000001 R09: 00007ffdc1e1936c R10: 000000000040522d R11: 0000000000000246 R12: 0000000000000001 R13: 0000000000000000 R14: 00007ffdc1e1d6f0 R15: 0000000000482420 Fixes: 3ebaf6da0716 ("net: sched: Do not assume RTNL is held in tunnel key action helpers") Fixes: 7a47281439ba ("net: sched: lock action when translating it to flow_action infra") Signed-off-by: Leon Romanovsky Acked-by: Cong Wang Signed-off-by: Jakub Kicinski --- include/net/tc_act/tc_tunnel_key.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index e1057b255f69..879fe8cff581 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -56,7 +56,10 @@ static inline struct ip_tunnel_info *tcf_tunnel_info(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT struct tcf_tunnel_key *t = to_tunnel_key(a); - struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); + struct tcf_tunnel_key_params *params; + + params = rcu_dereference_protected(t->params, + lockdep_is_held(&a->tcfa_lock)); return ¶ms->tcft_enc_metadata->u.tun_info; #else From 1d273fcc2c29343e59658276b77b02e5897a3123 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 14 Oct 2020 17:17:49 +0800 Subject: [PATCH 41/46] bpfilter: Fix build error with CONFIG_BPFILTER_UMH IF CONFIG_BPFILTER_UMH is set, building fails: In file included from /usr/include/sys/socket.h:33:0, from net/bpfilter/main.c:6: /usr/include/bits/socket.h:390:10: fatal error: asm/socket.h: No such file or directory #include ^~~~~~~~~~~~~~ compilation terminated. scripts/Makefile.userprogs:43: recipe for target 'net/bpfilter/main.o' failed make[2]: *** [net/bpfilter/main.o] Error 1 Add missing include path to fix this. Signed-off-by: YueHaibing Signed-off-by: Jakub Kicinski --- net/bpfilter/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index cdac82b8c53a..389ea76ccc0b 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -5,7 +5,7 @@ userprogs := bpfilter_umh bpfilter_umh-objs := main.o -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi -I usr/include/ ifeq ($(CONFIG_BPFILTER_UMH), y) # builtin bpfilter_umh should be linked with -static From d535ca1367787ddc8bff22d679a11f864c8228bc Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 14 Oct 2020 19:43:27 +0200 Subject: [PATCH 42/46] net/smc: fix use-after-free of delayed events When a delayed event is enqueued then the event worker will send this event the next time it is running and no other flow is currently active. The event handler is called for the delayed event, and the pointer to the event keeps set in lgr->delayed_event. This pointer is cleared later in the processing by smc_llc_flow_start(). This can lead to a use-after-free condition when the processing does not reach smc_llc_flow_start(), but frees the event because of an error situation. Then the delayed_event pointer is still set but the event is freed. Fix this by always clearing the delayed event pointer when the event is provided to the event handler for processing, and remove the code to clear it in smc_llc_flow_start(). Fixes: 555da9af827d ("net/smc: add event-based llc_flow framework") Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_llc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index f5f6487bb847..5e86926c83a1 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -233,8 +233,6 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow, default: flow->type = SMC_LLC_FLOW_NONE; } - if (qentry == lgr->delayed_event) - lgr->delayed_event = NULL; smc_llc_flow_qentry_set(flow, qentry); spin_unlock_bh(&lgr->llc_flow_lock); return true; @@ -1603,13 +1601,12 @@ static void smc_llc_event_work(struct work_struct *work) struct smc_llc_qentry *qentry; if (!lgr->llc_flow_lcl.type && lgr->delayed_event) { - if (smc_link_usable(lgr->delayed_event->link)) { - smc_llc_event_handler(lgr->delayed_event); - } else { - qentry = lgr->delayed_event; - lgr->delayed_event = NULL; + qentry = lgr->delayed_event; + lgr->delayed_event = NULL; + if (smc_link_usable(qentry->link)) + smc_llc_event_handler(qentry); + else kfree(qentry); - } } again: From ef12ad45880b696eb993d86c481ca891836ab593 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 14 Oct 2020 19:43:28 +0200 Subject: [PATCH 43/46] net/smc: fix valid DMBE buffer sizes The SMCD_DMBE_SIZES should include all valid DMBE buffer sizes, so the correct value is 6 which means 1MB. With 7 the registration of an ISM buffer would always fail because of the invalid size requested. Fix that and set the value to 6. Fixes: c6ba7c9ba43d ("net/smc: add base infrastructure for SMC-D and ISM") Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index a406627b1d55..7c0e4fac9748 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1597,7 +1597,7 @@ out: return rc; } -#define SMCD_DMBE_SIZES 7 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ +#define SMCD_DMBE_SIZES 6 /* 0 -> 16KB, 1 -> 32KB, .. 6 -> 1MB */ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, bool is_dmb, int bufsize) From 6b1bbf94ab369d97ed3bdaa561521a52c27ef619 Mon Sep 17 00:00:00 2001 From: Karsten Graul Date: Wed, 14 Oct 2020 19:43:29 +0200 Subject: [PATCH 44/46] net/smc: fix invalid return code in smcd_new_buf_create() smc_ism_register_dmb() returns error codes set by the ISM driver which are not guaranteed to be negative or in the errno range. Such values would not be handled by ERR_PTR() and finally the return code will be used as a memory address. Fix that by using a valid negative errno value with ERR_PTR(). Fixes: 72b7f6c48708 ("net/smc: unique reason code for exceeded max dmb count") Signed-off-by: Karsten Graul Signed-off-by: Jakub Kicinski --- net/smc/smc_core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7c0e4fac9748..59cc99fec2a2 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -1616,7 +1616,8 @@ static struct smc_buf_desc *smcd_new_buf_create(struct smc_link_group *lgr, rc = smc_ism_register_dmb(lgr, bufsize, buf_desc); if (rc) { kfree(buf_desc); - return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : ERR_PTR(rc); + return (rc == -ENOMEM) ? ERR_PTR(-EAGAIN) : + ERR_PTR(-EIO); } buf_desc->pages = virt_to_page(buf_desc->cpu_addr); /* CDC header stored in buf. So, pretend it was smaller */ From 6617dfd440149e42ce4d2be615eb31a4755f4d30 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 14 Oct 2020 07:46:12 -0700 Subject: [PATCH 45/46] net: fix pos incrementment in ipv6_route_seq_next Commit 4fc427e05158 ("ipv6_route_seq_next should increase position index") tried to fix the issue where seq_file pos is not increased if a NULL element is returned with seq_ops->next(). See bug https://bugzilla.kernel.org/show_bug.cgi?id=206283 The commit effectively does: - increase pos for all seq_ops->start() - increase pos for all seq_ops->next() For ipv6_route, increasing pos for all seq_ops->next() is correct. But increasing pos for seq_ops->start() is not correct since pos is used to determine how many items to skip during seq_ops->start(): iter->skip = *pos; seq_ops->start() just fetches the *current* pos item. The item can be skipped only after seq_ops->show() which essentially is the beginning of seq_ops->next(). For example, I have 7 ipv6 route entries, root@arch-fb-vm1:~/net-next dd if=/proc/net/ipv6_route bs=4096 00000000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001 eth0 fe800000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000001 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 00000000000000000000000000000001 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000003 00000000 80200001 lo fe800000000000002050e3fffebd3be8 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001 eth0 ff000000000000000000000000000000 08 00000000000000000000000000000000 00 00000000000000000000000000000000 00000100 00000004 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 0+1 records in 0+1 records out 1050 bytes (1.0 kB, 1.0 KiB) copied, 0.00707908 s, 148 kB/s root@arch-fb-vm1:~/net-next In the above, I specify buffer size 4096, so all records can be returned to user space with a single trip to the kernel. If I use buffer size 128, since each record size is 149, internally kernel seq_read() will read 149 into its internal buffer and return the data to user space in two read() syscalls. Then user read() syscall will trigger next seq_ops->start(). Since the current implementation increased pos even for seq_ops->start(), it will skip record #2, #4 and #6, assuming the first record is #1. root@arch-fb-vm1:~/net-next dd if=/proc/net/ipv6_route bs=128 00000000000000000000000000000000 40 00000000000000000000000000000000 00 00000000000000000000000000000000 00000400 00000001 00000000 00000001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo fe800000000000002050e3fffebd3be8 80 00000000000000000000000000000000 00 00000000000000000000000000000000 00000000 00000002 00000000 80200001 eth0 00000000000000000000000000000000 00 00000000000000000000000000000000 00 00000000000000000000000000000000 ffffffff 00000001 00000000 00200200 lo 4+1 records in 4+1 records out 600 bytes copied, 0.00127758 s, 470 kB/s To fix the problem, create a fake pos pointer so seq_ops->start() won't actually increase seq_file pos. With this fix, the above `dd` command with `bs=128` will show correct result. Fixes: 4fc427e05158 ("ipv6_route_seq_next should increase position index") Cc: Alexei Starovoitov Suggested-by: Vasily Averin Reviewed-by: Vasily Averin Signed-off-by: Yonghong Song Acked-by: Martin KaFai Lau Acked-by: Andrii Nakryiko Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 4a664ad4f4d4..f88693929e8d 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -2618,8 +2618,10 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos) iter->skip = *pos; if (iter->tbl) { + loff_t p = 0; + ipv6_route_seq_setup_walk(iter, net); - return ipv6_route_seq_next(seq, NULL, pos); + return ipv6_route_seq_next(seq, NULL, &p); } else { return NULL; } From 2ecbc1f684482b4ed52447a39903bd9b0f222898 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 15 Oct 2020 12:33:24 -0700 Subject: [PATCH 46/46] Revert "bpfilter: Fix build error with CONFIG_BPFILTER_UMH" This reverts commit 1d273fcc2c29343e59658276b77b02e5897a3123. Alexei points out there's nothing implying headers will be built and therefore exist under usr/include, so this fix doesn't make much sense. Signed-off-by: Jakub Kicinski --- net/bpfilter/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bpfilter/Makefile b/net/bpfilter/Makefile index 389ea76ccc0b..cdac82b8c53a 100644 --- a/net/bpfilter/Makefile +++ b/net/bpfilter/Makefile @@ -5,7 +5,7 @@ userprogs := bpfilter_umh bpfilter_umh-objs := main.o -userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi -I usr/include/ +userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi ifeq ($(CONFIG_BPFILTER_UMH), y) # builtin bpfilter_umh should be linked with -static