From 18f02ad19e2c2a1d9e1d55a4e1c0cbf51419151c Mon Sep 17 00:00:00 2001 From: Xiyu Yang Date: Sun, 26 Apr 2020 11:35:15 +0800 Subject: [PATCH 01/68] bpf: Fix sk_psock refcnt leak when receiving message tcp_bpf_recvmsg() invokes sk_psock_get(), which returns a reference of the specified sk_psock object to "psock" with increased refcnt. When tcp_bpf_recvmsg() returns, local variable "psock" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in several exception handling paths of tcp_bpf_recvmsg(). When those error scenarios occur such as "flags" includes MSG_ERRQUEUE, the function forgets to decrease the refcnt increased by sk_psock_get(), causing a refcnt leak. Fix this issue by calling sk_psock_put() or pulling up the error queue read handling when those error scenarios occur. Fixes: e7a5f1f1cd000 ("bpf/sockmap: Read psock ingress_msg before sk_receive_queue") Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/1587872115-42805-1-git-send-email-xiyuyang19@fudan.edu.cn --- net/ipv4/tcp_bpf.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 5a05327f97c1..ff96466ea6da 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -262,14 +262,17 @@ static int tcp_bpf_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, struct sk_psock *psock; int copied, ret; + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + psock = sk_psock_get(sk); if (unlikely(!psock)) return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); - if (unlikely(flags & MSG_ERRQUEUE)) - return inet_recv_error(sk, msg, len, addr_len); if (!skb_queue_empty(&sk->sk_receive_queue) && - sk_psock_queue_empty(psock)) + sk_psock_queue_empty(psock)) { + sk_psock_put(sk, psock); return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + } lock_sock(sk); msg_bytes_ready: copied = __tcp_bpf_recvmsg(sk, psock, msg, len, flags); From 7f645462ca01d01abb94d75e6768c8b3ed3a188b Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 30 Apr 2020 08:18:51 +0000 Subject: [PATCH 02/68] bpf: Fix error return code in map_lookup_and_delete_elem() Fix to return negative error code -EFAULT from the copy_to_user() error handling case instead of 0, as done elsewhere in this function. Fixes: bd513cd08f10 ("bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall") Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200430081851.166996-1-weiyongjun1@huawei.com --- kernel/bpf/syscall.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7626b8024471..2843bbba9ca1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1485,8 +1485,10 @@ static int map_lookup_and_delete_elem(union bpf_attr *attr) if (err) goto free_value; - if (copy_to_user(uvalue, value, value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) { + err = -EFAULT; goto free_value; + } err = 0; From 3e104c23816220919ea1b3fd93fabe363c67c484 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 4 May 2020 10:21:23 -0700 Subject: [PATCH 03/68] bpf, sockmap: msg_pop_data can incorrecty set an sge length When sk_msg_pop() is called where the pop operation is working on the end of a sge element and there is no additional trailing data and there _is_ data in front of pop, like the following case, |____________a_____________|__pop__| We have out of order operations where we incorrectly set the pop variable so that instead of zero'ing pop we incorrectly leave it untouched, effectively. This can cause later logic to shift the buffers around believing it should pop extra space. The result is we have 'popped' more data then we expected potentially breaking program logic. It took us a while to hit this case because typically we pop headers which seem to rarely be at the end of a scatterlist elements but we can't rely on this. Fixes: 7246d8ed4dcce ("bpf: helper to pop data from messages") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/158861288359.14306.7654891716919968144.stgit@john-Precision-5820-Tower --- net/core/filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/filter.c b/net/core/filter.c index 7d6ceaa54d21..5cc9276f1023 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2590,8 +2590,8 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start, } pop = 0; } else if (pop >= sge->length - a) { - sge->length = a; pop -= (sge->length - a); + sge->length = a; } } From 81aabbb9fb7b4b1efd073b62f0505d3adad442f3 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 4 May 2020 10:21:44 -0700 Subject: [PATCH 04/68] bpf, sockmap: bpf_tcp_ingress needs to subtract bytes from sg.size In bpf_tcp_ingress we used apply_bytes to subtract bytes from sg.size which is used to track total bytes in a message. But this is not correct because apply_bytes is itself modified in the main loop doing the mem_charge. Then at the end of this we have sg.size incorrectly set and out of sync with actual sk values. Then we can get a splat if we try to cork the data later and again try to redirect the msg to ingress. To fix instead of trying to track msg.size do the easy thing and include it as part of the sk_msg_xfer logic so that when the msg is moved the sg.size is always correct. To reproduce the below users will need ingress + cork and hit an error path that will then try to 'free' the skmsg. [ 173.699981] BUG: KASAN: null-ptr-deref in sk_msg_free_elem+0xdd/0x120 [ 173.699987] Read of size 8 at addr 0000000000000008 by task test_sockmap/5317 [ 173.700000] CPU: 2 PID: 5317 Comm: test_sockmap Tainted: G I 5.7.0-rc1+ #43 [ 173.700005] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019 [ 173.700009] Call Trace: [ 173.700021] dump_stack+0x8e/0xcb [ 173.700029] ? sk_msg_free_elem+0xdd/0x120 [ 173.700034] ? sk_msg_free_elem+0xdd/0x120 [ 173.700042] __kasan_report+0x102/0x15f [ 173.700052] ? sk_msg_free_elem+0xdd/0x120 [ 173.700060] kasan_report+0x32/0x50 [ 173.700070] sk_msg_free_elem+0xdd/0x120 [ 173.700080] __sk_msg_free+0x87/0x150 [ 173.700094] tcp_bpf_send_verdict+0x179/0x4f0 [ 173.700109] tcp_bpf_sendpage+0x3ce/0x5d0 Fixes: 604326b41a6fb ("bpf, sockmap: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Reviewed-by: Jakub Sitnicki Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/158861290407.14306.5327773422227552482.stgit@john-Precision-5820-Tower --- include/linux/skmsg.h | 1 + net/ipv4/tcp_bpf.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 8a709f63c5e5..ad31c9fb7158 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -187,6 +187,7 @@ static inline void sk_msg_xfer(struct sk_msg *dst, struct sk_msg *src, dst->sg.data[which] = src->sg.data[which]; dst->sg.data[which].length = size; dst->sg.size += size; + src->sg.size -= size; src->sg.data[which].length -= size; src->sg.data[which].offset += size; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index ff96466ea6da..629aaa9a1eb9 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -125,7 +125,6 @@ static int bpf_tcp_ingress(struct sock *sk, struct sk_psock *psock, if (!ret) { msg->sg.start = i; - msg->sg.size -= apply_bytes; sk_psock_queue_msg(psock, tmp); sk_psock_data_ready(sk, psock); } else { From 01c3259818a11f3cc3cd767adbae6b45849c03c1 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 7 May 2020 03:25:56 -0400 Subject: [PATCH 05/68] virtio_net: fix lockdep warning on 32 bit When we fill up a receive VQ, try_fill_recv currently tries to count kicks using a 64 bit stats counter. Turns out, on a 32 bit kernel that uses a seqcount. sequence counts are "lock" constructs where you need to make sure that writers are serialized. In turn, this means that we mustn't run two try_fill_recv concurrently. Which of course we don't. We do run try_fill_recv sometimes from a softirq napi context, and sometimes from a fully preemptible context, but the later always runs with napi disabled. However, when it comes to the seqcount, lockdep is trying to enforce the rule that the same lock isn't accessed from preemptible and softirq context - it doesn't know about napi being enabled/disabled. This causes a false-positive warning: WARNING: inconsistent lock state ... inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage. As a work around, shut down the warning by switching to u64_stats_update_begin_irqsave - that works by disabling interrupts on 32 bit only, is a NOP on 64 bit. Reported-by: Thomas Gleixner Suggested-by: Eric Dumazet Signed-off-by: Michael S. Tsirkin Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 11f722460513..ce07f52d89e7 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1243,9 +1243,11 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, break; } while (rq->vq->num_free); if (virtqueue_kick_prepare(rq->vq) && virtqueue_notify(rq->vq)) { - u64_stats_update_begin(&rq->stats.syncp); + unsigned long flags; + + flags = u64_stats_update_begin_irqsave(&rq->stats.syncp); rq->stats.kicks++; - u64_stats_update_end(&rq->stats.syncp); + u64_stats_update_end_irqrestore(&rq->stats.syncp, flags); } return !oom; From 64082b67ba724c5a5acfff38d352068c4992d089 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Thu, 7 May 2020 00:58:05 -0700 Subject: [PATCH 06/68] net: remove spurious declaration of tcp_default_init_rwnd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit it doesn't actually exist... Test: builds and 'git grep tcp_default_init_rwnd' comes up empty Signed-off-by: Maciej Żenczykowski Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index dcf9a72eeaa6..64f84683feae 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1376,7 +1376,6 @@ static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) rx_opt->num_sacks = 0; } -u32 tcp_default_init_rwnd(u32 mss); void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) From 1a10186e598a5be5bc1cd5e55d9e9598a7c079ac Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 7 May 2020 11:49:49 +0200 Subject: [PATCH 07/68] usb: hso: correct debug message If you do not find the OUT endpoint, you should say so, rather than copy the error message for the IN endpoint. Presumably a copy and paste error. Signed-off-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/hso.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index 417e42c9fd03..bb8c34d746ab 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2659,7 +2659,7 @@ static struct hso_device *hso_create_bulk_serial_device( if (! (serial->out_endp = hso_get_ep(interface, USB_ENDPOINT_XFER_BULK, USB_DIR_OUT))) { - dev_err(&interface->dev, "Failed to find BULK IN ep\n"); + dev_err(&interface->dev, "Failed to find BULK OUT ep\n"); goto exit2; } From 8aef199481df0d2b6d110b8be321d5358511fe01 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 7 May 2020 13:45:11 +0200 Subject: [PATCH 08/68] net: hisilicon: Make CONFIG_HNS invisible The HNS config symbol enables the framework support for the Hisilicon Network Subsystem. It is already selected by all of its users, so there is no reason to make it visible. Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- drivers/net/ethernet/hisilicon/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/Kconfig b/drivers/net/ethernet/hisilicon/Kconfig index 3892a2062404..2fff43509098 100644 --- a/drivers/net/ethernet/hisilicon/Kconfig +++ b/drivers/net/ethernet/hisilicon/Kconfig @@ -64,7 +64,7 @@ config HNS_MDIO the PHY config HNS - tristate "Hisilicon Network Subsystem Support (Framework)" + tristate ---help--- This selects the framework support for Hisilicon Network Subsystem. It is needed by any driver which provides HNS acceleration engine or make From ee2875566868687a95b8ec23913c0d6bce220efd Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 May 2020 19:22:14 +0200 Subject: [PATCH 09/68] net: bareudp: avoid uninitialized variable warning clang points out that building without IPv6 would lead to returning an uninitialized variable if a packet with family!=AF_INET is passed into bareudp_udp_encap_recv(): drivers/net/bareudp.c:139:6: error: variable 'err' is used uninitialized whenever 'if' condition is false [-Werror,-Wsometimes-uninitialized] if (family == AF_INET) ^~~~~~~~~~~~~~~~~ drivers/net/bareudp.c:146:15: note: uninitialized use occurs here if (unlikely(err)) { ^~~ include/linux/compiler.h:78:42: note: expanded from macro 'unlikely' # define unlikely(x) __builtin_expect(!!(x), 0) ^ drivers/net/bareudp.c:139:2: note: remove the 'if' if its condition is always true if (family == AF_INET) ^~~~~~~~~~~~~~~~~~~~~~ This cannot happen in practice, so change the condition in a way that gcc sees the IPv4 case as unconditionally true here. For consistency, change all the similar constructs in this file the same way, using "if(IS_ENABLED())" instead of #if IS_ENABLED()". Fixes: 571912c69f0e ("net: UDP tunnel encapsulation module for tunnelling different protocols like MPLS, IP, NSH etc.") Signed-off-by: Arnd Bergmann Reviewed-by: Nathan Chancellor Signed-off-by: David S. Miller --- drivers/net/bareudp.c | 18 ++++-------------- include/net/udp_tunnel.h | 2 -- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index cc0703c3d57f..efd1a1d1f35e 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -136,25 +136,21 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) oiph = skb_network_header(skb); skb_reset_network_header(skb); - if (family == AF_INET) + if (!IS_ENABLED(CONFIG_IPV6) || family == AF_INET) err = IP_ECN_decapsulate(oiph, skb); -#if IS_ENABLED(CONFIG_IPV6) else err = IP6_ECN_decapsulate(oiph, skb); -#endif if (unlikely(err)) { if (log_ecn_error) { - if (family == AF_INET) + if (!IS_ENABLED(CONFIG_IPV6) || family == AF_INET) net_info_ratelimited("non-ECT from %pI4 " "with TOS=%#x\n", &((struct iphdr *)oiph)->saddr, ((struct iphdr *)oiph)->tos); -#if IS_ENABLED(CONFIG_IPV6) else net_info_ratelimited("non-ECT from %pI6\n", &((struct ipv6hdr *)oiph)->saddr); -#endif } if (err > 1) { ++bareudp->dev->stats.rx_frame_errors; @@ -350,7 +346,6 @@ free_dst: return err; } -#if IS_ENABLED(CONFIG_IPV6) static int bareudp6_xmit_skb(struct sk_buff *skb, struct net_device *dev, struct bareudp_dev *bareudp, const struct ip_tunnel_info *info) @@ -411,7 +406,6 @@ free_dst: dst_release(dst); return err; } -#endif static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -435,11 +429,9 @@ static netdev_tx_t bareudp_xmit(struct sk_buff *skb, struct net_device *dev) } rcu_read_lock(); -#if IS_ENABLED(CONFIG_IPV6) - if (info->mode & IP_TUNNEL_INFO_IPV6) + if (IS_ENABLED(CONFIG_IPV6) && info->mode & IP_TUNNEL_INFO_IPV6) err = bareudp6_xmit_skb(skb, dev, bareudp, info); else -#endif err = bareudp_xmit_skb(skb, dev, bareudp, info); rcu_read_unlock(); @@ -467,7 +459,7 @@ static int bareudp_fill_metadata_dst(struct net_device *dev, use_cache = ip_tunnel_dst_cache_usable(skb, info); - if (ip_tunnel_info_af(info) == AF_INET) { + if (!IS_ENABLED(CONFIG_IPV6) || ip_tunnel_info_af(info) == AF_INET) { struct rtable *rt; __be32 saddr; @@ -478,7 +470,6 @@ static int bareudp_fill_metadata_dst(struct net_device *dev, ip_rt_put(rt); info->key.u.ipv4.src = saddr; -#if IS_ENABLED(CONFIG_IPV6) } else if (ip_tunnel_info_af(info) == AF_INET6) { struct dst_entry *dst; struct in6_addr saddr; @@ -492,7 +483,6 @@ static int bareudp_fill_metadata_dst(struct net_device *dev, dst_release(dst); info->key.u.ipv6.src = saddr; -#endif } else { return -EINVAL; } diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h index 4b1f95e08307..e7312ceb2794 100644 --- a/include/net/udp_tunnel.h +++ b/include/net/udp_tunnel.h @@ -143,14 +143,12 @@ void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb __be16 df, __be16 src_port, __be16 dst_port, bool xnet, bool nocheck); -#if IS_ENABLED(CONFIG_IPV6) int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb, struct net_device *dev, struct in6_addr *saddr, struct in6_addr *daddr, __u8 prio, __u8 ttl, __be32 label, __be16 src_port, __be16 dst_port, bool nocheck); -#endif void udp_tunnel_sock_release(struct socket *sock); From 09454fd0a4ce23cb3d8af65066c91a1bf27120dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Tue, 5 May 2020 11:57:23 -0700 Subject: [PATCH 10/68] Revert "ipv6: add mtu lock check in __ip6_rt_update_pmtu" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit 19bda36c4299ce3d7e5bce10bebe01764a655a6d: | ipv6: add mtu lock check in __ip6_rt_update_pmtu | | Prior to this patch, ipv6 didn't do mtu lock check in ip6_update_pmtu. | It leaded to that mtu lock doesn't really work when receiving the pkt | of ICMPV6_PKT_TOOBIG. | | This patch is to add mtu lock check in __ip6_rt_update_pmtu just as ipv4 | did in __ip_rt_update_pmtu. The above reasoning is incorrect. IPv6 *requires* icmp based pmtu to work. There's already a comment to this effect elsewhere in the kernel: $ git grep -p -B1 -A3 'RTAX_MTU lock' net/ipv6/route.c=4813= static int rt6_mtu_change_route(struct fib6_info *f6i, void *p_arg) ... /* In IPv6 pmtu discovery is not optional, so that RTAX_MTU lock cannot disable it. We still use this lock to block changes caused by addrconf/ndisc. */ This reverts to the pre-4.9 behaviour. Cc: Eric Dumazet Cc: Willem de Bruijn Cc: Xin Long Cc: Hannes Frederic Sowa Signed-off-by: Maciej Żenczykowski Fixes: 19bda36c4299 ("ipv6: add mtu lock check in __ip6_rt_update_pmtu") Signed-off-by: David S. Miller --- net/ipv6/route.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8d418038fe32..ff847a324220 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2722,8 +2722,10 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, const struct in6_addr *daddr, *saddr; struct rt6_info *rt6 = (struct rt6_info *)dst; - if (dst_metric_locked(dst, RTAX_MTU)) - return; + /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU) + * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it. + * [see also comment in rt6_mtu_change_route()] + */ if (iph) { daddr = &iph->daddr; From ff8ce319e9c25e920d994cc35236f0bb32dfc8f3 Mon Sep 17 00:00:00 2001 From: Chuhong Yuan Date: Thu, 7 May 2020 23:13:20 +0800 Subject: [PATCH 11/68] net: microchip: encx24j600: add missed kthread_stop This driver calls kthread_run() in probe, but forgets to call kthread_stop() in probe failure and remove. Add the missed kthread_stop() to fix it. Signed-off-by: Chuhong Yuan Signed-off-by: David S. Miller --- drivers/net/ethernet/microchip/encx24j600.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/encx24j600.c b/drivers/net/ethernet/microchip/encx24j600.c index 39925e4bf2ec..b25a13da900a 100644 --- a/drivers/net/ethernet/microchip/encx24j600.c +++ b/drivers/net/ethernet/microchip/encx24j600.c @@ -1070,7 +1070,7 @@ static int encx24j600_spi_probe(struct spi_device *spi) if (unlikely(ret)) { netif_err(priv, probe, ndev, "Error %d initializing card encx24j600 card\n", ret); - goto out_free; + goto out_stop; } eidled = encx24j600_read_reg(priv, EIDLED); @@ -1088,6 +1088,8 @@ static int encx24j600_spi_probe(struct spi_device *spi) out_unregister: unregister_netdev(priv->ndev); +out_stop: + kthread_stop(priv->kworker_task); out_free: free_netdev(ndev); @@ -1100,6 +1102,7 @@ static int encx24j600_spi_remove(struct spi_device *spi) struct encx24j600_priv *priv = dev_get_drvdata(&spi->dev); unregister_netdev(priv->ndev); + kthread_stop(priv->kworker_task); free_netdev(priv->ndev); From 7d14b0d2b9b317cfc14161143e2006b95a5da9b1 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Thu, 7 May 2020 18:53:24 +0200 Subject: [PATCH 12/68] mptcp: set correct vfs info for subflows When a subflow is created via mptcp_subflow_create_socket(), a new 'struct socket' is allocated, with a new i_ino value. When inspecting TCP sockets via the procfs and or the diag interface, the above ones are not related to the process owning the MPTCP master socket, even if they are a logical part of it ('ss -p' shows an empty process field) Additionally, subflows created by the path manager get the uid/gid from the running workqueue. Subflows are part of the owning MPTCP master socket, let's adjust the vfs info to reflect this. After this patch, 'ss' correctly displays subflows as belonging to the msk socket creator. Fixes: 2303f994b3e1 ("mptcp: Associate MPTCP context with TCP socket") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/subflow.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 67a4e35d4838..4931a29a6f08 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1012,6 +1012,16 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) if (err) return err; + /* the newly created socket really belongs to the owning MPTCP master + * socket, even if for additional subflows the allocation is performed + * by a kernel workqueue. Adjust inode references, so that the + * procfs/diag interaces really show this one belonging to the correct + * user. + */ + SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino; + SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid; + SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid; + subflow = mptcp_subflow_ctx(sf->sk); pr_debug("subflow=%p", subflow); From dd912306ff008891c82cd9f63e8181e47a9cb2fb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 7 May 2020 12:19:03 -0700 Subject: [PATCH 13/68] net: fix a potential recursive NETDEV_FEAT_CHANGE syzbot managed to trigger a recursive NETDEV_FEAT_CHANGE event between bonding master and slave. I managed to find a reproducer for this: ip li set bond0 up ifenslave bond0 eth0 brctl addbr br0 ethtool -K eth0 lro off brctl addif br0 bond0 ip li set br0 up When a NETDEV_FEAT_CHANGE event is triggered on a bonding slave, it captures this and calls bond_compute_features() to fixup its master's and other slaves' features. However, when syncing with its lower devices by netdev_sync_lower_features() this event is triggered again on slaves when the LRO feature fails to change, so it goes back and forth recursively until the kernel stack is exhausted. Commit 17b85d29e82c intentionally lets __netdev_update_features() return -1 for such a failure case, so we have to just rely on the existing check inside netdev_sync_lower_features() and skip NETDEV_FEAT_CHANGE event only for this specific failure case. Fixes: fd867d51f889 ("net/core: generic support for disabling netdev features down stack") Reported-by: syzbot+e73ceacfd8560cc8a3ca@syzkaller.appspotmail.com Reported-by: syzbot+c2fb6f9ddcea95ba49b5@syzkaller.appspotmail.com Cc: Jarod Wilson Cc: Nikolay Aleksandrov Cc: Josh Poimboeuf Cc: Jann Horn Reviewed-by: Jay Vosburgh Signed-off-by: Cong Wang Acked-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- net/core/dev.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 522288177bbd..6d327b7aa813 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -8907,11 +8907,13 @@ static void netdev_sync_lower_features(struct net_device *upper, netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n", &feature, lower->name); lower->wanted_features &= ~feature; - netdev_update_features(lower); + __netdev_update_features(lower); if (unlikely(lower->features & feature)) netdev_WARN(upper, "failed to disable %pNF on %s!\n", &feature, lower->name); + else + netdev_features_change(lower); } } } From cc4de047b33be247f9c8150d3e496743a49642b8 Mon Sep 17 00:00:00 2001 From: Kelly Littlepage Date: Fri, 8 May 2020 19:58:46 +0000 Subject: [PATCH 14/68] net: tcp: fix rx timestamp behavior for tcp_recvmsg The stated intent of the original commit is to is to "return the timestamp corresponding to the highest sequence number data returned." The current implementation returns the timestamp for the last byte of the last fully read skb, which is not necessarily the last byte in the recv buffer. This patch converts behavior to the original definition, and to the behavior of the previous draft versions of commit 98aaa913b4ed ("tcp: Extend SOF_TIMESTAMPING_RX_SOFTWARE to TCP recvmsg") which also match this behavior. Fixes: 98aaa913b4ed ("tcp: Extend SOF_TIMESTAMPING_RX_SOFTWARE to TCP recvmsg") Co-developed-by: Iris Liu Signed-off-by: Iris Liu Signed-off-by: Kelly Littlepage Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6d87de434377..e72bd651d21a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2154,13 +2154,15 @@ skip_copy: tp->urg_data = 0; tcp_fast_path_check(sk); } - if (used + offset < skb->len) - continue; if (TCP_SKB_CB(skb)->has_rxtstamp) { tcp_update_recv_tstamps(skb, &tss); cmsg_flags |= 2; } + + if (used + offset < skb->len) + continue; + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) goto found_fin_ok; if (!(flags & MSG_PEEK)) From 1f8492df081bd66255764f3ce82ba1b2c37def49 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 8 May 2020 08:24:14 +0200 Subject: [PATCH 15/68] r8169: re-establish support for RTL8401 chip version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit r8169 never had native support for the RTL8401, however it reportedly worked with the fallback to RTL8101e [0]. Therefore let's add this as an explicit assignment. [0] https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=956868 Fixes: b4cc2dcc9c7c ("r8169: remove default chip versions") Reported-by: Camaleón Signed-off-by: Heiner Kallweit Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index bf5bf05970a2..78e15cc00e0a 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2127,6 +2127,8 @@ static enum mac_version rtl8169_get_mac_version(u16 xid, bool gmii) { 0x7cf, 0x348, RTL_GIGA_MAC_VER_07 }, { 0x7cf, 0x248, RTL_GIGA_MAC_VER_07 }, { 0x7cf, 0x340, RTL_GIGA_MAC_VER_13 }, + /* RTL8401, reportedly works if treated as RTL8101e */ + { 0x7cf, 0x240, RTL_GIGA_MAC_VER_13 }, { 0x7cf, 0x343, RTL_GIGA_MAC_VER_10 }, { 0x7cf, 0x342, RTL_GIGA_MAC_VER_16 }, { 0x7c8, 0x348, RTL_GIGA_MAC_VER_09 }, From 5099dea0a59f1c89525bb0ceac36689178a4c125 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 8 May 2020 07:27:35 +0000 Subject: [PATCH 16/68] nfp: abm: fix error return code in nfp_abm_vnic_alloc() Fix to return negative error code -ENOMEM from the kzalloc() error handling case instead of 0, as done elsewhere in this function. Fixes: 174ab544e3bc ("nfp: abm: add cls_u32 offload for simple band classification") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/netronome/nfp/abm/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/netronome/nfp/abm/main.c b/drivers/net/ethernet/netronome/nfp/abm/main.c index 354efffac0f9..bdbf0726145e 100644 --- a/drivers/net/ethernet/netronome/nfp/abm/main.c +++ b/drivers/net/ethernet/netronome/nfp/abm/main.c @@ -333,8 +333,10 @@ nfp_abm_vnic_alloc(struct nfp_app *app, struct nfp_net *nn, unsigned int id) goto err_free_alink; alink->prio_map = kzalloc(abm->prio_map_len, GFP_KERNEL); - if (!alink->prio_map) + if (!alink->prio_map) { + err = -ENOMEM; goto err_free_alink; + } /* This is a multi-host app, make sure MAC/PHY is up, but don't * make the MAC/PHY state follow the state of any of the ports. From 6d32a5119811d2e9b5caa284181944c6f1f192ed Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 8 May 2020 17:37:20 +0300 Subject: [PATCH 17/68] dpaa2-eth: prevent array underflow in update_cls_rule() The "location" is controlled by the user via the ethtool_set_rxnfc() function. This update_cls_rule() function checks for array overflows but it doesn't check if the value is negative. I have changed the type to unsigned to prevent array underflows. Fixes: afb90dbb5f78 ("dpaa2-eth: Add ethtool support for flow classification") Signed-off-by: Dan Carpenter Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c index 94347c695233..b7141fdc279e 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-ethtool.c @@ -635,7 +635,7 @@ static int num_rules(struct dpaa2_eth_priv *priv) static int update_cls_rule(struct net_device *net_dev, struct ethtool_rx_flow_spec *new_fs, - int location) + unsigned int location) { struct dpaa2_eth_priv *priv = netdev_priv(net_dev); struct dpaa2_eth_cls_rule *rule; From db803036ada7d61d096783726f9771b3fc540370 Mon Sep 17 00:00:00 2001 From: Vincent Minet Date: Fri, 8 May 2020 00:14:22 +0200 Subject: [PATCH 18/68] umh: fix memory leak on execve failure If a UMH process created by fork_usermode_blob() fails to execute, a pair of struct file allocated by umh_pipe_setup() will leak. Under normal conditions, the caller (like bpfilter) needs to manage the lifetime of the UMH and its two pipes. But when fork_usermode_blob() fails, the caller doesn't really have a way to know what needs to be done. It seems better to do the cleanup ourselves in this case. Fixes: 449325b52b7a ("umh: introduce fork_usermode_blob() helper") Signed-off-by: Vincent Minet Signed-off-by: Jakub Kicinski --- kernel/umh.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..20d51e0957e0 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -475,6 +475,12 @@ static void umh_clean_and_save_pid(struct subprocess_info *info) { struct umh_info *umh_info = info->data; + /* cleanup if umh_pipe_setup() was successful but exec failed */ + if (info->pid && info->retval) { + fput(umh_info->pipe_to_umh); + fput(umh_info->pipe_from_umh); + } + argv_free(info->argv); umh_info->pid = info->pid; } From 9302bead664f49ef73040fa06fa64552d02fee42 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Fri, 8 May 2020 02:15:19 +0000 Subject: [PATCH 19/68] octeontx2-vf: Fix error return code in otx2vf_probe() Fix to return negative error code -ENOMEM from the alloc failed error handling case instead of 0, as done elsewhere in this function. Fixes: 3184fb5ba96e ("octeontx2-vf: Virtual function driver support") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c index 187c633a7af5..f4227517dc8e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c @@ -497,13 +497,17 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) hw->irq_name = devm_kmalloc_array(&hw->pdev->dev, num_vec, NAME_SIZE, GFP_KERNEL); - if (!hw->irq_name) + if (!hw->irq_name) { + err = -ENOMEM; goto err_free_netdev; + } hw->affinity_mask = devm_kcalloc(&hw->pdev->dev, num_vec, sizeof(cpumask_var_t), GFP_KERNEL); - if (!hw->affinity_mask) + if (!hw->affinity_mask) { + err = -ENOMEM; goto err_free_netdev; + } err = pci_alloc_irq_vectors(hw->pdev, num_vec, num_vec, PCI_IRQ_MSIX); if (err < 0) { From 57644431a6c2faac5d754ebd35780cf43a531b1a Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 May 2020 19:28:34 +0200 Subject: [PATCH 20/68] net: ipv4: really enforce backoff for redirects In commit b406472b5ad7 ("net: ipv4: avoid mixed n_redirects and rate_tokens usage") I missed the fact that a 0 'rate_tokens' will bypass the backoff algorithm. Since rate_tokens is cleared after a redirect silence, and never incremented on redirects, if the host keeps receiving packets requiring redirect it will reply ignoring the backoff. Additionally, the 'rate_last' field will be updated with the cadence of the ingress packet requiring redirect. If that rate is high enough, that will prevent the host from generating any other kind of ICMP messages The check for a zero 'rate_tokens' value was likely a shortcut to avoid the more complex backoff algorithm after a redirect silence period. Address the issue checking for 'n_redirects' instead, which is incremented on successful redirect, and does not interfere with other ICMP replies. Fixes: b406472b5ad7 ("net: ipv4: avoid mixed n_redirects and rate_tokens usage") Reported-and-tested-by: Colin Walters Signed-off-by: Paolo Abeni Signed-off-by: Jakub Kicinski --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788c69d9bfe0..fa829f31a3f5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -915,7 +915,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) /* Check for load limit; set rate_last to the latest sent * redirect. */ - if (peer->rate_tokens == 0 || + if (peer->n_redirects == 0 || time_after(jiffies, (peer->rate_last + (ip_rt_redirect_load << peer->n_redirects)))) { From 99352c79af3e5f2e4724abf37fa5a2a3299b1c81 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sat, 9 May 2020 14:04:52 +0200 Subject: [PATCH 21/68] net: freescale: select CONFIG_FIXED_PHY where needed I ran into a randconfig build failure with CONFIG_FIXED_PHY=m and CONFIG_GIANFAR=y: x86_64-linux-ld: drivers/net/ethernet/freescale/gianfar.o:(.rodata+0x418): undefined reference to `fixed_phy_change_carrier' It seems the same thing can happen with dpaa and ucc_geth, so change all three to do an explicit 'select FIXED_PHY'. The fixed-phy driver actually has an alternative stub function that theoretically allows building network drivers when fixed-phy is disabled, but I don't see how that would help here, as the drivers presumably would not work then. Signed-off-by: Arnd Bergmann Acked-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/Kconfig | 2 ++ drivers/net/ethernet/freescale/dpaa/Kconfig | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/freescale/Kconfig b/drivers/net/ethernet/freescale/Kconfig index 2bd7ace0a953..bfc6bfe94d0a 100644 --- a/drivers/net/ethernet/freescale/Kconfig +++ b/drivers/net/ethernet/freescale/Kconfig @@ -77,6 +77,7 @@ config UCC_GETH depends on QUICC_ENGINE && PPC32 select FSL_PQ_MDIO select PHYLIB + select FIXED_PHY ---help--- This driver supports the Gigabit Ethernet mode of the QUICC Engine, which is available on some Freescale SOCs. @@ -90,6 +91,7 @@ config GIANFAR depends on HAS_DMA select FSL_PQ_MDIO select PHYLIB + select FIXED_PHY select CRC32 ---help--- This driver supports the Gigabit TSEC on the MPC83xx, MPC85xx, diff --git a/drivers/net/ethernet/freescale/dpaa/Kconfig b/drivers/net/ethernet/freescale/dpaa/Kconfig index 3b325733a4f8..0a54c7e0e4ae 100644 --- a/drivers/net/ethernet/freescale/dpaa/Kconfig +++ b/drivers/net/ethernet/freescale/dpaa/Kconfig @@ -3,6 +3,7 @@ menuconfig FSL_DPAA_ETH tristate "DPAA Ethernet" depends on FSL_DPAA && FSL_FMAN select PHYLIB + select FIXED_PHY select FSL_FMAN_MAC ---help--- Data Path Acceleration Architecture Ethernet driver, From 090e28b229af92dc5b40786ca673999d59e73056 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Sat, 9 May 2020 11:32:10 +0800 Subject: [PATCH 22/68] netprio_cgroup: Fix unlimited memory leak of v2 cgroups If systemd is configured to use hybrid mode which enables the use of both cgroup v1 and v2, systemd will create new cgroup on both the default root (v2) and netprio_cgroup hierarchy (v1) for a new session and attach task to the two cgroups. If the task does some network thing then the v2 cgroup can never be freed after the session exited. One of our machines ran into OOM due to this memory leak. In the scenario described above when sk_alloc() is called cgroup_sk_alloc() thought it's in v2 mode, so it stores the cgroup pointer in sk->sk_cgrp_data and increments the cgroup refcnt, but then sock_update_netprioidx() thought it's in v1 mode, so it stores netprioidx value in sk->sk_cgrp_data, so the cgroup refcnt will never be freed. Currently we do the mode switch when someone writes to the ifpriomap cgroup control file. The easiest fix is to also do the switch when a task is attached to a new cgroup. Fixes: bd1060a1d671 ("sock, cgroup: add sock->sk_cgroup") Reported-by: Yang Yingliang Tested-by: Yang Yingliang Signed-off-by: Zefan Li Acked-by: Tejun Heo Signed-off-by: Jakub Kicinski --- net/core/netprio_cgroup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 8881dd943dd0..9bd4cab7d510 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -236,6 +236,8 @@ static void net_prio_attach(struct cgroup_taskset *tset) struct task_struct *p; struct cgroup_subsys_state *css; + cgroup_sk_alloc_disable(); + cgroup_taskset_for_each(p, css, tset) { void *v = (void *)(unsigned long)css->id; From 3047211ca11bf77b3ecbce045c0aa544d934b945 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Sat, 9 May 2020 16:45:44 -0700 Subject: [PATCH 23/68] net: dsa: loop: Add module soft dependency There is a soft dependency against dsa_loop_bdinfo.ko which sets up the MDIO device registration, since there are no symbols referenced by dsa_loop.ko, there is no automatic loading of dsa_loop_bdinfo.ko which is needed. Fixes: 98cd1552ea27 ("net: dsa: Mock-up driver") Signed-off-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/dsa/dsa_loop.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/dsa_loop.c b/drivers/net/dsa/dsa_loop.c index fdcb70b9f0e4..400207c5c7de 100644 --- a/drivers/net/dsa/dsa_loop.c +++ b/drivers/net/dsa/dsa_loop.c @@ -360,6 +360,7 @@ static void __exit dsa_loop_exit(void) } module_exit(dsa_loop_exit); +MODULE_SOFTDEP("pre: dsa_loop_bdinfo"); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Florian Fainelli"); MODULE_DESCRIPTION("DSA loopback driver"); From 2c407aca64977ede9b9f35158e919773cae2082f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 30 Apr 2020 23:30:48 +0200 Subject: [PATCH 24/68] netfilter: conntrack: avoid gcc-10 zero-length-bounds warning gcc-10 warns around a suspicious access to an empty struct member: net/netfilter/nf_conntrack_core.c: In function '__nf_conntrack_alloc': net/netfilter/nf_conntrack_core.c:1522:9: warning: array subscript 0 is outside the bounds of an interior zero-length array 'u8[0]' {aka 'unsigned char[0]'} [-Wzero-length-bounds] 1522 | memset(&ct->__nfct_init_offset[0], 0, | ^~~~~~~~~~~~~~~~~~~~~~~~~~ In file included from net/netfilter/nf_conntrack_core.c:37: include/net/netfilter/nf_conntrack.h:90:5: note: while referencing '__nfct_init_offset' 90 | u8 __nfct_init_offset[0]; | ^~~~~~~~~~~~~~~~~~ The code is correct but a bit unusual. Rework it slightly in a way that does not trigger the warning, using an empty struct instead of an empty array. There are probably more elegant ways to do this, but this is the smallest change. Fixes: c41884ce0562 ("netfilter: conntrack: avoid zeroing timer") Signed-off-by: Arnd Bergmann Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack.h | 2 +- net/netfilter/nf_conntrack_core.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 9f551f3b69c6..90690e37a56f 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -87,7 +87,7 @@ struct nf_conn { struct hlist_node nat_bysource; #endif /* all members below initialized via memset */ - u8 __nfct_init_offset[0]; + struct { } __nfct_init_offset; /* If we were expected by an expectation, this will be it */ struct nf_conn *master; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index c4582eb71766..0173398f4ced 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1519,9 +1519,9 @@ __nf_conntrack_alloc(struct net *net, ct->status = 0; ct->timeout = 0; write_pnet(&ct->ct_net, net); - memset(&ct->__nfct_init_offset[0], 0, + memset(&ct->__nfct_init_offset, 0, offsetof(struct nf_conn, proto) - - offsetof(struct nf_conn, __nfct_init_offset[0])); + offsetof(struct nf_conn, __nfct_init_offset)); nf_ct_zone_add(ct, zone); From e8a1b0efd632d1c9db7d4e93da66377c7b524862 Mon Sep 17 00:00:00 2001 From: Luo bin Date: Sun, 10 May 2020 19:01:08 +0000 Subject: [PATCH 25/68] hinic: fix a bug of ndo_stop if some function in ndo_stop interface returns failure because of hardware fault, must go on excuting rest steps rather than return failure directly, otherwise will cause memory leak.And bump the timeout for SET_FUNC_STATE to ensure that cmd won't return failure when hw is busy. Otherwise hw may stomp host memory if we free memory regardless of the return value of SET_FUNC_STATE. Fixes: 51ba902a16e6 ("net-next/hinic: Initialize hw interface") Signed-off-by: Luo bin Signed-off-by: Jakub Kicinski --- .../net/ethernet/huawei/hinic/hinic_hw_mgmt.c | 16 ++++++++++++---- drivers/net/ethernet/huawei/hinic/hinic_main.c | 16 ++-------------- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c index 8995e32dd1c0..992908e6eebf 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_hw_mgmt.c @@ -45,6 +45,8 @@ #define MGMT_MSG_TIMEOUT 5000 +#define SET_FUNC_PORT_MGMT_TIMEOUT 25000 + #define mgmt_to_pfhwdev(pf_mgmt) \ container_of(pf_mgmt, struct hinic_pfhwdev, pf_to_mgmt) @@ -238,12 +240,13 @@ static int msg_to_mgmt_sync(struct hinic_pf_to_mgmt *pf_to_mgmt, u8 *buf_in, u16 in_size, u8 *buf_out, u16 *out_size, enum mgmt_direction_type direction, - u16 resp_msg_id) + u16 resp_msg_id, u32 timeout) { struct hinic_hwif *hwif = pf_to_mgmt->hwif; struct pci_dev *pdev = hwif->pdev; struct hinic_recv_msg *recv_msg; struct completion *recv_done; + unsigned long timeo; u16 msg_id; int err; @@ -267,8 +270,9 @@ static int msg_to_mgmt_sync(struct hinic_pf_to_mgmt *pf_to_mgmt, goto unlock_sync_msg; } - if (!wait_for_completion_timeout(recv_done, - msecs_to_jiffies(MGMT_MSG_TIMEOUT))) { + timeo = msecs_to_jiffies(timeout ? timeout : MGMT_MSG_TIMEOUT); + + if (!wait_for_completion_timeout(recv_done, timeo)) { dev_err(&pdev->dev, "MGMT timeout, MSG id = %d\n", msg_id); err = -ETIMEDOUT; goto unlock_sync_msg; @@ -342,6 +346,7 @@ int hinic_msg_to_mgmt(struct hinic_pf_to_mgmt *pf_to_mgmt, { struct hinic_hwif *hwif = pf_to_mgmt->hwif; struct pci_dev *pdev = hwif->pdev; + u32 timeout = 0; if (sync != HINIC_MGMT_MSG_SYNC) { dev_err(&pdev->dev, "Invalid MGMT msg type\n"); @@ -353,9 +358,12 @@ int hinic_msg_to_mgmt(struct hinic_pf_to_mgmt *pf_to_mgmt, return -EINVAL; } + if (cmd == HINIC_PORT_CMD_SET_FUNC_STATE) + timeout = SET_FUNC_PORT_MGMT_TIMEOUT; + return msg_to_mgmt_sync(pf_to_mgmt, mod, cmd, buf_in, in_size, buf_out, out_size, MGMT_DIRECT_SEND, - MSG_NOT_RESP); + MSG_NOT_RESP, timeout); } /** diff --git a/drivers/net/ethernet/huawei/hinic/hinic_main.c b/drivers/net/ethernet/huawei/hinic/hinic_main.c index 13560975c103..63b92f6cc856 100644 --- a/drivers/net/ethernet/huawei/hinic/hinic_main.c +++ b/drivers/net/ethernet/huawei/hinic/hinic_main.c @@ -483,7 +483,6 @@ static int hinic_close(struct net_device *netdev) { struct hinic_dev *nic_dev = netdev_priv(netdev); unsigned int flags; - int err; down(&nic_dev->mgmt_lock); @@ -497,20 +496,9 @@ static int hinic_close(struct net_device *netdev) up(&nic_dev->mgmt_lock); - err = hinic_port_set_func_state(nic_dev, HINIC_FUNC_PORT_DISABLE); - if (err) { - netif_err(nic_dev, drv, netdev, - "Failed to set func port state\n"); - nic_dev->flags |= (flags & HINIC_INTF_UP); - return err; - } + hinic_port_set_state(nic_dev, HINIC_PORT_DISABLE); - err = hinic_port_set_state(nic_dev, HINIC_PORT_DISABLE); - if (err) { - netif_err(nic_dev, drv, netdev, "Failed to set port state\n"); - nic_dev->flags |= (flags & HINIC_INTF_UP); - return err; - } + hinic_port_set_func_state(nic_dev, HINIC_FUNC_PORT_DISABLE); if (nic_dev->flags & HINIC_RSS_ENABLE) { hinic_rss_deinit(nic_dev); From 2c8897953f3b2ff5498f3f275708a742bfcdbc24 Mon Sep 17 00:00:00 2001 From: Paul Blakey Date: Wed, 6 May 2020 14:24:39 +0300 Subject: [PATCH 26/68] netfilter: flowtable: Add pending bit for offload work Gc step can queue offloaded flow del work or stats work. Those work items can race each other and a flow could be freed before the stats work is executed and querying it. To avoid that, add a pending bit that if a work exists for a flow don't queue another work for it. This will also avoid adding multiple stats works in case stats work didn't complete but gc step started again. Signed-off-by: Paul Blakey Reviewed-by: Roi Dayan Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_flow_table.h | 1 + net/netfilter/nf_flow_table_offload.c | 10 ++++++++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_flow_table.h b/include/net/netfilter/nf_flow_table.h index 6bf69652f57d..c54a7f707e50 100644 --- a/include/net/netfilter/nf_flow_table.h +++ b/include/net/netfilter/nf_flow_table.h @@ -127,6 +127,7 @@ enum nf_flow_flags { NF_FLOW_HW_DYING, NF_FLOW_HW_DEAD, NF_FLOW_HW_REFRESH, + NF_FLOW_HW_PENDING, }; enum flow_offload_type { diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index e3b099c14eff..3d4ca62c81f9 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -817,6 +817,7 @@ static void flow_offload_work_handler(struct work_struct *work) WARN_ON_ONCE(1); } + clear_bit(NF_FLOW_HW_PENDING, &offload->flow->flags); kfree(offload); } @@ -831,10 +832,15 @@ nf_flow_offload_work_alloc(struct nf_flowtable *flowtable, { struct flow_offload_work *offload; - offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); - if (!offload) + if (test_and_set_bit(NF_FLOW_HW_PENDING, &flow->flags)) return NULL; + offload = kmalloc(sizeof(struct flow_offload_work), GFP_ATOMIC); + if (!offload) { + clear_bit(NF_FLOW_HW_PENDING, &flow->flags); + return NULL; + } + offload->cmd = cmd; offload->flow = flow; offload->priority = flowtable->priority; From 1d10da0eb09484ae087836da28258316ef4a02be Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Sun, 10 May 2020 13:55:43 +0300 Subject: [PATCH 27/68] netfilter: flowtable: Remove WQ_MEM_RECLAIM from workqueue This workqueue is in charge of handling offloaded flow tasks like add/del/stats we should not use WQ_MEM_RECLAIM flag. The flag can result in the following warning. [ 485.557189] ------------[ cut here ]------------ [ 485.562976] workqueue: WQ_MEM_RECLAIM nf_flow_table_offload:flow_offload_worr [ 485.562985] WARNING: CPU: 7 PID: 3731 at kernel/workqueue.c:2610 check_flush0 [ 485.590191] Kernel panic - not syncing: panic_on_warn set ... [ 485.597100] CPU: 7 PID: 3731 Comm: kworker/u112:8 Not tainted 5.7.0-rc1.21802 [ 485.606629] Hardware name: Dell Inc. PowerEdge R730/072T6D, BIOS 2.4.3 01/177 [ 485.615487] Workqueue: nf_flow_table_offload flow_offload_work_handler [nf_f] [ 485.624834] Call Trace: [ 485.628077] dump_stack+0x50/0x70 [ 485.632280] panic+0xfb/0x2d7 [ 485.636083] ? check_flush_dependency+0x110/0x130 [ 485.641830] __warn.cold.12+0x20/0x2a [ 485.646405] ? check_flush_dependency+0x110/0x130 [ 485.652154] ? check_flush_dependency+0x110/0x130 [ 485.657900] report_bug+0xb8/0x100 [ 485.662187] ? sched_clock_cpu+0xc/0xb0 [ 485.666974] do_error_trap+0x9f/0xc0 [ 485.671464] do_invalid_op+0x36/0x40 [ 485.675950] ? check_flush_dependency+0x110/0x130 [ 485.681699] invalid_op+0x28/0x30 Fixes: 7da182a998d6 ("netfilter: flowtable: Use work entry per offload command") Reported-by: Marcelo Ricardo Leitner Signed-off-by: Roi Dayan Reviewed-by: Paul Blakey Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 3d4ca62c81f9..2276a73ccba2 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1062,7 +1062,7 @@ static struct flow_indr_block_entry block_ing_entry = { int nf_flow_table_offload_init(void) { nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", - WQ_UNBOUND | WQ_MEM_RECLAIM, 0); + WQ_UNBOUND, 0); if (!nf_flow_offload_wq) return -ENOMEM; From 54ab49fde95605a1077f759ce454d94e84b5ca45 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 10 May 2020 14:28:07 +0200 Subject: [PATCH 28/68] netfilter: conntrack: fix infinite loop on rmmod 'rmmod nf_conntrack' can hang forever, because the netns exit gets stuck in nf_conntrack_cleanup_net_list(): i_see_dead_people: busy = 0; list_for_each_entry(net, net_exit_list, exit_list) { nf_ct_iterate_cleanup(kill_all, net, 0, 0); if (atomic_read(&net->ct.count) != 0) busy = 1; } if (busy) { schedule(); goto i_see_dead_people; } When nf_ct_iterate_cleanup iterates the conntrack table, all nf_conn structures can be found twice: once for the original tuple and once for the conntracks reply tuple. get_next_corpse() only calls the iterator when the entry is in original direction -- the idea was to avoid unneeded invocations of the iterator callback. When support for clashing entries was added, the assumption that all nf_conn objects are added twice, once in original, once for reply tuple no longer holds -- NF_CLASH_BIT entries are only added in the non-clashing reply direction. Thus, if at least one NF_CLASH entry is in the list then nf_conntrack_cleanup_net_list() always skips it completely. During normal netns destruction, this causes a hang of several seconds, until the gc worker removes the entry (NF_CLASH entries always have a 1 second timeout). But in the rmmod case, the gc worker has already been stopped, so ct.count never becomes 0. We can fix this in two ways: 1. Add a second test for CLASH_BIT and call iterator for those entries as well, or: 2. Skip the original tuple direction and use the reply tuple. 2) is simpler, so do that. Fixes: 6a757c07e51f80ac ("netfilter: conntrack: allow insertion of clashing entries") Reported-by: Chen Yi Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 0173398f4ced..1d57b95d3481 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -2139,8 +2139,19 @@ get_next_corpse(int (*iter)(struct nf_conn *i, void *data), nf_conntrack_lock(lockp); if (*bucket < nf_conntrack_htable_size) { hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[*bucket], hnnode) { - if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL) + if (NF_CT_DIRECTION(h) != IP_CT_DIR_REPLY) continue; + /* All nf_conn objects are added to hash table twice, one + * for original direction tuple, once for the reply tuple. + * + * Exception: In the IPS_NAT_CLASH case, only the reply + * tuple is added (the original tuple already existed for + * a different object). + * + * We only need to call the iterator once for each + * conntrack, so we just use the 'reply' direction + * tuple while iterating. + */ ct = nf_ct_tuplehash_to_ctrack(h); if (iter(ct, data)) goto found; From c781e1d4f3e23d5fc138dad2fd98ca0e0dd9c592 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 May 2020 14:14:03 -0500 Subject: [PATCH 29/68] net: ipa: set DMA length in gsi_trans_cmd_add() When a command gets added to a transaction for the AP->command channel we set the DMA address of its scatterlist entry, but not its DMA length. Fix this bug. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/gsi_trans.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ipa/gsi_trans.c b/drivers/net/ipa/gsi_trans.c index 2fd21d75367d..bdbfeed359db 100644 --- a/drivers/net/ipa/gsi_trans.c +++ b/drivers/net/ipa/gsi_trans.c @@ -399,13 +399,14 @@ void gsi_trans_cmd_add(struct gsi_trans *trans, void *buf, u32 size, /* assert(which < trans->tre_count); */ /* Set the page information for the buffer. We also need to fill in - * the DMA address for the buffer (something dma_map_sg() normally - * does). + * the DMA address and length for the buffer (something dma_map_sg() + * normally does). */ sg = &trans->sgl[which]; sg_set_buf(sg, buf, size); sg_dma_address(sg) = addr; + sg_dma_len(sg) = sg->length; info = &trans->info[which]; info->opcode = opcode; From 2c4bb8093c3bfebc8113c683a5de83c64d43514c Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Thu, 7 May 2020 14:14:04 -0500 Subject: [PATCH 30/68] net: ipa: use tag process on modem crash One part of recovering from a modem crash is performing a "tag sequence" of several IPA immediate commands, to clear the hardware pipeline. The sequence ends with a data transfer request on the command endpoint (which is not otherwise done). Unfortunately, attempting to do the data transfer led to a hang, so that request plus two other commands were commented out. The previous commit fixes the bug that was causing that hang. And with that bug fixed we can properly issue the tag sequence when the modem crashes, to return the hardware to a known state. Signed-off-by: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_cmd.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/net/ipa/ipa_cmd.c b/drivers/net/ipa/ipa_cmd.c index d226b858742d..cee417181f98 100644 --- a/drivers/net/ipa/ipa_cmd.c +++ b/drivers/net/ipa/ipa_cmd.c @@ -628,23 +628,15 @@ static void ipa_cmd_transfer_add(struct gsi_trans *trans, u16 size) void ipa_cmd_tag_process_add(struct gsi_trans *trans) { - ipa_cmd_register_write_add(trans, 0, 0, 0, true); -#if 1 - /* Reference these functions to avoid a compile error */ - (void)ipa_cmd_ip_packet_init_add; - (void)ipa_cmd_ip_tag_status_add; - (void) ipa_cmd_transfer_add; -#else struct ipa *ipa = container_of(trans->gsi, struct ipa, gsi); - struct gsi_endpoint *endpoint; + struct ipa_endpoint *endpoint; endpoint = ipa->name_map[IPA_ENDPOINT_AP_LAN_RX]; + + ipa_cmd_register_write_add(trans, 0, 0, 0, true); ipa_cmd_ip_packet_init_add(trans, endpoint->endpoint_id); - ipa_cmd_ip_tag_status_add(trans, 0xcba987654321); - ipa_cmd_transfer_add(trans, 4); -#endif } /* Returns the number of commands required for the tag process */ From 9ed81c8e0deb7bd2aa0d69371e4a0f9a7b31205d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 11 May 2020 11:54:31 +0200 Subject: [PATCH 31/68] netfilter: flowtable: set NF_FLOW_TEARDOWN flag on entry expiration If the flow timer expires, the gc sets on the NF_FLOW_TEARDOWN flag. Otherwise, the flowtable software path might race to refresh the timeout, leaving the state machine in inconsistent state. Fixes: c29f74e0df7a ("netfilter: nf_flow_table: hardware offload support") Reported-by: Paul Blakey Reviewed-by: Roi Dayan Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 4344e572b7f9..42da6e337276 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -284,7 +284,7 @@ static void flow_offload_del(struct nf_flowtable *flow_table, if (nf_flow_has_expired(flow)) flow_offload_fixup_ct(flow->ct); - else if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) + else flow_offload_fixup_ct_timeout(flow->ct); flow_offload_free(flow); @@ -361,8 +361,10 @@ static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; - if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct) || - test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { + if (nf_flow_has_expired(flow) || nf_ct_is_dying(flow->ct)) + set_bit(NF_FLOW_TEARDOWN, &flow->flags); + + if (test_bit(NF_FLOW_TEARDOWN, &flow->flags)) { if (test_bit(NF_FLOW_HW, &flow->flags)) { if (!test_bit(NF_FLOW_HW_DYING, &flow->flags)) nf_flow_offload_del(flow_table, flow); From 340eaff651160234bdbce07ef34b92a8e45cd540 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Mon, 11 May 2020 15:31:41 +0200 Subject: [PATCH 32/68] netfilter: nft_set_rbtree: Add missing expired checks Expired intervals would still match and be dumped to user space until garbage collection wiped them out. Make sure they stop matching and disappear (from users' perspective) as soon as they expire. Fixes: 8d8540c4f5e03 ("netfilter: nft_set_rbtree: add timeout support") Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_set_rbtree.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 3ffef454d469..62f416bc0579 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -79,6 +79,10 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set parent = rcu_dereference_raw(parent->rb_left); continue; } + + if (nft_set_elem_expired(&rbe->ext)) + return false; + if (nft_rbtree_interval_end(rbe)) { if (nft_set_is_anonymous(set)) return false; @@ -94,6 +98,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && + !nft_set_elem_expired(&interval->ext) && nft_rbtree_interval_start(interval)) { *ext = &interval->ext; return true; @@ -154,6 +159,9 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, continue; } + if (nft_set_elem_expired(&rbe->ext)) + return false; + if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == (flags & NFT_SET_ELEM_INTERVAL_END)) { @@ -170,6 +178,7 @@ static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && + !nft_set_elem_expired(&interval->ext) && ((!nft_rbtree_interval_end(interval) && !(flags & NFT_SET_ELEM_INTERVAL_END)) || (nft_rbtree_interval_end(interval) && @@ -418,6 +427,8 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, if (iter->count < iter->skip) goto cont; + if (nft_set_elem_expired(&rbe->ext)) + goto cont; if (!nft_set_elem_active(&rbe->ext, iter->genmask)) goto cont; From 64d950ae0b01eae96eb668b789c6d145c38ac41c Mon Sep 17 00:00:00 2001 From: Christoph Paasch Date: Mon, 11 May 2020 09:24:42 -0700 Subject: [PATCH 33/68] mptcp: Initialize map_seq upon subflow establishment When the other MPTCP-peer uses 32-bit data-sequence numbers, we rely on map_seq to indicate how to expand to a 64-bit data-sequence number in expand_seq() when receiving data. For new subflows, this field is not initialized, thus results in an "invalid" mapping being discarded. Fix this by initializing map_seq upon subflow establishment time. Fixes: f296234c98a8 ("mptcp: Add handling of incoming MP_JOIN requests") Signed-off-by: Christoph Paasch Reviewed-by: Paolo Abeni Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e1f23016ed3f..32ea8d35489a 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -1629,6 +1629,8 @@ bool mptcp_finish_join(struct sock *sk) ret = mptcp_pm_allow_new_subflow(msk); if (ret) { + subflow->map_seq = msk->ack_seq; + /* active connections are already on conn_list */ spin_lock_bh(&msk->join_list_lock); if (!WARN_ON_ONCE(!list_empty(&subflow->node))) From 2c864c78c2386ada7433268cdfa8cb77cfe31bf3 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Mon, 11 May 2020 14:02:15 -0700 Subject: [PATCH 34/68] ptp: fix struct member comment for do_aux_work The do_aux_work callback had documentation in the structure comment which referred to it as "do_work". Signed-off-by: Jacob Keller Cc: Richard Cochran Acked-by: Richard Cochran Signed-off-by: David S. Miller --- include/linux/ptp_clock_kernel.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/ptp_clock_kernel.h b/include/linux/ptp_clock_kernel.h index 121a7eda4593..c602670bbffb 100644 --- a/include/linux/ptp_clock_kernel.h +++ b/include/linux/ptp_clock_kernel.h @@ -105,10 +105,10 @@ struct ptp_system_timestamp { * parameter func: the desired function to use. * parameter chan: the function channel index to use. * - * @do_work: Request driver to perform auxiliary (periodic) operations - * Driver should return delay of the next auxiliary work scheduling - * time (>=0) or negative value in case further scheduling - * is not required. + * @do_aux_work: Request driver to perform auxiliary (periodic) operations + * Driver should return delay of the next auxiliary work + * scheduling time (>=0) or negative value in case further + * scheduling is not required. * * Drivers should embed their ptp_clock_info within a private * structure, obtaining a reference to it using container_of(). From f20a4d404122b183b7d66cadea917fcd1340c05b Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 11 May 2020 14:04:44 -0700 Subject: [PATCH 35/68] ionic: leave netdev mac alone after fw-upgrade When running in a bond setup, or some other potential configurations, the netdev mac may have been changed from the default device mac. Since the userland doesn't know about the changes going on under the covers in a fw-upgrade it doesn't know the re-push the mac filter. The driver needs to leave the netdev mac filter alone when rebuilding after the fw-upgrade. Fixes: c672412f6172 ("ionic: remove lifs on fw reset") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- .../net/ethernet/pensando/ionic/ionic_lif.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index d5293bfded29..f8c626444da0 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -2348,7 +2348,17 @@ static int ionic_station_set(struct ionic_lif *lif) if (is_zero_ether_addr(ctx.comp.lif_getattr.mac)) return 0; - if (!ether_addr_equal(ctx.comp.lif_getattr.mac, netdev->dev_addr)) { + if (!is_zero_ether_addr(netdev->dev_addr)) { + /* If the netdev mac is non-zero and doesn't match the default + * device address, it was set by something earlier and we're + * likely here again after a fw-upgrade reset. We need to be + * sure the netdev mac is in our filter list. + */ + if (!ether_addr_equal(ctx.comp.lif_getattr.mac, + netdev->dev_addr)) + ionic_lif_addr(lif, netdev->dev_addr, true); + } else { + /* Update the netdev mac with the device's mac */ memcpy(addr.sa_data, ctx.comp.lif_getattr.mac, netdev->addr_len); addr.sa_family = AF_INET; err = eth_prepare_mac_addr_change(netdev, &addr); @@ -2358,12 +2368,6 @@ static int ionic_station_set(struct ionic_lif *lif) return 0; } - if (!is_zero_ether_addr(netdev->dev_addr)) { - netdev_dbg(lif->netdev, "deleting station MAC addr %pM\n", - netdev->dev_addr); - ionic_lif_addr(lif, netdev->dev_addr, false); - } - eth_commit_mac_addr_change(netdev, &addr); } From ddc5911b9bd228dd69de976a4545292420f45e73 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 11 May 2020 14:04:45 -0700 Subject: [PATCH 36/68] ionic: call ionic_port_init after fw-upgrade Since the fw has been re-inited, we need to refresh the port information dma address so we can see fresh port information. Let's call ionic_port_init again, and tweak it to allow for a call to simply refresh the existing dma address. Fixes: c672412f6172 ("ionic: remove lifs on fw reset") Signed-off-by: Shannon Nelson Signed-off-by: David S. Miller --- .../net/ethernet/pensando/ionic/ionic_lif.c | 1 + .../net/ethernet/pensando/ionic/ionic_main.c | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index f8c626444da0..f8a9c1bcffc9 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -2118,6 +2118,7 @@ static void ionic_lif_handle_fw_up(struct ionic_lif *lif) dev_info(ionic->dev, "FW Up: restarting LIFs\n"); ionic_init_devinfo(ionic); + ionic_port_init(ionic); err = ionic_qcqs_alloc(lif); if (err) goto err_out; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_main.c b/drivers/net/ethernet/pensando/ionic/ionic_main.c index 588c62e9add7..3344bc1f7671 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_main.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_main.c @@ -509,16 +509,16 @@ int ionic_port_init(struct ionic *ionic) size_t sz; int err; - if (idev->port_info) - return 0; - - idev->port_info_sz = ALIGN(sizeof(*idev->port_info), PAGE_SIZE); - idev->port_info = dma_alloc_coherent(ionic->dev, idev->port_info_sz, - &idev->port_info_pa, - GFP_KERNEL); if (!idev->port_info) { - dev_err(ionic->dev, "Failed to allocate port info, aborting\n"); - return -ENOMEM; + idev->port_info_sz = ALIGN(sizeof(*idev->port_info), PAGE_SIZE); + idev->port_info = dma_alloc_coherent(ionic->dev, + idev->port_info_sz, + &idev->port_info_pa, + GFP_KERNEL); + if (!idev->port_info) { + dev_err(ionic->dev, "Failed to allocate port info\n"); + return -ENOMEM; + } } sz = min(sizeof(ident->port.config), sizeof(idev->dev_cmd_regs->data)); From 92db978f0d686468e527d49268e7c7e8d97d334b Mon Sep 17 00:00:00 2001 From: Clay McClure Date: Tue, 12 May 2020 13:02:30 +0300 Subject: [PATCH 37/68] net: ethernet: ti: Remove TI_CPTS_MOD workaround My recent commit b6d49cab44b5 ("net: Make PTP-specific drivers depend on PTP_1588_CLOCK") exposes a missing dependency in defconfigs that select TI_CPTS without selecting PTP_1588_CLOCK, leading to linker errors of the form: drivers/net/ethernet/ti/cpsw.o: in function `cpsw_ndo_stop': cpsw.c:(.text+0x680): undefined reference to `cpts_unregister' ... That's because TI_CPTS_MOD (which is the symbol gating the _compilation_ of cpts.c) now depends on PTP_1588_CLOCK, and so is not enabled in these configurations, but TI_CPTS (which is the symbol gating _calls_ to the cpts functions) _is_ enabled. So we end up compiling calls to functions that don't exist, resulting in the linker errors. This patch fixes build errors and restores previous behavior by: - ensure PTP_1588_CLOCK=y in TI specific configs and CPTS will be built - remove TI_CPTS_MOD and, instead, add dependencies from CPTS in TI_CPSW/TI_KEYSTONE_NETCP/TI_CPSW_SWITCHDEV as below: config TI_CPSW_SWITCHDEV ... depends on TI_CPTS || !TI_CPTS which will ensure proper dependencies PTP_1588_CLOCK -> TI_CPTS -> TI_CPSW/TI_KEYSTONE_NETCP/TI_CPSW_SWITCHDEV and build type selection. Note. For NFS boot + CPTS all of above configs have to be built-in. Cc: Arnd Bergmann Cc: Dan Murphy Cc: Tony Lindgren Fixes: b6d49cab44b5 ("net: Make PTP-specific drivers depend on PTP_1588_CLOCK") Reported-by: kbuild test robot Signed-off-by: Clay McClure [grygorii.strashko@ti.com: rewording, add deps cpsw/netcp from cpts, drop IS_REACHABLE] Signed-off-by: Grygorii Strashko Reviewed-by: Arnd Bergmann Tested-by: Tony Lindgren Signed-off-by: David S. Miller --- arch/arm/configs/keystone_defconfig | 1 + arch/arm/configs/omap2plus_defconfig | 1 + drivers/net/ethernet/ti/Kconfig | 16 ++++++---------- drivers/net/ethernet/ti/Makefile | 2 +- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/arm/configs/keystone_defconfig b/arch/arm/configs/keystone_defconfig index 11e2211f9007..84a3b055f253 100644 --- a/arch/arm/configs/keystone_defconfig +++ b/arch/arm/configs/keystone_defconfig @@ -147,6 +147,7 @@ CONFIG_I2C_DAVINCI=y CONFIG_SPI=y CONFIG_SPI_DAVINCI=y CONFIG_SPI_SPIDEV=y +CONFIG_PTP_1588_CLOCK=y CONFIG_PINCTRL_SINGLE=y CONFIG_GPIOLIB=y CONFIG_GPIO_SYSFS=y diff --git a/arch/arm/configs/omap2plus_defconfig b/arch/arm/configs/omap2plus_defconfig index 3cc3ca5fa027..8b83d4a5d309 100644 --- a/arch/arm/configs/omap2plus_defconfig +++ b/arch/arm/configs/omap2plus_defconfig @@ -274,6 +274,7 @@ CONFIG_SPI_TI_QSPI=m CONFIG_HSI=m CONFIG_OMAP_SSI=m CONFIG_SSI_PROTOCOL=m +CONFIG_PTP_1588_CLOCK=y CONFIG_PINCTRL_SINGLE=y CONFIG_DEBUG_GPIO=y CONFIG_GPIO_SYSFS=y diff --git a/drivers/net/ethernet/ti/Kconfig b/drivers/net/ethernet/ti/Kconfig index 8e348780efb6..62f809b67469 100644 --- a/drivers/net/ethernet/ti/Kconfig +++ b/drivers/net/ethernet/ti/Kconfig @@ -49,6 +49,7 @@ config TI_CPSW_PHY_SEL config TI_CPSW tristate "TI CPSW Switch Support" depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST + depends on TI_CPTS || !TI_CPTS select TI_DAVINCI_MDIO select MFD_SYSCON select PAGE_POOL @@ -64,6 +65,7 @@ config TI_CPSW_SWITCHDEV tristate "TI CPSW Switch Support with switchdev" depends on ARCH_DAVINCI || ARCH_OMAP2PLUS || COMPILE_TEST depends on NET_SWITCHDEV + depends on TI_CPTS || !TI_CPTS select PAGE_POOL select TI_DAVINCI_MDIO select MFD_SYSCON @@ -77,23 +79,16 @@ config TI_CPSW_SWITCHDEV will be called cpsw_new. config TI_CPTS - bool "TI Common Platform Time Sync (CPTS) Support" - depends on TI_CPSW || TI_KEYSTONE_NETCP || TI_CPSW_SWITCHDEV || COMPILE_TEST + tristate "TI Common Platform Time Sync (CPTS) Support" + depends on ARCH_OMAP2PLUS || ARCH_KEYSTONE || COMPILE_TEST depends on COMMON_CLK - depends on POSIX_TIMERS + depends on PTP_1588_CLOCK ---help--- This driver supports the Common Platform Time Sync unit of the CPSW Ethernet Switch and Keystone 2 1g/10g Switch Subsystem. The unit can time stamp PTP UDP/IPv4 and Layer 2 packets, and the driver offers a PTP Hardware Clock. -config TI_CPTS_MOD - tristate - depends on TI_CPTS - depends on PTP_1588_CLOCK - default y if TI_CPSW=y || TI_KEYSTONE_NETCP=y || TI_CPSW_SWITCHDEV=y - default m - config TI_K3_AM65_CPSW_NUSS tristate "TI K3 AM654x/J721E CPSW Ethernet driver" depends on ARCH_K3 && OF && TI_K3_UDMA_GLUE_LAYER @@ -114,6 +109,7 @@ config TI_KEYSTONE_NETCP select TI_DAVINCI_MDIO depends on OF depends on KEYSTONE_NAVIGATOR_DMA && KEYSTONE_NAVIGATOR_QMSS + depends on TI_CPTS || !TI_CPTS ---help--- This driver supports TI's Keystone NETCP Core. diff --git a/drivers/net/ethernet/ti/Makefile b/drivers/net/ethernet/ti/Makefile index 53792190e9c2..cb26a9d21869 100644 --- a/drivers/net/ethernet/ti/Makefile +++ b/drivers/net/ethernet/ti/Makefile @@ -13,7 +13,7 @@ obj-$(CONFIG_TI_DAVINCI_EMAC) += ti_davinci_emac.o ti_davinci_emac-y := davinci_emac.o davinci_cpdma.o obj-$(CONFIG_TI_DAVINCI_MDIO) += davinci_mdio.o obj-$(CONFIG_TI_CPSW_PHY_SEL) += cpsw-phy-sel.o -obj-$(CONFIG_TI_CPTS_MOD) += cpts.o +obj-$(CONFIG_TI_CPTS) += cpts.o obj-$(CONFIG_TI_CPSW) += ti_cpsw.o ti_cpsw-y := cpsw.o davinci_cpdma.o cpsw_ale.o cpsw_priv.o cpsw_sl.o cpsw_ethtool.o obj-$(CONFIG_TI_CPSW_SWITCHDEV) += ti_cpsw_new.o From 24adbc1676af4e134e709ddc7f34cf2adc2131e4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 May 2020 06:54:30 -0700 Subject: [PATCH 38/68] tcp: fix SO_RCVLOWAT hangs with fat skbs We autotune rcvbuf whenever SO_RCVLOWAT is set to account for 100% overhead in tcp_set_rcvlowat() This works well when skb->len/skb->truesize ratio is bigger than 0.5 But if we receive packets with small MSS, we can end up in a situation where not enough bytes are available in the receive queue to satisfy RCVLOWAT setting. As our sk_rcvbuf limit is hit, we send zero windows in ACK packets, preventing remote peer from sending more data. Even autotuning does not help, because it only triggers at the time user process drains the queue. If no EPOLLIN is generated, this can not happen. Note poll() has a similar issue, after commit c7004482e8dc ("tcp: Respect SO_RCVLOWAT in tcp_poll().") Fixes: 03f45c883c6f ("tcp: avoid extra wakeups for SO_RCVLOWAT users") Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/net/tcp.h | 13 +++++++++++++ net/ipv4/tcp.c | 14 +++++++++++--- net/ipv4/tcp_input.c | 3 ++- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 64f84683feae..6f8e60c6fbc7 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1420,6 +1420,19 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } +/* We provision sk_rcvbuf around 200% of sk_rcvlowat. + * If 87.5 % (7/8) of the space has been consumed, we want to override + * SO_RCVLOWAT constraint, since we are receiving skbs with too small + * len/truesize ratio. + */ +static inline bool tcp_rmem_pressure(const struct sock *sk) +{ + int rcvbuf = READ_ONCE(sk->sk_rcvbuf); + int threshold = rcvbuf - (rcvbuf >> 3); + + return atomic_read(&sk->sk_rmem_alloc) > threshold; +} + extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e72bd651d21a..a385fcaaa03b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -476,9 +476,17 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, int target, struct sock *sk) { - return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || - (sk->sk_prot->stream_memory_read ? - sk->sk_prot->stream_memory_read(sk) : false); + int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); + + if (avail > 0) { + if (avail >= target) + return true; + if (tcp_rmem_pressure(sk)) + return true; + } + if (sk->sk_prot->stream_memory_read) + return sk->sk_prot->stream_memory_read(sk); + return false; } /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b996dc1069c5..29c6fc8c7716 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4757,7 +4757,8 @@ void tcp_data_ready(struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); int avail = tp->rcv_nxt - tp->copied_seq; - if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) + if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && + !sock_flag(sk, SOCK_DONE)) return; sk->sk_data_ready(sk); From eead1c2ea2509fd754c6da893a94f0e69e83ebe4 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 12 May 2020 14:43:14 +0200 Subject: [PATCH 39/68] netlabel: cope with NULL catmap The cipso and calipso code can set the MLS_CAT attribute on successful parsing, even if the corresponding catmap has not been allocated, as per current configuration and external input. Later, selinux code tries to access the catmap if the MLS_CAT flag is present via netlbl_catmap_getlong(). That may cause null ptr dereference while processing incoming network traffic. Address the issue setting the MLS_CAT flag only if the catmap is really allocated. Additionally let netlbl_catmap_getlong() cope with NULL catmap. Reported-by: Matthew Sheets Fixes: 4b8feff251da ("netlabel: fix the horribly broken catmap functions") Fixes: ceba1832b1b2 ("calipso: Set the calipso socket label to match the secattr.") Signed-off-by: Paolo Abeni Acked-by: Paul Moore Signed-off-by: David S. Miller --- net/ipv4/cipso_ipv4.c | 6 ++++-- net/ipv6/calipso.c | 3 ++- net/netlabel/netlabel_kapi.c | 6 ++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 0bd10a1f477f..a23094b050f8 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1258,7 +1258,8 @@ static int cipso_v4_parsetag_rbm(const struct cipso_v4_doi *doi_def, return ret_val; } - secattr->flags |= NETLBL_SECATTR_MLS_CAT; + if (secattr->attr.mls.cat) + secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; @@ -1439,7 +1440,8 @@ static int cipso_v4_parsetag_rng(const struct cipso_v4_doi *doi_def, return ret_val; } - secattr->flags |= NETLBL_SECATTR_MLS_CAT; + if (secattr->attr.mls.cat) + secattr->flags |= NETLBL_SECATTR_MLS_CAT; } return 0; diff --git a/net/ipv6/calipso.c b/net/ipv6/calipso.c index 221c81f85cbf..8d3f66c310db 100644 --- a/net/ipv6/calipso.c +++ b/net/ipv6/calipso.c @@ -1047,7 +1047,8 @@ static int calipso_opt_getattr(const unsigned char *calipso, goto getattr_return; } - secattr->flags |= NETLBL_SECATTR_MLS_CAT; + if (secattr->attr.mls.cat) + secattr->flags |= NETLBL_SECATTR_MLS_CAT; } secattr->type = NETLBL_NLTYPE_CALIPSO; diff --git a/net/netlabel/netlabel_kapi.c b/net/netlabel/netlabel_kapi.c index 409a3ae47ce2..5e1239cef000 100644 --- a/net/netlabel/netlabel_kapi.c +++ b/net/netlabel/netlabel_kapi.c @@ -734,6 +734,12 @@ int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, if ((off & (BITS_PER_LONG - 1)) != 0) return -EINVAL; + /* a null catmap is equivalent to an empty one */ + if (!catmap) { + *offset = (u32)-1; + return 0; + } + if (off < catmap->startbit) { off = catmap->startbit; *offset = off; From 29b74cb75e3572d83708745e81e24d37837415f9 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Wed, 13 May 2020 09:42:29 +0200 Subject: [PATCH 40/68] s390/ism: fix error return code in ism_probe() Fix to return negative error code -ENOMEM from the smcd_alloc_dev() error handling case instead of 0, as done elsewhere in this function. Fixes: 684b89bc39ce ("s390/ism: add device driver for internal shared memory") Reported-by: Hulk Robot Signed-off-by: Wei Yongjun Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- drivers/s390/net/ism_drv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/net/ism_drv.c b/drivers/s390/net/ism_drv.c index c75112ee7b97..c7fade836d83 100644 --- a/drivers/s390/net/ism_drv.c +++ b/drivers/s390/net/ism_drv.c @@ -521,8 +521,10 @@ static int ism_probe(struct pci_dev *pdev, const struct pci_device_id *id) ism->smcd = smcd_alloc_dev(&pdev->dev, dev_name(&pdev->dev), &ism_ops, ISM_NR_DMBS); - if (!ism->smcd) + if (!ism->smcd) { + ret = -ENOMEM; goto err_resource; + } ism->smcd->priv = ism; ret = ism_dev_init(ism); From be7fa20f057e05b31fc08d1999394c43cf2c1a9a Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Wed, 13 May 2020 09:42:30 +0200 Subject: [PATCH 41/68] MAINTAINERS: add Karsten Graul as S390 NETWORK DRIVERS maintainer Add Karsten as additional maintainer for drivers/s390/net . One of his focal points is the ism driver. Cc: Julian Wiedmann Acked-by: Julian Wiedmann Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 88bf36ab2b22..85894787825e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14644,6 +14644,7 @@ F: drivers/iommu/s390-iommu.c S390 IUCV NETWORK LAYER M: Julian Wiedmann +M: Karsten Graul M: Ursula Braun L: linux-s390@vger.kernel.org S: Supported From c72685894506a3fec5978b9b11b94069a7d1c995 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Wed, 13 May 2020 19:33:16 +0700 Subject: [PATCH 42/68] tipc: fix large latency in smart Nagle streaming Currently when a connection is in Nagle mode, we set the 'ack_required' bit in the last sending buffer and wait for the corresponding ACK prior to pushing more data. However, on the receiving side, the ACK is issued only when application really reads the whole data. Even if part of the last buffer is received, we will not do the ACK as required. This might cause an unnecessary delay since the receiver does not always fetch the message as fast as the sender, resulting in a large latency in the user message sending, which is: [one RTT + the receiver processing time]. The commit makes Nagle ACK as soon as possible i.e. when a message with the 'ack_required' arrives in the receiving side's stack even before it is processed or put in the socket receive queue... This way, we can limit the streaming latency to one RTT as committed in Nagle mode. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/socket.c | 42 +++++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 87466607097f..e370ad0edd76 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1739,22 +1739,21 @@ static int tipc_sk_anc_data_recv(struct msghdr *m, struct sk_buff *skb, return 0; } -static void tipc_sk_send_ack(struct tipc_sock *tsk) +static struct sk_buff *tipc_sk_build_ack(struct tipc_sock *tsk) { struct sock *sk = &tsk->sk; - struct net *net = sock_net(sk); struct sk_buff *skb = NULL; struct tipc_msg *msg; u32 peer_port = tsk_peer_port(tsk); u32 dnode = tsk_peer_node(tsk); if (!tipc_sk_connected(sk)) - return; + return NULL; skb = tipc_msg_create(CONN_MANAGER, CONN_ACK, INT_H_SIZE, 0, dnode, tsk_own_node(tsk), peer_port, tsk->portid, TIPC_OK); if (!skb) - return; + return NULL; msg = buf_msg(skb); msg_set_conn_ack(msg, tsk->rcv_unacked); tsk->rcv_unacked = 0; @@ -1764,7 +1763,19 @@ static void tipc_sk_send_ack(struct tipc_sock *tsk) tsk->rcv_win = tsk_adv_blocks(tsk->sk.sk_rcvbuf); msg_set_adv_win(msg, tsk->rcv_win); } - tipc_node_xmit_skb(net, skb, dnode, msg_link_selector(msg)); + return skb; +} + +static void tipc_sk_send_ack(struct tipc_sock *tsk) +{ + struct sk_buff *skb; + + skb = tipc_sk_build_ack(tsk); + if (!skb) + return; + + tipc_node_xmit_skb(sock_net(&tsk->sk), skb, tsk_peer_node(tsk), + msg_link_selector(buf_msg(skb))); } static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) @@ -1938,7 +1949,6 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, bool peek = flags & MSG_PEEK; int offset, required, copy, copied = 0; int hlen, dlen, err, rc; - bool ack = false; long timeout; /* Catch invalid receive attempts */ @@ -1983,7 +1993,6 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Copy data if msg ok, otherwise return error/partial data */ if (likely(!err)) { - ack = msg_ack_required(hdr); offset = skb_cb->bytes_read; copy = min_t(int, dlen - offset, buflen - copied); rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); @@ -2011,7 +2020,7 @@ static int tipc_recvstream(struct socket *sock, struct msghdr *m, /* Send connection flow control advertisement when applicable */ tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); - if (ack || tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) + if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) tipc_sk_send_ack(tsk); /* Exit if all requested data or FIN/error received */ @@ -2105,9 +2114,11 @@ static void tipc_sk_proto_rcv(struct sock *sk, * tipc_sk_filter_connect - check incoming message for a connection-based socket * @tsk: TIPC socket * @skb: pointer to message buffer. + * @xmitq: for Nagle ACK if any * Returns true if message should be added to receive queue, false otherwise */ -static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) +static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb, + struct sk_buff_head *xmitq) { struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); @@ -2171,8 +2182,17 @@ static bool tipc_sk_filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) if (!skb_queue_empty(&sk->sk_write_queue)) tipc_sk_push_backlog(tsk); /* Accept only connection-based messages sent by peer */ - if (likely(con_msg && !err && pport == oport && pnode == onode)) + if (likely(con_msg && !err && pport == oport && + pnode == onode)) { + if (msg_ack_required(hdr)) { + struct sk_buff *skb; + + skb = tipc_sk_build_ack(tsk); + if (skb) + __skb_queue_tail(xmitq, skb); + } return true; + } if (!tsk_peer_msg(tsk, hdr)) return false; if (!err) @@ -2267,7 +2287,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, while ((skb = __skb_dequeue(&inputq))) { hdr = buf_msg(skb); limit = rcvbuf_limit(sk, skb); - if ((sk_conn && !tipc_sk_filter_connect(tsk, skb)) || + if ((sk_conn && !tipc_sk_filter_connect(tsk, skb, xmitq)) || (!sk_conn && msg_connected(hdr)) || (!grp && msg_in_group(hdr))) err = TIPC_ERR_NO_PORT; From 0771d7df819284d46cf5cfb57698621b503ec17f Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Wed, 13 May 2020 19:33:17 +0700 Subject: [PATCH 43/68] tipc: fix memory leak in service subscripting Upon receipt of a service subscription request from user via a topology connection, one 'sub' object will be allocated in kernel, so it will be able to send an event of the service if any to the user correspondingly then. Also, in case of any failure, the connection will be shutdown and all the pertaining 'sub' objects will be freed. However, there is a race condition as follows resulting in memory leak: receive-work connection send-work | | | sub-1 |<------//-------| | sub-2 |<------//-------| | | |<---------------| evt for sub-x sub-3 |<------//-------| | : : : : : : | /--------| | | | * peer closed | | | | | | | |<-------X-------| evt for sub-y | | |<===============| sub-n |<------/ X shutdown | -> orphan | | That is, the 'receive-work' may get the last subscription request while the 'send-work' is shutting down the connection due to peer close. We had a 'lock' on the connection, so the two actions cannot be carried out simultaneously. If the last subscription is allocated e.g. 'sub-n', before the 'send-work' closes the connection, there will be no issue at all, the 'sub' objects will be freed. In contrast the last subscription will become orphan since the connection was closed, and we released all references. This commit fixes the issue by simply adding one test if the connection remains in 'connected' state right after we obtain the connection lock, then a subscription object can be created as usual, otherwise we ignore it. Acked-by: Ying Xue Acked-by: Jon Maloy Reported-by: Thang Ngo Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/topsrv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 73dbed0c4b6b..931c426673c0 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -400,7 +400,9 @@ static int tipc_conn_rcv_from_sock(struct tipc_conn *con) return -EWOULDBLOCK; if (ret == sizeof(s)) { read_lock_bh(&sk->sk_callback_lock); - ret = tipc_conn_rcv_sub(srv, con, &s); + /* RACE: the connection can be closed in the meantime */ + if (likely(connected(con))) + ret = tipc_conn_rcv_sub(srv, con, &s); read_unlock_bh(&sk->sk_callback_lock); if (!ret) return 0; From 88690b1079d473a44bf4183dfee9b03d4afae866 Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Wed, 13 May 2020 19:33:18 +0700 Subject: [PATCH 44/68] tipc: fix failed service subscription deletion When a service subscription is expired or canceled by user, it needs to be deleted from the subscription list, so that new subscriptions can be registered (max = 65535 per net). However, there are two issues in code that can cause such an unused subscription to persist: 1) The 'tipc_conn_delete_sub()' has a loop on the subscription list but it makes a break shortly when the 1st subscription differs from the one specified, so the subscription will not be deleted. 2) In case a subscription is canceled, the code to remove the 'TIPC_SUB_CANCEL' flag from the subscription filter does not work if it is a local subscription (i.e. the little endian isn't involved). So, it will be no matches when looking for the subscription to delete later. The subscription(s) will be removed eventually when the user terminates its topology connection but that could be a long time later. Meanwhile, the number of available subscriptions may be exhausted. This commit fixes the two issues above, so as needed a subscription can be deleted correctly. Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/subscr.h | 10 ++++++++++ net/tipc/topsrv.c | 9 +++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index aa015c233898..6ebbec1bedd1 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -96,6 +96,16 @@ void tipc_sub_get(struct tipc_subscription *subscription); (swap_ ? swab32(val__) : val__); \ }) +/* tipc_sub_write - write val_ to field_ of struct sub_ in user endian format + */ +#define tipc_sub_write(sub_, field_, val_) \ + ({ \ + struct tipc_subscr *sub__ = sub_; \ + u32 val__ = val_; \ + int swap_ = !((sub__)->filter & TIPC_FILTER_MASK); \ + (sub__)->field_ = swap_ ? swab32(val__) : val__; \ + }) + /* tipc_evt_write - write val_ to field_ of struct evt_ in user endian format */ #define tipc_evt_write(evt_, field_, val_) \ diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 931c426673c0..446af7bbd13e 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -237,8 +237,8 @@ static void tipc_conn_delete_sub(struct tipc_conn *con, struct tipc_subscr *s) if (!s || !memcmp(s, &sub->evt.s, sizeof(*s))) { tipc_sub_unsubscribe(sub); atomic_dec(&tn->subscription_count); - } else if (s) { - break; + if (s) + break; } } spin_unlock_bh(&con->sub_lock); @@ -362,9 +362,10 @@ static int tipc_conn_rcv_sub(struct tipc_topsrv *srv, { struct tipc_net *tn = tipc_net(srv->net); struct tipc_subscription *sub; + u32 s_filter = tipc_sub_read(s, filter); - if (tipc_sub_read(s, filter) & TIPC_SUB_CANCEL) { - s->filter &= __constant_ntohl(~TIPC_SUB_CANCEL); + if (s_filter & TIPC_SUB_CANCEL) { + tipc_sub_write(s, filter, s_filter & ~TIPC_SUB_CANCEL); tipc_conn_delete_sub(con, s); return 0; } From 99addbe31f5524494f4d7077bcb3f6fa64c5d160 Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 13 May 2020 08:51:51 -0700 Subject: [PATCH 45/68] net: broadcom: Select BROADCOM_PHY for BCMGENET The GENET controller on the Raspberry Pi 4 (2711) is typically interfaced with an external Broadcom PHY via a RGMII electrical interface. To make sure that delays are properly configured at the PHY side, ensure that we the dedicated Broadcom PHY driver (CONFIG_BROADCOM_PHY) is enabled for this to happen. Fixes: 402482a6a78e ("net: bcmgenet: Clear ID_MODE_DIS in EXT_RGMII_OOB_CTRL when not needed") Reported-by: Marek Szyprowski Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig index 53055ce5dfd6..2a69c0d06f3c 100644 --- a/drivers/net/ethernet/broadcom/Kconfig +++ b/drivers/net/ethernet/broadcom/Kconfig @@ -69,6 +69,7 @@ config BCMGENET select BCM7XXX_PHY select MDIO_BCM_UNIMAC select DIMLIB + select BROADCOM_PHY if ARCH_BCM2835 help This driver supports the built-in Ethernet MACs found in the Broadcom BCM7xxx Set Top Box family chipset. From 9de5d235b60a7cdfcdd5461e70c5663e713fde87 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 12 May 2020 21:45:53 +0200 Subject: [PATCH 46/68] net: phy: fix aneg restart in phy_ethtool_set_eee phy_restart_aneg() enables aneg in the PHY. That's not what we want if phydev->autoneg is disabled. In this case still update EEE advertisement register, but don't enable aneg and don't trigger an aneg restart. Fixes: f75abeb8338e ("net: phy: restart phy autonegotiation after EEE advertisment change") Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 72c69a9c8a98..20ca6418f7bc 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -1132,9 +1132,11 @@ int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data) /* Restart autonegotiation so the new modes get sent to the * link partner. */ - ret = phy_restart_aneg(phydev); - if (ret < 0) - return ret; + if (phydev->autoneg == AUTONEG_ENABLE) { + ret = phy_restart_aneg(phydev); + if (ret < 0) + return ret; + } } return 0; From 23ad04669f81f958e9a4121b0266228d2eb3c357 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 11 May 2020 13:32:34 +0200 Subject: [PATCH 47/68] samples: bpf: Fix build error GCC 10 is very strict about symbol clash, and lwt_len_hist_user contains a symbol which clashes with libbpf: /usr/bin/ld: samples/bpf/lwt_len_hist_user.o:(.bss+0x0): multiple definition of `bpf_log_buf'; samples/bpf/bpf_load.o:(.bss+0x8c0): first defined here collect2: error: ld returned 1 exit status bpf_log_buf here seems to be a leftover, so removing it. Signed-off-by: Matteo Croce Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200511113234.80722-1-mcroce@redhat.com --- samples/bpf/lwt_len_hist_user.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/bpf/lwt_len_hist_user.c b/samples/bpf/lwt_len_hist_user.c index 587b68b1f8dd..430a4b7e353e 100644 --- a/samples/bpf/lwt_len_hist_user.c +++ b/samples/bpf/lwt_len_hist_user.c @@ -15,8 +15,6 @@ #define MAX_INDEX 64 #define MAX_STARS 38 -char bpf_log_buf[BPF_LOG_BUF_SIZE]; - static void stars(char *str, long val, long max, int width) { int i; From 333291ce5055f2039afc907badaf5b66bc1adfdc Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 12 May 2020 16:59:25 -0700 Subject: [PATCH 48/68] bpf: Fix bug in mmap() implementation for BPF array map mmap() subsystem allows user-space application to memory-map region with initial page offset. This wasn't taken into account in initial implementation of BPF array memory-mapping. This would result in wrong pages, not taking into account requested page shift, being memory-mmaped into user-space. This patch fixes this gap and adds a test for such scenario. Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200512235925.3817805-1-andriin@fb.com --- kernel/bpf/arraymap.c | 7 ++++++- tools/testing/selftests/bpf/prog_tests/mmap.c | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d6120fd5ba6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -486,7 +486,12 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) if (!(map->map_flags & BPF_F_MMAPABLE)) return -EINVAL; - return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), pgoff); + if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > + PAGE_ALIGN((u64)array->map.max_entries * array->elem_size)) + return -EINVAL; + + return remap_vmalloc_range(vma, array_map_vmalloc_addr(array), + vma->vm_pgoff + pgoff); } const struct bpf_map_ops array_map_ops = { diff --git a/tools/testing/selftests/bpf/prog_tests/mmap.c b/tools/testing/selftests/bpf/prog_tests/mmap.c index 56d80adcf4bd..6b9dce431d41 100644 --- a/tools/testing/selftests/bpf/prog_tests/mmap.c +++ b/tools/testing/selftests/bpf/prog_tests/mmap.c @@ -217,6 +217,14 @@ void test_mmap(void) munmap(tmp2, 4 * page_size); + /* map all 4 pages, but with pg_off=1 page, should fail */ + tmp1 = mmap(NULL, 4 * page_size, PROT_READ, MAP_SHARED | MAP_FIXED, + data_map_fd, page_size /* initial page shift */); + if (CHECK(tmp1 != MAP_FAILED, "adv_mmap7", "unexpected success")) { + munmap(tmp1, 4 * page_size); + goto cleanup; + } + tmp1 = mmap(NULL, map_sz, PROT_READ, MAP_SHARED, data_map_fd, 0); if (CHECK(tmp1 == MAP_FAILED, "last_mmap", "failed %d\n", errno)) goto cleanup; From 516d8d497c017a6820019c76acbb7912a24ec57b Mon Sep 17 00:00:00 2001 From: Sumanth Korikkar Date: Wed, 13 May 2020 17:44:14 +0200 Subject: [PATCH 49/68] libbpf: Fix register naming in PT_REGS s390 macros Fix register naming in PT_REGS s390 macros Fixes: b8ebce86ffe6 ("libbpf: Provide CO-RE variants of PT_REGS macros") Signed-off-by: Sumanth Korikkar Signed-off-by: Alexei Starovoitov Reviewed-by: Julian Wiedmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200513154414.29972-1-sumanthk@linux.ibm.com --- tools/lib/bpf/bpf_tracing.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h index f3f3c3fb98cb..48a9c7c69ef1 100644 --- a/tools/lib/bpf/bpf_tracing.h +++ b/tools/lib/bpf/bpf_tracing.h @@ -148,11 +148,11 @@ struct pt_regs; #define PT_REGS_PARM3_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[4]) #define PT_REGS_PARM4_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[5]) #define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[6]) -#define PT_REGS_RET_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), grps[14]) +#define PT_REGS_RET_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[14]) #define PT_REGS_FP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[11]) #define PT_REGS_RC_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[2]) #define PT_REGS_SP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), gprs[15]) -#define PT_REGS_IP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), pdw.addr) +#define PT_REGS_IP_CORE(x) BPF_CORE_READ((PT_REGS_S390 *)(x), psw.addr) #elif defined(bpf_target_arm) From 625236ba3832ae947cb3ebb7acc1f30788b274ef Mon Sep 17 00:00:00 2001 From: Anders Roxell Date: Tue, 12 May 2020 19:46:07 +0200 Subject: [PATCH 50/68] security: Fix the default value of secid_to_secctx hook security_secid_to_secctx is called by the bpf_lsm hook and a successful return value (i.e 0) implies that the parameter will be consumed by the LSM framework. The current behaviour return success when the pointer isn't initialized when CONFIG_BPF_LSM is enabled, with the default return from kernel/bpf/bpf_lsm.c. This is the internal error: [ 1229.341488][ T2659] usercopy: Kernel memory exposure attempt detected from null address (offset 0, size 280)! [ 1229.374977][ T2659] ------------[ cut here ]------------ [ 1229.376813][ T2659] kernel BUG at mm/usercopy.c:99! [ 1229.378398][ T2659] Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [ 1229.380348][ T2659] Modules linked in: [ 1229.381654][ T2659] CPU: 0 PID: 2659 Comm: systemd-journal Tainted: G B W 5.7.0-rc5-next-20200511-00019-g864e0c6319b8-dirty #13 [ 1229.385429][ T2659] Hardware name: linux,dummy-virt (DT) [ 1229.387143][ T2659] pstate: 80400005 (Nzcv daif +PAN -UAO BTYPE=--) [ 1229.389165][ T2659] pc : usercopy_abort+0xc8/0xcc [ 1229.390705][ T2659] lr : usercopy_abort+0xc8/0xcc [ 1229.392225][ T2659] sp : ffff000064247450 [ 1229.393533][ T2659] x29: ffff000064247460 x28: 0000000000000000 [ 1229.395449][ T2659] x27: 0000000000000118 x26: 0000000000000000 [ 1229.397384][ T2659] x25: ffffa000127049e0 x24: ffffa000127049e0 [ 1229.399306][ T2659] x23: ffffa000127048e0 x22: ffffa000127048a0 [ 1229.401241][ T2659] x21: ffffa00012704b80 x20: ffffa000127049e0 [ 1229.403163][ T2659] x19: ffffa00012704820 x18: 0000000000000000 [ 1229.405094][ T2659] x17: 0000000000000000 x16: 0000000000000000 [ 1229.407008][ T2659] x15: 0000000000000000 x14: 003d090000000000 [ 1229.408942][ T2659] x13: ffff80000d5b25b2 x12: 1fffe0000d5b25b1 [ 1229.410859][ T2659] x11: 1fffe0000d5b25b1 x10: ffff80000d5b25b1 [ 1229.412791][ T2659] x9 : ffffa0001034bee0 x8 : ffff00006ad92d8f [ 1229.414707][ T2659] x7 : 0000000000000000 x6 : ffffa00015eacb20 [ 1229.416642][ T2659] x5 : ffff0000693c8040 x4 : 0000000000000000 [ 1229.418558][ T2659] x3 : ffffa0001034befc x2 : d57a7483a01c6300 [ 1229.420610][ T2659] x1 : 0000000000000000 x0 : 0000000000000059 [ 1229.422526][ T2659] Call trace: [ 1229.423631][ T2659] usercopy_abort+0xc8/0xcc [ 1229.425091][ T2659] __check_object_size+0xdc/0x7d4 [ 1229.426729][ T2659] put_cmsg+0xa30/0xa90 [ 1229.428132][ T2659] unix_dgram_recvmsg+0x80c/0x930 [ 1229.429731][ T2659] sock_recvmsg+0x9c/0xc0 [ 1229.431123][ T2659] ____sys_recvmsg+0x1cc/0x5f8 [ 1229.432663][ T2659] ___sys_recvmsg+0x100/0x160 [ 1229.434151][ T2659] __sys_recvmsg+0x110/0x1a8 [ 1229.435623][ T2659] __arm64_sys_recvmsg+0x58/0x70 [ 1229.437218][ T2659] el0_svc_common.constprop.1+0x29c/0x340 [ 1229.438994][ T2659] do_el0_svc+0xe8/0x108 [ 1229.440587][ T2659] el0_svc+0x74/0x88 [ 1229.441917][ T2659] el0_sync_handler+0xe4/0x8b4 [ 1229.443464][ T2659] el0_sync+0x17c/0x180 [ 1229.444920][ T2659] Code: aa1703e2 aa1603e1 910a8260 97ecc860 (d4210000) [ 1229.447070][ T2659] ---[ end trace 400497d91baeaf51 ]--- [ 1229.448791][ T2659] Kernel panic - not syncing: Fatal exception [ 1229.450692][ T2659] Kernel Offset: disabled [ 1229.452061][ T2659] CPU features: 0x240002,20002004 [ 1229.453647][ T2659] Memory Limit: none [ 1229.455015][ T2659] ---[ end Kernel panic - not syncing: Fatal exception ]--- Rework the so the default return value is -EOPNOTSUPP. There are likely other callbacks such as security_inode_getsecctx() that may have the same problem, and that someone that understand the code better needs to audit them. Thank you Arnd for helping me figure out what went wrong. Fixes: 98e828a0650f ("security: Refactor declaration of LSM hooks") Signed-off-by: Anders Roxell Signed-off-by: Alexei Starovoitov Acked-by: James Morris Cc: Arnd Bergmann Link: https://lore.kernel.org/bpf/20200512174607.9630-1-anders.roxell@linaro.org --- include/linux/lsm_hook_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 9cd4455528e5..21f4fff9e4cd 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -243,7 +243,7 @@ LSM_HOOK(int, -EINVAL, getprocattr, struct task_struct *p, char *name, char **value) LSM_HOOK(int, -EINVAL, setprocattr, const char *name, void *value, size_t size) LSM_HOOK(int, 0, ismaclabel, const char *name) -LSM_HOOK(int, 0, secid_to_secctx, u32 secid, char **secdata, +LSM_HOOK(int, -EOPNOTSUPP, secid_to_secctx, u32 secid, char **secdata, u32 *seclen) LSM_HOOK(int, 0, secctx_to_secid, const char *secdata, u32 seclen, u32 *secid) LSM_HOOK(void, LSM_RET_VOID, release_secctx, char *secdata, u32 seclen) From fd4a5177382230d39e0d95632d98103fb2938383 Mon Sep 17 00:00:00 2001 From: Vinod Koul Date: Thu, 14 May 2020 11:58:36 +0530 Subject: [PATCH 51/68] net: stmmac: fix num_por initialization Driver missed initializing num_por which is one of the por values that driver configures to hardware. In order to get these values, add a new structure ethqos_emac_driver_data which holds por and num_por values and populate that in driver probe. Fixes: a7c30e62d4b8 ("net: stmmac: Add driver for Qualcomm ethqos") Reported-by: Rahul Ankushrao Kawadgave Signed-off-by: Vinod Koul Reviewed-by: Amit Kucheria Signed-off-by: David S. Miller --- .../ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c index e0a5fe83d8e0..bfc4a92f1d92 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-qcom-ethqos.c @@ -75,6 +75,11 @@ struct ethqos_emac_por { unsigned int value; }; +struct ethqos_emac_driver_data { + const struct ethqos_emac_por *por; + unsigned int num_por; +}; + struct qcom_ethqos { struct platform_device *pdev; void __iomem *rgmii_base; @@ -171,6 +176,11 @@ static const struct ethqos_emac_por emac_v2_3_0_por[] = { { .offset = RGMII_IO_MACRO_CONFIG2, .value = 0x00002060 }, }; +static const struct ethqos_emac_driver_data emac_v2_3_0_data = { + .por = emac_v2_3_0_por, + .num_por = ARRAY_SIZE(emac_v2_3_0_por), +}; + static int ethqos_dll_configure(struct qcom_ethqos *ethqos) { unsigned int val; @@ -442,6 +452,7 @@ static int qcom_ethqos_probe(struct platform_device *pdev) struct device_node *np = pdev->dev.of_node; struct plat_stmmacenet_data *plat_dat; struct stmmac_resources stmmac_res; + const struct ethqos_emac_driver_data *data; struct qcom_ethqos *ethqos; struct resource *res; int ret; @@ -471,7 +482,9 @@ static int qcom_ethqos_probe(struct platform_device *pdev) goto err_mem; } - ethqos->por = of_device_get_match_data(&pdev->dev); + data = of_device_get_match_data(&pdev->dev); + ethqos->por = data->por; + ethqos->num_por = data->num_por; ethqos->rgmii_clk = devm_clk_get(&pdev->dev, "rgmii"); if (IS_ERR(ethqos->rgmii_clk)) { @@ -526,7 +539,7 @@ static int qcom_ethqos_remove(struct platform_device *pdev) } static const struct of_device_id qcom_ethqos_match[] = { - { .compatible = "qcom,qcs404-ethqos", .data = &emac_v2_3_0_por}, + { .compatible = "qcom,qcs404-ethqos", .data = &emac_v2_3_0_data}, { } }; MODULE_DEVICE_TABLE(of, qcom_ethqos_match); From e92888c72fbdc6f9d07b3b0604c012e81d7c0da7 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 22:32:05 -0700 Subject: [PATCH 52/68] bpf: Enforce returning 0 for fentry/fexit progs Currently, tracing/fentry and tracing/fexit prog return values are not enforced. In trampoline codes, the fentry/fexit prog return values are ignored. Let us enforce it to be 0 to avoid confusion and allows potential future extension. This patch also explicitly added return value checking for tracing/raw_tp, tracing/fmod_ret, and freplace programs such that these program return values can be anything. The purpose are two folds: 1. to make it explicit about return value expectations for these programs in verifier. 2. for tracing prog_type, if a future attach type is added, the default is -ENOTSUPP which will enforce to specify return value ranges explicitly. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200514053206.1298415-1-yhs@fb.com --- kernel/bpf/verifier.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fa1d8245b925..a44ba6672688 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7059,6 +7059,23 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; range = tnum_const(0); break; + case BPF_PROG_TYPE_TRACING: + switch (env->prog->expected_attach_type) { + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + range = tnum_const(0); + break; + case BPF_TRACE_RAW_TP: + case BPF_MODIFY_RETURN: + return 0; + default: + return -ENOTSUPP; + } + break; + case BPF_PROG_TYPE_EXT: + /* freplace program can return anything as its return value + * depends on the to-be-replaced kernel func or bpf program. + */ default: return 0; } From 6d74f64b922b8394dccc52576659cb0dc0a1da7b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 May 2020 22:32:07 -0700 Subject: [PATCH 53/68] selftests/bpf: Enforce returning 0 for fentry/fexit programs There are a few fentry/fexit programs returning non-0. The tests with these programs will break with the previous patch which enfoced return-0 rules. Fix them properly. Fixes: ac065870d928 ("selftests/bpf: Add BPF_PROG, BPF_KPROBE, and BPF_KRETPROBE macros") Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200514053207.1298479-1-yhs@fb.com --- tools/testing/selftests/bpf/progs/test_overhead.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/test_overhead.c b/tools/testing/selftests/bpf/progs/test_overhead.c index 56a50b25cd33..abb7344b531f 100644 --- a/tools/testing/selftests/bpf/progs/test_overhead.c +++ b/tools/testing/selftests/bpf/progs/test_overhead.c @@ -30,13 +30,13 @@ int prog3(struct bpf_raw_tracepoint_args *ctx) SEC("fentry/__set_task_comm") int BPF_PROG(prog4, struct task_struct *tsk, const char *buf, bool exec) { - return !tsk; + return 0; } SEC("fexit/__set_task_comm") int BPF_PROG(prog5, struct task_struct *tsk, const char *buf, bool exec) { - return !tsk; + return 0; } char _license[] SEC("license") = "GPL"; From b8c158395119be62294da73646a3953c29ac974b Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 14 May 2020 12:15:39 +0200 Subject: [PATCH 54/68] pppoe: only process PADT targeted at local interfaces MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We don't want to disconnect a session because of a stray PADT arriving while the interface is in promiscuous mode. Furthermore, multicast and broadcast packets make no sense here, so only PACKET_HOST is accepted. Reported-by: David Balažic Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- drivers/net/ppp/pppoe.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index d760a36db28c..beedaad08255 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -490,6 +490,9 @@ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev, if (!skb) goto out; + if (skb->pkt_type != PACKET_HOST) + goto abort; + if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto abort; From 16bb1b505c3c6ec6c0ec5943cef67b23f228979a Mon Sep 17 00:00:00 2001 From: Wang Wenhu Date: Thu, 14 May 2020 04:02:22 -0700 Subject: [PATCH 55/68] drivers: ipa: fix typos for ipa_smp2p structure doc Remove the duplicate "mutex", and change "Motex" to "Mutex". Also I recommend it's easier for understanding to make the "ready-interrupt" a bundle for it is a parallel description as "shutdown" which is appended after the slash. Signed-off-by: Wang Wenhu Cc: Alex Elder Signed-off-by: David S. Miller --- drivers/net/ipa/ipa_smp2p.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ipa/ipa_smp2p.c b/drivers/net/ipa/ipa_smp2p.c index 4d33aa7ebfbb..a5f7a79a1923 100644 --- a/drivers/net/ipa/ipa_smp2p.c +++ b/drivers/net/ipa/ipa_smp2p.c @@ -53,7 +53,7 @@ * @clock_on: Whether IPA clock is on * @notified: Whether modem has been notified of clock state * @disabled: Whether setup ready interrupt handling is disabled - * @mutex mutex: Motex protecting ready interrupt/shutdown interlock + * @mutex: Mutex protecting ready-interrupt/shutdown interlock * @panic_notifier: Panic notifier structure */ struct ipa_smp2p { From 865e525db666767eb7bbcfc7d4ce7326b2b19271 Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Thu, 14 May 2020 13:45:12 +0200 Subject: [PATCH 56/68] MAINTAINERS: another add of Karsten Graul for S390 networking Complete adding of Karsten as maintainer for all S390 networking parts in the kernel. Cc: Julian Wiedmann Acked-by: Julian Wiedmann Signed-off-by: Ursula Braun Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 85894787825e..391e7eea6a3e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14655,6 +14655,7 @@ F: net/iucv/ S390 NETWORK DRIVERS M: Julian Wiedmann +M: Karsten Graul M: Ursula Braun L: linux-s390@vger.kernel.org S: Supported From c9e2053d4b1c634064bd3160689fe4ed52978d31 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 14 May 2020 13:13:07 -0700 Subject: [PATCH 57/68] MAINTAINERS: Add Jakub to networking drivers. Signed-off-by: David S. Miller --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 391e7eea6a3e..4b270dbdf09b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11718,6 +11718,7 @@ F: net/core/drop_monitor.c NETWORKING DRIVERS M: "David S. Miller" +M: Jakub Kicinski L: netdev@vger.kernel.org S: Odd Fixes W: http://www.linuxfoundation.org/en/Net From e776af608f692a7a647455106295fa34469e7475 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 May 2020 13:58:13 -0700 Subject: [PATCH 58/68] tcp: fix error recovery in tcp_zerocopy_receive() If user provides wrong virtual address in TCP_ZEROCOPY_RECEIVE operation we want to return -EINVAL error. But depending on zc->recv_skip_hint content, we might return -EIO error if the socket has SOCK_DONE set. Make sure to return -EINVAL in this case. BUG: KMSAN: uninit-value in tcp_zerocopy_receive net/ipv4/tcp.c:1833 [inline] BUG: KMSAN: uninit-value in do_tcp_getsockopt+0x4494/0x6320 net/ipv4/tcp.c:3685 CPU: 1 PID: 625 Comm: syz-executor.0 Not tainted 5.7.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x1c9/0x220 lib/dump_stack.c:118 kmsan_report+0xf7/0x1e0 mm/kmsan/kmsan_report.c:121 __msan_warning+0x58/0xa0 mm/kmsan/kmsan_instr.c:215 tcp_zerocopy_receive net/ipv4/tcp.c:1833 [inline] do_tcp_getsockopt+0x4494/0x6320 net/ipv4/tcp.c:3685 tcp_getsockopt+0xf8/0x1f0 net/ipv4/tcp.c:3728 sock_common_getsockopt+0x13f/0x180 net/core/sock.c:3131 __sys_getsockopt+0x533/0x7b0 net/socket.c:2177 __do_sys_getsockopt net/socket.c:2192 [inline] __se_sys_getsockopt+0xe1/0x100 net/socket.c:2189 __x64_sys_getsockopt+0x62/0x80 net/socket.c:2189 do_syscall_64+0xb8/0x160 arch/x86/entry/common.c:297 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45c829 Code: 0d b7 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b6 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f1deeb72c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000037 RAX: ffffffffffffffda RBX: 00000000004e01e0 RCX: 000000000045c829 RDX: 0000000000000023 RSI: 0000000000000006 RDI: 0000000000000009 RBP: 000000000078bf00 R08: 0000000020000200 R09: 0000000000000000 R10: 00000000200001c0 R11: 0000000000000246 R12: 00000000ffffffff R13: 00000000000001d8 R14: 00000000004d3038 R15: 00007f1deeb736d4 Local variable ----zc@do_tcp_getsockopt created at: do_tcp_getsockopt+0x1a74/0x6320 net/ipv4/tcp.c:3670 do_tcp_getsockopt+0x1a74/0x6320 net/ipv4/tcp.c:3670 Fixes: 05255b823a61 ("tcp: add TCP_ZEROCOPY_RECEIVE support for zerocopy receive") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index a385fcaaa03b..dd401757eea1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1764,10 +1764,11 @@ static int tcp_zerocopy_receive(struct sock *sk, down_read(¤t->mm->mmap_sem); - ret = -EINVAL; vma = find_vma(current->mm, address); - if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) - goto out; + if (!vma || vma->vm_start > address || vma->vm_ops != &tcp_vm_ops) { + up_read(¤t->mm->mmap_sem); + return -EINVAL; + } zc->length = min_t(unsigned long, zc->length, vma->vm_end - address); tp = tcp_sk(sk); From cc8a677a76f419016b5e231207d09b073f9b1d3f Mon Sep 17 00:00:00 2001 From: Kevin Lo Date: Thu, 14 May 2020 08:57:33 +0800 Subject: [PATCH 59/68] net: phy: broadcom: fix BCM54XX_SHD_SCR3_TRDDAPD value for BCM54810 Set the correct bit when checking for PHY_BRCM_DIS_TXCRXC_NOENRGY on the BCM54810 PHY. Fixes: 0ececcfc9267 ("net: phy: broadcom: Allow BCM54810 to use bcm54xx_adjust_rxrefclk()") Signed-off-by: Kevin Lo Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- drivers/net/phy/broadcom.c | 8 ++++++-- include/linux/brcmphy.h | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c index ae4873f2f86e..d14d91b759b7 100644 --- a/drivers/net/phy/broadcom.c +++ b/drivers/net/phy/broadcom.c @@ -225,8 +225,12 @@ static void bcm54xx_adjust_rxrefclk(struct phy_device *phydev) else val |= BCM54XX_SHD_SCR3_DLLAPD_DIS; - if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) - val |= BCM54XX_SHD_SCR3_TRDDAPD; + if (phydev->dev_flags & PHY_BRCM_DIS_TXCRXC_NOENRGY) { + if (BRCM_PHY_MODEL(phydev) == PHY_ID_BCM54810) + val |= BCM54810_SHD_SCR3_TRDDAPD; + else + val |= BCM54XX_SHD_SCR3_TRDDAPD; + } if (orig != val) bcm_phy_write_shadow(phydev, BCM54XX_SHD_SCR3, val); diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 6462c5447872..f4b77018c625 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -245,6 +245,7 @@ #define BCM54810_EXP_BROADREACH_LRE_MISC_CTL_EN (1 << 0) #define BCM54810_SHD_CLK_CTL 0x3 #define BCM54810_SHD_CLK_CTL_GTXCLK_EN (1 << 9) +#define BCM54810_SHD_SCR3_TRDDAPD 0x0100 /* BCM54612E Registers */ #define BCM54612E_EXP_SPARE0 (MII_BCM54XX_EXP_SEL_ETC + 0x34) From 95f59bf88bb75281cc626e283ecefdd5d5641427 Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 14 May 2020 19:41:15 +0530 Subject: [PATCH 60/68] drivers: net: hamradio: Fix suspicious RCU usage warning in bpqether.c This patch fixes the following warning: ============================= WARNING: suspicious RCU usage 5.7.0-rc5-next-20200514-syzkaller #0 Not tainted ----------------------------- drivers/net/hamradio/bpqether.c:149 RCU-list traversed in non-reader section!! Since rtnl lock is held, pass this cond in list_for_each_entry_rcu(). Reported-by: syzbot+bb82cafc737c002d11ca@syzkaller.appspotmail.com Signed-off-by: Madhuparna Bhowmik Signed-off-by: David S. Miller --- drivers/net/hamradio/bpqether.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index fbea6f232819..e2ad3c2e8df5 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -127,7 +127,8 @@ static inline struct net_device *bpq_get_ax25_dev(struct net_device *dev) { struct bpqdev *bpq; - list_for_each_entry_rcu(bpq, &bpq_devices, bpq_list) { + list_for_each_entry_rcu(bpq, &bpq_devices, bpq_list, + lockdep_rtnl_is_held()) { if (bpq->ethdev == dev) return bpq->axdev; } From a14fbcd4f157a5a97b6013289205559d246c5021 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 14 May 2020 23:31:02 +0530 Subject: [PATCH 61/68] ipmr: Fix RCU list debugging warning ipmr_for_each_table() macro uses list_for_each_entry_rcu() for traversing outside of an RCU read side critical section but under the protection of rtnl_mutex. Hence, add the corresponding lockdep expression to silence the following false-positive warning at boot: [ 4.319347] ============================= [ 4.319349] WARNING: suspicious RCU usage [ 4.319351] 5.5.4-stable #17 Tainted: G E [ 4.319352] ----------------------------- [ 4.319354] net/ipv4/ipmr.c:1757 RCU-list traversed in non-reader section!! Fixes: f0ad0860d01e ("ipv4: ipmr: support multiple tables") Signed-off-by: Amol Grover Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 9cf83cc85e4a..4897f7420c8f 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -110,7 +110,8 @@ static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES #define ipmr_for_each_table(mrt, net) \ - list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list) + list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \ + lockdep_rtnl_is_held()) static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) From 7013908c2db285cd6b48fcd427a56354beac2233 Mon Sep 17 00:00:00 2001 From: Amol Grover Date: Thu, 14 May 2020 23:31:03 +0530 Subject: [PATCH 62/68] ipmr: Add lockdep expression to ipmr_for_each_table macro During the initialization process, ipmr_new_table() is called to create new tables which in turn calls ipmr_get_table() which traverses net->ipv4.mr_tables without holding the writer lock. However, this is safe to do so as no tables exist at this time. Hence add a suitable lockdep expression to silence the following false-positive warning: ============================= WARNING: suspicious RCU usage 5.7.0-rc3-next-20200428-syzkaller #0 Not tainted ----------------------------- net/ipv4/ipmr.c:136 RCU-list traversed in non-reader section!! ipmr_get_table+0x130/0x160 net/ipv4/ipmr.c:136 ipmr_new_table net/ipv4/ipmr.c:403 [inline] ipmr_rules_init net/ipv4/ipmr.c:248 [inline] ipmr_net_init+0x133/0x430 net/ipv4/ipmr.c:3089 Fixes: f0ad0860d01e ("ipv4: ipmr: support multiple tables") Reported-by: syzbot+1519f497f2f9f08183c6@syzkaller.appspotmail.com Suggested-by: Jakub Kicinski Signed-off-by: Amol Grover Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 4897f7420c8f..5c218db2dede 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -109,9 +109,10 @@ static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES -#define ipmr_for_each_table(mrt, net) \ - list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \ - lockdep_rtnl_is_held()) +#define ipmr_for_each_table(mrt, net) \ + list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list, \ + lockdep_rtnl_is_held() || \ + list_empty(&net->ipv4.mr_tables)) static struct mr_table *ipmr_mr_table_iter(struct net *net, struct mr_table *mrt) From 207b584d0ab87e492412fcd69030e34873c7ed5e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 14 May 2020 18:04:41 -0700 Subject: [PATCH 63/68] MAINTAINERS: Mark networking drivers as Maintained. Suggested-by: Andrew Lunn Signed-off-by: David S. Miller --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 4b270dbdf09b..2c59cc557f2b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11720,7 +11720,7 @@ NETWORKING DRIVERS M: "David S. Miller" M: Jakub Kicinski L: netdev@vger.kernel.org -S: Odd Fixes +S: Maintained W: http://www.linuxfoundation.org/en/Net Q: http://patchwork.ozlabs.org/project/netdev/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git From 0ebeea8ca8a4d1d453ad299aef0507dab04f6e8d Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 May 2020 12:11:16 +0200 Subject: [PATCH 64/68] bpf: Restrict bpf_probe_read{, str}() only to archs where they work Given the legacy bpf_probe_read{,str}() BPF helpers are broken on archs with overlapping address ranges, we should really take the next step to disable them from BPF use there. To generally fix the situation, we've recently added new helper variants bpf_probe_read_{user,kernel}() and bpf_probe_read_{user,kernel}_str(). For details on them, see 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel} and probe_read_{user,kernel}_str helpers"). Given bpf_probe_read{,str}() have been around for ~5 years by now, there are plenty of users at least on x86 still relying on them today, so we cannot remove them entirely w/o breaking the BPF tracing ecosystem. However, their use should be restricted to archs with non-overlapping address ranges where they are working in their current form. Therefore, move this behind a CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE and have x86, arm64, arm select it (other archs supporting it can follow-up on it as well). For the remaining archs, they can workaround easily by relying on the feature probe from bpftool which spills out defines that can be used out of BPF C code to implement the drop-in replacement for old/new kernels via: bpftool feature probe macro Suggested-by: Linus Torvalds Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Masami Hiramatsu Acked-by: Linus Torvalds Cc: Brendan Gregg Cc: Christoph Hellwig Link: https://lore.kernel.org/bpf/20200515101118.6508-2-daniel@iogearbox.net --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/x86/Kconfig | 1 + init/Kconfig | 3 +++ kernel/trace/bpf_trace.c | 6 ++++-- 5 files changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 66a04f6f4775..c77c93c485a0 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -12,6 +12,7 @@ config ARM select ARCH_HAS_KEEPINITRD select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 40fb05d96c60..5d513f461957 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -20,6 +20,7 @@ config ARM64 select ARCH_HAS_KCOV select ARCH_HAS_KEEPINITRD select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PTE_DEVMAP select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SETUP_DMA_OPS diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1197b5596d5a..2d3f963fd6f1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -68,6 +68,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_PTE_DEVMAP if X86_64 select ARCH_HAS_PTE_SPECIAL diff --git a/init/Kconfig b/init/Kconfig index 9e22ee8fbd75..6fd13a051342 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -2279,6 +2279,9 @@ config ASN1 source "kernel/Kconfig.locks" +config ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + bool + config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE bool diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index ca1796747a77..b83bdaa31c7b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -825,14 +825,16 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: return &bpf_probe_read_kernel_proto; - case BPF_FUNC_probe_read: - return &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + case BPF_FUNC_probe_read: + return &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: return &bpf_probe_read_compat_str_proto; +#endif #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; From 47cc0ed574abcbbde0cf143ddb21a0baed1aa2df Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 May 2020 12:11:17 +0200 Subject: [PATCH 65/68] bpf: Add bpf_probe_read_{user, kernel}_str() to do_refine_retval_range Given bpf_probe_read{,str}() BPF helpers are now only available under CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE, we need to add the drop-in replacements of bpf_probe_read_{kernel,user}_str() to do_refine_retval_range() as well to avoid hitting the same issue as in 849fa50662fbc ("bpf/verifier: refine retval R0 state for bpf_get_stack helper"). Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200515101118.6508-3-daniel@iogearbox.net --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a44ba6672688..8d7ee40e2748 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4340,7 +4340,9 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, if (ret_type != RET_INTEGER || (func_id != BPF_FUNC_get_stack && - func_id != BPF_FUNC_probe_read_str)) + func_id != BPF_FUNC_probe_read_str && + func_id != BPF_FUNC_probe_read_kernel_str && + func_id != BPF_FUNC_probe_read_user_str)) return; ret_reg->smax_value = meta->msize_max_value; From b2a5212fb634561bb734c6356904e37f6665b955 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 15 May 2020 12:11:18 +0200 Subject: [PATCH 66/68] bpf: Restrict bpf_trace_printk()'s %s usage and add %pks, %pus specifier Usage of plain %s conversion specifier in bpf_trace_printk() suffers from the very same issue as bpf_probe_read{,str}() helpers, that is, it is broken on archs with overlapping address ranges. While the helpers have been addressed through work in 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers"), we need an option for bpf_trace_printk() as well to fix it. Similarly as with the helpers, force users to make an explicit choice by adding %pks and %pus specifier to bpf_trace_printk() which will then pick the corresponding strncpy_from_unsafe*() variant to perform the access under KERNEL_DS or USER_DS. The %pk* (kernel specifier) and %pu* (user specifier) can later also be extended for other objects aside strings that are probed and printed under tracing, and reused out of other facilities like bpf_seq_printf() or BTF based type printing. Existing behavior of %s for current users is still kept working for archs where it is not broken and therefore gated through CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE. For archs not having this property we fall-back to pick probing under KERNEL_DS as a sensible default. Fixes: 8d3b7dce8622 ("bpf: add support for %s specifier to bpf_trace_printk()") Reported-by: Linus Torvalds Reported-by: Christoph Hellwig Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Cc: Masami Hiramatsu Cc: Brendan Gregg Link: https://lore.kernel.org/bpf/20200515101118.6508-4-daniel@iogearbox.net --- Documentation/core-api/printk-formats.rst | 14 ++++ kernel/trace/bpf_trace.c | 94 +++++++++++++++-------- lib/vsprintf.c | 12 +++ 3 files changed, 88 insertions(+), 32 deletions(-) diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 8ebe46b1af39..5dfcc4592b23 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -112,6 +112,20 @@ used when printing stack backtraces. The specifier takes into consideration the effect of compiler optimisations which may occur when tail-calls are used and marked with the noreturn GCC attribute. +Probed Pointers from BPF / tracing +---------------------------------- + +:: + + %pks kernel string + %pus user string + +The ``k`` and ``u`` specifiers are used for printing prior probed memory from +either kernel memory (k) or user memory (u). The subsequent ``s`` specifier +results in printing a string. For direct use in regular vsnprintf() the (k) +and (u) annotation is ignored, however, when used out of BPF's bpf_trace_printk(), +for example, it reads the memory it is pointing to without faulting. + Kernel Pointers --------------- diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b83bdaa31c7b..a010edc37ee0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -323,17 +323,15 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void) /* * Only limited trace_printk() conversion specifiers allowed: - * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s + * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %pks %pus %s */ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, u64, arg2, u64, arg3) { + int i, mod[3] = {}, fmt_cnt = 0; + char buf[64], fmt_ptype; + void *unsafe_ptr = NULL; bool str_seen = false; - int mod[3] = {}; - int fmt_cnt = 0; - u64 unsafe_addr; - char buf[64]; - int i; /* * bpf_check()->check_func_arg()->check_stack_boundary() @@ -359,40 +357,71 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (fmt[i] == 'l') { mod[fmt_cnt]++; i++; - } else if (fmt[i] == 'p' || fmt[i] == 's') { + } else if (fmt[i] == 'p') { mod[fmt_cnt]++; + if ((fmt[i + 1] == 'k' || + fmt[i + 1] == 'u') && + fmt[i + 2] == 's') { + fmt_ptype = fmt[i + 1]; + i += 2; + goto fmt_str; + } + /* disallow any further format extensions */ if (fmt[i + 1] != 0 && !isspace(fmt[i + 1]) && !ispunct(fmt[i + 1])) return -EINVAL; - fmt_cnt++; - if (fmt[i] == 's') { - if (str_seen) - /* allow only one '%s' per fmt string */ - return -EINVAL; - str_seen = true; - switch (fmt_cnt) { - case 1: - unsafe_addr = arg1; - arg1 = (long) buf; - break; - case 2: - unsafe_addr = arg2; - arg2 = (long) buf; - break; - case 3: - unsafe_addr = arg3; - arg3 = (long) buf; - break; - } - buf[0] = 0; - strncpy_from_unsafe(buf, - (void *) (long) unsafe_addr, - sizeof(buf)); + goto fmt_next; + } else if (fmt[i] == 's') { + mod[fmt_cnt]++; + fmt_ptype = fmt[i]; +fmt_str: + if (str_seen) + /* allow only one '%s' per fmt string */ + return -EINVAL; + str_seen = true; + + if (fmt[i + 1] != 0 && + !isspace(fmt[i + 1]) && + !ispunct(fmt[i + 1])) + return -EINVAL; + + switch (fmt_cnt) { + case 0: + unsafe_ptr = (void *)(long)arg1; + arg1 = (long)buf; + break; + case 1: + unsafe_ptr = (void *)(long)arg2; + arg2 = (long)buf; + break; + case 2: + unsafe_ptr = (void *)(long)arg3; + arg3 = (long)buf; + break; } - continue; + + buf[0] = 0; + switch (fmt_ptype) { + case 's': +#ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + strncpy_from_unsafe(buf, unsafe_ptr, + sizeof(buf)); + break; +#endif + case 'k': + strncpy_from_unsafe_strict(buf, unsafe_ptr, + sizeof(buf)); + break; + case 'u': + strncpy_from_unsafe_user(buf, + (__force void __user *)unsafe_ptr, + sizeof(buf)); + break; + } + goto fmt_next; } if (fmt[i] == 'l') { @@ -403,6 +432,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, if (fmt[i] != 'i' && fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') return -EINVAL; +fmt_next: fmt_cnt++; } diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7c488a1ce318..532b6606a18a 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2168,6 +2168,10 @@ char *fwnode_string(char *buf, char *end, struct fwnode_handle *fwnode, * f full name * P node name, including a possible unit address * - 'x' For printing the address. Equivalent to "%lx". + * - '[ku]s' For a BPF/tracing related format specifier, e.g. used out of + * bpf_trace_printk() where [ku] prefix specifies either kernel (k) + * or user (u) memory to probe, and: + * s a string, equivalent to "%s" on direct vsnprintf() use * * ** When making changes please also update: * Documentation/core-api/printk-formats.rst @@ -2251,6 +2255,14 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, if (!IS_ERR(ptr)) break; return err_ptr(buf, end, ptr, spec); + case 'u': + case 'k': + switch (fmt[1]) { + case 's': + return string(buf, end, ptr, spec); + default: + return error_string(buf, end, "(einval)", spec); + } } /* default is to _not_ leak addresses, hash before printing */ From efa6a7d07523ffbbf6503c1a7eeb52201c15c0e3 Mon Sep 17 00:00:00 2001 From: Ioana Ciornei Date: Fri, 15 May 2020 15:30:22 +0300 Subject: [PATCH 67/68] dpaa2-eth: properly handle buffer size restrictions Depending on the WRIOP version, the buffer size on the RX path must by a multiple of 64 or 256. Handle this restriction properly by aligning down the buffer size to the necessary value. Also, use the new buffer size dynamically computed instead of the compile time one. Fixes: 27c874867c4e ("dpaa2-eth: Use a single page per Rx buffer") Signed-off-by: Ioana Ciornei Signed-off-by: David S. Miller --- .../net/ethernet/freescale/dpaa2/dpaa2-eth.c | 29 +++++++++++-------- .../net/ethernet/freescale/dpaa2/dpaa2-eth.h | 1 + 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index b6c46639aa4c..d97c320a2dc0 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -86,7 +86,7 @@ static void free_rx_fd(struct dpaa2_eth_priv *priv, for (i = 1; i < DPAA2_ETH_MAX_SG_ENTRIES; i++) { addr = dpaa2_sg_get_addr(&sgt[i]); sg_vaddr = dpaa2_iova_to_virt(priv->iommu_domain, addr); - dma_unmap_page(dev, addr, DPAA2_ETH_RX_BUF_SIZE, + dma_unmap_page(dev, addr, priv->rx_buf_size, DMA_BIDIRECTIONAL); free_pages((unsigned long)sg_vaddr, 0); @@ -144,7 +144,7 @@ static struct sk_buff *build_frag_skb(struct dpaa2_eth_priv *priv, /* Get the address and length from the S/G entry */ sg_addr = dpaa2_sg_get_addr(sge); sg_vaddr = dpaa2_iova_to_virt(priv->iommu_domain, sg_addr); - dma_unmap_page(dev, sg_addr, DPAA2_ETH_RX_BUF_SIZE, + dma_unmap_page(dev, sg_addr, priv->rx_buf_size, DMA_BIDIRECTIONAL); sg_length = dpaa2_sg_get_len(sge); @@ -185,7 +185,7 @@ static struct sk_buff *build_frag_skb(struct dpaa2_eth_priv *priv, (page_address(page) - page_address(head_page)); skb_add_rx_frag(skb, i - 1, head_page, page_offset, - sg_length, DPAA2_ETH_RX_BUF_SIZE); + sg_length, priv->rx_buf_size); } if (dpaa2_sg_is_final(sge)) @@ -211,7 +211,7 @@ static void free_bufs(struct dpaa2_eth_priv *priv, u64 *buf_array, int count) for (i = 0; i < count; i++) { vaddr = dpaa2_iova_to_virt(priv->iommu_domain, buf_array[i]); - dma_unmap_page(dev, buf_array[i], DPAA2_ETH_RX_BUF_SIZE, + dma_unmap_page(dev, buf_array[i], priv->rx_buf_size, DMA_BIDIRECTIONAL); free_pages((unsigned long)vaddr, 0); } @@ -335,7 +335,7 @@ static u32 run_xdp(struct dpaa2_eth_priv *priv, break; case XDP_REDIRECT: dma_unmap_page(priv->net_dev->dev.parent, addr, - DPAA2_ETH_RX_BUF_SIZE, DMA_BIDIRECTIONAL); + priv->rx_buf_size, DMA_BIDIRECTIONAL); ch->buf_count--; xdp.data_hard_start = vaddr; err = xdp_do_redirect(priv->net_dev, &xdp, xdp_prog); @@ -374,7 +374,7 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, trace_dpaa2_rx_fd(priv->net_dev, fd); vaddr = dpaa2_iova_to_virt(priv->iommu_domain, addr); - dma_sync_single_for_cpu(dev, addr, DPAA2_ETH_RX_BUF_SIZE, + dma_sync_single_for_cpu(dev, addr, priv->rx_buf_size, DMA_BIDIRECTIONAL); fas = dpaa2_get_fas(vaddr, false); @@ -393,13 +393,13 @@ static void dpaa2_eth_rx(struct dpaa2_eth_priv *priv, return; } - dma_unmap_page(dev, addr, DPAA2_ETH_RX_BUF_SIZE, + dma_unmap_page(dev, addr, priv->rx_buf_size, DMA_BIDIRECTIONAL); skb = build_linear_skb(ch, fd, vaddr); } else if (fd_format == dpaa2_fd_sg) { WARN_ON(priv->xdp_prog); - dma_unmap_page(dev, addr, DPAA2_ETH_RX_BUF_SIZE, + dma_unmap_page(dev, addr, priv->rx_buf_size, DMA_BIDIRECTIONAL); skb = build_frag_skb(priv, ch, buf_data); free_pages((unsigned long)vaddr, 0); @@ -974,7 +974,7 @@ static int add_bufs(struct dpaa2_eth_priv *priv, if (!page) goto err_alloc; - addr = dma_map_page(dev, page, 0, DPAA2_ETH_RX_BUF_SIZE, + addr = dma_map_page(dev, page, 0, priv->rx_buf_size, DMA_BIDIRECTIONAL); if (unlikely(dma_mapping_error(dev, addr))) goto err_map; @@ -984,7 +984,7 @@ static int add_bufs(struct dpaa2_eth_priv *priv, /* tracing point */ trace_dpaa2_eth_buf_seed(priv->net_dev, page, DPAA2_ETH_RX_BUF_RAW_SIZE, - addr, DPAA2_ETH_RX_BUF_SIZE, + addr, priv->rx_buf_size, bpid); } @@ -1720,7 +1720,7 @@ static bool xdp_mtu_valid(struct dpaa2_eth_priv *priv, int mtu) int mfl, linear_mfl; mfl = DPAA2_ETH_L2_MAX_FRM(mtu); - linear_mfl = DPAA2_ETH_RX_BUF_SIZE - DPAA2_ETH_RX_HWA_SIZE - + linear_mfl = priv->rx_buf_size - DPAA2_ETH_RX_HWA_SIZE - dpaa2_eth_rx_head_room(priv) - XDP_PACKET_HEADROOM; if (mfl > linear_mfl) { @@ -2462,6 +2462,11 @@ static int set_buffer_layout(struct dpaa2_eth_priv *priv) else rx_buf_align = DPAA2_ETH_RX_BUF_ALIGN; + /* We need to ensure that the buffer size seen by WRIOP is a multiple + * of 64 or 256 bytes depending on the WRIOP version. + */ + priv->rx_buf_size = ALIGN_DOWN(DPAA2_ETH_RX_BUF_SIZE, rx_buf_align); + /* tx buffer */ buf_layout.private_data_size = DPAA2_ETH_SWA_SIZE; buf_layout.pass_timestamp = true; @@ -3126,7 +3131,7 @@ static int bind_dpni(struct dpaa2_eth_priv *priv) pools_params.num_dpbp = 1; pools_params.pools[0].dpbp_id = priv->dpbp_dev->obj_desc.id; pools_params.pools[0].backup_pool = 0; - pools_params.pools[0].buffer_size = DPAA2_ETH_RX_BUF_SIZE; + pools_params.pools[0].buffer_size = priv->rx_buf_size; err = dpni_set_pools(priv->mc_io, 0, priv->mc_token, &pools_params); if (err) { dev_err(dev, "dpni_set_pools() failed\n"); diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h index 7635db3ef903..13242bf5b427 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.h @@ -382,6 +382,7 @@ struct dpaa2_eth_priv { u16 tx_data_offset; struct fsl_mc_device *dpbp_dev; + u16 rx_buf_size; u16 bpid; struct iommu_domain *iommu_domain; From 9a2dbb59ebd17a91f99a20b0286dfb6445981ecf Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Fri, 15 May 2020 17:54:41 +0200 Subject: [PATCH 68/68] selftests: mptcp: pm: rm the right tmp file "$err" is a variable pointing to a temp file. "$out" is not: only used as a local variable in "check()" and representing the output of a command line. Fixes: eedbc685321b (selftests: add PM netlink functional tests) Signed-off-by: Matthieu Baerts Signed-off-by: David S. Miller --- tools/testing/selftests/net/mptcp/pm_netlink.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/mptcp/pm_netlink.sh b/tools/testing/selftests/net/mptcp/pm_netlink.sh index 9172746b6cf0..15f4f46ca3a9 100755 --- a/tools/testing/selftests/net/mptcp/pm_netlink.sh +++ b/tools/testing/selftests/net/mptcp/pm_netlink.sh @@ -30,7 +30,7 @@ ret=0 cleanup() { - rm -f $out + rm -f $err ip netns del $ns1 }