From 1904fb9ebf911441f90a68e96b22aa73e4410505 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Nov 2024 17:52:34 -0800 Subject: [PATCH 01/38] netlink: terminate outstanding dump on socket close Netlink supports iterative dumping of data. It provides the families the following ops: - start - (optional) kicks off the dumping process - dump - actual dump helper, keeps getting called until it returns 0 - done - (optional) pairs with .start, can be used for cleanup The whole process is asynchronous and the repeated calls to .dump don't actually happen in a tight loop, but rather are triggered in response to recvmsg() on the socket. This gives the user full control over the dump, but also means that the user can close the socket without getting to the end of the dump. To make sure .start is always paired with .done we check if there is an ongoing dump before freeing the socket, and if so call .done. The complication is that sockets can get freed from BH and .done is allowed to sleep. So we use a workqueue to defer the call, when needed. Unfortunately this does not work correctly. What we defer is not the cleanup but rather releasing a reference on the socket. We have no guarantee that we own the last reference, if someone else holds the socket they may release it in BH and we're back to square one. The whole dance, however, appears to be unnecessary. Only the user can interact with dumps, so we can clean up when socket is closed. And close always happens in process context. Some async code may still access the socket after close, queue notification skbs to it etc. but no dumps can start, end or otherwise make progress. Delete the workqueue and flush the dump state directly from the release handler. Note that further cleanup is possible in -next, for instance we now always call .done before releasing the main module reference, so dump doesn't have to take a reference of its own. Reported-by: syzkaller Fixes: ed5d7788a934 ("netlink: Do not schedule work from sk_destruct") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241106015235.2458807-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 31 ++++++++----------------------- net/netlink/af_netlink.h | 2 -- 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 0a9287fadb47..f84aad420d44 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -393,15 +393,6 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) static void netlink_sock_destruct(struct sock *sk) { - struct netlink_sock *nlk = nlk_sk(sk); - - if (nlk->cb_running) { - if (nlk->cb.done) - nlk->cb.done(&nlk->cb); - module_put(nlk->cb.module); - kfree_skb(nlk->cb.skb); - } - skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { @@ -414,14 +405,6 @@ static void netlink_sock_destruct(struct sock *sk) WARN_ON(nlk_sk(sk)->groups); } -static void netlink_sock_destruct_work(struct work_struct *work) -{ - struct netlink_sock *nlk = container_of(work, struct netlink_sock, - work); - - sk_free(&nlk->sk); -} - /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves @@ -731,12 +714,6 @@ static void deferred_put_nlk_sk(struct rcu_head *head) if (!refcount_dec_and_test(&sk->sk_refcnt)) return; - if (nlk->cb_running && nlk->cb.done) { - INIT_WORK(&nlk->work, netlink_sock_destruct_work); - schedule_work(&nlk->work); - return; - } - sk_free(sk); } @@ -788,6 +765,14 @@ static int netlink_release(struct socket *sock) NETLINK_URELEASE, &n); } + /* Terminate any outstanding dump */ + if (nlk->cb_running) { + if (nlk->cb.done) + nlk->cb.done(&nlk->cb); + module_put(nlk->cb.module); + kfree_skb(nlk->cb.skb); + } + module_put(nlk->module); if (netlink_is_kernel(sk)) { diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 5b0e4e62ab8b..778a3809361f 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -4,7 +4,6 @@ #include #include -#include #include /* flags */ @@ -50,7 +49,6 @@ struct netlink_sock { struct rhash_head node; struct rcu_head rcu; - struct work_struct work; }; static inline struct netlink_sock *nlk_sk(struct sock *sk) From 55d42a0c3f9ccd07c199e0ddbe1ba87572d30074 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Nov 2024 17:52:35 -0800 Subject: [PATCH 02/38] selftests: net: add a test for closing a netlink socket ith dump in progress Close a socket with dump in progress. We need a dump which generates enough info not to fit into a single skb. Policy dump fits the bill. Use the trick discovered by syzbot for keeping a ref on the socket longer than just close, with mqueue. TAP version 13 1..3 # Starting 3 tests from 1 test cases. # RUN global.test_sanity ... # OK global.test_sanity ok 1 global.test_sanity # RUN global.close_in_progress ... # OK global.close_in_progress ok 2 global.close_in_progress # RUN global.close_with_ref ... # OK global.close_with_ref ok 3 global.close_with_ref # PASSED: 3 / 3 tests passed. # Totals: pass:3 fail:0 xfail:0 xpass:0 skip:0 error:0 Note that this test is not expected to fail but rather crash the kernel if we get the cleanup wrong. Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20241106015235.2458807-2-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/netlink-dumps.c | 110 ++++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 tools/testing/selftests/net/netlink-dumps.c diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 649f1fe0dc46..5e86f7a51b43 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -78,6 +78,7 @@ TEST_PROGS += test_vxlan_vnifiltering.sh TEST_GEN_FILES += io_uring_zerocopy_tx TEST_PROGS += io_uring_zerocopy_tx.sh TEST_GEN_FILES += bind_bhash +TEST_GEN_PROGS += netlink-dumps TEST_GEN_PROGS += sk_bind_sendto_listen TEST_GEN_PROGS += sk_connect_zero_addr TEST_GEN_PROGS += sk_so_peek_off diff --git a/tools/testing/selftests/net/netlink-dumps.c b/tools/testing/selftests/net/netlink-dumps.c new file mode 100644 index 000000000000..7ee6dcd334df --- /dev/null +++ b/tools/testing/selftests/net/netlink-dumps.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0 + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../kselftest_harness.h" + +static const struct { + struct nlmsghdr nlhdr; + struct genlmsghdr genlhdr; + struct nlattr ahdr; + __u16 val; + __u16 pad; +} dump_policies = { + .nlhdr = { + .nlmsg_len = sizeof(dump_policies), + .nlmsg_type = GENL_ID_CTRL, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP, + .nlmsg_seq = 1, + }, + .genlhdr = { + .cmd = CTRL_CMD_GETPOLICY, + .version = 2, + }, + .ahdr = { + .nla_len = 6, + .nla_type = CTRL_ATTR_FAMILY_ID, + }, + .val = GENL_ID_CTRL, + .pad = 0, +}; + +// Sanity check for the test itself, make sure the dump doesn't fit in one msg +TEST(test_sanity) +{ + int netlink_sock; + char buf[8192]; + ssize_t n; + + netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + ASSERT_GE(netlink_sock, 0); + + n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0); + ASSERT_EQ(n, sizeof(dump_policies)); + + n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT); + ASSERT_GE(n, sizeof(struct nlmsghdr)); + + n = recv(netlink_sock, buf, sizeof(buf), MSG_DONTWAIT); + ASSERT_GE(n, sizeof(struct nlmsghdr)); + + close(netlink_sock); +} + +TEST(close_in_progress) +{ + int netlink_sock; + ssize_t n; + + netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + ASSERT_GE(netlink_sock, 0); + + n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0); + ASSERT_EQ(n, sizeof(dump_policies)); + + close(netlink_sock); +} + +TEST(close_with_ref) +{ + char cookie[NOTIFY_COOKIE_LEN] = {}; + int netlink_sock, mq_fd; + struct sigevent sigev; + ssize_t n; + + netlink_sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_GENERIC); + ASSERT_GE(netlink_sock, 0); + + n = send(netlink_sock, &dump_policies, sizeof(dump_policies), 0); + ASSERT_EQ(n, sizeof(dump_policies)); + + mq_fd = syscall(__NR_mq_open, "sed", O_CREAT | O_WRONLY, 0600, 0); + ASSERT_GE(mq_fd, 0); + + memset(&sigev, 0, sizeof(sigev)); + sigev.sigev_notify = SIGEV_THREAD; + sigev.sigev_value.sival_ptr = cookie; + sigev.sigev_signo = netlink_sock; + + syscall(__NR_mq_notify, mq_fd, &sigev); + + close(netlink_sock); + + // give mqueue time to fire + usleep(100 * 1000); +} + +TEST_HARNESS_MAIN From eb72e7fcc83987d5d5595b43222f23b295d5de7f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Nov 2024 19:20:21 +0000 Subject: [PATCH 03/38] sctp: fix possible UAF in sctp_v6_available() A lockdep report [1] with CONFIG_PROVE_RCU_LIST=y hints that sctp_v6_available() is calling dev_get_by_index_rcu() and ipv6_chk_addr() without holding rcu. [1] ============================= WARNING: suspicious RCU usage 6.12.0-rc5-virtme #1216 Tainted: G W ----------------------------- net/core/dev.c:876 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by sctp_hello/31495: #0: ffff9f1ebbdb7418 (sk_lock-AF_INET6){+.+.}-{0:0}, at: sctp_bind (./arch/x86/include/asm/jump_label.h:27 net/sctp/socket.c:315) sctp stack backtrace: CPU: 7 UID: 0 PID: 31495 Comm: sctp_hello Tainted: G W 6.12.0-rc5-virtme #1216 Tainted: [W]=WARN Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:123) lockdep_rcu_suspicious (kernel/locking/lockdep.c:6822) dev_get_by_index_rcu (net/core/dev.c:876 (discriminator 7)) sctp_v6_available (net/sctp/ipv6.c:701) sctp sctp_do_bind (net/sctp/socket.c:400 (discriminator 1)) sctp sctp_bind (net/sctp/socket.c:320) sctp inet6_bind_sk (net/ipv6/af_inet6.c:465) ? security_socket_bind (security/security.c:4581 (discriminator 1)) __sys_bind (net/socket.c:1848 net/socket.c:1869) ? do_user_addr_fault (./include/linux/rcupdate.h:347 ./include/linux/rcupdate.h:880 ./include/linux/mm.h:729 arch/x86/mm/fault.c:1340) ? do_user_addr_fault (./arch/x86/include/asm/preempt.h:84 (discriminator 13) ./include/linux/rcupdate.h:98 (discriminator 13) ./include/linux/rcupdate.h:882 (discriminator 13) ./include/linux/mm.h:729 (discriminator 13) arch/x86/mm/fault.c:1340 (discriminator 13)) __x64_sys_bind (net/socket.c:1877 (discriminator 1) net/socket.c:1875 (discriminator 1) net/socket.c:1875 (discriminator 1)) do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f59b934a1e7 Code: 44 00 00 48 8b 15 39 8c 0c 00 f7 d8 64 89 02 b8 ff ff ff ff eb bd 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 b8 31 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 09 8c 0c 00 f7 d8 64 89 01 48 All code ======== 0: 44 00 00 add %r8b,(%rax) 3: 48 8b 15 39 8c 0c 00 mov 0xc8c39(%rip),%rdx # 0xc8c43 a: f7 d8 neg %eax c: 64 89 02 mov %eax,%fs:(%rdx) f: b8 ff ff ff ff mov $0xffffffff,%eax 14: eb bd jmp 0xffffffffffffffd3 16: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1) 1d: 00 00 00 20: 0f 1f 00 nopl (%rax) 23: b8 31 00 00 00 mov $0x31,%eax 28: 0f 05 syscall 2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction 30: 73 01 jae 0x33 32: c3 ret 33: 48 8b 0d 09 8c 0c 00 mov 0xc8c09(%rip),%rcx # 0xc8c43 3a: f7 d8 neg %eax 3c: 64 89 01 mov %eax,%fs:(%rcx) 3f: 48 rex.W Code starting with the faulting instruction =========================================== 0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax 6: 73 01 jae 0x9 8: c3 ret 9: 48 8b 0d 09 8c 0c 00 mov 0xc8c09(%rip),%rcx # 0xc8c19 10: f7 d8 neg %eax 12: 64 89 01 mov %eax,%fs:(%rcx) 15: 48 rex.W RSP: 002b:00007ffe2d0ad398 EFLAGS: 00000202 ORIG_RAX: 0000000000000031 RAX: ffffffffffffffda RBX: 00007ffe2d0ad3d0 RCX: 00007f59b934a1e7 RDX: 000000000000001c RSI: 00007ffe2d0ad3d0 RDI: 0000000000000005 RBP: 0000000000000005 R08: 1999999999999999 R09: 0000000000000000 R10: 00007f59b9253298 R11: 0000000000000202 R12: 00007ffe2d0ada61 R13: 0000000000000000 R14: 0000562926516dd8 R15: 00007f59b9479000 Fixes: 6fe1e52490a9 ("sctp: check ipv6 addr with sk_bound_dev if set") Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Acked-by: Xin Long Link: https://patch.msgid.link/20241107192021.2579789-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/sctp/ipv6.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index f7b809c0d142..38e2fbdcbeac 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -683,7 +683,7 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) struct sock *sk = &sp->inet.sk; struct net *net = sock_net(sk); struct net_device *dev = NULL; - int type; + int type, res, bound_dev_if; type = ipv6_addr_type(in6); if (IPV6_ADDR_ANY == type) @@ -697,14 +697,21 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) if (!(type & IPV6_ADDR_UNICAST)) return 0; - if (sk->sk_bound_dev_if) { - dev = dev_get_by_index_rcu(net, sk->sk_bound_dev_if); + rcu_read_lock(); + bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); + if (bound_dev_if) { + res = 0; + dev = dev_get_by_index_rcu(net, bound_dev_if); if (!dev) - return 0; + goto out; } - return ipv6_can_nonlocal_bind(net, &sp->inet) || - ipv6_chk_addr(net, in6, dev, 0); + res = ipv6_can_nonlocal_bind(net, &sp->inet) || + ipv6_chk_addr(net, in6, dev, 0); + +out: + rcu_read_unlock(); + return res; } /* This function checks if the address is a valid address to be used for From e68da664d379f352d41d7955712c44e0a738e4ab Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 8 Nov 2024 12:43:43 +0100 Subject: [PATCH 04/38] net: vertexcom: mse102x: Fix tx_bytes calculation The tx_bytes should consider the actual size of the Ethernet frames without the SPI encapsulation. But we still need to take care of Ethernet padding. Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Link: https://patch.msgid.link/20241108114343.6174-3-wahrenst@gmx.net Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/vertexcom/mse102x.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index 2c37957478fb..89dc4c401a8d 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -437,13 +437,15 @@ static void mse102x_tx_work(struct work_struct *work) mse = &mses->mse102x; while ((txb = skb_dequeue(&mse->txq))) { + unsigned int len = max_t(unsigned int, txb->len, ETH_ZLEN); + mutex_lock(&mses->lock); ret = mse102x_tx_pkt_spi(mse, txb, work_timeout); mutex_unlock(&mses->lock); if (ret) { mse->ndev->stats.tx_dropped++; } else { - mse->ndev->stats.tx_bytes += txb->len; + mse->ndev->stats.tx_bytes += len; mse->ndev->stats.tx_packets++; } From 252e01e68241d33bfe0ed1fc333220d9bd8b06df Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 7 Nov 2024 16:47:31 -0800 Subject: [PATCH 05/38] selftests: net: add netlink-dumps to .gitignore Commit 55d42a0c3f9c ("selftests: net: add a test for closing a netlink socket ith dump in progress") added a new test but did not add it to gitignore. Reviewed-by: Joe Damato Link: https://patch.msgid.link/20241108004731.2979878-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/.gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore index 217d8b7a7365..59fe07ee2df9 100644 --- a/tools/testing/selftests/net/.gitignore +++ b/tools/testing/selftests/net/.gitignore @@ -19,6 +19,7 @@ log.txt msg_oob msg_zerocopy ncdevmem +netlink-dumps nettest psock_fanout psock_snd From 073d89808c065ac4c672c0a613a71b27a80691cb Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Thu, 7 Nov 2024 10:34:05 +0800 Subject: [PATCH 06/38] net: fix data-races around sk->sk_forward_alloc Syzkaller reported this warning: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 16 at net/ipv4/af_inet.c:156 inet_sock_destruct+0x1c5/0x1e0 Modules linked in: CPU: 0 UID: 0 PID: 16 Comm: ksoftirqd/0 Not tainted 6.12.0-rc5 #26 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:inet_sock_destruct+0x1c5/0x1e0 Code: 24 12 4c 89 e2 5b 48 c7 c7 98 ec bb 82 41 5c e9 d1 18 17 ff 4c 89 e6 5b 48 c7 c7 d0 ec bb 82 41 5c e9 bf 18 17 ff 0f 0b eb 83 <0f> 0b eb 97 0f 0b eb 87 0f 0b e9 68 ff ff ff 66 66 2e 0f 1f 84 00 RSP: 0018:ffffc9000008bd90 EFLAGS: 00010206 RAX: 0000000000000300 RBX: ffff88810b172a90 RCX: 0000000000000007 RDX: 0000000000000002 RSI: 0000000000000300 RDI: ffff88810b172a00 RBP: ffff88810b172a00 R08: ffff888104273c00 R09: 0000000000100007 R10: 0000000000020000 R11: 0000000000000006 R12: ffff88810b172a00 R13: 0000000000000004 R14: 0000000000000000 R15: ffff888237c31f78 FS: 0000000000000000(0000) GS:ffff888237c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007ffc63fecac8 CR3: 000000000342e000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: ? __warn+0x88/0x130 ? inet_sock_destruct+0x1c5/0x1e0 ? report_bug+0x18e/0x1a0 ? handle_bug+0x53/0x90 ? exc_invalid_op+0x18/0x70 ? asm_exc_invalid_op+0x1a/0x20 ? inet_sock_destruct+0x1c5/0x1e0 __sk_destruct+0x2a/0x200 rcu_do_batch+0x1aa/0x530 ? rcu_do_batch+0x13b/0x530 rcu_core+0x159/0x2f0 handle_softirqs+0xd3/0x2b0 ? __pfx_smpboot_thread_fn+0x10/0x10 run_ksoftirqd+0x25/0x30 smpboot_thread_fn+0xdd/0x1d0 kthread+0xd3/0x100 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 ---[ end trace 0000000000000000 ]--- Its possible that two threads call tcp_v6_do_rcv()/sk_forward_alloc_add() concurrently when sk->sk_state == TCP_LISTEN with sk->sk_lock unlocked, which triggers a data-race around sk->sk_forward_alloc: tcp_v6_rcv tcp_v6_do_rcv skb_clone_and_charge_r sk_rmem_schedule __sk_mem_schedule sk_forward_alloc_add() skb_set_owner_r sk_mem_charge sk_forward_alloc_add() __kfree_skb skb_release_all skb_release_head_state sock_rfree sk_mem_uncharge sk_forward_alloc_add() sk_mem_reclaim // set local var reclaimable __sk_mem_reclaim sk_forward_alloc_add() In this syzkaller testcase, two threads call tcp_v6_do_rcv() with skb->truesize=768, the sk_forward_alloc changes like this: (cpu 1) | (cpu 2) | sk_forward_alloc ... | ... | 0 __sk_mem_schedule() | | +4096 = 4096 | __sk_mem_schedule() | +4096 = 8192 sk_mem_charge() | | -768 = 7424 | sk_mem_charge() | -768 = 6656 ... | ... | sk_mem_uncharge() | | +768 = 7424 reclaimable=7424 | | | sk_mem_uncharge() | +768 = 8192 | reclaimable=8192 | __sk_mem_reclaim() | | -4096 = 4096 | __sk_mem_reclaim() | -8192 = -4096 != 0 The skb_clone_and_charge_r() should not be called in tcp_v6_do_rcv() when sk->sk_state is TCP_LISTEN, it happens later in tcp_v6_syn_recv_sock(). Fix the same issue in dccp_v6_do_rcv(). Suggested-by: Eric Dumazet Reviewed-by: Eric Dumazet Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20241107023405.889239-1-wangliang74@huawei.com Signed-off-by: Jakub Kicinski --- net/dccp/ipv6.c | 2 +- net/ipv6/tcp_ipv6.c | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index da5dba120bc9..d6649246188d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -618,7 +618,7 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) by tcp. Feel free to propose better solution. --ANK (980728) */ - if (np->rxopt.all) + if (np->rxopt.all && sk->sk_state != DCCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == DCCP_OPEN) { /* Fast path */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index d71ab4e1efe1..c9de5ef8f267 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1618,7 +1618,7 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) by tcp. Feel free to propose better solution. --ANK (980728) */ - if (np->rxopt.all) + if (np->rxopt.all && sk->sk_state != TCP_LISTEN) opt_skb = skb_clone_and_charge_r(skb, sk); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ @@ -1656,8 +1656,6 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (reason) goto reset; } - if (opt_skb) - __kfree_skb(opt_skb); return 0; } } else From f2685c00c3222305f5b6740a8b16ea044640283a Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 7 Nov 2024 21:03:30 +0000 Subject: [PATCH 07/38] net: fix SO_DEVMEM_DONTNEED looping too long Exit early if we're freeing more than 1024 frags, to prevent looping too long. Also minor code cleanups: - Flip checks to reduce indentation. - Use sizeof(*tokens) everywhere for consistentcy. Cc: Yi Lai Signed-off-by: Mina Almasry Acked-by: Stanislav Fomichev Link: https://patch.msgid.link/20241107210331.3044434-1-almasrymina@google.com Signed-off-by: Jakub Kicinski --- net/core/sock.c | 42 ++++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index 039be95c40cf..da50df485090 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1052,32 +1052,34 @@ static int sock_reserve_memory(struct sock *sk, int bytes) #ifdef CONFIG_PAGE_POOL -/* This is the number of tokens that the user can SO_DEVMEM_DONTNEED in - * 1 syscall. The limit exists to limit the amount of memory the kernel - * allocates to copy these tokens. +/* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED + * in 1 syscall. The limit exists to limit the amount of memory the kernel + * allocates to copy these tokens, and to prevent looping over the frags for + * too long. */ #define MAX_DONTNEED_TOKENS 128 +#define MAX_DONTNEED_FRAGS 1024 static noinline_for_stack int sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) { unsigned int num_tokens, i, j, k, netmem_num = 0; struct dmabuf_token *tokens; + int ret = 0, num_frags = 0; netmem_ref netmems[16]; - int ret = 0; if (!sk_is_tcp(sk)) return -EBADF; - if (optlen % sizeof(struct dmabuf_token) || + if (optlen % sizeof(*tokens) || optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) return -EINVAL; - tokens = kvmalloc_array(optlen, sizeof(*tokens), GFP_KERNEL); + num_tokens = optlen / sizeof(*tokens); + tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); if (!tokens) return -ENOMEM; - num_tokens = optlen / sizeof(struct dmabuf_token); if (copy_from_sockptr(tokens, optval, optlen)) { kvfree(tokens); return -EFAULT; @@ -1086,24 +1088,28 @@ sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) xa_lock_bh(&sk->sk_user_frags); for (i = 0; i < num_tokens; i++) { for (j = 0; j < tokens[i].token_count; j++) { + if (++num_frags > MAX_DONTNEED_FRAGS) + goto frag_limit_reached; + netmem_ref netmem = (__force netmem_ref)__xa_erase( &sk->sk_user_frags, tokens[i].token_start + j); - if (netmem && - !WARN_ON_ONCE(!netmem_is_net_iov(netmem))) { - netmems[netmem_num++] = netmem; - if (netmem_num == ARRAY_SIZE(netmems)) { - xa_unlock_bh(&sk->sk_user_frags); - for (k = 0; k < netmem_num; k++) - WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); - netmem_num = 0; - xa_lock_bh(&sk->sk_user_frags); - } - ret++; + if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) + continue; + + netmems[netmem_num++] = netmem; + if (netmem_num == ARRAY_SIZE(netmems)) { + xa_unlock_bh(&sk->sk_user_frags); + for (k = 0; k < netmem_num; k++) + WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); + netmem_num = 0; + xa_lock_bh(&sk->sk_user_frags); } + ret++; } } +frag_limit_reached: xa_unlock_bh(&sk->sk_user_frags); for (k = 0; k < netmem_num; k++) WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); From 102d1404c385611c574498b1e0d1f3762e253359 Mon Sep 17 00:00:00 2001 From: Mina Almasry Date: Thu, 7 Nov 2024 21:03:31 +0000 Subject: [PATCH 08/38] net: clarify SO_DEVMEM_DONTNEED behavior in documentation Document new behavior when the number of frags passed is too big. Signed-off-by: Mina Almasry Link: https://patch.msgid.link/20241107210331.3044434-2-almasrymina@google.com Signed-off-by: Jakub Kicinski --- Documentation/networking/devmem.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/networking/devmem.rst b/Documentation/networking/devmem.rst index a55bf21f671c..d95363645331 100644 --- a/Documentation/networking/devmem.rst +++ b/Documentation/networking/devmem.rst @@ -225,6 +225,15 @@ The user must ensure the tokens are returned to the kernel in a timely manner. Failure to do so will exhaust the limited dmabuf that is bound to the RX queue and will lead to packet drops. +The user must pass no more than 128 tokens, with no more than 1024 total frags +among the token->token_count across all the tokens. If the user provides more +than 1024 frags, the kernel will free up to 1024 frags and return early. + +The kernel returns the number of actual frags freed. The number of frags freed +can be less than the tokens provided by the user in case of: + +(a) an internal kernel leak bug. +(b) the user passed more than 1024 frags. Implementation & Caveats ======================== From 581302298524e9d77c4c44ff5156a6cd112227ae Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Nov 2024 11:58:16 +0100 Subject: [PATCH 09/38] mptcp: error out earlier on disconnect Eric reported a division by zero splat in the MPTCP protocol: Oops: divide error: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 6094 Comm: syz-executor317 Not tainted 6.12.0-rc5-syzkaller-00291-g05b92660cdfe #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__tcp_select_window+0x5b4/0x1310 net/ipv4/tcp_output.c:3163 Code: f6 44 01 e3 89 df e8 9b 75 09 f8 44 39 f3 0f 8d 11 ff ff ff e8 0d 74 09 f8 45 89 f4 e9 04 ff ff ff e8 00 74 09 f8 44 89 f0 99 7c 24 14 41 29 d6 45 89 f4 e9 ec fe ff ff e8 e8 73 09 f8 48 89 RSP: 0018:ffffc900041f7930 EFLAGS: 00010293 RAX: 0000000000017e67 RBX: 0000000000017e67 RCX: ffffffff8983314b RDX: 0000000000000000 RSI: ffffffff898331b0 RDI: 0000000000000004 RBP: 00000000005d6000 R08: 0000000000000004 R09: 0000000000017e67 R10: 0000000000003e80 R11: 0000000000000000 R12: 0000000000003e80 R13: ffff888031d9b440 R14: 0000000000017e67 R15: 00000000002eb000 FS: 00007feb5d7f16c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007feb5d8adbb8 CR3: 0000000074e4c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __tcp_cleanup_rbuf+0x3e7/0x4b0 net/ipv4/tcp.c:1493 mptcp_rcv_space_adjust net/mptcp/protocol.c:2085 [inline] mptcp_recvmsg+0x2156/0x2600 net/mptcp/protocol.c:2289 inet_recvmsg+0x469/0x6a0 net/ipv4/af_inet.c:885 sock_recvmsg_nosec net/socket.c:1051 [inline] sock_recvmsg+0x1b2/0x250 net/socket.c:1073 __sys_recvfrom+0x1a5/0x2e0 net/socket.c:2265 __do_sys_recvfrom net/socket.c:2283 [inline] __se_sys_recvfrom net/socket.c:2279 [inline] __x64_sys_recvfrom+0xe0/0x1c0 net/socket.c:2279 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7feb5d857559 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007feb5d7f1208 EFLAGS: 00000246 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 00007feb5d8e1318 RCX: 00007feb5d857559 RDX: 000000800000000e RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007feb5d8e1310 R08: 0000000000000000 R09: ffffffff81000000 R10: 0000000000000100 R11: 0000000000000246 R12: 00007feb5d8e131c R13: 00007feb5d8ae074 R14: 000000800000000e R15: 00000000fffffdef and provided a nice reproducer. The root cause is the current bad handling of racing disconnect. After the blamed commit below, sk_wait_data() can return (with error) with the underlying socket disconnected and a zero rcv_mss. Catch the error and return without performing any additional operations on the current socket. Reported-by: Eric Dumazet Fixes: 419ce133ab92 ("tcp: allow again tcp_disconnect() when threads are waiting") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/8c82ecf71662ecbc47bf390f9905de70884c9f2d.1731060874.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d263091659e0..95a5a3da3944 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2205,7 +2205,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { - int bytes_read; + int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { @@ -2267,9 +2267,16 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - sk_wait_data(sk, &timeo, NULL); + mptcp_rcv_space_adjust(msk, copied); + err = sk_wait_data(sk, &timeo, NULL); + if (err < 0) { + err = copied ? : err; + goto out_err; + } } + mptcp_rcv_space_adjust(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) @@ -2285,8 +2292,6 @@ out_err: pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); - if (!(flags & MSG_PEEK)) - mptcp_rcv_space_adjust(msk, copied); release_sock(sk); return copied; From ce7356ae35943cc6494cc692e62d51a734062b7d Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Nov 2024 11:58:17 +0100 Subject: [PATCH 10/38] mptcp: cope racing subflow creation in mptcp_rcv_space_adjust Additional active subflows - i.e. created by the in kernel path manager - are included into the subflow list before starting the 3whs. A racing recvmsg() spooling data received on an already established subflow would unconditionally call tcp_cleanup_rbuf() on all the current subflows, potentially hitting a divide by zero error on the newly created ones. Explicitly check that the subflow is in a suitable state before invoking tcp_cleanup_rbuf(). Fixes: c76c6956566f ("mptcp: call tcp_cleanup_rbuf on subflows") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/02374660836e1b52afc91966b7535c8c5f7bafb0.1731060874.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 95a5a3da3944..48d480982b78 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2082,7 +2082,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); - tcp_cleanup_rbuf(ssk, 1); + if (tcp_can_send_ack(ssk)) + tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } } From 1220965d619178713844ef365beb9d9b88267e13 Mon Sep 17 00:00:00 2001 From: Chiara Meiohas Date: Thu, 7 Nov 2024 20:35:21 +0200 Subject: [PATCH 11/38] net/mlx5: E-switch, unload IB representors when unloading ETH representors IB representors depend on ETH representors, so the IB representors should not exist without the ETH ones. When unloading the ETH representors, the corresponding IB representors should be also unloaded. The commit 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") introduced the use of the ib_device_set_netdev API in IB repsresentors. ib_device_set_netdev() increments the refcount of the representor's netdev when loading an IB representor and decrements it when unloading. Without the unloading of the IB representor, the refcount of the representor's netdev remains greater than 0, preventing it from being unregistered. The patch uncovered an underlying bug where the eth representor is unloaded, without unloading the IB representor. This issue happened when using multiport E-switch and rebooting, causing the shutdown to hang when unloading the ETH representor because the refcount of the representor's netdevice was greater than 0. Call trace: unregister_netdevice: waiting for eth3 to become free. Usage count = 2 ref_tracker: eth%d@00000000661d60f7 has 1/1 users at ib_device_set_netdev+0x160/0x2d0 [ib_core] mlx5_ib_vport_rep_load+0x104/0x3f0 [mlx5_ib] mlx5_eswitch_reload_ib_reps+0xfc/0x110 [mlx5_core] mlx5_mpesw_work+0x236/0x330 [mlx5_core] process_one_work+0x169/0x320 worker_thread+0x288/0x3a0 kthread+0xb8/0xe0 ret_from_fork+0x2d/0x50 ret_from_fork_asm+0x11/0x20 Fixes: 8d159eb2117b ("RDMA/mlx5: Use IB set_netdev and get_netdev functions") Signed-off-by: Chiara Meiohas Reviewed-by: Mark Bloch Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index f24f91d213f2..8cf61ae8b89d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -2527,8 +2527,11 @@ static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw, struct mlx5_eswitch_rep *rep, u8 rep_type) { if (atomic_cmpxchg(&rep->rep_data[rep_type].state, - REP_LOADED, REP_REGISTERED) == REP_LOADED) + REP_LOADED, REP_REGISTERED) == REP_LOADED) { + if (rep_type == REP_ETH) + __esw_offloads_unload_rep(esw, rep, REP_IB); esw->offloads.rep_ops[rep_type]->unload(rep); + } } static void __unload_reps_all_vport(struct mlx5_eswitch *esw, u8 rep_type) From d0989c9d2b3a89ae5e4ad45fe6d7bbe449fc49fe Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 7 Nov 2024 20:35:22 +0200 Subject: [PATCH 12/38] net/mlx5: Fix msix vectors to respect platform limit The number of PCI vectors allocated by the platform (which may be fewer than requested) is currently not honored when creating the SF pool; only the PCI MSI-X capability is considered. As a result, when a platform allocates fewer vectors (in non-dynamic mode) than requested, the PF and SF pools end up with an invalid vector range. This causes incorrect SF vector accounting, which leads to the following call trace when an invalid IRQ vector is allocated. This issue is resolved by ensuring that the platform's vector limit is respected for both the SF and PF pools. Workqueue: mlx5_vhca_event0 mlx5_sf_dev_add_active_work [mlx5_core] RIP: 0010:pci_irq_vector+0x23/0x80 RSP: 0018:ffffabd5cebd7248 EFLAGS: 00010246 RAX: ffff980880e7f308 RBX: ffff9808932fb880 RCX: 0000000000000001 RDX: 00000000000001ff RSI: 0000000000000200 RDI: ffff980880e7f308 RBP: 0000000000000200 R08: 0000000000000010 R09: ffff97a9116f0860 R10: 0000000000000002 R11: 0000000000000228 R12: ffff980897cd0160 R13: 0000000000000000 R14: ffff97a920fec0c0 R15: ffffabd5cebd72d0 FS: 0000000000000000(0000) GS:ffff97c7ff9c0000(0000) knlGS:0000000000000000 ? rescuer_thread+0x350/0x350 kthread+0x11b/0x140 ? __kthread_bind_mask+0x60/0x60 ret_from_fork+0x22/0x30 mlx5_core 0000:a1:00.0: mlx5_irq_alloc:321:(pid 6781): Failed to request irq. err = -22 mlx5_core 0000:a1:00.0: mlx5_irq_alloc:321:(pid 6781): Failed to request irq. err = -22 mlx5_core.sf mlx5_core.sf.6: MLX5E: StrdRq(1) RqSz(8) StrdSz(2048) RxCqeCmprss(0 enhanced) mlx5_core.sf mlx5_core.sf.7: firmware version: 32.43.356 mlx5_core.sf mlx5_core.sf.6 enpa1s0f0s4: renamed from eth0 mlx5_core.sf mlx5_core.sf.7: Rate limit: 127 rates are supported, range: 0Mbps to 195312Mbps mlx5_core 0000:a1:00.0: mlx5_irq_alloc:321:(pid 6781): Failed to request irq. err = -22 mlx5_core 0000:a1:00.0: mlx5_irq_alloc:321:(pid 6781): Failed to request irq. err = -22 mlx5_core 0000:a1:00.0: mlx5_irq_alloc:321:(pid 6781): Failed to request irq. err = -22 Fixes: 3354822cde5a ("net/mlx5: Use dynamic msix vectors allocation") Signed-off-by: Parav Pandit Signed-off-by: Amir Tzin Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-3-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/pci_irq.c | 32 ++++++++++++++++--- 1 file changed, 27 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 81a9232a03e1..7db9cab9bedf 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -593,9 +593,11 @@ static void irq_pool_free(struct mlx5_irq_pool *pool) kvfree(pool); } -static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec) +static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec, + bool dynamic_vec) { struct mlx5_irq_table *table = dev->priv.irq_table; + int sf_vec_available = sf_vec; int num_sf_ctrl; int err; @@ -616,6 +618,13 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec) num_sf_ctrl = DIV_ROUND_UP(mlx5_sf_max_functions(dev), MLX5_SFS_PER_CTRL_IRQ); num_sf_ctrl = min_t(int, MLX5_IRQ_CTRL_SF_MAX, num_sf_ctrl); + if (!dynamic_vec && (num_sf_ctrl + 1) > sf_vec_available) { + mlx5_core_dbg(dev, + "Not enough IRQs for SFs control and completion pool, required=%d avail=%d\n", + num_sf_ctrl + 1, sf_vec_available); + return 0; + } + table->sf_ctrl_pool = irq_pool_alloc(dev, pcif_vec, num_sf_ctrl, "mlx5_sf_ctrl", MLX5_EQ_SHARE_IRQ_MIN_CTRL, @@ -624,9 +633,11 @@ static int irq_pools_init(struct mlx5_core_dev *dev, int sf_vec, int pcif_vec) err = PTR_ERR(table->sf_ctrl_pool); goto err_pf; } - /* init sf_comp_pool */ + sf_vec_available -= num_sf_ctrl; + + /* init sf_comp_pool, remaining vectors are for the SF completions */ table->sf_comp_pool = irq_pool_alloc(dev, pcif_vec + num_sf_ctrl, - sf_vec - num_sf_ctrl, "mlx5_sf_comp", + sf_vec_available, "mlx5_sf_comp", MLX5_EQ_SHARE_IRQ_MIN_COMP, MLX5_EQ_SHARE_IRQ_MAX_COMP); if (IS_ERR(table->sf_comp_pool)) { @@ -715,6 +726,7 @@ int mlx5_irq_table_get_num_comp(struct mlx5_irq_table *table) int mlx5_irq_table_create(struct mlx5_core_dev *dev) { int num_eqs = mlx5_max_eq_cap_get(dev); + bool dynamic_vec; int total_vec; int pcif_vec; int req_vec; @@ -724,21 +736,31 @@ int mlx5_irq_table_create(struct mlx5_core_dev *dev) if (mlx5_core_is_sf(dev)) return 0; + /* PCI PF vectors usage is limited by online cpus, device EQs and + * PCI MSI-X capability. + */ pcif_vec = MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() + 1; pcif_vec = min_t(int, pcif_vec, num_eqs); + pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev)); total_vec = pcif_vec; if (mlx5_sf_max_functions(dev)) total_vec += MLX5_MAX_MSIX_PER_SF * mlx5_sf_max_functions(dev); total_vec = min_t(int, total_vec, pci_msix_vec_count(dev->pdev)); - pcif_vec = min_t(int, pcif_vec, pci_msix_vec_count(dev->pdev)); req_vec = pci_msix_can_alloc_dyn(dev->pdev) ? 1 : total_vec; n = pci_alloc_irq_vectors(dev->pdev, 1, req_vec, PCI_IRQ_MSIX); if (n < 0) return n; - err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec); + /* Further limit vectors of the pools based on platform for non dynamic case */ + dynamic_vec = pci_msix_can_alloc_dyn(dev->pdev); + if (!dynamic_vec) { + pcif_vec = min_t(int, n, pcif_vec); + total_vec = min_t(int, n, total_vec); + } + + err = irq_pools_init(dev, total_vec - pcif_vec, pcif_vec, dynamic_vec); if (err) pci_free_irq_vectors(dev->pdev); From 9ca314419930f9135727e39d77e66262d5f7bef6 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 7 Nov 2024 20:35:23 +0200 Subject: [PATCH 13/38] net/mlx5: fs, lock FTE when checking if active The referenced commits introduced a two-step process for deleting FTEs: - Lock the FTE, delete it from hardware, set the hardware deletion function to NULL and unlock the FTE. - Lock the parent flow group, delete the software copy of the FTE, and remove it from the xarray. However, this approach encounters a race condition if a rule with the same match value is added simultaneously. In this scenario, fs_core may set the hardware deletion function to NULL prematurely, causing a panic during subsequent rule deletions. To prevent this, ensure the active flag of the FTE is checked under a lock, which will prevent the fs_core layer from attaching a new steering rule to an FTE that is in the process of deletion. [ 438.967589] MOSHE: 2496 mlx5_del_flow_rules del_hw_func [ 438.968205] ------------[ cut here ]------------ [ 438.968654] refcount_t: decrement hit 0; leaking memory. [ 438.969249] WARNING: CPU: 0 PID: 8957 at lib/refcount.c:31 refcount_warn_saturate+0xfb/0x110 [ 438.970054] Modules linked in: act_mirred cls_flower act_gact sch_ingress openvswitch nsh mlx5_vdpa vringh vhost_iotlb vdpa mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core zram zsmalloc fuse [last unloaded: cls_flower] [ 438.973288] CPU: 0 UID: 0 PID: 8957 Comm: tc Not tainted 6.12.0-rc1+ #8 [ 438.973888] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 438.974874] RIP: 0010:refcount_warn_saturate+0xfb/0x110 [ 438.975363] Code: 40 66 3b 82 c6 05 16 e9 4d 01 01 e8 1f 7c a0 ff 0f 0b c3 cc cc cc cc 48 c7 c7 10 66 3b 82 c6 05 fd e8 4d 01 01 e8 05 7c a0 ff <0f> 0b c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 90 [ 438.976947] RSP: 0018:ffff888124a53610 EFLAGS: 00010286 [ 438.977446] RAX: 0000000000000000 RBX: ffff888119d56de0 RCX: 0000000000000000 [ 438.978090] RDX: ffff88852c828700 RSI: ffff88852c81b3c0 RDI: ffff88852c81b3c0 [ 438.978721] RBP: ffff888120fa0e88 R08: 0000000000000000 R09: ffff888124a534b0 [ 438.979353] R10: 0000000000000001 R11: 0000000000000001 R12: ffff888119d56de0 [ 438.979979] R13: ffff888120fa0ec0 R14: ffff888120fa0ee8 R15: ffff888119d56de0 [ 438.980607] FS: 00007fe6dcc0f800(0000) GS:ffff88852c800000(0000) knlGS:0000000000000000 [ 438.983984] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 438.984544] CR2: 00000000004275e0 CR3: 0000000186982001 CR4: 0000000000372eb0 [ 438.985205] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 438.985842] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 438.986507] Call Trace: [ 438.986799] [ 438.987070] ? __warn+0x7d/0x110 [ 438.987426] ? refcount_warn_saturate+0xfb/0x110 [ 438.987877] ? report_bug+0x17d/0x190 [ 438.988261] ? prb_read_valid+0x17/0x20 [ 438.988659] ? handle_bug+0x53/0x90 [ 438.989054] ? exc_invalid_op+0x14/0x70 [ 438.989458] ? asm_exc_invalid_op+0x16/0x20 [ 438.989883] ? refcount_warn_saturate+0xfb/0x110 [ 438.990348] mlx5_del_flow_rules+0x2f7/0x340 [mlx5_core] [ 438.990932] __mlx5_eswitch_del_rule+0x49/0x170 [mlx5_core] [ 438.991519] ? mlx5_lag_is_sriov+0x3c/0x50 [mlx5_core] [ 438.992054] ? xas_load+0x9/0xb0 [ 438.992407] mlx5e_tc_rule_unoffload+0x45/0xe0 [mlx5_core] [ 438.993037] mlx5e_tc_del_fdb_flow+0x2a6/0x2e0 [mlx5_core] [ 438.993623] mlx5e_flow_put+0x29/0x60 [mlx5_core] [ 438.994161] mlx5e_delete_flower+0x261/0x390 [mlx5_core] [ 438.994728] tc_setup_cb_destroy+0xb9/0x190 [ 438.995150] fl_hw_destroy_filter+0x94/0xc0 [cls_flower] [ 438.995650] fl_change+0x11a4/0x13c0 [cls_flower] [ 438.996105] tc_new_tfilter+0x347/0xbc0 [ 438.996503] ? ___slab_alloc+0x70/0x8c0 [ 438.996929] rtnetlink_rcv_msg+0xf9/0x3e0 [ 438.997339] ? __netlink_sendskb+0x4c/0x70 [ 438.997751] ? netlink_unicast+0x286/0x2d0 [ 438.998171] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 438.998625] netlink_rcv_skb+0x54/0x100 [ 438.999020] netlink_unicast+0x203/0x2d0 [ 438.999421] netlink_sendmsg+0x1e4/0x420 [ 438.999820] __sock_sendmsg+0xa1/0xb0 [ 439.000203] ____sys_sendmsg+0x207/0x2a0 [ 439.000600] ? copy_msghdr_from_user+0x6d/0xa0 [ 439.001072] ___sys_sendmsg+0x80/0xc0 [ 439.001459] ? ___sys_recvmsg+0x8b/0xc0 [ 439.001848] ? generic_update_time+0x4d/0x60 [ 439.002282] __sys_sendmsg+0x51/0x90 [ 439.002658] do_syscall_64+0x50/0x110 [ 439.003040] entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 718ce4d601db ("net/mlx5: Consolidate update FTE for all removal changes") Fixes: cefc23554fc2 ("net/mlx5: Fix FTE cleanup") Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/fs_core.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 8505d5e241e1..6e4f8aaf8d2f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2105,13 +2105,22 @@ lookup_fte_locked(struct mlx5_flow_group *g, fte_tmp = NULL; goto out; } - if (!fte_tmp->node.active) { - tree_put_node(&fte_tmp->node, false); - fte_tmp = NULL; - goto out; - } nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD); + + if (!fte_tmp->node.active) { + up_write_ref_node(&fte_tmp->node, false); + + if (take_write) + up_write_ref_node(&g->node, false); + else + up_read_ref_node(&g->node); + + tree_put_node(&fte_tmp->node, false); + + return NULL; + } + out: if (take_write) up_write_ref_node(&g->node, false); From dd6e972cc5890d91d6749bb48e3912721c4e4b25 Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 7 Nov 2024 20:35:24 +0200 Subject: [PATCH 14/38] net/mlx5e: kTLS, Fix incorrect page refcounting The kTLS tx handling code is using a mix of get_page() and page_ref_inc() APIs to increment the page reference. But on the release path (mlx5e_ktls_tx_handle_resync_dump_comp()), only put_page() is used. This is an issue when using pages from large folios: the get_page() references are stored on the folio page while the page_ref_inc() references are stored directly in the given page. On release the folio page will be dereferenced too many times. This was found while doing kTLS testing with sendfile() + ZC when the served file was read from NFS on a kernel with NFS large folios support (commit 49b29a573da8 ("nfs: add support for large folios")). Fixes: 84d1bb2b139e ("net/mlx5e: kTLS, Limit DUMP wqe size") Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- .../net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index d61be26a4df1..3db31cc10719 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -660,7 +660,7 @@ tx_sync_info_get(struct mlx5e_ktls_offload_context_tx *priv_tx, while (remaining > 0) { skb_frag_t *frag = &record->frags[i]; - get_page(skb_frag_page(frag)); + page_ref_inc(skb_frag_page(frag)); remaining -= skb_frag_size(frag); info->frags[i++] = *frag; } @@ -763,7 +763,7 @@ void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, stats = sq->stats; mlx5e_tx_dma_unmap(sq->pdev, dma); - put_page(wi->resync_dump_frag_page); + page_ref_dec(wi->resync_dump_frag_page); stats->tls_dump_packets++; stats->tls_dump_bytes += wi->num_bytes; } @@ -816,12 +816,12 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, err_out: for (; i < info.nr_frags; i++) - /* The put_page() here undoes the page ref obtained in tx_sync_info_get(). + /* The page_ref_dec() here undoes the page ref obtained in tx_sync_info_get(). * Page refs obtained for the DUMP WQEs above (by page_ref_add) will be * released only upon their completions (or in mlx5e_free_txqsq_descs, * if channel closes). */ - put_page(skb_frag_page(&info.frags[i])); + page_ref_dec(skb_frag_page(&info.frags[i])); return MLX5E_KTLS_SYNC_FAIL; } From c079389878debf767dc4e52fe877b9117258dfe2 Mon Sep 17 00:00:00 2001 From: William Tu Date: Thu, 7 Nov 2024 20:35:25 +0200 Subject: [PATCH 15/38] net/mlx5e: clear xdp features on non-uplink representors Non-uplink representor port does not support XDP. The patch clears the xdp feature by checking the net_device_ops.ndo_bpf is set or not. Verify using the netlink tool: $ tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml --dump dev-get Representor netdev before the patch: {'ifindex': 8, 'xdp-features': {'basic', 'ndo-xmit', 'ndo-xmit-sg', 'redirect', 'rx-sg', 'xsk-zerocopy'}, 'xdp-rx-metadata-features': set(), 'xdp-zc-max-segs': 1, 'xsk-features': set()}, With the patch: {'ifindex': 8, 'xdp-features': set(), 'xdp-rx-metadata-features': set(), 'xsk-features': set()}, Fixes: 4d5ab0ad964d ("net/mlx5e: take into account device reconfiguration for xdp_features flag") Signed-off-by: William Tu Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-6-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index e601324a690a..13a3fa8dc0cb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4267,7 +4267,8 @@ void mlx5e_set_xdp_feature(struct net_device *netdev) struct mlx5e_params *params = &priv->channels.params; xdp_features_t val; - if (params->packet_merge.type != MLX5E_PACKET_MERGE_NONE) { + if (!netdev->netdev_ops->ndo_bpf || + params->packet_merge.type != MLX5E_PACKET_MERGE_NONE) { xdp_clear_features_flag(netdev); return; } From e99c6873229fe0482e7ceb7d5600e32d623ed9d9 Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 7 Nov 2024 20:35:26 +0200 Subject: [PATCH 16/38] net/mlx5e: CT: Fix null-ptr-deref in add rule err flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In error flow of mlx5_tc_ct_entry_add_rule(), in case ct_rule_add() callback returns error, zone_rule->attr is used uninitiated. Fix it to use attr which has the needed pointer value. Kernel log: BUG: kernel NULL pointer dereference, address: 0000000000000110 RIP: 0010:mlx5_tc_ct_entry_add_rule+0x2b1/0x2f0 [mlx5_core] … Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x150/0x3e0 ? exc_page_fault+0x74/0x140 ? asm_exc_page_fault+0x22/0x30 ? mlx5_tc_ct_entry_add_rule+0x2b1/0x2f0 [mlx5_core] ? mlx5_tc_ct_entry_add_rule+0x1d5/0x2f0 [mlx5_core] mlx5_tc_ct_block_flow_offload+0xc6a/0xf90 [mlx5_core] ? nf_flow_offload_tuple+0xd8/0x190 [nf_flow_table] nf_flow_offload_tuple+0xd8/0x190 [nf_flow_table] flow_offload_work_handler+0x142/0x320 [nf_flow_table] ? finish_task_switch.isra.0+0x15b/0x2b0 process_one_work+0x16c/0x320 worker_thread+0x28c/0x3a0 ? __pfx_worker_thread+0x10/0x10 kthread+0xb8/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2d/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Fixes: 7fac5c2eced3 ("net/mlx5: CT: Avoid reusing modify header context for natted entries") Signed-off-by: Moshe Shemesh Reviewed-by: Cosmin Ratiu Reviewed-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index dcfccaaa8d91..92d5cfec3dc0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -866,7 +866,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv, return 0; err_rule: - mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, zone_rule->attr, zone_rule->mh); + mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, attr, zone_rule->mh); mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id); err_mod_hdr: kfree(attr); From d1ac33934a66e8d58a52668999bf9e8f59e56c81 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Thu, 7 Nov 2024 20:35:27 +0200 Subject: [PATCH 17/38] net/mlx5e: Disable loopback self-test on multi-PF netdev In Multi-PF (Socket Direct) configurations, when a loopback packet is sent through one of the secondary devices, it will always be received on the primary device. This causes the loopback layer to fail in identifying the loopback packet as the devices are different. To avoid false test failures, disable the loopback self-test in Multi-PF configurations. Fixes: ed29705e4ed1 ("net/mlx5: Enable SD feature") Signed-off-by: Carolina Jubran Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 5bf8318cc48b..1d60465cc2ca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -36,6 +36,7 @@ #include "en.h" #include "en/port.h" #include "eswitch.h" +#include "lib/mlx5.h" static int mlx5e_test_health_info(struct mlx5e_priv *priv) { @@ -247,6 +248,9 @@ static int mlx5e_cond_loopback(struct mlx5e_priv *priv) if (is_mdev_switchdev_mode(priv->mdev)) return -EOPNOTSUPP; + if (mlx5_get_sd(priv->mdev)) + return -EOPNOTSUPP; + return 0; } From d7b0ff5a866724c3ad21f2628c22a63336deec3f Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 7 Nov 2024 21:46:12 +0100 Subject: [PATCH 18/38] virtio/vsock: Fix accept_queue memory leak As the final stages of socket destruction may be delayed, it is possible that virtio_transport_recv_listen() will be called after the accept_queue has been flushed, but before the SOCK_DONE flag has been set. As a result, sockets enqueued after the flush would remain unremoved, leading to a memory leak. vsock_release __vsock_release lock virtio_transport_release virtio_transport_close schedule_delayed_work(close_work) sk_shutdown = SHUTDOWN_MASK (!) flush accept_queue release virtio_transport_recv_pkt vsock_find_bound_socket lock if flag(SOCK_DONE) return virtio_transport_recv_listen child = vsock_create_connected (!) vsock_enqueue_accept(child) release close_work lock virtio_transport_do_close set_flag(SOCK_DONE) virtio_transport_remove_sock vsock_remove_sock vsock_remove_bound release Introduce a sk_shutdown check to disallow vsock_enqueue_accept() during socket destruction. unreferenced object 0xffff888109e3f800 (size 2040): comm "kworker/5:2", pid 371, jiffies 4294940105 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 28 00 0b 40 00 00 00 00 00 00 00 00 00 00 00 00 (..@............ backtrace (crc 9e5f4e84): [] kmem_cache_alloc_noprof+0x2c1/0x360 [] sk_prot_alloc+0x30/0x120 [] sk_alloc+0x2c/0x4b0 [] __vsock_create.constprop.0+0x2a/0x310 [] virtio_transport_recv_pkt+0x4dc/0x9a0 [] vsock_loopback_work+0xfd/0x140 [] process_one_work+0x20c/0x570 [] worker_thread+0x1bf/0x3a0 [] kthread+0xdd/0x110 [] ret_from_fork+0x2d/0x50 [] ret_from_fork_asm+0x1a/0x30 Fixes: 3fe356d58efa ("vsock/virtio: discard packets only when socket is really closed") Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index ccbd2bc0d210..cd075f608d4f 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1512,6 +1512,14 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, return -ENOMEM; } + /* __vsock_release() might have already flushed accept_queue. + * Subsequent enqueues would lead to a memory leak. + */ + if (sk->sk_shutdown == SHUTDOWN_MASK) { + virtio_transport_reset_no_sock(t, skb); + return -ESHUTDOWN; + } + child = vsock_create_connected(sk); if (!child) { virtio_transport_reset_no_sock(t, skb); From fbf7085b3ad1c7cc0677834c90f985f1b4f77a33 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 7 Nov 2024 21:46:13 +0100 Subject: [PATCH 19/38] vsock: Fix sk_error_queue memory leak Kernel queues MSG_ZEROCOPY completion notifications on the error queue. Where they remain, until explicitly recv()ed. To prevent memory leaks, clean up the queue when the socket is destroyed. unreferenced object 0xffff8881028beb00 (size 224): comm "vsock_test", pid 1218, jiffies 4294694897 hex dump (first 32 bytes): 90 b0 21 17 81 88 ff ff 90 b0 21 17 81 88 ff ff ..!.......!..... 00 00 00 00 00 00 00 00 00 b0 21 17 81 88 ff ff ..........!..... backtrace (crc 6c7031ca): [] kmem_cache_alloc_node_noprof+0x2f7/0x370 [] __alloc_skb+0x132/0x180 [] sock_omalloc+0x4b/0x80 [] msg_zerocopy_realloc+0x9e/0x240 [] virtio_transport_send_pkt_info+0x412/0x4c0 [] virtio_transport_stream_enqueue+0x43/0x50 [] vsock_connectible_sendmsg+0x373/0x450 [] ____sys_sendmsg+0x365/0x3a0 [] ___sys_sendmsg+0x84/0xd0 [] __sys_sendmsg+0x47/0x80 [] do_syscall_64+0x93/0x180 [] entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support") Signed-off-by: Michal Luczaj Reviewed-by: Stefano Garzarella Acked-by: Arseniy Krasnov Signed-off-by: Paolo Abeni --- net/vmw_vsock/af_vsock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index 35681adedd9a..dfd29160fe11 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -836,6 +836,9 @@ static void vsock_sk_destruct(struct sock *sk) { struct vsock_sock *vsk = vsock_sk(sk); + /* Flush MSG_ZEROCOPY leftovers. */ + __skb_queue_purge(&sk->sk_error_queue); + vsock_deassign_transport(vsk); /* When clearing these addresses, there's no need to set the family and From 60cf6206a1f513512f5d73fa4d3dbbcad2e7dcd6 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 7 Nov 2024 21:46:14 +0100 Subject: [PATCH 20/38] virtio/vsock: Improve MSG_ZEROCOPY error handling Add a missing kfree_skb() to prevent memory leaks. Fixes: 581512a6dc93 ("vsock/virtio: MSG_ZEROCOPY flag support") Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Acked-by: Arseniy Krasnov Signed-off-by: Paolo Abeni --- net/vmw_vsock/virtio_transport_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index cd075f608d4f..e2e6a30b759b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -400,6 +400,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, if (virtio_transport_init_zcopy_skb(vsk, skb, info->msg, can_zcopy)) { + kfree_skb(skb); ret = -ENOMEM; break; } From 7967dc8f797f454d4f4acec15c7df0cdf4801617 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 8 Nov 2024 11:19:54 -0500 Subject: [PATCH 21/38] Bluetooth: hci_core: Fix calling mgmt_device_connected Since 61a939c68ee0 ("Bluetooth: Queue incoming ACL data until BT_CONNECTED state is reached") there is no long the need to call mgmt_device_connected as ACL data will be queued until BT_CONNECTED state. Link: https://bugzilla.kernel.org/show_bug.cgi?id=219458 Link: https://github.com/bluez/bluez/issues/1014 Fixes: 333b4fd11e89 ("Bluetooth: L2CAP: Fix uaf in l2cap_connect") Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 96d097b21d13..0ac354db8177 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3788,8 +3788,6 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); - if (conn && hci_dev_test_flag(hdev, HCI_MGMT)) - mgmt_device_connected(hdev, conn, NULL, 0); hci_dev_unlock(hdev); if (conn) { From d5359a7f583ab9b7706915213b54deac065bcb81 Mon Sep 17 00:00:00 2001 From: Kiran K Date: Tue, 22 Oct 2024 14:41:34 +0530 Subject: [PATCH 22/38] Bluetooth: btintel: Direct exception event to bluetooth stack Have exception event part of HCI traces which helps for debug. snoop traces: > HCI Event: Vendor (0xff) plen 79 Vendor Prefix (0x8780) Intel Extended Telemetry (0x03) Unknown extended telemetry event type (0xde) 01 01 de Unknown extended subevent 0x07 01 01 de 07 01 de 06 1c ef be ad de ef be ad de ef be ad de ef be ad de ef be ad de ef be ad de ef be ad de 05 14 ef be ad de ef be ad de ef be ad de ef be ad de ef be ad de 43 10 ef be ad de ef be ad de ef be ad de ef be ad de Fixes: af395330abed ("Bluetooth: btintel: Add Intel devcoredump support") Signed-off-by: Kiran K Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btintel.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/bluetooth/btintel.c b/drivers/bluetooth/btintel.c index 438b92967bc3..30a32ebbcc68 100644 --- a/drivers/bluetooth/btintel.c +++ b/drivers/bluetooth/btintel.c @@ -3288,13 +3288,12 @@ static int btintel_diagnostics(struct hci_dev *hdev, struct sk_buff *skb) case INTEL_TLV_TEST_EXCEPTION: /* Generate devcoredump from exception */ if (!hci_devcd_init(hdev, skb->len)) { - hci_devcd_append(hdev, skb); + hci_devcd_append(hdev, skb_clone(skb, GFP_ATOMIC)); hci_devcd_complete(hdev); } else { bt_dev_err(hdev, "Failed to generate devcoredump"); - kfree_skb(skb); } - return 0; + break; default: bt_dev_err(hdev, "Invalid exception type %02X", tlv->val[0]); } From 50d325bb05cef24a2105e40e7cace5e2b237236d Mon Sep 17 00:00:00 2001 From: Wander Lairson Costa Date: Wed, 6 Nov 2024 08:14:26 -0300 Subject: [PATCH 23/38] Revert "igb: Disable threaded IRQ for igb_msix_other" This reverts commit 338c4d3902feb5be49bfda530a72c7ab860e2c9f. Sebastian noticed the ISR indirectly acquires spin_locks, which are sleeping locks under PREEMPT_RT, which leads to kernel splats. Fixes: 338c4d3902feb ("igb: Disable threaded IRQ for igb_msix_other") Reported-by: Sebastian Andrzej Siewior Signed-off-by: Wander Lairson Costa Reviewed-by: Sebastian Andrzej Siewior Acked-by: Przemek Kitszel Link: https://patch.msgid.link/20241106111427.7272-1-wander@redhat.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igb/igb_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index b83df5f94b1f..f1d088168723 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -907,7 +907,7 @@ static int igb_request_msix(struct igb_adapter *adapter) int i, err = 0, vector = 0, free_vector = 0; err = request_irq(adapter->msix_entries[vector].vector, - igb_msix_other, IRQF_NO_THREAD, netdev->name, adapter); + igb_msix_other, 0, netdev->name, adapter); if (err) goto err_out; From 2b99b2532593b5a4c7dc6bff2486e98d211a8596 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Nov 2024 11:03:21 +0100 Subject: [PATCH 24/38] MAINTAINERS: Re-add cancelled Renesas driver sections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Removing full driver sections also removed mailing list entries, causing submitters of future patches to forget CCing these mailing lists. Hence re-add the sections for the Renesas Ethernet AVB, R-Car SATA, and SuperH Ethernet drivers. Add people who volunteered to maintain these drivers (thanks a lot!), and mark all of them as supported. Fixes: 6e90b675cf942e50 ("MAINTAINERS: Remove some entries due to various compliance requirements.") Signed-off-by: Geert Uytterhoeven Acked-by: Greg Kroah-Hartman Reviewed-by: Simon Horman Acked-by: Niklas Cassel Acked-by: Niklas Söderlund Reviewed-by: Paul Barker Link: https://patch.msgid.link/4b2105332edca277f07ffa195796975e9ddce994.1731319098.git.geert+renesas@glider.be Signed-off-by: Jakub Kicinski --- MAINTAINERS | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 1d30b808d221..2bc7cc4769f5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19578,6 +19578,17 @@ S: Supported F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.yaml F: drivers/i2c/busses/i2c-emev2.c +RENESAS ETHERNET AVB DRIVER +M: Paul Barker +M: Niklas Söderlund +L: netdev@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +S: Supported +F: Documentation/devicetree/bindings/net/renesas,etheravb.yaml +F: drivers/net/ethernet/renesas/Kconfig +F: drivers/net/ethernet/renesas/Makefile +F: drivers/net/ethernet/renesas/ravb* + RENESAS ETHERNET SWITCH DRIVER R: Yoshihiro Shimoda L: netdev@vger.kernel.org @@ -19627,6 +19638,14 @@ F: Documentation/devicetree/bindings/i2c/renesas,rmobile-iic.yaml F: drivers/i2c/busses/i2c-rcar.c F: drivers/i2c/busses/i2c-sh_mobile.c +RENESAS R-CAR SATA DRIVER +M: Geert Uytterhoeven +L: linux-ide@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +S: Supported +F: Documentation/devicetree/bindings/ata/renesas,rcar-sata.yaml +F: drivers/ata/sata_rcar.c + RENESAS R-CAR THERMAL DRIVERS M: Niklas Söderlund L: linux-renesas-soc@vger.kernel.org @@ -19702,6 +19721,17 @@ S: Supported F: Documentation/devicetree/bindings/i2c/renesas,rzv2m.yaml F: drivers/i2c/busses/i2c-rzv2m.c +RENESAS SUPERH ETHERNET DRIVER +M: Niklas Söderlund +L: netdev@vger.kernel.org +L: linux-renesas-soc@vger.kernel.org +S: Supported +F: Documentation/devicetree/bindings/net/renesas,ether.yaml +F: drivers/net/ethernet/renesas/Kconfig +F: drivers/net/ethernet/renesas/Makefile +F: drivers/net/ethernet/renesas/sh_eth* +F: include/linux/sh_eth.h + RENESAS USB PHY DRIVER M: Yoshihiro Shimoda L: linux-renesas-soc@vger.kernel.org From 73af53d82076bbe184d9ece9e14b0dc8599e6055 Mon Sep 17 00:00:00 2001 From: Alexandre Ferrieux Date: Sun, 10 Nov 2024 18:28:36 +0100 Subject: [PATCH 25/38] net: sched: cls_u32: Fix u32's systematic failure to free IDR entries for hnodes. To generate hnode handles (in gen_new_htid()), u32 uses IDR and encodes the returned small integer into a structured 32-bit word. Unfortunately, at disposal time, the needed decoding is not done. As a result, idr_remove() fails, and the IDR fills up. Since its size is 2048, the following script ends up with "Filter already exists": tc filter add dev myve $FILTER1 tc filter add dev myve $FILTER2 for i in {1..2048} do echo $i tc filter del dev myve $FILTER2 tc filter add dev myve $FILTER2 done This patch adds the missing decoding logic for handles that deserve it. Fixes: e7614370d6f0 ("net_sched: use idr to allocate u32 filter handles") Reviewed-by: Eric Dumazet Acked-by: Jamal Hadi Salim Signed-off-by: Alexandre Ferrieux Tested-by: Victor Nogueira Link: https://patch.msgid.link/20241110172836.331319-1-alexandre.ferrieux@orange.com Signed-off-by: Jakub Kicinski --- net/sched/cls_u32.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 9412d88a99bc..d3a03c57545b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -92,6 +92,16 @@ struct tc_u_common { long knodes; }; +static u32 handle2id(u32 h) +{ + return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h); +} + +static u32 id2handle(u32 id) +{ + return (id | 0x800U) << 20; +} + static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) @@ -310,7 +320,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL); if (id < 0) return 0; - return (id | 0x800U) << 20; + return id2handle(id); } static struct hlist_head *tc_u_common_hash; @@ -360,7 +370,7 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; refcount_set(&root_ht->refcnt, 1); - root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0); root_ht->prio = tp->prio; root_ht->is_root = true; idr_init(&root_ht->handle_idr); @@ -612,7 +622,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, if (phn == ht) { u32_clear_hw_hnode(tp, ht, extack); idr_destroy(&ht->handle_idr); - idr_remove(&tp_c->handle_idr, ht->handle); + idr_remove(&tp_c->handle_idr, handle2id(ht->handle)); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -989,7 +999,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { - idr_remove(&tp_c->handle_idr, handle); + idr_remove(&tp_c->handle_idr, handle2id(handle)); kfree(ht); return err; } From e0266319413d5d687ba7b6df7ca99e4b9724a4f2 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 12 Nov 2024 20:18:33 +0100 Subject: [PATCH 26/38] mptcp: update local address flags when setting it Just like in-kernel pm, when userspace pm does set_flags, it needs to send out MP_PRIO signal, and also modify the flags of the corresponding address entry in the local address list. This patch implements the missing logic. Traverse all address entries on userspace_pm_local_addr_list to find the local address entry, if bkup is true, set the flags of this entry with FLAG_BACKUP, otherwise, clear FLAG_BACKUP. Fixes: 892f396c8e68 ("mptcp: netlink: issue MP_PRIO signals from userspace PMs") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-1-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 56dfea9862b7..3f888bfe1462 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -560,6 +560,7 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; struct net *net = sock_net(skb->sk); + struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; @@ -601,6 +602,17 @@ int mptcp_userspace_pm_set_flags(struct sk_buff *skb, struct genl_info *info) if (loc.flags & MPTCP_PM_ADDR_FLAG_BACKUP) bkup = 1; + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, &loc.addr, false)) { + if (bkup) + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + } + } + spin_unlock_bh(&msk->pm.lock); + lock_sock(sk); ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc.addr, &rem.addr, bkup); release_sock(sk); From f642c5c4d528d11bd78b6c6f84f541cd3c0bea86 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 12 Nov 2024 20:18:34 +0100 Subject: [PATCH 27/38] mptcp: hold pm lock when deleting entry When traversing userspace_pm_local_addr_list and deleting an entry from it in mptcp_pm_nl_remove_doit(), msk->pm.lock should be held. This patch holds this lock before mptcp_userspace_pm_lookup_addr_by_id() and releases it after list_move() in mptcp_pm_nl_remove_doit(). Fixes: d9a4594edabf ("mptcp: netlink: Add MPTCP_PM_CMD_REMOVE") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-2-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 3f888bfe1462..e35178f5205f 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -308,14 +308,17 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) lock_sock(sk); + spin_lock_bh(&msk->pm.lock); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); + spin_unlock_bh(&msk->pm.lock); release_sock(sk); goto out; } list_move(&match->list, &free_list); + spin_unlock_bh(&msk->pm.lock); mptcp_pm_remove_addrs(msk, &free_list); From db3eab8110bc0520416101b6a5b52f44a43fb4cf Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 12 Nov 2024 20:18:35 +0100 Subject: [PATCH 28/38] mptcp: pm: use _rcu variant under rcu_read_lock In mptcp_pm_create_subflow_or_signal_addr(), rcu_read_(un)lock() are used as expected to iterate over the list of local addresses, but list_for_each_entry() was used instead of list_for_each_entry_rcu() in __lookup_addr(). It is important to use this variant which adds the required READ_ONCE() (and diagnostic checks if enabled). Because __lookup_addr() is also used in mptcp_pm_nl_set_flags() where it is called under the pernet->lock and not rcu_read_lock(), an extra condition is then passed to help the diagnostic checks making sure either the associated spin lock or the RCU lock is held. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-3-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index db586a5b3866..45a2b5f05d38 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -524,7 +524,8 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; } From 671154f174e0e7f242507cd074497661deb41bfd Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Tue, 12 Nov 2024 16:20:00 +0000 Subject: [PATCH 29/38] net: phylink: ensure PHY momentary link-fails are handled Normally, phylib won't notify changes in quick succession. However, as a result of commit 3e43b903da04 ("net: phy: Immediately call adjust_link if only tx_lpi_enabled changes") this is no longer true - it is now possible that phy_link_down() and phy_link_up() will both complete before phylink's resolver has run, which means it'll miss that pl->phy_state.link momentarily became false. Rename "mac_link_dropped" to be more generic "link_failed" since it will cover more than the MAC/PCS end of the link failing, and arrange to set this in phylink_phy_change() if we notice that the PHY reports that the link is down. This will ensure that we capture an EEE reconfiguration event. Fixes: 3e43b903da04 ("net: phy: Immediately call adjust_link if only tx_lpi_enabled changes") Signed-off-by: Russell King (Oracle) Reviewed-by: Oleksij Rempel Link: https://patch.msgid.link/E1tAtcW-002RBS-LB@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 4309317de3d1..3e9957b6aa14 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -78,7 +78,7 @@ struct phylink { unsigned int pcs_neg_mode; unsigned int pcs_state; - bool mac_link_dropped; + bool link_failed; bool using_mac_select_pcs; struct sfp_bus *sfp_bus; @@ -1475,9 +1475,9 @@ static void phylink_resolve(struct work_struct *w) cur_link_state = pl->old_link_state; if (pl->phylink_disable_state) { - pl->mac_link_dropped = false; + pl->link_failed = false; link_state.link = false; - } else if (pl->mac_link_dropped) { + } else if (pl->link_failed) { link_state.link = false; retrigger = true; } else { @@ -1572,7 +1572,7 @@ static void phylink_resolve(struct work_struct *w) phylink_link_up(pl, link_state); } if (!link_state.link && retrigger) { - pl->mac_link_dropped = false; + pl->link_failed = false; queue_work(system_power_efficient_wq, &pl->resolve); } mutex_unlock(&pl->state_mutex); @@ -1835,6 +1835,8 @@ static void phylink_phy_change(struct phy_device *phydev, bool up) pl->phy_state.pause |= MLO_PAUSE_RX; pl->phy_state.interface = phydev->interface; pl->phy_state.link = up; + if (!up) + pl->link_failed = true; mutex_unlock(&pl->state_mutex); phylink_run_resolve(pl); @@ -2158,7 +2160,7 @@ EXPORT_SYMBOL_GPL(phylink_disconnect_phy); static void phylink_link_changed(struct phylink *pl, bool up, const char *what) { if (!up) - pl->mac_link_dropped = true; + pl->link_failed = true; phylink_run_resolve(pl); phylink_dbg(pl, "%s link %s\n", what, up ? "up" : "down"); } @@ -2792,7 +2794,7 @@ int phylink_ethtool_set_pauseparam(struct phylink *pl, * link will cycle. */ if (manual_changed) { - pl->mac_link_dropped = true; + pl->link_failed = true; phylink_run_resolve(pl); } From 3342dc8b4623d835e7dd76a15cec2e5a94fe2f93 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 12 Nov 2024 11:03:47 +0800 Subject: [PATCH 30/38] samples: pktgen: correct dev to DEV In the pktgen_sample01_simple.sh script, the device variable is uppercase 'DEV' instead of lowercase 'dev'. Because of this typo, the script cannot enable UDP tx checksum. Fixes: 460a9aa23de6 ("samples: pktgen: add UDP tx checksum support") Signed-off-by: Wei Fang Reviewed-by: Simon Horman Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20241112030347.1849335-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski --- samples/pktgen/pktgen_sample01_simple.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh index cdb9f497f87d..66cb707479e6 100755 --- a/samples/pktgen/pktgen_sample01_simple.sh +++ b/samples/pktgen/pktgen_sample01_simple.sh @@ -76,7 +76,7 @@ if [ -n "$DST_PORT" ]; then pg_set $DEV "udp_dst_max $UDP_DST_MAX" fi -[ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" +[ ! -z "$UDP_CSUM" ] && pg_set $DEV "flag UDPCSUM" # Setup random UDP port src range pg_set $DEV "flag UDPSRC_RND" From e28acc9c1ccfcb24c08e020828f69d0a915b06ae Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 8 Nov 2024 06:08:36 -0800 Subject: [PATCH 31/38] ipmr: Fix access to mfc_cache_list without lock held Accessing `mr_table->mfc_cache_list` is protected by an RCU lock. In the following code flow, the RCU read lock is not held, causing the following error when `RCU_PROVE` is not held. The same problem might show up in the IPv6 code path. 6.12.0-rc5-kbuilder-01145-gbac17284bdcb #33 Tainted: G E N ----------------------------- net/ipv4/ipmr_base.c:313 RCU-list traversed in non-reader section!! rcu_scheduler_active = 2, debug_locks = 1 2 locks held by RetransmitAggre/3519: #0: ffff88816188c6c0 (nlk_cb_mutex-ROUTE){+.+.}-{3:3}, at: __netlink_dump_start+0x8a/0x290 #1: ffffffff83fcf7a8 (rtnl_mutex){+.+.}-{3:3}, at: rtnl_dumpit+0x6b/0x90 stack backtrace: lockdep_rcu_suspicious mr_table_dump ipmr_rtm_dumproute rtnl_dump_all rtnl_dumpit netlink_dump __netlink_dump_start rtnetlink_rcv_msg netlink_rcv_skb netlink_unicast netlink_sendmsg This is not a problem per see, since the RTNL lock is held here, so, it is safe to iterate in the list without the RCU read lock, as suggested by Eric. To alleviate the concern, modify the code to use list_for_each_entry_rcu() with the RTNL-held argument. The annotation will raise an error only if RTNL or RCU read lock are missing during iteration, signaling a legitimate problem, otherwise it will avoid this false positive. This will solve the IPv6 case as well, since ip6mr_rtm_dumproute() calls this function as well. Signed-off-by: Breno Leitao Reviewed-by: David Ahern Link: https://patch.msgid.link/20241108-ipmr_rcu-v2-1-c718998e209b@debian.org Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr_base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ipmr_base.c b/net/ipv4/ipmr_base.c index 271dc03fc6db..f0af12a2f70b 100644 --- a/net/ipv4/ipmr_base.c +++ b/net/ipv4/ipmr_base.c @@ -310,7 +310,8 @@ int mr_table_dump(struct mr_table *mrt, struct sk_buff *skb, if (filter->filter_set) flags |= NLM_F_DUMP_FILTERED; - list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) { + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list, + lockdep_rtnl_is_held()) { if (e < s_e) goto next_entry; if (filter->dev && From a03b18a71c128846360cc81ac6fdb0e7d41597b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Sat, 9 Nov 2024 10:16:32 -0500 Subject: [PATCH 32/38] net: stmmac: dwmac-mediatek: Fix inverted handling of mediatek,mac-wol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The mediatek,mac-wol property is being handled backwards to what is described in the binding: it currently enables PHY WOL when the property is present and vice versa. Invert the driver logic so it matches the binding description. Fixes: fd1d62d80ebc ("net: stmmac: replace the use_phy_wol field with a flag") Signed-off-by: Nícolas F. R. A. Prado Link: https://patch.msgid.link/20241109-mediatek-mac-wol-noninverted-v2-1-0e264e213878@collabora.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c index 2a9132d6d743..001857c294fb 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-mediatek.c @@ -589,9 +589,9 @@ static int mediatek_dwmac_common_data(struct platform_device *pdev, plat->mac_interface = priv_plat->phy_mode; if (priv_plat->mac_wol) - plat->flags |= STMMAC_FLAG_USE_PHY_WOL; - else plat->flags &= ~STMMAC_FLAG_USE_PHY_WOL; + else + plat->flags |= STMMAC_FLAG_USE_PHY_WOL; plat->riwt_off = 1; plat->maxmtu = ETH_DATA_LEN; plat->host_dma_width = priv_plat->variant->dma_bit_mask; From eb94b7bb10109a14a5431a67e5d8e31cfa06b395 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 11 Nov 2024 00:17:34 +0100 Subject: [PATCH 33/38] net: Make copy_safe_from_sockptr() match documentation copy_safe_from_sockptr() return copy_from_sockptr() return copy_from_sockptr_offset() return copy_from_user() copy_from_user() does not return an error on fault. Instead, it returns a number of bytes that were not copied. Have it handled. Patch has a side effect: it un-breaks garbage input handling of nfc_llcp_setsockopt() and mISDN's data_sock_setsockopt(). Fixes: 6309863b31dd ("net: add copy_safe_from_sockptr() helper") Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241111-sockptr-copy-ret-fix-v1-1-a520083a93fb@rbox.co Signed-off-by: Jakub Kicinski --- include/linux/sockptr.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index fc5a206c4043..195debe2b1db 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -77,7 +77,9 @@ static inline int copy_safe_from_sockptr(void *dst, size_t ksize, { if (optlen < ksize) return -EINVAL; - return copy_from_sockptr(dst, optval, ksize); + if (copy_from_sockptr(dst, optval, ksize)) + return -EFAULT; + return 0; } static inline int copy_struct_from_sockptr(void *dst, size_t ksize, From 5b366eae71937ae7412365340b431064625f9617 Mon Sep 17 00:00:00 2001 From: Vitalii Mordan Date: Fri, 8 Nov 2024 20:33:34 +0300 Subject: [PATCH 34/38] stmmac: dwmac-intel-plat: fix call balance of tx_clk handling routines If the clock dwmac->tx_clk was not enabled in intel_eth_plat_probe, it should not be disabled in any path. Conversely, if it was enabled in intel_eth_plat_probe, it must be disabled in all error paths to ensure proper cleanup. Found by Linux Verification Center (linuxtesting.org) with Klever. Fixes: 9efc9b2b04c7 ("net: stmmac: Add dwmac-intel-plat for GBE driver") Signed-off-by: Vitalii Mordan Link: https://patch.msgid.link/20241108173334.2973603-1-mordan@ispras.ru Signed-off-by: Jakub Kicinski --- .../stmicro/stmmac/dwmac-intel-plat.c | 25 +++++++++++++------ 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c index d68f0c4e7835..9739bc9867c5 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-intel-plat.c @@ -108,7 +108,12 @@ static int intel_eth_plat_probe(struct platform_device *pdev) if (IS_ERR(dwmac->tx_clk)) return PTR_ERR(dwmac->tx_clk); - clk_prepare_enable(dwmac->tx_clk); + ret = clk_prepare_enable(dwmac->tx_clk); + if (ret) { + dev_err(&pdev->dev, + "Failed to enable tx_clk\n"); + return ret; + } /* Check and configure TX clock rate */ rate = clk_get_rate(dwmac->tx_clk); @@ -119,7 +124,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Failed to set tx_clk\n"); - return ret; + goto err_tx_clk_disable; } } } @@ -133,7 +138,7 @@ static int intel_eth_plat_probe(struct platform_device *pdev) if (ret) { dev_err(&pdev->dev, "Failed to set clk_ptp_ref\n"); - return ret; + goto err_tx_clk_disable; } } } @@ -149,12 +154,15 @@ static int intel_eth_plat_probe(struct platform_device *pdev) } ret = stmmac_dvr_probe(&pdev->dev, plat_dat, &stmmac_res); - if (ret) { - clk_disable_unprepare(dwmac->tx_clk); - return ret; - } + if (ret) + goto err_tx_clk_disable; return 0; + +err_tx_clk_disable: + if (dwmac->data->tx_clk_en) + clk_disable_unprepare(dwmac->tx_clk); + return ret; } static void intel_eth_plat_remove(struct platform_device *pdev) @@ -162,7 +170,8 @@ static void intel_eth_plat_remove(struct platform_device *pdev) struct intel_dwmac *dwmac = get_stmmac_bsp_priv(&pdev->dev); stmmac_pltfr_remove(pdev); - clk_disable_unprepare(dwmac->tx_clk); + if (dwmac->data->tx_clk_en) + clk_disable_unprepare(dwmac->tx_clk); } static struct platform_driver intel_eth_plat_driver = { From dc065076ee7768377d7c16af7d1b0767782d8c98 Mon Sep 17 00:00:00 2001 From: Meghana Malladi Date: Mon, 11 Nov 2024 15:28:42 +0530 Subject: [PATCH 35/38] net: ti: icssg-prueth: Fix 1 PPS sync The first PPS latch time needs to be calculated by the driver (in rounded off seconds) and configured as the start time offset for the cycle. After synchronizing two PTP clocks running as master/slave, missing this would cause master and slave to start immediately with some milliseconds drift which causes the PPS signal to never synchronize with the PTP master. Fixes: 186734c15886 ("net: ti: icssg-prueth: add packet timestamping and ptp support") Signed-off-by: Meghana Malladi Reviewed-by: Vadim Fedorenko Reviewed-by: MD Danish Anwar Link: https://patch.msgid.link/20241111095842.478833-1-m-malladi@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/icssg/icssg_prueth.c | 13 +++++++++++-- drivers/net/ethernet/ti/icssg/icssg_prueth.h | 12 ++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.c b/drivers/net/ethernet/ti/icssg/icssg_prueth.c index 5c20ceb164df..fe2fd1bfc904 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.c +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -411,6 +412,8 @@ static int prueth_perout_enable(void *clockops_data, struct prueth_emac *emac = clockops_data; u32 reduction_factor = 0, offset = 0; struct timespec64 ts; + u64 current_cycle; + u64 start_offset; u64 ns_period; if (!on) @@ -449,8 +452,14 @@ static int prueth_perout_enable(void *clockops_data, writel(reduction_factor, emac->prueth->shram.va + TIMESYNC_FW_WC_SYNCOUT_REDUCTION_FACTOR_OFFSET); - writel(0, emac->prueth->shram.va + - TIMESYNC_FW_WC_SYNCOUT_START_TIME_CYCLECOUNT_OFFSET); + current_cycle = icssg_read_time(emac->prueth->shram.va + + TIMESYNC_FW_WC_CYCLECOUNT_OFFSET); + + /* Rounding of current_cycle count to next second */ + start_offset = roundup(current_cycle, MSEC_PER_SEC); + + hi_lo_writeq(start_offset, emac->prueth->shram.va + + TIMESYNC_FW_WC_SYNCOUT_START_TIME_CYCLECOUNT_OFFSET); return 0; } diff --git a/drivers/net/ethernet/ti/icssg/icssg_prueth.h b/drivers/net/ethernet/ti/icssg/icssg_prueth.h index 8722bb4a268a..f5c1d473e9f9 100644 --- a/drivers/net/ethernet/ti/icssg/icssg_prueth.h +++ b/drivers/net/ethernet/ti/icssg/icssg_prueth.h @@ -330,6 +330,18 @@ static inline int prueth_emac_slice(struct prueth_emac *emac) extern const struct ethtool_ops icssg_ethtool_ops; extern const struct dev_pm_ops prueth_dev_pm_ops; +static inline u64 icssg_read_time(const void __iomem *addr) +{ + u32 low, high; + + do { + high = readl(addr + 4); + low = readl(addr); + } while (high != readl(addr + 4)); + + return low + ((u64)high << 32); +} + /* Classifier helpers */ void icssg_class_set_mac_addr(struct regmap *miig_rt, int slice, u8 *mac); void icssg_class_set_host_mac_addr(struct regmap *miig_rt, const u8 *mac); From 8eb36164d1a6769a20ed43033510067ff3dab9ee Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 11 Nov 2024 10:16:49 +0000 Subject: [PATCH 36/38] bonding: add ns target multicast address to slave device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4598380f9c54 ("bonding: fix ns validation on backup slaves") tried to resolve the issue where backup slaves couldn't be brought up when receiving IPv6 Neighbor Solicitation (NS) messages. However, this fix only worked for drivers that receive all multicast messages, such as the veth interface. For standard drivers, the NS multicast message is silently dropped because the slave device is not a member of the NS target multicast group. To address this, we need to make the slave device join the NS target multicast group, ensuring it can receive these IPv6 NS messages to validate the slave’s status properly. There are three policies before joining the multicast group: 1. All settings must be under active-backup mode (alb and tlb do not support arp_validate), with backup slaves and slaves supporting multicast. 2. We can add or remove multicast groups when arp_validate changes. 3. Other operations, such as enslaving, releasing, or setting NS targets, need to be guarded by arp_validate. Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- drivers/net/bonding/bond_main.c | 16 +++++- drivers/net/bonding/bond_options.c | 82 +++++++++++++++++++++++++++++- include/net/bond_options.h | 2 + 3 files changed, 98 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b1bffd8e9a95..15e0f14d0d49 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1008,6 +1008,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, if (bond->dev->flags & IFF_UP) bond_hw_addr_flush(bond->dev, old_active->dev); + + bond_slave_ns_maddrs_add(bond, old_active); } if (new_active) { @@ -1024,6 +1026,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, dev_mc_sync(new_active->dev, bond->dev); netif_addr_unlock_bh(bond->dev); } + + bond_slave_ns_maddrs_del(bond, new_active); } } @@ -2341,6 +2345,11 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, bond_compute_features(bond); bond_set_carrier(bond); + /* Needs to be called before bond_select_active_slave(), which will + * remove the maddrs if the slave is selected as active slave. + */ + bond_slave_ns_maddrs_add(bond, new_slave); + if (bond_uses_primary(bond)) { block_netpoll_tx(); bond_select_active_slave(bond); @@ -2350,7 +2359,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); - if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { if (bond->xdp_prog) { @@ -2548,6 +2556,12 @@ static int __bond_release_one(struct net_device *bond_dev, if (oldcurrent == slave) bond_change_active_slave(bond, NULL); + /* Must be called after bond_change_active_slave () as the slave + * might change from an active slave to a backup slave. Then it is + * necessary to clear the maddrs on the backup slave. + */ + bond_slave_ns_maddrs_del(bond, slave); + if (bond_is_lb(bond)) { /* Must be called only after the slave has been * detached from the list and the curr_active_slave diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 95d59a18c022..327b6ecdc77e 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -15,6 +15,7 @@ #include #include +#include static int bond_option_active_slave_set(struct bonding *bond, const struct bond_opt_value *newval); @@ -1234,6 +1235,68 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond, } #if IS_ENABLED(CONFIG_IPV6) +static bool slave_can_set_ns_maddr(const struct bonding *bond, struct slave *slave) +{ + return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + !bond_is_active_slave(slave) && + slave->dev->flags & IFF_MULTICAST; +} + +static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) +{ + struct in6_addr *targets = bond->params.ns_targets; + char slot_maddr[MAX_ADDR_LEN]; + int i; + + if (!slave_can_set_ns_maddr(bond, slave)) + return; + + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { + if (ipv6_addr_any(&targets[i])) + break; + + if (!ndisc_mc_map(&targets[i], slot_maddr, slave->dev, 0)) { + if (add) + dev_mc_add(slave->dev, slot_maddr); + else + dev_mc_del(slave->dev, slot_maddr); + } + } +} + +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) +{ + if (!bond->params.arp_validate) + return; + slave_set_ns_maddrs(bond, slave, true); +} + +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) +{ + if (!bond->params.arp_validate) + return; + slave_set_ns_maddrs(bond, slave, false); +} + +static void slave_set_ns_maddr(struct bonding *bond, struct slave *slave, + struct in6_addr *target, struct in6_addr *slot) +{ + char target_maddr[MAX_ADDR_LEN], slot_maddr[MAX_ADDR_LEN]; + + if (!bond->params.arp_validate || !slave_can_set_ns_maddr(bond, slave)) + return; + + /* remove the previous maddr from slave */ + if (!ipv6_addr_any(slot) && + !ndisc_mc_map(slot, slot_maddr, slave->dev, 0)) + dev_mc_del(slave->dev, slot_maddr); + + /* add new maddr on slave if target is set */ + if (!ipv6_addr_any(target) && + !ndisc_mc_map(target, target_maddr, slave->dev, 0)) + dev_mc_add(slave->dev, target_maddr); +} + static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct in6_addr *target, unsigned long last_rx) @@ -1243,8 +1306,10 @@ static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct slave *slave; if (slot >= 0 && slot < BOND_MAX_NS_TARGETS) { - bond_for_each_slave(bond, slave, iter) + bond_for_each_slave(bond, slave, iter) { slave->target_last_arp_rx[slot] = last_rx; + slave_set_ns_maddr(bond, slave, target, &targets[slot]); + } targets[slot] = *target; } } @@ -1296,15 +1361,30 @@ static int bond_option_ns_ip6_targets_set(struct bonding *bond, { return -EPERM; } + +static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) {} + +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) {} + +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) {} #endif static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval) { + bool changed = !!bond->params.arp_validate != !!newval->value; + struct list_head *iter; + struct slave *slave; + netdev_dbg(bond->dev, "Setting arp_validate to %s (%llu)\n", newval->string, newval->value); bond->params.arp_validate = newval->value; + if (changed) { + bond_for_each_slave(bond, slave, iter) + slave_set_ns_maddrs(bond, slave, !!bond->params.arp_validate); + } + return 0; } diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 473a0147769e..18687ccf0638 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -161,5 +161,7 @@ void bond_option_arp_ip_targets_clear(struct bonding *bond); #if IS_ENABLED(CONFIG_IPV6) void bond_option_ns_ip6_targets_clear(struct bonding *bond); #endif +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave); +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave); #endif /* _NET_BOND_OPTIONS_H */ From 86fb6173d11e773a00a5b6d1b7bd17caff8692b8 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 11 Nov 2024 10:16:50 +0000 Subject: [PATCH 37/38] selftests: bonding: add ns multicast group testing Add a test to make sure the backup slaves join correct multicast group when arp_validate enabled and ns_ip6_target is set. Here is the result: TEST: arp_validate (active-backup ns_ip6_target arp_validate 0) [ OK ] TEST: arp_validate (join mcast group) [ OK ] TEST: arp_validate (active-backup ns_ip6_target arp_validate 1) [ OK ] TEST: arp_validate (join mcast group) [ OK ] TEST: arp_validate (active-backup ns_ip6_target arp_validate 2) [ OK ] TEST: arp_validate (join mcast group) [ OK ] TEST: arp_validate (active-backup ns_ip6_target arp_validate 3) [ OK ] TEST: arp_validate (join mcast group) [ OK ] TEST: arp_validate (active-backup ns_ip6_target arp_validate 4) [ OK ] TEST: arp_validate (join mcast group) [ OK ] TEST: arp_validate (active-backup ns_ip6_target arp_validate 5) [ OK ] TEST: arp_validate (join mcast group) [ OK ] TEST: arp_validate (active-backup ns_ip6_target arp_validate 6) [ OK ] TEST: arp_validate (join mcast group) [ OK ] Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni --- .../drivers/net/bonding/bond_options.sh | 54 ++++++++++++++++++- 1 file changed, 53 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh index 41d0859feb7d..edc56e2cc606 100755 --- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh +++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh @@ -11,6 +11,8 @@ ALL_TESTS=" lib_dir=$(dirname "$0") source ${lib_dir}/bond_topo_3d1c.sh +c_maddr="33:33:00:00:00:10" +g_maddr="33:33:00:00:02:54" skip_prio() { @@ -240,6 +242,54 @@ arp_validate_test() done } +# Testing correct multicast groups are added to slaves for ns targets +arp_validate_mcast() +{ + RET=0 + local arp_valid=$(cmd_jq "ip -n ${s_ns} -j -d link show bond0" ".[].linkinfo.info_data.arp_validate") + local active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + + for i in $(seq 0 2); do + maddr_list=$(ip -n ${s_ns} maddr show dev eth${i}) + + # arp_valid == 0 or active_slave should not join any maddrs + if { [ "$arp_valid" == "null" ] || [ "eth${i}" == ${active_slave} ]; } && \ + echo "$maddr_list" | grep -qE "${c_maddr}|${g_maddr}"; then + RET=1 + check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group" + # arp_valid != 0 and backup_slave should join both maddrs + elif [ "$arp_valid" != "null" ] && [ "eth${i}" != ${active_slave} ] && \ + ( ! echo "$maddr_list" | grep -q "${c_maddr}" || \ + ! echo "$maddr_list" | grep -q "${m_maddr}"); then + RET=1 + check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group" + fi + done + + # Do failover + ip -n ${s_ns} link set ${active_slave} down + # wait for active link change + slowwait 2 active_slave_changed $active_slave + active_slave=$(cmd_jq "ip -n ${s_ns} -d -j link show bond0" ".[].linkinfo.info_data.active_slave") + + for i in $(seq 0 2); do + maddr_list=$(ip -n ${s_ns} maddr show dev eth${i}) + + # arp_valid == 0 or active_slave should not join any maddrs + if { [ "$arp_valid" == "null" ] || [ "eth${i}" == ${active_slave} ]; } && \ + echo "$maddr_list" | grep -qE "${c_maddr}|${g_maddr}"; then + RET=1 + check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group" + # arp_valid != 0 and backup_slave should join both maddrs + elif [ "$arp_valid" != "null" ] && [ "eth${i}" != ${active_slave} ] && \ + ( ! echo "$maddr_list" | grep -q "${c_maddr}" || \ + ! echo "$maddr_list" | grep -q "${m_maddr}"); then + RET=1 + check_err 1 "arp_valid $arp_valid active_slave $active_slave, eth$i has mcast group" + fi + done +} + arp_validate_arp() { local mode=$1 @@ -261,8 +311,10 @@ arp_validate_ns() fi for val in $(seq 0 6); do - arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} arp_validate $val" + arp_validate_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6},${c_ip6} arp_validate $val" log_test "arp_validate" "$mode ns_ip6_target arp_validate $val" + arp_validate_mcast + log_test "arp_validate" "join mcast group" done } From ca34aceb322bfcd6ab498884f1805ee12f983259 Mon Sep 17 00:00:00 2001 From: Alexandre Ferrieux Date: Wed, 13 Nov 2024 11:04:28 +0100 Subject: [PATCH 38/38] net: sched: u32: Add test case for systematic hnode IDR leaks Add a tdc test case to exercise the just-fixed systematic leak of IDR entries in u32 hnode disposal. Given the IDR in question is confined to the range [1..0x7FF], it is sufficient to create/delete the same filter 2048 times to fill it up and get a nonzero exit status from "tc filter add". Signed-off-by: Alexandre Ferrieux Acked-by: Jamal Hadi Salim Reviewed-by: Victor Nogueira Link: https://patch.msgid.link/20241113100428.360460-1-alexandre.ferrieux@orange.com Signed-off-by: Paolo Abeni --- .../tc-testing/tc-tests/filters/u32.json | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json index 24bd0c2a3014..b2ca9d4e991b 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/u32.json @@ -329,5 +329,29 @@ "teardown": [ "$TC qdisc del dev $DEV1 parent root drr" ] + }, + { + "id": "1234", + "name": "Exercise IDR leaks by creating/deleting a filter many (2048) times", + "category": [ + "filter", + "u32" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$TC qdisc add dev $DEV1 parent root handle 10: drr", + "$TC filter add dev $DEV1 parent 10:0 protocol ip prio 2 u32 match ip src 0.0.0.2/32 action drop", + "$TC filter add dev $DEV1 parent 10:0 protocol ip prio 3 u32 match ip src 0.0.0.3/32 action drop" + ], + "cmdUnderTest": "bash -c 'for i in {1..2048} ;do echo filter delete dev $DEV1 pref 3;echo filter add dev $DEV1 parent 10:0 protocol ip prio 3 u32 match ip src 0.0.0.3/32 action drop;done | $TC -b -'", + "expExitCode": "0", + "verifyCmd": "$TC filter show dev $DEV1", + "matchPattern": "protocol ip pref 3 u32", + "matchCount": "3", + "teardown": [ + "$TC qdisc del dev $DEV1 parent root drr" + ] } ]