From c9fd9c5f4b935516ba8be31f2e7590caf0bc2c6f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Jan 2020 15:54:43 +0100 Subject: [PATCH 1/4] mptcp: defer freeing of cached ext until last moment access to msk->cached_ext is only legal if the msk is locked or all concurrent accesses are impossible. Furthermore, once we start to tear down, we must make sure nothing else can step in and allocate a new cached ext. So place this code in the destroy callback where it belongs. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 39fdca79ce90..f1b1160dbbb4 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -654,8 +654,6 @@ static void __mptcp_close(struct sock *sk, long timeout) __mptcp_close_ssk(sk, ssk, subflow, timeout); } - if (msk->cached_ext) - __skb_ext_put(msk->cached_ext); release_sock(sk); sk_common_release(sk); } @@ -776,6 +774,10 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, static void mptcp_destroy(struct sock *sk) { + struct mptcp_sock *msk = mptcp_sk(sk); + + if (msk->cached_ext) + __skb_ext_put(msk->cached_ext); } static int mptcp_setsockopt(struct sock *sk, int level, int optname, From 50e741bb3b15fd5410bac379bdb4bc795d62b45c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Jan 2020 15:54:44 +0100 Subject: [PATCH 2/4] mptcp: fix panic on user pointer access Its not possible to call the kernel_(s|g)etsockopt functions here, the address points to user memory: General protection fault in user access. Non-canonical address? WARNING: CPU: 1 PID: 5352 at arch/x86/mm/extable.c:77 ex_handler_uaccess+0xba/0xe0 arch/x86/mm/extable.c:77 Kernel panic - not syncing: panic_on_warn set ... [..] Call Trace: fixup_exception+0x9d/0xcd arch/x86/mm/extable.c:178 general_protection+0x2d/0x40 arch/x86/entry/entry_64.S:1202 do_ip_getsockopt+0x1f6/0x1860 net/ipv4/ip_sockglue.c:1323 ip_getsockopt+0x87/0x1c0 net/ipv4/ip_sockglue.c:1561 tcp_getsockopt net/ipv4/tcp.c:3691 [inline] tcp_getsockopt+0x8c/0xd0 net/ipv4/tcp.c:3685 kernel_getsockopt+0x121/0x1f0 net/socket.c:3736 mptcp_getsockopt+0x69/0x90 net/mptcp/protocol.c:830 __sys_getsockopt+0x13a/0x220 net/socket.c:2175 We can call tcp_get/setsockopt functions instead. Doing so fixes crashing, but still leaves rtnl related lockdep splat: WARNING: possible circular locking dependency detected 5.5.0-rc6 #2 Not tainted ------------------------------------------------------ syz-executor.0/16334 is trying to acquire lock: ffffffff84f7a080 (rtnl_mutex){+.+.}, at: do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644 but task is already holding lock: ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: lock_sock include/net/sock.h:1516 [inline] ffff888116503b90 (sk_lock-AF_INET){+.+.}, at: mptcp_setsockopt+0x28/0x90 net/mptcp/protocol.c:1284 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_INET){+.+.}: lock_sock_nested+0xca/0x120 net/core/sock.c:2944 lock_sock include/net/sock.h:1516 [inline] do_ip_setsockopt.isra.0+0x281/0x3820 net/ipv4/ip_sockglue.c:645 ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248 udp_setsockopt+0x5d/0xa0 net/ipv4/udp.c:2639 __sys_setsockopt+0x152/0x240 net/socket.c:2130 __do_sys_setsockopt net/socket.c:2146 [inline] __se_sys_setsockopt net/socket.c:2143 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2143 do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (rtnl_mutex){+.+.}: check_prev_add kernel/locking/lockdep.c:2475 [inline] check_prevs_add kernel/locking/lockdep.c:2580 [inline] validate_chain kernel/locking/lockdep.c:2970 [inline] __lock_acquire+0x1fb2/0x4680 kernel/locking/lockdep.c:3954 lock_acquire+0x127/0x330 kernel/locking/lockdep.c:4484 __mutex_lock_common kernel/locking/mutex.c:956 [inline] __mutex_lock+0x158/0x1340 kernel/locking/mutex.c:1103 do_ip_setsockopt.isra.0+0x277/0x3820 net/ipv4/ip_sockglue.c:644 ip_setsockopt+0x44/0xf0 net/ipv4/ip_sockglue.c:1248 tcp_setsockopt net/ipv4/tcp.c:3159 [inline] tcp_setsockopt+0x8c/0xd0 net/ipv4/tcp.c:3153 kernel_setsockopt+0x121/0x1f0 net/socket.c:3767 mptcp_setsockopt+0x69/0x90 net/mptcp/protocol.c:1288 __sys_setsockopt+0x152/0x240 net/socket.c:2130 __do_sys_setsockopt net/socket.c:2146 [inline] __se_sys_setsockopt net/socket.c:2143 [inline] __x64_sys_setsockopt+0xba/0x150 net/socket.c:2143 do_syscall_64+0xbd/0x5b0 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sk_lock-AF_INET); lock(rtnl_mutex); lock(sk_lock-AF_INET); lock(rtnl_mutex); The lockdep complaint is because we hold mptcp socket lock when calling the sk_prot get/setsockopt handler, and those might need to acquire the rtnl mutex. Normally, order is: rtnl_lock(sk) -> lock_sock Whereas for mptcp the order is lock_sock(mptcp_sk) rtnl_lock -> lock_sock(subflow_sk) We can avoid this by releasing the mptcp socket lock early, but, as Paolo points out, we need to get/put the subflow socket refcount before doing so to avoid race with concurrent close(). Fixes: 717e79c867ca5 ("mptcp: Add setsockopt()/getsockopt() socket operations") Reported-by: Christoph Paasch Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 40 ++++++++++++++++++++++------------------ 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index f1b1160dbbb4..07ebff6396cd 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -781,15 +781,12 @@ static void mptcp_destroy(struct sock *sk) } static int mptcp_setsockopt(struct sock *sk, int level, int optname, - char __user *uoptval, unsigned int optlen) + char __user *optval, unsigned int optlen) { struct mptcp_sock *msk = mptcp_sk(sk); - char __kernel *optval; int ret = -EOPNOTSUPP; struct socket *ssock; - - /* will be treated as __user in tcp_setsockopt */ - optval = (char __kernel __force *)uoptval; + struct sock *ssk; pr_debug("msk=%p", msk); @@ -798,27 +795,28 @@ static int mptcp_setsockopt(struct sock *sk, int level, int optname, */ lock_sock(sk); ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (!IS_ERR(ssock)) { - pr_debug("subflow=%p", ssock->sk); - ret = kernel_setsockopt(ssock, level, optname, optval, optlen); + if (IS_ERR(ssock)) { + release_sock(sk); + return ret; } + + ssk = ssock->sk; + sock_hold(ssk); release_sock(sk); + ret = tcp_setsockopt(ssk, level, optname, optval, optlen); + sock_put(ssk); + return ret; } static int mptcp_getsockopt(struct sock *sk, int level, int optname, - char __user *uoptval, int __user *uoption) + char __user *optval, int __user *option) { struct mptcp_sock *msk = mptcp_sk(sk); - char __kernel *optval; int ret = -EOPNOTSUPP; - int __kernel *option; struct socket *ssock; - - /* will be treated as __user in tcp_getsockopt */ - optval = (char __kernel __force *)uoptval; - option = (int __kernel __force *)uoption; + struct sock *ssk; pr_debug("msk=%p", msk); @@ -827,12 +825,18 @@ static int mptcp_getsockopt(struct sock *sk, int level, int optname, */ lock_sock(sk); ssock = __mptcp_socket_create(msk, MPTCP_SAME_STATE); - if (!IS_ERR(ssock)) { - pr_debug("subflow=%p", ssock->sk); - ret = kernel_getsockopt(ssock, level, optname, optval, option); + if (IS_ERR(ssock)) { + release_sock(sk); + return ret; } + + ssk = ssock->sk; + sock_hold(ssk); release_sock(sk); + ret = tcp_getsockopt(ssk, level, optname, optval, option); + sock_put(ssk); + return ret; } From b2c5b614ca6ed460144788e7f9634569cc0c7b51 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Jan 2020 15:54:45 +0100 Subject: [PATCH 3/4] mptcp: avoid a lockdep splat when mcast group was joined syzbot triggered following lockdep splat: ffffffff82d2cd40 (rtnl_mutex){+.+.}, at: ip_mc_drop_socket+0x52/0x180 but task is already holding lock: ffff8881187a2310 (sk_lock-AF_INET){+.+.}, at: mptcp_close+0x18/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (sk_lock-AF_INET){+.+.}: lock_acquire+0xee/0x230 lock_sock_nested+0x89/0xc0 do_ip_setsockopt.isra.0+0x335/0x22f0 ip_setsockopt+0x35/0x60 tcp_setsockopt+0x5d/0x90 __sys_setsockopt+0xf3/0x190 __x64_sys_setsockopt+0x61/0x70 do_syscall_64+0x72/0x300 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (rtnl_mutex){+.+.}: check_prevs_add+0x2b7/0x1210 __lock_acquire+0x10b6/0x1400 lock_acquire+0xee/0x230 __mutex_lock+0x120/0xc70 ip_mc_drop_socket+0x52/0x180 inet_release+0x36/0xe0 __sock_release+0xfd/0x130 __mptcp_close+0xa8/0x1f0 inet_release+0x7f/0xe0 __sock_release+0x69/0x130 sock_close+0x18/0x20 __fput+0x179/0x400 task_work_run+0xd5/0x110 do_exit+0x685/0x1510 do_group_exit+0x7e/0x170 __x64_sys_exit_group+0x28/0x30 do_syscall_64+0x72/0x300 entry_SYSCALL_64_after_hwframe+0x49/0xbe The trigger is: socket(AF_INET, SOCK_STREAM, 0x106 /* IPPROTO_MPTCP */) = 4 setsockopt(4, SOL_IP, MCAST_JOIN_GROUP, {gr_interface=7, gr_group={sa_family=AF_INET, sin_port=htons(20003), sin_addr=inet_addr("224.0.0.2")}}, 136) = 0 exit(0) Which results in a call to rtnl_lock while we are holding the parent mptcp socket lock via mptcp_close -> lock_sock(msk) -> inet_release -> ip_mc_drop_socket -> rtnl_lock(). >From lockdep point of view we thus have both 'rtnl_lock; lock_sock' and 'lock_sock; rtnl_lock'. Fix this by stealing the msk conn_list and doing the subflow close without holding the msk lock. Fixes: cec37a6e41aae7bf ("mptcp: Handle MP_CAPABLE options for outgoing connections") Reported-by: Christoph Paasch Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- net/mptcp/protocol.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 07ebff6396cd..73c192d8c158 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -644,17 +644,21 @@ static void __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow, *tmp; struct mptcp_sock *msk = mptcp_sk(sk); + LIST_HEAD(conn_list); mptcp_token_destroy(msk->token); inet_sk_state_store(sk, TCP_CLOSE); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + list_splice_init(&msk->conn_list, &conn_list); + + release_sock(sk); + + list_for_each_entry_safe(subflow, tmp, &conn_list, node) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); __mptcp_close_ssk(sk, ssk, subflow, timeout); } - release_sock(sk); sk_common_release(sk); } From ae2dd7164943e03644293af92802550d052632e6 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Jan 2020 15:54:46 +0100 Subject: [PATCH 4/4] mptcp: handle tcp fallback when using syn cookies We can't deal with syncookie mode yet, the syncookie rx path will create tcp reqsk, i.e. we get OOB access because we treat tcp reqsk as mptcp reqsk one: TCP: SYN flooding on port 20002. Sending cookies. BUG: KASAN: slab-out-of-bounds in subflow_syn_recv_sock+0x451/0x4d0 net/mptcp/subflow.c:191 Read of size 1 at addr ffff8881167bc148 by task syz-executor099/2120 subflow_syn_recv_sock+0x451/0x4d0 net/mptcp/subflow.c:191 tcp_get_cookie_sock+0xcf/0x520 net/ipv4/syncookies.c:209 cookie_v6_check+0x15a5/0x1e90 net/ipv6/syncookies.c:252 tcp_v6_cookie_check net/ipv6/tcp_ipv6.c:1123 [inline] [..] Bug can be reproduced via "sysctl net.ipv4.tcp_syncookies=2". Note that MPTCP should work with syncookies (4th ack would carry needed state), but it appears better to sort that out in -next so do tcp fallback for now. I removed the MPTCP ifdef for tcp_rsk "is_mptcp" member because if (IS_ENABLED()) is easier to read than "#ifdef IS_ENABLED()/#endif" pair. Cc: Eric Dumazet Fixes: cec37a6e41aae7bf ("mptcp: Handle MP_CAPABLE options for outgoing connections") Reported-by: Christoph Paasch Tested-by: Christoph Paasch Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 -- net/ipv4/syncookies.c | 4 ++++ net/ipv4/tcp_input.c | 3 +++ net/ipv6/syncookies.c | 3 +++ net/mptcp/subflow.c | 5 ++++- 5 files changed, 14 insertions(+), 3 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1cf73e6f85ca..3dc964010fef 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -148,9 +148,7 @@ struct tcp_request_sock { const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; -#if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; -#endif u32 txhash; u32 rcv_isn; u32 snt_isn; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 345b2b0ff618..9a4f6b16c9bc 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -349,6 +349,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) req->ts_recent = tcp_opt.saw_tstamp ? tcp_opt.rcv_tsval : 0; treq->snt_synack = 0; treq->tfo_listener = false; + + if (IS_ENABLED(CONFIG_MPTCP)) + treq->is_mptcp = 0; + if (IS_ENABLED(CONFIG_SMC)) ireq->smc_ok = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e8b840a4767e..e325b4506e25 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6637,6 +6637,9 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, af_ops->init_req(req, sk, skb); + if (IS_ENABLED(CONFIG_MPTCP) && want_cookie) + tcp_rsk(req)->is_mptcp = 0; + if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 30915f6f31e3..13235a012388 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -178,6 +178,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) treq = tcp_rsk(req); treq->tfo_listener = false; + if (IS_ENABLED(CONFIG_MPTCP)) + treq->is_mptcp = 0; + if (security_inet_conn_request(sk, skb, req)) goto out_free; diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 205dca1c30b7..c90c0e6ffb82 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -186,6 +186,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn); + if (tcp_rsk(req)->is_mptcp == 0) + goto create_child; + /* if the sk is MP_CAPABLE, we try to fetch the client key */ subflow_req = mptcp_subflow_rsk(req); if (subflow_req->mp_capable) { @@ -769,7 +772,7 @@ static void subflow_ulp_clone(const struct request_sock *req, struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk); struct mptcp_subflow_context *new_ctx; - if (!subflow_req->mp_capable) { + if (!tcp_rsk(req)->is_mptcp || !subflow_req->mp_capable) { subflow_ulp_fallback(newsk, old_ctx); return; }