From be20037725d17935ec669044bd2b15bc40c3b5ab Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 8 May 2021 10:01:32 -0400 Subject: [PATCH 01/46] NFSv4: Fix delegation return in cases where we have to retry If we're unable to immediately recover all locks because the server is unable to immediately service our reclaim calls, then we want to retry after we've finished servicing all the other asynchronous delegation returns on our queue. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 71 +++++++++++++++++++++++++++++++++++---------- fs/nfs/delegation.h | 1 + fs/nfs/nfs4_fs.h | 1 + 3 files changed, 58 insertions(+), 15 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index e6ec6f09ac6e..7c45ac3c3b0b 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -75,6 +75,13 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation) set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags); } +static void nfs_mark_return_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) +{ + set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); +} + static bool nfs4_is_valid_delegation(const struct nfs_delegation *delegation, fmode_t flags) @@ -293,6 +300,7 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) goto out; spin_lock(&delegation->lock); if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); } @@ -314,16 +322,17 @@ nfs_start_delegation_return(struct nfs_inode *nfsi) return delegation; } -static void -nfs_abort_delegation_return(struct nfs_delegation *delegation, - struct nfs_client *clp) +static void nfs_abort_delegation_return(struct nfs_delegation *delegation, + struct nfs_client *clp, int err) { spin_lock(&delegation->lock); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + if (err == -EAGAIN) { + set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); + set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state); + } spin_unlock(&delegation->lock); - set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state); } static struct nfs_delegation * @@ -539,7 +548,7 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation } while (err == 0); if (err) { - nfs_abort_delegation_return(delegation, clp); + nfs_abort_delegation_return(delegation, clp, err); goto out; } @@ -568,6 +577,7 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) if (ret) clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) ret = false; @@ -647,6 +657,38 @@ out: return err; } +static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) +{ + struct nfs_delegation *d; + bool ret = false; + + list_for_each_entry_rcu (d, &server->delegations, super_list) { + if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags)) + continue; + nfs_mark_return_delegation(server, d); + clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags); + ret = true; + } + return ret; +} + +static bool nfs_client_clear_delayed_delegations(struct nfs_client *clp) +{ + struct nfs_server *server; + bool ret = false; + + if (!test_and_clear_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state)) + goto out; + rcu_read_lock(); + list_for_each_entry_rcu (server, &clp->cl_superblocks, client_link) { + if (nfs_server_clear_delayed_delegations(server)) + ret = true; + } + rcu_read_unlock(); +out: + return ret; +} + /** * nfs_client_return_marked_delegations - return previously marked delegations * @clp: nfs_client to process @@ -659,8 +701,14 @@ out: */ int nfs_client_return_marked_delegations(struct nfs_client *clp) { - return nfs_client_for_each_server(clp, - nfs_server_return_marked_delegations, NULL); + int err = nfs_client_for_each_server( + clp, nfs_server_return_marked_delegations, NULL); + if (err) + return err; + /* If a return was delayed, sleep to prevent hard looping */ + if (nfs_client_clear_delayed_delegations(clp)) + ssleep(1); + return 0; } /** @@ -775,13 +823,6 @@ static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } -static void nfs_mark_return_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) -{ - set_bit(NFS_DELEGATION_RETURN, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); -} - static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) { struct nfs_delegation *delegation; diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index c19b4fd20781..1c378992b7c0 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -36,6 +36,7 @@ enum { NFS_DELEGATION_REVOKED, NFS_DELEGATION_TEST_EXPIRED, NFS_DELEGATION_INODE_FREEING, + NFS_DELEGATION_RETURN_DELAYED, }; int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 543d916f79ab..3e344bec3647 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -45,6 +45,7 @@ enum nfs4_client_state { NFS4CLNT_RECALL_RUNNING, NFS4CLNT_RECALL_ANY_LAYOUT_READ, NFS4CLNT_RECALL_ANY_LAYOUT_RW, + NFS4CLNT_DELEGRETURN_DELAYED, }; #define NFS4_RENEW_TIMEOUT 0x01 From 6b4befc0a06bc412f5b5a17fdad473aaed943170 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 7 May 2021 09:14:37 -0400 Subject: [PATCH 02/46] NFSv4: Add lease breakpoints in case of a delegation recall or return When we add support for application level leases and knfsd delegations to the NFS client, we we want to have them safely underpinned by a "real" delegation to provide the caching guarantees. If that real delegation is recalled, then we need to ensure that the application leases/delegations are recalled too. Signed-off-by: Trond Myklebust --- fs/nfs/delegation.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 7c45ac3c3b0b..11118398f495 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -530,11 +530,18 @@ out: static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + unsigned int mode = O_WRONLY | O_RDWR; int err = 0; if (delegation == NULL) return 0; - do { + + if (!issync) + mode |= O_NONBLOCK; + /* Recall of any remaining application leases */ + err = break_lease(inode, mode); + + while (err == 0) { if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) break; err = nfs_delegation_claim_opens(inode, &delegation->stateid, @@ -545,7 +552,7 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation * Guard against state recovery */ err = nfs4_wait_clnt_recover(clp); - } while (err == 0); + } if (err) { nfs_abort_delegation_return(delegation, clp, err); @@ -746,13 +753,14 @@ int nfs4_inode_return_delegation(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_delegation *delegation; - int err = 0; - nfs_wb_all(inode); delegation = nfs_start_delegation_return(nfsi); + /* Synchronous recall of any application leases */ + break_lease(inode, O_WRONLY | O_RDWR); + nfs_wb_all(inode); if (delegation != NULL) - err = nfs_end_delegation_return(inode, delegation, 1); - return err; + return nfs_end_delegation_return(inode, delegation, 1); + return 0; } /** @@ -1051,6 +1059,9 @@ int nfs_async_inode_return_delegation(struct inode *inode, nfs_mark_return_delegation(server, delegation); rcu_read_unlock(); + /* If there are any application leases or delegations, recall them */ + break_lease(inode, O_WRONLY | O_RDWR | O_NONBLOCK); + nfs_delegation_run_state_manager(clp); return 0; out_enoent: From e93a5e9306a576011f03011b492d4fbaa274477b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 7 May 2021 10:06:13 -0400 Subject: [PATCH 03/46] NFSv4: Add support for application leases underpinned by a delegation If the NFSv4 client already holds a delegation for a file, then we can support application leases (i.e. fcntl(fd, F_SETLEASE,...)) because the underlying delegation guarantees that the file is not being modified on the server by another client in a way that might conflict with the lease guarantees. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 3 ++- fs/nfs/nfs4file.c | 8 +++++++- fs/nfs/nfs4proc.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 3e344bec3647..ba78df4b13d9 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -323,7 +323,8 @@ extern int update_open_stateid(struct nfs4_state *state, const nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode); - +extern int nfs4_proc_setlease(struct file *file, long arg, + struct file_lock **lease, void **priv); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); extern void nfs4_update_changeattr(struct inode *dir, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a1e5c6b85ded..c820de58a661 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -435,6 +435,12 @@ void nfs42_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ +static int nfs4_setlease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + return nfs4_proc_setlease(file, arg, lease, priv); +} + const struct file_operations nfs4_file_operations = { .read_iter = nfs_file_read, .write_iter = nfs_file_write, @@ -448,7 +454,7 @@ const struct file_operations nfs4_file_operations = { .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = simple_nosetlease, + .setlease = nfs4_setlease, #ifdef CONFIG_NFS_V4_2 .copy_file_range = nfs4_copy_file_range, .llseek = nfs4_file_llseek, diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e653654c10bc..35df44ef2c35 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7438,6 +7438,43 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) return nfs4_retry_setlk(state, cmd, request); } +static int nfs4_delete_lease(struct file *file, void **priv) +{ + return generic_setlease(file, F_UNLCK, NULL, priv); +} + +static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + struct inode *inode = file_inode(file); + fmode_t type = arg == F_RDLCK ? FMODE_READ : FMODE_WRITE; + int ret; + + /* No delegation, no lease */ + if (!nfs4_have_delegation(inode, type)) + return -ENOLCK; + ret = generic_setlease(file, arg, lease, priv); + if (ret || nfs4_have_delegation(inode, type)) + return ret; + /* We raced with a delegation return */ + nfs4_delete_lease(file, priv); + return -ENOLCK; +} + +int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, + void **priv) +{ + switch (arg) { + case F_RDLCK: + case F_WRLCK: + return nfs4_add_lease(file, arg, lease, priv); + case F_UNLCK: + return nfs4_delete_lease(file, priv); + default: + return -EINVAL; + } +} + int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid) { struct nfs_server *server = NFS_SERVER(state->inode); From dd99e9f98fbf423ff6d365b37a98e8879170f17c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 9 Jun 2021 10:04:46 -0400 Subject: [PATCH 04/46] NFSv4: Initialise connection to the server in nfs4_alloc_client() Set up the connection to the NFSv4 server in nfs4_alloc_client(), before we've added the struct nfs_client to the net-namespace's nfs_client_list so that a downed server won't cause other mounts to hang in the trunking detection code. Reported-by: Michael Wakabayashi Fixes: 5c6e5b60aae4 ("NFS: Fix an Oops in the pNFS files and flexfiles connection setup to the DS") Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 82 +++++++++++++++++++++++---------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 42719384e25f..28431acd1230 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -197,8 +197,11 @@ void nfs40_shutdown_client(struct nfs_client *clp) struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) { - int err; + char buf[INET6_ADDRSTRLEN + 1]; + const char *ip_addr = cl_init->ip_addr; struct nfs_client *clp = nfs_alloc_client(cl_init); + int err; + if (IS_ERR(clp)) return clp; @@ -222,6 +225,44 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) init_waitqueue_head(&clp->cl_lock_waitq); #endif INIT_LIST_HEAD(&clp->pending_cb_stateids); + + if (cl_init->minorversion != 0) + __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); + __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); + __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); + + /* + * Set up the connection to the server before we add add to the + * global list. + */ + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); + if (err == -EINVAL) + err = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); + if (err < 0) + goto error; + + /* If no clientaddr= option was specified, find a usable cb address */ + if (ip_addr == NULL) { + struct sockaddr_storage cb_addr; + struct sockaddr *sap = (struct sockaddr *)&cb_addr; + + err = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); + if (err < 0) + goto error; + err = rpc_ntop(sap, buf, sizeof(buf)); + if (err < 0) + goto error; + ip_addr = (const char *)buf; + } + strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); + + err = nfs_idmap_new(clp); + if (err < 0) { + dprintk("%s: failed to create idmapper. Error = %d\n", + __func__, err); + goto error; + } + __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); return clp; error: @@ -372,8 +413,6 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *cl_init) { - char buf[INET6_ADDRSTRLEN + 1]; - const char *ip_addr = cl_init->ip_addr; struct nfs_client *old; int error; @@ -381,43 +420,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, /* the client is initialised already */ return clp; - /* Check NFS protocol revision and initialize RPC op vector */ - clp->rpc_ops = &nfs_v4_clientops; - - if (clp->cl_minorversion != 0) - __set_bit(NFS_CS_INFINITE_SLOTS, &clp->cl_flags); - __set_bit(NFS_CS_DISCRTRY, &clp->cl_flags); - __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); - - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_GSS_KRB5I); - if (error == -EINVAL) - error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - - /* If no clientaddr= option was specified, find a usable cb address */ - if (ip_addr == NULL) { - struct sockaddr_storage cb_addr; - struct sockaddr *sap = (struct sockaddr *)&cb_addr; - - error = rpc_localaddr(clp->cl_rpcclient, sap, sizeof(cb_addr)); - if (error < 0) - goto error; - error = rpc_ntop(sap, buf, sizeof(buf)); - if (error < 0) - goto error; - ip_addr = (const char *)buf; - } - strlcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr)); - - error = nfs_idmap_new(clp); - if (error < 0) { - dprintk("%s: failed to create idmapper. Error = %d\n", - __func__, error); - goto error; - } - __set_bit(NFS_CS_IDMAP, &clp->cl_res_state); - error = nfs4_init_client_minor_version(clp); if (error < 0) goto error; From 3731d44bba8e0116b052b1b374476c5f6dd9a456 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 11 Jun 2021 13:40:55 -0400 Subject: [PATCH 05/46] NFSv4: Fix an Oops in pnfs_mark_request_commit() when doing O_DIRECT Fix an Oopsable condition in pnfs_mark_request_commit() when we're putting a set of writes on the commit list to reschedule them after a failed pNFS attempt. Fixes: 9c455a8c1e14 ("NFS/pNFS: Clean up pNFS commit operations") Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 2d30a4da49fa..2e894fec036b 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -700,8 +700,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - bool request_commit = false; struct nfs_page *req = nfs_list_entry(hdr->pages.next); + int flags = NFS_ODIRECT_DONE; nfs_init_cinfo_from_dreq(&cinfo, dreq); @@ -713,15 +713,9 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) nfs_direct_count_bytes(dreq, hdr); if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) { - switch (dreq->flags) { - case 0: + if (!dreq->flags) dreq->flags = NFS_ODIRECT_DO_COMMIT; - request_commit = true; - break; - case NFS_ODIRECT_RESCHED_WRITES: - case NFS_ODIRECT_DO_COMMIT: - request_commit = true; - } + flags = dreq->flags; } spin_unlock(&dreq->lock); @@ -729,12 +723,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); - if (request_commit) { + if (flags == NFS_ODIRECT_DO_COMMIT) { kref_get(&req->wb_kref); memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, hdr->ds_commit_idx); + } else if (flags == NFS_ODIRECT_RESCHED_WRITES) { + kref_get(&req->wb_kref); + nfs_mark_request_commit(req, NULL, &cinfo, 0); } nfs_unlock_and_release_request(req); } From 6d1c0f3d28f98ea2736128ed3e46821496dc3a8c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Wed, 9 Jun 2021 17:07:29 -0400 Subject: [PATCH 06/46] sunrpc: Avoid a KASAN slab-out-of-bounds bug in xdr_set_page_base() This seems to happen fairly easily during READ_PLUS testing on NFS v4.2. I found that we could end up accessing xdr->buf->pages[pgnr] with a pgnr greater than the number of pages in the array. So let's just return early if we're setting base to a point at the end of the page data and let xdr_set_tail_base() handle setting up the buffer pointers instead. Signed-off-by: Anna Schumaker Fixes: 8d86e373b0ef ("SUNRPC: Clean up helpers xdr_set_iov() and xdr_set_page_base()") Signed-off-by: Trond Myklebust --- net/sunrpc/xdr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 3964ff74ee51..ca10ba2626f2 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -1230,10 +1230,9 @@ static unsigned int xdr_set_page_base(struct xdr_stream *xdr, void *kaddr; maxlen = xdr->buf->page_len; - if (base >= maxlen) { - base = maxlen; - maxlen = 0; - } else + if (base >= maxlen) + return 0; + else maxlen -= base; if (len > maxlen) len = maxlen; From bb24cc0f37a2d12f780ab2a57df046274a0bec38 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 13 Jun 2021 15:06:52 +0100 Subject: [PATCH 07/46] rpc: remove redundant initialization of variable status The variable status is being initialized with a value that is never read, the assignment is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: Trond Myklebust --- net/sunrpc/auth_gss/svcauth_gss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 6dff64374bfe..a81be45f40d9 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -1275,7 +1275,7 @@ static int gss_proxy_save_rsc(struct cache_detail *cd, long long ctxh; struct gss_api_mech *gm = NULL; time64_t expiry; - int status = -EINVAL; + int status; memset(&rsci, 0, sizeof(rsci)); /* context handle */ From bc1c56e9bbe92766d017efb5f0a0c71f80da5570 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 15 Jun 2021 11:18:38 +1000 Subject: [PATCH 08/46] SUNRPC: prevent port reuse on transports which don't request it. If an RPC client is created without RPC_CLNT_CREATE_REUSEPORT, it should not reuse the source port when a TCP connection is re-established. This is currently implemented by preventing the source port being recorded after a successful connection (the call to xs_set_srcport()). However the source port is also recorded after a successful bind in xs_bind(). This may not be needed at all and certainly is not wanted when RPC_CLNT_CREATE_REUSEPORT wasn't requested. So avoid that assignment when xprt.reuseport is not set. With this change, NFSv4.1 and later mounts use a different port number on each connection. This is helpful with some firewalls which don't cope well with port reuse. Signed-off-by: NeilBrown Fixes: e6237b6feb37 ("NFSv4.1: Don't rebind to the same source port when reconnecting to the server") Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 316d04945587..3228b7a1836a 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1689,7 +1689,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) err = kernel_bind(sock, (struct sockaddr *)&myaddr, transport->xprt.addrlen); if (err == 0) { - transport->srcport = port; + if (transport->xprt.reuseport) + transport->srcport = port; break; } last = port; From 1fcb6fcd74a222d9ead54d405842fc763bb86262 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Fri, 18 Jun 2021 12:20:55 +0800 Subject: [PATCH 09/46] nfs: fix acl memory leak of posix_acl_create() When looking into another nfs xfstests report, I found acl and default_acl in nfs3_proc_create() and nfs3_proc_mknod() error paths are possibly leaked. Fix them in advance. Fixes: 013cdf1088d7 ("nfs: use generic posix ACL infrastructure for v3 Posix ACLs") Cc: Trond Myklebust Cc: Anna Schumaker Cc: Christoph Hellwig Cc: Joseph Qi Signed-off-by: Gao Xiang Signed-off-by: Trond Myklebust --- fs/nfs/nfs3proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 5c4e23abc345..2299446b3b89 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -385,7 +385,7 @@ nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; case NFS3_CREATE_UNCHECKED: - goto out; + goto out_release_acls; } nfs_fattr_init(data->res.dir_attr); nfs_fattr_init(data->res.fattr); @@ -751,7 +751,7 @@ nfs3_proc_mknod(struct inode *dir, struct dentry *dentry, struct iattr *sattr, break; default: status = -EINVAL; - goto out; + goto out_release_acls; } d_alias = nfs3_do_create(dir, dentry, data); From 213bb58475b57786e4336bc8bfd5029e16257c49 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 26 Jun 2021 09:23:51 -0400 Subject: [PATCH 10/46] NFS: Fix up inode attribute revalidation timeouts The inode is considered revalidated when we've checked the value of the change attribute against our cached value since that suffices to establish whether or not the other cached values are valid. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 50 ++++++++++++++++---------------------------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 529c4099f482..327f9ae4dd3f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2066,24 +2066,22 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } else { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CHANGE; - cache_revalidated = false; + if (!have_delegation || + (nfsi->cache_validity & NFS_INO_INVALID_CHANGE) != 0) + cache_revalidated = false; } - if (fattr->valid & NFS_ATTR_FATTR_MTIME) { + if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; - } else if (fattr_supported & NFS_ATTR_FATTR_MTIME) { + else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MTIME; - cache_revalidated = false; - } - if (fattr->valid & NFS_ATTR_FATTR_CTIME) { + if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; - } else if (fattr_supported & NFS_ATTR_FATTR_CTIME) { + else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_CTIME; - cache_revalidated = false; - } /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -2111,19 +2109,15 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) fattr->du.nfs3.used = 0; fattr->valid |= NFS_ATTR_FATTR_SPACE_USED; } - } else { + } else nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_SIZE; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; - else if (fattr_supported & NFS_ATTR_FATTR_ATIME) { + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATIME; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { @@ -2134,11 +2128,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_ACL; attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_MODE) { + } else if (fattr_supported & NFS_ATTR_FATTR_MODE) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_MODE; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { @@ -2147,11 +2139,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_uid = fattr->uid; attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) { + } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { @@ -2160,11 +2150,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid = fattr->gid; attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) { + } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_OTHER; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -2173,30 +2161,24 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); attr_changed = true; } - } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) { + } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_NLINK; - cache_revalidated = false; - } if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) { + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; - cache_revalidated = false; - } - if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) { + if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; - } else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) { + else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED) nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_BLOCKS; - cache_revalidated = false; - } /* Update attrtimeo value if we're out of the unstable period */ if (attr_changed) { From 20cf7d4ea4ad7d9830b01ff7444f6ac64a727a23 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Jun 2021 15:49:31 -0400 Subject: [PATCH 11/46] NFSv4: Fix handling of non-atomic change attrbute updates If the change attribute update is declared to be non-atomic by the server, or our cached value does not match the server's value before the operation was performed, then we should declare the inode cache invalid. On the other hand, if the change to the directory raced with a lookup or getattr which already updated the change attribute, then optimise away the revalidation. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e653654c10bc..451d3d56d80c 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1205,12 +1205,12 @@ nfs4_update_changeattr_locked(struct inode *inode, u64 change_attr = inode_peek_iversion_raw(inode); cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + if (S_ISDIR(inode->i_mode)) + cache_validity |= NFS_INO_INVALID_DATA; switch (NFS_SERVER(inode)->change_attr_type) { case NFS4_CHANGE_TYPE_IS_UNDEFINED: - break; - case NFS4_CHANGE_TYPE_IS_TIME_METADATA: - if ((s64)(change_attr - cinfo->after) > 0) + if (cinfo->after == change_attr) goto out; break; default: @@ -1218,24 +1218,21 @@ nfs4_update_changeattr_locked(struct inode *inode, goto out; } - if (cinfo->atomic && cinfo->before == change_attr) { - nfsi->attrtimeo_timestamp = jiffies; - } else { - if (S_ISDIR(inode->i_mode)) { - cache_validity |= NFS_INO_INVALID_DATA; - nfs_force_lookup_revalidate(inode); - } else { - if (!NFS_PROTO(inode)->have_delegation(inode, - FMODE_READ)) - cache_validity |= NFS_INO_REVAL_PAGECACHE; - } - - if (cinfo->before != change_attr) - cache_validity |= NFS_INO_INVALID_ACCESS | - NFS_INO_INVALID_ACL | - NFS_INO_INVALID_XATTR; - } inode_set_iversion_raw(inode, cinfo->after); + if (!cinfo->atomic || cinfo->before != change_attr) { + if (S_ISDIR(inode->i_mode)) + nfs_force_lookup_revalidate(inode); + + if (!NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) + cache_validity |= + NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_MODE | NFS_INO_INVALID_XATTR | + NFS_INO_REVAL_PAGECACHE; + nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); + } + nfsi->attrtimeo_timestamp = jiffies; nfsi->read_cache_jiffies = timestamp; nfsi->attr_gencount = nfs_inc_attr_generation_counter(); nfsi->cache_validity &= ~NFS_INO_INVALID_CHANGE; From a9601ac5e9160a3f96348ebc5d0751397a501701 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 26 Jun 2021 11:07:59 -0400 Subject: [PATCH 12/46] NFS: Avoid duplicate resets of attribute cache timeouts We know that the attributes changed on the server if and only if the change attribute is different. Otherwise, we're just refreshing our cache with values that were already known to be stale. Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 327f9ae4dd3f..2824acfe4ff9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -2055,13 +2055,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); + attr_changed = true; dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; inode_set_iversion_raw(inode, fattr->change_attr); - attr_changed = true; } } else { nfsi->cache_validity |= @@ -2094,7 +2094,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_DATA; - attr_changed = true; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", @@ -2126,7 +2125,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; - attr_changed = true; } } else if (fattr_supported & NFS_ATTR_FATTR_MODE) nfsi->cache_validity |= @@ -2137,7 +2135,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; - attr_changed = true; } } else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfsi->cache_validity |= @@ -2148,7 +2145,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; - attr_changed = true; } } else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfsi->cache_validity |= @@ -2159,7 +2155,6 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); - attr_changed = true; } } else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfsi->cache_validity |= From eae00c5d6e48ccb2d78ae5873743d7d1a572951b Mon Sep 17 00:00:00 2001 From: Scott Mayhew Date: Tue, 22 Jun 2021 08:11:59 -0400 Subject: [PATCH 13/46] nfs: update has_sec_mnt_opts after cloning lsm options from parent After calling security_sb_clone_mnt_opts() in nfs_get_root(), it's necessary to copy the value of has_sec_mnt_opts from the cloned super_block's nfs_server. Otherwise, calls to nfs_compare_super() using this super_block may not return the correct result, leading to mount failures. For example, mounting an nfs server with the following in /etc/exports: /export *(rw,insecure,crossmnt,no_root_squash,security_label) and having /export/scratch on a separate block device. mount -o v4.2,context=system_u:object_r:root_t:s0 server:/export/test /mnt/test mount -o v4.2,context=system_u:object_r:swapfile_t:s0 server:/export/scratch /mnt/scratch The second mount would fail with "mount.nfs: /mnt/scratch is busy or already mounted or sharecache fail" and "SELinux: mount invalid. Same superblock, different security settings for..." would appear in the syslog. Also while we're in there, replace several instances of "NFS_SB(s)" with "server", which was already declared at the top of the nfs_get_root(). Fixes: ec1ade6a0448 ("nfs: account for selinux security context when deciding to share superblock") Signed-off-by: Scott Mayhew Signed-off-by: Trond Myklebust --- fs/nfs/getroot.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index aaeeb4659bff..59355c106ece 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -67,7 +67,7 @@ static int nfs_superblock_set_dummy_root(struct super_block *sb, struct inode *i int nfs_get_root(struct super_block *s, struct fs_context *fc) { struct nfs_fs_context *ctx = nfs_fc2context(fc); - struct nfs_server *server = NFS_SB(s); + struct nfs_server *server = NFS_SB(s), *clone_server; struct nfs_fsinfo fsinfo; struct dentry *root; struct inode *inode; @@ -127,7 +127,7 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) } spin_unlock(&root->d_lock); fc->root = root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL) + if (server->caps & NFS_CAP_SECURITY_LABEL) kflags |= SECURITY_LSM_NATIVE_LABELS; if (ctx->clone_data.sb) { if (d_inode(fc->root)->i_fop != &nfs_dir_operations) { @@ -137,15 +137,19 @@ int nfs_get_root(struct super_block *s, struct fs_context *fc) /* clone lsm security options from the parent to the new sb */ error = security_sb_clone_mnt_opts(ctx->clone_data.sb, s, kflags, &kflags_out); + if (error) + goto error_splat_root; + clone_server = NFS_SB(ctx->clone_data.sb); + server->has_sec_mnt_opts = clone_server->has_sec_mnt_opts; } else { error = security_sb_set_mnt_opts(s, fc->security, kflags, &kflags_out); } if (error) goto error_splat_root; - if (NFS_SB(s)->caps & NFS_CAP_SECURITY_LABEL && + if (server->caps & NFS_CAP_SECURITY_LABEL && !(kflags_out & SECURITY_LSM_NATIVE_LABELS)) - NFS_SB(s)->caps &= ~NFS_CAP_SECURITY_LABEL; + server->caps &= ~NFS_CAP_SECURITY_LABEL; nfs_setsecurity(inode, fsinfo.fattr, fsinfo.fattr->label); error = 0; From b42ad64f5f216db05310783cbded56176c3a09df Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Thu, 24 Jun 2021 11:34:18 -0400 Subject: [PATCH 14/46] NFS: Remove unnecessary inode parameter from nfs_pageio_complete_read() Simplify nfs_pageio_complete_read() by using the inode pointer saved inside nfs_pageio_descriptor. Signed-off-by: Dave Wysochanski Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index d2b6dce1f99f..684a730f6670 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -74,8 +74,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); -static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, - struct inode *inode) +static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio) { struct nfs_pgio_mirror *pgm; unsigned long npages; @@ -86,9 +85,9 @@ static void nfs_pageio_complete_read(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(pgio->pg_mirror_count != 1); pgm = &pgio->pg_mirrors[0]; - NFS_I(inode)->read_io += pgm->pg_bytes_written; + NFS_I(pgio->pg_inode)->read_io += pgm->pg_bytes_written; npages = (pgm->pg_bytes_written + PAGE_SIZE - 1) >> PAGE_SHIFT; - nfs_add_stats(inode, NFSIOS_READPAGES, npages); + nfs_add_stats(pgio->pg_inode, NFSIOS_READPAGES, npages); } @@ -376,7 +375,7 @@ int nfs_readpage(struct file *file, struct page *page) ret = readpage_async_filler(&desc, page); if (!ret) - nfs_pageio_complete_read(&desc.pgio, inode); + nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; if (!ret) { @@ -430,7 +429,7 @@ int nfs_readpages(struct file *file, struct address_space *mapping, ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc); - nfs_pageio_complete_read(&desc.pgio, inode); + nfs_pageio_complete_read(&desc.pgio); read_complete: put_nfs_open_context(desc.ctx); From fcb170a9d825d7db4a3fb870b0300f5a40a8d096 Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Sat, 26 Jun 2021 15:50:41 +0800 Subject: [PATCH 15/46] SUNRPC: Fix the batch tasks count wraparound. The 'queue->nr' will wraparound from 0 to 255 when only current priority queue has tasks. This maybe lead a deadlock same as commit dfe1fe75e00e ("NFSv4: Fix deadlock between nfs4_evict_inode() and nfs4_opendata_get_inode()"): Privileged delegreturn task is queued to privileged list because all the slots are assigned. When non-privileged task complete and release the slot, a non-privileged maybe picked out. It maybe allocate slot failed when the session on draining. If the 'queue->nr' has wraparound to 255, and no enough slot to service it, then the privileged delegreturn will lost to wake up. So we should avoid the wraparound on 'queue->nr'. Reported-by: Hulk Robot Fixes: 5fcdfacc01f3 ("NFSv4: Return delegations synchronously in evict_inode") Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Cc: stable@vger.kernel.org Signed-off-by: Zhang Xiaoxu Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 39ed0e0afe6d..71993adb0762 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -595,7 +595,8 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q) && --queue->nr) { + if (!list_empty(q) && queue->nr) { + queue->nr--; task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto out; } From 5483b904bf336948826594610af4c9bbb0d9e3aa Mon Sep 17 00:00:00 2001 From: Zhang Xiaoxu Date: Sat, 26 Jun 2021 15:50:42 +0800 Subject: [PATCH 16/46] SUNRPC: Should wake up the privileged task firstly. When find a task from wait queue to wake up, a non-privileged task may be found out, rather than the privileged. This maybe lead a deadlock same as commit dfe1fe75e00e ("NFSv4: Fix deadlock between nfs4_evict_inode() and nfs4_opendata_get_inode()"): Privileged delegreturn task is queued to privileged list because all the slots are assigned. If there has no enough slot to wake up the non-privileged batch tasks(session less than 8 slot), then the privileged delegreturn task maybe lost waked up because the found out task can't get slot since the session is on draining. So we should treate the privileged task as the emergency task, and execute it as for as we can. Reported-by: Hulk Robot Fixes: 5fcdfacc01f3 ("NFSv4: Return delegations synchronously in evict_inode") Cc: stable@vger.kernel.org Signed-off-by: Zhang Xiaoxu Signed-off-by: Trond Myklebust --- net/sunrpc/sched.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 71993adb0762..c045f63d11fa 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -591,6 +591,15 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q struct list_head *q; struct rpc_task *task; + /* + * Service the privileged queue. + */ + q = &queue->tasks[RPC_NR_PRIORITY - 1]; + if (queue->maxpriority > RPC_PRIORITY_PRIVILEGED && !list_empty(q)) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; + } + /* * Service a batch of tasks from a single owner. */ From e97bc66377bca097e1f3349ca18ca17f202ff659 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 11 May 2021 23:41:10 -0400 Subject: [PATCH 17/46] NFS: nfs_find_open_context() may only select open files If a file has already been closed, then it should not be selected to support further I/O. Signed-off-by: Trond Myklebust [Trond: Fix an invalid pointer deref reported by Colin Ian King] --- fs/nfs/inode.c | 4 ++++ include/linux/nfs_fs.h | 1 + 2 files changed, 5 insertions(+) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 529c4099f482..d5d022f92c37 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1101,6 +1101,7 @@ EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { filp->private_data = get_nfs_open_context(ctx); + set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); if (list_empty(&ctx->list)) nfs_inode_attach_open_context(ctx); } @@ -1120,6 +1121,8 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; + if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags)) + continue; ctx = get_nfs_open_context(pos); if (ctx) break; @@ -1135,6 +1138,7 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); /* * We fatal error on write before. Try to writeback * every page again. diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index ffba254d2098..ce6474594872 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -84,6 +84,7 @@ struct nfs_open_context { #define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) +#define NFS_CONTEXT_FILE_OPEN (4) int error; struct list_head list; From df2c7b951f439a0342495a4a049d808f679c474c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 25 Jun 2021 15:08:39 -0400 Subject: [PATCH 18/46] NFSv4: setlease should return EAGAIN if locks are not available Instead of returning ENOLCK when we can't hand out a lease, we should be returning EAGAIN. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 35df44ef2c35..e4efb7bccd7e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -7452,13 +7452,13 @@ static int nfs4_add_lease(struct file *file, long arg, struct file_lock **lease, /* No delegation, no lease */ if (!nfs4_have_delegation(inode, type)) - return -ENOLCK; + return -EAGAIN; ret = generic_setlease(file, arg, lease, priv); if (ret || nfs4_have_delegation(inode, type)) return ret; /* We raced with a delegation return */ nfs4_delete_lease(file, priv); - return -ENOLCK; + return -EAGAIN; } int nfs4_proc_setlease(struct file *file, long arg, struct file_lock **lease, From 746787489b0c3a879ddc671ce1e0d15e71b0d881 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:10 -0400 Subject: [PATCH 19/46] sunrpc: Create a sunrpc directory under /sys/kernel/ This is where we'll put per-rpc_client related files Signed-off-by: Anna Schumaker Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/Makefile | 2 +- net/sunrpc/sunrpc_syms.c | 8 ++++++++ net/sunrpc/sysfs.c | 20 ++++++++++++++++++++ net/sunrpc/sysfs.h | 11 +++++++++++ 4 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 net/sunrpc/sysfs.c create mode 100644 net/sunrpc/sysfs.h diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile index 9488600451e8..1c8de397d6ad 100644 --- a/net/sunrpc/Makefile +++ b/net/sunrpc/Makefile @@ -12,7 +12,7 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \ auth.o auth_null.o auth_unix.o \ svc.o svcsock.o svcauth.o svcauth_unix.o \ addr.o rpcb_clnt.o timer.o xdr.o \ - sunrpc_syms.o cache.o rpc_pipe.o \ + sunrpc_syms.o cache.o rpc_pipe.o sysfs.o \ svc_xprt.o \ xprtmultipath.o sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 236fadc4a439..3b57efc692ec 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -24,6 +24,7 @@ #include #include "sunrpc.h" +#include "sysfs.h" #include "netns.h" unsigned int sunrpc_net_id; @@ -103,6 +104,10 @@ init_sunrpc(void) if (err) goto out4; + err = rpc_sysfs_init(); + if (err) + goto out5; + sunrpc_debugfs_init(); #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) rpc_register_sysctl(); @@ -111,6 +116,8 @@ init_sunrpc(void) init_socket_xprt(); /* clnt sock transport */ return 0; +out5: + unregister_rpc_pipefs(); out4: unregister_pernet_subsys(&sunrpc_net_ops); out3: @@ -124,6 +131,7 @@ out: static void __exit cleanup_sunrpc(void) { + rpc_sysfs_exit(); rpc_cleanup_clids(); rpcauth_remove_module(); cleanup_socket_xprt(); diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c new file mode 100644 index 000000000000..27eda180ac5e --- /dev/null +++ b/net/sunrpc/sysfs.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Anna Schumaker + */ +#include + +static struct kset *rpc_sunrpc_kset; + +int rpc_sysfs_init(void) +{ + rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); + if (!rpc_sunrpc_kset) + return -ENOMEM; + return 0; +} + +void rpc_sysfs_exit(void) +{ + kset_unregister(rpc_sunrpc_kset); +} diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h new file mode 100644 index 000000000000..f181c650aab8 --- /dev/null +++ b/net/sunrpc/sysfs.h @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Anna Schumaker + */ +#ifndef __SUNRPC_SYSFS_H +#define __SUNRPC_SYSFS_H + +int rpc_sysfs_init(void); +void rpc_sysfs_exit(void); + +#endif From c441f125de79121b97f1eb08dbfec85c8100a01e Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:11 -0400 Subject: [PATCH 20/46] sunrpc: Create a client/ subdirectory in the sunrpc sysfs For network namespace separation. Signed-off-by: Anna Schumaker Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/sysfs.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 27eda180ac5e..fa03e2ef836a 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -2,19 +2,62 @@ /* * Copyright (c) 2020 Anna Schumaker */ +#include #include static struct kset *rpc_sunrpc_kset; +static struct kobject *rpc_sunrpc_client_kobj; + +static void rpc_sysfs_object_release(struct kobject *kobj) +{ + kfree(kobj); +} + +static const struct kobj_ns_type_operations * +rpc_sysfs_object_child_ns_type(struct kobject *kobj) +{ + return &net_ns_type_operations; +} + +static struct kobj_type rpc_sysfs_object_type = { + .release = rpc_sysfs_object_release, + .sysfs_ops = &kobj_sysfs_ops, + .child_ns_type = rpc_sysfs_object_child_ns_type, +}; + +static struct kobject *rpc_sysfs_object_alloc(const char *name, + struct kset *kset, + struct kobject *parent) +{ + struct kobject *kobj; + + kobj = kzalloc(sizeof(*kobj), GFP_KERNEL); + if (kobj) { + kobj->kset = kset; + if (kobject_init_and_add(kobj, &rpc_sysfs_object_type, + parent, "%s", name) == 0) + return kobj; + kobject_put(kobj); + } + return NULL; +} int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); if (!rpc_sunrpc_kset) return -ENOMEM; + rpc_sunrpc_client_kobj = rpc_sysfs_object_alloc("client", rpc_sunrpc_kset, NULL); + if (!rpc_sunrpc_client_kobj) { + kset_unregister(rpc_sunrpc_kset); + rpc_sunrpc_client_kobj = NULL; + return -ENOMEM; + } return 0; } void rpc_sysfs_exit(void) { + kobject_put(rpc_sunrpc_client_kobj); kset_unregister(rpc_sunrpc_kset); } From c5a382ebdbdaac27ec109993e29f9045d70297f2 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:12 -0400 Subject: [PATCH 21/46] sunrpc: Create per-rpc_clnt sysfs kobjects These will eventually have files placed under them for sysfs operations. Signed-off-by: Anna Schumaker Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 2 ++ net/sunrpc/clnt.c | 5 +++ net/sunrpc/sysfs.c | 61 +++++++++++++++++++++++++++++++++++++ net/sunrpc/sysfs.h | 8 +++++ 4 files changed, 76 insertions(+) diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 02e7a5863d28..8b5d5c97553e 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -29,6 +29,7 @@ #include struct rpc_inode; +struct rpc_sysfs_client; /* * The high-level client handle @@ -71,6 +72,7 @@ struct rpc_clnt { #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) struct dentry *cl_debugfs; /* debugfs directory */ #endif + struct rpc_sysfs_client *cl_sysfs; /* sysfs directory */ /* cl_work is only needed after cl_xpi is no longer used, * and that are of similar size */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 42623d6b8f0e..6f3f840a2ea3 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -41,6 +41,7 @@ #include #include "sunrpc.h" +#include "sysfs.h" #include "netns.h" #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -300,6 +301,7 @@ static int rpc_client_register(struct rpc_clnt *clnt, int err; rpc_clnt_debugfs_register(clnt); + rpc_sysfs_client_setup(clnt, net); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { @@ -327,6 +329,7 @@ err_auth: out: if (pipefs_sb) rpc_put_sb_net(net); + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); return err; } @@ -733,6 +736,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt, rpc_unregister_client(clnt); __rpc_clnt_remove_pipedir(clnt); + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); /* @@ -879,6 +883,7 @@ static void rpc_free_client_work(struct work_struct *work) * so they cannot be called in rpciod, so they are handled separately * here. */ + rpc_sysfs_client_destroy(clnt); rpc_clnt_debugfs_unregister(clnt); rpc_free_clid(clnt); rpc_clnt_remove_pipedir(clnt); diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index fa03e2ef836a..f3b7547ee218 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -4,6 +4,7 @@ */ #include #include +#include "sysfs.h" static struct kset *rpc_sunrpc_kset; static struct kobject *rpc_sunrpc_client_kobj; @@ -56,8 +57,68 @@ int rpc_sysfs_init(void) return 0; } +static void rpc_sysfs_client_release(struct kobject *kobj) +{ + struct rpc_sysfs_client *c; + + c = container_of(kobj, struct rpc_sysfs_client, kobject); + kfree(c); +} + +static const void *rpc_sysfs_client_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_client, kobject)->net; +} + +static struct kobj_type rpc_sysfs_client_type = { + .release = rpc_sysfs_client_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_client_namespace, +}; + void rpc_sysfs_exit(void) { kobject_put(rpc_sunrpc_client_kobj); kset_unregister(rpc_sunrpc_kset); } + +static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, + struct net *net, + int clid) +{ + struct rpc_sysfs_client *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (p) { + p->net = net; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, &rpc_sysfs_client_type, + parent, "clnt-%d", clid) == 0) + return p; + kobject_put(&p->kobject); + } + return NULL; +} + +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net) +{ + struct rpc_sysfs_client *rpc_client; + + rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, net, clnt->cl_clid); + if (rpc_client) { + clnt->cl_sysfs = rpc_client; + kobject_uevent(&rpc_client->kobject, KOBJ_ADD); + } +} + +void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) +{ + struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; + + if (rpc_client) { + kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE); + kobject_del(&rpc_client->kobject); + kobject_put(&rpc_client->kobject); + clnt->cl_sysfs = NULL; + } +} diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h index f181c650aab8..c46afc848993 100644 --- a/net/sunrpc/sysfs.h +++ b/net/sunrpc/sysfs.h @@ -5,7 +5,15 @@ #ifndef __SUNRPC_SYSFS_H #define __SUNRPC_SYSFS_H +struct rpc_sysfs_client { + struct kobject kobject; + struct net *net; +}; + int rpc_sysfs_init(void); void rpc_sysfs_exit(void); +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net); +void rpc_sysfs_client_destroy(struct rpc_clnt *clnt); + #endif From 572caba402e10b35a080d1b43c0193da364f3a17 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:13 -0400 Subject: [PATCH 22/46] sunrpc: add xprt id This adds a unique identifier for a sunrpc transport in sysfs, which is similarly managed to the unique IDs of clients. Signed-off-by: Dan Aloni Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/sunrpc_syms.c | 1 + net/sunrpc/xprt.c | 26 ++++++++++++++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 61b622e334ee..1fbc470ce205 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -185,6 +185,7 @@ enum xprt_transports { struct rpc_xprt { struct kref kref; /* Reference count */ const struct rpc_xprt_ops *ops; /* transport methods */ + unsigned int id; /* transport id */ const struct rpc_timeout *timeout; /* timeout parms */ struct sockaddr_storage addr; /* server address */ @@ -370,6 +371,7 @@ struct rpc_xprt * xprt_alloc(struct net *net, size_t size, void xprt_free(struct rpc_xprt *); void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task); bool xprt_wake_up_backlog(struct rpc_xprt *xprt, struct rpc_rqst *req); +void xprt_cleanup_ids(void); static inline int xprt_enable_swap(struct rpc_xprt *xprt) diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 3b57efc692ec..b61b74c00483 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -133,6 +133,7 @@ cleanup_sunrpc(void) { rpc_sysfs_exit(); rpc_cleanup_clids(); + xprt_cleanup_ids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 3509a7f139b9..20b9bd705014 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1746,6 +1746,30 @@ static void xprt_free_all_slots(struct rpc_xprt *xprt) } } +static DEFINE_IDA(rpc_xprt_ids); + +void xprt_cleanup_ids(void) +{ + ida_destroy(&rpc_xprt_ids); +} + +static int xprt_alloc_id(struct rpc_xprt *xprt) +{ + int id; + + id = ida_simple_get(&rpc_xprt_ids, 0, 0, GFP_KERNEL); + if (id < 0) + return id; + + xprt->id = id; + return 0; +} + +static void xprt_free_id(struct rpc_xprt *xprt) +{ + ida_simple_remove(&rpc_xprt_ids, xprt->id); +} + struct rpc_xprt *xprt_alloc(struct net *net, size_t size, unsigned int num_prealloc, unsigned int max_alloc) @@ -1758,6 +1782,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, if (xprt == NULL) goto out; + xprt_alloc_id(xprt); xprt_init(xprt, net); for (i = 0; i < num_prealloc; i++) { @@ -1786,6 +1811,7 @@ void xprt_free(struct rpc_xprt *xprt) { put_net(xprt->xprt_net); xprt_free_all_slots(xprt); + xprt_free_id(xprt); kfree_rcu(xprt, rcu); } EXPORT_SYMBOL_GPL(xprt_free); From 5b9268727f299f87432e8b035e9e8bec8ba13e8d Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:14 -0400 Subject: [PATCH 23/46] sunrpc: add IDs to multipath This is used to uniquely identify sunrpc multipath objects in /sys. Signed-off-by: Dan Aloni Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtmultipath.h | 4 ++++ net/sunrpc/sunrpc_syms.c | 1 + net/sunrpc/xprtmultipath.c | 26 ++++++++++++++++++++++++++ 3 files changed, 31 insertions(+) diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index c6cce3fbf29d..ef95a6f18ccf 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -14,6 +14,7 @@ struct rpc_xprt_switch { spinlock_t xps_lock; struct kref xps_kref; + unsigned int xps_id; unsigned int xps_nxprts; unsigned int xps_nactive; atomic_long_t xps_queuelen; @@ -71,4 +72,7 @@ extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi); extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps, const struct sockaddr *sap); + +extern void xprt_multipath_cleanup_ids(void); + #endif diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index b61b74c00483..691c0000e9ea 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -134,6 +134,7 @@ cleanup_sunrpc(void) rpc_sysfs_exit(); rpc_cleanup_clids(); xprt_cleanup_ids(); + xprt_multipath_cleanup_ids(); rpcauth_remove_module(); cleanup_socket_xprt(); svc_cleanup_xprt_sock(); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 78c075a68c04..4969a4c216f7 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -86,6 +86,30 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, xprt_put(xprt); } +static DEFINE_IDA(rpc_xprtswitch_ids); + +void xprt_multipath_cleanup_ids(void) +{ + ida_destroy(&rpc_xprtswitch_ids); +} + +static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) +{ + int id; + + id = ida_simple_get(&rpc_xprtswitch_ids, 0, 0, gfp_flags); + if (id < 0) + return id; + + xps->xps_id = id; + return 0; +} + +static void xprt_switch_free_id(struct rpc_xprt_switch *xps) +{ + ida_simple_remove(&rpc_xprtswitch_ids, xps->xps_id); +} + /** * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch * @xprt: pointer to struct rpc_xprt @@ -103,6 +127,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, if (xps != NULL) { spin_lock_init(&xps->xps_lock); kref_init(&xps->xps_kref); + xprt_switch_alloc_id(xps, gfp_flags); xps->xps_nxprts = xps->xps_nactive = 0; atomic_long_set(&xps->xps_queuelen, 0); xps->xps_net = NULL; @@ -136,6 +161,7 @@ static void xprt_switch_free(struct kref *kref) struct rpc_xprt_switch, xps_kref); xprt_switch_free_entries(xps); + xprt_switch_free_id(xps); kfree_rcu(xps, xps_rcu); } From d3abc73987fd2a5992a9bdae9f44fa43d1b4db70 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:15 -0400 Subject: [PATCH 24/46] sunrpc: keep track of the xprt_class in rpc_xprt structure We need to keep track of the type for a given transport. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/xprtrdma/transport.c | 2 ++ net/sunrpc/xprtsock.c | 9 +++++++++ 3 files changed, 13 insertions(+) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 1fbc470ce205..7efc6c0a5455 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -53,6 +53,7 @@ enum rpc_display_format_t { struct rpc_task; struct rpc_xprt; +struct xprt_class; struct seq_file; struct svc_serv; struct net; @@ -289,6 +290,7 @@ struct rpc_xprt { atomic_t inject_disconnect; #endif struct rcu_head rcu; + const struct xprt_class *xprt_class; }; #if defined(CONFIG_SUNRPC_BACKCHANNEL) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 19a49d26b1e4..9c2ffc67c0fd 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -73,6 +73,7 @@ unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR; int xprt_rdma_pad_optimize; +static struct xprt_class xprt_rdma; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -349,6 +350,7 @@ xprt_setup_rdma(struct xprt_create *args) /* Ensure xprt->addr holds valid server TCP (not RDMA) * address, for any side protocols which peek at it */ xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xprt_rdma; xprt->addrlen = args->addrlen; memcpy(&xprt->addr, sap, xprt->addrlen); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 316d04945587..2ad4d0df45fe 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -91,6 +91,11 @@ static unsigned int xprt_max_resvport_limit = RPC_MAX_RESVPORT; static struct ctl_table_header *sunrpc_table_header; +static struct xprt_class xs_local_transport; +static struct xprt_class xs_udp_transport; +static struct xprt_class xs_tcp_transport; +static struct xprt_class xs_bc_tcp_transport; + /* * FIXME: changing the UDP slot table size should also resize the UDP * socket buffers for existing UDP transports @@ -2779,6 +2784,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = 0; + xprt->xprt_class = &xs_local_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; @@ -2848,6 +2854,7 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_UDP; + xprt->xprt_class = &xs_udp_transport; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); @@ -2928,6 +2935,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xs_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->bind_timeout = XS_BIND_TO; @@ -3001,6 +3009,7 @@ static struct rpc_xprt *xs_setup_bc_tcp(struct xprt_create *args) transport = container_of(xprt, struct sock_xprt, xprt); xprt->prot = IPPROTO_TCP; + xprt->xprt_class = &xs_bc_tcp_transport; xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; xprt->timeout = &xs_tcp_default_timeout; From baea99445dd4675a834e8a5987d2f368adb62e6c Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:16 -0400 Subject: [PATCH 25/46] sunrpc: add xprt_switch direcotry to sunrpc's sysfs Add xprt_switch directory to the sysfs and create individual xprt_swith subdirectories for multipath transport group. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtmultipath.h | 2 + net/sunrpc/sysfs.c | 99 ++++++++++++++++++++++++++-- net/sunrpc/sysfs.h | 10 +++ net/sunrpc/xprtmultipath.c | 4 ++ 4 files changed, 108 insertions(+), 7 deletions(-) diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h index ef95a6f18ccf..b19addc8b715 100644 --- a/include/linux/sunrpc/xprtmultipath.h +++ b/include/linux/sunrpc/xprtmultipath.h @@ -10,6 +10,7 @@ #define _NET_SUNRPC_XPRTMULTIPATH_H struct rpc_xprt_iter_ops; +struct rpc_sysfs_xprt_switch; struct rpc_xprt_switch { spinlock_t xps_lock; struct kref xps_kref; @@ -24,6 +25,7 @@ struct rpc_xprt_switch { const struct rpc_xprt_iter_ops *xps_iter_ops; + struct rpc_sysfs_xprt_switch *xps_sysfs; struct rcu_head xps_rcu; }; diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index f3b7547ee218..ed9f7131543f 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -7,7 +7,7 @@ #include "sysfs.h" static struct kset *rpc_sunrpc_kset; -static struct kobject *rpc_sunrpc_client_kobj; +static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj; static void rpc_sysfs_object_release(struct kobject *kobj) { @@ -48,13 +48,22 @@ int rpc_sysfs_init(void) rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); if (!rpc_sunrpc_kset) return -ENOMEM; - rpc_sunrpc_client_kobj = rpc_sysfs_object_alloc("client", rpc_sunrpc_kset, NULL); - if (!rpc_sunrpc_client_kobj) { - kset_unregister(rpc_sunrpc_kset); - rpc_sunrpc_client_kobj = NULL; - return -ENOMEM; - } + rpc_sunrpc_client_kobj = + rpc_sysfs_object_alloc("rpc-clients", rpc_sunrpc_kset, NULL); + if (!rpc_sunrpc_client_kobj) + goto err_client; + rpc_sunrpc_xprt_switch_kobj = + rpc_sysfs_object_alloc("xprt-switches", rpc_sunrpc_kset, NULL); + if (!rpc_sunrpc_xprt_switch_kobj) + goto err_switch; return 0; +err_switch: + kobject_put(rpc_sunrpc_client_kobj); + rpc_sunrpc_client_kobj = NULL; +err_client: + kset_unregister(rpc_sunrpc_kset); + rpc_sunrpc_kset = NULL; + return -ENOMEM; } static void rpc_sysfs_client_release(struct kobject *kobj) @@ -65,20 +74,40 @@ static void rpc_sysfs_client_release(struct kobject *kobj) kfree(c); } +static void rpc_sysfs_xprt_switch_release(struct kobject *kobj) +{ + struct rpc_sysfs_xprt_switch *xprt_switch; + + xprt_switch = container_of(kobj, struct rpc_sysfs_xprt_switch, kobject); + kfree(xprt_switch); +} + static const void *rpc_sysfs_client_namespace(struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_client, kobject)->net; } +static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; +} + static struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_client_namespace, }; +static struct kobj_type rpc_sysfs_xprt_switch_type = { + .release = rpc_sysfs_xprt_switch_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_xprt_switch_namespace, +}; + void rpc_sysfs_exit(void) { kobject_put(rpc_sunrpc_client_kobj); + kobject_put(rpc_sunrpc_xprt_switch_kobj); kset_unregister(rpc_sunrpc_kset); } @@ -100,6 +129,28 @@ static struct rpc_sysfs_client *rpc_sysfs_client_alloc(struct kobject *parent, return NULL; } +static struct rpc_sysfs_xprt_switch * +rpc_sysfs_xprt_switch_alloc(struct kobject *parent, + struct rpc_xprt_switch *xprt_switch, + struct net *net, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt_switch *p; + + p = kzalloc(sizeof(*p), gfp_flags); + if (p) { + p->net = net; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, + &rpc_sysfs_xprt_switch_type, + parent, "switch-%d", + xprt_switch->xps_id) == 0) + return p; + kobject_put(&p->kobject); + } + return NULL; +} + void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net) { struct rpc_sysfs_client *rpc_client; @@ -111,6 +162,28 @@ void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net) } } +void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt_switch *rpc_xprt_switch; + struct net *net; + + if (xprt_switch->xps_net) + net = xprt_switch->xps_net; + else + net = xprt->xprt_net; + rpc_xprt_switch = + rpc_sysfs_xprt_switch_alloc(rpc_sunrpc_xprt_switch_kobj, + xprt_switch, net, gfp_flags); + if (rpc_xprt_switch) { + xprt_switch->xps_sysfs = rpc_xprt_switch; + rpc_xprt_switch->xprt_switch = xprt_switch; + rpc_xprt_switch->xprt = xprt; + kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_ADD); + } +} + void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) { struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; @@ -122,3 +195,15 @@ void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) clnt->cl_sysfs = NULL; } } + +void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch) +{ + struct rpc_sysfs_xprt_switch *rpc_xprt_switch = xprt_switch->xps_sysfs; + + if (rpc_xprt_switch) { + kobject_uevent(&rpc_xprt_switch->kobject, KOBJ_REMOVE); + kobject_del(&rpc_xprt_switch->kobject); + kobject_put(&rpc_xprt_switch->kobject); + xprt_switch->xps_sysfs = NULL; + } +} diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h index c46afc848993..52ec472bd4a9 100644 --- a/net/sunrpc/sysfs.h +++ b/net/sunrpc/sysfs.h @@ -10,10 +10,20 @@ struct rpc_sysfs_client { struct net *net; }; +struct rpc_sysfs_xprt_switch { + struct kobject kobject; + struct net *net; + struct rpc_xprt_switch *xprt_switch; + struct rpc_xprt *xprt; +}; + int rpc_sysfs_init(void); void rpc_sysfs_exit(void); void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net); void rpc_sysfs_client_destroy(struct rpc_clnt *clnt); +void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, gfp_t gfp_flags); +void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt); #endif diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 4969a4c216f7..2d73a35df9ee 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -19,6 +19,8 @@ #include #include +#include "sysfs.h" + typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct rpc_xprt_switch *xps, const struct rpc_xprt *cur); @@ -133,6 +135,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, xps->xps_net = NULL; INIT_LIST_HEAD(&xps->xps_xprt_list); xps->xps_iter_ops = &rpc_xprt_iter_singular; + rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags); xprt_switch_add_xprt_locked(xps, xprt); } @@ -161,6 +164,7 @@ static void xprt_switch_free(struct kref *kref) struct rpc_xprt_switch, xps_kref); xprt_switch_free_entries(xps); + rpc_sysfs_xprt_switch_destroy(xps); xprt_switch_free_id(xps); kfree_rcu(xps, xps_rcu); } From 2a338a543163ad6b42f4732396249cea6d3a33c8 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:17 -0400 Subject: [PATCH 26/46] sunrpc: add a symlink from rpc-client directory to the xprt_switch An rpc client uses a transport switch and one ore more transports associated with that switch. Since transports are shared among rpc clients, create a symlink into the xprt_switch directory instead of duplicating entries under each rpc client. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- net/sunrpc/sysfs.c | 22 ++++++++++++++++++++-- net/sunrpc/sysfs.h | 6 +++++- 3 files changed, 26 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 6f3f840a2ea3..9bf820bad84c 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -301,7 +301,6 @@ static int rpc_client_register(struct rpc_clnt *clnt, int err; rpc_clnt_debugfs_register(clnt); - rpc_sysfs_client_setup(clnt, net); pipefs_sb = rpc_get_sb_net(net); if (pipefs_sb) { @@ -426,6 +425,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, /* save the nodename */ rpc_clnt_set_nodename(clnt, nodename); + rpc_sysfs_client_setup(clnt, xps, rpc_net_ns(clnt)); err = rpc_client_register(clnt, args->authflavor, args->client_name); if (err) goto out_no_path; diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index ed9f7131543f..0aa63747f496 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -151,14 +151,29 @@ rpc_sysfs_xprt_switch_alloc(struct kobject *parent, return NULL; } -void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net) +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xprt_switch, + struct net *net) { struct rpc_sysfs_client *rpc_client; - rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, net, clnt->cl_clid); + rpc_client = rpc_sysfs_client_alloc(rpc_sunrpc_client_kobj, + net, clnt->cl_clid); if (rpc_client) { + char name[] = "switch"; + struct rpc_sysfs_xprt_switch *xswitch = + (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + int ret; + clnt->cl_sysfs = rpc_client; + rpc_client->clnt = clnt; + rpc_client->xprt_switch = xprt_switch; kobject_uevent(&rpc_client->kobject, KOBJ_ADD); + ret = sysfs_create_link_nowarn(&rpc_client->kobject, + &xswitch->kobject, name); + if (ret) + pr_warn("can't create link to %s in sysfs (%d)\n", + name, ret); } } @@ -189,6 +204,9 @@ void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; if (rpc_client) { + char name[] = "switch"; + + sysfs_remove_link(&rpc_client->kobject, name); kobject_uevent(&rpc_client->kobject, KOBJ_REMOVE); kobject_del(&rpc_client->kobject); kobject_put(&rpc_client->kobject); diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h index 52ec472bd4a9..760f0582aee5 100644 --- a/net/sunrpc/sysfs.h +++ b/net/sunrpc/sysfs.h @@ -8,6 +8,8 @@ struct rpc_sysfs_client { struct kobject kobject; struct net *net; + struct rpc_clnt *clnt; + struct rpc_xprt_switch *xprt_switch; }; struct rpc_sysfs_xprt_switch { @@ -20,7 +22,9 @@ struct rpc_sysfs_xprt_switch { int rpc_sysfs_init(void); void rpc_sysfs_exit(void); -void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct net *net); +void rpc_sysfs_client_setup(struct rpc_clnt *clnt, + struct rpc_xprt_switch *xprt_switch, + struct net *net); void rpc_sysfs_client_destroy(struct rpc_clnt *clnt); void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags); From d408ebe04ac58eb370e2d264e88edbab746adda6 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:18 -0400 Subject: [PATCH 27/46] sunrpc: add add sysfs directory per xprt under each xprt_switch Add individual transport directories under each transport switch group. For instance, for each nconnect=X connections there will be a transport directory. Naming conventions also identifies transport type -- xprt-- where type is udp, tcp, rdma, local, bc. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 2 ++ net/sunrpc/sysfs.c | 67 +++++++++++++++++++++++++++++++++++++ net/sunrpc/sysfs.h | 8 +++++ net/sunrpc/xprtmultipath.c | 4 +++ 4 files changed, 81 insertions(+) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 7efc6c0a5455..8360db664e5f 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -183,6 +183,7 @@ enum xprt_transports { XPRT_TRANSPORT_LOCAL = 257, }; +struct rpc_sysfs_xprt; struct rpc_xprt { struct kref kref; /* Reference count */ const struct rpc_xprt_ops *ops; /* transport methods */ @@ -291,6 +292,7 @@ struct rpc_xprt { #endif struct rcu_head rcu; const struct xprt_class *xprt_class; + struct rpc_sysfs_xprt *xprt_sysfs; }; #if defined(CONFIG_SUNRPC_BACKCHANNEL) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 0aa63747f496..20f75708594f 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -82,6 +82,14 @@ static void rpc_sysfs_xprt_switch_release(struct kobject *kobj) kfree(xprt_switch); } +static void rpc_sysfs_xprt_release(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *xprt; + + xprt = container_of(kobj, struct rpc_sysfs_xprt, kobject); + kfree(xprt); +} + static const void *rpc_sysfs_client_namespace(struct kobject *kobj) { return container_of(kobj, struct rpc_sysfs_client, kobject)->net; @@ -92,6 +100,12 @@ static const void *rpc_sysfs_xprt_switch_namespace(struct kobject *kobj) return container_of(kobj, struct rpc_sysfs_xprt_switch, kobject)->net; } +static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) +{ + return container_of(kobj, struct rpc_sysfs_xprt, + kobject)->xprt->xprt_net; +} + static struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, .sysfs_ops = &kobj_sysfs_ops, @@ -104,6 +118,12 @@ static struct kobj_type rpc_sysfs_xprt_switch_type = { .namespace = rpc_sysfs_xprt_switch_namespace, }; +static struct kobj_type rpc_sysfs_xprt_type = { + .release = rpc_sysfs_xprt_release, + .sysfs_ops = &kobj_sysfs_ops, + .namespace = rpc_sysfs_xprt_namespace, +}; + void rpc_sysfs_exit(void) { kobject_put(rpc_sunrpc_client_kobj); @@ -151,6 +171,25 @@ rpc_sysfs_xprt_switch_alloc(struct kobject *parent, return NULL; } +static struct rpc_sysfs_xprt *rpc_sysfs_xprt_alloc(struct kobject *parent, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt *p; + + p = kzalloc(sizeof(*p), gfp_flags); + if (!p) + goto out; + p->kobject.kset = rpc_sunrpc_kset; + if (kobject_init_and_add(&p->kobject, &rpc_sysfs_xprt_type, + parent, "xprt-%d-%s", xprt->id, + xprt->address_strings[RPC_DISPLAY_PROTO]) == 0) + return p; + kobject_put(&p->kobject); +out: + return NULL; +} + void rpc_sysfs_client_setup(struct rpc_clnt *clnt, struct rpc_xprt_switch *xprt_switch, struct net *net) @@ -199,6 +238,22 @@ void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, } } +void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, + gfp_t gfp_flags) +{ + struct rpc_sysfs_xprt *rpc_xprt; + struct rpc_sysfs_xprt_switch *switch_obj = + (struct rpc_sysfs_xprt_switch *)xprt_switch->xps_sysfs; + + rpc_xprt = rpc_sysfs_xprt_alloc(&switch_obj->kobject, xprt, gfp_flags); + if (rpc_xprt) { + xprt->xprt_sysfs = rpc_xprt; + rpc_xprt->xprt = xprt; + kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD); + } +} + void rpc_sysfs_client_destroy(struct rpc_clnt *clnt) { struct rpc_sysfs_client *rpc_client = clnt->cl_sysfs; @@ -225,3 +280,15 @@ void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt_switch) xprt_switch->xps_sysfs = NULL; } } + +void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt) +{ + struct rpc_sysfs_xprt *rpc_xprt = xprt->xprt_sysfs; + + if (rpc_xprt) { + kobject_uevent(&rpc_xprt->kobject, KOBJ_REMOVE); + kobject_del(&rpc_xprt->kobject); + kobject_put(&rpc_xprt->kobject); + xprt->xprt_sysfs = NULL; + } +} diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h index 760f0582aee5..ff10451de6fa 100644 --- a/net/sunrpc/sysfs.h +++ b/net/sunrpc/sysfs.h @@ -19,6 +19,11 @@ struct rpc_sysfs_xprt_switch { struct rpc_xprt *xprt; }; +struct rpc_sysfs_xprt { + struct kobject kobject; + struct rpc_xprt *xprt; +}; + int rpc_sysfs_init(void); void rpc_sysfs_exit(void); @@ -29,5 +34,8 @@ void rpc_sysfs_client_destroy(struct rpc_clnt *clnt); void rpc_sysfs_xprt_switch_setup(struct rpc_xprt_switch *xprt_switch, struct rpc_xprt *xprt, gfp_t gfp_flags); void rpc_sysfs_xprt_switch_destroy(struct rpc_xprt_switch *xprt); +void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, + struct rpc_xprt *xprt, gfp_t gfp_flags); +void rpc_sysfs_xprt_destroy(struct rpc_xprt *xprt); #endif diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 2d73a35df9ee..e7973c1ff70c 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -57,6 +57,7 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps, if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) xprt_switch_add_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); + rpc_sysfs_xprt_setup(xps, xprt, GFP_KERNEL); } static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, @@ -85,6 +86,7 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, spin_lock(&xps->xps_lock); xprt_switch_remove_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); + rpc_sysfs_xprt_destroy(xprt); xprt_put(xprt); } @@ -137,6 +139,7 @@ struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt, xps->xps_iter_ops = &rpc_xprt_iter_singular; rpc_sysfs_xprt_switch_setup(xps, xprt, gfp_flags); xprt_switch_add_xprt_locked(xps, xprt); + rpc_sysfs_xprt_setup(xps, xprt, gfp_flags); } return xps; @@ -152,6 +155,7 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps) struct rpc_xprt, xprt_switch); xprt_switch_remove_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); + rpc_sysfs_xprt_destroy(xprt); xprt_put(xprt); spin_lock(&xps->xps_lock); } From e091853ebdb486fd8bde86b87178fdf3850914fc Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:46 -0400 Subject: [PATCH 28/46] SUNRPC mark the first transport When an RPC client gets created it's first transport is special and should be marked a main transport. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/clnt.c | 1 + 2 files changed, 2 insertions(+) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 13a4eaf385cf..692e5946c029 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -293,6 +293,7 @@ struct rpc_xprt { struct rcu_head rcu; const struct xprt_class *xprt_class; struct rpc_sysfs_xprt *xprt_sysfs; + bool main; /*mark if this is the 1st transport */ }; #if defined(CONFIG_SUNRPC_BACKCHANNEL) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 9bf820bad84c..408618765aa5 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -412,6 +412,7 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, } rpc_clnt_set_transport(clnt, xprt, timeout); + xprt->main = true; xprt_iter_init(&clnt->cl_xpi, xps); xprt_switch_put(xps); From 0e65ea43d9c7c038e167b20165a0e9ed1e9cca83 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:47 -0400 Subject: [PATCH 29/46] SUNRPC display xprt's main value in sysfs's xprt_info Display in sysfs in the information about the xprt if this is a main transport or not. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 2fbaba27d5c6..124f2e1e3409 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -103,10 +103,10 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" - "backlog_q_len=%u\n", xprt->last_used, xprt->cong, - xprt->cwnd, xprt->max_reqs, xprt->min_reqs, + "backlog_q_len=%u\nmain_xprt=%d\n", xprt->last_used, + xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, - xprt->pending.qlen, xprt->backlog.qlen); + xprt->pending.qlen, xprt->backlog.qlen, xprt->main); xprt_put(xprt); return ret + 1; } From a8482488a7d6d320f63a9ee1912dbb5ae5b80a61 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:48 -0400 Subject: [PATCH 30/46] SUNRPC query transport's source port Provide ability to query transport's source port. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprtsock.h | 1 + net/sunrpc/xprtsock.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/include/linux/sunrpc/xprtsock.h b/include/linux/sunrpc/xprtsock.h index 3c1423ee74b4..8c2a712cb242 100644 --- a/include/linux/sunrpc/xprtsock.h +++ b/include/linux/sunrpc/xprtsock.h @@ -10,6 +10,7 @@ int init_socket_xprt(void); void cleanup_socket_xprt(void); +unsigned short get_srcport(struct rpc_xprt *); #define RPC_MIN_RESVPORT (1U) #define RPC_MAX_RESVPORT (65535U) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 2ad4d0df45fe..4611845ec1eb 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1653,6 +1653,13 @@ static int xs_get_srcport(struct sock_xprt *transport) return port; } +unsigned short get_srcport(struct rpc_xprt *xprt) +{ + struct sock_xprt *sock = container_of(xprt, struct sock_xprt, xprt); + return sock->srcport; +} +EXPORT_SYMBOL(get_srcport); + static unsigned short xs_next_srcport(struct sock_xprt *transport, unsigned short port) { if (transport->srcport != 0) From c1830a63c79aa90f725ed6feaad097473f2b990d Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:49 -0400 Subject: [PATCH 31/46] SUNRPC for TCP display xprt's source port in sysfs xprt_info Using TCP connection's source port it is useful to match connections seen on the network traces to the xprts used by the linux nfs client. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/sysfs.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 124f2e1e3409..08aa503295b7 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "sysfs.h" @@ -103,10 +104,13 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" - "backlog_q_len=%u\nmain_xprt=%d\n", xprt->last_used, - xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, - xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, - xprt->pending.qlen, xprt->backlog.qlen, xprt->main); + "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n", + xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, + xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, + xprt->sending.qlen, xprt->pending.qlen, + xprt->backlog.qlen, xprt->main, + (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? + get_srcport(xprt) : 0); xprt_put(xprt); return ret + 1; } From 587bc7255d26ca80b58026881db5fb3bf770cc43 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:19 -0400 Subject: [PATCH 32/46] sunrpc: add dst_attr attributes to the sysfs xprt directory Allow to query and set the destination's address of a transport. Setting of the destination address is allowed only for TCP or RDMA based connections. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/sysfs.c | 105 ++++++++++++++++++++++++++++++++++++ net/sunrpc/xprt.c | 4 +- net/sunrpc/xprtmultipath.c | 2 - 4 files changed, 109 insertions(+), 3 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 8360db664e5f..13a4eaf385cf 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -414,6 +414,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); bool xprt_lock_connect(struct rpc_xprt *, struct rpc_task *, void *); void xprt_unlock_connect(struct rpc_xprt *, void *); +void xprt_release_write(struct rpc_xprt *, struct rpc_task *); /* * Reserved bit positions in xprt->state diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 20f75708594f..402924bbd743 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -4,8 +4,23 @@ */ #include #include +#include + #include "sysfs.h" +struct xprt_addr { + const char *addr; + struct rcu_head rcu; +}; + +static void free_xprt_addr(struct rcu_head *head) +{ + struct xprt_addr *addr = container_of(head, struct xprt_addr, rcu); + + kfree(addr->addr); + kfree(addr); +} + static struct kset *rpc_sunrpc_kset; static struct kobject *rpc_sunrpc_client_kobj, *rpc_sunrpc_xprt_switch_kobj; @@ -43,6 +58,87 @@ static struct kobject *rpc_sysfs_object_alloc(const char *name, return NULL; } +static inline struct rpc_xprt * +rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *x = container_of(kobj, + struct rpc_sysfs_xprt, kobject); + + return xprt_get(x->xprt); +} + +static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) + return 0; + ret = sprintf(buf, "%s\n", xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt_put(xprt); + return ret + 1; +} + +static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + struct sockaddr *saddr; + char *dst_addr; + int port; + struct xprt_addr *saved_addr; + size_t buf_len; + + if (!xprt) + return 0; + if (!(xprt->xprt_class->ident == XPRT_TRANSPORT_TCP || + xprt->xprt_class->ident == XPRT_TRANSPORT_RDMA)) { + xprt_put(xprt); + return -EOPNOTSUPP; + } + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + saddr = (struct sockaddr *)&xprt->addr; + port = rpc_get_port(saddr); + + /* buf_len is the len until the first occurence of either + * '\n' or '\0' + */ + buf_len = strcspn(buf, "\n"); + + dst_addr = kstrndup(buf, buf_len, GFP_KERNEL); + if (!dst_addr) + goto out_err; + saved_addr = kzalloc(sizeof(*saved_addr), GFP_KERNEL); + if (!saved_addr) + goto out_err_free; + saved_addr->addr = + rcu_dereference_raw(xprt->address_strings[RPC_DISPLAY_ADDR]); + rcu_assign_pointer(xprt->address_strings[RPC_DISPLAY_ADDR], dst_addr); + call_rcu(&saved_addr->rcu, free_xprt_addr); + xprt->addrlen = rpc_pton(xprt->xprt_net, buf, buf_len, saddr, + sizeof(*saddr)); + rpc_set_port(saddr, port); + + xprt_force_disconnect(xprt); +out: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + return count; +out_err_free: + kfree(dst_addr); +out_err: + count = -ENOMEM; + goto out; +} + int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); @@ -106,6 +202,14 @@ static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) kobject)->xprt->xprt_net; } +static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, + 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); + +static struct attribute *rpc_sysfs_xprt_attrs[] = { + &rpc_sysfs_xprt_dstaddr.attr, + NULL, +}; + static struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, .sysfs_ops = &kobj_sysfs_ops, @@ -120,6 +224,7 @@ static struct kobj_type rpc_sysfs_xprt_switch_type = { static struct kobj_type rpc_sysfs_xprt_type = { .release = rpc_sysfs_xprt_release, + .default_attrs = rpc_sysfs_xprt_attrs, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_namespace, }; diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 20b9bd705014..fb6db09725c7 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -55,6 +55,7 @@ #include #include "sunrpc.h" +#include "sysfs.h" /* * Local variables @@ -443,7 +444,7 @@ void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); -static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task != task) return; @@ -1812,6 +1813,7 @@ void xprt_free(struct rpc_xprt *xprt) put_net(xprt->xprt_net); xprt_free_all_slots(xprt); xprt_free_id(xprt); + rpc_sysfs_xprt_destroy(xprt); kfree_rcu(xprt, rcu); } EXPORT_SYMBOL_GPL(xprt_free); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index e7973c1ff70c..07e76ae1028a 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -86,7 +86,6 @@ void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps, spin_lock(&xps->xps_lock); xprt_switch_remove_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); - rpc_sysfs_xprt_destroy(xprt); xprt_put(xprt); } @@ -155,7 +154,6 @@ static void xprt_switch_free_entries(struct rpc_xprt_switch *xps) struct rpc_xprt, xprt_switch); xprt_switch_remove_xprt_locked(xps, xprt); spin_unlock(&xps->xps_lock); - rpc_sysfs_xprt_destroy(xprt); xprt_put(xprt); spin_lock(&xps->xps_lock); } From 5b7eb78486cd9ac58bfbd6d84ea0fe2d9fead03b Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:50 -0400 Subject: [PATCH 33/46] SUNRPC: take a xprt offline using sysfs Using sysfs's xprt_state attribute, mark a particular transport offline. It will not be picked during the round-robin selection. It's not allowed to take the main (1st created transport associated with the rpc_client) offline. Also bring a transport back online via sysfs by writing "online" and that would allow for this transport to be picked during the round- robin selection. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/sysfs.c | 66 ++++++++++++++++++++++++++++++++++--- net/sunrpc/sysfs.h | 1 + net/sunrpc/xprtmultipath.c | 6 ++-- 4 files changed, 68 insertions(+), 6 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 692e5946c029..b8ed7fa1b4ca 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -427,6 +427,7 @@ void xprt_release_write(struct rpc_xprt *, struct rpc_task *); #define XPRT_BOUND (4) #define XPRT_BINDING (5) #define XPRT_CLOSING (6) +#define XPRT_OFFLINE (7) #define XPRT_CONGESTED (9) #define XPRT_CWND_WAIT (10) #define XPRT_WRITE_SPACE (11) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 08aa503295b7..a30ad18aa7dc 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -68,6 +68,15 @@ rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) return xprt_get(x->xprt); } +static inline struct rpc_xprt_switch * +rpc_sysfs_xprt_kobj_get_xprt_switch(struct kobject *kobj) +{ + struct rpc_sysfs_xprt *x = container_of(kobj, + struct rpc_sysfs_xprt, kobject); + + return xprt_switch_get(x->xprt_switch); +} + static inline struct rpc_xprt_switch * rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) { @@ -122,7 +131,7 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; int locked, connected, connecting, close_wait, bound, binding, - closing, congested, cwnd_wait, write_space; + closing, congested, cwnd_wait, write_space, offline; if (!xprt) return 0; @@ -140,8 +149,9 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, congested = test_bit(XPRT_CONGESTED, &xprt->state); cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); + offline = test_bit(XPRT_OFFLINE, &xprt->state); - ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s\n", + ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s\n", locked ? "LOCKED" : "", connected ? "CONNECTED" : "", connecting ? "CONNECTING" : "", @@ -151,7 +161,8 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, closing ? "CLOSING" : "", congested ? "CONGESTED" : "", cwnd_wait ? "CWND_WAIT" : "", - write_space ? "WRITE_SPACE" : ""); + write_space ? "WRITE_SPACE" : "", + offline ? "OFFLINE" : ""); } xprt_put(xprt); @@ -233,6 +244,52 @@ out_err: goto out; } +static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + int offline = 0, online = 0; + struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); + + if (!xprt) + return 0; + + if (!strncmp(buf, "offline", 7)) + offline = 1; + else if (!strncmp(buf, "online", 6)) + online = 1; + else + return -EINVAL; + + if (wait_on_bit_lock(&xprt->state, XPRT_LOCKED, TASK_KILLABLE)) { + count = -EINTR; + goto out_put; + } + if (xprt->main) { + count = -EINVAL; + goto release_tasks; + } + if (offline) { + set_bit(XPRT_OFFLINE, &xprt->state); + spin_lock(&xps->xps_lock); + xps->xps_nactive--; + spin_unlock(&xps->xps_lock); + } else if (online) { + clear_bit(XPRT_OFFLINE, &xprt->state); + spin_lock(&xps->xps_lock); + xps->xps_nactive++; + spin_unlock(&xps->xps_lock); + } + +release_tasks: + xprt_release_write(xprt, NULL); +out_put: + xprt_put(xprt); + xprt_switch_put(xps); + return count; +} + int rpc_sysfs_init(void) { rpc_sunrpc_kset = kset_create_and_add("sunrpc", NULL, kernel_kobj); @@ -303,7 +360,7 @@ static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, - 0644, rpc_sysfs_xprt_state_show, NULL); + 0644, rpc_sysfs_xprt_state_show, rpc_sysfs_xprt_state_change); static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, @@ -466,6 +523,7 @@ void rpc_sysfs_xprt_setup(struct rpc_xprt_switch *xprt_switch, if (rpc_xprt) { xprt->xprt_sysfs = rpc_xprt; rpc_xprt->xprt = xprt; + rpc_xprt->xprt_switch = xprt_switch; kobject_uevent(&rpc_xprt->kobject, KOBJ_ADD); } } diff --git a/net/sunrpc/sysfs.h b/net/sunrpc/sysfs.h index ff10451de6fa..6620cebd1037 100644 --- a/net/sunrpc/sysfs.h +++ b/net/sunrpc/sysfs.h @@ -22,6 +22,7 @@ struct rpc_sysfs_xprt_switch { struct rpc_sysfs_xprt { struct kobject kobject; struct rpc_xprt *xprt; + struct rpc_xprt_switch *xprt_switch; }; int rpc_sysfs_init(void); diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 07e76ae1028a..5f4845d1e922 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -65,7 +65,8 @@ static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps, { if (unlikely(xprt == NULL)) return; - xps->xps_nactive--; + if (!test_bit(XPRT_OFFLINE, &xprt->state)) + xps->xps_nactive--; xps->xps_nxprts--; if (xps->xps_nxprts == 0) xps->xps_net = NULL; @@ -230,7 +231,8 @@ void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi) static bool xprt_is_active(const struct rpc_xprt *xprt) { - return kref_read(&xprt->kref) != 0; + return (kref_read(&xprt->kref) != 0 && + !test_bit(XPRT_OFFLINE, &xprt->state)); } static From 4a09651a6b28748af401a1dd5cf9cea06c3aa329 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:20 -0400 Subject: [PATCH 34/46] sunrpc: provide transport info in the sysfs directory Allow to query transport's attributes. Currently showing following fields of the rpc_xprt structure: state, last_used, cong, cwnd, max_reqs, min_reqs, num_reqs, sizes of queues binding, sending, pending, backlog. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/sysfs.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index 402924bbd743..f770eb7301b5 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -81,6 +81,27 @@ static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, return ret + 1; } +static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt) + return 0; + + ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" + "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" + "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" + "backlog_q_len=%u\n", xprt->last_used, xprt->cong, + xprt->cwnd, xprt->max_reqs, xprt->min_reqs, + xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, + xprt->pending.qlen, xprt->backlog.qlen); + xprt_put(xprt); + return ret + 1; +} + static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -205,8 +226,12 @@ static const void *rpc_sysfs_xprt_namespace(struct kobject *kobj) static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, 0644, rpc_sysfs_xprt_dstaddr_show, rpc_sysfs_xprt_dstaddr_store); +static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, + 0444, rpc_sysfs_xprt_info_show, NULL); + static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, + &rpc_sysfs_xprt_info.attr, NULL, }; From 85e39feead948bdf8322c961d7a9bebc20d629f3 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:51 -0400 Subject: [PATCH 35/46] NFSv4.1 identify and mark RPC tasks that can move between transports In preparation for when we can re-try a task on a different transport, identify and mark such RPC tasks as moveable. Only 4.1+ operarations can be re-tried on a different transport. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 38 +++++++++++++++++++++++++++++++----- fs/nfs/pagelist.c | 8 ++++++-- fs/nfs/write.c | 6 +++++- include/linux/sunrpc/sched.h | 2 ++ 4 files changed, 46 insertions(+), 8 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index e653654c10bc..d3ee3700c9dd 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1155,7 +1155,11 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res) { - return nfs4_do_call_sync(clnt, server, msg, args, res, 0); + unsigned short task_flags = 0; + + if (server->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; + return nfs4_do_call_sync(clnt, server, msg, args, res, task_flags); } @@ -2569,6 +2573,9 @@ static int nfs4_run_open_task(struct nfs4_opendata *data, }; int status; + if (server->nfs_client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + kref_get(&data->kref); data->rpc_done = false; data->rpc_status = 0; @@ -3749,6 +3756,9 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait) }; int status = -ENOMEM; + if (server->nfs_client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; + nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); @@ -4188,6 +4198,9 @@ static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, }; unsigned short task_flags = 0; + if (nfs4_has_session(server->nfs_client)) + task_flags = RPC_TASK_MOVEABLE; + /* Is this is an attribute revalidation, subject to softreval? */ if (inode && (server->flags & NFS_MOUNT_SOFTREVAL)) task_flags |= RPC_TASK_TIMEOUT; @@ -4307,6 +4320,9 @@ static int _nfs4_proc_lookup(struct rpc_clnt *clnt, struct inode *dir, }; unsigned short task_flags = 0; + if (server->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; + /* Is this is an attribute revalidation, subject to softreval? */ if (nfs_lookup_is_soft_revalidate(dentry)) task_flags |= RPC_TASK_TIMEOUT; @@ -6538,7 +6554,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_delegreturn_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, }; int status = 0; @@ -6856,6 +6872,11 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; + struct nfs_client *client = + NFS_SERVER(lsp->ls_state->inode)->nfs_client; + + if (client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; nfs4_state_protect(NFS_SERVER(lsp->ls_state->inode)->nfs_client, NFS_SP4_MACH_CRED_CLEANUP, &task_setup_data.rpc_client, &msg); @@ -7130,6 +7151,10 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, }; int ret; + struct nfs_client *client = NFS_SERVER(state->inode)->nfs_client; + + if (client->cl_minorversion) + task_setup_data.flags |= RPC_TASK_MOVEABLE; dprintk("%s: begin!\n", __func__); data = nfs4_alloc_lockdata(fl, nfs_file_open_context(fl->fl_file), @@ -9186,7 +9211,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, .rpc_client = clp->cl_rpcclient, .rpc_message = &msg, .callback_ops = &nfs41_sequence_ops, - .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT, + .flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT | RPC_TASK_MOVEABLE, }; struct rpc_task *ret; @@ -9509,7 +9534,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) .rpc_message = &msg, .callback_ops = &nfs4_layoutget_call_ops, .callback_data = lgp, - .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF, + .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | + RPC_TASK_MOVEABLE, }; struct pnfs_layout_segment *lseg = NULL; struct nfs4_exception exception = { @@ -9650,6 +9676,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutreturn_call_ops, .callback_data = lrp, + .flags = RPC_TASK_MOVEABLE, }; int status = 0; @@ -9804,6 +9831,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) .rpc_message = &msg, .callback_ops = &nfs4_layoutcommit_ops, .callback_data = data, + .flags = RPC_TASK_MOVEABLE, }; struct rpc_task *task; int status = 0; @@ -10131,7 +10159,7 @@ static int nfs41_free_stateid(struct nfs_server *server, .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs41_free_stateid_ops, - .flags = RPC_TASK_ASYNC, + .flags = RPC_TASK_ASYNC | RPC_TASK_MOVEABLE, }; struct nfs_free_stateid_data *data; struct rpc_task *task; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index cf9cc62ec48e..cc232d1f16f2 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -954,6 +954,7 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { struct nfs_pgio_header *hdr; int ret; + unsigned short task_flags = 0; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { @@ -962,14 +963,17 @@ static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); - if (ret == 0) + if (ret == 0) { + if (NFS_SERVER(hdr->inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode), hdr, hdr->cred, NFS_PROTO(hdr->inode), desc->pg_rpc_callops, desc->pg_ioflags, - RPC_TASK_CRED_NOREF); + RPC_TASK_CRED_NOREF | task_flags); + } return ret; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3bf82178166a..eae9bf114041 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1810,6 +1810,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo) { struct nfs_commit_data *data; + unsigned short task_flags = 0; /* another commit raced with us */ if (list_empty(head)) @@ -1820,8 +1821,11 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); + if (NFS_SERVER(inode)->nfs_client->cl_minorversion) + task_flags = RPC_TASK_MOVEABLE; return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), - data->mds_ops, how, RPC_TASK_CRED_NOREF); + data->mds_ops, how, + RPC_TASK_CRED_NOREF | task_flags); } /* diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h index df696efdd675..a237b8dbf608 100644 --- a/include/linux/sunrpc/sched.h +++ b/include/linux/sunrpc/sched.h @@ -121,6 +121,7 @@ struct rpc_task_setup { */ #define RPC_TASK_ASYNC 0x0001 /* is an async task */ #define RPC_TASK_SWAPPER 0x0002 /* is swapping in/out */ +#define RPC_TASK_MOVEABLE 0x0004 /* nfs4.1+ rpc tasks */ #define RPC_TASK_NULLCREDS 0x0010 /* Use AUTH_NULL credential */ #define RPC_CALL_MAJORSEEN 0x0020 /* major timeout seen */ #define RPC_TASK_ROOTCREDS 0x0040 /* force root creds */ @@ -139,6 +140,7 @@ struct rpc_task_setup { #define RPC_IS_SOFT(t) ((t)->tk_flags & (RPC_TASK_SOFT|RPC_TASK_TIMEOUT)) #define RPC_IS_SOFTCONN(t) ((t)->tk_flags & RPC_TASK_SOFTCONN) #define RPC_WAS_SENT(t) ((t)->tk_flags & RPC_TASK_SENT) +#define RPC_IS_MOVEABLE(t) ((t)->tk_flags & RPC_TASK_MOVEABLE) #define RPC_TASK_RUNNING 0 #define RPC_TASK_QUEUED 1 From 0e5590358770ae779f3a8f5c36a3fbde40e344dc Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:21 -0400 Subject: [PATCH 36/46] sunrpc: provide multipath info in the sysfs directory Allow to query xrpt_switch attributes. Currently showing the following fields of the rpc_xprt_switch structure: xps_nxprts, xps_nactive, xps_queuelen. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/sysfs.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index f770eb7301b5..b3bc76bbc2a0 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -67,6 +67,15 @@ rpc_sysfs_xprt_kobj_get_xprt(struct kobject *kobj) return xprt_get(x->xprt); } +static inline struct rpc_xprt_switch * +rpc_sysfs_xprt_switch_kobj_get_xprt(struct kobject *kobj) +{ + struct rpc_sysfs_xprt_switch *x = container_of(kobj, + struct rpc_sysfs_xprt_switch, kobject); + + return xprt_switch_get(x->xprt_switch); +} + static ssize_t rpc_sysfs_xprt_dstaddr_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -102,6 +111,23 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, return ret + 1; } +static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt_switch *xprt_switch = + rpc_sysfs_xprt_switch_kobj_get_xprt(kobj); + ssize_t ret; + + if (!xprt_switch) + return 0; + ret = sprintf(buf, "num_xprts=%u\nnum_active=%u\nqueue_len=%ld\n", + xprt_switch->xps_nxprts, xprt_switch->xps_nactive, + atomic_long_read(&xprt_switch->xps_queuelen)); + xprt_switch_put(xprt_switch); + return ret + 1; +} + static ssize_t rpc_sysfs_xprt_dstaddr_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) @@ -235,6 +261,14 @@ static struct attribute *rpc_sysfs_xprt_attrs[] = { NULL, }; +static struct kobj_attribute rpc_sysfs_xprt_switch_info = + __ATTR(xprt_switch_info, 0444, rpc_sysfs_xprt_switch_info_show, NULL); + +static struct attribute *rpc_sysfs_xprt_switch_attrs[] = { + &rpc_sysfs_xprt_switch_info.attr, + NULL, +}; + static struct kobj_type rpc_sysfs_client_type = { .release = rpc_sysfs_client_release, .sysfs_ops = &kobj_sysfs_ops, @@ -243,6 +277,7 @@ static struct kobj_type rpc_sysfs_client_type = { static struct kobj_type rpc_sysfs_xprt_switch_type = { .release = rpc_sysfs_xprt_switch_release, + .default_attrs = rpc_sysfs_xprt_switch_attrs, .sysfs_ops = &kobj_sysfs_ops, .namespace = rpc_sysfs_xprt_switch_namespace, }; From 6a2840590b66e4914d583be61e40445386bb5835 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:52 -0400 Subject: [PATCH 37/46] sunrpc: display xprt's queuelen of assigned tasks via sysfs Once a task grabs a trasnport it's reflected in the queuelen of the rpc_xprt structure. Add display of that value in the xprt's info file in sysfs. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index a30ad18aa7dc..b576c7f06829 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -113,13 +113,15 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, ret = sprintf(buf, "last_used=%lu\ncur_cong=%lu\ncong_win=%lu\n" "max_num_slots=%u\nmin_num_slots=%u\nnum_reqs=%u\n" "binding_q_len=%u\nsending_q_len=%u\npending_q_len=%u\n" - "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n", + "backlog_q_len=%u\nmain_xprt=%d\nsrc_port=%u\n" + "tasks_queuelen=%ld\n", xprt->last_used, xprt->cong, xprt->cwnd, xprt->max_reqs, xprt->min_reqs, xprt->num_reqs, xprt->binding.qlen, xprt->sending.qlen, xprt->pending.qlen, xprt->backlog.qlen, xprt->main, (xprt->xprt_class->ident == XPRT_TRANSPORT_TCP) ? - get_srcport(xprt) : 0); + get_srcport(xprt) : 0, + atomic_long_read(&xprt->queuelen)); xprt_put(xprt); return ret + 1; } From 681d5699cbe734031c125cd5ca91826268af4568 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 8 Jun 2021 15:59:22 -0400 Subject: [PATCH 38/46] sunrpc: provide showing transport's state info in the sysfs directory In preparation of being able to change the xprt's state, add a way to show currect state of the transport. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- net/sunrpc/sysfs.c | 47 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index b3bc76bbc2a0..2fbaba27d5c6 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -111,6 +111,49 @@ static ssize_t rpc_sysfs_xprt_info_show(struct kobject *kobj, return ret + 1; } +static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); + ssize_t ret; + int locked, connected, connecting, close_wait, bound, binding, + closing, congested, cwnd_wait, write_space; + + if (!xprt) + return 0; + + if (!xprt->state) { + ret = sprintf(buf, "state=CLOSED\n"); + } else { + locked = test_bit(XPRT_LOCKED, &xprt->state); + connected = test_bit(XPRT_CONNECTED, &xprt->state); + connecting = test_bit(XPRT_CONNECTING, &xprt->state); + close_wait = test_bit(XPRT_CLOSE_WAIT, &xprt->state); + bound = test_bit(XPRT_BOUND, &xprt->state); + binding = test_bit(XPRT_BINDING, &xprt->state); + closing = test_bit(XPRT_CLOSING, &xprt->state); + congested = test_bit(XPRT_CONGESTED, &xprt->state); + cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); + write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); + + ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s\n", + locked ? "LOCKED" : "", + connected ? "CONNECTED" : "", + connecting ? "CONNECTING" : "", + close_wait ? "CLOSE_WAIT" : "", + bound ? "BOUND" : "", + binding ? "BOUNDING" : "", + closing ? "CLOSING" : "", + congested ? "CONGESTED" : "", + cwnd_wait ? "CWND_WAIT" : "", + write_space ? "WRITE_SPACE" : ""); + } + + xprt_put(xprt); + return ret + 1; +} + static ssize_t rpc_sysfs_xprt_switch_info_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -255,9 +298,13 @@ static struct kobj_attribute rpc_sysfs_xprt_dstaddr = __ATTR(dstaddr, static struct kobj_attribute rpc_sysfs_xprt_info = __ATTR(xprt_info, 0444, rpc_sysfs_xprt_info_show, NULL); +static struct kobj_attribute rpc_sysfs_xprt_change_state = __ATTR(xprt_state, + 0644, rpc_sysfs_xprt_state_show, NULL); + static struct attribute *rpc_sysfs_xprt_attrs[] = { &rpc_sysfs_xprt_dstaddr.attr, &rpc_sysfs_xprt_info.attr, + &rpc_sysfs_xprt_change_state.attr, NULL, }; From 6f081693e7b2ba63422b735684b05a850a6351ba Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Wed, 23 Jun 2021 23:28:53 -0400 Subject: [PATCH 39/46] sunrpc: remove an offlined xprt using sysfs Once a transport has been put offline, this transport can be also removed from the list of transports. Any tasks that have been stuck on this transport would find the next available active transport and be re-tried. This transport would be removed from the xprt_switch list and freed. Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 1 + net/sunrpc/clnt.c | 24 ++++++++++++++++++++++++ net/sunrpc/sysfs.c | 26 ++++++++++++++++++++++---- 3 files changed, 47 insertions(+), 4 deletions(-) diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index b8ed7fa1b4ca..c8c39f22d3b1 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -428,6 +428,7 @@ void xprt_release_write(struct rpc_xprt *, struct rpc_task *); #define XPRT_BINDING (5) #define XPRT_CLOSING (6) #define XPRT_OFFLINE (7) +#define XPRT_REMOVE (8) #define XPRT_CONGESTED (9) #define XPRT_CWND_WAIT (10) #define XPRT_WRITE_SPACE (11) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 408618765aa5..8b4de70e8ead 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2106,6 +2106,30 @@ call_connect_status(struct rpc_task *task) case -ENOTCONN: case -EAGAIN: case -ETIMEDOUT: + if (!(task->tk_flags & RPC_TASK_NO_ROUND_ROBIN) && + (task->tk_flags & RPC_TASK_MOVEABLE) && + test_bit(XPRT_REMOVE, &xprt->state)) { + struct rpc_xprt *saved = task->tk_xprt; + struct rpc_xprt_switch *xps; + + rcu_read_lock(); + xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + rcu_read_unlock(); + if (xps->xps_nxprts > 1) { + long value; + + xprt_release(task); + value = atomic_long_dec_return(&xprt->queuelen); + if (value == 0) + rpc_xprt_switch_remove_xprt(xps, saved); + xprt_put(saved); + task->tk_xprt = NULL; + task->tk_action = call_start; + } + xprt_switch_put(xps); + if (!task->tk_xprt) + return; + } goto out_retry; case -ENOBUFS: rpc_delay(task, HZ >> 2); diff --git a/net/sunrpc/sysfs.c b/net/sunrpc/sysfs.c index b576c7f06829..64da3bfd28e6 100644 --- a/net/sunrpc/sysfs.c +++ b/net/sunrpc/sysfs.c @@ -133,7 +133,7 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); ssize_t ret; int locked, connected, connecting, close_wait, bound, binding, - closing, congested, cwnd_wait, write_space, offline; + closing, congested, cwnd_wait, write_space, offline, remove; if (!xprt) return 0; @@ -152,8 +152,9 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, cwnd_wait = test_bit(XPRT_CWND_WAIT, &xprt->state); write_space = test_bit(XPRT_WRITE_SPACE, &xprt->state); offline = test_bit(XPRT_OFFLINE, &xprt->state); + remove = test_bit(XPRT_REMOVE, &xprt->state); - ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s\n", + ret = sprintf(buf, "state=%s %s %s %s %s %s %s %s %s %s %s %s\n", locked ? "LOCKED" : "", connected ? "CONNECTED" : "", connecting ? "CONNECTING" : "", @@ -164,7 +165,8 @@ static ssize_t rpc_sysfs_xprt_state_show(struct kobject *kobj, congested ? "CONGESTED" : "", cwnd_wait ? "CWND_WAIT" : "", write_space ? "WRITE_SPACE" : "", - offline ? "OFFLINE" : ""); + offline ? "OFFLINE" : "", + remove ? "REMOVE" : ""); } xprt_put(xprt); @@ -251,7 +253,7 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, const char *buf, size_t count) { struct rpc_xprt *xprt = rpc_sysfs_xprt_kobj_get_xprt(kobj); - int offline = 0, online = 0; + int offline = 0, online = 0, remove = 0; struct rpc_xprt_switch *xps = rpc_sysfs_xprt_kobj_get_xprt_switch(kobj); if (!xprt) @@ -261,6 +263,8 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, offline = 1; else if (!strncmp(buf, "online", 6)) online = 1; + else if (!strncmp(buf, "remove", 6)) + remove = 1; else return -EINVAL; @@ -282,6 +286,20 @@ static ssize_t rpc_sysfs_xprt_state_change(struct kobject *kobj, spin_lock(&xps->xps_lock); xps->xps_nactive++; spin_unlock(&xps->xps_lock); + } else if (remove) { + if (test_bit(XPRT_OFFLINE, &xprt->state)) { + set_bit(XPRT_REMOVE, &xprt->state); + xprt_force_disconnect(xprt); + if (test_bit(XPRT_CONNECTED, &xprt->state)) { + if (!xprt->sending.qlen && + !xprt->pending.qlen && + !xprt->backlog.qlen && + !atomic_long_read(&xprt->queuelen)) + rpc_xprt_switch_remove_xprt(xps, xprt); + } + } else { + count = -EINVAL; + } } release_tasks: From e0340f16a08d031de54ed91d26f57c9a966a776a Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Tue, 29 Jun 2021 05:11:28 -0400 Subject: [PATCH 40/46] NFS: Ensure nfs_readpage returns promptly when internal error occurs A previous refactoring of nfs_readpage() might end up calling wait_on_page_locked_killable() even if readpage_async_filler() failed with an internal error and pg_error was non-zero (for example, if nfs_create_request() failed). In the case of an internal error, skip over wait_on_page_locked_killable() as this is only needed when the read is sent and an error occurs during completion handling. Signed-off-by: Dave Wysochanski Signed-off-by: Trond Myklebust --- fs/nfs/read.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 684a730f6670..eb390eb618b3 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -373,10 +373,10 @@ int nfs_readpage(struct file *file, struct page *page) &nfs_async_read_completion_ops); ret = readpage_async_filler(&desc, page); + if (ret) + goto out; - if (!ret) - nfs_pageio_complete_read(&desc.pgio); - + nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; if (!ret) { ret = wait_on_page_locked_killable(page); From ba512c1bc3232124567a59a3995c773dc79716e8 Mon Sep 17 00:00:00 2001 From: Dave Wysochanski Date: Tue, 29 Jun 2021 13:13:57 -0400 Subject: [PATCH 41/46] NFS: Fix fscache read from NFS after cache error Earlier commits refactored some NFS read code and removed nfs_readpage_async(), but neglected to properly fixup nfs_readpage_from_fscache_complete(). The code path is only hit when something unusual occurs with the cachefiles backing filesystem, such as an IO error or while a cookie is being invalidated. Mark page with PG_checked if fscache IO completes in error, unlock the page, and let the VM decide to re-issue based on PG_uptodate. When the VM reissues the readpage, PG_checked allows us to skip over fscache and read from the server. Link: https://marc.info/?l=linux-nfs&m=162498209518739 Fixes: 1e83b173b266 ("NFS: Add nfs_pageio_complete_read() and remove nfs_readpage_async()") Signed-off-by: Dave Wysochanski Signed-off-by: Trond Myklebust --- fs/nfs/fscache.c | 18 +++++++++++++----- fs/nfs/read.c | 5 +++-- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index c4c021c6ebbd..d743629e05e1 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -385,12 +385,15 @@ static void nfs_readpage_from_fscache_complete(struct page *page, "NFS: readpage_from_fscache_complete (0x%p/0x%p/%d)\n", page, context, error); - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) { + /* + * If the read completes with an error, mark the page with PG_checked, + * unlock the page, and let the VM reissue the readpage. + */ + if (!error) SetPageUptodate(page); - unlock_page(page); - } + else + SetPageChecked(page); + unlock_page(page); } /* @@ -405,6 +408,11 @@ int __nfs_readpage_from_fscache(struct nfs_open_context *ctx, "NFS: readpage_from_fscache(fsc:%p/p:%p(i:%lx f:%lx)/0x%p)\n", nfs_i_fscache(inode), page, page->index, page->flags, inode); + if (PageChecked(page)) { + ClearPageChecked(page); + return 1; + } + ret = fscache_read_or_alloc_page(nfs_i_fscache(inode), page, nfs_readpage_from_fscache_complete, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index eb390eb618b3..9f39e0a1a38b 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -362,13 +362,13 @@ int nfs_readpage(struct file *file, struct page *page) } else desc.ctx = get_nfs_open_context(nfs_file_open_context(file)); + xchg(&desc.ctx->error, 0); if (!IS_SYNC(inode)) { ret = nfs_readpage_from_fscache(desc.ctx, inode, page); if (ret == 0) - goto out; + goto out_wait; } - xchg(&desc.ctx->error, 0); nfs_pageio_init_read(&desc.pgio, inode, false, &nfs_async_read_completion_ops); @@ -378,6 +378,7 @@ int nfs_readpage(struct file *file, struct page *page) nfs_pageio_complete_read(&desc.pgio); ret = desc.pgio.pg_error < 0 ? desc.pgio.pg_error : 0; +out_wait: if (!ret) { ret = wait_on_page_locked_killable(page); if (!PageUptodate(page) && !ret) From aa95edf309ef31e2df4a37ebf0e5c2ca2a6772ab Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 2 Jul 2021 16:37:15 -0400 Subject: [PATCH 42/46] NFSv4/pnfs: Fix the layout barrier update If we have multiple outstanding layoutget requests, the current code to update the layout barrier assumes that the outstanding layout stateids are updated in order. That's not necessarily the case. Instead of using the value of lo->plh_outstanding as a guesstimate for the window of values we need to accept, just wait to update the window until we're processing the last one. The intention here is just to ensure that we don't process 2^31 seqid updates without also updating the barrier. Fixes: 1bcf34fdac5f ("pNFS/NFSv4: Update the layout barrier when we schedule a layoutreturn") Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 2c01ee805306..ffe02e43f8c0 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -966,10 +966,8 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, const struct cred *cred, bool update_barrier) { - u32 oldseq, newseq, new_barrier = 0; - - oldseq = be32_to_cpu(lo->plh_stateid.seqid); - newseq = be32_to_cpu(new->seqid); + u32 oldseq = be32_to_cpu(lo->plh_stateid.seqid); + u32 newseq = be32_to_cpu(new->seqid); if (!pnfs_layout_is_valid(lo)) { pnfs_set_layout_cred(lo, cred); @@ -979,19 +977,21 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); return; } - if (pnfs_seqid_is_newer(newseq, oldseq)) { + + if (pnfs_seqid_is_newer(newseq, oldseq)) nfs4_stateid_copy(&lo->plh_stateid, new); - /* - * Because of wraparound, we want to keep the barrier - * "close" to the current seqids. - */ - new_barrier = newseq - atomic_read(&lo->plh_outstanding); - } - if (update_barrier) - new_barrier = be32_to_cpu(new->seqid); - else if (new_barrier == 0) + + if (update_barrier) { + pnfs_barrier_update(lo, newseq); return; - pnfs_barrier_update(lo, new_barrier); + } + /* + * Because of wraparound, we want to keep the barrier + * "close" to the current seqids. We really only want to + * get here from a layoutget call. + */ + if (atomic_read(&lo->plh_outstanding) == 1) + pnfs_barrier_update(lo, be32_to_cpu(lo->plh_stateid.seqid)); } static bool From 0b77f97a7e42adc72bd566ff8cb733ea426f74f6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 2 Jul 2021 19:48:41 -0400 Subject: [PATCH 43/46] NFSv4/pnfs: Fix layoutget behaviour after invalidation If the layout gets invalidated, we should wait for any outstanding layoutget requests for that layout to complete, and we should resend them only after re-establishing the layout stateid. Fixes: d29b468da4f9 ("pNFS/NFSv4: Improve rejection of out-of-order layouts") Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index ffe02e43f8c0..be960e47d7f6 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2014,7 +2014,7 @@ lookup_again: * If the layout segment list is empty, but there are outstanding * layoutget calls, then they might be subject to a layoutrecall. */ - if (list_empty(&lo->plh_segs) && + if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) && atomic_read(&lo->plh_outstanding) != 0) { spin_unlock(&ino->i_lock); lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding, @@ -2390,11 +2390,13 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } + if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo)) + goto out_forget; + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { /* existing state ID, make sure the sequence number matches. */ if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { - if (!pnfs_layout_is_valid(lo) && - pnfs_is_first_layoutget(lo)) + if (!pnfs_layout_is_valid(lo)) lo->plh_barrier = 0; dprintk("%s forget reply due to sequence\n", __func__); goto out_forget; @@ -2413,8 +2415,6 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out_forget; } else { /* We have a completely new layout */ - if (!pnfs_is_first_layoutget(lo)) - goto out_forget; pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true); } From b4e89bcba2b3a966e043107cb52c682bb860cee7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 2 Jul 2021 17:24:22 -0400 Subject: [PATCH 44/46] NFSv4/pnfs: Clean up layout get on open Cache the layout in the arguments so we don't have to keep looking it up from the inode. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 6 +----- fs/nfs/pnfs.c | 28 ++++++++++++++++------------ include/linux/nfs_xdr.h | 1 + 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 2031d2b9b6e3..78d5f15ee852 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -9419,7 +9419,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, { struct inode *inode = lgp->args.inode; struct nfs_server *server = NFS_SERVER(inode); - struct pnfs_layout_hdr *lo; + struct pnfs_layout_hdr *lo = lgp->lo; int nfs4err = task->tk_status; int err, status = 0; LIST_HEAD(head); @@ -9471,7 +9471,6 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, case -NFS4ERR_BAD_STATEID: exception->timeout = 0; spin_lock(&inode->i_lock); - lo = NFS_I(inode)->layout; /* If the open stateid was bad, then recover it. */ if (!lo || test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags) || !nfs4_stateid_match_other(&lgp->args.stateid, &lo->plh_stateid)) { @@ -9554,9 +9553,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout) dprintk("--> %s\n", __func__); - /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ - pnfs_get_layout_hdr(NFS_I(inode)->layout); - nfs4_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0, 0); task = rpc_run_task(&task_setup_data); diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index be960e47d7f6..ef14ea0b6ab8 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1128,8 +1128,7 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp) size_t max_pages = lgp->args.layout.pglen / PAGE_SIZE; nfs4_free_pages(lgp->args.layout.pages, max_pages); - if (lgp->args.inode) - pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout); + pnfs_put_layout_hdr(lgp->lo); put_nfs_open_context(lgp->args.ctx); kfree(lgp); } @@ -2124,6 +2123,9 @@ lookup_again: goto out_put_layout_hdr; } + lgp->lo = lo; + pnfs_get_layout_hdr(lo); + lseg = nfs4_proc_layoutget(lgp, &timeout); trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg, PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); @@ -2255,6 +2257,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, pnfs_put_layout_hdr(lo); return; } + lgp->lo = lo; data->lgp = lgp; data->o_arg.lg_args = &lgp->args; data->o_res.lg_res = &lgp->res; @@ -2263,6 +2266,7 @@ static void _lgopen_prepare_attached(struct nfs4_opendata *data, static void _lgopen_prepare_floating(struct nfs4_opendata *data, struct nfs_open_context *ctx) { + struct inode *ino = data->dentry->d_inode; struct pnfs_layout_range rng = { .iomode = (data->o_arg.fmode & FMODE_WRITE) ? IOMODE_RW: IOMODE_READ, @@ -2271,7 +2275,7 @@ static void _lgopen_prepare_floating(struct nfs4_opendata *data, }; struct nfs4_layoutget *lgp; - lgp = pnfs_alloc_init_layoutget_args(NULL, ctx, ¤t_stateid, + lgp = pnfs_alloc_init_layoutget_args(ino, ctx, ¤t_stateid, &rng, GFP_KERNEL); if (!lgp) return; @@ -2291,6 +2295,8 @@ void pnfs_lgopen_prepare(struct nfs4_opendata *data, /* Could check on max_ops, but currently hardcoded high enough */ if (!nfs_server_capable(data->dir->d_inode, NFS_CAP_LGOPEN)) return; + if (data->lgp) + return; if (data->state) _lgopen_prepare_attached(data, ctx); else @@ -2330,13 +2336,13 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, } return; } - if (!lgp->args.inode) { + if (!lgp->lo) { lo = _pnfs_grab_empty_layout(ino, ctx); if (!lo) return; - lgp->args.inode = ino; + lgp->lo = lo; } else - lo = NFS_I(lgp->args.inode)->layout; + lo = lgp->lo; lseg = pnfs_layout_process(lgp); if (!IS_ERR(lseg)) { @@ -2349,11 +2355,9 @@ void pnfs_parse_lgopen(struct inode *ino, struct nfs4_layoutget *lgp, void nfs4_lgopen_release(struct nfs4_layoutget *lgp) { if (lgp != NULL) { - struct inode *inode = lgp->args.inode; - if (inode) { - struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - pnfs_clear_first_layoutget(lo); - nfs_layoutget_end(lo); + if (lgp->lo) { + pnfs_clear_first_layoutget(lgp->lo); + nfs_layoutget_end(lgp->lo); } pnfs_layoutget_free(lgp); } @@ -2362,7 +2366,7 @@ void nfs4_lgopen_release(struct nfs4_layoutget *lgp) struct pnfs_layout_segment * pnfs_layout_process(struct nfs4_layoutget *lgp) { - struct pnfs_layout_hdr *lo = NFS_I(lgp->args.inode)->layout; + struct pnfs_layout_hdr *lo = lgp->lo; struct nfs4_layoutget_res *res = &lgp->res; struct pnfs_layout_segment *lseg; struct inode *ino = lo->plh_inode; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 717ecc87c9e7..e9698b6278a5 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -277,6 +277,7 @@ struct nfs4_layoutget { struct nfs4_layoutget_args args; struct nfs4_layoutget_res res; const struct cred *cred; + struct pnfs_layout_hdr *lo; gfp_t gfp_flags; }; From f46f84931a0aa344678efe412d4b071d84d8a805 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Jul 2021 14:34:20 -0400 Subject: [PATCH 45/46] NFSv4/pNFS: Don't call _nfs4_pnfs_v3_ds_connect multiple times After we grab the lock in nfs4_pnfs_ds_connect(), there is no check for whether or not ds->ds_clp has already been initialised, so we can end up adding the same transports multiple times. Fixes: fc821d59209d ("pnfs/NFSv4.1: Add multipath capabilities to pNFS flexfiles servers over NFSv3") Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 52 +++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 49d3389bd813..1c2c0d08614e 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -805,19 +805,16 @@ out: } EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add); -static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) +static int nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds) { might_sleep(); - wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, - TASK_KILLABLE); + return wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING, TASK_KILLABLE); } static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds) { smp_mb__before_atomic(); - clear_bit(NFS4DS_CONNECTING, &ds->ds_state); - smp_mb__after_atomic(); - wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING); + clear_and_wake_up_bit(NFS4DS_CONNECTING, &ds->ds_state); } static struct nfs_client *(*get_v3_ds_connect)( @@ -993,30 +990,33 @@ int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, { int err; -again: - err = 0; - if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) { - if (version == 3) { - err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, - retrans); - } else if (version == 4) { - err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, - retrans, minor_version); - } else { - dprintk("%s: unsupported DS version %d\n", __func__, - version); - err = -EPROTONOSUPPORT; - } + do { + err = nfs4_wait_ds_connect(ds); + if (err || ds->ds_clp) + goto out; + if (nfs4_test_deviceid_unavailable(devid)) + return -ENODEV; + } while (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) != 0); - nfs4_clear_ds_conn_bit(ds); - } else { - nfs4_wait_ds_connect(ds); + if (ds->ds_clp) + goto connect_done; - /* what was waited on didn't connect AND didn't mark unavail */ - if (!ds->ds_clp && !nfs4_test_deviceid_unavailable(devid)) - goto again; + switch (version) { + case 3: + err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo, retrans); + break; + case 4: + err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo, retrans, + minor_version); + break; + default: + dprintk("%s: unsupported DS version %d\n", __func__, version); + err = -EPROTONOSUPPORT; } +connect_done: + nfs4_clear_ds_conn_bit(ds); +out: /* * At this point the ds->ds_clp should be ready, but it might have * hit an error. From dd5c153ed7839e1e7c131dae7fa4d8eaaafb3eac Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 3 Jul 2021 14:31:49 -0400 Subject: [PATCH 46/46] NFSv4/pNFS: Return an error if _nfs4_pnfs_v3_ds_connect can't load NFSv3 Currently we fail to return an error if the NFSv3 module failed to load when we're trying to connect to a pNFS data server. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs_nfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 1c2c0d08614e..cf19914fec81 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -855,7 +855,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr); if (!load_v3_ds_connect()) - goto out; + return -EPROTONOSUPPORT; list_for_each_entry(da, &ds->ds_addrs, da_node) { dprintk("%s: DS %s: trying address %s\n",