linux/net/sunrpc/xprtrdma/transport.c
Chuck Lever 6b1eb3b222 SUNRPC: Replace the use of the xprtiod WQ in rpcrdma
While setting up a new lab, I accidentally misconfigured the
Ethernet port for a system that tried an NFS mount using RoCE.
This made the NFS server unreachable. The following WARNING
popped on the NFS client while waiting for the mount attempt to
time out:

kernel: workqueue: WQ_MEM_RECLAIM xprtiod:xprt_rdma_connect_worker [rpcrdma] is flushing !WQ_MEM_RECLAI>
kernel: WARNING: CPU: 0 PID: 100 at kernel/workqueue.c:2628 check_flush_dependency+0xbf/0xca
kernel: Modules linked in: rpcsec_gss_krb5 nfsv4 dns_resolver nfs 8021q garp stp mrp llc rfkill rpcrdma>
kernel: CPU: 0 PID: 100 Comm: kworker/u8:8 Not tainted 6.0.0-rc1-00002-g6229f8c054e5 #13
kernel: Hardware name: Supermicro X10SRA-F/X10SRA-F, BIOS 2.0b 06/12/2017
kernel: Workqueue: xprtiod xprt_rdma_connect_worker [rpcrdma]
kernel: RIP: 0010:check_flush_dependency+0xbf/0xca
kernel: Code: 75 2a 48 8b 55 18 48 8d 8b b0 00 00 00 4d 89 e0 48 81 c6 b0 00 00 00 48 c7 c7 65 33 2e be>
kernel: RSP: 0018:ffffb562806cfcf8 EFLAGS: 00010092
kernel: RAX: 0000000000000082 RBX: ffff97894f8c3c00 RCX: 0000000000000027
kernel: RDX: 0000000000000002 RSI: ffffffffbe3447d1 RDI: 00000000ffffffff
kernel: RBP: ffff978941315840 R08: 0000000000000000 R09: 0000000000000000
kernel: R10: 00000000000008b0 R11: 0000000000000001 R12: ffffffffc0ce3731
kernel: R13: ffff978950c00500 R14: ffff97894341f0c0 R15: ffff978951112eb0
kernel: FS:  0000000000000000(0000) GS:ffff97987fc00000(0000) knlGS:0000000000000000
kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
kernel: CR2: 00007f807535eae8 CR3: 000000010b8e4002 CR4: 00000000003706f0
kernel: DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
kernel: DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
kernel: Call Trace:
kernel:  <TASK>
kernel:  __flush_work.isra.0+0xaf/0x188
kernel:  ? _raw_spin_lock_irqsave+0x2c/0x37
kernel:  ? lock_timer_base+0x38/0x5f
kernel:  __cancel_work_timer+0xea/0x13d
kernel:  ? preempt_latency_start+0x2b/0x46
kernel:  rdma_addr_cancel+0x70/0x81 [ib_core]
kernel:  _destroy_id+0x1a/0x246 [rdma_cm]
kernel:  rpcrdma_xprt_connect+0x115/0x5ae [rpcrdma]
kernel:  ? _raw_spin_unlock+0x14/0x29
kernel:  ? raw_spin_rq_unlock_irq+0x5/0x10
kernel:  ? finish_task_switch.isra.0+0x171/0x249
kernel:  xprt_rdma_connect_worker+0x3b/0xc7 [rpcrdma]
kernel:  process_one_work+0x1d8/0x2d4
kernel:  worker_thread+0x18b/0x24f
kernel:  ? rescuer_thread+0x280/0x280
kernel:  kthread+0xf4/0xfc
kernel:  ? kthread_complete_and_exit+0x1b/0x1b
kernel:  ret_from_fork+0x22/0x30
kernel:  </TASK>

SUNRPC's xprtiod workqueue is WQ_MEM_RECLAIM, so any workqueue that
one of its work items tries to cancel has to be WQ_MEM_RECLAIM to
prevent a priority inversion. The internal workqueues in the
RDMA/core are currently non-MEM_RECLAIM.

Jason Gunthorpe says this about the current state of RDMA/core:
> If you attempt to do a reconnection/etc from within a RECLAIM
> context it will deadlock on one of the many allocations that are
> made to support opening the connection.
>
> The general idea of reclaim is that the entire task context
> working under the reclaim is marked with an override of the gfp
> flags to make all allocations under that call chain reclaim safe.
>
> But rdmacm does allocations outside this, eg in the WQs processing
> the CM packets. So this doesn't work and we will deadlock.
>
> Fixing it is a big deal and needs more than poking WQ_MEM_RECLAIM
> here and there.

So we will change the ULP in this case to avoid the use of
WQ_MEM_RECLAIM where possible. Deadlocks that were possible before
are not fixed, but at least we no longer have a false sense of
confidence that the stack won't allocate memory during memory
reclaim.

Suggested-by: Leon Romanovsky <leon@kernel.org>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
Signed-off-by: Anna Schumaker <Anna.Schumaker@Netapp.com>
2022-10-05 15:47:16 -04:00

806 lines
22 KiB
C

// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
/*
* Copyright (c) 2014-2017 Oracle. All rights reserved.
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the BSD-type
* license below:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials provided
* with the distribution.
*
* Neither the name of the Network Appliance, Inc. nor the names of
* its contributors may be used to endorse or promote products
* derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* transport.c
*
* This file contains the top-level implementation of an RPC RDMA
* transport.
*
* Naming convention: functions beginning with xprt_ are part of the
* transport switch. All others are RPC RDMA internal.
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/smp.h>
#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
/*
* tunables
*/
static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRWR;
int xprt_rdma_pad_optimize;
static struct xprt_class xprt_rdma;
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static unsigned int min_slot_table_size = RPCRDMA_MIN_SLOT_TABLE;
static unsigned int max_slot_table_size = RPCRDMA_MAX_SLOT_TABLE;
static unsigned int min_inline_size = RPCRDMA_MIN_INLINE;
static unsigned int max_inline_size = RPCRDMA_MAX_INLINE;
static unsigned int max_padding = PAGE_SIZE;
static unsigned int min_memreg = RPCRDMA_BOUNCEBUFFERS;
static unsigned int max_memreg = RPCRDMA_LAST - 1;
static unsigned int dummy;
static struct ctl_table_header *sunrpc_table_header;
static struct ctl_table xr_tunables_table[] = {
{
.procname = "rdma_slot_table_entries",
.data = &xprt_rdma_slot_table_entries,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_slot_table_size,
.extra2 = &max_slot_table_size
},
{
.procname = "rdma_max_inline_read",
.data = &xprt_rdma_max_inline_read,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_inline_size,
.extra2 = &max_inline_size,
},
{
.procname = "rdma_max_inline_write",
.data = &xprt_rdma_max_inline_write,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_inline_size,
.extra2 = &max_inline_size,
},
{
.procname = "rdma_inline_write_padding",
.data = &dummy,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = &max_padding,
},
{
.procname = "rdma_memreg_strategy",
.data = &xprt_rdma_memreg_strategy,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &min_memreg,
.extra2 = &max_memreg,
},
{
.procname = "rdma_pad_optimize",
.data = &xprt_rdma_pad_optimize,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ },
};
static struct ctl_table sunrpc_table[] = {
{
.procname = "sunrpc",
.mode = 0555,
.child = xr_tunables_table
},
{ },
};
#endif
static const struct rpc_xprt_ops xprt_rdma_procs;
static void
xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
{
struct sockaddr_in *sin = (struct sockaddr_in *)sap;
char buf[20];
snprintf(buf, sizeof(buf), "%08x", ntohl(sin->sin_addr.s_addr));
xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA;
}
static void
xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
{
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
char buf[40];
snprintf(buf, sizeof(buf), "%pi6", &sin6->sin6_addr);
xprt->address_strings[RPC_DISPLAY_HEX_ADDR] = kstrdup(buf, GFP_KERNEL);
xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
}
void
xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
char buf[128];
switch (sap->sa_family) {
case AF_INET:
xprt_rdma_format_addresses4(xprt, sap);
break;
case AF_INET6:
xprt_rdma_format_addresses6(xprt, sap);
break;
default:
pr_err("rpcrdma: Unrecognized address family\n");
return;
}
(void)rpc_ntop(sap, buf, sizeof(buf));
xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL);
snprintf(buf, sizeof(buf), "%u", rpc_get_port(sap));
xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
snprintf(buf, sizeof(buf), "%4hx", rpc_get_port(sap));
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
}
void
xprt_rdma_free_addresses(struct rpc_xprt *xprt)
{
unsigned int i;
for (i = 0; i < RPC_DISPLAY_MAX; i++)
switch (i) {
case RPC_DISPLAY_PROTO:
case RPC_DISPLAY_NETID:
continue;
default:
kfree(xprt->address_strings[i]);
}
}
/**
* xprt_rdma_connect_worker - establish connection in the background
* @work: worker thread context
*
* Requester holds the xprt's send lock to prevent activity on this
* transport while a fresh connection is being established. RPC tasks
* sleep on the xprt's pending queue waiting for connect to complete.
*/
static void
xprt_rdma_connect_worker(struct work_struct *work)
{
struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt,
rx_connect_worker.work);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
unsigned int pflags = current->flags;
int rc;
if (atomic_read(&xprt->swapper))
current->flags |= PF_MEMALLOC;
rc = rpcrdma_xprt_connect(r_xprt);
xprt_clear_connecting(xprt);
if (!rc) {
xprt->connect_cookie++;
xprt->stat.connect_count++;
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
xprt_set_connected(xprt);
rc = -EAGAIN;
} else
rpcrdma_xprt_disconnect(r_xprt);
xprt_unlock_connect(xprt, r_xprt);
xprt_wake_pending_tasks(xprt, rc);
current_restore_flags(pflags, PF_MEMALLOC);
}
/**
* xprt_rdma_inject_disconnect - inject a connection fault
* @xprt: transport context
*
* If @xprt is connected, disconnect it to simulate spurious
* connection loss. Caller must hold @xprt's send lock to
* ensure that data structures and hardware resources are
* stable during the rdma_disconnect() call.
*/
static void
xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
trace_xprtrdma_op_inject_dsc(r_xprt);
rdma_disconnect(r_xprt->rx_ep->re_id);
}
/**
* xprt_rdma_destroy - Full tear down of transport
* @xprt: doomed transport context
*
* Caller guarantees there will be no more calls to us with
* this @xprt.
*/
static void
xprt_rdma_destroy(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
rpcrdma_xprt_disconnect(r_xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
module_put(THIS_MODULE);
}
/* 60 second timeout, no retries */
static const struct rpc_timeout xprt_rdma_default_timeout = {
.to_initval = 60 * HZ,
.to_maxval = 60 * HZ,
};
/**
* xprt_setup_rdma - Set up transport to use RDMA
*
* @args: rpc transport arguments
*/
static struct rpc_xprt *
xprt_setup_rdma(struct xprt_create *args)
{
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
struct sockaddr *sap;
int rc;
if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
if (!try_module_get(THIS_MODULE))
return ERR_PTR(-EIO);
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
xprt_rdma_slot_table_entries);
if (!xprt) {
module_put(THIS_MODULE);
return ERR_PTR(-ENOMEM);
}
xprt->timeout = &xprt_rdma_default_timeout;
xprt->connect_timeout = xprt->timeout->to_initval;
xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
xprt->bind_timeout = RPCRDMA_BIND_TO;
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
xprt->resvport = 0; /* privileged port not needed */
xprt->ops = &xprt_rdma_procs;
/*
* Set up RDMA-specific connect data.
*/
sap = args->dstaddr;
/* Ensure xprt->addr holds valid server TCP (not RDMA)
* address, for any side protocols which peek at it */
xprt->prot = IPPROTO_TCP;
xprt->xprt_class = &xprt_rdma;
xprt->addrlen = args->addrlen;
memcpy(&xprt->addr, sap, xprt->addrlen);
if (rpc_get_port(sap))
xprt_set_bound(xprt);
xprt_rdma_format_addresses(xprt, sap);
new_xprt = rpcx_to_rdmax(xprt);
rc = rpcrdma_buffer_create(new_xprt);
if (rc) {
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
module_put(THIS_MODULE);
return ERR_PTR(rc);
}
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
return xprt;
}
/**
* xprt_rdma_close - close a transport connection
* @xprt: transport context
*
* Called during autoclose or device removal.
*
* Caller holds @xprt's send lock to prevent activity on this
* transport while the connection is torn down.
*/
void xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
rpcrdma_xprt_disconnect(r_xprt);
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
}
/**
* xprt_rdma_set_port - update server port with rpcbind result
* @xprt: controlling RPC transport
* @port: new port value
*
* Transport connect status is unchanged.
*/
static void
xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port)
{
struct sockaddr *sap = (struct sockaddr *)&xprt->addr;
char buf[8];
rpc_set_port(sap, port);
kfree(xprt->address_strings[RPC_DISPLAY_PORT]);
snprintf(buf, sizeof(buf), "%u", port);
xprt->address_strings[RPC_DISPLAY_PORT] = kstrdup(buf, GFP_KERNEL);
kfree(xprt->address_strings[RPC_DISPLAY_HEX_PORT]);
snprintf(buf, sizeof(buf), "%4hx", port);
xprt->address_strings[RPC_DISPLAY_HEX_PORT] = kstrdup(buf, GFP_KERNEL);
}
/**
* xprt_rdma_timer - invoked when an RPC times out
* @xprt: controlling RPC transport
* @task: RPC task that timed out
*
* Invoked when the transport is still connected, but an RPC
* retransmit timeout occurs.
*
* Since RDMA connections don't have a keep-alive, forcibly
* disconnect and retry to connect. This drives full
* detection of the network path, and retransmissions of
* all pending RPCs.
*/
static void
xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task)
{
xprt_force_disconnect(xprt);
}
/**
* xprt_rdma_set_connect_timeout - set timeouts for establishing a connection
* @xprt: controlling transport instance
* @connect_timeout: reconnect timeout after client disconnects
* @reconnect_timeout: reconnect timeout after server disconnects
*
*/
static void xprt_rdma_set_connect_timeout(struct rpc_xprt *xprt,
unsigned long connect_timeout,
unsigned long reconnect_timeout)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
trace_xprtrdma_op_set_cto(r_xprt, connect_timeout, reconnect_timeout);
spin_lock(&xprt->transport_lock);
if (connect_timeout < xprt->connect_timeout) {
struct rpc_timeout to;
unsigned long initval;
to = *xprt->timeout;
initval = connect_timeout;
if (initval < RPCRDMA_INIT_REEST_TO << 1)
initval = RPCRDMA_INIT_REEST_TO << 1;
to.to_initval = initval;
to.to_maxval = initval;
r_xprt->rx_timeout = to;
xprt->timeout = &r_xprt->rx_timeout;
xprt->connect_timeout = connect_timeout;
}
if (reconnect_timeout < xprt->max_reconnect_timeout)
xprt->max_reconnect_timeout = reconnect_timeout;
spin_unlock(&xprt->transport_lock);
}
/**
* xprt_rdma_connect - schedule an attempt to reconnect
* @xprt: transport state
* @task: RPC scheduler context (unused)
*
*/
static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned long delay;
WARN_ON_ONCE(!xprt_lock_connect(xprt, task, r_xprt));
delay = 0;
if (ep && ep->re_connect_status != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
trace_xprtrdma_op_connect(r_xprt, delay);
queue_delayed_work(system_long_wq, &r_xprt->rx_connect_worker, delay);
}
/**
* xprt_rdma_alloc_slot - allocate an rpc_rqst
* @xprt: controlling RPC transport
* @task: RPC task requesting a fresh rpc_rqst
*
* tk_status values:
* %0 if task->tk_rqstp points to a fresh rpc_rqst
* %-EAGAIN if no rpc_rqst is available; queued on backlog
*/
static void
xprt_rdma_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req;
req = rpcrdma_buffer_get(&r_xprt->rx_buf);
if (!req)
goto out_sleep;
task->tk_rqstp = &req->rl_slot;
task->tk_status = 0;
return;
out_sleep:
task->tk_status = -ENOMEM;
xprt_add_backlog(xprt, task);
}
/**
* xprt_rdma_free_slot - release an rpc_rqst
* @xprt: controlling RPC transport
* @rqst: rpc_rqst to release
*
*/
static void
xprt_rdma_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *rqst)
{
struct rpcrdma_xprt *r_xprt =
container_of(xprt, struct rpcrdma_xprt, rx_xprt);
rpcrdma_reply_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
if (!xprt_wake_up_backlog(xprt, rqst)) {
memset(rqst, 0, sizeof(*rqst));
rpcrdma_buffer_put(&r_xprt->rx_buf, rpcr_to_rdmar(rqst));
}
}
static bool rpcrdma_check_regbuf(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_regbuf *rb, size_t size,
gfp_t flags)
{
if (unlikely(rdmab_length(rb) < size)) {
if (!rpcrdma_regbuf_realloc(rb, size, flags))
return false;
r_xprt->rx_stats.hardway_register_count += size;
}
return true;
}
/**
* xprt_rdma_allocate - allocate transport resources for an RPC
* @task: RPC task
*
* Return values:
* 0: Success; rq_buffer points to RPC buffer to use
* ENOMEM: Out of memory, call again later
* EIO: A permanent error occurred, do not retry
*/
static int
xprt_rdma_allocate(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
gfp_t flags = rpc_task_gfp_mask();
if (!rpcrdma_check_regbuf(r_xprt, req->rl_sendbuf, rqst->rq_callsize,
flags))
goto out_fail;
if (!rpcrdma_check_regbuf(r_xprt, req->rl_recvbuf, rqst->rq_rcvsize,
flags))
goto out_fail;
rqst->rq_buffer = rdmab_data(req->rl_sendbuf);
rqst->rq_rbuffer = rdmab_data(req->rl_recvbuf);
return 0;
out_fail:
return -ENOMEM;
}
/**
* xprt_rdma_free - release resources allocated by xprt_rdma_allocate
* @task: RPC task
*
* Caller guarantees rqst->rq_buffer is non-NULL.
*/
static void
xprt_rdma_free(struct rpc_task *task)
{
struct rpc_rqst *rqst = task->tk_rqstp;
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
if (unlikely(!list_empty(&req->rl_registered))) {
trace_xprtrdma_mrs_zap(task);
frwr_unmap_sync(rpcx_to_rdmax(rqst->rq_xprt), req);
}
/* XXX: If the RPC is completing because of a signal and
* not because a reply was received, we ought to ensure
* that the Send completion has fired, so that memory
* involved with the Send is not still visible to the NIC.
*/
}
/**
* xprt_rdma_send_request - marshal and send an RPC request
* @rqst: RPC message in rq_snd_buf
*
* Caller holds the transport's write lock.
*
* Returns:
* %0 if the RPC message has been sent
* %-ENOTCONN if the caller should reconnect and call again
* %-EAGAIN if the caller should call again
* %-ENOBUFS if the caller should call again after a delay
* %-EMSGSIZE if encoding ran out of buffer space. The request
* was not sent. Do not try to send this message again.
* %-EIO if an I/O error occurred. The request was not sent.
* Do not try to send this message again.
*/
static int
xprt_rdma_send_request(struct rpc_rqst *rqst)
{
struct rpc_xprt *xprt = rqst->rq_xprt;
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
int rc = 0;
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
if (unlikely(!rqst->rq_buffer))
return xprt_rdma_bc_send_reply(rqst);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
if (!xprt_connected(xprt))
return -ENOTCONN;
if (!xprt_request_get_cong(xprt, rqst))
return -EBADSLT;
rc = rpcrdma_marshal_req(r_xprt, rqst);
if (rc < 0)
goto failed_marshal;
/* Must suppress retransmit to maintain credits */
if (rqst->rq_connect_cookie == xprt->connect_cookie)
goto drop_connection;
rqst->rq_xtime = ktime_get();
if (frwr_send(r_xprt, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
/* An RPC with no reply will throw off credit accounting,
* so drop the connection to reset the credit grant.
*/
if (!rpc_reply_expected(rqst->rq_task))
goto drop_connection;
return 0;
failed_marshal:
if (rc != -ENOTCONN)
return rc;
drop_connection:
xprt_rdma_close(xprt);
return -ENOTCONN;
}
void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
long idle_time = 0;
if (xprt_connected(xprt))
idle_time = (long)(jiffies - xprt->last_used) / HZ;
seq_puts(seq, "\txprt:\trdma ");
seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
0, /* need a local port? */
xprt->stat.bind_count,
xprt->stat.connect_count,
xprt->stat.connect_time / HZ,
idle_time,
xprt->stat.sends,
xprt->stat.recvs,
xprt->stat.bad_xids,
xprt->stat.req_u,
xprt->stat.bklog_u);
seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu ",
r_xprt->rx_stats.read_chunk_count,
r_xprt->rx_stats.write_chunk_count,
r_xprt->rx_stats.reply_chunk_count,
r_xprt->rx_stats.total_rdma_request,
r_xprt->rx_stats.total_rdma_reply,
r_xprt->rx_stats.pullup_copy_count,
r_xprt->rx_stats.fixup_copy_count,
r_xprt->rx_stats.hardway_register_count,
r_xprt->rx_stats.failed_marshal_count,
r_xprt->rx_stats.bad_reply_count,
r_xprt->rx_stats.nomsg_call_count);
seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n",
r_xprt->rx_stats.mrs_recycled,
r_xprt->rx_stats.mrs_orphaned,
r_xprt->rx_stats.mrs_allocated,
r_xprt->rx_stats.local_inv_needed,
r_xprt->rx_stats.empty_sendctx_q,
r_xprt->rx_stats.reply_waits_for_send);
}
static int
xprt_rdma_enable_swap(struct rpc_xprt *xprt)
{
return 0;
}
static void
xprt_rdma_disable_swap(struct rpc_xprt *xprt)
{
}
/*
* Plumbing for rpc transport switch and kernel module
*/
static const struct rpc_xprt_ops xprt_rdma_procs = {
.reserve_xprt = xprt_reserve_xprt_cong,
.release_xprt = xprt_release_xprt_cong, /* sunrpc/xprt.c */
.alloc_slot = xprt_rdma_alloc_slot,
.free_slot = xprt_rdma_free_slot,
.release_request = xprt_release_rqst_cong, /* ditto */
.wait_for_reply_request = xprt_wait_for_reply_request_def, /* ditto */
.timer = xprt_rdma_timer,
.rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */
.set_port = xprt_rdma_set_port,
.connect = xprt_rdma_connect,
.buf_alloc = xprt_rdma_allocate,
.buf_free = xprt_rdma_free,
.send_request = xprt_rdma_send_request,
.close = xprt_rdma_close,
.destroy = xprt_rdma_destroy,
.set_connect_timeout = xprt_rdma_set_connect_timeout,
.print_stats = xprt_rdma_print_stats,
.enable_swap = xprt_rdma_enable_swap,
.disable_swap = xprt_rdma_disable_swap,
.inject_disconnect = xprt_rdma_inject_disconnect,
#if defined(CONFIG_SUNRPC_BACKCHANNEL)
.bc_setup = xprt_rdma_bc_setup,
.bc_maxpayload = xprt_rdma_bc_maxpayload,
.bc_num_slots = xprt_rdma_bc_max_slots,
.bc_free_rqst = xprt_rdma_bc_free_rqst,
.bc_destroy = xprt_rdma_bc_destroy,
#endif
};
static struct xprt_class xprt_rdma = {
.list = LIST_HEAD_INIT(xprt_rdma.list),
.name = "rdma",
.owner = THIS_MODULE,
.ident = XPRT_TRANSPORT_RDMA,
.setup = xprt_setup_rdma,
.netid = { "rdma", "rdma6", "" },
};
void xprt_rdma_cleanup(void)
{
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (sunrpc_table_header) {
unregister_sysctl_table(sunrpc_table_header);
sunrpc_table_header = NULL;
}
#endif
xprt_unregister_transport(&xprt_rdma);
xprt_unregister_transport(&xprt_rdma_bc);
}
int xprt_rdma_init(void)
{
int rc;
rc = xprt_register_transport(&xprt_rdma);
if (rc)
return rc;
rc = xprt_register_transport(&xprt_rdma_bc);
if (rc) {
xprt_unregister_transport(&xprt_rdma);
return rc;
}
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
if (!sunrpc_table_header)
sunrpc_table_header = register_sysctl_table(sunrpc_table);
#endif
return 0;
}