linux/net/sunrpc/xprtrdma/fmr_ops.c
Linus Torvalds 89e255678f A relatively quiet cycle for nfsd. The largest piece is an RDMA update
from Chuck Lever with new trace points, miscellaneous cleanups, and
 streamlining of the send and receive paths.  Other than that, some
 miscellaneous bugfixes.
 -----BEGIN PGP SIGNATURE-----
 
 iQIcBAABAgAGBQJbHtKUAAoJECebzXlCjuG+dfgP/2Z9PiJXlxKC2iISgkfMGmBd
 MmWZYekYMtCe5raoiI720W5cGL7uBLoKnc+r57+n7bEGxV9OFwtspmKGn17P/zrY
 YcBIdN7gjpqn8wrflLR4D09bGpnmaZG26jIt/v0TS+N1aFKO3gNXb0ZVSjUadlI0
 UsKRbYxr8qucIENVtXhfA0eRivddadsKopAEwflUrxf+8oEaYszPFUfNXcGDpdHK
 +6D2lFjr/Fn+z97Rbz/G3fMfldpYhUOpH28DOiCuKEpgamK3dYjx1WoGUANxcj3o
 RsbHGZnMR6842Nj5aHus0k6Ao9bgqt6lx+jKlkvWYK+G2EfMfV9Z1gAipPY+IMbd
 Zk5A4pnFpI1UG3sUlcnpaxAM/pHBs7heYGqj0hyocG8rB4V7SDZxp21Lv1fjTH/A
 XHAkdiT4iSgI11J8YbmDBR1S7bAnfNm7GT24DsAkZLzh2f5Miq5m/ZMxDxQLAFCJ
 3YKo2aNVjKvA/aOKDe5RMLZUhnmuhb8aMIDuQY2Ir1EK4S+7EYOiYAvqlbJrM3Ro
 aLmb9BUzRRWmRydMKOeGkWiMj49lHRW6oJxvb33PDZEEqW/AlvmYEyMGfjhXzPDE
 OZkvbdYrni4n5YboplxNnJyL0NJ6l5YAikV94SBWBknrnNv1psSZbDKoIgp2ghhQ
 rdP842qSmDiZiXVlTr3e
 =PuEk
 -----END PGP SIGNATURE-----

Merge tag 'nfsd-4.18' of git://linux-nfs.org/~bfields/linux

Pull nfsd updates from Bruce Fields:
 "A relatively quiet cycle for nfsd.

  The largest piece is an RDMA update from Chuck Lever with new trace
  points, miscellaneous cleanups, and streamlining of the send and
  receive paths.

  Other than that, some miscellaneous bugfixes"

* tag 'nfsd-4.18' of git://linux-nfs.org/~bfields/linux: (26 commits)
  nfsd: fix error handling in nfs4_set_delegation()
  nfsd: fix potential use-after-free in nfsd4_decode_getdeviceinfo
  Fix 16-byte memory leak in gssp_accept_sec_context_upcall
  svcrdma: Fix incorrect return value/type in svc_rdma_post_recvs
  svcrdma: Remove unused svc_rdma_op_ctxt
  svcrdma: Persistently allocate and DMA-map Send buffers
  svcrdma: Simplify svc_rdma_send()
  svcrdma: Remove post_send_wr
  svcrdma: Don't overrun the SGE array in svc_rdma_send_ctxt
  svcrdma: Introduce svc_rdma_send_ctxt
  svcrdma: Clean up Send SGE accounting
  svcrdma: Refactor svc_rdma_dma_map_buf
  svcrdma: Allocate recv_ctxt's on CPU handling Receives
  svcrdma: Persistently allocate and DMA-map Receive buffers
  svcrdma: Preserve Receive buffer until svc_rdma_sendto
  svcrdma: Simplify svc_rdma_recv_ctxt_put
  svcrdma: Remove sc_rq_depth
  svcrdma: Introduce svc_rdma_recv_ctxt
  svcrdma: Trace key RDMA API events
  svcrdma: Trace key RPC/RDMA protocol events
  ...
2018-06-12 09:49:33 -07:00

328 lines
7.7 KiB
C

// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (c) 2015, 2017 Oracle. All rights reserved.
* Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved.
*/
/* Lightweight memory registration using Fast Memory Regions (FMR).
* Referred to sometimes as MTHCAFMR mode.
*
* FMR uses synchronous memory registration and deregistration.
* FMR registration is known to be fast, but FMR deregistration
* can take tens of usecs to complete.
*/
/* Normal operation
*
* A Memory Region is prepared for RDMA READ or WRITE using the
* ib_map_phys_fmr verb (fmr_op_map). When the RDMA operation is
* finished, the Memory Region is unmapped using the ib_unmap_fmr
* verb (fmr_op_unmap).
*/
#include <linux/sunrpc/svc_rdma.h>
#include "xprt_rdma.h"
#include <trace/events/rpcrdma.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_TRANS
#endif
/* Maximum scatter/gather per FMR */
#define RPCRDMA_MAX_FMR_SGES (64)
/* Access mode of externally registered pages */
enum {
RPCRDMA_FMR_ACCESS_FLAGS = IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_READ,
};
bool
fmr_is_supported(struct rpcrdma_ia *ia)
{
if (!ia->ri_device->alloc_fmr) {
pr_info("rpcrdma: 'fmr' mode is not supported by device %s\n",
ia->ri_device->name);
return false;
}
return true;
}
static int
fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
{
static struct ib_fmr_attr fmr_attr = {
.max_pages = RPCRDMA_MAX_FMR_SGES,
.max_maps = 1,
.page_shift = PAGE_SHIFT
};
mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES,
sizeof(u64), GFP_KERNEL);
if (!mr->fmr.fm_physaddrs)
goto out_free;
mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES,
sizeof(*mr->mr_sg), GFP_KERNEL);
if (!mr->mr_sg)
goto out_free;
sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES);
mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS,
&fmr_attr);
if (IS_ERR(mr->fmr.fm_mr))
goto out_fmr_err;
INIT_LIST_HEAD(&mr->mr_list);
return 0;
out_fmr_err:
dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__,
PTR_ERR(mr->fmr.fm_mr));
out_free:
kfree(mr->mr_sg);
kfree(mr->fmr.fm_physaddrs);
return -ENOMEM;
}
static int
__fmr_unmap(struct rpcrdma_mr *mr)
{
LIST_HEAD(l);
int rc;
list_add(&mr->fmr.fm_mr->list, &l);
rc = ib_unmap_fmr(&l);
list_del(&mr->fmr.fm_mr->list);
return rc;
}
static void
fmr_op_release_mr(struct rpcrdma_mr *mr)
{
LIST_HEAD(unmap_list);
int rc;
kfree(mr->fmr.fm_physaddrs);
kfree(mr->mr_sg);
/* In case this one was left mapped, try to unmap it
* to prevent dealloc_fmr from failing with EBUSY
*/
rc = __fmr_unmap(mr);
if (rc)
pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n",
mr, rc);
rc = ib_dealloc_fmr(mr->fmr.fm_mr);
if (rc)
pr_err("rpcrdma: final ib_dealloc_fmr for %p returned %i\n",
mr, rc);
kfree(mr);
}
/* Reset of a single FMR.
*/
static void
fmr_op_recover_mr(struct rpcrdma_mr *mr)
{
struct rpcrdma_xprt *r_xprt = mr->mr_xprt;
int rc;
/* ORDER: invalidate first */
rc = __fmr_unmap(mr);
if (rc)
goto out_release;
/* ORDER: then DMA unmap */
rpcrdma_mr_unmap_and_put(mr);
r_xprt->rx_stats.mrs_recovered++;
return;
out_release:
pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr);
r_xprt->rx_stats.mrs_orphaned++;
trace_xprtrdma_dma_unmap(mr);
ib_dma_unmap_sg(r_xprt->rx_ia.ri_device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
spin_lock(&r_xprt->rx_buf.rb_mrlock);
list_del(&mr->mr_all);
spin_unlock(&r_xprt->rx_buf.rb_mrlock);
fmr_op_release_mr(mr);
}
static int
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
RPCRDMA_MAX_FMR_SGES);
return 0;
}
/* FMR mode conveys up to 64 pages of payload per chunk segment.
*/
static size_t
fmr_op_maxpages(struct rpcrdma_xprt *r_xprt)
{
return min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
RPCRDMA_MAX_HDR_SEGS * RPCRDMA_MAX_FMR_SGES);
}
/* Use the ib_map_phys_fmr() verb to register a memory region
* for remote access via RDMA READ or RDMA WRITE.
*/
static struct rpcrdma_mr_seg *
fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, struct rpcrdma_mr **out)
{
struct rpcrdma_mr_seg *seg1 = seg;
int len, pageoff, i, rc;
struct rpcrdma_mr *mr;
u64 *dma_pages;
mr = rpcrdma_mr_get(r_xprt);
if (!mr)
return ERR_PTR(-EAGAIN);
pageoff = offset_in_page(seg1->mr_offset);
seg1->mr_offset -= pageoff; /* start of page */
seg1->mr_len += pageoff;
len = -pageoff;
if (nsegs > RPCRDMA_MAX_FMR_SGES)
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
if (seg->mr_page)
sg_set_page(&mr->mr_sg[i],
seg->mr_page,
seg->mr_len,
offset_in_page(seg->mr_offset));
else
sg_set_buf(&mr->mr_sg[i], seg->mr_offset,
seg->mr_len);
len += seg->mr_len;
++seg;
++i;
/* Check for holes */
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
mr->mr_dir = rpcrdma_data_dir(writing);
mr->mr_nents = ib_dma_map_sg(r_xprt->rx_ia.ri_device,
mr->mr_sg, i, mr->mr_dir);
if (!mr->mr_nents)
goto out_dmamap_err;
for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++)
dma_pages[i] = sg_dma_address(&mr->mr_sg[i]);
rc = ib_map_phys_fmr(mr->fmr.fm_mr, dma_pages, mr->mr_nents,
dma_pages[0]);
if (rc)
goto out_maperr;
mr->mr_handle = mr->fmr.fm_mr->rkey;
mr->mr_length = len;
mr->mr_offset = dma_pages[0] + pageoff;
*out = mr;
return seg;
out_dmamap_err:
pr_err("rpcrdma: failed to DMA map sg %p sg_nents %d\n",
mr->mr_sg, i);
rpcrdma_mr_put(mr);
return ERR_PTR(-EIO);
out_maperr:
pr_err("rpcrdma: ib_map_phys_fmr %u@0x%llx+%i (%d) status %i\n",
len, (unsigned long long)dma_pages[0],
pageoff, mr->mr_nents, rc);
rpcrdma_mr_unmap_and_put(mr);
return ERR_PTR(-EIO);
}
/* Post Send WR containing the RPC Call message.
*/
static int
fmr_op_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
struct ib_send_wr *bad_wr;
return ib_post_send(ia->ri_id->qp, &req->rl_sendctx->sc_wr, &bad_wr);
}
/* Invalidate all memory regions that were registered for "req".
*
* Sleeps until it is safe for the host CPU to access the
* previously mapped memory regions.
*
* Caller ensures that @mrs is not empty before the call. This
* function empties the list.
*/
static void
fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs)
{
struct rpcrdma_mr *mr;
LIST_HEAD(unmap_list);
int rc;
/* ORDER: Invalidate all of the req's MRs first
*
* ib_unmap_fmr() is slow, so use a single call instead
* of one call per mapped FMR.
*/
list_for_each_entry(mr, mrs, mr_list) {
dprintk("RPC: %s: unmapping fmr %p\n",
__func__, &mr->fmr);
trace_xprtrdma_localinv(mr);
list_add_tail(&mr->fmr.fm_mr->list, &unmap_list);
}
r_xprt->rx_stats.local_inv_needed++;
rc = ib_unmap_fmr(&unmap_list);
if (rc)
goto out_reset;
/* ORDER: Now DMA unmap all of the req's MRs, and return
* them to the free MW list.
*/
while (!list_empty(mrs)) {
mr = rpcrdma_mr_pop(mrs);
list_del(&mr->fmr.fm_mr->list);
rpcrdma_mr_unmap_and_put(mr);
}
return;
out_reset:
pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc);
while (!list_empty(mrs)) {
mr = rpcrdma_mr_pop(mrs);
list_del(&mr->fmr.fm_mr->list);
fmr_op_recover_mr(mr);
}
}
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
.ro_send = fmr_op_send,
.ro_unmap_sync = fmr_op_unmap_sync,
.ro_recover_mr = fmr_op_recover_mr,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
.ro_init_mr = fmr_op_init_mr,
.ro_release_mr = fmr_op_release_mr,
.ro_displayname = "fmr",
.ro_send_w_inv_ok = 0,
};