diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c new file mode 100644 index 000000000000..682a290bc11e --- /dev/null +++ b/drivers/infiniband/sw/siw/siw_qp_rx.c @@ -0,0 +1,1456 @@ +// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause + +/* Authors: Bernard Metzler */ +/* Copyright (c) 2008-2019, IBM Corporation */ + +#include +#include +#include +#include +#include + +#include +#include + +#include "siw.h" +#include "siw_verbs.h" +#include "siw_mem.h" + +/* + * siw_rx_umem() + * + * Receive data of @len into target referenced by @dest_addr. + * + * @srx: Receive Context + * @umem: siw representation of target memory + * @dest_addr: user virtual address + * @len: number of bytes to place + */ +static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem, + u64 dest_addr, int len) +{ + int copied = 0; + + while (len) { + struct page *p; + int pg_off, bytes, rv; + void *dest; + + p = siw_get_upage(umem, dest_addr); + if (unlikely(!p)) { + pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n", + __func__, qp_id(rx_qp(srx)), + (void *)dest_addr, (void *)umem->fp_addr); + /* siw internal error */ + srx->skb_copied += copied; + srx->skb_new -= copied; + + return -EFAULT; + } + pg_off = dest_addr & ~PAGE_MASK; + bytes = min(len, (int)PAGE_SIZE - pg_off); + + siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes); + + dest = kmap_atomic(p); + rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off, + bytes); + + if (unlikely(rv)) { + kunmap_atomic(dest); + srx->skb_copied += copied; + srx->skb_new -= copied; + + pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, p, rv); + + return -EFAULT; + } + if (srx->mpa_crc_hd) { + if (rx_qp(srx)->kernel_verbs) { + crypto_shash_update(srx->mpa_crc_hd, + (u8 *)(dest + pg_off), bytes); + kunmap_atomic(dest); + } else { + kunmap_atomic(dest); + /* + * Do CRC on original, not target buffer. + * Some user land applications may + * concurrently write the target buffer, + * which would yield a broken CRC. + * Walking the skb twice is very ineffcient. + * Folding the CRC into skb_copy_bits() + * would be much better, but is currently + * not supported. + */ + siw_crc_skb(srx, bytes); + } + } else { + kunmap_atomic(dest); + } + srx->skb_offset += bytes; + copied += bytes; + len -= bytes; + dest_addr += bytes; + pg_off = 0; + } + srx->skb_copied += copied; + srx->skb_new -= copied; + + return copied; +} + +static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len) +{ + int rv; + + siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len); + + rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len); + if (unlikely(rv)) { + pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n", + qp_id(rx_qp(srx)), __func__, len, kva, rv); + + return rv; + } + if (srx->mpa_crc_hd) + crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len); + + srx->skb_offset += len; + srx->skb_copied += len; + srx->skb_new -= len; + + return len; +} + +static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx, + struct siw_mem *mem, u64 addr, int len) +{ + struct siw_pbl *pbl = mem->pbl; + u64 offset = addr - mem->va; + int copied = 0; + + while (len) { + int bytes; + u64 buf_addr = + siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx); + if (!buf_addr) + break; + + bytes = min(bytes, len); + if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) { + copied += bytes; + offset += bytes; + len -= bytes; + } else { + break; + } + } + return copied; +} + +/* + * siw_rresp_check_ntoh() + * + * Check incoming RRESP fragment header against expected + * header values and update expected values for potential next + * fragment. + * + * NOTE: This function must be called only if a RRESP DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_rresp_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(rresp->sink_stag); + u64 sink_to = be64_to_cpu(rresp->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = wqe->sqe.sge[0].lkey; + srx->ddp_to = wqe->sqe.sge[0].laddr; + frx->pbl_idx = 0; + } + /* Below checks extend beyond the semantics of DDP, and + * into RDMAP: + * We check if the read response matches exactly the + * read request which was send to the remote peer to + * trigger this read response. RFC5040/5041 do not + * always have a proper error code for the detected + * error cases. We choose 'base or bounds error' for + * cases where the inbound STag is valid, but offset + * or length do not match our response receive state. + */ + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + if (unlikely(!frx->more_ddp_segs && + (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) { + pr_warn("siw: [QP %u]: rresp len: %d != %d\n", + qp_id(rx_qp(srx)), + wqe->processed + srx->fpdu_part_rem, wqe->bytes); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_write_check_ntoh() + * + * Check incoming WRITE fragment header against expected + * header values and update expected values for potential next + * fragment + * + * NOTE: This function must be called only if a WRITE DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_write_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_rdma_write *write = &srx->hdr.rwrite; + enum ddp_ecode ecode; + + u32 sink_stag = be32_to_cpu(write->sink_stag); + u64 sink_to = be64_to_cpu(write->sink_to); + + if (frx->first_ddp_seg) { + srx->ddp_stag = sink_stag; + srx->ddp_to = sink_to; + frx->pbl_idx = 0; + } else { + if (unlikely(srx->ddp_stag != sink_stag)) { + pr_warn("siw: [QP %u]: write stag: %08x != %08x\n", + qp_id(rx_qp(srx)), sink_stag, + srx->ddp_stag); + ecode = DDP_ECODE_T_INVALID_STAG; + goto error; + } + if (unlikely(srx->ddp_to != sink_to)) { + pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n", + qp_id(rx_qp(srx)), + (unsigned long long)sink_to, + (unsigned long long)srx->ddp_to); + ecode = DDP_ECODE_T_BASE_BOUNDS; + goto error; + } + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, ecode, 0); + return -EINVAL; +} + +/* + * siw_send_check_ntoh() + * + * Check incoming SEND fragment header against expected + * header values and update expected MSN if no next + * fragment expected + * + * NOTE: This function must be called only if a SEND DDP segment + * starts but not for fragmented consecutive pieces of an + * already started DDP segment. + */ +static int siw_send_check_ntoh(struct siw_rx_stream *srx, + struct siw_rx_fpdu *frx) +{ + struct iwarp_send_inv *send = &srx->hdr.send_inv; + struct siw_wqe *wqe = &frx->wqe_active; + enum ddp_ecode ecode; + + u32 ddp_msn = be32_to_cpu(send->ddp_msn); + u32 ddp_mo = be32_to_cpu(send->ddp_mo); + u32 ddp_qn = be32_to_cpu(send->ddp_qn); + + if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) { + pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n", + qp_id(rx_qp(srx)), ddp_qn); + ecode = DDP_ECODE_UT_INVALID_QN; + goto error; + } + if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) { + pr_warn("siw: [QP %u]: send msn: %u != %u\n", + qp_id(rx_qp(srx)), ddp_msn, + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]); + ecode = DDP_ECODE_UT_INVALID_MSN_RANGE; + goto error; + } + if (unlikely(ddp_mo != wqe->processed)) { + pr_warn("siw: [QP %u], send mo: %u != %u\n", + qp_id(rx_qp(srx)), ddp_mo, wqe->processed); + ecode = DDP_ECODE_UT_INVALID_MO; + goto error; + } + if (frx->first_ddp_seg) { + /* initialize user memory write position */ + frx->sge_idx = 0; + frx->sge_off = 0; + frx->pbl_idx = 0; + + /* only valid for SEND_INV and SEND_SE_INV operations */ + srx->inval_stag = be32_to_cpu(send->inval_stag); + } + if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) { + siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n", + wqe->bytes, wqe->processed, srx->fpdu_part_rem); + wqe->wc_status = SIW_WC_LOC_LEN_ERR; + ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF; + goto error; + } + return 0; +error: + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, ecode, 0); + return -EINVAL; +} + +static struct siw_wqe *siw_rqe_get(struct siw_qp *qp) +{ + struct siw_rqe *rqe; + struct siw_srq *srq; + struct siw_wqe *wqe = NULL; + bool srq_event = false; + unsigned long flags; + + srq = qp->srq; + if (srq) { + spin_lock_irqsave(&srq->lock, flags); + if (unlikely(!srq->num_rqe)) + goto out; + + rqe = &srq->recvq[srq->rq_get % srq->num_rqe]; + } else { + if (unlikely(!qp->recvq)) + goto out; + + rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size]; + } + if (likely(rqe->flags == SIW_WQE_VALID)) { + int num_sge = rqe->num_sge; + + if (likely(num_sge <= SIW_MAX_SGE)) { + int i = 0; + + wqe = rx_wqe(&qp->rx_untagged); + rx_type(wqe) = SIW_OP_RECEIVE; + wqe->wr_status = SIW_WR_INPROGRESS; + wqe->bytes = 0; + wqe->processed = 0; + + wqe->rqe.id = rqe->id; + wqe->rqe.num_sge = num_sge; + + while (i < num_sge) { + wqe->rqe.sge[i].laddr = rqe->sge[i].laddr; + wqe->rqe.sge[i].lkey = rqe->sge[i].lkey; + wqe->rqe.sge[i].length = rqe->sge[i].length; + wqe->bytes += wqe->rqe.sge[i].length; + wqe->mem[i] = NULL; + i++; + } + /* can be re-used by appl */ + smp_store_mb(rqe->flags, 0); + } else { + siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge); + if (srq) + spin_unlock_irqrestore(&srq->lock, flags); + return NULL; + } + if (!srq) { + qp->rq_get++; + } else { + if (srq->armed) { + /* Test SRQ limit */ + u32 off = (srq->rq_get + srq->limit) % + srq->num_rqe; + struct siw_rqe *rqe2 = &srq->recvq[off]; + + if (!(rqe2->flags & SIW_WQE_VALID)) { + srq->armed = 0; + srq_event = true; + } + } + srq->rq_get++; + } + } +out: + if (srq) { + spin_unlock_irqrestore(&srq->lock, flags); + if (srq_event) + siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED); + } + return wqe; +} + +/* + * siw_proc_send: + * + * Process one incoming SEND and place data into memory referenced by + * receive wqe. + * + * Function supports partially received sends (suspending/resuming + * current receive wqe processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_send(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_untagged; + struct siw_wqe *wqe; + u32 data_bytes; /* all data bytes available */ + u32 rcvd_bytes; /* sum of data bytes rcvd */ + int rv = 0; + + if (frx->first_ddp_seg) { + wqe = siw_rqe_get(qp); + if (unlikely(!wqe)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_NOBUF, 0); + return -ENOENT; + } + } else { + wqe = rx_wqe(frx); + } + if (srx->state == SIW_GET_DATA_START) { + rv = siw_send_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + if (!srx->fpdu_part_rem) /* zero length SEND */ + return 0; + } + data_bytes = min(srx->fpdu_part_rem, srx->skb_new); + rcvd_bytes = 0; + + /* A zero length SEND will skip below loop */ + while (data_bytes) { + struct ib_pd *pd; + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + u32 sge_bytes; /* data bytes avail for SGE */ + + sge = &wqe->rqe.sge[frx->sge_idx]; + + if (!sge->length) { + /* just skip empty sge's */ + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + continue; + } + sge_bytes = min(data_bytes, sge->length - frx->sge_off); + mem = &wqe->mem[frx->sge_idx]; + + /* + * check with QP's PD if no SRQ present, SRQ's PD otherwise + */ + pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd; + + rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE, + frx->sge_off, sge_bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + break; + } + mem_p = *mem; + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(sge->laddr + frx->sge_off), + sge_bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, + sge->laddr + frx->sge_off, sge_bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + frx->sge_off, sge_bytes); + + if (unlikely(rv != sge_bytes)) { + wqe->processed += rcvd_bytes; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + frx->sge_off += rv; + + if (frx->sge_off == sge->length) { + frx->sge_idx++; + frx->sge_off = 0; + frx->pbl_idx = 0; + } + data_bytes -= rv; + rcvd_bytes += rv; + + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + } + wqe->processed += rcvd_bytes; + + if (!srx->fpdu_part_rem) + return 0; + + return (rv < 0) ? rv : -EAGAIN; +} + +/* + * siw_proc_write: + * + * Place incoming WRITE after referencing and checking target buffer + + * Function supports partially received WRITEs (suspending/resuming + * current receive processing) + * + * return value: + * 0: reached the end of a DDP segment + * -EAGAIN: to be called again to finish the DDP segment + */ +int siw_proc_write(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_mem *mem; + int bytes, rv; + + if (srx->state == SIW_GET_DATA_START) { + if (!srx->fpdu_part_rem) /* zero length WRITE */ + return 0; + + rv = siw_write_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (frx->first_ddp_seg) { + struct siw_wqe *wqe = rx_wqe(frx); + + rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8); + if (unlikely(!rx_mem(frx))) { + siw_dbg_qp(qp, + "sink stag not found/invalid, stag 0x%08x\n", + srx->ddp_stag); + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + wqe->rqe.num_sge = 1; + rx_type(wqe) = SIW_OP_WRITE; + wqe->wr_status = SIW_WR_INPROGRESS; + } + mem = rx_mem(frx); + + /* + * Check if application re-registered memory with different + * key field of STag. + */ + if (unlikely(mem->stag != srx->ddp_stag)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + DDP_ECODE_T_INVALID_STAG, 0); + return -EINVAL; + } + rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd, + IB_ACCESS_REMOTE_WRITE, bytes); + if (unlikely(rv)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv), + 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + + if (mem->mem_obj == NULL) + rv = siw_rx_kva(srx, + (void *)(srx->ddp_to + srx->fpdu_part_rcvd), + bytes); + else if (!mem->is_pbl) + rv = siw_rx_umem(srx, mem->umem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem, + srx->ddp_to + srx->fpdu_part_rcvd, bytes); + + if (unlikely(rv != bytes)) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return -EINVAL; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; +} + +/* + * Inbound RREQ's cannot carry user data. + */ +int siw_proc_rreq(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + + if (!srx->fpdu_part_rem) + return 0; + + pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp), + be16_to_cpu(srx->hdr.ctrl.mpa_len)); + + return -EPROTO; +} + +/* + * siw_init_rresp: + * + * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE. + * Put it at the tail of the IRQ, if there is another WQE currently in + * transmit processing. If not, make it the current WQE to be processed + * and schedule transmit processing. + * + * Can be called from softirq context and from process + * context (RREAD socket loopback case!) + * + * return value: + * 0: success, + * failure code otherwise + */ + +static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct siw_wqe *tx_work = tx_wqe(qp); + struct siw_sqe *resp; + + uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to), + laddr = be64_to_cpu(srx->hdr.rreq.source_to); + uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size), + lkey = be32_to_cpu(srx->hdr.rreq.source_stag), + rkey = be32_to_cpu(srx->hdr.rreq.sink_stag), + msn = be32_to_cpu(srx->hdr.rreq.ddp_msn); + + int run_sq = 1, rv = 0; + unsigned long flags; + + if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) { + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_UNTAGGED_BUF, + DDP_ECODE_UT_INVALID_MSN_RANGE, 0); + return -EPROTO; + } + spin_lock_irqsave(&qp->sq_lock, flags); + + if (tx_work->wr_status == SIW_WR_IDLE) { + /* + * immediately schedule READ response w/o + * consuming IRQ entry: IRQ must be empty. + */ + tx_work->processed = 0; + tx_work->mem[0] = NULL; + tx_work->wr_status = SIW_WR_QUEUED; + resp = &tx_work->sqe; + } else { + resp = irq_alloc_free(qp); + run_sq = 0; + } + if (likely(resp)) { + resp->opcode = SIW_OP_READ_RESPONSE; + + resp->sge[0].length = length; + resp->sge[0].laddr = laddr; + resp->sge[0].lkey = lkey; + + /* Keep aside message sequence number for potential + * error reporting during Read Response generation. + */ + resp->sge[1].length = msn; + + resp->raddr = raddr; + resp->rkey = rkey; + resp->num_sge = length ? 1 : 0; + + /* RRESP now valid as current TX wqe or placed into IRQ */ + smp_store_mb(resp->flags, SIW_WQE_VALID); + } else { + pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp), + qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size); + + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CATASTROPHIC_STREAM, 0); + rv = -EPROTO; + } + + spin_unlock_irqrestore(&qp->sq_lock, flags); + + if (run_sq) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * Only called at start of Read.Resonse processing. + * Transfer pending Read from tip of ORQ into currrent rx wqe, + * but keep ORQ entry valid until Read.Response processing done. + * No Queue locking needed. + */ +static int siw_orqe_start_rx(struct siw_qp *qp) +{ + struct siw_sqe *orqe; + struct siw_wqe *wqe = NULL; + + /* make sure ORQ indices are current */ + smp_mb(); + + orqe = orq_get_current(qp); + if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) { + /* RRESP is a TAGGED RDMAP operation */ + wqe = rx_wqe(&qp->rx_tagged); + wqe->sqe.id = orqe->id; + wqe->sqe.opcode = orqe->opcode; + wqe->sqe.sge[0].laddr = orqe->sge[0].laddr; + wqe->sqe.sge[0].lkey = orqe->sge[0].lkey; + wqe->sqe.sge[0].length = orqe->sge[0].length; + wqe->sqe.flags = orqe->flags; + wqe->sqe.num_sge = 1; + wqe->bytes = orqe->sge[0].length; + wqe->processed = 0; + wqe->mem[0] = NULL; + /* make sure WQE is completely written before valid */ + smp_wmb(); + wqe->wr_status = SIW_WR_INPROGRESS; + + return 0; + } + return -EPROTO; +} + +/* + * siw_proc_rresp: + * + * Place incoming RRESP data into memory referenced by RREQ WQE + * which is at the tip of the ORQ + * + * Function supports partially received RRESP's (suspending/resuming + * current receive processing) + */ +int siw_proc_rresp(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_rx_fpdu *frx = &qp->rx_tagged; + struct siw_wqe *wqe = rx_wqe(frx); + struct siw_mem **mem, *mem_p; + struct siw_sge *sge; + int bytes, rv; + + if (frx->first_ddp_seg) { + if (unlikely(wqe->wr_status != SIW_WR_IDLE)) { + pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n", + qp_id(qp), wqe->wr_status, wqe->sqe.opcode); + rv = -EPROTO; + goto error_term; + } + /* + * fetch pending RREQ from orq + */ + rv = siw_orqe_start_rx(qp); + if (rv) { + pr_warn("siw: [QP %u]: ORQ empty at idx %d\n", + qp_id(qp), qp->orq_get % qp->attrs.orq_size); + goto error_term; + } + rv = siw_rresp_check_ntoh(srx, frx); + if (unlikely(rv)) { + siw_qp_event(qp, IB_EVENT_QP_FATAL); + return rv; + } + } else { + if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) { + pr_warn("siw: [QP %u]: resume RRESP: status %d\n", + qp_id(qp), wqe->wr_status); + rv = -EPROTO; + goto error_term; + } + } + if (!srx->fpdu_part_rem) /* zero length RRESPONSE */ + return 0; + + sge = wqe->sqe.sge; /* there is only one */ + mem = &wqe->mem[0]; + + if (!(*mem)) { + /* + * check target memory which resolves memory on first fragment + */ + rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0, + wqe->bytes); + if (unlikely(rv)) { + siw_dbg_qp(qp, "target mem check: %d\n", rv); + wqe->wc_status = SIW_WC_LOC_PROT_ERR; + + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, + DDP_ETYPE_TAGGED_BUF, + siw_tagged_error(-rv), 0); + + siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR); + + return -EINVAL; + } + } + mem_p = *mem; + + bytes = min(srx->fpdu_part_rem, srx->skb_new); + + if (mem_p->mem_obj == NULL) + rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed), + bytes); + else if (!mem_p->is_pbl) + rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed, + bytes); + else + rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p, + sge->laddr + wqe->processed, bytes); + if (rv != bytes) { + wqe->wc_status = SIW_WC_GENERAL_ERR; + rv = -EINVAL; + goto error_term; + } + srx->fpdu_part_rem -= rv; + srx->fpdu_part_rcvd += rv; + wqe->processed += rv; + + if (!srx->fpdu_part_rem) { + srx->ddp_to += srx->fpdu_part_rcvd; + return 0; + } + return -EAGAIN; + +error_term: + siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC, + DDP_ECODE_CATASTROPHIC, 0); + return rv; +} + +int siw_proc_terminate(struct siw_qp *qp) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct sk_buff *skb = srx->skb; + struct iwarp_terminate *term = &srx->hdr.terminate; + union iwarp_hdr term_info; + u8 *infop = (u8 *)&term_info; + enum rdma_opcode op; + u16 to_copy = sizeof(struct iwarp_ctrl); + + pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n", + __rdmap_term_layer(term), __rdmap_term_etype(term), + __rdmap_term_ecode(term)); + + if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE || + be32_to_cpu(term->ddp_msn) != + qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] || + be32_to_cpu(term->ddp_mo) != 0) { + pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n", + be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn), + be32_to_cpu(term->ddp_mo)); + return -ECONNRESET; + } + /* + * Receive remaining pieces of TERM if indicated + */ + if (!term->flag_m) + return -ECONNRESET; + + /* Do not take the effort to reassemble a network fragmented + * TERM message + */ + if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged)) + return -ECONNRESET; + + memset(infop, 0, sizeof(term_info)); + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + op = __rdmap_get_opcode(&term_info.ctrl); + if (op >= RDMAP_TERMINATE) + goto out; + + infop += to_copy; + srx->skb_offset += to_copy; + srx->skb_new -= to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + to_copy = iwarp_pktinfo[op].hdr_len - to_copy; + + /* Again, no network fragmented TERM's */ + if (to_copy + MPA_CRC_SIZE > srx->skb_new) + return -ECONNRESET; + + skb_copy_bits(skb, srx->skb_offset, infop, to_copy); + + if (term->flag_r) { + siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } else if (term->flag_d) { + siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n", + op, be16_to_cpu(term_info.ctrl.mpa_len), + term->flag_m ? "valid" : "invalid"); + } +out: + srx->skb_new -= to_copy; + srx->skb_offset += to_copy; + srx->skb_copied += to_copy; + srx->fpdu_part_rcvd += to_copy; + srx->fpdu_part_rem -= to_copy; + + return -ECONNRESET; +} + +static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad; + __wsum crc_in, crc_own = 0; + + siw_dbg_qp(qp, "expected %d, available %d, pad %u\n", + srx->fpdu_part_rem, srx->skb_new, srx->pad); + + if (srx->skb_new < srx->fpdu_part_rem) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem); + + if (srx->mpa_crc_hd && srx->pad) + crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad); + + srx->skb_new -= srx->fpdu_part_rem; + srx->skb_offset += srx->fpdu_part_rem; + srx->skb_copied += srx->fpdu_part_rem; + + if (!srx->mpa_crc_hd) + return 0; + + /* + * CRC32 is computed, transmitted and received directly in NBO, + * so there's never a reason to convert byte order. + */ + crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own); + crc_in = (__force __wsum)srx->trailer.crc; + + if (unlikely(crc_in != crc_own)) { + pr_warn("siw: crc error. in: %08x, own %08x, op %u\n", + crc_in, crc_own, qp->rx_stream.rdmap_op); + + siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, + LLP_ETYPE_MPA, + LLP_ECODE_RECEIVED_CRC, 0); + return -EINVAL; + } + return 0; +} + +#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged) + +static int siw_get_hdr(struct siw_rx_stream *srx) +{ + struct sk_buff *skb = srx->skb; + struct siw_qp *qp = rx_qp(srx); + struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl; + struct siw_rx_fpdu *frx; + u8 opcode; + int bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) { + /* + * copy a mimimum sized (tagged) DDP frame control part + */ + bytes = min_t(int, srx->skb_new, + MIN_DDP_HDR - srx->fpdu_part_rcvd); + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + + if (srx->fpdu_part_rcvd < MIN_DDP_HDR) + return -EAGAIN; + + if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) { + enum ddp_etype etype; + enum ddp_ecode ecode; + + pr_warn("siw: received ddp version unsupported %d\n", + __ddp_get_version(c_hdr)); + + if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) { + etype = DDP_ETYPE_TAGGED_BUF; + ecode = DDP_ECODE_T_VERSION; + } else { + etype = DDP_ETYPE_UNTAGGED_BUF; + ecode = DDP_ECODE_UT_VERSION; + } + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP, + etype, ecode, 0); + return -EINVAL; + } + if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) { + pr_warn("siw: received rdmap version unsupported %d\n", + __rdmap_get_version(c_hdr)); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_VERSION, 0); + return -EINVAL; + } + opcode = __rdmap_get_opcode(c_hdr); + + if (opcode > RDMAP_TERMINATE) { + pr_warn("siw: received unknown packet type %u\n", + opcode); + + siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_OPCODE, 0); + return -EINVAL; + } + siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode); + } else { + opcode = __rdmap_get_opcode(c_hdr); + } + set_rx_fpdu_context(qp, opcode); + frx = qp->rx_fpdu; + + /* + * Figure out len of current hdr: variable length of + * iwarp hdr may force us to copy hdr information in + * two steps. Only tagged DDP messages are already + * completely received. + */ + if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) { + bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR; + + if (srx->skb_new < bytes) + return -EAGAIN; + + skb_copy_bits(skb, srx->skb_offset, + (char *)c_hdr + srx->fpdu_part_rcvd, bytes); + + srx->fpdu_part_rcvd += bytes; + + srx->skb_new -= bytes; + srx->skb_offset += bytes; + srx->skb_copied += bytes; + } + + /* + * DDP/RDMAP header receive completed. Check if the current + * DDP segment starts a new RDMAP message or continues a previously + * started RDMAP message. + * + * Alternating reception of DDP segments (or FPDUs) from incomplete + * tagged and untagged RDMAP messages is supported, as long as + * the current tagged or untagged message gets eventually completed + * w/o intersection from another message of the same type + * (tagged/untagged). E.g., a WRITE can get intersected by a SEND, + * but not by a READ RESPONSE etc. + */ + if (srx->mpa_crc_hd) { + /* + * Restart CRC computation + */ + crypto_shash_init(srx->mpa_crc_hd); + crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr, + srx->fpdu_part_rcvd); + } + if (frx->more_ddp_segs) { + frx->first_ddp_seg = 0; + if (frx->prev_rdmap_op != opcode) { + pr_warn("siw: packet intersection: %u : %u\n", + frx->prev_rdmap_op, opcode); + /* + * The last inbound RDMA operation of same type + * (tagged or untagged) is left unfinished. + * To complete it in error, make it the current + * operation again, even with the header already + * overwritten. For error handling, only the opcode + * and current rx context are relevant. + */ + set_rx_fpdu_context(qp, frx->prev_rdmap_op); + __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op); + return -EPROTO; + } + } else { + frx->prev_rdmap_op = opcode; + frx->first_ddp_seg = 1; + } + frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1; + + return 0; +} + +static int siw_check_tx_fence(struct siw_qp *qp) +{ + struct siw_wqe *tx_waiting = tx_wqe(qp); + struct siw_sqe *rreq; + int resume_tx = 0, rv = 0; + unsigned long flags; + + spin_lock_irqsave(&qp->orq_lock, flags); + + rreq = orq_get_current(qp); + + /* free current orq entry */ + WRITE_ONCE(rreq->flags, 0); + + if (qp->tx_ctx.orq_fence) { + if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) { + pr_warn("siw: [QP %u]: fence resume: bad status %d\n", + qp_id(qp), tx_waiting->wr_status); + rv = -EPROTO; + goto out; + } + /* resume SQ processing */ + if (tx_waiting->sqe.opcode == SIW_OP_READ || + tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) { + rreq = orq_get_tail(qp); + if (unlikely(!rreq)) { + pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp)); + rv = -EPROTO; + goto out; + } + siw_read_to_orq(rreq, &tx_waiting->sqe); + + qp->orq_put++; + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + + } else if (siw_orq_empty(qp)) { + qp->tx_ctx.orq_fence = 0; + resume_tx = 1; + } else { + pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n", + qp_id(qp), qp->orq_get, qp->orq_put); + rv = -EPROTO; + } + } + qp->orq_get++; +out: + spin_unlock_irqrestore(&qp->orq_lock, flags); + + if (resume_tx) + rv = siw_sq_start(qp); + + return rv; +} + +/* + * siw_rdmap_complete() + * + * Complete processing of an RDMA message after receiving all + * DDP segmens or ABort processing after encountering error case. + * + * o SENDs + RRESPs will need for completion, + * o RREQs need for READ RESPONSE initialization + * o WRITEs need memory dereferencing + * + * TODO: Failed WRITEs need local error to be surfaced. + */ +static int siw_rdmap_complete(struct siw_qp *qp, int error) +{ + struct siw_rx_stream *srx = &qp->rx_stream; + struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu); + enum siw_wc_status wc_status = wqe->wc_status; + u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl); + int rv = 0; + + switch (opcode) { + case RDMAP_SEND_SE: + case RDMAP_SEND_SE_INVAL: + wqe->rqe.flags |= SIW_WQE_SOLICITED; + case RDMAP_SEND: + case RDMAP_SEND_INVAL: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++; + + if (error != 0 && wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + /* + * Handle STag invalidation request + */ + if (wc_status == SIW_WC_SUCCESS && + (opcode == RDMAP_SEND_INVAL || + opcode == RDMAP_SEND_SE_INVAL)) { + rv = siw_invalidate_stag(qp->pd, srx->inval_stag); + if (rv) { + siw_init_terminate( + qp, TERM_ERROR_LAYER_RDMAP, + rv == -EACCES ? + RDMAP_ETYPE_REMOTE_PROTECTION : + RDMAP_ETYPE_REMOTE_OPERATION, + RDMAP_ECODE_CANNOT_INVALIDATE, 0); + + wc_status = SIW_WC_REM_INV_REQ_ERR; + } + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + rv ? 0 : srx->inval_stag, + wc_status); + } else { + rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed, + 0, wc_status); + } + siw_wqe_put_mem(wqe, SIW_OP_RECEIVE); + break; + + case RDMAP_RDMA_READ_RESP: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + if (error != 0) { + if ((srx->state == SIW_GET_HDR && + qp->rx_fpdu->first_ddp_seg) || error == -ENODATA) + /* possible RREQ in ORQ left untouched */ + break; + + if (wc_status == SIW_WC_SUCCESS) + wc_status = SIW_WC_GENERAL_ERR; + } else if (qp->kernel_verbs && + rx_type(wqe) == SIW_OP_READ_LOCAL_INV) { + /* + * Handle any STag invalidation request + */ + rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey); + if (rv) { + siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP, + RDMAP_ETYPE_CATASTROPHIC, + RDMAP_ECODE_UNSPECIFIED, 0); + + if (wc_status == SIW_WC_SUCCESS) { + wc_status = SIW_WC_GENERAL_ERR; + error = rv; + } + } + } + /* + * All errors turn the wqe into signalled. + */ + if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0) + rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed, + wc_status); + siw_wqe_put_mem(wqe, SIW_OP_READ); + + if (!error) + rv = siw_check_tx_fence(qp); + else + /* Disable current ORQ eleement */ + WRITE_ONCE(orq_get_current(qp)->flags, 0); + break; + + case RDMAP_RDMA_READ_REQ: + if (!error) { + rv = siw_init_rresp(qp, srx); + srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++; + } + break; + + case RDMAP_RDMA_WRITE: + if (wqe->wr_status == SIW_WR_IDLE) + break; + + /* + * Free References from memory object if + * attached to receive context (inbound WRITE). + * While a zero-length WRITE is allowed, + * no memory reference got created. + */ + if (rx_mem(&qp->rx_tagged)) { + siw_mem_put(rx_mem(&qp->rx_tagged)); + rx_mem(&qp->rx_tagged) = NULL; + } + break; + + default: + break; + } + wqe->wr_status = SIW_WR_IDLE; + + return rv; +} + +/* + * siw_tcp_rx_data() + * + * Main routine to consume inbound TCP payload + * + * @rd_desc: read descriptor + * @skb: socket buffer + * @off: offset in skb + * @len: skb->len - offset : payload in skb + */ +int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int off, size_t len) +{ + struct siw_qp *qp = rd_desc->arg.data; + struct siw_rx_stream *srx = &qp->rx_stream; + int rv; + + srx->skb = skb; + srx->skb_new = skb->len - off; + srx->skb_offset = off; + srx->skb_copied = 0; + + siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new); + + while (srx->skb_new) { + int run_completion = 1; + + if (unlikely(srx->rx_suspend)) { + /* Do not process any more data */ + srx->skb_copied += srx->skb_new; + break; + } + switch (srx->state) { + case SIW_GET_HDR: + rv = siw_get_hdr(srx); + if (!rv) { + srx->fpdu_part_rem = + be16_to_cpu(srx->hdr.ctrl.mpa_len) - + srx->fpdu_part_rcvd + MPA_HDR_SIZE; + + if (srx->fpdu_part_rem) + srx->pad = -srx->fpdu_part_rem & 0x3; + else + srx->pad = 0; + + srx->state = SIW_GET_DATA_START; + srx->fpdu_part_rcvd = 0; + } + break; + + case SIW_GET_DATA_MORE: + /* + * Another data fragment of the same DDP segment. + * Setting first_ddp_seg = 0 avoids repeating + * initializations that shall occur only once per + * DDP segment. + */ + qp->rx_fpdu->first_ddp_seg = 0; + /* Fall through */ + + case SIW_GET_DATA_START: + /* + * Headers will be checked by the opcode-specific + * data receive function below. + */ + rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp); + if (!rv) { + int mpa_len = + be16_to_cpu(srx->hdr.ctrl.mpa_len) + + MPA_HDR_SIZE; + + srx->fpdu_part_rem = (-mpa_len & 0x3) + + MPA_CRC_SIZE; + srx->fpdu_part_rcvd = 0; + srx->state = SIW_GET_TRAILER; + } else { + if (unlikely(rv == -ECONNRESET)) + run_completion = 0; + else + srx->state = SIW_GET_DATA_MORE; + } + break; + + case SIW_GET_TRAILER: + /* + * read CRC + any padding + */ + rv = siw_get_trailer(qp, srx); + if (likely(!rv)) { + /* + * FPDU completed. + * complete RDMAP message if last fragment + */ + srx->state = SIW_GET_HDR; + srx->fpdu_part_rcvd = 0; + + if (!(srx->hdr.ctrl.ddp_rdmap_ctrl & + DDP_FLAG_LAST)) + /* more frags */ + break; + + rv = siw_rdmap_complete(qp, 0); + run_completion = 0; + } + break; + + default: + pr_warn("QP[%u]: RX out of state\n", qp_id(qp)); + rv = -EPROTO; + run_completion = 0; + } + if (unlikely(rv != 0 && rv != -EAGAIN)) { + if ((srx->state > SIW_GET_HDR || + qp->rx_fpdu->more_ddp_segs) && run_completion) + siw_rdmap_complete(qp, rv); + + siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv, + srx->state); + + siw_qp_cm_drop(qp, 1); + + break; + } + if (rv) { + siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n", + srx->state, srx->fpdu_part_rem); + break; + } + } + return srx->skb_copied; +}