From 678ea9b5baab6800692b249bdba77c3c07261d61 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Thu, 31 Jul 2014 14:35:43 -0500 Subject: [PATCH 01/52] RDMA/cxgb4: Only call CQ completion handler if it is armed The function __flush_qp() always calls the ULP's CQ completion handler functions even if the CQ was not armed. This can crash the system if the function pointer is NULL. The iSER ULP behaves this way: no completion handler and never arm the CQ for notification. So now we track whether the CQ is armed at flush time and only call the completion handlers if their CQs were armed. Also, if the RCQ and SCQ are the same CQ, the completion handler is getting called twice. It should only be called once after all SQ and RQ WRs are flushed from the QP. So rearrange the logic to fix this. Signed-off-by: Steve Wise Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/ev.c | 1 + drivers/infiniband/hw/cxgb4/qp.c | 37 +++++++++++++++++++++----------- drivers/infiniband/hw/cxgb4/t4.h | 11 ++++++++++ 3 files changed, 37 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c index d61d0a18f784..a98426fed9ee 100644 --- a/drivers/infiniband/hw/cxgb4/ev.c +++ b/drivers/infiniband/hw/cxgb4/ev.c @@ -182,6 +182,7 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid) chp = get_chp(dev, qid); if (chp) { + t4_clear_cq_armed(&chp->cq); spin_lock_irqsave(&chp->comp_handler_lock, flag); (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context); spin_unlock_irqrestore(&chp->comp_handler_lock, flag); diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 086f62f5dc9e..60cfc11a66e4 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -1066,7 +1066,7 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, struct c4iw_cq *schp) { int count; - int flushed; + int rq_flushed, sq_flushed; unsigned long flag; PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp); @@ -1084,27 +1084,40 @@ static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp, c4iw_flush_hw_cq(rchp); c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count); - flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); + rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&rchp->lock, flag); - if (flushed) { - spin_lock_irqsave(&rchp->comp_handler_lock, flag); - (*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context); - spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); - } /* locking hierarchy: cq lock first, then qp lock. */ spin_lock_irqsave(&schp->lock, flag); spin_lock(&qhp->lock); if (schp != rchp) c4iw_flush_hw_cq(schp); - flushed = c4iw_flush_sq(qhp); + sq_flushed = c4iw_flush_sq(qhp); spin_unlock(&qhp->lock); spin_unlock_irqrestore(&schp->lock, flag); - if (flushed) { - spin_lock_irqsave(&schp->comp_handler_lock, flag); - (*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context); - spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + + if (schp == rchp) { + if (t4_clear_cq_armed(&rchp->cq) && + (rq_flushed || sq_flushed)) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + } else { + if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) { + spin_lock_irqsave(&rchp->comp_handler_lock, flag); + (*rchp->ibcq.comp_handler)(&rchp->ibcq, + rchp->ibcq.cq_context); + spin_unlock_irqrestore(&rchp->comp_handler_lock, flag); + } + if (t4_clear_cq_armed(&schp->cq) && sq_flushed) { + spin_lock_irqsave(&schp->comp_handler_lock, flag); + (*schp->ibcq.comp_handler)(&schp->ibcq, + schp->ibcq.cq_context); + spin_unlock_irqrestore(&schp->comp_handler_lock, flag); + } } } diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 68b0a6bf4eb0..d8d7fa3e446d 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -531,6 +531,10 @@ static inline int t4_wq_db_enabled(struct t4_wq *wq) return !wq->rq.queue[wq->rq.size].status.db_off; } +enum t4_cq_flags { + CQ_ARMED = 1, +}; + struct t4_cq { struct t4_cqe *queue; dma_addr_t dma_addr; @@ -551,12 +555,19 @@ struct t4_cq { u16 cidx_inc; u8 gen; u8 error; + unsigned long flags; }; +static inline int t4_clear_cq_armed(struct t4_cq *cq) +{ + return test_and_clear_bit(CQ_ARMED, &cq->flags); +} + static inline int t4_arm_cq(struct t4_cq *cq, int se) { u32 val; + set_bit(CQ_ARMED, &cq->flags); while (cq->cidx_inc > CIDXINC_MASK) { val = SEINTARM(0) | CIDXINC(CIDXINC_MASK) | TIMERREG(7) | INGRESSQID(cq->cqid); From f50f31e42fb772b62e209cf6ff5254b1bc02b756 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Tue, 10 Jun 2014 19:32:12 +0530 Subject: [PATCH 02/52] RDMA/ocrdma: Avoid posting DPP requests for RDMA READ Rollback the patch which allows DPP posting of RDMA READ since this path is not yet stable in FW. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 3bbf2010a821..bce4adf342ce 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2206,7 +2206,8 @@ int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs, OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK; qp->rq_cq = cq; - if (pd->dpp_enabled && pd->num_dpp_qp) { + if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp && + (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) { ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq, dpp_cq_id); } From 31dbdd9af58c63c7f8376a0fa680f5fc1b6cce98 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:13 +0530 Subject: [PATCH 03/52] RDMA/ocrdma: Query and initalize the PFC SL This patch implements routine to query the PFC priority from the adapter port. Following are the changes implemented: * A new FW command is implemented to query the operational/admin DCBX configuration from the FW and obtain active priority(service level). * Adds support for the async event reported by FW when the PFC priority changes. Service level is re-initialized during modify_qp or create_ah, based on this event. * Maintain SL value in ocrdma_dev structure and refer that as and when needed. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma.h | 21 +++ drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 2 + drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 172 +++++++++++++++++++++ drivers/infiniband/hw/ocrdma/ocrdma_hw.h | 2 + drivers/infiniband/hw/ocrdma/ocrdma_main.c | 1 + drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 81 +++++++++- 6 files changed, 278 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 19011dbb930f..5cd65c244191 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -236,6 +236,9 @@ struct ocrdma_dev { struct rcu_head rcu; int id; u64 stag_arr[OCRDMA_MAX_STAG]; + u8 sl; /* service level */ + bool pfc_state; + atomic_t update_sl; u16 pvid; u32 asic_id; @@ -518,4 +521,22 @@ static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev) OCRDMA_SLI_ASIC_GEN_NUM_SHIFT; } +static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio) +{ + return *(pfc + prio); +} + +static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio) +{ + return *(app_prio + prio); +} + +static inline u8 ocrdma_is_enabled_and_synced(u32 state) +{ /* May also be used to interpret TC-state, QCN-state + * Appl-state and Logical-link-state in future. + */ + return (state & OCRDMA_STATE_FLAG_ENABLED) && + (state & OCRDMA_STATE_FLAG_SYNC); +} + #endif diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index d4cc01f10c01..a023234d4b92 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -100,6 +100,8 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) if (!(attr->ah_flags & IB_AH_GRH)) return ERR_PTR(-EINVAL); + if (atomic_cmpxchg(&dev->update_sl, 1, 0)) + ocrdma_init_service_level(dev); ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index bce4adf342ce..e6463cb0f988 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -771,6 +771,10 @@ static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev, OCRDMA_AE_PVID_MCQE_TAG_MASK) >> OCRDMA_AE_PVID_MCQE_TAG_SHIFT); break; + + case OCRDMA_ASYNC_EVENT_COS_VALUE: + atomic_set(&dev->update_sl, 1); + break; default: /* Not interested evts. */ break; @@ -2265,6 +2269,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, if ((ah_attr->ah_flags & IB_AH_GRH) == 0) return -EINVAL; + if (atomic_cmpxchg(&qp->dev->update_sl, 1, 0)) + ocrdma_init_service_level(qp->dev); cmd->params.tclass_sq_psn |= (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT); cmd->params.rnt_rc_sl_fl |= @@ -2298,6 +2304,10 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, cmd->params.vlan_dmac_b4_to_b5 |= vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID; + /* override the sl with default priority if 0 */ + cmd->params.rnt_rc_sl_fl |= + (ah_attr->sl ? ah_attr->sl : + qp->dev->sl) << OCRDMA_QP_PARAMS_SL_SHIFT; } return 0; } @@ -2605,6 +2615,168 @@ int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq) return status; } +static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype, + struct ocrdma_dcbx_cfg *dcbxcfg) +{ + int status = 0; + dma_addr_t pa; + struct ocrdma_mqe cmd; + + struct ocrdma_get_dcbx_cfg_req *req = NULL; + struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL; + struct pci_dev *pdev = dev->nic_info.pdev; + struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge; + + memset(&cmd, 0, sizeof(struct ocrdma_mqe)); + cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp), + sizeof(struct ocrdma_get_dcbx_cfg_req)); + req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL); + if (!req) { + status = -ENOMEM; + goto mem_err; + } + + cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) & + OCRDMA_MQE_HDR_SGE_CNT_MASK; + mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL); + mqe_sge->pa_hi = (u32) upper_32_bits(pa); + mqe_sge->len = cmd.hdr.pyld_len; + + memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req)); + ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG, + OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len); + req->param_type = ptype; + + status = ocrdma_mbx_cmd(dev, &cmd); + if (status) + goto mbx_err; + + rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req; + ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp)); + memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg)); + +mbx_err: + dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa); +mem_err: + return status; +} + +#define OCRDMA_MAX_SERVICE_LEVEL_INDEX 0x08 +#define OCRDMA_DEFAULT_SERVICE_LEVEL 0x05 + +static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype, + struct ocrdma_dcbx_cfg *dcbxcfg, + u8 *srvc_lvl) +{ + int status = -EINVAL, indx, slindx; + int ventry_cnt; + struct ocrdma_app_parameter *app_param; + u8 valid, proto_sel; + u8 app_prio, pfc_prio; + u16 proto; + + if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) { + pr_info("%s ocrdma%d DCBX is disabled\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + goto out; + } + + if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) { + pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + (ptype > 0 ? "operational" : "admin"), + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ? + "enabled" : "disabled", + (dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ? + "" : ", not sync'ed"); + goto out; + } else { + pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + } + + ventry_cnt = (dcbxcfg->tcv_aev_opv_st >> + OCRDMA_DCBX_APP_ENTRY_SHIFT) + & OCRDMA_DCBX_STATE_MASK; + + for (indx = 0; indx < ventry_cnt; indx++) { + app_param = &dcbxcfg->app_param[indx]; + valid = (app_param->valid_proto_app >> + OCRDMA_APP_PARAM_VALID_SHIFT) + & OCRDMA_APP_PARAM_VALID_MASK; + proto_sel = (app_param->valid_proto_app + >> OCRDMA_APP_PARAM_PROTO_SEL_SHIFT) + & OCRDMA_APP_PARAM_PROTO_SEL_MASK; + proto = app_param->valid_proto_app & + OCRDMA_APP_PARAM_APP_PROTO_MASK; + + if ( + valid && proto == OCRDMA_APP_PROTO_ROCE && + proto_sel == OCRDMA_PROTO_SELECT_L2) { + for (slindx = 0; slindx < + OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) { + app_prio = ocrdma_get_app_prio( + (u8 *)app_param->app_prio, + slindx); + pfc_prio = ocrdma_get_pfc_prio( + (u8 *)dcbxcfg->pfc_prio, + slindx); + + if (app_prio && pfc_prio) { + *srvc_lvl = slindx; + status = 0; + goto out; + } + } + if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) { + pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n", + dev_name(&dev->nic_info.pdev->dev), + dev->id, proto); + } + } + } + +out: + return status; +} + +void ocrdma_init_service_level(struct ocrdma_dev *dev) +{ + int status = 0, indx; + struct ocrdma_dcbx_cfg dcbxcfg; + u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL; + int ptype = OCRDMA_PARAMETER_TYPE_OPER; + + for (indx = 0; indx < 2; indx++) { + status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg); + if (status) { + pr_err("%s(): status=%d\n", __func__, status); + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + status = ocrdma_parse_dcbxcfg_rsp(dev, ptype, + &dcbxcfg, &srvc_lvl); + if (status) { + ptype = OCRDMA_PARAMETER_TYPE_ADMIN; + continue; + } + + break; + } + + if (status) + pr_info("%s ocrdma%d service level default\n", + dev_name(&dev->nic_info.pdev->dev), dev->id); + else + pr_info("%s ocrdma%d service level %d\n", + dev_name(&dev->nic_info.pdev->dev), dev->id, + srvc_lvl); + + dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state); + dev->sl = srvc_lvl; +} + int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah) { int i; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h index e513f7293142..6eed8f191322 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h @@ -135,4 +135,6 @@ int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq); int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset); char *port_speed_string(struct ocrdma_dev *dev); +void ocrdma_init_service_level(struct ocrdma_dev *); + #endif /* __OCRDMA_HW_H__ */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 7c504e079744..9368d52888bb 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -399,6 +399,7 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) if (status) goto alloc_err; + ocrdma_init_service_level(dev); status = ocrdma_register_device(dev); if (status) goto alloc_err; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 96c9ee602ba4..4defae8f63df 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -422,7 +422,12 @@ struct ocrdma_ae_qp_mcqe { #define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14 #define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5 -#define OCRDMA_ASYNC_EVENT_PVID_STATE 0x3 + +enum ocrdma_async_grp5_events { + OCRDMA_ASYNC_EVENT_QOS_VALUE = 0x01, + OCRDMA_ASYNC_EVENT_COS_VALUE = 0x02, + OCRDMA_ASYNC_EVENT_PVID_STATE = 0x03 +}; enum OCRDMA_ASYNC_EVENT_TYPE { OCRDMA_CQ_ERROR = 0x00, @@ -1949,5 +1954,79 @@ struct ocrdma_get_ctrl_attribs_rsp { struct mgmt_controller_attrib ctrl_attribs; }; +#define OCRDMA_SUBSYS_DCBX 0x10 + +enum OCRDMA_DCBX_OPCODE { + OCRDMA_CMD_GET_DCBX_CONFIG = 0x01 +}; + +enum OCRDMA_DCBX_PARAM_TYPE { + OCRDMA_PARAMETER_TYPE_ADMIN = 0x00, + OCRDMA_PARAMETER_TYPE_OPER = 0x01, + OCRDMA_PARAMETER_TYPE_PEER = 0x02 +}; + +enum OCRDMA_DCBX_APP_PROTO { + OCRDMA_APP_PROTO_ROCE = 0x8915 +}; + +enum OCRDMA_DCBX_PROTO { + OCRDMA_PROTO_SELECT_L2 = 0x00, + OCRDMA_PROTO_SELECT_L4 = 0x01 +}; + +enum OCRDMA_DCBX_APP_PARAM { + OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF, + OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF, + OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10, + OCRDMA_APP_PARAM_VALID_MASK = 0xFF, + OCRDMA_APP_PARAM_VALID_SHIFT = 0x18 +}; + +enum OCRDMA_DCBX_STATE_FLAGS { + OCRDMA_STATE_FLAG_ENABLED = 0x01, + OCRDMA_STATE_FLAG_ADDVERTISED = 0x02, + OCRDMA_STATE_FLAG_WILLING = 0x04, + OCRDMA_STATE_FLAG_SYNC = 0x08, + OCRDMA_STATE_FLAG_UNSUPPORTED = 0x40000000, + OCRDMA_STATE_FLAG_NEG_FAILD = 0x80000000 +}; + +enum OCRDMA_TCV_AEV_OPV_ST { + OCRDMA_DCBX_TC_SUPPORT_MASK = 0xFF, + OCRDMA_DCBX_TC_SUPPORT_SHIFT = 0x18, + OCRDMA_DCBX_APP_ENTRY_SHIFT = 0x10, + OCRDMA_DCBX_OP_PARAM_SHIFT = 0x08, + OCRDMA_DCBX_STATE_MASK = 0xFF +}; + +struct ocrdma_app_parameter { + u32 valid_proto_app; + u32 oui; + u32 app_prio[2]; +}; + +struct ocrdma_dcbx_cfg { + u32 tcv_aev_opv_st; + u32 tc_state; + u32 pfc_state; + u32 qcn_state; + u32 appl_state; + u32 ll_state; + u32 tc_bw[2]; + u32 tc_prio[8]; + u32 pfc_prio[2]; + struct ocrdma_app_parameter app_param[15]; +}; + +struct ocrdma_get_dcbx_cfg_req { + struct ocrdma_mbx_hdr hdr; + u32 param_type; +} __packed; + +struct ocrdma_get_dcbx_cfg_rsp { + struct ocrdma_mbx_rsp hdr; + struct ocrdma_dcbx_cfg cfg; +} __packed; #endif /* __OCRDMA_SLI_H__ */ From 4808b184fd54d64995046b83864809536a058e7a Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:14 +0530 Subject: [PATCH 04/52] RDMA/ocrdma: Add hca_type and fixing fw_version string in device atrributes Add a new entry under sysfs for getting the HW type. Add a new-line character for the FW version string Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 9368d52888bb..227a5422db0b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -353,15 +353,25 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, { struct ocrdma_dev *dev = dev_get_drvdata(device); - return scnprintf(buf, PAGE_SIZE, "%s", &dev->attr.fw_ver[0]); + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]); +} + +static ssize_t show_hca_type(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); } static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); static struct device_attribute *ocrdma_attributes[] = { &dev_attr_hw_rev, - &dev_attr_fw_ver + &dev_attr_fw_ver, + &dev_attr_hca_type }; static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) From d114f99a29b7f9c7a5e001ca1ef9c93db70d8668 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Tue, 10 Jun 2014 19:32:15 +0530 Subject: [PATCH 05/52] be2net: Issue shutdown event to ocrdma driver In the shutdown path, when be2net calls pci_disable_msix(), it complains (BUG_ON) that irqs requested by ocrdma driver are still in use. This patch fixes this problem by issuing shutdown event to ocrdma from be2net shutdown path. As part of shutdown event processing, ocrdma driver will free up all the resources and free irqs. Once this completes be2net completes pci_disable_msix successfully. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/net/ethernet/emulex/benet/be.h | 1 + drivers/net/ethernet/emulex/benet/be_main.c | 1 + drivers/net/ethernet/emulex/benet/be_roce.c | 18 ++++++++++++++++-- drivers/net/ethernet/emulex/benet/be_roce.h | 3 ++- 4 files changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/emulex/benet/be.h b/drivers/net/ethernet/emulex/benet/be.h index c2f5d2d3b932..56d4d100e024 100644 --- a/drivers/net/ethernet/emulex/benet/be.h +++ b/drivers/net/ethernet/emulex/benet/be.h @@ -890,5 +890,6 @@ void be_roce_dev_remove(struct be_adapter *); */ void be_roce_dev_open(struct be_adapter *); void be_roce_dev_close(struct be_adapter *); +void be_roce_dev_shutdown(struct be_adapter *); #endif /* BE_H */ diff --git a/drivers/net/ethernet/emulex/benet/be_main.c b/drivers/net/ethernet/emulex/benet/be_main.c index 1e187fb760f8..36ce69ae9a2f 100644 --- a/drivers/net/ethernet/emulex/benet/be_main.c +++ b/drivers/net/ethernet/emulex/benet/be_main.c @@ -4958,6 +4958,7 @@ static void be_shutdown(struct pci_dev *pdev) if (!adapter) return; + be_roce_dev_shutdown(adapter); cancel_delayed_work_sync(&adapter->work); cancel_delayed_work_sync(&adapter->func_recovery_work); diff --git a/drivers/net/ethernet/emulex/benet/be_roce.c b/drivers/net/ethernet/emulex/benet/be_roce.c index 5bf16603a3e9..ef4672dc7357 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.c +++ b/drivers/net/ethernet/emulex/benet/be_roce.c @@ -120,7 +120,8 @@ static void _be_roce_dev_open(struct be_adapter *adapter) { if (ocrdma_drv && adapter->ocrdma_dev && ocrdma_drv->state_change_handler) - ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 0); + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_UP); } void be_roce_dev_open(struct be_adapter *adapter) @@ -136,7 +137,8 @@ static void _be_roce_dev_close(struct be_adapter *adapter) { if (ocrdma_drv && adapter->ocrdma_dev && ocrdma_drv->state_change_handler) - ocrdma_drv->state_change_handler(adapter->ocrdma_dev, 1); + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_DOWN); } void be_roce_dev_close(struct be_adapter *adapter) @@ -148,6 +150,18 @@ void be_roce_dev_close(struct be_adapter *adapter) } } +void be_roce_dev_shutdown(struct be_adapter *adapter) +{ + if (be_roce_supported(adapter)) { + mutex_lock(&be_adapter_list_lock); + if (ocrdma_drv && adapter->ocrdma_dev && + ocrdma_drv->state_change_handler) + ocrdma_drv->state_change_handler(adapter->ocrdma_dev, + BE_DEV_SHUTDOWN); + mutex_unlock(&be_adapter_list_lock); + } +} + int be_roce_register_driver(struct ocrdma_driver *drv) { struct be_adapter *dev; diff --git a/drivers/net/ethernet/emulex/benet/be_roce.h b/drivers/net/ethernet/emulex/benet/be_roce.h index a3d9e96c18eb..e6f7eb1a7d87 100644 --- a/drivers/net/ethernet/emulex/benet/be_roce.h +++ b/drivers/net/ethernet/emulex/benet/be_roce.h @@ -62,7 +62,8 @@ struct ocrdma_driver { enum { BE_DEV_UP = 0, - BE_DEV_DOWN = 1 + BE_DEV_DOWN = 1, + BE_DEV_SHUTDOWN = 2 }; /* APIs for RoCE driver to register callback handlers, From efe4593720829066667397f540d50baffc706435 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Tue, 10 Jun 2014 19:32:16 +0530 Subject: [PATCH 06/52] RDMA/ocrdma: Handle shutdown event from be2net driver be2net driver sends a shutdown event to ocrdma during shutdown/reboot. As part of event processing, ocrdma calls close() and remove() to free all the resources associated with ocrdma. This also frees irqs used by ocrdma. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 227a5422db0b..3cb20c6b91ce 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -519,6 +519,12 @@ static int ocrdma_close(struct ocrdma_dev *dev) return 0; } +static void ocrdma_shutdown(struct ocrdma_dev *dev) +{ + ocrdma_close(dev); + ocrdma_remove(dev); +} + /* event handling via NIC driver ensures that all the NIC specific * initialization done before RoCE driver notifies * event to stack. @@ -532,6 +538,9 @@ static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event) case BE_DEV_DOWN: ocrdma_close(dev); break; + case BE_DEV_SHUTDOWN: + ocrdma_shutdown(dev); + break; } } From a53d77a33494a3d1d5c797df311c975a05d69ef3 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Tue, 10 Jun 2014 19:32:17 +0530 Subject: [PATCH 07/52] RDMA/ocrdma: Remove hardcoding of the max DPP QPs supported Removing hardcoded value of max dpp qps and calculate the same from doorbell page size and WQE size. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 1 - drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 4defae8f63df..14a84b269cc0 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -1236,7 +1236,6 @@ struct ocrdma_destroy_srq { enum { OCRDMA_ALLOC_PD_ENABLE_DPP = BIT(16), - OCRDMA_PD_MAX_DPP_ENABLED_QP = 8, OCRDMA_DPP_PAGE_SIZE = 4096 }; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index edf6211d84b8..0d7d8083287f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -268,7 +268,8 @@ static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev, pd->dpp_enabled = ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R; pd->num_dpp_qp = - pd->dpp_enabled ? OCRDMA_PD_MAX_DPP_ENABLED_QP : 0; + pd->dpp_enabled ? (dev->nic_info.db_page_size / + dev->attr.wqe_size) : 0; } retry: From daac96815e969bd70ed5ad21231be2fc5d99506d Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Tue, 10 Jun 2014 19:32:18 +0530 Subject: [PATCH 08/52] RDMA/ocrdma: Delete AH table if ocrdma_init_hw fails after AH table creation Cleanup the AH table in error path, if HW initialization fails after AH table creation. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index e6463cb0f988..55308b667649 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1505,6 +1505,7 @@ static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev) ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va, dev->av_tbl.pa); + dev->av_tbl.va = NULL; dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va, dev->av_tbl.pbl.pa); kfree(cmd); @@ -2882,13 +2883,15 @@ int ocrdma_init_hw(struct ocrdma_dev *dev) goto conf_err; status = ocrdma_mbx_get_phy_info(dev); if (status) - goto conf_err; + goto info_attrb_err; status = ocrdma_mbx_get_ctrl_attribs(dev); if (status) - goto conf_err; + goto info_attrb_err; return 0; +info_attrb_err: + ocrdma_mbx_delete_ah_tbl(dev); conf_err: ocrdma_destroy_mq(dev); mq_err: From a96ffb1de9d656ce7083277a8badaa1082813498 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:19 +0530 Subject: [PATCH 09/52] RDMA/ocrdma: Avoid reporting wrong completions in case of error CQEs During cable pull test with a mount over NFS/RDMA, the driver was reporting error completions when there were no pending requests in the SQ and RQ. This was triggering a host crash because of reporting wrong work req id. Avoid this crash by adding a check for SQ and RQ empty condition and prevent reporting completions if queues are empty. Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 0d7d8083287f..2b68235973d7 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -2489,6 +2489,11 @@ static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp, *stop = true; expand = false; } + } else if (is_hw_sq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; } else { *polled = true; expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status); @@ -2594,6 +2599,11 @@ static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe, *stop = true; expand = false; } + } else if (is_hw_rq_empty(qp)) { + /* Do nothing */ + expand = false; + *polled = false; + *stop = false; } else { *polled = true; expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status); From f252b5dc36e26368c7161f32ef304c30cd2d1f6c Mon Sep 17 00:00:00 2001 From: Mitesh Ahuja Date: Tue, 10 Jun 2014 19:32:20 +0530 Subject: [PATCH 10/52] RDMA/ocrdma: Allow only SEND opcode in case of UD QPs Prevent posting opcodes other than send and send immediate on the UD QPs. Signed-off-by: Mitesh Ahuja Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 2b68235973d7..7f54d2478738 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -2055,6 +2055,13 @@ int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, } while (wr) { + if (qp->qp_type == IB_QPT_UD && + (wr->opcode != IB_WR_SEND && + wr->opcode != IB_WR_SEND_WITH_IMM)) { + *bad_wr = wr; + status = -EINVAL; + break; + } if (ocrdma_hwq_free_cnt(&qp->sq) == 0 || wr->num_sge > qp->sq.max_sges) { *bad_wr = wr; From 6dab02648c4c8bb58b35efccf29291d7970aeb68 Mon Sep 17 00:00:00 2001 From: Mitesh Ahuja Date: Tue, 10 Jun 2014 19:32:21 +0530 Subject: [PATCH 11/52] RDMA/ocrdma: Do proper cleanup even if FW is in error state If any mailbox command reports timeout, save the state in the driver, to prevent issuing any more commands to the HW. Do proper cleanup even if FW is in error state. Signed-off-by: Mitesh Ahuja Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma.h | 1 + drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 8 +++++++- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 12 +++++++++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 5cd65c244191..fc273782986e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -137,6 +137,7 @@ struct mqe_ctx { u16 cqe_status; u16 ext_status; bool cmd_done; + bool fw_error_state; }; struct ocrdma_hw_mr { diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 55308b667649..5b6e9d9c779e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -966,8 +966,12 @@ static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev) msecs_to_jiffies(30000)); if (status) return 0; - else + else { + dev->mqe_ctx.fw_error_state = true; + pr_err("%s(%d) mailbox timeout: fw not responding\n", + __func__, dev->id); return -1; + } } /* issue a mailbox command on the MQ */ @@ -979,6 +983,8 @@ static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe) struct ocrdma_mbx_rsp *rsp = NULL; mutex_lock(&dev->mqe_ctx.lock); + if (dev->mqe_ctx.fw_error_state) + goto mbx_err; ocrdma_post_mqe(dev, mqe); status = ocrdma_wait_mqe_cmpl(dev); if (status) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 7f54d2478738..8cd16a182475 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -329,7 +329,10 @@ static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) struct ocrdma_pd *pd = uctx->cntxt_pd; struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device); - BUG_ON(uctx->pd_in_use); + if (uctx->pd_in_use) { + pr_err("%s(%d) Freeing in use pdid=0x%x.\n", + __func__, dev->id, pd->id); + } uctx->cntxt_pd = NULL; status = _ocrdma_dealloc_pd(dev, pd); return status; @@ -844,6 +847,13 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr) if (mr->umem) ib_umem_release(mr->umem); kfree(mr); + + /* Don't stop cleanup, in case FW is unresponsive */ + if (dev->mqe_ctx.fw_error_state) { + status = 0; + pr_err("%s(%d) fw not responding.\n", + __func__, dev->id); + } return status; } From 033edd4dff23708c35120d9297fc898ef284bc7d Mon Sep 17 00:00:00 2001 From: Mitesh Ahuja Date: Tue, 10 Jun 2014 19:32:22 +0530 Subject: [PATCH 12/52] RDMA/ocrdma: Return proper value for max_mr_size Update the max_mr_size with proper value. Corrected the response structure of query config mailbox command. Signed-off-by: Mitesh Ahuja Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 3 ++- drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 5b6e9d9c779e..105659d7ef79 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1088,7 +1088,8 @@ static void ocrdma_get_attr(struct ocrdma_dev *dev, OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT; attr->max_mw = rsp->max_mw; attr->max_mr = rsp->max_mr; - attr->max_mr_size = ~0ull; + attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) | + rsp->max_mr_size_lo; attr->max_fmr = 0; attr->max_pages_per_frmr = rsp->max_pages_per_frmr; attr->max_num_mr_pbl = rsp->max_num_mr_pbl; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 14a84b269cc0..3cb88f040ca1 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -530,8 +530,8 @@ struct ocrdma_mbx_query_config { u32 max_ird_ord_per_qp; u32 max_shared_ird_ord; u32 max_mr; - u32 max_mr_size_lo; u32 max_mr_size_hi; + u32 max_mr_size_lo; u32 max_num_mr_pbl; u32 max_mw; u32 max_fmr; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 8cd16a182475..90152dec6447 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -69,7 +69,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) memcpy(&attr->fw_ver, &dev->attr.fw_ver[0], min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver))); ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid); - attr->max_mr_size = ~0ull; + attr->max_mr_size = dev->attr.max_mr_size; attr->page_size_cap = 0xffff000; attr->vendor_id = dev->nic_info.pdev->vendor; attr->vendor_part_id = dev->nic_info.pdev->device; From 920de55d40df30131c2b32850a5417d81b0efc7e Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:23 +0530 Subject: [PATCH 13/52] RDMA/ocrdma: Add missing adapter mailbox opcodes Fix the Statistics command opcode. Also specify the opcode of each command for better readablilty. Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 52 ++++++++++++----------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 3cb88f040ca1..a20d3485ed60 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -44,35 +44,39 @@ enum { #define OCRDMA_SUBSYS_ROCE 10 enum { OCRDMA_CMD_QUERY_CONFIG = 1, - OCRDMA_CMD_ALLOC_PD, - OCRDMA_CMD_DEALLOC_PD, + OCRDMA_CMD_ALLOC_PD = 2, + OCRDMA_CMD_DEALLOC_PD = 3, - OCRDMA_CMD_CREATE_AH_TBL, - OCRDMA_CMD_DELETE_AH_TBL, + OCRDMA_CMD_CREATE_AH_TBL = 4, + OCRDMA_CMD_DELETE_AH_TBL = 5, - OCRDMA_CMD_CREATE_QP, - OCRDMA_CMD_QUERY_QP, - OCRDMA_CMD_MODIFY_QP, - OCRDMA_CMD_DELETE_QP, + OCRDMA_CMD_CREATE_QP = 6, + OCRDMA_CMD_QUERY_QP = 7, + OCRDMA_CMD_MODIFY_QP = 8 , + OCRDMA_CMD_DELETE_QP = 9, - OCRDMA_CMD_RSVD1, - OCRDMA_CMD_ALLOC_LKEY, - OCRDMA_CMD_DEALLOC_LKEY, - OCRDMA_CMD_REGISTER_NSMR, - OCRDMA_CMD_REREGISTER_NSMR, - OCRDMA_CMD_REGISTER_NSMR_CONT, - OCRDMA_CMD_QUERY_NSMR, - OCRDMA_CMD_ALLOC_MW, - OCRDMA_CMD_QUERY_MW, + OCRDMA_CMD_RSVD1 = 10, + OCRDMA_CMD_ALLOC_LKEY = 11, + OCRDMA_CMD_DEALLOC_LKEY = 12, + OCRDMA_CMD_REGISTER_NSMR = 13, + OCRDMA_CMD_REREGISTER_NSMR = 14, + OCRDMA_CMD_REGISTER_NSMR_CONT = 15, + OCRDMA_CMD_QUERY_NSMR = 16, + OCRDMA_CMD_ALLOC_MW = 17, + OCRDMA_CMD_QUERY_MW = 18, - OCRDMA_CMD_CREATE_SRQ, - OCRDMA_CMD_QUERY_SRQ, - OCRDMA_CMD_MODIFY_SRQ, - OCRDMA_CMD_DELETE_SRQ, + OCRDMA_CMD_CREATE_SRQ = 19, + OCRDMA_CMD_QUERY_SRQ = 20, + OCRDMA_CMD_MODIFY_SRQ = 21, + OCRDMA_CMD_DELETE_SRQ = 22, - OCRDMA_CMD_ATTACH_MCAST, - OCRDMA_CMD_DETACH_MCAST, - OCRDMA_CMD_GET_RDMA_STATS, + OCRDMA_CMD_ATTACH_MCAST = 23, + OCRDMA_CMD_DETACH_MCAST = 24, + + OCRDMA_CMD_CREATE_RBQ = 25, + OCRDMA_CMD_DESTROY_RBQ = 26, + + OCRDMA_CMD_GET_RDMA_STATS = 27, OCRDMA_CMD_MAX }; From 4f1df8440d26bafe0be6ef4dbf17162a1263d3fc Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:24 +0530 Subject: [PATCH 14/52] RDMA/ocrdma: Increase the size of STAG array in dev structure to 16K HW can support 16K STAG entries. Change this max limit. Also, move this array out of ocrdma_dev to reduce the size of this structure. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma.h | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 6 ++++++ drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index fc273782986e..57165137a7a2 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -236,7 +236,7 @@ struct ocrdma_dev { struct list_head entry; struct rcu_head rcu; int id; - u64 stag_arr[OCRDMA_MAX_STAG]; + u64 *stag_arr; u8 sl; /* service level */ bool pfc_state; atomic_t update_sl; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 3cb20c6b91ce..0d90c7f75a6f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -324,6 +324,11 @@ static int ocrdma_alloc_resources(struct ocrdma_dev *dev) if (!dev->qp_tbl) goto alloc_err; } + + dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL); + if (dev->stag_arr == NULL) + goto alloc_err; + spin_lock_init(&dev->av_tbl.lock); spin_lock_init(&dev->flush_q_lock); return 0; @@ -334,6 +339,7 @@ alloc_err: static void ocrdma_free_resources(struct ocrdma_dev *dev) { + kfree(dev->stag_arr); kfree(dev->qp_tbl); kfree(dev->cq_tbl); kfree(dev->sgid_tbl); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index a20d3485ed60..3d08e665be26 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -107,7 +107,7 @@ enum { #define OCRDMA_MAX_QP 2048 #define OCRDMA_MAX_CQ 2048 -#define OCRDMA_MAX_STAG 8192 +#define OCRDMA_MAX_STAG 16384 enum { OCRDMA_DB_RQ_OFFSET = 0xE0, From b8806324ea0b242bcf0cf5740f2ceb1e56381c36 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:25 +0530 Subject: [PATCH 15/52] RDMA/ocrdma: Initialize the GID table while registering the device Current GID table gets updated only at the time of inet notification. Fix this by initializing the table at the time of device registration. Signed-off-by: Selvin Xavier Signed-off-by: Devesh Sharma Signed-off-by: Somnath Kotur Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 53 ++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 0d90c7f75a6f..256a06bc0b68 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -388,6 +388,58 @@ static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); } +static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ + struct in_device *in_dev; + union ib_gid gid; + in_dev = in_dev_get(net); + if (in_dev) { + for_ifa(in_dev) { + ipv6_addr_set_v4mapped(ifa->ifa_address, + (struct in6_addr *)&gid); + ocrdma_add_sgid(dev, &gid); + } + endfor_ifa(in_dev); + in_dev_put(in_dev); + } +} + +static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev, + struct net_device *net) +{ +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_dev *in6_dev; + union ib_gid *pgid; + struct inet6_ifaddr *ifp; + in6_dev = in6_dev_get(net); + if (in6_dev) { + read_lock_bh(&in6_dev->lock); + list_for_each_entry(ifp, &in6_dev->addr_list, if_list) { + pgid = (union ib_gid *)&ifp->addr; + ocrdma_add_sgid(dev, pgid); + } + read_unlock_bh(&in6_dev->lock); + in6_dev_put(in6_dev); + } +#endif +} + +static void ocrdma_init_gid_table(struct ocrdma_dev *dev) +{ + struct net_device *net_dev; + + for_each_netdev(&init_net, net_dev) { + struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ? + rdma_vlan_dev_real_dev(net_dev) : net_dev; + + if (real_dev == dev->nic_info.netdev) { + ocrdma_init_ipv4_gids(dev, net_dev); + ocrdma_init_ipv6_gids(dev, net_dev); + } + } +} + static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) { int status = 0, i; @@ -416,6 +468,7 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) goto alloc_err; ocrdma_init_service_level(dev); + ocrdma_init_gid_table(dev); status = ocrdma_register_device(dev); if (status) goto alloc_err; From 1b09a0c29034eaf68bad9f8821531ebd194af3f7 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:26 +0530 Subject: [PATCH 16/52] RDMA/ocrdma: Fix a sparse warning Fix the warning about the usage of plain integer as NULL pointer. Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 105659d7ef79..a4d27c70528f 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -661,7 +661,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, { struct ocrdma_qp *qp = NULL; struct ocrdma_cq *cq = NULL; - struct ib_event ib_evt = { 0 }; + struct ib_event ib_evt; int cq_event = 0; int qp_event = 1; int srq_event = 0; @@ -674,6 +674,8 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID) cq = dev->cq_tbl[cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK]; + memset(&ib_evt, 0, sizeof(ib_evt)); + ib_evt.device = &dev->ibdev; switch (type) { From 741742ed71c2cc12573ea6cf41ef1fbb3b2efe97 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Tue, 10 Jun 2014 19:32:27 +0530 Subject: [PATCH 17/52] RDMA/ocrdma: Update the ocrdma module version string Signed-off-by: Selvin Xavier Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index 57165137a7a2..b43456ae124b 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -40,7 +40,7 @@ #include #include "ocrdma_sli.h" -#define OCRDMA_ROCE_DRV_VERSION "10.2.145.0u" +#define OCRDMA_ROCE_DRV_VERSION "10.2.287.0u" #define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver" #define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA" From 96ed02d4be3c68527130decd3a7d18240da4bee5 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 31 Jul 2014 13:27:44 +0300 Subject: [PATCH 18/52] IB/iser: Support IPv6 address family Replace struct sockaddr_in with struct sockaddr which supports both IPv4 and IPv6, and print using the %pIS format directive. Signed-off-by: Roi Dayan Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.c | 3 +-- drivers/infiniband/ulp/iser/iscsi_iser.h | 4 ++-- drivers/infiniband/ulp/iser/iser_verbs.c | 17 ++++++----------- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index eb7973957a6e..1a53fd22aedf 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -604,8 +604,7 @@ iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, ib_conn->ep = ep; iser_conn_init(ib_conn); - err = iser_connect(ib_conn, NULL, (struct sockaddr_in *)dst_addr, - non_blocking); + err = iser_connect(ib_conn, NULL, dst_addr, non_blocking); if (err) return ERR_PTR(err); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 97cd385bf7f7..37e928477b08 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -448,8 +448,8 @@ int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task, enum iser_data_dir cmd_dir); int iser_connect(struct iser_conn *ib_conn, - struct sockaddr_in *src_addr, - struct sockaddr_in *dst_addr, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, int non_blocking); int iser_reg_page_vec(struct iser_conn *ib_conn, diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index ea01075f9f9b..dc0c90f4c312 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -810,22 +810,19 @@ void iser_conn_init(struct iser_conn *ib_conn) * sleeps until the connection is established or rejected */ int iser_connect(struct iser_conn *ib_conn, - struct sockaddr_in *src_addr, - struct sockaddr_in *dst_addr, + struct sockaddr *src_addr, + struct sockaddr *dst_addr, int non_blocking) { - struct sockaddr *src, *dst; int err = 0; - sprintf(ib_conn->name, "%pI4:%d", - &dst_addr->sin_addr.s_addr, dst_addr->sin_port); + sprintf(ib_conn->name, "%pISp", dst_addr); + + iser_info("connecting to: %s\n", ib_conn->name); /* the device is known only --after-- address resolution */ ib_conn->device = NULL; - iser_info("connecting to: %pI4, port 0x%x\n", - &dst_addr->sin_addr, dst_addr->sin_port); - ib_conn->state = ISER_CONN_PENDING; ib_conn->cma_id = rdma_create_id(iser_cma_handler, @@ -837,9 +834,7 @@ int iser_connect(struct iser_conn *ib_conn, goto id_failure; } - src = (struct sockaddr *)src_addr; - dst = (struct sockaddr *)dst_addr; - err = rdma_resolve_addr(ib_conn->cma_id, src, dst, 1000); + err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000); if (err) { iser_err("rdma_resolve_addr failed: %d\n", err); goto addr_failure; From 9579d603502d0f24272c4dd70451d97c8d306b54 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 31 Jul 2014 13:27:45 +0300 Subject: [PATCH 19/52] IB/iser: Add TIMEWAIT_EXIT event handling In case the DISCONNECTED event is not delivered after rdma_disconnect is called, the CM waits TIMEWAIT seconds and delivers the TIMEWAIT_EXIT local event. We use this as the notification needed to continue in the teardown and release sequence. Signed-off-by: Ariel Nahum Signed-off-by: Roi Dayan Reviewed-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index dc0c90f4c312..da6f3ddbb0ab 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -785,6 +785,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve case RDMA_CM_EVENT_DISCONNECTED: case RDMA_CM_EVENT_DEVICE_REMOVAL: case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: iser_disconnected_handler(cma_id); break; default: From 2ea32938f3a702d08c5cc2cc9cb8b11235eaad8c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 31 Jul 2014 13:27:46 +0300 Subject: [PATCH 20/52] IB/iser: Fix responder resources advertisement The iser initiator is the RDMA responder so it should publish to the target the max inflight rdma read requests its local HCA can handle in responder_resources (max_qp_rd_atom). The iser target should take the min of that and its local HCA max inflight oustanding rdma read requests (max_qp_init_rd_atom). We keep initiator_depth set to 1 in order to compat with old targets. Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index da6f3ddbb0ab..6c7d8ce7b016 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -694,13 +694,15 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) struct rdma_conn_param conn_param; int ret; struct iser_cm_hdr req_hdr; + struct iser_conn *ib_conn = (struct iser_conn *)cma_id->context; + struct iser_device *device = ib_conn->device; ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); if (ret) goto failure; memset(&conn_param, 0, sizeof conn_param); - conn_param.responder_resources = 4; + conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; conn_param.initiator_depth = 1; conn_param.retry_count = 7; conn_param.rnr_retry_count = 6; From 0a6907588a8b9bf1aa9ee84b809a1d49caea594a Mon Sep 17 00:00:00 2001 From: Ariel Nahum Date: Thu, 31 Jul 2014 13:27:47 +0300 Subject: [PATCH 21/52] IB/iser: Seperate iser_conn and iscsi_endpoint storage space iser connection needs asynchronous cleanup completions which are triggered in ep_disconnect. As a result we are keeping the corresponding iscsi_endpoint structure hanging for no good reason. In order to avoid that, we seperate iser_conn from iscsi_endpoint storage space to have their destruction being independent. iscsi_endpoint will be destroyed at ep_disconnect stage, while the iser connection will wait for asynchronous completions to be released in an orderly fashion. Signed-off-by: Ariel Nahum Signed-off-by: Roi Dayan Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.c | 16 +++++++++++++--- drivers/infiniband/ulp/iser/iser_verbs.c | 2 +- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 1a53fd22aedf..d7acd4b62d6e 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -596,19 +596,28 @@ iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr, struct iser_conn *ib_conn; struct iscsi_endpoint *ep; - ep = iscsi_create_endpoint(sizeof(*ib_conn)); + ep = iscsi_create_endpoint(0); if (!ep) return ERR_PTR(-ENOMEM); - ib_conn = ep->dd_data; + ib_conn = kzalloc(sizeof(*ib_conn), GFP_KERNEL); + if (!ib_conn) { + err = -ENOMEM; + goto failure; + } + + ep->dd_data = ib_conn; ib_conn->ep = ep; iser_conn_init(ib_conn); err = iser_connect(ib_conn, NULL, dst_addr, non_blocking); if (err) - return ERR_PTR(err); + goto failure; return ep; +failure: + iscsi_destroy_endpoint(ep); + return ERR_PTR(err); } static int @@ -658,6 +667,7 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) } else { iser_conn_release(ib_conn); } + iscsi_destroy_endpoint(ep); } static umode_t iser_attr_is_visible(int param_type, int param) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 6c7d8ce7b016..fffb4ac4c6ac 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -620,7 +620,7 @@ void iser_conn_release(struct iser_conn *ib_conn) rdma_destroy_id(ib_conn->cma_id); ib_conn->cma_id = NULL; } - iscsi_destroy_endpoint(ib_conn->ep); + kfree(ib_conn); } /** From f1a8bf0983207bebebd13c0507cb341fbffc5ed7 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Thu, 31 Jul 2014 13:27:48 +0300 Subject: [PATCH 22/52] IB/iser: Remove redundant return code in iser_free_ib_conn_res() Make it void. Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index fffb4ac4c6ac..a5372c6cf9c6 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -491,10 +491,9 @@ out_err: } /** - * releases the QP objects, returns 0 on success, - * -1 on failure + * releases the QP object */ -static int iser_free_ib_conn_res(struct iser_conn *ib_conn) +static void iser_free_ib_conn_res(struct iser_conn *ib_conn) { int cq_index; BUG_ON(ib_conn == NULL); @@ -513,8 +512,6 @@ static int iser_free_ib_conn_res(struct iser_conn *ib_conn) } ib_conn->qp = NULL; - - return 0; } /** From 504130c039f917aba8b145fe8ea99be95e662fca Mon Sep 17 00:00:00 2001 From: Ariel Nahum Date: Thu, 31 Jul 2014 13:27:49 +0300 Subject: [PATCH 23/52] IB/iser: Protect iser state machine with a mutex The iser connection state lookups and transitions are not fully protected. Some transitions are protected with a spinlock, and in some cases the state is accessed unprotected due to specific assumptions of the flow. Introduce a new mutex to protect the connection state access. We use a mutex since we need to also include a scheduling operations executed under the state lock. Each state transition/condition and its corresponding action will be protected with the state mutex. The rdma_cm events handler acquires the mutex when handling connection events. Since iser connection state can transition to DOWN concurrently during connection establishment, we bailout from addr/route resolution events when the state is not PENDING. This addresses a scenario where ep_poll retries expire during CMA connection establishment. In this case ep_disconnect is invoked while CMA events keep coming (address/route resolution, connected, etc...). Signed-off-by: Ariel Nahum Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.c | 15 ++++-- drivers/infiniband/ulp/iser/iscsi_iser.h | 1 + drivers/infiniband/ulp/iser/iser_verbs.c | 63 ++++++++++++++++++++---- 3 files changed, 65 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index d7acd4b62d6e..3dc853c9e4bf 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -632,10 +632,13 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) msecs_to_jiffies(timeout_ms)); /* if conn establishment failed, return error code to iscsi */ - if (!rc && - (ib_conn->state == ISER_CONN_TERMINATING || - ib_conn->state == ISER_CONN_DOWN)) - rc = -1; + if (rc == 0) { + mutex_lock(&ib_conn->state_mutex); + if (ib_conn->state == ISER_CONN_TERMINATING || + ib_conn->state == ISER_CONN_DOWN) + rc = -1; + mutex_unlock(&ib_conn->state_mutex); + } iser_info("ib conn %p rc = %d\n", ib_conn, rc); @@ -654,6 +657,7 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) ib_conn = ep->dd_data; iser_info("ep %p ib conn %p state %d\n", ep, ib_conn, ib_conn->state); + mutex_lock(&ib_conn->state_mutex); iser_conn_terminate(ib_conn); /* @@ -664,7 +668,10 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) if (ib_conn->iscsi_conn) { INIT_WORK(&ib_conn->release_work, iser_release_work); queue_work(release_wq, &ib_conn->release_work); + mutex_unlock(&ib_conn->state_mutex); } else { + ib_conn->state = ISER_CONN_DOWN; + mutex_unlock(&ib_conn->state_mutex); iser_conn_release(ib_conn); } iscsi_destroy_endpoint(ep); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 37e928477b08..c7efc5a91604 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -335,6 +335,7 @@ struct iser_conn { char name[ISER_OBJECT_NAME_SIZE]; struct work_struct release_work; struct completion stop_completion; + struct mutex state_mutex; struct list_head conn_list; /* entry in ig conn list */ char *login_buf; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index a5372c6cf9c6..6e7e54d883ab 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -565,16 +565,17 @@ static void iser_device_try_release(struct iser_device *device) mutex_unlock(&ig.device_list_mutex); } +/** + * Called with state mutex held + **/ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, enum iser_ib_conn_state comp, enum iser_ib_conn_state exch) { int ret; - spin_lock_bh(&ib_conn->lock); if ((ret = (ib_conn->state == comp))) ib_conn->state = exch; - spin_unlock_bh(&ib_conn->lock); return ret; } @@ -591,6 +592,10 @@ void iser_release_work(struct work_struct *work) wait_event_interruptible(ib_conn->wait, ib_conn->state == ISER_CONN_DOWN); + mutex_lock(&ib_conn->state_mutex); + ib_conn->state = ISER_CONN_DOWN; + mutex_unlock(&ib_conn->state_mutex); + iser_conn_release(ib_conn); } @@ -601,17 +606,21 @@ void iser_conn_release(struct iser_conn *ib_conn) { struct iser_device *device = ib_conn->device; - BUG_ON(ib_conn->state == ISER_CONN_UP); - mutex_lock(&ig.connlist_mutex); list_del(&ib_conn->conn_list); mutex_unlock(&ig.connlist_mutex); + + mutex_lock(&ib_conn->state_mutex); + BUG_ON(ib_conn->state != ISER_CONN_DOWN); + iser_free_rx_descriptors(ib_conn); iser_free_ib_conn_res(ib_conn); ib_conn->device = NULL; /* on EVENT_ADDR_ERROR there's no device yet for this conn */ if (device != NULL) iser_device_try_release(device); + mutex_unlock(&ib_conn->state_mutex); + /* if cma handler context, the caller actually destroy the id */ if (ib_conn->cma_id != NULL) { rdma_destroy_id(ib_conn->cma_id); @@ -639,6 +648,9 @@ void iser_conn_terminate(struct iser_conn *ib_conn) ib_conn,err); } +/** + * Called with state mutex held + **/ static void iser_connect_error(struct rdma_cm_id *cma_id) { struct iser_conn *ib_conn; @@ -649,12 +661,20 @@ static void iser_connect_error(struct rdma_cm_id *cma_id) wake_up_interruptible(&ib_conn->wait); } +/** + * Called with state mutex held + **/ static void iser_addr_handler(struct rdma_cm_id *cma_id) { struct iser_device *device; struct iser_conn *ib_conn; int ret; + ib_conn = (struct iser_conn *)cma_id->context; + if (ib_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + device = iser_device_find_by_ib_device(cma_id); if (!device) { iser_err("device lookup/creation failed\n"); @@ -662,7 +682,6 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) return; } - ib_conn = (struct iser_conn *)cma_id->context; ib_conn->device = device; /* connection T10-PI support */ @@ -686,6 +705,9 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) } } +/** + * Called with state mutex held + **/ static void iser_route_handler(struct rdma_cm_id *cma_id) { struct rdma_conn_param conn_param; @@ -694,6 +716,10 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) struct iser_conn *ib_conn = (struct iser_conn *)cma_id->context; struct iser_device *device = ib_conn->device; + if (ib_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); if (ret) goto failure; @@ -727,6 +753,11 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) struct ib_qp_attr attr; struct ib_qp_init_attr init_attr; + ib_conn = (struct iser_conn *)cma_id->context; + if (ib_conn->state != ISER_CONN_PENDING) + /* bailout */ + return; + (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); @@ -761,9 +792,13 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id) static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { + struct iser_conn *ib_conn; + + ib_conn = (struct iser_conn *)cma_id->context; iser_info("event %d status %d conn %p id %p\n", event->event, event->status, cma_id->context, cma_id); + mutex_lock(&ib_conn->state_mutex); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: iser_addr_handler(cma_id); @@ -791,6 +826,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve iser_err("Unexpected RDMA CM event (%d)\n", event->event); break; } + mutex_unlock(&ib_conn->state_mutex); return 0; } @@ -803,6 +839,7 @@ void iser_conn_init(struct iser_conn *ib_conn) init_completion(&ib_conn->stop_completion); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); + mutex_init(&ib_conn->state_mutex); } /** @@ -816,6 +853,8 @@ int iser_connect(struct iser_conn *ib_conn, { int err = 0; + mutex_lock(&ib_conn->state_mutex); + sprintf(ib_conn->name, "%pISp", dst_addr); iser_info("connecting to: %s\n", ib_conn->name); @@ -849,6 +888,7 @@ int iser_connect(struct iser_conn *ib_conn, goto connect_failure; } } + mutex_unlock(&ib_conn->state_mutex); mutex_lock(&ig.connlist_mutex); list_add(&ib_conn->conn_list, &ig.connlist); @@ -860,6 +900,7 @@ id_failure: addr_failure: ib_conn->state = ISER_CONN_DOWN; connect_failure: + mutex_unlock(&ib_conn->state_mutex); iser_conn_release(ib_conn); return err; } @@ -1044,11 +1085,13 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc, if (ib_conn->post_recv_buf_count == 0 && atomic_read(&ib_conn->post_send_buf_count) == 0) { - /* getting here when the state is UP means that the conn is * - * being terminated asynchronously from the iSCSI layer's * - * perspective. */ - if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, - ISER_CONN_TERMINATING)) + /** + * getting here when the state is UP means that the conn is + * being terminated asynchronously from the iSCSI layer's + * perspective. It is safe to peek at the connection state + * since iscsi_conn_failure is allowed to be called twice. + **/ + if (ib_conn->state == ISER_CONN_UP) iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); From 9a6d3234a192d4a3a51df1042c13af13f996242a Mon Sep 17 00:00:00 2001 From: Ariel Nahum Date: Thu, 31 Jul 2014 13:27:50 +0300 Subject: [PATCH 24/52] IB/iser: Replace connection waitqueue with completion object Instead of waiting for events and condition changes of the iser connection state, we wait for explicit completion of connection establishment and teardown. Separate connection establishment wait object from the teardown object to avoid a situation where racing connection establishment and teardown may concurrently wakeup each other. ep_poll will wait for up_completion invoked by iser_connected_handler() and iser release worker will wait for flush_completion before releasing the connection. Bound the completion wait with a 30 seconds timeout for cases where iscsid (the user space iscsi daemon) is too slow or gone. Signed-off-by: Ariel Nahum Signed-off-by: Sagi Grimberg Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iscsi_iser.c | 13 +++++------ drivers/infiniband/ulp/iser/iscsi_iser.h | 3 ++- drivers/infiniband/ulp/iser/iser_verbs.c | 29 ++++++++++++------------ 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 3dc853c9e4bf..61ee91d88380 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -627,10 +627,8 @@ iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms) int rc; ib_conn = ep->dd_data; - rc = wait_event_interruptible_timeout(ib_conn->wait, - ib_conn->state == ISER_CONN_UP, - msecs_to_jiffies(timeout_ms)); - + rc = wait_for_completion_interruptible_timeout(&ib_conn->up_completion, + msecs_to_jiffies(timeout_ms)); /* if conn establishment failed, return error code to iscsi */ if (rc == 0) { mutex_lock(&ib_conn->state_mutex); @@ -661,9 +659,10 @@ iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep) iser_conn_terminate(ib_conn); /* - * if iser_conn and iscsi_conn are bound, we must wait iscsi_conn_stop - * call and ISER_CONN_DOWN state before freeing the iser resources. - * otherwise we are safe to free resources immediately. + * if iser_conn and iscsi_conn are bound, we must wait for + * iscsi_conn_stop and flush errors completion before freeing + * the iser resources. Otherwise we are safe to free resources + * immediately. */ if (ib_conn->iscsi_conn) { INIT_WORK(&ib_conn->release_work, iser_release_work); diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index c7efc5a91604..c877dad381cb 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -326,7 +326,6 @@ struct iser_conn { struct iser_device *device; /* device context */ struct rdma_cm_id *cma_id; /* CMA ID */ struct ib_qp *qp; /* QP */ - wait_queue_head_t wait; /* waitq for conn/disconn */ unsigned qp_max_recv_dtos; /* num of rx buffers */ unsigned qp_max_recv_dtos_mask; /* above minus 1 */ unsigned min_posted_rx; /* qp_max_recv_dtos >> 2 */ @@ -336,6 +335,8 @@ struct iser_conn { struct work_struct release_work; struct completion stop_completion; struct mutex state_mutex; + struct completion flush_completion; + struct completion up_completion; struct list_head conn_list; /* entry in ig conn list */ char *login_buf; diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 6e7e54d883ab..06a49b3df3fd 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -582,15 +582,19 @@ static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, void iser_release_work(struct work_struct *work) { struct iser_conn *ib_conn; + int rc; ib_conn = container_of(work, struct iser_conn, release_work); /* wait for .conn_stop callback */ - wait_for_completion(&ib_conn->stop_completion); + rc = wait_for_completion_timeout(&ib_conn->stop_completion, 30 * HZ); + WARN_ON(rc == 0); /* wait for the qp`s post send and post receive buffers to empty */ - wait_event_interruptible(ib_conn->wait, - ib_conn->state == ISER_CONN_DOWN); + rc = wait_for_completion_timeout(&ib_conn->flush_completion, 30 * HZ); + WARN_ON(rc == 0); + + ib_conn->state = ISER_CONN_DOWN; mutex_lock(&ib_conn->state_mutex); ib_conn->state = ISER_CONN_DOWN; @@ -656,9 +660,7 @@ static void iser_connect_error(struct rdma_cm_id *cma_id) struct iser_conn *ib_conn; ib_conn = (struct iser_conn *)cma_id->context; - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); } /** @@ -761,9 +763,8 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); - ib_conn = (struct iser_conn *)cma_id->context; - if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_PENDING, ISER_CONN_UP)) - wake_up_interruptible(&ib_conn->wait); + ib_conn->state = ISER_CONN_UP; + complete(&ib_conn->up_completion); } static void iser_disconnected_handler(struct rdma_cm_id *cma_id) @@ -785,8 +786,7 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id) /* Complete the termination process if no posts are pending */ if (ib_conn->post_recv_buf_count == 0 && (atomic_read(&ib_conn->post_send_buf_count) == 0)) { - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); + complete(&ib_conn->flush_completion); } } @@ -833,10 +833,11 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve void iser_conn_init(struct iser_conn *ib_conn) { ib_conn->state = ISER_CONN_INIT; - init_waitqueue_head(&ib_conn->wait); ib_conn->post_recv_buf_count = 0; atomic_set(&ib_conn->post_send_buf_count, 0); init_completion(&ib_conn->stop_completion); + init_completion(&ib_conn->flush_completion); + init_completion(&ib_conn->up_completion); INIT_LIST_HEAD(&ib_conn->conn_list); spin_lock_init(&ib_conn->lock); mutex_init(&ib_conn->state_mutex); @@ -880,8 +881,7 @@ int iser_connect(struct iser_conn *ib_conn, } if (!non_blocking) { - wait_event_interruptible(ib_conn->wait, - (ib_conn->state != ISER_CONN_PENDING)); + wait_for_completion_interruptible(&ib_conn->up_completion); if (ib_conn->state != ISER_CONN_UP) { err = -EIO; @@ -1097,8 +1097,7 @@ static void iser_handle_comp_error(struct iser_tx_desc *desc, /* no more non completed posts to the QP, complete the * termination process w.o worrying on disconnect event */ - ib_conn->state = ISER_CONN_DOWN; - wake_up_interruptible(&ib_conn->wait); + complete(&ib_conn->flush_completion); } } From 8d4aca7f04a098086c6545df158e5a022cbbfcd1 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Thu, 31 Jul 2014 13:27:51 +0300 Subject: [PATCH 25/52] IB/iser: Clarify a duplicate counters check This is to prevent someone from thinking that this code section is redundant. Signed-off-by: Ariel Nahum Signed-off-by: Roi Dayan Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/iser/iser_verbs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 06a49b3df3fd..3ef167f97d6f 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -783,7 +783,10 @@ static void iser_disconnected_handler(struct rdma_cm_id *cma_id) iser_err("iscsi_iser connection isn't bound\n"); } - /* Complete the termination process if no posts are pending */ + /* Complete the termination process if no posts are pending. This code + * block also exists in iser_handle_comp_error(), but it is needed here + * for cases of no flushes at all, e.g. discovery over rdma. + */ if (ib_conn->post_recv_buf_count == 0 && (atomic_read(&ib_conn->post_send_buf_count) == 0)) { complete(&ib_conn->flush_completion); From 7e6edb9b2e0bcfb2a588db390c44d120213c57ae Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 Jul 2014 11:01:28 +0300 Subject: [PATCH 26/52] IB/core: Add user MR re-registration support Memory re-registration is a feature that enables changing the attributes of a memory region registered by user-space, including PD, translation (address and length) and access flags. Add the required support in uverbs and the kernel verbs API. Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 1 + drivers/infiniband/core/uverbs_cmd.c | 93 +++++++++++++++++++++++++++ drivers/infiniband/core/uverbs_main.c | 1 + include/rdma/ib_verbs.h | 10 ++- include/uapi/rdma/ib_user_verbs.h | 16 +++++ 5 files changed, 120 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index a283274a5a09..643c08a025a5 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -221,6 +221,7 @@ IB_UVERBS_DECLARE_CMD(query_port); IB_UVERBS_DECLARE_CMD(alloc_pd); IB_UVERBS_DECLARE_CMD(dealloc_pd); IB_UVERBS_DECLARE_CMD(reg_mr); +IB_UVERBS_DECLARE_CMD(rereg_mr); IB_UVERBS_DECLARE_CMD(dereg_mr); IB_UVERBS_DECLARE_CMD(alloc_mw); IB_UVERBS_DECLARE_CMD(dealloc_mw); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ea6203ee7bcc..0600c50e6215 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -1002,6 +1002,99 @@ err_free: return ret; } +ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, + const char __user *buf, int in_len, + int out_len) +{ + struct ib_uverbs_rereg_mr cmd; + struct ib_uverbs_rereg_mr_resp resp; + struct ib_udata udata; + struct ib_pd *pd = NULL; + struct ib_mr *mr; + struct ib_pd *old_pd; + int ret; + struct ib_uobject *uobj; + + if (out_len < sizeof(resp)) + return -ENOSPC; + + if (copy_from_user(&cmd, buf, sizeof(cmd))) + return -EFAULT; + + INIT_UDATA(&udata, buf + sizeof(cmd), + (unsigned long) cmd.response + sizeof(resp), + in_len - sizeof(cmd), out_len - sizeof(resp)); + + if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags) + return -EINVAL; + + if ((cmd.flags & IB_MR_REREG_TRANS) && + (!cmd.start || !cmd.hca_va || 0 >= cmd.length || + (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))) + return -EINVAL; + + uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, + file->ucontext); + + if (!uobj) + return -EINVAL; + + mr = uobj->object; + + if (cmd.flags & IB_MR_REREG_ACCESS) { + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + goto put_uobjs; + } + + if (cmd.flags & IB_MR_REREG_PD) { + pd = idr_read_pd(cmd.pd_handle, file->ucontext); + if (!pd) { + ret = -EINVAL; + goto put_uobjs; + } + } + + if (atomic_read(&mr->usecnt)) { + ret = -EBUSY; + goto put_uobj_pd; + } + + old_pd = mr->pd; + ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, + cmd.length, cmd.hca_va, + cmd.access_flags, pd, &udata); + if (!ret) { + if (cmd.flags & IB_MR_REREG_PD) { + atomic_inc(&pd->usecnt); + mr->pd = pd; + atomic_dec(&old_pd->usecnt); + } + } else { + goto put_uobj_pd; + } + + memset(&resp, 0, sizeof(resp)); + resp.lkey = mr->lkey; + resp.rkey = mr->rkey; + + if (copy_to_user((void __user *)(unsigned long)cmd.response, + &resp, sizeof(resp))) + ret = -EFAULT; + else + ret = in_len; + +put_uobj_pd: + if (cmd.flags & IB_MR_REREG_PD) + put_pd_read(pd); + +put_uobjs: + + put_uobj_write(mr->uobject); + + return ret; +} + ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file, const char __user *buf, int in_len, int out_len) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 08219fb3338b..c73b22a257fe 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -87,6 +87,7 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_ALLOC_PD] = ib_uverbs_alloc_pd, [IB_USER_VERBS_CMD_DEALLOC_PD] = ib_uverbs_dealloc_pd, [IB_USER_VERBS_CMD_REG_MR] = ib_uverbs_reg_mr, + [IB_USER_VERBS_CMD_REREG_MR] = ib_uverbs_rereg_mr, [IB_USER_VERBS_CMD_DEREG_MR] = ib_uverbs_dereg_mr, [IB_USER_VERBS_CMD_ALLOC_MW] = ib_uverbs_alloc_mw, [IB_USER_VERBS_CMD_DEALLOC_MW] = ib_uverbs_dealloc_mw, diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7ccef342f724..ed44cc07a7b3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1097,7 +1097,8 @@ struct ib_mr_attr { enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), - IB_MR_REREG_ACCESS = (1<<2) + IB_MR_REREG_ACCESS = (1<<2), + IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; /** @@ -1547,6 +1548,13 @@ struct ib_device { u64 virt_addr, int mr_access_flags, struct ib_udata *udata); + int (*rereg_user_mr)(struct ib_mr *mr, + int flags, + u64 start, u64 length, + u64 virt_addr, + int mr_access_flags, + struct ib_pd *pd, + struct ib_udata *udata); int (*query_mr)(struct ib_mr *mr, struct ib_mr_attr *mr_attr); int (*dereg_mr)(struct ib_mr *mr); diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index cbfdd4ca9510..26daf55ff76e 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -276,6 +276,22 @@ struct ib_uverbs_reg_mr_resp { __u32 rkey; }; +struct ib_uverbs_rereg_mr { + __u64 response; + __u32 mr_handle; + __u32 flags; + __u64 start; + __u64 length; + __u64 hca_va; + __u32 pd_handle; + __u32 access_flags; +}; + +struct ib_uverbs_rereg_mr_resp { + __u32 lkey; + __u32 rkey; +}; + struct ib_uverbs_dereg_mr { __u32 mr_handle; }; From e630664c8383f300c4146d7613d61e5a8eb1f8e3 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 Jul 2014 11:01:29 +0300 Subject: [PATCH 27/52] mlx4_core: Add helper functions to support MR re-registration Add few helper functions to support a mechanism of getting an MPT, modifying it and updating the HCA with the modified object. The code takes 2 paths, one for directly changing the MPT (and sometimes its related MTTs) and another one which queries the MPT and updates the HCA via fw command SW2HW_MPT. The first path is used in native mode; the second path is slower and is used only in SRIOV. Signed-off-by: Jack Morgenstein Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx4/mlx4.h | 2 + drivers/net/ethernet/mellanox/mlx4/mr.c | 160 ++++++++++++++++++ .../ethernet/mellanox/mlx4/resource_tracker.c | 26 ++- include/linux/mlx4/device.h | 16 ++ 4 files changed, 202 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 1d8af7336807..b40d587974fa 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -279,6 +279,8 @@ struct mlx4_icm_table { #define MLX4_MPT_FLAG_PHYSICAL (1 << 9) #define MLX4_MPT_FLAG_REGION (1 << 8) +#define MLX4_MPT_PD_MASK (0x1FFFFUL) +#define MLX4_MPT_PD_VF_MASK (0xFE0000UL) #define MLX4_MPT_PD_FLAG_FAST_REG (1 << 27) #define MLX4_MPT_PD_FLAG_RAE (1 << 28) #define MLX4_MPT_PD_FLAG_EN_INV (3 << 24) diff --git a/drivers/net/ethernet/mellanox/mlx4/mr.c b/drivers/net/ethernet/mellanox/mlx4/mr.c index 2839abb878a6..7d717eccb7b0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mr.c +++ b/drivers/net/ethernet/mellanox/mlx4/mr.c @@ -298,6 +298,131 @@ static int mlx4_HW2SW_MPT(struct mlx4_dev *dev, struct mlx4_cmd_mailbox *mailbox MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED); } +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry ***mpt_entry) +{ + int err; + int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1); + struct mlx4_cmd_mailbox *mailbox = NULL; + + /* Make sure that at this point we have single-threaded access only */ + + if (mmr->enabled != MLX4_MPT_EN_HW) + return -EINVAL; + + err = mlx4_HW2SW_MPT(dev, NULL, key); + + if (err) { + mlx4_warn(dev, "HW2SW_MPT failed (%d).", err); + mlx4_warn(dev, "Most likely the MR has MWs bound to it.\n"); + return err; + } + + mmr->enabled = MLX4_MPT_EN_SW; + + if (!mlx4_is_mfunc(dev)) { + **mpt_entry = mlx4_table_find( + &mlx4_priv(dev)->mr_table.dmpt_table, + key, NULL); + } else { + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR_OR_NULL(mailbox)) + return PTR_ERR(mailbox); + + err = mlx4_cmd_box(dev, 0, mailbox->dma, key, + 0, MLX4_CMD_QUERY_MPT, + MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + + if (err) + goto free_mailbox; + + *mpt_entry = (struct mlx4_mpt_entry **)&mailbox->buf; + } + + if (!(*mpt_entry) || !(**mpt_entry)) { + err = -ENOMEM; + goto free_mailbox; + } + + return 0; + +free_mailbox: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_get_mpt); + +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry **mpt_entry) +{ + int err; + + if (!mlx4_is_mfunc(dev)) { + /* Make sure any changes to this entry are flushed */ + wmb(); + + *(u8 *)(*mpt_entry) = MLX4_MPT_STATUS_HW; + + /* Make sure the new status is written */ + wmb(); + + err = mlx4_SYNC_TPT(dev); + } else { + int key = key_to_hw_index(mmr->key) & (dev->caps.num_mpts - 1); + + struct mlx4_cmd_mailbox *mailbox = + container_of((void *)mpt_entry, struct mlx4_cmd_mailbox, + buf); + + err = mlx4_SW2HW_MPT(dev, mailbox, key); + } + + mmr->pd = be32_to_cpu((*mpt_entry)->pd_flags) & MLX4_MPT_PD_MASK; + if (!err) + mmr->enabled = MLX4_MPT_EN_HW; + return err; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_write_mpt); + +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev, + struct mlx4_mpt_entry **mpt_entry) +{ + if (mlx4_is_mfunc(dev)) { + struct mlx4_cmd_mailbox *mailbox = + container_of((void *)mpt_entry, struct mlx4_cmd_mailbox, + buf); + mlx4_free_cmd_mailbox(dev, mailbox); + } +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_put_mpt); + +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry, + u32 pdn) +{ + u32 pd_flags = be32_to_cpu(mpt_entry->pd_flags); + /* The wrapper function will put the slave's id here */ + if (mlx4_is_mfunc(dev)) + pd_flags &= ~MLX4_MPT_PD_VF_MASK; + mpt_entry->pd_flags = cpu_to_be32((pd_flags & ~MLX4_MPT_PD_MASK) | + (pdn & MLX4_MPT_PD_MASK) + | MLX4_MPT_PD_FLAG_EN_INV); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_pd); + +int mlx4_mr_hw_change_access(struct mlx4_dev *dev, + struct mlx4_mpt_entry *mpt_entry, + u32 access) +{ + u32 flags = (be32_to_cpu(mpt_entry->flags) & ~MLX4_PERM_MASK) | + (access & MLX4_PERM_MASK); + + mpt_entry->flags = cpu_to_be32(flags); + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_hw_change_access); + static int mlx4_mr_alloc_reserved(struct mlx4_dev *dev, u32 mridx, u32 pd, u64 iova, u64 size, u32 access, int npages, int page_shift, struct mlx4_mr *mr) @@ -463,6 +588,41 @@ int mlx4_mr_free(struct mlx4_dev *dev, struct mlx4_mr *mr) } EXPORT_SYMBOL_GPL(mlx4_mr_free); +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr) +{ + mlx4_mtt_cleanup(dev, &mr->mtt); +} +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_cleanup); + +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, + u64 iova, u64 size, int npages, + int page_shift, struct mlx4_mpt_entry *mpt_entry) +{ + int err; + + mpt_entry->start = cpu_to_be64(mr->iova); + mpt_entry->length = cpu_to_be64(mr->size); + mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift); + + err = mlx4_mtt_init(dev, npages, page_shift, &mr->mtt); + if (err) + return err; + + if (mr->mtt.order < 0) { + mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL); + mpt_entry->mtt_addr = 0; + } else { + mpt_entry->mtt_addr = cpu_to_be64(mlx4_mtt_addr(dev, + &mr->mtt)); + if (mr->mtt.page_shift == 0) + mpt_entry->mtt_sz = cpu_to_be32(1 << mr->mtt.order); + } + mr->enabled = MLX4_MPT_EN_SW; + + return 0; +} +EXPORT_SYMBOL_GPL(mlx4_mr_rereg_mem_write); + int mlx4_mr_enable(struct mlx4_dev *dev, struct mlx4_mr *mr) { struct mlx4_cmd_mailbox *mailbox; diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c index 0efc1368e5a8..1089367fed22 100644 --- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c @@ -2613,12 +2613,34 @@ int mlx4_QUERY_MPT_wrapper(struct mlx4_dev *dev, int slave, if (err) return err; - if (mpt->com.from_state != RES_MPT_HW) { + if (mpt->com.from_state == RES_MPT_MAPPED) { + /* In order to allow rereg in SRIOV, we need to alter the MPT entry. To do + * that, the VF must read the MPT. But since the MPT entry memory is not + * in the VF's virtual memory space, it must use QUERY_MPT to obtain the + * entry contents. To guarantee that the MPT cannot be changed, the driver + * must perform HW2SW_MPT before this query and return the MPT entry to HW + * ownership fofollowing the change. The change here allows the VF to + * perform QUERY_MPT also when the entry is in SW ownership. + */ + struct mlx4_mpt_entry *mpt_entry = mlx4_table_find( + &mlx4_priv(dev)->mr_table.dmpt_table, + mpt->key, NULL); + + if (NULL == mpt_entry || NULL == outbox->buf) { + err = -EINVAL; + goto out; + } + + memcpy(outbox->buf, mpt_entry, sizeof(*mpt_entry)); + + err = 0; + } else if (mpt->com.from_state == RES_MPT_HW) { + err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); + } else { err = -EBUSY; goto out; } - err = mlx4_DMA_wrapper(dev, slave, vhcr, inbox, outbox, cmd); out: put_res(dev, slave, id, RES_MPT); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 35b51e7af886..bac002167ace 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -262,6 +262,7 @@ enum { MLX4_PERM_REMOTE_WRITE = 1 << 13, MLX4_PERM_ATOMIC = 1 << 14, MLX4_PERM_BIND_MW = 1 << 15, + MLX4_PERM_MASK = 0xFC00 }; enum { @@ -1243,4 +1244,19 @@ int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_set_enable_smi_admin(struct mlx4_dev *dev, int slave, int port, int enable); +int mlx4_mr_hw_get_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry ***mpt_entry); +int mlx4_mr_hw_write_mpt(struct mlx4_dev *dev, struct mlx4_mr *mmr, + struct mlx4_mpt_entry **mpt_entry); +int mlx4_mr_hw_change_pd(struct mlx4_dev *dev, struct mlx4_mpt_entry *mpt_entry, + u32 pdn); +int mlx4_mr_hw_change_access(struct mlx4_dev *dev, + struct mlx4_mpt_entry *mpt_entry, + u32 access); +void mlx4_mr_hw_put_mpt(struct mlx4_dev *dev, + struct mlx4_mpt_entry **mpt_entry); +void mlx4_mr_rereg_mem_cleanup(struct mlx4_dev *dev, struct mlx4_mr *mr); +int mlx4_mr_rereg_mem_write(struct mlx4_dev *dev, struct mlx4_mr *mr, + u64 iova, u64 size, int npages, + int page_shift, struct mlx4_mpt_entry *mpt_entry); #endif /* MLX4_DEVICE_H */ From 9376932d0c26d5f5f89c95d5bd45123bba96d3a9 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Thu, 31 Jul 2014 11:01:30 +0300 Subject: [PATCH 28/52] IB/mlx4_ib: Add support for user MR re-registration This enables the user to change the protection domain, access flags and translation (address and length) of the MR. Use basic mlx4_core helper functions to get, update and set MPT and MTT objects according to the required modifications. Signed-off-by: Matan Barak Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 2 + drivers/infiniband/hw/mlx4/mlx4_ib.h | 4 ++ drivers/infiniband/hw/mlx4/mr.c | 88 +++++++++++++++++++++++++++- 3 files changed, 93 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 0f7027e7db13..828a37b24816 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2007,6 +2007,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) (1ull << IB_USER_VERBS_CMD_ALLOC_PD) | (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) | (1ull << IB_USER_VERBS_CMD_REG_MR) | + (1ull << IB_USER_VERBS_CMD_REREG_MR) | (1ull << IB_USER_VERBS_CMD_DEREG_MR) | (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) | (1ull << IB_USER_VERBS_CMD_CREATE_CQ) | @@ -2059,6 +2060,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.req_notify_cq = mlx4_ib_arm_cq; ibdev->ib_dev.get_dma_mr = mlx4_ib_get_dma_mr; ibdev->ib_dev.reg_user_mr = mlx4_ib_reg_user_mr; + ibdev->ib_dev.rereg_user_mr = mlx4_ib_rereg_user_mr; ibdev->ib_dev.dereg_mr = mlx4_ib_dereg_mr; ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr; ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list; diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 369da3ca5d64..e8cad3926bfc 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -788,5 +788,9 @@ int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn); void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count); int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp, int is_attach); +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata); #endif /* MLX4_IB_H */ diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index cb2a8727f3fb..9b0e80e59b08 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -144,8 +144,10 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, if (!mr) return ERR_PTR(-ENOMEM); + /* Force registering the memory as writable. */ + /* Used for memory re-registeration. HCA protects the access */ mr->umem = ib_umem_get(pd->uobject->context, start, length, - access_flags, 0); + access_flags | IB_ACCESS_LOCAL_WRITE, 0); if (IS_ERR(mr->umem)) { err = PTR_ERR(mr->umem); goto err_free; @@ -183,6 +185,90 @@ err_free: return ERR_PTR(err); } +int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags, + u64 start, u64 length, u64 virt_addr, + int mr_access_flags, struct ib_pd *pd, + struct ib_udata *udata) +{ + struct mlx4_ib_dev *dev = to_mdev(mr->device); + struct mlx4_ib_mr *mmr = to_mmr(mr); + struct mlx4_mpt_entry *mpt_entry; + struct mlx4_mpt_entry **pmpt_entry = &mpt_entry; + int err; + + /* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs, + * we assume that the calls can't run concurrently. Otherwise, a + * race exists. + */ + err = mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry); + + if (err) + return err; + + if (flags & IB_MR_REREG_PD) { + err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry, + to_mpd(pd)->pdn); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_ACCESS) { + err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry, + convert_access(mr_access_flags)); + + if (err) + goto release_mpt_entry; + } + + if (flags & IB_MR_REREG_TRANS) { + int shift; + int err; + int n; + + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + mmr->umem = ib_umem_get(mr->uobject->context, start, length, + mr_access_flags | + IB_ACCESS_LOCAL_WRITE, + 0); + if (IS_ERR(mmr->umem)) { + err = PTR_ERR(mmr->umem); + mmr->umem = NULL; + goto release_mpt_entry; + } + n = ib_umem_page_count(mmr->umem); + shift = ilog2(mmr->umem->page_size); + + mmr->mmr.iova = virt_addr; + mmr->mmr.size = length; + err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr, + virt_addr, length, n, shift, + *pmpt_entry); + if (err) { + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + + err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem); + if (err) { + mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr); + ib_umem_release(mmr->umem); + goto release_mpt_entry; + } + } + + /* If we couldn't transfer the MR to the HCA, just remember to + * return a failure. But dereg_mr will free the resources. + */ + err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry); + +release_mpt_entry: + mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry); + + return err; +} + int mlx4_ib_dereg_mr(struct ib_mr *ibmr) { struct mlx4_ib_mr *mr = to_mmr(ibmr); From cd53eb686d2418eda938aad3c9da42b7dfa9351f Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 9 Jul 2014 15:56:43 +0200 Subject: [PATCH 29/52] scsi_transport_srp: Fix fast_io_fail_tmo=dev_loss_tmo=off behavior If scsi_remove_host() is called while an rport is in the blocked state then scsi_remove_host() will only finish if the rport is unblocked from inside a timer function. Make sure that an rport only enters the blocked state if a timer will be started that will unblock it. This avoids that unloading the ib_srp kernel module after having disconnected the initiator from the target system results in a deadlock if both the fast_io_fail_tmo and dev_loss_tmo parameters have been set to "off". Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: David Dillow Cc: Signed-off-by: Roland Dreier --- drivers/scsi/scsi_transport_srp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 13e898332e45..a0c5bfdc5366 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -473,7 +473,8 @@ static void __srp_start_tl_fail_timers(struct srp_rport *rport) if (delay > 0) queue_delayed_work(system_long_wq, &rport->reconnect_work, 1UL * delay * HZ); - if (srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { + if ((fast_io_fail_tmo >= 0 || dev_loss_tmo >= 0) && + srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { pr_debug("%s new state: %d\n", dev_name(&shost->shost_gendev), rport->state); scsi_target_block(&shost->shost_gendev); From bcc05910359183b431da92713e98eed478edf83a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 9 Jul 2014 15:57:26 +0200 Subject: [PATCH 30/52] IB/srp: Fix deadlock between host removal and multipathd If scsi_remove_host() is invoked after a SCSI device has been blocked, if the fast_io_fail_tmo or dev_loss_tmo work gets scheduled on the workqueue executing srp_remove_work() and if an I/O request is scheduled after the SCSI device had been blocked by e.g. multipathd then the following deadlock can occur: kworker/6:1 D ffff880831f3c460 0 195 2 0x00000000 Call Trace: [] schedule+0x29/0x70 [] schedule_timeout+0x10f/0x2a0 [] msleep+0x2f/0x40 [] __blk_drain_queue+0x4e/0x180 [] blk_cleanup_queue+0x225/0x230 [] __scsi_remove_device+0x62/0xe0 [scsi_mod] [] scsi_forget_host+0x6f/0x80 [scsi_mod] [] scsi_remove_host+0x7a/0x130 [scsi_mod] [] srp_remove_work+0x95/0x180 [ib_srp] [] process_one_work+0x1ea/0x6c0 [] worker_thread+0x11b/0x3a0 [] kthread+0xed/0x110 [] ret_from_fork+0x7c/0xb0 multipathd D ffff880096acc460 0 5340 1 0x00000000 Call Trace: [] schedule+0x29/0x70 [] schedule_timeout+0x10f/0x2a0 [] io_schedule_timeout+0x9b/0xf0 [] wait_for_completion_io_timeout+0xdc/0x110 [] blk_execute_rq+0x9b/0x100 [] sg_io+0x1a5/0x450 [] scsi_cmd_ioctl+0x2a1/0x430 [] scsi_cmd_blk_ioctl+0x42/0x50 [] sd_ioctl+0xbe/0x140 [sd_mod] [] blkdev_ioctl+0x234/0x840 [] block_ioctl+0x41/0x50 [] do_vfs_ioctl+0x300/0x520 [] SyS_ioctl+0x41/0x80 [] tracesys+0xd0/0xd5 Fix this by scheduling removal work on another workqueue than the transport layer timers. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: David Dillow Cc: Sebastian Parschauer Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 38 +++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index e3c2c5b4297f..767000811cf9 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -130,6 +130,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); static struct scsi_transport_template *ib_srp_transport_template; +static struct workqueue_struct *srp_remove_wq; static struct ib_client srp_client = { .name = "srp", @@ -731,7 +732,7 @@ static bool srp_queue_remove_work(struct srp_target_port *target) spin_unlock_irq(&target->lock); if (changed) - queue_work(system_long_wq, &target->remove_work); + queue_work(srp_remove_wq, &target->remove_work); return changed; } @@ -3261,9 +3262,10 @@ static void srp_remove_one(struct ib_device *device) spin_unlock(&host->target_lock); /* - * Wait for target port removal tasks. + * Wait for tl_err and target port removal tasks. */ flush_workqueue(system_long_wq); + flush_workqueue(srp_remove_wq); kfree(host); } @@ -3313,16 +3315,22 @@ static int __init srp_init_module(void) indirect_sg_entries = cmd_sg_entries; } + srp_remove_wq = create_workqueue("srp_remove"); + if (IS_ERR(srp_remove_wq)) { + ret = PTR_ERR(srp_remove_wq); + goto out; + } + + ret = -ENOMEM; ib_srp_transport_template = srp_attach_transport(&ib_srp_transport_functions); if (!ib_srp_transport_template) - return -ENOMEM; + goto destroy_wq; ret = class_register(&srp_class); if (ret) { pr_err("couldn't register class infiniband_srp\n"); - srp_release_transport(ib_srp_transport_template); - return ret; + goto release_tr; } ib_sa_register_client(&srp_sa_client); @@ -3330,13 +3338,22 @@ static int __init srp_init_module(void) ret = ib_register_client(&srp_client); if (ret) { pr_err("couldn't register IB client\n"); - srp_release_transport(ib_srp_transport_template); - ib_sa_unregister_client(&srp_sa_client); - class_unregister(&srp_class); - return ret; + goto unreg_sa; } - return 0; +out: + return ret; + +unreg_sa: + ib_sa_unregister_client(&srp_sa_client); + class_unregister(&srp_class); + +release_tr: + srp_release_transport(ib_srp_transport_template); + +destroy_wq: + destroy_workqueue(srp_remove_wq); + goto out; } static void __exit srp_cleanup_module(void) @@ -3345,6 +3362,7 @@ static void __exit srp_cleanup_module(void) ib_sa_unregister_client(&srp_sa_client); class_unregister(&srp_class); srp_release_transport(ib_srp_transport_template); + destroy_workqueue(srp_remove_wq); } module_init(srp_init_module); From e714531a349f614885ca11f68c38270940c5e915 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 9 Jul 2014 15:57:51 +0200 Subject: [PATCH 31/52] IB/srp: Fix residual handling From Documentation/scsi/scsi_mid_low_api.txt: "resid - an LLD should set this signed integer to the requested transfer length (i.e. 'request_bufflen') less the number of bytes that are actually transferred." This means that resid > 0 in case of an underrun and also that resid < 0 in case of an overrun. Modify the SRP initiator code such that it matches this requirement. Signed-off-by: Bart Van Assche Reviewed-by: Sagi Grimberg Reviewed-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 767000811cf9..7f5ee7fc02a6 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1644,10 +1644,14 @@ static void srp_process_rsp(struct srp_target_port *target, struct srp_rsp *rsp) SCSI_SENSE_BUFFERSIZE)); } - if (rsp->flags & (SRP_RSP_FLAG_DOOVER | SRP_RSP_FLAG_DOUNDER)) - scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); - else if (rsp->flags & (SRP_RSP_FLAG_DIOVER | SRP_RSP_FLAG_DIUNDER)) + if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER)) scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER)) + scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt)); + else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER)) + scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt)); srp_free_req(target, req, scmnd, be32_to_cpu(rsp->req_lim_delta)); From 2f0304d21867476394cd51a54e97f7273d112261 Mon Sep 17 00:00:00 2001 From: Steve Wise Date: Fri, 25 Jul 2014 09:11:33 -0500 Subject: [PATCH 32/52] RDMA/iwcm: Use a default listen backlog if needed If the user creates a listening cm_id with backlog of 0 the IWCM ends up not allowing any connection requests at all. The correct behavior is for the IWCM to pick a default value if the user backlog parameter is zero. Lustre from version 1.8.8 onward uses a backlog of 0, which breaks iwarp support without this fix. Signed-off-by: Steve Wise Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/core/iwcm.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 3d2e489ab732..ff9163dc1596 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -65,6 +66,20 @@ struct iwcm_work { struct list_head free_list; }; +static unsigned int default_backlog = 256; + +static struct ctl_table_header *iwcm_ctl_table_hdr; +static struct ctl_table iwcm_ctl_table[] = { + { + .procname = "default_backlog", + .data = &default_backlog, + .maxlen = sizeof(default_backlog), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { } +}; + /* * The following services provide a mechanism for pre-allocating iwcm_work * elements. The design pre-allocates them based on the cm_id type: @@ -425,6 +440,9 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog) cm_id_priv = container_of(cm_id, struct iwcm_id_private, id); + if (!backlog) + backlog = default_backlog; + ret = alloc_work_entries(cm_id_priv, backlog); if (ret) return ret; @@ -1030,11 +1048,20 @@ static int __init iw_cm_init(void) if (!iwcm_wq) return -ENOMEM; + iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm", + iwcm_ctl_table); + if (!iwcm_ctl_table_hdr) { + pr_err("iw_cm: couldn't register sysctl paths\n"); + destroy_workqueue(iwcm_wq); + return -ENOMEM; + } + return 0; } static void __exit iw_cm_cleanup(void) { + unregister_net_sysctl_table(iwcm_ctl_table_hdr); destroy_workqueue(iwcm_wq); } From 114840c3d29b9cbd867faa69595a2aee6f6b54a2 Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Sun, 1 Jun 2014 11:53:50 +0300 Subject: [PATCH 33/52] mlx4_core: Add support for secure-host and SMP firewall Secure-host is the general term for the capability of a device to protect itself and the subnet from malicious host software. This is achieved by: 1. Not allowing un-trusted entities to access device configuration registers, directly (through pci_cr or pci_conf) and indirectly (through MADs). 2. Hiding M_Key from untrusted entities. 3. Preventing the modification of GUID0 by un-trusted entities 4. Not allowing drivers on untrusted hosts to receive nor to transmit packets over QP0 (SMP Firewall). The secure-host capability depends on firmware handling all QP0 packets, and not passing these packets up to the driver. Any information required by the driver for proper operation (e.g., SM lid) is passed via events generated by the firmware while processing QP0 MADs. Driver support mainly requires using the MAD_DEMUX FW command at startup, where the feature is enabled/disabled through a procedure described in the Mellanox HCA tools package. Signed-off-by: Jack Morgenstein Signed-off-by: Or Gerlitz [ Fix error path in mlx4_setup_hca to go to err_mcg_table_free. - Roland ] Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx4/cmd.c | 9 +++ drivers/net/ethernet/mellanox/mlx4/fw.c | 91 ++++++++++++++++++++++- drivers/net/ethernet/mellanox/mlx4/main.c | 5 ++ drivers/net/ethernet/mellanox/mlx4/mlx4.h | 1 + include/linux/mlx4/cmd.h | 7 ++ include/linux/mlx4/device.h | 1 + 6 files changed, 113 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c b/drivers/net/ethernet/mellanox/mlx4/cmd.c index 5d940a26055c..65a4a0f88ea0 100644 --- a/drivers/net/ethernet/mellanox/mlx4/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c @@ -1310,6 +1310,15 @@ static struct mlx4_cmd_info cmd_info[] = { .verify = NULL, .wrapper = mlx4_MAD_IFC_wrapper }, + { + .opcode = MLX4_CMD_MAD_DEMUX, + .has_inbox = false, + .has_outbox = false, + .out_is_imm = false, + .encode_slave_id = false, + .verify = NULL, + .wrapper = mlx4_CMD_EPERM_wrapper + }, { .opcode = MLX4_CMD_QUERY_IF_STAT, .has_inbox = false, diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 688e1eabab29..494753e44ae3 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -136,7 +136,8 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [7] = "FSM (MAC anti-spoofing) support", [8] = "Dynamic QP updates support", [9] = "Device managed flow steering IPoIB support", - [10] = "TCP/IP offloads/flow-steering for VXLAN support" + [10] = "TCP/IP offloads/flow-steering for VXLAN support", + [11] = "MAD DEMUX (Secure-Host) support" }; int i; @@ -571,6 +572,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET 0xa0 #define QUERY_DEV_CAP_FW_REASSIGN_MAC 0x9d #define QUERY_DEV_CAP_VXLAN 0x9e +#define QUERY_DEV_CAP_MAD_DEMUX_OFFSET 0xb0 dev_cap->flags2 = 0; mailbox = mlx4_alloc_cmd_mailbox(dev); @@ -748,6 +750,11 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) MLX4_GET(dev_cap->max_counters, outbox, QUERY_DEV_CAP_MAX_COUNTERS_OFFSET); + MLX4_GET(field32, outbox, + QUERY_DEV_CAP_MAD_DEMUX_OFFSET); + if (field32 & (1 << 0)) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_MAD_DEMUX; + MLX4_GET(field32, outbox, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); if (field32 & (1 << 16)) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_UPDATE_QP; @@ -2016,3 +2023,85 @@ void mlx4_opreq_action(struct work_struct *work) out: mlx4_free_cmd_mailbox(dev, mailbox); } + +static int mlx4_check_smp_firewall_active(struct mlx4_dev *dev, + struct mlx4_cmd_mailbox *mailbox) +{ +#define MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET 0x10 +#define MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET 0x20 +#define MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET 0x40 +#define MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET 0x70 + + u32 set_attr_mask, getresp_attr_mask; + u32 trap_attr_mask, traprepress_attr_mask; + + MLX4_GET(set_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_SET_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall set_attribute_mask = 0x%x\n", + set_attr_mask); + + MLX4_GET(getresp_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_GETRESP_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall getresp_attribute_mask = 0x%x\n", + getresp_attr_mask); + + MLX4_GET(trap_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_TRAP_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall trap_attribute_mask = 0x%x\n", + trap_attr_mask); + + MLX4_GET(traprepress_attr_mask, mailbox->buf, + MLX4_CMD_MAD_DEMUX_TRAP_REPRESS_ATTR_OFFSET); + mlx4_dbg(dev, "SMP firewall traprepress_attribute_mask = 0x%x\n", + traprepress_attr_mask); + + if (set_attr_mask && getresp_attr_mask && trap_attr_mask && + traprepress_attr_mask) + return 1; + + return 0; +} + +int mlx4_config_mad_demux(struct mlx4_dev *dev) +{ + struct mlx4_cmd_mailbox *mailbox; + int secure_host_active; + int err; + + /* Check if mad_demux is supported */ + if (!(dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_MAD_DEMUX)) + return 0; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) { + mlx4_warn(dev, "Failed to allocate mailbox for cmd MAD_DEMUX"); + return -ENOMEM; + } + + /* Query mad_demux to find out which MADs are handled by internal sma */ + err = mlx4_cmd_box(dev, 0, mailbox->dma, 0x01 /* subn mgmt class */, + MLX4_CMD_MAD_DEMUX_QUERY_RESTR, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) { + mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: query restrictions failed (%d)\n", + err); + goto out; + } + + secure_host_active = mlx4_check_smp_firewall_active(dev, mailbox); + + /* Config mad_demux to handle all MADs returned by the query above */ + err = mlx4_cmd(dev, mailbox->dma, 0x01 /* subn mgmt class */, + MLX4_CMD_MAD_DEMUX_CONFIG, MLX4_CMD_MAD_DEMUX, + MLX4_CMD_TIME_CLASS_B, MLX4_CMD_NATIVE); + if (err) { + mlx4_warn(dev, "MLX4_CMD_MAD_DEMUX: configure failed (%d)\n", err); + goto out; + } + + if (secure_host_active) + mlx4_warn(dev, "HCA operating in secure-host mode. SMP firewall activated.\n"); +out: + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c index 82ab427290c3..f2c8e8ba23fe 100644 --- a/drivers/net/ethernet/mellanox/mlx4/main.c +++ b/drivers/net/ethernet/mellanox/mlx4/main.c @@ -1831,6 +1831,11 @@ static int mlx4_setup_hca(struct mlx4_dev *dev) mlx4_err(dev, "Failed to initialize multicast group table, aborting\n"); goto err_mr_table_free; } + err = mlx4_config_mad_demux(dev); + if (err) { + mlx4_err(dev, "Failed in config_mad_demux, aborting\n"); + goto err_mcg_table_free; + } } err = mlx4_init_eq_table(dev); diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 1d8af7336807..310b30b4682c 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -1311,5 +1311,6 @@ void mlx4_init_quotas(struct mlx4_dev *dev); int mlx4_get_slave_num_gids(struct mlx4_dev *dev, int slave, int port); /* Returns the VF index of slave */ int mlx4_get_vf_indx(struct mlx4_dev *dev, int slave); +int mlx4_config_mad_demux(struct mlx4_dev *dev); #endif /* MLX4_H */ diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index c8450366c130..379c02648ab3 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -116,6 +116,7 @@ enum { /* special QP and management commands */ MLX4_CMD_CONF_SPECIAL_QP = 0x23, MLX4_CMD_MAD_IFC = 0x24, + MLX4_CMD_MAD_DEMUX = 0x203, /* multicast commands */ MLX4_CMD_READ_MCG = 0x25, @@ -185,6 +186,12 @@ enum { MLX4_SET_PORT_VXLAN = 0xB }; +enum { + MLX4_CMD_MAD_DEMUX_CONFIG = 0, + MLX4_CMD_MAD_DEMUX_QUERY_STATE = 1, + MLX4_CMD_MAD_DEMUX_QUERY_RESTR = 2, /* Query mad demux restrictions */ +}; + enum { MLX4_CMD_WRAPPED, MLX4_CMD_NATIVE diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 35b51e7af886..cee9561e8ef6 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -172,6 +172,7 @@ enum { MLX4_DEV_CAP_FLAG2_UPDATE_QP = 1LL << 8, MLX4_DEV_CAP_FLAG2_DMFS_IPOIB = 1LL << 9, MLX4_DEV_CAP_FLAG2_VXLAN_OFFLOADS = 1LL << 10, + MLX4_DEV_CAP_FLAG2_MAD_DEMUX = 1LL << 11, }; enum { From e316453301f02bfcaabcb86e628f3dbef2e96c7e Mon Sep 17 00:00:00 2001 From: Or Gerlitz Date: Tue, 8 Jul 2014 12:45:10 +0300 Subject: [PATCH 34/52] IB/ipath: Add P_Key change event support Deliver P_Key_CHANGE event through the relevant IB device when the local pkey table changes. Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_mad.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c index 43f2d0424d4f..e890e5ba0e01 100644 --- a/drivers/infiniband/hw/ipath/ipath_mad.c +++ b/drivers/infiniband/hw/ipath/ipath_mad.c @@ -726,7 +726,7 @@ bail: * @dd: the infinipath device * @pkeys: the PKEY table */ -static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) +static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port) { struct ipath_portdata *pd; int i; @@ -759,6 +759,7 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) } if (changed) { u64 pkey; + struct ib_event event; pkey = (u64) dd->ipath_pkeys[0] | ((u64) dd->ipath_pkeys[1] << 16) | @@ -768,12 +769,17 @@ static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys) (unsigned long long) pkey); ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey, pkey); + + event.event = IB_EVENT_PKEY_CHANGE; + event.device = &dd->verbs_dev->ibdev; + event.element.port_num = port; + ib_dispatch_event(&event); } return 0; } static int recv_subn_set_pkeytable(struct ib_smp *smp, - struct ib_device *ibdev) + struct ib_device *ibdev, u8 port) { u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff); __be16 *p = (__be16 *) smp->data; @@ -784,7 +790,7 @@ static int recv_subn_set_pkeytable(struct ib_smp *smp, for (i = 0; i < n; i++) q[i] = be16_to_cpu(p[i]); - if (startpx != 0 || set_pkeys(dev->dd, q) != 0) + if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0) smp->status |= IB_SMP_INVALID_FIELD; return recv_subn_get_pkeytable(smp, ibdev); @@ -1342,7 +1348,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags, ret = recv_subn_set_portinfo(smp, ibdev, port_num); goto bail; case IB_SMP_ATTR_PKEY_TABLE: - ret = recv_subn_set_pkeytable(smp, ibdev); + ret = recv_subn_set_pkeytable(smp, ibdev, port_num); goto bail; case IB_SMP_ATTR_SM_INFO: if (dev->port_cap_flags & IB_PORT_SM_DISABLED) { From db84f88037592f069f5c1fc7f6b00ae6f5eb2e84 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 8 Jul 2014 12:45:11 +0300 Subject: [PATCH 35/52] IB/ipoib: Use P_Key change event instead of P_Key polling mechanism The current code use a dedicated polling logic to determine when the P_Key assigned to the ipoib device is present in HCA port table and act accordingly. Move to use the code which acts upon getting PKEY_CHANGE event to handle this task and remove the P_Key polling logic/thread as they add extra complexity which isn't needed. Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Acked-by: Alex Estrin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 6 +- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 73 +++++------------------ drivers/infiniband/ulp/ipoib/ipoib_main.c | 6 +- 3 files changed, 20 insertions(+), 65 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index c639f90cfda4..683d23ac7cbf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -86,7 +86,6 @@ enum { IPOIB_FLAG_INITIALIZED = 1, IPOIB_FLAG_ADMIN_UP = 2, IPOIB_PKEY_ASSIGNED = 3, - IPOIB_PKEY_STOP = 4, IPOIB_FLAG_SUBINTERFACE = 5, IPOIB_MCAST_RUN = 6, IPOIB_STOP_REAPER = 7, @@ -312,7 +311,6 @@ struct ipoib_dev_priv { struct list_head multicast_list; struct rb_root multicast_tree; - struct delayed_work pkey_poll_task; struct delayed_work mcast_task; struct work_struct carrier_on_task; struct work_struct flush_light; @@ -477,6 +475,7 @@ int ipoib_ib_dev_open(struct net_device *dev); int ipoib_ib_dev_up(struct net_device *dev); int ipoib_ib_dev_down(struct net_device *dev, int flush); int ipoib_ib_dev_stop(struct net_device *dev, int flush); +void ipoib_pkey_dev_check_presence(struct net_device *dev); int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port); void ipoib_dev_cleanup(struct net_device *dev); @@ -532,8 +531,7 @@ int ipoib_set_mode(struct net_device *dev, const char *buf); void ipoib_setup(struct net_device *dev); -void ipoib_pkey_poll(struct work_struct *work); -int ipoib_pkey_dev_delay_open(struct net_device *dev); +void ipoib_pkey_open(struct ipoib_dev_priv *priv); void ipoib_drain_cq(struct net_device *dev); void ipoib_set_ethtool_ops(struct net_device *dev); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 6a7003ddb0be..be8f971edbec 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -709,7 +709,7 @@ dev_stop: return -1; } -static void ipoib_pkey_dev_check_presence(struct net_device *dev) +void ipoib_pkey_dev_check_presence(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); u16 pkey_index = 0; @@ -745,14 +745,6 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags); netif_carrier_off(dev); - /* Shutdown the P_Key thread if still active */ - if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { - mutex_lock(&pkey_mutex); - set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work_sync(&priv->pkey_poll_task); - mutex_unlock(&pkey_mutex); - } - ipoib_mcast_stop_thread(dev, flush); ipoib_mcast_dev_flush(dev); @@ -988,9 +980,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { /* for non-child devices must check/update the pkey value here */ - if (level == IPOIB_FLUSH_HEAVY && - !test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) - update_parent_pkey(priv); + if (level == IPOIB_FLUSH_HEAVY) { + if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + ipoib_pkey_open(priv); + else + update_parent_pkey(priv); + } ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } @@ -1009,8 +1004,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ipoib_ib_dev_down(dev, 0); ipoib_ib_dev_stop(dev, 0); - if (ipoib_pkey_dev_delay_open(dev)) - return; + return; } /* restart QP only if P_Key index is changed */ if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && @@ -1094,54 +1088,15 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) ipoib_transport_dev_cleanup(dev); } -/* - * Delayed P_Key Assigment Interim Support - * - * The following is initial implementation of delayed P_Key assigment - * mechanism. It is using the same approach implemented for the multicast - * group join. The single goal of this implementation is to quickly address - * Bug #2507. This implementation will probably be removed when the P_Key - * change async notification is available. - */ - -void ipoib_pkey_poll(struct work_struct *work) +void ipoib_pkey_open(struct ipoib_dev_priv *priv) { - struct ipoib_dev_priv *priv = - container_of(work, struct ipoib_dev_priv, pkey_poll_task.work); - struct net_device *dev = priv->dev; - ipoib_pkey_dev_check_presence(dev); + if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + return; + + ipoib_pkey_dev_check_presence(priv->dev); if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) - ipoib_open(dev); - else { - mutex_lock(&pkey_mutex); - if (!test_bit(IPOIB_PKEY_STOP, &priv->flags)) - queue_delayed_work(ipoib_workqueue, - &priv->pkey_poll_task, - HZ); - mutex_unlock(&pkey_mutex); - } + ipoib_open(priv->dev); } -int ipoib_pkey_dev_delay_open(struct net_device *dev) -{ - struct ipoib_dev_priv *priv = netdev_priv(dev); - - /* Look for the interface pkey value in the IB Port P_Key table and */ - /* set the interface pkey assigment flag */ - ipoib_pkey_dev_check_presence(dev); - - /* P_Key value not assigned yet - start polling */ - if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { - mutex_lock(&pkey_mutex); - clear_bit(IPOIB_PKEY_STOP, &priv->flags); - queue_delayed_work(ipoib_workqueue, - &priv->pkey_poll_task, - HZ); - mutex_unlock(&pkey_mutex); - return 1; - } - - return 0; -} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 5786a78ff8bc..35acbd41a5f7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,7 +108,10 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - if (ipoib_pkey_dev_delay_open(dev)) + + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; if (ipoib_ib_dev_open(dev)) @@ -1379,7 +1382,6 @@ void ipoib_setup(struct net_device *dev) INIT_LIST_HEAD(&priv->dead_ahs); INIT_LIST_HEAD(&priv->multicast_list); - INIT_DELAYED_WORK(&priv->pkey_poll_task, ipoib_pkey_poll); INIT_DELAYED_WORK(&priv->mcast_task, ipoib_mcast_join_task); INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task); INIT_WORK(&priv->flush_light, ipoib_ib_dev_flush_light); From 4eae374845affc3ebf8ccfc38d554a86adc95003 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Tue, 8 Jul 2014 12:45:12 +0300 Subject: [PATCH 36/52] IB/ipoib: Avoid flushing the workqueue from worker context The error flow of ipoib_ib_dev_open() invokes ipoib_ib_dev_stop() with workqueue flushing enabled, which deadlocks if the open procedure itself was called by a worker thread. Fix this by adding a flush enabled flag to ipoib_ib_dev_open() and set it accordingly from the locations where such a call is made. The call trace was the following: [] ? flush_workqueue+0x54/0x80 [] ? ipoib_ib_dev_stop+0x447/0x650 [ib_ipoib] [] ? ipoib_ib_dev_open+0x284/0x430 [ib_ipoib] [] ? ipoib_open+0x78/0x1d0 [ib_ipoib] [] ? ipoib_pkey_open+0x38/0x40 [ib_ipoib] [] ? __ipoib_ib_dev_flush+0x15c/0x2c0 [ib_ipoib] [] ? __ipoib_ib_dev_flush+0x76/0x2c0 [ib_ipoib] [] ? ipoib_ib_dev_flush_heavy+0x0/0x20 [ib_ipoib] [] ? ipoib_ib_dev_flush_heavy+0x1a/0x20 [ib_ipoib] [] ? worker_thread+0x170/0x2a0 [] ? autoremove_wake_function+0x0/0x40 Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Acked-by: Alex Estrin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 8 ++++---- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 683d23ac7cbf..3edce617c31b 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -471,7 +471,7 @@ void ipoib_ib_dev_flush_heavy(struct work_struct *work); void ipoib_pkey_event(struct work_struct *work); void ipoib_ib_dev_cleanup(struct net_device *dev); -int ipoib_ib_dev_open(struct net_device *dev); +int ipoib_ib_dev_open(struct net_device *dev, int flush); int ipoib_ib_dev_up(struct net_device *dev); int ipoib_ib_dev_down(struct net_device *dev, int flush); int ipoib_ib_dev_stop(struct net_device *dev, int flush); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index be8f971edbec..9dcb2c9e758a 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -664,7 +664,7 @@ static void ipoib_ib_tx_timer_func(unsigned long ctx) drain_tx_cq((struct net_device *)ctx); } -int ipoib_ib_dev_open(struct net_device *dev) +int ipoib_ib_dev_open(struct net_device *dev, int flush) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; @@ -705,7 +705,7 @@ int ipoib_ib_dev_open(struct net_device *dev) dev_stop: if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) napi_enable(&priv->napi); - ipoib_ib_dev_stop(dev, 1); + ipoib_ib_dev_stop(dev, flush); return -1; } @@ -916,7 +916,7 @@ int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port) (unsigned long) dev); if (dev->flags & IFF_UP) { - if (ipoib_ib_dev_open(dev)) { + if (ipoib_ib_dev_open(dev, 1)) { ipoib_transport_dev_cleanup(dev); return -ENODEV; } @@ -1033,7 +1033,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, if (level == IPOIB_FLUSH_HEAVY) { ipoib_ib_dev_stop(dev, 0); - ipoib_ib_dev_open(dev); + ipoib_ib_dev_open(dev, 0); } /* diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 35acbd41a5f7..1bf994a96860 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -114,7 +114,7 @@ int ipoib_open(struct net_device *dev) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) return 0; - if (ipoib_ib_dev_open(dev)) + if (ipoib_ib_dev_open(dev, 1)) goto err_disable; if (ipoib_ib_dev_up(dev)) From f426a40eb695d315466f130618db30cafb27db90 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 8 Aug 2014 19:00:52 -0400 Subject: [PATCH 37/52] IB/umad: Update module to [pr|dev]_* style print messages Use dev_* style print when struct device is available. Also combine previously line broken user-visible strings as per Documentation/CodingStyle: "However, never break user-visible strings such as printk messages, because that breaks the ability to grep for them." Signed-off-by: Ira Weiny Signed-off-by: Roland Dreier --- drivers/infiniband/core/user_mad.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 1acb99100556..6be596dc360a 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -33,6 +33,8 @@ * SOFTWARE. */ +#define pr_fmt(fmt) "user_mad: " fmt + #include #include #include @@ -673,10 +675,11 @@ found: if (!file->already_used) { file->already_used = 1; if (!file->use_pkey_index) { - printk(KERN_WARNING "user_mad: process %s did not enable " - "P_Key index support.\n", current->comm); - printk(KERN_WARNING "user_mad: Documentation/infiniband/user_mad.txt " - "has info on the new ABI.\n"); + dev_warn(file->port->dev, + "process %s did not enable P_Key index support.\n", + current->comm); + dev_warn(file->port->dev, + " Documentation/infiniband/user_mad.txt has info on the new ABI.\n"); } } @@ -983,7 +986,7 @@ static CLASS_ATTR_STRING(abi_version, S_IRUGO, static dev_t overflow_maj; static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS); -static int find_overflow_devnum(void) +static int find_overflow_devnum(struct ib_device *device) { int ret; @@ -991,7 +994,8 @@ static int find_overflow_devnum(void) ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register dynamic device number\n"); + dev_err(&device->dev, + "couldn't register dynamic device number\n"); return ret; } } @@ -1014,7 +1018,7 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); if (devnum >= IB_UMAD_MAX_PORTS) { spin_unlock(&port_lock); - devnum = find_overflow_devnum(); + devnum = find_overflow_devnum(device); if (devnum < 0) return -1; @@ -1200,14 +1204,14 @@ static int __init ib_umad_init(void) ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2, "infiniband_mad"); if (ret) { - printk(KERN_ERR "user_mad: couldn't register device number\n"); + pr_err("couldn't register device number\n"); goto out; } umad_class = class_create(THIS_MODULE, "infiniband_mad"); if (IS_ERR(umad_class)) { ret = PTR_ERR(umad_class); - printk(KERN_ERR "user_mad: couldn't create class infiniband_mad\n"); + pr_err("couldn't create class infiniband_mad\n"); goto out_chrdev; } @@ -1215,13 +1219,13 @@ static int __init ib_umad_init(void) ret = class_create_file(umad_class, &class_attr_abi_version.attr); if (ret) { - printk(KERN_ERR "user_mad: couldn't create abi_version attribute\n"); + pr_err("couldn't create abi_version attribute\n"); goto out_class; } ret = ib_register_client(&umad_client); if (ret) { - printk(KERN_ERR "user_mad: couldn't register ib_umad client\n"); + pr_err("couldn't register ib_umad client\n"); goto out_class; } From dd57c9308afff61e4c157d0a7260695fe2f9a98c Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 6 Aug 2014 14:40:32 -0400 Subject: [PATCH 38/52] IB/ipoib: Avoid multicast join attempts with invalid P_key Currently, the parent interface keeps sending broadcast group join requests even if p_key index 0 is invalid, which is possible/common in virtualized environments where a VF has been probed to VM but the actual P_key configuration has not yet been assigned by the management software. This creates unnecessary noise on the fabric and in the kernel logs: ib0: multicast join failed for ff12:401b:8000:0000:0000:0000:ffff:ffff, status -22 The original code run the multicast task regardless of the actual P_key value, which can be avoided. The fix is to re-init resources and bring interface up only if P_key index 0 is valid either when starting up or on PKEY_CHANGE event. Fixes: c290414169 ("IPoIB: Fix pkey change flow for virtualization environments") Reviewed-by: Ira Weiny Signed-off-by: Alex Estrin Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 80 ++++++++++++----------- drivers/infiniband/ulp/ipoib/ipoib_main.c | 11 ++-- 2 files changed, 46 insertions(+), 45 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 9dcb2c9e758a..72626c348174 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -669,12 +669,13 @@ int ipoib_ib_dev_open(struct net_device *dev, int flush) struct ipoib_dev_priv *priv = netdev_priv(dev); int ret; - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &priv->pkey_index)) { - ipoib_warn(priv, "P_Key 0x%04x not found\n", priv->pkey); - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); + ipoib_pkey_dev_check_presence(dev); + + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { + ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey, + (!(priv->pkey & 0x7fff) ? "Invalid" : "not found")); return -1; } - set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); ret = ipoib_init_qp(dev); if (ret) { @@ -712,9 +713,10 @@ dev_stop: void ipoib_pkey_dev_check_presence(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - u16 pkey_index = 0; - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) + if (!(priv->pkey & 0x7fff) || + ib_find_pkey(priv->ca, priv->port, priv->pkey, + &priv->pkey_index)) clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); else set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); @@ -958,13 +960,27 @@ static inline int update_parent_pkey(struct ipoib_dev_priv *priv) return 1; } +/* + * returns 0 if pkey value was found in a different slot. + */ +static inline int update_child_pkey(struct ipoib_dev_priv *priv) +{ + u16 old_index = priv->pkey_index; + + priv->pkey_index = 0; + ipoib_pkey_dev_check_presence(priv->dev); + + if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && + (old_index == priv->pkey_index)) + return 1; + return 0; +} static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, enum ipoib_flush_level level) { struct ipoib_dev_priv *cpriv; struct net_device *dev = priv->dev; - u16 new_index; int result; down_read(&priv->vlan_rwsem); @@ -978,19 +994,20 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, up_read(&priv->vlan_rwsem); - if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { - /* for non-child devices must check/update the pkey value here */ - if (level == IPOIB_FLUSH_HEAVY) { - if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) - ipoib_pkey_open(priv); - else - update_parent_pkey(priv); - } + if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) && + level != IPOIB_FLUSH_HEAVY) { ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n"); return; } if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) { + /* interface is down. update pkey and leave. */ + if (level == IPOIB_FLUSH_HEAVY) { + if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) + update_parent_pkey(priv); + else + update_child_pkey(priv); + } ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n"); return; } @@ -1000,19 +1017,13 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, * (parent) devices should always takes what present in pkey index 0 */ if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) { - if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &new_index)) { - clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags); - ipoib_ib_dev_down(dev, 0); - ipoib_ib_dev_stop(dev, 0); - return; - } - /* restart QP only if P_Key index is changed */ - if (test_and_set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) && - new_index == priv->pkey_index) { + result = update_child_pkey(priv); + if (result) { + /* restart QP only if P_Key index is changed */ ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n"); return; } - priv->pkey_index = new_index; + } else { result = update_parent_pkey(priv); /* restart QP only if P_Key value changed */ @@ -1032,8 +1043,12 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, ipoib_ib_dev_down(dev, 0); if (level == IPOIB_FLUSH_HEAVY) { - ipoib_ib_dev_stop(dev, 0); - ipoib_ib_dev_open(dev, 0); + if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + ipoib_ib_dev_stop(dev, 0); + if (ipoib_ib_dev_open(dev, 0) != 0) + return; + if (netif_queue_stopped(dev)) + netif_start_queue(dev); } /* @@ -1088,15 +1103,4 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) ipoib_transport_dev_cleanup(dev); } -void ipoib_pkey_open(struct ipoib_dev_priv *priv) -{ - - if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) - return; - - ipoib_pkey_dev_check_presence(priv->dev); - - if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) - ipoib_open(priv->dev); -} diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 1bf994a96860..217cb77157d8 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -108,14 +108,11 @@ int ipoib_open(struct net_device *dev) set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags); - - ipoib_pkey_dev_check_presence(dev); - - if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) - return 0; - - if (ipoib_ib_dev_open(dev, 1)) + if (ipoib_ib_dev_open(dev, 1)) { + if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) + return 0; goto err_disable; + } if (ipoib_ib_dev_up(dev)) goto err_stop; From 7ef5d4b0463c095a994890131918d3301d8404ee Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 8 Aug 2014 19:00:53 -0400 Subject: [PATCH 39/52] IB/mad: Update module to [pr|dev]_* style print messages Use dev_* style print when struct device is available. Also combine previously line broken user-visible strings as per Documentation/CodingStyle: "However, never break user-visible strings such as printk messages, because that breaks the ability to grep for them." Signed-off-by: Ira Weiny [ Remove PFX so the patch actually builds. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/core/agent.c | 12 +-- drivers/infiniband/core/mad.c | 147 ++++++++++++++++------------- drivers/infiniband/core/mad_priv.h | 3 - 3 files changed, 85 insertions(+), 77 deletions(-) diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index 2bc7f5af64f4..8e32c5abd09d 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -94,14 +94,14 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, port_priv = ib_get_agent_port(device, port_num); if (!port_priv) { - printk(KERN_ERR SPFX "Unable to find port agent\n"); + dev_err(&device->dev, "Unable to find port agent\n"); return; } agent = port_priv->agent[qpn]; ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num); if (IS_ERR(ah)) { - printk(KERN_ERR SPFX "ib_create_ah_from_wc error %ld\n", + dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n", PTR_ERR(ah)); return; } @@ -110,7 +110,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA, GFP_KERNEL); if (IS_ERR(send_buf)) { - printk(KERN_ERR SPFX "ib_create_send_mad error\n"); + dev_err(&device->dev, "ib_create_send_mad error\n"); goto err1; } @@ -125,7 +125,7 @@ void agent_send_response(struct ib_mad *mad, struct ib_grh *grh, } if (ib_post_send_mad(send_buf, NULL)) { - printk(KERN_ERR SPFX "ib_post_send_mad error\n"); + dev_err(&device->dev, "ib_post_send_mad error\n"); goto err2; } return; @@ -151,7 +151,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR SPFX "No memory for ib_agent_port_private\n"); + dev_err(&device->dev, "No memory for ib_agent_port_private\n"); ret = -ENOMEM; goto error1; } @@ -202,7 +202,7 @@ int ib_agent_port_close(struct ib_device *device, int port_num) port_priv = __ib_get_agent_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_agent_port_list_lock, flags); - printk(KERN_ERR SPFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del(&port_priv->port_list); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ab31f136d04b..08f430319fc8 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -33,6 +33,9 @@ * SOFTWARE. * */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -706,7 +709,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, smi_handle_dr_smp_send(smp, device->node_type, port_num) == IB_SMI_DISCARD) { ret = -EINVAL; - printk(KERN_ERR PFX "Invalid directed route\n"); + dev_err(&device->dev, "Invalid directed route\n"); goto out; } @@ -718,7 +721,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, local = kmalloc(sizeof *local, GFP_ATOMIC); if (!local) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for ib_mad_local_private\n"); + dev_err(&device->dev, "No memory for ib_mad_local_private\n"); goto out; } local->mad_priv = NULL; @@ -726,7 +729,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC); if (!mad_priv) { ret = -ENOMEM; - printk(KERN_ERR PFX "No memory for local response MAD\n"); + dev_err(&device->dev, "No memory for local response MAD\n"); kfree(local); goto out; } @@ -837,9 +840,9 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, for (left = send_buf->data_len + pad; left > 0; left -= seg_size) { seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask); if (!seg) { - printk(KERN_ERR "alloc_send_rmpp_segs: RMPP mem " - "alloc failed for len %zd, gfp %#x\n", - sizeof (*seg) + seg_size, gfp_mask); + dev_err(&send_buf->mad_agent->device->dev, + "alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n", + sizeof (*seg) + seg_size, gfp_mask); free_send_rmpp_list(send_wr); return -ENOMEM; } @@ -1199,7 +1202,8 @@ EXPORT_SYMBOL(ib_redirect_mad_qp); int ib_process_mad_wc(struct ib_mad_agent *mad_agent, struct ib_wc *wc) { - printk(KERN_ERR PFX "ib_process_mad_wc() not implemented yet\n"); + dev_err(&mad_agent->device->dev, + "ib_process_mad_wc() not implemented yet\n"); return 0; } EXPORT_SYMBOL(ib_process_mad_wc); @@ -1211,7 +1215,7 @@ static int method_in_use(struct ib_mad_mgmt_method_table **method, for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) { if ((*method)->agent[i]) { - printk(KERN_ERR PFX "Method %d already in use\n", i); + pr_err("Method %d already in use\n", i); return -EINVAL; } } @@ -1223,8 +1227,7 @@ static int allocate_method_table(struct ib_mad_mgmt_method_table **method) /* Allocate management method table */ *method = kzalloc(sizeof **method, GFP_ATOMIC); if (!*method) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_method_table\n"); + pr_err("No memory for ib_mad_mgmt_method_table\n"); return -ENOMEM; } @@ -1319,8 +1322,8 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, /* Allocate management class table for "new" class version */ *class = kzalloc(sizeof **class, GFP_ATOMIC); if (!*class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_class_table\n"); ret = -ENOMEM; goto error1; } @@ -1386,8 +1389,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, /* Allocate mgmt vendor class table for "new" class version */ vendor = kzalloc(sizeof *vendor, GFP_ATOMIC); if (!vendor) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class_table\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class_table\n"); goto error1; } @@ -1397,8 +1400,8 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, /* Allocate table for this management vendor class */ vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC); if (!vendor_class) { - printk(KERN_ERR PFX "No memory for " - "ib_mad_mgmt_vendor_class\n"); + dev_err(&agent_priv->agent.device->dev, + "No memory for ib_mad_mgmt_vendor_class\n"); goto error2; } @@ -1429,7 +1432,7 @@ static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, goto check_in_use; } } - printk(KERN_ERR PFX "All OUI slots in use\n"); + dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n"); goto error3; check_in_use: @@ -1640,9 +1643,9 @@ find_mad_agent(struct ib_mad_port_private *port_priv, if (mad_agent->agent.recv_handler) atomic_inc(&mad_agent->refcount); else { - printk(KERN_NOTICE PFX "No receive handler for client " - "%p on port %d\n", - &mad_agent->agent, port_priv->port_num); + dev_notice(&port_priv->device->dev, + "No receive handler for client %p on port %d\n", + &mad_agent->agent, port_priv->port_num); mad_agent = NULL; } } @@ -1658,8 +1661,8 @@ static int validate_mad(struct ib_mad *mad, u32 qp_num) /* Make sure MAD base version is understood */ if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) { - printk(KERN_ERR PFX "MAD received with unsupported base " - "version %d\n", mad->mad_hdr.base_version); + pr_err("MAD received with unsupported base version %d\n", + mad->mad_hdr.base_version); goto out; } @@ -1911,8 +1914,8 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!response) { - printk(KERN_ERR PFX "ib_mad_recv_done_handler no memory " - "for response buffer\n"); + dev_err(&port_priv->device->dev, + "ib_mad_recv_done_handler no memory for response buffer\n"); goto out; } @@ -2176,7 +2179,8 @@ retry: ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr, &bad_send_wr); if (ret) { - printk(KERN_ERR PFX "ib_post_send failed: %d\n", ret); + dev_err(&port_priv->device->dev, + "ib_post_send failed: %d\n", ret); mad_send_wr = queued_send_wr; wc->status = IB_WC_LOC_QP_OP_ERR; goto retry; @@ -2248,8 +2252,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, IB_QP_STATE | IB_QP_CUR_STATE); kfree(attr); if (ret) - printk(KERN_ERR PFX "mad_error_handler - " - "ib_modify_qp to RTS : %d\n", ret); + dev_err(&port_priv->device->dev, + "mad_error_handler - ib_modify_qp to RTS : %d\n", + ret); else mark_sends_for_retry(qp_info); } @@ -2408,7 +2413,8 @@ static void local_completions(struct work_struct *work) if (local->mad_priv) { recv_mad_agent = local->recv_mad_agent; if (!recv_mad_agent) { - printk(KERN_ERR PFX "No receive MAD agent for local completion\n"); + dev_err(&mad_agent_priv->agent.device->dev, + "No receive MAD agent for local completion\n"); free_mad = 1; goto local_send_completion; } @@ -2589,7 +2595,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, } else { mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL); if (!mad_priv) { - printk(KERN_ERR PFX "No memory for receive buffer\n"); + dev_err(&qp_info->port_priv->device->dev, + "No memory for receive buffer\n"); ret = -ENOMEM; break; } @@ -2625,7 +2632,8 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, sizeof mad_priv->header, DMA_FROM_DEVICE); kmem_cache_free(ib_mad_cache, mad_priv); - printk(KERN_ERR PFX "ib_post_recv failed: %d\n", ret); + dev_err(&qp_info->port_priv->device->dev, + "ib_post_recv failed: %d\n", ret); break; } } while (post); @@ -2681,7 +2689,8 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) attr = kmalloc(sizeof *attr, GFP_KERNEL); if (!attr) { - printk(KERN_ERR PFX "Couldn't kmalloc ib_qp_attr\n"); + dev_err(&port_priv->device->dev, + "Couldn't kmalloc ib_qp_attr\n"); return -ENOMEM; } @@ -2705,16 +2714,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "INIT: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to INIT: %d\n", + i, ret); goto out; } attr->qp_state = IB_QPS_RTR; ret = ib_modify_qp(qp, attr, IB_QP_STATE); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTR: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTR: %d\n", + i, ret); goto out; } @@ -2722,16 +2733,18 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) attr->sq_psn = IB_MAD_SEND_Q_PSN; ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN); if (ret) { - printk(KERN_ERR PFX "Couldn't change QP%d state to " - "RTS: %d\n", i, ret); + dev_err(&port_priv->device->dev, + "Couldn't change QP%d state to RTS: %d\n", + i, ret); goto out; } } ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); if (ret) { - printk(KERN_ERR PFX "Failed to request completion " - "notification: %d\n", ret); + dev_err(&port_priv->device->dev, + "Failed to request completion notification: %d\n", + ret); goto out; } @@ -2741,7 +2754,8 @@ static int ib_mad_port_start(struct ib_mad_port_private *port_priv) ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL); if (ret) { - printk(KERN_ERR PFX "Couldn't post receive WRs\n"); + dev_err(&port_priv->device->dev, + "Couldn't post receive WRs\n"); goto out; } } @@ -2755,7 +2769,8 @@ static void qp_event_handler(struct ib_event *event, void *qp_context) struct ib_mad_qp_info *qp_info = qp_context; /* It's worse than that! He's dead, Jim! */ - printk(KERN_ERR PFX "Fatal error (%d) on MAD QP (%d)\n", + dev_err(&qp_info->port_priv->device->dev, + "Fatal error (%d) on MAD QP (%d)\n", event->event, qp_info->qp->qp_num); } @@ -2801,8 +2816,9 @@ static int create_mad_qp(struct ib_mad_qp_info *qp_info, qp_init_attr.event_handler = qp_event_handler; qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr); if (IS_ERR(qp_info->qp)) { - printk(KERN_ERR PFX "Couldn't create ib_mad QP%d\n", - get_spl_qp_index(qp_type)); + dev_err(&qp_info->port_priv->device->dev, + "Couldn't create ib_mad QP%d\n", + get_spl_qp_index(qp_type)); ret = PTR_ERR(qp_info->qp); goto error; } @@ -2840,7 +2856,7 @@ static int ib_mad_port_open(struct ib_device *device, /* Create new device info */ port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL); if (!port_priv) { - printk(KERN_ERR PFX "No memory for ib_mad_port_private\n"); + dev_err(&device->dev, "No memory for ib_mad_port_private\n"); return -ENOMEM; } @@ -2860,21 +2876,21 @@ static int ib_mad_port_open(struct ib_device *device, ib_mad_thread_completion_handler, NULL, port_priv, cq_size, 0); if (IS_ERR(port_priv->cq)) { - printk(KERN_ERR PFX "Couldn't create ib_mad CQ\n"); + dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); goto error3; } port_priv->pd = ib_alloc_pd(device); if (IS_ERR(port_priv->pd)) { - printk(KERN_ERR PFX "Couldn't create ib_mad PD\n"); + dev_err(&device->dev, "Couldn't create ib_mad PD\n"); ret = PTR_ERR(port_priv->pd); goto error4; } port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE); if (IS_ERR(port_priv->mr)) { - printk(KERN_ERR PFX "Couldn't get ib_mad DMA MR\n"); + dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n"); ret = PTR_ERR(port_priv->mr); goto error5; } @@ -2902,7 +2918,7 @@ static int ib_mad_port_open(struct ib_device *device, ret = ib_mad_port_start(port_priv); if (ret) { - printk(KERN_ERR PFX "Couldn't start port\n"); + dev_err(&device->dev, "Couldn't start port\n"); goto error9; } @@ -2946,7 +2962,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) port_priv = __ib_get_mad_port(device, port_num); if (port_priv == NULL) { spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); - printk(KERN_ERR PFX "Port %d not found\n", port_num); + dev_err(&device->dev, "Port %d not found\n", port_num); return -ENODEV; } list_del_init(&port_priv->port_list); @@ -2984,14 +3000,12 @@ static void ib_mad_init_device(struct ib_device *device) for (i = start; i <= end; i++) { if (ib_mad_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't open port %d\n", i); goto error; } if (ib_agent_port_open(device, i)) { - printk(KERN_ERR PFX "Couldn't open %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't open port %d for agents\n", i); goto error_agent; } } @@ -2999,20 +3013,17 @@ static void ib_mad_init_device(struct ib_device *device) error_agent: if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); error: i--; while (i >= start) { if (ib_agent_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, i); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", i); if (ib_mad_port_close(device, i)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, i); + dev_err(&device->dev, "Couldn't close port %d\n", i); i--; } } @@ -3033,12 +3044,12 @@ static void ib_mad_remove_device(struct ib_device *device) } for (i = 0; i < num_ports; i++, cur_port++) { if (ib_agent_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d " - "for agents\n", - device->name, cur_port); + dev_err(&device->dev, + "Couldn't close port %d for agents\n", + cur_port); if (ib_mad_port_close(device, cur_port)) - printk(KERN_ERR PFX "Couldn't close %s port %d\n", - device->name, cur_port); + dev_err(&device->dev, "Couldn't close port %d\n", + cur_port); } } @@ -3064,7 +3075,7 @@ static int __init ib_mad_init_module(void) SLAB_HWCACHE_ALIGN, NULL); if (!ib_mad_cache) { - printk(KERN_ERR PFX "Couldn't create ib_mad cache\n"); + pr_err("Couldn't create ib_mad cache\n"); ret = -ENOMEM; goto error1; } @@ -3072,7 +3083,7 @@ static int __init ib_mad_init_module(void) INIT_LIST_HEAD(&ib_mad_port_list); if (ib_register_client(&mad_client)) { - printk(KERN_ERR PFX "Couldn't register ib_mad client\n"); + pr_err("Couldn't register ib_mad client\n"); ret = -EINVAL; goto error2; } diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 9430ab4969c5..d1a0b0ee9444 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -42,9 +42,6 @@ #include #include - -#define PFX "ib_mad: " - #define IB_MAD_QPS_CORE 2 /* Always QP0 and QP1 as a minimum */ /* QP and CQ parameters */ From 9ad13a423484725324fb2c5c5ab527d6bf9d84cc Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 8 Aug 2014 19:00:54 -0400 Subject: [PATCH 40/52] IB/mad: Add dev_notice messages for various umad/mad registration failures Registration failures can be difficult to debug from userspace. This gives more visibility. Signed-off-by: Ira Weiny Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 58 +++++++++++++++++++++++++----- drivers/infiniband/core/user_mad.c | 8 +++++ 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 08f430319fc8..f46995d4f54b 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -214,58 +214,95 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate parameters */ qpn = get_spl_qp_index(qp_type); - if (qpn == -1) + if (qpn == -1) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid QP Type %d\n", + qp_type); goto error1; + } - if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) + if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid RMPP Version %u\n", + rmpp_version); goto error1; + } /* Validate MAD registration request if supplied */ if (mad_reg_req) { - if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) + if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { + dev_notice(&device->dev, + "ib_register_mad_agent: invalid Class Version %u\n", + mad_reg_req->mgmt_class_version); goto error1; - if (!recv_handler) + } + if (!recv_handler) { + dev_notice(&device->dev, + "ib_register_mad_agent: no recv_handler\n"); goto error1; + } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { /* * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only * one in this range currently allowed */ if (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else if (mad_reg_req->mgmt_class == 0) { /* * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid Mgmt Class 0\n"); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* * If class is in "new" vendor range, * ensure supplied OUI is not zero */ - if (!is_vendor_oui(mad_reg_req->oui)) + if (!is_vendor_oui(mad_reg_req->oui)) { + dev_notice(&device->dev, + "ib_register_mad_agent: No OUI specified for class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { - if (rmpp_version) + if (rmpp_version) { + dev_notice(&device->dev, + "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } /* Make sure class supplied is consistent with QP type */ if (qp_type == IB_QPT_SMI) { if ((mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } else { if ((mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == - IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) + IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { + dev_notice(&device->dev, + "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", + mad_reg_req->mgmt_class); goto error1; + } } } else { /* No registration request supplied */ @@ -276,6 +313,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { + dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n"); ret = ERR_PTR(-ENODEV); goto error1; } @@ -283,6 +321,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Verify the QP requested is supported. For example, Ethernet devices * will not have QP0 */ if (!port_priv->qp_info[qpn].qp) { + dev_notice(&device->dev, + "ib_register_mad_agent: QP %d not supported\n", qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 6be596dc360a..ea90a3ebb9c2 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -616,6 +616,8 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, mutex_lock(&file->mutex); if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid device\n"); ret = -EPIPE; goto out; } @@ -626,6 +628,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, } if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent: invalid QPN %d specified\n", + ureq.qpn); ret = -EINVAL; goto out; } @@ -634,6 +639,9 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, if (!__get_agent(file, agent_id)) goto found; + dev_notice(file->port->dev, + "ib_umad_reg_agent: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); ret = -ENOMEM; goto out; From 0f29b46d49b0ca50536632c6a33986c3171f5ea1 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 8 Aug 2014 19:00:55 -0400 Subject: [PATCH 41/52] IB/mad: add new ioctl to ABI to support new registration options Registrations options are specified through flags. Definitions of flags will be in subsequent patches. Signed-off-by: Ira Weiny Signed-off-by: Roland Dreier --- Documentation/infiniband/user_mad.txt | 13 ++- drivers/infiniband/core/agent.c | 4 +- drivers/infiniband/core/cm.c | 5 +- drivers/infiniband/core/mad.c | 4 +- drivers/infiniband/core/sa_query.c | 2 +- drivers/infiniband/core/user_mad.c | 120 +++++++++++++++++++++++- drivers/infiniband/hw/mlx4/mad.c | 2 +- drivers/infiniband/hw/mthca/mthca_mad.c | 2 +- drivers/infiniband/hw/qib/qib_mad.c | 2 +- drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +- include/rdma/ib_mad.h | 7 +- include/uapi/rdma/ib_user_mad.h | 39 ++++++++ 12 files changed, 186 insertions(+), 16 deletions(-) diff --git a/Documentation/infiniband/user_mad.txt b/Documentation/infiniband/user_mad.txt index 8a366959f5cc..7aca13a54a3a 100644 --- a/Documentation/infiniband/user_mad.txt +++ b/Documentation/infiniband/user_mad.txt @@ -26,6 +26,11 @@ Creating MAD agents ioctl. Also, all agents registered through a file descriptor will be unregistered when the descriptor is closed. + 2014 -- a new registration ioctl is now provided which allows additional + fields to be provided during registration. + Users of this registration call are implicitly setting the use of + pkey_index (see below). + Receiving MADs MADs are received using read(). The receive side now supports @@ -104,10 +109,10 @@ P_Key Index Handling The old ib_umad interface did not allow setting the P_Key index for MADs that are sent and did not provide a way for obtaining the P_Key index of received MADs. A new layout for struct ib_user_mad_hdr - with a pkey_index member has been defined; however, to preserve - binary compatibility with older applications, this new layout will - not be used unless the IB_USER_MAD_ENABLE_PKEY ioctl is called - before a file descriptor is used for anything else. + with a pkey_index member has been defined; however, to preserve binary + compatibility with older applications, this new layout will not be used + unless one of IB_USER_MAD_ENABLE_PKEY or IB_USER_MAD_REGISTER_AGENT2 ioctl's + are called before a file descriptor is used for anything else. In September 2008, the IB_USER_MAD_ABI_VERSION will be incremented to 6, the new layout of struct ib_user_mad_hdr will be used by diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c index 8e32c5abd09d..f6d29614cb01 100644 --- a/drivers/infiniband/core/agent.c +++ b/drivers/infiniband/core/agent.c @@ -161,7 +161,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) port_priv->agent[0] = ib_register_mad_agent(device, port_num, IB_QPT_SMI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[0])) { ret = PTR_ERR(port_priv->agent[0]); goto error2; @@ -172,7 +172,7 @@ int ib_agent_port_open(struct ib_device *device, int port_num) port_priv->agent[1] = ib_register_mad_agent(device, port_num, IB_QPT_GSI, NULL, 0, &agent_send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(port_priv->agent[1])) { ret = PTR_ERR(port_priv->agent[1]); goto error3; diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index c3239170d8b7..e28a494e2a3a 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3753,7 +3753,7 @@ static void cm_add_one(struct ib_device *ib_device) struct cm_port *port; struct ib_mad_reg_req reg_req = { .mgmt_class = IB_MGMT_CLASS_CM, - .mgmt_class_version = IB_CM_CLASS_VERSION + .mgmt_class_version = IB_CM_CLASS_VERSION, }; struct ib_port_modify port_modify = { .set_port_cap_mask = IB_PORT_CM_SUP @@ -3801,7 +3801,8 @@ static void cm_add_one(struct ib_device *ib_device) 0, cm_send_handler, cm_recv_handler, - port); + port, + 0); if (IS_ERR(port->mad_agent)) goto error2; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index f46995d4f54b..988bbda67952 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -198,7 +198,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, - void *context) + void *context, + u32 registration_flags) { struct ib_mad_port_private *port_priv; struct ib_mad_agent *ret = ERR_PTR(-EINVAL); @@ -359,6 +360,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, mad_agent_priv->agent.context = context; mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp; mad_agent_priv->agent.port_num = port_num; + mad_agent_priv->agent.flags = registration_flags; spin_lock_init(&mad_agent_priv->lock); INIT_LIST_HEAD(&mad_agent_priv->send_list); INIT_LIST_HEAD(&mad_agent_priv->wait_list); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 233eaf541f55..c38f030f0dc9 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1184,7 +1184,7 @@ static void ib_sa_add_one(struct ib_device *device) sa_dev->port[i].agent = ib_register_mad_agent(device, i + s, IB_QPT_GSI, NULL, 0, send_handler, - recv_handler, sa_dev); + recv_handler, sa_dev, 0); if (IS_ERR(sa_dev->port[i].agent)) goto err; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index ea90a3ebb9c2..11af1c61c135 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -647,6 +647,7 @@ static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg, found: if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); req.mgmt_class = ureq.mgmt_class; req.mgmt_class_version = ureq.mgmt_class_version; memcpy(req.oui, ureq.oui, sizeof req.oui); @@ -667,7 +668,7 @@ found: ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, ureq.mgmt_class ? &req : NULL, ureq.rmpp_version, - send_handler, recv_handler, file); + send_handler, recv_handler, file, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); agent = NULL; @@ -705,6 +706,119 @@ out: return ret; } +static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg) +{ + struct ib_user_mad_reg_req2 ureq; + struct ib_mad_reg_req req; + struct ib_mad_agent *agent = NULL; + int agent_id; + int ret; + + mutex_lock(&file->port->file_mutex); + mutex_lock(&file->mutex); + + if (!file->port->ib_dev) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid device\n"); + ret = -EPIPE; + goto out; + } + + if (copy_from_user(&ureq, arg, sizeof(ureq))) { + ret = -EFAULT; + goto out; + } + + if (ureq.qpn != 0 && ureq.qpn != 1) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2: invalid QPN %d specified\n", + ureq.qpn); + ret = -EINVAL; + goto out; + } + + if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n", + ureq.flags, IB_USER_MAD_REG_FLAGS_CAP); + ret = -EINVAL; + + if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP, + (u32 __user *) (arg + offsetof(struct + ib_user_mad_reg_req2, flags)))) + ret = -EFAULT; + + goto out; + } + + for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id) + if (!__get_agent(file, agent_id)) + goto found; + + dev_notice(file->port->dev, + "ib_umad_reg_agent2: Max Agents (%u) reached\n", + IB_UMAD_MAX_AGENTS); + ret = -ENOMEM; + goto out; + +found: + if (ureq.mgmt_class) { + memset(&req, 0, sizeof(req)); + req.mgmt_class = ureq.mgmt_class; + req.mgmt_class_version = ureq.mgmt_class_version; + if (ureq.oui & 0xff000000) { + dev_notice(file->port->dev, + "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n", + ureq.oui); + ret = -EINVAL; + goto out; + } + req.oui[2] = ureq.oui & 0x0000ff; + req.oui[1] = (ureq.oui & 0x00ff00) >> 8; + req.oui[0] = (ureq.oui & 0xff0000) >> 16; + memcpy(req.method_mask, ureq.method_mask, + sizeof(req.method_mask)); + } + + agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num, + ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI, + ureq.mgmt_class ? &req : NULL, + ureq.rmpp_version, + send_handler, recv_handler, file, + ureq.flags); + if (IS_ERR(agent)) { + ret = PTR_ERR(agent); + agent = NULL; + goto out; + } + + if (put_user(agent_id, + (u32 __user *)(arg + + offsetof(struct ib_user_mad_reg_req2, id)))) { + ret = -EFAULT; + goto out; + } + + if (!file->already_used) { + file->already_used = 1; + file->use_pkey_index = 1; + } + + file->agent[agent_id] = agent; + ret = 0; + +out: + mutex_unlock(&file->mutex); + + if (ret && agent) + ib_unregister_mad_agent(agent); + + mutex_unlock(&file->port->file_mutex); + + return ret; +} + + static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg) { struct ib_mad_agent *agent = NULL; @@ -760,6 +874,8 @@ static long ib_umad_ioctl(struct file *filp, unsigned int cmd, return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, (void __user *) arg); default: return -ENOIOCTLCMD; } @@ -776,6 +892,8 @@ static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd, return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg)); case IB_USER_MAD_ENABLE_PKEY: return ib_umad_enable_pkey(filp->private_data); + case IB_USER_MAD_REGISTER_AGENT2: + return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg)); default: return -ENOIOCTLCMD; } diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 287ad0564acd..82a7dd87089b 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -891,7 +891,7 @@ int mlx4_ib_mad_init(struct mlx4_ib_dev *dev) agent = ib_register_mad_agent(&dev->ib_dev, p + 1, q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index b6f7f457fc55..8881fa376e06 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -294,7 +294,7 @@ int mthca_create_agents(struct mthca_dev *dev) agent = ib_register_mad_agent(&dev->ib_dev, p + 1, q ? IB_QPT_GSI : IB_QPT_SMI, NULL, 0, send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c index 22c720e5740d..636be117b578 100644 --- a/drivers/infiniband/hw/qib/qib_mad.c +++ b/drivers/infiniband/hw/qib/qib_mad.c @@ -2476,7 +2476,7 @@ int qib_create_agents(struct qib_ibdev *dev) ibp = &dd->pport[p].ibport_data; agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI, NULL, 0, send_handler, - NULL, NULL); + NULL, NULL, 0); if (IS_ERR(agent)) { ret = PTR_ERR(agent); goto err; diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index fe09f2788b15..8a8311e35cbd 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -563,7 +563,7 @@ static int srpt_refresh_port(struct srpt_port *sport) ®_req, 0, srpt_mad_send_handler, srpt_mad_recv_handler, - sport); + sport, 0); if (IS_ERR(sport->mad_agent)) { ret = PTR_ERR(sport->mad_agent); sport->mad_agent = NULL; diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 3d81b90cc315..876f497f8b0c 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -355,6 +355,7 @@ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, * @hi_tid: Access layer assigned transaction ID for this client. * Unsolicited MADs sent by this client will have the upper 32-bits * of their TID set to this value. + * @flags: registration flags * @port_num: Port number on which QP is registered * @rmpp_version: If set, indicates the RMPP version used by this agent. */ @@ -367,6 +368,7 @@ struct ib_mad_agent { ib_mad_snoop_handler snoop_handler; void *context; u32 hi_tid; + u32 flags; u8 port_num; u8 rmpp_version; }; @@ -426,6 +428,7 @@ struct ib_mad_recv_wc { * in the range from 0x30 to 0x4f. Otherwise not used. * @method_mask: The caller will receive unsolicited MADs for any method * where @method_mask = 1. + * */ struct ib_mad_reg_req { u8 mgmt_class; @@ -451,6 +454,7 @@ struct ib_mad_reg_req { * @recv_handler: The completion callback routine invoked for a received * MAD. * @context: User specified context associated with the registration. + * @registration_flags: Registration flags to set for this agent */ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 port_num, @@ -459,7 +463,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, u8 rmpp_version, ib_mad_send_handler send_handler, ib_mad_recv_handler recv_handler, - void *context); + void *context, + u32 registration_flags); enum ib_mad_snoop_flags { /*IB_MAD_SNOOP_POSTED_SENDS = 1,*/ diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h index d6fce1cbdb90..c00b081dc10e 100644 --- a/include/uapi/rdma/ib_user_mad.h +++ b/include/uapi/rdma/ib_user_mad.h @@ -191,6 +191,42 @@ struct ib_user_mad_reg_req { __u8 rmpp_version; }; +/** + * ib_user_mad_reg_req2 - MAD registration request + * + * @id - Set by the _kernel_; used by userspace to identify the + * registered agent in future requests. + * @qpn - Queue pair number; must be 0 or 1. + * @mgmt_class - Indicates which management class of MADs should be + * receive by the caller. This field is only required if + * the user wishes to receive unsolicited MADs, otherwise + * it should be 0. + * @mgmt_class_version - Indicates which version of MADs for the given + * management class to receive. + * @res - Ignored. + * @flags - additional registration flags; Must be in the set of + * flags defined in IB_USER_MAD_REG_FLAGS_CAP + * @method_mask - The caller wishes to receive unsolicited MADs for the + * methods whose bit(s) is(are) set. + * @oui - Indicates IEEE OUI to use when mgmt_class is a vendor + * class in the range from 0x30 to 0x4f. Otherwise not + * used. + * @rmpp_version - If set, indicates the RMPP version to use. + */ +#define IB_USER_MAD_REG_FLAGS_CAP (0) +struct ib_user_mad_reg_req2 { + __u32 id; + __u32 qpn; + __u8 mgmt_class; + __u8 mgmt_class_version; + __u16 res; + __u32 flags; + __u64 method_mask[2]; + __u32 oui; + __u8 rmpp_version; + __u8 reserved[3]; +}; + #define IB_IOCTL_MAGIC 0x1b #define IB_USER_MAD_REGISTER_AGENT _IOWR(IB_IOCTL_MAGIC, 1, \ @@ -200,4 +236,7 @@ struct ib_user_mad_reg_req { #define IB_USER_MAD_ENABLE_PKEY _IO(IB_IOCTL_MAGIC, 3) +#define IB_USER_MAD_REGISTER_AGENT2 _IOWR(IB_IOCTL_MAGIC, 4, \ + struct ib_user_mad_reg_req2) + #endif /* IB_USER_MAD_H */ From 1471cb6ca67990a306500e69e52ffb28c93ccbbc Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Fri, 8 Aug 2014 19:00:56 -0400 Subject: [PATCH 42/52] IB/mad: Add user space RMPP support Using the new registration mechanism, define a flag that indicates the user wishes to process RMPP messages in user space rather than have the kernel process them. Signed-off-by: Ira Weiny Signed-off-by: Roland Dreier --- drivers/infiniband/core/mad.c | 76 ++++++++++++++++++++---------- drivers/infiniband/core/user_mad.c | 34 ++++++++----- include/rdma/ib_mad.h | 11 +++++ include/uapi/rdma/ib_user_mad.h | 5 +- 4 files changed, 89 insertions(+), 37 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 988bbda67952..74c30f4c557e 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -283,6 +283,7 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, goto error1; } } + /* Make sure class supplied is consistent with QP type */ if (qp_type == IB_QPT_SMI) { if ((mad_reg_req->mgmt_class != @@ -309,6 +310,8 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* No registration request supplied */ if (!send_handler) goto error1; + if (registration_flags & IB_MAD_USER_RMPP) + goto error1; } /* Validate device and port */ @@ -907,6 +910,12 @@ static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr, return 0; } +int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent) +{ + return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP); +} +EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent); + struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, u32 remote_qpn, u16 pkey_index, int rmpp_active, @@ -923,10 +932,12 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, pad = get_pad_size(hdr_len, data_len); message_size = hdr_len + data_len + pad; - if ((!mad_agent->rmpp_version && - (rmpp_active || message_size > sizeof(struct ib_mad))) || - (!rmpp_active && message_size > sizeof(struct ib_mad))) - return ERR_PTR(-EINVAL); + if (ib_mad_kernel_rmpp_agent(mad_agent)) { + if (!rmpp_active && message_size > sizeof(struct ib_mad)) + return ERR_PTR(-EINVAL); + } else + if (rmpp_active || message_size > sizeof(struct ib_mad)) + return ERR_PTR(-EINVAL); size = rmpp_active ? hdr_len : sizeof(struct ib_mad); buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask); @@ -1180,7 +1191,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf, &mad_agent_priv->send_list); spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_send_rmpp_mad(mad_send_wr); if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED) ret = ib_send_mad(mad_send_wr); @@ -1730,6 +1741,7 @@ static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv, rmpp_mad = (struct ib_rmpp_mad *)mad_hdr; return !mad_agent_priv->agent.rmpp_version || + !ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) || !(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE) || (rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA); @@ -1857,7 +1869,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, INIT_LIST_HEAD(&mad_recv_wc->rmpp_list); list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv, mad_recv_wc); if (!mad_recv_wc) { @@ -1872,23 +1884,39 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc); if (!mad_send_wr) { spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - ib_free_recv_mad(mad_recv_wc); - deref_mad_agent(mad_agent_priv); - return; + if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) + && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr) + & IB_MGMT_RMPP_FLAG_ACTIVE)) { + /* user rmpp is in effect + * and this is an active RMPP MAD + */ + mad_recv_wc->wc->wr_id = 0; + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); + } else { + /* not user rmpp, revert to normal behavior and + * drop the mad */ + ib_free_recv_mad(mad_recv_wc); + deref_mad_agent(mad_agent_priv); + return; + } + } else { + ib_mark_mad_done(mad_send_wr); + spin_unlock_irqrestore(&mad_agent_priv->lock, flags); + + /* Defined behavior is to complete response before request */ + mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_recv_wc); + atomic_dec(&mad_agent_priv->refcount); + + mad_send_wc.status = IB_WC_SUCCESS; + mad_send_wc.vendor_err = 0; + mad_send_wc.send_buf = &mad_send_wr->send_buf; + ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); } - ib_mark_mad_done(mad_send_wr); - spin_unlock_irqrestore(&mad_agent_priv->lock, flags); - - /* Defined behavior is to complete response before request */ - mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); - atomic_dec(&mad_agent_priv->refcount); - - mad_send_wc.status = IB_WC_SUCCESS; - mad_send_wc.vendor_err = 0; - mad_send_wc.send_buf = &mad_send_wr->send_buf; - ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); } else { mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, mad_recv_wc); @@ -2128,7 +2156,7 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, mad_agent_priv = mad_send_wr->mad_agent_priv; spin_lock_irqsave(&mad_agent_priv->lock, flags); - if (mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) { ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc); if (ret == IB_RMPP_RESULT_CONSUMED) goto done; @@ -2524,7 +2552,7 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms); - if (mad_send_wr->mad_agent_priv->agent.rmpp_version) { + if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) { ret = ib_retry_rmpp(mad_send_wr); switch (ret) { case IB_RMPP_RESULT_UNHANDLED: diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 11af1c61c135..928cdd20e2d1 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -506,13 +506,15 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data; hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class); - if (!ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)) { - copy_offset = IB_MGMT_MAD_HDR; - rmpp_active = 0; - } else { + + if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && ib_mad_kernel_rmpp_agent(agent)) { copy_offset = IB_MGMT_RMPP_HDR; rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & - IB_MGMT_RMPP_FLAG_ACTIVE; + IB_MGMT_RMPP_FLAG_ACTIVE; + } else { + copy_offset = IB_MGMT_MAD_HDR; + rmpp_active = 0; } data_len = count - hdr_size(file) - hdr_len; @@ -558,14 +560,22 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf, rmpp_mad->mad_hdr.tid = *tid; } - spin_lock_irq(&file->send_lock); - ret = is_duplicate(file, packet); - if (!ret) + if (!ib_mad_kernel_rmpp_agent(agent) + && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class) + && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) { + spin_lock_irq(&file->send_lock); list_add_tail(&packet->list, &file->send_list); - spin_unlock_irq(&file->send_lock); - if (ret) { - ret = -EINVAL; - goto err_msg; + spin_unlock_irq(&file->send_lock); + } else { + spin_lock_irq(&file->send_lock); + ret = is_duplicate(file, packet); + if (!ret) + list_add_tail(&packet->list, &file->send_list); + spin_unlock_irq(&file->send_lock); + if (ret) { + ret = -EINVAL; + goto err_msg; + } } ret = ib_post_send_mad(packet->msg, NULL); diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 876f497f8b0c..9bb99e983f58 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -40,6 +40,7 @@ #include #include +#include /* Management base version */ #define IB_MGMT_BASE_VERSION 1 @@ -359,6 +360,9 @@ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, * @port_num: Port number on which QP is registered * @rmpp_version: If set, indicates the RMPP version used by this agent. */ +enum { + IB_MAD_USER_RMPP = IB_USER_MAD_USER_RMPP, +}; struct ib_mad_agent { struct ib_device *device; struct ib_qp *qp; @@ -666,4 +670,11 @@ void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num); */ void ib_free_send_mad(struct ib_mad_send_buf *send_buf); +/** + * ib_mad_kernel_rmpp_agent - Returns if the agent is performing RMPP. + * @agent: the agent in question + * @return: true if agent is performing rmpp, false otherwise. + */ +int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent); + #endif /* IB_MAD_H */ diff --git a/include/uapi/rdma/ib_user_mad.h b/include/uapi/rdma/ib_user_mad.h index c00b081dc10e..09f809f323ea 100644 --- a/include/uapi/rdma/ib_user_mad.h +++ b/include/uapi/rdma/ib_user_mad.h @@ -213,7 +213,10 @@ struct ib_user_mad_reg_req { * used. * @rmpp_version - If set, indicates the RMPP version to use. */ -#define IB_USER_MAD_REG_FLAGS_CAP (0) +enum { + IB_USER_MAD_USER_RMPP = (1 << 0), +}; +#define IB_USER_MAD_REG_FLAGS_CAP (IB_USER_MAD_USER_RMPP) struct ib_user_mad_reg_req2 { __u32 id; __u32 qpn; From e42fa2092c1049ac9c0e38aaac39ef3c40e91a36 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 12 Aug 2014 19:20:06 -0400 Subject: [PATCH 43/52] IPoIB: Remove unnecessary test for NULL before debugfs_remove() Fix checkpatch warning: WARNING: debugfs_remove(NULL) is safe this check is probably not required Signed-off-by: Fabian Frederick Signed-off-by: Doug Ledford Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_fs.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c index 50061854616e..6bd5740e2691 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_fs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c @@ -281,10 +281,8 @@ void ipoib_delete_debug_files(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - if (priv->mcg_dentry) - debugfs_remove(priv->mcg_dentry); - if (priv->path_dentry) - debugfs_remove(priv->path_dentry); + debugfs_remove(priv->mcg_dentry); + debugfs_remove(priv->path_dentry); } int ipoib_register_debugfs(void) From 859976da0307618d1169616f9cb03936716106eb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 12 Aug 2014 19:20:04 -0400 Subject: [PATCH 44/52] RDMA/amso1100: Check for integer overflow in c2_alloc_cq_buf() This is a static checker fix. The static checker says that q_size comes from the user and can be any 32 bit value. The call tree is: --> ib_uverbs_create_cq() --> c2_create_cq() --> c2_init_cq() Signed-off-by: Dan Carpenter Signed-off-by: Doug Ledford Signed-off-by: Roland Dreier --- drivers/infiniband/hw/amso1100/c2_cq.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c index 49e0e8533f74..1b63185b4ad4 100644 --- a/drivers/infiniband/hw/amso1100/c2_cq.c +++ b/drivers/infiniband/hw/amso1100/c2_cq.c @@ -260,11 +260,14 @@ static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq) mq->msg_pool.host, dma_unmap_addr(mq, mapping)); } -static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, int q_size, - int msg_size) +static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq, + size_t q_size, size_t msg_size) { u8 *pool_start; + if (q_size > SIZE_MAX / msg_size) + return -EINVAL; + pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size, &mq->host_dma, GFP_KERNEL); if (!pool_start) From a57f23f675bf679b2be093571ae09fe2199c5e76 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 12 Aug 2014 19:20:07 -0400 Subject: [PATCH 45/52] IB/mlx4: Use ARRAY_SIZE instead of sizeof/sizeof[0] Signed-off-by: Fabian Frederick Signed-off-by: Doug Ledford Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/main.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 0f7027e7db13..0dc807f7b069 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -910,8 +910,7 @@ static int __mlx4_ib_default_rules_match(struct ib_qp *qp, const struct default_rules *pdefault_rules = default_table; u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port); - for (i = 0; i < sizeof(default_table)/sizeof(default_table[0]); i++, - pdefault_rules++) { + for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) { __u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS]; memset(&field_types, 0, sizeof(field_types)); @@ -965,8 +964,7 @@ static int __mlx4_ib_create_default_rules( int size = 0; int i; - for (i = 0; i < sizeof(pdefault_rules->rules_create_list)/ - sizeof(pdefault_rules->rules_create_list[0]); i++) { + for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) { int ret; union ib_flow_spec ib_spec; switch (pdefault_rules->rules_create_list[i]) { From a8f731ebd1b4f94cf52ff07fe524414b4fbf9793 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Tue, 12 Aug 2014 19:20:08 -0400 Subject: [PATCH 46/52] IB/mlx5: Use ARRAY_SIZE instead of sizeof/sizeof[0] Acked-by: Eli Cohen Signed-off-by: Fabian Frederick Signed-off-by: Doug Ledford Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index bbbcf389272c..416cb7244224 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2501,7 +2501,7 @@ int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, spin_lock_irqsave(&qp->sq.lock, flags); for (nreq = 0; wr; nreq++, wr = wr->next) { - if (unlikely(wr->opcode >= sizeof(mlx5_ib_opcode) / sizeof(mlx5_ib_opcode[0]))) { + if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { mlx5_ib_warn(dev, "\n"); err = -EINVAL; *bad_wr = wr; From 2aa1cf64aa576671b3593d73a6cb0a28b90ee444 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Tue, 12 Aug 2014 19:20:10 -0400 Subject: [PATCH 47/52] IB/srpt: Handle GID change events GID change events need a refresh just like LID change events and several others. Handle this the same as the others. Signed-off-by: Doug Ledford Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srpt/ib_srpt.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index fe09f2788b15..fd23203c25a9 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -198,6 +198,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, case IB_EVENT_PKEY_CHANGE: case IB_EVENT_SM_CHANGE: case IB_EVENT_CLIENT_REREGISTER: + case IB_EVENT_GID_CHANGE: /* Refresh port data asynchronously. */ if (event->element.port_num <= sdev->device->phys_port_cnt) { sport = &sdev->port[event->element.port_num - 1]; From db1044d458a287c18c4d413adc4ad12e92e253b5 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Tue, 12 Aug 2014 19:20:11 -0400 Subject: [PATCH 48/52] RDMA/uapi: Include socket.h in rdma_user_cm.h added struct sockaddr_storage to rdma_user_cm.h without also adding an include for linux/socket.h to make sure it is defined. Systemtap needs the header files to build standalone and cannot rely on other files to pre-include other headers, so add linux/socket.h to the list of includes in this file. Fixes: ee7aed4528f ("RDMA/ucma: Support querying for AF_IB addresses") Signed-off-by: Doug Ledford Signed-off-by: Roland Dreier --- include/uapi/rdma/rdma_user_cm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/rdma/rdma_user_cm.h b/include/uapi/rdma/rdma_user_cm.h index 99b80abf360a..3066718eb120 100644 --- a/include/uapi/rdma/rdma_user_cm.h +++ b/include/uapi/rdma/rdma_user_cm.h @@ -34,6 +34,7 @@ #define RDMA_USER_CM_H #include +#include #include #include #include From 0ea8726250cae3defa0c1065801017f26a269c93 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Wed, 2 Jul 2014 11:36:04 +0530 Subject: [PATCH 49/52] RDMA/ocrdma: Obtain SL from device structure Currently, driver obtains service level value from ah_attr->sl field. However, this field is set to zero all the times from rdma-cm. This patch allows create_ah to obtain service level from dev->sl. Signed-off-by: Devesh Sharma Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_ah.c | 4 +++- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 4 +--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index a023234d4b92..40f8536c10b0 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -35,6 +35,8 @@ #include "ocrdma_ah.h" #include "ocrdma_hw.h" +#define OCRDMA_VID_PCP_SHIFT 0xD + static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, struct ib_ah_attr *attr, int pdid) { @@ -55,7 +57,7 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah, if (vlan_tag && (vlan_tag < 0x1000)) { eth.eth_type = cpu_to_be16(0x8100); eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE); - vlan_tag |= (attr->sl & 7) << 13; + vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT; eth.vlan_tag = cpu_to_be16(vlan_tag); eth_sz = sizeof(struct ocrdma_eth_vlan); vlan_enabled = true; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index a4d27c70528f..df68782b32d1 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -2314,10 +2314,8 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp, cmd->params.vlan_dmac_b4_to_b5 |= vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT; cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID; - /* override the sl with default priority if 0 */ cmd->params.rnt_rc_sl_fl |= - (ah_attr->sl ? ah_attr->sl : - qp->dev->sl) << OCRDMA_QP_PARAMS_SL_SHIFT; + (qp->dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT; } return 0; } From 8ac0c7c7a13b16e7c49fea812819a7f807a202fe Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Wed, 2 Jul 2014 11:36:05 +0530 Subject: [PATCH 50/52] RDMA/ocrdma: Update sli data structure for endianness Update the sli specific mailbox command request/response data sturcures to fix endianness issues. Signed-off-by: Devesh Sharma Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 32 +++-- drivers/infiniband/hw/ocrdma/ocrdma_sli.h | 147 ++++++++++++++++------ 2 files changed, 129 insertions(+), 50 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index df68782b32d1..dd35ae558ae1 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -525,7 +525,7 @@ static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev, cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS; cmd->eqn = eq->id; - cmd->cqe_count = cq->size / sizeof(struct ocrdma_mcqe); + cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe); ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE, cq->dma, PAGE_SIZE_4K); @@ -1265,7 +1265,9 @@ static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev) ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va; hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs; - dev->hba_port_num = hba_attribs->phy_port; + dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv & + OCRDMA_HBA_ATTRB_PTNUM_MASK) + >> OCRDMA_HBA_ATTRB_PTNUM_SHIFT; strncpy(dev->model_number, hba_attribs->controller_model_number, 31); } @@ -1315,7 +1317,8 @@ int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed) goto mbx_err; rsp = (struct ocrdma_get_link_speed_rsp *)cmd; - *lnk_speed = rsp->phys_port_speed; + *lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK) + >> OCRDMA_PHY_PS_SHIFT; mbx_err: kfree(cmd); @@ -1341,11 +1344,16 @@ static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev) goto mbx_err; rsp = (struct ocrdma_get_phy_info_rsp *)cmd; - dev->phy.phy_type = le16_to_cpu(rsp->phy_type); + dev->phy.phy_type = + (rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK); + dev->phy.interface_type = + (rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK) + >> OCRDMA_IF_TYPE_SHIFT; dev->phy.auto_speeds_supported = - le16_to_cpu(rsp->auto_speeds_supported); + (rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK); dev->phy.fixed_speeds_supported = - le16_to_cpu(rsp->fixed_speeds_supported); + (rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK) + >> OCRDMA_FSPEED_SUPP_SHIFT; mbx_err: kfree(cmd); return status; @@ -1470,8 +1478,8 @@ static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev) pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va; for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) { - pbes[i].pa_lo = (u32) (pa & 0xffffffff); - pbes[i].pa_hi = (u32) upper_32_bits(pa); + pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff); + pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa)); pa += PAGE_SIZE; } cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF); @@ -1638,14 +1646,16 @@ int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq, cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP << OCRDMA_CREATE_CQ_TYPE_SHIFT; cq->phase_change = false; - cmd->cmd.cqe_count = (cq->len / cqe_size); + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size); } else { - cmd->cmd.cqe_count = (cq->len / cqe_size) - 1; + cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1; cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID; cq->phase_change = true; } - cmd->cmd.pd_id = pd_id; /* valid only for v3 */ + /* pd_id valid only for v3 */ + cmd->cmd.pdid_cqecnt |= (pd_id << + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT); ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size); status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd); if (status) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h index 3d08e665be26..904989ec5eaa 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h @@ -589,17 +589,26 @@ enum { OCRDMA_FN_MODE_RDMA = 0x4 }; +enum { + OCRDMA_IF_TYPE_MASK = 0xFFFF0000, + OCRDMA_IF_TYPE_SHIFT = 0x10, + OCRDMA_PHY_TYPE_MASK = 0x0000FFFF, + OCRDMA_FUTURE_DETAILS_MASK = 0xFFFF0000, + OCRDMA_FUTURE_DETAILS_SHIFT = 0x10, + OCRDMA_EX_PHY_DETAILS_MASK = 0x0000FFFF, + OCRDMA_FSPEED_SUPP_MASK = 0xFFFF0000, + OCRDMA_FSPEED_SUPP_SHIFT = 0x10, + OCRDMA_ASPEED_SUPP_MASK = 0x0000FFFF +}; + struct ocrdma_get_phy_info_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; - u16 phy_type; - u16 interface_type; + u32 ityp_ptyp; u32 misc_params; - u16 ext_phy_details; - u16 rsvd; - u16 auto_speeds_supported; - u16 fixed_speeds_supported; + u32 ftrdtl_exphydtl; + u32 fspeed_aspeed; u32 future_use[2]; }; @@ -612,19 +621,34 @@ enum { OCRDMA_PHY_SPEED_40GBPS = 0x20 }; +enum { + OCRDMA_PORT_NUM_MASK = 0x3F, + OCRDMA_PT_MASK = 0xC0, + OCRDMA_PT_SHIFT = 0x6, + OCRDMA_LINK_DUP_MASK = 0x0000FF00, + OCRDMA_LINK_DUP_SHIFT = 0x8, + OCRDMA_PHY_PS_MASK = 0x00FF0000, + OCRDMA_PHY_PS_SHIFT = 0x10, + OCRDMA_PHY_PFLT_MASK = 0xFF000000, + OCRDMA_PHY_PFLT_SHIFT = 0x18, + OCRDMA_QOS_LNKSP_MASK = 0xFFFF0000, + OCRDMA_QOS_LNKSP_SHIFT = 0x10, + OCRDMA_LLST_MASK = 0xFF, + OCRDMA_PLFC_MASK = 0x00000400, + OCRDMA_PLFC_SHIFT = 0x8, + OCRDMA_PLRFC_MASK = 0x00000200, + OCRDMA_PLRFC_SHIFT = 0x8, + OCRDMA_PLTFC_MASK = 0x00000100, + OCRDMA_PLTFC_SHIFT = 0x8 +}; struct ocrdma_get_link_speed_rsp { struct ocrdma_mqe_hdr hdr; struct ocrdma_mbx_rsp rsp; - u8 pt_port_num; - u8 link_duplex; - u8 phys_port_speed; - u8 phys_port_fault; - u16 rsvd1; - u16 qos_lnk_speed; - u8 logical_lnk_status; - u8 rsvd2[3]; + u32 pflt_pps_ld_pnum; + u32 qos_lsp; + u32 res_lls; }; enum { @@ -675,8 +699,7 @@ struct ocrdma_create_cq_cmd { u32 pgsz_pgcnt; u32 ev_cnt_flags; u32 eqn; - u16 cqe_count; - u16 pd_id; + u32 pdid_cqecnt; u32 rsvd6; struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES]; }; @@ -686,6 +709,10 @@ struct ocrdma_create_cq { struct ocrdma_create_cq_cmd cmd; }; +enum { + OCRDMA_CREATE_CQ_CMD_PDID_SHIFT = 0x10 +}; + enum { OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK = 0xFFFF }; @@ -1904,12 +1931,62 @@ struct ocrdma_rdma_stats_resp { struct ocrdma_rx_dbg_stats rx_dbg_stats; } __packed; +enum { + OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK = 0xFF, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_CDBLEN_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ASIC_REV_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_GUID0_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_GUID0_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_GUID13_MASK = 0xFF, + OCRDMA_HBA_ATTRB_GUID14_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_GUID14_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_GUID15_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_GUID15_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCNT_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_PCNT_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_LDTOUT_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_ISCSI_VER_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_CV_MASK = 0xFF, + OCRDMA_HBA_ATTRB_HBA_ST_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_HBA_ST_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_MAX_DOMS_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PTNUM_MASK = 0x3F000000, + OCRDMA_HBA_ATTRB_PTNUM_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_PT_MASK = 0xC0000000, + OCRDMA_HBA_ATTRB_PT_SHIFT = 0x1E, + OCRDMA_HBA_ATTRB_ISCSI_FET_MASK = 0xFF, + OCRDMA_HBA_ATTRB_ASIC_GEN_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_VID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_DID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_DID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_SVID_MASK = 0xFFFF, + OCRDMA_HBA_ATTRB_PCI_SSID_MASK = 0xFFFF0000, + OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK = 0xFF, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK = 0xFF00, + OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT = 0x08, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK = 0xFF0000, + OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT = 0x10, + OCRDMA_HBA_ATTRB_IF_TYPE_MASK = 0xFF000000, + OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT = 0x18, + OCRDMA_HBA_ATTRB_NETFIL_MASK =0xFF +}; struct mgmt_hba_attribs { u8 flashrom_version_string[32]; u8 manufacturer_name[32]; u32 supported_modes; - u32 rsvd0[3]; + u32 rsvd_eprom_verhi_verlo; + u32 mbx_ds_ver; + u32 epfw_ds_ver; u8 ncsi_ver_string[12]; u32 default_extended_timeout; u8 controller_model_number[32]; @@ -1922,34 +1999,26 @@ struct mgmt_hba_attribs { u8 driver_version_string[32]; u8 fw_on_flash_version_string[32]; u32 functionalities_supported; - u16 max_cdblength; - u8 asic_revision; - u8 generational_guid[16]; - u8 hba_port_count; - u16 default_link_down_timeout; - u8 iscsi_ver_min_max; - u8 multifunction_device; - u8 cache_valid; - u8 hba_status; - u8 max_domains_supported; - u8 phy_port; + u32 guid0_asicrev_cdblen; + u8 generational_guid[12]; + u32 portcnt_guid15; + u32 mfuncdev_iscsi_ldtout; + u32 ptpnum_maxdoms_hbast_cv; u32 firmware_post_status; u32 hba_mtu[8]; - u32 rsvd1[4]; + u32 res_asicgen_iscsi_feaures; + u32 rsvd1[3]; }; struct mgmt_controller_attrib { struct mgmt_hba_attribs hba_attribs; - u16 pci_vendor_id; - u16 pci_device_id; - u16 pci_sub_vendor_id; - u16 pci_sub_system_id; - u8 pci_bus_number; - u8 pci_device_number; - u8 pci_function_number; - u8 interface_type; - u64 unique_identifier; - u32 rsvd0[5]; + u32 pci_did_vid; + u32 pci_ssid_svid; + u32 ityp_fnum_devnum_bnum; + u32 uid_hi; + u32 uid_lo; + u32 res_nnetfil; + u32 rsvd0[4]; }; struct ocrdma_get_ctrl_attribs_rsp { From 96c51abecc4379273fb751ff44eb57e4b243b5f0 Mon Sep 17 00:00:00 2001 From: Mitesh Ahuja Date: Wed, 2 Jul 2014 11:36:06 +0530 Subject: [PATCH 51/52] RDMA/ocrdma: report asic-id in query device Ocrdma does not report hw_ver when query_device is issued. This patch adds a meaningful value to this field. Signed-off-by: Devesh Sharma Signed-off-by: Mitesh Ahuja Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 90152dec6447..acb434d16903 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -73,7 +73,7 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr) attr->page_size_cap = 0xffff000; attr->vendor_id = dev->nic_info.pdev->vendor; attr->vendor_part_id = dev->nic_info.pdev->device; - attr->hw_ver = 0; + attr->hw_ver = dev->asic_id; attr->max_qp = dev->attr.max_qp; attr->max_ah = OCRDMA_MAX_AH; attr->max_qp_wr = dev->attr.max_wqe; From da05be290f5040e488ea4d13edc4da889f6f6929 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 14 Aug 2014 08:56:22 +0800 Subject: [PATCH 52/52] IB/srp: Fix return value check in srp_init_module() In case of error, the function create_workqueue() returns NULL pointer not ERR_PTR(). The IS_ERR() test in the return value check should be replaced with NULL test. Signed-off-by: Wei Yongjun Acked-by: Bart Van Assche Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 7f5ee7fc02a6..62d2a18e1b41 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3320,8 +3320,8 @@ static int __init srp_init_module(void) } srp_remove_wq = create_workqueue("srp_remove"); - if (IS_ERR(srp_remove_wq)) { - ret = PTR_ERR(srp_remove_wq); + if (!srp_remove_wq) { + ret = -ENOMEM; goto out; }