From 649fb5ec0e5291da4f5e3b6c8eb4fc8c4efbb24f Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 27 Oct 2013 21:50:45 +0000 Subject: [PATCH 01/62] IB/cxgb4: Fix formatting of physical address Physical addresses may be wider than virtual addresses (e.g. on i386 with PAE) and must not be formatted with %p. Compile-tested only. Signed-off-by: Ben Hutchings Signed-off-by: Roland Dreier --- drivers/infiniband/hw/cxgb4/device.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 33d2cc6ab562..4a033853312e 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -602,10 +602,10 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.start, rdev->lldi.vr->cq.size); - PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu " + PDBG("udb len 0x%x udb base %llx db_reg %p gts_reg %p qpshift %lu " "qpmask 0x%x cqshift %lu cqmask 0x%x\n", (unsigned)pci_resource_len(rdev->lldi.pdev, 2), - (void *)(unsigned long)pci_resource_start(rdev->lldi.pdev, 2), + (u64)pci_resource_start(rdev->lldi.pdev, 2), rdev->lldi.db_reg, rdev->lldi.gts_reg, rdev->qpshift, rdev->qpmask, From c2bb5628db44735bc0095a8ef8efc0278ba245df Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 16 Oct 2013 17:37:47 +0300 Subject: [PATCH 02/62] IPoIB: Fix crash in dev_open error flow If napi has never been enabled when calling ipoib_ib_dev_stop, a kernel crash occurs, because the verbs layer completion handler (ipoib_ib_completion) calls napi_schedule unconditionally. If the napi structure passed in the napi_schedule call has not been initialized, napi will crash. The cleanest solution is to simply enable napi before calling ipoib_ib_dev_stop in the dev_open error flow. (dev_stop then immediately disables napi). Signed-off-by: Jack Morgenstein Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 196b1d13cbcb..c14f949edcb7 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -685,15 +685,13 @@ int ipoib_ib_dev_open(struct net_device *dev) ret = ipoib_ib_post_receives(dev); if (ret) { ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } ret = ipoib_cm_dev_open(dev); if (ret) { ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret); - ipoib_ib_dev_stop(dev, 1); - return -1; + goto dev_stop; } clear_bit(IPOIB_STOP_REAPER, &priv->flags); @@ -704,6 +702,11 @@ int ipoib_ib_dev_open(struct net_device *dev) napi_enable(&priv->napi); return 0; +dev_stop: + if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) + napi_enable(&priv->napi); + ipoib_ib_dev_stop(dev, 1); + return -1; } static void ipoib_pkey_dev_check_presence(struct net_device *dev) From 22252b4e09f19431f82dc26428e0edf697d19a04 Mon Sep 17 00:00:00 2001 From: Tal Alon Date: Wed, 16 Oct 2013 17:37:48 +0300 Subject: [PATCH 03/62] IPoIB: Change CM skb memory allocation to be non-atomic during init Change CM skb memory allocation to use GFP_KERNEL when possible. During device init there's no need to use GFP_ATOMIC when allocating memory for the CM skbs -- use GFP_KERNEL instead. Signed-off-by: Tal Alon Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 7a3175400b2a..1377f85911c2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -140,7 +140,8 @@ static int ipoib_cm_post_receive_nonsrq(struct net_device *dev, static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, struct ipoib_cm_rx_buf *rx_ring, int id, int frags, - u64 mapping[IPOIB_CM_RX_SG]) + u64 mapping[IPOIB_CM_RX_SG], + gfp_t gfp) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct sk_buff *skb; @@ -164,7 +165,7 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev, } for (i = 0; i < frags; i++) { - struct page *page = alloc_page(GFP_ATOMIC); + struct page *page = alloc_page(gfp); if (!page) goto partial_error; @@ -382,7 +383,8 @@ static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_i for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1, - rx->rx_ring[i].mapping)) { + rx->rx_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate receive buffer %d\n", i); ret = -ENOMEM; goto err_count; @@ -639,7 +641,8 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc) frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len, (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE; - newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, mapping); + newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags, + mapping, GFP_ATOMIC); if (unlikely(!newskb)) { /* * If we can't allocate a new RX buffer, dump @@ -1556,7 +1559,8 @@ int ipoib_cm_dev_init(struct net_device *dev) for (i = 0; i < ipoib_recvq_size; ++i) { if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i, priv->cm.num_frags - 1, - priv->cm.srq_ring[i].mapping)) { + priv->cm.srq_ring[i].mapping, + GFP_KERNEL)) { ipoib_warn(priv, "failed to allocate " "receive buffer %d\n", i); ipoib_cm_dev_cleanup(dev); From f47944cc2dba3c7e6f753b81e9f713f4d12bdd5a Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 16 Oct 2013 17:37:49 +0300 Subject: [PATCH 04/62] IPoIB: Fix deadlock between dev_change_flags() and __ipoib_dev_flush() When ipoib interface is going down it takes all of its children with it, under mutex. For each child, dev_change_flags() is called. That function calls ipoib_stop() via the ndo, and causes flush of the workqueue. Sometimes in the workqueue an __ipoib_dev_flush work() is waiting and when invoked tries to get the same mutex, which leads to a deadlock, as seen below. The solution is to switch to rw-sem instead of mutex. The deadlock: [11028.165303] [] ? vgacon_scroll+0x107/0x2e0 [11028.171844] [] schedule_timeout+0x215/0x2e0 [11028.178465] [] ? perf_event_task_sched_out+0x33/0x80 [11028.185962] [] wait_for_common+0x123/0x180 [11028.192491] [] ? default_wake_function+0x0/0x20 [11028.199504] [] wait_for_completion+0x1d/0x20 [11028.206224] [] flush_cpu_workqueue+0x61/0x90 [11028.212948] [] ? wq_barrier_func+0x0/0x20 [11028.219375] [] flush_workqueue+0x54/0x80 [11028.225712] [] ipoib_mcast_stop_thread+0x66/0x90 [ib_ipoib] [11028.233988] [] ipoib_ib_dev_down+0x6a/0x100 [ib_ipoib] [11028.241678] [] ipoib_stop+0x8a/0x140 [ib_ipoib] [11028.248692] [] dev_close+0x71/0xc0 [11028.254447] [] dev_change_flags+0xa1/0x1d0 [11028.261062] [] ipoib_stop+0x10b/0x140 [ib_ipoib] [11028.268172] [] dev_close+0x71/0xc0 [11028.273922] [] dev_change_flags+0xa1/0x1d0 [11028.280452] [] devinet_ioctl+0x5eb/0x6a0 [11028.286786] [] inet_ioctl+0x88/0xa0 [11028.292633] [] sock_ioctl+0x7a/0x280 [11028.298576] [] vfs_ioctl+0x22/0xa0 [11028.304326] [] ? unmap_region+0x110/0x130 [11028.310756] [] do_vfs_ioctl+0x84/0x580 [11028.316897] [] sys_ioctl+0x81/0xa0 and 11028.017533] [] ? perf_event_task_sched_out+0x33/0x80 [11028.025030] [] ? apic_timer_interrupt+0xe/0x20 [11028.031945] [] __mutex_lock_slowpath+0x13e/0x180 [11028.039053] [] mutex_lock+0x2b/0x50 [11028.044910] [] __ipoib_ib_dev_flush+0x37/0x210 [ib_ipoib] [11028.052894] [] ? ipoib_ib_dev_flush_light+0x0/0x20 [ib_ipoib] [11028.061363] [] ipoib_ib_dev_flush_light+0x17/0x20 [ib_ipoib] [11028.069738] [] worker_thread+0x170/0x2a0 [11028.076068] [] ? autoremove_wake_function+0x0/0x40 [11028.083374] [] ? worker_thread+0x0/0x2a0 [11028.089709] [] kthread+0x96/0xa0 [11028.095266] [] child_rip+0xa/0x20 [11028.100921] [] ? kthread+0x0/0xa0 [11028.106573] [] ? child_rip+0x0/0x20 [11028.112423] INFO: task ifconfig:23640 blocked for more than 120 seconds. Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 +- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 4 ++-- drivers/infiniband/ulp/ipoib/ipoib_main.c | 10 +++++----- drivers/infiniband/ulp/ipoib/ipoib_netlink.c | 4 ++-- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 10 ++++++---- 5 files changed, 16 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index eb71aaa26a9a..ec9190eff09c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -299,7 +299,7 @@ struct ipoib_dev_priv { unsigned long flags; - struct mutex vlan_mutex; + struct rw_semaphore vlan_rwsem; struct rb_root path_tree; struct list_head path_list; diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index c14f949edcb7..01594de9b11d 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -977,7 +977,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, u16 new_index; int result; - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); /* * Flush any child interfaces too -- they might be up even if @@ -986,7 +986,7 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv, list_for_each_entry(cpriv, &priv->child_intfs, list) __ipoib_ib_dev_flush(cpriv, level); - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags)) { /* for non-child devices must check/update the pkey value here */ diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 82cec1af902c..dcb50cef0bda 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -119,7 +119,7 @@ int ipoib_open(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring up any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -129,7 +129,7 @@ int ipoib_open(struct net_device *dev) dev_change_flags(cpriv->dev, flags | IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } netif_start_queue(dev); @@ -162,7 +162,7 @@ static int ipoib_stop(struct net_device *dev) struct ipoib_dev_priv *cpriv; /* Bring down any child interfaces too */ - mutex_lock(&priv->vlan_mutex); + down_read(&priv->vlan_rwsem); list_for_each_entry(cpriv, &priv->child_intfs, list) { int flags; @@ -172,7 +172,7 @@ static int ipoib_stop(struct net_device *dev) dev_change_flags(cpriv->dev, flags & ~IFF_UP); } - mutex_unlock(&priv->vlan_mutex); + up_read(&priv->vlan_rwsem); } return 0; @@ -1372,7 +1372,7 @@ void ipoib_setup(struct net_device *dev) spin_lock_init(&priv->lock); - mutex_init(&priv->vlan_mutex); + init_rwsem(&priv->vlan_rwsem); INIT_LIST_HEAD(&priv->path_list); INIT_LIST_HEAD(&priv->child_intfs); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c index f81abe16cf09..c29b5c838833 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c @@ -142,10 +142,10 @@ static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head priv = netdev_priv(dev); ppriv = netdev_priv(priv->parent); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); unregister_netdevice_queue(dev, head); list_del(&priv->list); - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); } static size_t ipoib_get_size(const struct net_device *dev) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 8292554bccb5..9fad7b5ac8b9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -140,7 +140,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + down_write(&ppriv->vlan_rwsem); /* * First ensure this isn't a duplicate. We check the parent device and @@ -163,7 +163,7 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD); out: - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); if (result) free_netdev(priv->dev); @@ -185,7 +185,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) if (!rtnl_trylock()) return restart_syscall(); - mutex_lock(&ppriv->vlan_mutex); + + down_write(&ppriv->vlan_rwsem); list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { @@ -195,7 +196,8 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) break; } } - mutex_unlock(&ppriv->vlan_mutex); + up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); if (dev) { From aede25011fddf559dcf216d86975187e3f64b109 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 16 Oct 2013 17:37:50 +0300 Subject: [PATCH 05/62] IPoIB: Avoid flushing the driver workqueue on dev_down The driver should not flush the whole workqueue when only one work (the pkey poll one) needs to be cancelled. Use cancel_delayed_work_sync() instead. Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index 01594de9b11d..ff64b1629dbf 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -749,10 +749,8 @@ int ipoib_ib_dev_down(struct net_device *dev, int flush) if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) { mutex_lock(&pkey_mutex); set_bit(IPOIB_PKEY_STOP, &priv->flags); - cancel_delayed_work(&priv->pkey_poll_task); + cancel_delayed_work_sync(&priv->pkey_poll_task); mutex_unlock(&pkey_mutex); - if (flush) - flush_workqueue(ipoib_workqueue); } ipoib_mcast_stop_thread(dev, flush); From a9c8ba588495547d1598f1b83d5eb086bef65e4b Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 16 Oct 2013 17:37:51 +0300 Subject: [PATCH 06/62] IPoIB: Fix usage of uninitialized multicast objects The driver should avoid calling ib_sa_free_multicast on the mcast->mc object until it finishes its initialization state. Otherwise we can crash when ipoib_mcast_dev_flush() attempts to use the uninitialized multicast object. Instead, only call wait_for_completion() for multicast entries that started the join process, meaning that ib_sa_join_multicast() finished. Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib.h | 2 ++ .../infiniband/ulp/ipoib/ipoib_multicast.c | 21 +++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index ec9190eff09c..c639f90cfda4 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -101,6 +101,7 @@ enum { IPOIB_MCAST_FLAG_SENDONLY = 1, IPOIB_MCAST_FLAG_BUSY = 2, /* joining or already joined */ IPOIB_MCAST_FLAG_ATTACHED = 3, + IPOIB_MCAST_JOIN_STARTED = 4, MAX_SEND_CQE = 16, IPOIB_CM_COPYBREAK = 256, @@ -151,6 +152,7 @@ struct ipoib_mcast { struct sk_buff_head pkt_queue; struct net_device *dev; + struct completion done; }; struct ipoib_rx_buf { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index cecb98a4c662..780a2a0df41f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -386,8 +386,10 @@ static int ipoib_mcast_join_complete(int status, mcast->mcmember.mgid.raw, status); /* We trap for port events ourselves. */ - if (status == -ENETRESET) - return 0; + if (status == -ENETRESET) { + status = 0; + goto out; + } if (!status) status = ipoib_mcast_join_finish(mcast, &multicast->rec); @@ -407,7 +409,8 @@ static int ipoib_mcast_join_complete(int status, if (mcast == priv->broadcast) queue_work(ipoib_workqueue, &priv->carrier_on_task); - return 0; + status = 0; + goto out; } if (mcast->logcount++ < 20) { @@ -434,7 +437,8 @@ static int ipoib_mcast_join_complete(int status, mcast->backoff * HZ); spin_unlock_irq(&priv->lock); mutex_unlock(&mcast_mutex); - +out: + complete(&mcast->done); return status; } @@ -484,11 +488,15 @@ static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast, } set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + init_completion(&mcast->done); + set_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags); + mcast->mc = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port, &rec, comp_mask, GFP_KERNEL, ipoib_mcast_join_complete, mcast); if (IS_ERR(mcast->mc)) { clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags); + complete(&mcast->done); ret = PTR_ERR(mcast->mc); ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret); @@ -751,6 +759,11 @@ void ipoib_mcast_dev_flush(struct net_device *dev) spin_unlock_irqrestore(&priv->lock, flags); + /* seperate between the wait to the leave*/ + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) + if (test_bit(IPOIB_MCAST_JOIN_STARTED, &mcast->flags)) + wait_for_completion(&mcast->done); + list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { ipoib_mcast_leave(dev, mcast); ipoib_mcast_free(mcast); From a39c52ab887fdcefae1d7f467fb0621f30833c84 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 16 Oct 2013 17:37:52 +0300 Subject: [PATCH 07/62] IPoIB: Add path query flushing in ipoib_ib_dev_cleanup The path_rec_completion() callback may be invoked asynchronously even at the middle of "driver uninit" process. This can lead to scheduling a task that tries to touch members of the priv object that are no longer valid. For example the function cm_create_tx_qp can attempt to create qp with no valid priv->pd object. The following crash is one of the results: RIP: 0010:[] [] ipoib_cm_create_tx_qp+0x57/0x90 [ib_ipoib] Process ipoib (pid: 5916, threadinfo ffff8803786e4000, task ffff8804150e1500) Stack: Call Trace: [] ? get_random_bytes+0x20/0x30 [] ipoib_cm_tx_init+0xca/0x340 [ib_ipoib] [] ipoib_cm_tx_start+0x215/0x3f0 [ib_ipoib] [] ? ipoib_cm_tx_start+0x0/0x3f0 [ib_ipoib] [] worker_thread+0x170/0x2a0 [] ? autoremove_wake_function+0x0/0x40 [] ? worker_thread+0x0/0x2a0 [] kthread+0x96/0xa0 [] child_rip+0xa/0x20 [] ? kthread+0x0/0xa0 [] ? child_rip+0x0/0x20 Fix that by flushing all pending path queries at this point. Signed-off-by: Alex Markuze Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_ib.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c index ff64b1629dbf..6a7003ddb0be 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c @@ -1082,6 +1082,11 @@ void ipoib_ib_dev_cleanup(struct net_device *dev) struct ipoib_dev_priv *priv = netdev_priv(dev); ipoib_dbg(priv, "cleaning up ib_dev\n"); + /* + * We must make sure there are no more (path) completions + * that may wish to touch priv fields that are no longer valid + */ + ipoib_flush_paths(dev); ipoib_mcast_stop_thread(dev, 1); ipoib_mcast_dev_flush(dev); From 94232d9ce81755c4b0c1536648442383442b27e0 Mon Sep 17 00:00:00 2001 From: Erez Shitrit Date: Wed, 16 Oct 2013 17:37:53 +0300 Subject: [PATCH 08/62] IPoIB: Start multicast join process only on active ports The driver starts the mcast_join task whenever the netdev interface is UP without relation to the underlying IB port state. Until the port state is ACTIVE all the join requests are irrelevant, and the IB core returns -EINVAL. So the user will see errors such as: "multicast join failed for ff12:401b:... , status -22". Instead, have ipoib_mcast_join_task() return when the port is not active. It will be called again when the port state is changed and the low-level driver triggers the IB_EVENT_PORT_ACTIVE event or the IB_EVENT_CLIENT_REREGISTER event. Signed-off-by: Erez Shitrit Signed-off-by: Or Gerlitz Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 780a2a0df41f..d4e005720d01 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -518,10 +518,18 @@ void ipoib_mcast_join_task(struct work_struct *work) struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv, mcast_task.work); struct net_device *dev = priv->dev; + struct ib_port_attr port_attr; if (!test_bit(IPOIB_MCAST_RUN, &priv->flags)) return; + if (ib_query_port(priv->ca, priv->port, &port_attr) || + port_attr.state != IB_PORT_ACTIVE) { + ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n", + port_attr.state); + return; + } + if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid)) ipoib_warn(priv, "ib_query_gid() failed\n"); else From 7f1a38671c55b5cbda77dbbda8b4651224c50cd7 Mon Sep 17 00:00:00 2001 From: Michal Schmidt Date: Wed, 21 Aug 2013 18:50:22 +0200 Subject: [PATCH 09/62] IPoIB: lower NAPI weight Since commit 82dc3c63c692 ("net: introduce NAPI_POLL_WEIGHT") netif_napi_add() produces an error message if a NAPI poll weight greater than 64 is requested. Use the standard NAPI weight. Signed-off-by: Michal Schmidt Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index dcb50cef0bda..d64ed05fb082 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1350,7 +1350,7 @@ void ipoib_setup(struct net_device *dev) ipoib_set_ethtool_ops(dev); - netif_napi_add(dev, &priv->napi, ipoib_poll, 100); + netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT); dev->watchdog_timeo = HZ; From 6b7d103c1b11346499063d9167832596de754ea4 Mon Sep 17 00:00:00 2001 From: Latchesar Ionkov Date: Sat, 19 Oct 2013 15:55:41 -0600 Subject: [PATCH 10/62] IB/core: Pass imm_data from ib_uverbs_send_wr to ib_send_wr correctly Currently, we don't copy the immediate data from the userspace struct to the kernel one when UD messages are being sent. This patch makes sure that the immediate data is set correctly. Signed-off-by: Latchesar Ionkov Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2f0f01b70e3b..5bb2a82d52e7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2128,6 +2128,9 @@ ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file, } next->wr.ud.remote_qpn = user_wr->wr.ud.remote_qpn; next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey; + if (next->opcode == IB_WR_SEND_WITH_IMM) + next->ex.imm_data = + (__be32 __force) user_wr->ex.imm_data; } else { switch (next->opcode) { case IB_WR_RDMA_WRITE_WITH_IMM: From 5476781bb9e273d81a4192be47101811e987114f Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Mon, 30 Sep 2013 22:03:54 +0200 Subject: [PATCH 11/62] IB/netlink: Remove superfluous RDMA_NL_GET_OP() masking 'op' is the already RDMA_NL_GET_OP() masked 'type'. No need to mask it again. Signed-off-by: Mathias Krause Reviewed-by: Yann Droneaud Acked-by: Sean Hefty Signed-off-by: Roland Dreier --- drivers/infiniband/core/netlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index da06abde9e0d..a1e9cba84944 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -148,7 +148,7 @@ static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) list_for_each_entry(client, &client_list, list) { if (client->index == index) { if (op < 0 || op >= client->nops || - !client->cb_table[RDMA_NL_GET_OP(op)].dump) + !client->cb_table[op].dump) return -EINVAL; { From 51ee86a4af639e4ee8953dd02ad8a766c40f46a1 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:13 +0300 Subject: [PATCH 12/62] IB/mlx5: Fix check of number of entries in create CQ Verify that the value is non negative before rounding up to power of 2. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/cq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 344ab03948a3..e7c64c57d699 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -653,8 +653,11 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries, int eqn; int err; + if (entries < 0) + return ERR_PTR(-EINVAL); + entries = roundup_pow_of_two(entries + 1); - if (entries < 1 || entries > dev->mdev.caps.max_cqes) + if (entries > dev->mdev.caps.max_cqes) return ERR_PTR(-EINVAL); cq = kzalloc(sizeof(*cq), GFP_KERNEL); From 746b5583c1a48a837f4891adaff5e09d61b204a6 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:14 +0300 Subject: [PATCH 13/62] IB/mlx5: Multithreaded create MR Use asynchronous commands to execute up to eight concurrent create MR commands. This is to fill memory caches faster so we keep consuming from there. Also, increase timeout for shrinking caches to five minutes. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/main.c | 3 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 6 + drivers/infiniband/hw/mlx5/mr.c | 163 ++++++++++++++---- drivers/infiniband/hw/mlx5/qp.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 106 +++++++++--- .../net/ethernet/mellanox/mlx5/core/debugfs.c | 8 +- drivers/net/ethernet/mellanox/mlx5/core/mr.c | 32 ++-- include/linux/mlx5/driver.h | 17 +- 8 files changed, 255 insertions(+), 84 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b1a6cb3a2809..306534109627 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -745,7 +745,8 @@ static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in)); + err = mlx5_core_create_mkey(&dev->mdev, &mr, in, sizeof(*in), + NULL, NULL, NULL); if (err) { mlx5_ib_warn(dev, "failed to create mkey, %d\n", err); goto err_in; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 836be9157242..4c134d93d4fc 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -262,6 +262,9 @@ struct mlx5_ib_mr { int npages; struct completion done; enum ib_wc_status status; + struct mlx5_ib_dev *dev; + struct mlx5_create_mkey_mbox_out out; + unsigned long start; }; struct mlx5_ib_fast_reg_page_list { @@ -323,6 +326,7 @@ struct mlx5_cache_ent { struct mlx5_ib_dev *dev; struct work_struct work; struct delayed_work dwork; + int pending; }; struct mlx5_mr_cache { @@ -358,6 +362,8 @@ struct mlx5_ib_dev { spinlock_t mr_lock; struct mlx5_ib_resources devr; struct mlx5_mr_cache cache; + struct timer_list delay_timer; + int fill_delay; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 3453580b1eb2..d7f202290747 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -35,11 +35,13 @@ #include #include #include +#include #include #include "mlx5_ib.h" enum { DEF_CACHE_SIZE = 10, + MAX_PENDING_REG_MR = 8, }; enum { @@ -63,6 +65,57 @@ static int order2idx(struct mlx5_ib_dev *dev, int order) return order - cache->ent[0].order; } +static void reg_mr_callback(int status, void *context) +{ + struct mlx5_ib_mr *mr = context; + struct mlx5_ib_dev *dev = mr->dev; + struct mlx5_mr_cache *cache = &dev->cache; + int c = order2idx(dev, mr->order); + struct mlx5_cache_ent *ent = &cache->ent[c]; + u8 key; + unsigned long delta = jiffies - mr->start; + unsigned long index; + unsigned long flags; + + index = find_last_bit(&delta, 8 * sizeof(delta)); + if (index == 64) + index = 0; + + spin_lock_irqsave(&ent->lock, flags); + ent->pending--; + spin_unlock_irqrestore(&ent->lock, flags); + if (status) { + mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + if (mr->out.hdr.status) { + mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n", + mr->out.hdr.status, + be32_to_cpu(mr->out.hdr.syndrome)); + kfree(mr); + dev->fill_delay = 1; + mod_timer(&dev->delay_timer, jiffies + HZ); + return; + } + + spin_lock_irqsave(&dev->mdev.priv.mkey_lock, flags); + key = dev->mdev.priv.mkey_key++; + spin_unlock_irqrestore(&dev->mdev.priv.mkey_lock, flags); + mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key; + + cache->last_add = jiffies; + + spin_lock_irqsave(&ent->lock, flags); + list_add_tail(&mr->list, &ent->head); + ent->cur++; + ent->size++; + spin_unlock_irqrestore(&ent->lock, flags); +} + static int add_keys(struct mlx5_ib_dev *dev, int c, int num) { struct mlx5_mr_cache *cache = &dev->cache; @@ -78,36 +131,39 @@ static int add_keys(struct mlx5_ib_dev *dev, int c, int num) return -ENOMEM; for (i = 0; i < num; i++) { + if (ent->pending >= MAX_PENDING_REG_MR) { + err = -EAGAIN; + break; + } + mr = kzalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { err = -ENOMEM; - goto out; + break; } mr->order = ent->order; mr->umred = 1; + mr->dev = dev; in->seg.status = 1 << 6; in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2); in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN; in->seg.log2_page_size = 12; + spin_lock_irq(&ent->lock); + ent->pending++; + spin_unlock_irq(&ent->lock); + mr->start = jiffies; err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, - sizeof(*in)); + sizeof(*in), reg_mr_callback, + mr, &mr->out); if (err) { mlx5_ib_warn(dev, "create mkey failed %d\n", err); kfree(mr); - goto out; + break; } - cache->last_add = jiffies; - - spin_lock(&ent->lock); - list_add_tail(&mr->list, &ent->head); - ent->cur++; - ent->size++; - spin_unlock(&ent->lock); } -out: kfree(in); return err; } @@ -121,16 +177,16 @@ static void remove_keys(struct mlx5_ib_dev *dev, int c, int num) int i; for (i = 0; i < num; i++) { - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); @@ -162,9 +218,13 @@ static ssize_t size_write(struct file *filp, const char __user *buf, return -EINVAL; if (var > ent->size) { - err = add_keys(dev, c, var - ent->size); - if (err) - return err; + do { + err = add_keys(dev, c, var - ent->size); + if (err && err != -EAGAIN) + return err; + + usleep_range(3000, 5000); + } while (err); } else if (var < ent->size) { remove_keys(dev, c, ent->size - var); } @@ -280,23 +340,37 @@ static void __cache_work_func(struct mlx5_cache_ent *ent) struct mlx5_ib_dev *dev = ent->dev; struct mlx5_mr_cache *cache = &dev->cache; int i = order2idx(dev, ent->order); + int err; if (cache->stopped) return; ent = &dev->cache.ent[i]; - if (ent->cur < 2 * ent->limit) { - add_keys(dev, i, 1); - if (ent->cur < 2 * ent->limit) - queue_work(cache->wq, &ent->work); + if (ent->cur < 2 * ent->limit && !dev->fill_delay) { + err = add_keys(dev, i, 1); + if (ent->cur < 2 * ent->limit) { + if (err == -EAGAIN) { + mlx5_ib_dbg(dev, "returned eagain, order %d\n", + i + 2); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(3)); + } else if (err) { + mlx5_ib_warn(dev, "command failed order %d, err %d\n", + i + 2, err); + queue_delayed_work(cache->wq, &ent->dwork, + msecs_to_jiffies(1000)); + } else { + queue_work(cache->wq, &ent->work); + } + } } else if (ent->cur > 2 * ent->limit) { if (!someone_adding(cache) && - time_after(jiffies, cache->last_add + 60 * HZ)) { + time_after(jiffies, cache->last_add + 300 * HZ)) { remove_keys(dev, i, 1); if (ent->cur > ent->limit) queue_work(cache->wq, &ent->work); } else { - queue_delayed_work(cache->wq, &ent->dwork, 60 * HZ); + queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ); } } } @@ -336,18 +410,18 @@ static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order) mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i); - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (!list_empty(&ent->head)) { mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); if (ent->cur < ent->limit) queue_work(cache->wq, &ent->work); break; } - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); queue_work(cache->wq, &ent->work); @@ -374,12 +448,12 @@ static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) return; } ent = &cache->ent[c]; - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); list_add_tail(&mr->list, &ent->head); ent->cur++; if (ent->cur > 2 * ent->limit) shrink = 1; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); if (shrink) queue_work(cache->wq, &ent->work); @@ -394,16 +468,16 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c) cancel_delayed_work(&ent->dwork); while (1) { - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (list_empty(&ent->head)) { - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); return; } mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list); list_del(&mr->list); ent->cur--; ent->size--; - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); err = mlx5_core_destroy_mkey(&dev->mdev, &mr->mmr); if (err) mlx5_ib_warn(dev, "failed destroy mkey\n"); @@ -464,6 +538,13 @@ static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev) debugfs_remove_recursive(dev->cache.root); } +static void delay_time_func(unsigned long ctx) +{ + struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx; + + dev->fill_delay = 0; +} + int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) { struct mlx5_mr_cache *cache = &dev->cache; @@ -479,6 +560,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) return -ENOMEM; } + setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev); for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) { INIT_LIST_HEAD(&cache->ent[i].head); spin_lock_init(&cache->ent[i].lock); @@ -522,6 +604,7 @@ int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev) clean_keys(dev, i); destroy_workqueue(dev->cache.wq); + del_timer_sync(&dev->delay_timer); return 0; } @@ -551,7 +634,8 @@ struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc) seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); seg->start_addr = 0; - err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in)); + err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL, + NULL); if (err) goto err_in; @@ -660,14 +744,14 @@ static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem, int err; int i; - for (i = 0; i < 10; i++) { + for (i = 0; i < 1; i++) { mr = alloc_cached_mr(dev, order); if (mr) break; err = add_keys(dev, order2idx(dev, order), 1); - if (err) { - mlx5_ib_warn(dev, "add_keys failed\n"); + if (err && err != -EAGAIN) { + mlx5_ib_warn(dev, "add_keys failed, err %d\n", err); break; } } @@ -759,8 +843,10 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr, in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); in->seg.log2_page_size = page_shift; in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); - in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift)); - err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen); + in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length, + 1 << page_shift)); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, inlen, NULL, + NULL, NULL); if (err) { mlx5_ib_warn(dev, "create mkey failed\n"); goto err_2; @@ -944,7 +1030,8 @@ struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd, * TBD not needed - issue 197292 */ in->seg.log2_page_size = PAGE_SHIFT; - err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in)); + err = mlx5_core_create_mkey(&dev->mdev, &mr->mmr, in, sizeof(*in), NULL, + NULL, NULL); kfree(in); if (err) goto err_free; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5659ea880741..e3881433f5d7 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1744,6 +1744,7 @@ static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr, MLX5_MKEY_MASK_PD | MLX5_MKEY_MASK_LR | MLX5_MKEY_MASK_LW | + MLX5_MKEY_MASK_KEY | MLX5_MKEY_MASK_RR | MLX5_MKEY_MASK_RW | MLX5_MKEY_MASK_A | @@ -1800,7 +1801,8 @@ static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *w seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start); seg->len = cpu_to_be64(wr->wr.fast_reg.length); seg->log2_page_size = wr->wr.fast_reg.page_shift; - seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8); + seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 | + mlx5_mkey_variant(wr->wr.fast_reg.rkey)); } static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg, diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c index 6ca30739625f..8675d26a678b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c @@ -98,6 +98,7 @@ enum { static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, struct mlx5_cmd_msg *in, struct mlx5_cmd_msg *out, + void *uout, int uout_size, mlx5_cmd_cbk_t cbk, void *context, int page_queue) { @@ -110,6 +111,8 @@ static struct mlx5_cmd_work_ent *alloc_cmd(struct mlx5_cmd *cmd, ent->in = in; ent->out = out; + ent->uout = uout; + ent->uout_size = uout_size; ent->callback = cbk; ent->context = context; ent->cmd = cmd; @@ -534,6 +537,7 @@ static void cmd_work_handler(struct work_struct *work) ent->lay = lay; memset(lay, 0, sizeof(*lay)); memcpy(lay->in, ent->in->first.data, sizeof(lay->in)); + ent->op = be32_to_cpu(lay->in[0]) >> 16; if (ent->in->next) lay->in_ptr = cpu_to_be64(ent->in->next->dma); lay->inlen = cpu_to_be32(ent->in->len); @@ -628,7 +632,8 @@ static int wait_func(struct mlx5_core_dev *dev, struct mlx5_cmd_work_ent *ent) * 2. page queue commands do not support asynchrous completion */ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, - struct mlx5_cmd_msg *out, mlx5_cmd_cbk_t callback, + struct mlx5_cmd_msg *out, void *uout, int uout_size, + mlx5_cmd_cbk_t callback, void *context, int page_queue, u8 *status) { struct mlx5_cmd *cmd = &dev->cmd; @@ -642,7 +647,8 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, if (callback && page_queue) return -EINVAL; - ent = alloc_cmd(cmd, in, out, callback, context, page_queue); + ent = alloc_cmd(cmd, in, out, uout, uout_size, callback, context, + page_queue); if (IS_ERR(ent)) return PTR_ERR(ent); @@ -670,10 +676,10 @@ static int mlx5_cmd_invoke(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *in, op = be16_to_cpu(((struct mlx5_inbox_hdr *)in->first.data)->opcode); if (op < ARRAY_SIZE(cmd->stats)) { stats = &cmd->stats[op]; - spin_lock(&stats->lock); + spin_lock_irq(&stats->lock); stats->sum += ds; ++stats->n; - spin_unlock(&stats->lock); + spin_unlock_irq(&stats->lock); } mlx5_core_dbg_mask(dev, 1 << MLX5_CMD_TIME, "fw exec time for %s is %lld nsec\n", @@ -826,7 +832,7 @@ static struct mlx5_cmd_msg *mlx5_alloc_cmd_msg(struct mlx5_core_dev *dev, int n; int i; - msg = kzalloc(sizeof(*msg), GFP_KERNEL); + msg = kzalloc(sizeof(*msg), flags); if (!msg) return ERR_PTR(-ENOMEM); @@ -1109,6 +1115,19 @@ void mlx5_cmd_use_polling(struct mlx5_core_dev *dev) up(&cmd->sem); } +static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg) +{ + unsigned long flags; + + if (msg->cache) { + spin_lock_irqsave(&msg->cache->lock, flags); + list_add_tail(&msg->list, &msg->cache->head); + spin_unlock_irqrestore(&msg->cache->lock, flags); + } else { + mlx5_free_cmd_msg(dev, msg); + } +} + void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector) { struct mlx5_cmd *cmd = &dev->cmd; @@ -1117,6 +1136,10 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector) void *context; int err; int i; + ktime_t t1, t2, delta; + s64 ds; + struct mlx5_cmd_stats *stats; + unsigned long flags; for (i = 0; i < (1 << cmd->log_sz); i++) { if (test_bit(i, &vector)) { @@ -1141,9 +1164,29 @@ void mlx5_cmd_comp_handler(struct mlx5_core_dev *dev, unsigned long vector) } free_ent(cmd, ent->idx); if (ent->callback) { + t1 = timespec_to_ktime(ent->ts1); + t2 = timespec_to_ktime(ent->ts2); + delta = ktime_sub(t2, t1); + ds = ktime_to_ns(delta); + if (ent->op < ARRAY_SIZE(cmd->stats)) { + stats = &cmd->stats[ent->op]; + spin_lock_irqsave(&stats->lock, flags); + stats->sum += ds; + ++stats->n; + spin_unlock_irqrestore(&stats->lock, flags); + } + callback = ent->callback; context = ent->context; err = ent->ret; + if (!err) + err = mlx5_copy_from_msg(ent->uout, + ent->out, + ent->uout_size); + + mlx5_free_cmd_msg(dev, ent->out); + free_msg(dev, ent->in); + free_cmd(ent); callback(err, context); } else { @@ -1160,7 +1203,8 @@ static int status_to_err(u8 status) return status ? -1 : 0; /* TBD more meaningful codes */ } -static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size) +static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size, + gfp_t gfp) { struct mlx5_cmd_msg *msg = ERR_PTR(-ENOMEM); struct mlx5_cmd *cmd = &dev->cmd; @@ -1172,7 +1216,7 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size) ent = &cmd->cache.med; if (ent) { - spin_lock(&ent->lock); + spin_lock_irq(&ent->lock); if (!list_empty(&ent->head)) { msg = list_entry(ent->head.next, typeof(*msg), list); /* For cached lists, we must explicitly state what is @@ -1181,43 +1225,34 @@ static struct mlx5_cmd_msg *alloc_msg(struct mlx5_core_dev *dev, int in_size) msg->len = in_size; list_del(&msg->list); } - spin_unlock(&ent->lock); + spin_unlock_irq(&ent->lock); } if (IS_ERR(msg)) - msg = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, in_size); + msg = mlx5_alloc_cmd_msg(dev, gfp, in_size); return msg; } -static void free_msg(struct mlx5_core_dev *dev, struct mlx5_cmd_msg *msg) -{ - if (msg->cache) { - spin_lock(&msg->cache->lock); - list_add_tail(&msg->list, &msg->cache->head); - spin_unlock(&msg->cache->lock); - } else { - mlx5_free_cmd_msg(dev, msg); - } -} - static int is_manage_pages(struct mlx5_inbox_hdr *in) { return be16_to_cpu(in->opcode) == MLX5_CMD_OP_MANAGE_PAGES; } -int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, - int out_size) +static int cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, + int out_size, mlx5_cmd_cbk_t callback, void *context) { struct mlx5_cmd_msg *inb; struct mlx5_cmd_msg *outb; int pages_queue; + gfp_t gfp; int err; u8 status = 0; pages_queue = is_manage_pages(in); + gfp = callback ? GFP_ATOMIC : GFP_KERNEL; - inb = alloc_msg(dev, in_size); + inb = alloc_msg(dev, in_size, gfp); if (IS_ERR(inb)) { err = PTR_ERR(inb); return err; @@ -1229,13 +1264,14 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, goto out_in; } - outb = mlx5_alloc_cmd_msg(dev, GFP_KERNEL, out_size); + outb = mlx5_alloc_cmd_msg(dev, gfp, out_size); if (IS_ERR(outb)) { err = PTR_ERR(outb); goto out_in; } - err = mlx5_cmd_invoke(dev, inb, outb, NULL, NULL, pages_queue, &status); + err = mlx5_cmd_invoke(dev, inb, outb, out, out_size, callback, context, + pages_queue, &status); if (err) goto out_out; @@ -1248,14 +1284,30 @@ int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, err = mlx5_copy_from_msg(out, outb, out_size); out_out: - mlx5_free_cmd_msg(dev, outb); + if (!callback) + mlx5_free_cmd_msg(dev, outb); out_in: - free_msg(dev, inb); + if (!callback) + free_msg(dev, inb); return err; } + +int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, + int out_size) +{ + return cmd_exec(dev, in, in_size, out, out_size, NULL, NULL); +} EXPORT_SYMBOL(mlx5_cmd_exec); +int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, + void *out, int out_size, mlx5_cmd_cbk_t callback, + void *context) +{ + return cmd_exec(dev, in, in_size, out, out_size, callback, context); +} +EXPORT_SYMBOL(mlx5_cmd_exec_cb); + static void destroy_msg_cache(struct mlx5_core_dev *dev) { struct mlx5_cmd *cmd = &dev->cmd; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c index 9c7194b26ee2..80f6d127257a 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/debugfs.c @@ -154,10 +154,10 @@ static ssize_t average_read(struct file *filp, char __user *buf, size_t count, return 0; stats = filp->private_data; - spin_lock(&stats->lock); + spin_lock_irq(&stats->lock); if (stats->n) field = div64_u64(stats->sum, stats->n); - spin_unlock(&stats->lock); + spin_unlock_irq(&stats->lock); ret = snprintf(tbuf, sizeof(tbuf), "%llu\n", field); if (ret > 0) { if (copy_to_user(buf, tbuf, ret)) @@ -175,10 +175,10 @@ static ssize_t average_write(struct file *filp, const char __user *buf, struct mlx5_cmd_stats *stats; stats = filp->private_data; - spin_lock(&stats->lock); + spin_lock_irq(&stats->lock); stats->sum = 0; stats->n = 0; - spin_unlock(&stats->lock); + spin_unlock_irq(&stats->lock); *pos += count; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mr.c b/drivers/net/ethernet/mellanox/mlx5/core/mr.c index 5b44e2e46daf..35e514dc7b7d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mr.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/mr.c @@ -37,31 +37,41 @@ #include "mlx5_core.h" int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, - struct mlx5_create_mkey_mbox_in *in, int inlen) + struct mlx5_create_mkey_mbox_in *in, int inlen, + mlx5_cmd_cbk_t callback, void *context, + struct mlx5_create_mkey_mbox_out *out) { - struct mlx5_create_mkey_mbox_out out; + struct mlx5_create_mkey_mbox_out lout; int err; u8 key; - memset(&out, 0, sizeof(out)); - spin_lock(&dev->priv.mkey_lock); + memset(&lout, 0, sizeof(lout)); + spin_lock_irq(&dev->priv.mkey_lock); key = dev->priv.mkey_key++; - spin_unlock(&dev->priv.mkey_lock); + spin_unlock_irq(&dev->priv.mkey_lock); in->seg.qpn_mkey7_0 |= cpu_to_be32(key); in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_MKEY); - err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); + if (callback) { + err = mlx5_cmd_exec_cb(dev, in, inlen, out, sizeof(*out), + callback, context); + return err; + } else { + err = mlx5_cmd_exec(dev, in, inlen, &lout, sizeof(lout)); + } + if (err) { mlx5_core_dbg(dev, "cmd exec faile %d\n", err); return err; } - if (out.hdr.status) { - mlx5_core_dbg(dev, "status %d\n", out.hdr.status); - return mlx5_cmd_status_to_err(&out.hdr); + if (lout.hdr.status) { + mlx5_core_dbg(dev, "status %d\n", lout.hdr.status); + return mlx5_cmd_status_to_err(&lout.hdr); } - mr->key = mlx5_idx_to_mkey(be32_to_cpu(out.mkey) & 0xffffff) | key; - mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", be32_to_cpu(out.mkey), key, mr->key); + mr->key = mlx5_idx_to_mkey(be32_to_cpu(lout.mkey) & 0xffffff) | key; + mlx5_core_dbg(dev, "out 0x%x, key 0x%x, mkey 0x%x\n", + be32_to_cpu(lout.mkey), key, mr->key); return err; } diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 6b8c496572c8..513619a75695 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -557,9 +557,11 @@ typedef void (*mlx5_cmd_cbk_t)(int status, void *context); struct mlx5_cmd_work_ent { struct mlx5_cmd_msg *in; struct mlx5_cmd_msg *out; + void *uout; + int uout_size; mlx5_cmd_cbk_t callback; void *context; - int idx; + int idx; struct completion done; struct mlx5_cmd *cmd; struct work_struct work; @@ -570,6 +572,7 @@ struct mlx5_cmd_work_ent { u8 token; struct timespec ts1; struct timespec ts2; + u16 op; }; struct mlx5_pas { @@ -653,6 +656,9 @@ void mlx5_cmd_use_polling(struct mlx5_core_dev *dev); int mlx5_cmd_status_to_err(struct mlx5_outbox_hdr *hdr); int mlx5_cmd_exec(struct mlx5_core_dev *dev, void *in, int in_size, void *out, int out_size); +int mlx5_cmd_exec_cb(struct mlx5_core_dev *dev, void *in, int in_size, + void *out, int out_size, mlx5_cmd_cbk_t callback, + void *context); int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn); int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn); int mlx5_alloc_uuars(struct mlx5_core_dev *dev, struct mlx5_uuar_info *uuari); @@ -676,7 +682,9 @@ int mlx5_core_query_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, int mlx5_core_arm_srq(struct mlx5_core_dev *dev, struct mlx5_core_srq *srq, u16 lwm, int is_srq); int mlx5_core_create_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, - struct mlx5_create_mkey_mbox_in *in, int inlen); + struct mlx5_create_mkey_mbox_in *in, int inlen, + mlx5_cmd_cbk_t callback, void *context, + struct mlx5_create_mkey_mbox_out *out); int mlx5_core_destroy_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr); int mlx5_core_query_mkey(struct mlx5_core_dev *dev, struct mlx5_core_mr *mr, struct mlx5_query_mkey_mbox_out *out, int outlen); @@ -745,6 +753,11 @@ static inline u32 mlx5_idx_to_mkey(u32 mkey_idx) return mkey_idx << 8; } +static inline u8 mlx5_mkey_variant(u32 mkey) +{ + return mkey & 0xff; +} + enum { MLX5_PROF_MASK_QP_SIZE = (u64)1 << 0, MLX5_PROF_MASK_MR_CACHE = (u64)1 << 1, From 9641b74ebee65320fa52172995d6df12b641caa5 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:15 +0300 Subject: [PATCH 14/62] IB/mlx5: Fix overflow check in IB_WR_FAST_REG_MR Make sure not to overflow when reading the page list from struct ib_fast_reg_page_list. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/qp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e3881433f5d7..8a36fd78c89f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1915,6 +1915,10 @@ static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size, if (unlikely((*seg == qp->sq.qend))) *seg = mlx5_get_send_wqe(qp, 0); if (!li) { + if (unlikely(wr->wr.fast_reg.page_list_len > + wr->wr.fast_reg.page_list->max_page_list_len)) + return -ENOMEM; + set_frwr_pages(*seg, wr, mdev, pd, writ); *seg += sizeof(struct mlx5_wqe_data_seg); *size += (sizeof(struct mlx5_wqe_data_seg) / 16); From 1faacf82dfb3e0027087ff7e6aae5e0643b98a4d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:16 +0300 Subject: [PATCH 15/62] IB/mlx5: Simplify mlx5_ib_destroy_srq Make use of destroy_srq_kernel() to clear SRQ resouces. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/srq.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index 0aa478bc291a..f1e5845b42b0 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -390,9 +390,7 @@ int mlx5_ib_destroy_srq(struct ib_srq *srq) mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db); ib_umem_release(msrq->umem); } else { - kfree(msrq->wrid); - mlx5_buf_free(&dev->mdev, &msrq->buf); - mlx5_db_free(&dev->mdev, &msrq->db); + destroy_srq_kernel(dev, msrq); } kfree(srq); From cfd8f1d49b61b20aab77d5af5ec907dc99bb0064 Mon Sep 17 00:00:00 2001 From: Moshe Lazer Date: Wed, 23 Oct 2013 09:53:17 +0300 Subject: [PATCH 16/62] IB/mlx5: Fix srq free in destroy qp On destroy QP the driver walks over the relevant CQ and removes CQEs reported for the destroyed QP. It also frees the related SRQ entry without checking that this is actually an SRQ-related CQE. In case of a CQ used for both send and receive QP, we could free SRQ entries for send CQEs. This patch resolves this issue by verifying that this is a SRQ related CQE by checking the SRQ number in the CQE is not zero. Signed-off-by: Moshe Lazer Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/cq.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index e7c64c57d699..84591865e39d 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -750,17 +750,9 @@ int mlx5_ib_destroy_cq(struct ib_cq *cq) return 0; } -static int is_equal_rsn(struct mlx5_cqe64 *cqe64, struct mlx5_ib_srq *srq, - u32 rsn) +static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn) { - u32 lrsn; - - if (srq) - lrsn = be32_to_cpu(cqe64->srqn) & 0xffffff; - else - lrsn = be32_to_cpu(cqe64->sop_drop_qpn) & 0xffffff; - - return rsn == lrsn; + return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff); } void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) @@ -790,8 +782,8 @@ void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq) while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) { cqe = get_cqe(cq, prod_index & cq->ibcq.cqe); cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64; - if (is_equal_rsn(cqe64, srq, rsn)) { - if (srq) + if (is_equal_rsn(cqe64, rsn)) { + if (srq && (ntohl(cqe64->srqn) & 0xffffff)) mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter)); ++nfreed; } else if (nfreed) { From 952f5f6e807ba82e1b82fcfcf7f73db022342aa7 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:18 +0300 Subject: [PATCH 17/62] mlx5: Fix cleanup flow when DMA mapping fails If DMA mapping fails, the driver cleared the object that holds the previously DMA mapped pages. Fix this by allocating a new object for the command that reports back to firmware that pages can't be supplied. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- .../ethernet/mellanox/mlx5/core/pagealloc.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 7b12acf210f8..a0d0da35578c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -181,6 +181,7 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, { struct mlx5_manage_pages_inbox *in; struct mlx5_manage_pages_outbox out; + struct mlx5_manage_pages_inbox *nin; struct page *page; int inlen; u64 addr; @@ -247,13 +248,20 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, out_alloc: if (notify_fail) { - memset(in, 0, inlen); + nin = kzalloc(sizeof(*nin), GFP_KERNEL); + if (!nin) { + mlx5_core_warn(dev, "allocation failed\n"); + goto unmap; + } memset(&out, 0, sizeof(out)); - in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES); - in->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE); - if (mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out))) - mlx5_core_warn(dev, "\n"); + nin->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES); + nin->hdr.opmod = cpu_to_be16(MLX5_PAGES_CANT_GIVE); + if (mlx5_cmd_exec(dev, nin, sizeof(*nin), &out, sizeof(out))) + mlx5_core_warn(dev, "page notify failed\n"); + kfree(nin); } + +unmap: for (i--; i >= 0; i--) { addr = be64_to_cpu(in->pas[i]); page = remove_page(dev, addr); From bf0bf77f6519e5dcd57a77b47e1d151c1e81b7ec Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:19 +0300 Subject: [PATCH 18/62] mlx5: Support communicating arbitrary host page size to firmware Connect-IB firmware requires 4K pages to be communicated with the driver. This patch breaks larger pages to 4K units to enable support for architectures utilizing larger page size, such as PowerPC. This patch also fixes several places that referred to PAGE_SHIFT instead of explicit 12 which is the inherent page shift on Connect-IB. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/cq.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 4 +- drivers/infiniband/hw/mlx5/srq.c | 4 +- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- .../ethernet/mellanox/mlx5/core/pagealloc.c | 174 ++++++++++++------ include/linux/mlx5/driver.h | 1 + 6 files changed, 127 insertions(+), 60 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 84591865e39d..eecac7b52bcb 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -620,7 +620,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, } mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - PAGE_SHIFT; + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - 12; *index = dev->mdev.priv.uuari.uars[0].index; return 0; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 8a36fd78c89f..ce14008a5702 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -551,7 +551,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - PAGE_SHIFT) << 24); + cpu_to_be32((page_shift - 12) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); @@ -648,7 +648,7 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, goto err_buf; } (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - PAGE_SHIFT) << 24); + (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - 12) << 24); /* Set "fast registration enabled" for all kernel QPs */ (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index f1e5845b42b0..dbc2c7188c5e 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -123,7 +123,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_in; } - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - 12; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); return 0; @@ -192,7 +192,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - (*in)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*in)->ctx.log_pg_sz = page_shift - 12; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 2231d93cc7ad..6b4b436840bd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -354,7 +354,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ); in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index); in->ctx.intr = vecidx; - in->ctx.log_page_size = PAGE_SHIFT - 12; + in->ctx.log_page_size = eq->buf.page_shift - 12; in->events_mask = cpu_to_be64(mask); err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index a0d0da35578c..013aa422adee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -57,10 +57,13 @@ struct mlx5_pages_req { }; struct fw_page { - struct rb_node rb_node; - u64 addr; - struct page *page; - u16 func_id; + struct rb_node rb_node; + u64 addr; + struct page *page; + u16 func_id; + unsigned long bitmask; + struct list_head list; + unsigned free_count; }; struct mlx5_query_pages_inbox { @@ -94,6 +97,11 @@ enum { MAX_RECLAIM_TIME_MSECS = 5000, }; +enum { + MLX5_MAX_RECLAIM_TIME_MILI = 5000, + MLX5_NUM_4K_IN_PAGE = PAGE_SIZE / 4096, +}; + static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u16 func_id) { struct rb_root *root = &dev->priv.page_root; @@ -101,6 +109,7 @@ static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u struct rb_node *parent = NULL; struct fw_page *nfp; struct fw_page *tfp; + int i; while (*new) { parent = *new; @@ -113,25 +122,29 @@ static int insert_page(struct mlx5_core_dev *dev, u64 addr, struct page *page, u return -EEXIST; } - nfp = kmalloc(sizeof(*nfp), GFP_KERNEL); + nfp = kzalloc(sizeof(*nfp), GFP_KERNEL); if (!nfp) return -ENOMEM; nfp->addr = addr; nfp->page = page; nfp->func_id = func_id; + nfp->free_count = MLX5_NUM_4K_IN_PAGE; + for (i = 0; i < MLX5_NUM_4K_IN_PAGE; i++) + set_bit(i, &nfp->bitmask); rb_link_node(&nfp->rb_node, parent, new); rb_insert_color(&nfp->rb_node, root); + list_add(&nfp->list, &dev->priv.free_list); return 0; } -static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr) +static struct fw_page *find_fw_page(struct mlx5_core_dev *dev, u64 addr) { struct rb_root *root = &dev->priv.page_root; struct rb_node *tmp = root->rb_node; - struct page *result = NULL; + struct fw_page *result = NULL; struct fw_page *tfp; while (tmp) { @@ -141,9 +154,7 @@ static struct page *remove_page(struct mlx5_core_dev *dev, u64 addr) } else if (tfp->addr > addr) { tmp = tmp->rb_right; } else { - rb_erase(&tfp->rb_node, root); - result = tfp->page; - kfree(tfp); + result = tfp; break; } } @@ -176,13 +187,97 @@ static int mlx5_cmd_query_pages(struct mlx5_core_dev *dev, u16 *func_id, return err; } +static int alloc_4k(struct mlx5_core_dev *dev, u64 *addr) +{ + struct fw_page *fp; + unsigned n; + + if (list_empty(&dev->priv.free_list)) { + return -ENOMEM; + mlx5_core_warn(dev, "\n"); + } + + fp = list_entry(dev->priv.free_list.next, struct fw_page, list); + n = find_first_bit(&fp->bitmask, 8 * sizeof(fp->bitmask)); + if (n >= MLX5_NUM_4K_IN_PAGE) { + mlx5_core_warn(dev, "alloc 4k bug\n"); + return -ENOENT; + } + clear_bit(n, &fp->bitmask); + fp->free_count--; + if (!fp->free_count) + list_del(&fp->list); + + *addr = fp->addr + n * 4096; + + return 0; +} + +static void free_4k(struct mlx5_core_dev *dev, u64 addr) +{ + struct fw_page *fwp; + int n; + + fwp = find_fw_page(dev, addr & PAGE_MASK); + if (!fwp) { + mlx5_core_warn(dev, "page not found\n"); + return; + } + + n = (addr & ~PAGE_MASK) % 4096; + fwp->free_count++; + set_bit(n, &fwp->bitmask); + if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) { + rb_erase(&fwp->rb_node, &dev->priv.page_root); + list_del(&fwp->list); + dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + __free_page(fwp->page); + kfree(fwp); + } else if (fwp->free_count == 1) { + list_add(&fwp->list, &dev->priv.free_list); + } +} + +static int alloc_system_page(struct mlx5_core_dev *dev, u16 func_id) +{ + struct page *page; + u64 addr; + int err; + + page = alloc_page(GFP_HIGHUSER); + if (!page) { + mlx5_core_warn(dev, "failed to allocate page\n"); + return -ENOMEM; + } + addr = dma_map_page(&dev->pdev->dev, page, 0, + PAGE_SIZE, DMA_BIDIRECTIONAL); + if (dma_mapping_error(&dev->pdev->dev, addr)) { + mlx5_core_warn(dev, "failed dma mapping page\n"); + err = -ENOMEM; + goto out_alloc; + } + err = insert_page(dev, addr, page, func_id); + if (err) { + mlx5_core_err(dev, "failed to track allocated page\n"); + goto out_mapping; + } + + return 0; + +out_mapping: + dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); + +out_alloc: + __free_page(page); + + return err; +} static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, int notify_fail) { struct mlx5_manage_pages_inbox *in; struct mlx5_manage_pages_outbox out; struct mlx5_manage_pages_inbox *nin; - struct page *page; int inlen; u64 addr; int err; @@ -197,27 +292,15 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, memset(&out, 0, sizeof(out)); for (i = 0; i < npages; i++) { - page = alloc_page(GFP_HIGHUSER); - if (!page) { - err = -ENOMEM; - mlx5_core_warn(dev, "failed to allocate page\n"); - goto out_alloc; - } - addr = dma_map_page(&dev->pdev->dev, page, 0, - PAGE_SIZE, DMA_BIDIRECTIONAL); - if (dma_mapping_error(&dev->pdev->dev, addr)) { - mlx5_core_warn(dev, "failed dma mapping page\n"); - __free_page(page); - err = -ENOMEM; - goto out_alloc; - } - err = insert_page(dev, addr, page, func_id); +retry: + err = alloc_4k(dev, &addr); if (err) { - mlx5_core_err(dev, "failed to track allocated page\n"); - dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); - __free_page(page); - err = -ENOMEM; - goto out_alloc; + if (err == -ENOMEM) + err = alloc_system_page(dev, func_id); + if (err) + goto out_4k; + + goto retry; } in->pas[i] = cpu_to_be64(addr); } @@ -227,7 +310,6 @@ static int give_pages(struct mlx5_core_dev *dev, u16 func_id, int npages, in->func_id = cpu_to_be16(func_id); in->num_entries = cpu_to_be32(npages); err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); - mlx5_core_dbg(dev, "err %d\n", err); if (err) { mlx5_core_warn(dev, "func_id 0x%x, npages %d, err %d\n", func_id, npages, err); goto out_alloc; @@ -251,7 +333,7 @@ out_alloc: nin = kzalloc(sizeof(*nin), GFP_KERNEL); if (!nin) { mlx5_core_warn(dev, "allocation failed\n"); - goto unmap; + goto out_4k; } memset(&out, 0, sizeof(out)); nin->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_MANAGE_PAGES); @@ -261,19 +343,9 @@ out_alloc: kfree(nin); } -unmap: - for (i--; i >= 0; i--) { - addr = be64_to_cpu(in->pas[i]); - page = remove_page(dev, addr); - if (!page) { - mlx5_core_err(dev, "BUG: can't remove page at addr 0x%llx\n", - addr); - continue; - } - dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); - __free_page(page); - } - +out_4k: + for (i--; i >= 0; i--) + free_4k(dev, be64_to_cpu(in->pas[i])); out_free: mlx5_vfree(in); return err; @@ -284,7 +356,6 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages, { struct mlx5_manage_pages_inbox in; struct mlx5_manage_pages_outbox *out; - struct page *page; int num_claimed; int outlen; u64 addr; @@ -323,13 +394,7 @@ static int reclaim_pages(struct mlx5_core_dev *dev, u32 func_id, int npages, for (i = 0; i < num_claimed; i++) { addr = be64_to_cpu(out->pas[i]); - page = remove_page(dev, addr); - if (!page) { - mlx5_core_warn(dev, "FW reported unknown DMA address 0x%llx\n", addr); - } else { - dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); - __free_page(page); - } + free_4k(dev, addr); } out_free: @@ -435,6 +500,7 @@ int mlx5_reclaim_startup_pages(struct mlx5_core_dev *dev) void mlx5_pagealloc_init(struct mlx5_core_dev *dev) { dev->priv.page_root = RB_ROOT; + INIT_LIST_HEAD(&dev->priv.free_list); } void mlx5_pagealloc_cleanup(struct mlx5_core_dev *dev) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 513619a75695..554548cd3dd4 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -483,6 +483,7 @@ struct mlx5_priv { struct rb_root page_root; int fw_pages; int reg_pages; + struct list_head free_list; struct mlx5_core_health health; From 87b8de492da34942fc554f2958a570ce0642296a Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Wed, 23 Oct 2013 09:53:20 +0300 Subject: [PATCH 19/62] mlx5: Clear reserved area in set_hca_cap() Firmware spec requires reserved fields to be cleared when calling set_hca_cap. Current code queries and copy to the set area, possibly resulting in reserved bits not cleared. This patch copies only writable fields to the set area. Fix also typo - msx => max Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- .../net/ethernet/mellanox/mlx5/core/main.c | 35 +++++++++++++++++-- include/linux/mlx5/device.h | 9 +++-- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index bc0f5fb66e24..40a9f5ed814d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -159,6 +159,36 @@ struct mlx5_reg_host_endianess { u8 rsvd[15]; }; + +#define CAP_MASK(pos, size) ((u64)((1 << (size)) - 1) << (pos)) + +enum { + MLX5_CAP_BITS_RW_MASK = CAP_MASK(MLX5_CAP_OFF_CMDIF_CSUM, 2) | + CAP_MASK(MLX5_CAP_OFF_DCT, 1), +}; + +/* selectively copy writable fields clearing any reserved area + */ +static void copy_rw_fields(struct mlx5_hca_cap *to, struct mlx5_hca_cap *from) +{ + u64 v64; + + to->log_max_qp = from->log_max_qp & 0x1f; + to->log_max_ra_req_dc = from->log_max_ra_req_dc & 0x3f; + to->log_max_ra_res_dc = from->log_max_ra_res_dc & 0x3f; + to->log_max_ra_req_qp = from->log_max_ra_req_qp & 0x3f; + to->log_max_ra_res_qp = from->log_max_ra_res_qp & 0x3f; + to->log_max_atomic_size_qp = from->log_max_atomic_size_qp; + to->log_max_atomic_size_dc = from->log_max_atomic_size_dc; + v64 = be64_to_cpu(from->flags) & MLX5_CAP_BITS_RW_MASK; + to->flags = cpu_to_be64(v64); +} + +enum { + HCA_CAP_OPMOD_GET_MAX = 0, + HCA_CAP_OPMOD_GET_CUR = 1, +}; + static int handle_hca_cap(struct mlx5_core_dev *dev) { struct mlx5_cmd_query_hca_cap_mbox_out *query_out = NULL; @@ -180,7 +210,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) } query_ctx.hdr.opcode = cpu_to_be16(MLX5_CMD_OP_QUERY_HCA_CAP); - query_ctx.hdr.opmod = cpu_to_be16(0x1); + query_ctx.hdr.opmod = cpu_to_be16(HCA_CAP_OPMOD_GET_CUR); err = mlx5_cmd_exec(dev, &query_ctx, sizeof(query_ctx), query_out, sizeof(*query_out)); if (err) @@ -192,8 +222,7 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) goto query_ex; } - memcpy(&set_ctx->hca_cap, &query_out->hca_cap, - sizeof(set_ctx->hca_cap)); + copy_rw_fields(&set_ctx->hca_cap, &query_out->hca_cap); if (dev->profile->mask & MLX5_PROF_MASK_QP_SIZE) set_ctx->hca_cap.log_max_qp = dev->profile->log_max_qp; diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 5eb4e31af22b..3d789f43e34b 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -230,6 +230,11 @@ enum { MLX5_MAX_PAGE_SHIFT = 31 }; +enum { + MLX5_CAP_OFF_DCT = 41, + MLX5_CAP_OFF_CMDIF_CSUM = 46, +}; + struct mlx5_inbox_hdr { __be16 opcode; u8 rsvd[4]; @@ -319,9 +324,9 @@ struct mlx5_hca_cap { u8 rsvd25[42]; __be16 log_uar_page_sz; u8 rsvd26[28]; - u8 log_msx_atomic_size_qp; + u8 log_max_atomic_size_qp; u8 rsvd27[2]; - u8 log_msx_atomic_size_dc; + u8 log_max_atomic_size_dc; u8 rsvd28[76]; }; From 4e3d677ba986d5c8e76ee1742c1d4d79bc197d5c Mon Sep 17 00:00:00 2001 From: Moshe Lazer Date: Wed, 23 Oct 2013 09:53:21 +0300 Subject: [PATCH 20/62] mlx5_core: Change optimal_reclaimed_pages for better performance Change optimal_reclaimed_pages() to increase the output size of each reclaim pages command. This change reduces significantly the amount of reclaim pages commands issued to FW when the driver is unloaded which reduces the overall driver unload time. Signed-off-by: Moshe Lazer Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index 013aa422adee..ba816c25c5c1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -454,14 +454,19 @@ int mlx5_satisfy_startup_pages(struct mlx5_core_dev *dev, int boot) return give_pages(dev, func_id, npages, 0); } +enum { + MLX5_BLKS_FOR_RECLAIM_PAGES = 12 +}; + static int optimal_reclaimed_pages(void) { struct mlx5_cmd_prot_block *block; struct mlx5_cmd_layout *lay; int ret; - ret = (sizeof(lay->in) + sizeof(block->data) - - sizeof(struct mlx5_manage_pages_outbox)) / 8; + ret = (sizeof(lay->out) + MLX5_BLKS_FOR_RECLAIM_PAGES * sizeof(block->data) - + sizeof(struct mlx5_manage_pages_outbox)) / + FIELD_SIZEOF(struct mlx5_manage_pages_outbox, pas[0]); return ret; } From 2d036fad949080f178e02b12c93a61367e9f562f Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 24 Oct 2013 12:01:00 +0300 Subject: [PATCH 21/62] IB/mlx5: Remove dead code in mr.c In mlx5_mr_cache_init() the size variable is not used so remove it to avoid compiler warnings when running with make W=1. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/mr.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index d7f202290747..2c4626f7b1b8 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -40,7 +40,6 @@ #include "mlx5_ib.h" enum { - DEF_CACHE_SIZE = 10, MAX_PENDING_REG_MR = 8, }; @@ -550,7 +549,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) struct mlx5_mr_cache *cache = &dev->cache; struct mlx5_cache_ent *ent; int limit; - int size; int err; int i; @@ -571,13 +569,11 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->order = i + 2; ent->dev = dev; - if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) { - size = dev->mdev.profile->mr_cache[i].size; + if (dev->mdev.profile->mask & MLX5_PROF_MASK_MR_CACHE) limit = dev->mdev.profile->mr_cache[i].limit; - } else { - size = DEF_CACHE_SIZE; + else limit = 0; - } + INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); ent->limit = limit; From 07c9113fe8c67e28707b0a4b1e8580abe0327145 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 24 Oct 2013 12:01:01 +0300 Subject: [PATCH 22/62] IB/mlx5: Remove "Always false" comparison mlx5_cur and mlx5_new cannot have negative values so remove the redundant condition. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index ce14008a5702..2e4c4656e047 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1550,7 +1550,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, mlx5_cur = to_mlx5_state(cur_state); mlx5_new = to_mlx5_state(new_state); mlx5_st = to_mlx5_st(ibqp->qp_type); - if (mlx5_cur < 0 || mlx5_new < 0 || mlx5_st < 0) + if (mlx5_st < 0) goto out; optpar = ib_mask_to_mlx5_opt(attr_mask); From c2a3431e6153ed90911704356bc1e869624e118d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 24 Oct 2013 12:01:02 +0300 Subject: [PATCH 23/62] IB/mlx5: Update opt param mask for RTS2RTS RTS to RTS transition should allow update of alternate path. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/qp.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 2e4c4656e047..5bcf57943b84 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1317,9 +1317,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q MLX5_QP_OPTPAR_RAE | MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RNR_TIMEOUT | - MLX5_QP_OPTPAR_PM_STATE, + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE | - MLX5_QP_OPTPAR_PM_STATE, + MLX5_QP_OPTPAR_PM_STATE | + MLX5_QP_OPTPAR_ALT_ADDR_PATH, [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY | MLX5_QP_OPTPAR_SRQN | MLX5_QP_OPTPAR_CQN_RCV, From 1b77d2bd753d119eedcbc08fda58934307676554 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 24 Oct 2013 12:01:03 +0300 Subject: [PATCH 24/62] mlx5: Use enum to indicate adapter page size The Connect-IB adapter has an inherent page size which equals 4K. Define an new enum that equals the page shift and use it instead of using the value 12 throughout the code. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/cq.c | 2 +- drivers/infiniband/hw/mlx5/qp.c | 5 +++-- drivers/infiniband/hw/mlx5/srq.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 2 +- include/linux/mlx5/device.h | 4 ++++ 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index eecac7b52bcb..28344773f640 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -620,7 +620,7 @@ static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq, } mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas); - (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - 12; + (*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; *index = dev->mdev.priv.uuari.uars[0].index; return 0; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5bcf57943b84..7c6b4ba49bec 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -551,7 +551,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, } mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = - cpu_to_be32((page_shift - 12) << 24); + cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); @@ -648,7 +648,8 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, goto err_buf; } (*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index); - (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((qp->buf.page_shift - 12) << 24); + (*in)->ctx.log_pg_sz_remote_qpn = + cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); /* Set "fast registration enabled" for all kernel QPs */ (*in)->ctx.params1 |= cpu_to_be32(1 << 11); (*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4); diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index dbc2c7188c5e..210b3eaf188a 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -123,7 +123,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, goto err_in; } - (*in)->ctx.log_pg_sz = page_shift - 12; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); return 0; @@ -192,7 +192,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, } srq->wq_sig = !!srq_signature; - (*in)->ctx.log_pg_sz = page_shift - 12; + (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 6b4b436840bd..64a61b286b2c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -354,7 +354,7 @@ int mlx5_create_map_eq(struct mlx5_core_dev *dev, struct mlx5_eq *eq, u8 vecidx, in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_EQ); in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(eq->nent) << 24 | uar->index); in->ctx.intr = vecidx; - in->ctx.log_page_size = eq->buf.page_shift - 12; + in->ctx.log_page_size = eq->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT; in->events_mask = cpu_to_be64(mask); err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 3d789f43e34b..da78875807fc 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -230,6 +230,10 @@ enum { MLX5_MAX_PAGE_SHIFT = 31 }; +enum { + MLX5_ADAPTER_PAGE_SHIFT = 12 +}; + enum { MLX5_CAP_OFF_DCT = 41, MLX5_CAP_OFF_CMDIF_CSUM = 46, From 6ebacdfc07ca61ba258693c9b4c88f1ffbe8ccd7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Sep 2013 11:50:46 +0300 Subject: [PATCH 25/62] RDMA/ocrdma: Silence an integer underflow warning We recently added a cap on "max_wqe_allocated" in 43a6b4025c ('RDMA/ocrdma: Create IRD queue fix'). My static checker complains that the cap has a problem because it casts large values to negative. "attrs->cap.max_send_wr" is a u32. It comes from the user, but it's capped in ocrdma_check_qp_params() so it can't wrap here. Signed-off-by: Dan Carpenter Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index 50219ab2279d..56bf32fcb62c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -1783,7 +1783,7 @@ static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd, u32 max_sges = attrs->cap.max_send_sge; /* QP1 may exceed 127 */ - max_wqe_allocated = min_t(int, attrs->cap.max_send_wr + 1, + max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1, dev->attr.max_wqe); status = ocrdma_build_q_conf(&max_wqe_allocated, From 1852d1da3b4723d7db0d490f54c07442b3e1c452 Mon Sep 17 00:00:00 2001 From: Naresh Gottumukkala Date: Fri, 6 Sep 2013 15:02:47 +0530 Subject: [PATCH 26/62] RDMA/ocrdma: Fix a crash in rmmod 1) ocrdma_remove_free() is called from a call_rcu callback funtion context, which can be a bottom-half context. So the code in ocrdma_remove_free should not sleep. But ocrdma_cleanup_hw() can sleep, So move it ocrdma_remove() instead of ocrdma_remove_free. 2) Fix a couple of kbuild test robot warnings. Signed-off-by: Naresh Gottumukkala Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma.h | 53 +++++++++++---------- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 7 +-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 3 files changed, 32 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h index adc11d14f878..294dd27b601e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma.h @@ -122,6 +122,32 @@ struct mqe_ctx { bool cmd_done; }; +struct ocrdma_hw_mr { + u32 lkey; + u8 fr_mr; + u8 remote_atomic; + u8 remote_rd; + u8 remote_wr; + u8 local_rd; + u8 local_wr; + u8 mw_bind; + u8 rsvd; + u64 len; + struct ocrdma_pbl *pbl_table; + u32 num_pbls; + u32 num_pbes; + u32 pbl_size; + u32 pbe_size; + u64 fbo; + u64 va; +}; + +struct ocrdma_mr { + struct ib_mr ibmr; + struct ib_umem *umem; + struct ocrdma_hw_mr hwmr; +}; + struct ocrdma_dev { struct ib_device ibdev; struct ocrdma_dev_attr attr; @@ -169,7 +195,7 @@ struct ocrdma_dev { struct list_head entry; struct rcu_head rcu; int id; - u64 stag_arr[OCRDMA_MAX_STAG]; + struct ocrdma_mr *stag_arr[OCRDMA_MAX_STAG]; u16 pvid; }; @@ -294,31 +320,6 @@ struct ocrdma_qp { u16 db_cache; }; -struct ocrdma_hw_mr { - u32 lkey; - u8 fr_mr; - u8 remote_atomic; - u8 remote_rd; - u8 remote_wr; - u8 local_rd; - u8 local_wr; - u8 mw_bind; - u8 rsvd; - u64 len; - struct ocrdma_pbl *pbl_table; - u32 num_pbls; - u32 num_pbes; - u32 pbl_size; - u32 pbe_size; - u64 fbo; - u64 va; -}; - -struct ocrdma_mr { - struct ib_mr ibmr; - struct ib_umem *umem; - struct ocrdma_hw_mr hwmr; -}; struct ocrdma_ucontext { struct ib_ucontext ibucontext; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 0ce7674621ea..91443bcb9e0e 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -452,9 +452,6 @@ static void ocrdma_remove_free(struct rcu_head *rcu) { struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu); - ocrdma_free_resources(dev); - ocrdma_cleanup_hw(dev); - idr_remove(&ocrdma_dev_id, dev->id); kfree(dev->mbx_cmd); ib_dealloc_device(&dev->ibdev); @@ -470,6 +467,10 @@ static void ocrdma_remove(struct ocrdma_dev *dev) spin_lock(&ocrdma_devlist_lock); list_del_rcu(&dev->entry); spin_unlock(&ocrdma_devlist_lock); + + ocrdma_free_resources(dev); + ocrdma_cleanup_hw(dev); + call_rcu(&dev->rcu, ocrdma_remove_free); } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 69f1d1221a6b..d4fbf916d3cc 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -2839,7 +2839,7 @@ struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len) goto mbx_err; mr->ibmr.rkey = mr->hwmr.lkey; mr->ibmr.lkey = mr->hwmr.lkey; - dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = (unsigned long) mr; + dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] = mr; return &mr->ibmr; mbx_err: ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); From d5e3f378337581af86a8b86a4db8fd6e164eae3c Mon Sep 17 00:00:00 2001 From: Naresh Gottumukkala Date: Mon, 28 Oct 2013 17:29:34 +0530 Subject: [PATCH 27/62] RDMA/ocrdma: Remove redundant check in ocrdma_build_fr() Remove the redundant check of comparing if a 32-bit value is greater than 0xffffffffULL. Reported by Dan Carpenter. Signed-off-by: Naresh Gottumukkala Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index d4fbf916d3cc..7686dceadd29 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1981,9 +1981,7 @@ static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr, wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES); - if ((wr->wr.fast_reg.page_list_len > - qp->dev->attr.max_pages_per_frmr) || - (wr->wr.fast_reg.length > 0xffffffffULL)) + if (wr->wr.fast_reg.page_list_len > qp->dev->attr.max_pages_per_frmr) return -EINVAL; hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT); From 4adcf7fb6783e354aab38824d803fa8c4f8e8a27 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Oct 2013 09:29:06 -0400 Subject: [PATCH 28/62] IB/ipath: Convert ipath_user_sdma_pin_pages() to use get_user_pages_fast() ipath_user_sdma_queue_pkts() gets called with mmap_sem held for writing. Except for get_user_pages() deep down in ipath_user_sdma_pin_pages() we don't seem to need mmap_sem at all. Even more interestingly the function ipath_user_sdma_queue_pkts() (and also ipath_user_sdma_coalesce() called somewhat later) call copy_from_user() which can hit a page fault and we deadlock on trying to get mmap_sem when handling that fault. So just make ipath_user_sdma_pin_pages() use get_user_pages_fast() and leave mmap_sem locking for mm. This deadlock has actually been observed in the wild when the node is under memory pressure. Cc: Signed-off-by: Jan Kara Signed-off-by: Mike Marciniszyn [ Merged in fix for call to get_user_pages_fast from Tetsuo Handa . - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/hw/ipath/ipath_user_sdma.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c index f5cb13b21445..cc04b7ba3488 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -280,9 +280,7 @@ static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd, int j; int ret; - ret = get_user_pages(current, current->mm, addr, - npages, 0, 1, pages, NULL); - + ret = get_user_pages_fast(addr, npages, 0, pages); if (ret != npages) { int i; @@ -811,10 +809,7 @@ int ipath_user_sdma_writev(struct ipath_devdata *dd, while (dim) { const int mxp = 8; - down_write(¤t->mm->mmap_sem); ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp); - up_write(¤t->mm->mmap_sem); - if (ret <= 0) goto done_unlock; else { From 603e7729920e42b3c2f4dbfab9eef4878cb6e8fa Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 4 Oct 2013 09:29:12 -0400 Subject: [PATCH 29/62] IB/qib: Convert qib_user_sdma_pin_pages() to use get_user_pages_fast() qib_user_sdma_queue_pkts() gets called with mmap_sem held for writing. Except for get_user_pages() deep down in qib_user_sdma_pin_pages() we don't seem to need mmap_sem at all. Even more interestingly the function qib_user_sdma_queue_pkts() (and also qib_user_sdma_coalesce() called somewhat later) call copy_from_user() which can hit a page fault and we deadlock on trying to get mmap_sem when handling that fault. So just make qib_user_sdma_pin_pages() use get_user_pages_fast() and leave mmap_sem locking for mm. This deadlock has actually been observed in the wild when the node is under memory pressure. Cc: Reviewed-by: Mike Marciniszyn Signed-off-by: Jan Kara Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_user_sdma.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c index d0a0ea0c14d6..165aee2ca8a0 100644 --- a/drivers/infiniband/hw/qib/qib_user_sdma.c +++ b/drivers/infiniband/hw/qib/qib_user_sdma.c @@ -594,8 +594,7 @@ static int qib_user_sdma_pin_pages(const struct qib_devdata *dd, else j = npages; - ret = get_user_pages(current, current->mm, addr, - j, 0, 1, pages, NULL); + ret = get_user_pages_fast(addr, j, 0, pages); if (ret != j) { i = 0; j = ret; @@ -1294,11 +1293,8 @@ int qib_user_sdma_writev(struct qib_ctxtdata *rcd, int mxp = 8; int ndesc = 0; - down_write(¤t->mm->mmap_sem); ret = qib_user_sdma_queue_pkts(dd, ppd, pq, iov, dim, &list, &mxp, &ndesc); - up_write(¤t->mm->mmap_sem); - if (ret < 0) goto done_unlock; else { From 78a5886472085a6e458824858a8c8338113aec4e Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Thu, 24 Oct 2013 11:40:37 -0400 Subject: [PATCH 30/62] IB/qib: Fix checkpatch __packed warnings Convert __attribute__ ((packed)) to __packed. Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_mad.h | 14 +++++++------- drivers/infiniband/hw/qib/qib_verbs.h | 10 +++++----- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h index 28874f8606f8..941d4d50d8e7 100644 --- a/drivers/infiniband/hw/qib/qib_mad.h +++ b/drivers/infiniband/hw/qib/qib_mad.h @@ -54,7 +54,7 @@ struct ib_node_info { __be32 revision; u8 local_port_num; u8 vendor_id[3]; -} __attribute__ ((packed)); +} __packed; struct ib_mad_notice_attr { u8 generic_type; @@ -73,7 +73,7 @@ struct ib_mad_notice_attr { __be16 reserved; __be16 lid; /* where violation happened */ u8 port_num; /* where violation happened */ - } __attribute__ ((packed)) ntc_129_131; + } __packed ntc_129_131; struct { __be16 reserved; @@ -83,14 +83,14 @@ struct ib_mad_notice_attr { __be32 new_cap_mask; /* new capability mask */ u8 reserved3; u8 change_flags; /* low 3 bits only */ - } __attribute__ ((packed)) ntc_144; + } __packed ntc_144; struct { __be16 reserved; __be16 lid; /* lid where sys guid changed */ __be16 reserved2; __be64 new_sys_guid; - } __attribute__ ((packed)) ntc_145; + } __packed ntc_145; struct { __be16 reserved; @@ -104,7 +104,7 @@ struct ib_mad_notice_attr { u8 reserved3; u8 dr_trunc_hop; u8 dr_rtn_path[30]; - } __attribute__ ((packed)) ntc_256; + } __packed ntc_256; struct { __be16 reserved; @@ -115,7 +115,7 @@ struct ib_mad_notice_attr { __be32 qp2; /* high 8 bits reserved */ union ib_gid gid1; union ib_gid gid2; - } __attribute__ ((packed)) ntc_257_258; + } __packed ntc_257_258; } details; }; @@ -209,7 +209,7 @@ struct ib_pma_portcounters_cong { __be64 port_rcv_packets; __be64 port_xmit_wait; __be64 port_adr_events; -} __attribute__ ((packed)); +} __packed; #define IB_PMA_CONG_HW_CONTROL_TIMER 0x00 #define IB_PMA_CONG_HW_CONTROL_SAMPLE 0x01 diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 012e2c7575ad..a01c7d2cf541 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -150,14 +150,14 @@ struct ib_reth { __be64 vaddr; __be32 rkey; __be32 length; -} __attribute__ ((packed)); +} __packed; struct ib_atomic_eth { __be32 vaddr[2]; /* unaligned so access as 2 32-bit words */ __be32 rkey; __be64 swap_data; __be64 compare_data; -} __attribute__ ((packed)); +} __packed; struct qib_other_headers { __be32 bth[3]; @@ -178,7 +178,7 @@ struct qib_other_headers { __be32 aeth; struct ib_atomic_eth atomic_eth; } u; -} __attribute__ ((packed)); +} __packed; /* * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes @@ -195,12 +195,12 @@ struct qib_ib_header { } l; struct qib_other_headers oth; } u; -} __attribute__ ((packed)); +} __packed; struct qib_pio_header { __le32 pbc[2]; struct qib_ib_header hdr; -} __attribute__ ((packed)); +} __packed; /* * There is one struct qib_mcast for each multicast GID. From 2fadd83184d58701f1116ca578465b5a75f9417c Mon Sep 17 00:00:00 2001 From: Mike Marciniszyn Date: Fri, 25 Oct 2013 11:17:59 -0400 Subject: [PATCH 31/62] IB/qib: Fix txselect regression Commit 7fac33014f54("IB/qib: checkpatch fixes") was overzealous in removing a simple_strtoul for a parse routine, setup_txselect(). That routine is required to handle a multi-value string. Unwind that aspect of the fix. Cc: Signed-off-by: Mike Marciniszyn Signed-off-by: Roland Dreier --- drivers/infiniband/hw/qib/qib_iba7322.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 016e7429adf6..5bfc02f450e6 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -6190,21 +6190,20 @@ static int setup_txselect(const char *str, struct kernel_param *kp) { struct qib_devdata *dd; unsigned long val; - int ret; - + char *n; if (strlen(str) >= MAX_ATTEN_LEN) { pr_info("txselect_values string too long\n"); return -ENOSPC; } - ret = kstrtoul(str, 0, &val); - if (ret || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + + val = simple_strtoul(str, &n, 0); + if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ)) { pr_info("txselect_values must start with a number < %d\n", TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ); - return ret ? ret : -EINVAL; + return -EINVAL; } - strcpy(txselect_list, str); + list_for_each_entry(dd, &qib_dev_list, list) if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322) set_no_qsfp_atten(dd, 1); From 7bb312e4a2f323fa460bbf9f33eeb00b5dabdb6b Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Sat, 26 Oct 2013 14:31:27 +0200 Subject: [PATCH 32/62] IB/srp: Make transport layer retry count configurable Allow the InfiniBand RC retry count to be configured by the user as an option in the target login string. Reducing this retry count allows to reduce the path failover time. Signed-off-by: Vu Pham [ bvanassche: Rewrote patch description / changed default retry count ] Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-driver-ib_srp | 2 ++ drivers/infiniband/ulp/srp/ib_srp.c | 24 +++++++++++++++++++- drivers/infiniband/ulp/srp/ib_srp.h | 1 + 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp index 5c53d28f775c..18e9b279ebb1 100644 --- a/Documentation/ABI/stable/sysfs-driver-ib_srp +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp @@ -61,6 +61,8 @@ Description: Interface for making ib_srp connect to a new target. interrupt is handled by a different CPU then the comp_vector parameter can be used to spread the SRP completion workload over multiple CPU's. + * tl_retry_count, a number in the range 2..7 specifying the + IB RC retry count. What: /sys/class/infiniband_srp/srp--/ibdev Date: January 2, 2006 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index f93baf8254c4..11ebf3e7646a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -390,7 +390,7 @@ static int srp_send_req(struct srp_target_port *target) req->param.responder_resources = 4; req->param.remote_cm_response_timeout = 20; req->param.local_cm_response_timeout = 20; - req->param.retry_count = 7; + req->param.retry_count = target->tl_retry_count; req->param.rnr_retry_count = 7; req->param.max_cm_retries = 15; @@ -1907,6 +1907,14 @@ static ssize_t show_comp_vector(struct device *dev, return sprintf(buf, "%d\n", target->comp_vector); } +static ssize_t show_tl_retry_count(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%d\n", target->tl_retry_count); +} + static ssize_t show_cmd_sg_entries(struct device *dev, struct device_attribute *attr, char *buf) { @@ -1934,6 +1942,7 @@ static DEVICE_ATTR(zero_req_lim, S_IRUGO, show_zero_req_lim, NULL); static DEVICE_ATTR(local_ib_port, S_IRUGO, show_local_ib_port, NULL); static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL); static DEVICE_ATTR(comp_vector, S_IRUGO, show_comp_vector, NULL); +static DEVICE_ATTR(tl_retry_count, S_IRUGO, show_tl_retry_count, NULL); static DEVICE_ATTR(cmd_sg_entries, S_IRUGO, show_cmd_sg_entries, NULL); static DEVICE_ATTR(allow_ext_sg, S_IRUGO, show_allow_ext_sg, NULL); @@ -1949,6 +1958,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_local_ib_port, &dev_attr_local_ib_device, &dev_attr_comp_vector, + &dev_attr_tl_retry_count, &dev_attr_cmd_sg_entries, &dev_attr_allow_ext_sg, NULL @@ -2073,6 +2083,7 @@ enum { SRP_OPT_ALLOW_EXT_SG = 1 << 10, SRP_OPT_SG_TABLESIZE = 1 << 11, SRP_OPT_COMP_VECTOR = 1 << 12, + SRP_OPT_TL_RETRY_COUNT = 1 << 13, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -2094,6 +2105,7 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_ALLOW_EXT_SG, "allow_ext_sg=%u" }, { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, + { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, { SRP_OPT_ERR, NULL } }; @@ -2257,6 +2269,15 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->comp_vector = token; break; + case SRP_OPT_TL_RETRY_COUNT: + if (match_int(args, &token) || token < 2 || token > 7) { + pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n", + p); + goto out; + } + target->tl_retry_count = token; + break; + default: pr_warn("unknown parameter or missing value '%s' in target creation request\n", p); @@ -2311,6 +2332,7 @@ static ssize_t srp_create_target(struct device *dev, target->cmd_sg_cnt = cmd_sg_entries; target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; target->allow_ext_sg = allow_ext_sg; + target->tl_retry_count = 7; ret = srp_parse_options(buf, target); if (ret) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index e641088c14dc..84d821b24ec7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -157,6 +157,7 @@ struct srp_target_port { unsigned int scsi_id; unsigned int sg_tablesize; int comp_vector; + int tl_retry_count; struct ib_sa_path_rec path; __be16 orig_dgid[8]; From 9dd69a600a680fab1c9235a644c886d8d6a2da2a Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:32:30 +0200 Subject: [PATCH 33/62] IB/srp: Keep rport as long as the IB transport layer Keep the rport data structure around after srp_remove_host() has finished until cleanup of the IB transport layer has finished completely. This is necessary because later patches use the rport pointer inside the queuecommand callback. Without this patch accessing the rport from inside a queuecommand callback is racy because srp_remove_host() must be invoked before scsi_remove_host() and because the queuecommand callback could get invoked after srp_remove_host() has finished. In other words, without this patch the queuecommand callback can get invoked after the rport data structure has been freed. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 3 +++ drivers/infiniband/ulp/srp/ib_srp.h | 1 + drivers/scsi/scsi_transport_srp.c | 18 ++++++++++++++++++ include/scsi/scsi_transport_srp.h | 2 ++ 4 files changed, 24 insertions(+) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 11ebf3e7646a..6edab7855f5e 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -528,11 +528,13 @@ static void srp_remove_target(struct srp_target_port *target) WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_del_scsi_host_attr(target->scsi_host); + srp_rport_get(target->rport); srp_remove_host(target->scsi_host); scsi_remove_host(target->scsi_host); srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + srp_rport_put(target->rport); srp_free_req_data(target); scsi_host_put(target->scsi_host); } @@ -2004,6 +2006,7 @@ static int srp_add_target(struct srp_host *host, struct srp_target_port *target) } rport->lld_data = target; + target->rport = rport; spin_lock(&host->target_lock); list_add_tail(&target->list, &host->target_list); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 84d821b24ec7..2a1768fcc57d 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -153,6 +153,7 @@ struct srp_target_port { u16 io_class; struct srp_host *srp_host; struct Scsi_Host *scsi_host; + struct srp_rport *rport; char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index f379c7f3034c..f7ba94aa52cb 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -184,6 +184,24 @@ static int srp_host_match(struct attribute_container *cont, struct device *dev) return &i->t.host_attrs.ac == cont; } +/** + * srp_rport_get() - increment rport reference count + */ +void srp_rport_get(struct srp_rport *rport) +{ + get_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_get); + +/** + * srp_rport_put() - decrement rport reference count + */ +void srp_rport_put(struct srp_rport *rport) +{ + put_device(&rport->dev); +} +EXPORT_SYMBOL(srp_rport_put); + /** * srp_rport_add - add a SRP remote port to the device hierarchy * @shost: scsi host the remote port is connected to. diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index ff0f04ac91aa..5a2d2d1081c1 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -38,6 +38,8 @@ extern struct scsi_transport_template * srp_attach_transport(struct srp_function_template *); extern void srp_release_transport(struct scsi_transport_template *); +extern void srp_rport_get(struct srp_rport *rport); +extern void srp_rport_put(struct srp_rport *rport); extern struct srp_rport *srp_rport_add(struct Scsi_Host *, struct srp_rport_identifiers *); extern void srp_rport_del(struct srp_rport *); From 29c17324803c8a3bb5b2b69309e43571164cc4de Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:33:30 +0200 Subject: [PATCH 34/62] scsi_transport_srp: Add transport layer error handling Add the necessary functions in the SRP transport module to allow an SRP initiator driver to implement transport layer error handling similar to the functionality already provided by the FC transport layer. This includes: - Support for implementing fast_io_fail_tmo, the time that should elapse after having detected a transport layer problem and before failing I/O. - Support for implementing dev_loss_tmo, the time that should elapse after having detected a transport layer problem and before removing a remote port. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-transport-srp | 31 ++ drivers/scsi/scsi_transport_srp.c | 430 ++++++++++++++++++- include/scsi/scsi_transport_srp.h | 74 +++- 3 files changed, 532 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-transport-srp b/Documentation/ABI/stable/sysfs-transport-srp index b36fb0dc13c8..8b6acc775b71 100644 --- a/Documentation/ABI/stable/sysfs-transport-srp +++ b/Documentation/ABI/stable/sysfs-transport-srp @@ -5,6 +5,24 @@ Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org Description: Instructs an SRP initiator to disconnect from a target and to remove all LUNs imported from that target. +What: /sys/class/srp_remote_ports/port-:/dev_loss_tmo +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: Number of seconds the SCSI layer will wait after a transport + layer error has been observed before removing a target port. + Zero means immediate removal. Setting this attribute to "off" + will disable the dev_loss timer. + +What: /sys/class/srp_remote_ports/port-:/fast_io_fail_tmo +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: Number of seconds the SCSI layer will wait after a transport + layer error has been observed before failing I/O. Zero means + failing I/O immediately. Setting this attribute to "off" will + disable the fast_io_fail timer. + What: /sys/class/srp_remote_ports/port-:/port_id Date: June 27, 2007 KernelVersion: 2.6.24 @@ -17,3 +35,16 @@ Date: June 27, 2007 KernelVersion: 2.6.24 Contact: linux-scsi@vger.kernel.org Description: Role of the remote port. Either "SRP Initiator" or "SRP Target". + +What: /sys/class/srp_remote_ports/port-:/state +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: State of the transport layer used for communication with the + remote port. "running" if the transport layer is operational; + "blocked" if a transport layer error has been encountered but + the fast_io_fail_tmo timer has not yet fired; "fail-fast" + after the fast_io_fail_tmo timer has fired and before the + "dev_loss_tmo" timer has fired; "lost" after the + "dev_loss_tmo" timer has fired and before the port is finally + removed. diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index f7ba94aa52cb..2696e26b3423 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -24,12 +24,15 @@ #include #include #include +#include #include +#include #include #include #include #include +#include "scsi_priv.h" #include "scsi_transport_srp_internal.h" struct srp_host_attrs { @@ -38,7 +41,7 @@ struct srp_host_attrs { #define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) #define SRP_HOST_ATTRS 0 -#define SRP_RPORT_ATTRS 3 +#define SRP_RPORT_ATTRS 6 struct srp_internal { struct scsi_transport_template t; @@ -54,6 +57,34 @@ struct srp_internal { #define dev_to_rport(d) container_of(d, struct srp_rport, dev) #define transport_class_to_srp_rport(dev) dev_to_rport((dev)->parent) +static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r) +{ + return dev_to_shost(r->dev.parent); +} + +/** + * srp_tmo_valid() - check timeout combination validity + * + * The combination of the timeout parameters must be such that SCSI commands + * are finished in a reasonable time. Hence do not allow the fast I/O fail + * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT. Furthermore, these + * parameters must be such that multipath can detect failed paths timely. + * Hence do not allow both parameters to be disabled simultaneously. + */ +int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo) +{ + if (fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + return -EINVAL; + if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + return -EINVAL; + if (dev_loss_tmo >= LONG_MAX / HZ) + return -EINVAL; + if (fast_io_fail_tmo >= 0 && dev_loss_tmo >= 0 && + fast_io_fail_tmo >= dev_loss_tmo) + return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(srp_tmo_valid); static int srp_host_setup(struct transport_container *tc, struct device *dev, struct device *cdev) @@ -134,10 +165,383 @@ static ssize_t store_srp_rport_delete(struct device *dev, static DEVICE_ATTR(delete, S_IWUSR, NULL, store_srp_rport_delete); +static ssize_t show_srp_rport_state(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + static const char *const state_name[] = { + [SRP_RPORT_RUNNING] = "running", + [SRP_RPORT_BLOCKED] = "blocked", + [SRP_RPORT_FAIL_FAST] = "fail-fast", + [SRP_RPORT_LOST] = "lost", + }; + struct srp_rport *rport = transport_class_to_srp_rport(dev); + enum srp_rport_state state = rport->state; + + return sprintf(buf, "%s\n", + (unsigned)state < ARRAY_SIZE(state_name) ? + state_name[state] : "???"); +} + +static DEVICE_ATTR(state, S_IRUGO, show_srp_rport_state, NULL); + +static ssize_t srp_show_tmo(char *buf, int tmo) +{ + return tmo >= 0 ? sprintf(buf, "%d\n", tmo) : sprintf(buf, "off\n"); +} + +static int srp_parse_tmo(int *tmo, const char *buf) +{ + int res = 0; + + if (strncmp(buf, "off", 3) != 0) + res = kstrtoint(buf, 0, tmo); + else + *tmo = -1; + + return res; +} + +static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->fast_io_fail_tmo); +} + +static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int fast_io_fail_tmo; + + res = srp_parse_tmo(&fast_io_fail_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(fast_io_fail_tmo, rport->dev_loss_tmo); + if (res) + goto out; + rport->fast_io_fail_tmo = fast_io_fail_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(fast_io_fail_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_fast_io_fail_tmo, + store_srp_rport_fast_io_fail_tmo); + +static ssize_t show_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->dev_loss_tmo); +} + +static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res; + int dev_loss_tmo; + + res = srp_parse_tmo(&dev_loss_tmo, buf); + if (res) + goto out; + res = srp_tmo_valid(rport->fast_io_fail_tmo, dev_loss_tmo); + if (res) + goto out; + rport->dev_loss_tmo = dev_loss_tmo; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(dev_loss_tmo, S_IRUGO | S_IWUSR, + show_srp_rport_dev_loss_tmo, + store_srp_rport_dev_loss_tmo); + +static int srp_rport_set_state(struct srp_rport *rport, + enum srp_rport_state new_state) +{ + enum srp_rport_state old_state = rport->state; + + lockdep_assert_held(&rport->mutex); + + switch (new_state) { + case SRP_RPORT_RUNNING: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_BLOCKED: + switch (old_state) { + case SRP_RPORT_RUNNING: + break; + default: + goto invalid; + } + break; + case SRP_RPORT_FAIL_FAST: + switch (old_state) { + case SRP_RPORT_LOST: + goto invalid; + default: + break; + } + break; + case SRP_RPORT_LOST: + break; + } + rport->state = new_state; + return 0; + +invalid: + return -EINVAL; +} + +static void __rport_fail_io_fast(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i; + + lockdep_assert_held(&rport->mutex); + + if (srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST)) + return; + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + + /* Involve the LLD if possible to terminate all I/O on the rport. */ + i = to_srp_internal(shost->transportt); + if (i->f->terminate_rport_io) + i->f->terminate_rport_io(rport); +} + +/** + * rport_fast_io_fail_timedout() - fast I/O failure timeout handler + */ +static void rport_fast_io_fail_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, fast_io_fail_work); + struct Scsi_Host *shost = rport_to_shost(rport); + + pr_info("fast_io_fail_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + mutex_unlock(&rport->mutex); +} + +/** + * rport_dev_loss_timedout() - device loss timeout handler + */ +static void rport_dev_loss_timedout(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, dev_loss_work); + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + + pr_info("dev_loss_tmo expired for SRP %s / %s.\n", + dev_name(&rport->dev), dev_name(&shost->shost_gendev)); + + mutex_lock(&rport->mutex); + WARN_ON(srp_rport_set_state(rport, SRP_RPORT_LOST) != 0); + scsi_target_unblock(rport->dev.parent, SDEV_TRANSPORT_OFFLINE); + mutex_unlock(&rport->mutex); + + i->f->rport_delete(rport); +} + +static void __srp_start_tl_fail_timers(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + int fast_io_fail_tmo, dev_loss_tmo; + + lockdep_assert_held(&rport->mutex); + + if (!rport->deleted) { + fast_io_fail_tmo = rport->fast_io_fail_tmo; + dev_loss_tmo = rport->dev_loss_tmo; + pr_debug("%s current state: %d\n", + dev_name(&shost->shost_gendev), rport->state); + + if (fast_io_fail_tmo >= 0 && + srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { + pr_debug("%s new state: %d\n", + dev_name(&shost->shost_gendev), + rport->state); + scsi_target_block(&shost->shost_gendev); + queue_delayed_work(system_long_wq, + &rport->fast_io_fail_work, + 1UL * fast_io_fail_tmo * HZ); + } + if (dev_loss_tmo >= 0) + queue_delayed_work(system_long_wq, + &rport->dev_loss_work, + 1UL * dev_loss_tmo * HZ); + } else { + pr_debug("%s has already been deleted\n", + dev_name(&shost->shost_gendev)); + srp_rport_set_state(rport, SRP_RPORT_FAIL_FAST); + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + } +} + +/** + * srp_start_tl_fail_timers() - start the transport layer failure timers + * + * Start the transport layer fast I/O failure and device loss timers. Do not + * modify a timer that was already started. + */ +void srp_start_tl_fail_timers(struct srp_rport *rport) +{ + mutex_lock(&rport->mutex); + __srp_start_tl_fail_timers(rport); + mutex_unlock(&rport->mutex); +} +EXPORT_SYMBOL(srp_start_tl_fail_timers); + +/** + * scsi_request_fn_active() - number of kernel threads inside scsi_request_fn() + */ +static int scsi_request_fn_active(struct Scsi_Host *shost) +{ + struct scsi_device *sdev; + struct request_queue *q; + int request_fn_active = 0; + + shost_for_each_device(sdev, shost) { + q = sdev->request_queue; + + spin_lock_irq(q->queue_lock); + request_fn_active += q->request_fn_active; + spin_unlock_irq(q->queue_lock); + } + + return request_fn_active; +} + +/** + * srp_reconnect_rport() - reconnect to an SRP target port + * + * Blocks SCSI command queueing before invoking reconnect() such that + * queuecommand() won't be invoked concurrently with reconnect() from outside + * the SCSI EH. This is important since a reconnect() implementation may + * reallocate resources needed by queuecommand(). + * + * Notes: + * - This function neither waits until outstanding requests have finished nor + * tries to abort these. It is the responsibility of the reconnect() + * function to finish outstanding commands before reconnecting to the target + * port. + * - It is the responsibility of the caller to ensure that the resources + * reallocated by the reconnect() function won't be used while this function + * is in progress. One possible strategy is to invoke this function from + * the context of the SCSI EH thread only. Another possible strategy is to + * lock the rport mutex inside each SCSI LLD callback that can be invoked by + * the SCSI EH (the scsi_host_template.eh_*() functions and also the + * scsi_host_template.queuecommand() function). + */ +int srp_reconnect_rport(struct srp_rport *rport) +{ + struct Scsi_Host *shost = rport_to_shost(rport); + struct srp_internal *i = to_srp_internal(shost->transportt); + struct scsi_device *sdev; + int res; + + pr_debug("SCSI host %s\n", dev_name(&shost->shost_gendev)); + + res = mutex_lock_interruptible(&rport->mutex); + if (res) + goto out; + scsi_target_block(&shost->shost_gendev); + while (scsi_request_fn_active(shost)) + msleep(20); + res = i->f->reconnect(rport); + pr_debug("%s (state %d): transport.reconnect() returned %d\n", + dev_name(&shost->shost_gendev), rport->state, res); + if (res == 0) { + cancel_delayed_work(&rport->fast_io_fail_work); + cancel_delayed_work(&rport->dev_loss_work); + + srp_rport_set_state(rport, SRP_RPORT_RUNNING); + scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING); + /* + * If the SCSI error handler has offlined one or more devices, + * invoking scsi_target_unblock() won't change the state of + * these devices into running so do that explicitly. + */ + spin_lock_irq(shost->host_lock); + __shost_for_each_device(sdev, shost) + if (sdev->sdev_state == SDEV_OFFLINE) + sdev->sdev_state = SDEV_RUNNING; + spin_unlock_irq(shost->host_lock); + } else if (rport->state == SRP_RPORT_RUNNING) { + /* + * srp_reconnect_rport() was invoked with fast_io_fail + * off. Mark the port as failed and start the TL failure + * timers if these had not yet been started. + */ + __rport_fail_io_fast(rport); + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + __srp_start_tl_fail_timers(rport); + } else if (rport->state != SRP_RPORT_BLOCKED) { + scsi_target_unblock(&shost->shost_gendev, + SDEV_TRANSPORT_OFFLINE); + } + mutex_unlock(&rport->mutex); + +out: + return res; +} +EXPORT_SYMBOL(srp_reconnect_rport); + +/** + * srp_timed_out() - SRP transport intercept of the SCSI timeout EH + * + * If a timeout occurs while an rport is in the blocked state, ask the SCSI + * EH to continue waiting (BLK_EH_RESET_TIMER). Otherwise let the SCSI core + * handle the timeout (BLK_EH_NOT_HANDLED). + * + * Note: This function is called from soft-IRQ context and with the request + * queue lock held. + */ +static enum blk_eh_timer_return srp_timed_out(struct scsi_cmnd *scmd) +{ + struct scsi_device *sdev = scmd->device; + struct Scsi_Host *shost = sdev->host; + struct srp_internal *i = to_srp_internal(shost->transportt); + + pr_debug("timeout for sdev %s\n", dev_name(&sdev->sdev_gendev)); + return i->f->reset_timer_if_blocked && scsi_device_blocked(sdev) ? + BLK_EH_RESET_TIMER : BLK_EH_NOT_HANDLED; +} + static void srp_rport_release(struct device *dev) { struct srp_rport *rport = dev_to_rport(dev); + cancel_delayed_work_sync(&rport->fast_io_fail_work); + cancel_delayed_work_sync(&rport->dev_loss_work); + put_device(dev->parent); kfree(rport); } @@ -214,12 +618,15 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost, { struct srp_rport *rport; struct device *parent = &shost->shost_gendev; + struct srp_internal *i = to_srp_internal(shost->transportt); int id, ret; rport = kzalloc(sizeof(*rport), GFP_KERNEL); if (!rport) return ERR_PTR(-ENOMEM); + mutex_init(&rport->mutex); + device_initialize(&rport->dev); rport->dev.parent = get_device(parent); @@ -228,6 +635,13 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost, memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id)); rport->roles = ids->roles; + rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ? + *i->f->fast_io_fail_tmo : 15; + rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60; + INIT_DELAYED_WORK(&rport->fast_io_fail_work, + rport_fast_io_fail_timedout); + INIT_DELAYED_WORK(&rport->dev_loss_work, rport_dev_loss_timedout); + id = atomic_inc_return(&to_srp_host_attrs(shost)->next_port_id); dev_set_name(&rport->dev, "port-%d:%d", shost->host_no, id); @@ -277,6 +691,13 @@ void srp_rport_del(struct srp_rport *rport) transport_remove_device(dev); device_del(dev); transport_destroy_device(dev); + + mutex_lock(&rport->mutex); + if (rport->state == SRP_RPORT_BLOCKED) + __rport_fail_io_fast(rport); + rport->deleted = true; + mutex_unlock(&rport->mutex); + put_device(dev); } EXPORT_SYMBOL_GPL(srp_rport_del); @@ -328,6 +749,8 @@ srp_attach_transport(struct srp_function_template *ft) if (!i) return NULL; + i->t.eh_timed_out = srp_timed_out; + i->t.tsk_mgmt_response = srp_tsk_mgmt_response; i->t.it_nexus_response = srp_it_nexus_response; @@ -345,6 +768,11 @@ srp_attach_transport(struct srp_function_template *ft) count = 0; i->rport_attrs[count++] = &dev_attr_port_id; i->rport_attrs[count++] = &dev_attr_roles; + if (ft->has_rport_state) { + i->rport_attrs[count++] = &dev_attr_state; + i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo; + i->rport_attrs[count++] = &dev_attr_dev_loss_tmo; + } if (ft->rport_delete) i->rport_attrs[count++] = &dev_attr_delete; i->rport_attrs[count++] = NULL; diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index 5a2d2d1081c1..ee7001677f64 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -13,6 +13,26 @@ struct srp_rport_identifiers { u8 roles; }; +/** + * enum srp_rport_state - SRP transport layer state + * @SRP_RPORT_RUNNING: Transport layer operational. + * @SRP_RPORT_BLOCKED: Transport layer not operational; fast I/O fail timer + * is running and I/O has been blocked. + * @SRP_RPORT_FAIL_FAST: Fast I/O fail timer has expired; fail I/O fast. + * @SRP_RPORT_LOST: Device loss timer has expired; port is being removed. + */ +enum srp_rport_state { + SRP_RPORT_RUNNING, + SRP_RPORT_BLOCKED, + SRP_RPORT_FAIL_FAST, + SRP_RPORT_LOST, +}; + +/** + * struct srp_rport + * @lld_data: LLD private data. + * @mutex: Protects against concurrent rport fast_io_fail / dev_loss_tmo. + */ struct srp_rport { /* for initiator and target drivers */ @@ -23,11 +43,38 @@ struct srp_rport { /* for initiator drivers */ - void *lld_data; /* LLD private data */ + void *lld_data; + + struct mutex mutex; + enum srp_rport_state state; + bool deleted; + int fast_io_fail_tmo; + int dev_loss_tmo; + struct delayed_work fast_io_fail_work; + struct delayed_work dev_loss_work; }; +/** + * struct srp_function_template + * @has_rport_state: Whether or not to create the state, fast_io_fail_tmo and + * dev_loss_tmo sysfs attribute for an rport. + * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command + * timer if the device on which it has been queued is blocked. + * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value. + * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value. + * @reconnect: Callback function for reconnecting to the target. See also + * srp_reconnect_rport(). + * @terminate_rport_io: Callback function for terminating all outstanding I/O + * requests for an rport. + */ struct srp_function_template { /* for initiator drivers */ + bool has_rport_state; + bool reset_timer_if_blocked; + int *fast_io_fail_tmo; + int *dev_loss_tmo; + int (*reconnect)(struct srp_rport *rport); + void (*terminate_rport_io)(struct srp_rport *rport); void (*rport_delete)(struct srp_rport *rport); /* for target drivers */ int (* tsk_mgmt_response)(struct Scsi_Host *, u64, u64, int); @@ -43,7 +90,30 @@ extern void srp_rport_put(struct srp_rport *rport); extern struct srp_rport *srp_rport_add(struct Scsi_Host *, struct srp_rport_identifiers *); extern void srp_rport_del(struct srp_rport *); - +extern int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo); +extern int srp_reconnect_rport(struct srp_rport *rport); +extern void srp_start_tl_fail_timers(struct srp_rport *rport); extern void srp_remove_host(struct Scsi_Host *); +/** + * srp_chkready() - evaluate the transport layer state before I/O + * + * Returns a SCSI result code that can be returned by the LLD queuecommand() + * implementation. The role of this function is similar to that of + * fc_remote_port_chkready(). + */ +static inline int srp_chkready(struct srp_rport *rport) +{ + switch (rport->state) { + case SRP_RPORT_RUNNING: + case SRP_RPORT_BLOCKED: + default: + return 0; + case SRP_RPORT_FAIL_FAST: + return DID_TRANSPORT_FAILFAST << 16; + case SRP_RPORT_LOST: + return DID_NO_CONNECT << 16; + } +} + #endif From ed9b2264fb393327a6c8a4229d8df55df596188e Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:34:27 +0200 Subject: [PATCH 35/62] IB/srp: Use SRP transport layer error recovery Enable fast_io_fail_tmo and dev_loss_tmo functionality for the IB SRP initiator. Add kernel module parameters that allow to specify default values for these parameters. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 141 ++++++++++++++++++++-------- drivers/infiniband/ulp/srp/ib_srp.h | 1 - 2 files changed, 101 insertions(+), 41 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 6edab7855f5e..15b4d2ce4989 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -86,6 +86,27 @@ module_param(topspin_workarounds, int, 0444); MODULE_PARM_DESC(topspin_workarounds, "Enable workarounds for Topspin/Cisco SRP target bugs if != 0"); +static struct kernel_param_ops srp_tmo_ops; + +static int srp_fast_io_fail_tmo = 15; +module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(fast_io_fail_tmo, + "Number of seconds between the observation of a transport" + " layer error and failing all I/O. \"off\" means that this" + " functionality is disabled."); + +static int srp_dev_loss_tmo = 60; +module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dev_loss_tmo, + "Maximum number of seconds that the SRP transport should" + " insulate transport layer errors. After this time has been" + " exceeded the SCSI host is removed. Should be" + " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT) + " if fast_io_fail_tmo has not been set. \"off\" means that" + " this functionality is disabled."); + static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device); static void srp_recv_completion(struct ib_cq *cq, void *target_ptr); @@ -102,6 +123,44 @@ static struct ib_client srp_client = { static struct ib_sa_client srp_sa_client; +static int srp_tmo_get(char *buffer, const struct kernel_param *kp) +{ + int tmo = *(int *)kp->arg; + + if (tmo >= 0) + return sprintf(buffer, "%d", tmo); + else + return sprintf(buffer, "off"); +} + +static int srp_tmo_set(const char *val, const struct kernel_param *kp) +{ + int tmo, res; + + if (strncmp(val, "off", 3) != 0) { + res = kstrtoint(val, 0, &tmo); + if (res) + goto out; + } else { + tmo = -1; + } + if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(tmo, srp_dev_loss_tmo); + else + res = srp_tmo_valid(srp_fast_io_fail_tmo, tmo); + if (res) + goto out; + *(int *)kp->arg = tmo; + +out: + return res; +} + +static struct kernel_param_ops srp_tmo_ops = { + .get = srp_tmo_get, + .set = srp_tmo_set, +}; + static inline struct srp_target_port *host_to_target(struct Scsi_Host *host) { return (struct srp_target_port *) host->hostdata; @@ -688,23 +747,42 @@ static void srp_free_req(struct srp_target_port *target, spin_unlock_irqrestore(&target->lock, flags); } -static void srp_reset_req(struct srp_target_port *target, struct srp_request *req) +static void srp_finish_req(struct srp_target_port *target, + struct srp_request *req, int result) { struct scsi_cmnd *scmnd = srp_claim_req(target, req, NULL); if (scmnd) { srp_free_req(target, req, scmnd, 0); - scmnd->result = DID_RESET << 16; + scmnd->result = result; scmnd->scsi_done(scmnd); } } -static int srp_reconnect_target(struct srp_target_port *target) +static void srp_terminate_io(struct srp_rport *rport) { - struct Scsi_Host *shost = target->scsi_host; - int i, ret; + struct srp_target_port *target = rport->lld_data; + int i; - scsi_target_block(&shost->shost_gendev); + for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + struct srp_request *req = &target->req_ring[i]; + srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16); + } +} + +/* + * It is up to the caller to ensure that srp_rport_reconnect() calls are + * serialized and that no concurrent srp_queuecommand(), srp_abort(), + * srp_reset_device() or srp_reset_host() calls will occur while this function + * is in progress. One way to realize that is not to call this function + * directly but to call srp_reconnect_rport() instead since that last function + * serializes calls of this function via rport->mutex and also blocks + * srp_queuecommand() calls before invoking this function. + */ +static int srp_rport_reconnect(struct srp_rport *rport) +{ + struct srp_target_port *target = rport->lld_data; + int i, ret; srp_disconnect_target(target); /* @@ -725,8 +803,7 @@ static int srp_reconnect_target(struct srp_target_port *target) for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { struct srp_request *req = &target->req_ring[i]; - if (req->scmnd) - srp_reset_req(target, req); + srp_finish_req(target, req, DID_RESET << 16); } INIT_LIST_HEAD(&target->free_tx); @@ -736,28 +813,9 @@ static int srp_reconnect_target(struct srp_target_port *target) if (ret == 0) ret = srp_connect_target(target); - scsi_target_unblock(&shost->shost_gendev, ret == 0 ? SDEV_RUNNING : - SDEV_TRANSPORT_OFFLINE); - target->transport_offline = !!ret; - - if (ret) - goto err; - - shost_printk(KERN_INFO, target->scsi_host, PFX "reconnect succeeded\n"); - - return ret; - -err: - shost_printk(KERN_ERR, target->scsi_host, - PFX "reconnect failed (%d), removing target port.\n", ret); - - /* - * We couldn't reconnect, so kill our target port off. - * However, we have to defer the real removal because we - * are in the context of the SCSI error handler now, which - * will deadlock if we call scsi_remove_host(). - */ - srp_queue_remove_work(target); + if (ret == 0) + shost_printk(KERN_INFO, target->scsi_host, + PFX "reconnect succeeded\n"); return ret; } @@ -1356,10 +1414,11 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) struct srp_cmd *cmd; struct ib_device *dev; unsigned long flags; - int len; + int len, result; - if (unlikely(target->transport_offline)) { - scmnd->result = DID_NO_CONNECT << 16; + result = srp_chkready(target->rport); + if (unlikely(result)) { + scmnd->result = result; scmnd->scsi_done(scmnd); return 0; } @@ -1757,7 +1816,7 @@ static int srp_abort(struct scsi_cmnd *scmnd) if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, SRP_TSK_ABORT_TASK) == 0) ret = SUCCESS; - else if (target->transport_offline) + else if (target->rport->state == SRP_RPORT_LOST) ret = FAST_IO_FAIL; else ret = FAILED; @@ -1784,7 +1843,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { struct srp_request *req = &target->req_ring[i]; if (req->scmnd && req->scmnd->device == scmnd->device) - srp_reset_req(target, req); + srp_finish_req(target, req, DID_RESET << 16); } return SUCCESS; @@ -1793,14 +1852,10 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) static int srp_reset_host(struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(scmnd->device->host); - int ret = FAILED; shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n"); - if (!srp_reconnect_target(target)) - ret = SUCCESS; - - return ret; + return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED; } static int srp_slave_configure(struct scsi_device *sdev) @@ -2637,7 +2692,13 @@ static void srp_remove_one(struct ib_device *device) } static struct srp_function_template ib_srp_transport_functions = { + .has_rport_state = true, + .reset_timer_if_blocked = true, + .fast_io_fail_tmo = &srp_fast_io_fail_tmo, + .dev_loss_tmo = &srp_dev_loss_tmo, + .reconnect = srp_rport_reconnect, .rport_delete = srp_rport_delete, + .terminate_rport_io = srp_terminate_io, }; static int __init srp_init_module(void) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 2a1768fcc57d..fd1817e48ad4 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -140,7 +140,6 @@ struct srp_target_port { unsigned int cmd_sg_cnt; unsigned int indirect_size; bool allow_ext_sg; - bool transport_offline; /* Everything above this point is used in the hot path of * command processing. Try to keep them packed into cachelines. From c1120f8981fe8ac8dd21092afaf664ba030a76cd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:35:08 +0200 Subject: [PATCH 36/62] IB/srp: Start timers if a transport layer error occurs Start the reconnect timer, fast_io_fail timer and dev_loss timers if a transport layer error occurs. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 19 +++++++++++++++++++ drivers/infiniband/ulp/srp/ib_srp.h | 1 + 2 files changed, 20 insertions(+) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 15b4d2ce4989..076a3ab7f779 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -593,6 +593,7 @@ static void srp_remove_target(struct srp_target_port *target) srp_disconnect_target(target); ib_destroy_cm_id(target->cm_id); srp_free_target_ib(target); + cancel_work_sync(&target->tl_err_work); srp_rport_put(target->rport); srp_free_req_data(target); scsi_host_put(target->scsi_host); @@ -1362,6 +1363,21 @@ static void srp_handle_recv(struct srp_target_port *target, struct ib_wc *wc) PFX "Recv failed with error code %d\n", res); } +/** + * srp_tl_err_work() - handle a transport layer error + * + * Note: This function may get invoked before the rport has been created, + * hence the target->rport test. + */ +static void srp_tl_err_work(struct work_struct *work) +{ + struct srp_target_port *target; + + target = container_of(work, struct srp_target_port, tl_err_work); + if (target->rport) + srp_start_tl_fail_timers(target->rport); +} + static void srp_handle_qp_err(enum ib_wc_status wc_status, enum ib_wc_opcode wc_opcode, struct srp_target_port *target) @@ -1371,6 +1387,7 @@ static void srp_handle_qp_err(enum ib_wc_status wc_status, PFX "failed %s status %d\n", wc_opcode & IB_WC_RECV ? "receive" : "send", wc_status); + queue_work(system_long_wq, &target->tl_err_work); } target->qp_in_error = true; } @@ -1733,6 +1750,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) if (ib_send_cm_drep(cm_id, NULL, 0)) shost_printk(KERN_ERR, target->scsi_host, PFX "Sending CM DREP failed\n"); + queue_work(system_long_wq, &target->tl_err_work); break; case IB_CM_TIMEWAIT_EXIT: @@ -2419,6 +2437,7 @@ static ssize_t srp_create_target(struct device *dev, sizeof (struct srp_indirect_buf) + target->cmd_sg_cnt * sizeof (struct srp_direct_buf); + INIT_WORK(&target->tl_err_work, srp_tl_err_work); INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index fd1817e48ad4..446b0452107b 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -177,6 +177,7 @@ struct srp_target_port { struct srp_iu *rx_ring[SRP_RQ_SIZE]; struct srp_request req_ring[SRP_CMD_SQ_SIZE]; + struct work_struct tl_err_work; struct work_struct remove_work; struct list_head list; From 8c64e4531c3c3bedf11d723196270d4a7553db45 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:35:59 +0200 Subject: [PATCH 37/62] scsi_transport_srp: Add periodic reconnect support Add support for periodically reconnecting to an SRP target until the dev_loss timer expires. After the tenth reconnection attempt, gradually slow down subsequent reconnect attempts. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-transport-srp | 8 ++ drivers/infiniband/ulp/srp/ib_srp.c | 4 +- drivers/scsi/scsi_transport_srp.c | 106 +++++++++++++++++-- include/scsi/scsi_transport_srp.h | 11 +- 4 files changed, 118 insertions(+), 11 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-transport-srp b/Documentation/ABI/stable/sysfs-transport-srp index 8b6acc775b71..ec7af69fea0a 100644 --- a/Documentation/ABI/stable/sysfs-transport-srp +++ b/Documentation/ABI/stable/sysfs-transport-srp @@ -30,6 +30,14 @@ Contact: linux-scsi@vger.kernel.org Description: 16-byte local SRP port identifier in hexadecimal format. An example: 4c:49:4e:55:58:20:56:49:4f:00:00:00:00:00:00:00. +What: /sys/class/srp_remote_ports/port-:/reconnect_delay +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-scsi@vger.kernel.org, linux-rdma@vger.kernel.org +Description: Number of seconds the SCSI layer will wait after a reconnect + attempt failed before retrying. Setting this attribute to + "off" will disable time-based reconnecting. + What: /sys/class/srp_remote_ports/port-:/roles Date: June 27, 2007 KernelVersion: 2.6.24 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 076a3ab7f779..99c893d1c2ac 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -145,9 +145,9 @@ static int srp_tmo_set(const char *val, const struct kernel_param *kp) tmo = -1; } if (kp->arg == &srp_fast_io_fail_tmo) - res = srp_tmo_valid(tmo, srp_dev_loss_tmo); + res = srp_tmo_valid(-1, tmo, srp_dev_loss_tmo); else - res = srp_tmo_valid(srp_fast_io_fail_tmo, tmo); + res = srp_tmo_valid(-1, srp_fast_io_fail_tmo, tmo); if (res) goto out; *(int *)kp->arg = tmo; diff --git a/drivers/scsi/scsi_transport_srp.c b/drivers/scsi/scsi_transport_srp.c index 2696e26b3423..2700a5a09bd4 100644 --- a/drivers/scsi/scsi_transport_srp.c +++ b/drivers/scsi/scsi_transport_srp.c @@ -41,7 +41,7 @@ struct srp_host_attrs { #define to_srp_host_attrs(host) ((struct srp_host_attrs *)(host)->shost_data) #define SRP_HOST_ATTRS 0 -#define SRP_RPORT_ATTRS 6 +#define SRP_RPORT_ATTRS 8 struct srp_internal { struct scsi_transport_template t; @@ -69,11 +69,13 @@ static inline struct Scsi_Host *rport_to_shost(struct srp_rport *r) * are finished in a reasonable time. Hence do not allow the fast I/O fail * timeout to exceed SCSI_DEVICE_BLOCK_MAX_TIMEOUT. Furthermore, these * parameters must be such that multipath can detect failed paths timely. - * Hence do not allow both parameters to be disabled simultaneously. + * Hence do not allow all three parameters to be disabled simultaneously. */ -int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo) +int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, int dev_loss_tmo) { - if (fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + if (reconnect_delay < 0 && fast_io_fail_tmo < 0 && dev_loss_tmo < 0) + return -EINVAL; + if (reconnect_delay == 0) return -EINVAL; if (fast_io_fail_tmo > SCSI_DEVICE_BLOCK_MAX_TIMEOUT) return -EINVAL; @@ -202,6 +204,56 @@ static int srp_parse_tmo(int *tmo, const char *buf) return res; } +static ssize_t show_reconnect_delay(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return srp_show_tmo(buf, rport->reconnect_delay); +} + +static ssize_t store_reconnect_delay(struct device *dev, + struct device_attribute *attr, + const char *buf, const size_t count) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + int res, delay; + + res = srp_parse_tmo(&delay, buf); + if (res) + goto out; + res = srp_tmo_valid(delay, rport->fast_io_fail_tmo, + rport->dev_loss_tmo); + if (res) + goto out; + + if (rport->reconnect_delay <= 0 && delay > 0 && + rport->state != SRP_RPORT_RUNNING) { + queue_delayed_work(system_long_wq, &rport->reconnect_work, + delay * HZ); + } else if (delay <= 0) { + cancel_delayed_work(&rport->reconnect_work); + } + rport->reconnect_delay = delay; + res = count; + +out: + return res; +} + +static DEVICE_ATTR(reconnect_delay, S_IRUGO | S_IWUSR, show_reconnect_delay, + store_reconnect_delay); + +static ssize_t show_failed_reconnects(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct srp_rport *rport = transport_class_to_srp_rport(dev); + + return sprintf(buf, "%d\n", rport->failed_reconnects); +} + +static DEVICE_ATTR(failed_reconnects, S_IRUGO, show_failed_reconnects, NULL); + static ssize_t show_srp_rport_fast_io_fail_tmo(struct device *dev, struct device_attribute *attr, char *buf) @@ -222,7 +274,8 @@ static ssize_t store_srp_rport_fast_io_fail_tmo(struct device *dev, res = srp_parse_tmo(&fast_io_fail_tmo, buf); if (res) goto out; - res = srp_tmo_valid(fast_io_fail_tmo, rport->dev_loss_tmo); + res = srp_tmo_valid(rport->reconnect_delay, fast_io_fail_tmo, + rport->dev_loss_tmo); if (res) goto out; rport->fast_io_fail_tmo = fast_io_fail_tmo; @@ -256,7 +309,8 @@ static ssize_t store_srp_rport_dev_loss_tmo(struct device *dev, res = srp_parse_tmo(&dev_loss_tmo, buf); if (res) goto out; - res = srp_tmo_valid(rport->fast_io_fail_tmo, dev_loss_tmo); + res = srp_tmo_valid(rport->reconnect_delay, rport->fast_io_fail_tmo, + dev_loss_tmo); if (res) goto out; rport->dev_loss_tmo = dev_loss_tmo; @@ -312,6 +366,29 @@ invalid: return -EINVAL; } +/** + * srp_reconnect_work() - reconnect and schedule a new attempt if necessary + */ +static void srp_reconnect_work(struct work_struct *work) +{ + struct srp_rport *rport = container_of(to_delayed_work(work), + struct srp_rport, reconnect_work); + struct Scsi_Host *shost = rport_to_shost(rport); + int delay, res; + + res = srp_reconnect_rport(rport); + if (res != 0) { + shost_printk(KERN_ERR, shost, + "reconnect attempt %d failed (%d)\n", + ++rport->failed_reconnects, res); + delay = rport->reconnect_delay * + min(100, max(1, rport->failed_reconnects - 10)); + if (delay > 0) + queue_delayed_work(system_long_wq, + &rport->reconnect_work, delay * HZ); + } +} + static void __rport_fail_io_fast(struct srp_rport *rport) { struct Scsi_Host *shost = rport_to_shost(rport); @@ -371,16 +448,21 @@ static void rport_dev_loss_timedout(struct work_struct *work) static void __srp_start_tl_fail_timers(struct srp_rport *rport) { struct Scsi_Host *shost = rport_to_shost(rport); - int fast_io_fail_tmo, dev_loss_tmo; + int delay, fast_io_fail_tmo, dev_loss_tmo; lockdep_assert_held(&rport->mutex); if (!rport->deleted) { + delay = rport->reconnect_delay; fast_io_fail_tmo = rport->fast_io_fail_tmo; dev_loss_tmo = rport->dev_loss_tmo; pr_debug("%s current state: %d\n", dev_name(&shost->shost_gendev), rport->state); + if (delay > 0) + queue_delayed_work(system_long_wq, + &rport->reconnect_work, + 1UL * delay * HZ); if (fast_io_fail_tmo >= 0 && srp_rport_set_state(rport, SRP_RPORT_BLOCKED) == 0) { pr_debug("%s new state: %d\n", @@ -481,6 +563,7 @@ int srp_reconnect_rport(struct srp_rport *rport) cancel_delayed_work(&rport->fast_io_fail_work); cancel_delayed_work(&rport->dev_loss_work); + rport->failed_reconnects = 0; srp_rport_set_state(rport, SRP_RPORT_RUNNING); scsi_target_unblock(&shost->shost_gendev, SDEV_RUNNING); /* @@ -539,6 +622,7 @@ static void srp_rport_release(struct device *dev) { struct srp_rport *rport = dev_to_rport(dev); + cancel_delayed_work_sync(&rport->reconnect_work); cancel_delayed_work_sync(&rport->fast_io_fail_work); cancel_delayed_work_sync(&rport->dev_loss_work); @@ -635,6 +719,10 @@ struct srp_rport *srp_rport_add(struct Scsi_Host *shost, memcpy(rport->port_id, ids->port_id, sizeof(rport->port_id)); rport->roles = ids->roles; + if (i->f->reconnect) + rport->reconnect_delay = i->f->reconnect_delay ? + *i->f->reconnect_delay : 10; + INIT_DELAYED_WORK(&rport->reconnect_work, srp_reconnect_work); rport->fast_io_fail_tmo = i->f->fast_io_fail_tmo ? *i->f->fast_io_fail_tmo : 15; rport->dev_loss_tmo = i->f->dev_loss_tmo ? *i->f->dev_loss_tmo : 60; @@ -773,6 +861,10 @@ srp_attach_transport(struct srp_function_template *ft) i->rport_attrs[count++] = &dev_attr_fast_io_fail_tmo; i->rport_attrs[count++] = &dev_attr_dev_loss_tmo; } + if (ft->reconnect) { + i->rport_attrs[count++] = &dev_attr_reconnect_delay; + i->rport_attrs[count++] = &dev_attr_failed_reconnects; + } if (ft->rport_delete) i->rport_attrs[count++] = &dev_attr_delete; i->rport_attrs[count++] = NULL; diff --git a/include/scsi/scsi_transport_srp.h b/include/scsi/scsi_transport_srp.h index ee7001677f64..4ebf6913b7b2 100644 --- a/include/scsi/scsi_transport_srp.h +++ b/include/scsi/scsi_transport_srp.h @@ -31,7 +31,8 @@ enum srp_rport_state { /** * struct srp_rport * @lld_data: LLD private data. - * @mutex: Protects against concurrent rport fast_io_fail / dev_loss_tmo. + * @mutex: Protects against concurrent rport reconnect / fast_io_fail / + * dev_loss_tmo activity. */ struct srp_rport { /* for initiator and target drivers */ @@ -48,6 +49,9 @@ struct srp_rport { struct mutex mutex; enum srp_rport_state state; bool deleted; + int reconnect_delay; + int failed_reconnects; + struct delayed_work reconnect_work; int fast_io_fail_tmo; int dev_loss_tmo; struct delayed_work fast_io_fail_work; @@ -60,6 +64,7 @@ struct srp_rport { * dev_loss_tmo sysfs attribute for an rport. * @reset_timer_if_blocked: Whether or srp_timed_out() should reset the command * timer if the device on which it has been queued is blocked. + * @reconnect_delay: If not NULL, points to the default reconnect_delay value. * @fast_io_fail_tmo: If not NULL, points to the default fast_io_fail_tmo value. * @dev_loss_tmo: If not NULL, points to the default dev_loss_tmo value. * @reconnect: Callback function for reconnecting to the target. See also @@ -71,6 +76,7 @@ struct srp_function_template { /* for initiator drivers */ bool has_rport_state; bool reset_timer_if_blocked; + int *reconnect_delay; int *fast_io_fail_tmo; int *dev_loss_tmo; int (*reconnect)(struct srp_rport *rport); @@ -90,7 +96,8 @@ extern void srp_rport_put(struct srp_rport *rport); extern struct srp_rport *srp_rport_add(struct Scsi_Host *, struct srp_rport_identifiers *); extern void srp_rport_del(struct srp_rport *); -extern int srp_tmo_valid(int fast_io_fail_tmo, int dev_loss_tmo); +extern int srp_tmo_valid(int reconnect_delay, int fast_io_fail_tmo, + int dev_loss_tmo); extern int srp_reconnect_rport(struct srp_rport *rport); extern void srp_start_tl_fail_timers(struct srp_rport *rport); extern void srp_remove_host(struct Scsi_Host *); From a95cadb9dafef41a755b11680529c2b49e7f59bd Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:37:17 +0200 Subject: [PATCH 38/62] IB/srp: Add periodic reconnect functionality After a transport layer occurred, periodically try to reconnect to the target until the dev_loss timer expires. Protect the callback functions that can be invoked from inside the SCSI EH against concurrent invocation with srp_reconnect_rport() via the rport mutex. Change the default dev_loss_tmo from 60s into 600s to give the reconnect mechanism a chance to kick in. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 52 +++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 99c893d1c2ac..ebbe01bdd306 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -88,6 +88,11 @@ MODULE_PARM_DESC(topspin_workarounds, static struct kernel_param_ops srp_tmo_ops; +static int srp_reconnect_delay = 10; +module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts"); + static int srp_fast_io_fail_tmo = 15; module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo, S_IRUGO | S_IWUSR); @@ -96,7 +101,7 @@ MODULE_PARM_DESC(fast_io_fail_tmo, " layer error and failing all I/O. \"off\" means that this" " functionality is disabled."); -static int srp_dev_loss_tmo = 60; +static int srp_dev_loss_tmo = 600; module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(dev_loss_tmo, @@ -144,10 +149,14 @@ static int srp_tmo_set(const char *val, const struct kernel_param *kp) } else { tmo = -1; } - if (kp->arg == &srp_fast_io_fail_tmo) - res = srp_tmo_valid(-1, tmo, srp_dev_loss_tmo); + if (kp->arg == &srp_reconnect_delay) + res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo, + srp_dev_loss_tmo); + else if (kp->arg == &srp_fast_io_fail_tmo) + res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo); else - res = srp_tmo_valid(-1, srp_fast_io_fail_tmo, tmo); + res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo, + tmo); if (res) goto out; *(int *)kp->arg = tmo; @@ -1426,18 +1435,29 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(shost); + struct srp_rport *rport = target->rport; struct srp_request *req; struct srp_iu *iu; struct srp_cmd *cmd; struct ib_device *dev; unsigned long flags; int len, result; + const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler; + + /* + * The SCSI EH thread is the only context from which srp_queuecommand() + * can get invoked for blocked devices (SDEV_BLOCK / + * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by + * locking the rport mutex if invoked from inside the SCSI EH. + */ + if (in_scsi_eh) + mutex_lock(&rport->mutex); result = srp_chkready(target->rport); if (unlikely(result)) { scmnd->result = result; scmnd->scsi_done(scmnd); - return 0; + goto unlock_rport; } spin_lock_irqsave(&target->lock, flags); @@ -1482,6 +1502,10 @@ static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) goto err_unmap; } +unlock_rport: + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + return 0; err_unmap: @@ -1496,6 +1520,9 @@ err_iu: err_unlock: spin_unlock_irqrestore(&target->lock, flags); + if (in_scsi_eh) + mutex_unlock(&rport->mutex); + return SCSI_MLQUEUE_HOST_BUSY; } @@ -1780,6 +1807,7 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) static int srp_send_tsk_mgmt(struct srp_target_port *target, u64 req_tag, unsigned int lun, u8 func) { + struct srp_rport *rport = target->rport; struct ib_device *dev = target->srp_host->srp_dev->dev; struct srp_iu *iu; struct srp_tsk_mgmt *tsk_mgmt; @@ -1789,12 +1817,20 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, init_completion(&target->tsk_mgmt_done); + /* + * Lock the rport mutex to avoid that srp_create_target_ib() is + * invoked while a task management function is being sent. + */ + mutex_lock(&rport->mutex); spin_lock_irq(&target->lock); iu = __srp_get_tx_iu(target, SRP_IU_TSK_MGMT); spin_unlock_irq(&target->lock); - if (!iu) + if (!iu) { + mutex_unlock(&rport->mutex); + return -1; + } ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt, DMA_TO_DEVICE); @@ -1811,8 +1847,11 @@ static int srp_send_tsk_mgmt(struct srp_target_port *target, DMA_TO_DEVICE); if (srp_post_send(target, iu, sizeof *tsk_mgmt)) { srp_put_tx_iu(target, iu, SRP_IU_TSK_MGMT); + mutex_unlock(&rport->mutex); + return -1; } + mutex_unlock(&rport->mutex); if (!wait_for_completion_timeout(&target->tsk_mgmt_done, msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS))) @@ -2713,6 +2752,7 @@ static void srp_remove_one(struct ib_device *device) static struct srp_function_template ib_srp_transport_functions = { .has_rport_state = true, .reset_timer_if_blocked = true, + .reconnect_delay = &srp_reconnect_delay, .fast_io_fail_tmo = &srp_fast_io_fail_tmo, .dev_loss_tmo = &srp_dev_loss_tmo, .reconnect = srp_rport_reconnect, From 848b3082dba4215e886c8a595b61f5b70247b341 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:38:12 +0200 Subject: [PATCH 39/62] IB/srp: Export sgid to sysfs On an initiator system with multiple IB ports it is not yet possible to figure out what the originating port of an SRP connection is. Hence make the source GID available in sysfs. Signed-off-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-driver-ib_srp | 7 +++++++ drivers/infiniband/ulp/srp/ib_srp.c | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp index 18e9b279ebb1..b95067eb5f18 100644 --- a/Documentation/ABI/stable/sysfs-driver-ib_srp +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp @@ -155,6 +155,13 @@ Contact: linux-rdma@vger.kernel.org Description: InfiniBand service ID used for establishing communication with the SRP target. +What: /sys/class/scsi_host/host/sgid +Date: February 1, 2014 +KernelVersion: 3.13 +Contact: linux-rdma@vger.kernel.org +Description: InfiniBand GID of the source port used for communication with + the SRP target. + What: /sys/class/scsi_host/host/zero_req_lim Date: September 20, 2006 KernelVersion: 2.6.18 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index ebbe01bdd306..62e1f2a1c522 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1965,6 +1965,14 @@ static ssize_t show_pkey(struct device *dev, struct device_attribute *attr, return sprintf(buf, "0x%04x\n", be16_to_cpu(target->path.pkey)); } +static ssize_t show_sgid(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct srp_target_port *target = host_to_target(class_to_shost(dev)); + + return sprintf(buf, "%pI6\n", target->path.sgid.raw); +} + static ssize_t show_dgid(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2049,6 +2057,7 @@ static DEVICE_ATTR(id_ext, S_IRUGO, show_id_ext, NULL); static DEVICE_ATTR(ioc_guid, S_IRUGO, show_ioc_guid, NULL); static DEVICE_ATTR(service_id, S_IRUGO, show_service_id, NULL); static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL); +static DEVICE_ATTR(sgid, S_IRUGO, show_sgid, NULL); static DEVICE_ATTR(dgid, S_IRUGO, show_dgid, NULL); static DEVICE_ATTR(orig_dgid, S_IRUGO, show_orig_dgid, NULL); static DEVICE_ATTR(req_lim, S_IRUGO, show_req_lim, NULL); @@ -2065,6 +2074,7 @@ static struct device_attribute *srp_host_attrs[] = { &dev_attr_ioc_guid, &dev_attr_service_id, &dev_attr_pkey, + &dev_attr_sgid, &dev_attr_dgid, &dev_attr_orig_dgid, &dev_attr_req_lim, From b81d00bddfdccf000f29a5ba5bf7caa8f26fa00b Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:38:47 +0200 Subject: [PATCH 40/62] IB/srp: Introduce srp_alloc_req_data() This patch does not change any functionality. Signed-off-by: Bart Van Assche Acked-by: David Dillow Cc: Roland Dreier Cc: Vu Pham Cc: Sebastian Riemer Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 64 ++++++++++++++++++----------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 62e1f2a1c522..735af11d9b82 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -576,6 +576,42 @@ static void srp_free_req_data(struct srp_target_port *target) } } +static int srp_alloc_req_data(struct srp_target_port *target) +{ + struct srp_device *srp_dev = target->srp_host->srp_dev; + struct ib_device *ibdev = srp_dev->dev; + struct srp_request *req; + dma_addr_t dma_addr; + int i, ret = -ENOMEM; + + INIT_LIST_HEAD(&target->free_reqs); + + for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + req = &target->req_ring[i]; + req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), + GFP_KERNEL); + req->map_page = kmalloc(SRP_FMR_SIZE * sizeof(void *), + GFP_KERNEL); + req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); + if (!req->fmr_list || !req->map_page || !req->indirect_desc) + goto out; + + dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, + target->indirect_size, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(ibdev, dma_addr)) + goto out; + + req->indirect_dma_addr = dma_addr; + req->index = i; + list_add_tail(&req->list, &target->free_reqs); + } + ret = 0; + +out: + return ret; +} + /** * srp_del_scsi_host_attr() - Remove attributes defined in the host template. * @shost: SCSI host whose attributes to remove from sysfs. @@ -2433,8 +2469,7 @@ static ssize_t srp_create_target(struct device *dev, struct Scsi_Host *target_host; struct srp_target_port *target; struct ib_device *ibdev = host->srp_dev->dev; - dma_addr_t dma_addr; - int i, ret; + int ret; target_host = scsi_host_alloc(&srp_template, sizeof (struct srp_target_port)); @@ -2490,28 +2525,9 @@ static ssize_t srp_create_target(struct device *dev, INIT_WORK(&target->remove_work, srp_remove_work); spin_lock_init(&target->lock); INIT_LIST_HEAD(&target->free_tx); - INIT_LIST_HEAD(&target->free_reqs); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { - struct srp_request *req = &target->req_ring[i]; - - req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof (void *), - GFP_KERNEL); - req->map_page = kmalloc(SRP_FMR_SIZE * sizeof (void *), - GFP_KERNEL); - req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL); - if (!req->fmr_list || !req->map_page || !req->indirect_desc) - goto err_free_mem; - - dma_addr = ib_dma_map_single(ibdev, req->indirect_desc, - target->indirect_size, - DMA_TO_DEVICE); - if (ib_dma_mapping_error(ibdev, dma_addr)) - goto err_free_mem; - - req->indirect_dma_addr = dma_addr; - req->index = i; - list_add_tail(&req->list, &target->free_reqs); - } + ret = srp_alloc_req_data(target); + if (ret) + goto err_free_mem; ib_query_gid(ibdev, host->port, 0, &target->path.sgid); From 4d73f95f708ce540a85113b00f5363d0593d871d Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Sat, 26 Oct 2013 14:40:37 +0200 Subject: [PATCH 41/62] IB/srp: Make queue size configurable Certain storage configurations, e.g. a sufficiently large array of hard disks in a RAID configuration, need a queue depth above 64 to achieve optimal performance. Hence make the queue depth configurable. Signed-off-by: Bart Van Assche Acked-by: David Dillow Tested-by: Jack Wang Signed-off-by: Roland Dreier --- Documentation/ABI/stable/sysfs-driver-ib_srp | 4 + drivers/infiniband/ulp/srp/ib_srp.c | 125 ++++++++++++++----- drivers/infiniband/ulp/srp/ib_srp.h | 17 ++- 3 files changed, 107 insertions(+), 39 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-driver-ib_srp b/Documentation/ABI/stable/sysfs-driver-ib_srp index b95067eb5f18..b9688de8455b 100644 --- a/Documentation/ABI/stable/sysfs-driver-ib_srp +++ b/Documentation/ABI/stable/sysfs-driver-ib_srp @@ -63,6 +63,10 @@ Description: Interface for making ib_srp connect to a new target. over multiple CPU's. * tl_retry_count, a number in the range 2..7 specifying the IB RC retry count. + * queue_size, the maximum number of commands that the + initiator is allowed to queue per SCSI host. The default + value for this parameter is 62. The lowest supported value + is 2. What: /sys/class/infiniband_srp/srp--/ibdev Date: January 2, 2006 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 735af11d9b82..6aa660d188a3 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -299,16 +299,16 @@ static int srp_create_target_ib(struct srp_target_port *target) return -ENOMEM; recv_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_recv_completion, NULL, target, SRP_RQ_SIZE, - target->comp_vector); + srp_recv_completion, NULL, target, + target->queue_size, target->comp_vector); if (IS_ERR(recv_cq)) { ret = PTR_ERR(recv_cq); goto err; } send_cq = ib_create_cq(target->srp_host->srp_dev->dev, - srp_send_completion, NULL, target, SRP_SQ_SIZE, - target->comp_vector); + srp_send_completion, NULL, target, + target->queue_size, target->comp_vector); if (IS_ERR(send_cq)) { ret = PTR_ERR(send_cq); goto err_recv_cq; @@ -317,8 +317,8 @@ static int srp_create_target_ib(struct srp_target_port *target) ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); init_attr->event_handler = srp_qp_event; - init_attr->cap.max_send_wr = SRP_SQ_SIZE; - init_attr->cap.max_recv_wr = SRP_RQ_SIZE; + init_attr->cap.max_send_wr = target->queue_size; + init_attr->cap.max_recv_wr = target->queue_size; init_attr->cap.max_recv_sge = 1; init_attr->cap.max_send_sge = 1; init_attr->sq_sig_type = IB_SIGNAL_ALL_WR; @@ -364,6 +364,10 @@ err: return ret; } +/* + * Note: this function may be called without srp_alloc_iu_bufs() having been + * invoked. Hence the target->[rt]x_ring checks. + */ static void srp_free_target_ib(struct srp_target_port *target) { int i; @@ -375,10 +379,18 @@ static void srp_free_target_ib(struct srp_target_port *target) target->qp = NULL; target->send_cq = target->recv_cq = NULL; - for (i = 0; i < SRP_RQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->rx_ring[i]); - for (i = 0; i < SRP_SQ_SIZE; ++i) - srp_free_iu(target->srp_host, target->tx_ring[i]); + if (target->rx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->rx_ring[i]); + kfree(target->rx_ring); + target->rx_ring = NULL; + } + if (target->tx_ring) { + for (i = 0; i < target->queue_size; ++i) + srp_free_iu(target->srp_host, target->tx_ring[i]); + kfree(target->tx_ring); + target->tx_ring = NULL; + } } static void srp_path_rec_completion(int status, @@ -564,7 +576,11 @@ static void srp_free_req_data(struct srp_target_port *target) struct srp_request *req; int i; - for (i = 0, req = target->req_ring; i < SRP_CMD_SQ_SIZE; ++i, ++req) { + if (!target->req_ring) + return; + + for (i = 0; i < target->req_ring_size; ++i) { + req = &target->req_ring[i]; kfree(req->fmr_list); kfree(req->map_page); if (req->indirect_dma_addr) { @@ -574,6 +590,9 @@ static void srp_free_req_data(struct srp_target_port *target) } kfree(req->indirect_desc); } + + kfree(target->req_ring); + target->req_ring = NULL; } static int srp_alloc_req_data(struct srp_target_port *target) @@ -586,7 +605,12 @@ static int srp_alloc_req_data(struct srp_target_port *target) INIT_LIST_HEAD(&target->free_reqs); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + target->req_ring = kzalloc(target->req_ring_size * + sizeof(*target->req_ring), GFP_KERNEL); + if (!target->req_ring) + goto out; + + for (i = 0; i < target->req_ring_size; ++i) { req = &target->req_ring[i]; req->fmr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *), GFP_KERNEL); @@ -810,7 +834,7 @@ static void srp_terminate_io(struct srp_rport *rport) struct srp_target_port *target = rport->lld_data; int i; - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; srp_finish_req(target, req, DID_TRANSPORT_FAILFAST << 16); } @@ -847,13 +871,13 @@ static int srp_rport_reconnect(struct srp_rport *rport) else srp_create_target_ib(target); - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; srp_finish_req(target, req, DID_RESET << 16); } INIT_LIST_HEAD(&target->free_tx); - for (i = 0; i < SRP_SQ_SIZE; ++i) + for (i = 0; i < target->queue_size; ++i) list_add(&target->tx_ring[i]->list, &target->free_tx); if (ret == 0) @@ -1562,11 +1586,24 @@ err_unlock: return SCSI_MLQUEUE_HOST_BUSY; } +/* + * Note: the resources allocated in this function are freed in + * srp_free_target_ib(). + */ static int srp_alloc_iu_bufs(struct srp_target_port *target) { int i; - for (i = 0; i < SRP_RQ_SIZE; ++i) { + target->rx_ring = kzalloc(target->queue_size * sizeof(*target->rx_ring), + GFP_KERNEL); + if (!target->rx_ring) + goto err_no_ring; + target->tx_ring = kzalloc(target->queue_size * sizeof(*target->tx_ring), + GFP_KERNEL); + if (!target->tx_ring) + goto err_no_ring; + + for (i = 0; i < target->queue_size; ++i) { target->rx_ring[i] = srp_alloc_iu(target->srp_host, target->max_ti_iu_len, GFP_KERNEL, DMA_FROM_DEVICE); @@ -1574,7 +1611,7 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) goto err; } - for (i = 0; i < SRP_SQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { target->tx_ring[i] = srp_alloc_iu(target->srp_host, target->max_iu_len, GFP_KERNEL, DMA_TO_DEVICE); @@ -1587,15 +1624,17 @@ static int srp_alloc_iu_bufs(struct srp_target_port *target) return 0; err: - for (i = 0; i < SRP_RQ_SIZE; ++i) { + for (i = 0; i < target->queue_size; ++i) { srp_free_iu(target->srp_host, target->rx_ring[i]); - target->rx_ring[i] = NULL; + srp_free_iu(target->srp_host, target->tx_ring[i]); } - for (i = 0; i < SRP_SQ_SIZE; ++i) { - srp_free_iu(target->srp_host, target->tx_ring[i]); - target->tx_ring[i] = NULL; - } + +err_no_ring: + kfree(target->tx_ring); + target->tx_ring = NULL; + kfree(target->rx_ring); + target->rx_ring = NULL; return -ENOMEM; } @@ -1647,6 +1686,9 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, target->scsi_host->can_queue = min(target->req_lim - SRP_TSK_MGMT_SQ_SIZE, target->scsi_host->can_queue); + target->scsi_host->cmd_per_lun + = min_t(int, target->scsi_host->can_queue, + target->scsi_host->cmd_per_lun); } else { shost_printk(KERN_WARNING, target->scsi_host, PFX "Unhandled RSP opcode %#x\n", lrsp->opcode); @@ -1654,7 +1696,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, goto error; } - if (!target->rx_ring[0]) { + if (!target->rx_ring) { ret = srp_alloc_iu_bufs(target); if (ret) goto error; @@ -1674,7 +1716,7 @@ static void srp_cm_rep_handler(struct ib_cm_id *cm_id, if (ret) goto error_free; - for (i = 0; i < SRP_RQ_SIZE; i++) { + for (i = 0; i < target->queue_size; i++) { struct srp_iu *iu = target->rx_ring[i]; ret = srp_post_recv(target, iu); if (ret) @@ -1933,7 +1975,7 @@ static int srp_reset_device(struct scsi_cmnd *scmnd) if (target->tsk_mgmt_status) return FAILED; - for (i = 0; i < SRP_CMD_SQ_SIZE; ++i) { + for (i = 0; i < target->req_ring_size; ++i) { struct srp_request *req = &target->req_ring[i]; if (req->scmnd && req->scmnd->device == scmnd->device) srp_finish_req(target, req, DID_RESET << 16); @@ -2136,9 +2178,9 @@ static struct scsi_host_template srp_template = { .eh_host_reset_handler = srp_reset_host, .skip_settle_delay = true, .sg_tablesize = SRP_DEF_SG_TABLESIZE, - .can_queue = SRP_CMD_SQ_SIZE, + .can_queue = SRP_DEFAULT_CMD_SQ_SIZE, .this_id = -1, - .cmd_per_lun = SRP_CMD_SQ_SIZE, + .cmd_per_lun = SRP_DEFAULT_CMD_SQ_SIZE, .use_clustering = ENABLE_CLUSTERING, .shost_attrs = srp_host_attrs }; @@ -2245,6 +2287,7 @@ enum { SRP_OPT_SG_TABLESIZE = 1 << 11, SRP_OPT_COMP_VECTOR = 1 << 12, SRP_OPT_TL_RETRY_COUNT = 1 << 13, + SRP_OPT_QUEUE_SIZE = 1 << 14, SRP_OPT_ALL = (SRP_OPT_ID_EXT | SRP_OPT_IOC_GUID | SRP_OPT_DGID | @@ -2267,6 +2310,7 @@ static const match_table_t srp_opt_tokens = { { SRP_OPT_SG_TABLESIZE, "sg_tablesize=%u" }, { SRP_OPT_COMP_VECTOR, "comp_vector=%u" }, { SRP_OPT_TL_RETRY_COUNT, "tl_retry_count=%u" }, + { SRP_OPT_QUEUE_SIZE, "queue_size=%d" }, { SRP_OPT_ERR, NULL } }; @@ -2361,13 +2405,25 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) target->scsi_host->max_sectors = token; break; + case SRP_OPT_QUEUE_SIZE: + if (match_int(args, &token) || token < 1) { + pr_warn("bad queue_size parameter '%s'\n", p); + goto out; + } + target->scsi_host->can_queue = token; + target->queue_size = token + SRP_RSP_SQ_SIZE + + SRP_TSK_MGMT_SQ_SIZE; + if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + target->scsi_host->cmd_per_lun = token; + break; + case SRP_OPT_MAX_CMD_PER_LUN: - if (match_int(args, &token)) { + if (match_int(args, &token) || token < 1) { pr_warn("bad max cmd_per_lun parameter '%s'\n", p); goto out; } - target->scsi_host->cmd_per_lun = min(token, SRP_CMD_SQ_SIZE); + target->scsi_host->cmd_per_lun = token; break; case SRP_OPT_IO_CLASS: @@ -2455,6 +2511,12 @@ static int srp_parse_options(const char *buf, struct srp_target_port *target) pr_warn("target creation request is missing parameter '%s'\n", srp_opt_tokens[i].pattern); + if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue + && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN)) + pr_warn("cmd_per_lun = %d > queue_size = %d\n", + target->scsi_host->cmd_per_lun, + target->scsi_host->can_queue); + out: kfree(options); return ret; @@ -2493,11 +2555,14 @@ static ssize_t srp_create_target(struct device *dev, target->sg_tablesize = indirect_sg_entries ? : cmd_sg_entries; target->allow_ext_sg = allow_ext_sg; target->tl_retry_count = 7; + target->queue_size = SRP_DEFAULT_QUEUE_SIZE; ret = srp_parse_options(buf, target); if (ret) goto err; + target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE; + if (!srp_conn_unique(target->srp_host, target)) { shost_printk(KERN_INFO, target->scsi_host, PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n", diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index 446b0452107b..575681063f38 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -57,14 +57,11 @@ enum { SRP_MAX_LUN = 512, SRP_DEF_SG_TABLESIZE = 12, - SRP_RQ_SHIFT = 6, - SRP_RQ_SIZE = 1 << SRP_RQ_SHIFT, - - SRP_SQ_SIZE = SRP_RQ_SIZE, + SRP_DEFAULT_QUEUE_SIZE = 1 << 6, SRP_RSP_SQ_SIZE = 1, - SRP_REQ_SQ_SIZE = SRP_SQ_SIZE - SRP_RSP_SQ_SIZE, SRP_TSK_MGMT_SQ_SIZE = 1, - SRP_CMD_SQ_SIZE = SRP_REQ_SQ_SIZE - SRP_TSK_MGMT_SQ_SIZE, + SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE - + SRP_TSK_MGMT_SQ_SIZE, SRP_TAG_NO_REQ = ~0U, SRP_TAG_TSK_MGMT = 1U << 31, @@ -156,6 +153,8 @@ struct srp_target_port { char target_name[32]; unsigned int scsi_id; unsigned int sg_tablesize; + int queue_size; + int req_ring_size; int comp_vector; int tl_retry_count; @@ -173,9 +172,9 @@ struct srp_target_port { int zero_req_lim; - struct srp_iu *tx_ring[SRP_SQ_SIZE]; - struct srp_iu *rx_ring[SRP_RQ_SIZE]; - struct srp_request req_ring[SRP_CMD_SQ_SIZE]; + struct srp_iu **tx_ring; + struct srp_iu **rx_ring; + struct srp_request *req_ring; struct work_struct tl_err_work; struct work_struct remove_work; From 71444b97816ae08a1cc435f044f9f1f4cbd51e6a Mon Sep 17 00:00:00 2001 From: Jack Wang Date: Thu, 7 Nov 2013 11:37:37 +0100 Subject: [PATCH 42/62] IB/srp: Add change_queue_depth and change_queue_type support Currently, it's not possible to change queue depth for a device behind SRP host. Sometimes, we need to adjust queue_depth for performance reason (eg storage busy, we need lower queue_depth to avoid running into SCSI error handler), so this patch add support for SRP driver. Signed-off-by: Jack Wang Tested-by: Bart Van Assche Acked-by: David Dillow Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 54 +++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 6aa660d188a3..a8f8d0b8a777 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -1882,6 +1883,57 @@ static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) return 0; } +/** + * srp_change_queue_type - changing device queue tag type + * @sdev: scsi device struct + * @tag_type: requested tag type + * + * Returns queue tag type. + */ +static int +srp_change_queue_type(struct scsi_device *sdev, int tag_type) +{ + if (sdev->tagged_supported) { + scsi_set_tag_type(sdev, tag_type); + if (tag_type) + scsi_activate_tcq(sdev, sdev->queue_depth); + else + scsi_deactivate_tcq(sdev, sdev->queue_depth); + } else + tag_type = 0; + + return tag_type; +} + +/** + * srp_change_queue_depth - setting device queue depth + * @sdev: scsi device struct + * @qdepth: requested queue depth + * @reason: SCSI_QDEPTH_DEFAULT/SCSI_QDEPTH_QFULL/SCSI_QDEPTH_RAMP_UP + * (see include/scsi/scsi_host.h for definition) + * + * Returns queue depth. + */ +static int +srp_change_queue_depth(struct scsi_device *sdev, int qdepth, int reason) +{ + struct Scsi_Host *shost = sdev->host; + int max_depth; + if (reason == SCSI_QDEPTH_DEFAULT || reason == SCSI_QDEPTH_RAMP_UP) { + max_depth = shost->can_queue; + if (!sdev->tagged_supported) + max_depth = 1; + if (qdepth > max_depth) + qdepth = max_depth; + scsi_adjust_queue_depth(sdev, scsi_get_tag_type(sdev), qdepth); + } else if (reason == SCSI_QDEPTH_QFULL) + scsi_track_queue_full(sdev, qdepth); + else + return -EOPNOTSUPP; + + return sdev->queue_depth; +} + static int srp_send_tsk_mgmt(struct srp_target_port *target, u64 req_tag, unsigned int lun, u8 func) { @@ -2173,6 +2225,8 @@ static struct scsi_host_template srp_template = { .slave_configure = srp_slave_configure, .info = srp_target_info, .queuecommand = srp_queuecommand, + .change_queue_depth = srp_change_queue_depth, + .change_queue_type = srp_change_queue_type, .eh_abort_handler = srp_abort, .eh_device_reset_handler = srp_reset_device, .eh_host_reset_handler = srp_reset_host, From 65d7dd2f3479ef5aec1d9ddd1481cb7851c11af6 Mon Sep 17 00:00:00 2001 From: Vu Pham Date: Thu, 10 Oct 2013 13:50:29 +0200 Subject: [PATCH 43/62] IB/srp: Remove target from list before freeing Scsi_Host structure Remove an SRP target from the SRP target list before invoking the last scsi_host_put() call. This change is necessary because that last put frees the memory that holds the srp_target_port structure. This patch prevents the following kernel oops: RIP: 0010:[] __lock_acquire+0x500/0x1570 Call Trace: [] lock_acquire+0xa4/0x120 [] _spin_lock+0x36/0x70 [] srp_remove_work+0xef/0x180 [ib_srp] [] worker_thread+0x21c/0x3d0 [] kthread+0x96/0xa0 [] child_rip+0xa/0x20 Signed-off-by: Vu Pham [ bvanassche - Modified path description and CC'ed stable. ] Signed-off-by: Bart Van Assche Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index a8f8d0b8a777..a8c06c4451c0 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -666,6 +666,11 @@ static void srp_remove_target(struct srp_target_port *target) cancel_work_sync(&target->tl_err_work); srp_rport_put(target->rport); srp_free_req_data(target); + + spin_lock(&target->srp_host->target_lock); + list_del(&target->list); + spin_unlock(&target->srp_host->target_lock); + scsi_host_put(target->scsi_host); } @@ -677,10 +682,6 @@ static void srp_remove_work(struct work_struct *work) WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED); srp_remove_target(target); - - spin_lock(&target->srp_host->target_lock); - list_del(&target->list); - spin_unlock(&target->srp_host->target_lock); } static void srp_rport_delete(struct srp_rport *rport) From 99b6697a50c2acbe3ca2772d359fc9a28835dc84 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 10 Oct 2013 13:52:33 +0200 Subject: [PATCH 44/62] IB/srp: Avoid offlining operational SCSI devices If SCSI commands are submitted with a SCSI request timeout that is lower than the the IB RC timeout, it can happen that the SCSI error handler has already started device recovery before transport layer error handling starts. So it can happen that the SCSI error handler tries to abort a SCSI command after it has been reset by srp_rport_reconnect(). Tell the SCSI error handler that such commands have finished and that it is not necessary to continue its recovery strategy for commands that have been reset by srp_rport_reconnect(). Signed-off-by: Bart Van Assche Cc: Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index a8c06c4451c0..4bf2f9feb59a 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -2000,7 +2000,7 @@ static int srp_abort(struct scsi_cmnd *scmnd) shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n"); if (!req || !srp_claim_req(target, req, scmnd)) - return FAILED; + return SUCCESS; if (srp_send_tsk_mgmt(target, req->index, scmnd->device->lun, SRP_TSK_ABORT_TASK) == 0) ret = SUCCESS; From cd4e38542a5c2cab94e5410fb17c1cc004a60792 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 10 Oct 2013 13:53:25 +0200 Subject: [PATCH 45/62] IB/srp: Report receive errors correctly The IB spec does not guarantee that the opcode is available in error completions. Hence do not rely on it. See also commit 948d1e889e5b ("IB/srp: Introduce srp_handle_qp_err()"). Signed-off-by: Bart Van Assche Cc: # v3.8 Signed-off-by: Roland Dreier --- drivers/infiniband/ulp/srp/ib_srp.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 4bf2f9feb59a..a88631918e85 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1449,14 +1449,13 @@ static void srp_tl_err_work(struct work_struct *work) srp_start_tl_fail_timers(target->rport); } -static void srp_handle_qp_err(enum ib_wc_status wc_status, - enum ib_wc_opcode wc_opcode, +static void srp_handle_qp_err(enum ib_wc_status wc_status, bool send_err, struct srp_target_port *target) { if (target->connected && !target->qp_in_error) { shost_printk(KERN_ERR, target->scsi_host, PFX "failed %s status %d\n", - wc_opcode & IB_WC_RECV ? "receive" : "send", + send_err ? "send" : "receive", wc_status); queue_work(system_long_wq, &target->tl_err_work); } @@ -1473,7 +1472,7 @@ static void srp_recv_completion(struct ib_cq *cq, void *target_ptr) if (likely(wc.status == IB_WC_SUCCESS)) { srp_handle_recv(target, &wc); } else { - srp_handle_qp_err(wc.status, wc.opcode, target); + srp_handle_qp_err(wc.status, false, target); } } } @@ -1489,7 +1488,7 @@ static void srp_send_completion(struct ib_cq *cq, void *target_ptr) iu = (struct srp_iu *) (uintptr_t) wc.wr_id; list_add(&iu->list, &target->free_tx); } else { - srp_handle_qp_err(wc.status, wc.opcode, target); + srp_handle_qp_err(wc.status, true, target); } } } From 4127c365c9dbd4412e70b1c01f6ae1e3624cafac Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 17 Sep 2013 17:45:54 -0400 Subject: [PATCH 46/62] RDMA/nes: Remove self-assignment from nes_query_qp() Assigning a value to itself is pointless. Spotted with coverity, no hardware to test. Signed-off-by: Dave Jones Signed-off-by: Roland Dreier --- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 5b53ca5a2284..8308e3634767 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2834,7 +2834,7 @@ static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, init_attr->qp_context = nesqp->ibqp.qp_context; init_attr->send_cq = nesqp->ibqp.send_cq; init_attr->recv_cq = nesqp->ibqp.recv_cq; - init_attr->srq = nesqp->ibqp.srq = nesqp->ibqp.srq; + init_attr->srq = nesqp->ibqp.srq; init_attr->cap = attr->cap; return 0; From 180771a3707a4c0577cbf4f830c754dbabfdfccb Mon Sep 17 00:00:00 2001 From: "Upinder Malhi \\(umalhi\\)" Date: Tue, 10 Sep 2013 03:36:59 +0000 Subject: [PATCH 47/62] IB/core: Add Cisco usNIC rdma node and transport types This patch adds new rdma node and new rdma transport, and supporting code used by Cisco's low latency driver called usNIC. usNIC uses its own transport, distinct from IB and iWARP. Signed-off-by: Upinder Malhi Signed-off-by: Jeff Squyres Signed-off-by: Roland Dreier --- drivers/infiniband/core/sysfs.c | 1 + drivers/infiniband/core/verbs.c | 3 +++ include/rdma/ib_verbs.h | 6 ++++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index cde1e7b5b85d..faad2caf22b1 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -612,6 +612,7 @@ static ssize_t show_node_type(struct device *device, switch (dev->node_type) { case RDMA_NODE_IB_CA: return sprintf(buf, "%d: CA\n", dev->node_type); case RDMA_NODE_RNIC: return sprintf(buf, "%d: RNIC\n", dev->node_type); + case RDMA_NODE_USNIC: return sprintf(buf, "%d: usNIC\n", dev->node_type); case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type); case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type); default: return sprintf(buf, "%d: \n", dev->node_type); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index a321df28bab2..84f502785f89 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -114,6 +114,8 @@ rdma_node_get_transport(enum rdma_node_type node_type) return RDMA_TRANSPORT_IB; case RDMA_NODE_RNIC: return RDMA_TRANSPORT_IWARP; + case RDMA_NODE_USNIC: + return RDMA_TRANSPORT_USNIC; default: BUG(); return 0; @@ -130,6 +132,7 @@ enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_ case RDMA_TRANSPORT_IB: return IB_LINK_LAYER_INFINIBAND; case RDMA_TRANSPORT_IWARP: + case RDMA_TRANSPORT_USNIC: return IB_LINK_LAYER_ETHERNET; default: return IB_LINK_LAYER_UNSPECIFIED; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e393171e2fac..60354d53948e 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -67,12 +67,14 @@ enum rdma_node_type { RDMA_NODE_IB_CA = 1, RDMA_NODE_IB_SWITCH, RDMA_NODE_IB_ROUTER, - RDMA_NODE_RNIC + RDMA_NODE_RNIC, + RDMA_NODE_USNIC, }; enum rdma_transport_type { RDMA_TRANSPORT_IB, - RDMA_TRANSPORT_IWARP + RDMA_TRANSPORT_IWARP, + RDMA_TRANSPORT_USNIC }; enum rdma_transport_type From 93b80ac297b1cfdf3bea771a5ce6ea4ff5d25d1d Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 31 Oct 2013 15:26:35 +0200 Subject: [PATCH 48/62] IB/mlx4: Fix endless loop in resize CQ When calling get_sw_cqe() we need pass the consumer_index and not the masked value. Failure to do so will cause incorrect result of get_sw_cqe() possibly leading to endless loop. This problem was reported and analyzed by Michael Rice from HP. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index d5e60f44ba5a..3fe1a68a500a 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -324,7 +324,7 @@ static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq) u32 i; i = cq->mcq.cons_index; - while (get_sw_cqe(cq, i & cq->ibcq.cqe)) + while (get_sw_cqe(cq, i)) ++i; return i - cq->mcq.cons_index; From 1c636f801615bdfc9b1d46904e8258c7a025670b Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 31 Oct 2013 15:26:32 +0200 Subject: [PATCH 49/62] IB/core: Encorce MR access rights rules on kernel consumers Enforce the rule that when requesting remote write or atomic permissions, local write must be indicated as well. See IB spec 11.2.8.2. Spotted by: Hagay Abramovsky Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 10 +++------- drivers/infiniband/core/verbs.c | 14 ++++++++++++++ include/rdma/ib_verbs.h | 13 +++++++++++++ 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 5bb2a82d52e7..7f671b7b8bac 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -939,13 +939,9 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)) return -EINVAL; - /* - * Local write permission is required if remote write or - * remote atomic permission is also requested. - */ - if (cmd.access_flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && - !(cmd.access_flags & IB_ACCESS_LOCAL_WRITE)) - return -EINVAL; + ret = ib_check_mr_access(cmd.access_flags); + if (ret) + return ret; uobj = kmalloc(sizeof *uobj, GFP_KERNEL); if (!uobj) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 84f502785f89..d4f6ddf72ffa 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -961,6 +961,11 @@ EXPORT_SYMBOL(ib_resize_cq); struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); mr = pd->device->get_dma_mr(pd, mr_access_flags); @@ -983,6 +988,11 @@ struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd, u64 *iova_start) { struct ib_mr *mr; + int err; + + err = ib_check_mr_access(mr_access_flags); + if (err) + return ERR_PTR(err); if (!pd->device->reg_phys_mr) return ERR_PTR(-ENOSYS); @@ -1013,6 +1023,10 @@ int ib_rereg_phys_mr(struct ib_mr *mr, struct ib_pd *old_pd; int ret; + ret = ib_check_mr_access(mr_access_flags); + if (ret) + return ret; + if (!mr->device->rereg_phys_mr) return -ENOSYS; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 60354d53948e..68c053d0d629 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2386,4 +2386,17 @@ struct ib_flow *ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr, int domain); int ib_destroy_flow(struct ib_flow *flow_id); +static inline int ib_check_mr_access(int flags) +{ + /* + * Local write permission is required if remote write or + * remote atomic permission is also requested. + */ + if (flags & (IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_REMOTE_WRITE) && + !(flags & IB_ACCESS_LOCAL_WRITE)) + return -EINVAL; + + return 0; +} + #endif /* IB_VERBS_H */ From 7e2e19210a8bbbcacd31e8ce4a0ea64e3ac37dea Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 31 Oct 2013 15:26:31 +0200 Subject: [PATCH 50/62] IB/mlx5: Remove dead code The value of the local variable index is never used in reg_mr_callback(). Signed-off-by: Eli Cohen [ Remove now-unused variable delta too. - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/mr.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 2c4626f7b1b8..039c3e40fcb4 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -72,14 +72,8 @@ static void reg_mr_callback(int status, void *context) int c = order2idx(dev, mr->order); struct mlx5_cache_ent *ent = &cache->ent[c]; u8 key; - unsigned long delta = jiffies - mr->start; - unsigned long index; unsigned long flags; - index = find_last_bit(&delta, 8 * sizeof(delta)); - if (index == 64) - index = 0; - spin_lock_irqsave(&ent->lock, flags); ent->pending--; spin_unlock_irqrestore(&ent->lock, flags); From 2b136d025348774633a2f6fc2a87f0cf409a3ab9 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 31 Oct 2013 15:26:33 +0200 Subject: [PATCH 51/62] IB/mlx5: Fix list_del of empty list For archs with pages size of 4K, when the chunk is freed, fwp is not in the list so avoid attempting to delete it. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c index ba816c25c5c1..37b6ad1f9a1b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c @@ -229,7 +229,8 @@ static void free_4k(struct mlx5_core_dev *dev, u64 addr) set_bit(n, &fwp->bitmask); if (fwp->free_count == MLX5_NUM_4K_IN_PAGE) { rb_erase(&fwp->rb_node, &dev->priv.page_root); - list_del(&fwp->list); + if (fwp->free_count != 1) + list_del(&fwp->list); dma_unmap_page(&dev->pdev->dev, addr, PAGE_SIZE, DMA_BIDIRECTIONAL); __free_page(fwp->page); kfree(fwp); From 79d3da9c51a5fef672b329b8d8583a089d147bef Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 31 Oct 2013 15:26:34 +0200 Subject: [PATCH 52/62] IB/mlx4: Fix device max capabilities check Move the check on max supported CQEs after the final number of entries is evaluated. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx4/cq.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index d5e60f44ba5a..ea299515ecb2 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -365,7 +365,7 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) mutex_lock(&cq->resize_mutex); - if (entries < 1 || entries > dev->dev->caps.max_cqes) { + if (entries < 1) { err = -EINVAL; goto out; } @@ -376,6 +376,11 @@ int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata) goto out; } + if (entries > dev->dev->caps.max_cqes) { + err = -EINVAL; + goto out; + } + if (ibcq->uobject) { err = mlx4_alloc_resize_umem(dev, cq, entries, udata); if (err) From cf1c5e1f1c965cf44e680127b2e9564fc472676c Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Thu, 31 Oct 2013 15:26:36 +0200 Subject: [PATCH 53/62] IB/mlx5: Fix page shift in create CQ for userspace When creating a CQ, we must use mlx5 adapter page shift. Signed-off-by: Eli Cohen Signed-off-by: Roland Dreier --- drivers/infiniband/hw/mlx5/cq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 28344773f640..b72627429745 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -556,7 +556,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, goto err_db; } mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0); - (*cqb)->ctx.log_pg_sz = page_shift - PAGE_SHIFT; + (*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; *index = to_mucontext(context)->uuari.uars[0].index; From ab626d1a68d1aee19a5c7d889b9fbeace6a92b41 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Fri, 15 Nov 2013 14:16:37 -0800 Subject: [PATCH 54/62] IB/cm: Convert to using idr_alloc_cyclic() Commit 3e6628c4b347 ("idr: introduce idr_alloc_cyclic()") adds a new idr_alloc_cyclic() routine and converts several of these users to it. This is just a missed one - add it. Signed-off-by: Zhao Hongjiang Signed-off-by: Andrew Morton Signed-off-by: Roland Dreier --- drivers/infiniband/core/cm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 784b97cb05b0..f2ef7ef0f36f 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -383,14 +383,11 @@ static int cm_alloc_id(struct cm_id_private *cm_id_priv) { unsigned long flags; int id; - static int next_id; idr_preload(GFP_KERNEL); spin_lock_irqsave(&cm.lock, flags); - id = idr_alloc(&cm.local_id_table, cm_id_priv, next_id, 0, GFP_NOWAIT); - if (id >= 0) - next_id = max(id + 1, 0); + id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT); spin_unlock_irqrestore(&cm.lock, flags); idr_preload_end(); From f3a5e3e37e221d29ce611238a22ac61cf76c6614 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 22 Oct 2013 15:29:50 -0700 Subject: [PATCH 55/62] IB/ucma: Convert use of typedef ctl_table to struct ctl_table This typedef is unnecessary and should just be removed. Signed-off-by: Joe Perches Signed-off-by: Roland Dreier --- drivers/infiniband/core/ucma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index b0f189be543b..3f93093632c2 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -57,7 +57,7 @@ MODULE_LICENSE("Dual BSD/GPL"); static unsigned int max_backlog = 1024; static struct ctl_table_header *ucma_ctl_table_hdr; -static ctl_table ucma_ctl_table[] = { +static struct ctl_table ucma_ctl_table[] = { { .procname = "max_backlog", .data = &max_backlog, From f88482743872230f5899b8344a057d6e2fd011e2 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 6 Nov 2013 23:21:44 +0100 Subject: [PATCH 56/62] IB/core: clarify overflow/underflow checks on ib_create/destroy_flow This patch fixes the following issues: 1. Unneeded checks were removed 2. Removed the fixed size out of flow_attr.size, thus simplifying the checks. 3. Remove a 32bit hole on 64bit systems with strict alignment in struct ib_kern_flow_att by adding a reserved field. Signed-off-by: Matan Barak Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 32 +++++++++++++--------------- include/uapi/rdma/ib_user_verbs.h | 1 + 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 2f0f01b70e3b..26ee2d2a8c52 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2657,7 +2657,6 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, void *kern_spec; void *ib_spec; int i; - int kern_attr_size; if (out_len < sizeof(resp)) return -ENOSPC; @@ -2672,32 +2671,28 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW)) return -EPERM; - if (cmd.flow_attr.num_of_specs < 0 || - cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) + if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; - kern_attr_size = cmd.flow_attr.size - sizeof(cmd) - - sizeof(struct ib_uverbs_cmd_hdr_ex); - - if (cmd.flow_attr.size < 0 || cmd.flow_attr.size > in_len || - kern_attr_size < 0 || kern_attr_size > + if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || + cmd.flow_attr.size > (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec))) return -EINVAL; if (cmd.flow_attr.num_of_specs) { - kern_flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size, + GFP_KERNEL); if (!kern_flow_attr) return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd), - kern_attr_size)) { + cmd.flow_attr.size)) { err = -EFAULT; goto err_free_attr; } } else { kern_flow_attr = &cmd.flow_attr; - kern_attr_size = sizeof(cmd.flow_attr); } uobj = kmalloc(sizeof(*uobj), GFP_KERNEL); @@ -2714,7 +2709,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, goto err_uobj; } - flow_attr = kmalloc(cmd.flow_attr.size, GFP_KERNEL); + flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL); if (!flow_attr) { err = -ENOMEM; goto err_put; @@ -2729,19 +2724,22 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; - for (i = 0; i < flow_attr->num_of_specs && kern_attr_size > 0; i++) { + for (i = 0; i < flow_attr->num_of_specs && + cmd.flow_attr.size > offsetof(struct ib_kern_spec, reserved) && + cmd.flow_attr.size >= + ((struct ib_kern_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - kern_attr_size -= ((struct ib_kern_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_kern_spec *)kern_spec)->size; kern_spec += ((struct ib_kern_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } - if (kern_attr_size) { - pr_warn("create flow failed, %d bytes left from uverb cmd\n", - kern_attr_size); + if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { + pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n", + i, cmd.flow_attr.size); goto err_free; } flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER); diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index e3ddd86c90a6..99a58bdc1c64 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -771,6 +771,7 @@ struct ib_kern_flow_attr { struct ib_uverbs_create_flow { __u32 comp_mask; + __u32 reserved; __u64 response; __u32 qp_handle; struct ib_kern_flow_attr flow_attr; From d82693dad09b49232cd727ee9e15bd027710edac Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:45 +0100 Subject: [PATCH 57/62] IB/core: Rename 'flow' structs to match other uverbs structs Commit 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs") added public data structures to support receive flow steering. The new structs are not following the 'uverbs' pattern: they're lacking the common prefix 'ib_uverbs'. This patch replaces ib_kern prefix by ib_uverbs. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 14 +++++------ include/uapi/rdma/ib_user_verbs.h | 36 ++++++++++++++-------------- 2 files changed, 25 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 26ee2d2a8c52..f83c1dc08481 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2602,7 +2602,7 @@ out_put: } #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -static int kern_spec_to_ib_spec(struct ib_kern_spec *kern_spec, +static int kern_spec_to_ib_spec(struct ib_uverbs_spec *kern_spec, union ib_flow_spec *ib_spec) { ib_spec->type = kern_spec->type; @@ -2650,7 +2650,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; struct ib_flow *flow_id; - struct ib_kern_flow_attr *kern_flow_attr; + struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; struct ib_qp *qp; int err = 0; @@ -2676,7 +2676,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || cmd.flow_attr.size > - (cmd.flow_attr.num_of_specs * sizeof(struct ib_kern_spec))) + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_spec))) return -EINVAL; if (cmd.flow_attr.num_of_specs) { @@ -2725,16 +2725,16 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && - cmd.flow_attr.size > offsetof(struct ib_kern_spec, reserved) && + cmd.flow_attr.size > offsetof(struct ib_uverbs_spec, reserved) && cmd.flow_attr.size >= - ((struct ib_kern_spec *)kern_spec)->size; i++) { + ((struct ib_uverbs_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - cmd.flow_attr.size -= ((struct ib_kern_spec *)kern_spec)->size; - kern_spec += ((struct ib_kern_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 99a58bdc1c64..96a660533236 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -701,61 +701,61 @@ struct ib_uverbs_detach_mcast { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -struct ib_kern_eth_filter { +struct ib_uverbs_eth_filter { __u8 dst_mac[6]; __u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; }; -struct ib_kern_spec_eth { +struct ib_uverbs_spec_eth { __u32 type; __u16 size; __u16 reserved; - struct ib_kern_eth_filter val; - struct ib_kern_eth_filter mask; + struct ib_uverbs_eth_filter val; + struct ib_uverbs_eth_filter mask; }; -struct ib_kern_ipv4_filter { +struct ib_uverbs_ipv4_filter { __be32 src_ip; __be32 dst_ip; }; -struct ib_kern_spec_ipv4 { +struct ib_uverbs_spec_ipv4 { __u32 type; __u16 size; __u16 reserved; - struct ib_kern_ipv4_filter val; - struct ib_kern_ipv4_filter mask; + struct ib_uverbs_ipv4_filter val; + struct ib_uverbs_ipv4_filter mask; }; -struct ib_kern_tcp_udp_filter { +struct ib_uverbs_tcp_udp_filter { __be16 dst_port; __be16 src_port; }; -struct ib_kern_spec_tcp_udp { +struct ib_uverbs_spec_tcp_udp { __u32 type; __u16 size; __u16 reserved; - struct ib_kern_tcp_udp_filter val; - struct ib_kern_tcp_udp_filter mask; + struct ib_uverbs_tcp_udp_filter val; + struct ib_uverbs_tcp_udp_filter mask; }; -struct ib_kern_spec { +struct ib_uverbs_spec { union { struct { __u32 type; __u16 size; __u16 reserved; }; - struct ib_kern_spec_eth eth; - struct ib_kern_spec_ipv4 ipv4; - struct ib_kern_spec_tcp_udp tcp_udp; + struct ib_uverbs_spec_eth eth; + struct ib_uverbs_spec_ipv4 ipv4; + struct ib_uverbs_spec_tcp_udp tcp_udp; }; }; -struct ib_kern_flow_attr { +struct ib_uverbs_flow_attr { __u32 type; __u16 size; __u16 priority; @@ -774,7 +774,7 @@ struct ib_uverbs_create_flow { __u32 reserved; __u64 response; __u32 qp_handle; - struct ib_kern_flow_attr flow_attr; + struct ib_uverbs_flow_attr flow_attr; }; struct ib_uverbs_create_flow_resp { From b68c956021386eead6b8b28e445f33c8c985d7d2 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:46 +0100 Subject: [PATCH 58/62] IB/core: Make uverbs flow structure use names like verbs ones This patch adds "flow" prefix to most of data structure added as part of commit 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs") to keep those names in sync with the data structures added in commit 319a441d1361 ("IB/core: Add receive flow steering support"). It's just a matter of translating 'ib_flow' to 'ib_uverbs_flow'. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs_cmd.c | 12 +++++------ include/uapi/rdma/ib_user_verbs.h | 32 ++++++++++++++-------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index f83c1dc08481..19f14d82c988 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2602,7 +2602,7 @@ out_put: } #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -static int kern_spec_to_ib_spec(struct ib_uverbs_spec *kern_spec, +static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { ib_spec->type = kern_spec->type; @@ -2676,7 +2676,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || cmd.flow_attr.size > - (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_spec))) + (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; if (cmd.flow_attr.num_of_specs) { @@ -2725,16 +2725,16 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kern_spec = kern_flow_attr + 1; ib_spec = flow_attr + 1; for (i = 0; i < flow_attr->num_of_specs && - cmd.flow_attr.size > offsetof(struct ib_uverbs_spec, reserved) && + cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) && cmd.flow_attr.size >= - ((struct ib_uverbs_spec *)kern_spec)->size; i++) { + ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) { err = kern_spec_to_ib_spec(kern_spec, ib_spec); if (err) goto err_free; flow_attr->size += ((union ib_flow_spec *) ib_spec)->size; - cmd.flow_attr.size -= ((struct ib_uverbs_spec *)kern_spec)->size; - kern_spec += ((struct ib_uverbs_spec *) kern_spec)->size; + cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size; + kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size; ib_spec += ((union ib_flow_spec *) ib_spec)->size; } if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) { diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 96a660533236..ef2be64bf4c3 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -701,57 +701,57 @@ struct ib_uverbs_detach_mcast { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -struct ib_uverbs_eth_filter { +struct ib_uverbs_flow_eth_filter { __u8 dst_mac[6]; __u8 src_mac[6]; __be16 ether_type; __be16 vlan_tag; }; -struct ib_uverbs_spec_eth { +struct ib_uverbs_flow_spec_eth { __u32 type; __u16 size; __u16 reserved; - struct ib_uverbs_eth_filter val; - struct ib_uverbs_eth_filter mask; + struct ib_uverbs_flow_eth_filter val; + struct ib_uverbs_flow_eth_filter mask; }; -struct ib_uverbs_ipv4_filter { +struct ib_uverbs_flow_ipv4_filter { __be32 src_ip; __be32 dst_ip; }; -struct ib_uverbs_spec_ipv4 { +struct ib_uverbs_flow_spec_ipv4 { __u32 type; __u16 size; __u16 reserved; - struct ib_uverbs_ipv4_filter val; - struct ib_uverbs_ipv4_filter mask; + struct ib_uverbs_flow_ipv4_filter val; + struct ib_uverbs_flow_ipv4_filter mask; }; -struct ib_uverbs_tcp_udp_filter { +struct ib_uverbs_flow_tcp_udp_filter { __be16 dst_port; __be16 src_port; }; -struct ib_uverbs_spec_tcp_udp { +struct ib_uverbs_flow_spec_tcp_udp { __u32 type; __u16 size; __u16 reserved; - struct ib_uverbs_tcp_udp_filter val; - struct ib_uverbs_tcp_udp_filter mask; + struct ib_uverbs_flow_tcp_udp_filter val; + struct ib_uverbs_flow_tcp_udp_filter mask; }; -struct ib_uverbs_spec { +struct ib_uverbs_flow_spec { union { struct { __u32 type; __u16 size; __u16 reserved; }; - struct ib_uverbs_spec_eth eth; - struct ib_uverbs_spec_ipv4 ipv4; - struct ib_uverbs_spec_tcp_udp tcp_udp; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; }; }; From 58913efba9c3aa7992f2a4d630135ded833d988e Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:47 +0100 Subject: [PATCH 59/62] IB/core: Use a common header for uverbs flow_specs A common header will allows better checking of flow specs size, while ensuring strict alignment to 64 bits. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- include/uapi/rdma/ib_user_verbs.h | 53 +++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index ef2be64bf4c3..43014981550a 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -701,6 +701,14 @@ struct ib_uverbs_detach_mcast { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING +struct ib_uverbs_flow_spec_hdr { + __u32 type; + __u16 size; + __u16 reserved; + /* followed by flow_spec */ + __u64 flow_spec_data[0]; +}; + struct ib_uverbs_flow_eth_filter { __u8 dst_mac[6]; __u8 src_mac[6]; @@ -709,9 +717,14 @@ struct ib_uverbs_flow_eth_filter { }; struct ib_uverbs_flow_spec_eth { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; struct ib_uverbs_flow_eth_filter val; struct ib_uverbs_flow_eth_filter mask; }; @@ -722,9 +735,14 @@ struct ib_uverbs_flow_ipv4_filter { }; struct ib_uverbs_flow_spec_ipv4 { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; struct ib_uverbs_flow_ipv4_filter val; struct ib_uverbs_flow_ipv4_filter mask; }; @@ -735,19 +753,27 @@ struct ib_uverbs_flow_tcp_udp_filter { }; struct ib_uverbs_flow_spec_tcp_udp { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; struct ib_uverbs_flow_tcp_udp_filter val; struct ib_uverbs_flow_tcp_udp_filter mask; }; struct ib_uverbs_flow_spec { union { - struct { - __u32 type; - __u16 size; - __u16 reserved; + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; }; struct ib_uverbs_flow_spec_eth eth; struct ib_uverbs_flow_spec_ipv4 ipv4; @@ -767,6 +793,7 @@ struct ib_uverbs_flow_attr { * struct ib_flow_spec_xxx * struct ib_flow_spec_yyy */ + struct ib_uverbs_flow_spec_hdr flow_specs[0]; }; struct ib_uverbs_create_flow { From 2490f20be496c2da14ae4632a8c60e0633e97fd0 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:48 +0100 Subject: [PATCH 60/62] IB/core: Remove ib_uverbs_flow_spec structure from userspace The structure holding any types of flow_spec is of no use to userspace. It would be wrong for userspace to do: struct ib_uverbs_flow_spec flow_spec; flow_spec.type = IB_FLOW_SPEC_TCP; flow_spec.size = sizeof(flow_spec); Instead, userspace should use the dedicated flow_spec structure for - Ethernet : struct ib_uverbs_flow_spec_eth, - IPv4 : struct ib_uverbs_flow_spec_ipv4, - TCP/UDP : struct ib_uverbs_flow_spec_tcp_udp. In other words, struct ib_uverbs_flow_spec is a "virtual" data structure that can only be use by the kernel as an alias to the other. Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 16 ++++++++++++++++ include/uapi/rdma/ib_user_verbs.h | 16 ---------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index d8f9c6c272d7..777954f67270 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -178,6 +178,22 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +struct ib_uverbs_flow_spec { + union { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; + struct ib_uverbs_flow_spec_eth eth; + struct ib_uverbs_flow_spec_ipv4 ipv4; + struct ib_uverbs_flow_spec_tcp_udp tcp_udp; + }; +}; + #define IB_UVERBS_DECLARE_CMD(name) \ ssize_t ib_uverbs_##name(struct ib_uverbs_file *file, \ const char __user *buf, int in_len, \ diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 43014981550a..fc9bbe37cfce 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -765,22 +765,6 @@ struct ib_uverbs_flow_spec_tcp_udp { struct ib_uverbs_flow_tcp_udp_filter mask; }; -struct ib_uverbs_flow_spec { - union { - union { - struct ib_uverbs_flow_spec_hdr hdr; - struct { - __u32 type; - __u16 size; - __u16 reserved; - }; - }; - struct ib_uverbs_flow_spec_eth eth; - struct ib_uverbs_flow_spec_ipv4 ipv4; - struct ib_uverbs_flow_spec_tcp_udp tcp_udp; - }; -}; - struct ib_uverbs_flow_attr { __u32 type; __u16 size; From f21519b23c1b6fa25366be4114ccf7fcf1c190f9 Mon Sep 17 00:00:00 2001 From: Yann Droneaud Date: Wed, 6 Nov 2013 23:21:49 +0100 Subject: [PATCH 61/62] IB/core: extended command: an improved infrastructure for uverbs commands Commit 400dbc96583f ("IB/core: Infrastructure for extensible uverbs commands") added an infrastructure for extensible uverbs commands while later commit 436f2ad05a0b ("IB/core: Export ib_create/destroy_flow through uverbs") exported ib_create_flow()/ib_destroy_flow() functions using this new infrastructure. According to the commit 400dbc96583f, the purpose of this infrastructure is to support passing around provider (eg. hardware) specific buffers when userspace issue commands to the kernel, so that it would be possible to extend uverbs (eg. core) buffers independently from the provider buffers. But the new kernel command function prototypes were not modified to take advantage of this extension. This issue was exposed by Roland Dreier in a previous review[1]. So the following patch is an attempt to a revised extensible command infrastructure. This improved extensible command infrastructure distinguish between core (eg. legacy)'s command/response buffers from provider (eg. hardware)'s command/response buffers: each extended command implementing function is given a struct ib_udata to hold core (eg. uverbs) input and output buffers, and another struct ib_udata to hold the hw (eg. provider) input and output buffers. Having those buffers identified separately make it easier to increase one buffer to support extension without having to add some code to guess the exact size of each command/response parts: This should make the extended functions more reliable. Additionally, instead of relying on command identifier being greater than IB_USER_VERBS_CMD_THRESHOLD, the proposed infrastructure rely on unused bits in command field: on the 32 bits provided by command field, only 6 bits are really needed to encode the identifier of commands currently supported by the kernel. (Even using only 6 bits leaves room for about 23 new commands). So this patch makes use of some high order bits in command field to store flags, leaving enough room for more command identifiers than one will ever need (eg. 256). The new flags are used to specify if the command should be processed as an extended one or a legacy one. While designing the new command format, care was taken to make usage of flags itself extensible. Using high order bits of the commands field ensure that newer libibverbs on older kernel will properly fail when trying to call extended commands. On the other hand, older libibverbs on newer kernel will never be able to issue calls to extended commands. The extended command header includes the optional response pointer so that output buffer length and output buffer pointer are located together in the command, allowing proper parameters checking. This should make implementing functions easier and safer. Additionally the extended header ensure 64bits alignment, while making all sizes multiple of 8 bytes, extending the maximum buffer size: legacy extended Maximum command buffer: 256KBytes 1024KBytes (512KBytes + 512KBytes) Maximum response buffer: 256KBytes 1024KBytes (512KBytes + 512KBytes) For the purpose of doing proper buffer size accounting, the headers size are no more taken in account in "in_words". One of the odds of the current extensible infrastructure, reading twice the "legacy" command header, is fixed by removing the "legacy" command header from the extended command header: they are processed as two different parts of the command: memory is read once and information are not duplicated: it's making clear that's an extended command scheme and not a different command scheme. The proposed scheme will format input (command) and output (response) buffers this way: - command: legacy header + extended header + command data (core + hw): +----------------------------------------+ | flags | 00 00 | command | | in_words | out_words | +----------------------------------------+ | response | | response | | provider_in_words | provider_out_words | | padding | +----------------------------------------+ | | . . . (in_words * 8) . | | +----------------------------------------+ | | . . . (provider_in_words * 8) . | | +----------------------------------------+ - response, if present: +----------------------------------------+ | | . . . (out_words * 8) . | | +----------------------------------------+ | | . . . (provider_out_words * 8) . | | +----------------------------------------+ The overall design is to ensure that the extensible infrastructure is itself extensible while begin more reliable with more input and bound checking. Note: The unused field in the extended header would be perfect candidate to hold the command "comp_mask" (eg. bit field used to handle compatibility). This was suggested by Roland Dreier in a previous review[2]. But "comp_mask" field is likely to be present in the uverb input and/or provider input, likewise for the response, as noted by Matan Barak[3], so it doesn't make sense to put "comp_mask" in the header. [1]: http://marc.info/?i=CAL1RGDWxmM17W2o_era24A-TTDeKyoL6u3NRu_=t_dhV_ZA9MA@mail.gmail.com [2]: http://marc.info/?i=CAL1RGDXJtrc849M6_XNZT5xO1+ybKtLWGq6yg6LhoSsKpsmkYA@mail.gmail.com [3]: http://marc.info/?i=525C1149.6000701@mellanox.com Signed-off-by: Yann Droneaud Link: http://marc.info/?i=cover.1383773832.git.ydroneaud@opteya.com [ Convert "ret ? ret : 0" to the equivalent "ret". - Roland ] Signed-off-by: Roland Dreier --- drivers/infiniband/core/uverbs.h | 20 +++- drivers/infiniband/core/uverbs_cmd.c | 56 +++++------ drivers/infiniband/core/uverbs_main.c | 135 +++++++++++++++++++------- drivers/infiniband/hw/mlx4/main.c | 6 +- include/rdma/ib_verbs.h | 1 + include/uapi/rdma/ib_user_verbs.h | 25 +++-- 6 files changed, 165 insertions(+), 78 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 777954f67270..24adccbf92f9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -47,6 +47,14 @@ #include #include +#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ + do { \ + (udata)->inbuf = (void __user *) (ibuf); \ + (udata)->outbuf = (void __user *) (obuf); \ + (udata)->inlen = (ilen); \ + (udata)->outlen = (olen); \ + } while (0) + /* * Our lifetime rules for these structs are the following: * @@ -233,9 +241,17 @@ IB_UVERBS_DECLARE_CMD(destroy_srq); IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); + #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -IB_UVERBS_DECLARE_CMD(create_flow); -IB_UVERBS_DECLARE_CMD(destroy_flow); + +#define IB_UVERBS_DECLARE_EX_CMD(name) \ + int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ + struct ib_udata *ucore, \ + struct ib_udata *uhw) + +IB_UVERBS_DECLARE_EX_CMD(create_flow); +IB_UVERBS_DECLARE_EX_CMD(destroy_flow); + #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 19f14d82c988..33a52785f812 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -58,14 +58,6 @@ static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ -#define INIT_UDATA(udata, ibuf, obuf, ilen, olen) \ - do { \ - (udata)->inbuf = (void __user *) (ibuf); \ - (udata)->outbuf = (void __user *) (obuf); \ - (udata)->inlen = (ilen); \ - (udata)->outlen = (olen); \ - } while (0) - /* * The ib_uobject locking scheme is as follows: * @@ -2642,9 +2634,9 @@ static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, return 0; } -ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) +int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) { struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; @@ -2658,11 +2650,15 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, void *ib_spec; int i; - if (out_len < sizeof(resp)) + if (ucore->outlen < sizeof(resp)) return -ENOSPC; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (err) + return err; + + ucore->inbuf += sizeof(cmd); + ucore->inlen -= sizeof(cmd); if (cmd.comp_mask) return -EINVAL; @@ -2674,7 +2670,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS) return -EINVAL; - if (cmd.flow_attr.size > (in_len - sizeof(cmd)) || + if (cmd.flow_attr.size > ucore->inlen || cmd.flow_attr.size > (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec))) return -EINVAL; @@ -2686,11 +2682,10 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, return -ENOMEM; memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr)); - if (copy_from_user(kern_flow_attr + 1, buf + sizeof(cmd), - cmd.flow_attr.size)) { - err = -EFAULT; + err = ib_copy_from_udata(kern_flow_attr + 1, ucore, + cmd.flow_attr.size); + if (err) goto err_free_attr; - } } else { kern_flow_attr = &cmd.flow_attr; } @@ -2758,11 +2753,10 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; - if (copy_to_user((void __user *)(unsigned long) cmd.response, - &resp, sizeof(resp))) { - err = -EFAULT; + err = ib_copy_to_udata(ucore, + &resp, sizeof(resp)); + if (err) goto err_copy; - } put_qp_read(qp); mutex_lock(&file->mutex); @@ -2775,7 +2769,7 @@ ssize_t ib_uverbs_create_flow(struct ib_uverbs_file *file, kfree(flow_attr); if (cmd.flow_attr.num_of_specs) kfree(kern_flow_attr); - return in_len; + return 0; err_copy: idr_remove_uobj(&ib_uverbs_rule_idr, uobj); destroy_flow: @@ -2792,16 +2786,18 @@ err_free_attr: return err; } -ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, - const char __user *buf, int in_len, - int out_len) { +int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) +{ struct ib_uverbs_destroy_flow cmd; struct ib_flow *flow_id; struct ib_uobject *uobj; int ret; - if (copy_from_user(&cmd, buf, sizeof(cmd))) - return -EFAULT; + ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd)); + if (ret) + return ret; uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle, file->ucontext); @@ -2823,7 +2819,7 @@ ssize_t ib_uverbs_destroy_flow(struct ib_uverbs_file *file, put_uobj(uobj); - return ret ? ret : in_len; + return ret; } #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 2df31f68ea09..189d99e76d9f 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -115,12 +115,17 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_CLOSE_XRCD] = ib_uverbs_close_xrcd, [IB_USER_VERBS_CMD_CREATE_XSRQ] = ib_uverbs_create_xsrq, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - [IB_USER_VERBS_CMD_CREATE_FLOW] = ib_uverbs_create_flow, - [IB_USER_VERBS_CMD_DESTROY_FLOW] = ib_uverbs_destroy_flow -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ }; +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING +static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, + struct ib_udata *ucore, + struct ib_udata *uhw) = { + [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, + [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow +}; +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); @@ -589,6 +594,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, { struct ib_uverbs_file *file = filp->private_data; struct ib_uverbs_cmd_hdr hdr; + __u32 flags; if (count < sizeof hdr) return -EINVAL; @@ -596,45 +602,108 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, if (copy_from_user(&hdr, buf, sizeof hdr)) return -EFAULT; - if (hdr.command >= ARRAY_SIZE(uverbs_cmd_table) || - !uverbs_cmd_table[hdr.command]) - return -EINVAL; + flags = (hdr.command & + IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT; - if (!file->ucontext && - hdr.command != IB_USER_VERBS_CMD_GET_CONTEXT) - return -EINVAL; + if (!flags) { + __u32 command; - if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << hdr.command))) - return -ENOSYS; - -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - if (hdr.command >= IB_USER_VERBS_CMD_THRESHOLD) { - struct ib_uverbs_cmd_hdr_ex hdr_ex; - - if (copy_from_user(&hdr_ex, buf, sizeof(hdr_ex))) - return -EFAULT; - - if (((hdr_ex.in_words + hdr_ex.provider_in_words) * 4) != count) + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr_ex), - (hdr_ex.in_words + - hdr_ex.provider_in_words) * 4, - (hdr_ex.out_words + - hdr_ex.provider_out_words) * 4); - } else { -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_cmd_table) || + !uverbs_cmd_table[command]) + return -EINVAL; + + if (!file->ucontext && + command != IB_USER_VERBS_CMD_GET_CONTEXT) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command))) + return -ENOSYS; + if (hdr.in_words * 4 != count) return -EINVAL; - return uverbs_cmd_table[hdr.command](file, - buf + sizeof(hdr), - hdr.in_words * 4, - hdr.out_words * 4); + return uverbs_cmd_table[command](file, + buf + sizeof(hdr), + hdr.in_words * 4, + hdr.out_words * 4); + #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING + + } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { + __u32 command; + + struct ib_uverbs_ex_cmd_hdr ex_hdr; + struct ib_udata ucore; + struct ib_udata uhw; + int err; + size_t written_count = count; + + if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK | + IB_USER_VERBS_CMD_COMMAND_MASK)) + return -EINVAL; + + command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK; + + if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) || + !uverbs_ex_cmd_table[command]) + return -ENOSYS; + + if (!file->ucontext) + return -EINVAL; + + if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command))) + return -ENOSYS; + + if (count < (sizeof(hdr) + sizeof(ex_hdr))) + return -EINVAL; + + if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr))) + return -EFAULT; + + count -= sizeof(hdr) + sizeof(ex_hdr); + buf += sizeof(hdr) + sizeof(ex_hdr); + + if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count) + return -EINVAL; + + if (ex_hdr.response) { + if (!hdr.out_words && !ex_hdr.provider_out_words) + return -EINVAL; + } else { + if (hdr.out_words || ex_hdr.provider_out_words) + return -EINVAL; + } + + INIT_UDATA(&ucore, + (hdr.in_words) ? buf : 0, + (unsigned long)ex_hdr.response, + hdr.in_words * 8, + hdr.out_words * 8); + + INIT_UDATA(&uhw, + (ex_hdr.provider_in_words) ? buf + ucore.inlen : 0, + (ex_hdr.provider_out_words) ? (unsigned long)ex_hdr.response + ucore.outlen : 0, + ex_hdr.provider_in_words * 8, + ex_hdr.provider_out_words * 8); + + err = uverbs_ex_cmd_table[command](file, + &ucore, + &uhw); + + if (err) + return err; + + return written_count; } #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + + return -ENOSYS; } static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index f0612645de99..4be29fef66d6 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1692,9 +1692,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - ibdev->ib_dev.uverbs_cmd_mask |= - (1ull << IB_USER_VERBS_CMD_CREATE_FLOW) | - (1ull << IB_USER_VERBS_CMD_DESTROY_FLOW); + ibdev->ib_dev.uverbs_ex_cmd_mask |= + (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | + (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); #endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e393171e2fac..a06fc122f803 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1436,6 +1436,7 @@ struct ib_device { int uverbs_abi_ver; u64 uverbs_cmd_mask; + u64 uverbs_ex_cmd_mask; char node_desc[64]; __be64 node_guid; diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index fc9bbe37cfce..6ace125e1af6 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -87,12 +87,15 @@ enum { IB_USER_VERBS_CMD_CLOSE_XRCD, IB_USER_VERBS_CMD_CREATE_XSRQ, IB_USER_VERBS_CMD_OPEN_QP, -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - IB_USER_VERBS_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, - IB_USER_VERBS_CMD_DESTROY_FLOW -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ }; +#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING +enum { + IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, + IB_USER_VERBS_EX_CMD_DESTROY_FLOW +}; +#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ + /* * Make sure that all structs defined in this file remain laid out so * that they pack the same way on 32-bit and 64-bit architectures (to @@ -122,6 +125,12 @@ struct ib_uverbs_comp_event_desc { * the rest of the command struct based on these value. */ +#define IB_USER_VERBS_CMD_COMMAND_MASK 0xff +#define IB_USER_VERBS_CMD_FLAGS_MASK 0xff000000u +#define IB_USER_VERBS_CMD_FLAGS_SHIFT 24 + +#define IB_USER_VERBS_CMD_FLAG_EXTENDED 0x80 + struct ib_uverbs_cmd_hdr { __u32 command; __u16 in_words; @@ -129,10 +138,8 @@ struct ib_uverbs_cmd_hdr { }; #ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING -struct ib_uverbs_cmd_hdr_ex { - __u32 command; - __u16 in_words; - __u16 out_words; +struct ib_uverbs_ex_cmd_hdr { + __u64 response; __u16 provider_in_words; __u16 provider_out_words; __u32 cmd_hdr_reserved; @@ -782,8 +789,6 @@ struct ib_uverbs_flow_attr { struct ib_uverbs_create_flow { __u32 comp_mask; - __u32 reserved; - __u64 response; __u32 qp_handle; struct ib_uverbs_flow_attr flow_attr; }; From 69ad5da41b4ed94aef31d4111a3442cfd73ce570 Mon Sep 17 00:00:00 2001 From: Matan Barak Date: Wed, 6 Nov 2013 23:21:50 +0100 Subject: [PATCH 62/62] IB/core: Re-enable create_flow/destroy_flow uverbs This commit reverts commit 7afbddfae993 ("IB/core: Temporarily disable create_flow/destroy_flow uverbs"). Since the uverbs extensions functionality was experimental for v3.12, this patch re-enables the support for them and flow-steering for v3.13. Signed-off-by: Matan Barak Signed-off-by: Roland Dreier --- drivers/infiniband/Kconfig | 11 ----------- drivers/infiniband/core/uverbs.h | 4 ---- drivers/infiniband/core/uverbs_cmd.c | 4 ---- drivers/infiniband/core/uverbs_main.c | 5 ----- drivers/infiniband/hw/mlx4/main.c | 2 -- include/uapi/rdma/ib_user_verbs.h | 6 ------ 6 files changed, 32 deletions(-) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index b84791f03a27..5ceda710f516 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -31,17 +31,6 @@ config INFINIBAND_USER_ACCESS libibverbs, libibcm and a hardware driver library from . -config INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - bool "Experimental and unstable ABI for userspace access to flow steering verbs" - depends on INFINIBAND_USER_ACCESS - depends on STAGING - ---help--- - The final ABI for userspace access to flow steering verbs - has not been defined. To use the current ABI, *WHICH WILL - CHANGE IN THE FUTURE*, say Y here. - - If unsure, say N. - config INFINIBAND_USER_MEM bool depends on INFINIBAND_USER_ACCESS != n diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 24adccbf92f9..bdc842e9faef 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -242,8 +242,6 @@ IB_UVERBS_DECLARE_CMD(create_xsrq); IB_UVERBS_DECLARE_CMD(open_xrcd); IB_UVERBS_DECLARE_CMD(close_xrcd); -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - #define IB_UVERBS_DECLARE_EX_CMD(name) \ int ib_uverbs_ex_##name(struct ib_uverbs_file *file, \ struct ib_udata *ucore, \ @@ -252,6 +250,4 @@ IB_UVERBS_DECLARE_CMD(close_xrcd); IB_UVERBS_DECLARE_EX_CMD(create_flow); IB_UVERBS_DECLARE_EX_CMD(destroy_flow); -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ - #endif /* UVERBS_H */ diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 33a52785f812..8233744b4140 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -54,9 +54,7 @@ static struct uverbs_lock_class qp_lock_class = { .name = "QP-uobj" }; static struct uverbs_lock_class ah_lock_class = { .name = "AH-uobj" }; static struct uverbs_lock_class srq_lock_class = { .name = "SRQ-uobj" }; static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ /* * The ib_uobject locking scheme is as follows: @@ -2593,7 +2591,6 @@ out_put: return ret ? ret : in_len; } -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec, union ib_flow_spec *ib_spec) { @@ -2821,7 +2818,6 @@ int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file, return ret; } -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ static int __uverbs_create_xsrq(struct ib_uverbs_file *file, struct ib_uverbs_create_xsrq *cmd, diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 189d99e76d9f..34386943ebcf 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -117,14 +117,12 @@ static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, [IB_USER_VERBS_CMD_OPEN_QP] = ib_uverbs_open_qp, }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, struct ib_udata *ucore, struct ib_udata *uhw) = { [IB_USER_VERBS_EX_CMD_CREATE_FLOW] = ib_uverbs_ex_create_flow, [IB_USER_VERBS_EX_CMD_DESTROY_FLOW] = ib_uverbs_ex_destroy_flow }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device); @@ -633,8 +631,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, hdr.in_words * 4, hdr.out_words * 4); -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING - } else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) { __u32 command; @@ -701,7 +697,6 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf, return written_count; } -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ return -ENOSYS; } diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 4be29fef66d6..1aad9b3e6bdd 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1691,11 +1691,9 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->ib_dev.create_flow = mlx4_ib_create_flow; ibdev->ib_dev.destroy_flow = mlx4_ib_destroy_flow; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING ibdev->ib_dev.uverbs_ex_cmd_mask |= (1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) | (1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW); -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ } mlx4_ib_alloc_eqs(dev, ibdev); diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 6ace125e1af6..cbfdd4ca9510 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -89,12 +89,10 @@ enum { IB_USER_VERBS_CMD_OPEN_QP, }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING enum { IB_USER_VERBS_EX_CMD_CREATE_FLOW = IB_USER_VERBS_CMD_THRESHOLD, IB_USER_VERBS_EX_CMD_DESTROY_FLOW }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ /* * Make sure that all structs defined in this file remain laid out so @@ -137,14 +135,12 @@ struct ib_uverbs_cmd_hdr { __u16 out_words; }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING struct ib_uverbs_ex_cmd_hdr { __u64 response; __u16 provider_in_words; __u16 provider_out_words; __u32 cmd_hdr_reserved; }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ struct ib_uverbs_get_context { __u64 response; @@ -707,7 +703,6 @@ struct ib_uverbs_detach_mcast { __u64 driver_data[0]; }; -#ifdef CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING struct ib_uverbs_flow_spec_hdr { __u32 type; __u16 size; @@ -802,7 +797,6 @@ struct ib_uverbs_destroy_flow { __u32 comp_mask; __u32 flow_handle; }; -#endif /* CONFIG_INFINIBAND_EXPERIMENTAL_UVERBS_FLOW_STEERING */ struct ib_uverbs_create_srq { __u64 response;