From e4103312d7b7afb8a3a7a842a33ef2b1856b2c0f Mon Sep 17 00:00:00 2001
From: Parav Pandit <parav@mellanox.com>
Date: Wed, 12 Feb 2020 09:26:29 +0200
Subject: [PATCH 01/12] Revert "RDMA/cma: Simplify rdma_resolve_addr() error
 flow"

This reverts commit 219d2e9dfda9431b808c28d5efc74b404b95b638.

The call chain below requires the cm_id_priv's destination address to be
setup before performing rdma_bind_addr(). Otherwise source port allocation
fails as cma_port_is_unique() no longer sees the correct tuple to allow
duplicate users of the source port.

rdma_resolve_addr()
  cma_bind_addr()
    rdma_bind_addr()
      cma_get_port()
        cma_alloc_any_port()
          cma_port_is_unique() <- compared with zero daddr

This can result in false failures to connect, particularly if the source
port range is restricted.

Fixes: 219d2e9dfda9 ("RDMA/cma: Simplify rdma_resolve_addr() error flow")
Link: https://lore.kernel.org/r/20200212072635.682689-4-leon@kernel.org
Signed-off-by: Parav Pandit <parav@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cma.c | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 72f032160c4b..2dec3a02ab9f 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -3212,19 +3212,26 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
 	int ret;
 
 	id_priv = container_of(id, struct rdma_id_private, id);
+	memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
 	if (id_priv->state == RDMA_CM_IDLE) {
 		ret = cma_bind_addr(id, src_addr, dst_addr);
-		if (ret)
+		if (ret) {
+			memset(cma_dst_addr(id_priv), 0,
+			       rdma_addr_size(dst_addr));
 			return ret;
+		}
 	}
 
-	if (cma_family(id_priv) != dst_addr->sa_family)
+	if (cma_family(id_priv) != dst_addr->sa_family) {
+		memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr));
 		return -EINVAL;
+	}
 
-	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY)) {
+		memset(cma_dst_addr(id_priv), 0, rdma_addr_size(dst_addr));
 		return -EINVAL;
+	}
 
-	memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
 	if (cma_any_addr(dst_addr)) {
 		ret = cma_resolve_loopback(id_priv);
 	} else {

From 4ca501d6aaf21de31541deac35128bbea8427aa6 Mon Sep 17 00:00:00 2001
From: Nathan Chancellor <natechancellor@gmail.com>
Date: Mon, 17 Feb 2020 13:43:18 -0700
Subject: [PATCH 02/12] RDMA/core: Fix use of logical OR in get_new_pps

Clang warns:

../drivers/infiniband/core/security.c:351:41: warning: converting the
enum constant to a boolean [-Wint-in-bool-context]
        if (!(qp_attr_mask & (IB_QP_PKEY_INDEX || IB_QP_PORT)) && qp_pps) {
                                               ^
1 warning generated.

A bitwise OR should have been used instead.

Fixes: 1dd017882e01 ("RDMA/core: Fix protection fault in get_pkey_idx_qp_list")
Link: https://lore.kernel.org/r/20200217204318.13609-1-natechancellor@gmail.com
Link: https://github.com/ClangBuiltLinux/linux/issues/889
Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Nathan Chancellor <natechancellor@gmail.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/security.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index 2b4d80393bd0..b9a36ea244d4 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -348,7 +348,7 @@ static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp,
 	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && (qp_attr_mask & IB_QP_PORT))
 		new_pps->main.state = IB_PORT_PKEY_VALID;
 
-	if (!(qp_attr_mask & (IB_QP_PKEY_INDEX || IB_QP_PORT)) && qp_pps) {
+	if (!(qp_attr_mask & (IB_QP_PKEY_INDEX | IB_QP_PORT)) && qp_pps) {
 		new_pps->main.port_num = qp_pps->main.port_num;
 		new_pps->main.pkey_index = qp_pps->main.pkey_index;
 		if (qp_pps->main.state != IB_PORT_PKEY_NOT_VALID)

From 6affca140cbea01f497c4f4e16f1e2be7f74bd04 Mon Sep 17 00:00:00 2001
From: Max Gurtovoy <maxg@mellanox.com>
Date: Thu, 20 Feb 2020 12:08:18 +0200
Subject: [PATCH 03/12] RDMA/rw: Fix error flow during RDMA context
 initialization

In case the SGL was mapped for P2P DMA operation, we must unmap it using
pci_p2pdma_unmap_sg during the error unwind of rdma_rw_ctx_init()

Fixes: 7f73eac3a713 ("PCI/P2PDMA: Introduce pci_p2pdma_unmap_sg()")
Link: https://lore.kernel.org/r/20200220100819.41860-1-maxg@mellanox.com
Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Logan Gunthorpe <logang@deltatee.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/rw.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 4fad732f9b3c..06e5b6787443 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -273,6 +273,23 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, struct ib_qp *qp,
 	return 1;
 }
 
+static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+			     u32 sg_cnt, enum dma_data_direction dir)
+{
+	if (is_pci_p2pdma_page(sg_page(sg)))
+		pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir);
+	else
+		ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+}
+
+static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg,
+			  u32 sg_cnt, enum dma_data_direction dir)
+{
+	if (is_pci_p2pdma_page(sg_page(sg)))
+		return pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
+	return ib_dma_map_sg(dev, sg, sg_cnt, dir);
+}
+
 /**
  * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
  * @ctx:	context to initialize
@@ -295,11 +312,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 	struct ib_device *dev = qp->pd->device;
 	int ret;
 
-	if (is_pci_p2pdma_page(sg_page(sg)))
-		ret = pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
-	else
-		ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
-
+	ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir);
 	if (!ret)
 		return -ENOMEM;
 	sg_cnt = ret;
@@ -338,7 +351,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 	return ret;
 
 out_unmap_sg:
-	ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
+	rdma_rw_unmap_sg(dev, sg, sg_cnt, dir);
 	return ret;
 }
 EXPORT_SYMBOL(rdma_rw_ctx_init);
@@ -588,11 +601,7 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct ib_qp *qp, u8 port_num,
 		break;
 	}
 
-	if (is_pci_p2pdma_page(sg_page(sg)))
-		pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg,
-				    sg_cnt, dir);
-	else
-		ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+	rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
 

From c14dfddbd869bf0c2bafb7ef260c41d9cebbcfec Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Fri, 21 Feb 2020 15:20:26 +0000
Subject: [PATCH 04/12] RMDA/cm: Fix missing ib_cm_destroy_id() in
 ib_cm_insert_listen()

The algorithm pre-allocates a cm_id since allocation cannot be done while
holding the cm.lock spinlock, however it doesn't free it on one error
path, leading to a memory leak.

Fixes: 067b171b8679 ("IB/cm: Share listening CM IDs")
Link: https://lore.kernel.org/r/20200221152023.GA8680@ziepe.ca
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/cm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
index 68cc1b2d6824..15e99a888427 100644
--- a/drivers/infiniband/core/cm.c
+++ b/drivers/infiniband/core/cm.c
@@ -1191,6 +1191,7 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device,
 			/* Sharing an ib_cm_id with different handlers is not
 			 * supported */
 			spin_unlock_irqrestore(&cm.lock, flags);
+			ib_destroy_cm_id(cm_id);
 			return ERR_PTR(-EINVAL);
 		}
 		refcount_inc(&cm_id_priv->refcount);

From 801b67f3eaafd3f2ec8b65d93142d4ffedba85df Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 27 Feb 2020 14:57:28 +0200
Subject: [PATCH 05/12] RDMA/core: Fix pkey and port assignment in get_new_pps

When port is part of the modify mask, then we should take it from the
qp_attr and not from the old pps. Same for PKEY. Otherwise there are
panics in some configurations:

  RIP: 0010:get_pkey_idx_qp_list+0x50/0x80 [ib_core]
  Code: c7 18 e8 13 04 30 ef 0f b6 43 06 48 69 c0 b8 00 00 00 48 03 85 a0 04 00 00 48 8b 50 20 48 8d 48 20 48 39 ca 74 1a 0f b7 73 04 <66> 39 72 10 75 08 eb 10 66 39 72 10 74 0a 48 8b 12 48 39 ca 75 f2
  RSP: 0018:ffffafb3480932f0 EFLAGS: 00010203
  RAX: ffff98059ababa10 RBX: ffff980d926e8cc0 RCX: ffff98059ababa30
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff98059ababa28
  RBP: ffff98059b940000 R08: 00000000000310c0 R09: ffff97fe47c07480
  R10: 0000000000000036 R11: 0000000000000200 R12: 0000000000000071
  R13: ffff98059b940000 R14: ffff980d87f948a0 R15: 0000000000000000
  FS:  00007f88deb31740(0000) GS:ffff98059f600000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000010 CR3: 0000000853e26001 CR4: 00000000001606e0
  Call Trace:
   port_pkey_list_insert+0x3d/0x1b0 [ib_core]
   ? kmem_cache_alloc_trace+0x215/0x220
   ib_security_modify_qp+0x226/0x3a0 [ib_core]
   _ib_modify_qp+0xcf/0x390 [ib_core]
   ipoib_init_qp+0x7f/0x200 [ib_ipoib]
   ? rvt_modify_port+0xd0/0xd0 [rdmavt]
   ? ib_find_pkey+0x99/0xf0 [ib_core]
   ipoib_ib_dev_open_default+0x1a/0x200 [ib_ipoib]
   ipoib_ib_dev_open+0x96/0x130 [ib_ipoib]
   ipoib_open+0x44/0x130 [ib_ipoib]
   __dev_open+0xd1/0x160
   __dev_change_flags+0x1ab/0x1f0
   dev_change_flags+0x23/0x60
   do_setlink+0x328/0xe30
   ? __nla_validate_parse+0x54/0x900
   __rtnl_newlink+0x54e/0x810
   ? __alloc_pages_nodemask+0x17d/0x320
   ? page_fault+0x30/0x50
   ? _cond_resched+0x15/0x30
   ? kmem_cache_alloc_trace+0x1c8/0x220
   rtnl_newlink+0x43/0x60
   rtnetlink_rcv_msg+0x28f/0x350
   ? kmem_cache_alloc+0x1fb/0x200
   ? _cond_resched+0x15/0x30
   ? __kmalloc_node_track_caller+0x24d/0x2d0
   ? rtnl_calcit.isra.31+0x120/0x120
   netlink_rcv_skb+0xcb/0x100
   netlink_unicast+0x1e0/0x340
   netlink_sendmsg+0x317/0x480
   ? __check_object_size+0x48/0x1d0
   sock_sendmsg+0x65/0x80
   ____sys_sendmsg+0x223/0x260
   ? copy_msghdr_from_user+0xdc/0x140
   ___sys_sendmsg+0x7c/0xc0
   ? skb_dequeue+0x57/0x70
   ? __inode_wait_for_writeback+0x75/0xe0
   ? fsnotify_grab_connector+0x45/0x80
   ? __dentry_kill+0x12c/0x180
   __sys_sendmsg+0x58/0xa0
   do_syscall_64+0x5b/0x200
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x7f88de467f10

Link: https://lore.kernel.org/r/20200227125728.100551-1-leon@kernel.org
Cc: <stable@vger.kernel.org>
Fixes: 1dd017882e01 ("RDMA/core: Fix protection fault in get_pkey_idx_qp_list")
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Tested-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/security.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c
index b9a36ea244d4..2d5608315dc8 100644
--- a/drivers/infiniband/core/security.c
+++ b/drivers/infiniband/core/security.c
@@ -340,11 +340,15 @@ static struct ib_ports_pkeys *get_new_pps(const struct ib_qp *qp,
 		return NULL;
 
 	if (qp_attr_mask & IB_QP_PORT)
-		new_pps->main.port_num =
-			(qp_pps) ? qp_pps->main.port_num : qp_attr->port_num;
+		new_pps->main.port_num = qp_attr->port_num;
+	else if (qp_pps)
+		new_pps->main.port_num = qp_pps->main.port_num;
+
 	if (qp_attr_mask & IB_QP_PKEY_INDEX)
-		new_pps->main.pkey_index = (qp_pps) ? qp_pps->main.pkey_index :
-						      qp_attr->pkey_index;
+		new_pps->main.pkey_index = qp_attr->pkey_index;
+	else if (qp_pps)
+		new_pps->main.pkey_index = qp_pps->main.pkey_index;
+
 	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && (qp_attr_mask & IB_QP_PORT))
 		new_pps->main.state = IB_PORT_PKEY_VALID;
 

From 817a68a6584aa08e323c64283fec5ded7be84759 Mon Sep 17 00:00:00 2001
From: Dennis Dalessandro <dennis.dalessandro@intel.com>
Date: Tue, 25 Feb 2020 14:54:45 -0500
Subject: [PATCH 06/12] IB/hfi1, qib: Ensure RCU is locked when accessing list

The packet handling function, specifically the iteration of the qp list
for mad packet processing misses locking RCU before running through the
list. Not only is this incorrect, but the list_for_each_entry_rcu() call
can not be called with a conditional check for lock dependency. Remedy
this by invoking the rcu lock and unlock around the critical section.

This brings MAD packet processing in line with what is done for non-MAD
packets.

Fixes: 7724105686e7 ("IB/hfi1: add driver files")
Link: https://lore.kernel.org/r/20200225195445.140896.41873.stgit@awfm-01.aw.intel.com
Reviewed-by: Mike Marciniszyn <mike.marciniszyn@intel.com>
Signed-off-by: Dennis Dalessandro <dennis.dalessandro@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/hfi1/verbs.c    | 4 +++-
 drivers/infiniband/hw/qib/qib_verbs.c | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c
index 089e201d7550..2f6323ad9c59 100644
--- a/drivers/infiniband/hw/hfi1/verbs.c
+++ b/drivers/infiniband/hw/hfi1/verbs.c
@@ -515,10 +515,11 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet,
 				       opa_get_lid(packet->dlid, 9B));
 		if (!mcast)
 			goto drop;
+		rcu_read_lock();
 		list_for_each_entry_rcu(p, &mcast->qp_list, list) {
 			packet->qp = p->qp;
 			if (hfi1_do_pkey_check(packet))
-				goto drop;
+				goto unlock_drop;
 			spin_lock_irqsave(&packet->qp->r_lock, flags);
 			packet_handler = qp_ok(packet);
 			if (likely(packet_handler))
@@ -527,6 +528,7 @@ static inline void hfi1_handle_packet(struct hfi1_packet *packet,
 				ibp->rvp.n_pkt_drops++;
 			spin_unlock_irqrestore(&packet->qp->r_lock, flags);
 		}
+		rcu_read_unlock();
 		/*
 		 * Notify rvt_multicast_detach() if it is waiting for us
 		 * to finish.
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
index 33778d451b82..5ef93f8f17a1 100644
--- a/drivers/infiniband/hw/qib/qib_verbs.c
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -329,8 +329,10 @@ void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen)
 		if (mcast == NULL)
 			goto drop;
 		this_cpu_inc(ibp->pmastats->n_multicast_rcv);
+		rcu_read_lock();
 		list_for_each_entry_rcu(p, &mcast->qp_list, list)
 			qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp);
+		rcu_read_unlock();
 		/*
 		 * Notify rvt_multicast_detach() if it is waiting for us
 		 * to finish.

From de5ed007a03d71daaa505f5daa4d3666530c7090 Mon Sep 17 00:00:00 2001
From: Artemy Kovalyov <artemyko@mellanox.com>
Date: Thu, 27 Feb 2020 13:39:18 +0200
Subject: [PATCH 07/12] IB/mlx5: Fix implicit ODP race

Following race may occur because of the call_srcu and the placement of
the synchronize_srcu vs the xa_erase.

CPU0				   CPU1

mlx5_ib_free_implicit_mr:	   destroy_unused_implicit_child_mr:
 xa_erase(odp_mkeys)
 synchronize_srcu()
				    xa_lock(implicit_children)
				    if (still in xarray)
				       atomic_inc()
				       call_srcu()
				    xa_unlock(implicit_children)
 xa_erase(implicit_children):
   xa_lock(implicit_children)
   __xa_erase()
   xa_unlock(implicit_children)

 flush_workqueue()
				   [..]
				    free_implicit_child_mr_rcu:
				     (via call_srcu)
				      queue_work()

 WARN_ON(atomic_read())
				   [..]
				    free_implicit_child_mr_work:
				     (via wq)
				      free_implicit_child_mr()
 mlx5_mr_cache_invalidate()
				     mlx5_ib_update_xlt() <-- UMR QP fail
				     atomic_dec()

The wait_event() solves the race because it blocks until
free_implicit_child_mr_work() completes.

Fixes: 5256edcb98a1 ("RDMA/mlx5: Rework implicit ODP destroy")
Link: https://lore.kernel.org/r/20200227113918.94432-1-leon@kernel.org
Signed-off-by: Artemy Kovalyov <artemyko@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  1 +
 drivers/infiniband/hw/mlx5/odp.c     | 17 +++++++----------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index d9bffcc93587..bb78142bca5e 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -636,6 +636,7 @@ struct mlx5_ib_mr {
 
 	/* For ODP and implicit */
 	atomic_t		num_deferred_work;
+	wait_queue_head_t       q_deferred_work;
 	struct xarray		implicit_children;
 	union {
 		struct rcu_head rcu;
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 4216814ba871..bf50cd91f472 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -235,7 +235,8 @@ static void free_implicit_child_mr(struct mlx5_ib_mr *mr, bool need_imr_xlt)
 	mr->parent = NULL;
 	mlx5_mr_cache_free(mr->dev, mr);
 	ib_umem_odp_release(odp);
-	atomic_dec(&imr->num_deferred_work);
+	if (atomic_dec_and_test(&imr->num_deferred_work))
+		wake_up(&imr->q_deferred_work);
 }
 
 static void free_implicit_child_mr_work(struct work_struct *work)
@@ -554,6 +555,7 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd,
 	imr->umem = &umem_odp->umem;
 	imr->is_odp_implicit = true;
 	atomic_set(&imr->num_deferred_work, 0);
+	init_waitqueue_head(&imr->q_deferred_work);
 	xa_init(&imr->implicit_children);
 
 	err = mlx5_ib_update_xlt(imr, 0,
@@ -611,10 +613,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr)
 	 * under xa_lock while the child is in the xarray. Thus at this point
 	 * it is only decreasing, and all work holding it is now on the wq.
 	 */
-	if (atomic_read(&imr->num_deferred_work)) {
-		flush_workqueue(system_unbound_wq);
-		WARN_ON(atomic_read(&imr->num_deferred_work));
-	}
+	wait_event(imr->q_deferred_work, !atomic_read(&imr->num_deferred_work));
 
 	/*
 	 * Fence the imr before we destroy the children. This allows us to
@@ -645,10 +644,7 @@ void mlx5_ib_fence_odp_mr(struct mlx5_ib_mr *mr)
 	/* Wait for all running page-fault handlers to finish. */
 	synchronize_srcu(&mr->dev->odp_srcu);
 
-	if (atomic_read(&mr->num_deferred_work)) {
-		flush_workqueue(system_unbound_wq);
-		WARN_ON(atomic_read(&mr->num_deferred_work));
-	}
+	wait_event(mr->q_deferred_work, !atomic_read(&mr->num_deferred_work));
 
 	dma_fence_odp_mr(mr);
 }
@@ -1720,7 +1716,8 @@ static void destroy_prefetch_work(struct prefetch_mr_work *work)
 	u32 i;
 
 	for (i = 0; i < work->num_sge; ++i)
-		atomic_dec(&work->frags[i].mr->num_deferred_work);
+		if (atomic_dec_and_test(&work->frags[i].mr->num_deferred_work))
+			wake_up(&work->frags[i].mr->q_deferred_work);
 	kvfree(work);
 }
 

From e38b55ea0443da35a50a3eb2079ad3612cf763b9 Mon Sep 17 00:00:00 2001
From: Maor Gottlieb <maorg@mellanox.com>
Date: Thu, 27 Feb 2020 13:27:08 +0200
Subject: [PATCH 08/12] RDMA/core: Fix protection fault in ib_mr_pool_destroy

Fix NULL pointer dereference in the error flow of ib_create_qp_user
when accessing to uninitialized list pointers - rdma_mrs and sig_mrs.
The following crash from syzkaller revealed it.

  kasan: GPF could be caused by NULL-ptr deref or user memory access
  general protection fault: 0000 [#1] SMP KASAN PTI
  CPU: 1 PID: 23167 Comm: syz-executor.1 Not tainted 5.5.0-rc5 #2
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
  rel-1.12.1-0-ga5cab58e9a3f-prebuilt.qemu.org 04/01/2014
  RIP: 0010:ib_mr_pool_destroy+0x81/0x1f0
  Code: 00 00 fc ff df 49 c1 ec 03 4d 01 fc e8 a8 ea 72 fe 41 80 3c 24 00
  0f 85 62 01 00 00 48 8b 13 48 89 d6 4c 8d 6a c8 48 c1 ee 03 <42> 80 3c
  3e 00 0f 85 34 01 00 00 48 8d 7a 08 4c 8b 02 48 89 fe 48
  RSP: 0018:ffffc9000951f8b0 EFLAGS: 00010046
  RAX: 0000000000040000 RBX: ffff88810f268038 RCX: ffffffff82c41628
  RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffc9000951f850
  RBP: ffff88810f268020 R08: 0000000000000004 R09: fffff520012a3f0a
  R10: 0000000000000001 R11: fffff520012a3f0a R12: ffffed1021e4d007
  R13: ffffffffffffffc8 R14: 0000000000000246 R15: dffffc0000000000
  FS:  00007f54bc788700(0000) GS:ffff88811b100000(0000)
  knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 0000000000000000 CR3: 0000000116920002 CR4: 0000000000360ee0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
   rdma_rw_cleanup_mrs+0x15/0x30
   ib_destroy_qp_user+0x674/0x7d0
   ib_create_qp_user+0xb01/0x11c0
   create_qp+0x1517/0x2130
   ib_uverbs_create_qp+0x13e/0x190
   ib_uverbs_write+0xaa5/0xdf0
   __vfs_write+0x7c/0x100
   vfs_write+0x168/0x4a0
   ksys_write+0xc8/0x200
   do_syscall_64+0x9c/0x390
   entry_SYSCALL_64_after_hwframe+0x44/0xa9
  RIP: 0033:0x465b49
  Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89
  f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01
  f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48
  RSP: 002b:00007f54bc787c58 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
  RAX: ffffffffffffffda RBX: 000000000073bf00 RCX: 0000000000465b49
  RDX: 0000000000000040 RSI: 0000000020000540 RDI: 0000000000000003
  RBP: 00007f54bc787c70 R08: 0000000000000000 R09: 0000000000000000
  R10: 0000000000000000 R11: 0000000000000246 R12: 00007f54bc7886bc
  R13: 00000000004ca2ec R14: 000000000070ded0 R15: 0000000000000005

Fixes: a060b5629ab0 ("IB/core: generic RDMA READ/WRITE API")
Link: https://lore.kernel.org/r/20200227112708.93023-1-leon@kernel.org
Signed-off-by: Maor Gottlieb <maorg@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/core_priv.h  | 14 ++++++++++++++
 drivers/infiniband/core/uverbs_cmd.c |  9 ---------
 drivers/infiniband/core/verbs.c      | 10 ----------
 3 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index b1457b3464d3..cf42acca4a3a 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -338,6 +338,20 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
 	qp->pd = pd;
 	qp->uobject = uobj;
 	qp->real_qp = qp;
+
+	qp->qp_type = attr->qp_type;
+	qp->rwq_ind_tbl = attr->rwq_ind_tbl;
+	qp->send_cq = attr->send_cq;
+	qp->recv_cq = attr->recv_cq;
+	qp->srq = attr->srq;
+	qp->rwq_ind_tbl = attr->rwq_ind_tbl;
+	qp->event_handler = attr->event_handler;
+
+	atomic_set(&qp->usecnt, 0);
+	spin_lock_init(&qp->mr_lock);
+	INIT_LIST_HEAD(&qp->rdma_mrs);
+	INIT_LIST_HEAD(&qp->sig_mrs);
+
 	/*
 	 * We don't track XRC QPs for now, because they don't have PD
 	 * and more importantly they are created internaly by driver,
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
index 025933752e1d..060b4ebbd2ba 100644
--- a/drivers/infiniband/core/uverbs_cmd.c
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -1445,16 +1445,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
 		if (ret)
 			goto err_cb;
 
-		qp->pd		  = pd;
-		qp->send_cq	  = attr.send_cq;
-		qp->recv_cq	  = attr.recv_cq;
-		qp->srq		  = attr.srq;
-		qp->rwq_ind_tbl	  = ind_tbl;
-		qp->event_handler = attr.event_handler;
-		qp->qp_type	  = attr.qp_type;
-		atomic_set(&qp->usecnt, 0);
 		atomic_inc(&pd->usecnt);
-		qp->port = 0;
 		if (attr.send_cq)
 			atomic_inc(&attr.send_cq->usecnt);
 		if (attr.recv_cq)
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index 3ebae3b65c28..e62c9dfc7837 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1185,16 +1185,6 @@ struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
 	if (ret)
 		goto err;
 
-	qp->qp_type    = qp_init_attr->qp_type;
-	qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
-
-	atomic_set(&qp->usecnt, 0);
-	qp->mrs_used = 0;
-	spin_lock_init(&qp->mr_lock);
-	INIT_LIST_HEAD(&qp->rdma_mrs);
-	INIT_LIST_HEAD(&qp->sig_mrs);
-	qp->port = 0;
-
 	if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
 		struct ib_qp *xrc_qp =
 			create_xrc_qp_user(qp, qp_init_attr, udata);

From a4e63bce1414df7ab6eb82ca9feb8494ce13e554 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@mellanox.com>
Date: Thu, 27 Feb 2020 13:41:18 +0200
Subject: [PATCH 09/12] RDMA/odp: Ensure the mm is still alive before creating
 an implicit child

Registration of a mmu_notifier requires the caller to hold a mmget() on
the mm as registration is not permitted to race with exit_mmap(). There is
a BUG_ON inside the mmu_notifier to guard against this.

Normally creating a umem is done against current which implicitly holds
the mmget(), however an implicit ODP child is created from a pagefault
work queue and is not guaranteed to have a mmget().

Call mmget() around this registration and abort faulting if the MM has
gone to exit_mmap().

Before the patch below the notifier was registered when the implicit ODP
parent was created, so there was no chance to register a notifier outside
of current.

Fixes: c571feca2dc9 ("RDMA/odp: use mmu_notifier_get/put for 'struct ib_ucontext_per_mm'")
Link: https://lore.kernel.org/r/20200227114118.94736-1-leon@kernel.org
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/umem_odp.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
index b8c657b28380..cd656ad4953b 100644
--- a/drivers/infiniband/core/umem_odp.c
+++ b/drivers/infiniband/core/umem_odp.c
@@ -181,14 +181,28 @@ ib_umem_odp_alloc_child(struct ib_umem_odp *root, unsigned long addr,
 	odp_data->page_shift = PAGE_SHIFT;
 	odp_data->notifier.ops = ops;
 
+	/*
+	 * A mmget must be held when registering a notifier, the owming_mm only
+	 * has a mm_grab at this point.
+	 */
+	if (!mmget_not_zero(umem->owning_mm)) {
+		ret = -EFAULT;
+		goto out_free;
+	}
+
 	odp_data->tgid = get_pid(root->tgid);
 	ret = ib_init_umem_odp(odp_data, ops);
-	if (ret) {
-		put_pid(odp_data->tgid);
-		kfree(odp_data);
-		return ERR_PTR(ret);
-	}
+	if (ret)
+		goto out_tgid;
+	mmput(umem->owning_mm);
 	return odp_data;
+
+out_tgid:
+	put_pid(odp_data->tgid);
+	mmput(umem->owning_mm);
+out_free:
+	kfree(odp_data);
+	return ERR_PTR(ret);
 }
 EXPORT_SYMBOL(ib_umem_odp_alloc_child);
 

From 78f34a16c28654cb47791257006f90d0948f2f0c Mon Sep 17 00:00:00 2001
From: Mark Zhang <markz@mellanox.com>
Date: Thu, 27 Feb 2020 14:51:11 +0200
Subject: [PATCH 10/12] RDMA/nldev: Fix crash when set a QP to a new counter
 but QPN is missing

This fixes the kernel crash when a RDMA_NLDEV_CMD_STAT_SET command is
received, but the QP number parameter is not available.

  iwpm_register_pid: Unable to send a nlmsg (client = 2)
  infiniband syz1: RDMA CMA: cma_listen_on_dev, error -98
  general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] PREEMPT SMP KASAN
  KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007]
  CPU: 0 PID: 9754 Comm: syz-executor069 Not tainted 5.6.0-rc2-syzkaller #0
  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
  RIP: 0010:nla_get_u32 include/net/netlink.h:1474 [inline]
  RIP: 0010:nldev_stat_set_doit+0x63c/0xb70 drivers/infiniband/core/nldev.c:1760
  Code: fc 01 0f 84 58 03 00 00 e8 41 83 bf fb 4c 8b a3 58 fd ff ff 48 b8 00 00 00 00 00 fc ff df 49 8d 7c 24 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 6d
  RSP: 0018:ffffc900068bf350 EFLAGS: 00010247
  RAX: dffffc0000000000 RBX: ffffc900068bf728 RCX: ffffffff85b60470
  RDX: 0000000000000000 RSI: ffffffff85b6047f RDI: 0000000000000004
  RBP: ffffc900068bf750 R08: ffff88808c3ee140 R09: ffff8880a25e6010
  R10: ffffed10144bcddc R11: ffff8880a25e6ee3 R12: 0000000000000000
  R13: ffff88809acb0000 R14: ffff888092a42c80 R15: 000000009ef2e29a
  FS:  0000000001ff0880(0000) GS:ffff8880ae800000(0000) knlGS:0000000000000000
  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
  CR2: 00007f4733e34000 CR3: 00000000a9b27000 CR4: 00000000001406f0
  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
  Call Trace:
    rdma_nl_rcv_msg drivers/infiniband/core/netlink.c:195 [inline]
    rdma_nl_rcv_skb drivers/infiniband/core/netlink.c:239 [inline]
    rdma_nl_rcv+0x5d9/0x980 drivers/infiniband/core/netlink.c:259
    netlink_unicast_kernel net/netlink/af_netlink.c:1303 [inline]
    netlink_unicast+0x59e/0x7e0 net/netlink/af_netlink.c:1329
    netlink_sendmsg+0x91c/0xea0 net/netlink/af_netlink.c:1918
    sock_sendmsg_nosec net/socket.c:652 [inline]
    sock_sendmsg+0xd7/0x130 net/socket.c:672
    ____sys_sendmsg+0x753/0x880 net/socket.c:2343
    ___sys_sendmsg+0x100/0x170 net/socket.c:2397
    __sys_sendmsg+0x105/0x1d0 net/socket.c:2430
    __do_sys_sendmsg net/socket.c:2439 [inline]
    __se_sys_sendmsg net/socket.c:2437 [inline]
    __x64_sys_sendmsg+0x78/0xb0 net/socket.c:2437
    do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294
    entry_SYSCALL_64_after_hwframe+0x49/0xbe
  RIP: 0033:0x4403d9
  Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 fb 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00
  RSP: 002b:00007ffc0efbc5c8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
  RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 00000000004403d9
  RDX: 0000000000000000 RSI: 0000000020000240 RDI: 0000000000000004
  RBP: 00000000006ca018 R08: 0000000000000008 R09: 00000000004002c8
  R10: 000000000000004a R11: 0000000000000246 R12: 0000000000401c60
  R13: 0000000000401cf0 R14: 0000000000000000 R15: 0000000000000000

Fixes: b389327df905 ("RDMA/nldev: Allow counter manual mode configration through RDMA netlink")
Link: https://lore.kernel.org/r/20200227125111.99142-1-leon@kernel.org
Reported-by: syzbot+bd4af81bc51ee0283445@syzkaller.appspotmail.com
Signed-off-by: Mark Zhang <markz@mellanox.com>
Signed-off-by: Leon Romanovsky <leonro@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/nldev.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c
index 37b433aa7306..e0b0a91da696 100644
--- a/drivers/infiniband/core/nldev.c
+++ b/drivers/infiniband/core/nldev.c
@@ -1757,6 +1757,8 @@ static int nldev_stat_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 		if (ret)
 			goto err_msg;
 	} else {
+		if (!tb[RDMA_NLDEV_ATTR_RES_LQPN])
+			goto err_msg;
 		qpn = nla_get_u32(tb[RDMA_NLDEV_ATTR_RES_LQPN]);
 		if (tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]) {
 			cntn = nla_get_u32(tb[RDMA_NLDEV_ATTR_STAT_COUNTER_ID]);

From 12e5eef0f4d8087ea7b559f6630be08ffea2d851 Mon Sep 17 00:00:00 2001
From: Bernard Metzler <bmt@zurich.ibm.com>
Date: Mon, 2 Mar 2020 16:58:14 +0100
Subject: [PATCH 11/12] RDMA/siw: Fix failure handling during device creation

A failing call to ib_device_set_netdev() during device creation caused
system crash due to xa_destroy of uninitialized xarray hit by device
deallocation. Fixed by moving xarray initialization before potential
device deallocation.

Fixes: bdcf26bf9b3a ("rdma/siw: network and RDMA core interface")
Link: https://lore.kernel.org/r/20200302155814.9896-1-bmt@zurich.ibm.com
Reported-by: syzbot+2e80962bedd9559fe0b3@syzkaller.appspotmail.com
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/sw/siw/siw_main.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/sw/siw/siw_main.c b/drivers/infiniband/sw/siw/siw_main.c
index 96ed349c0939..5cd40fb9e20c 100644
--- a/drivers/infiniband/sw/siw/siw_main.c
+++ b/drivers/infiniband/sw/siw/siw_main.c
@@ -388,6 +388,9 @@ static struct siw_device *siw_device_create(struct net_device *netdev)
 		{ .max_segment_size = SZ_2G };
 	base_dev->num_comp_vectors = num_possible_cpus();
 
+	xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
+	xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
+
 	ib_set_device_ops(base_dev, &siw_device_ops);
 	rv = ib_device_set_netdev(base_dev, netdev, 1);
 	if (rv)
@@ -415,9 +418,6 @@ static struct siw_device *siw_device_create(struct net_device *netdev)
 	sdev->attrs.max_srq_wr = SIW_MAX_SRQ_WR;
 	sdev->attrs.max_srq_sge = SIW_MAX_SGE;
 
-	xa_init_flags(&sdev->qp_xa, XA_FLAGS_ALLOC1);
-	xa_init_flags(&sdev->mem_xa, XA_FLAGS_ALLOC1);
-
 	INIT_LIST_HEAD(&sdev->cep_list);
 	INIT_LIST_HEAD(&sdev->qp_list);
 

From 810dbc69087b08fd53e1cdd6c709f385bc2921ad Mon Sep 17 00:00:00 2001
From: Bernard Metzler <bmt@zurich.ibm.com>
Date: Mon, 2 Mar 2020 19:16:14 +0100
Subject: [PATCH 12/12] RDMA/iwcm: Fix iwcm work deallocation

The dealloc_work_entries() function must update the work_free_list pointer
while freeing its entries, since potentially called again on same list. A
second iteration of the work list caused system crash. This happens, if
work allocation fails during cma_iw_listen() and free_cm_id() tries to
free the list again during cleanup.

Fixes: 922a8e9fb2e0 ("RDMA: iWARP Connection Manager.")
Link: https://lore.kernel.org/r/20200302181614.17042-1-bmt@zurich.ibm.com
Reported-by: syzbot+cb0c054eabfba4342146@syzkaller.appspotmail.com
Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
Reviewed-by: Jason Gunthorpe <jgg@mellanox.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
---
 drivers/infiniband/core/iwcm.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index ade71823370f..da8adadf4755 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -159,8 +159,10 @@ static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
 {
 	struct list_head *e, *tmp;
 
-	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list) {
+		list_del(e);
 		kfree(list_entry(e, struct iwcm_work, free_list));
+	}
 }
 
 static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)