From d3c04a3a6870a4b96ee213d99b4f4ca723e64025 Mon Sep 17 00:00:00 2001 From: Vijay Immanuel Date: Thu, 5 Jul 2018 18:43:47 -0700 Subject: [PATCH 001/242] IB/rxe: vary the source udp port for receive scaling Select the source udp port number for a QP based on the source QPN. This provides a better spread of traffic across NIC RX queues for RC/UC QPs. Signed-off-by: Vijay Immanuel Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_net.c | 8 ++++---- drivers/infiniband/sw/rxe/rxe_qp.c | 10 ++++++++++ drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 8094cbaa54a9..4ddb6b4c6a46 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -396,8 +396,8 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, if (!memcmp(saddr, daddr, sizeof(*daddr))) pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), - htons(ROCE_V2_UDP_DPORT)); + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv4_hdr(dst, skb, saddr->s_addr, daddr->s_addr, IPPROTO_UDP, av->grh.traffic_class, av->grh.hop_limit, df, xnet); @@ -423,8 +423,8 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, if (!memcmp(saddr, daddr, sizeof(*daddr))) pkt->mask |= RXE_LOOPBACK_MASK; - prepare_udp_hdr(skb, htons(RXE_ROCE_V2_SPORT), - htons(ROCE_V2_UDP_DPORT)); + prepare_udp_hdr(skb, cpu_to_be16(qp->src_port), + cpu_to_be16(ROCE_V2_UDP_DPORT)); prepare_ipv6_hdr(dst, skb, saddr, daddr, IPPROTO_UDP, av->grh.traffic_class, diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index c58452daffc7..6ff88c8250f6 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -227,6 +227,16 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, return err; qp->sk->sk->sk_user_data = qp; + /* pick a source UDP port number for this QP based on + * the source QPN. this spreads traffic for different QPs + * across different NIC RX queues (while using a single + * flow for a given QP to maintain packet order). + * the port number must be in the Dynamic Ports range + * (0xc000 - 0xffff). + */ + qp->src_port = RXE_ROCE_V2_SPORT + + (hash_32_generic(qp_num(qp), 14) & 0x3fff); + qp->sq.max_wr = init->cap.max_send_wr; qp->sq.max_sge = init->cap.max_send_sge; qp->sq.max_inline = init->cap.max_inline_data; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index af1470d29391..0d920087811a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -248,6 +248,7 @@ struct rxe_qp { struct socket *sk; u32 dst_cookie; + u16 src_port; struct rxe_av pri_av; struct rxe_av alt_av; From 8b7b59d030cc0d34b83b0c556bb6365c9ae6ba77 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 19 Aug 2018 15:04:01 +0800 Subject: [PATCH 002/242] IB/rxe: remove redudant qpn check In the commit 536ca245c512 ("IB/rxe: Drop QP0 silently"), if qpn is zero, the function directly returns. So in the following function, it is not necessary to check qpn. The qpn check in the function check_keys is removed. Fixes: 536ca245c512 ("IB/rxe: Drop QP0 silently") CC: Srinivas Eeda CC: Junxiao Bi Signed-off-by: Zhu Yanjun Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_recv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_recv.c b/drivers/infiniband/sw/rxe/rxe_recv.c index d30dbac24583..5c29a1bb575a 100644 --- a/drivers/infiniband/sw/rxe/rxe_recv.c +++ b/drivers/infiniband/sw/rxe/rxe_recv.c @@ -122,7 +122,7 @@ static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, set_bad_pkey_cntr(port); goto err1; } - } else if (qpn != 0) { + } else { if (unlikely(!pkey_match(pkey, port->pkey_tbl[qp->attr.pkey_index] ))) { @@ -134,7 +134,7 @@ static int check_keys(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, } if ((qp_type(qp) == IB_QPT_UD || qp_type(qp) == IB_QPT_GSI) && - qpn != 0 && pkt->mask) { + pkt->mask) { u32 qkey = (qpn == 1) ? GSI_QKEY : qp->attr.qkey; if (unlikely(deth_qkey(pkt) != qkey)) { From 3db2bceb29fd9a2f31d5df573f337dbba577f8e9 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 13:51:37 +0300 Subject: [PATCH 003/242] IB/rxe: Simplify rxe_find_route() to avoid GID query for netdev rxe_prepare() is called on an skb which has ndev already initialized by rxe_init_packet(). Therefore avoid querying the GID attribute again and use the available netdevice from the skb->dev. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Reviewed-by: Yuval Shaia Tested-by: Yuval Shaia Signed-off-by: Zhu Yanjun Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_loc.h | 3 +-- drivers/infiniband/sw/rxe/rxe_net.c | 30 ++++++++++------------------ drivers/infiniband/sw/rxe/rxe_req.c | 2 +- drivers/infiniband/sw/rxe/rxe_resp.c | 2 +- 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 87d14f7ef21b..8e305422adbb 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -144,8 +144,7 @@ void rxe_loopback(struct sk_buff *skb); int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb); struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt); -int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, u32 *crc); +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc); enum rdma_link_layer rxe_link_layer(struct rxe_dev *rxe, unsigned int port_num); const char *rxe_parent_name(struct rxe_dev *rxe, unsigned int port_num); struct device *rxe_dma_device(struct rxe_dev *rxe); diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 4ddb6b4c6a46..ed823cccca48 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -182,19 +182,11 @@ static struct dst_entry *rxe_find_route6(struct net_device *ndev, #endif -static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, +static struct dst_entry *rxe_find_route(struct net_device *ndev, struct rxe_qp *qp, struct rxe_av *av) { - const struct ib_gid_attr *attr; struct dst_entry *dst = NULL; - struct net_device *ndev; - - attr = rdma_get_gid_attr(&rxe->ib_dev, qp->attr.port_num, - av->grh.sgid_index); - if (IS_ERR(attr)) - return NULL; - ndev = attr->ndev; if (qp_type(qp) == IB_QPT_RC) dst = sk_dst_get(qp->sk->sk); @@ -229,7 +221,6 @@ static struct dst_entry *rxe_find_route(struct rxe_dev *rxe, sk_dst_set(qp->sk->sk, dst); } } - rdma_put_gid_attr(attr); return dst; } @@ -377,8 +368,8 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb, ip6h->payload_len = htons(skb->len - sizeof(*ip6h)); } -static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, struct rxe_av *av) +static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb, + struct rxe_av *av) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; @@ -387,7 +378,7 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr; struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr; - dst = rxe_find_route(rxe, qp, av); + dst = rxe_find_route(skb->dev, qp, av); if (!dst) { pr_err("Host not reachable\n"); return -EHOSTUNREACH; @@ -406,15 +397,15 @@ static int prepare4(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return 0; } -static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, struct rxe_av *av) +static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb, + struct rxe_av *av) { struct rxe_qp *qp = pkt->qp; struct dst_entry *dst; struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr; struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr; - dst = rxe_find_route(rxe, qp, av); + dst = rxe_find_route(skb->dev, qp, av); if (!dst) { pr_err("Host not reachable\n"); return -EHOSTUNREACH; @@ -434,16 +425,15 @@ static int prepare6(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, return 0; } -int rxe_prepare(struct rxe_dev *rxe, struct rxe_pkt_info *pkt, - struct sk_buff *skb, u32 *crc) +int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc) { int err = 0; struct rxe_av *av = rxe_get_av(pkt); if (av->network_type == RDMA_NETWORK_IPV4) - err = prepare4(rxe, pkt, skb, av); + err = prepare4(pkt, skb, av); else if (av->network_type == RDMA_NETWORK_IPV6) - err = prepare6(rxe, pkt, skb, av); + err = prepare6(pkt, skb, av); *crc = rxe_icrc_hdr(pkt, skb); diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 8be27238a86e..84d35520781c 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -476,7 +476,7 @@ static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe, u32 *p; int err; - err = rxe_prepare(rxe, pkt, skb, &crc); + err = rxe_prepare(pkt, skb, &crc); if (err) return err; diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index aa5833318372..c1948d406528 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -637,7 +637,7 @@ static struct sk_buff *prepare_ack_packet(struct rxe_qp *qp, if (ack->mask & RXE_ATMACK_MASK) atmack_set_orig(ack, qp->resp.atomic_orig); - err = rxe_prepare(rxe, ack, skb, &crc); + err = rxe_prepare(ack, skb, &crc); if (err) { kfree_skb(skb); return NULL; From 66d0f207dbf0e983c8006eb2fc5060163646210f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 27 Aug 2018 08:44:14 +0300 Subject: [PATCH 004/242] IB/rxe: Replace spinlock with rwlock Concurrent readers which read rb tree are protected using read lock. Concurrent writers which add element to pool are protected using write lock. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_pool.c | 36 ++++++++++++++-------------- drivers/infiniband/sw/rxe/rxe_pool.h | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index b4a8acc7bb7d..be076d818891 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -207,7 +207,7 @@ int rxe_pool_init( kref_init(&pool->ref_cnt); - spin_lock_init(&pool->pool_lock); + rwlock_init(&pool->pool_lock); if (rxe_type_info[type].flags & RXE_POOL_INDEX) { err = rxe_pool_init_index(pool, @@ -245,12 +245,12 @@ int rxe_pool_cleanup(struct rxe_pool *pool) { unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); pool->state = rxe_pool_invalid; if (atomic_read(&pool->num_elem) > 0) pr_warn("%s pool destroyed with unfree'd elem\n", pool_name(pool)); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); rxe_pool_put(pool); @@ -336,10 +336,10 @@ void rxe_add_key(void *arg, void *key) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); memcpy((u8 *)elem + pool->key_offset, key, pool->key_size); insert_key(pool, elem); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_drop_key(void *arg) @@ -348,9 +348,9 @@ void rxe_drop_key(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); rb_erase(&elem->node, &pool->tree); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_add_index(void *arg) @@ -359,10 +359,10 @@ void rxe_add_index(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); elem->index = alloc_index(pool); insert_index(pool, elem); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void rxe_drop_index(void *arg) @@ -371,10 +371,10 @@ void rxe_drop_index(void *arg) struct rxe_pool *pool = elem->pool; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + write_lock_irqsave(&pool->pool_lock, flags); clear_bit(elem->index - pool->min_index, pool->table); rb_erase(&elem->node, &pool->tree); - spin_unlock_irqrestore(&pool->pool_lock, flags); + write_unlock_irqrestore(&pool->pool_lock, flags); } void *rxe_alloc(struct rxe_pool *pool) @@ -384,13 +384,13 @@ void *rxe_alloc(struct rxe_pool *pool) might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); - spin_lock_irqsave(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); if (pool->state != rxe_pool_valid) { - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); return NULL; } kref_get(&pool->ref_cnt); - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); kref_get(&pool->rxe->ref_cnt); @@ -436,7 +436,7 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) struct rxe_pool_entry *elem = NULL; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); if (pool->state != rxe_pool_valid) goto out; @@ -458,7 +458,7 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) kref_get(&elem->ref_cnt); out: - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); return node ? elem : NULL; } @@ -469,7 +469,7 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key) int cmp; unsigned long flags; - spin_lock_irqsave(&pool->pool_lock, flags); + read_lock_irqsave(&pool->pool_lock, flags); if (pool->state != rxe_pool_valid) goto out; @@ -494,6 +494,6 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key) kref_get(&elem->ref_cnt); out: - spin_unlock_irqrestore(&pool->pool_lock, flags); + read_unlock_irqrestore(&pool->pool_lock, flags); return node ? elem : NULL; } diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 47df28e43acf..6eb344236afc 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -90,7 +90,7 @@ struct rxe_pool_entry { struct rxe_pool { struct rxe_dev *rxe; - spinlock_t pool_lock; /* pool spinlock */ + rwlock_t pool_lock; /* protects pool add/del/search */ size_t elem_size; struct kref ref_cnt; void (*cleanup)(struct rxe_pool_entry *obj); From 3ccf19e25adf2d3b9c5f1e65b950b6d223e1c4c0 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 27 Aug 2018 08:44:15 +0300 Subject: [PATCH 005/242] IB/rxe: Change pool state enums to capital letters Normal practice is to have enum defines in capital letters. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_pool.c | 12 ++++++------ drivers/infiniband/sw/rxe/rxe_pool.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index be076d818891..c8598b91e454 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -222,7 +222,7 @@ int rxe_pool_init( pool->key_size = rxe_type_info[type].key_size; } - pool->state = rxe_pool_valid; + pool->state = RXE_POOL_STATE_VALID; out: return err; @@ -232,7 +232,7 @@ static void rxe_pool_release(struct kref *kref) { struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt); - pool->state = rxe_pool_invalid; + pool->state = RXE_POOL_STATE_INVALID; kfree(pool->table); } @@ -246,7 +246,7 @@ int rxe_pool_cleanup(struct rxe_pool *pool) unsigned long flags; write_lock_irqsave(&pool->pool_lock, flags); - pool->state = rxe_pool_invalid; + pool->state = RXE_POOL_STATE_INVALID; if (atomic_read(&pool->num_elem) > 0) pr_warn("%s pool destroyed with unfree'd elem\n", pool_name(pool)); @@ -385,7 +385,7 @@ void *rxe_alloc(struct rxe_pool *pool) might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC)); read_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) { + if (pool->state != RXE_POOL_STATE_VALID) { read_unlock_irqrestore(&pool->pool_lock, flags); return NULL; } @@ -438,7 +438,7 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) read_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) + if (pool->state != RXE_POOL_STATE_VALID) goto out; node = pool->tree.rb_node; @@ -471,7 +471,7 @@ void *rxe_pool_get_key(struct rxe_pool *pool, void *key) read_lock_irqsave(&pool->pool_lock, flags); - if (pool->state != rxe_pool_valid) + if (pool->state != RXE_POOL_STATE_VALID) goto out; node = pool->tree.rb_node; diff --git a/drivers/infiniband/sw/rxe/rxe_pool.h b/drivers/infiniband/sw/rxe/rxe_pool.h index 6eb344236afc..aa4ba307097b 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.h +++ b/drivers/infiniband/sw/rxe/rxe_pool.h @@ -74,8 +74,8 @@ struct rxe_type_info { extern struct rxe_type_info rxe_type_info[]; enum rxe_pool_state { - rxe_pool_invalid, - rxe_pool_valid, + RXE_POOL_STATE_INVALID, + RXE_POOL_STATE_VALID, }; struct rxe_pool_entry { From 536a631d1ec1de6ed51453954407068c10473eea Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 27 Aug 2018 08:44:16 +0300 Subject: [PATCH 006/242] IB/rxe: Avoid NULL check when search is successful While performing lookup in a pool, if entry is found, take the reference right there, instead of checking again outside the loop and save one branch. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_pool.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_pool.c b/drivers/infiniband/sw/rxe/rxe_pool.c index c8598b91e454..36b53fb94a49 100644 --- a/drivers/infiniband/sw/rxe/rxe_pool.c +++ b/drivers/infiniband/sw/rxe/rxe_pool.c @@ -450,13 +450,12 @@ void *rxe_pool_get_index(struct rxe_pool *pool, u32 index) node = node->rb_left; else if (elem->index < index) node = node->rb_right; - else + else { + kref_get(&elem->ref_cnt); break; + } } - if (node) - kref_get(&elem->ref_cnt); - out: read_unlock_irqrestore(&pool->pool_lock, flags); return node ? elem : NULL; From 1703129ed2c12c112889f471bddf242012791b47 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 27 Aug 2018 08:44:17 +0300 Subject: [PATCH 007/242] IB/rxe: Refactor lookup memory function Consolidate all error checks under single if() condition and use helper unlikely() macro for them, in addition drop unneeded goto labels. rxe_pool_get_index() already provides RB tree based efficient lookup. Avoid doing extra checks for error cases which are rare and already covered by rxe_pool_get_index(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_mr.c | 35 ++++++++++-------------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c index dff605fdf60f..9d3916b93f23 100644 --- a/drivers/infiniband/sw/rxe/rxe_mr.c +++ b/drivers/infiniband/sw/rxe/rxe_mr.c @@ -573,33 +573,20 @@ struct rxe_mem *lookup_mem(struct rxe_pd *pd, int access, u32 key, struct rxe_dev *rxe = to_rdev(pd->ibpd.device); int index = key >> 8; - if (index >= RXE_MIN_MR_INDEX && index <= RXE_MAX_MR_INDEX) { - mem = rxe_pool_get_index(&rxe->mr_pool, index); - if (!mem) - goto err1; - } else { - goto err1; + mem = rxe_pool_get_index(&rxe->mr_pool, index); + if (!mem) + return NULL; + + if (unlikely((type == lookup_local && mem->lkey != key) || + (type == lookup_remote && mem->rkey != key) || + mem->pd != pd || + (access && !(access & mem->access)) || + mem->state != RXE_MEM_STATE_VALID)) { + rxe_drop_ref(mem); + mem = NULL; } - if ((type == lookup_local && mem->lkey != key) || - (type == lookup_remote && mem->rkey != key)) - goto err2; - - if (mem->pd != pd) - goto err2; - - if (access && !(access & mem->access)) - goto err2; - - if (mem->state != RXE_MEM_STATE_VALID) - goto err2; - return mem; - -err2: - rxe_drop_ref(mem); -err1: - return NULL; } int rxe_mem_map_pages(struct rxe_dev *rxe, struct rxe_mem *mem, From b97db58557f4aa6d9903f8e1deea6b3d1ed0ba43 Mon Sep 17 00:00:00 2001 From: Vijay Immanuel Date: Tue, 12 Jun 2018 18:20:49 -0700 Subject: [PATCH 008/242] IB/rxe: fix for duplicate request processing and ack psns Don't reset the resp opcode for a replayed read response. The resp opcode could be in the middle of a write or send sequence, when the duplicate read request was received. An example sequence is as follows: - Receive read request for 12KB PSN 20. Transmit read response first, middle and last with PSNs 20,21,22. - Receive write first PSN 23. At this point the resp psn is 24 and resp opcode is write first. - The sender notices that PSN 20 is dropped and retransmits. Receive read request for 12KB PSN 20. Transmit read response first, middle and last with PSNs 20,21,22. The resp opcode is set to -1, the resp psn remains 24. - Receive write first PSN 23. This is processed by duplicate_request(). The resp opcode remains -1 and resp psn remains 24. - Receive write middle PSN 24. check_op_seq() reports a missing first error since the resp opcode is -1. When sending an ack for a duplicate send or write request, use the psn of the previous ack sent. Do not use the psn of a read response for the ack. An example sequence is as follows: - Receive write PSN 30. Transmit ACK for PSN 30. - Receive read request 4KB PSN 31. Transmit read response with PSN 31. The resp psn is now 32. - The sender notices that PSN 30 is dropped and retransmits. Receive write PSN 30. duplicate_request() sends an ACK with PSN 31. That is incorrect since PSN 31 was a read request. Signed-off-by: Vijay Immanuel Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_resp.c | 8 ++++++-- drivers/infiniband/sw/rxe/rxe_verbs.h | 2 ++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_resp.c b/drivers/infiniband/sw/rxe/rxe_resp.c index c1948d406528..c962160292f4 100644 --- a/drivers/infiniband/sw/rxe/rxe_resp.c +++ b/drivers/infiniband/sw/rxe/rxe_resp.c @@ -682,6 +682,7 @@ static enum resp_states read_reply(struct rxe_qp *qp, rxe_advance_resp_resource(qp); res->type = RXE_READ_MASK; + res->replay = 0; res->read.va = qp->resp.va; res->read.va_org = qp->resp.va; @@ -752,7 +753,8 @@ static enum resp_states read_reply(struct rxe_qp *qp, state = RESPST_DONE; } else { qp->resp.res = NULL; - qp->resp.opcode = -1; + if (!res->replay) + qp->resp.opcode = -1; if (psn_compare(res->cur_psn, qp->resp.psn) >= 0) qp->resp.psn = res->cur_psn; state = RESPST_CLEANUP; @@ -814,6 +816,7 @@ static enum resp_states execute(struct rxe_qp *qp, struct rxe_pkt_info *pkt) /* next expected psn, read handles this separately */ qp->resp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + qp->resp.ack_psn = qp->resp.psn; qp->resp.opcode = pkt->opcode; qp->resp.status = IB_WC_SUCCESS; @@ -1065,7 +1068,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, struct rxe_pkt_info *pkt) { enum resp_states rc; - u32 prev_psn = (qp->resp.psn - 1) & BTH_PSN_MASK; + u32 prev_psn = (qp->resp.ack_psn - 1) & BTH_PSN_MASK; if (pkt->mask & RXE_SEND_MASK || pkt->mask & RXE_WRITE_MASK) { @@ -1108,6 +1111,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp, res->state = (pkt->psn == res->first_psn) ? rdatm_res_state_new : rdatm_res_state_replay; + res->replay = 1; /* Reset the resource, except length. */ res->read.va_org = iova; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 0d920087811a..979e987e3c46 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -171,6 +171,7 @@ enum rdatm_res_state { struct resp_res { int type; + int replay; u32 first_psn; u32 last_psn; u32 cur_psn; @@ -195,6 +196,7 @@ struct rxe_resp_info { enum rxe_qp_state state; u32 msn; u32 psn; + u32 ack_psn; int opcode; int drop_msg; int goto_error; From 4e4c53df567714b3d08b2b5d8ccb1d175fc9be01 Mon Sep 17 00:00:00 2001 From: Vijay Immanuel Date: Wed, 13 Jun 2018 18:47:30 -0700 Subject: [PATCH 009/242] IB/rxe: avoid back-to-back retries Error retries can occur due to timeouts, NAKs or receiving packets beyond the current read request. Avoid back-to-back retries due to packet processing, by only retrying the initial attempt immediately. Subsequent retries must be due to timeouts. Continue to process completion packets after scheduling a retry. Signed-off-by: Vijay Immanuel Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_comp.c | 18 +++++++++++++++++- drivers/infiniband/sw/rxe/rxe_verbs.h | 1 + 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index 83311dd07019..ed96441595d8 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -191,6 +191,7 @@ static inline void reset_retry_counters(struct rxe_qp *qp) { qp->comp.retry_cnt = qp->attr.retry_cnt; qp->comp.rnr_retry = qp->attr.rnr_retry; + qp->comp.started_retry = 0; } static inline enum comp_state check_psn(struct rxe_qp *qp, @@ -676,6 +677,20 @@ int rxe_completer(void *arg) goto exit; } + /* if we've started a retry, don't start another + * retry sequence, unless this is a timeout. + */ + if (qp->comp.started_retry && + !qp->comp.timeout_retry) { + if (pkt) { + rxe_drop_ref(pkt->qp); + kfree_skb(skb); + skb = NULL; + } + + goto done; + } + if (qp->comp.retry_cnt > 0) { if (qp->comp.retry_cnt != 7) qp->comp.retry_cnt--; @@ -692,6 +707,7 @@ int rxe_completer(void *arg) rxe_counter_inc(rxe, RXE_CNT_COMP_RETRY); qp->req.need_retry = 1; + qp->comp.started_retry = 1; rxe_run_task(&qp->req.task, 1); } @@ -701,7 +717,7 @@ int rxe_completer(void *arg) skb = NULL; } - goto exit; + goto done; } else { rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED); diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h index 979e987e3c46..82e670d6eeea 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.h +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h @@ -158,6 +158,7 @@ struct rxe_comp_info { int opcode; int timeout; int timeout_retry; + int started_retry; u32 retry_cnt; u32 rnr_retry; struct rxe_task task; From 030e46e495af855a13964a0aab9753ea82a96edc Mon Sep 17 00:00:00 2001 From: Vijay Immanuel Date: Wed, 13 Jun 2018 18:48:07 -0700 Subject: [PATCH 010/242] IB/rxe: fixes for rdma read retry When a read request is retried for the remaining partial data, the response may restart from read response first or read response only. So support those cases. Do not advance the comp psn beyond the current wqe's last_psn as that could skip over an entire read wqe and will cause the req_retry() logic to set an incorrect req psn. An example sequence is as follows: Write PSN 40 -- this is the current WQE. Read request PSN 41 Write PSN 42 Receive ACK PSN 42 -- this will complete the current WQE for PSN 40, and set the comp psn to 42 which is a problem because the read request at PSN 41 has been skipped over. So when req_retry() tries to retransmit the read request, it sets the req psn to 42 which is incorrect. When retrying a read request, calculate the number of psns completed based on the dma resid instead of the wqe first_psn. The wqe first_psn could have moved if the read request was retried multiple times. Set the reth length to the dma resid to handle read retries for the remaining partial data. Signed-off-by: Vijay Immanuel Signed-off-by: Doug Ledford --- drivers/infiniband/sw/rxe/rxe_comp.c | 21 ++++++++++++++++----- drivers/infiniband/sw/rxe/rxe_req.c | 15 +++++++++------ 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c index ed96441595d8..ea089cb091ad 100644 --- a/drivers/infiniband/sw/rxe/rxe_comp.c +++ b/drivers/infiniband/sw/rxe/rxe_comp.c @@ -254,6 +254,17 @@ static inline enum comp_state check_ack(struct rxe_qp *qp, case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE: if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE && pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) { + /* read retries of partial data may restart from + * read response first or response only. + */ + if ((pkt->psn == wqe->first_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) || + (wqe->first_psn == wqe->last_psn && + pkt->opcode == + IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY)) + break; + return COMPST_ERROR; } break; @@ -500,11 +511,11 @@ static inline enum comp_state complete_wqe(struct rxe_qp *qp, struct rxe_pkt_info *pkt, struct rxe_send_wqe *wqe) { - qp->comp.opcode = -1; - - if (pkt) { - if (psn_compare(pkt->psn, qp->comp.psn) >= 0) - qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK; + if (pkt && wqe->state == wqe_state_pending) { + if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) { + qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK; + qp->comp.opcode = -1; + } if (qp->req.wait_psn) { qp->req.wait_psn = 0; diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c index 84d35520781c..6c361d70d7cd 100644 --- a/drivers/infiniband/sw/rxe/rxe_req.c +++ b/drivers/infiniband/sw/rxe/rxe_req.c @@ -73,9 +73,6 @@ static void req_retry(struct rxe_qp *qp) int npsn; int first = 1; - wqe = queue_head(qp->sq.queue); - npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK; - qp->req.wqe_index = consumer_index(qp->sq.queue); qp->req.psn = qp->comp.psn; qp->req.opcode = -1; @@ -107,11 +104,17 @@ static void req_retry(struct rxe_qp *qp) if (first) { first = 0; - if (mask & WR_WRITE_OR_SEND_MASK) + if (mask & WR_WRITE_OR_SEND_MASK) { + npsn = (qp->comp.psn - wqe->first_psn) & + BTH_PSN_MASK; retry_first_write_send(qp, wqe, mask, npsn); + } - if (mask & WR_READ_MASK) + if (mask & WR_READ_MASK) { + npsn = (wqe->dma.length - wqe->dma.resid) / + qp->mtu; wqe->iova += npsn * qp->mtu; + } } wqe->state = wqe_state_posted; @@ -435,7 +438,7 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp, if (pkt->mask & RXE_RETH_MASK) { reth_set_rkey(pkt, ibwr->wr.rdma.rkey); reth_set_va(pkt, wqe->iova); - reth_set_len(pkt, wqe->dma.length); + reth_set_len(pkt, wqe->dma.resid); } if (pkt->mask & RXE_IMMDT_MASK) From c54a73d8202a3f072f6635ec1046edb34f770a81 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Wed, 15 Aug 2018 22:58:31 -0700 Subject: [PATCH 011/242] IB/hfi1: Rework file list in Makefile We want to keep files in alphabetical order in our makefile, however this just makes for messy diffs when adding (or removing) files. Let's just clean this up and make it line by line. Reviewed-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/Makefile | 40 ++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index f451ba912f47..05b680426e37 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -8,12 +8,40 @@ # obj-$(CONFIG_INFINIBAND_HFI1) += hfi1.o -hfi1-y := affinity.o chip.o device.o driver.o efivar.o \ - eprom.o exp_rcv.o file_ops.o firmware.o \ - init.o intr.o mad.o mmu_rb.o pcie.o pio.o pio_copy.o platform.o \ - qp.o qsfp.o rc.o ruc.o sdma.o sysfs.o trace.o \ - uc.o ud.o user_exp_rcv.o user_pages.o user_sdma.o verbs.o \ - verbs_txreq.o vnic_main.o vnic_sdma.o +hfi1-y := \ + affinity.o \ + chip.o \ + device.o \ + driver.o \ + efivar.o \ + eprom.o \ + exp_rcv.o \ + file_ops.o \ + firmware.o \ + init.o \ + intr.o \ + mad.o \ + mmu_rb.o \ + pcie.o \ + pio.o \ + pio_copy.o \ + platform.o \ + qp.o \ + qsfp.o \ + rc.o \ + ruc.o \ + sdma.o \ + sysfs.o \ + trace.o \ + uc.o \ + ud.o \ + user_exp_rcv.o \ + user_pages.o \ + user_sdma.o \ + verbs.o \ + verbs_txreq.o \ + vnic_main.o \ + vnic_sdma.o ifdef CONFIG_DEBUG_FS hfi1-y += debugfs.o From 22c21438aad76022b6e41fea02e7e382d94ab361 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 15 Aug 2018 22:58:40 -0700 Subject: [PATCH 012/242] IB/hfi1: Remove duplicated defines TXREQ defines are duplicated, incompletely, in the sdma header file. Remove duplicate defines. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/sdma.h | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index 46c775f255d1..f2d83294271e 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -62,16 +62,6 @@ /* Hardware limit for SDMA packet size */ #define MAX_SDMA_PKT_SIZE ((16 * 1024) - 1) -#define SDMA_TXREQ_S_OK 0 -#define SDMA_TXREQ_S_SENDERROR 1 -#define SDMA_TXREQ_S_ABORTED 2 -#define SDMA_TXREQ_S_SHUTDOWN 3 - -/* flags bits */ -#define SDMA_TXREQ_F_URGENT 0x0001 -#define SDMA_TXREQ_F_AHG_COPY 0x0002 -#define SDMA_TXREQ_F_USE_AHG 0x0004 - #define SDMA_MAP_NONE 0 #define SDMA_MAP_SINGLE 1 #define SDMA_MAP_PAGE 2 From 6a516bc9d70b291d3271c6c66b89c25e3c2b5795 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 15 Aug 2018 22:58:49 -0700 Subject: [PATCH 013/242] IB/hfi1: tune_pcie_caps is arbitrarily placed, poorly The tune_pcie_caps needs to occur sometime after PCI is enabled, but before the HFI is enabled. Currently it is placed in the MSIx allocation code which doesn't really fit. Moving it to just after the gen3 bump. Clean up the associated code (modules, etc.). Reviewed-by: Mike Marciniszyn Reviewed-by: Sadanand Warrier Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 6 ++++++ drivers/infiniband/hw/hfi1/hfi.h | 1 + drivers/infiniband/hw/hfi1/pcie.c | 18 ++++++++---------- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 2c19bf772451..db6b095e9d15 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -15123,6 +15123,12 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, if (ret) goto bail_cleanup; + /* + * This should probably occur in hfi1_pcie_init(), but historically + * occurs after the do_pcie_gen3_transition() code. + */ + tune_pcie_caps(dd); + /* start setting dd values and adjusting CSRs */ init_early_variables(dd); diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index d9470317983f..f5e88d7f844a 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1984,6 +1984,7 @@ int request_msix(struct hfi1_devdata *dd, u32 msireq); int restore_pci_variables(struct hfi1_devdata *dd); int save_pci_variables(struct hfi1_devdata *dd); int do_pcie_gen3_transition(struct hfi1_devdata *dd); +void tune_pcie_caps(struct hfi1_devdata *dd); int parse_platform_config(struct hfi1_devdata *dd); int get_platform_config_field(struct hfi1_devdata *dd, enum platform_config_table_type_encoding diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index eec83757d55f..b7473399ec6f 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -60,11 +60,6 @@ * This file contains PCIe utility routines. */ -/* - * Code to adjust PCIe capabilities. - */ -static void tune_pcie_caps(struct hfi1_devdata *); - /* * Do all the common PCIe setup and initialization. * devdata is not yet allocated, and is not allocated until after this @@ -359,8 +354,6 @@ int request_msix(struct hfi1_devdata *dd, u32 msireq) return nvec; } - tune_pcie_caps(dd); - return nvec; } @@ -479,14 +472,19 @@ error: * Check and optionally adjust them to maximize our throughput. */ static int hfi1_pcie_caps; -module_param_named(pcie_caps, hfi1_pcie_caps, int, S_IRUGO); +module_param_named(pcie_caps, hfi1_pcie_caps, int, 0444); MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)"); uint aspm_mode = ASPM_MODE_DISABLED; -module_param_named(aspm, aspm_mode, uint, S_IRUGO); +module_param_named(aspm, aspm_mode, uint, 0444); MODULE_PARM_DESC(aspm, "PCIe ASPM: 0: disable, 1: enable, 2: dynamic"); -static void tune_pcie_caps(struct hfi1_devdata *dd) +/** + * tune_pcie_caps() - Code to adjust PCIe capabilities. + * @dd: Valid device data structure + * + */ +void tune_pcie_caps(struct hfi1_devdata *dd) { struct pci_dev *parent; u16 rc_mpss, rc_mps, ep_mpss, ep_mps; From 57f97e96625fe89246827ebb4d129741a4f83c40 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 15 Aug 2018 23:03:46 -0700 Subject: [PATCH 014/242] IB/hfi1: Get the hfi1_devdata structure as early as possible Currently several things occur before the hfi1_devdata structure is allocated. This leads to an inconsistent logging ability and makes it more difficult to restructure some code paths. Allocate (and do a minimal init) the structure as soon as possible. Reviewed-by: Mike Marciniszyn Reviewed-by: Sadanand Warrier Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 22 ++------- drivers/infiniband/hw/hfi1/hfi.h | 19 +------- drivers/infiniband/hw/hfi1/init.c | 77 +++++++++++++++++-------------- drivers/infiniband/hw/hfi1/pcie.c | 27 ++++------- 4 files changed, 59 insertions(+), 86 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index db6b095e9d15..eab25d54e136 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -67,8 +67,6 @@ #include "debugfs.h" #include "fault.h" -#define NUM_IB_PORTS 1 - uint kdeth_qp; module_param_named(kdeth_qp, kdeth_qp, uint, S_IRUGO); MODULE_PARM_DESC(kdeth_qp, "Set the KDETH queue pair prefix"); @@ -14914,20 +14912,16 @@ err_exit: } /** - * Allocate and initialize the device structure for the hfi. + * hfi1_init_dd() - Initialize most of the dd structure. * @dev: the pci_dev for hfi1_ib device * @ent: pci_device_id struct for this dev * - * Also allocates, initializes, and returns the devdata struct for this - * device instance - * * This is global, and is called directly at init to set up the * chip-specific function pointers for later use. */ -struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, - const struct pci_device_id *ent) +int hfi1_init_dd(struct hfi1_devdata *dd) { - struct hfi1_devdata *dd; + struct pci_dev *pdev = dd->pcidev; struct hfi1_pportdata *ppd; u64 reg; int i, ret; @@ -14938,13 +14932,8 @@ struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, "Functional simulator" }; struct pci_dev *parent = pdev->bus->self; - u32 sdma_engines; + u32 sdma_engines = chip_sdma_engines(dd); - dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * - sizeof(struct hfi1_pportdata)); - if (IS_ERR(dd)) - goto bail; - sdma_engines = chip_sdma_engines(dd); ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) { int vl; @@ -15246,9 +15235,8 @@ bail_cleanup: hfi1_pcie_ddcleanup(dd); bail_free: hfi1_free_devdata(dd); - dd = ERR_PTR(ret); bail: - return dd; + return ret; } static u16 delay_cycles(struct hfi1_pportdata *ppd, u32 desired_egress_rate, diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index f5e88d7f844a..5e4ad14455fb 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1887,10 +1887,8 @@ struct cc_state *get_cc_state_protected(struct hfi1_pportdata *ppd) #define HFI1_CTXT_WAITING_URG 4 /* free up any allocated data at closes */ -struct hfi1_devdata *hfi1_init_dd(struct pci_dev *pdev, - const struct pci_device_id *ent); +int hfi1_init_dd(struct hfi1_devdata *dd); void hfi1_free_devdata(struct hfi1_devdata *dd); -struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra); /* LED beaconing functions */ void hfi1_start_led_override(struct hfi1_pportdata *ppd, unsigned int timeon, @@ -1974,7 +1972,7 @@ void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd); /* Hook for sysfs read of QSFP */ int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); -int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent); +int hfi1_pcie_init(struct hfi1_devdata *dd); void hfi1_clean_up_interrupts(struct hfi1_devdata *dd); void hfi1_pcie_cleanup(struct pci_dev *pdev); int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev); @@ -2125,19 +2123,6 @@ static inline u64 hfi1_pkt_base_sdma_integrity(struct hfi1_devdata *dd) return base_sdma_integrity; } -/* - * hfi1_early_err is used (only!) to print early errors before devdata is - * allocated, or when dd->pcidev may not be valid, and at the tail end of - * cleanup when devdata may have been freed, etc. hfi1_dev_porterr is - * the same as dd_dev_err, but is used when the message really needs - * the IB port# to be definitive as to what's happening.. - */ -#define hfi1_early_err(dev, fmt, ...) \ - dev_err(dev, fmt, ##__VA_ARGS__) - -#define hfi1_early_info(dev, fmt, ...) \ - dev_info(dev, fmt, ##__VA_ARGS__) - #define dd_dev_emerg(dd, fmt, ...) \ dev_emerg(&(dd)->pcidev->dev, "%s: " fmt, \ rvt_get_ibdev_name(&(dd)->verbs_dev.rdi), ##__VA_ARGS__) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 758d273c32cf..0965ec69f4dd 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -83,6 +83,8 @@ #define HFI1_MIN_EAGER_BUFFER_SIZE (4 * 1024) /* 4KB */ #define HFI1_MAX_EAGER_BUFFER_SIZE (256 * 1024) /* 256KB */ +#define NUM_IB_PORTS 1 + /* * Number of user receive contexts we are configured to use (to allow for more * pio buffers per ctxt, etc.) Zero means use one user context per CPU. @@ -654,9 +656,8 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, ppd->part_enforce |= HFI1_PART_ENFORCE_IN; if (loopback) { - hfi1_early_err(&pdev->dev, - "Faking data partition 0x8001 in idx %u\n", - !default_pkey_idx); + dd_dev_err(dd, "Faking data partition 0x8001 in idx %u\n", + !default_pkey_idx); ppd->pkeys[!default_pkey_idx] = 0x8001; } @@ -702,9 +703,7 @@ void hfi1_init_pportdata(struct pci_dev *pdev, struct hfi1_pportdata *ppd, return; bail: - - hfi1_early_err(&pdev->dev, - "Congestion Control Agent disabled for port %d\n", port); + dd_dev_err(dd, "Congestion Control Agent disabled for port %d\n", port); } /* @@ -1246,15 +1245,19 @@ void hfi1_free_devdata(struct hfi1_devdata *dd) kobject_put(&dd->kobj); } -/* - * Allocate our primary per-unit data structure. Must be done via verbs - * allocator, because the verbs cleanup process both does cleanup and - * free of the data structure. +/** + * hfi1_alloc_devdata - Allocate our primary per-unit data structure. + * @pdev: Valid PCI device + * @extra: How many bytes to alloc past the default + * + * Must be done via verbs allocator, because the verbs cleanup process + * both does cleanup and free of the data structure. * "extra" is for chip-specific data. * * Use the idr mechanism to get a unit number for this unit. */ -struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) +static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, + size_t extra) { unsigned long flags; struct hfi1_devdata *dd; @@ -1287,8 +1290,8 @@ struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, size_t extra) idr_preload_end(); if (ret < 0) { - hfi1_early_err(&pdev->dev, - "Could not allocate unit ID: error %d\n", -ret); + dev_err(&pdev->dev, + "Could not allocate unit ID: error %d\n", -ret); goto bail; } rvt_set_ibdev_name(&dd->verbs_dev.rdi, "%s_%d", class_name(), dd->unit); @@ -1604,23 +1607,23 @@ static void postinit_cleanup(struct hfi1_devdata *dd) hfi1_free_devdata(dd); } -static int init_validate_rcvhdrcnt(struct device *dev, uint thecnt) +static int init_validate_rcvhdrcnt(struct hfi1_devdata *dd, uint thecnt) { if (thecnt <= HFI1_MIN_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, "Receive header queue count too small\n"); + dd_dev_err(dd, "Receive header queue count too small\n"); return -EINVAL; } if (thecnt > HFI1_MAX_HDRQ_EGRBUF_CNT) { - hfi1_early_err(dev, - "Receive header queue count cannot be greater than %u\n", - HFI1_MAX_HDRQ_EGRBUF_CNT); + dd_dev_err(dd, + "Receive header queue count cannot be greater than %u\n", + HFI1_MAX_HDRQ_EGRBUF_CNT); return -EINVAL; } if (thecnt % HDRQ_INCREMENT) { - hfi1_early_err(dev, "Receive header queue count %d must be divisible by %lu\n", - thecnt, HDRQ_INCREMENT); + dd_dev_err(dd, "Receive header queue count %d must be divisible by %lu\n", + thecnt, HDRQ_INCREMENT); return -EINVAL; } @@ -1639,22 +1642,29 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* Validate dev ids */ if (!(ent->device == PCI_DEVICE_ID_INTEL0 || ent->device == PCI_DEVICE_ID_INTEL1)) { - hfi1_early_err(&pdev->dev, - "Failing on unknown Intel deviceid 0x%x\n", - ent->device); + dev_err(&pdev->dev, "Failing on unknown Intel deviceid 0x%x\n", + ent->device); ret = -ENODEV; goto bail; } + /* Allocate the dd so we can get to work */ + dd = hfi1_alloc_devdata(pdev, NUM_IB_PORTS * + sizeof(struct hfi1_pportdata)); + if (IS_ERR(dd)) { + ret = PTR_ERR(dd); + goto bail; + } + /* Validate some global module parameters */ - ret = init_validate_rcvhdrcnt(&pdev->dev, rcvhdrcnt); + ret = init_validate_rcvhdrcnt(dd, rcvhdrcnt); if (ret) goto bail; /* use the encoding function as a sanitization check */ if (!encode_rcv_header_entry_size(hfi1_hdrq_entsize)) { - hfi1_early_err(&pdev->dev, "Invalid HdrQ Entry size %u\n", - hfi1_hdrq_entsize); + dd_dev_err(dd, "Invalid HdrQ Entry size %u\n", + hfi1_hdrq_entsize); ret = -EINVAL; goto bail; } @@ -1676,10 +1686,10 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) clamp_val(eager_buffer_size, MIN_EAGER_BUFFER * 8, MAX_EAGER_BUFFER_TOTAL); - hfi1_early_info(&pdev->dev, "Eager buffer size %u\n", - eager_buffer_size); + dd_dev_info(dd, "Eager buffer size %u\n", + eager_buffer_size); } else { - hfi1_early_err(&pdev->dev, "Invalid Eager buffer size of 0\n"); + dd_dev_err(dd, "Invalid Eager buffer size of 0\n"); ret = -EINVAL; goto bail; } @@ -1687,7 +1697,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) /* restrict value of hfi1_rcvarr_split */ hfi1_rcvarr_split = clamp_val(hfi1_rcvarr_split, 0, 100); - ret = hfi1_pcie_init(pdev, ent); + ret = hfi1_pcie_init(dd); if (ret) goto bail; @@ -1695,12 +1705,9 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) * Do device-specific initialization, function table setup, dd * allocation, etc. */ - dd = hfi1_init_dd(pdev, ent); - - if (IS_ERR(dd)) { - ret = PTR_ERR(dd); + ret = hfi1_init_dd(dd); + if (ret) goto clean_bail; /* error already printed */ - } ret = create_workqueues(dd); if (ret) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index b7473399ec6f..00f83d8a67ce 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -62,13 +62,11 @@ /* * Do all the common PCIe setup and initialization. - * devdata is not yet allocated, and is not allocated until after this - * routine returns success. Therefore dd_dev_err() can't be used for error - * printing. */ -int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) +int hfi1_pcie_init(struct hfi1_devdata *dd) { int ret; + struct pci_dev *pdev = dd->pcidev; ret = pci_enable_device(pdev); if (ret) { @@ -84,15 +82,13 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) * about that, it appears. If the original BAR was retained * in the kernel data structures, this may be OK. */ - hfi1_early_err(&pdev->dev, "pci enable failed: error %d\n", - -ret); - goto done; + dd_dev_err(dd, "pci enable failed: error %d\n", -ret); + return ret; } ret = pci_request_regions(pdev, DRIVER_NAME); if (ret) { - hfi1_early_err(&pdev->dev, - "pci_request_regions fails: err %d\n", -ret); + dd_dev_err(dd, "pci_request_regions fails: err %d\n", -ret); goto bail; } @@ -105,8 +101,7 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) */ ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); if (ret) { - hfi1_early_err(&pdev->dev, - "Unable to set DMA mask: %d\n", ret); + dd_dev_err(dd, "Unable to set DMA mask: %d\n", ret); goto bail; } ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); @@ -114,18 +109,16 @@ int hfi1_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent) ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); } if (ret) { - hfi1_early_err(&pdev->dev, - "Unable to set DMA consistent mask: %d\n", ret); + dd_dev_err(dd, "Unable to set DMA consistent mask: %d\n", ret); goto bail; } pci_set_master(pdev); (void)pci_enable_pcie_error_reporting(pdev); - goto done; + return 0; bail: hfi1_pcie_cleanup(pdev); -done: return ret; } @@ -201,7 +194,7 @@ int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev) dd_dev_err(dd, "WC mapping of send buffers failed\n"); goto nomem; } - dd_dev_info(dd, "WC piobase: %p\n for %x", dd->piobase, TXE_PIO_SIZE); + dd_dev_info(dd, "WC piobase: %p for %x\n", dd->piobase, TXE_PIO_SIZE); dd->physaddr = addr; /* used for io_remap, etc. */ From 09e71899b9cf5ed7495118b3023db41575013fe2 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Thu, 16 Aug 2018 06:28:40 -0700 Subject: [PATCH 015/242] IB/hfi1: Prepare for new HFI1 MSIx API The current HFI1 MSIx API is difficult to follow, change, or add to. In anticipation of moving to an more flexible API, move the current MSIx functionality to the new msix.c module. Reviewed-by: Mike Marciniszyn Reviewed-by: Sadanand Warrier Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/Makefile | 1 + drivers/infiniband/hw/hfi1/chip.c | 280 +---------------------- drivers/infiniband/hw/hfi1/chip.h | 10 + drivers/infiniband/hw/hfi1/hfi.h | 6 +- drivers/infiniband/hw/hfi1/msix.c | 330 ++++++++++++++++++++++++++++ drivers/infiniband/hw/hfi1/msix.h | 64 ++++++ drivers/infiniband/hw/hfi1/pcie.c | 18 -- 7 files changed, 414 insertions(+), 295 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/msix.c create mode 100644 drivers/infiniband/hw/hfi1/msix.h diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index 05b680426e37..a8dcf82ab7cb 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -22,6 +22,7 @@ hfi1-y := \ intr.o \ mad.o \ mmu_rb.o \ + msix.o \ pcie.o \ pio.o \ pio_copy.o \ diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index eab25d54e136..6d81a955ef8e 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -8271,7 +8271,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) * context DATA IRQs are threaded and are not supported by this handler. * */ -static irqreturn_t general_interrupt(int irq, void *data) +irqreturn_t general_interrupt(int irq, void *data) { struct hfi1_devdata *dd = data; u64 regs[CCE_NUM_INT_CSRS]; @@ -8304,7 +8304,7 @@ static irqreturn_t general_interrupt(int irq, void *data) return handled; } -static irqreturn_t sdma_interrupt(int irq, void *data) +irqreturn_t sdma_interrupt(int irq, void *data) { struct sdma_engine *sde = data; struct hfi1_devdata *dd = sde->dd; @@ -8396,7 +8396,7 @@ static inline int check_packet_present(struct hfi1_ctxtdata *rcd) * invoked) is finished. The intent is to avoid extra interrupts while we * are processing packets anyway. */ -static irqreturn_t receive_context_interrupt(int irq, void *data) +irqreturn_t receive_context_interrupt(int irq, void *data) { struct hfi1_ctxtdata *rcd = data; struct hfi1_devdata *dd = rcd->dd; @@ -8436,7 +8436,7 @@ static irqreturn_t receive_context_interrupt(int irq, void *data) * Receive packet thread handler. This expects to be invoked with the * receive interrupt still blocked. */ -static irqreturn_t receive_context_thread(int irq, void *data) +irqreturn_t receive_context_thread(int irq, void *data) { struct hfi1_ctxtdata *rcd = data; int present; @@ -13013,7 +13013,7 @@ void set_intr_state(struct hfi1_devdata *dd, u32 enable) /* * Clear all interrupt sources on the chip. */ -static void clear_all_interrupts(struct hfi1_devdata *dd) +void clear_all_interrupts(struct hfi1_devdata *dd) { int i; @@ -13037,38 +13037,11 @@ static void clear_all_interrupts(struct hfi1_devdata *dd) write_csr(dd, DC_DC8051_ERR_CLR, ~(u64)0); } -/** - * hfi1_clean_up_interrupts() - Free all IRQ resources - * @dd: valid device data data structure - * - * Free the MSIx and assoicated PCI resources, if they have been allocated. - */ -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd) -{ - int i; - struct hfi1_msix_entry *me = dd->msix_entries; - - /* remove irqs - must happen before disabling/turning off */ - for (i = 0; i < dd->num_msix_entries; i++, me++) { - if (!me->arg) /* => no irq, no affinity */ - continue; - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, i, me->arg); - } - - /* clean structures */ - kfree(dd->msix_entries); - dd->msix_entries = NULL; - dd->num_msix_entries = 0; - - pci_free_irq_vectors(dd->pcidev); -} - /* * Remap the interrupt source from the general handler to the given MSI-X * interrupt. */ -static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) { u64 reg; int m, n; @@ -13092,8 +13065,7 @@ static void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr) write_csr(dd, CCE_INT_MAP + (8 * m), reg); } -static void remap_sdma_interrupts(struct hfi1_devdata *dd, - int engine, int msix_intr) +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr) { /* * SDMA engine interrupt sources grouped by type, rather than @@ -13110,196 +13082,11 @@ static void remap_sdma_interrupts(struct hfi1_devdata *dd, msix_intr); } -static int request_msix_irqs(struct hfi1_devdata *dd) -{ - int first_general, last_general; - int first_sdma, last_sdma; - int first_rx, last_rx; - int i, ret = 0; - - /* calculate the ranges we are going to use */ - first_general = 0; - last_general = first_general + 1; - first_sdma = last_general; - last_sdma = first_sdma + dd->num_sdma; - first_rx = last_sdma; - last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts; - - /* VNIC MSIx interrupts get mapped when VNIC contexts are created */ - dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues; - - /* - * Sanity check - the code expects all SDMA chip source - * interrupts to be in the same CSR, starting at bit 0. Verify - * that this is true by checking the bit location of the start. - */ - BUILD_BUG_ON(IS_SDMA_START % 64); - - for (i = 0; i < dd->num_msix_entries; i++) { - struct hfi1_msix_entry *me = &dd->msix_entries[i]; - const char *err_info; - irq_handler_t handler; - irq_handler_t thread = NULL; - void *arg = NULL; - int idx; - struct hfi1_ctxtdata *rcd = NULL; - struct sdma_engine *sde = NULL; - char name[MAX_NAME_SIZE]; - - /* obtain the arguments to pci_request_irq */ - if (first_general <= i && i < last_general) { - idx = i - first_general; - handler = general_interrupt; - arg = dd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d", dd->unit); - err_info = "general"; - me->type = IRQ_GENERAL; - } else if (first_sdma <= i && i < last_sdma) { - idx = i - first_sdma; - sde = &dd->per_sdma[idx]; - handler = sdma_interrupt; - arg = sde; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d sdma%d", dd->unit, idx); - err_info = "sdma"; - remap_sdma_interrupts(dd, idx, i); - me->type = IRQ_SDMA; - } else if (first_rx <= i && i < last_rx) { - idx = i - first_rx; - rcd = hfi1_rcd_get_by_index_safe(dd, idx); - if (rcd) { - /* - * Set the interrupt register and mask for this - * context's interrupt. - */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - handler = receive_context_interrupt; - thread = receive_context_thread; - arg = rcd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d kctxt%d", - dd->unit, idx); - err_info = "receive context"; - remap_intr(dd, IS_RCVAVAIL_START + idx, i); - me->type = IRQ_RCVCTXT; - rcd->msix_intr = i; - hfi1_rcd_put(rcd); - } - } else { - /* not in our expected range - complain, then - * ignore it - */ - dd_dev_err(dd, - "Unexpected extra MSI-X interrupt %d\n", i); - continue; - } - /* no argument, no interrupt */ - if (!arg) - continue; - /* make sure the name is terminated */ - name[sizeof(name) - 1] = 0; - me->irq = pci_irq_vector(dd->pcidev, i); - ret = pci_request_irq(dd->pcidev, i, handler, thread, arg, - name); - if (ret) { - dd_dev_err(dd, - "unable to allocate %s interrupt, irq %d, index %d, err %d\n", - err_info, me->irq, idx, ret); - return ret; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) - dd_dev_err(dd, "unable to pin IRQ %d\n", ret); - } - - return ret; -} - -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd) -{ - int i; - - for (i = 0; i < dd->vnic.num_ctxt; i++) { - struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - synchronize_irq(me->irq); - } -} - -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - if (!me->arg) /* => no irq, no affinity */ - return; - - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - - me->arg = NULL; -} - -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me; - int idx = rcd->ctxt; - void *arg = rcd; - int ret; - - rcd->msix_intr = dd->vnic.msix_idx++; - me = &dd->msix_entries[rcd->msix_intr]; - - /* - * Set the interrupt register and mask for this - * context's interrupt. - */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - me->type = IRQ_RCVCTXT; - me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr); - remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr); - - ret = pci_request_irq(dd->pcidev, rcd->msix_intr, - receive_context_interrupt, - receive_context_thread, arg, - DRIVER_NAME "_%d kctxt%d", dd->unit, idx); - if (ret) { - dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n", - me->irq, idx, ret); - return; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) { - dd_dev_err(dd, - "unable to pin IRQ %d\n", ret); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - } -} - /* * Set the general handler to accept all interrupts, remap all * chip interrupts back to MSI-X 0. */ -static void reset_interrupts(struct hfi1_devdata *dd) +void reset_interrupts(struct hfi1_devdata *dd) { int i; @@ -13312,57 +13099,6 @@ static void reset_interrupts(struct hfi1_devdata *dd) write_csr(dd, CCE_INT_MAP + (8 * i), 0); } -static int set_up_interrupts(struct hfi1_devdata *dd) -{ - u32 total; - int ret, request; - - /* - * Interrupt count: - * 1 general, "slow path" interrupt (includes the SDMA engines - * slow source, SDMACleanupDone) - * N interrupts - one per used SDMA engine - * M interrupt - one per kernel receive context - * V interrupt - one for each VNIC context - */ - total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; - - /* ask for MSI-X interrupts */ - request = request_msix(dd, total); - if (request < 0) { - ret = request; - goto fail; - } else { - dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries), - GFP_KERNEL); - if (!dd->msix_entries) { - ret = -ENOMEM; - goto fail; - } - /* using MSI-X */ - dd->num_msix_entries = total; - dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); - } - - /* mask all interrupts */ - set_intr_state(dd, 0); - /* clear all pending interrupts */ - clear_all_interrupts(dd); - - /* reset general handler mask, chip MSI-X mappings */ - reset_interrupts(dd); - - ret = request_msix_irqs(dd); - if (ret) - goto fail; - - return 0; - -fail: - hfi1_clean_up_interrupts(dd); - return ret; -} - /* * Set up context values in dd. Sets: * diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 36b04d6300e5..3cc8fc982d24 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -1416,6 +1416,16 @@ void hfi1_read_link_quality(struct hfi1_devdata *dd, u8 *link_quality); void hfi1_init_vnic_rsm(struct hfi1_devdata *dd); void hfi1_deinit_vnic_rsm(struct hfi1_devdata *dd); +irqreturn_t general_interrupt(int irq, void *data); +irqreturn_t sdma_interrupt(int irq, void *data); +irqreturn_t receive_context_interrupt(int irq, void *data); +irqreturn_t receive_context_thread(int irq, void *data); + +void clear_all_interrupts(struct hfi1_devdata *dd); +void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); +void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); +void reset_interrupts(struct hfi1_devdata *dd); + /* * Interrupt source table. * diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 5e4ad14455fb..6c5f2c2b7e7e 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -80,6 +80,7 @@ #include "qsfp.h" #include "platform.h" #include "affinity.h" +#include "msix.h" /* bumped 1 from s/w major version of TrueScale */ #define HFI1_CHIP_VERS_MAJ 3U @@ -1431,9 +1432,6 @@ int handle_receive_interrupt(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_nodma_rtail(struct hfi1_ctxtdata *rcd, int thread); int handle_receive_interrupt_dma_rtail(struct hfi1_ctxtdata *rcd, int thread); void set_all_slowpath(struct hfi1_devdata *dd); -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd); -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd); -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd); extern const struct pci_device_id hfi1_pci_tbl[]; void hfi1_make_ud_req_9B(struct rvt_qp *qp, @@ -1973,12 +1971,10 @@ void hfi1_verbs_unregister_sysfs(struct hfi1_devdata *dd); int qsfp_dump(struct hfi1_pportdata *ppd, char *buf, int len); int hfi1_pcie_init(struct hfi1_devdata *dd); -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd); void hfi1_pcie_cleanup(struct pci_dev *pdev); int hfi1_pcie_ddinit(struct hfi1_devdata *dd, struct pci_dev *pdev); void hfi1_pcie_ddcleanup(struct hfi1_devdata *); int pcie_speeds(struct hfi1_devdata *dd); -int request_msix(struct hfi1_devdata *dd, u32 msireq); int restore_pci_variables(struct hfi1_devdata *dd); int save_pci_variables(struct hfi1_devdata *dd); int do_pcie_gen3_transition(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c new file mode 100644 index 000000000000..bea6e657ae17 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -0,0 +1,330 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "hfi.h" +#include "sdma.h" + +/* + * Returns: + * - actual number of interrupts allocated or + * - error + */ +int request_msix(struct hfi1_devdata *dd, u32 msireq) +{ + int nvec; + + nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX); + if (nvec < 0) { + dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec); + return nvec; + } + + return nvec; +} + +int set_up_interrupts(struct hfi1_devdata *dd) +{ + u32 total; + int ret, request; + + /* + * Interrupt count: + * 1 general, "slow path" interrupt (includes the SDMA engines + * slow source, SDMACleanupDone) + * N interrupts - one per used SDMA engine + * M interrupt - one per kernel receive context + * V interrupt - one for each VNIC context + */ + total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; + + /* ask for MSI-X interrupts */ + request = request_msix(dd, total); + if (request < 0) { + ret = request; + goto fail; + } else { + dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries), + GFP_KERNEL); + if (!dd->msix_entries) { + ret = -ENOMEM; + goto fail; + } + /* using MSI-X */ + dd->num_msix_entries = total; + dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); + } + + /* mask all interrupts */ set_intr_state(dd, 0); + /* clear all pending interrupts */ + clear_all_interrupts(dd); + + /* reset general handler mask, chip MSI-X mappings */ + reset_interrupts(dd); + + ret = request_msix_irqs(dd); + if (ret) + goto fail; + + return 0; + +fail: + hfi1_clean_up_interrupts(dd); + return ret; +} + +int request_msix_irqs(struct hfi1_devdata *dd) +{ + int first_general, last_general; + int first_sdma, last_sdma; + int first_rx, last_rx; + int i, ret = 0; + + /* calculate the ranges we are going to use */ + first_general = 0; + last_general = first_general + 1; + first_sdma = last_general; + last_sdma = first_sdma + dd->num_sdma; + first_rx = last_sdma; + last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts; + + /* VNIC MSIx interrupts get mapped when VNIC contexts are created */ + dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues; + + /* + * Sanity check - the code expects all SDMA chip source + * interrupts to be in the same CSR, starting at bit 0. Verify + * that this is true by checking the bit location of the start. + */ + BUILD_BUG_ON(IS_SDMA_START % 64); + + for (i = 0; i < dd->num_msix_entries; i++) { + struct hfi1_msix_entry *me = &dd->msix_entries[i]; + const char *err_info; + irq_handler_t handler; + irq_handler_t thread = NULL; + void *arg = NULL; + int idx; + struct hfi1_ctxtdata *rcd = NULL; + struct sdma_engine *sde = NULL; + char name[MAX_NAME_SIZE]; + + /* obtain the arguments to pci_request_irq */ + if (first_general <= i && i < last_general) { + idx = i - first_general; + handler = general_interrupt; + arg = dd; + snprintf(name, sizeof(name), + DRIVER_NAME "_%d", dd->unit); + err_info = "general"; + me->type = IRQ_GENERAL; + } else if (first_sdma <= i && i < last_sdma) { + idx = i - first_sdma; + sde = &dd->per_sdma[idx]; + handler = sdma_interrupt; + arg = sde; + snprintf(name, sizeof(name), + DRIVER_NAME "_%d sdma%d", dd->unit, idx); + err_info = "sdma"; + remap_sdma_interrupts(dd, idx, i); + me->type = IRQ_SDMA; + } else if (first_rx <= i && i < last_rx) { + idx = i - first_rx; + rcd = hfi1_rcd_get_by_index_safe(dd, idx); + if (rcd) { + /* + * Set the interrupt register and mask for this + * context's interrupt. + */ + rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; + rcd->imask = ((u64)1) << + ((IS_RCVAVAIL_START + idx) % 64); + handler = receive_context_interrupt; + thread = receive_context_thread; + arg = rcd; + snprintf(name, sizeof(name), + DRIVER_NAME "_%d kctxt%d", + dd->unit, idx); + err_info = "receive context"; + remap_intr(dd, IS_RCVAVAIL_START + idx, i); + me->type = IRQ_RCVCTXT; + rcd->msix_intr = i; + hfi1_rcd_put(rcd); + } + } else { + /* not in our expected range - complain, then + * ignore it + */ + dd_dev_err(dd, + "Unexpected extra MSI-X interrupt %d\n", i); + continue; + } + /* no argument, no interrupt */ + if (!arg) + continue; + /* make sure the name is terminated */ + name[sizeof(name) - 1] = 0; + me->irq = pci_irq_vector(dd->pcidev, i); + ret = pci_request_irq(dd->pcidev, i, handler, thread, arg, + name); + if (ret) { + dd_dev_err(dd, + "unable to allocate %s interrupt, irq %d, index %d, err %d\n", + err_info, me->irq, idx, ret); + return ret; + } + /* + * assign arg after pci_request_irq call, so it will be + * cleaned up + */ + me->arg = arg; + + ret = hfi1_get_irq_affinity(dd, me); + if (ret) + dd_dev_err(dd, "unable to pin IRQ %d\n", ret); + } + + return ret; +} + +void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->vnic.num_ctxt; i++) { + struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; + struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; + + synchronize_irq(me->irq); + } +} + +void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; + + if (!me->arg) /* => no irq, no affinity */ + return; + + hfi1_put_irq_affinity(dd, me); + pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); + + me->arg = NULL; +} + +void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd) +{ + struct hfi1_devdata *dd = rcd->dd; + struct hfi1_msix_entry *me; + int idx = rcd->ctxt; + void *arg = rcd; + int ret; + + rcd->msix_intr = dd->vnic.msix_idx++; + me = &dd->msix_entries[rcd->msix_intr]; + + /* + * Set the interrupt register and mask for this + * context's interrupt. + */ + rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; + rcd->imask = ((u64)1) << + ((IS_RCVAVAIL_START + idx) % 64); + me->type = IRQ_RCVCTXT; + me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr); + remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr); + + ret = pci_request_irq(dd->pcidev, rcd->msix_intr, + receive_context_interrupt, + receive_context_thread, arg, + DRIVER_NAME "_%d kctxt%d", dd->unit, idx); + if (ret) { + dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n", + me->irq, idx, ret); + return; + } + /* + * assign arg after pci_request_irq call, so it will be + * cleaned up + */ + me->arg = arg; + + ret = hfi1_get_irq_affinity(dd, me); + if (ret) { + dd_dev_err(dd, + "unable to pin IRQ %d\n", ret); + pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); + } +} + +/** + * hfi1_clean_up_interrupts() - Free all IRQ resources + * @dd: valid device data data structure + * + * Free the MSIx and associated PCI resources, if they have been allocated. + */ +void hfi1_clean_up_interrupts(struct hfi1_devdata *dd) +{ + int i; + struct hfi1_msix_entry *me = dd->msix_entries; + + /* remove irqs - must happen before disabling/turning off */ + for (i = 0; i < dd->num_msix_entries; i++, me++) { + if (!me->arg) /* => no irq, no affinity */ + continue; + hfi1_put_irq_affinity(dd, me); + pci_free_irq(dd->pcidev, i, me->arg); + } + + /* clean structures */ + kfree(dd->msix_entries); + dd->msix_entries = NULL; + dd->num_msix_entries = 0; + + pci_free_irq_vectors(dd->pcidev); +} diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h new file mode 100644 index 000000000000..45cadd56fef9 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/msix.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ +#ifndef _HFI1_MSIX_H +#define _HFI1_MSIX_H + +#include "hfi.h" + +/* MSIx interface */ +int request_msix(struct hfi1_devdata *dd, u32 msireq); +int set_up_interrupts(struct hfi1_devdata *dd); +int request_msix_irqs(struct hfi1_devdata *dd); +void hfi1_clean_up_interrupts(struct hfi1_devdata *dd); + +/* VNIC interface */ +void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd); +void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd); +void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd); + +#endif diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 00f83d8a67ce..577d7479a845 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -332,24 +332,6 @@ int pcie_speeds(struct hfi1_devdata *dd) return 0; } -/* - * Returns: - * - actual number of interrupts allocated or - * - error - */ -int request_msix(struct hfi1_devdata *dd, u32 msireq) -{ - int nvec; - - nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX); - if (nvec < 0) { - dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec); - return nvec; - } - - return nvec; -} - /* restore command and BARs after a reset has wiped them out */ int restore_pci_variables(struct hfi1_devdata *dd) { From 6eb4eb10fb0d14340956c05281b7e09d80902788 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 15 Aug 2018 23:04:04 -0700 Subject: [PATCH 016/242] IB/hfi1: Make the MSIx resource allocation a bit more flexible The current method of allocating MSIx resources is a bit cumbersome, and not very easily added to. Refactor and re-order the code paths into a more consistent interface. Update the interface so that allocations are not order dependent. Reviewed-by: Mike Marciniszyn Reviewed-by: Sadanand Warrier Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/affinity.c | 4 +- drivers/infiniband/hw/hfi1/chip.c | 31 +- drivers/infiniband/hw/hfi1/hfi.h | 13 +- drivers/infiniband/hw/hfi1/init.c | 4 +- drivers/infiniband/hw/hfi1/msix.c | 477 +++++++++++++------------ drivers/infiniband/hw/hfi1/msix.h | 14 +- drivers/infiniband/hw/hfi1/sdma.h | 1 + drivers/infiniband/hw/hfi1/vnic_main.c | 12 +- 8 files changed, 304 insertions(+), 252 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/affinity.c b/drivers/infiniband/hw/hfi1/affinity.c index bedd5fba33b0..2baf38cc1e23 100644 --- a/drivers/infiniband/hw/hfi1/affinity.c +++ b/drivers/infiniband/hw/hfi1/affinity.c @@ -817,10 +817,10 @@ static void hfi1_update_sdma_affinity(struct hfi1_msix_entry *msix, int cpu) set = &entry->def_intr; cpumask_set_cpu(cpu, &set->mask); cpumask_set_cpu(cpu, &set->used); - for (i = 0; i < dd->num_msix_entries; i++) { + for (i = 0; i < dd->msix_info.max_requested; i++) { struct hfi1_msix_entry *other_msix; - other_msix = &dd->msix_entries[i]; + other_msix = &dd->msix_info.msix_entries[i]; if (other_msix->type != IRQ_SDMA || other_msix == msix) continue; diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 6d81a955ef8e..8acbf8b0304f 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -13099,6 +13099,35 @@ void reset_interrupts(struct hfi1_devdata *dd) write_csr(dd, CCE_INT_MAP + (8 * i), 0); } +/** + * set_up_interrupts() - Initialize the IRQ resources and state + * @dd: valid devdata + * + */ +static int set_up_interrupts(struct hfi1_devdata *dd) +{ + int ret; + + /* mask all interrupts */ + set_intr_state(dd, 0); + /* clear all pending interrupts */ + clear_all_interrupts(dd); + + /* reset general handler mask, chip MSI-X mappings */ + reset_interrupts(dd); + + /* ask for MSI-X interrupts */ + ret = msix_initialize(dd); + if (ret) + return ret; + + ret = msix_request_irqs(dd); + if (ret) + msix_clean_up_interrupts(dd); + + return ret; +} + /* * Set up context values in dd. Sets: * @@ -14966,7 +14995,7 @@ bail_free_cntrs: free_cntrs(dd); bail_clear_intr: hfi1_comp_vectors_clean_up(dd); - hfi1_clean_up_interrupts(dd); + msix_clean_up_interrupts(dd); bail_cleanup: hfi1_pcie_ddcleanup(dd); bail_free: diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6c5f2c2b7e7e..e2a5c2dac109 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -668,6 +668,14 @@ struct hfi1_msix_entry { struct irq_affinity_notify notify; }; +struct hfi1_msix_info { + /* lock to synchronize in_use_msix access */ + spinlock_t msix_lock; + DECLARE_BITMAP(in_use_msix, CCE_NUM_MSIX_VECTORS); + struct hfi1_msix_entry *msix_entries; + u16 max_requested; +}; + /* per-SL CCA information */ struct cca_timer { struct hrtimer hrtimer; @@ -993,7 +1001,6 @@ struct hfi1_vnic_data { struct idr vesw_idr; u8 rmt_start; u8 num_ctxt; - u32 msix_idx; }; struct hfi1_vnic_vport_info; @@ -1207,9 +1214,7 @@ struct hfi1_devdata { struct diag_client *diag_client; /* MSI-X information */ - struct hfi1_msix_entry *msix_entries; - u32 num_msix_entries; - u32 first_dyn_msix_idx; + struct hfi1_msix_info msix_info; /* general interrupt: mask of handled interrupts */ u64 gi_mask[CCE_NUM_INT_CSRS]; diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 0965ec69f4dd..3ab7e2316523 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1052,7 +1052,7 @@ static void shutdown_device(struct hfi1_devdata *dd) /* mask and clean up interrupts, but not errors */ set_intr_state(dd, 0); - hfi1_clean_up_interrupts(dd); + msix_clean_up_interrupts(dd); for (pidx = 0; pidx < dd->num_pports; ++pidx) { ppd = dd->pport + pidx; @@ -1738,7 +1738,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) dd_dev_err(dd, "Failed to create /dev devices: %d\n", -j); if (initfail || ret) { - hfi1_clean_up_interrupts(dd); + msix_clean_up_interrupts(dd); stop_timers(dd); flush_workqueue(ib_wq); for (pidx = 0; pidx < dd->num_pports; ++pidx) { diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c index bea6e657ae17..72cd0269cb2e 100644 --- a/drivers/infiniband/hw/hfi1/msix.c +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -47,284 +47,301 @@ */ #include "hfi.h" +#include "affinity.h" #include "sdma.h" -/* - * Returns: - * - actual number of interrupts allocated or - * - error +/** + * msix_initialize() - Calculate, request and configure MSIx IRQs + * @dd: valid hfi1 devdata + * */ -int request_msix(struct hfi1_devdata *dd, u32 msireq) -{ - int nvec; - - nvec = pci_alloc_irq_vectors(dd->pcidev, msireq, msireq, PCI_IRQ_MSIX); - if (nvec < 0) { - dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", nvec); - return nvec; - } - - return nvec; -} - -int set_up_interrupts(struct hfi1_devdata *dd) +int msix_initialize(struct hfi1_devdata *dd) { u32 total; - int ret, request; + int ret; + struct hfi1_msix_entry *entries; /* - * Interrupt count: - * 1 general, "slow path" interrupt (includes the SDMA engines - * slow source, SDMACleanupDone) - * N interrupts - one per used SDMA engine - * M interrupt - one per kernel receive context - * V interrupt - one for each VNIC context + * MSIx interrupt count: + * one for the general, "slow path" interrupt + * one per used SDMA engine + * one per kernel receive context + * one for each VNIC context + * ...any new IRQs should be added here. */ total = 1 + dd->num_sdma + dd->n_krcv_queues + dd->num_vnic_contexts; - /* ask for MSI-X interrupts */ - request = request_msix(dd, total); - if (request < 0) { - ret = request; - goto fail; - } else { - dd->msix_entries = kcalloc(total, sizeof(*dd->msix_entries), - GFP_KERNEL); - if (!dd->msix_entries) { - ret = -ENOMEM; - goto fail; - } - /* using MSI-X */ - dd->num_msix_entries = total; - dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); + if (total >= CCE_NUM_MSIX_VECTORS) + return -EINVAL; + + ret = pci_alloc_irq_vectors(dd->pcidev, total, total, PCI_IRQ_MSIX); + if (ret < 0) { + dd_dev_err(dd, "pci_alloc_irq_vectors() failed: %d\n", ret); + return ret; } - /* mask all interrupts */ set_intr_state(dd, 0); - /* clear all pending interrupts */ - clear_all_interrupts(dd); + entries = kcalloc(total, sizeof(*dd->msix_info.msix_entries), + GFP_KERNEL); + if (!entries) { + pci_free_irq_vectors(dd->pcidev); + return -ENOMEM; + } - /* reset general handler mask, chip MSI-X mappings */ - reset_interrupts(dd); - - ret = request_msix_irqs(dd); - if (ret) - goto fail; + dd->msix_info.msix_entries = entries; + spin_lock_init(&dd->msix_info.msix_lock); + bitmap_zero(dd->msix_info.in_use_msix, total); + dd->msix_info.max_requested = total; + dd_dev_info(dd, "%u MSI-X interrupts allocated\n", total); return 0; - -fail: - hfi1_clean_up_interrupts(dd); - return ret; } -int request_msix_irqs(struct hfi1_devdata *dd) +/** + * msix_request_irq() - Allocate a free MSIx IRQ + * @dd: valid devdata + * @arg: context information for the IRQ + * @handler: IRQ handler + * @thread: IRQ thread handler (could be NULL) + * @idx: zero base idx if multiple devices are needed + * @type: affinty IRQ type + * + * Allocated an MSIx vector if available, and then create the appropriate + * meta data needed to keep track of the pci IRQ request. + * + * Return: + * < 0 Error + * >= 0 MSIx vector + * + */ +static int msix_request_irq(struct hfi1_devdata *dd, void *arg, + irq_handler_t handler, irq_handler_t thread, + u32 idx, enum irq_type type) { - int first_general, last_general; - int first_sdma, last_sdma; - int first_rx, last_rx; - int i, ret = 0; + unsigned long nr; + int irq; + int ret; + const char *err_info; + char name[MAX_NAME_SIZE]; + struct hfi1_msix_entry *me; - /* calculate the ranges we are going to use */ - first_general = 0; - last_general = first_general + 1; - first_sdma = last_general; - last_sdma = first_sdma + dd->num_sdma; - first_rx = last_sdma; - last_rx = first_rx + dd->n_krcv_queues + dd->num_vnic_contexts; + /* Allocate an MSIx vector */ + spin_lock(&dd->msix_info.msix_lock); + nr = find_first_zero_bit(dd->msix_info.in_use_msix, + dd->msix_info.max_requested); + if (nr < dd->msix_info.max_requested) + __set_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); - /* VNIC MSIx interrupts get mapped when VNIC contexts are created */ - dd->first_dyn_msix_idx = first_rx + dd->n_krcv_queues; + if (nr == dd->msix_info.max_requested) + return -ENOSPC; + + /* Specific verification and determine the name */ + switch (type) { + case IRQ_GENERAL: + /* general interrupt must be MSIx vector 0 */ + if (nr) { + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + dd_dev_err(dd, "Invalid index %lu for GENERAL IRQ\n", + nr); + return -EINVAL; + } + snprintf(name, sizeof(name), DRIVER_NAME "_%d", dd->unit); + err_info = "general"; + break; + case IRQ_SDMA: + snprintf(name, sizeof(name), DRIVER_NAME "_%d sdma%d", + dd->unit, idx); + err_info = "sdma"; + break; + case IRQ_RCVCTXT: + snprintf(name, sizeof(name), DRIVER_NAME "_%d kctxt%d", + dd->unit, idx); + err_info = "receive context"; + break; + case IRQ_OTHER: + default: + return -EINVAL; + } + name[sizeof(name) - 1] = 0; + + irq = pci_irq_vector(dd->pcidev, nr); + ret = pci_request_irq(dd->pcidev, nr, handler, thread, arg, name); + if (ret) { + dd_dev_err(dd, + "%s: request for IRQ %d failed, MSIx %d, err %d\n", + err_info, irq, idx, ret); + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(nr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); + return ret; + } /* - * Sanity check - the code expects all SDMA chip source - * interrupts to be in the same CSR, starting at bit 0. Verify - * that this is true by checking the bit location of the start. + * assign arg after pci_request_irq call, so it will be + * cleaned up */ - BUILD_BUG_ON(IS_SDMA_START % 64); + me = &dd->msix_info.msix_entries[nr]; + me->irq = irq; + me->arg = arg; + me->type = type; - for (i = 0; i < dd->num_msix_entries; i++) { - struct hfi1_msix_entry *me = &dd->msix_entries[i]; - const char *err_info; - irq_handler_t handler; - irq_handler_t thread = NULL; - void *arg = NULL; - int idx; - struct hfi1_ctxtdata *rcd = NULL; - struct sdma_engine *sde = NULL; - char name[MAX_NAME_SIZE]; + /* This is a request, so a failure is not fatal */ + ret = hfi1_get_irq_affinity(dd, me); + if (ret) + dd_dev_err(dd, "unable to pin IRQ %d\n", ret); - /* obtain the arguments to pci_request_irq */ - if (first_general <= i && i < last_general) { - idx = i - first_general; - handler = general_interrupt; - arg = dd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d", dd->unit); - err_info = "general"; - me->type = IRQ_GENERAL; - } else if (first_sdma <= i && i < last_sdma) { - idx = i - first_sdma; - sde = &dd->per_sdma[idx]; - handler = sdma_interrupt; - arg = sde; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d sdma%d", dd->unit, idx); - err_info = "sdma"; - remap_sdma_interrupts(dd, idx, i); - me->type = IRQ_SDMA; - } else if (first_rx <= i && i < last_rx) { - idx = i - first_rx; - rcd = hfi1_rcd_get_by_index_safe(dd, idx); - if (rcd) { - /* - * Set the interrupt register and mask for this - * context's interrupt. - */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - handler = receive_context_interrupt; - thread = receive_context_thread; - arg = rcd; - snprintf(name, sizeof(name), - DRIVER_NAME "_%d kctxt%d", - dd->unit, idx); - err_info = "receive context"; - remap_intr(dd, IS_RCVAVAIL_START + idx, i); - me->type = IRQ_RCVCTXT; - rcd->msix_intr = i; - hfi1_rcd_put(rcd); - } - } else { - /* not in our expected range - complain, then - * ignore it - */ - dd_dev_err(dd, - "Unexpected extra MSI-X interrupt %d\n", i); - continue; - } - /* no argument, no interrupt */ - if (!arg) - continue; - /* make sure the name is terminated */ - name[sizeof(name) - 1] = 0; - me->irq = pci_irq_vector(dd->pcidev, i); - ret = pci_request_irq(dd->pcidev, i, handler, thread, arg, - name); - if (ret) { - dd_dev_err(dd, - "unable to allocate %s interrupt, irq %d, index %d, err %d\n", - err_info, me->irq, idx, ret); - return ret; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) - dd_dev_err(dd, "unable to pin IRQ %d\n", ret); - } - - return ret; + return nr; } -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd) +/** + * msix_request_rcd_irq() - Helper function for RCVAVAIL IRQs + * @rcd: valid rcd context + * + */ +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd) { - int i; + int nr; - for (i = 0; i < dd->vnic.num_ctxt; i++) { - struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - synchronize_irq(me->irq); - } -} - -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me = &dd->msix_entries[rcd->msix_intr]; - - if (!me->arg) /* => no irq, no affinity */ - return; - - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - - me->arg = NULL; -} - -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd) -{ - struct hfi1_devdata *dd = rcd->dd; - struct hfi1_msix_entry *me; - int idx = rcd->ctxt; - void *arg = rcd; - int ret; - - rcd->msix_intr = dd->vnic.msix_idx++; - me = &dd->msix_entries[rcd->msix_intr]; + nr = msix_request_irq(rcd->dd, rcd, receive_context_interrupt, + receive_context_thread, rcd->ctxt, IRQ_RCVCTXT); + if (nr < 0) + return nr; /* * Set the interrupt register and mask for this * context's interrupt. */ - rcd->ireg = (IS_RCVAVAIL_START + idx) / 64; - rcd->imask = ((u64)1) << - ((IS_RCVAVAIL_START + idx) % 64); - me->type = IRQ_RCVCTXT; - me->irq = pci_irq_vector(dd->pcidev, rcd->msix_intr); - remap_intr(dd, IS_RCVAVAIL_START + idx, rcd->msix_intr); + rcd->ireg = (IS_RCVAVAIL_START + rcd->ctxt) / 64; + rcd->imask = ((u64)1) << ((IS_RCVAVAIL_START + rcd->ctxt) % 64); + rcd->msix_intr = nr; + remap_intr(rcd->dd, IS_RCVAVAIL_START + rcd->ctxt, nr); - ret = pci_request_irq(dd->pcidev, rcd->msix_intr, - receive_context_interrupt, - receive_context_thread, arg, - DRIVER_NAME "_%d kctxt%d", dd->unit, idx); - if (ret) { - dd_dev_err(dd, "vnic irq request (irq %d, idx %d) fail %d\n", - me->irq, idx, ret); - return; - } - /* - * assign arg after pci_request_irq call, so it will be - * cleaned up - */ - me->arg = arg; - - ret = hfi1_get_irq_affinity(dd, me); - if (ret) { - dd_dev_err(dd, - "unable to pin IRQ %d\n", ret); - pci_free_irq(dd->pcidev, rcd->msix_intr, me->arg); - } + return 0; } /** - * hfi1_clean_up_interrupts() - Free all IRQ resources + * msix_request_smda_ira() - Helper for getting SDMA IRQ resources + * @sde: valid sdma engine + * + */ +int msix_request_sdma_irq(struct sdma_engine *sde) +{ + int nr; + + nr = msix_request_irq(sde->dd, sde, sdma_interrupt, NULL, + sde->this_idx, IRQ_SDMA); + if (nr < 0) + return nr; + sde->msix_intr = nr; + remap_sdma_interrupts(sde->dd, sde->this_idx, nr); + + return 0; +} + +/** + * msix_request_irqs() - Allocate all MSIx IRQs + * @dd: valid devdata structure + * + * Helper function to request the used MSIx IRQs. + * + */ +int msix_request_irqs(struct hfi1_devdata *dd) +{ + int i; + int ret; + + ret = msix_request_irq(dd, dd, general_interrupt, NULL, 0, IRQ_GENERAL); + if (ret < 0) + return ret; + + for (i = 0; i < dd->num_sdma; i++) { + struct sdma_engine *sde = &dd->per_sdma[i]; + + ret = msix_request_sdma_irq(sde); + if (ret) + return ret; + } + + for (i = 0; i < dd->n_krcv_queues; i++) { + struct hfi1_ctxtdata *rcd = hfi1_rcd_get_by_index_safe(dd, i); + + if (rcd) + ret = msix_request_rcd_irq(rcd); + hfi1_rcd_put(rcd); + if (ret) + return ret; + } + + return 0; +} + +/** + * msix_free_irq() - Free the specified MSIx resources and IRQ + * @dd: valid devdata + * @msix_intr: MSIx vector to free. + * + */ +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr) +{ + struct hfi1_msix_entry *me; + + if (msix_intr >= dd->msix_info.max_requested) + return; + + me = &dd->msix_info.msix_entries[msix_intr]; + + if (!me->arg) /* => no irq, no affinity */ + return; + + hfi1_put_irq_affinity(dd, me); + pci_free_irq(dd->pcidev, msix_intr, me->arg); + + me->arg = NULL; + + spin_lock(&dd->msix_info.msix_lock); + __clear_bit(msix_intr, dd->msix_info.in_use_msix); + spin_unlock(&dd->msix_info.msix_lock); +} + +/** + * hfi1_clean_up_msix_interrupts() - Free all MSIx IRQ resources * @dd: valid device data data structure * * Free the MSIx and associated PCI resources, if they have been allocated. */ -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd) +void msix_clean_up_interrupts(struct hfi1_devdata *dd) { int i; - struct hfi1_msix_entry *me = dd->msix_entries; + struct hfi1_msix_entry *me = dd->msix_info.msix_entries; /* remove irqs - must happen before disabling/turning off */ - for (i = 0; i < dd->num_msix_entries; i++, me++) { - if (!me->arg) /* => no irq, no affinity */ - continue; - hfi1_put_irq_affinity(dd, me); - pci_free_irq(dd->pcidev, i, me->arg); - } + for (i = 0; i < dd->msix_info.max_requested; i++, me++) + msix_free_irq(dd, i); /* clean structures */ - kfree(dd->msix_entries); - dd->msix_entries = NULL; - dd->num_msix_entries = 0; + kfree(dd->msix_info.msix_entries); + dd->msix_info.msix_entries = NULL; + dd->msix_info.max_requested = 0; pci_free_irq_vectors(dd->pcidev); } + +/** + * msix_vnic_syncrhonize_irq() - Vnic IRQ synchronize + * @dd: valid devdata + */ +void msix_vnic_synchronize_irq(struct hfi1_devdata *dd) +{ + int i; + + for (i = 0; i < dd->vnic.num_ctxt; i++) { + struct hfi1_ctxtdata *rcd = dd->vnic.ctxt[i]; + struct hfi1_msix_entry *me; + + me = &dd->msix_info.msix_entries[rcd->msix_intr]; + + synchronize_irq(me->irq); + } +} diff --git a/drivers/infiniband/hw/hfi1/msix.h b/drivers/infiniband/hw/hfi1/msix.h index 45cadd56fef9..a514881632a4 100644 --- a/drivers/infiniband/hw/hfi1/msix.h +++ b/drivers/infiniband/hw/hfi1/msix.h @@ -51,14 +51,14 @@ #include "hfi.h" /* MSIx interface */ -int request_msix(struct hfi1_devdata *dd, u32 msireq); -int set_up_interrupts(struct hfi1_devdata *dd); -int request_msix_irqs(struct hfi1_devdata *dd); -void hfi1_clean_up_interrupts(struct hfi1_devdata *dd); +int msix_initialize(struct hfi1_devdata *dd); +int msix_request_irqs(struct hfi1_devdata *dd); +void msix_clean_up_interrupts(struct hfi1_devdata *dd); +int msix_request_rcd_irq(struct hfi1_ctxtdata *rcd); +int msix_request_sdma_irq(struct sdma_engine *sde); +void msix_free_irq(struct hfi1_devdata *dd, u8 msix_intr); /* VNIC interface */ -void hfi1_vnic_synchronize_irq(struct hfi1_devdata *dd); -void hfi1_set_vnic_msix_info(struct hfi1_ctxtdata *rcd); -void hfi1_reset_vnic_msix_info(struct hfi1_ctxtdata *rcd); +void msix_vnic_synchronize_irq(struct hfi1_devdata *dd); #endif diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index f2d83294271e..d2da2e651600 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -405,6 +405,7 @@ struct sdma_engine { struct list_head flushlist; struct cpumask cpu_mask; struct kobject kobj; + u32 msix_intr; }; int sdma_init(struct hfi1_devdata *dd, u8 port); diff --git a/drivers/infiniband/hw/hfi1/vnic_main.c b/drivers/infiniband/hw/hfi1/vnic_main.c index c643d80c5a53..c9876d9e3cb9 100644 --- a/drivers/infiniband/hw/hfi1/vnic_main.c +++ b/drivers/infiniband/hw/hfi1/vnic_main.c @@ -120,7 +120,7 @@ static int allocate_vnic_ctxt(struct hfi1_devdata *dd, uctxt->seq_cnt = 1; uctxt->is_vnic = true; - hfi1_set_vnic_msix_info(uctxt); + msix_request_rcd_irq(uctxt); hfi1_stats.sps_ctxts++; dd_dev_dbg(dd, "created vnic context %d\n", uctxt->ctxt); @@ -135,8 +135,6 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, dd_dev_dbg(dd, "closing vnic context %d\n", uctxt->ctxt); flush_wc(); - hfi1_reset_vnic_msix_info(uctxt); - /* * Disable receive context and interrupt available, reset all * RcvCtxtCtrl bits to default values. @@ -148,6 +146,10 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd, HFI1_RCVCTRL_NO_RHQ_DROP_DIS | HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + /* msix_intr will always be > 0, only clean up if this is true */ + if (uctxt->msix_intr) + msix_free_irq(dd, uctxt->msix_intr); + uctxt->event_flags = 0; hfi1_clear_tids(uctxt); @@ -626,7 +628,7 @@ static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo) idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id); /* ensure irqs see the change */ - hfi1_vnic_synchronize_irq(dd); + msix_vnic_synchronize_irq(dd); /* remove unread skbs */ for (i = 0; i < vinfo->num_rx_q; i++) { @@ -690,8 +692,6 @@ static int hfi1_vnic_init(struct hfi1_vnic_vport_info *vinfo) rc = hfi1_vnic_txreq_init(dd); if (rc) goto txreq_fail; - - dd->vnic.msix_idx = dd->first_dyn_msix_idx; } for (i = dd->vnic.num_ctxt; i < vinfo->num_rx_q; i++) { From e63bb50d1994b71f97b40077ddec46a7354197a0 Mon Sep 17 00:00:00 2001 From: Kamenee Arumugam Date: Wed, 15 Aug 2018 23:04:13 -0700 Subject: [PATCH 017/242] IB/hfi1: PCIe bus width retry Retry the PCIe link training up to 'pcie_retry' times if the PCIe link width is narrower than the previous width. Reviewed-by: Michael J. Ruhl Reviewed-by: Mitko Haralanov Signed-off-by: Kamenee Arumugam Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/pcie.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/pcie.c b/drivers/infiniband/hw/hfi1/pcie.c index 577d7479a845..cb7fc9fb3c9e 100644 --- a/drivers/infiniband/hw/hfi1/pcie.c +++ b/drivers/infiniband/hw/hfi1/pcie.c @@ -1005,6 +1005,7 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) const u8 (*ctle_tunings)[4]; uint static_ctle_mode; int return_error = 0; + u32 target_width; /* PCIe Gen3 is for the ASIC only */ if (dd->icode != ICODE_RTL_SILICON) @@ -1044,6 +1045,9 @@ int do_pcie_gen3_transition(struct hfi1_devdata *dd) return 0; } + /* Previous Gen1/Gen2 bus width */ + target_width = dd->lbus_width; + /* * Do the Gen3 transition. Steps are those of the PCIe Gen3 * recipe. @@ -1412,11 +1416,12 @@ retry: dd_dev_info(dd, "%s: new speed and width: %s\n", __func__, dd->lbus_info); - if (dd->lbus_speed != target_speed) { /* not target */ + if (dd->lbus_speed != target_speed || + dd->lbus_width < target_width) { /* not target */ /* maybe retry */ do_retry = retry_count < pcie_retry; - dd_dev_err(dd, "PCIe link speed did not switch to Gen%d%s\n", - pcie_target, do_retry ? ", retrying" : ""); + dd_dev_err(dd, "PCIe link speed or width did not match target%s\n", + do_retry ? ", retrying" : ""); retry_count++; if (do_retry) { msleep(100); /* allow time to settle */ From a2f7bbdc2dba0e4c82a9243a64931aa81c0c28cf Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 15 Aug 2018 23:04:22 -0700 Subject: [PATCH 018/242] IB/hfi1: Rework the IRQ API to be more flexible The current IRQ API is an all or nothing interface. This has two problems: 1. All IRQs are enabled regardless of use 2. Moving from general interrupt to MSIx handling is difficult Introduce a new API to enable/disable specific IRQs or a range of IRQs. Do not enable and disable all IRQs in one step. Rework various modules to enable/disable IRQs when needed. Reviewed-by: Mike Marciniszyn Reviewed-by: Sadanand Warrier Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 158 +++++++++++++------------- drivers/infiniband/hw/hfi1/chip.h | 61 ++++++---- drivers/infiniband/hw/hfi1/file_ops.c | 7 ++ drivers/infiniband/hw/hfi1/hfi.h | 8 +- drivers/infiniband/hw/hfi1/init.c | 26 ++++- drivers/infiniband/hw/hfi1/msix.c | 16 +++ 6 files changed, 167 insertions(+), 109 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 8acbf8b0304f..b9d8b0e4c5eb 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -1098,9 +1098,9 @@ struct err_reg_info { const char *desc; }; -#define NUM_MISC_ERRS (IS_GENERAL_ERR_END - IS_GENERAL_ERR_START) -#define NUM_DC_ERRS (IS_DC_END - IS_DC_START) -#define NUM_VARIOUS (IS_VARIOUS_END - IS_VARIOUS_START) +#define NUM_MISC_ERRS (IS_GENERAL_ERR_END + 1 - IS_GENERAL_ERR_START) +#define NUM_DC_ERRS (IS_DC_END + 1 - IS_DC_START) +#define NUM_VARIOUS (IS_VARIOUS_END + 1 - IS_VARIOUS_START) /* * Helpers for building HFI and DC error interrupt table entries. Different @@ -8176,7 +8176,7 @@ static void is_rcv_avail_int(struct hfi1_devdata *dd, unsigned int source) /** * is_rcv_urgent_int() - User receive context urgent IRQ handler * @dd: valid dd - * @source: logical IRQ source (ofse from IS_RCVURGENT_START) + * @source: logical IRQ source (offset from IS_RCVURGENT_START) * * RX block receive urgent interrupt. Source is < 160. * @@ -8226,7 +8226,7 @@ static const struct is_table is_table[] = { is_sdma_eng_err_name, is_sdma_eng_err_int }, { IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, is_sendctxt_err_name, is_sendctxt_err_int }, -{ IS_SDMA_START, IS_SDMA_END, +{ IS_SDMA_START, IS_SDMA_IDLE_END, is_sdma_eng_name, is_sdma_eng_int }, { IS_VARIOUS_START, IS_VARIOUS_END, is_various_name, is_various_int }, @@ -8252,7 +8252,7 @@ static void is_interrupt(struct hfi1_devdata *dd, unsigned int source) /* avoids a double compare by walking the table in-order */ for (entry = &is_table[0]; entry->is_name; entry++) { - if (source < entry->end) { + if (source <= entry->end) { trace_hfi1_interrupt(dd, entry, source); entry->is_int(dd, source - entry->start); return; @@ -9646,30 +9646,10 @@ void qsfp_event(struct work_struct *work) } } -static void init_qsfp_int(struct hfi1_devdata *dd) +void init_qsfp_int(struct hfi1_devdata *dd) { struct hfi1_pportdata *ppd = dd->pport; - u64 qsfp_mask, cce_int_mask; - const int qsfp1_int_smask = QSFP1_INT % 64; - const int qsfp2_int_smask = QSFP2_INT % 64; - - /* - * disable QSFP1 interrupts for HFI1, QSFP2 interrupts for HFI0 - * Qsfp1Int and Qsfp2Int are adjacent bits in the same CSR, - * therefore just one of QSFP1_INT/QSFP2_INT can be used to find - * the index of the appropriate CSR in the CCEIntMask CSR array - */ - cce_int_mask = read_csr(dd, CCE_INT_MASK + - (8 * (QSFP1_INT / 64))); - if (dd->hfi1_id) { - cce_int_mask &= ~((u64)1 << qsfp1_int_smask); - write_csr(dd, CCE_INT_MASK + (8 * (QSFP1_INT / 64)), - cce_int_mask); - } else { - cce_int_mask &= ~((u64)1 << qsfp2_int_smask); - write_csr(dd, CCE_INT_MASK + (8 * (QSFP2_INT / 64)), - cce_int_mask); - } + u64 qsfp_mask; qsfp_mask = (u64)(QSFP_HFI0_INT_N | QSFP_HFI0_MODPRST_N); /* Clear current status to avoid spurious interrupts */ @@ -9686,6 +9666,12 @@ static void init_qsfp_int(struct hfi1_devdata *dd) write_csr(dd, dd->hfi1_id ? ASIC_QSFP2_INVERT : ASIC_QSFP1_INVERT, qsfp_mask); + + /* Enable the appropriate QSFP IRQ source */ + if (!dd->hfi1_id) + set_intr_bits(dd, QSFP1_INT, QSFP1_INT, true); + else + set_intr_bits(dd, QSFP2_INT, QSFP2_INT, true); } /* @@ -11926,10 +11912,16 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl &= ~RCV_CTXT_CTRL_ENABLE_SMASK; } - if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) + if (op & HFI1_RCVCTRL_INTRAVAIL_ENB) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, true); rcvctrl |= RCV_CTXT_CTRL_INTR_AVAIL_SMASK; - if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) + } + if (op & HFI1_RCVCTRL_INTRAVAIL_DIS) { + set_intr_bits(dd, IS_RCVAVAIL_START + rcd->ctxt, + IS_RCVAVAIL_START + rcd->ctxt, false); rcvctrl &= ~RCV_CTXT_CTRL_INTR_AVAIL_SMASK; + } if ((op & HFI1_RCVCTRL_TAILUPD_ENB) && rcd->rcvhdrtail_kvaddr) rcvctrl |= RCV_CTXT_CTRL_TAIL_UPD_SMASK; if (op & HFI1_RCVCTRL_TAILUPD_DIS) { @@ -12957,57 +12949,65 @@ int hfi1_tempsense_rd(struct hfi1_devdata *dd, struct hfi1_temp *temp) return ret; } -/** - * get_int_mask - get 64 bit int mask - * @dd - the devdata - * @i - the csr (relative to CCE_INT_MASK) - * - * Returns the mask with the urgent interrupt mask - * bit clear for kernel receive contexts. - */ -static u64 get_int_mask(struct hfi1_devdata *dd, u32 i) -{ - u64 mask = U64_MAX; /* default to no change */ - - if (i >= (IS_RCVURGENT_START / 64) && i < (IS_RCVURGENT_END / 64)) { - int j = (i - (IS_RCVURGENT_START / 64)) * 64; - int k = !j ? IS_RCVURGENT_START % 64 : 0; - - if (j) - j -= IS_RCVURGENT_START % 64; - /* j = 0..dd->first_dyn_alloc_ctxt - 1,k = 0..63 */ - for (; j < dd->first_dyn_alloc_ctxt && k < 64; j++, k++) - /* convert to bit in mask and clear */ - mask &= ~BIT_ULL(k); - } - return mask; -} - /* ========================================================================= */ -/* - * Enable/disable chip from delivering interrupts. +/** + * read_mod_write() - Calculate the IRQ register index and set/clear the bits + * @dd: valid devdata + * @src: IRQ source to determine register index from + * @bits: the bits to set or clear + * @set: true == set the bits, false == clear the bits + * */ -void set_intr_state(struct hfi1_devdata *dd, u32 enable) +static void read_mod_write(struct hfi1_devdata *dd, u16 src, u64 bits, + bool set) { - int i; + u64 reg; + u16 idx = src / BITS_PER_REGISTER; - /* - * In HFI, the mask needs to be 1 to allow interrupts. - */ - if (enable) { - /* enable all interrupts but urgent on kernel contexts */ - for (i = 0; i < CCE_NUM_INT_CSRS; i++) { - u64 mask = get_int_mask(dd, i); + spin_lock(&dd->irq_src_lock); + reg = read_csr(dd, CCE_INT_MASK + (8 * idx)); + if (set) + reg |= bits; + else + reg &= ~bits; + write_csr(dd, CCE_INT_MASK + (8 * idx), reg); + spin_unlock(&dd->irq_src_lock); +} - write_csr(dd, CCE_INT_MASK + (8 * i), mask); +/** + * set_intr_bits() - Enable/disable a range (one or more) IRQ sources + * @dd: valid devdata + * @first: first IRQ source to set/clear + * @last: last IRQ source (inclusive) to set/clear + * @set: true == set the bits, false == clear the bits + * + * If first == last, set the exact source. + */ +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) +{ + u64 bits = 0; + u64 bit; + u16 src; + + if (first > NUM_INTERRUPT_SOURCES || last > NUM_INTERRUPT_SOURCES) + return -EINVAL; + + if (last < first) + return -ERANGE; + + for (src = first; src <= last; src++) { + bit = src % BITS_PER_REGISTER; + /* wrapped to next register? */ + if (!bit && bits) { + read_mod_write(dd, src - 1, bits, set); + bits = 0; } - - init_qsfp_int(dd); - } else { - for (i = 0; i < CCE_NUM_INT_CSRS; i++) - write_csr(dd, CCE_INT_MASK + (8 * i), 0ull); + bits |= BIT_ULL(bit); } + read_mod_write(dd, src, bits, set); + + return 0; } /* @@ -13074,12 +13074,9 @@ void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr) * SDMAProgress * SDMAIdle */ - remap_intr(dd, IS_SDMA_START + 0 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 1 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); - remap_intr(dd, IS_SDMA_START + 2 * TXE_NUM_SDMA_ENGINES + engine, - msix_intr); + remap_intr(dd, IS_SDMA_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_PROGRESS_START + engine, msix_intr); + remap_intr(dd, IS_SDMA_IDLE_START + engine, msix_intr); } /* @@ -13109,7 +13106,8 @@ static int set_up_interrupts(struct hfi1_devdata *dd) int ret; /* mask all interrupts */ - set_intr_state(dd, 0); + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); + /* clear all pending interrupts */ clear_all_interrupts(dd); diff --git a/drivers/infiniband/hw/hfi1/chip.h b/drivers/infiniband/hw/hfi1/chip.h index 3cc8fc982d24..6b9c8f12dff8 100644 --- a/drivers/infiniband/hw/hfi1/chip.h +++ b/drivers/infiniband/hw/hfi1/chip.h @@ -52,9 +52,7 @@ */ /* sizes */ -#define CCE_NUM_MSIX_VECTORS 256 -#define CCE_NUM_INT_CSRS 12 -#define CCE_NUM_INT_MAP_CSRS 96 +#define BITS_PER_REGISTER (BITS_PER_BYTE * sizeof(u64)) #define NUM_INTERRUPT_SOURCES 768 #define RXE_NUM_CONTEXTS 160 #define RXE_PER_CONTEXT_SIZE 0x1000 /* 4k */ @@ -161,34 +159,49 @@ (CR_CREDIT_RETURN_DUE_TO_FORCE_MASK << \ CR_CREDIT_RETURN_DUE_TO_FORCE_SHIFT) -/* interrupt source numbers */ -#define IS_GENERAL_ERR_START 0 -#define IS_SDMAENG_ERR_START 16 -#define IS_SENDCTXT_ERR_START 32 -#define IS_SDMA_START 192 /* includes SDmaProgress,SDmaIdle */ +/* Specific IRQ sources */ +#define CCE_ERR_INT 0 +#define RXE_ERR_INT 1 +#define MISC_ERR_INT 2 +#define PIO_ERR_INT 4 +#define SDMA_ERR_INT 5 +#define EGRESS_ERR_INT 6 +#define TXE_ERR_INT 7 +#define PBC_INT 240 +#define GPIO_ASSERT_INT 241 +#define QSFP1_INT 242 +#define QSFP2_INT 243 +#define TCRIT_INT 244 + +/* interrupt source ranges */ +#define IS_FIRST_SOURCE CCE_ERR_INT +#define IS_GENERAL_ERR_START 0 +#define IS_SDMAENG_ERR_START 16 +#define IS_SENDCTXT_ERR_START 32 +#define IS_SDMA_START 192 +#define IS_SDMA_PROGRESS_START 208 +#define IS_SDMA_IDLE_START 224 #define IS_VARIOUS_START 240 #define IS_DC_START 248 #define IS_RCVAVAIL_START 256 #define IS_RCVURGENT_START 416 #define IS_SENDCREDIT_START 576 #define IS_RESERVED_START 736 -#define IS_MAX_SOURCES 768 +#define IS_LAST_SOURCE 767 /* derived interrupt source values */ -#define IS_GENERAL_ERR_END IS_SDMAENG_ERR_START -#define IS_SDMAENG_ERR_END IS_SENDCTXT_ERR_START -#define IS_SENDCTXT_ERR_END IS_SDMA_START -#define IS_SDMA_END IS_VARIOUS_START -#define IS_VARIOUS_END IS_DC_START -#define IS_DC_END IS_RCVAVAIL_START -#define IS_RCVAVAIL_END IS_RCVURGENT_START -#define IS_RCVURGENT_END IS_SENDCREDIT_START -#define IS_SENDCREDIT_END IS_RESERVED_START -#define IS_RESERVED_END IS_MAX_SOURCES - -/* absolute interrupt numbers for QSFP1Int and QSFP2Int */ -#define QSFP1_INT 242 -#define QSFP2_INT 243 +#define IS_GENERAL_ERR_END 7 +#define IS_SDMAENG_ERR_END 31 +#define IS_SENDCTXT_ERR_END 191 +#define IS_SDMA_END 207 +#define IS_SDMA_PROGRESS_END 223 +#define IS_SDMA_IDLE_END 239 +#define IS_VARIOUS_END 244 +#define IS_DC_END 255 +#define IS_RCVAVAIL_END 415 +#define IS_RCVURGENT_END 575 +#define IS_SENDCREDIT_END 735 +#define IS_RESERVED_END IS_LAST_SOURCE /* DCC_CFG_PORT_CONFIG logical link states */ #define LSTATE_DOWN 0x1 @@ -1421,6 +1434,8 @@ irqreturn_t sdma_interrupt(int irq, void *data); irqreturn_t receive_context_interrupt(int irq, void *data); irqreturn_t receive_context_thread(int irq, void *data); +int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set); +void init_qsfp_int(struct hfi1_devdata *dd); void clear_all_interrupts(struct hfi1_devdata *dd); void remap_intr(struct hfi1_devdata *dd, int isrc, int msix_intr); void remap_sdma_interrupts(struct hfi1_devdata *dd, int engine, int msix_intr); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 1fc75647e47b..0b669475349e 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -639,6 +639,9 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) hfi1_cdbg(PROC, "closing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); + set_intr_bits(dd, IS_RCVURGENT_START + uctxt->ctxt, + IS_RCVURGENT_START + uctxt->ctxt, false); + flush_wc(); /* drain user sdma queue */ hfi1_user_sdma_free_queues(fdata, uctxt); @@ -1217,6 +1220,10 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, fd->uctxt = uctxt; hfi1_rcd_get(uctxt); + /* Enable the Urgent IRQ for this user context */ + set_intr_bits(dd, IS_RCVURGENT_START + uctxt->ctxt, + IS_RCVURGENT_START + uctxt->ctxt, true); + done: if (uctxt->subctxt_cnt) { /* diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index e2a5c2dac109..6d18784cea46 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1213,9 +1213,6 @@ struct hfi1_devdata { struct diag_client *diag_client; - /* MSI-X information */ - struct hfi1_msix_info msix_info; - /* general interrupt: mask of handled interrupts */ u64 gi_mask[CCE_NUM_INT_CSRS]; @@ -1229,6 +1226,9 @@ struct hfi1_devdata { */ struct timer_list synth_stats_timer; + /* MSI-X information */ + struct hfi1_msix_info msix_info; + /* * device counters */ @@ -1355,6 +1355,8 @@ struct hfi1_devdata { /* vnic data */ struct hfi1_vnic_data vnic; + /* Lock to protect IRQ SRC register access */ + spinlock_t irq_src_lock; }; static inline bool hfi1_vnic_is_rsm_full(struct hfi1_devdata *dd, int spare) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 3ab7e2316523..1e770a133779 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -831,6 +831,23 @@ wq_error: return -ENOMEM; } +/** + * enable_general_intr() - Enable the IRQs that will be handled by the + * general interrupt handler. + * @dd: valid devdata + * + */ +static void enable_general_intr(struct hfi1_devdata *dd) +{ + set_intr_bits(dd, CCE_ERR_INT, MISC_ERR_INT, true); + set_intr_bits(dd, PIO_ERR_INT, TXE_ERR_INT, true); + set_intr_bits(dd, IS_SENDCTXT_ERR_START, IS_SENDCTXT_ERR_END, true); + set_intr_bits(dd, PBC_INT, GPIO_ASSERT_INT, true); + set_intr_bits(dd, TCRIT_INT, TCRIT_INT, true); + set_intr_bits(dd, IS_DC_START, IS_DC_END, true); + set_intr_bits(dd, IS_SENDCREDIT_START, IS_SENDCREDIT_END, true); +} + /** * hfi1_init - do the actual initialization sequence on the chip * @dd: the hfi1_ib device @@ -915,6 +932,7 @@ int hfi1_init(struct hfi1_devdata *dd, int reinit) "failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n"); ret = lastfail; } + /* enable IRQ */ hfi1_rcd_put(rcd); } @@ -953,7 +971,8 @@ done: HFI1_STATUS_INITTED; if (!ret) { /* enable all interrupts from the chip */ - set_intr_state(dd, 1); + enable_general_intr(dd); + init_qsfp_int(dd); /* chip is OK for user apps; mark it as initialized */ for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -1050,8 +1069,8 @@ static void shutdown_device(struct hfi1_devdata *dd) } dd->flags &= ~HFI1_INITTED; - /* mask and clean up interrupts, but not errors */ - set_intr_state(dd, 0); + /* mask and clean up interrupts */ + set_intr_bits(dd, IS_FIRST_SOURCE, IS_LAST_SOURCE, false); msix_clean_up_interrupts(dd); for (pidx = 0; pidx < dd->num_pports; ++pidx) { @@ -1312,6 +1331,7 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev, spin_lock_init(&dd->pio_map_lock); mutex_init(&dd->dc8051_lock); init_waitqueue_head(&dd->event_queue); + spin_lock_init(&dd->irq_src_lock); dd->int_counter = alloc_percpu(u64); if (!dd->int_counter) { diff --git a/drivers/infiniband/hw/hfi1/msix.c b/drivers/infiniband/hw/hfi1/msix.c index 72cd0269cb2e..d920b165d696 100644 --- a/drivers/infiniband/hw/hfi1/msix.c +++ b/drivers/infiniband/hw/hfi1/msix.c @@ -240,6 +240,21 @@ int msix_request_sdma_irq(struct sdma_engine *sde) return 0; } +/** + * enable_sdma_src() - Helper to enable SDMA IRQ srcs + * @dd: valid devdata structure + * @i: index of SDMA engine + */ +static void enable_sdma_srcs(struct hfi1_devdata *dd, int i) +{ + set_intr_bits(dd, IS_SDMA_START + i, IS_SDMA_START + i, true); + set_intr_bits(dd, IS_SDMA_PROGRESS_START + i, + IS_SDMA_PROGRESS_START + i, true); + set_intr_bits(dd, IS_SDMA_IDLE_START + i, IS_SDMA_IDLE_START + i, true); + set_intr_bits(dd, IS_SDMAENG_ERR_START + i, IS_SDMAENG_ERR_START + i, + true); +} + /** * msix_request_irqs() - Allocate all MSIx IRQs * @dd: valid devdata structure @@ -262,6 +277,7 @@ int msix_request_irqs(struct hfi1_devdata *dd) ret = msix_request_sdma_irq(sde); if (ret) return ret; + enable_sdma_srcs(sde->dd, i); } for (i = 0; i < dd->n_krcv_queues; i++) { From dc9f5d0f841d604b8ca6310bd021096f804cd2a0 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 15 Aug 2018 23:04:32 -0700 Subject: [PATCH 019/242] IB/hfi1: Move URGENT IRQ enable to hfi1_rcvctrl() User contexts use the receive URGENT interrupt. However, enabling the IRQ SRC in the file_ops module is not as clean as it could be. Augment the _rcvctl() function to be able to enable/disable the IRQ source. Use the new interface from file_ops to enable/disable the IRQ. Reviewed-by: Mike Marciniszyn Reviewed-by: Sadanand Warrier Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hfi1/chip.c | 7 +++++++ drivers/infiniband/hw/hfi1/file_ops.c | 11 +++-------- drivers/infiniband/hw/hfi1/hfi.h | 2 ++ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index b9d8b0e4c5eb..ef433fdb2411 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -11951,6 +11951,13 @@ void hfi1_rcvctrl(struct hfi1_devdata *dd, unsigned int op, rcvctrl |= RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; if (op & HFI1_RCVCTRL_NO_EGR_DROP_DIS) rcvctrl &= ~RCV_CTXT_CTRL_DONT_DROP_EGR_FULL_SMASK; + if (op & HFI1_RCVCTRL_URGENT_ENB) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, true); + if (op & HFI1_RCVCTRL_URGENT_DIS) + set_intr_bits(dd, IS_RCVURGENT_START + rcd->ctxt, + IS_RCVURGENT_START + rcd->ctxt, false); + hfi1_cdbg(RCVCTRL, "ctxt %d rcvctrl 0x%llx\n", ctxt, rcvctrl); write_kctxt_csr(dd, ctxt, RCV_CTXT_CTRL, rcvctrl); diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 0b669475349e..c22ebc774a6a 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -639,9 +639,6 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) hfi1_cdbg(PROC, "closing ctxt %u:%u", uctxt->ctxt, fdata->subctxt); - set_intr_bits(dd, IS_RCVURGENT_START + uctxt->ctxt, - IS_RCVURGENT_START + uctxt->ctxt, false); - flush_wc(); /* drain user sdma queue */ hfi1_user_sdma_free_queues(fdata, uctxt); @@ -684,7 +681,8 @@ static int hfi1_file_close(struct inode *inode, struct file *fp) HFI1_RCVCTRL_TAILUPD_DIS | HFI1_RCVCTRL_ONE_PKT_EGR_DIS | HFI1_RCVCTRL_NO_RHQ_DROP_DIS | - HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt); + HFI1_RCVCTRL_NO_EGR_DROP_DIS | + HFI1_RCVCTRL_URGENT_DIS, uctxt); /* Clear the context's J_KEY */ hfi1_clear_ctxt_jkey(dd, uctxt); /* @@ -1099,6 +1097,7 @@ static void user_init(struct hfi1_ctxtdata *uctxt) hfi1_set_ctxt_jkey(uctxt->dd, uctxt, uctxt->jkey); rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB; + rcvctrl_ops |= HFI1_RCVCTRL_URGENT_ENB; if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP)) rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB; /* @@ -1220,10 +1219,6 @@ static int setup_base_ctxt(struct hfi1_filedata *fd, fd->uctxt = uctxt; hfi1_rcd_get(uctxt); - /* Enable the Urgent IRQ for this user context */ - set_intr_bits(dd, IS_RCVURGENT_START + uctxt->ctxt, - IS_RCVURGENT_START + uctxt->ctxt, true); - done: if (uctxt->subctxt_cnt) { /* diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index 6d18784cea46..d3d5717650e9 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -621,6 +621,8 @@ struct rvt_sge_state; #define HFI1_RCVCTRL_NO_RHQ_DROP_DIS 0x8000 #define HFI1_RCVCTRL_NO_EGR_DROP_ENB 0x10000 #define HFI1_RCVCTRL_NO_EGR_DROP_DIS 0x20000 +#define HFI1_RCVCTRL_URGENT_ENB 0x40000 +#define HFI1_RCVCTRL_URGENT_DIS 0x80000 /* partition enforcement flags */ #define HFI1_PART_ENFORCE_IN 0x1 From c6a21c3864fc7f5febae7d096cd136f397c791f2 Mon Sep 17 00:00:00 2001 From: Majd Dibbiny Date: Tue, 28 Aug 2018 14:29:05 +0300 Subject: [PATCH 020/242] IB/mlx5: Change TX affinity assignment in RoCE LAG mode In the current code, the TX affinity is per RoCE device, which can cause unfairness between different contexts. e.g. if we open two contexts, and each open 10 QPs concurrently, all of the QPs of the first context might end up on the first port instead of distributed on the two ports as expected To overcome this unfairness between processes, we maintain per device TX affinity, and per process TX affinity. The allocation algorithm is as follow: 1. Hold two tx_port_affinity atomic variables, one per RoCE device and one per ucontext. Both initialized to 0. 2. In mlx5_ib_alloc_ucontext do: 2.1. ucontext.tx_port_affinity = device.tx_port_affinity 2.2. device.tx_port_affinity += 1 3. In modify QP INIT2RST: 3.1. qp.tx_port_affinity = ucontext.tx_port_affinity % MLX5_PORT_NUM 3.2. ucontext.tx_port_affinity += 1 Signed-off-by: Majd Dibbiny Reviewed-by: Moni Shoua Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++- drivers/infiniband/hw/mlx5/qp.c | 37 +++++++++++++++++++++++++--- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c414f3809e5c..f9bbc6a61512 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1826,6 +1826,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->lib_caps = req.lib_caps; print_lib_caps(dev, context->lib_caps); + if (mlx5_lag_is_active(dev->mdev)) { + u8 port = mlx5_core_native_port_num(dev->mdev); + + atomic_set(&context->tx_port_affinity, + atomic_add_return( + 1, &dev->roce[port].tx_port_affinity)); + } + return &context->ibucontext; out_mdev: diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 320d4dfe8c2f..387a6ec1ce3b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -139,6 +139,8 @@ struct mlx5_ib_ucontext { u64 lib_caps; DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES); u16 devx_uid; + /* For RoCE LAG TX affinity */ + atomic_t tx_port_affinity; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -699,7 +701,7 @@ struct mlx5_roce { rwlock_t netdev_lock; struct net_device *netdev; struct notifier_block nb; - atomic_t next_port; + atomic_t tx_port_affinity; enum ib_port_state last_port_state; struct mlx5_ib_dev *dev; u8 native_port_num; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 6cba2a02d11b..e060499146ad 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2909,6 +2909,37 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return 0; } +static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev, + struct mlx5_ib_pd *pd, + struct mlx5_ib_qp_base *qp_base, + u8 port_num) +{ + struct mlx5_ib_ucontext *ucontext = NULL; + unsigned int tx_port_affinity; + + if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context) + ucontext = to_mucontext(pd->ibpd.uobject->context); + + if (ucontext) { + tx_port_affinity = (unsigned int)atomic_add_return( + 1, &ucontext->tx_port_affinity) % + MLX5_MAX_PORTS + + 1; + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x ucontext %p\n", + tx_port_affinity, qp_base->mqp.qpn, ucontext); + } else { + tx_port_affinity = + (unsigned int)atomic_add_return( + 1, &dev->roce[port_num].tx_port_affinity) % + MLX5_MAX_PORTS + + 1; + mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n", + tx_port_affinity, qp_base->mqp.qpn); + } + + return tx_port_affinity; +} + static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state, @@ -2974,6 +3005,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, if (!context) return -ENOMEM; + pd = get_pd(qp); context->flags = cpu_to_be32(mlx5_st << 16); if (!(attr_mask & IB_QP_PATH_MIG_STATE)) { @@ -3002,9 +3034,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, (ibqp->qp_type == IB_QPT_XRC_TGT)) { if (mlx5_lag_is_active(dev->mdev)) { u8 p = mlx5_core_native_port_num(dev->mdev); - tx_affinity = (unsigned int)atomic_add_return(1, - &dev->roce[p].next_port) % - MLX5_MAX_PORTS + 1; + tx_affinity = get_tx_affinity(dev, pd, base, p); context->flags |= cpu_to_be32(tx_affinity << 24); } } @@ -3062,7 +3092,6 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, goto out; } - pd = get_pd(qp); get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq, &send_cq, &recv_cq); From 0953fffec9ba022f63bfe01e86427530d8320d5c Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:50 +0300 Subject: [PATCH 021/242] RDMA/uverbs: Add UVERBS_ATTR_CONST_IN to the specs language This makes it clear and safe to access constants passed in from user space. We define a consistent ABI of u64 for all constants, and verify that the data passed in can be represented by the type the user supplies. The expectation is this will always be used with an enum declaring the constant values, and the user will use the enum type as input to the accessor. To retrieve the attribute value we introduce two helper calls - one standard which may fail if attribute is not valid and one where caller can provide a default value which will be used in case the attribute is not valid (useful when attribute is optional). Signed-off-by: Jason Gunthorpe Signed-off-by: Ariel Levkovich Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs_ioctl.c | 23 +++++++++++++++ include/rdma/uverbs_ioctl.h | 40 ++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 1a6b229e3db3..4bafd4671de2 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -611,3 +611,26 @@ int uverbs_copy_to(const struct uverbs_attr_bundle *bundle, size_t idx, return 0; } EXPORT_SYMBOL(uverbs_copy_to); + +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + const struct uverbs_attr *attr; + + attr = uverbs_attr_get(attrs_bundle, idx); + if (IS_ERR(attr)) { + if ((PTR_ERR(attr) != -ENOENT) || !def_val) + return PTR_ERR(attr); + + *to = *def_val; + } else { + *to = attr->ptr_attr.data; + } + + if (*to < lower_bound || (*to > 0 && (u64)*to > upper_bound)) + return -EINVAL; + + return 0; +} +EXPORT_SYMBOL(_uverbs_get_const); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index 9e997c3c2f04..fc2e52234a2a 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -365,6 +365,15 @@ struct uverbs_object_tree_def { __VA_ARGS__ }, \ }) +/* An input value that is a member in the enum _enum_type. */ +#define UVERBS_ATTR_CONST_IN(_attr_id, _enum_type, ...) \ + UVERBS_ATTR_PTR_IN( \ + _attr_id, \ + UVERBS_ATTR_SIZE( \ + sizeof(u64) + BUILD_BUG_ON_ZERO(!sizeof(_enum_type)), \ + sizeof(u64)), \ + __VA_ARGS__) + /* * An input value that is a bitwise combination of values of _enum_type. * This permits the flag value to be passed as either a u32 or u64, it must @@ -603,6 +612,9 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return _uverbs_alloc(bundle, size, GFP_KERNEL | __GFP_ZERO); } +int _uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val); #else static inline int uverbs_get_flags64(u64 *to, const struct uverbs_attr_bundle *attrs_bundle, @@ -631,6 +643,34 @@ static inline __malloc void *uverbs_zalloc(struct uverbs_attr_bundle *bundle, { return ERR_PTR(-EINVAL); } +static inline int +_uverbs_get_const(s64 *to, const struct uverbs_attr_bundle *attrs_bundle, + size_t idx, s64 lower_bound, u64 upper_bound, + s64 *def_val) +{ + return -EINVAL; +} #endif +#define uverbs_get_const(_to, _attrs_bundle, _idx) \ + ({ \ + s64 _val; \ + int _ret = _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), NULL); \ + (*_to) = _val; \ + _ret; \ + }) + +#define uverbs_get_const_default(_to, _attrs_bundle, _idx, _default) \ + ({ \ + s64 _val; \ + s64 _def_val = _default; \ + int _ret = \ + _uverbs_get_const(&_val, _attrs_bundle, _idx, \ + type_min(typeof(*_to)), \ + type_max(typeof(*_to)), &_def_val); \ + (*_to) = _val; \ + _ret; \ + }) #endif From b4749bf25652689d8e33827460266b78bb2ec42c Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:51 +0300 Subject: [PATCH 022/242] RDMA/mlx5: Add a new flow action verb - modify header Expose the ability to create a flow action which changes packet headers. The data passed from userspace should be modify header actions as defined by HW specification. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 134 ++++++++++++++++++++++ drivers/infiniband/hw/mlx5/main.c | 3 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 17 ++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 10 ++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 5 + 5 files changed, 168 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 1a29f47f836e..02103a4b372c 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,24 @@ #define UVERBS_MODULE_NAME mlx5_ib #include +static int +mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type, + enum mlx5_flow_namespace_type *namespace) +{ + switch (table_type) { + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX: + *namespace = MLX5_FLOW_NAMESPACE_BYPASS; + break; + case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX: + *namespace = MLX5_FLOW_NAMESPACE_EGRESS; + break; + default: + return -EINVAL; + } + + return 0; +} + static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { [MLX5_IB_FLOW_TYPE_NORMAL] = { .type = UVERBS_ATTR_TYPE_PTR_IN, @@ -175,6 +194,100 @@ end: return err; } +void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) +{ + switch (maction->flow_action_raw.sub_type) { + case MLX5_IB_FLOW_ACTION_MODIFY_HEADER: + mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.action_id); + break; + default: + break; + } +} + +static struct ib_flow_action * +mlx5_ib_create_modify_header(struct mlx5_ib_dev *dev, + enum mlx5_ib_uapi_flow_table_type ft_type, + u8 num_actions, void *in) +{ + enum mlx5_flow_namespace_type namespace; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ERR_PTR(-EINVAL); + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return ERR_PTR(-ENOMEM); + + ret = mlx5_modify_header_alloc(dev->mdev, namespace, num_actions, in, + &maction->flow_action_raw.action_id); + + if (ret) { + kfree(maction); + return ERR_PTR(ret); + } + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_MODIFY_HEADER; + maction->flow_action_raw.dev = dev; + + return &maction->ib_action; +} + +static bool mlx5_ib_modify_header_supported(struct mlx5_ib_dev *dev) +{ + return MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + max_modify_header_actions) || + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, max_modify_header_actions); +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject( + attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + enum mlx5_ib_uapi_flow_table_type ft_type; + struct ib_flow_action *action; + size_t num_actions; + void *in; + int len; + int ret; + + if (!mlx5_ib_modify_header_supported(mdev)) + return -EOPNOTSUPP; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM); + len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM); + + if (len % MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto)) + return -EINVAL; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE); + if (ret) + return ret; + + num_actions = len / MLX5_UN_SZ_BYTES(set_action_in_add_action_in_auto), + action = mlx5_ib_create_modify_header(mdev, ft_type, num_actions, in); + if (IS_ERR(action)) + return PTR_ERR(action); + + atomic_set(&action->usecnt, 0); + action->device = uobj->context->device; + action->type = IB_FLOW_ACTION_UNSPECIFIED; + action->uobject = uobj; + uobj->object = action; + + return 0; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_CREATE_FLOW, UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, @@ -209,6 +322,26 @@ ADD_UVERBS_METHODS(mlx5_ib_fs, &UVERBS_METHOD(MLX5_IB_METHOD_CREATE_FLOW), &UVERBS_METHOD(MLX5_IB_METHOD_DESTROY_FLOW)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + UVERBS_ATTR_MIN_SIZE(MLX5_UN_SZ_BYTES( + set_action_in_add_action_in_auto)), + UA_MANDATORY, + UA_ALLOC_AND_COPY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + +ADD_UVERBS_METHODS( + mlx5_ib_flow_actions, + UVERBS_OBJECT_FLOW_ACTION, + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)); + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_FLOW_MATCHER_CREATE, UVERBS_ATTR_IDR(MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE, @@ -247,6 +380,7 @@ int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) root[i++] = &flow_objects; root[i++] = &mlx5_ib_fs; + root[i++] = &mlx5_ib_flow_actions; return i; } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c414f3809e5c..d41419fb6b3e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3995,6 +3995,9 @@ static int mlx5_ib_destroy_flow_action(struct ib_flow_action *action) */ mlx5_accel_esp_destroy_xfrm(maction->esp_aes_gcm.ctx); break; + case IB_FLOW_ACTION_UNSPECIFIED: + mlx5_ib_destroy_flow_action_raw(maction); + break; default: WARN_ON(true); break; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 320d4dfe8c2f..c26ea868b4f1 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -151,6 +151,10 @@ struct mlx5_ib_pd { u32 pdn; }; +enum { + MLX5_IB_FLOW_ACTION_MODIFY_HEADER, +}; + #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) #define MLX5_IB_FLOW_LAST_PRIO (MLX5_BY_PASS_NUM_REGULAR_PRIOS - 1) #if (MLX5_IB_FLOW_LAST_PRIO <= 0) @@ -814,6 +818,11 @@ struct mlx5_ib_flow_action { u64 ib_flags; struct mlx5_accel_esp_xfrm *ctx; } esp_aes_gcm; + struct { + struct mlx5_ib_dev *dev; + u32 sub_type; + u32 action_id; + } flow_action_raw; }; }; @@ -860,7 +869,7 @@ to_mcounters(struct ib_counters *ibcntrs) struct mlx5_ib_dev { struct ib_device ib_dev; - const struct uverbs_object_tree_def *driver_trees[6]; + const struct uverbs_object_tree_def *driver_trees[7]; struct mlx5_core_dev *mdev; struct mlx5_roce roce[MLX5_MAX_PORTS]; int num_ports; @@ -1238,6 +1247,7 @@ struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( void *cmd_in, int inlen, int dest_id, int dest_type); bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root); +void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction); #else static inline int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, @@ -1256,6 +1266,11 @@ mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root) { return 0; } +static inline void +mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) +{ + return; +}; #endif static inline void init_query_mad(struct ib_smp *mad) { diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 9c51801b9e64..9c83e13c0e89 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -166,4 +166,14 @@ enum mlx5_ib_flow_methods { MLX5_IB_METHOD_DESTROY_FLOW, }; +enum mlx5_ib_flow_action_methods { + MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER = (1U << UVERBS_ID_NS_SHIFT), +}; + +enum mlx5_ib_create_flow_action_create_modify_header_attrs { + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_ACTIONS_PRM, + MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, +}; + #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index 8a2fb33f3ed4..ceb6d0d8529a 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -39,5 +39,10 @@ enum mlx5_ib_uapi_flow_action_flags { MLX5_IB_UAPI_FLOW_ACTION_FLAGS_REQUIRE_METADATA = 1 << 0, }; +enum mlx5_ib_uapi_flow_table_type { + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX = 0x0, + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX = 0x1, +}; + #endif From 841eefc5cb57030ad05a0c4bc285f93ffa668ad9 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:52 +0300 Subject: [PATCH 023/242] RDMA/uverbs: Add generic function to fill in flow action object Refactor the initialization of a flow action object to a common function. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- .../infiniband/core/uverbs_std_types_flow_action.c | 7 ++----- drivers/infiniband/hw/mlx5/flow.c | 8 +++----- include/rdma/uverbs_std_types.h | 12 ++++++++++++ 3 files changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/core/uverbs_std_types_flow_action.c b/drivers/infiniband/core/uverbs_std_types_flow_action.c index d8cfafe23bd9..cb9486ad5c67 100644 --- a/drivers/infiniband/core/uverbs_std_types_flow_action.c +++ b/drivers/infiniband/core/uverbs_std_types_flow_action.c @@ -326,11 +326,8 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = ib_dev; - action->type = IB_FLOW_ACTION_ESP; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, ib_dev, + IB_FLOW_ACTION_ESP); return 0; } diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 02103a4b372c..0c89d5431c7e 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -279,11 +280,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( if (IS_ERR(action)) return PTR_ERR(action); - atomic_set(&action->usecnt, 0); - action->device = uobj->context->device; - action->type = IB_FLOW_ACTION_UNSPECIFIED; - action->uobject = uobj; - uobj->object = action; + uverbs_flow_action_fill_action(action, uobj, uobj->context->device, + IB_FLOW_ACTION_UNSPECIFIED); return 0; } diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 3b00231cc084..526d918fcd5a 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -140,5 +140,17 @@ __uobj_alloc(const struct uverbs_api_object *obj, struct ib_uverbs_file *ufile, #define uobj_alloc(_type, _ufile, _ib_dev) \ __uobj_alloc(uobj_get_type(_ufile, _type), _ufile, _ib_dev) +static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, + struct ib_uobject *uobj, + struct ib_device *ib_dev, + enum ib_flow_action_type type) +{ + atomic_set(&action->usecnt, 0); + action->device = ib_dev; + action->type = type; + action->uobject = uobj; + uobj->object = action; +} + #endif From 08aeb97cb82483192bd8ad8e60d1b73ce1b75923 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:53 +0300 Subject: [PATCH 024/242] RDMA/mlx5: Add new flow action verb - packet reformat For now, only add L2_TUNNEL_TO_L2 option. This will allow to perform generic decap operation if the encapsulating protocol is L2 based, and the inner packet is also L2 based. For example this can be used to decap VXLAN packets. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 76 ++++++++++++++++++++++- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 7 +++ include/uapi/rdma/mlx5_user_ioctl_verbs.h | 4 ++ 4 files changed, 87 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 0c89d5431c7e..888f79d6a125 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -202,6 +202,8 @@ void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, maction->flow_action_raw.action_id); break; + case MLX5_IB_FLOW_ACTION_DECAP: + break; default: break; } @@ -286,6 +288,64 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)( return 0; } +static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, + u8 packet_reformat_type, + u8 ft_type) +{ + switch (packet_reformat_type) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap); + break; + default: + break; + } + + return false; +} + +static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( + struct ib_uverbs_file *file, + struct uverbs_attr_bundle *attrs) +{ + struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE); + struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device); + enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt; + enum mlx5_ib_uapi_flow_table_type ft_type; + struct mlx5_ib_flow_action *maction; + int ret; + + ret = uverbs_get_const(&ft_type, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE); + if (ret) + return ret; + + ret = uverbs_get_const(&dv_prt, attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE); + if (ret) + return ret; + + if (!mlx5_ib_flow_action_packet_reformat_valid(mdev, dv_prt, ft_type)) + return -EOPNOTSUPP; + + maction = kzalloc(sizeof(*maction), GFP_KERNEL); + if (!maction) + return -ENOMEM; + + if (dv_prt == + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2) { + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_DECAP; + maction->flow_action_raw.dev = mdev; + } + + uverbs_flow_action_fill_action(&maction->ib_action, uobj, + uobj->context->device, + IB_FLOW_ACTION_UNSPECIFIED); + return 0; +} + DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_CREATE_FLOW, UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_HANDLE, @@ -335,10 +395,24 @@ DECLARE_UVERBS_NAMED_METHOD( enum mlx5_ib_uapi_flow_table_type, UA_MANDATORY)); +DECLARE_UVERBS_NAMED_METHOD( + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, + UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_NEW, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + enum mlx5_ib_uapi_flow_action_packet_reformat_type, + UA_MANDATORY), + UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + enum mlx5_ib_uapi_flow_table_type, + UA_MANDATORY)); + ADD_UVERBS_METHODS( mlx5_ib_flow_actions, UVERBS_OBJECT_FLOW_ACTION, - &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)); + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER), + &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)); DECLARE_UVERBS_NAMED_METHOD( MLX5_IB_METHOD_FLOW_MATCHER_CREATE, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index c26ea868b4f1..8ac84cc00fd5 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -153,6 +153,7 @@ struct mlx5_ib_pd { enum { MLX5_IB_FLOW_ACTION_MODIFY_HEADER, + MLX5_IB_FLOW_ACTION_DECAP, }; #define MLX5_IB_FLOW_MCAST_PRIO (MLX5_BY_PASS_NUM_PRIOS - 1) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 9c83e13c0e89..40db7fca3d0b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -168,6 +168,7 @@ enum mlx5_ib_flow_methods { enum mlx5_ib_flow_action_methods { MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT, }; enum mlx5_ib_create_flow_action_create_modify_header_attrs { @@ -176,4 +177,10 @@ enum mlx5_ib_create_flow_action_create_modify_header_attrs { MLX5_IB_ATTR_CREATE_MODIFY_HEADER_FT_TYPE, }; +enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, +}; + #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index ceb6d0d8529a..b5fda0fcd484 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -44,5 +44,9 @@ enum mlx5_ib_uapi_flow_table_type { MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX = 0x1, }; +enum mlx5_ib_uapi_flow_action_packet_reformat_type { + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 = 0x0, +}; + #endif From a090d0d859ff88dd4c34614d01cee9b0603f4313 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Tue, 28 Aug 2018 14:18:54 +0300 Subject: [PATCH 025/242] RDMA/mlx5: Extend packet reformat verbs We expose new actions: L2_TO_L2_TUNNEL - A generic encap from L2 to L2, the data passed should be the encapsulating headers. L3_TUNNEL_TO_L2 - Will do decap where the inner packet starts from L3, the data should be mac or mac + vlan (14 or 18 bytes). L2_TO_L3_TUNNEL - Will do encap where is L2 of the original packet will not be included, the data should be the encapsulating header. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 95 +++++++++++++++++++++++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + include/uapi/rdma/mlx5_user_ioctl_verbs.h | 3 + 4 files changed, 100 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 888f79d6a125..5750a650884e 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -202,6 +202,10 @@ void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction) mlx5_modify_header_dealloc(maction->flow_action_raw.dev->mdev, maction->flow_action_raw.action_id); break; + case MLX5_IB_FLOW_ACTION_PACKET_REFORMAT: + mlx5_packet_reformat_dealloc(maction->flow_action_raw.dev->mdev, + maction->flow_action_raw.action_id); + break; case MLX5_IB_FLOW_ACTION_DECAP: break; default: @@ -293,6 +297,21 @@ static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, u8 ft_type) { switch (packet_reformat_type) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE(ibdev->mdev, + encap_general_header); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX) + return MLX5_CAP_FLOWTABLE_NIC_TX(ibdev->mdev, + reformat_l2_to_l3_tunnel); + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) + return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, + reformat_l3_tunnel_to_l2); + break; case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: if (ft_type == MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX) return MLX5_CAP_FLOWTABLE_NIC_RX(ibdev->mdev, decap); @@ -304,6 +323,56 @@ static bool mlx5_ib_flow_action_packet_reformat_valid(struct mlx5_ib_dev *ibdev, return false; } +static int mlx5_ib_dv_to_prm_packet_reforamt_type(u8 dv_prt, u8 *prm_prt) +{ + switch (dv_prt) { + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L2_TUNNEL; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: + *prm_prt = MLX5_REFORMAT_TYPE_L3_TUNNEL_TO_L2; + break; + case MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: + *prm_prt = MLX5_REFORMAT_TYPE_L2_TO_L3_TUNNEL; + break; + default: + return -EINVAL; + } + + return 0; +} + +static int mlx5_ib_flow_action_create_packet_reformat_ctx( + struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_action *maction, + u8 ft_type, u8 dv_prt, + void *in, size_t len) +{ + enum mlx5_flow_namespace_type namespace; + u8 prm_prt; + int ret; + + ret = mlx5_ib_ft_type_to_namespace(ft_type, &namespace); + if (ret) + return ret; + + ret = mlx5_ib_dv_to_prm_packet_reforamt_type(dv_prt, &prm_prt); + if (ret) + return ret; + + ret = mlx5_packet_reformat_alloc(dev->mdev, prm_prt, len, + in, namespace, + &maction->flow_action_raw.action_id); + if (ret) + return ret; + + maction->flow_action_raw.sub_type = + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT; + maction->flow_action_raw.dev = dev; + + return 0; +} + static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) @@ -338,12 +407,34 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)( maction->flow_action_raw.sub_type = MLX5_IB_FLOW_ACTION_DECAP; maction->flow_action_raw.dev = mdev; + } else { + void *in; + int len; + + in = uverbs_attr_get_alloced_ptr(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + if (IS_ERR(in)) { + ret = PTR_ERR(in); + goto free_maction; + } + + len = uverbs_attr_get_len(attrs, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF); + + ret = mlx5_ib_flow_action_create_packet_reformat_ctx(mdev, + maction, ft_type, dv_prt, in, len); + if (ret) + goto free_maction; } uverbs_flow_action_fill_action(&maction->ib_action, uobj, uobj->context->device, IB_FLOW_ACTION_UNSPECIFIED); return 0; + +free_maction: + kfree(maction); + return ret; } DECLARE_UVERBS_NAMED_METHOD( @@ -401,6 +492,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_FLOW_ACTION, UVERBS_ACCESS_NEW, UA_MANDATORY), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, + UVERBS_ATTR_MIN_SIZE(1), + UA_ALLOC_AND_COPY, + UA_OPTIONAL), UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, enum mlx5_ib_uapi_flow_action_packet_reformat_type, UA_MANDATORY), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8ac84cc00fd5..eb6a0ca0247f 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -153,6 +153,7 @@ struct mlx5_ib_pd { enum { MLX5_IB_FLOW_ACTION_MODIFY_HEADER, + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT, MLX5_IB_FLOW_ACTION_DECAP, }; diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 40db7fca3d0b..75c7093fd95b 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -181,6 +181,7 @@ enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_TYPE, MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_FT_TYPE, + MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, }; #endif diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h index b5fda0fcd484..4ef62c0e8452 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h +++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h @@ -46,6 +46,9 @@ enum mlx5_ib_uapi_flow_table_type { enum mlx5_ib_uapi_flow_action_packet_reformat_type { MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2 = 0x0, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL = 0x1, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2 = 0x2, + MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3, }; #endif From 08920b8f5d2d3b6eb8bc118923f707c769704c77 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 10 Aug 2018 11:42:46 -0700 Subject: [PATCH 026/242] RDMA/bnxt_re: QPLIB: Add and use #define dev_fmt(fmt) "QPLIB: " fmt Consistently use the "QPLIB: " prefix for dev_ logging. Miscellanea: o Add missing newlines to avoid possible message interleaving o Coalesce consecutive dev_ uses that emit a message header to avoid < 80 column lengths and mistakenly output on multiple lines o Reflow modified lines to use 80 columns where appropriate o Consistently use "%s: " where __func__ is output o QPLIB: is now always output immediately after the dev_ header Signed-off-by: Joe Perches Acked-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 130 ++++++++------------- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 51 ++++---- drivers/infiniband/hw/bnxt_re/qplib_res.c | 29 +++-- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 57 +++++---- 4 files changed, 116 insertions(+), 151 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e426b990c1dd..f92605aa42b4 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -36,6 +36,8 @@ * Description: Fast Path Operators */ +#define dev_fmt(fmt) "QPLIB: " fmt + #include #include #include @@ -71,8 +73,7 @@ static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp) if (!qp->sq.flushed) { dev_dbg(&scq->hwq.pdev->dev, - "QPLIB: FP: Adding to SQ Flush list = %p", - qp); + "FP: Adding to SQ Flush list = %p\n", qp); bnxt_qplib_cancel_phantom_processing(qp); list_add_tail(&qp->sq_flush, &scq->sqf_head); qp->sq.flushed = true; @@ -80,8 +81,7 @@ static void __bnxt_qplib_add_flush_qp(struct bnxt_qplib_qp *qp) if (!qp->srq) { if (!qp->rq.flushed) { dev_dbg(&rcq->hwq.pdev->dev, - "QPLIB: FP: Adding to RQ Flush list = %p", - qp); + "FP: Adding to RQ Flush list = %p\n", qp); list_add_tail(&qp->rq_flush, &rcq->rqf_head); qp->rq.flushed = true; } @@ -207,7 +207,7 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res, if (!qp->sq_hdr_buf) { rc = -ENOMEM; dev_err(&res->pdev->dev, - "QPLIB: Failed to create sq_hdr_buf"); + "Failed to create sq_hdr_buf\n"); goto fail; } } @@ -221,7 +221,7 @@ static int bnxt_qplib_alloc_qp_hdr_buf(struct bnxt_qplib_res *res, if (!qp->rq_hdr_buf) { rc = -ENOMEM; dev_err(&res->pdev->dev, - "QPLIB: Failed to create rq_hdr_buf"); + "Failed to create rq_hdr_buf\n"); goto fail; } } @@ -277,8 +277,7 @@ static void bnxt_qplib_service_nq(unsigned long data) num_cqne_processed++; else dev_warn(&nq->pdev->dev, - "QPLIB: cqn - type 0x%x not handled", - type); + "cqn - type 0x%x not handled\n", type); spin_unlock_bh(&cq->compl_lock); break; } @@ -298,7 +297,7 @@ static void bnxt_qplib_service_nq(unsigned long data) num_srqne_processed++; else dev_warn(&nq->pdev->dev, - "QPLIB: SRQ event 0x%x not handled", + "SRQ event 0x%x not handled\n", nqsrqe->event); break; } @@ -306,8 +305,7 @@ static void bnxt_qplib_service_nq(unsigned long data) break; default: dev_warn(&nq->pdev->dev, - "QPLIB: nqe with type = 0x%x not handled", - type); + "nqe with type = 0x%x not handled\n", type); break; } raw_cons++; @@ -396,7 +394,7 @@ int bnxt_qplib_nq_start_irq(struct bnxt_qplib_nq *nq, int nq_indx, rc = irq_set_affinity_hint(nq->vector, &nq->mask); if (rc) { dev_warn(&nq->pdev->dev, - "QPLIB: set affinity failed; vector: %d nq_idx: %d\n", + "set affinity failed; vector: %d nq_idx: %d\n", nq->vector, nq_indx); } nq->requested = true; @@ -443,7 +441,7 @@ int bnxt_qplib_enable_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq, rc = bnxt_qplib_nq_start_irq(nq, nq_idx, msix_vector, true); if (rc) { dev_err(&nq->pdev->dev, - "QPLIB: Failed to request irq for nq-idx %d", nq_idx); + "Failed to request irq for nq-idx %d\n", nq_idx); goto fail; } @@ -662,8 +660,8 @@ int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq, spin_lock(&srq_hwq->lock); if (srq->start_idx == srq->last_idx) { - dev_err(&srq_hwq->pdev->dev, "QPLIB: FP: SRQ (0x%x) is full!", - srq->id); + dev_err(&srq_hwq->pdev->dev, + "FP: SRQ (0x%x) is full!\n", srq->id); rc = -EINVAL; spin_unlock(&srq_hwq->lock); goto done; @@ -1324,7 +1322,7 @@ int bnxt_qplib_query_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp) } } if (i == res->sgid_tbl.max) - dev_warn(&res->pdev->dev, "QPLIB: SGID not found??"); + dev_warn(&res->pdev->dev, "SGID not found??\n"); qp->ah.hop_limit = sb->hop_limit; qp->ah.traffic_class = sb->traffic_class; @@ -1536,7 +1534,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, if (bnxt_qplib_queue_full(sq)) { dev_err(&sq->hwq.pdev->dev, - "QPLIB: prod = %#x cons = %#x qdepth = %#x delta = %#x", + "prod = %#x cons = %#x qdepth = %#x delta = %#x\n", sq->hwq.prod, sq->hwq.cons, sq->hwq.max_elements, sq->q_full_delta); rc = -ENOMEM; @@ -1561,7 +1559,7 @@ int bnxt_qplib_post_send(struct bnxt_qplib_qp *qp, /* Copy the inline data */ if (wqe->inline_len > BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH) { dev_warn(&sq->hwq.pdev->dev, - "QPLIB: Inline data length > 96 detected"); + "Inline data length > 96 detected\n"); data_len = BNXT_QPLIB_SWQE_MAX_INLINE_LENGTH; } else { data_len = wqe->inline_len; @@ -1776,7 +1774,7 @@ done: queue_work(qp->scq->nq->cqn_wq, &nq_work->work); } else { dev_err(&sq->hwq.pdev->dev, - "QPLIB: FP: Failed to allocate SQ nq_work!"); + "FP: Failed to allocate SQ nq_work!\n"); rc = -ENOMEM; } } @@ -1815,13 +1813,12 @@ int bnxt_qplib_post_recv(struct bnxt_qplib_qp *qp, if (qp->state == CMDQ_MODIFY_QP_NEW_STATE_ERR) { sch_handler = true; dev_dbg(&rq->hwq.pdev->dev, - "%s Error QP. Scheduling for poll_cq\n", - __func__); + "%s: Error QP. Scheduling for poll_cq\n", __func__); goto queue_err; } if (bnxt_qplib_queue_full(rq)) { dev_err(&rq->hwq.pdev->dev, - "QPLIB: FP: QP (0x%x) RQ is full!", qp->id); + "FP: QP (0x%x) RQ is full!\n", qp->id); rc = -EINVAL; goto done; } @@ -1870,7 +1867,7 @@ queue_err: queue_work(qp->rcq->nq->cqn_wq, &nq_work->work); } else { dev_err(&rq->hwq.pdev->dev, - "QPLIB: FP: Failed to allocate RQ nq_work!"); + "FP: Failed to allocate RQ nq_work!\n"); rc = -ENOMEM; } } @@ -1932,7 +1929,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) if (!cq->dpi) { dev_err(&rcfw->pdev->dev, - "QPLIB: FP: CREATE_CQ failed due to NULL DPI"); + "FP: CREATE_CQ failed due to NULL DPI\n"); return -EINVAL; } req.dpi = cpu_to_le32(cq->dpi->dpi); @@ -2172,7 +2169,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq, * comes back */ dev_dbg(&cq->hwq.pdev->dev, - "FP:Got Phantom CQE"); + "FP: Got Phantom CQE\n"); sq->condition = false; sq->single = true; rc = 0; @@ -2189,7 +2186,7 @@ static int do_wa9060(struct bnxt_qplib_qp *qp, struct bnxt_qplib_cq *cq, peek_raw_cq_cons++; } dev_err(&cq->hwq.pdev->dev, - "Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x", + "Should not have come here! cq_cons=0x%x qp=0x%x sq cons sw=0x%x hw=0x%x\n", cq_cons, qp->id, sw_sq_cons, cqe_sq_cons); rc = -EINVAL; } @@ -2213,7 +2210,7 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, le64_to_cpu(hwcqe->qp_handle)); if (!qp) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: Process Req qp is NULL"); + "FP: Process Req qp is NULL\n"); return -EINVAL; } sq = &qp->sq; @@ -2221,16 +2218,14 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, cqe_sq_cons = HWQ_CMP(le16_to_cpu(hwcqe->sq_cons_idx), &sq->hwq); if (cqe_sq_cons > sq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process req reported "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x", + "FP: CQ Process req reported sq_cons_idx 0x%x which exceeded max 0x%x\n", cqe_sq_cons, sq->hwq.max_elements); return -EINVAL; } if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } /* Require to walk the sq's swq to fabricate CQEs for all previously @@ -2262,9 +2257,7 @@ static int bnxt_qplib_cq_process_req(struct bnxt_qplib_cq *cq, hwcqe->status != CQ_REQ_STATUS_OK) { cqe->status = hwcqe->status; dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Processed Req "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id[%d] = 0x%llx with status 0x%x", + "FP: CQ Processed Req wr_id[%d] = 0x%llx with status 0x%x\n", sw_sq_cons, cqe->wr_id, cqe->status); cqe++; (*budget)--; @@ -2330,12 +2323,12 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq RC qp is NULL"); + dev_err(&cq->hwq.pdev->dev, "process_cq RC qp is NULL\n"); return -EINVAL; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } @@ -2356,9 +2349,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, return -EINVAL; if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process RC "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + "FP: CQ Process RC wr_id idx 0x%x exceeded SRQ max 0x%x\n", wr_id_idx, srq->hwq.max_elements); return -EINVAL; } @@ -2371,9 +2362,7 @@ static int bnxt_qplib_cq_process_res_rc(struct bnxt_qplib_cq *cq, rq = &qp->rq; if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process RC "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + "FP: CQ Process RC wr_id idx 0x%x exceeded RQ max 0x%x\n", wr_id_idx, rq->hwq.max_elements); return -EINVAL; } @@ -2409,12 +2398,12 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { - dev_err(&cq->hwq.pdev->dev, "QPLIB: process_cq UD qp is NULL"); + dev_err(&cq->hwq.pdev->dev, "process_cq UD qp is NULL\n"); return -EINVAL; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } cqe = *pcqe; @@ -2439,9 +2428,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process UD "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + "FP: CQ Process UD wr_id idx 0x%x exceeded SRQ max 0x%x\n", wr_id_idx, srq->hwq.max_elements); return -EINVAL; } @@ -2454,9 +2441,7 @@ static int bnxt_qplib_cq_process_res_ud(struct bnxt_qplib_cq *cq, rq = &qp->rq; if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process UD "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded RQ max 0x%x", + "FP: CQ Process UD wr_id idx 0x%x exceeded RQ max 0x%x\n", wr_id_idx, rq->hwq.max_elements); return -EINVAL; } @@ -2508,13 +2493,12 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { - dev_err(&cq->hwq.pdev->dev, - "QPLIB: process_cq Raw/QP1 qp is NULL"); + dev_err(&cq->hwq.pdev->dev, "process_cq Raw/QP1 qp is NULL\n"); return -EINVAL; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto done; } cqe = *pcqe; @@ -2543,14 +2527,12 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, srq = qp->srq; if (!srq) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: SRQ used but not defined??"); + "FP: SRQ used but not defined??\n"); return -EINVAL; } if (wr_id_idx >= srq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Raw/QP1 "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: wr_id idx 0x%x exceeded SRQ max 0x%x", + "FP: CQ Process Raw/QP1 wr_id idx 0x%x exceeded SRQ max 0x%x\n", wr_id_idx, srq->hwq.max_elements); return -EINVAL; } @@ -2563,9 +2545,7 @@ static int bnxt_qplib_cq_process_res_raweth_qp1(struct bnxt_qplib_cq *cq, rq = &qp->rq; if (wr_id_idx >= rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Raw/QP1 RQ wr_id "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: ix 0x%x exceeded RQ max 0x%x", + "FP: CQ Process Raw/QP1 RQ wr_id idx 0x%x exceeded RQ max 0x%x\n", wr_id_idx, rq->hwq.max_elements); return -EINVAL; } @@ -2600,14 +2580,14 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, /* Check the Status */ if (hwcqe->status != CQ_TERMINAL_STATUS_OK) dev_warn(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Terminal Error status = 0x%x", + "FP: CQ Process Terminal Error status = 0x%x\n", hwcqe->status); qp = (struct bnxt_qplib_qp *)((unsigned long) le64_to_cpu(hwcqe->qp_handle)); if (!qp) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process terminal qp is NULL"); + "FP: CQ Process terminal qp is NULL\n"); return -EINVAL; } @@ -2623,16 +2603,14 @@ static int bnxt_qplib_cq_process_terminal(struct bnxt_qplib_cq *cq, if (cqe_cons > sq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process terminal reported "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: sq_cons_idx 0x%x which exceeded max 0x%x", + "FP: CQ Process terminal reported sq_cons_idx 0x%x which exceeded max 0x%x\n", cqe_cons, sq->hwq.max_elements); goto do_rq; } if (qp->sq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); goto sq_done; } @@ -2673,16 +2651,14 @@ do_rq: goto done; } else if (cqe_cons > rq->hwq.max_elements) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Processed terminal "); - dev_err(&cq->hwq.pdev->dev, - "QPLIB: reported rq_cons_idx 0x%x exceeds max 0x%x", + "FP: CQ Processed terminal reported rq_cons_idx 0x%x exceeds max 0x%x\n", cqe_cons, rq->hwq.max_elements); goto done; } if (qp->rq.flushed) { dev_dbg(&cq->hwq.pdev->dev, - "%s: QPLIB: QP in Flush QP = %p\n", __func__, qp); + "%s: QP in Flush QP = %p\n", __func__, qp); rc = 0; goto done; } @@ -2704,7 +2680,7 @@ static int bnxt_qplib_cq_process_cutoff(struct bnxt_qplib_cq *cq, /* Check the Status */ if (hwcqe->status != CQ_CUTOFF_STATUS_OK) { dev_err(&cq->hwq.pdev->dev, - "QPLIB: FP: CQ Process Cutoff Error status = 0x%x", + "FP: CQ Process Cutoff Error status = 0x%x\n", hwcqe->status); return -EINVAL; } @@ -2724,16 +2700,12 @@ int bnxt_qplib_process_flush_list(struct bnxt_qplib_cq *cq, spin_lock_irqsave(&cq->flush_lock, flags); list_for_each_entry(qp, &cq->sqf_head, sq_flush) { - dev_dbg(&cq->hwq.pdev->dev, - "QPLIB: FP: Flushing SQ QP= %p", - qp); + dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing SQ QP= %p\n", qp); __flush_sq(&qp->sq, qp, &cqe, &budget); } list_for_each_entry(qp, &cq->rqf_head, rq_flush) { - dev_dbg(&cq->hwq.pdev->dev, - "QPLIB: FP: Flushing RQ QP= %p", - qp); + dev_dbg(&cq->hwq.pdev->dev, "FP: Flushing RQ QP= %p\n", qp); __flush_rq(&qp->rq, qp, &cqe, &budget); } spin_unlock_irqrestore(&cq->flush_lock, flags); @@ -2801,7 +2773,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, goto exit; default: dev_err(&cq->hwq.pdev->dev, - "QPLIB: process_cq unknown type 0x%lx", + "process_cq unknown type 0x%lx\n", hw_cqe->cqe_type_toggle & CQ_BASE_CQE_TYPE_MASK); rc = -EINVAL; @@ -2814,7 +2786,7 @@ int bnxt_qplib_poll_cq(struct bnxt_qplib_cq *cq, struct bnxt_qplib_cqe *cqe, * next one */ dev_err(&cq->hwq.pdev->dev, - "QPLIB: process_cqe error rc = 0x%x", rc); + "process_cqe error rc = 0x%x\n", rc); } raw_cons++; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 2852d350ada1..14105004d258 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -35,6 +35,9 @@ * * Description: RDMA Controller HW interface */ + +#define dev_fmt(fmt) "QPLIB: " fmt + #include #include #include @@ -96,14 +99,13 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, opcode != CMDQ_BASE_OPCODE_INITIALIZE_FW && opcode != CMDQ_BASE_OPCODE_QUERY_VERSION)) { dev_err(&rcfw->pdev->dev, - "QPLIB: RCFW not initialized, reject opcode 0x%x", - opcode); + "RCFW not initialized, reject opcode 0x%x\n", opcode); return -EINVAL; } if (test_bit(FIRMWARE_INITIALIZED_FLAG, &rcfw->flags) && opcode == CMDQ_BASE_OPCODE_INITIALIZE_FW) { - dev_err(&rcfw->pdev->dev, "QPLIB: RCFW already initialized!"); + dev_err(&rcfw->pdev->dev, "RCFW already initialized!\n"); return -EINVAL; } @@ -115,7 +117,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, */ spin_lock_irqsave(&cmdq->lock, flags); if (req->cmd_size >= HWQ_FREE_SLOTS(cmdq)) { - dev_err(&rcfw->pdev->dev, "QPLIB: RCFW: CMDQ is full!"); + dev_err(&rcfw->pdev->dev, "RCFW: CMDQ is full!\n"); spin_unlock_irqrestore(&cmdq->lock, flags); return -EAGAIN; } @@ -154,7 +156,7 @@ static int __send_message(struct bnxt_qplib_rcfw *rcfw, struct cmdq_base *req, cmdqe = &cmdq_ptr[get_cmdq_pg(sw_prod)][get_cmdq_idx(sw_prod)]; if (!cmdqe) { dev_err(&rcfw->pdev->dev, - "QPLIB: RCFW request failed with no cmdqe!"); + "RCFW request failed with no cmdqe!\n"); goto done; } /* Copy a segment of the req cmd to the cmdq */ @@ -210,7 +212,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, if (!retry_cnt || (rc != -EAGAIN && rc != -EBUSY)) { /* send failed */ - dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x send failed", + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x send failed\n", cookie, opcode); return rc; } @@ -224,7 +226,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, rc = __wait_for_resp(rcfw, cookie); if (rc) { /* timed out */ - dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x timedout (%d)msec", + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x timedout (%d)msec\n", cookie, opcode, RCFW_CMD_WAIT_TIME_MS); set_bit(FIRMWARE_TIMED_OUT, &rcfw->flags); return rc; @@ -232,7 +234,7 @@ int bnxt_qplib_rcfw_send_message(struct bnxt_qplib_rcfw *rcfw, if (evnt->status) { /* failed with status */ - dev_err(&rcfw->pdev->dev, "QPLIB: cmdq[%#x]=%#x status %#x", + dev_err(&rcfw->pdev->dev, "cmdq[%#x]=%#x status %#x\n", cookie, opcode, evnt->status); rc = -EFAULT; } @@ -298,9 +300,9 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, qp_id = le32_to_cpu(err_event->xid); qp = rcfw->qp_tbl[qp_id].qp_handle; dev_dbg(&rcfw->pdev->dev, - "QPLIB: Received QP error notification"); + "Received QP error notification\n"); dev_dbg(&rcfw->pdev->dev, - "QPLIB: qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", + "qpid 0x%x, req_err=0x%x, resp_err=0x%x\n", qp_id, err_event->req_err_state_reason, err_event->res_err_state_reason); if (!qp) @@ -323,13 +325,13 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, crsqe->resp = NULL; } else { dev_err(&rcfw->pdev->dev, - "QPLIB: CMD %s resp->cookie = %#x, evnt->cookie = %#x", + "CMD %s resp->cookie = %#x, evnt->cookie = %#x\n", crsqe->resp ? "mismatch" : "collision", crsqe->resp ? crsqe->resp->cookie : 0, mcookie); } if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap)) dev_warn(&rcfw->pdev->dev, - "QPLIB: CMD bit %d was not requested", cbit); + "CMD bit %d was not requested\n", cbit); cmdq->cons += crsqe->req_size; crsqe->req_size = 0; @@ -376,14 +378,13 @@ static void bnxt_qplib_service_creq(unsigned long data) (rcfw, (struct creq_func_event *)creqe)) rcfw->creq_func_event_processed++; else - dev_warn - (&rcfw->pdev->dev, "QPLIB:aeqe:%#x Not handled", - type); + dev_warn(&rcfw->pdev->dev, + "aeqe:%#x Not handled\n", type); break; default: - dev_warn(&rcfw->pdev->dev, "QPLIB: creqe with "); dev_warn(&rcfw->pdev->dev, - "QPLIB: op_event = 0x%x not handled", type); + "creqe with op_event = 0x%x not handled\n", + type); break; } raw_cons++; @@ -551,7 +552,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, BNXT_QPLIB_CREQE_UNITS, 0, PAGE_SIZE, HWQ_TYPE_L2_CMPL)) { dev_err(&rcfw->pdev->dev, - "QPLIB: HW channel CREQ allocation failed"); + "HW channel CREQ allocation failed\n"); goto fail; } rcfw->cmdq.max_elements = BNXT_QPLIB_CMDQE_MAX_CNT; @@ -560,7 +561,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev, BNXT_QPLIB_CMDQE_UNITS, 0, PAGE_SIZE, HWQ_TYPE_CTX)) { dev_err(&rcfw->pdev->dev, - "QPLIB: HW channel CMDQ allocation failed"); + "HW channel CMDQ allocation failed\n"); goto fail; } @@ -616,7 +617,7 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size); if (indx != rcfw->bmap_size) dev_err(&rcfw->pdev->dev, - "QPLIB: disabling RCFW with pending cmd-bit %lx", indx); + "disabling RCFW with pending cmd-bit %lx\n", indx); kfree(rcfw->cmdq_bitmap); rcfw->bmap_size = 0; @@ -681,8 +682,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, RCFW_COMM_BASE_OFFSET, RCFW_COMM_SIZE); if (!rcfw->cmdq_bar_reg_iomem) { - dev_err(&rcfw->pdev->dev, - "QPLIB: CMDQ BAR region %d mapping failed", + dev_err(&rcfw->pdev->dev, "CMDQ BAR region %d mapping failed\n", rcfw->cmdq_bar_reg); return -ENOMEM; } @@ -697,13 +697,12 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, res_base = pci_resource_start(pdev, rcfw->creq_bar_reg); if (!res_base) dev_err(&rcfw->pdev->dev, - "QPLIB: CREQ BAR region %d resc start is 0!", + "CREQ BAR region %d resc start is 0!\n", rcfw->creq_bar_reg); rcfw->creq_bar_reg_iomem = ioremap_nocache(res_base + cp_bar_reg_off, 4); if (!rcfw->creq_bar_reg_iomem) { - dev_err(&rcfw->pdev->dev, - "QPLIB: CREQ BAR region %d mapping failed", + dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n", rcfw->creq_bar_reg); return -ENOMEM; } @@ -717,7 +716,7 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, rc = bnxt_qplib_rcfw_start_irq(rcfw, msix_vector, true); if (rc) { dev_err(&rcfw->pdev->dev, - "QPLIB: Failed to request IRQ for CREQ rc = 0x%x", rc); + "Failed to request IRQ for CREQ rc = 0x%x\n", rc); bnxt_qplib_disable_rcfw_channel(rcfw); return rc; } diff --git a/drivers/infiniband/hw/bnxt_re/qplib_res.c b/drivers/infiniband/hw/bnxt_re/qplib_res.c index 539a5d44e6db..59eeac55626f 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_res.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_res.c @@ -36,6 +36,8 @@ * Description: QPLib resource manager */ +#define dev_fmt(fmt) "QPLIB: " fmt + #include #include #include @@ -68,8 +70,7 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl, pbl->pg_map_arr[i]); else dev_warn(&pdev->dev, - "QPLIB: PBL free pg_arr[%d] empty?!", - i); + "PBL free pg_arr[%d] empty?!\n", i); pbl->pg_arr[i] = NULL; } } @@ -537,7 +538,7 @@ static void bnxt_qplib_free_pkey_tbl(struct bnxt_qplib_res *res, struct bnxt_qplib_pkey_tbl *pkey_tbl) { if (!pkey_tbl->tbl) - dev_dbg(&res->pdev->dev, "QPLIB: PKEY tbl not present"); + dev_dbg(&res->pdev->dev, "PKEY tbl not present\n"); else kfree(pkey_tbl->tbl); @@ -578,7 +579,7 @@ int bnxt_qplib_dealloc_pd(struct bnxt_qplib_res *res, struct bnxt_qplib_pd *pd) { if (test_and_set_bit(pd->id, pdt->tbl)) { - dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d", + dev_warn(&res->pdev->dev, "Freeing an unused PD? pdn = %d\n", pd->id); return -EINVAL; } @@ -639,11 +640,11 @@ int bnxt_qplib_dealloc_dpi(struct bnxt_qplib_res *res, struct bnxt_qplib_dpi *dpi) { if (dpi->dpi >= dpit->max) { - dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d", dpi->dpi); + dev_warn(&res->pdev->dev, "Invalid DPI? dpi = %d\n", dpi->dpi); return -EINVAL; } if (test_and_set_bit(dpi->dpi, dpit->tbl)) { - dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d", + dev_warn(&res->pdev->dev, "Freeing an unused DPI? dpi = %d\n", dpi->dpi); return -EINVAL; } @@ -673,22 +674,21 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, u32 dbr_len, bytes; if (dpit->dbr_bar_reg_iomem) { - dev_err(&res->pdev->dev, - "QPLIB: DBR BAR region %d already mapped", dbr_bar_reg); + dev_err(&res->pdev->dev, "DBR BAR region %d already mapped\n", + dbr_bar_reg); return -EALREADY; } bar_reg_base = pci_resource_start(res->pdev, dbr_bar_reg); if (!bar_reg_base) { - dev_err(&res->pdev->dev, - "QPLIB: BAR region %d resc start failed", dbr_bar_reg); + dev_err(&res->pdev->dev, "BAR region %d resc start failed\n", + dbr_bar_reg); return -ENOMEM; } dbr_len = pci_resource_len(res->pdev, dbr_bar_reg) - dbr_offset; if (!dbr_len || ((dbr_len & (PAGE_SIZE - 1)) != 0)) { - dev_err(&res->pdev->dev, "QPLIB: Invalid DBR length %d", - dbr_len); + dev_err(&res->pdev->dev, "Invalid DBR length %d\n", dbr_len); return -ENOMEM; } @@ -696,8 +696,7 @@ static int bnxt_qplib_alloc_dpi_tbl(struct bnxt_qplib_res *res, dbr_len); if (!dpit->dbr_bar_reg_iomem) { dev_err(&res->pdev->dev, - "QPLIB: FP: DBR BAR region %d mapping failed", - dbr_bar_reg); + "FP: DBR BAR region %d mapping failed\n", dbr_bar_reg); return -ENOMEM; } @@ -767,7 +766,7 @@ static int bnxt_qplib_alloc_stats_ctx(struct pci_dev *pdev, stats->dma = dma_alloc_coherent(&pdev->dev, stats->size, &stats->dma_map, GFP_KERNEL); if (!stats->dma) { - dev_err(&pdev->dev, "QPLIB: Stats DMA allocation failed"); + dev_err(&pdev->dev, "Stats DMA allocation failed\n"); return -ENOMEM; } return 0; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 4097f3fa25c5..c80f2eaaf1a3 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -36,6 +36,8 @@ * Description: Slow Path Operators */ +#define dev_fmt(fmt) "QPLIB: " fmt + #include #include #include @@ -89,7 +91,7 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); if (!sbuf) { dev_err(&rcfw->pdev->dev, - "QPLIB: SP: QUERY_FUNC alloc side buffer failed"); + "SP: QUERY_FUNC alloc side buffer failed\n"); return -ENOMEM; } @@ -186,8 +188,7 @@ int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res, (void *)&resp, NULL, 0); if (rc) { - dev_err(&res->pdev->dev, - "QPLIB: Failed to set function resources"); + dev_err(&res->pdev->dev, "Failed to set function resources\n"); } return rc; } @@ -199,7 +200,7 @@ int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, { if (index >= sgid_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: Index %d exceeded SGID table max (%d)", + "Index %d exceeded SGID table max (%d)\n", index, sgid_tbl->max); return -EINVAL; } @@ -217,13 +218,12 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int index; if (!sgid_tbl) { - dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated"); + dev_err(&res->pdev->dev, "SGID table not allocated\n"); return -EINVAL; } /* Do we need a sgid_lock here? */ if (!sgid_tbl->active) { - dev_err(&res->pdev->dev, - "QPLIB: SGID table has no active entries"); + dev_err(&res->pdev->dev, "SGID table has no active entries\n"); return -ENOMEM; } for (index = 0; index < sgid_tbl->max; index++) { @@ -231,7 +231,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, break; } if (index == sgid_tbl->max) { - dev_warn(&res->pdev->dev, "GID not found in the SGID table"); + dev_warn(&res->pdev->dev, "GID not found in the SGID table\n"); return 0; } /* Remove GID from the SGID table */ @@ -244,7 +244,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, RCFW_CMD_PREP(req, DELETE_GID, cmd_flags); if (sgid_tbl->hw_id[index] == 0xFFFF) { dev_err(&res->pdev->dev, - "QPLIB: GID entry contains an invalid HW id"); + "GID entry contains an invalid HW id\n"); return -EINVAL; } req.gid_index = cpu_to_le16(sgid_tbl->hw_id[index]); @@ -258,7 +258,7 @@ int bnxt_qplib_del_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, sgid_tbl->vlan[index] = 0; sgid_tbl->active--; dev_dbg(&res->pdev->dev, - "QPLIB: SGID deleted hw_id[0x%x] = 0x%x active = 0x%x", + "SGID deleted hw_id[0x%x] = 0x%x active = 0x%x\n", index, sgid_tbl->hw_id[index], sgid_tbl->active); sgid_tbl->hw_id[index] = (u16)-1; @@ -277,20 +277,19 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, int i, free_idx; if (!sgid_tbl) { - dev_err(&res->pdev->dev, "QPLIB: SGID table not allocated"); + dev_err(&res->pdev->dev, "SGID table not allocated\n"); return -EINVAL; } /* Do we need a sgid_lock here? */ if (sgid_tbl->active == sgid_tbl->max) { - dev_err(&res->pdev->dev, "QPLIB: SGID table is full"); + dev_err(&res->pdev->dev, "SGID table is full\n"); return -ENOMEM; } free_idx = sgid_tbl->max; for (i = 0; i < sgid_tbl->max; i++) { if (!memcmp(&sgid_tbl->tbl[i], gid, sizeof(*gid))) { dev_dbg(&res->pdev->dev, - "QPLIB: SGID entry already exist in entry %d!", - i); + "SGID entry already exist in entry %d!\n", i); *index = i; return -EALREADY; } else if (!memcmp(&sgid_tbl->tbl[i], &bnxt_qplib_gid_zero, @@ -301,7 +300,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, } if (free_idx == sgid_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: SGID table is FULL but count is not MAX??"); + "SGID table is FULL but count is not MAX??\n"); return -ENOMEM; } if (update) { @@ -348,7 +347,7 @@ int bnxt_qplib_add_sgid(struct bnxt_qplib_sgid_tbl *sgid_tbl, sgid_tbl->vlan[free_idx] = 1; dev_dbg(&res->pdev->dev, - "QPLIB: SGID added hw_id[0x%x] = 0x%x active = 0x%x", + "SGID added hw_id[0x%x] = 0x%x active = 0x%x\n", free_idx, sgid_tbl->hw_id[free_idx], sgid_tbl->active); *index = free_idx; @@ -404,7 +403,7 @@ int bnxt_qplib_get_pkey(struct bnxt_qplib_res *res, } if (index >= pkey_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: Index %d exceeded PKEY table max (%d)", + "Index %d exceeded PKEY table max (%d)\n", index, pkey_tbl->max); return -EINVAL; } @@ -419,14 +418,13 @@ int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res, int i, rc = 0; if (!pkey_tbl) { - dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated"); + dev_err(&res->pdev->dev, "PKEY table not allocated\n"); return -EINVAL; } /* Do we need a pkey_lock here? */ if (!pkey_tbl->active) { - dev_err(&res->pdev->dev, - "QPLIB: PKEY table has no active entries"); + dev_err(&res->pdev->dev, "PKEY table has no active entries\n"); return -ENOMEM; } for (i = 0; i < pkey_tbl->max; i++) { @@ -435,8 +433,7 @@ int bnxt_qplib_del_pkey(struct bnxt_qplib_res *res, } if (i == pkey_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: PKEY 0x%04x not found in the pkey table", - *pkey); + "PKEY 0x%04x not found in the pkey table\n", *pkey); return -ENOMEM; } memset(&pkey_tbl->tbl[i], 0, sizeof(*pkey)); @@ -453,13 +450,13 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res, int i, free_idx, rc = 0; if (!pkey_tbl) { - dev_err(&res->pdev->dev, "QPLIB: PKEY table not allocated"); + dev_err(&res->pdev->dev, "PKEY table not allocated\n"); return -EINVAL; } /* Do we need a pkey_lock here? */ if (pkey_tbl->active == pkey_tbl->max) { - dev_err(&res->pdev->dev, "QPLIB: PKEY table is full"); + dev_err(&res->pdev->dev, "PKEY table is full\n"); return -ENOMEM; } free_idx = pkey_tbl->max; @@ -471,7 +468,7 @@ int bnxt_qplib_add_pkey(struct bnxt_qplib_res *res, } if (free_idx == pkey_tbl->max) { dev_err(&res->pdev->dev, - "QPLIB: PKEY table is FULL but count is not MAX??"); + "PKEY table is FULL but count is not MAX??\n"); return -ENOMEM; } /* Add PKEY to the pkey_tbl */ @@ -555,8 +552,7 @@ int bnxt_qplib_free_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw) int rc; if (mrw->lkey == 0xFFFFFFFF) { - dev_info(&res->pdev->dev, - "QPLIB: SP: Free a reserved lkey MRW"); + dev_info(&res->pdev->dev, "SP: Free a reserved lkey MRW\n"); return 0; } @@ -666,9 +662,8 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, pages++; if (pages > MAX_PBL_LVL_1_PGS) { - dev_err(&res->pdev->dev, "QPLIB: SP: Reg MR pages "); dev_err(&res->pdev->dev, - "requested (0x%x) exceeded max (0x%x)", + "SP: Reg MR pages requested (0x%x) exceeded max (0x%x)\n", pages, MAX_PBL_LVL_1_PGS); return -ENOMEM; } @@ -684,7 +679,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr, HWQ_TYPE_CTX); if (rc) { dev_err(&res->pdev->dev, - "SP: Reg MR memory allocation failed"); + "SP: Reg MR memory allocation failed\n"); return -ENOMEM; } /* Write to the hwq */ @@ -795,7 +790,7 @@ int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, sbuf = bnxt_qplib_rcfw_alloc_sbuf(rcfw, sizeof(*sb)); if (!sbuf) { dev_err(&rcfw->pdev->dev, - "QPLIB: SP: QUERY_ROCE_STATS alloc side buffer failed"); + "SP: QUERY_ROCE_STATS alloc side buffer failed\n"); return -ENOMEM; } From f794809a7259dfaa3d47d90ef5a86007cf48b1ce Mon Sep 17 00:00:00 2001 From: Jack Morgenstein Date: Mon, 27 Aug 2018 08:35:55 +0300 Subject: [PATCH 027/242] IB/core: Add an unbound WQ type to the new CQ API The upstream kernel commit cited below modified the workqueue in the new CQ API to be bound to a specific CPU (instead of being unbound). This caused ALL users of the new CQ API to use the same bound WQ. Specifically, MAD handling was severely delayed when the CPU bound to the WQ was busy handling (higher priority) interrupts. This caused a delay in the MAD "heartbeat" response handling, which resulted in ports being incorrectly classified as "down". To fix this, add a new "unbound" WQ type to the new CQ API, so that users have the option to choose either a bound WQ or an unbound WQ. For MADs, choose the new "unbound" WQ. Fixes: b7363e67b23e ("IB/device: Convert ib-comp-wq to be CPU-bound") Signed-off-by: Jack Morgenstein Signed-off-by: Leon Romanovsky Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cq.c | 8 ++++++-- drivers/infiniband/core/device.c | 15 ++++++++++++++- drivers/infiniband/core/mad.c | 2 +- include/rdma/ib_verbs.h | 9 ++++++--- 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index af5ad6a56ae4..9271f7290005 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -112,12 +112,12 @@ static void ib_cq_poll_work(struct work_struct *work) IB_POLL_BATCH); if (completed >= IB_POLL_BUDGET_WORKQUEUE || ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) { - queue_work(ib_comp_wq, &cq->work); + queue_work(cq->comp_wq, &cq->work); } /** @@ -175,9 +175,12 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cq->comp_handler = ib_cq_completion_workqueue; INIT_WORK(&cq->work, ib_cq_poll_work); ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + cq->comp_wq = (cq->poll_ctx == IB_POLL_WORKQUEUE) ? + ib_comp_wq : ib_comp_unbound_wq; break; default: ret = -EINVAL; @@ -213,6 +216,7 @@ void ib_free_cq(struct ib_cq *cq) irq_poll_disable(&cq->iop); break; case IB_POLL_WORKQUEUE: + case IB_POLL_UNBOUND_WORKQUEUE: cancel_work_sync(&cq->work); break; default: diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index db3b6271f09d..6d8ac51a39cc 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -61,6 +61,7 @@ struct ib_client_data { }; struct workqueue_struct *ib_comp_wq; +struct workqueue_struct *ib_comp_unbound_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -1166,10 +1167,19 @@ static int __init ib_core_init(void) goto err; } + ib_comp_unbound_wq = + alloc_workqueue("ib-comp-unb-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM | + WQ_SYSFS, WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_unbound_wq) { + ret = -ENOMEM; + goto err_comp; + } + ret = class_register(&ib_class); if (ret) { pr_warn("Couldn't create InfiniBand device class\n"); - goto err_comp; + goto err_comp_unbound; } ret = rdma_nl_init(); @@ -1218,6 +1228,8 @@ err_ibnl: rdma_nl_exit(); err_sysfs: class_unregister(&ib_class); +err_comp_unbound: + destroy_workqueue(ib_comp_unbound_wq); err_comp: destroy_workqueue(ib_comp_wq); err: @@ -1236,6 +1248,7 @@ static void __exit ib_core_cleanup(void) addr_cleanup(); rdma_nl_exit(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_unbound_wq); destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index ef459f2f2eeb..b8977c3db5f3 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -3183,7 +3183,7 @@ static int ib_mad_port_open(struct ib_device *device, cq_size *= 2; port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, - IB_POLL_WORKQUEUE); + IB_POLL_UNBOUND_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e950c2a68f06..df8d234a2b56 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -71,6 +71,7 @@ extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; +extern struct workqueue_struct *ib_comp_unbound_wq; union ib_gid { u8 raw[16]; @@ -1570,9 +1571,10 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); enum ib_poll_context { - IB_POLL_DIRECT, /* caller context, no hw completions */ - IB_POLL_SOFTIRQ, /* poll from softirq context */ - IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ + IB_POLL_UNBOUND_WORKQUEUE, /* poll from unbound workqueue */ }; struct ib_cq { @@ -1589,6 +1591,7 @@ struct ib_cq { struct irq_poll iop; struct work_struct work; }; + struct workqueue_struct *comp_wq; /* * Implementation details of the RDMA core, don't use in drivers: */ From 882dff2890e1e0928803701fcad75291e5310d0d Mon Sep 17 00:00:00 2001 From: Igor Stoppa Date: Fri, 31 Aug 2018 23:16:45 +0300 Subject: [PATCH 028/242] IB/srp: Remove unnecessary unlikely() WARN_ON() already contains an unlikely(), so it's not necessary to wrap it into another. Signed-off-by: Igor Stoppa Reviewed-by: Bart Van Assche Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/srpt/ib_srpt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index f37cbad022a2..447d21ea479a 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -2708,7 +2708,7 @@ static void srpt_queue_response(struct se_cmd *cmd) break; } - if (unlikely(WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) + if (WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT)) return; /* For read commands, transfer the data to the initiator. */ From b3b43483a26dbb7b05fce1c21f6807d299888617 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 1 Sep 2018 03:53:46 +0000 Subject: [PATCH 029/242] RDMA/qedr: remove set but not used variable 'ctx' Fixes gcc '-Wunused-but-set-variable' warning: drivers/infiniband/hw/qedr/verbs.c: In function 'qedr_create_srq': drivers/infiniband/hw/qedr/verbs.c:1450:24: warning: variable 'ctx' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Acked-by: Rahul Verma Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/verbs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 8cc3df24e04e..9d4d165014d9 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -1447,7 +1447,6 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, u64 pbl_base_addr, phy_prod_pair_addr; struct ib_ucontext *ib_ctx = NULL; struct qedr_srq_hwq_info *hw_srq; - struct qedr_ucontext *ctx = NULL; u32 page_cnt, page_size; struct qedr_srq *srq; int rc = 0; @@ -1473,7 +1472,6 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd, if (udata && ibpd->uobject && ibpd->uobject->context) { ib_ctx = ibpd->uobject->context; - ctx = get_qedr_ucontext(ib_ctx); if (ib_copy_from_udata(&ureq, udata, sizeof(ureq))) { DP_ERR(dev, From 12d6f669717d8f1b7de5cb7409c362fd8a4f1d2d Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 5 Sep 2018 08:58:31 +0300 Subject: [PATCH 030/242] RDMA/nes: Delete impossible debug prints The pci-core and net-core logic ensure that parameters provided to nes_probe() and nes_netdev_open() are valid, hence the assert print are not possible. Cc: Faisal Latif Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/nes/nes.c | 3 --- drivers/infiniband/hw/nes/nes.h | 9 --------- drivers/infiniband/hw/nes/nes_nic.c | 2 -- 3 files changed, 14 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c index 42b68aa999fc..e00add6d78ec 100644 --- a/drivers/infiniband/hw/nes/nes.c +++ b/drivers/infiniband/hw/nes/nes.c @@ -456,9 +456,6 @@ static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent) void __iomem *mmio_regs = NULL; u8 hw_rev; - assert(pcidev != NULL); - assert(ent != NULL); - printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n", DRV_VERSION, pci_name(pcidev)); diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h index bedaa02749fb..a895fe980d10 100644 --- a/drivers/infiniband/hw/nes/nes.h +++ b/drivers/infiniband/hw/nes/nes.h @@ -149,18 +149,9 @@ do { \ printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \ } while (0) -#define assert(expr) \ -do { \ - if (!(expr)) { \ - printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \ - #expr, __FILE__, __func__, __LINE__); \ - } \ -} while (0) - #define NES_EVENT_TIMEOUT 1200000 #else #define nes_debug(level, fmt, args...) no_printk(fmt, ##args) -#define assert(expr) do {} while (0) #define NES_EVENT_TIMEOUT 100000 #endif diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c index 61014e251555..16f33454c198 100644 --- a/drivers/infiniband/hw/nes/nes_nic.c +++ b/drivers/infiniband/hw/nes/nes_nic.c @@ -146,8 +146,6 @@ static int nes_netdev_open(struct net_device *netdev) struct list_head *list_pos, *list_temp; unsigned long flags; - assert(nesdev != NULL); - if (nesvnic->netdev_open == 1) return 0; From 6ceb6331b3291694fb6ceba625219f51447c3fa2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Sep 2018 20:18:03 +0300 Subject: [PATCH 031/242] RDMA/uverbs: Declare closing variable as boolean The "closing" variable is used as boolean and set to "true" in one place, update the declaration of that variable and their other assignment to proper type. Fixes: e951747a087a ("IB/uverbs: Rework the locking for cleaning up the ucontext") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 2 +- include/rdma/ib_verbs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index a21d5214afc3..4b72851ade24 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -120,7 +120,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, rcu_read_lock(); ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); rcu_read_unlock(); - ucontext->closing = 0; + ucontext->closing = false; ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index df8d234a2b56..a4c3a09a91bc 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1486,7 +1486,7 @@ struct ib_ucontext { * it is set when we are closing the file descriptor and indicates * that mm_sem may be locked. */ - int closing; + bool closing; bool cleanup_retryable; From 627212c9d49ba2759b699450f5d8f45f73e062fa Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 3 Sep 2018 20:20:25 +0300 Subject: [PATCH 032/242] RDMA/core: Replace open-coded variant of get_device Reuse existing get_device() API to do it symmetric to already used put_device() in commit 924b8900a49d ("RDMA/core: Replace open-coded variant of put_device") Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7fd14ead7b37..62351b3fcafc 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1359,8 +1359,8 @@ void ib_device_unregister_sysfs(struct ib_device *device) { int i; - /* Hold kobject until ib_dealloc_device() */ - kobject_get(&device->dev.kobj); + /* Hold device until ib_dealloc_device() */ + get_device(&device->dev); free_port_list_attributes(device); From adee9f3f3bbb317c5469f84deba01eef4b86515b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:47:58 +0300 Subject: [PATCH 033/242] RDMA/core: Depend on device_add() to add device attributes Instead of adding/removing device attribute files, depend on device_add() which considers adding these device files based on NULL terminated attributes group array. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sysfs.c | 59 +++++++++++++++------------------ include/rdma/ib_verbs.h | 3 ++ 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 7fd14ead7b37..185075af3ad6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1183,7 +1183,7 @@ err_put: return ret; } -static ssize_t show_node_type(struct device *device, +static ssize_t node_type_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1198,8 +1198,9 @@ static ssize_t show_node_type(struct device *device, default: return sprintf(buf, "%d: \n", dev->node_type); } } +static DEVICE_ATTR_RO(node_type); -static ssize_t show_sys_image_guid(struct device *device, +static ssize_t sys_image_guid_show(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1210,8 +1211,9 @@ static ssize_t show_sys_image_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } +static DEVICE_ATTR_RO(sys_image_guid); -static ssize_t show_node_guid(struct device *device, +static ssize_t node_guid_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1222,8 +1224,9 @@ static ssize_t show_node_guid(struct device *device, be16_to_cpu(((__be16 *) &dev->node_guid)[2]), be16_to_cpu(((__be16 *) &dev->node_guid)[3])); } +static DEVICE_ATTR_RO(node_guid); -static ssize_t show_node_desc(struct device *device, +static ssize_t node_desc_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1231,9 +1234,9 @@ static ssize_t show_node_desc(struct device *device, return sprintf(buf, "%.64s\n", dev->node_desc); } -static ssize_t set_node_desc(struct device *device, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t node_desc_store(struct device *device, + struct device_attribute *attr, + const char *buf, size_t count) { struct ib_device *dev = container_of(device, struct ib_device, dev); struct ib_device_modify desc = {}; @@ -1249,8 +1252,9 @@ static ssize_t set_node_desc(struct device *device, return count; } +static DEVICE_ATTR_RW(node_desc); -static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, +static ssize_t fw_ver_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); @@ -1259,19 +1263,19 @@ static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr, strlcat(buf, "\n", IB_FW_VERSION_NAME_MAX); return strlen(buf); } +static DEVICE_ATTR_RO(fw_ver); -static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL); -static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL); -static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL); -static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc); -static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); +static struct attribute *ib_dev_attrs[] = { + &dev_attr_node_type.attr, + &dev_attr_node_guid.attr, + &dev_attr_sys_image_guid.attr, + &dev_attr_fw_ver.attr, + &dev_attr_node_desc.attr, + NULL, +}; -static struct device_attribute *ib_class_attributes[] = { - &dev_attr_node_type, - &dev_attr_sys_image_guid, - &dev_attr_node_guid, - &dev_attr_node_desc, - &dev_attr_fw_ver, +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, }; static void free_port_list_attributes(struct ib_device *device) @@ -1311,16 +1315,13 @@ int ib_device_register_sysfs(struct ib_device *device, if (ret) return ret; + device->groups[0] = &dev_attr_group; + class_dev->groups = device->groups; + ret = device_add(class_dev); if (ret) goto err; - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) { - ret = device_create_file(class_dev, ib_class_attributes[i]); - if (ret) - goto err_unregister; - } - device->ports_parent = kobject_create_and_add("ports", &class_dev->kobj); if (!device->ports_parent) { @@ -1347,18 +1348,13 @@ int ib_device_register_sysfs(struct ib_device *device, err_put: free_port_list_attributes(device); - -err_unregister: device_del(class_dev); - err: return ret; } void ib_device_unregister_sysfs(struct ib_device *device) { - int i; - /* Hold kobject until ib_dealloc_device() */ kobject_get(&device->dev.kobj); @@ -1369,8 +1365,5 @@ void ib_device_unregister_sysfs(struct ib_device *device) free_hsag(&device->dev.kobj, device->hw_stats_ag); } - for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) - device_remove_file(&device->dev, ib_class_attributes[i]); - device_unregister(&device->dev); } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e950c2a68f06..cd0f935f0bc1 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2536,6 +2536,9 @@ struct ib_device { struct module *owner; struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; + struct kobject *ports_parent; struct list_head port_list; From c5c4d92e70f37369b5bdca5e85f9fc55dc2c8a3b Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:47:59 +0300 Subject: [PATCH 034/242] RDMA/uverbs: Use cdev_device_add() instead of cdev_add() Instead of doing two step process to add char device and create underlying device, use cdev_device_add() which does both. Currently a kobject per uverbs_device is created to keep reference to its holding ib_uverbs_device in addition to its underlying device 'dev'. Instead just use uverbs_device->dev to keep a reference to. With this change there is single reference tracker for ib_uverbs_device structure. This allows for subsequent patch to registers group attribute as well using single API cdev_device_add(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 3 +- drivers/infiniband/core/uverbs_main.c | 68 ++++++++++++--------------- 2 files changed, 31 insertions(+), 40 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 5df8e548cc14..0288aec432a4 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -100,13 +100,12 @@ struct ib_uverbs_device { atomic_t refcount; int num_comp_vectors; struct completion comp; - struct device *dev; + struct device dev; struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; struct rb_root xrcd_tree; struct mutex xrcd_tree_mutex; - struct kobject kobj; struct srcu_struct disassociate_srcu; struct mutex lists_mutex; /* protect lists */ struct list_head uverbs_file_list; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 6d974e2363df..1d2650f0f24c 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -169,20 +169,16 @@ int uverbs_dealloc_mw(struct ib_mw *mw) return ret; } -static void ib_uverbs_release_dev(struct kobject *kobj) +static void ib_uverbs_release_dev(struct device *device) { struct ib_uverbs_device *dev = - container_of(kobj, struct ib_uverbs_device, kobj); + container_of(device, struct ib_uverbs_device, dev); uverbs_destroy_api(dev->uapi); cleanup_srcu_struct(&dev->disassociate_srcu); kfree(dev); } -static struct kobj_type ib_uverbs_dev_ktype = { - .release = ib_uverbs_release_dev, -}; - static void ib_uverbs_release_async_event_file(struct kref *ref) { struct ib_uverbs_async_event_file *file = @@ -265,7 +261,7 @@ void ib_uverbs_release_file(struct kref *ref) if (atomic_dec_and_test(&file->device->refcount)) ib_uverbs_comp_dev(file->device); - kobject_put(&file->device->kobj); + put_device(&file->device->dev); kfree(file); } @@ -838,6 +834,7 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) if (!atomic_inc_not_zero(&dev->refcount)) return -ENXIO; + get_device(&dev->dev); srcu_key = srcu_read_lock(&dev->disassociate_srcu); mutex_lock(&dev->lists_mutex); ib_dev = srcu_dereference(dev->ib_dev, @@ -877,7 +874,6 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) init_rwsem(&file->hw_destroy_rwsem); filp->private_data = file; - kobject_get(&dev->kobj); list_add_tail(&file->list, &dev->uverbs_file_list); mutex_unlock(&dev->lists_mutex); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); @@ -898,6 +894,7 @@ err: if (atomic_dec_and_test(&dev->refcount)) ib_uverbs_comp_dev(dev); + put_device(&dev->dev); return ret; } @@ -953,14 +950,12 @@ static struct ib_client uverbs_client = { static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, char *buf) { + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; - struct ib_uverbs_device *dev = dev_get_drvdata(device); struct ib_device *ib_dev; - if (!dev) - return -ENODEV; - srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) @@ -974,13 +969,12 @@ static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); static ssize_t show_dev_abi_version(struct device *device, struct device_attribute *attr, char *buf) { - struct ib_uverbs_device *dev = dev_get_drvdata(device); + struct ib_uverbs_device *dev = + container_of(device, struct ib_uverbs_device, dev); int ret = -ENODEV; int srcu_key; struct ib_device *ib_dev; - if (!dev) - return -ENODEV; srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) @@ -1031,7 +1025,6 @@ static void ib_uverbs_add_one(struct ib_device *device) init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; mutex_init(&uverbs_dev->xrcd_tree_mutex); - kobject_init(&uverbs_dev->kobj, &ib_uverbs_dev_ktype); mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); @@ -1052,40 +1045,41 @@ static void ib_uverbs_add_one(struct ib_device *device) if (ib_uverbs_create_uapi(device, uverbs_dev)) goto err_uapi; - cdev_init(&uverbs_dev->cdev, NULL); + device_initialize(&uverbs_dev->dev); + uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.parent = device->dev.parent; + uverbs_dev->dev.devt = base; + uverbs_dev->dev.release = ib_uverbs_release_dev; + dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); + + cdev_init(&uverbs_dev->cdev, + device->mmap ? &uverbs_mmap_fops : &uverbs_fops); uverbs_dev->cdev.owner = THIS_MODULE; - uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops; - cdev_set_parent(&uverbs_dev->cdev, &uverbs_dev->kobj); - kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum); - if (cdev_add(&uverbs_dev->cdev, base, 1)) + + ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); + if (ret) goto err_cdev; - uverbs_dev->dev = device_create(uverbs_class, device->dev.parent, - uverbs_dev->cdev.dev, uverbs_dev, - "uverbs%d", uverbs_dev->devnum); - if (IS_ERR(uverbs_dev->dev)) - goto err_cdev; - - if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev)) - goto err_class; - if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version)) - goto err_class; + if (device_create_file(&uverbs_dev->dev, &dev_attr_ibdev)) + goto err_file; + if (device_create_file(&uverbs_dev->dev, &dev_attr_abi_version)) + goto err_file; ib_set_client_data(device, &uverbs_client, uverbs_dev); return; -err_class: - device_destroy(uverbs_class, uverbs_dev->cdev.dev); +err_file: + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); err_cdev: cdev_del(&uverbs_dev->cdev); + put_device(&uverbs_dev->dev); err_uapi: clear_bit(devnum, dev_map); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); return; } @@ -1155,9 +1149,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (!uverbs_dev) return; - dev_set_drvdata(uverbs_dev->dev, NULL); - device_destroy(uverbs_class, uverbs_dev->cdev.dev); - cdev_del(&uverbs_dev->cdev); + cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); clear_bit(uverbs_dev->devnum, dev_map); if (device->disassociate_ucontext) { @@ -1181,7 +1173,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) if (wait_clients) wait_for_completion(&uverbs_dev->comp); - kobject_put(&uverbs_dev->kobj); + put_device(&uverbs_dev->dev); } static char *uverbs_devnode(struct device *dev, umode_t *mode) From b53b1c08a23eb1091982daacb2122f90a7094a77 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 09:48:00 +0300 Subject: [PATCH 035/242] RDMA/uverbs: Use device.groups to initialize device attributes Instead of explicitly adding device attribute files and handling such error conditions, depend on device core layer to create device attributes files based group pointer NULL terminated array. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 2 ++ drivers/infiniband/core/uverbs_main.c | 30 +++++++++++++++------------ 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 0288aec432a4..7199c275ab79 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -101,6 +101,8 @@ struct ib_uverbs_device { int num_comp_vectors; struct completion comp; struct device dev; + /* First group for device attributes, NULL terminated array */ + const struct attribute_group *groups[2]; struct ib_device __rcu *ib_dev; int devnum; struct cdev cdev; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 1d2650f0f24c..16e5f714ca53 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -947,7 +947,7 @@ static struct ib_client uverbs_client = { .remove = ib_uverbs_remove_one }; -static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, +static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = @@ -964,10 +964,10 @@ static ssize_t show_ibdev(struct device *device, struct device_attribute *attr, return ret; } -static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); +static DEVICE_ATTR_RO(ibdev); -static ssize_t show_dev_abi_version(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t abi_version_show(struct device *device, + struct device_attribute *attr, char *buf) { struct ib_uverbs_device *dev = container_of(device, struct ib_uverbs_device, dev); @@ -983,7 +983,17 @@ static ssize_t show_dev_abi_version(struct device *device, return ret; } -static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL); +static DEVICE_ATTR_RO(abi_version); + +static struct attribute *ib_dev_attrs[] = { + &dev_attr_abi_version.attr, + &dev_attr_ibdev.attr, + NULL, +}; + +static const struct attribute_group dev_attr_group = { + .attrs = ib_dev_attrs, +}; static CLASS_ATTR_STRING(abi_version, S_IRUGO, __stringify(IB_USER_VERBS_ABI_VERSION)); @@ -1050,6 +1060,8 @@ static void ib_uverbs_add_one(struct ib_device *device) uverbs_dev->dev.parent = device->dev.parent; uverbs_dev->dev.devt = base; uverbs_dev->dev.release = ib_uverbs_release_dev; + uverbs_dev->groups[0] = &dev_attr_group; + uverbs_dev->dev.groups = uverbs_dev->groups; dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); cdev_init(&uverbs_dev->cdev, @@ -1060,17 +1072,9 @@ static void ib_uverbs_add_one(struct ib_device *device) if (ret) goto err_cdev; - if (device_create_file(&uverbs_dev->dev, &dev_attr_ibdev)) - goto err_file; - if (device_create_file(&uverbs_dev->dev, &dev_attr_abi_version)) - goto err_file; - ib_set_client_data(device, &uverbs_client, uverbs_dev); - return; -err_file: - cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); err_cdev: cdev_del(&uverbs_dev->cdev); put_device(&uverbs_dev->dev); From 798bba01b44b0ddf8cd6e542635b37cc9a9b739c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 14:45:28 +0300 Subject: [PATCH 036/242] RDMA/core: Fail early if unsupported QP is provided When requested QP type is not supported for a {device, port}, return the error right away before validating all parameters during mad agent registration time. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index b8977c3db5f3..43343c4e033e 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -220,6 +220,10 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, int ret2, qpn; u8 mgmt_class, vclass; + if ((qp_type == IB_QPT_SMI && !rdma_cap_ib_smi(device, port_num)) || + (qp_type == IB_QPT_GSI && !rdma_cap_ib_cm(device, port_num))) + return ERR_PTR(-EPROTONOSUPPORT); + /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) { From 6c75520f7e5a6a353f3b332509d205e213d05855 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 14:45:29 +0300 Subject: [PATCH 037/242] IB/mlx5: Don't hold spin lock while checking device state mdev->state device state is not protected by the QP for which WRs are being processed. Therefore, there is no need to hold spin lock while checking mdev state. Given that device fatal error is unlikely situation, wrap the condition check with unlikely(). Additionally, kernel QP1 is also a kernel ULP for which soft CQEs needs to be generated. Therefore, check for device fatal error before processing QP1 work requests. Fixes: 89ea94a7b6c4 ("IB/mlx5: Reset flow support for IB kernel ULPs") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e060499146ad..a2073ee7a16f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -4400,6 +4400,12 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, u8 next_fence = 0; u8 fence; + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_post_send(ibqp, wr, bad_wr); @@ -4409,13 +4415,6 @@ static int _mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, spin_lock_irqsave(&qp->sq.lock, flags); - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) { - err = -EIO; - *bad_wr = wr; - nreq = 0; - goto out; - } - for (nreq = 0; wr; nreq++, wr = wr->next) { if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) { mlx5_ib_warn(dev, "\n"); @@ -4729,18 +4728,17 @@ static int _mlx5_ib_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr, int ind; int i; + if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && + !drain)) { + *bad_wr = wr; + return -EIO; + } + if (unlikely(ibqp->qp_type == IB_QPT_GSI)) return mlx5_ib_gsi_post_recv(ibqp, wr, bad_wr); spin_lock_irqsave(&qp->rq.lock, flags); - if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR && !drain) { - err = -EIO; - *bad_wr = wr; - nreq = 0; - goto out; - } - ind = qp->rq.head & (qp->rq.wqe_cnt - 1); for (nreq = 0; wr; nreq++, wr = wr->next) { From 142a9c287613560edf5a03c8d142c8b6ebc1995b Mon Sep 17 00:00:00 2001 From: Muhammad Sammar Date: Tue, 28 Aug 2018 14:45:30 +0300 Subject: [PATCH 038/242] IB/ipoib: Ensure that MTU isn't less than minimum permitted It is illegal to change MTU to a value lower than the minimum MTU stated in ethernet spec. In addition to that we need to add 4 bytes for encapsulation header (IPOIB_ENCAP_LEN). Before "ifconfig ib0 mtu 0" command, succeeds while it obviously shouldn't. Signed-off-by: Muhammad Sammar Reviewed-by: Feras Daoud Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index e3d28f9ad9c0..2d20de72e67f 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -243,7 +243,8 @@ static int ipoib_change_mtu(struct net_device *dev, int new_mtu) return 0; } - if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) + if (new_mtu < (ETH_MIN_MTU + IPOIB_ENCAP_LEN) || + new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu)) return -EINVAL; priv->admin_mtu = new_mtu; From f9d08f1e1939ad4d92e38bd3dee6842512f5bee6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 14:45:31 +0300 Subject: [PATCH 039/242] RDMA/core: Rate limit MAD error messages While registering a mad agent, a user space can trigger various errors and flood the logs. Therefore, decrease verbosity and rate limit such error messages. While we are at it, use __func__ to print function name. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/mad.c | 72 ++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 43343c4e033e..c355379e7534 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -227,30 +227,30 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate parameters */ qpn = get_spl_qp_index(qp_type); if (qpn == -1) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid QP Type %d\n", - qp_type); + dev_dbg_ratelimited(&device->dev, "%s: invalid QP Type %d\n", + __func__, qp_type); goto error1; } if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid RMPP Version %u\n", - rmpp_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid RMPP Version %u\n", + __func__, rmpp_version); goto error1; } /* Validate MAD registration request if supplied */ if (mad_reg_req) { if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) { - dev_notice(&device->dev, - "ib_register_mad_agent: invalid Class Version %u\n", - mad_reg_req->mgmt_class_version); + dev_dbg_ratelimited(&device->dev, + "%s: invalid Class Version %u\n", + __func__, + mad_reg_req->mgmt_class_version); goto error1; } if (!recv_handler) { - dev_notice(&device->dev, - "ib_register_mad_agent: no recv_handler\n"); + dev_dbg_ratelimited(&device->dev, + "%s: no recv_handler\n", __func__); goto error1; } if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) { @@ -260,9 +260,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, */ if (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else if (mad_reg_req->mgmt_class == 0) { @@ -270,8 +270,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * Class 0 is reserved in IBA and is used for * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE */ - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid Mgmt Class 0\n"); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid Mgmt Class 0\n", + __func__); goto error1; } else if (is_vendor_class(mad_reg_req->mgmt_class)) { /* @@ -279,18 +280,19 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, * ensure supplied OUI is not zero */ if (!is_vendor_oui(mad_reg_req->oui)) { - dev_notice(&device->dev, - "ib_register_mad_agent: No OUI specified for class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: No OUI specified for class 0x%x\n", + __func__, + mad_reg_req->mgmt_class); goto error1; } } /* Make sure class supplied is consistent with RMPP */ if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) { if (rmpp_version) { - dev_notice(&device->dev, - "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: RMPP version for non-RMPP class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -301,9 +303,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) && (mad_reg_req->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid SM QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } else { @@ -311,9 +313,9 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, IB_MGMT_CLASS_SUBN_LID_ROUTED) || (mad_reg_req->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n", - mad_reg_req->mgmt_class); + dev_dbg_ratelimited(&device->dev, + "%s: Invalid GS QP type: class 0x%x\n", + __func__, mad_reg_req->mgmt_class); goto error1; } } @@ -328,18 +330,18 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device, /* Validate device and port */ port_priv = ib_get_mad_port(device, port_num); if (!port_priv) { - dev_notice(&device->dev, - "ib_register_mad_agent: Invalid port %d\n", - port_num); + dev_dbg_ratelimited(&device->dev, "%s: Invalid port %d\n", + __func__, port_num); ret = ERR_PTR(-ENODEV); goto error1; } - /* Verify the QP requested is supported. For example, Ethernet devices - * will not have QP0 */ + /* Verify the QP requested is supported. For example, Ethernet devices + * will not have QP0. + */ if (!port_priv->qp_info[qpn].qp) { - dev_notice(&device->dev, - "ib_register_mad_agent: QP %d not supported\n", qpn); + dev_dbg_ratelimited(&device->dev, "%s: QP %d not supported\n", + __func__, qpn); ret = ERR_PTR(-EPROTONOSUPPORT); goto error1; } From 722c7b2bfeadbae8d9aaa08552c456e09d17a7f7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 14:45:32 +0300 Subject: [PATCH 040/242] RDMA/{cma, core}: Avoid callback on rdma_addr_cancel() Currently rdma_addr_cancel() is an async operation, which notifies that cancel is done by executing the callback function given during rdma_resolve_ip(). If resolve_ip request is already completed than callback is not executed. Instead, now rdma_resolve_addr() and rdma_addr_cancel() simplified in following ways. 1. rdma_addr_cancel() now a synchronous method. If request was pending, after it is cancelled, no callback is notified. 2. rdma_resolve_addr() and respective addr_handler() callback doesn't need to hold reference to cm_id. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 12 +++++++----- drivers/infiniband/core/cma.c | 4 ---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 46b855a42884..94ff38731be8 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -660,6 +660,13 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, return addr_resolve(src_in, dst_addr, addr, false, 0); } +/** + * rdma_addr_cancel - Cancel resolve ip request + * @addr: Pointer to address structure given previously + * during rdma_resolve_ip(). + * rdma_addr_cancel() is synchronous function which cancels any pending + * request if there is any. + */ void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; @@ -687,11 +694,6 @@ void rdma_addr_cancel(struct rdma_dev_addr *addr) * guarentees no work is running and none will be started. */ cancel_delayed_work_sync(&found->work); - - if (found->callback) - found->callback(-ECANCELED, (struct sockaddr *)&found->src_addr, - found->addr, found->context); - kfree(found); } EXPORT_SYMBOL(rdma_addr_cancel); diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f72677291b69..4ba77f4e7098 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2880,13 +2880,11 @@ static void addr_handler(int status, struct sockaddr *src_addr, if (id_priv->id.event_handler(&id_priv->id, &event)) { cma_exch(id_priv, RDMA_CM_DESTROYING); mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); rdma_destroy_id(&id_priv->id); return; } out: mutex_unlock(&id_priv->handler_mutex); - cma_deref_id(id_priv); } static int cma_resolve_loopback(struct rdma_id_private *id_priv) @@ -2983,7 +2981,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return -EINVAL; memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr)); - atomic_inc(&id_priv->refcount); if (cma_any_addr(dst_addr)) { ret = cma_resolve_loopback(id_priv); } else { @@ -3001,7 +2998,6 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, return 0; err: cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND); - cma_deref_id(id_priv); return ret; } EXPORT_SYMBOL(rdma_resolve_addr); From 93688ddbe1da1ead030b210dadc5a8cfbff95849 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:41 +0300 Subject: [PATCH 041/242] RDMA/core: No need to protect kfree with spin lock and semaphore While unregistering a client, only context removal should be protected with lock. There is no need to protect a freeing of such context which is already removed from the list. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 6d8ac51a39cc..9bc5ba2f488e 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -700,9 +700,9 @@ void ib_unregister_client(struct ib_client *client) down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); list_del(&found_context->list); - kfree(found_context); spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); + kfree(found_context); } mutex_unlock(&device_mutex); From f7b65d9bf2db0e4b319c0676900c6c25398a449f Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:42 +0300 Subject: [PATCH 042/242] RDMA/core: Use simplified list_for_each While traversing client_data_list in following conditions, linked list is only read, no elements of the list are removed. Therefore, use list_for_each_entry(), instead of list_for_each_safe(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 9bc5ba2f488e..559fbe6a97c2 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -587,13 +587,12 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; spin_unlock_irqrestore(&device->client_data_lock, flags); downgrade_write(&lists_rwsem); - list_for_each_entry_safe(context, tmp, &device->client_data_list, - list) { + list_for_each_entry(context, &device->client_data_list, list) { if (context->client->remove) context->client->remove(device, context->data); } @@ -663,7 +662,7 @@ EXPORT_SYMBOL(ib_register_client); */ void ib_unregister_client(struct ib_client *client) { - struct ib_client_data *context, *tmp; + struct ib_client_data *context; struct ib_device *device; unsigned long flags; @@ -678,7 +677,7 @@ void ib_unregister_client(struct ib_client *client) down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; From 4512acd0d34cea1bc0d9c69c1a60174016e121d7 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:43 +0300 Subject: [PATCH 043/242] RDMA/core: Remove context entries from list while unregistering device While unregistering a device, remove the context elements from the list to not have any stale entries. With that any errors/bugs can be checked when device is freed. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 559fbe6a97c2..81758477a882 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -286,6 +286,7 @@ EXPORT_SYMBOL(ib_alloc_device); */ void ib_dealloc_device(struct ib_device *device) { + WARN_ON(!list_empty(&device->client_data_list)); WARN_ON(device->reg_state != IB_DEV_UNREGISTERED && device->reg_state != IB_DEV_UNINITIALIZED); rdma_restrack_clean(&device->res); @@ -610,8 +611,11 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); spin_lock_irqsave(&device->client_data_lock, flags); - list_for_each_entry_safe(context, tmp, &device->client_data_list, list) + list_for_each_entry_safe(context, tmp, &device->client_data_list, + list) { + list_del(&context->list); kfree(context); + } spin_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); From 2d65f49ff961da5e974a48e250edd24b0c6f54d6 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:44 +0300 Subject: [PATCH 044/242] RDMA/core: Use simpler spin lock irq API from blocking context add_client_context(), ib_unregister_device() and ib_unregister_client() are designed to call from blocking context. There is no need to save and restore last interrupt state when called from such blocking context. Even though this is not a performance path, using the right spin lock API is desired for code clarity. To avoid checkpatch warning while removing flags, sizeof() is used. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 81758477a882..a51d16ab1329 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -297,9 +297,8 @@ EXPORT_SYMBOL(ib_dealloc_device); static int add_client_context(struct ib_device *device, struct ib_client *client) { struct ib_client_data *context; - unsigned long flags; - context = kmalloc(sizeof *context, GFP_KERNEL); + context = kmalloc(sizeof(*context), GFP_KERNEL); if (!context) return -ENOMEM; @@ -308,9 +307,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->going_down = false; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_add(&context->list, &device->client_data_list); - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); return 0; @@ -587,10 +586,10 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); downgrade_write(&lists_rwsem); list_for_each_entry(context, &device->client_data_list, list) { @@ -668,7 +667,6 @@ void ib_unregister_client(struct ib_client *client) { struct ib_client_data *context; struct ib_device *device; - unsigned long flags; mutex_lock(&device_mutex); @@ -680,14 +678,14 @@ void ib_unregister_client(struct ib_client *client) struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); if (client->remove) @@ -701,9 +699,9 @@ void ib_unregister_client(struct ib_client *client) } down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + spin_lock_irq(&device->client_data_lock); list_del(&found_context->list); - spin_unlock_irqrestore(&device->client_data_lock, flags); + spin_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); kfree(found_context); } From e1f540c3ed0e9634d0f8c4600f3c85df8aff4ae2 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 28 Aug 2018 15:08:45 +0300 Subject: [PATCH 045/242] RDMA/core: Define client_data_lock as rwlock instead of spinlock Even though device registration/unregistration and client registration/unregistration is not a performance path, define the client_data_lock as rwlock for code clarity. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 30 +++++++++++++++--------------- include/rdma/ib_verbs.h | 5 +++-- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a51d16ab1329..a0939140ed3a 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -270,7 +270,7 @@ struct ib_device *ib_alloc_device(size_t size) INIT_LIST_HEAD(&device->event_handler_list); spin_lock_init(&device->event_handler_lock); - spin_lock_init(&device->client_data_lock); + rwlock_init(&device->client_data_lock); INIT_LIST_HEAD(&device->client_data_list); INIT_LIST_HEAD(&device->port_list); @@ -307,9 +307,9 @@ static int add_client_context(struct ib_device *device, struct ib_client *client context->going_down = false; down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_add(&context->list, &device->client_data_list); - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); return 0; @@ -586,10 +586,10 @@ void ib_unregister_device(struct ib_device *device) down_write(&lists_rwsem); list_del(&device->core_list); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) context->going_down = true; - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); downgrade_write(&lists_rwsem); list_for_each_entry(context, &device->client_data_list, list) { @@ -609,13 +609,13 @@ void ib_unregister_device(struct ib_device *device) kfree(device->port_pkey_list); down_write(&lists_rwsem); - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry_safe(context, tmp, &device->client_data_list, list) { list_del(&context->list); kfree(context); } - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); up_write(&lists_rwsem); device->reg_state = IB_DEV_UNREGISTERED; @@ -678,14 +678,14 @@ void ib_unregister_client(struct ib_client *client) struct ib_client_data *found_context = NULL; down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->going_down = true; found_context = context; break; } - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); if (client->remove) @@ -699,9 +699,9 @@ void ib_unregister_client(struct ib_client *client) } down_write(&lists_rwsem); - spin_lock_irq(&device->client_data_lock); + write_lock_irq(&device->client_data_lock); list_del(&found_context->list); - spin_unlock_irq(&device->client_data_lock); + write_unlock_irq(&device->client_data_lock); up_write(&lists_rwsem); kfree(found_context); } @@ -724,13 +724,13 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client) void *ret = NULL; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + read_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { ret = context->data; break; } - spin_unlock_irqrestore(&device->client_data_lock, flags); + read_unlock_irqrestore(&device->client_data_lock, flags); return ret; } @@ -751,7 +751,7 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, struct ib_client_data *context; unsigned long flags; - spin_lock_irqsave(&device->client_data_lock, flags); + write_lock_irqsave(&device->client_data_lock, flags); list_for_each_entry(context, &device->client_data_list, list) if (context->client == client) { context->data = data; @@ -762,7 +762,7 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, device->name, client->name); out: - spin_unlock_irqrestore(&device->client_data_lock, flags); + write_unlock_irqrestore(&device->client_data_lock, flags); } EXPORT_SYMBOL(ib_set_client_data); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index ddc7c317e136..995f176d4782 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2256,10 +2256,11 @@ struct ib_device { struct list_head event_handler_list; spinlock_t event_handler_lock; - spinlock_t client_data_lock; + rwlock_t client_data_lock; struct list_head core_list; /* Access to the client_data_list is protected by the client_data_lock - * spinlock and the lists_rwsem read-write semaphore */ + * rwlock and the lists_rwsem read-write semaphore + */ struct list_head client_data_list; struct ib_cache cache; From 50704e039ab1d7e6c035d8c27a0b314929bfbe10 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Mon, 3 Sep 2018 20:17:31 +0300 Subject: [PATCH 046/242] RDMA/umem: Restore lockdep check while downgrading lock Lockdep engine handles correctly downgrade of locks and it simply incorrect to disable lockdep checks prior to calling mmu_notifier. Remove lockdep_off and ensure locks correctness. Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem_odp.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6ec748eccff7..29e34e6a6420 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -431,13 +431,7 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, atomic_set(&context->notifier_count, 0); INIT_HLIST_NODE(&context->mn.hlist); context->mn.ops = &ib_umem_notifiers; - /* - * Lock-dep detects a false positive for mmap_sem vs. - * umem_rwsem, due to not grasping downgrade_write correctly. - */ - lockdep_off(); ret_val = mmu_notifier_register(&context->mn, mm); - lockdep_on(); if (ret_val) { pr_err("Failed to register mmu_notifier %d\n", ret_val); ret_val = -EBUSY; From c715a39541bb399eb03d728a996b224d90ce1336 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 6 Sep 2018 10:55:31 +0300 Subject: [PATCH 047/242] RDMA/core: Follow correct unregister order between sysfs and cgroup During register_device() init sequence is, (a) register with rdma cgroup followed by (b) register with sysfs Therefore, unregister_device() sequence should follow the reverse order. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index a0939140ed3a..e1155067954b 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -598,8 +598,8 @@ void ib_unregister_device(struct ib_device *device) } up_read(&lists_rwsem); - ib_device_unregister_rdmacg(device); ib_device_unregister_sysfs(device); + ib_device_unregister_rdmacg(device); mutex_unlock(&device_mutex); From 273993509f05623934dda14a56237738149b2906 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 6 Sep 2018 10:58:57 +0300 Subject: [PATCH 048/242] RDMA/core: Assign device ifindex before publishing the device Even though device->ifindex is assigned before adding the device in the list which is read by netlink flow, it is better to assign rdma device index before publishing the device in the system to users and clients. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/device.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index e1155067954b..5a680a88aa87 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -525,6 +525,8 @@ int ib_register_device(struct ib_device *device, goto port_cleanup; } + device->index = __dev_new_index(); + ret = ib_device_register_rdmacg(device); if (ret) { pr_warn("Couldn't register device with rdma cgroup\n"); @@ -551,7 +553,6 @@ int ib_register_device(struct ib_device *device, if (!add_client_context(device, client) && client->add) client->add(device); - device->index = __dev_new_index(); down_write(&lists_rwsem); list_add_tail(&device->core_list, &device_list); up_write(&lists_rwsem); From 4269024639f6ff9a1967c4bfa5a2ba7d9853384a Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Sep 2018 11:45:14 -0400 Subject: [PATCH 049/242] RDMA/core: Document CM @event_handler function Code audit suggests that the RDMA CM event handler callback function is _always_ invoked in a context that is safe to block. That's important for consumer implementers to know, so document that in the comment before rdma_create_id (where the handler function is set up by the consumer). Signed-off-by: Chuck Lever Signed-off-by: Jason Gunthorpe --- include/rdma/rdma_cm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 5d71a7f51a9f..53d93c7d8e01 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -152,7 +152,11 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, * @ps: RDMA port space. * @qp_type: type of queue pair associated with the id. * - * The id holds a reference on the network namespace until it is destroyed. + * Returns a new rdma_cm_id. The id holds a reference on the network + * namespace until it is destroyed. + * + * The event handler callback serializes on the id's mutex and is + * allowed to sleep. */ #define rdma_create_id(net, event_handler, context, ps, qp_type) \ __rdma_create_id((net), (event_handler), (context), (ps), (qp_type), \ From eb93c82ed8c77f00955f2891483170194c3be92c Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 4 Sep 2018 11:45:20 -0400 Subject: [PATCH 050/242] RDMA/core: Document QP @event_handler function Add helpful warning for RDMA consumer implementers. Signed-off-by: Chuck Lever Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 995f176d4782..f687faadf33b 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1138,7 +1138,9 @@ enum ib_qp_create_flags { */ struct ib_qp_init_attr { + /* Consumer's event_handler callback must not block */ void (*event_handler)(struct ib_event *, void *); + void *qp_context; struct ib_cq *send_cq; struct ib_cq *recv_cq; From 78dd0c430f116a325eeda61416c0b36a8206fbfc Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Sun, 2 Sep 2018 12:51:31 +0300 Subject: [PATCH 051/242] RDMA/mlx5: Add NIC TX steering support Just like ingress steering, allow a user to create steering rules that match egress vport traffic. We expose the same number of priorities as the bypass (NIC RX) steering. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 29 ++++++++++++++++++---------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 640973c78fb2..ed323db1b6d2 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2891,7 +2891,7 @@ is_valid_esp_aes_gcm(struct mlx5_core_dev *mdev, * rules would be supported, always return VALID_SPEC_NA. */ if (!is_crypto) - return egress ? VALID_SPEC_INVALID : VALID_SPEC_NA; + return VALID_SPEC_NA; return is_crypto && is_ipsec && (!egress || (!is_drop && !flow_act->has_flow_tag)) ? @@ -3066,21 +3066,27 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, log_max_ft_size)); if (flow_attr->type == IB_FLOW_ATTR_NORMAL) { - if (ft_type == MLX5_IB_FT_TX) - priority = 0; - else if (flow_is_multicast_only(flow_attr) && - !dont_trap) + enum mlx5_flow_namespace_type fn_type; + + if (flow_is_multicast_only(flow_attr) && + !dont_trap) priority = MLX5_IB_FLOW_MCAST_PRIO; else priority = ib_prio_to_core_prio(flow_attr->priority, dont_trap); - ns = mlx5_get_flow_namespace(dev->mdev, - ft_type == MLX5_IB_FT_TX ? - MLX5_FLOW_NAMESPACE_EGRESS : - MLX5_FLOW_NAMESPACE_BYPASS); + if (ft_type == MLX5_IB_FT_RX) { + fn_type = MLX5_FLOW_NAMESPACE_BYPASS; + prio = &dev->flow_db->prios[priority]; + } else { + max_table_size = + BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, + log_max_ft_size)); + fn_type = MLX5_FLOW_NAMESPACE_EGRESS; + prio = &dev->flow_db->egress_prios[priority]; + } + ns = mlx5_get_flow_namespace(dev->mdev, fn_type); num_entries = MLX5_FS_MAX_ENTRIES; num_groups = MLX5_FS_MAX_TYPES; - prio = &dev->flow_db->prios[priority]; } else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT || flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) { ns = mlx5_get_flow_namespace(dev->mdev, @@ -3279,6 +3285,9 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev, if (!is_valid_attr(dev->mdev, flow_attr)) return ERR_PTR(-EINVAL); + if (dev->rep && is_egress) + return ERR_PTR(-EINVAL); + spec = kvzalloc(sizeof(*spec), GFP_KERNEL); handler = kzalloc(sizeof(*handler), GFP_KERNEL); if (!handler || !spec) { diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 4e9676dc7531..a3ade16937b4 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -196,6 +196,7 @@ struct mlx5_ib_flow_matcher { struct mlx5_ib_flow_db { struct mlx5_ib_flow_prio prios[MLX5_IB_NUM_FLOW_FT]; + struct mlx5_ib_flow_prio egress_prios[MLX5_IB_NUM_FLOW_FT]; struct mlx5_ib_flow_prio sniffer[MLX5_IB_NUM_SNIFFER_FTS]; struct mlx5_ib_flow_prio egress[MLX5_IB_NUM_EGRESS_FTS]; struct mlx5_flow_table *lag_demux_ft; From b1085be3f468a58d4a43f3ea878ecb6c10910552 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Sun, 2 Sep 2018 12:51:32 +0300 Subject: [PATCH 052/242] RDMA/mlx5: Enable attaching modify header to steering flows When creating a flow steering rule, allow the user to attach a modify header action. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ed323db1b6d2..8a4cdba6be8c 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2474,6 +2474,14 @@ static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, MLX5_FLOW_CONTEXT_ACTION_ENCRYPT : MLX5_FLOW_CONTEXT_ACTION_DECRYPT; return 0; + case IB_FLOW_ACTION_UNSPECIFIED: + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_MODIFY_HEADER) { + action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; + action->modify_id = maction->flow_action_raw.action_id; + return 0; + } + /* fall through */ default: return -EOPNOTSUPP; } From 4adda1122c490e042d4bcb920900f796fc9423e4 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Sun, 2 Sep 2018 12:51:33 +0300 Subject: [PATCH 053/242] RDMA/mlx5: Enable decap and packet reformat on flow tables If NIC RX flow tables support decap operation, enable it on creation, This allows to perform decapsulation of tunnelled packets by steering rules. If NIC TX flow tables support reformat operation, enable it on creation. We don't enable those capabilities on representors as the E-Switch should handle packet modification (can be configured via TC) and as current hardware can't handle both FDB and NIC flow tables with decap/packet reformat support. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 8a4cdba6be8c..ce9afa0dd983 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3042,14 +3042,15 @@ enum flow_table_type { static struct mlx5_ib_flow_prio *_get_prio(struct mlx5_flow_namespace *ns, struct mlx5_ib_flow_prio *prio, int priority, - int num_entries, int num_groups) + int num_entries, int num_groups, + u32 flags) { struct mlx5_flow_table *ft; ft = mlx5_create_auto_grouped_flow_table(ns, priority, num_entries, num_groups, - 0, 0); + 0, flags); if (IS_ERR(ft)) return ERR_CAST(ft); @@ -3069,6 +3070,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, int max_table_size; int num_entries; int num_groups; + u32 flags = 0; int priority; max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, @@ -3085,12 +3087,18 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, if (ft_type == MLX5_IB_FT_RX) { fn_type = MLX5_FLOW_NAMESPACE_BYPASS; prio = &dev->flow_db->prios[priority]; + if (!dev->rep && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; } else { max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size)); fn_type = MLX5_FLOW_NAMESPACE_EGRESS; prio = &dev->flow_db->egress_prios[priority]; + if (!dev->rep && + MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } ns = mlx5_get_flow_namespace(dev->mdev, fn_type); num_entries = MLX5_FS_MAX_ENTRIES; @@ -3126,7 +3134,8 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, ft = prio->flow_table; if (!ft) - return _get_prio(ns, prio, priority, num_entries, num_groups); + return _get_prio(ns, prio, priority, num_entries, num_groups, + flags); return prio; } @@ -3710,7 +3719,7 @@ static struct mlx5_ib_flow_prio *_get_flow_table(struct mlx5_ib_dev *dev, return prio; return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES, - MLX5_FS_MAX_TYPES); + MLX5_FS_MAX_TYPES, 0); } static struct mlx5_ib_flow_handler * From 10a308964eaf4bf7dea859dcb9f630c91b3b07be Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Sun, 2 Sep 2018 12:51:34 +0300 Subject: [PATCH 054/242] RDMA/mlx5: Enable attaching DECAP action to steering flows Any matching packet will be stripped of it's VXLAN tunnel, only the inner L2 onward is left. The user will receive the decapsulated packet. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index ce9afa0dd983..5ae31ad1a438 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2481,6 +2481,11 @@ static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, action->modify_id = maction->flow_action_raw.action_id; return 0; } + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_DECAP) { + action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; + return 0; + } /* fall through */ default: return -EOPNOTSUPP; From 5c2db53f62633689632aee7be9659418b2bf291f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Sun, 2 Sep 2018 12:51:35 +0300 Subject: [PATCH 055/242] RDMA/mlx5: Enable reformat on NIC RX if supported A L3_TUNNEL_TO_L2 decap flow action requires to enable the encap bit on the flow table, enable it if supported. This will allow to attach those flow actions to NIC RX steering. We don't enable if running on a representor. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 5ae31ad1a438..b5f0ab88e064 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3095,6 +3095,10 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev, if (!dev->rep && MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (!dev->rep && + MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + reformat_l3_tunnel_to_l2)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; } else { max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, From e806f9328ba424b371766899af3c3cf4616e5522 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Sun, 2 Sep 2018 12:51:36 +0300 Subject: [PATCH 056/242] RDMA/mlx5: Enable attaching packet reformat action to steering flows Any matching rules will be mutated based on the packet reformat context which is attached to that given flow rule. Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b5f0ab88e064..65fce84135f3 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2486,6 +2486,14 @@ static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; return 0; } + if (maction->flow_action_raw.sub_type == + MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) { + action->action |= + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; + action->reformat_id = + maction->flow_action_raw.action_id; + return 0; + } /* fall through */ default: return -EOPNOTSUPP; From 70cd20aed00f719f3536154df02596106e431e45 Mon Sep 17 00:00:00 2001 From: Guy Levi Date: Thu, 6 Sep 2018 17:27:01 +0300 Subject: [PATCH 057/242] IB/uverbs: Add IDRs array attribute type to ioctl() interface Methods sometimes need to get a flexible set of IDRs and not a strict set as can be achieved today by the conventional IDR attribute. Add a new IDRS_ARRAY attribute to the generic uverbs ioctl layer. IDRS_ARRAY points to array of idrs of the same object type and same access rights, only write and read are supported. Signed-off-by: Guy Levi Signed-off-by: Jason Gunthorpe `` Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 114 +++++++++++++++++++++++ drivers/infiniband/core/uverbs_uapi.c | 12 +++ include/rdma/uverbs_ioctl.h | 71 +++++++++++++- include/uapi/rdma/rdma_user_ioctl_cmds.h | 7 +- 4 files changed, 201 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 4bafd4671de2..0e95a5888274 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -57,6 +57,7 @@ struct bundle_priv { struct ib_uverbs_attr *uattrs; DECLARE_BITMAP(uobj_finalize, UVERBS_API_ATTR_BKEY_LEN); + DECLARE_BITMAP(spec_finalize, UVERBS_API_ATTR_BKEY_LEN); /* * Must be last. bundle ends in a flex array which overlaps @@ -143,6 +144,86 @@ static bool uverbs_is_attr_cleared(const struct ib_uverbs_attr *uattr, 0, uattr->len - len); } +static int uverbs_process_idrs_array(struct bundle_priv *pbundle, + const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + struct ib_uverbs_attr *uattr, + u32 attr_bkey) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + size_t array_len; + u32 *idr_vals; + int ret = 0; + size_t i; + + if (uattr->attr_data.reserved) + return -EINVAL; + + if (uattr->len % sizeof(u32)) + return -EINVAL; + + array_len = uattr->len / sizeof(u32); + if (array_len < spec->u2.objs_arr.min_len || + array_len > spec->u2.objs_arr.max_len) + return -EINVAL; + + attr->uobjects = + uverbs_alloc(&pbundle->bundle, + array_size(array_len, sizeof(*attr->uobjects))); + if (IS_ERR(attr->uobjects)) + return PTR_ERR(attr->uobjects); + + /* + * Since idr is 4B and *uobjects is >= 4B, we can use attr->uobjects + * to store idrs array and avoid additional memory allocation. The + * idrs array is offset to the end of the uobjects array so we will be + * able to read idr and replace with a pointer. + */ + idr_vals = (u32 *)(attr->uobjects + array_len) - array_len; + + if (uattr->len > sizeof(uattr->data)) { + ret = copy_from_user(idr_vals, u64_to_user_ptr(uattr->data), + uattr->len); + if (ret) + return -EFAULT; + } else { + memcpy(idr_vals, &uattr->data, uattr->len); + } + + for (i = 0; i != array_len; i++) { + attr->uobjects[i] = uverbs_get_uobject_from_file( + spec->u2.objs_arr.obj_type, pbundle->bundle.ufile, + spec->u2.objs_arr.access, idr_vals[i]); + if (IS_ERR(attr->uobjects[i])) { + ret = PTR_ERR(attr->uobjects[i]); + break; + } + } + + attr->len = i; + __set_bit(attr_bkey, pbundle->spec_finalize); + return ret; +} + +static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi, + struct uverbs_objs_arr_attr *attr, + bool commit) +{ + const struct uverbs_attr_spec *spec = &attr_uapi->spec; + int current_ret; + int ret = 0; + size_t i; + + for (i = 0; i != attr->len; i++) { + current_ret = uverbs_finalize_object( + attr->uobjects[i], spec->u2.objs_arr.access, commit); + if (!ret) + ret = current_ret; + } + + return ret; +} + static int uverbs_process_attr(struct bundle_priv *pbundle, const struct uverbs_api_attr *attr_uapi, struct ib_uverbs_attr *uattr, u32 attr_bkey) @@ -246,6 +327,11 @@ static int uverbs_process_attr(struct bundle_priv *pbundle, } break; + + case UVERBS_ATTR_TYPE_IDRS_ARRAY: + return uverbs_process_idrs_array(pbundle, attr_uapi, + &e->objs_arr_attr, uattr, + attr_bkey); default: return -EOPNOTSUPP; } @@ -384,6 +470,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) unsigned int i; int ret = 0; + /* fast path for simple uobjects */ i = -1; while ((i = find_next_bit(pbundle->uobj_finalize, key_bitmap_len, i + 1)) < key_bitmap_len) { @@ -397,6 +484,32 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) ret = current_ret; } + i = -1; + while ((i = find_next_bit(pbundle->spec_finalize, key_bitmap_len, + i + 1)) < key_bitmap_len) { + struct uverbs_attr *attr = &pbundle->bundle.attrs[i]; + const struct uverbs_api_attr *attr_uapi; + void __rcu **slot; + int current_ret; + + slot = uapi_get_attr_for_method( + pbundle, + pbundle->method_key | uapi_bkey_to_key_attr(i)); + if (WARN_ON(!slot)) + continue; + + attr_uapi = srcu_dereference( + *slot, + &pbundle->bundle.ufile->device->disassociate_srcu); + + if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + current_ret = uverbs_free_idrs_array( + attr_uapi, &attr->objs_arr_attr, commit); + if (!ret) + ret = current_ret; + } + } + for (memblock = pbundle->allocated_mem; memblock;) { struct bundle_alloc_head *tmp = memblock; @@ -461,6 +574,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, memset(pbundle->bundle.attr_present, 0, sizeof(pbundle->bundle.attr_present)); memset(pbundle->uobj_finalize, 0, sizeof(pbundle->uobj_finalize)); + memset(pbundle->spec_finalize, 0, sizeof(pbundle->spec_finalize)); ret = ib_uverbs_run_method(pbundle, hdr->num_attrs); destroy_ret = bundle_destroy(pbundle, ret == 0); diff --git a/drivers/infiniband/core/uverbs_uapi.c b/drivers/infiniband/core/uverbs_uapi.c index 73ea6f0db88f..cdf5ced2c84f 100644 --- a/drivers/infiniband/core/uverbs_uapi.c +++ b/drivers/infiniband/core/uverbs_uapi.c @@ -73,6 +73,18 @@ static int uapi_merge_method(struct uverbs_api *uapi, if (attr->attr.type == UVERBS_ATTR_TYPE_ENUM_IN) method_elm->driver_method |= is_driver; + /* + * Like other uobject based things we only support a single + * uobject being NEW'd or DESTROY'd + */ + if (attr->attr.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { + u8 access = attr->attr.u2.objs_arr.access; + + if (WARN_ON(access == UVERBS_ACCESS_NEW || + access == UVERBS_ACCESS_DESTROY)) + return -EINVAL; + } + attr_slot = uapi_add_elm(uapi, method_key | uapi_key_attr(attr->id), sizeof(*attr_slot)); diff --git a/include/rdma/uverbs_ioctl.h b/include/rdma/uverbs_ioctl.h index fc2e52234a2a..84d3d15f1f38 100644 --- a/include/rdma/uverbs_ioctl.h +++ b/include/rdma/uverbs_ioctl.h @@ -52,6 +52,7 @@ enum uverbs_attr_type { UVERBS_ATTR_TYPE_IDR, UVERBS_ATTR_TYPE_FD, UVERBS_ATTR_TYPE_ENUM_IN, + UVERBS_ATTR_TYPE_IDRS_ARRAY, }; enum uverbs_obj_access { @@ -101,7 +102,7 @@ struct uverbs_attr_spec { } enum_def; } u; - /* This weird split of the enum lets us remove some padding */ + /* This weird split lets us remove some padding */ union { struct { /* @@ -111,6 +112,17 @@ struct uverbs_attr_spec { */ const struct uverbs_attr_spec *ids; } enum_def; + + struct { + /* + * higher bits mean the namespace and lower bits mean + * the type id within the namespace. + */ + u16 obj_type; + u16 min_len; + u16 max_len; + u8 access; + } objs_arr; } u2; }; @@ -251,6 +263,11 @@ static inline __attribute_const__ u32 uapi_bkey_attr(u32 attr_key) return attr_key - 1; } +static inline __attribute_const__ u32 uapi_bkey_to_key_attr(u32 attr_bkey) +{ + return attr_bkey + 1; +} + /* * ======================================= * Verbs definitions @@ -323,6 +340,27 @@ struct uverbs_object_tree_def { #define UA_MANDATORY .mandatory = 1 #define UA_OPTIONAL .mandatory = 0 +/* + * min_len must be bigger than 0 and _max_len must be smaller than 4095. Only + * READ\WRITE accesses are supported. + */ +#define UVERBS_ATTR_IDRS_ARR(_attr_id, _idr_type, _access, _min_len, _max_len, \ + ...) \ + (&(const struct uverbs_attr_def){ \ + .id = (_attr_id) + \ + BUILD_BUG_ON_ZERO((_min_len) == 0 || \ + (_max_len) > \ + PAGE_SIZE / sizeof(void *) || \ + (_min_len) > (_max_len) || \ + (_access) == UVERBS_ACCESS_NEW || \ + (_access) == UVERBS_ACCESS_DESTROY), \ + .attr = { .type = UVERBS_ATTR_TYPE_IDRS_ARRAY, \ + .u2.objs_arr.obj_type = _idr_type, \ + .u2.objs_arr.access = _access, \ + .u2.objs_arr.min_len = _min_len, \ + .u2.objs_arr.max_len = _max_len, \ + __VA_ARGS__ } }) + #define UVERBS_ATTR_IDR(_attr_id, _idr_type, _access, ...) \ (&(const struct uverbs_attr_def){ \ .id = _attr_id, \ @@ -440,10 +478,16 @@ struct uverbs_obj_attr { const struct uverbs_api_attr *attr_elm; }; +struct uverbs_objs_arr_attr { + struct ib_uobject **uobjects; + u16 len; +}; + struct uverbs_attr { union { struct uverbs_ptr_attr ptr_attr; struct uverbs_obj_attr obj_attr; + struct uverbs_objs_arr_attr objs_arr_attr; }; }; @@ -516,6 +560,31 @@ uverbs_attr_get_len(const struct uverbs_attr_bundle *attrs_bundle, u16 idx) return attr->ptr_attr.len; } +/** + * uverbs_attr_get_uobjs_arr() - Provides array's properties for attribute for + * UVERBS_ATTR_TYPE_IDRS_ARRAY. + * @arr: Returned pointer to array of pointers for uobjects or NULL if + * the attribute isn't provided. + * + * Return: The array length or 0 if no attribute was provided. + */ +static inline int uverbs_attr_get_uobjs_arr( + const struct uverbs_attr_bundle *attrs_bundle, u16 attr_idx, + struct ib_uobject ***arr) +{ + const struct uverbs_attr *attr = + uverbs_attr_get(attrs_bundle, attr_idx); + + if (IS_ERR(attr)) { + *arr = NULL; + return 0; + } + + *arr = attr->objs_arr_attr.uobjects; + + return attr->objs_arr_attr.len; +} + static inline bool uverbs_attr_ptr_is_inline(const struct uverbs_attr *attr) { return attr->ptr_attr.len <= sizeof(attr->ptr_attr.data); diff --git a/include/uapi/rdma/rdma_user_ioctl_cmds.h b/include/uapi/rdma/rdma_user_ioctl_cmds.h index 24800c6c1f32..06c34d99be85 100644 --- a/include/uapi/rdma/rdma_user_ioctl_cmds.h +++ b/include/uapi/rdma/rdma_user_ioctl_cmds.h @@ -53,7 +53,7 @@ enum { struct ib_uverbs_attr { __u16 attr_id; /* command specific type attribute */ - __u16 len; /* only for pointers */ + __u16 len; /* only for pointers and IDRs array */ __u16 flags; /* combination of UVERBS_ATTR_F_XXXX */ union { struct { @@ -63,7 +63,10 @@ struct ib_uverbs_attr { __u16 reserved; } attr_data; union { - /* Used by PTR_IN/OUT, ENUM_IN and IDR */ + /* + * ptr to command, inline data, idr/fd or + * ptr to __u32 array of IDRs + */ __aligned_u64 data; /* Used by FD_IN and FD_OUT */ __s64 data_s64; From 86e1d464a8ccd627b6ea3e9a98a0389b0d27fd1f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:02 +0300 Subject: [PATCH 058/242] RDMA/uverbs: Move flow resources initialization Use ib_set_flow() when initializing flow related resources. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs.h | 6 ----- drivers/infiniband/core/uverbs_cmd.c | 19 ++-------------- drivers/infiniband/hw/mlx5/flow.c | 2 +- include/rdma/ib_verbs.h | 14 ------------ include/rdma/uverbs_std_types.h | 33 ++++++++++++++++++++++++++++ 5 files changed, 36 insertions(+), 38 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 7199c275ab79..717ab35b0af9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -219,12 +219,6 @@ struct ib_ucq_object { u32 async_events_reported; }; -struct ib_uflow_resources; -struct ib_uflow_object { - struct ib_uobject uobject; - struct ib_uflow_resources *resources; -}; - extern const struct file_operations uverbs_event_fops; void ib_uverbs_init_event_queue(struct ib_uverbs_event_queue *ev_queue); struct file *ib_uverbs_alloc_async_event_file(struct ib_uverbs_file *uverbs_file, diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 4b72851ade24..c054d65dec1b 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2747,15 +2747,6 @@ out_put: return ret ? ret : in_len; } -struct ib_uflow_resources { - size_t max; - size_t num; - size_t collection_num; - size_t counters_num; - struct ib_counters **counters; - struct ib_flow_action **collection; -}; - static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -3462,7 +3453,6 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, struct ib_uverbs_create_flow cmd; struct ib_uverbs_create_flow_resp resp; struct ib_uobject *uobj; - struct ib_uflow_object *uflow; struct ib_flow *flow_id; struct ib_uverbs_flow_attr *kern_flow_attr; struct ib_flow_attr *flow_attr; @@ -3601,13 +3591,8 @@ int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file, err = PTR_ERR(flow_id); goto err_free; } - atomic_inc(&qp->usecnt); - flow_id->qp = qp; - flow_id->device = qp->device; - flow_id->uobject = uobj; - uobj->object = flow_id; - uflow = container_of(uobj, typeof(*uflow), uobject); - uflow->resources = uflow_res; + + ib_set_flow(uobj, flow_id, qp, qp->device, uflow_res); memset(&resp, 0, sizeof(resp)); resp.flow_handle = uobj->id; diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 5750a650884e..12abbc02af99 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -128,7 +128,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( if (IS_ERR(flow_handler)) return PTR_ERR(flow_handler); - ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev); + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, NULL); return 0; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f687faadf33b..6076c9b72ab9 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -4162,20 +4162,6 @@ ib_get_vector_affinity(struct ib_device *device, int comp_vector) } -static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, - struct ib_qp *qp, struct ib_device *device) -{ - uobj->object = ibflow; - ibflow->uobject = uobj; - - if (qp) { - atomic_inc(&qp->usecnt); - ibflow->qp = qp; - } - - ibflow->device = device; -} - /** * rdma_roce_rescan_device - Rescan all of the network devices in the system * and add their gids, as needed, to the relevant RoCE devices. diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index 526d918fcd5a..dfd6d35f1783 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -152,5 +152,38 @@ static inline void uverbs_flow_action_fill_action(struct ib_flow_action *action, uobj->object = action; } +struct ib_uflow_resources { + size_t max; + size_t num; + size_t collection_num; + size_t counters_num; + struct ib_counters **counters; + struct ib_flow_action **collection; +}; + +struct ib_uflow_object { + struct ib_uobject uobject; + struct ib_uflow_resources *resources; +}; + +static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, + struct ib_qp *qp, struct ib_device *device, + struct ib_uflow_resources *uflow_res) +{ + struct ib_uflow_object *uflow; + + uobj->object = ibflow; + ibflow->uobject = uobj; + + if (qp) { + atomic_inc(&qp->usecnt); + ibflow->qp = qp; + } + + ibflow->device = device; + uflow = container_of(uobj, typeof(*uflow), uobject); + uflow->resources = uflow_res; +} + #endif From 2ea262039015ba7e74dcaff91e70c547a45437c7 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:03 +0300 Subject: [PATCH 059/242] RDMA/mlx5: Refactor flow action parsing to be more generic Make the parsing of flow actions more generic so it could be used by mlx5 raw create flow. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 12 ++++++------ drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 65fce84135f3..50e5ebd1614f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2460,17 +2460,16 @@ static int check_mpls_supp_fields(u32 field_support, const __be32 *set_mask) offsetof(typeof(filter), field) -\ sizeof(filter.field)) -static int parse_flow_flow_action(const union ib_flow_spec *ib_spec, - const struct ib_flow_attr *flow_attr, - struct mlx5_flow_act *action) +int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, + bool is_egress, + struct mlx5_flow_act *action) { - struct mlx5_ib_flow_action *maction = to_mflow_act(ib_spec->action.act); switch (maction->ib_action.type) { case IB_FLOW_ACTION_ESP: /* Currently only AES_GCM keymat is supported by the driver */ action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx; - action->action |= flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS ? + action->action |= is_egress ? MLX5_FLOW_CONTEXT_ACTION_ENCRYPT : MLX5_FLOW_CONTEXT_ACTION_DECRYPT; return 0; @@ -2831,7 +2830,8 @@ static int parse_flow_attr(struct mlx5_core_dev *mdev, u32 *match_c, action->action |= MLX5_FLOW_CONTEXT_ACTION_DROP; break; case IB_FLOW_SPEC_ACTION_HANDLE: - ret = parse_flow_flow_action(ib_spec, flow_attr, action); + ret = parse_flow_flow_action(to_mflow_act(ib_spec->action.act), + flow_attr->flags & IB_FLOW_ATTR_FLAGS_EGRESS, action); if (ret) return ret; break; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a3ade16937b4..0e831e48047a 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -872,6 +873,9 @@ to_mcounters(struct ib_counters *ibcntrs) return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs); } +int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, + bool is_egress, + struct mlx5_flow_act *action); struct mlx5_ib_dev { struct ib_device ib_dev; const struct uverbs_object_tree_def *driver_trees[7]; From 501f14e37ba099f9d3a530c9d3a03cc4dbad103f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:04 +0300 Subject: [PATCH 060/242] RDMA/mlx5: Don't overwrite action if already set We support only a single action type per flow rule, in case the user passes the same type of flow actions fail the flow creation. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 50e5ebd1614f..0ab8a336eab0 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2467,6 +2467,9 @@ int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, switch (maction->ib_action.type) { case IB_FLOW_ACTION_ESP: + if (action->action & (MLX5_FLOW_CONTEXT_ACTION_ENCRYPT | + MLX5_FLOW_CONTEXT_ACTION_DECRYPT)) + return -EINVAL; /* Currently only AES_GCM keymat is supported by the driver */ action->esp_id = (uintptr_t)maction->esp_aes_gcm.ctx; action->action |= is_egress ? @@ -2476,17 +2479,24 @@ int parse_flow_flow_action(struct mlx5_ib_flow_action *maction, case IB_FLOW_ACTION_UNSPECIFIED: if (maction->flow_action_raw.sub_type == MLX5_IB_FLOW_ACTION_MODIFY_HEADER) { + if (action->action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + return -EINVAL; action->action |= MLX5_FLOW_CONTEXT_ACTION_MOD_HDR; action->modify_id = maction->flow_action_raw.action_id; return 0; } if (maction->flow_action_raw.sub_type == MLX5_IB_FLOW_ACTION_DECAP) { + if (action->action & MLX5_FLOW_CONTEXT_ACTION_DECAP) + return -EINVAL; action->action |= MLX5_FLOW_CONTEXT_ACTION_DECAP; return 0; } if (maction->flow_action_raw.sub_type == MLX5_IB_FLOW_ACTION_PACKET_REFORMAT) { + if (action->action & + MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT) + return -EINVAL; action->action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; action->reformat_id = From b823dd6d86ce6576d229c865895d0ee5285d0363 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:05 +0300 Subject: [PATCH 061/242] RDMA/mlx5: Refactor raw flow creation Move struct mlx5_flow_act to be passed from the method entry point, this will allow to add support for flow action for the raw create flow path. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 4 +++- drivers/infiniband/hw/mlx5/main.c | 12 +++++++----- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 +++- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 12abbc02af99..0e913491d139 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -61,6 +61,7 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { + struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; void *devx_obj; @@ -123,7 +124,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); fs_matcher = uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCHER); - flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, cmd_in, inlen, + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, + cmd_in, inlen, dest_id, dest_type); if (IS_ERR(flow_handler)) return PTR_ERR(flow_handler); diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0ab8a336eab0..b258a6e19c6a 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3754,10 +3754,10 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_prio *ft_prio, struct mlx5_flow_destination *dst, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_act *flow_act, void *cmd_in, int inlen) { struct mlx5_ib_flow_handler *handler; - struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_flow_spec *spec; struct mlx5_flow_table *ft = ft_prio->flow_table; int err = 0; @@ -3776,9 +3776,8 @@ _create_raw_flow_rule(struct mlx5_ib_dev *dev, fs_matcher->mask_len); spec->match_criteria_enable = fs_matcher->match_criteria_enable; - flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; handler->rule = mlx5_add_flow_rules(ft, spec, - &flow_act, dst, 1); + flow_act, dst, 1); if (IS_ERR(handler->rule)) { err = PTR_ERR(handler->rule); @@ -3840,6 +3839,7 @@ static bool raw_fs_is_multicast(struct mlx5_ib_flow_matcher *fs_matcher, struct mlx5_ib_flow_handler * mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, + struct mlx5_flow_act *flow_act, void *cmd_in, int inlen, int dest_id, int dest_type) { @@ -3872,13 +3872,15 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, if (dest_type == MLX5_FLOW_DESTINATION_TYPE_TIR) { dst->type = dest_type; dst->tir_num = dest_id; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; } else { dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; dst->ft_num = dest_id; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; } - handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, cmd_in, - inlen); + handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act, + cmd_in, inlen); if (IS_ERR(handler)) { err = PTR_ERR(handler); diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 0e831e48047a..c667a6c51c12 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -1253,7 +1254,8 @@ void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void); struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, - void *cmd_in, int inlen, int dest_id, int dest_type); + struct mlx5_flow_act *flow_act, void *cmd_in, int inlen, + int dest_id, int dest_type); bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type); int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root); void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction); From fa76d24ee0aa24fff3fa9ba71fc2179fb88fef6a Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:06 +0300 Subject: [PATCH 062/242] RDMA/mlx5: Add flow actions support to raw create flow Support attaching flow actions to a flow rule via raw create flow. For now only NIC RX path is supported. This change requires to export flow resources management functions so we can maintain proper bookkeeping of flow actions. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_cmd.c | 11 ++++--- drivers/infiniband/hw/mlx5/flow.c | 40 +++++++++++++++++++++--- include/rdma/uverbs_std_types.h | 6 ++++ include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 4 files changed, 50 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c054d65dec1b..9c87c98a0f19 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -2747,7 +2747,7 @@ out_put: return ret ? ret : in_len; } -static struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs) { struct ib_uflow_resources *resources; @@ -2777,6 +2777,7 @@ err: return NULL; } +EXPORT_SYMBOL(flow_resources_alloc); void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) { @@ -2795,10 +2796,11 @@ void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res) kfree(uflow_res->counters); kfree(uflow_res); } +EXPORT_SYMBOL(ib_uverbs_flow_resources_free); -static void flow_resources_add(struct ib_uflow_resources *uflow_res, - enum ib_flow_spec_type type, - void *ibobj) +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj) { WARN_ON(uflow_res->num >= uflow_res->max); @@ -2819,6 +2821,7 @@ static void flow_resources_add(struct ib_uflow_resources *uflow_res, uflow_res->num++; } +EXPORT_SYMBOL(flow_resources_add); static int kern_spec_to_ib_spec_action(struct ib_uverbs_file *ufile, struct ib_uverbs_flow_spec *kern_spec, diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 0e913491d139..ce9276a2aaa5 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -58,12 +58,15 @@ static const struct uverbs_attr_spec mlx5_ib_flow_type[] = { }, }; +#define MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS 2 static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs) { struct mlx5_flow_act flow_act = {.flow_tag = MLX5_FS_DEFAULT_FLOW_TAG}; struct mlx5_ib_flow_handler *flow_handler; struct mlx5_ib_flow_matcher *fs_matcher; + struct ib_uobject **arr_flow_actions; + struct ib_uflow_resources *uflow_res; void *devx_obj; int dest_id, dest_type; void *cmd_in; @@ -73,6 +76,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE); struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); + int len, ret, i; if (!capable(CAP_NET_RAW)) return -EPERM; @@ -124,15 +128,38 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); fs_matcher = uverbs_attr_get_obj(attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + + uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); + if (!uflow_res) + return -ENOMEM; + + len = uverbs_attr_get_uobjs_arr(attrs, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, &arr_flow_actions); + for (i = 0; i < len; i++) { + struct mlx5_ib_flow_action *maction = + to_mflow_act(arr_flow_actions[i]->object); + + ret = parse_flow_flow_action(maction, false, &flow_act); + if (ret) + goto err_out; + flow_resources_add(uflow_res, IB_FLOW_SPEC_ACTION_HANDLE, + arr_flow_actions[i]->object); + } + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, cmd_in, inlen, dest_id, dest_type); - if (IS_ERR(flow_handler)) - return PTR_ERR(flow_handler); + if (IS_ERR(flow_handler)) { + ret = PTR_ERR(flow_handler); + goto err_out; + } - ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, NULL); + ib_set_flow(uobj, &flow_handler->ibflow, qp, &dev->ib_dev, uflow_res); return 0; +err_out: + ib_uverbs_flow_resources_free(uflow_res); + return ret; } static int flow_matcher_cleanup(struct ib_uobject *uobject, @@ -459,7 +486,12 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_ACCESS_READ), UVERBS_ATTR_IDR(MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_OBJECT_DEVX_OBJ, - UVERBS_ACCESS_READ)); + UVERBS_ACCESS_READ), + UVERBS_ATTR_IDRS_ARR(MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + UVERBS_OBJECT_FLOW_ACTION, + UVERBS_ACCESS_READ, 1, + MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index dfd6d35f1783..3db2802fbc68 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -166,6 +166,12 @@ struct ib_uflow_object { struct ib_uflow_resources *resources; }; +struct ib_uflow_resources *flow_resources_alloc(size_t num_specs); +void flow_resources_add(struct ib_uflow_resources *uflow_res, + enum ib_flow_spec_type type, + void *ibobj); +void ib_uverbs_flow_resources_free(struct ib_uflow_resources *uflow_res); + static inline void ib_set_flow(struct ib_uobject *uobj, struct ib_flow *ibflow, struct ib_qp *qp, struct ib_device *device, struct ib_uflow_resources *uflow_res) diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 75c7093fd95b..91c3d42ebd0f 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -155,6 +155,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_DEST_QP, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_ATTR_CREATE_FLOW_MATCHER, + MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, }; enum mlx5_ib_destoy_flow_attrs { From b47fd4ffe2d6422a986f19d47563d72c79ebbc21 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:07 +0300 Subject: [PATCH 063/242] RDMA/mlx5: Add NIC TX namespace when getting a flow table Add the ability to get a NIC TX flow table when using _get_flow_table(). This will allow to create a matcher and a flow rule on the NIC TX path. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 1 + drivers/infiniband/hw/mlx5/main.c | 41 ++++++++++++++++++++-------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + 3 files changed, 32 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index ce9276a2aaa5..16e677c549e6 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -189,6 +189,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( if (!obj) return -ENOMEM; + obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS; obj->mask_len = uverbs_attr_get_len( attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK); err = uverbs_copy_from(&obj->matcher_mask, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b258a6e19c6a..e311b6f8e1ee 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3719,34 +3719,54 @@ free_ucmd: return ERR_PTR(err); } -static struct mlx5_ib_flow_prio *_get_flow_table(struct mlx5_ib_dev *dev, - int priority, bool mcast) +static struct mlx5_ib_flow_prio * +_get_flow_table(struct mlx5_ib_dev *dev, + struct mlx5_ib_flow_matcher *fs_matcher, + bool mcast) { - int max_table_size; struct mlx5_flow_namespace *ns = NULL; struct mlx5_ib_flow_prio *prio; + int max_table_size; + u32 flags = 0; + int priority; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) { + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + log_max_ft_size)); + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP; + if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, + reformat_l3_tunnel_to_l2)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } else { /* Can only be MLX5_FLOW_NAMESPACE_EGRESS */ + max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, + log_max_ft_size)); + if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat)) + flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT; + } - max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, - log_max_ft_size)); if (max_table_size < MLX5_FS_MAX_ENTRIES) return ERR_PTR(-ENOMEM); if (mcast) priority = MLX5_IB_FLOW_MCAST_PRIO; else - priority = ib_prio_to_core_prio(priority, false); + priority = ib_prio_to_core_prio(fs_matcher->priority, false); - ns = mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS); + ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type); if (!ns) return ERR_PTR(-ENOTSUPP); - prio = &dev->flow_db->prios[priority]; + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) + prio = &dev->flow_db->prios[priority]; + else + prio = &dev->flow_db->egress_prios[priority]; if (prio->flow_table) return prio; return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES, - MLX5_FS_MAX_TYPES, 0); + MLX5_FS_MAX_TYPES, flags); } static struct mlx5_ib_flow_handler * @@ -3845,7 +3865,6 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, { struct mlx5_flow_destination *dst; struct mlx5_ib_flow_prio *ft_prio; - int priority = fs_matcher->priority; struct mlx5_ib_flow_handler *handler; bool mcast; int err; @@ -3863,7 +3882,7 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, mcast = raw_fs_is_multicast(fs_matcher, cmd_in); mutex_lock(&dev->flow_db->lock); - ft_prio = _get_flow_table(dev, priority, mcast); + ft_prio = _get_flow_table(dev, fs_matcher, mcast); if (IS_ERR(ft_prio)) { err = PTR_ERR(ft_prio); goto unlock; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index c667a6c51c12..6c57872fdc4e 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -190,6 +190,7 @@ struct mlx5_ib_flow_matcher { struct mlx5_ib_match_params matcher_mask; int mask_len; enum mlx5_ib_flow_type flow_type; + enum mlx5_flow_namespace_type ns_type; u16 priority; struct mlx5_core_dev *mdev; atomic_t usecnt; From a7ee18bdee837e4703f01588993504b72074ffc6 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 6 Sep 2018 17:27:08 +0300 Subject: [PATCH 064/242] RDMA/mlx5: Allow creating a matcher for a NIC TX flow table Currently a matcher can only be created and attached to a NIC RX flow table. Extend it to allow it on NIC TX flow tables as well. In order to achieve that, we: 1) Expose a new attribute: MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS. enum ib_flow_flags is used as valid flags. Only IB_FLOW_ATTR_FLAGS_EGRESS is supported. 2) Remove the requirement to have a DEVX or QP destination when creating a flow. A flow added to NIC TX flow table will forward the packet outside of the vport (Wire or E-Switch in the SR-iOV case). Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 34 ++++++++++++++++++++---- drivers/infiniband/hw/mlx5/main.c | 5 +++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 16e677c549e6..4ee4af450720 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -86,7 +86,14 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( dest_qp = uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_QP); - if ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)) + fs_matcher = uverbs_attr_get_obj(attrs, + MLX5_IB_ATTR_CREATE_FLOW_MATCHER); + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS && + ((dest_devx && dest_qp) || (!dest_devx && !dest_qp))) + return -EINVAL; + + if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS && + (dest_devx || dest_qp)) return -EINVAL; if (dest_devx) { @@ -100,7 +107,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( */ if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type)) return -EINVAL; - } else { + } else if (dest_qp) { struct mlx5_ib_qp *mqp; qp = uverbs_attr_get_obj(attrs, @@ -117,6 +124,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( else dest_id = mqp->raw_packet_qp.rq.tirn; dest_type = MLX5_FLOW_DESTINATION_TYPE_TIR; + } else { + dest_type = MLX5_FLOW_DESTINATION_TYPE_PORT; } if (dev->rep) @@ -126,8 +135,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); inlen = uverbs_attr_get_len(attrs, MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE); - fs_matcher = uverbs_attr_get_obj(attrs, - MLX5_IB_ATTR_CREATE_FLOW_MATCHER); uflow_res = flow_resources_alloc(MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS); if (!uflow_res) @@ -183,6 +190,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE); struct mlx5_ib_dev *dev = to_mdev(uobj->context->device); struct mlx5_ib_flow_matcher *obj; + u32 flags; int err; obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL); @@ -215,6 +223,19 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)( if (err) goto end; + err = uverbs_get_flags32(&flags, attrs, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + IB_FLOW_ATTR_FLAGS_EGRESS); + if (err) + goto end; + + if (flags) { + err = mlx5_ib_ft_type_to_namespace( + MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX, &obj->ns_type); + if (err) + goto end; + } + uobj->object = obj; obj->mdev = dev->mdev; atomic_set(&obj->usecnt, 0); @@ -559,7 +580,10 @@ DECLARE_UVERBS_NAMED_METHOD( UA_MANDATORY), UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, UVERBS_ATTR_TYPE(u8), - UA_MANDATORY)); + UA_MANDATORY), + UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, + enum ib_flow_flags, + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e311b6f8e1ee..2be6a4377558 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3892,10 +3892,13 @@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_dev *dev, dst->type = dest_type; dst->tir_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; - } else { + } else if (dest_type == MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE) { dst->type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE_NUM; dst->ft_num = dest_id; flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + } else { + dst->type = MLX5_FLOW_DESTINATION_TYPE_PORT; + flow_act->action |= MLX5_FLOW_CONTEXT_ACTION_ALLOW; } handler = _create_raw_flow_rule(dev, ft_prio, dst, fs_matcher, flow_act, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index 91c3d42ebd0f..fb4a8b17cca8 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -125,6 +125,7 @@ enum mlx5_ib_flow_matcher_create_attrs { MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA, + MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS, }; enum mlx5_ib_flow_matcher_destroy_attrs { From 3e5d60bcc8a42bfd0c888a0cf52a5a7e8398677d Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Sat, 8 Sep 2018 21:49:31 +0800 Subject: [PATCH 065/242] infiniband: remove redundant condition check before debugfs_remove debugfs_remove has taken the IS_ERR_OR_NULL into account. Just remove the unnecessary condition. Signed-off-by: zhong jiang Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/usnic/usnic_debugfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c index 92dc66cc2d50..a3115709fb03 100644 --- a/drivers/infiniband/hw/usnic/usnic_debugfs.c +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -165,6 +165,5 @@ void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow) { - if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) - debugfs_remove(qp_flow->dbgfs_dentry); + debugfs_remove(qp_flow->dbgfs_dentry); } From 0b79b27748cbec221e1ceabf63578198602bf01d Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 10 Sep 2018 09:49:27 -0700 Subject: [PATCH 066/242] IB/{hfi1, qib, rdmavt}: Schedule multi RC/UC packets instead of posting The post_send() path determines if it should post directly or, schedule the post for later. The current logic is: if the swqe ring is empty or (for hfi1) wqe->length <= piothreshold post the send else schedule This can allow large requests to call the send engine directly. Large requests can potentially produce a large number of packets prior to returning to the caller, blocking the caller from posting more requests, and allowing better parallel processing. Allow the driver(s) more say in this logic (pass call_send to the driver, rather than examining a return value). Update hfi1/qib logic to schedule the send engine if an RC or UC message is larger than the QP MTU size. Reviewed-by: Mike Marciniszyn Reviewed-by: Ira Weiny Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 12 +++++------- drivers/infiniband/hw/hfi1/verbs.h | 3 ++- drivers/infiniband/hw/qib/qib_qp.c | 17 +++++++---------- drivers/infiniband/hw/qib/qib_verbs.h | 3 ++- drivers/infiniband/sw/rdmavt/qp.c | 14 ++++++++------ include/rdma/rdma_vt.h | 10 ++++++++-- 6 files changed, 32 insertions(+), 27 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 9b1e84a6b1cc..54d9ff171059 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -285,17 +285,13 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, * hfi1_check_send_wqe - validate wqe * @qp - The qp * @wqe - The built wqe - * - * validate wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. + * @call_send - Determine if the send should be posted or scheduled. * * Returns 0 on success, -EINVAL on failure * */ int hfi1_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) + struct rvt_swqe *wqe, bool *call_send) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; @@ -305,6 +301,8 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; break; case IB_QPT_SMI: ah = ibah_to_rvtah(wqe->ud_wr.ah); @@ -321,7 +319,7 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, default: break; } - return wqe->length <= piothreshold; + return 0; } /** diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index a4d06502f06d..269ec338581b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -343,7 +343,8 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); -int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 344e401915f7..a81905df2d0f 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -378,25 +378,22 @@ void qib_flush_qp_waiters(struct rvt_qp *qp) * qib_check_send_wqe - validate wr/wqe * @qp - The qp * @wqe - The built wqe + * @call_send - Determine if the send should be posted or scheduled * - * validate wr/wqe. This is called - * prior to inserting the wqe into - * the ring but after the wqe has been - * setup. - * - * Returns 1 to force direct progress, 0 otherwise, -EINVAL on failure + * Returns 0 on success, -EINVAL on failure */ int qib_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe) + struct rvt_swqe *wqe, bool *call_send) { struct rvt_ah *ah; - int ret = 0; switch (qp->ibqp.qp_type) { case IB_QPT_RC: case IB_QPT_UC: if (wqe->length > 0x80000000U) return -EINVAL; + if (wqe->length > qp->pmtu) + *call_send = false; break; case IB_QPT_SMI: case IB_QPT_GSI: @@ -405,12 +402,12 @@ int qib_check_send_wqe(struct rvt_qp *qp, if (wqe->length > (1 << ah->log_pmtu)) return -EINVAL; /* progress hint */ - ret = 1; + *call_send = true; break; default: break; } - return ret; + return 0; } #ifdef CONFIG_DEBUG_FS diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 666613eef88f..3d7b744ae8fb 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -303,7 +303,8 @@ void qib_rc_rcv(struct qib_ctxtdata *rcd, struct ib_header *hdr, int qib_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); -int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe); +int qib_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 5ce403c6cddb..a9b7d7ff32ee 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1718,7 +1718,7 @@ static inline int rvt_qp_is_avail( */ static int rvt_post_one_wr(struct rvt_qp *qp, const struct ib_send_wr *wr, - int *call_send) + bool *call_send) { struct rvt_swqe *wqe; u32 next; @@ -1825,11 +1825,9 @@ static int rvt_post_one_wr(struct rvt_qp *qp, /* general part of wqe valid - allow for driver checks */ if (rdi->driver_f.check_send_wqe) { - ret = rdi->driver_f.check_send_wqe(qp, wqe); + ret = rdi->driver_f.check_send_wqe(qp, wqe, call_send); if (ret < 0) goto bail_inval_free; - if (ret) - *call_send = ret; } log_pmtu = qp->log_pmtu; @@ -1897,7 +1895,7 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, struct rvt_qp *qp = ibqp_to_rvtqp(ibqp); struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device); unsigned long flags = 0; - int call_send; + bool call_send; unsigned nreq = 0; int err = 0; @@ -1930,7 +1928,11 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, bail: spin_unlock_irqrestore(&qp->s_hlock, flags); if (nreq) { - if (call_send) + /* + * Only call do_send if there is exactly one packet, and the + * driver said it was ok. + */ + if (nreq == 1 && call_send) rdi->driver_f.do_send(qp); else rdi->driver_f.schedule_send_no_lock(qp); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e79229a0cf01..e32facdd9fd3 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -214,8 +214,14 @@ struct rvt_driver_provided { void (*schedule_send)(struct rvt_qp *qp); void (*schedule_send_no_lock)(struct rvt_qp *qp); - /* Driver specific work request checking */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe); + /* + * Validate the wqe. This needs to be done prior to inserting the + * wqe into the ring, but after the wqe has been set up. Allow for + * driver specific work request checking by providing a callback. + * call_send indicates if the wqe should be posted or scheduled. + */ + int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); /* * Sometimes rdmavt needs to kick the driver's send progress. That is From f1a315420e79fe5c077fa119db9439ffabd2cda2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 10 Sep 2018 11:35:11 +0300 Subject: [PATCH 067/242] RDMA/hns: Fix an error code in hns_roce_v2_init_eq_table() The error code isn't set on this path. Signed-off-by: Dan Carpenter Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 0218c0f8c2a7..b84bfa48371d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -5125,6 +5125,7 @@ static int hns_roce_v2_init_eq_table(struct hns_roce_dev *hr_dev) create_singlethread_workqueue("hns_roce_irq_workqueue"); if (!hr_dev->irq_workq) { dev_err(dev, "Create irq workqueue failed!\n"); + ret = -ENOMEM; goto err_request_irq_fail; } From a0e0cb82804a6a21d9067022c2dfdf80d11da429 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 10 Sep 2018 09:39:03 -0700 Subject: [PATCH 068/242] IB/hfi1: Eliminate races in the SDMA send error path pq_update() can only be called in two places: from the completion function when the complete (npkts) sequence of packets has been submitted and processed, or from setup function if a subset of the packets were submitted (i.e. the error path). Currently both paths can call pq_update() if an error occurrs. This race will cause the n_req value to go negative, hanging file_close(), or cause a crash by freeing the txlist more than once. Several variables are used to determine SDMA send state. Most of these are unnecessary, and have code inspectible races between the setup function and the completion function, in both the send path and the error path. The request 'status' value can be set by the setup or by the completion function. This is code inspectibly racy. Since the status is not needed in the completion code or by the caller it has been removed. The request 'done' value races between usage by the setup and the completion function. The completion function does not need this. When the number of processed packets matches npkts, it is done. The 'has_error' value races between usage of the setup and the completion function. This can cause incorrect error handling and leave the n_req in an incorrect value (i.e. negative). Simplify the code by removing all of the unneeded state checks and variables. Clean up iovs node when it is freed. Eliminate race conditions in the error path: If all packets are submitted, the completion handler will set the completion status correctly (ok or aborted). If all packets are not submitted, the caller must wait until the submitted packets have completed, and then set the completion status. These two change eliminate the race condition in the error path. Reviewed-by: Mitko Haralanov Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.c | 85 ++++++++++++-------------- drivers/infiniband/hw/hfi1/user_sdma.h | 3 - 2 files changed, 38 insertions(+), 50 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index a3a7b33196d6..9f471672e2d5 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -328,7 +328,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, u8 opcode, sc, vl; u16 pkey; u32 slid; - int req_queued = 0; u16 dlid; u32 selector; @@ -392,7 +391,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->data_len = 0; req->pq = pq; req->cq = cq; - req->status = -1; req->ahg_idx = -1; req->iov_idx = 0; req->sent = 0; @@ -400,12 +398,14 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->seqcomp = 0; req->seqsubmitted = 0; req->tids = NULL; - req->done = 0; req->has_error = 0; INIT_LIST_HEAD(&req->txps); memcpy(&req->info, &info, sizeof(info)); + /* The request is initialized, count it */ + atomic_inc(&pq->n_reqs); + if (req_opcode(info.ctrl) == EXPECTED) { /* expected must have a TID info and at least one data vector */ if (req->data_iovs < 2) { @@ -500,7 +500,6 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, ret = pin_vector_pages(req, &req->iovs[i]); if (ret) { req->data_iovs = i; - req->status = ret; goto free_req; } req->data_len += req->iovs[i].iov.iov_len; @@ -561,14 +560,10 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->ahg_idx = sdma_ahg_alloc(req->sde); set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); - atomic_inc(&pq->n_reqs); - req_queued = 1; /* Send the first N packets in the request to buy us some time */ ret = user_sdma_send_pkts(req, pcount); - if (unlikely(ret < 0 && ret != -EBUSY)) { - req->status = ret; + if (unlikely(ret < 0 && ret != -EBUSY)) goto free_req; - } /* * It is possible that the SDMA engine would have processed all the @@ -588,14 +583,8 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, while (req->seqsubmitted != req->info.npkts) { ret = user_sdma_send_pkts(req, pcount); if (ret < 0) { - if (ret != -EBUSY) { - req->status = ret; - WRITE_ONCE(req->has_error, 1); - if (READ_ONCE(req->seqcomp) == - req->seqsubmitted - 1) - goto free_req; - return ret; - } + if (ret != -EBUSY) + goto free_req; wait_event_interruptible_timeout( pq->busy.wait_dma, (pq->state == SDMA_PKT_Q_ACTIVE), @@ -606,10 +595,19 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, *count += idx; return 0; free_req: - user_sdma_free_request(req, true); - if (req_queued) + /* + * If the submitted seqsubmitted == npkts, the completion routine + * controls the final state. If sequbmitted < npkts, wait for any + * outstanding packets to finish before cleaning up. + */ + if (req->seqsubmitted < req->info.npkts) { + if (req->seqsubmitted) + wait_event(pq->busy.wait_dma, + (req->seqcomp == req->seqsubmitted - 1)); + user_sdma_free_request(req, true); pq_update(pq); - set_comp_state(pq, cq, info.comp_idx, ERROR, req->status); + set_comp_state(pq, cq, info.comp_idx, ERROR, ret); + } return ret; } @@ -917,7 +915,6 @@ dosend: ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); req->seqsubmitted += count; if (req->seqsubmitted == req->info.npkts) { - WRITE_ONCE(req->done, 1); /* * The txreq has already been submitted to the HW queue * so we can free the AHG entry now. Corruption will not @@ -1365,11 +1362,15 @@ static int set_txreq_header_ahg(struct user_sdma_request *req, return idx; } -/* - * SDMA tx request completion callback. Called when the SDMA progress - * state machine gets notification that the SDMA descriptors for this - * tx request have been processed by the DMA engine. Called in - * interrupt context. +/** + * user_sdma_txreq_cb() - SDMA tx request completion callback. + * @txreq: valid sdma tx request + * @status: success/failure of request + * + * Called when the SDMA progress state machine gets notification that + * the SDMA descriptors for this tx request have been processed by the + * DMA engine. Called in interrupt context. + * Only do work on completed sequences. */ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) { @@ -1378,7 +1379,7 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) struct user_sdma_request *req; struct hfi1_user_sdma_pkt_q *pq; struct hfi1_user_sdma_comp_q *cq; - u16 idx; + enum hfi1_sdma_comp_state state = COMPLETE; if (!tx->req) return; @@ -1391,31 +1392,19 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) SDMA_DBG(req, "SDMA completion with error %d", status); WRITE_ONCE(req->has_error, 1); + state = ERROR; } req->seqcomp = tx->seqnum; kmem_cache_free(pq->txreq_cache, tx); - tx = NULL; - idx = req->info.comp_idx; - if (req->status == -1 && status == SDMA_TXREQ_S_OK) { - if (req->seqcomp == req->info.npkts - 1) { - req->status = 0; - user_sdma_free_request(req, false); - pq_update(pq); - set_comp_state(pq, cq, idx, COMPLETE, 0); - } - } else { - if (status != SDMA_TXREQ_S_OK) - req->status = status; - if (req->seqcomp == (READ_ONCE(req->seqsubmitted) - 1) && - (READ_ONCE(req->done) || - READ_ONCE(req->has_error))) { - user_sdma_free_request(req, false); - pq_update(pq); - set_comp_state(pq, cq, idx, ERROR, req->status); - } - } + /* sequence isn't complete? We are done */ + if (req->seqcomp != req->info.npkts - 1) + return; + + user_sdma_free_request(req, false); + set_comp_state(pq, cq, req->info.comp_idx, state, status); + pq_update(pq); } static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) @@ -1448,6 +1437,8 @@ static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) if (!node) continue; + req->iovs[i].node = NULL; + if (unpin) hfi1_mmu_rb_remove(req->pq->handler, &node->rb); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index d2bc77f75253..0ae06456c868 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -205,8 +205,6 @@ struct user_sdma_request { /* Writeable fields shared with interrupt */ u64 seqcomp ____cacheline_aligned_in_smp; u64 seqsubmitted; - /* status of the last txreq completed */ - int status; /* Send side fields */ struct list_head txps ____cacheline_aligned_in_smp; @@ -228,7 +226,6 @@ struct user_sdma_request { u16 tididx; /* progress index moving along the iovs array */ u8 iov_idx; - u8 done; u8 has_error; struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ]; From 28a9a9e83ceae2cee25b9af9ad20d53aaa9ab951 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 10 Sep 2018 09:39:11 -0700 Subject: [PATCH 069/242] IB/hfi1: Remove race conditions in user_sdma send path Packet queue state is over used to determine SDMA descriptor availablitity and packet queue request state. cpu 0 ret = user_sdma_send_pkts(req, pcount); cpu 0 if (atomic_read(&pq->n_reqs)) cpu 1 IRQ user_sdma_txreq_cb calls pq_update() (state to _INACTIVE) cpu 0 xchg(&pq->state, SDMA_PKT_Q_ACTIVE); At this point pq->n_reqs == 0 and pq->state is incorrectly SDMA_PKT_Q_ACTIVE. The close path will hang waiting for the state to return to _INACTIVE. This can also change the state from _DEFERRED to _ACTIVE. However, this is a mostly benign race. Remove the racy code path. Use n_reqs to determine if a packet queue is active or not. Reviewed-by: Mitko Haralanov Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.c | 24 ++++++++++-------------- drivers/infiniband/hw/hfi1/user_sdma.h | 9 +++++---- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 9f471672e2d5..fd6e9f56f53e 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -187,7 +187,6 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, pq->ctxt = uctxt->ctxt; pq->subctxt = fd->subctxt; pq->n_max_reqs = hfi1_sdma_comp_ring_size; - pq->state = SDMA_PKT_Q_INACTIVE; atomic_set(&pq->n_reqs, 0); init_waitqueue_head(&pq->wait); atomic_set(&pq->n_locked, 0); @@ -276,7 +275,7 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, /* Wait until all requests have been freed. */ wait_event_interruptible( pq->wait, - (READ_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE)); + !atomic_read(&pq->n_reqs)); kfree(pq->reqs); kfree(pq->req_in_use); kmem_cache_destroy(pq->txreq_cache); @@ -312,6 +311,13 @@ static u8 dlid_to_selector(u16 dlid) return mapping[hash]; } +/** + * hfi1_user_sdma_process_request() - Process and start a user sdma request + * @fd: valid file descriptor + * @iovec: array of io vectors to process + * @dim: overall iovec array size + * @count: number of io vector array entries processed + */ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, struct iovec *iovec, unsigned long dim, unsigned long *count) @@ -560,20 +566,12 @@ int hfi1_user_sdma_process_request(struct hfi1_filedata *fd, req->ahg_idx = sdma_ahg_alloc(req->sde); set_comp_state(pq, cq, info.comp_idx, QUEUED, 0); + pq->state = SDMA_PKT_Q_ACTIVE; /* Send the first N packets in the request to buy us some time */ ret = user_sdma_send_pkts(req, pcount); if (unlikely(ret < 0 && ret != -EBUSY)) goto free_req; - /* - * It is possible that the SDMA engine would have processed all the - * submitted packets by the time we get here. Therefore, only set - * packet queue state to ACTIVE if there are still uncompleted - * requests. - */ - if (atomic_read(&pq->n_reqs)) - xchg(&pq->state, SDMA_PKT_Q_ACTIVE); - /* * This is a somewhat blocking send implementation. * The driver will block the caller until all packets of the @@ -1409,10 +1407,8 @@ static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status) static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq) { - if (atomic_dec_and_test(&pq->n_reqs)) { - xchg(&pq->state, SDMA_PKT_Q_INACTIVE); + if (atomic_dec_and_test(&pq->n_reqs)) wake_up(&pq->wait); - } } static void user_sdma_free_request(struct user_sdma_request *req, bool unpin) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index 0ae06456c868..91c343f91776 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -105,9 +105,10 @@ static inline int ahg_header_set(u32 *arr, int idx, size_t array_size, #define TXREQ_FLAGS_REQ_ACK BIT(0) /* Set the ACK bit in the header */ #define TXREQ_FLAGS_REQ_DISABLE_SH BIT(1) /* Disable header suppression */ -#define SDMA_PKT_Q_INACTIVE BIT(0) -#define SDMA_PKT_Q_ACTIVE BIT(1) -#define SDMA_PKT_Q_DEFERRED BIT(2) +enum pkt_q_sdma_state { + SDMA_PKT_Q_ACTIVE, + SDMA_PKT_Q_DEFERRED, +}; /* * Maximum retry attempts to submit a TX request @@ -133,7 +134,7 @@ struct hfi1_user_sdma_pkt_q { struct user_sdma_request *reqs; unsigned long *req_in_use; struct iowait busy; - unsigned state; + enum pkt_q_sdma_state state; wait_queue_head_t wait; unsigned long unpinned; struct mmu_rb_handler *handler; From 3ca633f1ff7b1c1e8653181352485889b5636a12 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 10 Sep 2018 09:39:20 -0700 Subject: [PATCH 070/242] IB/hfi1: Right size user_sdma sequence numbers and related variables Hardware limits the maximum number of packets to u16 packets. Match that size for all relevant sequence numbers in the user_sdma engine. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/sdma.c | 4 ++-- drivers/infiniband/hw/hfi1/sdma.h | 2 +- drivers/infiniband/hw/hfi1/user_sdma.c | 8 ++++---- drivers/infiniband/hw/hfi1/user_sdma.h | 8 ++++---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 88e326d6cc49..7a9b67e82a96 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -2444,7 +2444,7 @@ nodesc: * @sde: sdma engine to use * @wait: wait structure to use when full (may be NULL) * @tx_list: list of sdma_txreqs to submit - * @count: pointer to a u32 which, after return will contain the total number of + * @count: pointer to a u16 which, after return will contain the total number of * sdma_txreqs removed from the tx_list. This will include sdma_txreqs * whose SDMA descriptors are submitted to the ring and the sdma_txreqs * which are added to SDMA engine flush list if the SDMA engine state is @@ -2468,7 +2468,7 @@ nodesc: * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, - struct list_head *tx_list, u32 *count_out) + struct list_head *tx_list, u16 *count_out) { struct sdma_txreq *tx, *tx_next; int ret = 0; diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index d2da2e651600..c076eef081e8 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -849,7 +849,7 @@ int sdma_send_txreq(struct sdma_engine *sde, int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, struct list_head *tx_list, - u32 *count); + u16 *count_out); int sdma_ahg_alloc(struct sdma_engine *sde); void sdma_ahg_free(struct sdma_engine *sde, int ahg_index); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index fd6e9f56f53e..bee75beda730 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -76,8 +76,7 @@ MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 12 static unsigned initial_pkt_count = 8; -static int user_sdma_send_pkts(struct user_sdma_request *req, - unsigned maxpkts); +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts); static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status); static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq); static void user_sdma_free_request(struct user_sdma_request *req, bool unpin); @@ -756,9 +755,10 @@ static int user_sdma_txadd(struct user_sdma_request *req, return ret; } -static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts) +static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) { - int ret = 0, count; + int ret = 0; + u16 count; unsigned npkts = 0; struct user_sdma_txreq *tx = NULL; struct hfi1_user_sdma_pkt_q *pq = NULL; diff --git a/drivers/infiniband/hw/hfi1/user_sdma.h b/drivers/infiniband/hw/hfi1/user_sdma.h index 91c343f91776..14dfd757dafd 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.h +++ b/drivers/infiniband/hw/hfi1/user_sdma.h @@ -204,12 +204,12 @@ struct user_sdma_request { s8 ahg_idx; /* Writeable fields shared with interrupt */ - u64 seqcomp ____cacheline_aligned_in_smp; - u64 seqsubmitted; + u16 seqcomp ____cacheline_aligned_in_smp; + u16 seqsubmitted; /* Send side fields */ struct list_head txps ____cacheline_aligned_in_smp; - u64 seqnum; + u16 seqnum; /* * KDETH.OFFSET (TID) field * The offset can cover multiple packets, depending on the @@ -246,7 +246,7 @@ struct user_sdma_txreq { struct user_sdma_request *req; u16 flags; unsigned int busycount; - u64 seqnum; + u16 seqnum; }; int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, From 2bf4b33f83dfe521c4c7c407b6b150aeec04d69c Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 10 Sep 2018 09:39:28 -0700 Subject: [PATCH 071/242] IB/hfi1: Missing return value in error path for user sdma If the set_txreq_header_agh() function returns an error, the exit path is chosen. In this path, the code fails to set the return value. This will cause the caller to not realize an error has occurred. Set the return value correctly in the error path. Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/user_sdma.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index bee75beda730..825e475dc9fe 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -860,8 +860,10 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) changes = set_txreq_header_ahg(req, tx, datalen); - if (changes < 0) + if (changes < 0) { + ret = changes; goto free_tx; + } } } else { ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) + From b53ae6bc7e397930ff20221d1a5976677e0dc85b Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Mon, 10 Sep 2018 07:54:04 -0700 Subject: [PATCH 072/242] IB/hfi1: set_intr_bits uses incorrect source for register modification HFI IRQ enable bits are not being set correctly. Send context error and DC IRQs are not being enabled correctly. In addition, send context error IRQs are not being delivered. Because of this, send context errors are not being handled correctly when they occur. When setting the IRQ bits, if an IRQ range is used, and the last bit is on a register boundary (bit 63), the calculated index for the final register modification is incorrect (index + 1 vs. index). The incorrect index calculation causes incorrect IRQ bits to be set. In this case the send context error IRQ is NOT enabled. Fix by using the 'last' value rather than the counted 'src' value to determine the final index to use. This satisfies all cases. Fixes: a2f7bbdc2dba ("IB/hfi1: Rework the IRQ API to be more flexible") Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index ef433fdb2411..290c3b5b22f3 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -13012,7 +13012,7 @@ int set_intr_bits(struct hfi1_devdata *dd, u16 first, u16 last, bool set) } bits |= BIT_ULL(bit); } - read_mod_write(dd, src, bits, set); + read_mod_write(dd, last, bits, set); return 0; } From 99ed748e878a99c6c7b87bbec063eefd9e47cb42 Mon Sep 17 00:00:00 2001 From: Moni Shoua Date: Wed, 12 Sep 2018 09:33:55 +0300 Subject: [PATCH 073/242] IB/mlx5: Allow transition of DCI QP to reset The transition is allowed from any state and the atrribute mask must be IB_QP_STATE. Fixes: c32a4f296e1d ("IB/mlx5: Add support for DC Initiator QP") Signed-off-by: Moni Shoua Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a2073ee7a16f..1f35ecbefffe 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3272,7 +3272,9 @@ static bool modify_dci_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state new int req = IB_QP_STATE; int opt = 0; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { + if (new_state == IB_QPS_RESET) { + return is_valid_mask(attr_mask, req, opt); + } else if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) { req |= IB_QP_PKEY_INDEX | IB_QP_PORT; return is_valid_mask(attr_mask, req, opt); } else if (cur_state == IB_QPS_INIT && new_state == IB_QPS_INIT) { From caf1e3ae9fa648d6dd38468736868d6867cab273 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:16 +0300 Subject: [PATCH 074/242] RDMA/core Introduce and use rdma_find_ndev_for_src_ip_rcu This fixes two issues: 1. When address family is other than IPv4 or v6, rdma_translate_ip() returns success which is incorrect. 2. When address familty is AF_INET6, and if the source address is not found, it returns success, which is also incorrect. Therefore, introduce and use rdma_find_ndev_for_src_ip_rcu() helper function which returns correct success or error status and is also useful for future code refactor in addr_resolve(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 61 +++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 94ff38731be8..50ab50f1908b 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -232,6 +232,36 @@ void rdma_copy_addr(struct rdma_dev_addr *dev_addr, } EXPORT_SYMBOL(rdma_copy_addr); +static struct net_device * +rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) +{ + struct net_device *dev = NULL; + int ret = -EADDRNOTAVAIL; + + switch (src_in->sa_family) { + case AF_INET: + dev = __ip_dev_find(net, + ((const struct sockaddr_in *)src_in)->sin_addr.s_addr, + false); + if (dev) + ret = 0; + break; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + for_each_netdev_rcu(net, dev) { + if (ipv6_chk_addr(net, + &((const struct sockaddr_in6 *)src_in)->sin6_addr, + dev, 1)) { + ret = 0; + break; + } + } + break; +#endif + } + return ret ? ERR_PTR(ret) : dev; +} + int rdma_translate_ip(const struct sockaddr *addr, struct rdma_dev_addr *dev_addr) { @@ -246,33 +276,12 @@ int rdma_translate_ip(const struct sockaddr *addr, return 0; } - switch (addr->sa_family) { - case AF_INET: - dev = ip_dev_find(dev_addr->net, - ((const struct sockaddr_in *)addr)->sin_addr.s_addr); - - if (!dev) - return -EADDRNOTAVAIL; - + rcu_read_lock(); + dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); + if (!IS_ERR(dev)) rdma_copy_addr(dev_addr, dev, NULL); - dev_put(dev); - break; -#if IS_ENABLED(CONFIG_IPV6) - case AF_INET6: - rcu_read_lock(); - for_each_netdev_rcu(dev_addr->net, dev) { - if (ipv6_chk_addr(dev_addr->net, - &((const struct sockaddr_in6 *)addr)->sin6_addr, - dev, 1)) { - rdma_copy_addr(dev_addr, dev, NULL); - break; - } - } - rcu_read_unlock(); - break; -#endif - } - return 0; + rcu_read_unlock(); + return PTR_ERR_OR_ZERO(dev); } EXPORT_SYMBOL(rdma_translate_ip); From f89b7dfa33537bba9ee082a17a55242fc727e9f4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:17 +0300 Subject: [PATCH 075/242] RDMA/core: Avoid unnecessary sa_family overwrite addr4_resolve() and addr6_resolve() are called by checking the value of sa_family. Both above functions overwrite the value after typecasting, this is not necessary. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 50ab50f1908b..858ceffbeeaa 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -392,7 +392,6 @@ static int addr4_resolve(struct sockaddr_in *src_in, if (ret) return ret; - src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're @@ -429,10 +428,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, return ret; rt = (struct rt6_info *)dst; - if (ipv6_addr_any(&src_in->sin6_addr)) { - src_in->sin6_family = AF_INET6; + if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; - } /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network From 89c5691cdd95ab39f43bd102ec3f0ff39716ae85 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:18 +0300 Subject: [PATCH 076/242] RDMA/core: Let protocol specific function typecast sockaddr structure Current code typecasts destination address using extra variable but uses source address as is. Even though the compiler optimizes such code well, just let each protocol specific function typecast for src and dest both and have symmetric code. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 858ceffbeeaa..9649e5e55e9e 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -372,11 +372,15 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, return dst_fetch_ha(dst, dev_addr, daddr); } -static int addr4_resolve(struct sockaddr_in *src_in, - const struct sockaddr_in *dst_in, +static int addr4_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct rtable **prt) { + struct sockaddr_in *src_in = (struct sockaddr_in *)src_sock; + const struct sockaddr_in *dst_in = + (const struct sockaddr_in *)dst_sock; + __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; struct rtable *rt; @@ -408,11 +412,14 @@ static int addr4_resolve(struct sockaddr_in *src_in, } #if IS_ENABLED(CONFIG_IPV6) -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { + struct sockaddr_in6 *src_in = (struct sockaddr_in6 *)src_sock; + const struct sockaddr_in6 *dst_in = + (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; struct rt6_info *rt; @@ -445,8 +452,8 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, return 0; } #else -static int addr6_resolve(struct sockaddr_in6 *src_in, - const struct sockaddr_in6 *dst_in, +static int addr6_resolve(struct sockaddr *src_sock, + const struct sockaddr *dst_sock, struct rdma_dev_addr *addr, struct dst_entry **pdst) { @@ -496,11 +503,8 @@ static int addr_resolve(struct sockaddr *src_in, if (src_in->sa_family == AF_INET) { struct rtable *rt = NULL; - const struct sockaddr_in *dst_in4 = - (const struct sockaddr_in *)dst_in; - ret = addr4_resolve((struct sockaddr_in *)src_in, - dst_in4, addr, &rt); + ret = addr4_resolve(src_in, dst_in, addr, &rt); if (ret) return ret; @@ -516,12 +520,7 @@ static int addr_resolve(struct sockaddr *src_in, ip_rt_put(rt); } else { - const struct sockaddr_in6 *dst_in6 = - (const struct sockaddr_in6 *)dst_in; - - ret = addr6_resolve((struct sockaddr_in6 *)src_in, - dst_in6, addr, - &dst); + ret = addr6_resolve(src_in, dst_in, addr, &dst); if (ret) return ret; From a362ea1d9e1acf674094614518f4245d17cfc01e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:19 +0300 Subject: [PATCH 077/242] RDMA/core: Introduce and use rdma_set_src_addr() between IPv4 and IPv6 rdma_translate_ip() is done while resolving address for the loopback addresses. The current flow is convoluted with resolve neighbor being optional. This patch simplifies the code in following ways. (a) Use common code between IPv4 and IPv6 for address translation, loopback checks and acquiring netdevice. (b) During neigh resolve in addr_resolve_neigh(), only copy destination address. (c) Always resolve the source address before the destination address, because it doesn't depend on resolving neigh being requested or not. This helps to reduce 3 calls of rdma_copy_addr and rdma_translate_ip to one and makes it easier to follow the code flow. Now that ib_nl_fetch_ha() doesn't depend on dst, drop dst argument from ib_nl_fetch_ha(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 65 ++++++++++++---------------------- 1 file changed, 23 insertions(+), 42 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 9649e5e55e9e..40f1c1563477 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -304,15 +304,12 @@ static void queue_req(struct addr_req *req) spin_unlock_bh(&lock); } -static int ib_nl_fetch_ha(const struct dst_entry *dst, - struct rdma_dev_addr *dev_addr, +static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; - /* We fill in what we can, the response will fill the rest */ - rdma_copy_addr(dev_addr, dst->dev, NULL); return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); } @@ -331,7 +328,7 @@ static int dst_fetch_ha(const struct dst_entry *dst, neigh_event_send(n, NULL); ret = -ENODATA; } else { - rdma_copy_addr(dev_addr, dst->dev, n->ha); + memcpy(dev_addr->dst_dev_addr, n->ha, MAX_ADDR_LEN); } neigh_release(n); @@ -367,7 +364,7 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, /* Gateway + ARPHRD_INFINIBAND -> IB router */ if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) - return ib_nl_fetch_ha(dst, dev_addr, daddr, seq, family); + return ib_nl_fetch_ha(dev_addr, daddr, seq, family); else return dst_fetch_ha(dst, dev_addr, daddr); } @@ -467,32 +464,37 @@ static int addr_resolve_neigh(const struct dst_entry *dst, u32 seq) { if (dst->dev->flags & IFF_LOOPBACK) { - int ret; - - ret = rdma_translate_ip(dst_in, addr); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, - MAX_ADDR_LEN); - - return ret; + memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); + return 0; } /* If the device doesn't do ARP internally */ if (!(dst->dev->flags & IFF_NOARP)) return fetch_ha(dst, addr, dst_in, seq); - rdma_copy_addr(addr, dst->dev, NULL); - return 0; } +static int rdma_set_src_addr(const struct dst_entry *dst, + const struct sockaddr *dst_in, + struct rdma_dev_addr *dev_addr) +{ + int ret = 0; + + if (dst->dev->flags & IFF_LOOPBACK) + ret = rdma_translate_ip(dst_in, dev_addr); + else + rdma_copy_addr(dev_addr, dst->dev, NULL); + return ret; +} + static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, u32 seq) { - struct net_device *ndev; + struct rtable *rt = NULL; struct dst_entry *dst; int ret; @@ -502,49 +504,28 @@ static int addr_resolve(struct sockaddr *src_in, } if (src_in->sa_family == AF_INET) { - struct rtable *rt = NULL; ret = addr4_resolve(src_in, dst_in, addr, &rt); if (ret) return ret; - if (resolve_neigh) + ret = rdma_set_src_addr(&rt->dst, dst_in, addr); + if (!ret && resolve_neigh) ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = rt->dst.dev; - dev_hold(ndev); - } - ip_rt_put(rt); } else { ret = addr6_resolve(src_in, dst_in, addr, &dst); if (ret) return ret; - if (resolve_neigh) + ret = rdma_set_src_addr(dst, dst_in, addr); + if (!ret && resolve_neigh) ret = addr_resolve_neigh(dst, dst_in, addr, seq); - if (addr->bound_dev_if) { - ndev = dev_get_by_index(addr->net, addr->bound_dev_if); - } else { - ndev = dst->dev; - dev_hold(ndev); - } - dst_release(dst); } - if (ndev) { - if (ndev->flags & IFF_LOOPBACK) - ret = rdma_translate_ip(dst_in, addr); - else - addr->bound_dev_if = ndev->ifindex; - dev_put(ndev); - } - return ret; } From 77addc524473ee9a85d2ef5747a32173c85768d4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:20 +0300 Subject: [PATCH 078/242] RDMA/core: Rename rdma_copy_addr to rdma_copy_src_l2_addr Now that rdma_copy_addr() only copies the source addresses and all callers are interested in copying only source addresses, simplify it to drop the destination address argument. Given that it only copies source layer2 addresses, rename it to rdma_copy_src_l2_addr for better code readability. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 24 +++++++++++++++--------- drivers/infiniband/core/cma.c | 4 ++-- drivers/infiniband/core/core_priv.h | 2 ++ include/rdma/ib_addr.h | 4 ---- 4 files changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 40f1c1563477..c9d14d6996b2 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -219,18 +219,24 @@ int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr) } EXPORT_SYMBOL(rdma_addr_size_kss); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr) +/** + * rdma_copy_src_l2_addr - Copy netdevice source addresses + * @dev_addr: Destination address pointer where to copy the addresses + * @dev: Netdevice whose source addresses to copy + * + * rdma_copy_src_l2_addr() copies source addresses from the specified netdevice. + * This includes unicast address, broadcast address, device type and + * interface index. + */ +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev) { dev_addr->dev_type = dev->type; memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN); memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN); - if (dst_dev_addr) - memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN); dev_addr->bound_dev_if = dev->ifindex; } -EXPORT_SYMBOL(rdma_copy_addr); +EXPORT_SYMBOL(rdma_copy_src_l2_addr); static struct net_device * rdma_find_ndev_for_src_ip_rcu(struct net *net, const struct sockaddr *src_in) @@ -271,7 +277,7 @@ int rdma_translate_ip(const struct sockaddr *addr, dev = dev_get_by_index(dev_addr->net, dev_addr->bound_dev_if); if (!dev) return -ENODEV; - rdma_copy_addr(dev_addr, dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dev); dev_put(dev); return 0; } @@ -279,7 +285,7 @@ int rdma_translate_ip(const struct sockaddr *addr, rcu_read_lock(); dev = rdma_find_ndev_for_src_ip_rcu(dev_addr->net, addr); if (!IS_ERR(dev)) - rdma_copy_addr(dev_addr, dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dev); rcu_read_unlock(); return PTR_ERR_OR_ZERO(dev); } @@ -484,7 +490,7 @@ static int rdma_set_src_addr(const struct dst_entry *dst, if (dst->dev->flags & IFF_LOOPBACK) ret = rdma_translate_ip(dst_in, dev_addr); else - rdma_copy_addr(dev_addr, dst->dev, NULL); + rdma_copy_src_l2_addr(dev_addr, dst->dev); return ret; } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 4ba77f4e7098..ace2a4c757f6 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1900,7 +1900,7 @@ cma_ib_new_conn_id(const struct rdma_cm_id *listen_id, rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path; if (net_dev) { - rdma_copy_addr(&rt->addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&rt->addr.dev_addr, net_dev); } else { if (!cma_protocol_roce(listen_id) && cma_any_addr(cma_src_addr(id_priv))) { @@ -1950,7 +1950,7 @@ cma_ib_new_udp_id(const struct rdma_cm_id *listen_id, goto err; if (net_dev) { - rdma_copy_addr(&id->route.addr.dev_addr, net_dev, NULL); + rdma_copy_src_l2_addr(&id->route.addr.dev_addr, net_dev); } else { if (!cma_any_addr(cma_src_addr(id_priv))) { ret = cma_translate_addr(cma_src_addr(id_priv), diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 77c7005c396c..c3d93350413c 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -340,5 +340,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, u8 *dmac, const struct net_device *ndev, int *hoplimit); +void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct net_device *dev); #endif /* _CORE_PRIV_H */ diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 77c7908b7d73..676514a930ab 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -105,10 +105,6 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, void rdma_addr_cancel(struct rdma_dev_addr *addr); -void rdma_copy_addr(struct rdma_dev_addr *dev_addr, - const struct net_device *dev, - const unsigned char *dst_dev_addr); - int rdma_addr_size(const struct sockaddr *addr); int rdma_addr_size_in6(struct sockaddr_in6 *addr); int rdma_addr_size_kss(struct __kernel_sockaddr_storage *addr); From 783793b5543d3b886f0704803198feeb058cccab Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:21 +0300 Subject: [PATCH 079/242] RDMA/core: Use common code flow for IPv4/6 for addr resolve Use common code flow for resolving neighbour and for finding source addresses. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c9d14d6996b2..cbc64de2d791 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -500,8 +500,8 @@ static int addr_resolve(struct sockaddr *src_in, bool resolve_neigh, u32 seq) { + struct dst_entry *dst = NULL; struct rtable *rt = NULL; - struct dst_entry *dst; int ret; if (!addr->net) { @@ -510,28 +510,26 @@ static int addr_resolve(struct sockaddr *src_in, } if (src_in->sa_family == AF_INET) { - ret = addr4_resolve(src_in, dst_in, addr, &rt); - if (ret) - return ret; - - ret = rdma_set_src_addr(&rt->dst, dst_in, addr); - if (!ret && resolve_neigh) - ret = addr_resolve_neigh(&rt->dst, dst_in, addr, seq); - - ip_rt_put(rt); + dst = &rt->dst; } else { ret = addr6_resolve(src_in, dst_in, addr, &dst); - if (ret) - return ret; - - ret = rdma_set_src_addr(dst, dst_in, addr); - if (!ret && resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, seq); - - dst_release(dst); } + if (ret) + return ret; + ret = rdma_set_src_addr(dst, dst_in, addr); + /* + * Resolve neighbor destination address if requested and + * only if src addr translation didn't fail. + */ + if (!ret && resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr, seq); + + if (src_in->sa_family == AF_INET) + ip_rt_put(rt); + else + dst_release(dst); return ret; } From 307edde8efb75cd39326f0f603c9693a5b2af019 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:22 +0300 Subject: [PATCH 080/242] RDMA/core: Refer to network type instead of device type Set and refer to rdma_dev_addr network type instead of dst->ndev to reduce dependency on accessing dst netdevice. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 35 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index cbc64de2d791..97d0b36b5120 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -368,8 +368,8 @@ static int fetch_ha(const struct dst_entry *dst, struct rdma_dev_addr *dev_addr, (const void *)&dst_in6->sin6_addr; sa_family_t family = dst_in->sa_family; - /* Gateway + ARPHRD_INFINIBAND -> IB router */ - if (has_gateway(dst, family) && dst->dev->type == ARPHRD_INFINIBAND) + /* If we have a gateway in IB mode then it must be an IB network */ + if (has_gateway(dst, family) && dev_addr->network == RDMA_NETWORK_IB) return ib_nl_fetch_ha(dev_addr, daddr, seq, family); else return dst_fetch_ha(dst, dev_addr, daddr); @@ -401,13 +401,6 @@ static int addr4_resolve(struct sockaddr *src_sock, src_in->sin_addr.s_addr = fl4.saddr; - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt_uses_gateway && rt->dst.dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV4; - addr->hoplimit = ip4_dst_hoplimit(&rt->dst); *prt = rt; @@ -425,7 +418,6 @@ static int addr6_resolve(struct sockaddr *src_sock, (const struct sockaddr_in6 *)dst_sock; struct flowi6 fl6; struct dst_entry *dst; - struct rt6_info *rt; int ret; memset(&fl6, 0, sizeof fl6); @@ -437,18 +429,9 @@ static int addr6_resolve(struct sockaddr *src_sock, if (ret < 0) return ret; - rt = (struct rt6_info *)dst; if (ipv6_addr_any(&src_in->sin6_addr)) src_in->sin6_addr = fl6.saddr; - /* If there's a gateway and type of device not ARPHRD_INFINIBAND, we're - * definitely in RoCE v2 (as RoCE v1 isn't routable) set the network - * type accordingly. - */ - if (rt->rt6i_flags & RTF_GATEWAY && - ip6_dst_idev(dst)->dev->type != ARPHRD_INFINIBAND) - addr->network = RDMA_NETWORK_IPV6; - addr->hoplimit = ip6_dst_hoplimit(dst); *pdst = dst; @@ -491,6 +474,20 @@ static int rdma_set_src_addr(const struct dst_entry *dst, ret = rdma_translate_ip(dst_in, dev_addr); else rdma_copy_src_l2_addr(dev_addr, dst->dev); + + /* + * If there's a gateway and type of device not ARPHRD_INFINIBAND, + * we're definitely in RoCE v2 (as RoCE v1 isn't routable) set the + * network type accordingly. + */ + if (has_gateway(dst, dst_in->sa_family) && + dst->dev->type != ARPHRD_INFINIBAND) + dev_addr->network = dst_in->sa_family == AF_INET ? + RDMA_NETWORK_IPV4 : + RDMA_NETWORK_IPV6; + else + dev_addr->network = RDMA_NETWORK_IB; + return ret; } From c31d4b2ddf07ba74388cb8799517a7010e3e0c89 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:23 +0300 Subject: [PATCH 081/242] RDMA/core: Protect against changing dst->dev during destination resolve During resolving address process, during route lookup and while performing src address translation in case of loopback mode, hold the rcu lock so that if netdevice is moving to different net namespace, or being unregistered, it can be synchronized with net/core/dev.c, ie change_net_namespace() ->dev_close_many() ->rt6_uncached_list_flush_dev() who would change dst->dev to loopback device of the given net namespace. Therefore, hold the rcu lock and sync with synchronize_net() of change_net_namespace() to ensure that netdevice cannot get freed while dst->dev is being used. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 61 +++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 97d0b36b5120..316a53f59ee8 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -450,23 +450,26 @@ static int addr6_resolve(struct sockaddr *src_sock, static int addr_resolve_neigh(const struct dst_entry *dst, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, + unsigned int ndev_flags, u32 seq) { - if (dst->dev->flags & IFF_LOOPBACK) { + int ret = 0; + + if (ndev_flags & IFF_LOOPBACK) { memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - return 0; + } else { + if (!(ndev_flags & IFF_NOARP)) { + /* If the device doesn't do ARP internally */ + ret = fetch_ha(dst, addr, dst_in, seq); + } } - - /* If the device doesn't do ARP internally */ - if (!(dst->dev->flags & IFF_NOARP)) - return fetch_ha(dst, addr, dst_in, seq); - - return 0; + return ret; } -static int rdma_set_src_addr(const struct dst_entry *dst, +static void copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct sockaddr *dst_in, - struct rdma_dev_addr *dev_addr) + const struct dst_entry *dst, + const struct net_device *ndev) { int ret = 0; @@ -481,14 +484,37 @@ static int rdma_set_src_addr(const struct dst_entry *dst, * network type accordingly. */ if (has_gateway(dst, dst_in->sa_family) && - dst->dev->type != ARPHRD_INFINIBAND) + ndev->type != ARPHRD_INFINIBAND) dev_addr->network = dst_in->sa_family == AF_INET ? RDMA_NETWORK_IPV4 : RDMA_NETWORK_IPV6; else dev_addr->network = RDMA_NETWORK_IB; +} - return ret; +static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, + unsigned int *ndev_flags, + const struct sockaddr *dst_in, + const struct dst_entry *dst) +{ + struct net_device *ndev = READ_ONCE(dst->dev); + + *ndev_flags = ndev->flags; + /* A physical device must be the RDMA device to use */ + if (ndev->flags & IFF_LOOPBACK) { + /* + * RDMA (IB/RoCE, iWarp) doesn't run on lo interface or + * loopback IP address. So if route is resolved to loopback + * interface, translate that to a real ndev based on non + * loopback IP address. + */ + ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); + if (!ndev) + return -ENODEV; + } + + copy_src_l2_addr(dev_addr, dst_in, dst, ndev); + return 0; } static int addr_resolve(struct sockaddr *src_in, @@ -498,6 +524,7 @@ static int addr_resolve(struct sockaddr *src_in, u32 seq) { struct dst_entry *dst = NULL; + unsigned int ndev_flags = 0; struct rtable *rt = NULL; int ret; @@ -506,22 +533,26 @@ static int addr_resolve(struct sockaddr *src_in, return -EINVAL; } + rcu_read_lock(); if (src_in->sa_family == AF_INET) { ret = addr4_resolve(src_in, dst_in, addr, &rt); dst = &rt->dst; } else { ret = addr6_resolve(src_in, dst_in, addr, &dst); } - if (ret) + if (ret) { + rcu_read_unlock(); return ret; + } + ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); + rcu_read_unlock(); - ret = rdma_set_src_addr(dst, dst_in, addr); /* * Resolve neighbor destination address if requested and * only if src addr translation didn't fail. */ if (!ret && resolve_neigh) - ret = addr_resolve_neigh(dst, dst_in, addr, seq); + ret = addr_resolve_neigh(dst, dst_in, addr, ndev_flags, seq); if (src_in->sa_family == AF_INET) ip_rt_put(rt); From 6aaecd38568557266ff7a5c3765c58322586e4ce Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:24 +0300 Subject: [PATCH 082/242] RDMA/core: Simplify roce_resolve_route_from_path() Currently RoCE route resolve functionality is split between two functions. (a) roce_resolve_route_from_path() and its helper function rdma_resolve_ip_route(). Due to this multiple sockaddr src structures are created in both functions with rdma_dev_addr is an interface between the two for checks. Since there is only one user of rdma_resolve_ip_route() as RoCE, combine the functionality of both functions to roce_resolve_route_from_path() and further reduce the scope of rdma_dev_addr to core/addr.c This also allow to extend addr_resolve() in subsequent patch to consider netdev properties of GID in safer way under rcu lock. Additionally src and dst addresses were always provided, so skip the src addr NULL pointer check as they are present on the stack now. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 51 +++++++++++++++++++++-------- drivers/infiniband/core/core_priv.h | 3 ++ drivers/infiniband/core/sa_query.c | 40 ---------------------- 3 files changed, 41 insertions(+), 53 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 316a53f59ee8..c4c620334957 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -659,23 +660,47 @@ err: } EXPORT_SYMBOL(rdma_resolve_ip); -int rdma_resolve_ip_route(struct sockaddr *src_addr, - const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr) +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr) { - struct sockaddr_storage ssrc_addr = {}; - struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid, dgid; + struct rdma_dev_addr dev_addr = {}; + int ret; - if (src_addr) { - if (src_addr->sa_family != dst_addr->sa_family) - return -EINVAL; + if (rec->roce.route_resolved) + return 0; - memcpy(src_in, src_addr, rdma_addr_size(src_addr)); - } else { - src_in->sa_family = dst_addr->sa_family; - } + rdma_gid2ip(&sgid._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid._sockaddr, &rec->dgid); - return addr_resolve(src_in, dst_addr, addr, false, 0); + if (sgid._sockaddr.sa_family != dgid._sockaddr.sa_family) + return -EINVAL; + + if (!attr || !attr->ndev) + return -EINVAL; + + dev_addr.bound_dev_if = attr->ndev->ifindex; + /* TODO: Use net from the ib_gid_attr once it is added to it, + * until than, limit itself to init_net. + */ + dev_addr.net = &init_net; + + ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, + &dev_addr, false, 0); + if (ret) + return ret; + + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) + return -EINVAL; + + rec->roce.route_resolved = true; + return 0; } /** diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index c3d93350413c..4dfcf41d83c0 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -343,4 +343,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev); +struct sa_path_rec; +int roce_resolve_route_from_path(struct sa_path_rec *rec, + const struct ib_gid_attr *attr); #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 7b794a14d6e8..d3d6275b3b7e 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1227,46 +1227,6 @@ static u8 get_src_path_mask(struct ib_device *device, u8 port_num) return src_path_mask; } -static int roce_resolve_route_from_path(struct sa_path_rec *rec, - const struct ib_gid_attr *attr) -{ - struct rdma_dev_addr dev_addr = {}; - union { - struct sockaddr _sockaddr; - struct sockaddr_in _sockaddr_in; - struct sockaddr_in6 _sockaddr_in6; - } sgid_addr, dgid_addr; - int ret; - - if (rec->roce.route_resolved) - return 0; - if (!attr || !attr->ndev) - return -EINVAL; - - dev_addr.bound_dev_if = attr->ndev->ifindex; - /* TODO: Use net from the ib_gid_attr once it is added to it, - * until than, limit itself to init_net. - */ - dev_addr.net = &init_net; - - rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); - rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); - - /* validate the route */ - ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, - &dgid_addr._sockaddr, &dev_addr); - if (ret) - return ret; - - if ((dev_addr.network == RDMA_NETWORK_IPV4 || - dev_addr.network == RDMA_NETWORK_IPV6) && - rec->rec_type != SA_PATH_REC_TYPE_ROCE_V2) - return -EINVAL; - - rec->roce.route_resolved = true; - return 0; -} - static int init_ah_attr_grh_fields(struct ib_device *device, u8 port_num, struct sa_path_rec *rec, struct rdma_ah_attr *ah_attr, From d6b1764a8c5ac0ad3a66c6d11d24c4fe067fe933 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:25 +0300 Subject: [PATCH 083/242] RDMA/core: Introduce rdma_read_gid_attr_ndev_rcu() to check GID attribute Introduce an API rdma_read_gid_attr_ndev_rcu() to return GID attribute netdevice which is in UP state for accessing netdevice's fields such as net namespace and ifindex. This is useful for users who intent to access netdevice fields under rcu lock. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cache.c | 33 +++++++++++++++++++++++++++++ drivers/infiniband/core/core_priv.h | 2 ++ 2 files changed, 35 insertions(+) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 0bee1f4b914e..8957d31d60ca 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -1252,6 +1252,39 @@ void rdma_hold_gid_attr(const struct ib_gid_attr *attr) } EXPORT_SYMBOL(rdma_hold_gid_attr); +/** + * rdma_read_gid_attr_ndev_rcu - Read GID attribute netdevice + * which must be in UP state. + * + * @attr:Pointer to the GID attribute + * + * Returns pointer to netdevice if the netdevice was attached to GID and + * netdevice is in UP state. Caller must hold RCU lock as this API + * reads the netdev flags which can change while netdevice migrates to + * different net namespace. Returns ERR_PTR with error code otherwise. + * + */ +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr) +{ + struct ib_gid_table_entry *entry = + container_of(attr, struct ib_gid_table_entry, attr); + struct ib_device *device = entry->attr.device; + struct net_device *ndev = ERR_PTR(-ENODEV); + u8 port_num = entry->attr.port_num; + struct ib_gid_table *table; + unsigned long flags; + bool valid; + + table = rdma_gid_table(device, port_num); + + read_lock_irqsave(&table->rwlock, flags); + valid = is_gid_entry_valid(table->data_vec[attr->index]); + if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP)) + ndev = attr->ndev; + read_unlock_irqrestore(&table->rwlock, flags); + return ndev; +} + static int config_non_roce_gid_cache(struct ib_device *device, u8 port, int gid_tbl_len) { diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 4dfcf41d83c0..33f50e1929e7 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -346,4 +346,6 @@ void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, struct sa_path_rec; int roce_resolve_route_from_path(struct sa_path_rec *rec, const struct ib_gid_attr *attr); + +struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr); #endif /* _CORE_PRIV_H */ From 0e9d2c19bff1d351005afb2f990a913e395ba6d4 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 5 Sep 2018 12:54:26 +0300 Subject: [PATCH 084/242] RDMA/core: Consider net ns of gid attribute for RoCE When resolving destination address or route, when net namespace is unavailable, refer to the net namespace of the netdevice of the SGID attribute. This is typically the case for requests arriving from the network for RoCE ports. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 75 ++++++++++++++++++++++++----- drivers/infiniband/core/cma.c | 7 +-- drivers/infiniband/core/core_priv.h | 2 +- drivers/infiniband/core/verbs.c | 2 +- include/rdma/ib_addr.h | 3 ++ 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c4c620334957..7a0356c78f60 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -62,6 +62,7 @@ struct addr_req { struct rdma_dev_addr *addr, void *context); unsigned long timeout; struct delayed_work work; + bool resolve_by_gid_attr; /* Consider gid attr in resolve phase */ int status; u32 seq; }; @@ -518,10 +519,37 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, return 0; } +static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) +{ + struct net_device *ndev; + + ndev = rdma_read_gid_attr_ndev_rcu(addr->sgid_attr); + if (IS_ERR(ndev)) + return PTR_ERR(ndev); + + /* + * Since we are holding the rcu, reading net and ifindex + * are safe without any additional reference; because + * change_net_namespace() in net/core/dev.c does rcu sync + * after it changes the state to IFF_DOWN and before + * updating netdev fields {net, ifindex}. + */ + addr->net = dev_net(ndev); + addr->bound_dev_if = ndev->ifindex; + return 0; +} + +static void rdma_addr_set_net_defaults(struct rdma_dev_addr *addr) +{ + addr->net = &init_net; + addr->bound_dev_if = 0; +} + static int addr_resolve(struct sockaddr *src_in, const struct sockaddr *dst_in, struct rdma_dev_addr *addr, bool resolve_neigh, + bool resolve_by_gid_attr, u32 seq) { struct dst_entry *dst = NULL; @@ -535,6 +563,23 @@ static int addr_resolve(struct sockaddr *src_in, } rcu_read_lock(); + if (resolve_by_gid_attr) { + if (!addr->sgid_attr) { + rcu_read_unlock(); + pr_warn_ratelimited("%s: missing gid_attr\n", __func__); + return -EINVAL; + } + /* + * If the request is for a specific gid attribute of the + * rdma_dev_addr, derive net from the netdevice of the + * GID attribute. + */ + ret = set_addr_netns_by_gid_rcu(addr); + if (ret) { + rcu_read_unlock(); + return ret; + } + } if (src_in->sa_family == AF_INET) { ret = addr4_resolve(src_in, dst_in, addr, &rt); dst = &rt->dst; @@ -543,7 +588,7 @@ static int addr_resolve(struct sockaddr *src_in, } if (ret) { rcu_read_unlock(); - return ret; + goto done; } ret = rdma_set_src_addr_rcu(addr, &ndev_flags, dst_in, dst); rcu_read_unlock(); @@ -559,6 +604,13 @@ static int addr_resolve(struct sockaddr *src_in, ip_rt_put(rt); else dst_release(dst); +done: + /* + * Clear the addr net to go back to its original state, only if it was + * derived from GID attribute in this context. + */ + if (resolve_by_gid_attr) + rdma_addr_set_net_defaults(addr); return ret; } @@ -573,7 +625,8 @@ static void process_one_req(struct work_struct *_work) src_in = (struct sockaddr *)&req->src_addr; dst_in = (struct sockaddr *)&req->dst_addr; req->status = addr_resolve(src_in, dst_in, req->addr, - true, req->seq); + true, req->resolve_by_gid_attr, + req->seq); if (req->status && time_after_eq(jiffies, req->timeout)) { req->status = -ETIMEDOUT; } else if (req->status == -ENODATA) { @@ -608,6 +661,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; @@ -636,10 +690,12 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, req->addr = addr; req->callback = callback; req->context = context; + req->resolve_by_gid_attr = resolve_by_gid_attr; INIT_DELAYED_WORK(&req->work, process_one_req); req->seq = (u32)atomic_inc_return(&ib_nl_addr_request_seq); - req->status = addr_resolve(src_in, dst_in, addr, true, req->seq); + req->status = addr_resolve(src_in, dst_in, addr, true, + req->resolve_by_gid_attr, req->seq); switch (req->status) { case 0: req->timeout = jiffies; @@ -683,14 +739,11 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec, if (!attr || !attr->ndev) return -EINVAL; - dev_addr.bound_dev_if = attr->ndev->ifindex; - /* TODO: Use net from the ib_gid_attr once it is added to it, - * until than, limit itself to init_net. - */ dev_addr.net = &init_net; + dev_addr.sgid_attr = attr; ret = addr_resolve(&sgid._sockaddr, &dgid._sockaddr, - &dev_addr, false, 0); + &dev_addr, false, true, 0); if (ret) return ret; @@ -755,7 +808,7 @@ static void resolve_cb(int status, struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit) { struct rdma_dev_addr dev_addr; @@ -771,12 +824,12 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = ndev->ifindex; dev_addr.net = &init_net; + dev_addr.sgid_attr = sgid_attr; init_completion(&ctx.comp); ret = rdma_resolve_ip(&sgid_addr._sockaddr, &dgid_addr._sockaddr, - &dev_addr, 1000, resolve_cb, &ctx); + &dev_addr, 1000, resolve_cb, true, &ctx); if (ret) return ret; diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index ace2a4c757f6..a57c8b823302 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2987,9 +2987,10 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, if (dst_addr->sa_family == AF_IB) { ret = cma_resolve_ib_addr(id_priv); } else { - ret = rdma_resolve_ip(cma_src_addr(id_priv), - dst_addr, &id->route.addr.dev_addr, - timeout_ms, addr_handler, id_priv); + ret = rdma_resolve_ip(cma_src_addr(id_priv), dst_addr, + &id->route.addr.dev_addr, + timeout_ms, addr_handler, + false, id_priv); } } if (ret) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 33f50e1929e7..d7399d5b1cb6 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -338,7 +338,7 @@ int rdma_resolve_ip_route(struct sockaddr *src_addr, int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, const struct net_device *ndev, + u8 *dmac, const struct ib_gid_attr *sgid_attr, int *hoplimit); void rdma_copy_src_l2_addr(struct rdma_dev_addr *dev_addr, const struct net_device *dev); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 6ee03d6089eb..c36be384fe34 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -710,7 +710,7 @@ static int ib_resolve_unicast_gid_dmac(struct ib_device *device, ret = rdma_addr_find_l2_eth_by_grh(&sgid_attr->gid, &grh->dgid, ah_attr->roce.dmac, - sgid_attr->ndev, &hop_limit); + sgid_attr, &hop_limit); grh->hop_limit = hop_limit; return ret; diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 676514a930ab..2e33b1529015 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -95,12 +95,15 @@ int rdma_translate_ip(const struct sockaddr *addr, * @timeout_ms: Amount of time to wait for the address resolution to complete. * @callback: Call invoked once address resolution has completed, timed out, * or been canceled. A status of 0 indicates success. + * @resolve_by_gid_attr: Resolve the ip based on the GID attribute from + * rdma_dev_addr. * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); From 4c0b6534c9100c3ad2d69e7dd3562a165a346204 Mon Sep 17 00:00:00 2001 From: Arseny Maslennikov Date: Thu, 6 Sep 2018 17:51:10 +0300 Subject: [PATCH 085/242] Documentation/ABI: document /sys/class/net/*/dev_port The sysfs field was introduced 4 years ago along with fixes to various drivers that erroneously used `dev_id' for that purpose, but it was not properly documented anywhere. See commit v3.14-rc3-739-g3f85944fe207. Signed-off-by: Arseny Maslennikov Signed-off-by: Doug Ledford --- Documentation/ABI/testing/sysfs-class-net | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 2f1788111cd9..ec2232f6a949 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -91,6 +91,24 @@ Description: stacked (e.g: VLAN interfaces) but still have the same MAC address as their parent device. +What: /sys/class/net//dev_port +Date: February 2014 +KernelVersion: 3.15 +Contact: netdev@vger.kernel.org +Description: + Indicates the port number of this network device, formatted + as a decimal value. Some NICs have multiple independent ports + on the same PCI bus, device and function. This attribute allows + userspace to distinguish the respective interfaces. + + Note: some device drivers started to use 'dev_id' for this + purpose since long before 3.15 and have not adopted the new + attribute ever since. To query the port number, some tools look + exclusively at 'dev_port', while others only consult 'dev_id'. + If a network device has multiple client adapter ports as + described in the previous paragraph and does not set this + attribute to its port number, it's a kernel bug. + What: /sys/class/net//dormant Date: March 2006 KernelVersion: 2.6.17 From 9b8b2a323008aedd39a8debb861b825707f01420 Mon Sep 17 00:00:00 2001 From: Arseny Maslennikov Date: Thu, 6 Sep 2018 17:51:11 +0300 Subject: [PATCH 086/242] IB/ipoib: Use dev_port to expose network interface port numbers Some InfiniBand network devices have multiple ports on the same PCI function. This initializes the `dev_port' sysfs field of those network interfaces with their port number. Prior to this the kernel erroneously used the `dev_id' sysfs field of those network interfaces to convey the port number to userspace. The use of `dev_id' was considered correct until Linux 3.15, when another field, `dev_port', was defined for this particular purpose and `dev_id' was reserved for distinguishing stacked ifaces (e.g: VLANs) with the same hardware address as their parent device. Similar fixes to net/mlx4_en and many other drivers, which started exporting this information through `dev_id' before 3.15, were accepted into the kernel 4 years ago. See 76a066f2a2a0 (`net/mlx4_en: Expose port number through sysfs'). Signed-off-by: Arseny Maslennikov Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 2d20de72e67f..a046076670e6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1881,6 +1881,8 @@ static int ipoib_parent_init(struct net_device *ndev) sizeof(union ib_gid)); SET_NETDEV_DEV(priv->dev, priv->ca->dev.parent); + priv->dev->dev_port = priv->port - 1; + /* Let's set this one too for backwards compatibility. */ priv->dev->dev_id = priv->port - 1; return 0; From f6350da41dc728da5a27044bde6f770525ba97d6 Mon Sep 17 00:00:00 2001 From: Arseny Maslennikov Date: Thu, 6 Sep 2018 17:51:12 +0300 Subject: [PATCH 087/242] IB/ipoib: Log sysfs 'dev_id' accesses from userspace Some tools may currently be using only the deprecated attribute; let's print an elaborate and clear deprecation notice to kmsg. To do that, we have to replace the whole sysfs file, since we inherit the original one from netdev. Signed-off-by: Arseny Maslennikov Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index a046076670e6..1df90f0d9e64 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -2387,6 +2387,35 @@ int ipoib_add_pkey_attr(struct net_device *dev) return device_create_file(&dev->dev, &dev_attr_pkey); } +/* + * We erroneously exposed the iface's port number in the dev_id + * sysfs field long after dev_port was introduced for that purpose[1], + * and we need to stop everyone from relying on that. + * Let's overload the shower routine for the dev_id file here + * to gently bring the issue up. + * + * [1] https://www.spinics.net/lists/netdev/msg272123.html + */ +static ssize_t dev_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *ndev = to_net_dev(dev); + + if (ndev->dev_id == ndev->dev_port) + netdev_info_once(ndev, + "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n", + current->comm); + + return sprintf(buf, "%#x\n", ndev->dev_id); +} +static DEVICE_ATTR_RO(dev_id); + +int ipoib_intercept_dev_id_attr(struct net_device *dev) +{ + device_remove_file(&dev->dev, &dev_attr_dev_id); + return device_create_file(&dev->dev, &dev_attr_dev_id); +} + static struct net_device *ipoib_add_port(const char *format, struct ib_device *hca, u8 port) { @@ -2428,6 +2457,8 @@ static struct net_device *ipoib_add_port(const char *format, */ ndev->priv_destructor = ipoib_intf_free; + if (ipoib_intercept_dev_id_attr(ndev)) + goto sysfs_failed; if (ipoib_cm_add_mode_attr(ndev)) goto sysfs_failed; if (ipoib_add_pkey_attr(ndev)) From cb816cd22618b1822667a4c2c80023ffd0261777 Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Thu, 13 Sep 2018 21:47:52 +0800 Subject: [PATCH 088/242] RDMA: Remove duplicated include from ib_addr.h Remove duplicated include. Signed-off-by: YueHaibing Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- include/rdma/ib_addr.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 2e33b1529015..e09eca91eb18 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -46,7 +46,6 @@ #include #include #include -#include #include /** From 9a59739bd01f77db6fbe2955a4fce165f0f43568 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 14 Aug 2018 15:33:02 -0700 Subject: [PATCH 089/242] IB/rxe: Revise the ib_wr_opcode enum This enum has become part of the uABI, as both RXE and the ib_uverbs_post_send() command expect userspace to supply values from this enum. So it should be properly placed in include/uapi/rdma. In userspace this enum is called 'enum ibv_wr_opcode' as part of libibverbs.h. That enum defines different values for IB_WR_LOCAL_INV, IB_WR_SEND_WITH_INV, and IB_WR_LSO. These were introduced (incorrectly, it turns out) into libiberbs in 2015. The kernel has changed its mind on the numbering for several of the IB_WC values over the years, but has remained stable on IB_WR_LOCAL_INV and below. Based on this we can conclude that there is no real user space user of the values beyond IB_WR_ATOMIC_FETCH_AND_ADD, as they have never worked via rdma-core. This is confirmed by inspection, only rxe uses the kernel enum and implements the latter operations. rxe has clearly never worked with these attributes from userspace. Other drivers that support these opcodes implement the functionality without calling out to the kernel. To make IB_WR_SEND_WITH_INV and related work for RXE in userspace we choose to renumber the IB_WR enum in the kernel to match the uABI that userspace has bee using since before Soft RoCE was merged. This is an overall simpler configuration for the whole software stack, and obviously can't break anything existing. Reported-by: Seth Howell Tested-by: Seth Howell Fixes: 8700e3e7c485 ("Soft RoCE driver") Cc: Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 34 ++++++++++++++++++------------- include/uapi/rdma/ib_user_verbs.h | 20 +++++++++++++++++- 2 files changed, 39 insertions(+), 15 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6076c9b72ab9..e463d3007a35 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1281,21 +1281,27 @@ struct ib_qp_attr { }; enum ib_wr_opcode { - IB_WR_RDMA_WRITE, - IB_WR_RDMA_WRITE_WITH_IMM, - IB_WR_SEND, - IB_WR_SEND_WITH_IMM, - IB_WR_RDMA_READ, - IB_WR_ATOMIC_CMP_AND_SWP, - IB_WR_ATOMIC_FETCH_AND_ADD, - IB_WR_LSO, - IB_WR_SEND_WITH_INV, - IB_WR_RDMA_READ_WITH_INV, - IB_WR_LOCAL_INV, - IB_WR_REG_MR, - IB_WR_MASKED_ATOMIC_CMP_AND_SWP, - IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, + /* These are shared with userspace */ + IB_WR_RDMA_WRITE = IB_UVERBS_WR_RDMA_WRITE, + IB_WR_RDMA_WRITE_WITH_IMM = IB_UVERBS_WR_RDMA_WRITE_WITH_IMM, + IB_WR_SEND = IB_UVERBS_WR_SEND, + IB_WR_SEND_WITH_IMM = IB_UVERBS_WR_SEND_WITH_IMM, + IB_WR_RDMA_READ = IB_UVERBS_WR_RDMA_READ, + IB_WR_ATOMIC_CMP_AND_SWP = IB_UVERBS_WR_ATOMIC_CMP_AND_SWP, + IB_WR_ATOMIC_FETCH_AND_ADD = IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD, + IB_WR_LSO = IB_UVERBS_WR_TSO, + IB_WR_SEND_WITH_INV = IB_UVERBS_WR_SEND_WITH_INV, + IB_WR_RDMA_READ_WITH_INV = IB_UVERBS_WR_RDMA_READ_WITH_INV, + IB_WR_LOCAL_INV = IB_UVERBS_WR_LOCAL_INV, + IB_WR_MASKED_ATOMIC_CMP_AND_SWP = + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP, + IB_WR_MASKED_ATOMIC_FETCH_AND_ADD = + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD, + + /* These are kernel only and can not be issued by userspace */ + IB_WR_REG_MR = 0x20, IB_WR_REG_SIG_MR, + /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. */ diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 25a16760de2a..1254b51a551a 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -763,10 +763,28 @@ struct ib_uverbs_sge { __u32 lkey; }; +enum ib_uverbs_wr_opcode { + IB_UVERBS_WR_RDMA_WRITE = 0, + IB_UVERBS_WR_RDMA_WRITE_WITH_IMM = 1, + IB_UVERBS_WR_SEND = 2, + IB_UVERBS_WR_SEND_WITH_IMM = 3, + IB_UVERBS_WR_RDMA_READ = 4, + IB_UVERBS_WR_ATOMIC_CMP_AND_SWP = 5, + IB_UVERBS_WR_ATOMIC_FETCH_AND_ADD = 6, + IB_UVERBS_WR_LOCAL_INV = 7, + IB_UVERBS_WR_BIND_MW = 8, + IB_UVERBS_WR_SEND_WITH_INV = 9, + IB_UVERBS_WR_TSO = 10, + IB_UVERBS_WR_RDMA_READ_WITH_INV = 11, + IB_UVERBS_WR_MASKED_ATOMIC_CMP_AND_SWP = 12, + IB_UVERBS_WR_MASKED_ATOMIC_FETCH_AND_ADD = 13, + /* Review enum ib_wr_opcode before modifying this */ +}; + struct ib_uverbs_send_wr { __aligned_u64 wr_id; __u32 num_sge; - __u32 opcode; + __u32 opcode; /* see enum ib_uverbs_wr_opcode */ __u32 send_flags; union { __be32 imm_data; From 6ebce44746036c923d0c255672735399192467c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:37:38 +0300 Subject: [PATCH 090/242] RDMA/uverbs: Remove is_closed from ib_uverbs_file This does nothing but indicate if the uverbs_file is in the device's list, use list_del_init instead. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky --- drivers/infiniband/core/uverbs.h | 1 - drivers/infiniband/core/uverbs_main.c | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 717ab35b0af9..24369eb66c67 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -147,7 +147,6 @@ struct ib_uverbs_file { struct ib_event_handler event_handler; struct ib_uverbs_async_event_file *async_file; struct list_head list; - int is_closed; /* * To access the uobjects list hw_destroy_rwsem must be held for write diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 16e5f714ca53..176271db9ed7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -905,10 +905,7 @@ static int ib_uverbs_close(struct inode *inode, struct file *filp) uverbs_destroy_ufile_hw(file, RDMA_REMOVE_CLOSE); mutex_lock(&file->device->lists_mutex); - if (!file->is_closed) { - list_del(&file->list); - file->is_closed = 1; - } + list_del_init(&file->list); mutex_unlock(&file->device->lists_mutex); if (file->async_file) @@ -1104,8 +1101,7 @@ static void ib_uverbs_free_hw_resources(struct ib_uverbs_device *uverbs_dev, while (!list_empty(&uverbs_dev->uverbs_file_list)) { file = list_first_entry(&uverbs_dev->uverbs_file_list, struct ib_uverbs_file, list); - file->is_closed = 1; - list_del(&file->list); + list_del_init(&file->list); kref_get(&file->ref); /* We must release the mutex before going ahead and calling From 802fa45cd320de319e86c93bca72abec028ba059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Mon, 17 Sep 2018 16:07:07 +0200 Subject: [PATCH 091/242] RDMA/i40iw: Fix incorrect iterator type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit f27b4746f378 ("i40iw: add connection management code") uses an incorrect rcu iterator, whilst holding the rtnl_lock. Since the critical region invokes i40iw_manage_qhash(), which is a sleeping function, the rcu locking and traversal cannot be used. Signed-off-by: HÃ¥kon Bugge Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/i40iw/i40iw_cm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c b/drivers/infiniband/hw/i40iw/i40iw_cm.c index 423818a7d333..771eb6bd0785 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_cm.c +++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c @@ -1689,7 +1689,7 @@ static enum i40iw_status_code i40iw_add_mqh_6(struct i40iw_device *iwdev, unsigned long flags; rtnl_lock(); - for_each_netdev_rcu(&init_net, ip_dev) { + for_each_netdev(&init_net, ip_dev) { if ((((rdma_vlan_dev_vlan_id(ip_dev) < I40IW_NO_VLAN) && (rdma_vlan_dev_real_dev(ip_dev) == iwdev->netdev)) || (ip_dev == iwdev->netdev)) && (ip_dev->flags & IFF_UP)) { From 0965cc953a235196b8d6ef0cba45ecb5c355194f Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 19 Sep 2018 20:28:38 +0800 Subject: [PATCH 092/242] RDMA/core: Properly return the error code of rdma_set_src_addr_rcu rdma_set_src_addr_rcu should check copy_src_l2_addr fails, rather than always return 0. Also copy_src_l2_addr should return 'ret' as its return value when rdma_translate_ip fails. Fixes: c31d4b2ddf07 ("RDMA/core: Protect against changing dst->dev during destination resolve") Signed-off-by: YueHaibing Reviewed-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 7a0356c78f60..c2ca9e4b5160 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -468,10 +468,10 @@ static int addr_resolve_neigh(const struct dst_entry *dst, return ret; } -static void copy_src_l2_addr(struct rdma_dev_addr *dev_addr, - const struct sockaddr *dst_in, - const struct dst_entry *dst, - const struct net_device *ndev) +static int copy_src_l2_addr(struct rdma_dev_addr *dev_addr, + const struct sockaddr *dst_in, + const struct dst_entry *dst, + const struct net_device *ndev) { int ret = 0; @@ -492,6 +492,8 @@ static void copy_src_l2_addr(struct rdma_dev_addr *dev_addr, RDMA_NETWORK_IPV6; else dev_addr->network = RDMA_NETWORK_IB; + + return ret; } static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, @@ -515,8 +517,7 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, return -ENODEV; } - copy_src_l2_addr(dev_addr, dst_in, dst, ndev); - return 0; + return copy_src_l2_addr(dev_addr, dst_in, dst, ndev); } static int set_addr_netns_by_gid_rcu(struct rdma_dev_addr *addr) From 0099103926b68e6675a1be4644848f5b1c1b6f97 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 17 Sep 2018 15:44:46 -0600 Subject: [PATCH 093/242] RDMA/uverbs: Fix error unwind in ib_uverbs_add_one The error path has several mistakes - cdev_del should not be called if cdev_device_add fails - We must call put_device on all the goto exit paths as that is what frees the uapi, SRCU and the struct itself. While we are here consolidate all the uvdev_dev init that cannot fail at the top. Fixes: c5c4d92e70f3 ("RDMA/uverbs: Use cdev_device_add() instead of cdev_add()") Signed-off-by: Jason Gunthorpe Reviewed-by: Parav Pandit --- drivers/infiniband/core/uverbs_main.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 176271db9ed7..db6de9157668 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1028,6 +1028,12 @@ static void ib_uverbs_add_one(struct ib_device *device) return; } + device_initialize(&uverbs_dev->dev); + uverbs_dev->dev.class = uverbs_class; + uverbs_dev->dev.parent = device->dev.parent; + uverbs_dev->dev.release = ib_uverbs_release_dev; + uverbs_dev->groups[0] = &dev_attr_group; + uverbs_dev->dev.groups = uverbs_dev->groups; atomic_set(&uverbs_dev->refcount, 1); init_completion(&uverbs_dev->comp); uverbs_dev->xrcd_tree = RB_ROOT; @@ -1035,6 +1041,8 @@ static void ib_uverbs_add_one(struct ib_device *device) mutex_init(&uverbs_dev->lists_mutex); INIT_LIST_HEAD(&uverbs_dev->uverbs_file_list); INIT_LIST_HEAD(&uverbs_dev->uverbs_events_file_list); + rcu_assign_pointer(uverbs_dev->ib_dev, device); + uverbs_dev->num_comp_vectors = device->num_comp_vectors; devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); if (devnum >= IB_UVERBS_MAX_DEVICES) @@ -1046,19 +1054,10 @@ static void ib_uverbs_add_one(struct ib_device *device) else base = IB_UVERBS_BASE_DEV + devnum; - rcu_assign_pointer(uverbs_dev->ib_dev, device); - uverbs_dev->num_comp_vectors = device->num_comp_vectors; - if (ib_uverbs_create_uapi(device, uverbs_dev)) goto err_uapi; - device_initialize(&uverbs_dev->dev); - uverbs_dev->dev.class = uverbs_class; - uverbs_dev->dev.parent = device->dev.parent; uverbs_dev->dev.devt = base; - uverbs_dev->dev.release = ib_uverbs_release_dev; - uverbs_dev->groups[0] = &dev_attr_group; - uverbs_dev->dev.groups = uverbs_dev->groups; dev_set_name(&uverbs_dev->dev, "uverbs%d", uverbs_dev->devnum); cdev_init(&uverbs_dev->cdev, @@ -1067,20 +1066,18 @@ static void ib_uverbs_add_one(struct ib_device *device) ret = cdev_device_add(&uverbs_dev->cdev, &uverbs_dev->dev); if (ret) - goto err_cdev; + goto err_uapi; ib_set_client_data(device, &uverbs_client, uverbs_dev); return; -err_cdev: - cdev_del(&uverbs_dev->cdev); - put_device(&uverbs_dev->dev); err_uapi: clear_bit(devnum, dev_map); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); wait_for_completion(&uverbs_dev->comp); + put_device(&uverbs_dev->dev); return; } From b00a92c8f2cae4ea8c84ddae9b9ba5daeb9f327f Mon Sep 17 00:00:00 2001 From: liuyixian Date: Mon, 3 Sep 2018 17:18:14 +0800 Subject: [PATCH 094/242] RDMA/hns: Move all prints out of irq handle It will trigger unnecessary interrupts caused by time out if prints inside aeq handle under some configurations. Thus, move all prints out of aeq handle to work queue. Signed-off-by: liuyixian Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 232 +++++++++----------- 2 files changed, 99 insertions(+), 134 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 9a24fd0ee3e7..cfb88a41b28f 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -738,6 +738,7 @@ struct hns_roce_work { struct hns_roce_dev *hr_dev; struct work_struct work; u32 qpn; + u32 cqn; int event_type; int sub_type; }; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b84bfa48371d..7ccf3774eb71 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3995,13 +3995,103 @@ static void hns_roce_irq_work_handle(struct work_struct *work) { struct hns_roce_work *irq_work = container_of(work, struct hns_roce_work, work); + struct device *dev = irq_work->hr_dev->dev; u32 qpn = irq_work->qpn; + u32 cqn = irq_work->cqn; switch (irq_work->event_type) { + case HNS_ROCE_EVENT_TYPE_PATH_MIG: + dev_info(dev, "Path migrated succeeded.\n"); + break; + case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: + dev_warn(dev, "Path migration failed.\n"); + break; + case HNS_ROCE_EVENT_TYPE_COMM_EST: + dev_info(dev, "Communication established.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: + dev_warn(dev, "Send queue drained.\n"); + break; case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + dev_err(dev, "Local work queue catastrophic error.\n"); hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); + switch (irq_work->sub_type) { + case HNS_ROCE_LWQCE_QPC_ERROR: + dev_err(dev, "QP %d, QPC error.\n", qpn); + break; + case HNS_ROCE_LWQCE_MTU_ERROR: + dev_err(dev, "QP %d, MTU error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: + dev_err(dev, "QP %d, WQE BA addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: + dev_err(dev, "QP %d, WQE addr error.\n", qpn); + break; + case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: + dev_err(dev, "QP %d, WQE shift error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", + irq_work->sub_type); + break; + } + break; + case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: + dev_err(dev, "Invalid request local work queue error.\n"); + hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); + break; + case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: + dev_err(dev, "Local access violation work queue error.\n"); + hns_roce_set_qps_to_err(irq_work->hr_dev, qpn); + switch (irq_work->sub_type) { + case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: + dev_err(dev, "QP %d, R_key violation.\n", qpn); + break; + case HNS_ROCE_LAVWQE_LENGTH_ERROR: + dev_err(dev, "QP %d, length error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_VA_ERROR: + dev_err(dev, "QP %d, VA error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_PD_ERROR: + dev_err(dev, "QP %d, PD error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_RW_ACC_ERROR: + dev_err(dev, "QP %d, rw acc error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: + dev_err(dev, "QP %d, key state error.\n", qpn); + break; + case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: + dev_err(dev, "QP %d, MR operation error.\n", qpn); + break; + default: + dev_err(dev, "Unhandled sub_event type %d.\n", + irq_work->sub_type); + break; + } + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: + dev_warn(dev, "SRQ limit reach.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: + dev_warn(dev, "SRQ last wqe reach.\n"); + break; + case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: + dev_err(dev, "SRQ catas error.\n"); + break; + case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: + dev_err(dev, "CQ 0x%x access err.\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: + dev_warn(dev, "CQ 0x%x overflow\n", cqn); + break; + case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: + dev_warn(dev, "DB overflow.\n"); + break; + case HNS_ROCE_EVENT_TYPE_FLR: + dev_warn(dev, "Function level reset.\n"); break; default: break; @@ -4011,7 +4101,8 @@ static void hns_roce_irq_work_handle(struct work_struct *work) } static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev, - struct hns_roce_eq *eq, u32 qpn) + struct hns_roce_eq *eq, + u32 qpn, u32 cqn) { struct hns_roce_work *irq_work; @@ -4022,6 +4113,7 @@ static void hns_roce_v2_init_irq_work(struct hns_roce_dev *hr_dev, INIT_WORK(&(irq_work->work), hns_roce_irq_work_handle); irq_work->hr_dev = hr_dev; irq_work->qpn = qpn; + irq_work->cqn = cqn; irq_work->event_type = eq->event_type; irq_work->sub_type = eq->sub_type; queue_work(hr_dev->irq_workq, &(irq_work->work)); @@ -4058,124 +4150,6 @@ static void set_eq_cons_index_v2(struct hns_roce_eq *eq) hns_roce_write64_k(doorbell, eq->doorbell); } -static void hns_roce_v2_wq_catas_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - u32 qpn) -{ - struct device *dev = hr_dev->dev; - int sub_type; - - dev_warn(dev, "Local work queue catastrophic error.\n"); - sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, - HNS_ROCE_V2_AEQE_SUB_TYPE_S); - switch (sub_type) { - case HNS_ROCE_LWQCE_QPC_ERROR: - dev_warn(dev, "QP %d, QPC error.\n", qpn); - break; - case HNS_ROCE_LWQCE_MTU_ERROR: - dev_warn(dev, "QP %d, MTU error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_BA_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE BA addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_WQE_ADDR_ERROR: - dev_warn(dev, "QP %d, WQE addr error.\n", qpn); - break; - case HNS_ROCE_LWQCE_SQ_WQE_SHIFT_ERROR: - dev_warn(dev, "QP %d, WQE shift error.\n", qpn); - break; - default: - dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); - break; - } -} - -static void hns_roce_v2_local_wq_access_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, u32 qpn) -{ - struct device *dev = hr_dev->dev; - int sub_type; - - dev_warn(dev, "Local access violation work queue error.\n"); - sub_type = roce_get_field(aeqe->asyn, HNS_ROCE_V2_AEQE_SUB_TYPE_M, - HNS_ROCE_V2_AEQE_SUB_TYPE_S); - switch (sub_type) { - case HNS_ROCE_LAVWQE_R_KEY_VIOLATION: - dev_warn(dev, "QP %d, R_key violation.\n", qpn); - break; - case HNS_ROCE_LAVWQE_LENGTH_ERROR: - dev_warn(dev, "QP %d, length error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_VA_ERROR: - dev_warn(dev, "QP %d, VA error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_PD_ERROR: - dev_err(dev, "QP %d, PD error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_RW_ACC_ERROR: - dev_warn(dev, "QP %d, rw acc error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_KEY_STATE_ERROR: - dev_warn(dev, "QP %d, key state error.\n", qpn); - break; - case HNS_ROCE_LAVWQE_MR_OPERATION_ERROR: - dev_warn(dev, "QP %d, MR operation error.\n", qpn); - break; - default: - dev_err(dev, "Unhandled sub_event type %d.\n", sub_type); - break; - } -} - -static void hns_roce_v2_qp_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type, u32 qpn) -{ - struct device *dev = hr_dev->dev; - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_COMM_EST: - dev_warn(dev, "Communication established.\n"); - break; - case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: - dev_warn(dev, "Send queue drained.\n"); - break; - case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: - hns_roce_v2_wq_catas_err_handle(hr_dev, aeqe, qpn); - break; - case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: - dev_warn(dev, "Invalid request local work queue error.\n"); - break; - case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_v2_local_wq_access_err_handle(hr_dev, aeqe, qpn); - break; - default: - break; - } - - hns_roce_qp_event(hr_dev, qpn, event_type); -} - -static void hns_roce_v2_cq_err_handle(struct hns_roce_dev *hr_dev, - struct hns_roce_aeqe *aeqe, - int event_type, u32 cqn) -{ - struct device *dev = hr_dev->dev; - - switch (event_type) { - case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: - dev_warn(dev, "CQ 0x%x access err.\n", cqn); - break; - case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - dev_warn(dev, "CQ 0x%x overflow\n", cqn); - break; - default: - break; - } - - hns_roce_cq_event(hr_dev, cqn, event_type); -} - static struct hns_roce_aeqe *get_aeqe_v2(struct hns_roce_eq *eq, u32 entry) { u32 buf_chk_sz; @@ -4251,31 +4225,23 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, switch (event_type) { case HNS_ROCE_EVENT_TYPE_PATH_MIG: - dev_warn(dev, "Path migrated succeeded.\n"); - break; case HNS_ROCE_EVENT_TYPE_PATH_MIG_FAILED: - dev_warn(dev, "Path migration failed.\n"); - break; case HNS_ROCE_EVENT_TYPE_COMM_EST: case HNS_ROCE_EVENT_TYPE_SQ_DRAINED: case HNS_ROCE_EVENT_TYPE_WQ_CATAS_ERROR: case HNS_ROCE_EVENT_TYPE_INV_REQ_LOCAL_WQ_ERROR: case HNS_ROCE_EVENT_TYPE_LOCAL_WQ_ACCESS_ERROR: - hns_roce_v2_qp_err_handle(hr_dev, aeqe, event_type, - qpn); + hns_roce_qp_event(hr_dev, qpn, event_type); break; case HNS_ROCE_EVENT_TYPE_SRQ_LIMIT_REACH: case HNS_ROCE_EVENT_TYPE_SRQ_LAST_WQE_REACH: case HNS_ROCE_EVENT_TYPE_SRQ_CATAS_ERROR: - dev_warn(dev, "SRQ not support.\n"); break; case HNS_ROCE_EVENT_TYPE_CQ_ACCESS_ERROR: case HNS_ROCE_EVENT_TYPE_CQ_OVERFLOW: - hns_roce_v2_cq_err_handle(hr_dev, aeqe, event_type, - cqn); + hns_roce_cq_event(hr_dev, cqn, event_type); break; case HNS_ROCE_EVENT_TYPE_DB_OVERFLOW: - dev_warn(dev, "DB overflow.\n"); break; case HNS_ROCE_EVENT_TYPE_MB: hns_roce_cmd_event(hr_dev, @@ -4284,10 +4250,8 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, le64_to_cpu(aeqe->event.cmd.out_param)); break; case HNS_ROCE_EVENT_TYPE_CEQ_OVERFLOW: - dev_warn(dev, "CEQ overflow.\n"); break; case HNS_ROCE_EVENT_TYPE_FLR: - dev_warn(dev, "Function level reset.\n"); break; default: dev_err(dev, "Unhandled event %d on EQ %d at idx %u.\n", @@ -4304,7 +4268,7 @@ static int hns_roce_v2_aeq_int(struct hns_roce_dev *hr_dev, dev_warn(dev, "cons_index overflow, set back to 0.\n"); eq->cons_index = 0; } - hns_roce_v2_init_irq_work(hr_dev, eq, qpn); + hns_roce_v2_init_irq_work(hr_dev, eq, qpn, cqn); } set_eq_cons_index_v2(eq); From 5f9794dc94f59ad1eb821724a8ae1f8e803ea188 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:08 +0300 Subject: [PATCH 095/242] RDMA/ucontext: Add a core API for mmaping driver IO memory To support disassociation and PCI hot unplug, we have to track all the VMAs that refer to the device IO memory. When disassociation occurs the VMAs have to be revised to point to the zero page, not the IO memory, to allow the physical HW to be unplugged. The three drivers supporting this implemented three different versions of this algorithm, all leaving something to be desired. This new common implementation has a few differences from the driver versions: - Track all VMAs, including splitting/truncating/etc. Tie the lifetime of the private data allocation to the lifetime of the vma. This avoids any tricks with setting vm_ops which Linus didn't like. (see link) - Support multiple mms, and support properly tracking mmaps triggered by processes other than the one first opening the uverbs fd. This makes fork behavior of disassociation enabled drivers the same as fork support in normal drivers. - Don't use crazy get_task stuff. - Simplify the approach for to racing between vm_ops close and disassociation, fixing the related bugs most of the driver implementations had. Since we are in core code the tracking list can be placed in struct ib_uverbs_ufile, which has a lifetime strictly longer than any VMAs created by mmap on the uverbs FD. Link: https://www.spinics.net/lists/stable/msg248747.html Link: https://lkml.kernel.org/r/CA+55aFxJTV_g46AQPoPXen-UPiqR1HGMZictt7VpC-SMFbm3Cw@mail.gmail.com Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 4 +- drivers/infiniband/core/rdma_core.h | 1 + drivers/infiniband/core/uverbs.h | 3 + drivers/infiniband/core/uverbs_main.c | 223 ++++++++++++++++++++++++++ include/rdma/ib_verbs.h | 22 +++ 5 files changed, 252 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index c4118bcd5103..06d31fe56677 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -842,8 +842,10 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, struct ib_ucontext *ucontext = ufile->ucontext; int ret; - if (reason == RDMA_REMOVE_DRIVER_REMOVE) + if (reason == RDMA_REMOVE_DRIVER_REMOVE) { + uverbs_user_mmap_disassociate(ufile); ufile_disassociate_ucontext(ucontext); + } put_pid(ucontext->tgid); ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, diff --git a/drivers/infiniband/core/rdma_core.h b/drivers/infiniband/core/rdma_core.h index f962f2a593ba..4886d2bba7c7 100644 --- a/drivers/infiniband/core/rdma_core.h +++ b/drivers/infiniband/core/rdma_core.h @@ -160,5 +160,6 @@ void uverbs_disassociate_api(struct uverbs_api *uapi); void uverbs_destroy_api(struct uverbs_api *uapi); void uapi_compute_bundle_size(struct uverbs_api_ioctl_method *method_elm, unsigned int num_attrs); +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile); #endif /* RDMA_CORE_H */ diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 24369eb66c67..c97935a0c7c6 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -158,6 +158,9 @@ struct ib_uverbs_file { spinlock_t uobjects_lock; struct list_head uobjects; + struct mutex umap_lock; + struct list_head umaps; + u64 uverbs_cmd_mask; u64 uverbs_ex_cmd_mask; diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index db6de9157668..8d56773aac56 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -45,6 +45,7 @@ #include #include #include +#include #include @@ -811,6 +812,226 @@ out: return ret; } +/* + * Each time we map IO memory into user space this keeps track of the mapping. + * When the device is hot-unplugged we 'zap' the mmaps in user space to point + * to the zero page and allow the hot unplug to proceed. + * + * This is necessary for cases like PCI physical hot unplug as the actual BAR + * memory may vanish after this and access to it from userspace could MCE. + * + * RDMA drivers supporting disassociation must have their user space designed + * to cope in some way with their IO pages going to the zero page. + */ +struct rdma_umap_priv { + struct vm_area_struct *vma; + struct list_head list; +}; + +static const struct vm_operations_struct rdma_umap_ops; + +static void rdma_umap_priv_init(struct rdma_umap_priv *priv, + struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + + priv->vma = vma; + vma->vm_private_data = priv; + vma->vm_ops = &rdma_umap_ops; + + mutex_lock(&ufile->umap_lock); + list_add(&priv->list, &ufile->umaps); + mutex_unlock(&ufile->umap_lock); +} + +/* + * The VMA has been dup'd, initialize the vm_private_data with a new tracking + * struct + */ +static void rdma_umap_open(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *opriv = vma->vm_private_data; + struct rdma_umap_priv *priv; + + if (!opriv) + return; + + /* We are racing with disassociation */ + if (!down_read_trylock(&ufile->hw_destroy_rwsem)) + goto out_zap; + /* + * Disassociation already completed, the VMA should already be zapped. + */ + if (!ufile->ucontext) + goto out_unlock; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + goto out_unlock; + rdma_umap_priv_init(priv, vma); + + up_read(&ufile->hw_destroy_rwsem); + return; + +out_unlock: + up_read(&ufile->hw_destroy_rwsem); +out_zap: + /* + * We can't allow the VMA to be created with the actual IO pages, that + * would break our API contract, and it can't be stopped at this + * point, so zap it. + */ + vma->vm_private_data = NULL; + zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start); +} + +static void rdma_umap_close(struct vm_area_struct *vma) +{ + struct ib_uverbs_file *ufile = vma->vm_file->private_data; + struct rdma_umap_priv *priv = vma->vm_private_data; + + if (!priv) + return; + + /* + * The vma holds a reference on the struct file that created it, which + * in turn means that the ib_uverbs_file is guaranteed to exist at + * this point. + */ + mutex_lock(&ufile->umap_lock); + list_del(&priv->list); + mutex_unlock(&ufile->umap_lock); + kfree(priv); +} + +static const struct vm_operations_struct rdma_umap_ops = { + .open = rdma_umap_open, + .close = rdma_umap_close, +}; + +static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long size) +{ + struct ib_uverbs_file *ufile = ucontext->ufile; + struct rdma_umap_priv *priv; + + if (vma->vm_end - vma->vm_start != size) + return ERR_PTR(-EINVAL); + + /* Driver is using this wrong, must be called by ib_uverbs_mmap */ + if (WARN_ON(!vma->vm_file || + vma->vm_file->private_data != ufile)) + return ERR_PTR(-EINVAL); + lockdep_assert_held(&ufile->device->disassociate_srcu); + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return ERR_PTR(-ENOMEM); + return priv; +} + +/* + * Map IO memory into a process. This is to be called by drivers as part of + * their mmap() functions if they wish to send something like PCI-E BAR memory + * to userspace. + */ +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + vma->vm_page_prot = prot; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_io); + +/* + * The page case is here for a slightly different reason, the driver expects + * to be able to free the page it is sharing to user space when it destroys + * its ucontext, which means we need to zap the user space references. + * + * We could handle this differently by providing an API to allocate a shared + * page and then only freeing the shared page when the last ufile is + * destroyed. + */ +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size); + + if (IS_ERR(priv)) + return PTR_ERR(priv); + + if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size, + vma->vm_page_prot)) { + kfree(priv); + return -EAGAIN; + } + + rdma_umap_priv_init(priv, vma); + return 0; +} +EXPORT_SYMBOL(rdma_user_mmap_page); + +void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) +{ + struct rdma_umap_priv *priv, *next_priv; + + lockdep_assert_held(&ufile->hw_destroy_rwsem); + + while (1) { + struct mm_struct *mm = NULL; + + /* Get an arbitrary mm pointer that hasn't been cleaned yet */ + mutex_lock(&ufile->umap_lock); + if (!list_empty(&ufile->umaps)) { + mm = list_first_entry(&ufile->umaps, + struct rdma_umap_priv, list) + ->vma->vm_mm; + mmget(mm); + } + mutex_unlock(&ufile->umap_lock); + if (!mm) + return; + + /* + * The umap_lock is nested under mmap_sem since it used within + * the vma_ops callbacks, so we have to clean the list one mm + * at a time to get the lock ordering right. Typically there + * will only be one mm, so no big deal. + */ + down_write(&mm->mmap_sem); + mutex_lock(&ufile->umap_lock); + list_for_each_entry_safe (priv, next_priv, &ufile->umaps, + list) { + struct vm_area_struct *vma = priv->vma; + + if (vma->vm_mm != mm) + continue; + list_del_init(&priv->list); + + zap_vma_ptes(vma, vma->vm_start, + vma->vm_end - vma->vm_start); + vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); + } + mutex_unlock(&ufile->umap_lock); + up_write(&mm->mmap_sem); + mmput(mm); + } +} + /* * ib_uverbs_open() does not need the BKL: * @@ -872,6 +1093,8 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp) spin_lock_init(&file->uobjects_lock); INIT_LIST_HEAD(&file->uobjects); init_rwsem(&file->hw_destroy_rwsem); + mutex_init(&file->umap_lock); + INIT_LIST_HEAD(&file->umaps); filp->private_data = file; list_add_tail(&file->list, &dev->uverbs_file_list); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index e463d3007a35..a66238d8a2a3 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2646,6 +2646,28 @@ void *ib_get_client_data(struct ib_device *device, struct ib_client *client); void ib_set_client_data(struct ib_device *device, struct ib_client *client, void *data); +#if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) +int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, pgprot_t prot); +int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size); +#else +static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, + unsigned long pfn, unsigned long size, + pgprot_t prot) +{ + return -EINVAL; +} +static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext, + struct vm_area_struct *vma, struct page *page, + unsigned long size) +{ + return -EINVAL; +} +#endif + static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len) { return copy_from_user(dest, udata->inbuf, len) ? -EFAULT : 0; From c282da4109e4a1238432f2cab22af6e3562f4b47 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:09 +0300 Subject: [PATCH 096/242] RDMA/mlx4: Use rdma_user_mmap_io Rely on the new core code helper to map BAR memory from the driver. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/main.c | 142 +++++---------------------- drivers/infiniband/hw/mlx4/mlx4_ib.h | 5 - 2 files changed, 24 insertions(+), 123 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index ca0f1ee26091..bf3cdb88aaf5 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -1138,144 +1138,50 @@ static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } -static void mlx4_ib_vma_open(struct vm_area_struct *area) -{ - /* vma_open is called when a new VMA is created on top of our VMA. - * This is done through either mremap flow or split_vma (usually due - * to mlock, madvise, munmap, etc.). We do not support a clone of the - * vma, as this VMA is strongly hardware related. Therefore we set the - * vm_ops of the newly created/cloned VMA to NULL, to prevent it from - * calling us again and trying to do incorrect actions. We assume that - * the original vma size is exactly a single page that there will be no - * "splitting" operations on. - */ - area->vm_ops = NULL; -} - -static void mlx4_ib_vma_close(struct vm_area_struct *area) -{ - struct mlx4_ib_vma_private_data *mlx4_ib_vma_priv_data; - - /* It's guaranteed that all VMAs opened on a FD are closed before the - * file itself is closed, therefore no sync is needed with the regular - * closing flow. (e.g. mlx4_ib_dealloc_ucontext) However need a sync - * with accessing the vma as part of mlx4_ib_disassociate_ucontext. - * The close operation is usually called under mm->mmap_sem except when - * process is exiting. The exiting case is handled explicitly as part - * of mlx4_ib_disassociate_ucontext. - */ - mlx4_ib_vma_priv_data = (struct mlx4_ib_vma_private_data *) - area->vm_private_data; - - /* set the vma context pointer to null in the mlx4_ib driver's private - * data to protect against a race condition in mlx4_ib_dissassociate_ucontext(). - */ - mlx4_ib_vma_priv_data->vma = NULL; -} - -static const struct vm_operations_struct mlx4_ib_vm_ops = { - .open = mlx4_ib_vma_open, - .close = mlx4_ib_vma_close -}; - static void mlx4_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { - int i; - struct vm_area_struct *vma; - struct mlx4_ib_ucontext *context = to_mucontext(ibcontext); - - /* need to protect from a race on closing the vma as part of - * mlx4_ib_vma_close(). - */ - for (i = 0; i < HW_BAR_COUNT; i++) { - vma = context->hw_bar_info[i].vma; - if (!vma) - continue; - - zap_vma_ptes(context->hw_bar_info[i].vma, - context->hw_bar_info[i].vma->vm_start, PAGE_SIZE); - - context->hw_bar_info[i].vma->vm_flags &= - ~(VM_SHARED | VM_MAYSHARE); - /* context going to be destroyed, should not access ops any more */ - context->hw_bar_info[i].vma->vm_ops = NULL; - } -} - -static void mlx4_ib_set_vma_data(struct vm_area_struct *vma, - struct mlx4_ib_vma_private_data *vma_private_data) -{ - vma_private_data->vma = vma; - vma->vm_private_data = vma_private_data; - vma->vm_ops = &mlx4_ib_vm_ops; } static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct mlx4_ib_dev *dev = to_mdev(context->device); - struct mlx4_ib_ucontext *mucontext = to_mucontext(context); - if (vma->vm_end - vma->vm_start != PAGE_SIZE) - return -EINVAL; + switch (vma->vm_pgoff) { + case 0: + return rdma_user_mmap_io(context, vma, + to_mucontext(context)->uar.pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot)); - if (vma->vm_pgoff == 0) { - /* We prevent double mmaping on same context */ - if (mucontext->hw_bar_info[HW_BAR_DB].vma) + case 1: + if (dev->dev->caps.bf_reg_size == 0) return -EINVAL; + return rdma_user_mmap_io( + context, vma, + to_mucontext(context)->uar.pfn + + dev->dev->caps.num_uars, + PAGE_SIZE, pgprot_writecombine(vma->vm_page_prot)); - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - if (io_remap_pfn_range(vma, vma->vm_start, - to_mucontext(context)->uar.pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_DB]); - - } else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) { - /* We prevent double mmaping on same context */ - if (mucontext->hw_bar_info[HW_BAR_BF].vma) - return -EINVAL; - - vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot); - - if (io_remap_pfn_range(vma, vma->vm_start, - to_mucontext(context)->uar.pfn + - dev->dev->caps.num_uars, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx4_ib_set_vma_data(vma, &mucontext->hw_bar_info[HW_BAR_BF]); - - } else if (vma->vm_pgoff == 3) { + case 3: { struct mlx4_clock_params params; int ret; - /* We prevent double mmaping on same context */ - if (mucontext->hw_bar_info[HW_BAR_CLOCK].vma) - return -EINVAL; - ret = mlx4_get_internal_clock_params(dev->dev, ¶ms); - if (ret) return ret; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, - (pci_resource_start(dev->dev->persist->pdev, - params.bar) + - params.offset) - >> PAGE_SHIFT, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - - mlx4_ib_set_vma_data(vma, - &mucontext->hw_bar_info[HW_BAR_CLOCK]); - } else { - return -EINVAL; + return rdma_user_mmap_io( + context, vma, + (pci_resource_start(dev->dev->persist->pdev, + params.bar) + + params.offset) >> + PAGE_SHIFT, + PAGE_SIZE, pgprot_noncached(vma->vm_page_prot)); } - return 0; + default: + return -EINVAL; + } } static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index e10dccc7958f..8850dfc3826d 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -80,16 +80,11 @@ enum hw_bar_type { HW_BAR_COUNT }; -struct mlx4_ib_vma_private_data { - struct vm_area_struct *vma; -}; - struct mlx4_ib_ucontext { struct ib_ucontext ibucontext; struct mlx4_uar uar; struct list_head db_page_list; struct mutex db_page_mutex; - struct mlx4_ib_vma_private_data hw_bar_info[HW_BAR_COUNT]; struct list_head wqn_ranges_list; struct mutex wqn_ranges_mutex; /* protect wqn_ranges_list */ }; From e2cd1d1ad204664f9ce78cb61c9307c149051f8e Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:10 +0300 Subject: [PATCH 097/242] RDMA/mlx5: Use rdma_user_mmap_io Rely on the new core code helper to map BAR memory from the driver. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 122 ++------------------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 10 --- 2 files changed, 7 insertions(+), 125 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2be6a4377558..aeb328100986 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1749,8 +1749,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, goto out_mdev; } - INIT_LIST_HEAD(&context->vma_private_list); - mutex_init(&context->vma_private_list_mutex); INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); @@ -1908,94 +1906,9 @@ static int get_extended_index(unsigned long offset) return get_arg(offset) | ((offset >> 16) & 0xff) << 8; } -static void mlx5_ib_vma_open(struct vm_area_struct *area) -{ - /* vma_open is called when a new VMA is created on top of our VMA. This - * is done through either mremap flow or split_vma (usually due to - * mlock, madvise, munmap, etc.) We do not support a clone of the VMA, - * as this VMA is strongly hardware related. Therefore we set the - * vm_ops of the newly created/cloned VMA to NULL, to prevent it from - * calling us again and trying to do incorrect actions. We assume that - * the original VMA size is exactly a single page, and therefore all - * "splitting" operation will not happen to it. - */ - area->vm_ops = NULL; -} - -static void mlx5_ib_vma_close(struct vm_area_struct *area) -{ - struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data; - - /* It's guaranteed that all VMAs opened on a FD are closed before the - * file itself is closed, therefore no sync is needed with the regular - * closing flow. (e.g. mlx5 ib_dealloc_ucontext) - * However need a sync with accessing the vma as part of - * mlx5_ib_disassociate_ucontext. - * The close operation is usually called under mm->mmap_sem except when - * process is exiting. - * The exiting case is handled explicitly as part of - * mlx5_ib_disassociate_ucontext. - */ - mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data; - - /* setting the vma context pointer to null in the mlx5_ib driver's - * private data, to protect a race condition in - * mlx5_ib_disassociate_ucontext(). - */ - mlx5_ib_vma_priv_data->vma = NULL; - mutex_lock(mlx5_ib_vma_priv_data->vma_private_list_mutex); - list_del(&mlx5_ib_vma_priv_data->list); - mutex_unlock(mlx5_ib_vma_priv_data->vma_private_list_mutex); - kfree(mlx5_ib_vma_priv_data); -} - -static const struct vm_operations_struct mlx5_ib_vm_ops = { - .open = mlx5_ib_vma_open, - .close = mlx5_ib_vma_close -}; - -static int mlx5_ib_set_vma_data(struct vm_area_struct *vma, - struct mlx5_ib_ucontext *ctx) -{ - struct mlx5_ib_vma_private_data *vma_prv; - struct list_head *vma_head = &ctx->vma_private_list; - - vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL); - if (!vma_prv) - return -ENOMEM; - - vma_prv->vma = vma; - vma_prv->vma_private_list_mutex = &ctx->vma_private_list_mutex; - vma->vm_private_data = vma_prv; - vma->vm_ops = &mlx5_ib_vm_ops; - - mutex_lock(&ctx->vma_private_list_mutex); - list_add(&vma_prv->list, vma_head); - mutex_unlock(&ctx->vma_private_list_mutex); - - return 0; -} static void mlx5_ib_disassociate_ucontext(struct ib_ucontext *ibcontext) { - struct vm_area_struct *vma; - struct mlx5_ib_vma_private_data *vma_private, *n; - struct mlx5_ib_ucontext *context = to_mucontext(ibcontext); - - mutex_lock(&context->vma_private_list_mutex); - list_for_each_entry_safe(vma_private, n, &context->vma_private_list, - list) { - vma = vma_private->vma; - zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); - /* context going to be destroyed, should - * not access ops any more. - */ - vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); - vma->vm_ops = NULL; - list_del(&vma_private->list); - kfree(vma_private); - } - mutex_unlock(&context->vma_private_list_mutex); } static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd) @@ -2018,9 +1931,6 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, struct vm_area_struct *vma, struct mlx5_ib_ucontext *context) { - phys_addr_t pfn; - int err; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; @@ -2033,13 +1943,8 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev, if (!dev->mdev->clock_info_page) return -EOPNOTSUPP; - pfn = page_to_pfn(dev->mdev->clock_info_page); - err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, - vma->vm_page_prot); - if (err) - return err; - - return mlx5_ib_set_vma_data(vma, context); + return rdma_user_mmap_page(&context->ibucontext, vma, + dev->mdev->clock_info_page, PAGE_SIZE); } static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, @@ -2129,21 +2034,15 @@ static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd, pfn = uar_index2pfn(dev, uar_index); mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn); - vma->vm_page_prot = prot; - err = io_remap_pfn_range(vma, vma->vm_start, pfn, - PAGE_SIZE, vma->vm_page_prot); + err = rdma_user_mmap_io(&context->ibucontext, vma, pfn, PAGE_SIZE, + prot); if (err) { mlx5_ib_err(dev, - "io_remap_pfn_range failed with error=%d, mmap_cmd=%s\n", + "rdma_user_mmap_io failed with error=%d, mmap_cmd=%s\n", err, mmap_cmd2str(cmd)); - err = -EAGAIN; goto err; } - err = mlx5_ib_set_vma_data(vma, context); - if (err) - goto err; - if (dyn_uar) bfregi->sys_pages[idx] = uar_index; return 0; @@ -2168,7 +2067,6 @@ static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) size_t map_size = vma->vm_end - vma->vm_start; u32 npages = map_size >> PAGE_SHIFT; phys_addr_t pfn; - pgprot_t prot; if (find_next_zero_bit(mctx->dm_pages, page_idx + npages, page_idx) != page_idx + npages) @@ -2178,14 +2076,8 @@ static int dm_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >> PAGE_SHIFT) + page_idx; - prot = pgprot_writecombine(vma->vm_page_prot); - vma->vm_page_prot = prot; - - if (io_remap_pfn_range(vma, vma->vm_start, pfn, map_size, - vma->vm_page_prot)) - return -EAGAIN; - - return mlx5_ib_set_vma_data(vma, mctx); + return rdma_user_mmap_io(context, vma, pfn, map_size, + pgprot_writecombine(vma->vm_page_prot)); } static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6c57872fdc4e..81154b598266 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -116,13 +116,6 @@ enum { MLX5_MEMIC_BASE_SIZE = 1 << MLX5_MEMIC_BASE_ALIGN, }; -struct mlx5_ib_vma_private_data { - struct list_head list; - struct vm_area_struct *vma; - /* protect vma_private_list add/del */ - struct mutex *vma_private_list_mutex; -}; - struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; struct list_head db_page_list; @@ -134,9 +127,6 @@ struct mlx5_ib_ucontext { u8 cqe_version; /* Transport Domain number */ u32 tdn; - struct list_head vma_private_list; - /* protect vma_private_list add/del */ - struct mutex vma_private_list_mutex; u64 lib_caps; DECLARE_BITMAP(dm_pages, MLX5_MAX_MEMIC_PAGES); From 6745d356ab393b9cd1e54370054a9899c92f42af Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:11 +0300 Subject: [PATCH 098/242] RDMA/hns: Use rdma_user_mmap_io Rely on the new core code helper to map BAR memory from the driver. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/hns/hns_roce_device.h | 8 -- drivers/infiniband/hw/hns/hns_roce_main.c | 101 ++++---------------- 2 files changed, 21 insertions(+), 88 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index cfb88a41b28f..d443e26b67d1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -219,19 +219,11 @@ struct hns_roce_uar { unsigned long logic_idx; }; -struct hns_roce_vma_data { - struct list_head list; - struct vm_area_struct *vma; - struct mutex *vma_list_mutex; -}; - struct hns_roce_ucontext { struct ib_ucontext ibucontext; struct hns_roce_uar uar; struct list_head page_list; struct mutex page_mutex; - struct list_head vma_list; - struct mutex vma_list_mutex; }; struct hns_roce_pd { diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index c5cae9a38c04..6edb547baee8 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -344,8 +344,6 @@ static struct ib_ucontext *hns_roce_alloc_ucontext(struct ib_device *ib_dev, if (ret) goto error_fail_uar_alloc; - INIT_LIST_HEAD(&context->vma_list); - mutex_init(&context->vma_list_mutex); if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) { INIT_LIST_HEAD(&context->page_list); mutex_init(&context->page_mutex); @@ -376,76 +374,34 @@ static int hns_roce_dealloc_ucontext(struct ib_ucontext *ibcontext) return 0; } -static void hns_roce_vma_open(struct vm_area_struct *vma) -{ - vma->vm_ops = NULL; -} - -static void hns_roce_vma_close(struct vm_area_struct *vma) -{ - struct hns_roce_vma_data *vma_data; - - vma_data = (struct hns_roce_vma_data *)vma->vm_private_data; - vma_data->vma = NULL; - mutex_lock(vma_data->vma_list_mutex); - list_del(&vma_data->list); - mutex_unlock(vma_data->vma_list_mutex); - kfree(vma_data); -} - -static const struct vm_operations_struct hns_roce_vm_ops = { - .open = hns_roce_vma_open, - .close = hns_roce_vma_close, -}; - -static int hns_roce_set_vma_data(struct vm_area_struct *vma, - struct hns_roce_ucontext *context) -{ - struct list_head *vma_head = &context->vma_list; - struct hns_roce_vma_data *vma_data; - - vma_data = kzalloc(sizeof(*vma_data), GFP_KERNEL); - if (!vma_data) - return -ENOMEM; - - vma_data->vma = vma; - vma_data->vma_list_mutex = &context->vma_list_mutex; - vma->vm_private_data = vma_data; - vma->vm_ops = &hns_roce_vm_ops; - - mutex_lock(&context->vma_list_mutex); - list_add(&vma_data->list, vma_head); - mutex_unlock(&context->vma_list_mutex); - - return 0; -} - static int hns_roce_mmap(struct ib_ucontext *context, struct vm_area_struct *vma) { struct hns_roce_dev *hr_dev = to_hr_dev(context->device); - if (((vma->vm_end - vma->vm_start) % PAGE_SIZE) != 0) - return -EINVAL; + switch (vma->vm_pgoff) { + case 0: + return rdma_user_mmap_io(context, vma, + to_hr_ucontext(context)->uar.pfn, + PAGE_SIZE, + pgprot_noncached(vma->vm_page_prot)); - if (vma->vm_pgoff == 0) { - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - if (io_remap_pfn_range(vma, vma->vm_start, - to_hr_ucontext(context)->uar.pfn, - PAGE_SIZE, vma->vm_page_prot)) - return -EAGAIN; - } else if (vma->vm_pgoff == 1 && hr_dev->tptr_dma_addr && - hr_dev->tptr_size) { - /* vm_pgoff: 1 -- TPTR */ - if (io_remap_pfn_range(vma, vma->vm_start, - hr_dev->tptr_dma_addr >> PAGE_SHIFT, - hr_dev->tptr_size, - vma->vm_page_prot)) - return -EAGAIN; - } else - return -EINVAL; + /* vm_pgoff: 1 -- TPTR */ + case 1: + if (!hr_dev->tptr_dma_addr || !hr_dev->tptr_size) + return -EINVAL; + /* + * FIXME: using io_remap_pfn_range on the dma address returned + * by dma_alloc_coherent is totally wrong. + */ + return rdma_user_mmap_io(context, vma, + hr_dev->tptr_dma_addr >> PAGE_SHIFT, + hr_dev->tptr_size, + vma->vm_page_prot); - return hns_roce_set_vma_data(vma, to_hr_ucontext(context)); + default: + return -EINVAL; + } } static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, @@ -471,21 +427,6 @@ static int hns_roce_port_immutable(struct ib_device *ib_dev, u8 port_num, static void hns_roce_disassociate_ucontext(struct ib_ucontext *ibcontext) { - struct hns_roce_ucontext *context = to_hr_ucontext(ibcontext); - struct hns_roce_vma_data *vma_data, *n; - struct vm_area_struct *vma; - - mutex_lock(&context->vma_list_mutex); - list_for_each_entry_safe(vma_data, n, &context->vma_list, list) { - vma = vma_data->vma; - zap_vma_ptes(vma, vma->vm_start, PAGE_SIZE); - - vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); - vma->vm_ops = NULL; - list_del(&vma_data->list); - kfree(vma_data); - } - mutex_unlock(&context->vma_list_mutex); } static void hns_roce_unregister_device(struct hns_roce_dev *hr_dev) From ce92db1ca84de2ebc5be7a81a68f2e220799fcf5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:43:12 +0300 Subject: [PATCH 099/242] RDMA/ucontext: Get rid of the old disassociate flow The disassociate_ucontext function in every driver is now empty, so we don't need this ugly and wrong code that was messing with tgids. rdma_user_mmap_io does this same work in a better way. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 51 ++++++----------------------- 1 file changed, 10 insertions(+), 41 deletions(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 06d31fe56677..6a3acf4bf78a 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -794,44 +794,6 @@ void uverbs_close_fd(struct file *f) uverbs_uobject_put(uobj); } -static void ufile_disassociate_ucontext(struct ib_ucontext *ibcontext) -{ - struct ib_device *ib_dev = ibcontext->device; - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = get_pid_task(ibcontext->tgid, PIDTYPE_PID); - if (!owning_process) - return; - - owning_mm = get_task_mm(owning_process); - if (!owning_mm) { - pr_info("no mm, disassociate ucontext is pending task termination\n"); - while (1) { - put_task_struct(owning_process); - usleep_range(1000, 2000); - owning_process = get_pid_task(ibcontext->tgid, - PIDTYPE_PID); - if (!owning_process || - owning_process->state == TASK_DEAD) { - pr_info("disassociate ucontext done, task was terminated\n"); - /* in case task was dead need to release the - * task struct. - */ - if (owning_process) - put_task_struct(owning_process); - return; - } - } - } - - down_write(&owning_mm->mmap_sem); - ib_dev->disassociate_ucontext(ibcontext); - up_write(&owning_mm->mmap_sem); - mmput(owning_mm); - put_task_struct(owning_process); -} - /* * Drop the ucontext off the ufile and completely disconnect it from the * ib_device @@ -840,22 +802,29 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, enum rdma_remove_reason reason) { struct ib_ucontext *ucontext = ufile->ucontext; + struct ib_device *ib_dev = ucontext->device; int ret; + /* + * If we are closing the FD then the user mmap VMAs must have + * already been destroyed as they hold on to the filep, otherwise + * they need to be zap'd. + */ if (reason == RDMA_REMOVE_DRIVER_REMOVE) { uverbs_user_mmap_disassociate(ufile); - ufile_disassociate_ucontext(ucontext); + if (ib_dev->disassociate_ucontext) + ib_dev->disassociate_ucontext(ucontext); } put_pid(ucontext->tgid); - ib_rdmacg_uncharge(&ucontext->cg_obj, ucontext->device, + ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); /* * FIXME: Drivers are not permitted to fail dealloc_ucontext, remove * the error return. */ - ret = ucontext->device->dealloc_ucontext(ucontext); + ret = ib_dev->dealloc_ucontext(ucontext); WARN_ON(ret); ufile->ucontext = NULL; From d4b4dd1b9706e48c370f88d3adfe713e43423cc9 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:44:45 +0300 Subject: [PATCH 100/242] RDMA/umem: Do not use current->tgid to track the mm_struct This is just wrong, the process that calls into the reg_mr is the process associated with the umem, and that does not have to be the same process that created the context. When this code was first written mmgrab() didn't exist, however these days we can just directly hold the mm_struct pointer in the umem and have no ambiguity when it comes to releasing the umem as to which mm it was associated with. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 77 ++++++++++++++++------------------ include/rdma/ib_umem.h | 3 +- 2 files changed, 37 insertions(+), 43 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index a41792dbae1f..c32a3e27a896 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -86,6 +86,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct vm_area_struct **vma_list; unsigned long lock_limit; unsigned long cur_base; + struct mm_struct *mm; unsigned long npages; int ret; int i; @@ -124,6 +125,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, return umem; } + umem->owning_mm = mm = current->mm; + mmgrab(mm); umem->odp_data = NULL; /* We assume the memory is from hugetlb until proved otherwise */ @@ -132,7 +135,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; - goto umem_kfree; + goto umem_kfree_drop; } /* @@ -147,14 +150,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm += npages; - if ((current->mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm += npages; + if ((mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { + up_write(&mm->mmap_sem); ret = -ENOMEM; goto vma; } - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; @@ -172,14 +175,14 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; - down_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); while (npages) { ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), gup_flags, page_list, vma_list); if (ret < 0) { - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); goto umem_release; } @@ -197,7 +200,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, /* preparing for next loop */ sg_list_start = sg; } - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, @@ -223,6 +226,9 @@ out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); +umem_kfree_drop: + if (ret) + mmdrop(umem->owning_mm); umem_kfree: if (ret) kfree(umem); @@ -230,15 +236,21 @@ umem_kfree: } EXPORT_SYMBOL(ib_umem_get); -static void ib_umem_account(struct work_struct *work) +static void __ib_umem_release_tail(struct ib_umem *umem) +{ + mmdrop(umem->owning_mm); + kfree(umem); +} + +static void ib_umem_release_defer(struct work_struct *work) { struct ib_umem *umem = container_of(work, struct ib_umem, work); - down_write(&umem->mm->mmap_sem); - umem->mm->pinned_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); + down_write(&umem->owning_mm->mmap_sem); + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); + + __ib_umem_release_tail(umem); } /** @@ -248,9 +260,6 @@ static void ib_umem_account(struct work_struct *work) void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - struct mm_struct *mm; - struct task_struct *task; - unsigned long diff; if (umem->odp_data) { ib_umem_odp_release(umem); @@ -259,41 +268,27 @@ void ib_umem_release(struct ib_umem *umem) __ib_umem_release(umem->context->device, umem, 1); - task = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - diff = ib_umem_num_pages(umem); - /* * We may be called with the mm's mmap_sem already held. This * can happen when a userspace munmap() is the call that drops * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. + * we defer the vm_locked accounting a workqueue. */ if (context->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&umem->work, ib_umem_account); - umem->mm = mm; - umem->diff = diff; - + if (!down_write_trylock(&umem->owning_mm->mmap_sem)) { + INIT_WORK(&umem->work, ib_umem_release_defer); queue_work(ib_wq, &umem->work); return; } - } else - down_write(&mm->mmap_sem); + } else { + down_write(&umem->owning_mm->mmap_sem); + } + umem->owning_mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&umem->owning_mm->mmap_sem); - mm->pinned_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -out: - kfree(umem); + __ib_umem_release_tail(umem); } EXPORT_SYMBOL(ib_umem_release); diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index a1fd63871d17..e1c00b2ead19 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -42,14 +42,13 @@ struct ib_umem_odp; struct ib_umem { struct ib_ucontext *context; + struct mm_struct *owning_mm; size_t length; unsigned long address; int page_shift; int writable; int hugetlb; struct work_struct work; - struct mm_struct *mm; - unsigned long diff; struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; From ece8ea7bfac053cf27c24e1a767b796a4db2fbd7 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:44:46 +0300 Subject: [PATCH 101/242] RDMA/usnic: Do not use ucontext->tgid Update this driver to match the code it copies from umem.c which no longer uses tgid. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/usnic/usnic_uiom.c | 93 ++++++++++++------------ drivers/infiniband/hw/usnic/usnic_uiom.h | 3 +- 2 files changed, 47 insertions(+), 49 deletions(-) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c index 9dd39daa602b..49275a548751 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.c +++ b/drivers/infiniband/hw/usnic/usnic_uiom.c @@ -54,18 +54,6 @@ static struct workqueue_struct *usnic_uiom_wq; ((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] - \ (void *) &((struct usnic_uiom_chunk *) 0)->page_list[0])) -static void usnic_uiom_reg_account(struct work_struct *work) -{ - struct usnic_uiom_reg *umem = container_of(work, - struct usnic_uiom_reg, work); - - down_write(&umem->mm->mmap_sem); - umem->mm->locked_vm -= umem->diff; - up_write(&umem->mm->mmap_sem); - mmput(umem->mm); - kfree(umem); -} - static int usnic_uiom_dma_fault(struct iommu_domain *domain, struct device *dev, unsigned long iova, int flags, @@ -99,8 +87,9 @@ static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty) } static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, - int dmasync, struct list_head *chunk_list) + int dmasync, struct usnic_uiom_reg *uiomr) { + struct list_head *chunk_list = &uiomr->chunk_list; struct page **page_list; struct scatterlist *sg; struct usnic_uiom_chunk *chunk; @@ -114,6 +103,7 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, int flags; dma_addr_t pa; unsigned int gup_flags; + struct mm_struct *mm; /* * If the combination of the addr and size requested for this memory @@ -136,7 +126,8 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + uiomr->owning_mm = mm = current->mm; + down_write(&mm->mmap_sem); locked = npages + current->mm->pinned_vm; lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -196,10 +187,12 @@ static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable, out: if (ret < 0) usnic_uiom_put_pages(chunk_list, 0); - else - current->mm->pinned_vm = locked; + else { + mm->pinned_vm = locked; + mmgrab(uiomr->owning_mm); + } - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); free_page((unsigned long) page_list); return ret; } @@ -379,7 +372,7 @@ struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd, uiomr->pd = pd; err = usnic_uiom_get_pages(addr, size, writable, dmasync, - &uiomr->chunk_list); + uiomr); if (err) { usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n", vpn_start, vpn_last, err); @@ -426,55 +419,61 @@ out_put_intervals: out_put_pages: usnic_uiom_put_pages(&uiomr->chunk_list, 0); spin_unlock(&pd->lock); + mmdrop(uiomr->owning_mm); out_free_uiomr: kfree(uiomr); return ERR_PTR(err); } -void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, - struct ib_ucontext *ucontext) +static void __usnic_uiom_release_tail(struct usnic_uiom_reg *uiomr) { - struct task_struct *task; - struct mm_struct *mm; - unsigned long diff; + mmdrop(uiomr->owning_mm); + kfree(uiomr); +} +static inline size_t usnic_uiom_num_pages(struct usnic_uiom_reg *uiomr) +{ + return PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; +} + +static void usnic_uiom_release_defer(struct work_struct *work) +{ + struct usnic_uiom_reg *uiomr = + container_of(work, struct usnic_uiom_reg, work); + + down_write(&uiomr->owning_mm->mmap_sem); + uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + up_write(&uiomr->owning_mm->mmap_sem); + + __usnic_uiom_release_tail(uiomr); +} + +void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, + struct ib_ucontext *context) +{ __usnic_uiom_reg_release(uiomr->pd, uiomr, 1); - task = get_pid_task(ucontext->tgid, PIDTYPE_PID); - if (!task) - goto out; - mm = get_task_mm(task); - put_task_struct(task); - if (!mm) - goto out; - - diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT; - /* * We may be called with the mm's mmap_sem already held. This * can happen when a userspace munmap() is the call that drops * the last reference to our file and calls our release * method. If there are memory regions to destroy, we'll end * up here and not be able to take the mmap_sem. In that case - * we defer the vm_locked accounting to the system workqueue. + * we defer the vm_locked accounting to a workqueue. */ - if (ucontext->closing) { - if (!down_write_trylock(&mm->mmap_sem)) { - INIT_WORK(&uiomr->work, usnic_uiom_reg_account); - uiomr->mm = mm; - uiomr->diff = diff; - + if (context->closing) { + if (!down_write_trylock(&uiomr->owning_mm->mmap_sem)) { + INIT_WORK(&uiomr->work, usnic_uiom_release_defer); queue_work(usnic_uiom_wq, &uiomr->work); return; } - } else - down_write(&mm->mmap_sem); + } else { + down_write(&uiomr->owning_mm->mmap_sem); + } + uiomr->owning_mm->pinned_vm -= usnic_uiom_num_pages(uiomr); + up_write(&uiomr->owning_mm->mmap_sem); - mm->pinned_vm -= diff; - up_write(&mm->mmap_sem); - mmput(mm); -out: - kfree(uiomr); + __usnic_uiom_release_tail(uiomr); } struct usnic_uiom_pd *usnic_uiom_alloc_pd(void) diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h index 8c096acff123..b86a9731071b 100644 --- a/drivers/infiniband/hw/usnic/usnic_uiom.h +++ b/drivers/infiniband/hw/usnic/usnic_uiom.h @@ -71,8 +71,7 @@ struct usnic_uiom_reg { int writable; struct list_head chunk_list; struct work_struct work; - struct mm_struct *mm; - unsigned long diff; + struct mm_struct *owning_mm; }; struct usnic_uiom_chunk { From b5231b019d76521dd8c59a54c174770ec92c767c Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:04 +0300 Subject: [PATCH 102/242] RDMA/umem: Use ib_umem_odp in all function signatures connected to ODP All of these functions already require the ODP version of the umem struct, make this very clear by having the signature require it. This paves the way to using the container_of() pattern to link umem_odp and umem together. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 2 +- drivers/infiniband/core/umem_odp.c | 139 ++++++++++++++------------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/mr.c | 3 +- drivers/infiniband/hw/mlx5/odp.c | 54 ++++++----- include/rdma/ib_umem_odp.h | 39 ++++---- include/rdma/ib_verbs.h | 4 +- 7 files changed, 129 insertions(+), 114 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index c32a3e27a896..971d92ddea8f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -262,7 +262,7 @@ void ib_umem_release(struct ib_umem *umem) struct ib_ucontext *context = umem->context; if (umem->odp_data) { - ib_umem_odp_release(umem); + ib_umem_odp_release(to_ib_umem_odp(umem)); return; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 29e34e6a6420..8405e9afd7dc 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -77,41 +77,41 @@ static u64 node_last(struct umem_odp_node *n) INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, node_start, node_last, static, rbt_ib_umem) -static void ib_umem_notifier_start_account(struct ib_umem *item) +static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); /* Only update private counters for this umem if it has them. * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { - int notifiers_count = item->odp_data->notifiers_count++; + if (umem_odp->mn_counters_active) { + int notifiers_count = umem_odp->notifiers_count++; if (notifiers_count == 0) /* Initialize the completion object for waiting on * notifiers. Since notifier_count is zero, no one * should be waiting right now. */ - reinit_completion(&item->odp_data->notifier_completion); + reinit_completion(&umem_odp->notifier_completion); } - mutex_unlock(&item->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } -static void ib_umem_notifier_end_account(struct ib_umem *item) +static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { - mutex_lock(&item->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); /* Only update private counters for this umem if it has them. * Otherwise skip it. All page faults will be delayed for this umem. */ - if (item->odp_data->mn_counters_active) { + if (umem_odp->mn_counters_active) { /* * This sequence increase will notify the QP page fault that * the page that is going to be mapped in the spte could have * been freed. */ - ++item->odp_data->notifiers_seq; - if (--item->odp_data->notifiers_count == 0) - complete_all(&item->odp_data->notifier_completion); + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); } - mutex_unlock(&item->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } /* Account for a new mmu notifier in an ib_ucontext. */ @@ -156,20 +156,23 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) } } -static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) { +static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, + u64 start, u64 end, void *cookie) +{ + struct ib_umem *umem = umem_odp->umem; + /* * Increase the number of notifiers running, to * prevent any further fault handling on this MR. */ - ib_umem_notifier_start_account(item); - item->odp_data->dying = 1; + ib_umem_notifier_start_account(umem_odp); + umem_odp->dying = 1; /* Make sure that the fact the umem is dying is out before we release * all pending page faults. */ smp_wmb(); - complete_all(&item->odp_data->notifier_completion); - item->context->invalidate_range(item, ib_umem_start(item), - ib_umem_end(item)); + complete_all(&umem_odp->notifier_completion); + umem->context->invalidate_range(umem_odp, ib_umem_start(umem), + ib_umem_end(umem)); return 0; } @@ -191,20 +194,20 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, up_read(&context->umem_rwsem); } -static int invalidate_page_trampoline(struct ib_umem *item, u64 start, +static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem->context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } -static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, - u64 end, void *cookie) +static int invalidate_range_start_trampoline(struct ib_umem_odp *item, + u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->context->invalidate_range(item, start, end); + item->umem->context->invalidate_range(item, start, end); return 0; } @@ -235,7 +238,7 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, return ret; } -static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, +static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_end_account(item); @@ -271,9 +274,8 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, + unsigned long addr, size_t size) { struct ib_umem *umem; struct ib_umem_odp *odp_data; @@ -326,7 +328,7 @@ struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, umem->odp_data = odp_data; - return umem; + return odp_data; out_page_list: vfree(odp_data->page_list); @@ -462,8 +464,9 @@ out_mm: return ret_val; } -void ib_umem_odp_release(struct ib_umem *umem) +void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { + struct ib_umem *umem = umem_odp->umem; struct ib_ucontext *context = umem->context; /* @@ -472,17 +475,17 @@ void ib_umem_odp_release(struct ib_umem *umem) * It is the driver's responsibility to ensure, before calling us, * that the hardware will not attempt to access the MR any more. */ - ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); down_write(&context->umem_rwsem); if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem->odp_data->interval_tree, + rbt_ib_umem_remove(&umem_odp->interval_tree, &context->umem_tree); context->odp_mrs_count--; - if (!umem->odp_data->mn_counters_active) { - list_del(&umem->odp_data->no_private_counters); - complete_all(&umem->odp_data->notifier_completion); + if (!umem_odp->mn_counters_active) { + list_del(&umem_odp->no_private_counters); + complete_all(&umem_odp->notifier_completion); } /* @@ -523,9 +526,9 @@ out_put_task: out: up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); - vfree(umem->odp_data->page_list); - kfree(umem->odp_data); + vfree(umem_odp->dma_list); + vfree(umem_odp->page_list); + kfree(umem_odp); kfree(umem); } @@ -538,7 +541,7 @@ out: * @access_mask: access permissions needed for this page. * @current_seq: sequence number for synchronization with invalidations. * the sequence number is taken from - * umem->odp_data->notifiers_seq. + * umem_odp->notifiers_seq. * * The function returns -EFAULT if the DMA mapping operation fails. It returns * -EAGAIN if a concurrent invalidation prevents us from updating the page. @@ -548,12 +551,13 @@ out: * umem. */ static int ib_umem_odp_map_dma_single_page( - struct ib_umem *umem, + struct ib_umem_odp *umem_odp, int page_index, struct page *page, u64 access_mask, unsigned long current_seq) { + struct ib_umem *umem = umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -565,11 +569,11 @@ static int ib_umem_odp_map_dma_single_page( * handle case of a racing notifier. This check also allows us to bail * early if we have a notifier running in parallel with us. */ - if (ib_umem_mmu_notifier_retry(umem, current_seq)) { + if (ib_umem_mmu_notifier_retry(umem_odp, current_seq)) { ret = -EAGAIN; goto out; } - if (!(umem->odp_data->dma_list[page_index])) { + if (!(umem_odp->dma_list[page_index])) { dma_addr = ib_dma_map_page(dev, page, 0, BIT(umem->page_shift), @@ -578,15 +582,15 @@ static int ib_umem_odp_map_dma_single_page( ret = -EFAULT; goto out; } - umem->odp_data->dma_list[page_index] = dma_addr | access_mask; - umem->odp_data->page_list[page_index] = page; + umem_odp->dma_list[page_index] = dma_addr | access_mask; + umem_odp->page_list[page_index] = page; umem->npages++; stored_page = 1; - } else if (umem->odp_data->page_list[page_index] == page) { - umem->odp_data->dma_list[page_index] |= access_mask; + } else if (umem_odp->page_list[page_index] == page) { + umem_odp->dma_list[page_index] |= access_mask; } else { pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n", - umem->odp_data->page_list[page_index], page); + umem_odp->page_list[page_index], page); /* Better remove the mapping now, to prevent any further * damage. */ remove_existing_mapping = 1; @@ -599,7 +603,7 @@ out: if (remove_existing_mapping && umem->context->invalidate_range) { invalidate_page_trampoline( - umem, + umem_odp, ib_umem_start(umem) + (page_index >> umem->page_shift), ib_umem_start(umem) + ((page_index + 1) >> umem->page_shift), @@ -615,7 +619,7 @@ out: * * Pins the range of pages passed in the argument, and maps them to * DMA addresses. The DMA addresses of the mapped pages is updated in - * umem->odp_data->dma_list. + * umem_odp->dma_list. * * Returns the number of pages mapped in success, negative error code * for failure. @@ -623,7 +627,7 @@ out: * the function from completing its task. * An -ENOENT error code indicates that userspace process is being terminated * and mm was already destroyed. - * @umem: the umem to map and pin + * @umem_odp: the umem to map and pin * @user_virt: the address from which we need to map. * @bcnt: the minimal number of bytes to pin and map. The mapping might be * bigger due to alignment, and may also be smaller in case of an error @@ -633,11 +637,13 @@ out: * range. * @current_seq: the MMU notifiers sequance value for synchronization with * invalidations. the sequance number is read from - * umem->odp_data->notifiers_seq before calling this function + * umem_odp->notifiers_seq before calling this function */ -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, - u64 access_mask, unsigned long current_seq) +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, + u64 bcnt, u64 access_mask, + unsigned long current_seq) { + struct ib_umem *umem = umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; @@ -703,7 +709,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, break; bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt); - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (j = 0; j < npages; j++, user_virt += PAGE_SIZE) { if (user_virt & ~page_mask) { p += PAGE_SIZE; @@ -716,7 +722,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, } ret = ib_umem_odp_map_dma_single_page( - umem, k, local_page_list[j], + umem_odp, k, local_page_list[j], access_mask, current_seq); if (ret < 0) break; @@ -724,7 +730,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, p = page_to_phys(local_page_list[j]); k++; } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); if (ret < 0) { /* Release left over pages when handling errors. */ @@ -750,9 +756,10 @@ out_no_task: } EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { + struct ib_umem *umem = umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; @@ -764,12 +771,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ - mutex_lock(&umem->odp_data->umem_mutex); + mutex_lock(&umem_odp->umem_mutex); for (addr = virt; addr < bound; addr += BIT(umem->page_shift)) { idx = (addr - ib_umem_start(umem)) >> umem->page_shift; - if (umem->odp_data->page_list[idx]) { - struct page *page = umem->odp_data->page_list[idx]; - dma_addr_t dma = umem->odp_data->dma_list[idx]; + if (umem_odp->page_list[idx]) { + struct page *page = umem_odp->page_list[idx]; + dma_addr_t dma = umem_odp->dma_list[idx]; dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK; WARN_ON(!dma_addr); @@ -792,12 +799,12 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt, /* on demand pinning support */ if (!umem->context->invalidate_range) put_page(page); - umem->odp_data->page_list[idx] = NULL; - umem->odp_data->dma_list[idx] = 0; + umem_odp->page_list[idx] = NULL; + umem_odp->dma_list[idx] = 0; umem->npages--; } } - mutex_unlock(&umem->odp_data->umem_mutex); + mutex_unlock(&umem_odp->umem_mutex); } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); @@ -824,7 +831,7 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); - ret_val = cb(umem->umem, start, last, cookie) || ret_val; + ret_val = cb(umem, start, last, cookie) || ret_val; } return ret_val; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 81154b598266..dc34ffa4c8b3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1150,7 +1150,7 @@ void mlx5_ib_pfault(struct mlx5_core_dev *mdev, void *context, int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev); int __init mlx5_ib_odp_init(void); void mlx5_ib_odp_cleanup(void); -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); void mlx5_odp_init_mr_cache_entry(struct mlx5_cache_ent *ent); void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 9fb1d9cb9401..affbf2831ccd 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -1631,7 +1631,8 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ if (umem->odp_data->page_list) - mlx5_ib_invalidate_range(umem, ib_umem_start(umem), + mlx5_ib_invalidate_range(to_ib_umem_odp(umem), + ib_umem_start(umem), ib_umem_end(umem)); else mlx5_ib_free_implicit_mr(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index d216e0d2921d..8f4a4a8171eb 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -170,22 +170,24 @@ static void mr_leaf_free_action(struct work_struct *work) wake_up(&imr->q_leaf_free); } -void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, +void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end) { struct mlx5_ib_mr *mr; const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(struct mlx5_mtt)) - 1; u64 idx = 0, blk_start_idx = 0; + struct ib_umem *umem; int in_block = 0; u64 addr; - if (!umem || !umem->odp_data) { + if (!umem_odp) { pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } + umem = umem_odp->umem; - mr = umem->odp_data->private; + mr = umem_odp->private; if (!mr || !mr->ibmr.pd) return; @@ -208,7 +210,7 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * estimate the cost of another UMR vs. the cost of bigger * UMR. */ - if (umem->odp_data->dma_list[idx] & + if (umem_odp->dma_list[idx] & (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) { if (!in_block) { blk_start_idx = idx; @@ -237,13 +239,13 @@ void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start, * needed. */ - ib_umem_odp_unmap_dma_pages(umem, start, end); + ib_umem_odp_unmap_dma_pages(umem_odp, start, end); if (unlikely(!umem->npages && mr->parent && - !umem->odp_data->dying)) { - WRITE_ONCE(umem->odp_data->dying, 1); + !umem_odp->dying)) { + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&mr->parent->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); } } @@ -372,7 +374,6 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, u64 addr = io_virt & MLX5_IMR_MTT_MASK; int nentries = 0, start_idx = 0, ret; struct mlx5_ib_mr *mtt; - struct ib_umem *umem; mutex_lock(&mr->umem->odp_data->umem_mutex); odp = odp_lookup(ctx, addr, 1, mr); @@ -385,22 +386,22 @@ next_mr: if (nentries) nentries++; } else { - umem = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); - if (IS_ERR(umem)) { + odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + if (IS_ERR(odp)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - return ERR_CAST(umem); + return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, umem, 0, mr->access_flags); + mtt = implicit_mr_alloc(mr->ibmr.pd, odp->umem, 0, + mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - ib_umem_release(umem); + ib_umem_release(odp->umem); return ERR_CAST(mtt); } - odp = umem->odp_data; odp->private = mtt; - mtt->umem = umem; + mtt->umem = odp->umem; mtt->mmkey.iova = addr; mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -460,24 +461,24 @@ struct mlx5_ib_mr *mlx5_ib_alloc_implicit_mr(struct mlx5_ib_pd *pd, return imr; } -static int mr_leaf_free(struct ib_umem *umem, u64 start, - u64 end, void *cookie) +static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, + void *cookie) { - struct mlx5_ib_mr *mr = umem->odp_data->private, *imr = cookie; + struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; + struct ib_umem *umem = umem_odp->umem; if (mr->parent != imr) return 0; - ib_umem_odp_unmap_dma_pages(umem, - ib_umem_start(umem), + ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - if (umem->odp_data->dying) + if (umem_odp->dying) return 0; - WRITE_ONCE(umem->odp_data->dying, 1); + WRITE_ONCE(umem_odp->dying, 1); atomic_inc(&imr->num_leaf_free); - schedule_work(&umem->odp_data->work); + schedule_work(&umem_odp->work); return 0; } @@ -533,7 +534,7 @@ next_mr: */ smp_rmb(); - ret = ib_umem_odp_map_dma_pages(mr->umem, io_virt, size, + ret = ib_umem_odp_map_dma_pages(to_ib_umem_odp(mr->umem), io_virt, size, access_mask, current_seq); if (ret < 0) @@ -542,7 +543,8 @@ next_mr: np = ret; mutex_lock(&odp->umem_mutex); - if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) { + if (!ib_umem_mmu_notifier_retry(to_ib_umem_odp(mr->umem), + current_seq)) { /* * No need to check whether the MTTs really belong to * this MR, since ib_umem_odp_map_dma_pages already diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 381cdf5a9bd1..3ef2975b5fb2 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -82,15 +82,18 @@ struct ib_umem_odp { struct work_struct work; }; +static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) +{ + return umem->odp_data; +} + #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, int access); -struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size); - -void ib_umem_odp_release(struct ib_umem *umem); +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, + unsigned long addr, size_t size); +void ib_umem_odp_release(struct ib_umem_odp *umem_odp); /* * The lower 2 bits of the DMA address signal the R/W permissions for @@ -105,13 +108,14 @@ void ib_umem_odp_release(struct ib_umem *umem); #define ODP_DMA_ADDR_MASK (~(ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) -int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 start_offset, u64 bcnt, - u64 access_mask, unsigned long current_seq); +int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, + u64 bcnt, u64 access_mask, + unsigned long current_seq); -void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 start_offset, +void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 start_offset, u64 bound); -typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, +typedef int (*umem_call_back)(struct ib_umem_odp *item, u64 start, u64 end, void *cookie); /* * Call the callback on each ib_umem in the range. Returns the logical or of @@ -129,25 +133,25 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, struct ib_umem_odp *rbt_ib_umem_lookup(struct rb_root_cached *root, u64 addr, u64 length); -static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, +static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, unsigned long mmu_seq) { /* * This code is strongly based on the KVM code from * mmu_notifier_retry. Should be called with - * the relevant locks taken (item->odp_data->umem_mutex + * the relevant locks taken (umem_odp->umem_mutex * and the ucontext umem_mutex semaphore locked for read). */ /* Do not allow page faults while the new ib_umem hasn't seen a state * with zero notifiers yet, and doesn't have its own valid set of * private counters. */ - if (!item->odp_data->mn_counters_active) + if (!umem_odp->mn_counters_active) return 1; - if (unlikely(item->odp_data->notifiers_count)) + if (unlikely(umem_odp->notifiers_count)) return 1; - if (item->odp_data->notifiers_seq != mmu_seq) + if (umem_odp->notifiers_seq != mmu_seq) return 1; return 0; } @@ -161,14 +165,13 @@ static inline int ib_umem_odp_get(struct ib_ucontext *context, return -EINVAL; } -static inline struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, - size_t size) +static inline struct ib_umem_odp * +ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { return ERR_PTR(-EINVAL); } -static inline void ib_umem_odp_release(struct ib_umem *umem) {} +static inline void ib_umem_odp_release(struct ib_umem_odp *umem_odp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index a66238d8a2a3..d611ce9df7fb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -69,6 +69,8 @@ #define IB_FW_VERSION_NAME_MAX ETHTOOL_FWVERS_LEN +struct ib_umem_odp; + extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; extern struct workqueue_struct *ib_comp_unbound_wq; @@ -1506,7 +1508,7 @@ struct ib_ucontext { * mmu notifiers registration. */ struct rw_semaphore umem_rwsem; - void (*invalidate_range)(struct ib_umem *umem, + void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); struct mmu_notifier mn; From 41b4deeaa123e62e1037af7a0be547af2e0e05f1 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:05 +0300 Subject: [PATCH 103/242] RDMA/umem: Make ib_umem_odp into a sub structure of ib_umem These two structures are linked together, use the container_of pattern instead of a double allocation to make the code simpler and easier to follow. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 36 ++++++++------ drivers/infiniband/core/umem_odp.c | 79 ++++++++++++------------------ drivers/infiniband/hw/mlx5/odp.c | 26 +++++----- include/rdma/ib_umem_odp.h | 11 ++--- 4 files changed, 69 insertions(+), 83 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 971d92ddea8f..88b9b88f90e1 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -108,34 +108,39 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, if (!can_do_mlock()) return ERR_PTR(-EPERM); - umem = kzalloc(sizeof *umem, GFP_KERNEL); - if (!umem) - return ERR_PTR(-ENOMEM); + if (access & IB_ACCESS_ON_DEMAND) { + umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + umem->odp_data = to_ib_umem_odp(umem); + } else { + umem = kzalloc(sizeof(*umem), GFP_KERNEL); + if (!umem) + return ERR_PTR(-ENOMEM); + } umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = ib_access_writable(access); + umem->owning_mm = mm = current->mm; + mmgrab(mm); if (access & IB_ACCESS_ON_DEMAND) { - ret = ib_umem_odp_get(context, umem, access); + ret = ib_umem_odp_get(to_ib_umem_odp(umem), access); if (ret) goto umem_kfree; return umem; } - umem->owning_mm = mm = current->mm; - mmgrab(mm); - umem->odp_data = NULL; - /* We assume the memory is from hugetlb until proved otherwise */ umem->hugetlb = 1; page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) { ret = -ENOMEM; - goto umem_kfree_drop; + goto umem_kfree; } /* @@ -226,12 +231,11 @@ out: if (vma_list) free_page((unsigned long) vma_list); free_page((unsigned long) page_list); -umem_kfree_drop: - if (ret) - mmdrop(umem->owning_mm); umem_kfree: - if (ret) + if (ret) { + mmdrop(umem->owning_mm); kfree(umem); + } return ret ? ERR_PTR(ret) : umem; } EXPORT_SYMBOL(ib_umem_get); @@ -239,7 +243,10 @@ EXPORT_SYMBOL(ib_umem_get); static void __ib_umem_release_tail(struct ib_umem *umem) { mmdrop(umem->owning_mm); - kfree(umem); + if (umem->odp_data) + kfree(to_ib_umem_odp(umem)); + else + kfree(umem); } static void ib_umem_release_defer(struct work_struct *work) @@ -263,6 +270,7 @@ void ib_umem_release(struct ib_umem *umem) if (umem->odp_data) { ib_umem_odp_release(to_ib_umem_odp(umem)); + __ib_umem_release_tail(umem); return; } diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 8405e9afd7dc..900fdedfe910 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -58,7 +58,7 @@ static u64 node_start(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_start(umem_odp->umem); + return ib_umem_start(&umem_odp->umem); } /* Note that the representation of the intervals in the interval tree @@ -71,7 +71,7 @@ static u64 node_last(struct umem_odp_node *n) struct ib_umem_odp *umem_odp = container_of(n, struct ib_umem_odp, interval_tree); - return ib_umem_end(umem_odp->umem) - 1; + return ib_umem_end(&umem_odp->umem) - 1; } INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, @@ -159,7 +159,7 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; /* * Increase the number of notifiers running, to @@ -198,7 +198,7 @@ static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem->context->invalidate_range(item, start, start + PAGE_SIZE); + item->umem.context->invalidate_range(item, start, start + PAGE_SIZE); ib_umem_notifier_end_account(item); return 0; } @@ -207,7 +207,7 @@ static int invalidate_range_start_trampoline(struct ib_umem_odp *item, u64 start, u64 end, void *cookie) { ib_umem_notifier_start_account(item); - item->umem->context->invalidate_range(item, start, end); + item->umem.context->invalidate_range(item, start, end); return 0; } @@ -277,28 +277,21 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { - struct ib_umem *umem; struct ib_umem_odp *odp_data; + struct ib_umem *umem; int pages = size >> PAGE_SHIFT; int ret; - umem = kzalloc(sizeof(*umem), GFP_KERNEL); - if (!umem) + odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); + if (!odp_data) return ERR_PTR(-ENOMEM); - + umem = &odp_data->umem; umem->context = context; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; - odp_data = kzalloc(sizeof(*odp_data), GFP_KERNEL); - if (!odp_data) { - ret = -ENOMEM; - goto out_umem; - } - odp_data->umem = umem; - mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -334,15 +327,14 @@ out_page_list: vfree(odp_data->page_list); out_odp_data: kfree(odp_data); -out_umem: - kfree(umem); return ERR_PTR(ret); } EXPORT_SYMBOL(ib_alloc_odp_umem); -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access) +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { + struct ib_ucontext *context = umem_odp->umem.context; + struct ib_umem *umem = &umem_odp->umem; int ret_val; struct pid *our_pid; struct mm_struct *mm = get_task_mm(current); @@ -378,30 +370,23 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, goto out_mm; } - umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL); - if (!umem->odp_data) { - ret_val = -ENOMEM; - goto out_mm; - } - umem->odp_data->umem = umem; + mutex_init(&umem_odp->umem_mutex); - mutex_init(&umem->odp_data->umem_mutex); - - init_completion(&umem->odp_data->notifier_completion); + init_completion(&umem_odp->notifier_completion); if (ib_umem_num_pages(umem)) { - umem->odp_data->page_list = - vzalloc(array_size(sizeof(*umem->odp_data->page_list), + umem_odp->page_list = + vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->page_list) { + if (!umem_odp->page_list) { ret_val = -ENOMEM; - goto out_odp_data; + goto out_mm; } - umem->odp_data->dma_list = - vzalloc(array_size(sizeof(*umem->odp_data->dma_list), + umem_odp->dma_list = + vzalloc(array_size(sizeof(*umem_odp->dma_list), ib_umem_num_pages(umem))); - if (!umem->odp_data->dma_list) { + if (!umem_odp->dma_list) { ret_val = -ENOMEM; goto out_page_list; } @@ -415,13 +400,13 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, down_write(&context->umem_rwsem); context->odp_mrs_count++; if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem->odp_data->interval_tree, + rbt_ib_umem_insert(&umem_odp->interval_tree, &context->umem_tree); if (likely(!atomic_read(&context->notifier_count)) || context->odp_mrs_count == 1) - umem->odp_data->mn_counters_active = true; + umem_odp->mn_counters_active = true; else - list_add(&umem->odp_data->no_private_counters, + list_add(&umem_odp->no_private_counters, &context->no_private_counters); downgrade_write(&context->umem_rwsem); @@ -454,11 +439,9 @@ int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, out_mutex: up_read(&context->umem_rwsem); - vfree(umem->odp_data->dma_list); + vfree(umem_odp->dma_list); out_page_list: - vfree(umem->odp_data->page_list); -out_odp_data: - kfree(umem->odp_data); + vfree(umem_odp->page_list); out_mm: mmput(mm); return ret_val; @@ -466,7 +449,7 @@ out_mm: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct ib_ucontext *context = umem->context; /* @@ -528,8 +511,6 @@ out: vfree(umem_odp->dma_list); vfree(umem_odp->page_list); - kfree(umem_odp); - kfree(umem); } /* @@ -557,7 +538,7 @@ static int ib_umem_odp_map_dma_single_page( u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct ib_device *dev = umem->context->device; dma_addr_t dma_addr; int stored_page = 0; @@ -643,7 +624,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, u64 bcnt, u64 access_mask, unsigned long current_seq) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; struct page **local_page_list = NULL; @@ -759,7 +740,7 @@ EXPORT_SYMBOL(ib_umem_odp_map_dma_pages); void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 bound) { - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; int idx; u64 addr; struct ib_device *dev = umem->context->device; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 8f4a4a8171eb..5b9fd56186bd 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -64,7 +64,7 @@ static int check_parent(struct ib_umem_odp *odp, static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) { struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext *ctx = odp->umem->context; + struct ib_ucontext *ctx = odp->umem.context; struct rb_node *rb; down_read(&ctx->umem_rwsem); @@ -102,7 +102,7 @@ static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, if (!rb) goto not_found; odp = rb_entry(rb, struct ib_umem_odp, interval_tree.rb); - if (ib_umem_start(odp->umem) > start + length) + if (ib_umem_start(&odp->umem) > start + length) goto not_found; } not_found: @@ -137,7 +137,7 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); va = (offset + i) * MLX5_IMR_MTT_SIZE; - if (odp && odp->umem->address == va) { + if (odp && odp->umem.address == va) { struct mlx5_ib_mr *mtt = odp->private; pklm->key = cpu_to_be32(mtt->ibmr.lkey); @@ -153,13 +153,13 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, static void mr_leaf_free_action(struct work_struct *work) { struct ib_umem_odp *odp = container_of(work, struct ib_umem_odp, work); - int idx = ib_umem_start(odp->umem) >> MLX5_IMR_MTT_SHIFT; + int idx = ib_umem_start(&odp->umem) >> MLX5_IMR_MTT_SHIFT; struct mlx5_ib_mr *mr = odp->private, *imr = mr->parent; mr->parent = NULL; synchronize_srcu(&mr->dev->mr_srcu); - ib_umem_release(odp->umem); + ib_umem_release(&odp->umem); if (imr->live) mlx5_ib_update_xlt(imr, idx, 1, 0, MLX5_IB_UPD_XLT_INDIRECT | @@ -185,7 +185,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, pr_err("invalidation called on NULL umem or non-ODP umem\n"); return; } - umem = umem_odp->umem; + umem = &umem_odp->umem; mr = umem_odp->private; @@ -392,16 +392,16 @@ next_mr: return ERR_CAST(odp); } - mtt = implicit_mr_alloc(mr->ibmr.pd, odp->umem, 0, + mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, mr->access_flags); if (IS_ERR(mtt)) { mutex_unlock(&mr->umem->odp_data->umem_mutex); - ib_umem_release(odp->umem); + ib_umem_release(&odp->umem); return ERR_CAST(mtt); } odp->private = mtt; - mtt->umem = odp->umem; + mtt->umem = &odp->umem; mtt->mmkey.iova = addr; mtt->parent = mr; INIT_WORK(&odp->work, mr_leaf_free_action); @@ -418,7 +418,7 @@ next_mr: addr += MLX5_IMR_MTT_SIZE; if (unlikely(addr < io_virt + bcnt)) { odp = odp_next(odp); - if (odp && odp->umem->address != addr) + if (odp && odp->umem.address != addr) odp = NULL; goto next_mr; } @@ -465,7 +465,7 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { struct mlx5_ib_mr *mr = umem_odp->private, *imr = cookie; - struct ib_umem *umem = umem_odp->umem; + struct ib_umem *umem = &umem_odp->umem; if (mr->parent != imr) return 0; @@ -518,7 +518,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, } next_mr: - size = min_t(size_t, bcnt, ib_umem_end(odp->umem) - io_virt); + size = min_t(size_t, bcnt, ib_umem_end(&odp->umem) - io_virt); page_shift = mr->umem->page_shift; page_mask = ~(BIT(page_shift) - 1); @@ -577,7 +577,7 @@ next_mr: io_virt += size; next = odp_next(odp); - if (unlikely(!next || next->umem->address != io_virt)) { + if (unlikely(!next || next->umem.address != io_virt)) { mlx5_ib_dbg(dev, "next implicit leaf removed at 0x%llx. got %p\n", io_virt, next); return -EAGAIN; diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 3ef2975b5fb2..4519ea663df5 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -43,6 +43,7 @@ struct umem_odp_node { }; struct ib_umem_odp { + struct ib_umem umem; /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will @@ -72,7 +73,6 @@ struct ib_umem_odp { /* A linked list of umems that don't have private mmu notifier * counters yet. */ struct list_head no_private_counters; - struct ib_umem *umem; /* Tree tracking */ struct umem_odp_node interval_tree; @@ -84,13 +84,12 @@ struct ib_umem_odp { static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) { - return umem->odp_data; + return container_of(umem, struct ib_umem_odp, umem); } #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, - int access); +int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); @@ -158,9 +157,7 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ -static inline int ib_umem_odp_get(struct ib_ucontext *context, - struct ib_umem *umem, - int access) +static inline int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { return -EINVAL; } From 597ecc5a095406a668e53ab330495ddb65327f77 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:06 +0300 Subject: [PATCH 104/242] RDMA/umem: Get rid of struct ib_umem.odp_data This no longer has any use, we can use container_of to get to the umem_odp, and a simple flag to indicate if this is an odp MR. Remove the few remaining references to it. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem.c | 8 ++++---- drivers/infiniband/core/umem_odp.c | 3 +-- drivers/infiniband/hw/mlx5/mem.c | 9 ++++----- drivers/infiniband/hw/mlx5/mr.c | 13 +++++++------ drivers/infiniband/hw/mlx5/odp.c | 14 ++++++++------ include/rdma/ib_umem.h | 6 +++--- 6 files changed, 27 insertions(+), 26 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 88b9b88f90e1..fec5d489e311 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -112,7 +112,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem = kzalloc(sizeof(struct ib_umem_odp), GFP_KERNEL); if (!umem) return ERR_PTR(-ENOMEM); - umem->odp_data = to_ib_umem_odp(umem); + umem->is_odp = 1; } else { umem = kzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) @@ -243,7 +243,7 @@ EXPORT_SYMBOL(ib_umem_get); static void __ib_umem_release_tail(struct ib_umem *umem) { mmdrop(umem->owning_mm); - if (umem->odp_data) + if (umem->is_odp) kfree(to_ib_umem_odp(umem)); else kfree(umem); @@ -268,7 +268,7 @@ void ib_umem_release(struct ib_umem *umem) { struct ib_ucontext *context = umem->context; - if (umem->odp_data) { + if (umem->is_odp) { ib_umem_odp_release(to_ib_umem_odp(umem)); __ib_umem_release_tail(umem); return; @@ -306,7 +306,7 @@ int ib_umem_page_count(struct ib_umem *umem) int n; struct scatterlist *sg; - if (umem->odp_data) + if (umem->is_odp) return ib_umem_num_pages(umem); n = 0; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 900fdedfe910..42272b2bf595 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -291,6 +291,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; + umem->is_odp = 1; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -319,8 +320,6 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, &context->no_private_counters); up_write(&context->umem_rwsem); - umem->odp_data = odp_data; - return odp_data; out_page_list: diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c index f3dbd75a0a96..549234988bb4 100644 --- a/drivers/infiniband/hw/mlx5/mem.c +++ b/drivers/infiniband/hw/mlx5/mem.c @@ -57,7 +57,7 @@ void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int entry; unsigned long page_shift = umem->page_shift; - if (umem->odp_data) { + if (umem->is_odp) { *ncont = ib_umem_page_count(umem); *count = *ncont << (page_shift - PAGE_SHIFT); *shift = page_shift; @@ -152,14 +152,13 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, struct scatterlist *sg; int entry; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - const bool odp = umem->odp_data != NULL; - - if (odp) { + if (umem->is_odp) { WARN_ON(shift != 0); WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)); for (i = 0; i < num_pages; ++i) { - dma_addr_t pa = umem->odp_data->dma_list[offset + i]; + dma_addr_t pa = + to_ib_umem_odp(umem)->dma_list[offset + i]; pas[i] = cpu_to_be64(umem_dma_to_mtt(pa)); } diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index affbf2831ccd..6aac3a107330 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -98,7 +98,7 @@ static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING static void update_odp_mr(struct mlx5_ib_mr *mr) { - if (mr->umem->odp_data) { + if (mr->umem->is_odp) { /* * This barrier prevents the compiler from moving the * setting of umem->odp_data->private to point to our @@ -107,7 +107,7 @@ static void update_odp_mr(struct mlx5_ib_mr *mr) * handle invalidations. */ smp_wmb(); - mr->umem->odp_data->private = mr; + to_ib_umem_odp(mr->umem)->private = mr; /* * Make sure we will see the new * umem->odp_data->private value in the invalidation @@ -1624,15 +1624,16 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr) struct ib_umem *umem = mr->umem; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - if (umem && umem->odp_data) { + if (umem && umem->is_odp) { + struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem); + /* Prevent new page faults from succeeding */ mr->live = 0; /* Wait for all running page-fault handlers to finish. */ synchronize_srcu(&dev->mr_srcu); /* Destroy all page mappings */ - if (umem->odp_data->page_list) - mlx5_ib_invalidate_range(to_ib_umem_odp(umem), - ib_umem_start(umem), + if (umem_odp->page_list) + mlx5_ib_invalidate_range(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); else mlx5_ib_free_implicit_mr(mr); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 5b9fd56186bd..d4780bded74a 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -371,11 +371,12 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); struct ib_umem_odp *odp, *result = NULL; + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 addr = io_virt & MLX5_IMR_MTT_MASK; int nentries = 0, start_idx = 0, ret; struct mlx5_ib_mr *mtt; - mutex_lock(&mr->umem->odp_data->umem_mutex); + mutex_lock(&odp_mr->umem_mutex); odp = odp_lookup(ctx, addr, 1, mr); mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", @@ -388,14 +389,14 @@ next_mr: } else { odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); } mtt = implicit_mr_alloc(mr->ibmr.pd, &odp->umem, 0, mr->access_flags); if (IS_ERR(mtt)) { - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); ib_umem_release(&odp->umem); return ERR_CAST(mtt); } @@ -433,7 +434,7 @@ next_mr: } } - mutex_unlock(&mr->umem->odp_data->umem_mutex); + mutex_unlock(&odp_mr->umem_mutex); return result; } @@ -498,6 +499,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt, u32 *bytes_mapped) { + struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); u64 access_mask = ODP_READ_ALLOWED_BIT; int npages = 0, page_shift, np; u64 start_idx, page_mask; @@ -506,7 +508,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, size_t size; int ret; - if (!mr->umem->odp_data->page_list) { + if (!odp_mr->page_list) { odp = implicit_mr_get_data(mr, io_virt, bcnt); if (IS_ERR(odp)) @@ -514,7 +516,7 @@ static int pagefault_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr, mr = odp->private; } else { - odp = mr->umem->odp_data; + odp = odp_mr; } next_mr: diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index e1c00b2ead19..5d3755ec5afa 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -46,10 +46,10 @@ struct ib_umem { size_t length; unsigned long address; int page_shift; - int writable; - int hugetlb; + u32 writable : 1; + u32 hugetlb : 1; + u32 is_odp : 1; struct work_struct work; - struct ib_umem_odp *odp_data; struct sg_table sg_head; int nmap; int npages; From c9990ab39b6e911003bab10a6da96e98ab1503a3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:07 +0300 Subject: [PATCH 105/242] RDMA/umem: Move all the ODP related stuff out of ucontext and into per_mm This is the first step to make ODP use the owning_mm that is now part of struct ib_umem. Each ODP umem is linked to a single per_mm structure, which in turn, is linked to a single mm, via the embedded mmu_notifier. This first patch introduces the structure and reworks eveything to use it. This also needs to introduce tgid into the ib_ucontext_per_mm, as get_user_pages_remote() requires the originating task for statistics tracking. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 127 ++++++++++++++------------- drivers/infiniband/core/uverbs_cmd.c | 9 +- drivers/infiniband/hw/mlx5/odp.c | 43 +++++---- include/rdma/ib_umem_odp.h | 2 + include/rdma/ib_verbs.h | 32 ++++--- 5 files changed, 120 insertions(+), 93 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 42272b2bf595..6bf3fc0c12a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -115,34 +115,35 @@ static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) } /* Account for a new mmu notifier in an ib_ucontext. */ -static void ib_ucontext_notifier_start_account(struct ib_ucontext *context) +static void +ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm) { - atomic_inc(&context->notifier_count); + atomic_inc(&per_mm->notifier_count); } /* Account for a terminating mmu notifier in an ib_ucontext. * * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) +static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm) { - int zero_notifiers = atomic_dec_and_test(&context->notifier_count); + int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count); if (zero_notifiers && - !list_empty(&context->no_private_counters)) { + !list_empty(&per_mm->no_private_counters)) { /* No currently running mmu notifiers. Now is the chance to * add private accounting to all previously added umems. */ struct ib_umem_odp *odp_data, *next; /* Prevent concurrent mmu notifiers from working on the * no_private_counters list. */ - down_write(&context->umem_rwsem); + down_write(&per_mm->umem_rwsem); /* Read the notifier_count again, with the umem_rwsem * semaphore taken for write. */ - if (!atomic_read(&context->notifier_count)) { + if (!atomic_read(&per_mm->notifier_count)) { list_for_each_entry_safe(odp_data, next, - &context->no_private_counters, + &per_mm->no_private_counters, no_private_counters) { mutex_lock(&odp_data->umem_mutex); odp_data->mn_counters_active = true; @@ -152,7 +153,7 @@ static void ib_ucontext_notifier_end_account(struct ib_ucontext *context) } } - up_write(&context->umem_rwsem); + up_write(&per_mm->umem_rwsem); } } @@ -179,19 +180,20 @@ static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, static void ib_umem_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return; - ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, + ib_ucontext_notifier_start_account(per_mm); + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, true, NULL); - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); } static int invalidate_page_trampoline(struct ib_umem_odp *item, u64 start, @@ -217,23 +219,24 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, unsigned long end, bool blockable) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); int ret; - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return 0; if (blockable) - down_read(&context->umem_rwsem); - else if (!down_read_trylock(&context->umem_rwsem)) + down_read(&per_mm->umem_rwsem); + else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(context); - ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + ib_ucontext_notifier_start_account(per_mm); + ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_start_trampoline, blockable, NULL); - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); return ret; } @@ -250,9 +253,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, unsigned long start, unsigned long end) { - struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + struct ib_ucontext_per_mm *per_mm = + container_of(mn, struct ib_ucontext_per_mm, mn); - if (!context->invalidate_range) + if (!per_mm->context->invalidate_range) return; /* @@ -260,12 +264,12 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, * in ib_umem_notifier_invalidate_range_start so we shouldn't really block * here. But this is ugly and fragile. */ - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); - up_read(&context->umem_rwsem); - ib_ucontext_notifier_end_account(context); + up_read(&per_mm->umem_rwsem); + ib_ucontext_notifier_end_account(per_mm); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -277,6 +281,7 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size) { + struct ib_ucontext_per_mm *per_mm; struct ib_umem_odp *odp_data; struct ib_umem *umem; int pages = size >> PAGE_SHIFT; @@ -292,6 +297,7 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, umem->page_shift = PAGE_SHIFT; umem->writable = 1; umem->is_odp = 1; + odp_data->per_mm = per_mm = &context->per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -310,15 +316,15 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&context->umem_rwsem); - context->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count))) + down_write(&per_mm->umem_rwsem); + per_mm->odp_mrs_count++; + rbt_ib_umem_insert(&odp_data->interval_tree, &per_mm->umem_tree); + if (likely(!atomic_read(&per_mm->notifier_count))) odp_data->mn_counters_active = true; else list_add(&odp_data->no_private_counters, - &context->no_private_counters); - up_write(&context->umem_rwsem); + &per_mm->no_private_counters); + up_write(&per_mm->umem_rwsem); return odp_data; @@ -334,6 +340,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { struct ib_ucontext *context = umem_odp->umem.context; struct ib_umem *umem = &umem_odp->umem; + struct ib_ucontext_per_mm *per_mm; int ret_val; struct pid *our_pid; struct mm_struct *mm = get_task_mm(current); @@ -396,28 +403,30 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) * notification before the "current" task (and MM) is * destroyed. We use the umem_rwsem semaphore to synchronize. */ - down_write(&context->umem_rwsem); - context->odp_mrs_count++; + umem_odp->per_mm = per_mm = &context->per_mm; + + down_write(&per_mm->umem_rwsem); + per_mm->odp_mrs_count++; if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem_odp->interval_tree, - &context->umem_tree); - if (likely(!atomic_read(&context->notifier_count)) || - context->odp_mrs_count == 1) + &per_mm->umem_tree); + if (likely(!atomic_read(&per_mm->notifier_count)) || + per_mm->odp_mrs_count == 1) umem_odp->mn_counters_active = true; else list_add(&umem_odp->no_private_counters, - &context->no_private_counters); - downgrade_write(&context->umem_rwsem); + &per_mm->no_private_counters); + downgrade_write(&per_mm->umem_rwsem); - if (context->odp_mrs_count == 1) { + if (per_mm->odp_mrs_count == 1) { /* * Note that at this point, no MMU notifier is running - * for this context! + * for this per_mm! */ - atomic_set(&context->notifier_count, 0); - INIT_HLIST_NODE(&context->mn.hlist); - context->mn.ops = &ib_umem_notifiers; - ret_val = mmu_notifier_register(&context->mn, mm); + atomic_set(&per_mm->notifier_count, 0); + INIT_HLIST_NODE(&per_mm->mn.hlist); + per_mm->mn.ops = &ib_umem_notifiers; + ret_val = mmu_notifier_register(&per_mm->mn, mm); if (ret_val) { pr_err("Failed to register mmu_notifier %d\n", ret_val); ret_val = -EBUSY; @@ -425,7 +434,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) } } - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); /* * Note that doing an mmput can cause a notifier for the relevant mm. @@ -437,7 +446,7 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) return 0; out_mutex: - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); vfree(umem_odp->dma_list); out_page_list: vfree(umem_odp->page_list); @@ -449,7 +458,7 @@ out_mm: void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext *context = umem->context; + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; /* * Ensure that no more pages are mapped in the umem. @@ -460,11 +469,11 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&context->umem_rwsem); + down_write(&per_mm->umem_rwsem); if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem_odp->interval_tree, - &context->umem_tree); - context->odp_mrs_count--; + &per_mm->umem_tree); + per_mm->odp_mrs_count--; if (!umem_odp->mn_counters_active) { list_del(&umem_odp->no_private_counters); complete_all(&umem_odp->notifier_completion); @@ -477,13 +486,13 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * that since we are doing it atomically, no other user could register * and unregister while we do the check. */ - downgrade_write(&context->umem_rwsem); - if (!context->odp_mrs_count) { + downgrade_write(&per_mm->umem_rwsem); + if (!per_mm->odp_mrs_count) { struct task_struct *owning_process = NULL; struct mm_struct *owning_mm = NULL; - owning_process = get_pid_task(context->tgid, - PIDTYPE_PID); + owning_process = + get_pid_task(umem_odp->umem.context->tgid, PIDTYPE_PID); if (owning_process == NULL) /* * The process is already dead, notifier were removed @@ -498,7 +507,7 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * removed already. */ goto out_put_task; - mmu_notifier_unregister(&context->mn, owning_mm); + mmu_notifier_unregister(&per_mm->mn, owning_mm); mmput(owning_mm); @@ -506,7 +515,7 @@ out_put_task: put_task_struct(owning_process); } out: - up_read(&context->umem_rwsem); + up_read(&per_mm->umem_rwsem); vfree(umem_odp->dma_list); vfree(umem_odp->page_list); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 9c87c98a0f19..ce678e1008a4 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -124,10 +124,11 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->umem_rwsem); - ucontext->odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->no_private_counters); + ucontext->per_mm.umem_tree = RB_ROOT_CACHED; + init_rwsem(&ucontext->per_mm.umem_rwsem); + ucontext->per_mm.odp_mrs_count = 0; + INIT_LIST_HEAD(&ucontext->per_mm.no_private_counters); + ucontext->per_mm.context = ucontext; if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index d4780bded74a..9982b5f4e598 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -61,13 +61,21 @@ static int check_parent(struct ib_umem_odp *odp, return mr && mr->parent == parent && !odp->dying; } +struct ib_ucontext_per_mm *mr_to_per_mm(struct mlx5_ib_mr *mr) +{ + if (WARN_ON(!mr || !mr->umem || !mr->umem->is_odp)) + return NULL; + + return to_ib_umem_odp(mr->umem)->per_mm; +} + static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) { struct mlx5_ib_mr *mr = odp->private, *parent = mr->parent; - struct ib_ucontext *ctx = odp->umem.context; + struct ib_ucontext_per_mm *per_mm = odp->per_mm; struct rb_node *rb; - down_read(&ctx->umem_rwsem); + down_read(&per_mm->umem_rwsem); while (1) { rb = rb_next(&odp->interval_tree.rb); if (!rb) @@ -79,19 +87,19 @@ static struct ib_umem_odp *odp_next(struct ib_umem_odp *odp) not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } -static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, - u64 start, u64 length, +static struct ib_umem_odp *odp_lookup(u64 start, u64 length, struct mlx5_ib_mr *parent) { + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(parent); struct ib_umem_odp *odp; struct rb_node *rb; - down_read(&ctx->umem_rwsem); - odp = rbt_ib_umem_lookup(&ctx->umem_tree, start, length); + down_read(&per_mm->umem_rwsem); + odp = rbt_ib_umem_lookup(&per_mm->umem_tree, start, length); if (!odp) goto end; @@ -108,7 +116,7 @@ static struct ib_umem_odp *odp_lookup(struct ib_ucontext *ctx, not_found: odp = NULL; end: - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); return odp; } @@ -116,7 +124,6 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, size_t nentries, struct mlx5_ib_mr *mr, int flags) { struct ib_pd *pd = mr->ibmr.pd; - struct ib_ucontext *ctx = pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(pd->device); struct ib_umem_odp *odp; unsigned long va; @@ -131,8 +138,8 @@ void mlx5_odp_populate_klm(struct mlx5_klm *pklm, size_t offset, return; } - odp = odp_lookup(ctx, offset * MLX5_IMR_MTT_SIZE, - nentries * MLX5_IMR_MTT_SIZE, mr); + odp = odp_lookup(offset * MLX5_IMR_MTT_SIZE, + nentries * MLX5_IMR_MTT_SIZE, mr); for (i = 0; i < nentries; i++, pklm++) { pklm->bcount = cpu_to_be32(MLX5_IMR_MTT_SIZE); @@ -368,7 +375,6 @@ fail: static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, u64 io_virt, size_t bcnt) { - struct ib_ucontext *ctx = mr->ibmr.pd->uobject->context; struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.pd->device); struct ib_umem_odp *odp, *result = NULL; struct ib_umem_odp *odp_mr = to_ib_umem_odp(mr->umem); @@ -377,7 +383,7 @@ static struct ib_umem_odp *implicit_mr_get_data(struct mlx5_ib_mr *mr, struct mlx5_ib_mr *mtt; mutex_lock(&odp_mr->umem_mutex); - odp = odp_lookup(ctx, addr, 1, mr); + odp = odp_lookup(addr, 1, mr); mlx5_ib_dbg(dev, "io_virt:%llx bcnt:%zx addr:%llx odp:%p\n", io_virt, bcnt, addr, odp); @@ -387,7 +393,8 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(ctx, addr, MLX5_IMR_MTT_SIZE); + odp = ib_alloc_odp_umem(odp_mr->umem.context, addr, + MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); return ERR_CAST(odp); @@ -486,12 +493,12 @@ static int mr_leaf_free(struct ib_umem_odp *umem_odp, u64 start, u64 end, void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) { - struct ib_ucontext *ctx = imr->ibmr.pd->uobject->context; + struct ib_ucontext_per_mm *per_mm = mr_to_per_mm(imr); - down_read(&ctx->umem_rwsem); - rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, + down_read(&per_mm->umem_rwsem); + rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, mr_leaf_free, true, imr); - up_read(&ctx->umem_rwsem); + up_read(&per_mm->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); } diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 4519ea663df5..394ea6b68db7 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -44,6 +44,8 @@ struct umem_odp_node { struct ib_umem_odp { struct ib_umem umem; + struct ib_ucontext_per_mm *per_mm; + /* * An array of the pages included in the on-demand paging umem. * Indices of pages that are currently not mapped into the device will diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index d611ce9df7fb..2cf2cee5a753 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1488,6 +1488,25 @@ struct ib_rdmacg_object { #endif }; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + + struct rb_root_cached umem_tree; + /* + * Protects .umem_rbroot and tree, as well as odp_mrs_count and + * mmu notifiers registration. + */ + struct rw_semaphore umem_rwsem; + + struct mmu_notifier mn; + atomic_t notifier_count; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + unsigned int odp_mrs_count; +}; +#endif + struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; @@ -1502,20 +1521,9 @@ struct ib_ucontext { struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - int odp_mrs_count; + struct ib_ucontext_per_mm per_mm; #endif struct ib_rdmacg_object cg_obj; From f27a0d50a4bc2861b472c2e3740d63a29d1ac460 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:08 +0300 Subject: [PATCH 106/242] RDMA/umem: Use umem->owning_mm inside ODP Since ODP had a single struct mmu_notifier located in the ucontext it could only handle a single MM at a time, and this prevented it from using the new owning_mm system. With the prior rework it is now simple to let ODP track multiple MMs per ucontext, finish the job so that the per_mm is allocated on a mm by mm basis, and freed when the last umem is dropped from the ucontext. As a side effect the new saner locking removes the lockdep splat about nesting the umem_rwsem between mmu_notifier_unregister and ib_umem_odp_release. It also makes ODP work with multiple processes, across, fork, etc. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 301 ++++++++++++++------------- drivers/infiniband/core/uverbs_cmd.c | 8 +- drivers/infiniband/hw/mlx5/main.c | 7 + drivers/infiniband/hw/mlx5/odp.c | 2 +- include/rdma/ib_umem_odp.h | 20 +- include/rdma/ib_verbs.h | 22 +- 6 files changed, 191 insertions(+), 169 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 6bf3fc0c12a1..0577f9ff600f 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -278,10 +278,135 @@ static const struct mmu_notifier_ops ib_umem_notifiers = { .invalidate_range_end = ib_umem_notifier_invalidate_range_end, }; -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, - unsigned long addr, size_t size) +static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_insert(&umem_odp->interval_tree, + &per_mm->umem_tree); + + if (likely(!atomic_read(&per_mm->notifier_count))) + umem_odp->mn_counters_active = true; + else + list_add(&umem_odp->no_private_counters, + &per_mm->no_private_counters); + up_write(&per_mm->umem_rwsem); +} + +static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_umem *umem = &umem_odp->umem; + + down_write(&per_mm->umem_rwsem); + if (likely(ib_umem_start(umem) != ib_umem_end(umem))) + rbt_ib_umem_remove(&umem_odp->interval_tree, + &per_mm->umem_tree); + if (!umem_odp->mn_counters_active) { + list_del(&umem_odp->no_private_counters); + complete_all(&umem_odp->notifier_completion); + } + + up_write(&per_mm->umem_rwsem); +} + +static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, + struct mm_struct *mm) { struct ib_ucontext_per_mm *per_mm; + int ret; + + per_mm = kzalloc(sizeof(*per_mm), GFP_KERNEL); + if (!per_mm) + return ERR_PTR(-ENOMEM); + + per_mm->context = ctx; + per_mm->mm = mm; + per_mm->umem_tree = RB_ROOT_CACHED; + init_rwsem(&per_mm->umem_rwsem); + INIT_LIST_HEAD(&per_mm->no_private_counters); + + rcu_read_lock(); + per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); + rcu_read_unlock(); + + WARN_ON(mm != current->mm); + + per_mm->mn.ops = &ib_umem_notifiers; + ret = mmu_notifier_register(&per_mm->mn, per_mm->mm); + if (ret) { + dev_err(&ctx->device->dev, + "Failed to register mmu_notifier %d\n", ret); + goto out_pid; + } + + list_add(&per_mm->ucontext_list, &ctx->per_mm_list); + return per_mm; + +out_pid: + put_pid(per_mm->tgid); + kfree(per_mm); + return ERR_PTR(ret); +} + +static int get_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext *ctx = umem_odp->umem.context; + struct ib_ucontext_per_mm *per_mm; + + /* + * Generally speaking we expect only one or two per_mm in this list, + * so no reason to optimize this search today. + */ + mutex_lock(&ctx->per_mm_list_lock); + list_for_each_entry(per_mm, &ctx->per_mm_list, ucontext_list) { + if (per_mm->mm == umem_odp->umem.owning_mm) + goto found; + } + + per_mm = alloc_per_mm(ctx, umem_odp->umem.owning_mm); + if (IS_ERR(per_mm)) { + mutex_unlock(&ctx->per_mm_list_lock); + return PTR_ERR(per_mm); + } + +found: + umem_odp->per_mm = per_mm; + per_mm->odp_mrs_count++; + mutex_unlock(&ctx->per_mm_list_lock); + + return 0; +} + +void put_per_mm(struct ib_umem_odp *umem_odp) +{ + struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; + struct ib_ucontext *ctx = umem_odp->umem.context; + bool need_free; + + mutex_lock(&ctx->per_mm_list_lock); + umem_odp->per_mm = NULL; + per_mm->odp_mrs_count--; + need_free = per_mm->odp_mrs_count == 0; + if (need_free) + list_del(&per_mm->ucontext_list); + mutex_unlock(&ctx->per_mm_list_lock); + + if (!need_free) + return; + + mmu_notifier_unregister(&per_mm->mn, per_mm->mm); + put_pid(per_mm->tgid); + kfree(per_mm); +} + +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, + unsigned long addr, size_t size) +{ + struct ib_ucontext *ctx = per_mm->context; struct ib_umem_odp *odp_data; struct ib_umem *umem; int pages = size >> PAGE_SHIFT; @@ -291,13 +416,13 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, if (!odp_data) return ERR_PTR(-ENOMEM); umem = &odp_data->umem; - umem->context = context; + umem->context = ctx; umem->length = size; umem->address = addr; umem->page_shift = PAGE_SHIFT; umem->writable = 1; umem->is_odp = 1; - odp_data->per_mm = per_mm = &context->per_mm; + odp_data->per_mm = per_mm; mutex_init(&odp_data->umem_mutex); init_completion(&odp_data->notifier_completion); @@ -316,15 +441,14 @@ struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, goto out_page_list; } - down_write(&per_mm->umem_rwsem); + /* + * Caller must ensure that the umem_odp that the per_mm came from + * cannot be freed during the call to ib_alloc_odp_umem. + */ + mutex_lock(&ctx->per_mm_list_lock); per_mm->odp_mrs_count++; - rbt_ib_umem_insert(&odp_data->interval_tree, &per_mm->umem_tree); - if (likely(!atomic_read(&per_mm->notifier_count))) - odp_data->mn_counters_active = true; - else - list_add(&odp_data->no_private_counters, - &per_mm->no_private_counters); - up_write(&per_mm->umem_rwsem); + mutex_unlock(&ctx->per_mm_list_lock); + add_umem_to_per_mm(odp_data); return odp_data; @@ -338,15 +462,13 @@ EXPORT_SYMBOL(ib_alloc_odp_umem); int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) { - struct ib_ucontext *context = umem_odp->umem.context; struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext_per_mm *per_mm; + /* + * NOTE: This must called in a process context where umem->owning_mm + * == current->mm + */ + struct mm_struct *mm = umem->owning_mm; int ret_val; - struct pid *our_pid; - struct mm_struct *mm = get_task_mm(current); - - if (!mm) - return -EINVAL; if (access & IB_ACCESS_HUGETLB) { struct vm_area_struct *vma; @@ -366,16 +488,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) umem->hugetlb = 0; } - /* Prevent creating ODP MRs in child processes */ - rcu_read_lock(); - our_pid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); - put_pid(our_pid); - if (context->tgid != our_pid) { - ret_val = -EINVAL; - goto out_mm; - } - mutex_init(&umem_odp->umem_mutex); init_completion(&umem_odp->notifier_completion); @@ -384,10 +496,8 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) umem_odp->page_list = vzalloc(array_size(sizeof(*umem_odp->page_list), ib_umem_num_pages(umem))); - if (!umem_odp->page_list) { - ret_val = -ENOMEM; - goto out_mm; - } + if (!umem_odp->page_list) + return -ENOMEM; umem_odp->dma_list = vzalloc(array_size(sizeof(*umem_odp->dma_list), @@ -398,67 +508,23 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access) } } - /* - * When using MMU notifiers, we will get a - * notification before the "current" task (and MM) is - * destroyed. We use the umem_rwsem semaphore to synchronize. - */ - umem_odp->per_mm = per_mm = &context->per_mm; + ret_val = get_per_mm(umem_odp); + if (ret_val) + goto out_dma_list; + add_umem_to_per_mm(umem_odp); - down_write(&per_mm->umem_rwsem); - per_mm->odp_mrs_count++; - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_insert(&umem_odp->interval_tree, - &per_mm->umem_tree); - if (likely(!atomic_read(&per_mm->notifier_count)) || - per_mm->odp_mrs_count == 1) - umem_odp->mn_counters_active = true; - else - list_add(&umem_odp->no_private_counters, - &per_mm->no_private_counters); - downgrade_write(&per_mm->umem_rwsem); - - if (per_mm->odp_mrs_count == 1) { - /* - * Note that at this point, no MMU notifier is running - * for this per_mm! - */ - atomic_set(&per_mm->notifier_count, 0); - INIT_HLIST_NODE(&per_mm->mn.hlist); - per_mm->mn.ops = &ib_umem_notifiers; - ret_val = mmu_notifier_register(&per_mm->mn, mm); - if (ret_val) { - pr_err("Failed to register mmu_notifier %d\n", ret_val); - ret_val = -EBUSY; - goto out_mutex; - } - } - - up_read(&per_mm->umem_rwsem); - - /* - * Note that doing an mmput can cause a notifier for the relevant mm. - * If the notifier is called while we hold the umem_rwsem, this will - * cause a deadlock. Therefore, we release the reference only after we - * released the semaphore. - */ - mmput(mm); return 0; -out_mutex: - up_read(&per_mm->umem_rwsem); +out_dma_list: vfree(umem_odp->dma_list); out_page_list: vfree(umem_odp->page_list); -out_mm: - mmput(mm); return ret_val; } void ib_umem_odp_release(struct ib_umem_odp *umem_odp) { struct ib_umem *umem = &umem_odp->umem; - struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; /* * Ensure that no more pages are mapped in the umem. @@ -469,54 +535,8 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem), ib_umem_end(umem)); - down_write(&per_mm->umem_rwsem); - if (likely(ib_umem_start(umem) != ib_umem_end(umem))) - rbt_ib_umem_remove(&umem_odp->interval_tree, - &per_mm->umem_tree); - per_mm->odp_mrs_count--; - if (!umem_odp->mn_counters_active) { - list_del(&umem_odp->no_private_counters); - complete_all(&umem_odp->notifier_completion); - } - - /* - * Downgrade the lock to a read lock. This ensures that the notifiers - * (who lock the mutex for reading) will be able to finish, and we - * will be able to enventually obtain the mmu notifiers SRCU. Note - * that since we are doing it atomically, no other user could register - * and unregister while we do the check. - */ - downgrade_write(&per_mm->umem_rwsem); - if (!per_mm->odp_mrs_count) { - struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; - - owning_process = - get_pid_task(umem_odp->umem.context->tgid, PIDTYPE_PID); - if (owning_process == NULL) - /* - * The process is already dead, notifier were removed - * already. - */ - goto out; - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) - /* - * The process' mm is already dead, notifier were - * removed already. - */ - goto out_put_task; - mmu_notifier_unregister(&per_mm->mn, owning_mm); - - mmput(owning_mm); - -out_put_task: - put_task_struct(owning_process); - } -out: - up_read(&per_mm->umem_rwsem); - + remove_umem_from_per_mm(umem_odp); + put_per_mm(umem_odp); vfree(umem_odp->dma_list); vfree(umem_odp->page_list); } @@ -634,7 +654,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, { struct ib_umem *umem = &umem_odp->umem; struct task_struct *owning_process = NULL; - struct mm_struct *owning_mm = NULL; + struct mm_struct *owning_mm = umem_odp->umem.owning_mm; struct page **local_page_list = NULL; u64 page_mask, off; int j, k, ret = 0, start_idx, npages = 0, page_shift; @@ -658,15 +678,14 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, user_virt = user_virt & page_mask; bcnt += off; /* Charge for the first page offset as well. */ - owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID); - if (owning_process == NULL) { + /* + * owning_process is allowed to be NULL, this means somehow the mm is + * existing beyond the lifetime of the originating process.. Presumably + * mmget_not_zero will fail in this case. + */ + owning_process = get_pid_task(umem_odp->per_mm->tgid, PIDTYPE_PID); + if (WARN_ON(!mmget_not_zero(umem_odp->umem.owning_mm))) { ret = -EINVAL; - goto out_no_task; - } - - owning_mm = get_task_mm(owning_process); - if (owning_mm == NULL) { - ret = -ENOENT; goto out_put_task; } @@ -738,8 +757,8 @@ int ib_umem_odp_map_dma_pages(struct ib_umem_odp *umem_odp, u64 user_virt, mmput(owning_mm); out_put_task: - put_task_struct(owning_process); -out_no_task: + if (owning_process) + put_task_struct(owning_process); free_page((unsigned long)local_page_list); return ret; } diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index ce678e1008a4..d77b0b9793c7 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -124,12 +124,8 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->cleanup_retryable = false; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - ucontext->per_mm.umem_tree = RB_ROOT_CACHED; - init_rwsem(&ucontext->per_mm.umem_rwsem); - ucontext->per_mm.odp_mrs_count = 0; - INIT_LIST_HEAD(&ucontext->per_mm.no_private_counters); - ucontext->per_mm.context = ucontext; - + mutex_init(&ucontext->per_mm_list_lock); + INIT_LIST_HEAD(&ucontext->per_mm_list); if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index aeb328100986..1348a08261a9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1861,6 +1861,13 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_ib_dev *dev = to_mdev(ibcontext->device); struct mlx5_bfreg_info *bfregi; +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* All umem's must be destroyed before destroying the ucontext. */ + mutex_lock(&ibcontext->per_mm_list_lock); + WARN_ON(!list_empty(&ibcontext->per_mm_list)); + mutex_unlock(&ibcontext->per_mm_list_lock); +#endif + if (context->devx_uid) mlx5_ib_devx_destroy(dev, context); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 9982b5f4e598..b04eb6775326 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -393,7 +393,7 @@ next_mr: if (nentries) nentries++; } else { - odp = ib_alloc_odp_umem(odp_mr->umem.context, addr, + odp = ib_alloc_odp_umem(odp_mr->per_mm, addr, MLX5_IMR_MTT_SIZE); if (IS_ERR(odp)) { mutex_unlock(&odp_mr->umem_mutex); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 394ea6b68db7..259eb08dfc9e 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -91,8 +91,26 @@ static inline struct ib_umem_odp *to_ib_umem_odp(struct ib_umem *umem) #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING +struct ib_ucontext_per_mm { + struct ib_ucontext *context; + struct mm_struct *mm; + struct pid *tgid; + + struct rb_root_cached umem_tree; + /* Protects umem_tree */ + struct rw_semaphore umem_rwsem; + atomic_t notifier_count; + + struct mmu_notifier mn; + /* A list of umems that don't have private mmu notifier counters yet. */ + struct list_head no_private_counters; + unsigned int odp_mrs_count; + + struct list_head ucontext_list; +}; + int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); -struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext *context, +struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, unsigned long addr, size_t size); void ib_umem_odp_release(struct ib_umem_odp *umem_odp); diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 2cf2cee5a753..6437e6af758d 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1488,25 +1488,6 @@ struct ib_rdmacg_object { #endif }; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -struct ib_ucontext_per_mm { - struct ib_ucontext *context; - - struct rb_root_cached umem_tree; - /* - * Protects .umem_rbroot and tree, as well as odp_mrs_count and - * mmu notifiers registration. - */ - struct rw_semaphore umem_rwsem; - - struct mmu_notifier mn; - atomic_t notifier_count; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; - unsigned int odp_mrs_count; -}; -#endif - struct ib_ucontext { struct ib_device *device; struct ib_uverbs_file *ufile; @@ -1523,7 +1504,8 @@ struct ib_ucontext { #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); - struct ib_ucontext_per_mm per_mm; + struct mutex per_mm_list_lock; + struct list_head per_mm_list; #endif struct ib_rdmacg_object cg_obj; From ca748c39ea3f3c755295d64d69ba0b4375e34b5d Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:09 +0300 Subject: [PATCH 107/242] RDMA/umem: Get rid of per_mm->notifier_count This is intrinsically racy and the scheme is simply unnecessary. New MR registration can wait for any on going invalidation to fully complete. CPU0 CPU1 if (atomic_read()) if (atomic_dec_and_test() && !list_empty()) { /* not taken */ } list_add() Putting the new UMEM into some kind of purgatory until another invalidate rolls through.. Instead hold the read side of the umem_rwsem across the pair'd start/end and get rid of the racy 'deferred add' approach. Since all umem's in the rbt are always ready to go, also get rid of the mn_counters_active stuff. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 113 +++++------------------------ include/rdma/ib_umem_odp.h | 15 ---- 2 files changed, 18 insertions(+), 110 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 0577f9ff600f..1c0c4a431218 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -80,83 +80,29 @@ INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last, static void ib_umem_notifier_start_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - int notifiers_count = umem_odp->notifiers_count++; - - if (notifiers_count == 0) - /* Initialize the completion object for waiting on - * notifiers. Since notifier_count is zero, no one - * should be waiting right now. */ - reinit_completion(&umem_odp->notifier_completion); - } + if (umem_odp->notifiers_count++ == 0) + /* + * Initialize the completion object for waiting on + * notifiers. Since notifier_count is zero, no one should be + * waiting right now. + */ + reinit_completion(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } static void ib_umem_notifier_end_account(struct ib_umem_odp *umem_odp) { mutex_lock(&umem_odp->umem_mutex); - - /* Only update private counters for this umem if it has them. - * Otherwise skip it. All page faults will be delayed for this umem. */ - if (umem_odp->mn_counters_active) { - /* - * This sequence increase will notify the QP page fault that - * the page that is going to be mapped in the spte could have - * been freed. - */ - ++umem_odp->notifiers_seq; - if (--umem_odp->notifiers_count == 0) - complete_all(&umem_odp->notifier_completion); - } + /* + * This sequence increase will notify the QP page fault that the page + * that is going to be mapped in the spte could have been freed. + */ + ++umem_odp->notifiers_seq; + if (--umem_odp->notifiers_count == 0) + complete_all(&umem_odp->notifier_completion); mutex_unlock(&umem_odp->umem_mutex); } -/* Account for a new mmu notifier in an ib_ucontext. */ -static void -ib_ucontext_notifier_start_account(struct ib_ucontext_per_mm *per_mm) -{ - atomic_inc(&per_mm->notifier_count); -} - -/* Account for a terminating mmu notifier in an ib_ucontext. - * - * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since - * the function takes the semaphore itself. */ -static void ib_ucontext_notifier_end_account(struct ib_ucontext_per_mm *per_mm) -{ - int zero_notifiers = atomic_dec_and_test(&per_mm->notifier_count); - - if (zero_notifiers && - !list_empty(&per_mm->no_private_counters)) { - /* No currently running mmu notifiers. Now is the chance to - * add private accounting to all previously added umems. */ - struct ib_umem_odp *odp_data, *next; - - /* Prevent concurrent mmu notifiers from working on the - * no_private_counters list. */ - down_write(&per_mm->umem_rwsem); - - /* Read the notifier_count again, with the umem_rwsem - * semaphore taken for write. */ - if (!atomic_read(&per_mm->notifier_count)) { - list_for_each_entry_safe(odp_data, next, - &per_mm->no_private_counters, - no_private_counters) { - mutex_lock(&odp_data->umem_mutex); - odp_data->mn_counters_active = true; - list_del(&odp_data->no_private_counters); - complete_all(&odp_data->notifier_completion); - mutex_unlock(&odp_data->umem_mutex); - } - } - - up_write(&per_mm->umem_rwsem); - } -} - static int ib_umem_notifier_release_trampoline(struct ib_umem_odp *umem_odp, u64 start, u64 end, void *cookie) { @@ -186,7 +132,6 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - ib_ucontext_notifier_start_account(per_mm); down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, ULLONG_MAX, @@ -231,14 +176,9 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; - ib_ucontext_notifier_start_account(per_mm); - ret = rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, - end, - invalidate_range_start_trampoline, - blockable, NULL); - up_read(&per_mm->umem_rwsem); - - return ret; + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, + invalidate_range_start_trampoline, + blockable, NULL); } static int invalidate_range_end_trampoline(struct ib_umem_odp *item, u64 start, @@ -259,17 +199,10 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, if (!per_mm->context->invalidate_range) return; - /* - * TODO: we currently bail out if there is any sleepable work to be done - * in ib_umem_notifier_invalidate_range_start so we shouldn't really block - * here. But this is ugly and fragile. - */ - down_read(&per_mm->umem_rwsem); rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_end_trampoline, true, NULL); up_read(&per_mm->umem_rwsem); - ib_ucontext_notifier_end_account(per_mm); } static const struct mmu_notifier_ops ib_umem_notifiers = { @@ -287,12 +220,6 @@ static void add_umem_to_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_insert(&umem_odp->interval_tree, &per_mm->umem_tree); - - if (likely(!atomic_read(&per_mm->notifier_count))) - umem_odp->mn_counters_active = true; - else - list_add(&umem_odp->no_private_counters, - &per_mm->no_private_counters); up_write(&per_mm->umem_rwsem); } @@ -305,10 +232,7 @@ static void remove_umem_from_per_mm(struct ib_umem_odp *umem_odp) if (likely(ib_umem_start(umem) != ib_umem_end(umem))) rbt_ib_umem_remove(&umem_odp->interval_tree, &per_mm->umem_tree); - if (!umem_odp->mn_counters_active) { - list_del(&umem_odp->no_private_counters); - complete_all(&umem_odp->notifier_completion); - } + complete_all(&umem_odp->notifier_completion); up_write(&per_mm->umem_rwsem); } @@ -327,7 +251,6 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); - INIT_LIST_HEAD(&per_mm->no_private_counters); rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 259eb08dfc9e..ce9502545903 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -67,15 +67,9 @@ struct ib_umem_odp { struct mutex umem_mutex; void *private; /* for the HW driver to use. */ - /* When false, use the notifier counter in the ucontext struct. */ - bool mn_counters_active; int notifiers_seq; int notifiers_count; - /* A linked list of umems that don't have private mmu notifier - * counters yet. */ - struct list_head no_private_counters; - /* Tree tracking */ struct umem_odp_node interval_tree; @@ -99,11 +93,8 @@ struct ib_ucontext_per_mm { struct rb_root_cached umem_tree; /* Protects umem_tree */ struct rw_semaphore umem_rwsem; - atomic_t notifier_count; struct mmu_notifier mn; - /* A list of umems that don't have private mmu notifier counters yet. */ - struct list_head no_private_counters; unsigned int odp_mrs_count; struct list_head ucontext_list; @@ -162,12 +153,6 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem_odp *umem_odp, * and the ucontext umem_mutex semaphore locked for read). */ - /* Do not allow page faults while the new ib_umem hasn't seen a state - * with zero notifiers yet, and doesn't have its own valid set of - * private counters. */ - if (!umem_odp->mn_counters_active) - return 1; - if (unlikely(umem_odp->notifiers_count)) return 1; if (umem_odp->notifiers_seq != mmu_seq) From be7a57b41ad824dbc59d1ffa91160ee73f2999ee Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:10 +0300 Subject: [PATCH 108/242] RDMA/umem: Handle a half-complete start/end sequence mmu_notifier_unregister() can race between a invalidate_start/end and cause the invalidate_end to be skipped. This causes an imbalance in the locking, which lockdep complains about. This is not actually a bug, as we immediately kfree the memory holding the lock, but it simple enough to fix. Mark when the notifier is being destroyed and abort the start callback. This can be done under the lock we already obtained, and can re-purpose the invalidate_range test we already have. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 39 ++++++++++++++++++++---------- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 27 insertions(+), 13 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1c0c4a431218..d7b6422b9611 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -129,15 +129,11 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (!per_mm->context->invalidate_range) - return; - down_read(&per_mm->umem_rwsem); - rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, 0, - ULLONG_MAX, - ib_umem_notifier_release_trampoline, - true, - NULL); + if (per_mm->active) + rbt_ib_umem_for_each_in_range( + &per_mm->umem_tree, 0, ULLONG_MAX, + ib_umem_notifier_release_trampoline, true, NULL); up_read(&per_mm->umem_rwsem); } @@ -166,16 +162,22 @@ static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, { struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - int ret; - - if (!per_mm->context->invalidate_range) - return 0; if (blockable) down_read(&per_mm->umem_rwsem); else if (!down_read_trylock(&per_mm->umem_rwsem)) return -EAGAIN; + if (!per_mm->active) { + up_read(&per_mm->umem_rwsem); + /* + * At this point active is permanently set and visible to this + * CPU without a lock, that fact is relied on to skip the unlock + * in range_end. + */ + return 0; + } + return rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, end, invalidate_range_start_trampoline, blockable, NULL); @@ -196,7 +198,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, struct ib_ucontext_per_mm *per_mm = container_of(mn, struct ib_ucontext_per_mm, mn); - if (!per_mm->context->invalidate_range) + if (unlikely(!per_mm->active)) return; rbt_ib_umem_for_each_in_range(&per_mm->umem_tree, start, @@ -251,6 +253,7 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx, per_mm->mm = mm; per_mm->umem_tree = RB_ROOT_CACHED; init_rwsem(&per_mm->umem_rwsem); + per_mm->active = ctx->invalidate_range; rcu_read_lock(); per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); @@ -321,6 +324,16 @@ void put_per_mm(struct ib_umem_odp *umem_odp) if (!need_free) return; + /* + * NOTE! mmu_notifier_unregister() can happen between a start/end + * callback, resulting in an start/end, and thus an unbalanced + * lock. This doesn't really matter to us since we are about to kfree + * the memory that holds the lock, however LOCKDEP doesn't like this. + */ + down_write(&per_mm->umem_rwsem); + per_mm->active = false; + up_write(&per_mm->umem_rwsem); + mmu_notifier_unregister(&per_mm->mn, per_mm->mm); put_pid(per_mm->tgid); kfree(per_mm); diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index ce9502545903..ec05c82ead7a 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -89,6 +89,7 @@ struct ib_ucontext_per_mm { struct ib_ucontext *context; struct mm_struct *mm; struct pid *tgid; + bool active; struct rb_root_cached umem_tree; /* Protects umem_tree */ From 56ac9dd9177ce451ac8176311915b29e8b5f0ac2 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:11 +0300 Subject: [PATCH 109/242] RDMA/umem: Avoid synchronize_srcu in the ODP MR destruction path synchronize_rcu is slow enough that it should be avoided on the syscall path when user space is destroying MRs. After all the rework we can now trivially do this by having call_srcu kfree the per_mm. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/umem_odp.c | 10 ++++++++-- include/rdma/ib_umem_odp.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index d7b6422b9611..2b4c5e7dd5a1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -307,6 +307,11 @@ found: return 0; } +static void free_per_mm(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ib_ucontext_per_mm, rcu)); +} + void put_per_mm(struct ib_umem_odp *umem_odp) { struct ib_ucontext_per_mm *per_mm = umem_odp->per_mm; @@ -334,9 +339,10 @@ void put_per_mm(struct ib_umem_odp *umem_odp) per_mm->active = false; up_write(&per_mm->umem_rwsem); - mmu_notifier_unregister(&per_mm->mn, per_mm->mm); + WARN_ON(!RB_EMPTY_ROOT(&per_mm->umem_tree.rb_root)); + mmu_notifier_unregister_no_release(&per_mm->mn, per_mm->mm); put_pid(per_mm->tgid); - kfree(per_mm); + mmu_notifier_call_srcu(&per_mm->rcu, free_per_mm); } struct ib_umem_odp *ib_alloc_odp_umem(struct ib_ucontext_per_mm *per_mm, diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index ec05c82ead7a..0b1446fe2fab 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -99,6 +99,7 @@ struct ib_ucontext_per_mm { unsigned int odp_mrs_count; struct list_head ucontext_list; + struct rcu_head rcu; }; int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access); From 2a3ccfdbeb6a5f832d7203e230799f1ffa46e0fc Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Sun, 16 Sep 2018 20:48:12 +0300 Subject: [PATCH 110/242] RDMA/uverbs: Get rid of ucontext->tgid Nothing uses this now, just delete it. Signed-off-by: Jason Gunthorpe Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/rdma_core.c | 1 - drivers/infiniband/core/uverbs_cmd.c | 4 ---- include/rdma/ib_verbs.h | 1 - 3 files changed, 6 deletions(-) diff --git a/drivers/infiniband/core/rdma_core.c b/drivers/infiniband/core/rdma_core.c index 6a3acf4bf78a..752a55c6bdce 100644 --- a/drivers/infiniband/core/rdma_core.c +++ b/drivers/infiniband/core/rdma_core.c @@ -816,7 +816,6 @@ static void ufile_destroy_ucontext(struct ib_uverbs_file *ufile, ib_dev->disassociate_ucontext(ucontext); } - put_pid(ucontext->tgid); ib_rdmacg_uncharge(&ucontext->cg_obj, ib_dev, RDMACG_RESOURCE_HCA_HANDLE); diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index d77b0b9793c7..91d3e4029cd5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -117,9 +117,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, /* ufile is required when some objects are released */ ucontext->ufile = file; - rcu_read_lock(); - ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID); - rcu_read_unlock(); ucontext->closing = false; ucontext->cleanup_retryable = false; @@ -169,7 +166,6 @@ err_fd: put_unused_fd(resp.async_fd); err_free: - put_pid(ucontext->tgid); ib_dev->dealloc_ucontext(ucontext); err_alloc: diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 6437e6af758d..0d822a9db300 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1500,7 +1500,6 @@ struct ib_ucontext { bool cleanup_retryable; - struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING void (*invalidate_range)(struct ib_umem_odp *umem_odp, unsigned long start, unsigned long end); From b9f86e6e7b755951af3c57f7864c57eeb98303eb Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Sep 2018 20:29:57 -0700 Subject: [PATCH 111/242] IB/nes: Remove unnecessary parentheses Clang warns when more than one set of parentheses are used in single conditional statements. drivers/infiniband/hw/nes/nes_hw.c:1446:27: warning: equality comparison with extraneous parentheses [-Wparentheses-equality] } while ((temp_phy_data2 == temp_phy_data)); ~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~ drivers/infiniband/hw/nes/nes_hw.c:1446:27: note: remove extraneous parentheses around the comparison to silence this warning } while ((temp_phy_data2 == temp_phy_data)); ~ ^ ~ drivers/infiniband/hw/nes/nes_hw.c:1446:27: note: use '=' to turn this equality comparison into an assignment } while ((temp_phy_data2 == temp_phy_data)); ^~ = Remove the unnecessary parentheses to silence this warning. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Doug Ledford --- drivers/infiniband/hw/nes/nes_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c index bd0675d8f298..5517e392bc01 100644 --- a/drivers/infiniband/hw/nes/nes_hw.c +++ b/drivers/infiniband/hw/nes/nes_hw.c @@ -1443,7 +1443,7 @@ static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_inde mdelay(1); nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee); temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL); - } while ((temp_phy_data2 == temp_phy_data)); + } while (temp_phy_data2 == temp_phy_data); /* wait for tracking */ counter = 0; From fa8f11586a963ad484d43e0728ef42f378a85768 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Sep 2018 20:32:29 -0700 Subject: [PATCH 112/242] IB/mlx4: Remove unnecessary parentheses Clang warns when more than one set of parentheses are used in single conditional statements. drivers/infiniband/hw/mlx4/mcg.c:676:16: warning: equality comparison with extraneous parentheses [-Wparentheses-equality] if ((method == IB_MGMT_METHOD_GET_RESP)) { ~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mcg.c:676:16: note: remove extraneous parentheses around the comparison to silence this warning if ((method == IB_MGMT_METHOD_GET_RESP)) { ~ ^ ~ drivers/infiniband/hw/mlx4/mcg.c:676:16: note: use '=' to turn this equality comparison into an assignment if ((method == IB_MGMT_METHOD_GET_RESP)) { ^~ = Remove the unnecessary parentheses to silence this warning. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Reviewed-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mcg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c index 81ffc007e0a1..d844831179cf 100644 --- a/drivers/infiniband/hw/mlx4/mcg.c +++ b/drivers/infiniband/hw/mlx4/mcg.c @@ -673,7 +673,7 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work) if (!list_empty(&group->pending_list)) req = list_first_entry(&group->pending_list, struct mcast_req, group_list); - if ((method == IB_MGMT_METHOD_GET_RESP)) { + if (method == IB_MGMT_METHOD_GET_RESP) { if (req) { send_reply_to_slave(req->func, group, &req->sa_mad, status); --group->func[req->func].num_pend_reqs; From 26f91da29650177364564d8183907d0229e9afbc Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Thu, 20 Sep 2018 17:52:42 +0800 Subject: [PATCH 113/242] RDMA/cxgb4: remove redundant null pointer check before kfree_skb kfree_skb has taken the null pointer into account. hence it is safe to remove the redundant null pointer check before kfree_skb. Signed-off-by: zhong jiang Acked-by: Steve Wise Signed-off-by: Doug Ledford --- drivers/infiniband/hw/cxgb4/cm.c | 3 +-- drivers/infiniband/hw/cxgb4/qp.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 0f83cbec33f3..615413bd3e8d 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -403,8 +403,7 @@ void _c4iw_free_ep(struct kref *kref) ep->com.local_addr.ss_family); dst_release(ep->dst); cxgb4_l2t_release(ep->l2t); - if (ep->mpa_skb) - kfree_skb(ep->mpa_skb); + kfree_skb(ep->mpa_skb); } if (!skb_queue_empty(&ep->com.ep_skb_list)) skb_queue_purge(&ep->com.ep_skb_list); diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index 347fe18b1a41..e78dd9afac86 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -2813,8 +2813,7 @@ err_free_queue: free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx, srq->wr_waitp); err_free_skb: - if (srq->destroy_skb) - kfree_skb(srq->destroy_skb); + kfree_skb(srq->destroy_skb); err_free_srq_idx: c4iw_free_srq_idx(&rhp->rdev, srq->idx); err_free_wr_wait: From a560f1d9af4be84ee91d1a47382cacf620eb4a79 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 17 Sep 2018 13:30:47 +0300 Subject: [PATCH 114/242] RDMA/mlx5: Refactor transport domain bookkeeping logic In preparation to enable loopback on a single user context move the logic that enables/disables loopback to separate functions and group variables under a single struct. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 45 ++++++++++++++++++---------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 10 +++++-- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index c414f3809e5c..e1b03b6908f0 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1571,6 +1571,32 @@ static void deallocate_uars(struct mlx5_ib_dev *dev, mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); } +static int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev) +{ + int err = 0; + + mutex_lock(&dev->lb.mutex); + dev->lb.user_td++; + + if (dev->lb.user_td == 2) + err = mlx5_nic_vport_update_local_lb(dev->mdev, true); + + mutex_unlock(&dev->lb.mutex); + + return err; +} + +static void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev) +{ + mutex_lock(&dev->lb.mutex); + dev->lb.user_td--; + + if (dev->lb.user_td < 2) + mlx5_nic_vport_update_local_lb(dev->mdev, false); + + mutex_unlock(&dev->lb.mutex); +} + static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) { int err; @@ -1587,14 +1613,7 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return err; - mutex_lock(&dev->lb_mutex); - dev->user_td++; - - if (dev->user_td == 2) - err = mlx5_nic_vport_update_local_lb(dev->mdev, true); - - mutex_unlock(&dev->lb_mutex); - return err; + return mlx5_ib_enable_lb(dev); } static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) @@ -1609,13 +1628,7 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return; - mutex_lock(&dev->lb_mutex); - dev->user_td--; - - if (dev->user_td < 2) - mlx5_nic_vport_update_local_lb(dev->mdev, false); - - mutex_unlock(&dev->lb_mutex); + mlx5_ib_disable_lb(dev); } static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, @@ -5880,7 +5893,7 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev) if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) && (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) || MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) - mutex_init(&dev->lb_mutex); + mutex_init(&dev->lb.mutex); return 0; } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 320d4dfe8c2f..fde5a867a7d3 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -858,6 +858,12 @@ to_mcounters(struct ib_counters *ibcntrs) return container_of(ibcntrs, struct mlx5_ib_mcounters, ibcntrs); } +struct mlx5_ib_lb_state { + /* protect the user_td */ + struct mutex mutex; + u32 user_td; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; const struct uverbs_object_tree_def *driver_trees[6]; @@ -899,9 +905,7 @@ struct mlx5_ib_dev { const struct mlx5_ib_profile *profile; struct mlx5_eswitch_rep *rep; - /* protect the user_td */ - struct mutex lb_mutex; - u32 user_td; + struct mlx5_ib_lb_state lb; u8 umr_fence; struct list_head ib_dev_list; u64 sys_image_guid; From 175edba85634a8be0ddab5ee96d0b23d9f17627e Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 17 Sep 2018 13:30:48 +0300 Subject: [PATCH 115/242] RDMA/mlx5: Allow creating RAW ethernet QP with loopback support Expose two new flags: MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC Those flags can be used at creation time in order to allow a QP to be able to receive loopback traffic (unicast and multicast). We store the state in the QP to be used on the destroy path to indicate with which flags the QP was created with. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/qp.c | 62 ++++++++++++++++++++++------ include/uapi/rdma/mlx5-abi.h | 2 + 3 files changed, 52 insertions(+), 14 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index fde5a867a7d3..ca435654c30b 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -428,7 +428,7 @@ struct mlx5_ib_qp { struct list_head cq_send_list; struct mlx5_rate_limit rl; u32 underlay_qpn; - bool tunnel_offload_en; + u32 flags_en; /* storage for qp sub type when core qp type is IB_QPT_DRIVER */ enum ib_qp_type qp_sub_type; }; diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index daf1eb84cd31..f29ae401b232 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1258,8 +1258,9 @@ static bool tunnel_offload_supported(struct mlx5_core_dev *dev) static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn, - bool tunnel_offload_en) + u32 *qp_flags_en) { + u8 lb_flag = 0; u32 *in; void *tirc; int inlen; @@ -1274,12 +1275,21 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); MLX5_SET(tirc, tirc, transport_domain, tdn); - if (tunnel_offload_en) + if (*qp_flags_en & MLX5_QP_FLAG_TUNNEL_OFFLOADS) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); - if (dev->rep) - MLX5_SET(tirc, tirc, self_lb_block, - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST); + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + + if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + + if (dev->rep) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + *qp_flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); @@ -1332,8 +1342,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, goto err_destroy_sq; - err = create_raw_packet_qp_tir(dev, rq, tdn, - qp->tunnel_offload_en); + err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en); if (err) goto err_destroy_rq; } @@ -1410,6 +1419,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 tdn = mucontext->tdn; struct mlx5_ib_create_qp_rss ucmd = {}; size_t required_cmd_sz; + u8 lb_flag = 0; if (init_attr->qp_type != IB_QPT_RAW_PACKET) return -EOPNOTSUPP; @@ -1444,7 +1454,9 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } - if (ucmd.flags & ~MLX5_QP_FLAG_TUNNEL_OFFLOADS) { + if (ucmd.flags & ~(MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) { mlx5_ib_dbg(dev, "invalid flags\n"); return -EOPNOTSUPP; } @@ -1461,6 +1473,16 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return -EOPNOTSUPP; } + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->rep) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST; + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST; + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; + } + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); if (err) { mlx5_ib_dbg(dev, "copy failed\n"); @@ -1484,6 +1506,8 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) MLX5_SET(tirc, tirc, tunneled_offload_en, 1); + MLX5_SET(tirc, tirc, self_lb_block, lb_flag); + if (ucmd.rx_hash_fields_mask & MLX5_RX_HASH_INNER) hfso = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_inner); else @@ -1580,10 +1604,6 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields); create_tir: - if (dev->rep) - MLX5_SET(tirc, tirc, self_lb_block, - MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST); - err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); if (err) @@ -1710,7 +1730,23 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, mlx5_ib_dbg(dev, "Tunnel offload isn't supported\n"); return -EOPNOTSUPP; } - qp->tunnel_offload_en = true; + qp->flags_en |= MLX5_QP_FLAG_TUNNEL_OFFLOADS; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Self-LB UC isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC; + } + + if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC) { + if (init_attr->qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_dbg(dev, "Self-LB UM isn't supported\n"); + return -EOPNOTSUPP; + } + qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC; } if (init_attr->create_flags & IB_QP_CREATE_SOURCE_QPN) { diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index addbb9c4529e..e584ba40208e 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -45,6 +45,8 @@ enum { MLX5_QP_FLAG_BFREG_INDEX = 1 << 3, MLX5_QP_FLAG_TYPE_DCT = 1 << 4, MLX5_QP_FLAG_TYPE_DCI = 1 << 5, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC = 1 << 6, + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7, }; enum { From 0042f9e458a560e13c1da2211cf6429e0c7dd812 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 17 Sep 2018 13:30:49 +0300 Subject: [PATCH 116/242] RDMA/mlx5: Enable vport loopback when user context or QP mandate A user can create a QP which can accept loopback traffic, but that's not enough. We need to enable loopback on the vport as well. Currently vport loopback is enabled only when more than 1 users are using the IB device, update the logic to consider whatever a QP which supports loopback was created, if so enable vport loopback even if there is only a single user. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/main.c | 36 ++++++++++++++++++++-------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 4 ++++ drivers/infiniband/hw/mlx5/qp.c | 34 ++++++++++++++++++++------ 3 files changed, 57 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index e1b03b6908f0..131a1286a767 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1571,28 +1571,44 @@ static void deallocate_uars(struct mlx5_ib_dev *dev, mlx5_cmd_free_uar(dev->mdev, bfregi->sys_pages[i]); } -static int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev) +int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { int err = 0; mutex_lock(&dev->lb.mutex); - dev->lb.user_td++; + if (td) + dev->lb.user_td++; + if (qp) + dev->lb.qps++; - if (dev->lb.user_td == 2) - err = mlx5_nic_vport_update_local_lb(dev->mdev, true); + if (dev->lb.user_td == 2 || + dev->lb.qps == 1) { + if (!dev->lb.enabled) { + err = mlx5_nic_vport_update_local_lb(dev->mdev, true); + dev->lb.enabled = true; + } + } mutex_unlock(&dev->lb.mutex); return err; } -static void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev) +void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) { mutex_lock(&dev->lb.mutex); - dev->lb.user_td--; + if (td) + dev->lb.user_td--; + if (qp) + dev->lb.qps--; - if (dev->lb.user_td < 2) - mlx5_nic_vport_update_local_lb(dev->mdev, false); + if (dev->lb.user_td == 1 && + dev->lb.qps == 0) { + if (dev->lb.enabled) { + mlx5_nic_vport_update_local_lb(dev->mdev, false); + dev->lb.enabled = false; + } + } mutex_unlock(&dev->lb.mutex); } @@ -1613,7 +1629,7 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return err; - return mlx5_ib_enable_lb(dev); + return mlx5_ib_enable_lb(dev, true, false); } static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) @@ -1628,7 +1644,7 @@ static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) !MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc))) return; - mlx5_ib_disable_lb(dev); + mlx5_ib_disable_lb(dev, true, false); } static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index ca435654c30b..8376408e2bc9 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -862,6 +862,8 @@ struct mlx5_ib_lb_state { /* protect the user_td */ struct mutex mutex; u32 user_td; + int qps; + bool enabled; }; struct mlx5_ib_dev { @@ -1020,6 +1022,8 @@ int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr); int mlx5_ib_destroy_srq(struct ib_srq *srq); int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); +int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp); +void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp); struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index f29ae401b232..fcaa5c4d6feb 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1256,6 +1256,16 @@ static bool tunnel_offload_supported(struct mlx5_core_dev *dev) MLX5_CAP_ETH(dev, tunnel_stateless_geneve_rx)); } +static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u32 qp_flags_en) +{ + if (qp_flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) + mlx5_ib_disable_lb(dev, false, true); + mlx5_core_destroy_tir(dev->mdev, rq->tirn); +} + static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn, u32 *qp_flags_en) @@ -1293,17 +1303,17 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); + if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { + err = mlx5_ib_enable_lb(dev, false, true); + + if (err) + destroy_raw_packet_qp_tir(dev, rq, 0); + } kvfree(in); return err; } -static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, - struct mlx5_ib_rq *rq) -{ - mlx5_core_destroy_tir(dev->mdev, rq->tirn); -} - static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 *in, size_t inlen, struct ib_pd *pd) @@ -1372,7 +1382,7 @@ static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq = &raw_packet_qp->rq; if (qp->rq.wqe_cnt) { - destroy_raw_packet_qp_tir(dev, rq); + destroy_raw_packet_qp_tir(dev, rq, qp->flags_en); destroy_raw_packet_qp_rq(dev, rq); } @@ -1396,6 +1406,9 @@ static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { + if (qp->flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | + MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) + mlx5_ib_disable_lb(dev, false, true); mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); } @@ -1606,6 +1619,13 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, create_tir: err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn); + if (!err && MLX5_GET(tirc, tirc, self_lb_block)) { + err = mlx5_ib_enable_lb(dev, false, true); + + if (err) + mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); + } + if (err) goto err; From a1069c1c75d5be4d7fed354a33e5590de27ae0f3 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:19 +0300 Subject: [PATCH 117/242] IB/mlx5: Use uid as part of PD commands Use uid as part of PD commands so that the firmware can manage the PD object in a secured way. For example when a QP is created its uid must match the CQ uid which it uses. Next patches in this series will use the uid from the PD, then will come a patch to set the uid on the PD so that all objects will be properly work in one change. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 11 +++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 1 + drivers/infiniband/hw/mlx5/main.c | 14 +++++++++++--- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index c84fef9a8a08..67f16e900445 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -197,3 +197,14 @@ int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out) return mlx5_core_access_reg(dev, in, sz, out, sz, MLX5_REG_PPCNT, 0, 0); } + +void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_pd_in)] = {}; + + MLX5_SET(dealloc_pd_in, in, opcode, MLX5_CMD_OP_DEALLOC_PD); + MLX5_SET(dealloc_pd_in, in, pd, pdn); + MLX5_SET(dealloc_pd_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 88cbb1c41703..6df031d41a8f 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -47,4 +47,5 @@ int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev, int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, u64 length, u32 alignment); int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); +void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 853574345d91..821dceb614a1 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2254,21 +2254,29 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, struct mlx5_ib_alloc_pd_resp resp; struct mlx5_ib_pd *pd; int err; + u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_pd_in)] = {}; + u16 uid = 0; pd = kmalloc(sizeof(*pd), GFP_KERNEL); if (!pd) return ERR_PTR(-ENOMEM); - err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn); + MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); + MLX5_SET(alloc_pd_in, in, uid, uid); + err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in), + out, sizeof(out)); if (err) { kfree(pd); return ERR_PTR(err); } + pd->pdn = MLX5_GET(alloc_pd_out, out, pd); + pd->uid = uid; if (context) { resp.pdn = pd->pdn; if (ib_copy_to_udata(udata, &resp, sizeof(resp))) { - mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn); + mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid); kfree(pd); return ERR_PTR(-EFAULT); } @@ -2282,7 +2290,7 @@ static int mlx5_ib_dealloc_pd(struct ib_pd *pd) struct mlx5_ib_dev *mdev = to_mdev(pd->device); struct mlx5_ib_pd *mpd = to_mpd(pd); - mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn); + mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid); kfree(mpd); return 0; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index a28d04d4c9df..6b0f27b05c67 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -143,6 +143,7 @@ static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibuconte struct mlx5_ib_pd { struct ib_pd ibpd; u32 pdn; + u16 uid; }; enum { From 991d219829aaf4eaeea6fdb84021fe05f0fcdae3 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:20 +0300 Subject: [PATCH 118/242] IB/mlx5: Set uid as part of QP creation Set uid as part of QP creation so that the firmware can manage the QP object in a secured way. The uid for the destroy and the modify commands is set by mlx5_core. This will enable using a QP that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 1f318a47040c..127b386911e1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -850,6 +850,7 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, goto err_umem; } + MLX5_SET(create_qp_in, *in, uid, to_mpd(pd)->uid); pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, *in, pas); if (ubuffer->umem) mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, pas, 0); From 34d57585f91edf8eb3b7e966684e49832d61ecca Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:21 +0300 Subject: [PATCH 119/242] IB/mlx5: Set uid as part of RQ commands Set uid as part of RQ commands so that the firmware can manage the RQ object in a secured way. The uid for the destroy command is set by mlx5_core. This will enable using an RQ that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 127b386911e1..e92a0b9fec3e 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1189,7 +1189,7 @@ static size_t get_rq_pas_size(void *qpc) static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, void *qpin, - size_t qpinlen) + size_t qpinlen, struct ib_pd *pd) { struct mlx5_ib_qp *mqp = rq->base.container_mibqp; __be64 *pas; @@ -1210,6 +1210,7 @@ static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, if (!in) return -ENOMEM; + MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid); rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); if (!(rq->flags & MLX5_IB_RQ_CVLAN_STRIPPING)) MLX5_SET(rqc, rqc, vsd, 1); @@ -1348,7 +1349,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, rq->flags |= MLX5_IB_RQ_CVLAN_STRIPPING; if (qp->flags & MLX5_IB_QP_PCI_WRITE_END_PADDING) rq->flags |= MLX5_IB_RQ_PCI_WRITE_END_PADDING; - err = create_raw_packet_qp_rq(dev, rq, in, inlen); + err = create_raw_packet_qp_rq(dev, rq, in, inlen, pd); if (err) goto err_destroy_sq; @@ -2785,9 +2786,9 @@ static int ib_mask_to_mlx5_opt(int ib_mask) return result; } -static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, - struct mlx5_ib_rq *rq, int new_state, - const struct mlx5_modify_raw_qp_param *raw_qp_param) +static int modify_raw_packet_qp_rq( + struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd) { void *in; void *rqc; @@ -2800,6 +2801,7 @@ static int modify_raw_packet_qp_rq(struct mlx5_ib_dev *dev, return -ENOMEM; MLX5_SET(modify_rq_in, in, rq_state, rq->state); + MLX5_SET(modify_rq_in, in, uid, to_mpd(pd)->uid); rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); MLX5_SET(rqc, rqc, state, new_state); @@ -2947,7 +2949,8 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, } if (modify_rq) { - err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param); + err = modify_raw_packet_qp_rq(dev, rq, rq_state, raw_qp_param, + qp->ibqp.pd); if (err) return err; } @@ -5354,6 +5357,7 @@ static int create_rq(struct mlx5_ib_rwq *rwq, struct ib_pd *pd, if (!in) return -ENOMEM; + MLX5_SET(create_rq_in, in, uid, to_mpd(pd)->uid); rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); @@ -5739,6 +5743,7 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, if (wq_state == IB_WQS_ERR) wq_state = MLX5_RQC_STATE_ERR; MLX5_SET(modify_rq_in, in, rq_state, curr_wq_state); + MLX5_SET(modify_rq_in, in, uid, to_mpd(wq->pd)->uid); MLX5_SET(rqc, rqc, state, wq_state); if (wq_attr_mask & IB_WQ_FLAGS) { From c14003f09068711c8f1ad9cb89b5520b3579e563 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:22 +0300 Subject: [PATCH 120/242] IB/mlx5: Set uid as part of SQ commands Set uid as part of SQ commands so that the firmware can manage the SQ object in a secured way. The uid for the destroy command is set by mlx5_core. This will enable using an SQ that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index e92a0b9fec3e..3e5dbdcf3a61 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1115,6 +1115,7 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, goto err_umem; } + MLX5_SET(create_sq_in, in, uid, to_mpd(pd)->uid); sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); MLX5_SET(sqc, sqc, flush_in_error_en, 1); if (MLX5_CAP_ETH(dev->mdev, multi_pkt_send_wqe)) @@ -2827,10 +2828,9 @@ out: return err; } -static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, - struct mlx5_ib_sq *sq, - int new_state, - const struct mlx5_modify_raw_qp_param *raw_qp_param) +static int modify_raw_packet_qp_sq( + struct mlx5_core_dev *dev, struct mlx5_ib_sq *sq, int new_state, + const struct mlx5_modify_raw_qp_param *raw_qp_param, struct ib_pd *pd) { struct mlx5_ib_qp *ibqp = sq->base.container_mibqp; struct mlx5_rate_limit old_rl = ibqp->rl; @@ -2847,6 +2847,7 @@ static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, if (!in) return -ENOMEM; + MLX5_SET(modify_sq_in, in, uid, to_mpd(pd)->uid); MLX5_SET(modify_sq_in, in, sq_state, sq->state); sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); @@ -2963,7 +2964,8 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, return err; } - return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, raw_qp_param); + return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state, + raw_qp_param, qp->ibqp.pd); } return 0; From 9f33ec03bcdaf4f98023e6bfc6020304c10bb35a Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:23 +0300 Subject: [PATCH 121/242] IB/mlx5: Set uid as part of SRQ commands Set uid as part of SRQ create command so that the firmware can manage the SRQ object in a secured way. The uid for the destroy and modify commands are set by mlx5_core. That will enable using a SRQ that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/srq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index d359fecf7a5b..d012e7dbcc38 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -144,6 +144,7 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, in->log_page_size = page_shift - MLX5_ADAPTER_PAGE_SHIFT; in->page_offset = offset; + in->uid = to_mpd(pd)->uid; if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1 && in->type != IB_SRQT_BASIC) in->user_index = uidx; From a01a5860b209453b8c190fa304a63fada1bb9759 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:24 +0300 Subject: [PATCH 122/242] IB/mlx5: Set uid as part of DCT commands Set uid as part of DCT create command so that the firmware can manage the DCT object in a secured way. The uid for the destroy and drain commands are set by mlx5_core. That will enable using a DCT that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 3e5dbdcf3a61..13495816a10f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2251,6 +2251,7 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, goto err_free; } + MLX5_SET(create_dct_in, qp->dct.in, uid, to_mpd(pd)->uid); dctc = MLX5_ADDR_OF(create_dct_in, qp->dct.in, dct_context_entry); qp->qp_sub_type = MLX5_IB_QPT_DCT; MLX5_SET(dctc, dctc, pd, to_mpd(pd)->pdn); From 539ec982763a8f041761056a5fb9e09cdb6b3dcc Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:25 +0300 Subject: [PATCH 123/242] IB/mlx5: Set uid as part of MCG commands Set uid as part of MCG commands so that the firmware can manage the MCG object in a secured way. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 30 ++++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 4 ++++ drivers/infiniband/hw/mlx5/main.c | 11 +++++++++-- 3 files changed, 43 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 67f16e900445..91cfc2856bc4 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -208,3 +208,33 @@ void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) MLX5_SET(dealloc_pd_in, in, uid, uid); mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } + +int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(attach_to_mcg_out)] = {}; + u32 in[MLX5_ST_SZ_DW(attach_to_mcg_in)] = {}; + void *gid; + + MLX5_SET(attach_to_mcg_in, in, opcode, MLX5_CMD_OP_ATTACH_TO_MCG); + MLX5_SET(attach_to_mcg_in, in, qpn, qpn); + MLX5_SET(attach_to_mcg_in, in, uid, uid); + gid = MLX5_ADDR_OF(attach_to_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + +int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(detach_from_mcg_out)] = {}; + u32 in[MLX5_ST_SZ_DW(detach_from_mcg_in)] = {}; + void *gid; + + MLX5_SET(detach_from_mcg_in, in, opcode, MLX5_CMD_OP_DETACH_FROM_MCG); + MLX5_SET(detach_from_mcg_in, in, qpn, qpn); + MLX5_SET(detach_from_mcg_in, in, uid, uid); + gid = MLX5_ADDR_OF(detach_from_mcg_in, in, multicast_gid); + memcpy(gid, mgid, sizeof(*mgid)); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 6df031d41a8f..24175b3d5965 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -48,4 +48,8 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, u64 length, u32 alignment); int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); +int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid); +int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, + u32 qpn, u16 uid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 821dceb614a1..2ed30e181a8b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4033,13 +4033,17 @@ static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *mqp = to_mqp(ibqp); int err; + u16 uid; + + uid = ibqp->pd ? + to_mpd(ibqp->pd)->uid : 0; if (mqp->flags & MLX5_IB_QP_UNDERLAY) { mlx5_ib_dbg(dev, "Attaching a multi cast group to underlay QP is not supported\n"); return -EOPNOTSUPP; } - err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num); + err = mlx5_cmd_attach_mcg(dev->mdev, gid, ibqp->qp_num, uid); if (err) mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); @@ -4051,8 +4055,11 @@ static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) { struct mlx5_ib_dev *dev = to_mdev(ibqp->device); int err; + u16 uid; - err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num); + uid = ibqp->pd ? + to_mpd(ibqp->pd)->uid : 0; + err = mlx5_cmd_detach_mcg(dev->mdev, gid, ibqp->qp_num, uid); if (err) mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n", ibqp->qp_num, gid->raw); From 443c1cf9d6c845d0dc389469b78cefec842d5868 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:26 +0300 Subject: [PATCH 124/242] IB/mlx5: Set uid as part of TIR commands Set uid as part of TIR commands so that the firmware can manage the TIR object in a secured way. That will enable using a TIR that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 11 +++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 24 +++++++++++++++--------- 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 91cfc2856bc4..9834cd4d1e45 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -198,6 +198,17 @@ int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out) 0, 0); } +void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tir_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_tir_out)] = {}; + + MLX5_SET(destroy_tir_in, in, opcode, MLX5_CMD_OP_DESTROY_TIR); + MLX5_SET(destroy_tir_in, in, tirn, tirn); + MLX5_SET(destroy_tir_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) { u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {}; diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 24175b3d5965..feab7682bfbd 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -48,6 +48,7 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, u64 length, u32 alignment); int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); +void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid); int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 13495816a10f..2b2d26fa071d 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -37,6 +37,7 @@ #include #include "mlx5_ib.h" #include "ib_rep.h" +#include "cmd.h" /* not supported currently */ static int wq_signature; @@ -1261,17 +1262,19 @@ static bool tunnel_offload_supported(struct mlx5_core_dev *dev) static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, - u32 qp_flags_en) + u32 qp_flags_en, + struct ib_pd *pd) { if (qp_flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) mlx5_ib_disable_lb(dev, false, true); - mlx5_core_destroy_tir(dev->mdev, rq->tirn); + mlx5_cmd_destroy_tir(dev->mdev, rq->tirn, to_mpd(pd)->uid); } static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq, u32 tdn, - u32 *qp_flags_en) + u32 *qp_flags_en, + struct ib_pd *pd) { u8 lb_flag = 0; u32 *in; @@ -1284,6 +1287,7 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, if (!in) return -ENOMEM; + MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid); tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); @@ -1310,7 +1314,7 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, err = mlx5_ib_enable_lb(dev, false, true); if (err) - destroy_raw_packet_qp_tir(dev, rq, 0); + destroy_raw_packet_qp_tir(dev, rq, 0, pd); } kvfree(in); @@ -1354,8 +1358,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) goto err_destroy_sq; - - err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en); + err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd); if (err) goto err_destroy_rq; } @@ -1385,7 +1388,7 @@ static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_rq *rq = &raw_packet_qp->rq; if (qp->rq.wqe_cnt) { - destroy_raw_packet_qp_tir(dev, rq, qp->flags_en); + destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, qp->ibqp.pd); destroy_raw_packet_qp_rq(dev, rq); } @@ -1412,7 +1415,8 @@ static void destroy_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *q if (qp->flags_en & (MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC | MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)) mlx5_ib_disable_lb(dev, false, true); - mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, + to_mpd(qp->ibqp.pd)->uid); } static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, @@ -1510,6 +1514,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (!in) return -ENOMEM; + MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid); tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT); @@ -1626,7 +1631,8 @@ create_tir: err = mlx5_ib_enable_lb(dev, false, true); if (err) - mlx5_core_destroy_tir(dev->mdev, qp->rss_qp.tirn); + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, + to_mpd(pd)->uid); } if (err) From 1cd6dbd32f986fe05fef90249996f6ba394dfc78 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:27 +0300 Subject: [PATCH 125/242] IB/mlx5: Set uid as part of TIS commands Set uid as part of TIS commands so that the firmware can manage the TIS object in a secured way. That will enable using a TIS that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 11 +++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 27 +++++++++++++++++---------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 9834cd4d1e45..40f7299dbb1d 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -209,6 +209,17 @@ void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid) mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } +void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_tis_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(destroy_tis_out)] = {0}; + + MLX5_SET(destroy_tis_in, in, opcode, MLX5_CMD_OP_DESTROY_TIS); + MLX5_SET(destroy_tis_in, in, tisn, tisn); + MLX5_SET(destroy_tis_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) { u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {}; diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index feab7682bfbd..f7722d7b92a3 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -49,6 +49,7 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr, int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid); +void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid); int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 2b2d26fa071d..027e81abd50c 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1061,11 +1061,13 @@ static int is_connected(enum ib_qp_type qp_type) static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct mlx5_ib_sq *sq, u32 tdn) + struct mlx5_ib_sq *sq, u32 tdn, + struct ib_pd *pd) { u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {0}; void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + MLX5_SET(create_tis_in, in, uid, to_mpd(pd)->uid); MLX5_SET(tisc, tisc, transport_domain, tdn); if (qp->flags & MLX5_IB_QP_UNDERLAY) MLX5_SET(tisc, tisc, underlay_qpn, qp->underlay_qpn); @@ -1074,9 +1076,9 @@ static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, } static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, - struct mlx5_ib_sq *sq) + struct mlx5_ib_sq *sq, struct ib_pd *pd) { - mlx5_core_destroy_tis(dev->mdev, sq->tisn); + mlx5_cmd_destroy_tis(dev->mdev, sq->tisn, to_mpd(pd)->uid); } static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev, @@ -1335,7 +1337,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 tdn = mucontext->tdn; if (qp->sq.wqe_cnt) { - err = create_raw_packet_qp_tis(dev, qp, sq, tdn); + err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd); if (err) return err; @@ -1375,7 +1377,7 @@ err_destroy_sq: return err; destroy_raw_packet_qp_sq(dev, sq); err_destroy_tis: - destroy_raw_packet_qp_tis(dev, sq); + destroy_raw_packet_qp_tis(dev, sq, pd); return err; } @@ -1394,7 +1396,7 @@ static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, if (qp->sq.wqe_cnt) { destroy_raw_packet_qp_sq(dev, sq); - destroy_raw_packet_qp_tis(dev, sq); + destroy_raw_packet_qp_tis(dev, sq, qp->ibqp.pd); } } @@ -2524,7 +2526,8 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) } static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, - struct mlx5_ib_sq *sq, u8 sl) + struct mlx5_ib_sq *sq, u8 sl, + struct ib_pd *pd) { void *in; void *tisc; @@ -2537,6 +2540,7 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, return -ENOMEM; MLX5_SET(modify_tis_in, in, bitmask.prio, 1); + MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid); tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); @@ -2549,7 +2553,8 @@ static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, } static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, - struct mlx5_ib_sq *sq, u8 tx_affinity) + struct mlx5_ib_sq *sq, u8 tx_affinity, + struct ib_pd *pd) { void *in; void *tisc; @@ -2562,6 +2567,7 @@ static int modify_raw_packet_tx_affinity(struct mlx5_core_dev *dev, return -ENOMEM; MLX5_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1); + MLX5_SET(modify_tis_in, in, uid, to_mpd(pd)->uid); tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); MLX5_SET(tisc, tisc, lag_tx_port_affinity, tx_affinity); @@ -2646,7 +2652,7 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) return modify_raw_packet_eth_prio(dev->mdev, &qp->raw_packet_qp.sq, - sl & 0xf); + sl & 0xf, qp->ibqp.pd); return 0; } @@ -2966,7 +2972,8 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (modify_sq) { if (tx_affinity) { err = modify_raw_packet_tx_affinity(dev->mdev, sq, - tx_affinity); + tx_affinity, + qp->ibqp.pd); if (err) return err; } From 5deba86ee2cddaebaa1d37ad71efcda26b626592 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:28 +0300 Subject: [PATCH 126/242] IB/mlx5: Set uid as part of RQT commands Set uid as part of RQT commands so that the firmware can manage the RQT object in a secured way. That will enable using an RQT that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 11 +++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 1 + drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 7 +++++-- 4 files changed, 18 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 40f7299dbb1d..6862b03dbdec 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -220,6 +220,17 @@ void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid) mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } +void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(destroy_rqt_in)] = {}; + u32 out[MLX5_ST_SZ_DW(destroy_rqt_out)] = {}; + + MLX5_SET(destroy_rqt_in, in, opcode, MLX5_CMD_OP_DESTROY_RQT); + MLX5_SET(destroy_rqt_in, in, rqtn, rqtn); + MLX5_SET(destroy_rqt_in, in, uid, uid); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) { u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {}; diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index f7722d7b92a3..742452e4be7e 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -50,6 +50,7 @@ int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length); void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid); void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid); +void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid); int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 6b0f27b05c67..43eddd452a68 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -325,6 +325,7 @@ enum { struct mlx5_ib_rwq_ind_table { struct ib_rwq_ind_table ib_rwq_ind_tbl; u32 rqtn; + u16 uid; }; struct mlx5_ib_ubuffer { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 027e81abd50c..5dc5869692eb 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -5679,6 +5679,9 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, for (i = 0; i < sz; i++) MLX5_SET(rqtc, rqtc, rq_num[i], init_attr->ind_tbl[i]->wq_num); + rwq_ind_tbl->uid = to_mpd(init_attr->ind_tbl[0]->pd)->uid; + MLX5_SET(create_rqt_in, in, uid, rwq_ind_tbl->uid); + err = mlx5_core_create_rqt(dev->mdev, in, inlen, &rwq_ind_tbl->rqtn); kvfree(in); @@ -5697,7 +5700,7 @@ struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device, return &rwq_ind_tbl->ib_rwq_ind_tbl; err_copy: - mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid); err: kfree(rwq_ind_tbl); return ERR_PTR(err); @@ -5708,7 +5711,7 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl) struct mlx5_ib_rwq_ind_table *rwq_ind_tbl = to_mrwq_ind_table(ib_rwq_ind_tbl); struct mlx5_ib_dev *dev = to_mdev(ib_rwq_ind_tbl->device); - mlx5_core_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn); + mlx5_cmd_destroy_rqt(dev->mdev, rwq_ind_tbl->rqtn, rwq_ind_tbl->uid); kfree(rwq_ind_tbl); return 0; From 58895f0d18df09a6be82e6e4141e4609dd12f84f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:29 +0300 Subject: [PATCH 127/242] IB/mlx5: Set uid upon PD allocation Set uid as part of PD allocation, this uid is used for other mlx5 objects upon calling the firmware. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 2ed30e181a8b..0df397fee94f 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -2262,6 +2262,7 @@ static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev, if (!pd) return ERR_PTR(-ENOMEM); + uid = context ? to_mucontext(context)->devx_uid : 0; MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD); MLX5_SET(alloc_pd_in, in, uid, uid); err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in), From cf50a7863b424527c95ef41e2b26a03678cf009e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:30 +0300 Subject: [PATCH 128/242] IB/mlx5: Set uid as part of CQ creation Set uid as part of CQ creation so that the firmware can manage the CQ object in a secured way. The uid for the destroy and the modify commands is set by mlx5_core. This will enable using a CQ that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 088205d7f1a1..dae30b6478bf 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -877,6 +877,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata, cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD; } + MLX5_SET(create_cq_in, *cqb, uid, to_mucontext(context)->devx_uid); return 0; err_cqb: From d00614c0570656480d6dea5c84c1caec58f40021 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:31 +0300 Subject: [PATCH 129/242] IB/mlx5: Set uid as part of XRCD commands Set uid as part of XRCD commands so that the firmware can manage the XRCD object in a secured way. That will enable using an XRCD that was created by verbs application to be used by the DEVX flow in case the uid is equal. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 25 +++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 2 ++ drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 + drivers/infiniband/hw/mlx5/qp.c | 8 ++++++-- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 6862b03dbdec..2a530e9f99e6 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -271,3 +271,28 @@ int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, memcpy(gid, mgid, sizeof(*mgid)); return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } + +int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(alloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(alloc_xrcd_in)] = {}; + int err; + + MLX5_SET(alloc_xrcd_in, in, opcode, MLX5_CMD_OP_ALLOC_XRCD); + MLX5_SET(alloc_xrcd_in, in, uid, uid); + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *xrcdn = MLX5_GET(alloc_xrcd_out, out, xrcd); + return err; +} + +int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid) +{ + u32 out[MLX5_ST_SZ_DW(dealloc_xrcd_out)] = {}; + u32 in[MLX5_ST_SZ_DW(dealloc_xrcd_in)] = {}; + + MLX5_SET(dealloc_xrcd_in, in, opcode, MLX5_CMD_OP_DEALLOC_XRCD); + MLX5_SET(dealloc_xrcd_in, in, xrcd, xrcdn); + MLX5_SET(dealloc_xrcd_in, in, uid, uid); + return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 742452e4be7e..12fe63005d11 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -55,4 +55,6 @@ int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); +int mlx5_cmd_xrcd_alloc(struct mlx5_core_dev *dev, u32 *xrcdn, u16 uid); +int mlx5_cmd_xrcd_dealloc(struct mlx5_core_dev *dev, u32 xrcdn, u16 uid); #endif /* MLX5_IB_CMD_H */ diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 43eddd452a68..94cf83bd1716 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -539,6 +539,7 @@ struct mlx5_ib_srq { struct mlx5_ib_xrcd { struct ib_xrcd ibxrcd; u32 xrcdn; + u16 uid; }; enum mlx5_ib_mtt_access_flags { diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5dc5869692eb..a0cb260a271f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -5280,6 +5280,7 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_xrcd *xrcd; int err; + u16 uid; if (!MLX5_CAP_GEN(dev->mdev, xrc)) return ERR_PTR(-ENOSYS); @@ -5288,12 +5289,14 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev, if (!xrcd) return ERR_PTR(-ENOMEM); - err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn); + uid = context ? to_mucontext(context)->devx_uid : 0; + err = mlx5_cmd_xrcd_alloc(dev->mdev, &xrcd->xrcdn, uid); if (err) { kfree(xrcd); return ERR_PTR(-ENOMEM); } + xrcd->uid = uid; return &xrcd->ibxrcd; } @@ -5301,9 +5304,10 @@ int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd) { struct mlx5_ib_dev *dev = to_mdev(xrcd->device); u32 xrcdn = to_mxrcd(xrcd)->xrcdn; + u16 uid = to_mxrcd(xrcd)->uid; int err; - err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn); + err = mlx5_cmd_xrcd_dealloc(dev->mdev, xrcdn, uid); if (err) mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn); From d2d19121ae2f4bc4e818dd770c1746deadf14093 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:32 +0300 Subject: [PATCH 130/242] IB/mlx5: Set uid as part of TD commands Set uid as part of TD commands so that the firmware can manage the TD object in a secured way. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/cmd.c | 30 ++++++++++++++++++++++++++++ drivers/infiniband/hw/mlx5/cmd.h | 4 ++++ drivers/infiniband/hw/mlx5/main.c | 33 +++++++++++++++++-------------- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c index 2a530e9f99e6..ca060a2e2b36 100644 --- a/drivers/infiniband/hw/mlx5/cmd.c +++ b/drivers/infiniband/hw/mlx5/cmd.c @@ -231,6 +231,36 @@ void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid) mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); } +int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, + u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)] = {0}; + int err; + + MLX5_SET(alloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_ALLOC_TRANSPORT_DOMAIN); + + err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); + if (!err) + *tdn = MLX5_GET(alloc_transport_domain_out, out, + transport_domain); + + return err; +} + +void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn, + u16 uid) +{ + u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)] = {0}; + u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)] = {0}; + + MLX5_SET(dealloc_transport_domain_in, in, opcode, + MLX5_CMD_OP_DEALLOC_TRANSPORT_DOMAIN); + MLX5_SET(dealloc_transport_domain_in, in, transport_domain, tdn); + mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out)); +} + void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid) { u32 out[MLX5_ST_SZ_DW(dealloc_pd_out)] = {}; diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h index 12fe63005d11..c03c56455534 100644 --- a/drivers/infiniband/hw/mlx5/cmd.h +++ b/drivers/infiniband/hw/mlx5/cmd.h @@ -51,6 +51,10 @@ void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid); void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid); void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid); void mlx5_cmd_destroy_rqt(struct mlx5_core_dev *dev, u32 rqtn, u16 uid); +int mlx5_cmd_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn, + u16 uid); +void mlx5_cmd_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn, + u16 uid); int mlx5_cmd_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn, u16 uid); int mlx5_cmd_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 0df397fee94f..91693e1a3731 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1613,14 +1613,15 @@ void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp) mutex_unlock(&dev->lb.mutex); } -static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) +static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn, + u16 uid) { int err; if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) return 0; - err = mlx5_core_alloc_transport_domain(dev->mdev, tdn); + err = mlx5_cmd_alloc_transport_domain(dev->mdev, tdn, uid); if (err) return err; @@ -1632,12 +1633,13 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn) return mlx5_ib_enable_lb(dev, true, false); } -static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn) +static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn, + u16 uid) { if (!MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) return; - mlx5_core_dealloc_transport_domain(dev->mdev, tdn); + mlx5_cmd_dealloc_transport_domain(dev->mdev, tdn, uid); if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) || (!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) && @@ -1756,22 +1758,23 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif - err = mlx5_ib_alloc_transport_domain(dev, &context->tdn); - if (err) - goto out_uars; - if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { /* Block DEVX on Infiniband as of SELinux */ if (mlx5_ib_port_link_layer(ibdev, 1) != IB_LINK_LAYER_ETHERNET) { err = -EPERM; - goto out_td; + goto out_uars; } err = mlx5_ib_devx_create(dev, context); if (err) - goto out_td; + goto out_uars; } + err = mlx5_ib_alloc_transport_domain(dev, &context->tdn, + context->devx_uid); + if (err) + goto out_devx; + if (MLX5_CAP_GEN(dev->mdev, dump_fill_mkey)) { err = mlx5_cmd_dump_fill_mkey(dev->mdev, &dump_fill_mkey); if (err) @@ -1864,10 +1867,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, return &context->ibucontext; out_mdev: + mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); +out_devx: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) mlx5_ib_devx_destroy(dev, context); -out_td: - mlx5_ib_dealloc_transport_domain(dev, context->tdn); out_uars: deallocate_uars(dev, context); @@ -1897,12 +1900,12 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) mutex_unlock(&ibcontext->per_mm_list_lock); #endif + bfregi = &context->bfregi; + mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); + if (context->devx_uid) mlx5_ib_devx_destroy(dev, context); - bfregi = &context->bfregi; - mlx5_ib_dealloc_transport_domain(dev, context->tdn); - deallocate_uars(dev, context); kfree(bfregi->sys_pages); kfree(bfregi->count); From ba1a057da2f17411009ecfbdfde4d68bc8c1765e Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:39:33 +0300 Subject: [PATCH 131/242] IB/mlx5: Set valid umem bit on DEVX Set valid umem bit on DEVX commands that use umem. This will enforce the umem usage by the firmware and not the 'pas' info. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 95 +++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 25dafa4ff6ca..562c7936bbad 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -264,6 +264,97 @@ static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in) return false; } +static void devx_set_umem_valid(const void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_CREATE_MKEY: + MLX5_SET(create_mkey_in, in, mkey_umem_valid, 1); + break; + case MLX5_CMD_OP_CREATE_CQ: + { + void *cqc; + + MLX5_SET(create_cq_in, in, cq_umem_valid, 1); + cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context); + MLX5_SET(cqc, cqc, dbr_umem_valid, 1); + break; + } + case MLX5_CMD_OP_CREATE_QP: + { + void *qpc; + + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + MLX5_SET(qpc, qpc, dbr_umem_valid, 1); + MLX5_SET(create_qp_in, in, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_RQ: + { + void *rqc, *wq; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_SQ: + { + void *sqc, *wq; + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_MODIFY_CQ: + MLX5_SET(modify_cq_in, in, cq_umem_valid, 1); + break; + + case MLX5_CMD_OP_CREATE_RMP: + { + void *rmpc, *wq; + + rmpc = MLX5_ADDR_OF(create_rmp_in, in, ctx); + wq = MLX5_ADDR_OF(rmpc, rmpc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_XRQ: + { + void *xrqc, *wq; + + xrqc = MLX5_ADDR_OF(create_xrq_in, in, xrq_context); + wq = MLX5_ADDR_OF(xrqc, xrqc, wq); + MLX5_SET(wq, wq, dbr_umem_valid, 1); + MLX5_SET(wq, wq, wq_umem_valid, 1); + break; + } + + case MLX5_CMD_OP_CREATE_XRC_SRQ: + { + void *xrc_srqc; + + MLX5_SET(create_xrc_srq_in, in, xrc_srq_umem_valid, 1); + xrc_srqc = MLX5_ADDR_OF(create_xrc_srq_in, in, + xrc_srq_context_entry); + MLX5_SET(xrc_srqc, xrc_srqc, dbr_umem_valid, 1); + break; + } + + default: + return; + } +} + static bool devx_is_obj_create_cmd(const void *in) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); @@ -741,6 +832,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( return -ENOMEM; MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + devx_set_umem_valid(cmd_in); + err = mlx5_cmd_exec(dev->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_IN), cmd_out, cmd_out_len); @@ -790,6 +883,8 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( return PTR_ERR(cmd_out); MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + devx_set_umem_valid(cmd_in); + err = mlx5_cmd_exec(obj->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_MODIFY_CMD_IN), cmd_out, cmd_out_len); From e3b00e9c3051162a46dd03be0bd92fe52de703b3 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 24 Sep 2018 18:27:38 +0100 Subject: [PATCH 132/242] IB/usnic: fix spelling mistake "unvalid" -> "invalid" Trivial fix to spelling mistake in usnic_err error message. Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/usnic/usnic_transport.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c index e0a95538c364..82dd810bc000 100644 --- a/drivers/infiniband/hw/usnic/usnic_transport.c +++ b/drivers/infiniband/hw/usnic/usnic_transport.c @@ -121,7 +121,7 @@ void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num) if (type == USNIC_TRANSPORT_ROCE_CUSTOM) { spin_lock(&roce_bitmap_lock); if (!port_num) { - usnic_err("Unreserved unvalid port num 0 for %s\n", + usnic_err("Unreserved invalid port num 0 for %s\n", usnic_transport_to_str(type)); goto out_roce_custom; } From 0430e74f9fbe4d027a86f39fdb6f20427912088f Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Mon, 24 Sep 2018 18:58:07 +0000 Subject: [PATCH 133/242] RDMA/mlx5: Remove superfluous version print When profiles were introduced to MLX5 IB an unneeded version print when creating an MLX5 IB device was added. Remove the print, we still have a printk for driver version in mlx5_ib_add(). Fixes: 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") Signed-off-by: Mark Bloch Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 91693e1a3731..fb1e3c546826 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -6199,8 +6199,6 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, int err; int i; - printk_once(KERN_INFO "%s", mlx5_version); - for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { if (profile->stage[i].init) { err = profile->stage[i].init(dev); From 1b571086e869395b6a11ab24186b0104fe05c057 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 12:29:03 -0700 Subject: [PATCH 134/242] iw_cxgb4: Use proper enumerated type in c4iw_bar2_addrs Clang warns when one enumerated type is implicitly converted to another. drivers/infiniband/hw/cxgb4/qp.c:287:8: warning: implicit conversion from enumeration type 'enum t4_bar2_qtype' to different enumeration type 'enum cxgb4_bar2_qtype' [-Wenum-conversion] T4_BAR2_QTYPE_EGRESS, ^~~~~~~~~~~~~~~~~~~~ c4iw_bar2_addrs expects a value from enum cxgb4_bar2_qtype so use the corresponding values from that type so Clang is satisfied without changing the meaning of the code. T4_BAR2_QTYPE_EGRESS = CXGB4_BAR2_QTYPE_EGRESS = 0 T4_BAR2_QTYPE_INGRESS = CXGB4_BAR2_QTYPE_INGRESS = 1 Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Acked-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/cxgb4/cq.c | 2 +- drivers/infiniband/hw/cxgb4/qp.c | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index 6d3042794094..1fd8798d91a7 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -161,7 +161,7 @@ static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq, cq->gts = rdev->lldi.gts_reg; cq->rdev = rdev; - cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, T4_BAR2_QTYPE_INGRESS, + cq->bar2_va = c4iw_bar2_addrs(rdev, cq->cqid, CXGB4_BAR2_QTYPE_INGRESS, &cq->bar2_qid, user ? &cq->bar2_pa : NULL); if (user && !cq->bar2_pa) { diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index e78dd9afac86..74eb70300fdf 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -279,12 +279,13 @@ static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq, wq->db = rdev->lldi.db_reg; - wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, T4_BAR2_QTYPE_EGRESS, + wq->sq.bar2_va = c4iw_bar2_addrs(rdev, wq->sq.qid, + CXGB4_BAR2_QTYPE_EGRESS, &wq->sq.bar2_qid, user ? &wq->sq.bar2_pa : NULL); if (need_rq) wq->rq.bar2_va = c4iw_bar2_addrs(rdev, wq->rq.qid, - T4_BAR2_QTYPE_EGRESS, + CXGB4_BAR2_QTYPE_EGRESS, &wq->rq.bar2_qid, user ? &wq->rq.bar2_pa : NULL); @@ -2572,7 +2573,7 @@ static int alloc_srq_queue(struct c4iw_srq *srq, struct c4iw_dev_ucontext *uctx, memset(wq->queue, 0, wq->memsize); pci_unmap_addr_set(wq, mapping, wq->dma_addr); - wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, T4_BAR2_QTYPE_EGRESS, + wq->bar2_va = c4iw_bar2_addrs(rdev, wq->qid, CXGB4_BAR2_QTYPE_EGRESS, &wq->bar2_qid, user ? &wq->bar2_pa : NULL); From 3312d1c6bdee6aa912c099c0ac0662d197c52842 Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 21 Sep 2018 11:30:12 -0400 Subject: [PATCH 135/242] RDMA/umem: Minor optimizations Noticed while reviewing commit d4b4dd1b9706 ("RDMA/umem: Do not use current->tgid to track the mm_struct") patch. Why would we take a lock, adjust a protected variable, drop the lock, and *then* check the input into our protected variable adjustment? Then we have to take the lock again on our error unwind. Let's just check the input early and skip taking the locks needlessly if the input isn't valid. It was also noticed that we set mm = current->mm, we then never modify mm, but we still go back and reference current->mm a number of times needlessly. Be consistent in using the stored reference in mm. Signed-off-by: Doug Ledford Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index fec5d489e311..1886d7709911 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -152,6 +152,10 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem->hugetlb = 0; npages = ib_umem_num_pages(umem); + if (npages == 0 || npages > UINT_MAX) { + ret = -EINVAL; + goto out; + } lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; @@ -166,11 +170,6 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base = addr & PAGE_MASK; - if (npages == 0 || npages > UINT_MAX) { - ret = -EINVAL; - goto vma; - } - ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL); if (ret) goto vma; @@ -224,9 +223,9 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, umem_release: __ib_umem_release(context->device, umem, 0); vma: - down_write(¤t->mm->mmap_sem); - current->mm->pinned_vm -= ib_umem_num_pages(umem); - up_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); + mm->pinned_vm -= ib_umem_num_pages(umem); + up_write(&mm->mmap_sem); out: if (vma_list) free_page((unsigned long) vma_list); From c6ce580716372d71cd119bacf73f14a62e9af2ea Mon Sep 17 00:00:00 2001 From: Doug Ledford Date: Fri, 21 Sep 2018 11:30:13 -0400 Subject: [PATCH 136/242] RDMA/umem: Fix potential addition overflow Given a large enough memory allocation, it is possible to wrap the pinned_vm counter. Check for addition overflow to prevent such eventualities. Fixes: 40ddacf2dda9 ("RDMA/umem: Don't hold mmap_sem for too long") Reported-by: Jason Gunthorpe Signed-off-by: Doug Ledford Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 1886d7709911..8da1cf29a69f 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -85,6 +85,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, struct page **page_list; struct vm_area_struct **vma_list; unsigned long lock_limit; + unsigned long new_pinned; unsigned long cur_base; struct mm_struct *mm; unsigned long npages; @@ -160,12 +161,13 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; down_write(&mm->mmap_sem); - mm->pinned_vm += npages; - if ((mm->pinned_vm > lock_limit) && !capable(CAP_IPC_LOCK)) { + if (check_add_overflow(mm->pinned_vm, npages, &new_pinned) || + (new_pinned > lock_limit && !capable(CAP_IPC_LOCK))) { up_write(&mm->mmap_sem); ret = -ENOMEM; - goto vma; + goto out; } + mm->pinned_vm = new_pinned; up_write(&mm->mmap_sem); cur_base = addr & PAGE_MASK; From 46bdf777685677c1cc6b3da9220aace9da690731 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 26 Sep 2018 21:36:52 +0200 Subject: [PATCH 137/242] RDMA: Fix dependencies for rdma_user_mmap_io The mlx4 driver produces a link error when it is configured as built-in while CONFIG_INFINIBAND_USER_ACCESS is set to =m: drivers/infiniband/hw/mlx4/main.o: In function `mlx4_ib_mmap': main.c:(.text+0x1af4): undefined reference to `rdma_user_mmap_io' The same function is called from mlx5, which already has a dependency to ensure we can call it, and from hns, which appears to suffer from the same problem. This adds the same dependency that mlx5 uses to the other two. Fixes: 6745d356ab39 ("RDMA/hns: Use rdma_user_mmap_io") Fixes: c282da4109e4 ("RDMA/mlx4: Use rdma_user_mmap_io") Signed-off-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/Kconfig | 1 + drivers/infiniband/hw/mlx4/Kconfig | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/hns/Kconfig b/drivers/infiniband/hw/hns/Kconfig index fddb5fdf92de..21c2100b2ea9 100644 --- a/drivers/infiniband/hw/hns/Kconfig +++ b/drivers/infiniband/hw/hns/Kconfig @@ -1,6 +1,7 @@ config INFINIBAND_HNS tristate "HNS RoCE Driver" depends on NET_VENDOR_HISILICON + depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS depends on ARM64 || (COMPILE_TEST && 64BIT) ---help--- This is a RoCE/RDMA driver for the Hisilicon RoCE engine. The engine diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig index db4aa13ebae0..d1de3285fd88 100644 --- a/drivers/infiniband/hw/mlx4/Kconfig +++ b/drivers/infiniband/hw/mlx4/Kconfig @@ -1,6 +1,7 @@ config MLX4_INFINIBAND tristate "Mellanox ConnectX HCA support" depends on NETDEVICES && ETHERNET && PCI && INET + depends on INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS depends on MAY_USE_DEVLINK select NET_VENDOR_MELLANOX select MLX4_CORE From e349f858d29f300ad9ad327fd57735a1d15e147f Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 25 Sep 2018 16:58:09 -0600 Subject: [PATCH 138/242] RDMA: Fully setup the device name in ib_register_device The current code has two copies of the device name, ibdev->dev and dev_name(&ibdev->dev), and they are setup at different times, which is very confusing. Set them both up at the same time and make dev_name() the lead name, which is the proper use of the driver core APIs. To make it very clear that the name is not valid until registration pass it in to the ib_register_device() call rather than messing with ibdev->name directly. Also the reorganization now checks that dev_name is unique even if it does not contain a %. Signed-off-by: Jason Gunthorpe Acked-by: Adit Ranadive Reviewed-by: Steve Wise Acked-by: Devesh Sharma Reviewed-by: Shiraz Saleem Reviewed-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Reviewed-by: Michael J. Ruhl --- drivers/infiniband/core/device.c | 35 +++++++++++-------- drivers/infiniband/core/sysfs.c | 4 --- drivers/infiniband/hw/bnxt_re/main.c | 3 +- drivers/infiniband/hw/cxgb3/iwch_provider.c | 3 +- drivers/infiniband/hw/cxgb4/provider.c | 3 +- drivers/infiniband/hw/hns/hns_roce_main.c | 3 +- drivers/infiniband/hw/i40iw/i40iw_verbs.c | 3 +- drivers/infiniband/hw/mlx4/main.c | 3 +- drivers/infiniband/hw/mlx5/main.c | 15 ++++---- drivers/infiniband/hw/mthca/mthca_provider.c | 3 +- drivers/infiniband/hw/nes/nes_verbs.c | 3 +- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 3 +- drivers/infiniband/hw/qedr/main.c | 4 +-- drivers/infiniband/hw/usnic/usnic_ib_main.c | 3 +- .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 3 +- drivers/infiniband/sw/rdmavt/vt.c | 3 +- drivers/infiniband/sw/rxe/rxe_verbs.c | 3 +- include/rdma/ib_verbs.h | 6 ++-- include/rdma/rdma_vt.h | 9 ++++- 19 files changed, 53 insertions(+), 59 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5a680a88aa87..faacf95699d7 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -170,10 +170,9 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } -static int alloc_name(char *name) +static int alloc_name(struct ib_device *ibdev, const char *name) { unsigned long *inuse; - char buf[IB_DEVICE_NAME_MAX]; struct ib_device *device; int i; @@ -182,24 +181,21 @@ static int alloc_name(char *name) return -ENOMEM; list_for_each_entry(device, &device_list, core_list) { - if (!sscanf(device->name, name, &i)) + char buf[IB_DEVICE_NAME_MAX]; + + if (sscanf(device->name, name, &i) != 1) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; snprintf(buf, sizeof buf, name, i); - if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(buf, dev_name(&device->dev))) set_bit(i, inuse); } i = find_first_zero_bit(inuse, PAGE_SIZE * 8); free_page((unsigned long) inuse); - snprintf(buf, sizeof buf, name, i); - if (__ib_device_get_by_name(buf)) - return -ENFILE; - - strlcpy(name, buf, IB_DEVICE_NAME_MAX); - return 0; + return dev_set_name(&ibdev->dev, name, i); } static void ib_device_release(struct device *device) @@ -454,9 +450,9 @@ static u32 __dev_new_index(void) * callback for each device that is added. @device must be allocated * with ib_alloc_device(). */ -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)) +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)) { int ret; struct ib_client *client; @@ -495,11 +491,20 @@ int ib_register_device(struct ib_device *device, mutex_lock(&device_mutex); - if (strchr(device->name, '%')) { - ret = alloc_name(device->name); + if (strchr(name, '%')) { + ret = alloc_name(device, name); + if (ret) + goto out; + } else { + ret = dev_set_name(&device->dev, name); if (ret) goto out; } + if (__ib_device_get_by_name(dev_name(&device->dev))) { + ret = -ENFILE; + goto out; + } + strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); if (ib_device_check_mandatory(device)) { ret = -EINVAL; diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 0b04dbff884f..bc947a863b34 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1311,10 +1311,6 @@ int ib_device_register_sysfs(struct ib_device *device, int ret; int i; - ret = dev_set_name(class_dev, "%s", device->name); - if (ret) - return ret; - device->groups[0] = &dev_attr_group; class_dev->groups = device->groups; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 20b9f31052bf..73632e5b819f 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -579,7 +579,6 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) /* ib device init */ ibdev->owner = THIS_MODULE; ibdev->node_type = RDMA_NODE_IB_CA; - strlcpy(ibdev->name, "bnxt_re%d", IB_DEVICE_NAME_MAX); strlcpy(ibdev->node_desc, BNXT_RE_DESC " HCA", strlen(BNXT_RE_DESC) + 5); ibdev->phys_port_cnt = 1; @@ -672,7 +671,7 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats; ibdev->driver_id = RDMA_DRIVER_BNXT_RE; - return ib_register_device(ibdev, NULL); + return ib_register_device(ibdev, "bnxt_re%d", NULL); } static ssize_t show_rev(struct device *device, struct device_attribute *attr, diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 1b9ff21aa1d5..39530cc15f95 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1319,7 +1319,6 @@ int iwch_register_device(struct iwch_dev *dev) int i; pr_debug("%s iwch_dev %p\n", __func__, dev); - strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -1402,7 +1401,7 @@ int iwch_register_device(struct iwch_dev *dev) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL); if (ret) goto bail1; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 4eda6872e617..416f8d1af610 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -535,7 +535,6 @@ void c4iw_register_device(struct work_struct *work) struct c4iw_dev *dev = ctx->dev; pr_debug("c4iw_dev %p\n", dev); - strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6); dev->ibdev.owner = THIS_MODULE; @@ -627,7 +626,7 @@ void c4iw_register_device(struct work_struct *work) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; - ret = ib_register_device(&dev->ibdev, NULL); + ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL); if (ret) goto err_kfree_iwcm; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 6edb547baee8..5a86a48cba13 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -449,7 +449,6 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) spin_lock_init(&iboe->lock); ib_dev = &hr_dev->ib_dev; - strlcpy(ib_dev->name, "hns_%d", IB_DEVICE_NAME_MAX); ib_dev->owner = THIS_MODULE; ib_dev->node_type = RDMA_NODE_IB_CA; @@ -530,7 +529,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; ib_dev->driver_id = RDMA_DRIVER_HNS; - ret = ib_register_device(ib_dev, NULL); + ret = ib_register_device(ib_dev, "hns_%d", NULL); if (ret) { dev_err(dev, "ib_register_device failed!\n"); return ret; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index e2e6c74a7452..cb2aef874ca8 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2752,7 +2752,6 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev i40iw_pr_err("iwdev == NULL\n"); return NULL; } - strlcpy(iwibdev->ibdev.name, "i40iw%d", IB_DEVICE_NAME_MAX); iwibdev->ibdev.owner = THIS_MODULE; iwdev->iwibdev = iwibdev; iwibdev->iwdev = iwdev; @@ -2897,7 +2896,7 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev) iwibdev = iwdev->iwibdev; iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; - ret = ib_register_device(&iwibdev->ibdev, NULL); + ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL); if (ret) goto error; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index bf3cdb88aaf5..fa5d20eccc21 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2540,7 +2540,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) ibdev->dev = dev; ibdev->bond_next_port = 0; - strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX); ibdev->ib_dev.owner = THIS_MODULE; ibdev->ib_dev.node_type = RDMA_NODE_IB_CA; ibdev->ib_dev.local_dma_lkey = dev->caps.reserved_lkey; @@ -2803,7 +2802,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_steer_free_bitmap; ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; - if (ib_register_device(&ibdev->ib_dev, NULL)) + if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL)) goto err_diag_counters; if (mlx4_ib_mad_init(ibdev)) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fb1e3c546826..597cd3c171c9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -5671,7 +5671,6 @@ void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev) int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) { struct mlx5_core_dev *mdev = dev->mdev; - const char *name; int err; int i; @@ -5704,12 +5703,6 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) if (mlx5_use_mad_ifc(dev)) get_ext_port_caps(dev); - if (!mlx5_lag_is_active(mdev)) - name = "mlx5_%d"; - else - name = "mlx5_bond_%d"; - - strlcpy(dev->ib_dev.name, name, IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.node_type = RDMA_NODE_IB_CA; dev->ib_dev.local_dma_lkey = 0 /* not supported for now */; @@ -6122,7 +6115,13 @@ static int mlx5_ib_stage_populate_specs(struct mlx5_ib_dev *dev) int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { - return ib_register_device(&dev->ib_dev, NULL); + const char *name; + + if (!mlx5_lag_is_active(dev->mdev)) + name = "mlx5_%d"; + else + name = "mlx5_bond_%d"; + return ib_register_device(&dev->ib_dev, name, NULL); } void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev) diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 0d3473b4596e..7bd7e2ad17e4 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1198,7 +1198,6 @@ int mthca_register_device(struct mthca_dev *dev) if (ret) return ret; - strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX); dev->ib_dev.owner = THIS_MODULE; dev->ib_dev.uverbs_abi_ver = MTHCA_UVERBS_ABI_VERSION; @@ -1297,7 +1296,7 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL); if (ret) return ret; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 6940c7215961..2127cd2f4bec 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -3640,7 +3640,6 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) if (nesibdev == NULL) { return NULL; } - strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX); nesibdev->ibdev.owner = THIS_MODULE; nesibdev->ibdev.node_type = RDMA_NODE_RNIC; @@ -3798,7 +3797,7 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) int i, ret; nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; - ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL); + ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL); if (ret) { return ret; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 7832ee3e0c84..4d3c27613351 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -116,7 +116,6 @@ static void get_dev_fw_str(struct ib_device *device, char *str) static int ocrdma_register_device(struct ocrdma_dev *dev) { - strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX); ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid); BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX); memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC, @@ -214,7 +213,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; } dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "ocrdma%d", NULL); } static int ocrdma_alloc_resources(struct ocrdma_dev *dev) diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index a0af6d424aed..cd7b8b39a129 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -170,8 +170,6 @@ static int qedr_register_device(struct qedr_dev *dev) { int rc; - strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX); - dev->ibdev.node_guid = dev->attr.node_guid; memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC)); dev->ibdev.owner = THIS_MODULE; @@ -264,7 +262,7 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str; dev->ibdev.driver_id = RDMA_DRIVER_QEDR; - return ib_register_device(&dev->ibdev, NULL); + return ib_register_device(&dev->ibdev, "qedr%d", NULL); } /* This function allocates fast-path status block memory */ diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index f0538a460328..3b9f12928314 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -364,7 +364,6 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS; us_ibdev->ib_dev.dev.parent = &dev->dev; us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION; - strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX); us_ibdev->ib_dev.uverbs_cmd_mask = (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) | @@ -416,7 +415,7 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; - if (ib_register_device(&us_ibdev->ib_dev, NULL)) + if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL)) goto err_fwd_dealloc; usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu); diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index a5719899f49a..6878107fc637 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -162,7 +162,6 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) int ret = -1; int i = 0; - strlcpy(dev->ib_dev.name, "vmw_pvrdma%d", IB_DEVICE_NAME_MAX); dev->ib_dev.node_guid = dev->dsr->caps.node_guid; dev->sys_image_guid = dev->dsr->caps.sys_image_guid; dev->flags = 0; @@ -267,7 +266,7 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; spin_lock_init(&dev->srq_tbl_lock); - ret = ib_register_device(&dev->ib_dev, NULL); + ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL); if (ret) goto err_srq_free; diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index 17e4abc067af..e3249d46bcef 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -828,7 +828,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->ibdev.driver_id = driver_id; /* We are now good to announce we exist */ - ret = ib_register_device(&rdi->ibdev, rdi->driver_f.port_callback); + ret = ib_register_device(&rdi->ibdev, dev_name(&rdi->ibdev.dev), + rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); goto bail_mr; diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index f5b1e0ad6142..e4da5b671e4a 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1159,7 +1159,6 @@ int rxe_register_device(struct rxe_dev *rxe) struct ib_device *dev = &rxe->ib_dev; struct crypto_shash *tfm; - strlcpy(dev->name, "rxe%d", IB_DEVICE_NAME_MAX); strlcpy(dev->node_desc, "rxe", sizeof(dev->node_desc)); dev->owner = THIS_MODULE; @@ -1261,7 +1260,7 @@ int rxe_register_device(struct rxe_dev *rxe) rxe->tfm = tfm; dev->driver_id = RDMA_DRIVER_RXE; - err = ib_register_device(dev, NULL); + err = ib_register_device(dev, "rxe%d", NULL); if (err) { pr_warn("%s failed with error %d\n", __func__, err); goto err1; diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 0d822a9db300..9897d2329f2c 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2625,9 +2625,9 @@ void ib_dealloc_device(struct ib_device *device); void ib_get_device_fw_str(struct ib_device *device, char *str); -int ib_register_device(struct ib_device *device, - int (*port_callback)(struct ib_device *, - u8, struct kobject *)); +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)); void ib_unregister_device(struct ib_device *device); int ib_register_client (struct ib_client *client); diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index e32facdd9fd3..065c9fbe6589 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -429,7 +429,14 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, const char *fmt, const char *name, const int unit) { - snprintf(rdi->ibdev.name, sizeof(rdi->ibdev.name), fmt, name, unit); + /* + * FIXME: rvt and its users want to touch the ibdev before + * registration and have things like the name work. We don't have the + * infrastructure in the core to support this directly today, hack it + * to work by setting the name manually here. + */ + dev_set_name(&rdi->ibdev.dev, fmt, name, unit); + strlcpy(rdi->ibdev.name, dev_name(&rdi->ibdev.dev), IB_DEVICE_NAME_MAX); } /** From 43c7c851b9bce9e6091f2c882871a3b388aa38c3 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:23 -0600 Subject: [PATCH 139/242] RDMA/core: Use dev_err/dbg/etc instead of pr_* + ibdev->name Any messages related to a device should be printed with the dev_* formatters. This provides greater consistency for the user. The core does not set pr_fmt so this has no significant change. Signed-off-by: Jason Gunthorpe Reviewed-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro --- drivers/infiniband/core/cache.c | 46 ++++++++++++++---------------- drivers/infiniband/core/cma.c | 11 +++---- drivers/infiniband/core/device.c | 33 ++++++++++++--------- drivers/infiniband/core/fmr_pool.c | 2 +- drivers/infiniband/core/restrack.c | 3 +- drivers/infiniband/core/verbs.c | 10 ++++--- 6 files changed, 55 insertions(+), 50 deletions(-) diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 8957d31d60ca..ebc64418d809 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -212,9 +212,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry) u8 port_num = entry->attr.port_num; struct ib_gid_table *table = rdma_gid_table(device, port_num); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - device->name, port_num, entry->attr.index, - entry->attr.gid.raw); + dev_dbg(&device->dev, "%s port=%d index=%d gid %pI6\n", __func__, + port_num, entry->attr.index, entry->attr.gid.raw); if (rdma_cap_roce_gid_table(device, port_num) && entry->state != GID_TABLE_ENTRY_INVALID) @@ -289,9 +288,9 @@ static void store_gid_entry(struct ib_gid_table *table, { entry->state = GID_TABLE_ENTRY_VALID; - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - entry->attr.device->name, entry->attr.port_num, - entry->attr.index, entry->attr.gid.raw); + dev_dbg(&entry->attr.device->dev, "%s port=%d index=%d gid %pI6\n", + __func__, entry->attr.port_num, entry->attr.index, + entry->attr.gid.raw); lockdep_assert_held(&table->lock); write_lock_irq(&table->rwlock); @@ -320,17 +319,16 @@ static int add_roce_gid(struct ib_gid_table_entry *entry) int ret; if (!attr->ndev) { - pr_err("%s NULL netdev device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, "%s NULL netdev port=%d index=%d\n", + __func__, attr->port_num, attr->index); return -EINVAL; } if (rdma_cap_roce_gid_table(attr->device, attr->port_num)) { ret = attr->device->add_gid(attr, &entry->context); if (ret) { - pr_err("%s GID add failed device=%s port=%d index=%d\n", - __func__, attr->device->name, attr->port_num, - attr->index); + dev_err(&attr->device->dev, + "%s GID add failed port=%d index=%d\n", + __func__, attr->port_num, attr->index); return ret; } } @@ -402,9 +400,8 @@ static void del_gid(struct ib_device *ib_dev, u8 port, lockdep_assert_held(&table->lock); - pr_debug("%s device=%s port=%d index=%d gid %pI6\n", __func__, - ib_dev->name, port, ix, - table->data_vec[ix]->attr.gid.raw); + dev_dbg(&ib_dev->dev, "%s port=%d index=%d gid %pI6\n", __func__, port, + ix, table->data_vec[ix]->attr.gid.raw); write_lock_irq(&table->rwlock); entry = table->data_vec[ix]; @@ -782,9 +779,9 @@ static void release_gid_table(struct ib_device *device, u8 port, if (is_gid_entry_free(table->data_vec[i])) continue; if (kref_read(&table->data_vec[i]->kref) > 1) { - pr_err("GID entry ref leak for %s (index %d) ref=%d\n", - device->name, i, - kref_read(&table->data_vec[i]->kref)); + dev_err(&device->dev, + "GID entry ref leak for index %d ref=%d\n", i, + kref_read(&table->data_vec[i]->kref)); leak = true; } } @@ -1303,8 +1300,9 @@ static int config_non_roce_gid_cache(struct ib_device *device, continue; ret = device->query_gid(device, port, i, &gid_attr.gid); if (ret) { - pr_warn("query_gid failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "query_gid failed (%d) for index %d\n", ret, + i); goto err; } gid_attr.index = i; @@ -1333,8 +1331,7 @@ static void ib_cache_update(struct ib_device *device, ret = ib_query_port(device, port, tprops); if (ret) { - pr_warn("ib_query_port failed (%d) for %s\n", - ret, device->name); + dev_warn(&device->dev, "ib_query_port failed (%d)\n", ret); goto err; } @@ -1356,8 +1353,9 @@ static void ib_cache_update(struct ib_device *device, for (i = 0; i < pkey_cache->table_len; ++i) { ret = ib_query_pkey(device, port, i, pkey_cache->table + i); if (ret) { - pr_warn("ib_query_pkey failed (%d) for %s (index %d)\n", - ret, device->name, i); + dev_warn(&device->dev, + "ib_query_pkey failed (%d) for index %d\n", + ret, i); goto err; } } diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index a57c8b823302..c650223c52bf 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2352,8 +2352,8 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, ret = rdma_listen(id, id_priv->backlog); if (ret) - pr_warn("RDMA CMA: cma_listen_on_dev, error %d, listening on device %s\n", - ret, cma_dev->device->name); + dev_warn(&cma_dev->device->dev, + "RDMA CMA: cma_listen_on_dev, error %d\n", ret); } static void cma_listen_on_all(struct rdma_id_private *id_priv) @@ -4082,9 +4082,10 @@ static int cma_join_ib_multicast(struct rdma_id_private *id_priv, (!ib_sa_sendonly_fullmem_support(&sa_client, id_priv->id.device, id_priv->id.port_num))) { - pr_warn("RDMA CM: %s port %u Unable to multicast join\n" - "RDMA CM: SM doesn't support Send Only Full Member option\n", - id_priv->id.device->name, id_priv->id.port_num); + dev_warn( + &id_priv->id.device->dev, + "RDMA CM: port %u Unable to multicast join: SM doesn't support Send Only Full Member option\n", + id_priv->id.port_num); return -EOPNOTSUPP; } diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index faacf95699d7..7c3ff43092fd 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -123,8 +123,9 @@ static int ib_device_check_mandatory(struct ib_device *device) for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) { if (!*(void **) ((void *) device + mandatory_table[i].offset)) { - pr_warn("Device %s is missing mandatory function %s\n", - device->name, mandatory_table[i].name); + dev_warn(&device->dev, + "Device is missing mandatory function %s\n", + mandatory_table[i].name); return -EINVAL; } } @@ -513,20 +514,21 @@ int ib_register_device(struct ib_device *device, const char *name, ret = read_port_immutable(device); if (ret) { - pr_warn("Couldn't create per port immutable data %s\n", - device->name); + dev_warn(&device->dev, + "Couldn't create per port immutable data\n"); goto out; } ret = setup_port_pkey_list(device); if (ret) { - pr_warn("Couldn't create per port_pkey_list\n"); + dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); goto out; } ret = ib_cache_setup_one(device); if (ret) { - pr_warn("Couldn't set up InfiniBand P_Key/GID cache\n"); + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); goto port_cleanup; } @@ -534,21 +536,23 @@ int ib_register_device(struct ib_device *device, const char *name, ret = ib_device_register_rdmacg(device); if (ret) { - pr_warn("Couldn't register device with rdma cgroup\n"); + dev_warn(&device->dev, + "Couldn't register device with rdma cgroup\n"); goto cache_cleanup; } memset(&device->attrs, 0, sizeof(device->attrs)); ret = device->query_device(device, &device->attrs, &uhw); if (ret) { - pr_warn("Couldn't query the device attributes\n"); + dev_warn(&device->dev, + "Couldn't query the device attributes\n"); goto cg_cleanup; } ret = ib_device_register_sysfs(device, port_callback); if (ret) { - pr_warn("Couldn't register device %s with driver model\n", - device->name); + dev_warn(&device->dev, + "Couldn't register device with driver model\n"); goto cg_cleanup; } @@ -699,8 +703,9 @@ void ib_unregister_client(struct ib_client *client) found_context->data : NULL); if (!found_context) { - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, + "No client context found for %s\n", + client->name); continue; } @@ -764,8 +769,8 @@ void ib_set_client_data(struct ib_device *device, struct ib_client *client, goto out; } - pr_warn("No client context found for %s/%s\n", - device->name, client->name); + dev_warn(&device->dev, "No client context found for %s\n", + client->name); out: write_unlock_irqrestore(&device->client_data_lock, flags); diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index a077500f7f32..d1f2eee7d5da 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -213,7 +213,7 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, device = pd->device; if (!device->alloc_fmr || !device->dealloc_fmr || !device->map_phys_fmr || !device->unmap_fmr) { - pr_info(PFX "Device %s does not support FMRs\n", device->name); + dev_info(&device->dev, "Device does not support FMRs\n"); return ERR_PTR(-ENOSYS); } diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 3b7fa0ccaa08..bcc693fffd4c 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -50,8 +50,7 @@ void rdma_restrack_clean(struct rdma_restrack_root *res) dev = container_of(res, struct ib_device, res); pr_err("restrack: %s", CUT_HERE); - pr_err("restrack: BUG: RESTRACK detected leak of resources on %s\n", - dev->name); + dev_err(&dev->dev, "BUG: RESTRACK detected leak of resources\n"); hash_for_each(res->hash, bkt, e, node) { if (rdma_is_kernel_res(e)) { owner = e->kern_name; diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index c36be384fe34..ee5fc8408add 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1629,14 +1629,16 @@ static int _ib_modify_qp(struct ib_qp *qp, struct ib_qp_attr *attr, if (rdma_ib_or_roce(qp->device, port)) { if (attr_mask & IB_QP_RQ_PSN && attr->rq_psn & ~0xffffff) { - pr_warn("%s: %s rq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + "%s rq_psn overflow, masking to 24 bits\n", + __func__); attr->rq_psn &= 0xffffff; } if (attr_mask & IB_QP_SQ_PSN && attr->sq_psn & ~0xffffff) { - pr_warn("%s: %s sq_psn overflow, masking to 24 bits\n", - __func__, qp->device->name); + dev_warn(&qp->device->dev, + " %s sq_psn overflow, masking to 24 bits\n", + __func__); attr->sq_psn &= 0xffffff; } } From 5a738b5d47050b77ac8aa90bd79429940533ef6a Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:24 -0600 Subject: [PATCH 140/242] RDMA/drivers: Use dev_err/dbg/etc instead of pr_* + ibdev->name Kernel convention is that a driver for a subsystem will print using dev_* on the subsystem's struct device, or with dev_* on the physical device. Drivers should rarely use a pr_* function. Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mlx5_ib.h | 18 +++++++++--------- drivers/infiniband/hw/mlx5/qp.c | 10 ++++++---- drivers/infiniband/hw/mthca/mthca_mad.c | 5 +++-- drivers/infiniband/hw/ocrdma/ocrdma_hw.c | 2 +- drivers/infiniband/sw/rxe/rxe_net.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_sysfs.c | 2 +- 6 files changed, 22 insertions(+), 19 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 94cf83bd1716..5ada28d6e5e9 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -50,17 +50,17 @@ #include #include -#define mlx5_ib_dbg(dev, format, arg...) \ -pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, current->pid, ##arg) +#define mlx5_ib_dbg(_dev, format, arg...) \ + dev_dbg(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) -#define mlx5_ib_err(dev, format, arg...) \ -pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, current->pid, ##arg) +#define mlx5_ib_err(_dev, format, arg...) \ + dev_err(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) -#define mlx5_ib_warn(dev, format, arg...) \ -pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ - __LINE__, current->pid, ##arg) +#define mlx5_ib_warn(_dev, format, arg...) \ + dev_warn(&(_dev)->ib_dev.dev, "%s:%d:(pid %d): " format, __func__, \ + __LINE__, current->pid, ##arg) #define field_avail(type, fld, sz) (offsetof(type, fld) + \ sizeof(((type *)0)->fld) <= (sz)) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index a0cb260a271f..3455b50705cd 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -2826,8 +2826,9 @@ static int modify_raw_packet_qp_rq( MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID); MLX5_SET(rqc, rqc, counter_set_id, raw_qp_param->rq_q_ctr_id); } else - pr_info_once("%s: RAW PACKET QP counters are not supported on current FW\n", - dev->ib_dev.name); + dev_info_once( + &dev->ib_dev.dev, + "RAW PACKET QP counters are not supported on current FW\n"); } err = mlx5_core_modify_rq(dev->mdev, rq->base.mqp.qpn, in, inlen); @@ -5798,8 +5799,9 @@ int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr, MLX5_SET(rqc, rqc, counter_set_id, dev->port->cnts.set_id); } else - pr_info_once("%s: Receive WQ counters are not supported on current FW\n", - dev->ib_dev.name); + dev_info_once( + &dev->ib_dev.dev, + "Receive WQ counters are not supported on current FW\n"); } err = mlx5_core_modify_rq(dev->mdev, rwq->core_qp.qpn, in, inlen); diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c index 093f7755c843..2e5dc0a67cfc 100644 --- a/drivers/infiniband/hw/mthca/mthca_mad.c +++ b/drivers/infiniband/hw/mthca/mthca_mad.c @@ -58,8 +58,9 @@ static int mthca_update_rate(struct mthca_dev *dev, u8 port_num) ret = ib_query_port(&dev->ib_dev, port_num, tprops); if (ret) { - printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n", - ret, dev->ib_dev.name, port_num); + dev_warn(&dev->ib_dev.dev, + "ib_query_port failed (%d) forport %d\n", ret, + port_num); goto out; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c index e578281471af..241a57a07485 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c @@ -792,7 +792,7 @@ static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev, qp->srq->ibsrq. srq_context); } else if (dev_event) { - pr_err("%s: Fatal event received\n", dev->ibdev.name); + dev_err(&dev->ibdev.dev, "Fatal event received\n"); ib_dispatch_event(&ib_evt); } diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index ed823cccca48..8e658ce439b4 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -615,7 +615,7 @@ void rxe_port_up(struct rxe_dev *rxe) port->attr.phys_state = IB_PHYS_STATE_LINK_UP; rxe_port_event(rxe, IB_EVENT_PORT_ACTIVE); - pr_info("set %s active\n", rxe->ib_dev.name); + dev_info(&rxe->ib_dev.dev, "set active\n"); } /* Caller must hold net_info_lock */ @@ -628,7 +628,7 @@ void rxe_port_down(struct rxe_dev *rxe) port->attr.phys_state = IB_PHYS_STATE_LINK_DOWN; rxe_port_event(rxe, IB_EVENT_PORT_ERR); - pr_info("set %s down\n", rxe->ib_dev.name); + dev_info(&rxe->ib_dev.dev, "set down\n"); } static int rxe_notify(struct notifier_block *not_blk, diff --git a/drivers/infiniband/sw/rxe/rxe_sysfs.c b/drivers/infiniband/sw/rxe/rxe_sysfs.c index d5ed7571128f..73a19f808e1b 100644 --- a/drivers/infiniband/sw/rxe/rxe_sysfs.c +++ b/drivers/infiniband/sw/rxe/rxe_sysfs.c @@ -105,7 +105,7 @@ static int rxe_param_set_add(const char *val, const struct kernel_param *kp) } rxe_set_port_state(ndev); - pr_info("added %s to %s\n", rxe->ib_dev.name, intf); + dev_info(&rxe->ib_dev.dev, "added %s\n", intf); err: if (ndev) dev_put(ndev); From 896de0090a85f4c3a2b37fc0f46215a73c5b5429 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:25 -0600 Subject: [PATCH 141/242] RDMA/core: Use dev_name instead of ibdev->name These return the same thing but dev_name is a more conventional use of the kernel API. Signed-off-by: Jason Gunthorpe Reviewed-by: Steve Wise Reviewed-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro --- drivers/infiniband/core/cm.c | 2 +- drivers/infiniband/core/cma_configfs.c | 2 +- drivers/infiniband/core/device.c | 8 +++----- drivers/infiniband/core/fmr_pool.c | 3 ++- drivers/infiniband/core/iwcm.c | 2 +- drivers/infiniband/core/nldev.c | 3 ++- drivers/infiniband/core/sa_query.c | 2 +- drivers/infiniband/core/security.c | 7 +++---- drivers/infiniband/core/user_mad.c | 2 +- drivers/infiniband/core/uverbs_main.c | 2 +- 10 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 6e39c27dca8e..a6a20603ccea 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -4367,7 +4367,7 @@ static void cm_add_one(struct ib_device *ib_device) cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, - "%s", ib_device->name); + "%s", dev_name(&ib_device->dev)); if (IS_ERR(cm_dev->device)) { kfree(cm_dev); return; diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index eee38b40be99..8c2dfb3e294e 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -65,7 +65,7 @@ static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) static bool filter_by_name(struct ib_device *ib_dev, void *cookie) { - return !strcmp(ib_dev->name, cookie); + return !strcmp(dev_name(&ib_dev->dev), cookie); } static int cma_configfs_params_get(struct config_item *item, diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 7c3ff43092fd..d105b9b2d118 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -165,7 +165,7 @@ static struct ib_device *__ib_device_get_by_name(const char *name) struct ib_device *device; list_for_each_entry(device, &device_list, core_list) - if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX)) + if (!strcmp(name, dev_name(&device->dev))) return device; return NULL; @@ -184,7 +184,7 @@ static int alloc_name(struct ib_device *ibdev, const char *name) list_for_each_entry(device, &device_list, core_list) { char buf[IB_DEVICE_NAME_MAX]; - if (sscanf(device->name, name, &i) != 1) + if (sscanf(dev_name(&device->dev), name, &i) != 1) continue; if (i < 0 || i >= PAGE_SIZE * 8) continue; @@ -219,9 +219,7 @@ static void ib_device_release(struct device *device) static int ib_device_uevent(struct device *device, struct kobj_uevent_env *env) { - struct ib_device *dev = container_of(device, struct ib_device, dev); - - if (add_uevent_var(env, "NAME=%s", dev->name)) + if (add_uevent_var(env, "NAME=%s", dev_name(device))) return -ENOMEM; /* diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index d1f2eee7d5da..83ba0068e8bb 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -257,7 +257,8 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, atomic_set(&pool->flush_ser, 0); init_waitqueue_head(&pool->force_wait); - pool->worker = kthread_create_worker(0, "ib_fmr(%s)", device->name); + pool->worker = + kthread_create_worker(0, "ib_fmr(%s)", dev_name(&device->dev)); if (IS_ERR(pool->worker)) { pr_warn(PFX "couldn't start cleanup kthread worker\n"); ret = PTR_ERR(pool->worker); diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 5d676cff41f4..ba668d49c751 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -509,7 +509,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active) cm_id->m_local_addr = cm_id->local_addr; cm_id->m_remote_addr = cm_id->remote_addr; - memcpy(pm_reg_msg.dev_name, cm_id->device->name, + memcpy(pm_reg_msg.dev_name, dev_name(&cm_id->device->dev), sizeof(pm_reg_msg.dev_name)); memcpy(pm_reg_msg.if_name, cm_id->device->iwcm->ifname, sizeof(pm_reg_msg.if_name)); diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 0385ab438320..ba5403fbcd88 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -179,7 +179,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device) { if (nla_put_u32(msg, RDMA_NLDEV_ATTR_DEV_INDEX, device->index)) return -EMSGSIZE; - if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, device->name)) + if (nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_NAME, + dev_name(&device->dev))) return -EMSGSIZE; return 0; diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index d3d6275b3b7e..19e1833e13fc 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -761,7 +761,7 @@ static void ib_nl_set_path_rec_attrs(struct sk_buff *skb, /* Construct the family header first */ header = skb_put(skb, NLMSG_ALIGN(sizeof(*header))); - memcpy(header->device_name, query->port->agent->device->name, + memcpy(header->device_name, dev_name(&query->port->agent->device->dev), LS_DEVICE_NAME_MAX); header->port_num = query->port->port_num; diff --git a/drivers/infiniband/core/security.c b/drivers/infiniband/core/security.c index 9b0bea8303e0..1143c0448666 100644 --- a/drivers/infiniband/core/security.c +++ b/drivers/infiniband/core/security.c @@ -685,9 +685,8 @@ static int ib_mad_agent_security_change(struct notifier_block *nb, if (event != LSM_POLICY_CHANGE) return NOTIFY_DONE; - ag->smp_allowed = !security_ib_endport_manage_subnet(ag->security, - ag->device->name, - ag->port_num); + ag->smp_allowed = !security_ib_endport_manage_subnet( + ag->security, dev_name(&ag->device->dev), ag->port_num); return NOTIFY_OK; } @@ -708,7 +707,7 @@ int ib_mad_agent_security_setup(struct ib_mad_agent *agent, return 0; ret = security_ib_endport_manage_subnet(agent->security, - agent->device->name, + dev_name(&agent->device->dev), agent->port_num); if (ret) return ret; diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index c34a6852d691..9961859da06a 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -1132,7 +1132,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, if (!port) return -ENODEV; - return sprintf(buf, "%s\n", port->ib_dev->name); + return sprintf(buf, "%s\n", dev_name(&port->ib_dev->dev)); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 8d56773aac56..12d8f8097574 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -1179,7 +1179,7 @@ static ssize_t ibdev_show(struct device *device, struct device_attribute *attr, srcu_key = srcu_read_lock(&dev->disassociate_srcu); ib_dev = srcu_dereference(dev->ib_dev, &dev->disassociate_srcu); if (ib_dev) - ret = sprintf(buf, "%s\n", ib_dev->name); + ret = sprintf(buf, "%s\n", dev_name(&ib_dev->dev)); srcu_read_unlock(&dev->disassociate_srcu, srcu_key); return ret; From 9de6986148360bdc954e8621cf024f4c830ddb13 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:26 -0600 Subject: [PATCH 142/242] RDMA/drivers: Use dev_name instead of ibdev->name These return the same thing but dev_name is a more conventional use of the kernel API. Signed-off-by: Jason Gunthorpe Reviewed-by: Leon Romanovsky --- drivers/infiniband/hw/nes/nes_verbs.c | 2 +- drivers/infiniband/hw/ocrdma/ocrdma_stats.c | 3 +- drivers/infiniband/hw/qedr/qedr.h | 2 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 34 ++++++++++--------- drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 6 ++-- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 16 +++++---- .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 2 +- drivers/infiniband/sw/rxe/rxe_net.c | 2 +- 8 files changed, 36 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 2127cd2f4bec..94054bc611bd 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -687,7 +687,7 @@ static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev, } nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n", - nespd, nesvnic->nesibdev->ibdev.name); + nespd, dev_name(&nesvnic->nesibdev->ibdev.dev)); nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c index 24d20a4aa262..290d776edf48 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c @@ -764,7 +764,8 @@ void ocrdma_add_port_stats(struct ocrdma_dev *dev) return; /* Create post stats base dir */ - dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir); + dev->dir = + debugfs_create_dir(dev_name(&dev->ibdev.dev), ocrdma_dbgfs_dir); if (!dev->dir) goto err; diff --git a/drivers/infiniband/hw/qedr/qedr.h b/drivers/infiniband/hw/qedr/qedr.h index a2d708dceb8d..53bbe6b4e6e6 100644 --- a/drivers/infiniband/hw/qedr/qedr.h +++ b/drivers/infiniband/hw/qedr/qedr.h @@ -43,7 +43,7 @@ #include "qedr_hsi_rdma.h" #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA" -#define DP_NAME(dev) ((dev)->ibdev.name) +#define DP_NAME(_dev) dev_name(&(_dev)->ibdev.dev) #define IS_IWARP(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_IWARP) #define IS_ROCE(_dev) ((_dev)->rdma_type == QED_RDMA_TYPE_ROCE) diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index 3b9f12928314..e3f98fe2f1e6 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -76,7 +76,7 @@ static LIST_HEAD(usnic_ib_ibdev_list); static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz) { struct usnic_ib_vf *vf = obj; - return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name); + return scnprintf(buf, buf_sz, "PF: %s ", dev_name(&vf->pf->ib_dev.dev)); } /* End callback dump funcs */ @@ -138,7 +138,7 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, netdev = us_ibdev->netdev; switch (event) { case NETDEV_REBOOT: - usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name); + usnic_info("PF Reset on %s\n", dev_name(&us_ibdev->ib_dev.dev)); usnic_ib_qp_grp_modify_active_to_err(us_ibdev); ib_event.event = IB_EVENT_PORT_ERR; ib_event.device = &us_ibdev->ib_dev; @@ -151,7 +151,8 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, if (!us_ibdev->ufdev->link_up && netif_carrier_ok(netdev)) { usnic_fwd_carrier_up(us_ibdev->ufdev); - usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name); + usnic_info("Link UP on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); ib_event.event = IB_EVENT_PORT_ACTIVE; ib_event.device = &us_ibdev->ib_dev; ib_event.element.port_num = 1; @@ -159,7 +160,8 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, } else if (us_ibdev->ufdev->link_up && !netif_carrier_ok(netdev)) { usnic_fwd_carrier_down(us_ibdev->ufdev); - usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name); + usnic_info("Link DOWN on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); usnic_ib_qp_grp_modify_active_to_err(us_ibdev); ib_event.event = IB_EVENT_PORT_ERR; ib_event.device = &us_ibdev->ib_dev; @@ -168,17 +170,17 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, } else { usnic_dbg("Ignoring %s on %s\n", netdev_cmd_to_name(event), - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } break; case NETDEV_CHANGEADDR: if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr, sizeof(us_ibdev->ufdev->mac))) { usnic_dbg("Ignoring addr change on %s\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } else { usnic_info(" %s old mac: %pM new mac: %pM\n", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), us_ibdev->ufdev->mac, netdev->dev_addr); usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr); @@ -193,19 +195,19 @@ static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev, case NETDEV_CHANGEMTU: if (us_ibdev->ufdev->mtu != netdev->mtu) { usnic_info("MTU Change on %s old: %u new: %u\n", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), us_ibdev->ufdev->mtu, netdev->mtu); usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu); usnic_ib_qp_grp_modify_active_to_err(us_ibdev); } else { usnic_dbg("Ignoring MTU change on %s\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } break; default: usnic_dbg("Ignoring event %s on %s", netdev_cmd_to_name(event), - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } mutex_unlock(&us_ibdev->usdev_lock); } @@ -267,7 +269,7 @@ static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev, default: usnic_info("Ignoring event %s on %s", netdev_cmd_to_name(event), - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); } mutex_unlock(&us_ibdev->usdev_lock); @@ -436,9 +438,9 @@ static void *usnic_ib_device_add(struct pci_dev *dev) kref_init(&us_ibdev->vf_cnt); usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n", - us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev), - us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up, - us_ibdev->ufdev->mtu); + dev_name(&us_ibdev->ib_dev.dev), + netdev_name(us_ibdev->netdev), us_ibdev->ufdev->mac, + us_ibdev->ufdev->link_up, us_ibdev->ufdev->mtu); return us_ibdev; err_fwd_dealloc: @@ -451,7 +453,7 @@ err_dealloc: static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev) { - usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name); + usnic_info("Unregistering %s\n", dev_name(&us_ibdev->ib_dev.dev)); usnic_ib_sysfs_unregister_usdev(us_ibdev); usnic_fwd_dev_free(us_ibdev->ufdev); ib_unregister_device(&us_ibdev->ib_dev); @@ -590,7 +592,7 @@ static int usnic_ib_pci_probe(struct pci_dev *pdev, mutex_unlock(&pf->usdev_lock); usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev), - pf->ib_dev.name); + dev_name(&pf->ib_dev.dev)); usnic_ib_log_vf(vf); return 0; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index 4210ca14014d..fab4cb780122 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -94,7 +94,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr, n = scnprintf(ptr, left, "%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), busname, PCI_SLOT(us_ibdev->pdev->devfn), PCI_FUNC(us_ibdev->pdev->devfn), @@ -119,7 +119,7 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr, UPDATE_PTR_LEFT(n, ptr, left); } else { n = scnprintf(ptr, left, "%s: no VFs\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); UPDATE_PTR_LEFT(n, ptr, left); } mutex_unlock(&us_ibdev->usdev_lock); @@ -285,7 +285,7 @@ int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) usnic_class_attributes[i]); if (err) { usnic_err("Failed to create device file %d for %s eith err %d", - i, us_ibdev->ib_dev.name, err); + i, dev_name(&us_ibdev->ib_dev.dev), err); return -EINVAL; } } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 9973ac893635..0b91ff36768a 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -159,7 +159,8 @@ static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, err = ib_copy_to_udata(udata, &resp, sizeof(resp)); if (err) { - usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name); + usnic_err("Failed to copy udata for %s", + dev_name(&us_ibdev->ib_dev.dev)); return err; } @@ -197,7 +198,7 @@ find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, vnic = vf->vnic; if (!usnic_vnic_check_room(vnic, res_spec)) { usnic_dbg("Found used vnic %s from %s\n", - us_ibdev->ib_dev.name, + dev_name(&us_ibdev->ib_dev.dev), pci_name(usnic_vnic_get_pdev( vnic))); qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, @@ -230,7 +231,8 @@ find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, spin_unlock(&vf->lock); } - usnic_info("No free qp grp found on %s\n", us_ibdev->ib_dev.name); + usnic_info("No free qp grp found on %s\n", + dev_name(&us_ibdev->ib_dev.dev)); return ERR_PTR(-ENOMEM); qp_grp_check: @@ -471,7 +473,7 @@ struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev, } usnic_info("domain 0x%p allocated for context 0x%p and device %s\n", - pd, context, ibdev->name); + pd, context, dev_name(&ibdev->dev)); return &pd->ibpd; } @@ -508,20 +510,20 @@ struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd, err = ib_copy_from_udata(&cmd, udata, sizeof(cmd)); if (err) { usnic_err("%s: cannot copy udata for create_qp\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); return ERR_PTR(-EINVAL); } err = create_qp_validate_user_data(cmd); if (err) { usnic_err("%s: Failed to validate user data\n", - us_ibdev->ib_dev.name); + dev_name(&us_ibdev->ib_dev.dev)); return ERR_PTR(-EINVAL); } if (init_attr->qp_type != IB_QPT_UD) { usnic_err("%s asked to make a non-UD QP: %d\n", - us_ibdev->ib_dev.name, init_attr->qp_type); + dev_name(&us_ibdev->ib_dev.dev), init_attr->qp_type); return ERR_PTR(-EINVAL); } diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index 6878107fc637..c1e31985b11c 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -734,7 +734,7 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev, default: dev_dbg(&dev->pdev->dev, "ignore netdevice event %ld on %s\n", - event, dev->ib_dev.name); + event, dev_name(&dev->ib_dev.dev)); break; } } diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index 8e658ce439b4..fb06f94f33d8 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -72,7 +72,7 @@ struct rxe_dev *get_rxe_by_name(const char *name) spin_lock_bh(&dev_list_lock); list_for_each_entry(rxe, &rxe_dev_list, list) { - if (!strcmp(name, rxe->ib_dev.name)) { + if (!strcmp(name, dev_name(&rxe->ib_dev.dev))) { found = rxe; break; } From 6c8541118bd53bc90b6c2473e289e5541de80376 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Thu, 20 Sep 2018 16:42:27 -0600 Subject: [PATCH 143/242] RDMA/ulp: Use dev_name instead of ibdev->name These return the same thing but dev_name is a more conventional use of the kernel API. Signed-off-by: Jason Gunthorpe Reviewed-by: Steve Wise Reviewed-by: Sagi Grimberg Reviewed-by: Dennis Dalessandro --- drivers/infiniband/ulp/ipoib/ipoib_verbs.c | 2 +- drivers/infiniband/ulp/iser/iser_verbs.c | 9 ++++--- drivers/infiniband/ulp/isert/ib_isert.c | 2 +- .../infiniband/ulp/opa_vnic/opa_vnic_vema.c | 3 ++- drivers/infiniband/ulp/srp/ib_srp.c | 10 ++++--- drivers/infiniband/ulp/srpt/ib_srpt.c | 26 ++++++++++--------- include/rdma/rdma_vt.h | 2 +- 7 files changed, 30 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c index 9f36ca786df8..1e88213459f2 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c @@ -277,7 +277,7 @@ void ipoib_event(struct ib_event_handler *handler, return; ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event, - record->device->name, record->element.port_num); + dev_name(&record->device->dev), record->element.port_num); if (record->event == IB_EVENT_SM_CHANGE || record->event == IB_EVENT_CLIENT_REREGISTER) { diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index b686a4aaffe8..946b623ba5eb 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -55,7 +55,7 @@ static void iser_event_handler(struct ib_event_handler *handler, { iser_err("async event %s (%d) on device %s port %d\n", ib_event_msg(event->event), event->event, - event->device->name, event->element.port_num); + dev_name(&event->device->dev), event->element.port_num); } /** @@ -85,7 +85,7 @@ static int iser_create_device_ib_res(struct iser_device *device) max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe); iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n", - device->comps_used, ib_dev->name, + device->comps_used, dev_name(&ib_dev->dev), ib_dev->num_comp_vectors, max_cqe); device->pd = ib_alloc_pd(ib_dev, @@ -468,7 +468,8 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr); iser_dbg("device %s supports max_send_wr %d\n", - device->ib_device->name, ib_dev->attrs.max_qp_wr); + dev_name(&device->ib_device->dev), + ib_dev->attrs.max_qp_wr); } } @@ -764,7 +765,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) IB_DEVICE_SIGNATURE_HANDOVER)) { iser_warn("T10-PI requested but not supported on %s, " "continue without T10-PI\n", - ib_conn->device->ib_device->name); + dev_name(&ib_conn->device->ib_device->dev)); ib_conn->pi_support = false; } else { ib_conn->pi_support = true; diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index f39670c5c25c..e3dd13798d79 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -262,7 +262,7 @@ isert_alloc_comps(struct isert_device *device) isert_info("Using %d CQs, %s supports %d vectors support " "pi_capable %d\n", - device->comps_used, device->ib_device->name, + device->comps_used, dev_name(&device->ib_device->dev), device->ib_device->num_comp_vectors, device->pi_capable); diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c index 15711dcc6f58..d119d9afa845 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c @@ -888,7 +888,8 @@ static void opa_vnic_event(struct ib_event_handler *handler, return; c_dbg("OPA_VNIC received event %d on device %s port %d\n", - record->event, record->device->name, record->element.port_num); + record->event, dev_name(&record->device->dev), + record->element.port_num); if (record->event == IB_EVENT_PORT_ERR) idr_for_each(&port->vport_idr, vema_disable_vport, NULL); diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 444d16520506..e2ad7c5ea296 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -3124,7 +3124,8 @@ static ssize_t show_local_ib_device(struct device *dev, { struct srp_target_port *target = host_to_target(class_to_shost(dev)); - return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name); + return sprintf(buf, "%s\n", + dev_name(&target->srp_host->srp_dev->dev->dev)); } static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr, @@ -3987,7 +3988,7 @@ static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr, { struct srp_host *host = container_of(dev, struct srp_host, dev); - return sprintf(buf, "%s\n", host->srp_dev->dev->name); + return sprintf(buf, "%s\n", dev_name(&host->srp_dev->dev->dev)); } static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL); @@ -4019,7 +4020,8 @@ static struct srp_host *srp_add_port(struct srp_device *device, u8 port) host->dev.class = &srp_class; host->dev.parent = device->dev->dev.parent; - dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port); + dev_set_name(&host->dev, "srp-%s-%d", dev_name(&device->dev->dev), + port); if (device_register(&host->dev)) goto free_host; @@ -4095,7 +4097,7 @@ static void srp_add_one(struct ib_device *device) srp_dev->mr_max_size = srp_dev->mr_page_size * srp_dev->max_pages_per_mr; pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", - device->name, mr_page_shift, attr->max_mr_size, + dev_name(&device->dev), mr_page_shift, attr->max_mr_size, attr->max_fast_reg_page_list_len, srp_dev->max_pages_per_mr, srp_dev->mr_max_size); diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index 447d21ea479a..2357aa727dcf 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -148,7 +148,7 @@ static void srpt_event_handler(struct ib_event_handler *handler, return; pr_debug("ASYNC event= %d on device= %s\n", event->event, - sdev->device->name); + dev_name(&sdev->device->dev)); switch (event->event) { case IB_EVENT_PORT_ERR: @@ -1941,7 +1941,8 @@ static void __srpt_close_all_ch(struct srpt_port *sport) if (srpt_disconnect_ch(ch) >= 0) pr_info("Closing channel %s because target %s_%d has been disabled\n", ch->sess_name, - sport->sdev->device->name, sport->port); + dev_name(&sport->sdev->device->dev), + sport->port); srpt_close_ch(ch); } } @@ -2127,7 +2128,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, if (!sport->enabled) { rej->reason = cpu_to_be32(SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target port %s_%d has not yet been enabled\n", - sport->sdev->device->name, port_num); + dev_name(&sport->sdev->device->dev), port_num); goto reject; } @@ -2267,7 +2268,7 @@ static int srpt_cm_req_recv(struct srpt_device *const sdev, rej->reason = cpu_to_be32( SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES); pr_info("rejected SRP_LOGIN_REQ because target %s_%d is not enabled\n", - sdev->device->name, port_num); + dev_name(&sdev->device->dev), port_num); mutex_unlock(&sport->mutex); goto reject; } @@ -2842,7 +2843,7 @@ static int srpt_release_sport(struct srpt_port *sport) while (wait_event_timeout(sport->ch_releaseQ, srpt_ch_list_empty(sport), 5 * HZ) <= 0) { pr_info("%s_%d: waiting for session unregistration ...\n", - sport->sdev->device->name, sport->port); + dev_name(&sport->sdev->device->dev), sport->port); rcu_read_lock(); list_for_each_entry(nexus, &sport->nexus_list, entry) { list_for_each_entry(ch, &nexus->ch_list, list) { @@ -2932,7 +2933,7 @@ static int srpt_alloc_srq(struct srpt_device *sdev) } pr_debug("create SRQ #wr= %d max_allow=%d dev= %s\n", sdev->srq_size, - sdev->device->attrs.max_srq_wr, device->name); + sdev->device->attrs.max_srq_wr, dev_name(&device->dev)); sdev->ioctx_ring = (struct srpt_recv_ioctx **) srpt_alloc_ioctx_ring(sdev, sdev->srq_size, @@ -2965,8 +2966,8 @@ static int srpt_use_srq(struct srpt_device *sdev, bool use_srq) } else if (use_srq && !sdev->srq) { ret = srpt_alloc_srq(sdev); } - pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, device->name, - sdev->use_srq, ret); + pr_debug("%s(%s): use_srq = %d; ret = %d\n", __func__, + dev_name(&device->dev), sdev->use_srq, ret); return ret; } @@ -3052,7 +3053,7 @@ static void srpt_add_one(struct ib_device *device) if (srpt_refresh_port(sport)) { pr_err("MAD registration failed for %s-%d.\n", - sdev->device->name, i); + dev_name(&sdev->device->dev), i); goto err_event; } } @@ -3063,7 +3064,7 @@ static void srpt_add_one(struct ib_device *device) out: ib_set_client_data(device, &srpt_client, sdev); - pr_debug("added %s.\n", device->name); + pr_debug("added %s.\n", dev_name(&device->dev)); return; err_event: @@ -3078,7 +3079,7 @@ free_dev: kfree(sdev); err: sdev = NULL; - pr_info("%s(%s) failed.\n", __func__, device->name); + pr_info("%s(%s) failed.\n", __func__, dev_name(&device->dev)); goto out; } @@ -3093,7 +3094,8 @@ static void srpt_remove_one(struct ib_device *device, void *client_data) int i; if (!sdev) { - pr_info("%s(%s): nothing to do.\n", __func__, device->name); + pr_info("%s(%s): nothing to do.\n", __func__, + dev_name(&device->dev)); return; } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 065c9fbe6589..cc08de3570b4 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -447,7 +447,7 @@ static inline void rvt_set_ibdev_name(struct rvt_dev_info *rdi, */ static inline const char *rvt_get_ibdev_name(const struct rvt_dev_info *rdi) { - return rdi->ibdev.name; + return dev_name(&rdi->ibdev.dev); } static inline struct rvt_pd *ibpd_to_rvtpd(struct ib_pd *ibpd) From b9c1ea40e8bbd2ce8bf975ec9cf42817021cadb6 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sat, 22 Sep 2018 16:21:05 +0800 Subject: [PATCH 144/242] RDMA/hns: Refactor the codes for setting transport opode Currently the transport opcodes which come from users configuration is set by similar code. This patch simplifies it. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 66 ++++++---------------- 1 file changed, 18 insertions(+), 48 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 7ccf3774eb71..903ed66e6b30 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -191,6 +191,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, int attr_mask; u32 tmp_len; int ret = 0; + u32 hr_op; u8 *smac; int nreq; int i; @@ -408,91 +409,60 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, switch (wr->opcode) { case IB_WR_RDMA_READ: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_READ); + hr_op = HNS_ROCE_V2_WQE_OP_RDMA_READ; rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); break; case IB_WR_RDMA_WRITE: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_WRITE); + hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE; rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); break; case IB_WR_RDMA_WRITE_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM); + hr_op = HNS_ROCE_V2_WQE_OP_RDMA_WRITE_WITH_IMM; rc_sq_wqe->rkey = cpu_to_le32(rdma_wr(wr)->rkey); rc_sq_wqe->va = cpu_to_le64(rdma_wr(wr)->remote_addr); break; case IB_WR_SEND: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND); + hr_op = HNS_ROCE_V2_WQE_OP_SEND; break; case IB_WR_SEND_WITH_INV: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND_WITH_INV); + hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_INV; break; case IB_WR_SEND_WITH_IMM: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM); + hr_op = HNS_ROCE_V2_WQE_OP_SEND_WITH_IMM; break; case IB_WR_LOCAL_INV: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_LOCAL_INV); + hr_op = HNS_ROCE_V2_WQE_OP_LOCAL_INV; break; case IB_WR_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP); + hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP; break; case IB_WR_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD); + hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD; break; case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP); + hr_op = + HNS_ROCE_V2_WQE_OP_ATOM_MSK_CMP_AND_SWAP; break; case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD); + hr_op = + HNS_ROCE_V2_WQE_OP_ATOM_MSK_FETCH_AND_ADD; break; default: - roce_set_field(rc_sq_wqe->byte_4, - V2_RC_SEND_WQE_BYTE_4_OPCODE_M, - V2_RC_SEND_WQE_BYTE_4_OPCODE_S, - HNS_ROCE_V2_WQE_OP_MASK); + hr_op = HNS_ROCE_V2_WQE_OP_MASK; break; } + roce_set_field(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_OPCODE_M, + V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op); wqe += sizeof(struct hns_roce_v2_rc_send_wqe); ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, From 384f881851127dd834a2733f91999b859a5ffddb Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sat, 22 Sep 2018 16:21:06 +0800 Subject: [PATCH 145/242] RDMA/hns: Add atomic support This patch adds atomic operations for hip08, includes fetchadd and cmpswap operation. In order to enable atomic, the driver needs to do the following steps: 1. Enable the atomic caps for RoCE device 2. Post the wqe context of atomic type 3. Configure the atomic type of mtpt Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 39 +++++++++++++++++++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 +++ drivers/infiniband/hw/hns/hns_roce_main.c | 3 +- 4 files changed, 45 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index d443e26b67d1..7a2c0ca793f2 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -193,6 +193,7 @@ enum { HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2), HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3), HNS_ROCE_CAP_FLAG_SQ_RECORD_DB = BIT(4), + HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), }; enum hns_roce_mtt_type { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 903ed66e6b30..97ff5dae9e95 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -54,6 +54,18 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } +static void set_atomic_seg(struct hns_roce_wqe_atomic_seg *aseg, + const struct ib_atomic_wr *wr) +{ + if (wr->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) { + aseg->fetchadd_swap_data = cpu_to_le64(wr->swap); + aseg->cmp_data = cpu_to_le64(wr->compare_add); + } else { + aseg->fetchadd_swap_data = cpu_to_le64(wr->compare_add); + aseg->cmp_data = 0; + } +} + static void set_extend_sge(struct hns_roce_qp *qp, const struct ib_send_wr *wr, unsigned int *sge_ind) { @@ -179,6 +191,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct hns_roce_v2_ud_send_wqe *ud_sq_wqe; struct hns_roce_v2_rc_send_wqe *rc_sq_wqe; struct hns_roce_qp *qp = to_hr_qp(ibqp); + struct hns_roce_v2_wqe_data_seg *dseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; struct ib_qp_attr attr; @@ -407,6 +420,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, roce_set_bit(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OWNER_S, owner_bit); + wqe += sizeof(struct hns_roce_v2_rc_send_wqe); switch (wr->opcode) { case IB_WR_RDMA_READ: hr_op = HNS_ROCE_V2_WQE_OP_RDMA_READ; @@ -443,9 +457,21 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, break; case IB_WR_ATOMIC_CMP_AND_SWP: hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP; + rc_sq_wqe->rkey = + cpu_to_le32(atomic_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le32(atomic_wr(wr)->remote_addr); + wqe += sizeof(struct hns_roce_v2_wqe_data_seg); + set_atomic_seg(wqe, atomic_wr(wr)); break; case IB_WR_ATOMIC_FETCH_AND_ADD: hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD; + rc_sq_wqe->rkey = + cpu_to_le32(atomic_wr(wr)->rkey); + rc_sq_wqe->va = + cpu_to_le32(atomic_wr(wr)->remote_addr); + wqe += sizeof(struct hns_roce_v2_wqe_data_seg); + set_atomic_seg(wqe, atomic_wr(wr)); break; case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: hr_op = @@ -463,7 +489,12 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op); - wqe += sizeof(struct hns_roce_v2_rc_send_wqe); + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) + dseg = + wqe - sizeof(struct hns_roce_v2_wqe_data_seg); + else + dseg = wqe; ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, &sge_ind, bad_wr); @@ -1232,6 +1263,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->local_ca_ack_delay = 0; caps->max_mtu = IB_MTU_4096; + if (hr_dev->pci_dev->revision == 0x21) + caps->flags |= HNS_ROCE_CAP_FLAG_ATOMIC; + ret = hns_roce_v2_set_bt(hr_dev); if (ret) dev_err(hr_dev->dev, "Configure bt attribute fail, ret = %d.\n", @@ -1663,7 +1697,8 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 0); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_BIND_EN_S, (mr->access & IB_ACCESS_MW_BIND ? 1 : 0)); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, 0); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, + mr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RR_EN_S, (mr->access & IB_ACCESS_REMOTE_READ ? 1 : 0)); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RW_EN_S, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 14aa308befef..6f92722baf32 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -1564,4 +1564,9 @@ struct hns_roce_eq_context { #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_S 0 #define HNS_ROCE_V2_AEQE_EVENT_QUEUE_NUM_M GENMASK(23, 0) +struct hns_roce_wqe_atomic_seg { + __le64 fetchadd_swap_data; + __le64 cmp_data; +}; + #endif diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 5a86a48cba13..202408874c54 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -215,7 +215,8 @@ static int hns_roce_query_device(struct ib_device *ib_dev, props->max_pd = hr_dev->caps.num_pds; props->max_qp_rd_atom = hr_dev->caps.max_qp_dest_rdma; props->max_qp_init_rd_atom = hr_dev->caps.max_qp_init_rdma; - props->atomic_cap = IB_ATOMIC_NONE; + props->atomic_cap = hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_ATOMIC ? + IB_ATOMIC_HCA : IB_ATOMIC_NONE; props->max_pkeys = 1; props->local_ca_ack_delay = hr_dev->caps.local_ca_ack_delay; From 944e64093a63bfab0a4f7ecebcd434630dff9107 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sat, 22 Sep 2018 16:21:07 +0800 Subject: [PATCH 146/242] RDMA/hns: Add CM of vlan device support This patch mainly sets the vlan_id field in the WC for rdma_listen() to work over vlan. This is required by ib_init_ah_attr_from_wc() which is called by the CM REQ handler. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 9 ++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 97ff5dae9e95..a781f6b49782 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2292,7 +2292,14 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, wc->smac[5] = roce_get_field(cqe->byte_28, V2_CQE_BYTE_28_SMAC_5_M, V2_CQE_BYTE_28_SMAC_5_S); - wc->vlan_id = 0xffff; + if (roce_get_bit(cqe->byte_28, V2_CQE_BYTE_28_VID_VLD_S)) { + wc->vlan_id = (u16)roce_get_field(cqe->byte_28, + V2_CQE_BYTE_28_VID_M, + V2_CQE_BYTE_28_VID_S); + } else { + wc->vlan_id = 0xffff; + } + wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC); wc->network_hdr_type = roce_get_field(cqe->byte_28, V2_CQE_BYTE_28_PORT_TYPE_M, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 6f92722baf32..d04be1cef453 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -818,6 +818,11 @@ struct hns_roce_v2_cqe { #define V2_CQE_BYTE_28_PORT_TYPE_S 16 #define V2_CQE_BYTE_28_PORT_TYPE_M GENMASK(17, 16) +#define V2_CQE_BYTE_28_VID_S 18 +#define V2_CQE_BYTE_28_VID_M GENMASK(29, 18) + +#define V2_CQE_BYTE_28_VID_VLD_S 30 + #define V2_CQE_BYTE_32_RMT_QPN_S 0 #define V2_CQE_BYTE_32_RMT_QPN_M GENMASK(23, 0) From 8320deb88c03a842f8c2db92e2b4a86d2bb6df76 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sat, 22 Sep 2018 16:21:08 +0800 Subject: [PATCH 147/242] RDMA/hns: Add enable judgement for UD vlan According to the hardware modification, the vlan of the UD packet is based on the ud_vlan_en field of the UD wqe to determine whether to add a vlan header to the UD packet. The ud_vlan_en field is filled by the driver according to the net device. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_ah.c | 6 +++++- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 3 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 ++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_ah.c b/drivers/infiniband/hw/hns/hns_roce_ah.c index 0d96c5bb38cd..9990dc9eb96a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_ah.c +++ b/drivers/infiniband/hw/hns/hns_roce_ah.c @@ -49,6 +49,7 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, struct hns_roce_ah *ah; u16 vlan_tag = 0xffff; const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr); + bool vlan_en = false; ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) @@ -58,8 +59,10 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN); gid_attr = ah_attr->grh.sgid_attr; - if (is_vlan_dev(gid_attr->ndev)) + if (is_vlan_dev(gid_attr->ndev)) { vlan_tag = vlan_dev_vlan_id(gid_attr->ndev); + vlan_en = true; + } if (vlan_tag < 0x1000) vlan_tag |= (rdma_ah_get_sl(ah_attr) & @@ -71,6 +74,7 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd, HNS_ROCE_PORT_NUM_SHIFT)); ah->av.gid_index = grh->sgid_index; ah->av.vlan = cpu_to_le16(vlan_tag); + ah->av.vlan_en = vlan_en; dev_dbg(dev, "gid_index = 0x%x,vlan = 0x%x\n", ah->av.gid_index, ah->av.vlan); diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 7a2c0ca793f2..3b06bd03b852 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -450,6 +450,7 @@ struct hns_roce_av { u8 dgid[HNS_ROCE_GID_SIZE]; u8 mac[6]; __le16 vlan; + bool vlan_en; }; struct hns_roce_ah { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index a781f6b49782..3b52d56d56f6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -370,6 +370,9 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, V2_UD_SEND_WQE_BYTE_40_PORTN_S, qp->port); + roce_set_bit(ud_sq_wqe->byte_40, + V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S, + ah->av.vlan_en ? 1 : 0); roce_set_field(ud_sq_wqe->byte_48, V2_UD_SEND_WQE_BYTE_48_SGID_INDX_M, V2_UD_SEND_WQE_BYTE_48_SGID_INDX_S, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index d04be1cef453..7ee6ed276fa6 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -993,6 +993,8 @@ struct hns_roce_v2_ud_send_wqe { #define V2_UD_SEND_WQE_BYTE_40_PORTN_S 24 #define V2_UD_SEND_WQE_BYTE_40_PORTN_M GENMASK(26, 24) +#define V2_UD_SEND_WQE_BYTE_40_UD_VLAN_EN_S 30 + #define V2_UD_SEND_WQE_BYTE_40_LBI_S 31 #define V2_UD_SEND_WQE_DMAC_0_S 0 From c7c28191408bf33c1d9c83de1d5b91f58f1ddaf1 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Sun, 23 Sep 2018 17:20:46 +0800 Subject: [PATCH 148/242] RDMA/hns: Add MW support for hip08 This patch adds memory window (mw) support in the kernel space. Signed-off-by: Yixian Liu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 21 ++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 45 ++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 10 ++ drivers/infiniband/hw/hns/hns_roce_main.c | 9 ++ drivers/infiniband/hw/hns/hns_roce_mr.c | 120 ++++++++++++++++++++ 5 files changed, 205 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 3b06bd03b852..ffa92550dbfa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -193,6 +193,7 @@ enum { HNS_ROCE_CAP_FLAG_RQ_INLINE = BIT(2), HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3), HNS_ROCE_CAP_FLAG_SQ_RECORD_DB = BIT(4), + HNS_ROCE_CAP_FLAG_MW = BIT(7), HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), }; @@ -286,6 +287,16 @@ struct hns_roce_mtt { enum hns_roce_mtt_type mtt_type; }; +struct hns_roce_mw { + struct ib_mw ibmw; + u32 pdn; + u32 rkey; + int enabled; /* MW's active status */ + u32 pbl_hop_num; + u32 pbl_ba_pg_sz; + u32 pbl_buf_pg_sz; +}; + /* Only support 4K page size for mr register */ #define MR_SIZE_4K 0 @@ -759,6 +770,7 @@ struct hns_roce_hw { struct hns_roce_mr *mr, int flags, u32 pdn, int mr_access_flags, u64 iova, u64 size, void *mb_buf); + int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, dma_addr_t dma_handle, int nent, u32 vector); @@ -858,6 +870,11 @@ static inline struct hns_roce_mr *to_hr_mr(struct ib_mr *ibmr) return container_of(ibmr, struct hns_roce_mr, ibmr); } +static inline struct hns_roce_mw *to_hr_mw(struct ib_mw *ibmw) +{ + return container_of(ibmw, struct hns_roce_mw, ibmw); +} + static inline struct hns_roce_qp *to_hr_qp(struct ib_qp *ibqp) { return container_of(ibqp, struct hns_roce_qp, ibqp); @@ -969,6 +986,10 @@ int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev, unsigned long mpt_index); unsigned long key_to_hw_index(u32 key); +struct ib_mw *hns_roce_alloc_mw(struct ib_pd *pd, enum ib_mw_type, + struct ib_udata *udata); +int hns_roce_dealloc_mw(struct ib_mw *ibmw); + void hns_roce_buf_free(struct hns_roce_dev *hr_dev, u32 size, struct hns_roce_buf *buf); int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 3b52d56d56f6..0d7568ee0d64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1259,6 +1259,10 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) HNS_ROCE_CAP_FLAG_RQ_INLINE | HNS_ROCE_CAP_FLAG_RECORD_DB | HNS_ROCE_CAP_FLAG_SQ_RECORD_DB; + + if (hr_dev->pci_dev->revision == 0x21) + caps->flags |= HNS_ROCE_CAP_FLAG_MW; + caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; caps->ceqe_depth = HNS_ROCE_V2_COMP_EQE_NUM; @@ -1825,6 +1829,46 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, return 0; } +static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw) +{ + struct hns_roce_v2_mpt_entry *mpt_entry; + + mpt_entry = mb_buf; + memset(mpt_entry, 0, sizeof(*mpt_entry)); + + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, + V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE); + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, + V2_MPT_BYTE_4_PD_S, mw->pdn); + roce_set_field(mpt_entry->byte_4_pd_hop_st, + V2_MPT_BYTE_4_PBL_HOP_NUM_M, + V2_MPT_BYTE_4_PBL_HOP_NUM_S, + mw->pbl_hop_num == HNS_ROCE_HOP_NUM_0 ? + 0 : mw->pbl_hop_num); + roce_set_field(mpt_entry->byte_4_pd_hop_st, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, + mw->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); + + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 1); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BQP_S, + mw->ibmw.type == IB_MW_TYPE_1 ? 0 : 1); + + roce_set_field(mpt_entry->byte_64_buf_pa1, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, + mw->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + + mpt_entry->lkey = cpu_to_le32(mw->rkey); + + return 0; +} + static void *get_cqe_v2(struct hns_roce_cq *hr_cq, int n) { return hns_roce_buf_offset(&hr_cq->hr_buf.hr_buf, @@ -5175,6 +5219,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .set_mac = hns_roce_v2_set_mac, .write_mtpt = hns_roce_v2_write_mtpt, .rereg_write_mtpt = hns_roce_v2_rereg_write_mtpt, + .mw_write_mtpt = hns_roce_v2_mw_write_mtpt, .write_cqc = hns_roce_v2_write_cqc, .set_hem = hns_roce_v2_set_hem, .clear_hem = hns_roce_v2_clear_hem, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 7ee6ed276fa6..712542657c64 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -324,6 +324,7 @@ struct hns_roce_v2_cq_context { enum{ V2_MPT_ST_VALID = 0x1, + V2_MPT_ST_FREE = 0x2, }; enum hns_roce_v2_qp_state { @@ -883,8 +884,17 @@ struct hns_roce_v2_mpt_entry { #define V2_MPT_BYTE_8_LW_EN_S 7 +#define V2_MPT_BYTE_8_MW_CNT_S 8 +#define V2_MPT_BYTE_8_MW_CNT_M GENMASK(31, 8) + #define V2_MPT_BYTE_12_PA_S 1 +#define V2_MPT_BYTE_12_MR_MW_S 4 + +#define V2_MPT_BYTE_12_BPD_S 5 + +#define V2_MPT_BYTE_12_BQP_S 6 + #define V2_MPT_BYTE_12_INNER_PA_VLD_S 7 #define V2_MPT_BYTE_12_MW_BIND_QPN_S 8 diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 202408874c54..8c5160ec3a4d 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -525,6 +525,15 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) ib_dev->uverbs_cmd_mask |= (1ULL << IB_USER_VERBS_CMD_REREG_MR); } + /* MW */ + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_MW) { + ib_dev->alloc_mw = hns_roce_alloc_mw; + ib_dev->dealloc_mw = hns_roce_dealloc_mw; + ib_dev->uverbs_cmd_mask |= + (1ULL << IB_USER_VERBS_CMD_ALLOC_MW) | + (1ULL << IB_USER_VERBS_CMD_DEALLOC_MW); + } + /* OTHERS */ ib_dev->get_port_immutable = hns_roce_port_immutable; ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index eb26a5f6fc58..0613c117a442 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -1201,3 +1201,123 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr) return ret; } + +static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, + struct hns_roce_mw *mw) +{ + struct device *dev = hr_dev->dev; + int ret; + + if (mw->enabled) { + ret = hns_roce_hw2sw_mpt(hr_dev, NULL, key_to_hw_index(mw->rkey) + & (hr_dev->caps.num_mtpts - 1)); + if (ret) + dev_warn(dev, "MW HW2SW_MPT failed (%d)\n", ret); + + hns_roce_table_put(hr_dev, &hr_dev->mr_table.mtpt_table, + key_to_hw_index(mw->rkey)); + } + + hns_roce_bitmap_free(&hr_dev->mr_table.mtpt_bitmap, + key_to_hw_index(mw->rkey), BITMAP_NO_RR); +} + +static int hns_roce_mw_enable(struct hns_roce_dev *hr_dev, + struct hns_roce_mw *mw) +{ + struct hns_roce_mr_table *mr_table = &hr_dev->mr_table; + struct hns_roce_cmd_mailbox *mailbox; + struct device *dev = hr_dev->dev; + unsigned long mtpt_idx = key_to_hw_index(mw->rkey); + int ret; + + /* prepare HEM entry memory */ + ret = hns_roce_table_get(hr_dev, &mr_table->mtpt_table, mtpt_idx); + if (ret) + return ret; + + mailbox = hns_roce_alloc_cmd_mailbox(hr_dev); + if (IS_ERR(mailbox)) { + ret = PTR_ERR(mailbox); + goto err_table; + } + + ret = hr_dev->hw->mw_write_mtpt(mailbox->buf, mw); + if (ret) { + dev_err(dev, "MW write mtpt fail!\n"); + goto err_page; + } + + ret = hns_roce_sw2hw_mpt(hr_dev, mailbox, + mtpt_idx & (hr_dev->caps.num_mtpts - 1)); + if (ret) { + dev_err(dev, "MW sw2hw_mpt failed (%d)\n", ret); + goto err_page; + } + + mw->enabled = 1; + + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + + return 0; + +err_page: + hns_roce_free_cmd_mailbox(hr_dev, mailbox); + +err_table: + hns_roce_table_put(hr_dev, &mr_table->mtpt_table, mtpt_idx); + + return ret; +} + +struct ib_mw *hns_roce_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type, + struct ib_udata *udata) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ib_pd->device); + struct hns_roce_mw *mw; + unsigned long index = 0; + int ret; + + mw = kmalloc(sizeof(*mw), GFP_KERNEL); + if (!mw) + return ERR_PTR(-ENOMEM); + + /* Allocate a key for mw from bitmap */ + ret = hns_roce_bitmap_alloc(&hr_dev->mr_table.mtpt_bitmap, &index); + if (ret) + goto err_bitmap; + + mw->rkey = hw_index_to_key(index); + + mw->ibmw.rkey = mw->rkey; + mw->ibmw.type = type; + mw->pdn = to_hr_pd(ib_pd)->pdn; + mw->pbl_hop_num = hr_dev->caps.pbl_hop_num; + mw->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz; + mw->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz; + + ret = hns_roce_mw_enable(hr_dev, mw); + if (ret) + goto err_mw; + + return &mw->ibmw; + +err_mw: + hns_roce_mw_free(hr_dev, mw); + +err_bitmap: + kfree(mw); + + return ERR_PTR(ret); +} + +int hns_roce_dealloc_mw(struct ib_mw *ibmw) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(ibmw->device); + struct hns_roce_mw *mw = to_hr_mw(ibmw); + + hns_roce_mw_free(hr_dev, mw); + kfree(mw); + + return 0; +} From 6e68c899e664f139cf72997dd1e3d8ebf34903bb Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 26 Sep 2018 13:26:08 +0100 Subject: [PATCH 149/242] IB/mthca: remove redundant inner check of mdev->mthca_flags The inner check for mdev->mthca_flags & MTHCA_FLAG_MSI_X is redundant as this is already true because of the previous identical check in an outer if statement. Remove it Detected by cppcheck: (warning) Identical inner 'if' condition is always true. Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index f3e80dec1334..f99c7e0b234c 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -1014,8 +1014,7 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) err = mthca_setup_hca(mdev); if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) { - if (mdev->mthca_flags & MTHCA_FLAG_MSI_X) - pci_free_irq_vectors(pdev); + pci_free_irq_vectors(pdev); mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X; err = mthca_setup_hca(mdev); From e04951ebeefbad29455a6218187fe01e0f05b026 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Wed, 26 Sep 2018 10:02:22 -0700 Subject: [PATCH 150/242] IB/hfi1: Move UnsupportedVL bits definitions to the correct header The UnsupportedVL SendCtrl register bit information is defined in the module rather than the chip register header file. Move the defines to the appropriate header file. Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip_registers.h | 4 ++++ drivers/infiniband/hw/hfi1/pio.c | 8 -------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip_registers.h b/drivers/infiniband/hw/hfi1/chip_registers.h index ee6dca5e2a2f..c6163a347e93 100644 --- a/drivers/infiniband/hw/hfi1/chip_registers.h +++ b/drivers/infiniband/hw/hfi1/chip_registers.h @@ -878,6 +878,10 @@ #define SEND_CTRL (TXE + 0x000000000000) #define SEND_CTRL_CM_RESET_SMASK 0x4ull #define SEND_CTRL_SEND_ENABLE_SMASK 0x1ull +#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 +#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xFFull +#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ + << SEND_CTRL_UNSUPPORTED_VL_SHIFT) #define SEND_CTRL_VL_ARBITER_ENABLE_SMASK 0x2ull #define SEND_CTXT_CHECK_ENABLE (TXE + 0x000000100080) #define SEND_CTXT_CHECK_ENABLE_CHECK_BYPASS_VL_MAPPING_SMASK 0x80ull diff --git a/drivers/infiniband/hw/hfi1/pio.c b/drivers/infiniband/hw/hfi1/pio.c index c2c1cba5b23b..7b3b2350f078 100644 --- a/drivers/infiniband/hw/hfi1/pio.c +++ b/drivers/infiniband/hw/hfi1/pio.c @@ -71,14 +71,6 @@ void __cm_reset(struct hfi1_devdata *dd, u64 sendctrl) } } -/* defined in header release 48 and higher */ -#ifndef SEND_CTRL_UNSUPPORTED_VL_SHIFT -#define SEND_CTRL_UNSUPPORTED_VL_SHIFT 3 -#define SEND_CTRL_UNSUPPORTED_VL_MASK 0xffull -#define SEND_CTRL_UNSUPPORTED_VL_SMASK (SEND_CTRL_UNSUPPORTED_VL_MASK \ - << SEND_CTRL_UNSUPPORTED_VL_SHIFT) -#endif - /* global control of PIO send */ void pio_send_control(struct hfi1_devdata *dd, int op) { From c8b53d0c5eb89a5831b7a25f4bd5e742a85c293b Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 26 Sep 2018 10:02:32 -0700 Subject: [PATCH 151/242] IB/sa: simplify return code logic for ib_nl_send_msg() rdma_nl_multicast() returns either negative error code or zero if succeeded. Remove unnecessary ret code checks and reassignments. Reviewed-by: Kaike Wan Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/sa_query.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index 19e1833e13fc..a5e76d432d3f 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -835,7 +835,6 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) struct sk_buff *skb = NULL; struct nlmsghdr *nlh; void *data; - int ret = 0; struct ib_sa_mad *mad; int len; @@ -862,13 +861,7 @@ static int ib_nl_send_msg(struct ib_sa_query *query, gfp_t gfp_mask) /* Repair the nlmsg header length */ nlmsg_end(skb, nlh); - ret = rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); - if (!ret) - ret = len; - else - ret = 0; - - return ret; + return rdma_nl_multicast(skb, RDMA_NL_GROUP_LS, gfp_mask); } static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) @@ -891,14 +884,12 @@ static int ib_nl_make_request(struct ib_sa_query *query, gfp_t gfp_mask) spin_unlock_irqrestore(&ib_nl_request_lock, flags); ret = ib_nl_send_msg(query, gfp_mask); - if (ret <= 0) { + if (ret) { ret = -EIO; /* Remove the request */ spin_lock_irqsave(&ib_nl_request_lock, flags); list_del(&query->list); spin_unlock_irqrestore(&ib_nl_request_lock, flags); - } else { - ret = 0; } return ret; From b54900fce4835862ab15b4f94f7cb676e65ecf6d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 27 Sep 2018 14:24:30 +0100 Subject: [PATCH 152/242] RDMA/hns: fix spelling mistake "reseved" -> "reserved" Trivial fix to spelling mistake in dev_err error message Signed-off-by: Colin Ian King Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c index 081aa91fc162..ca05810c92dc 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c @@ -731,7 +731,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) cq_init_attr.comp_vector = 0; cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL, NULL); if (IS_ERR(cq)) { - dev_err(dev, "Create cq for reseved loop qp failed!"); + dev_err(dev, "Create cq for reserved loop qp failed!"); return -ENOMEM; } free_mr->mr_free_cq = to_hr_cq(cq); @@ -744,7 +744,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev) pd = hns_roce_alloc_pd(&hr_dev->ib_dev, NULL, NULL); if (IS_ERR(pd)) { - dev_err(dev, "Create pd for reseved loop qp failed!"); + dev_err(dev, "Create pd for reserved loop qp failed!"); ret = -ENOMEM; goto alloc_pd_failed; } From 3994586f4d7a1e8eb2a152405d0a1c9c8b947c4c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Tue, 25 Sep 2018 12:04:04 +0300 Subject: [PATCH 153/242] RDMA/core: Acquire and release mmap_sem on page range Currently mmap_sem is read locked while pinning the memory. In a multi-threaded application of a process, holding mmap_sem lock creates contention with other threads who might be either registering memory, creating QPs or simply doing mmap() as such operations also require to hold the mmap_sem write lock. All such operation cannot make forward progress until one memory pin operation is completed. It becomes more worse if the memory is unpinned and/or memory registration is large (in GB range). Therefore, instead of holding mmap_sem for too long (for whole region pinning), acquire and release the lock for every few pages. For example on x86 with 4K page size, acquire and release mmap_sem for every 2Mbytes memory chunk. This allows other competing threads to make progress who might wish to hold mmap_sem for shorter duration. When memory registration latency is measured using [1] for memory sizes ranging from 4K to 48GB, <= 1% or 0.5% degradation is noticed. In many runs no difference is seen other than run-to-run variance. In other targeted tests of users with large memory, desired improvements are seen due to reduced contention of mmap_sem. [1] https://github.com/paravmellanox/rtool $ rdma_resource_lat -c 1 -s 48G -a -u L -i 500 -A It registers pinned memory from 4K to 48GB size with 500 iterations for each memory size. $ rdma_resource_lat -c 1 -s 12G -a -u L -i 500 -t 4 4 competing threads pin memory, each of 12GB size with 500 iterations. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 8da1cf29a69f..c6144df47ea4 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -181,8 +181,8 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, sg_list_start = umem->sg_head.sgl; - down_read(&mm->mmap_sem); while (npages) { + down_read(&mm->mmap_sem); ret = get_user_pages_longterm(cur_base, min_t(unsigned long, npages, PAGE_SIZE / sizeof (struct page *)), @@ -196,17 +196,20 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr, cur_base += ret * PAGE_SIZE; npages -= ret; + /* Continue to hold the mmap_sem as vma_list access + * needs to be protected. + */ for_each_sg(sg_list_start, sg, ret, i) { if (vma_list && !is_vm_hugetlb_page(vma_list[i])) umem->hugetlb = 0; sg_set_page(sg, page_list[i], PAGE_SIZE, 0); } + up_read(&mm->mmap_sem); /* preparing for next loop */ sg_list_start = sg; } - up_read(&mm->mmap_sem); umem->nmap = ib_dma_map_sg_attrs(context->device, umem->sg_head.sgl, From 7f72052cb48efb5637ed99d2f45cb33a0bf60719 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:45:18 +0300 Subject: [PATCH 154/242] IB/mlx5: Expose RAW QP device handles to user space Expose RAW QP device handles to user space by extending the UHW part of mlx5_ib_create_qp_resp. This data is returned only when DEVX context is used where it may be applicable. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 38 +++++++++++++++++++++++++++++++-- include/uapi/rdma/mlx5-abi.h | 13 +++++++++++ 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 3455b50705cd..c49a0815a12b 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1325,7 +1325,9 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, u32 *in, size_t inlen, - struct ib_pd *pd) + struct ib_pd *pd, + struct ib_udata *udata, + struct mlx5_ib_create_qp_resp *resp) { struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; struct mlx5_ib_sq *sq = &raw_packet_qp->sq; @@ -1335,6 +1337,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); int err; u32 tdn = mucontext->tdn; + u16 uid = to_mpd(pd)->uid; if (qp->sq.wqe_cnt) { err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd); @@ -1345,6 +1348,13 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, if (err) goto err_destroy_tis; + if (uid) { + resp->tisn = sq->tisn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TISN; + resp->sqn = sq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_SQN; + } + sq->base.container_mibqp = qp; sq->base.mqp.event = mlx5_ib_qp_event; } @@ -1363,13 +1373,25 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd); if (err) goto err_destroy_rq; + + if (uid) { + resp->rqn = rq->base.mqp.qpn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_RQN; + resp->tirn = rq->tirn; + resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + } } qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : rq->base.mqp.qpn; + err = ib_copy_to_udata(udata, resp, min(udata->outlen, sizeof(*resp))); + if (err) + goto err_destroy_tir; return 0; +err_destroy_tir: + destroy_raw_packet_qp_tir(dev, rq, qp->flags_en, pd); err_destroy_rq: destroy_raw_packet_qp_rq(dev, rq); err_destroy_sq: @@ -1640,12 +1662,23 @@ create_tir: if (err) goto err; + if (mucontext->devx_uid) { + resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN; + resp.tirn = qp->rss_qp.tirn; + } + + err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp))); + if (err) + goto err_copy; + kvfree(in); /* qpn is reserved for that QP */ qp->trans_qp.base.mqp.qpn = 0; qp->flags |= MLX5_IB_QP_RSS; return 0; +err_copy: + mlx5_cmd_destroy_tir(dev->mdev, qp->rss_qp.tirn, mucontext->devx_uid); err: kvfree(in); return err; @@ -1978,7 +2011,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->flags & MLX5_IB_QP_UNDERLAY) { qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); - err = create_raw_packet_qp(dev, qp, in, inlen, pd); + err = create_raw_packet_qp(dev, qp, in, inlen, pd, udata, + &resp); } else { err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); } diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index e584ba40208e..6056625237cf 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -351,9 +351,22 @@ struct mlx5_ib_create_qp_rss { __u32 flags; }; +enum mlx5_ib_create_qp_resp_mask { + MLX5_IB_CREATE_QP_RESP_MASK_TIRN = 1UL << 0, + MLX5_IB_CREATE_QP_RESP_MASK_TISN = 1UL << 1, + MLX5_IB_CREATE_QP_RESP_MASK_RQN = 1UL << 2, + MLX5_IB_CREATE_QP_RESP_MASK_SQN = 1UL << 3, +}; + struct mlx5_ib_create_qp_resp { __u32 bfreg_index; __u32 reserved; + __u32 comp_mask; + __u32 tirn; + __u32 tisn; + __u32 rqn; + __u32 sqn; + __u32 reserved1; }; struct mlx5_ib_alloc_mw { From 76dc5a8406bffabf3f466e331a3e9515ddf93954 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:45:19 +0300 Subject: [PATCH 155/242] IB/mlx5: Manage device uid for DEVX white list commands Manage device uid for DEVX white list commands. The created device uid will be used on white list commands if the user didn't supply its own uid. This will enable the firmware to filter out non privileged functionality as of the recognition of the uid. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 12 ++++++------ drivers/infiniband/hw/mlx5/main.c | 16 ++++++++++++---- drivers/infiniband/hw/mlx5/mlx5_ib.h | 13 +++++-------- 3 files changed, 23 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 562c7936bbad..97cac57dcb3d 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -45,13 +45,14 @@ static struct mlx5_ib_ucontext *devx_ufile2uctx(struct ib_uverbs_file *file) return to_mucontext(ib_uverbs_get_ucontext(file)); } -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *context) +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev) { u32 in[MLX5_ST_SZ_DW(create_uctx_in)] = {0}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; u64 general_obj_types; void *hdr; int err; + u16 uid; hdr = MLX5_ADDR_OF(create_uctx_in, in, hdr); @@ -70,19 +71,18 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, struct mlx5_ib_ucontext *contex if (err) return err; - context->devx_uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); - return 0; + uid = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id); + return uid; } -void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context) +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) { u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {0}; u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {0}; MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_UCTX); - MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, context->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, uid); mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); } diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 597cd3c171c9..10e59923e95b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1765,9 +1765,10 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, goto out_uars; } - err = mlx5_ib_devx_create(dev, context); - if (err) + err = mlx5_ib_devx_create(dev); + if (err < 0) goto out_uars; + context->devx_uid = err; } err = mlx5_ib_alloc_transport_domain(dev, &context->tdn, @@ -1870,7 +1871,7 @@ out_mdev: mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); out_devx: if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) - mlx5_ib_devx_destroy(dev, context); + mlx5_ib_devx_destroy(dev, context->devx_uid); out_uars: deallocate_uars(dev, context); @@ -1904,7 +1905,7 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid); if (context->devx_uid) - mlx5_ib_devx_destroy(dev, context); + mlx5_ib_devx_destroy(dev, context->devx_uid); deallocate_uars(dev, context); kfree(bfregi->sys_pages); @@ -6189,6 +6190,8 @@ void __mlx5_ib_remove(struct mlx5_ib_dev *dev, profile->stage[stage].cleanup(dev); } + if (dev->devx_whitelist_uid) + mlx5_ib_devx_destroy(dev, dev->devx_whitelist_uid); ib_dealloc_device((struct ib_device *)dev); } @@ -6197,6 +6200,7 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, { int err; int i; + int uid; for (i = 0; i < MLX5_IB_STAGE_MAX; i++) { if (profile->stage[i].init) { @@ -6206,6 +6210,10 @@ void *__mlx5_ib_add(struct mlx5_ib_dev *dev, } } + uid = mlx5_ib_devx_create(dev); + if (uid > 0) + dev->devx_whitelist_uid = uid; + dev->profile = profile; dev->ib_active = true; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 5ada28d6e5e9..e5ec3fdaa4d5 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -925,6 +925,7 @@ struct mlx5_ib_dev { struct list_head ib_dev_list; u64 sys_image_guid; struct mlx5_memic memic; + u16 devx_whitelist_uid; }; static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq) @@ -1249,10 +1250,8 @@ void mlx5_ib_put_native_port_mdev(struct mlx5_ib_dev *dev, u8 port_num); #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS) -int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context); -void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context); +int mlx5_ib_devx_create(struct mlx5_ib_dev *dev); +void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid); const struct uverbs_object_tree_def *mlx5_ib_get_devx_tree(void); struct mlx5_ib_flow_handler *mlx5_ib_raw_fs_rule_add( struct mlx5_ib_dev *dev, struct mlx5_ib_flow_matcher *fs_matcher, @@ -1263,10 +1262,8 @@ int mlx5_ib_get_flow_trees(const struct uverbs_object_tree_def **root); void mlx5_ib_destroy_flow_action_raw(struct mlx5_ib_flow_action *maction); #else static inline int -mlx5_ib_devx_create(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context) { return -EOPNOTSUPP; }; -static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, - struct mlx5_ib_ucontext *context) {} +mlx5_ib_devx_create(struct mlx5_ib_dev *dev) { return -EOPNOTSUPP; }; +static inline void mlx5_ib_devx_destroy(struct mlx5_ib_dev *dev, u16 uid) {} static inline const struct uverbs_object_tree_def * mlx5_ib_get_devx_tree(void) { return NULL; } static inline bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, From 7e1335a736969a8b5169629e6569779d42fcda2f Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:45:20 +0300 Subject: [PATCH 156/242] IB/mlx5: Enable DEVX white list commands Enable DEVX white list commands without the need for CAP_NET_RAW. DEVX uid must exist from the ucontext or the device so that the firmware will mask unprivileged capabilities. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 75 ++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 97cac57dcb3d..c11640047f26 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -61,9 +61,6 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev) !(general_obj_types & MLX5_GENERAL_OBJ_TYPES_CAP_UMEM)) return -EINVAL; - if (!capable(CAP_NET_RAW)) - return -EPERM; - MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type, MLX5_OBJ_TYPE_UCTX); @@ -476,12 +473,49 @@ static bool devx_is_obj_query_cmd(const void *in) } } +static bool devx_is_whitelist_cmd(void *in) +{ + u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + + switch (opcode) { + case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: + return true; + default: + return false; + } +} + +static int devx_get_uid(struct mlx5_ib_ucontext *c, void *cmd_in) +{ + if (devx_is_whitelist_cmd(cmd_in)) { + struct mlx5_ib_dev *dev; + + if (c->devx_uid) + return c->devx_uid; + + dev = to_mdev(c->ibucontext.device); + if (dev->devx_whitelist_uid) + return dev->devx_whitelist_uid; + + return -EOPNOTSUPP; + } + + if (!c->devx_uid) + return -EINVAL; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + + return c->devx_uid; +} static bool devx_is_general_cmd(void *in) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); switch (opcode) { case MLX5_CMD_OP_QUERY_HCA_CAP: + case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT: case MLX5_CMD_OP_QUERY_VPORT_STATE: case MLX5_CMD_OP_QUERY_ADAPTER: case MLX5_CMD_OP_QUERY_ISSI: @@ -589,14 +623,16 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( MLX5_IB_ATTR_DEVX_OTHER_CMD_OUT); void *cmd_out; int err; + int uid; c = devx_ufile2uctx(file); if (IS_ERR(c)) return PTR_ERR(c); dev = to_mdev(c->ibucontext.device); - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; /* Only white list of some general HCA commands are allowed for this method. */ if (!devx_is_general_cmd(cmd_in)) @@ -606,7 +642,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OTHER)( if (IS_ERR(cmd_out)) return PTR_ERR(cmd_out); - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); err = mlx5_cmd_exec(dev->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OTHER_CMD_IN), cmd_out, cmd_out_len); @@ -816,9 +852,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( struct mlx5_ib_dev *dev = to_mdev(c->ibucontext.device); struct devx_obj *obj; int err; + int uid; - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; if (!devx_is_obj_create_cmd(cmd_in)) return -EINVAL; @@ -831,7 +869,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( if (!obj) return -ENOMEM; - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); devx_set_umem_valid(cmd_in); err = mlx5_cmd_exec(dev->mdev, cmd_in, @@ -868,9 +906,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( struct devx_obj *obj = uobj->object; void *cmd_out; int err; + int uid; - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; if (!devx_is_obj_modify_cmd(cmd_in)) return -EINVAL; @@ -882,7 +922,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)( if (IS_ERR(cmd_out)) return PTR_ERR(cmd_out); - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); devx_set_umem_valid(cmd_in); err = mlx5_cmd_exec(obj->mdev, cmd_in, @@ -907,9 +947,11 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( struct devx_obj *obj = uobj->object; void *cmd_out; int err; + int uid; - if (!c->devx_uid) - return -EPERM; + uid = devx_get_uid(c, cmd_in); + if (uid < 0) + return uid; if (!devx_is_obj_query_cmd(cmd_in)) return -EINVAL; @@ -921,7 +963,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)( if (IS_ERR(cmd_out)) return PTR_ERR(cmd_out); - MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, c->devx_uid); + MLX5_SET(general_obj_in_cmd_hdr, cmd_in, uid, uid); err = mlx5_cmd_exec(obj->mdev, cmd_in, uverbs_attr_get_len(attrs, MLX5_IB_ATTR_DEVX_OBJ_QUERY_CMD_IN), cmd_out, cmd_out_len); @@ -1020,6 +1062,9 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_UMEM_REG)( int err; if (!c->devx_uid) + return -EINVAL; + + if (!capable(CAP_NET_RAW)) return -EPERM; obj = kzalloc(sizeof(struct devx_umem), GFP_KERNEL); From 3df6e0234aebc55888069997239fe2847d4cf152 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Thu, 20 Sep 2018 21:45:21 +0300 Subject: [PATCH 157/242] IB/mlx5: Enable DEVX on IB IB has additional protections with SELinux that cannot be extended to the DEVX domain. SELinux can restrict access to pkeys. The first version of DEVX blocked IB entirely until this could be understood. Since DEVX requires CAP_NET_RAW, it supersedes the SELinux restriction and allows userspace to form arbitrary packets with arbitrary pkeys. Thus we enable IB for DEVX when CAP_NET_RAW is given. Signed-off-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 10e59923e95b..b3294a7e3ff9 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -1759,12 +1759,6 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, #endif if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) { - /* Block DEVX on Infiniband as of SELinux */ - if (mlx5_ib_port_link_layer(ibdev, 1) != IB_LINK_LAYER_ETHERNET) { - err = -EPERM; - goto out_uars; - } - err = mlx5_ib_devx_create(dev); if (err < 0) goto out_uars; From 65f07f5a09dacf3b60619f196f096ea3671a5eda Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Wed, 26 Sep 2018 09:44:18 +0000 Subject: [PATCH 158/242] IB/iser: Fix possible NULL deref at iser_inv_desc() In case target remote invalidates bogus rkey and signature is not used, pi_ctx is NULL deref. The commit also fails the connection on bogus remote invalidation. Fixes: 59caaed7a72a ("IB/iser: Support the remote invalidation exception") Signed-off-by: Israel Rukshin Reviewed-by: Max Gurtovoy Reviewed-by: Sagi Grimberg Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/iser/iser_initiator.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index 2f6388596f88..96af06cfe0af 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -589,13 +589,19 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) ib_conn->post_recv_buf_count--; } -static inline void +static inline int iser_inv_desc(struct iser_fr_desc *desc, u32 rkey) { - if (likely(rkey == desc->rsc.mr->rkey)) + if (likely(rkey == desc->rsc.mr->rkey)) { desc->rsc.mr_valid = 0; - else if (likely(rkey == desc->pi_ctx->sig_mr->rkey)) + } else if (likely(desc->pi_ctx && rkey == desc->pi_ctx->sig_mr->rkey)) { desc->pi_ctx->sig_mr_valid = 0; + } else { + iser_err("Bogus remote invalidation for rkey %#x\n", rkey); + return -EINVAL; + } + + return 0; } static int @@ -623,12 +629,14 @@ iser_check_remote_inv(struct iser_conn *iser_conn, if (iser_task->dir[ISER_DIR_IN]) { desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h; - iser_inv_desc(desc, rkey); + if (unlikely(iser_inv_desc(desc, rkey))) + return -EINVAL; } if (iser_task->dir[ISER_DIR_OUT]) { desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h; - iser_inv_desc(desc, rkey); + if (unlikely(iser_inv_desc(desc, rkey))) + return -EINVAL; } } else { iser_err("failed to get task for itt=%d\n", hdr->itt); From 721ad7e643f7002efa398838693f90284ea216d1 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 30 Sep 2018 01:57:42 -0400 Subject: [PATCH 159/242] IB/rxe: replace kvfree with vfree The buf is allocated by vmalloc_user in the function rxe_queue_init. So it is better to free it by vfree. Fixes: 8700e3e7c485 ("Soft RoCE driver") Reviewed-by: Leon Romanovsky Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_cq.c | 4 ++-- drivers/infiniband/sw/rxe/rxe_qp.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index 2ee4b08b00ea..a57276f2cb84 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -30,7 +30,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ - +#include #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" @@ -97,7 +97,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); if (err) { - kvfree(cq->queue->buf); + vfree(cq->queue->buf); kfree(cq->queue); return err; } diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 6ff88c8250f6..45b392b7342f 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "rxe.h" #include "rxe_loc.h" @@ -257,7 +258,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, &qp->sq.queue->ip); if (err) { - kvfree(qp->sq.queue->buf); + vfree(qp->sq.queue->buf); kfree(qp->sq.queue); return err; } @@ -310,7 +311,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp, qp->rq.queue->buf, qp->rq.queue->buf_size, &qp->rq.queue->ip); if (err) { - kvfree(qp->rq.queue->buf); + vfree(qp->rq.queue->buf); kfree(qp->rq.queue); return err; } From 935c84ac649a147e1aad2c48ee5c5a1a9176b2d0 Mon Sep 17 00:00:00 2001 From: "Michael J. Ruhl" Date: Fri, 28 Sep 2018 07:34:57 -0700 Subject: [PATCH 160/242] IB/hfi1: Error path MAD response size is incorrect If a MAD packet has incorrect header information, the logic uses the reply path to report the error. The reply path expects *resp_len to be set prior to return. Unfortunately, *resp_len is set to 0 for this path. This causes an incorrect response packet. Fix by ensuring that the *resp_len is defaulted to the incoming packet size (wc->bytes_len - sizeof(GRH)). Reviewed-by: Mike Marciniszyn Signed-off-by: Michael J. Ruhl Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/mad.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mad.c b/drivers/infiniband/hw/hfi1/mad.c index 0307405491e0..88a0cf930136 100644 --- a/drivers/infiniband/hw/hfi1/mad.c +++ b/drivers/infiniband/hw/hfi1/mad.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015-2017 Intel Corporation. + * Copyright(c) 2015-2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -4836,7 +4836,7 @@ static int hfi1_process_opa_mad(struct ib_device *ibdev, int mad_flags, int ret; int pkey_idx; int local_mad = 0; - u32 resp_len = 0; + u32 resp_len = in_wc->byte_len - sizeof(*in_grh); struct hfi1_ibport *ibp = to_iport(ibdev, port); pkey_idx = hfi1_lookup_pkey_idx(ibp, LIM_MGMT_P_KEY); From aef716fa5e6da3919cca22ac2097a90d73d8177f Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Thu, 27 Sep 2018 13:55:58 -0700 Subject: [PATCH 161/242] RDMA/qedr: Remove enumerated type qed_roce_ll2_tx_dest Clang warns when one enumerated type is explicitly converted to another. drivers/infiniband/hw/qedr/qedr_roce_cm.c:198:28: warning: implicit conversion from enumeration type 'enum qed_roce_ll2_tx_dest' to different enumeration type 'enum qed_ll2_tx_dest' [-Wenum-conversion] ll2_tx_pkt.tx_dest = pkt->tx_dest; ~ ~~~~~^~~~~~~ 1 warning generated. Turns out that QED_ROCE_LL2_TX_DEST_NW and QED_ROCE_LL2_TX_DEST_LB are only used once in the whole tree and QED_ROCE_LL2_TX_DEST_MAX is used nowhere. Remove them and use the equivalent values from qed_ll2_tx_dest in their place. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Acked-by: Michal Kalderon Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/qedr/qedr_roce_cm.c | 4 ++-- include/linux/qed/qed_rdma_if.h | 11 +---------- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/hw/qedr/qedr_roce_cm.c b/drivers/infiniband/hw/qedr/qedr_roce_cm.c index 85578887421b..e1ac2fd60bb1 100644 --- a/drivers/infiniband/hw/qedr/qedr_roce_cm.c +++ b/drivers/infiniband/hw/qedr/qedr_roce_cm.c @@ -519,9 +519,9 @@ static inline int qedr_gsi_build_packet(struct qedr_dev *dev, } if (ether_addr_equal(udh.eth.smac_h, udh.eth.dmac_h)) - packet->tx_dest = QED_ROCE_LL2_TX_DEST_LB; + packet->tx_dest = QED_LL2_TX_DEST_LB; else - packet->tx_dest = QED_ROCE_LL2_TX_DEST_NW; + packet->tx_dest = QED_LL2_TX_DEST_NW; packet->roce_mode = roce_mode; memcpy(packet->header.vaddr, ud_header_buffer, header_size); diff --git a/include/linux/qed/qed_rdma_if.h b/include/linux/qed/qed_rdma_if.h index df4d13f7e191..d15f8e4815e3 100644 --- a/include/linux/qed/qed_rdma_if.h +++ b/include/linux/qed/qed_rdma_if.h @@ -39,15 +39,6 @@ #include #include -enum qed_roce_ll2_tx_dest { - /* Light L2 TX Destination to the Network */ - QED_ROCE_LL2_TX_DEST_NW, - - /* Light L2 TX Destination to the Loopback */ - QED_ROCE_LL2_TX_DEST_LB, - QED_ROCE_LL2_TX_DEST_MAX -}; - #define QED_RDMA_MAX_CNQ_SIZE (0xFFFF) /* rdma interface */ @@ -581,7 +572,7 @@ struct qed_roce_ll2_packet { int n_seg; struct qed_roce_ll2_buffer payload[RDMA_MAX_SGE_PER_SQ_WQE]; int roce_mode; - enum qed_roce_ll2_tx_dest tx_dest; + enum qed_ll2_tx_dest tx_dest; }; enum qed_rdma_type { From 8c31c9188b27b276b5ba3d362fa8e2adc9b7404a Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Fri, 28 Sep 2018 10:59:53 +0000 Subject: [PATCH 162/242] RDMA/hns: remove set but not used variable 'dseg' Fixes gcc '-Wunused-but-set-variable' warning: drivers/infiniband/hw/hns/hns_roce_hw_v2.c: In function 'hns_roce_v2_post_send': drivers/infiniband/hw/hns/hns_roce_hw_v2.c:194:35: warning: variable 'dseg' set but not used [-Wunused-but-set-variable] Signed-off-by: YueHaibing Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 0d7568ee0d64..8136dd85845e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -191,7 +191,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct hns_roce_v2_ud_send_wqe *ud_sq_wqe; struct hns_roce_v2_rc_send_wqe *rc_sq_wqe; struct hns_roce_qp *qp = to_hr_qp(ibqp); - struct hns_roce_v2_wqe_data_seg *dseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; struct ib_qp_attr attr; @@ -492,12 +491,6 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, roce_set_field(rc_sq_wqe->byte_4, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op); - if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || - wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) - dseg = - wqe - sizeof(struct hns_roce_v2_wqe_data_seg); - else - dseg = wqe; ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, &sge_ind, bad_wr); From d205a06a14796a24b3447bc5d27b7dedff4479d5 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 26 Sep 2018 10:26:44 -0700 Subject: [PATCH 163/242] IB/rdmavt: Rename check_send_wqe as setup_wqe The driver-provided function check_send_wqe allows the hardware driver to check and set up the incoming send wqe before it is inserted into the swqe ring. This patch will rename it as setup_wqe to better reflect its usage. In addition, this function is only called when all setup is complete in rdmavt. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 11 ++++++++--- drivers/infiniband/hw/hfi1/verbs.c | 2 +- drivers/infiniband/hw/hfi1/verbs.h | 4 ++-- drivers/infiniband/hw/qib/qib_verbs.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 28 +++++++++++++++++++-------- include/rdma/rdma_vt.h | 13 +++++++------ 6 files changed, 39 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 54d9ff171059..b1044a205ab6 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -282,16 +282,21 @@ void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, } /** - * hfi1_check_send_wqe - validate wqe + * hfi1_setup_wqe - set up the wqe * @qp - The qp * @wqe - The built wqe * @call_send - Determine if the send should be posted or scheduled. * + * Perform setup of the wqe. This is called + * prior to inserting the wqe into the ring but after + * the wqe has been setup by RDMAVT. This function + * allows the driver the opportunity to perform + * validation and additional setup of the wqe. + * * Returns 0 on success, -EINVAL on failure * */ -int hfi1_check_send_wqe(struct rvt_qp *qp, - struct rvt_swqe *wqe, bool *call_send) +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 13374c727b14..bbee0cb77ff8 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1937,7 +1937,7 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.driver_f.check_modify_qp = hfi1_check_modify_qp; dd->verbs_dev.rdi.driver_f.modify_qp = hfi1_modify_qp; dd->verbs_dev.rdi.driver_f.notify_restart_rc = hfi1_restart_rc; - dd->verbs_dev.rdi.driver_f.check_send_wqe = hfi1_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = hfi1_setup_wqe; dd->verbs_dev.rdi.driver_f.comp_vect_cpu_lookup = hfi1_comp_vect_mappings_lookup; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index 269ec338581b..bc77ffec51ce 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -343,8 +343,8 @@ int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, void hfi1_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata); void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait); -int hfi1_check_send_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, - bool *call_send); +int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); extern const u32 rc_only_opcode; extern const u32 uc_only_opcode; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 41babbc0db58..ad9093d33cb2 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1588,7 +1588,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.driver_f.port_callback = qib_create_port_files; dd->verbs_dev.rdi.driver_f.get_pci_dev = qib_get_pci_dev; dd->verbs_dev.rdi.driver_f.check_ah = qib_check_ah; - dd->verbs_dev.rdi.driver_f.check_send_wqe = qib_check_send_wqe; + dd->verbs_dev.rdi.driver_f.setup_wqe = qib_check_send_wqe; dd->verbs_dev.rdi.driver_f.notify_new_ah = qib_notify_new_ah; dd->verbs_dev.rdi.driver_f.alloc_qpn = qib_alloc_qpn; dd->verbs_dev.rdi.driver_f.qp_priv_alloc = qib_qp_priv_alloc; diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a9b7d7ff32ee..2db71e956d02 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1823,13 +1823,11 @@ static int rvt_post_one_wr(struct rvt_qp *qp, wqe->wr.num_sge = j; } - /* general part of wqe valid - allow for driver checks */ - if (rdi->driver_f.check_send_wqe) { - ret = rdi->driver_f.check_send_wqe(qp, wqe, call_send); - if (ret < 0) - goto bail_inval_free; - } - + /* + * Calculate and set SWQE PSN values prior to handing it off + * to the driver's check routine. This give the driver the + * opportunity to adjust PSN values based on internal checks. + */ log_pmtu = qp->log_pmtu; if (qp->ibqp.qp_type != IB_QPT_UC && qp->ibqp.qp_type != IB_QPT_RC) { @@ -1854,8 +1852,18 @@ static int rvt_post_one_wr(struct rvt_qp *qp, (wqe->length ? ((wqe->length - 1) >> log_pmtu) : 0); - qp->s_next_psn = wqe->lpsn + 1; } + + /* general part of wqe valid - allow for driver checks */ + if (rdi->driver_f.setup_wqe) { + ret = rdi->driver_f.setup_wqe(qp, wqe, call_send); + if (ret < 0) + goto bail_inval_free_ref; + } + + if (!(rdi->post_parms[wr->opcode].flags & RVT_OPERATION_LOCAL)) + qp->s_next_psn = wqe->lpsn + 1; + if (unlikely(reserved_op)) { wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; rvt_qp_wqe_reserve(qp, wqe); @@ -1869,6 +1877,10 @@ static int rvt_post_one_wr(struct rvt_qp *qp, return 0; +bail_inval_free_ref: + if (qp->ibqp.qp_type != IB_QPT_UC && + qp->ibqp.qp_type != IB_QPT_RC) + atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); bail_inval_free: /* release mr holds */ while (j) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index cc08de3570b4..52907204afcd 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -215,13 +215,14 @@ struct rvt_driver_provided { void (*schedule_send_no_lock)(struct rvt_qp *qp); /* - * Validate the wqe. This needs to be done prior to inserting the - * wqe into the ring, but after the wqe has been set up. Allow for - * driver specific work request checking by providing a callback. - * call_send indicates if the wqe should be posted or scheduled. + * Driver specific work request setup and checking. + * This function is allowed to perform any setup, checks, or + * adjustments required to the SWQE in order to be usable by + * underlying protocols. This includes private data structure + * allocations. */ - int (*check_send_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, - bool *call_send); + int (*setup_wqe)(struct rvt_qp *qp, struct rvt_swqe *wqe, + bool *call_send); /* * Sometimes rdmavt needs to kick the driver's send progress. That is From 5da0fc9dbf891a9c9e01a634f2126b5952afb3a6 Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Fri, 28 Sep 2018 07:17:09 -0700 Subject: [PATCH 164/242] IB/hfi1: Prepare resource waits for dual leg Current implementation allows each qp to have only one send engine. As such, each qp has only one list to queue prebuilt packets when send engine resources are not available. To improve performance, it is desired to support multiple send engines for each qp. This patch creates the framework to support two send engines (two legs) for each qp for the TID RDMA protocol, which can be easily extended to support more send engines. It achieves the goal by creating a leg specific struct, iowait_work in the iowait struct, to hold the work_struct and the tx_list as well as a pointer to the parent iowait struct. The hfi1_pkt_state now has an additional field to record the current legs work structure and that is now passed to all egress waiters to determine the leg that needs to wait via a new iowait helper. The APIs are adjusted to use the new leg specific struct as required. Many new and modified helpers are added to support this change. Reviewed-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/Makefile | 1 + drivers/infiniband/hw/hfi1/iowait.c | 91 +++++++++++ drivers/infiniband/hw/hfi1/iowait.h | 192 ++++++++++++++++------- drivers/infiniband/hw/hfi1/qp.c | 67 ++++++-- drivers/infiniband/hw/hfi1/qp.h | 31 ++-- drivers/infiniband/hw/hfi1/ruc.c | 10 +- drivers/infiniband/hw/hfi1/sdma.c | 52 +++--- drivers/infiniband/hw/hfi1/sdma.h | 8 +- drivers/infiniband/hw/hfi1/user_sdma.c | 14 +- drivers/infiniband/hw/hfi1/verbs.c | 11 +- drivers/infiniband/hw/hfi1/verbs.h | 4 +- drivers/infiniband/hw/hfi1/verbs_txreq.h | 11 +- drivers/infiniband/hw/hfi1/vnic_sdma.c | 21 +-- drivers/infiniband/hw/qib/qib_verbs.c | 9 +- drivers/infiniband/hw/qib/qib_verbs.h | 6 +- include/rdma/rdma_vt.h | 4 +- 16 files changed, 366 insertions(+), 166 deletions(-) create mode 100644 drivers/infiniband/hw/hfi1/iowait.c diff --git a/drivers/infiniband/hw/hfi1/Makefile b/drivers/infiniband/hw/hfi1/Makefile index a8dcf82ab7cb..ff790390c91a 100644 --- a/drivers/infiniband/hw/hfi1/Makefile +++ b/drivers/infiniband/hw/hfi1/Makefile @@ -20,6 +20,7 @@ hfi1-y := \ firmware.o \ init.o \ intr.o \ + iowait.o \ mad.o \ mmu_rb.o \ msix.o \ diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c new file mode 100644 index 000000000000..59dc955f1880 --- /dev/null +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -0,0 +1,91 @@ +// SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#include "iowait.h" + +void iowait_set_flag(struct iowait *wait, u32 flag) +{ + set_bit(flag, &wait->flags); +} + +bool iowait_flag_set(struct iowait *wait, u32 flag) +{ + return test_bit(flag, &wait->flags); +} + +inline void iowait_clear_flag(struct iowait *wait, u32 flag) +{ + clear_bit(flag, &wait->flags); +} + +/** + * iowait_init() - initialize wait structure + * @wait: wait struct to initialize + * @tx_limit: limit for overflow queuing + * @func: restart function for workqueue + * @sleep: sleep function for no space + * @resume: wakeup function for no space + * + * This function initializes the iowait + * structure embedded in the QP or PQ. + * + */ +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)) +{ + int i; + + wait->count = 0; + INIT_LIST_HEAD(&wait->list); + init_waitqueue_head(&wait->wait_dma); + init_waitqueue_head(&wait->wait_pio); + atomic_set(&wait->sdma_busy, 0); + atomic_set(&wait->pio_busy, 0); + wait->tx_limit = tx_limit; + wait->sleep = sleep; + wait->wakeup = wakeup; + wait->sdma_drained = sdma_drained; + wait->flags = 0; + for (i = 0; i < IOWAIT_SES; i++) { + wait->wait[i].iow = wait; + INIT_LIST_HEAD(&wait->wait[i].tx_head); + if (i == IOWAIT_IB_SE) + INIT_WORK(&wait->wait[i].iowork, func); + else + INIT_WORK(&wait->wait[i].iowork, tidfunc); + } +} + +/** + * iowait_cancel_work - cancel all work in iowait + * @w: the iowait struct + */ +void iowait_cancel_work(struct iowait *w) +{ + cancel_work_sync(&iowait_get_ib_work(w)->iowork); + cancel_work_sync(&iowait_get_tid_work(w)->iowork); +} + +/** + * iowait_set_work_flag - set work flag based on leg + * @w - the iowait work struct + */ +int iowait_set_work_flag(struct iowait_work *w) +{ + if (w == &w->iow->wait[IOWAIT_IB_SE]) { + iowait_set_flag(w->iow, IOWAIT_PENDING_IB); + return IOWAIT_IB_SE; + } + iowait_set_flag(w->iow, IOWAIT_PENDING_TID); + return IOWAIT_TID_SE; +} diff --git a/drivers/infiniband/hw/hfi1/iowait.h b/drivers/infiniband/hw/hfi1/iowait.h index 3d9c32c7c340..23a58ac0d47c 100644 --- a/drivers/infiniband/hw/hfi1/iowait.h +++ b/drivers/infiniband/hw/hfi1/iowait.h @@ -1,7 +1,7 @@ #ifndef _HFI1_IOWAIT_H #define _HFI1_IOWAIT_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -49,6 +49,7 @@ #include #include +#include #include #include "sdma_txreq.h" @@ -59,16 +60,47 @@ */ typedef void (*restart_t)(struct work_struct *work); +#define IOWAIT_PENDING_IB 0x0 +#define IOWAIT_PENDING_TID 0x1 + +/* + * A QP can have multiple Send Engines (SEs). + * + * The current use case is for supporting a TID RDMA + * packet build/xmit mechanism independent from verbs. + */ +#define IOWAIT_SES 2 +#define IOWAIT_IB_SE 0 +#define IOWAIT_TID_SE 1 + struct sdma_txreq; struct sdma_engine; /** - * struct iowait - linkage for delayed progress/waiting + * @iowork: the work struct + * @tx_head: list of prebuilt packets + * @iow: the parent iowait structure + * + * This structure is the work item (process) specific + * details associated with the each of the two SEs of the + * QP. + * + * The workstruct and the queued TXs are unique to each + * SE. + */ +struct iowait; +struct iowait_work { + struct work_struct iowork; + struct list_head tx_head; + struct iowait *iow; +}; + +/** * @list: used to add/insert into QP/PQ wait lists - * @lock: uses to record the list head lock * @tx_head: overflow list of sdma_txreq's * @sleep: no space callback * @wakeup: space callback wakeup * @sdma_drained: sdma count drained + * @lock: lock protected head of wait queue * @iowork: workqueue overhead * @wait_dma: wait for sdma_busy == 0 * @wait_pio: wait for pio_busy == 0 @@ -76,6 +108,8 @@ struct sdma_engine; * @count: total number of descriptors in tx_head'ed list * @tx_limit: limit for overflow queuing * @tx_count: number of tx entry's in tx_head'ed list + * @flags: wait flags (one per QP) + * @wait: SE array * * This is to be embedded in user's state structure * (QP or PQ). @@ -98,13 +132,11 @@ struct sdma_engine; * Waiters explicity know that, but the destroy * code that unwaits QPs does not. */ - struct iowait { struct list_head list; - struct list_head tx_head; int (*sleep)( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, uint seq, bool pkts_sent @@ -112,7 +144,6 @@ struct iowait { void (*wakeup)(struct iowait *wait, int reason); void (*sdma_drained)(struct iowait *wait); seqlock_t *lock; - struct work_struct iowork; wait_queue_head_t wait_dma; wait_queue_head_t wait_pio; atomic_t sdma_busy; @@ -121,63 +152,37 @@ struct iowait { u32 tx_limit; u32 tx_count; u8 starved_cnt; + unsigned long flags; + struct iowait_work wait[IOWAIT_SES]; }; #define SDMA_AVAIL_REASON 0 -/** - * iowait_init() - initialize wait structure - * @wait: wait struct to initialize - * @tx_limit: limit for overflow queuing - * @func: restart function for workqueue - * @sleep: sleep function for no space - * @resume: wakeup function for no space - * - * This function initializes the iowait - * structure embedded in the QP or PQ. - * - */ +void iowait_set_flag(struct iowait *wait, u32 flag); +bool iowait_flag_set(struct iowait *wait, u32 flag); +void iowait_clear_flag(struct iowait *wait, u32 flag); -static inline void iowait_init( - struct iowait *wait, - u32 tx_limit, - void (*func)(struct work_struct *work), - int (*sleep)( - struct sdma_engine *sde, - struct iowait *wait, - struct sdma_txreq *tx, - uint seq, - bool pkts_sent), - void (*wakeup)(struct iowait *wait, int reason), - void (*sdma_drained)(struct iowait *wait)) -{ - wait->count = 0; - wait->lock = NULL; - INIT_LIST_HEAD(&wait->list); - INIT_LIST_HEAD(&wait->tx_head); - INIT_WORK(&wait->iowork, func); - init_waitqueue_head(&wait->wait_dma); - init_waitqueue_head(&wait->wait_pio); - atomic_set(&wait->sdma_busy, 0); - atomic_set(&wait->pio_busy, 0); - wait->tx_limit = tx_limit; - wait->sleep = sleep; - wait->wakeup = wakeup; - wait->sdma_drained = sdma_drained; -} +void iowait_init(struct iowait *wait, u32 tx_limit, + void (*func)(struct work_struct *work), + void (*tidfunc)(struct work_struct *work), + int (*sleep)(struct sdma_engine *sde, + struct iowait_work *wait, + struct sdma_txreq *tx, + uint seq, + bool pkts_sent), + void (*wakeup)(struct iowait *wait, int reason), + void (*sdma_drained)(struct iowait *wait)); /** - * iowait_schedule() - initialize wait structure + * iowait_schedule() - schedule the default send engine work * @wait: wait struct to schedule * @wq: workqueue for schedule * @cpu: cpu */ -static inline void iowait_schedule( - struct iowait *wait, - struct workqueue_struct *wq, - int cpu) +static inline bool iowait_schedule(struct iowait *wait, + struct workqueue_struct *wq, int cpu) { - queue_work_on(cpu, wq, &wait->iowork); + return !!queue_work_on(cpu, wq, &wait->wait[IOWAIT_IB_SE].iowork); } /** @@ -228,6 +233,8 @@ static inline void iowait_sdma_add(struct iowait *wait, int count) */ static inline int iowait_sdma_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->sdma_busy); } @@ -267,11 +274,13 @@ static inline void iowait_pio_inc(struct iowait *wait) } /** - * iowait_sdma_dec - note pio complete + * iowait_pio_dec - note pio complete * @wait: iowait structure */ static inline int iowait_pio_dec(struct iowait *wait) { + if (!wait) + return 0; return atomic_dec_and_test(&wait->pio_busy); } @@ -293,9 +302,9 @@ static inline void iowait_drain_wakeup(struct iowait *wait) /** * iowait_get_txhead() - get packet off of iowait list * - * @wait wait struture + * @wait iowait_work struture */ -static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) +static inline struct sdma_txreq *iowait_get_txhead(struct iowait_work *wait) { struct sdma_txreq *tx = NULL; @@ -309,6 +318,28 @@ static inline struct sdma_txreq *iowait_get_txhead(struct iowait *wait) return tx; } +static inline u16 iowait_get_desc(struct iowait_work *w) +{ + u16 num_desc = 0; + struct sdma_txreq *tx = NULL; + + if (!list_empty(&w->tx_head)) { + tx = list_first_entry(&w->tx_head, struct sdma_txreq, + list); + num_desc = tx->num_desc; + } + return num_desc; +} + +static inline u32 iowait_get_all_desc(struct iowait *w) +{ + u32 num_desc = 0; + + num_desc = iowait_get_desc(&w->wait[IOWAIT_IB_SE]); + num_desc += iowait_get_desc(&w->wait[IOWAIT_TID_SE]); + return num_desc; +} + /** * iowait_queue - Put the iowait on a wait queue * @pkts_sent: have some packets been sent before queuing? @@ -372,12 +403,57 @@ static inline void iowait_starve_find_max(struct iowait *w, u8 *max, } /** - * iowait_packet_queued() - determine if a packet is already built - * @wait: the wait structure + * iowait_packet_queued() - determine if a packet is queued + * @wait: the iowait_work structure */ -static inline bool iowait_packet_queued(struct iowait *wait) +static inline bool iowait_packet_queued(struct iowait_work *wait) { return !list_empty(&wait->tx_head); } +/** + * inc_wait_count - increment wait counts + * @w: the log work struct + * @n: the count + */ +static inline void iowait_inc_wait_count(struct iowait_work *w, u16 n) +{ + if (!w) + return; + w->iow->tx_count++; + w->iow->count += n; +} + +/** + * iowait_get_tid_work - return iowait_work for tid SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_tid_work(struct iowait *w) +{ + return &w->wait[IOWAIT_TID_SE]; +} + +/** + * iowait_get_ib_work - return iowait_work for ib SE + * @w: the iowait struct + */ +static inline struct iowait_work *iowait_get_ib_work(struct iowait *w) +{ + return &w->wait[IOWAIT_IB_SE]; +} + +/** + * iowait_ioww_to_iow - return iowait given iowait_work + * @w: the iowait_work struct + */ +static inline struct iowait *iowait_ioww_to_iow(struct iowait_work *w) +{ + if (likely(w)) + return w->iow; + return NULL; +} + +void iowait_cancel_work(struct iowait *w); +int iowait_set_work_flag(struct iowait_work *w); + #endif diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index b1044a205ab6..126e9739e44f 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -66,7 +66,7 @@ MODULE_PARM_DESC(qp_table_size, "QP table size"); static void flush_tx_list(struct rvt_qp *qp); static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, unsigned int seq, bool pkts_sent); @@ -134,15 +134,13 @@ const struct rvt_operation_params hfi1_post_parms[RVT_OPERATION_MAX] = { }; -static void flush_tx_list(struct rvt_qp *qp) +static void flush_list_head(struct list_head *l) { - struct hfi1_qp_priv *priv = qp->priv; - - while (!list_empty(&priv->s_iowait.tx_head)) { + while (!list_empty(l)) { struct sdma_txreq *tx; tx = list_first_entry( - &priv->s_iowait.tx_head, + l, struct sdma_txreq, list); list_del_init(&tx->list); @@ -151,6 +149,14 @@ static void flush_tx_list(struct rvt_qp *qp) } } +static void flush_tx_list(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + flush_list_head(&iowait_get_ib_work(&priv->s_iowait)->tx_head); + flush_list_head(&iowait_get_tid_work(&priv->s_iowait)->tx_head); +} + static void flush_iowait(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; @@ -336,7 +342,7 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) * It is only used in the post send, which doesn't hold * the s_lock. */ -void _hfi1_schedule_send(struct rvt_qp *qp) +bool _hfi1_schedule_send(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; struct hfi1_ibport *ibp = @@ -344,10 +350,10 @@ void _hfi1_schedule_send(struct rvt_qp *qp) struct hfi1_pportdata *ppd = ppd_from_ibp(ibp); struct hfi1_devdata *dd = dd_from_ibdev(qp->ibqp.device); - iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, - priv->s_sde ? - priv->s_sde->cpu : - cpumask_first(cpumask_of_node(dd->node))); + return iowait_schedule(&priv->s_iowait, ppd->hfi1_wq, + priv->s_sde ? + priv->s_sde->cpu : + cpumask_first(cpumask_of_node(dd->node))); } static void qp_pio_drain(struct rvt_qp *qp) @@ -375,12 +381,32 @@ static void qp_pio_drain(struct rvt_qp *qp) * * This schedules qp progress and caller should hold * the s_lock. + * @return true if the first leg is scheduled; + * false if the first leg is not scheduled. */ -void hfi1_schedule_send(struct rvt_qp *qp) +bool hfi1_schedule_send(struct rvt_qp *qp) { lockdep_assert_held(&qp->s_lock); - if (hfi1_send_ok(qp)) + if (hfi1_send_ok(qp)) { _hfi1_schedule_send(qp); + return true; + } + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&((struct hfi1_qp_priv *)qp->priv)->s_iowait, + IOWAIT_PENDING_IB); + return false; +} + +static void hfi1_qp_schedule(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + bool ret; + + if (iowait_flag_set(&priv->s_iowait, IOWAIT_PENDING_IB)) { + ret = hfi1_schedule_send(qp); + if (ret) + iowait_clear_flag(&priv->s_iowait, IOWAIT_PENDING_IB); + } } void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) @@ -391,16 +417,22 @@ void hfi1_qp_wakeup(struct rvt_qp *qp, u32 flag) if (qp->s_flags & flag) { qp->s_flags &= ~flag; trace_hfi1_qpwakeup(qp, flag); - hfi1_schedule_send(qp); + hfi1_qp_schedule(qp); } spin_unlock_irqrestore(&qp->s_lock, flags); /* Notify hfi1_destroy_qp() if it is waiting. */ rvt_put_qp(qp); } +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait) +{ + if (iowait_set_work_flag(wait) == IOWAIT_IB_SE) + qp->s_flags &= ~RVT_S_BUSY; +} + static int iowait_sleep( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *stx, uint seq, bool pkts_sent) @@ -441,7 +473,7 @@ static int iowait_sleep( rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, wait); spin_unlock_irqrestore(&qp->s_lock, flags); ret = -EBUSY; } else { @@ -640,6 +672,7 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp) &priv->s_iowait, 1, _hfi1_do_send, + NULL, iowait_sleep, iowait_wakeup, iowait_sdma_drained); @@ -689,7 +722,7 @@ void stop_send_queue(struct rvt_qp *qp) { struct hfi1_qp_priv *priv = qp->priv; - cancel_work_sync(&priv->s_iowait.iowork); + iowait_cancel_work(&priv->s_iowait); } void quiesce_qp(struct rvt_qp *qp) diff --git a/drivers/infiniband/hw/hfi1/qp.h b/drivers/infiniband/hw/hfi1/qp.h index 078cff7560b6..7adb6dff6813 100644 --- a/drivers/infiniband/hw/hfi1/qp.h +++ b/drivers/infiniband/hw/hfi1/qp.h @@ -57,18 +57,6 @@ extern unsigned int hfi1_qp_table_size; extern const struct rvt_operation_params hfi1_post_parms[]; -/* - * Send if not busy or waiting for I/O and either - * a RC response is pending or we can process send work requests. - */ -static inline int hfi1_send_ok(struct rvt_qp *qp) -{ - return !(qp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT_IO)) && - (verbs_txreq_queued(qp) || - (qp->s_flags & RVT_S_RESP_PENDING) || - !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); -} - /* * Driver specific s_flags starting at bit 31 down to HFI1_S_MIN_BIT_MASK * @@ -89,6 +77,20 @@ static inline int hfi1_send_ok(struct rvt_qp *qp) #define HFI1_S_ANY_WAIT_IO (RVT_S_ANY_WAIT_IO | HFI1_S_WAIT_PIO_DRAIN) #define HFI1_S_ANY_WAIT (HFI1_S_ANY_WAIT_IO | RVT_S_ANY_WAIT_SEND) +/* + * Send if not busy or waiting for I/O and either + * a RC response is pending or we can process send work requests. + */ +static inline int hfi1_send_ok(struct rvt_qp *qp) +{ + struct hfi1_qp_priv *priv = qp->priv; + + return !(qp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT_IO)) && + (verbs_txreq_queued(iowait_get_ib_work(&priv->s_iowait)) || + (qp->s_flags & RVT_S_RESP_PENDING) || + !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); +} + /* * free_ahg - clear ahg from QP */ @@ -129,8 +131,8 @@ struct send_context *qp_to_send_context(struct rvt_qp *qp, u8 sc5); void qp_iter_print(struct seq_file *s, struct rvt_qp_iter *iter); -void _hfi1_schedule_send(struct rvt_qp *qp); -void hfi1_schedule_send(struct rvt_qp *qp); +bool _hfi1_schedule_send(struct rvt_qp *qp); +bool hfi1_schedule_send(struct rvt_qp *qp); void hfi1_migrate_qp(struct rvt_qp *qp); @@ -150,4 +152,5 @@ void quiesce_qp(struct rvt_qp *qp); u32 mtu_from_qp(struct rvt_dev_info *rdi, struct rvt_qp *qp, u32 pmtu); int mtu_to_path_mtu(u32 mtu); void hfi1_error_port_qps(struct hfi1_ibport *ibp, u8 sl); +void hfi1_qp_unbusy(struct rvt_qp *qp, struct iowait_work *wait); #endif /* _QP_H */ diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 5f56f3c1b4c4..17b49b4309a3 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -825,8 +825,8 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp) void _hfi1_do_send(struct work_struct *work) { - struct iowait *wait = container_of(work, struct iowait, iowork); - struct rvt_qp *qp = iowait_to_qp(wait); + struct iowait_work *w = container_of(work, struct iowait_work, iowork); + struct rvt_qp *qp = iowait_to_qp(w->iow); hfi1_do_send(qp, true); } @@ -850,6 +850,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.ibp = to_iport(qp->ibqp.device, qp->port_num); ps.ppd = ppd_from_ibp(ps.ibp); ps.in_thread = in_thread; + ps.wait = iowait_get_ib_work(&priv->s_iowait); trace_hfi1_rc_do_send(qp, in_thread); @@ -883,6 +884,8 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) /* Return if we are already busy processing a work request. */ if (!hfi1_send_ok(qp)) { + if (qp->s_flags & HFI1_S_ANY_WAIT_IO) + iowait_set_flag(&priv->s_iowait, IOWAIT_PENDING_IB); spin_unlock_irqrestore(&qp->s_lock, ps.flags); return; } @@ -896,7 +899,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) ps.pkts_sent = false; /* insure a pre-built packet is handled */ - ps.s_txreq = get_waiting_verbs_txreq(qp); + ps.s_txreq = get_waiting_verbs_txreq(ps.wait); do { /* Check for a constructed packet to be sent. */ if (ps.s_txreq) { @@ -907,6 +910,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) */ if (hfi1_verbs_send(qp, &ps)) return; + /* allow other tasks to run */ if (schedule_send_yield(qp, &ps)) return; diff --git a/drivers/infiniband/hw/hfi1/sdma.c b/drivers/infiniband/hw/hfi1/sdma.c index 7a9b67e82a96..891d2386d1ca 100644 --- a/drivers/infiniband/hw/hfi1/sdma.c +++ b/drivers/infiniband/hw/hfi1/sdma.c @@ -378,7 +378,7 @@ static inline void complete_tx(struct sdma_engine *sde, __sdma_txclean(sde->dd, tx); if (complete) (*complete)(tx, res); - if (wait && iowait_sdma_dec(wait)) + if (iowait_sdma_dec(wait)) iowait_drain_wakeup(wait); } @@ -1758,7 +1758,6 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) struct iowait *wait, *nw; struct iowait *waits[SDMA_WAIT_BATCH_SIZE]; uint i, n = 0, seq, max_idx = 0; - struct sdma_txreq *stx; struct hfi1_ibdev *dev = &sde->dd->verbs_dev; u8 max_starved_cnt = 0; @@ -1779,19 +1778,13 @@ static void sdma_desc_avail(struct sdma_engine *sde, uint avail) nw, &sde->dmawait, list) { - u16 num_desc = 0; + u32 num_desc; if (!wait->wakeup) continue; if (n == ARRAY_SIZE(waits)) break; - if (!list_empty(&wait->tx_head)) { - stx = list_first_entry( - &wait->tx_head, - struct sdma_txreq, - list); - num_desc = stx->num_desc; - } + num_desc = iowait_get_all_desc(wait); if (num_desc > avail) break; avail -= num_desc; @@ -2346,7 +2339,7 @@ static inline u16 submit_tx(struct sdma_engine *sde, struct sdma_txreq *tx) */ static int sdma_check_progress( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2356,12 +2349,12 @@ static int sdma_check_progress( if (tx->num_desc <= sde->desc_avail) return -EAGAIN; /* pulse the head_lock */ - if (wait && wait->sleep) { + if (wait && iowait_ioww_to_iow(wait)->sleep) { unsigned seq; seq = raw_seqcount_begin( (const seqcount_t *)&sde->head_lock.seqcount); - ret = wait->sleep(sde, wait, tx, seq, pkts_sent); + ret = wait->iow->sleep(sde, wait, tx, seq, pkts_sent); if (ret == -EAGAIN) sde->desc_avail = sdma_descq_freecnt(sde); } else { @@ -2373,7 +2366,7 @@ static int sdma_check_progress( /** * sdma_send_txreq() - submit a tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx: sdma_txreq to submit * @pkts_sent: has any packet been sent yet? * @@ -2386,7 +2379,7 @@ static int sdma_check_progress( * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent) { @@ -2397,7 +2390,7 @@ int sdma_send_txreq(struct sdma_engine *sde, /* user should have supplied entire packet */ if (unlikely(tx->tlen)) return -EINVAL; - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); spin_lock_irqsave(&sde->tail_lock, flags); retry: if (unlikely(!__sdma_running(sde))) @@ -2406,14 +2399,14 @@ retry: goto nodesc; tail = submit_tx(sde, tx); if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); sdma_update_tail(sde, tail); unlock: spin_unlock_irqrestore(&sde->tail_lock, flags); return ret; unlock_noconn: if (wait) - iowait_sdma_inc(wait); + iowait_sdma_inc(iowait_ioww_to_iow(wait)); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER tx->sn = sde->tail_sn++; @@ -2422,10 +2415,7 @@ unlock_noconn: spin_lock(&sde->flushlist_lock); list_add_tail(&tx->list, &sde->flushlist); spin_unlock(&sde->flushlist_lock); - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); schedule_work(&sde->flush_worker); ret = -ECOMM; goto unlock; @@ -2442,7 +2432,7 @@ nodesc: /** * sdma_send_txlist() - submit a list of tx req to ring * @sde: sdma engine to use - * @wait: wait structure to use when full (may be NULL) + * @wait: SE wait structure to use when full (may be NULL) * @tx_list: list of sdma_txreqs to submit * @count: pointer to a u16 which, after return will contain the total number of * sdma_txreqs removed from the tx_list. This will include sdma_txreqs @@ -2467,7 +2457,7 @@ nodesc: * -EINVAL - sdma_txreq incomplete, -EBUSY - no space in ring (wait == NULL) * -EIOCBQUEUED - tx queued to iowait, -ECOMM bad sdma state */ -int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, +int sdma_send_txlist(struct sdma_engine *sde, struct iowait_work *wait, struct list_head *tx_list, u16 *count_out) { struct sdma_txreq *tx, *tx_next; @@ -2479,7 +2469,7 @@ int sdma_send_txlist(struct sdma_engine *sde, struct iowait *wait, spin_lock_irqsave(&sde->tail_lock, flags); retry: list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); if (unlikely(!__sdma_running(sde))) goto unlock_noconn; if (unlikely(tx->num_desc > sde->desc_avail)) @@ -2500,8 +2490,9 @@ retry: update_tail: total_count = submit_count + flush_count; if (wait) { - iowait_sdma_add(wait, total_count); - iowait_starve_clear(submit_count > 0, wait); + iowait_sdma_add(iowait_ioww_to_iow(wait), total_count); + iowait_starve_clear(submit_count > 0, + iowait_ioww_to_iow(wait)); } if (tail != INVALID_TAIL) sdma_update_tail(sde, tail); @@ -2511,7 +2502,7 @@ update_tail: unlock_noconn: spin_lock(&sde->flushlist_lock); list_for_each_entry_safe(tx, tx_next, tx_list, list) { - tx->wait = wait; + tx->wait = iowait_ioww_to_iow(wait); list_del_init(&tx->list); tx->next_descq_idx = 0; #ifdef CONFIG_HFI1_DEBUG_SDMA_ORDER @@ -2520,10 +2511,7 @@ unlock_noconn: #endif list_add_tail(&tx->list, &sde->flushlist); flush_count++; - if (wait) { - wait->tx_count++; - wait->count += tx->num_desc; - } + iowait_inc_wait_count(wait, tx->num_desc); } spin_unlock(&sde->flushlist_lock); schedule_work(&sde->flush_worker); diff --git a/drivers/infiniband/hw/hfi1/sdma.h b/drivers/infiniband/hw/hfi1/sdma.h index c076eef081e8..6dc63d7c5685 100644 --- a/drivers/infiniband/hw/hfi1/sdma.h +++ b/drivers/infiniband/hw/hfi1/sdma.h @@ -1,7 +1,7 @@ #ifndef _HFI1_SDMA_H #define _HFI1_SDMA_H /* - * Copyright(c) 2015, 2016 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -840,14 +840,14 @@ static inline int sdma_txadd_kvaddr( dd, SDMA_MAP_SINGLE, tx, addr, len); } -struct iowait; +struct iowait_work; int sdma_send_txreq(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *tx, bool pkts_sent); int sdma_send_txlist(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct list_head *tx_list, u16 *count_out); diff --git a/drivers/infiniband/hw/hfi1/user_sdma.c b/drivers/infiniband/hw/hfi1/user_sdma.c index 825e475dc9fe..6e2aa4480c58 100644 --- a/drivers/infiniband/hw/hfi1/user_sdma.c +++ b/drivers/infiniband/hw/hfi1/user_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -100,7 +100,7 @@ static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len); static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent); @@ -123,13 +123,13 @@ static struct mmu_rb_ops sdma_rb_ops = { static int defer_packet_queue( struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_user_sdma_pkt_q *pq = - container_of(wait, struct hfi1_user_sdma_pkt_q, busy); + container_of(wait->iow, struct hfi1_user_sdma_pkt_q, busy); struct hfi1_ibdev *dev = &pq->dd->verbs_dev; struct user_sdma_txreq *tx = container_of(txreq, struct user_sdma_txreq, txreq); @@ -191,7 +191,7 @@ int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, atomic_set(&pq->n_locked, 0); pq->mm = fd->mm; - iowait_init(&pq->busy, 0, NULL, defer_packet_queue, + iowait_init(&pq->busy, 0, NULL, NULL, defer_packet_queue, activate_packet_queue, NULL); pq->reqidx = 0; @@ -912,7 +912,9 @@ static int user_sdma_send_pkts(struct user_sdma_request *req, u16 maxpkts) npkts++; } dosend: - ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps, &count); + ret = sdma_send_txlist(req->sde, + iowait_get_ib_work(&pq->busy), + &req->txps, &count); req->seqsubmitted += count; if (req->seqsubmitted == req->info.npkts) { /* diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index bbee0cb77ff8..16b88948383b 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -737,7 +737,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { if (list_empty(&dev->memwait)) mod_timer(&dev->mem_timer, jiffies + 1); @@ -748,7 +748,7 @@ static int wait_kmem(struct hfi1_ibdev *dev, rvt_get_qp(qp); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); @@ -950,8 +950,7 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, if (unlikely(ret)) goto bail_build; } - ret = sdma_send_txreq(tx->sde, &priv->s_iowait, &tx->txreq, - ps->pkts_sent); + ret = sdma_send_txreq(tx->sde, ps->wait, &tx->txreq, ps->pkts_sent); if (unlikely(ret < 0)) { if (ret == -ECOMM) goto bail_ecomm; @@ -1001,7 +1000,7 @@ static int pio_wait(struct rvt_qp *qp, if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) { write_seqlock(&dev->iowait_lock); list_add_tail(&ps->s_txreq->txreq.list, - &priv->s_iowait.tx_head); + &ps->wait->tx_head); if (list_empty(&priv->s_iowait.list)) { struct hfi1_ibdev *dev = &dd->verbs_dev; int was_empty; @@ -1020,7 +1019,7 @@ static int pio_wait(struct rvt_qp *qp, hfi1_sc_wantpiobuf_intr(sc, 1); } write_sequnlock(&dev->iowait_lock); - qp->s_flags &= ~RVT_S_BUSY; + hfi1_qp_unbusy(qp, ps->wait); ret = -EBUSY; } spin_unlock_irqrestore(&qp->s_lock, flags); diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index bc77ffec51ce..d4164114396e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -166,11 +166,13 @@ struct hfi1_qp_priv { * This structure is used to hold commonly lookedup and computed values during * the send engine progress. */ +struct iowait_work; struct hfi1_pkt_state { struct hfi1_ibdev *dev; struct hfi1_ibport *ibp; struct hfi1_pportdata *ppd; struct verbs_txreq *s_txreq; + struct iowait_work *wait; unsigned long flags; unsigned long timeout; unsigned long timeout_int; @@ -247,7 +249,7 @@ static inline struct hfi1_ibdev *to_idev(struct ib_device *ibdev) return container_of(rdi, struct hfi1_ibdev, rdi); } -static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) +static inline struct rvt_qp *iowait_to_qp(struct iowait *s_iowait) { struct hfi1_qp_priv *priv; diff --git a/drivers/infiniband/hw/hfi1/verbs_txreq.h b/drivers/infiniband/hw/hfi1/verbs_txreq.h index 1c19bbc764b2..2a77af26a231 100644 --- a/drivers/infiniband/hw/hfi1/verbs_txreq.h +++ b/drivers/infiniband/hw/hfi1/verbs_txreq.h @@ -102,22 +102,19 @@ static inline struct sdma_txreq *get_sdma_txreq(struct verbs_txreq *tx) return &tx->txreq; } -static inline struct verbs_txreq *get_waiting_verbs_txreq(struct rvt_qp *qp) +static inline struct verbs_txreq *get_waiting_verbs_txreq(struct iowait_work *w) { struct sdma_txreq *stx; - struct hfi1_qp_priv *priv = qp->priv; - stx = iowait_get_txhead(&priv->s_iowait); + stx = iowait_get_txhead(w); if (stx) return container_of(stx, struct verbs_txreq, txreq); return NULL; } -static inline bool verbs_txreq_queued(struct rvt_qp *qp) +static inline bool verbs_txreq_queued(struct iowait_work *w) { - struct hfi1_qp_priv *priv = qp->priv; - - return iowait_packet_queued(&priv->s_iowait); + return iowait_packet_queued(w); } void hfi1_put_txreq(struct verbs_txreq *tx); diff --git a/drivers/infiniband/hw/hfi1/vnic_sdma.c b/drivers/infiniband/hw/hfi1/vnic_sdma.c index c3c96c5869ed..97bd940a056a 100644 --- a/drivers/infiniband/hw/hfi1/vnic_sdma.c +++ b/drivers/infiniband/hw/hfi1/vnic_sdma.c @@ -1,5 +1,5 @@ /* - * Copyright(c) 2017 Intel Corporation. + * Copyright(c) 2017 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -198,8 +198,8 @@ int hfi1_vnic_send_dma(struct hfi1_devdata *dd, u8 q_idx, goto free_desc; tx->retry_count = 0; - ret = sdma_send_txreq(sde, &vnic_sdma->wait, &tx->txreq, - vnic_sdma->pkts_sent); + ret = sdma_send_txreq(sde, iowait_get_ib_work(&vnic_sdma->wait), + &tx->txreq, vnic_sdma->pkts_sent); /* When -ECOMM, sdma callback will be called with ABORT status */ if (unlikely(ret && unlikely(ret != -ECOMM))) goto free_desc; @@ -230,13 +230,13 @@ tx_err: * become available. */ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, - struct iowait *wait, + struct iowait_work *wait, struct sdma_txreq *txreq, uint seq, bool pkts_sent) { struct hfi1_vnic_sdma *vnic_sdma = - container_of(wait, struct hfi1_vnic_sdma, wait); + container_of(wait->iow, struct hfi1_vnic_sdma, wait); struct hfi1_ibdev *dev = &vnic_sdma->dd->verbs_dev; struct vnic_txreq *tx = container_of(txreq, struct vnic_txreq, txreq); @@ -247,7 +247,7 @@ static int hfi1_vnic_sdma_sleep(struct sdma_engine *sde, vnic_sdma->state = HFI1_VNIC_SDMA_Q_DEFERRED; write_seqlock(&dev->iowait_lock); if (list_empty(&vnic_sdma->wait.list)) - iowait_queue(pkts_sent, wait, &sde->dmawait); + iowait_queue(pkts_sent, wait->iow, &sde->dmawait); write_sequnlock(&dev->iowait_lock); return -EBUSY; } @@ -285,7 +285,8 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) for (i = 0; i < vinfo->num_tx_q; i++) { struct hfi1_vnic_sdma *vnic_sdma = &vinfo->sdma[i]; - iowait_init(&vnic_sdma->wait, 0, NULL, hfi1_vnic_sdma_sleep, + iowait_init(&vnic_sdma->wait, 0, NULL, NULL, + hfi1_vnic_sdma_sleep, hfi1_vnic_sdma_wakeup, NULL); vnic_sdma->sde = &vinfo->dd->per_sdma[i]; vnic_sdma->dd = vinfo->dd; @@ -295,10 +296,12 @@ void hfi1_vnic_sdma_init(struct hfi1_vnic_vport_info *vinfo) /* Add a free descriptor watermark for wakeups */ if (vnic_sdma->sde->descq_cnt > HFI1_VNIC_SDMA_DESC_WTRMRK) { + struct iowait_work *work; + INIT_LIST_HEAD(&vnic_sdma->stx.list); vnic_sdma->stx.num_desc = HFI1_VNIC_SDMA_DESC_WTRMRK; - list_add_tail(&vnic_sdma->stx.list, - &vnic_sdma->wait.tx_head); + work = iowait_get_ib_work(&vnic_sdma->wait); + list_add_tail(&vnic_sdma->stx.list, &work->tx_head); } } } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ad9093d33cb2..26ab78e5aaa7 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1716,14 +1716,14 @@ void qib_unregister_ib_device(struct qib_devdata *dd) * It is only used in post send, which doesn't hold * the s_lock. */ -void _qib_schedule_send(struct rvt_qp *qp) +bool _qib_schedule_send(struct rvt_qp *qp) { struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct qib_pportdata *ppd = ppd_from_ibp(ibp); struct qib_qp_priv *priv = qp->priv; - queue_work(ppd->qib_wq, &priv->s_work); + return queue_work(ppd->qib_wq, &priv->s_work); } /** @@ -1733,8 +1733,9 @@ void _qib_schedule_send(struct rvt_qp *qp) * This schedules qp progress. The s_lock * should be held. */ -void qib_schedule_send(struct rvt_qp *qp) +bool qib_schedule_send(struct rvt_qp *qp) { if (qib_send_ok(qp)) - _qib_schedule_send(qp); + return _qib_schedule_send(qp); + return false; } diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 3d7b744ae8fb..df90a7a41534 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012 - 2017 Intel Corporation. All rights reserved. + * Copyright (c) 2012 - 2018 Intel Corporation. All rights reserved. * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved. * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved. * @@ -223,8 +223,8 @@ static inline int qib_send_ok(struct rvt_qp *qp) !(qp->s_flags & RVT_S_ANY_WAIT_SEND)); } -void _qib_schedule_send(struct rvt_qp *qp); -void qib_schedule_send(struct rvt_qp *qp); +bool _qib_schedule_send(struct rvt_qp *qp); +bool qib_schedule_send(struct rvt_qp *qp); static inline int qib_pkey_ok(u16 pkey1, u16 pkey2) { diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 52907204afcd..0a888a9ecc96 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -211,8 +211,8 @@ struct rvt_driver_provided { * version requires the s_lock not to be held. The other assumes the * s_lock is held. */ - void (*schedule_send)(struct rvt_qp *qp); - void (*schedule_send_no_lock)(struct rvt_qp *qp); + bool (*schedule_send)(struct rvt_qp *qp); + bool (*schedule_send_no_lock)(struct rvt_qp *qp); /* * Driver specific work request setup and checking. From 15b796bc3d5cc8f776f5cb0770f9f3e8b282e4d1 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 26 Sep 2018 10:27:03 -0700 Subject: [PATCH 165/242] IB/hfi1: Add static trace for iowait This patch adds the static trace for resource wait. Reviewed-by: Mitko Haralanov Signed-off-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/iowait.c | 3 ++ drivers/infiniband/hw/hfi1/trace.h | 3 +- drivers/infiniband/hw/hfi1/trace_iowait.h | 54 +++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) create mode 100644 drivers/infiniband/hw/hfi1/trace_iowait.h diff --git a/drivers/infiniband/hw/hfi1/iowait.c b/drivers/infiniband/hw/hfi1/iowait.c index 59dc955f1880..582f1ba136ff 100644 --- a/drivers/infiniband/hw/hfi1/iowait.c +++ b/drivers/infiniband/hw/hfi1/iowait.c @@ -4,9 +4,11 @@ * */ #include "iowait.h" +#include "trace_iowait.h" void iowait_set_flag(struct iowait *wait, u32 flag) { + trace_hfi1_iowait_set(wait, flag); set_bit(flag, &wait->flags); } @@ -17,6 +19,7 @@ bool iowait_flag_set(struct iowait *wait, u32 flag) inline void iowait_clear_flag(struct iowait *wait, u32 flag) { + trace_hfi1_iowait_clear(wait, flag); clear_bit(flag, &wait->flags); } diff --git a/drivers/infiniband/hw/hfi1/trace.h b/drivers/infiniband/hw/hfi1/trace.h index 8540463ef3f7..84458f1325e1 100644 --- a/drivers/infiniband/hw/hfi1/trace.h +++ b/drivers/infiniband/hw/hfi1/trace.h @@ -1,5 +1,5 @@ /* - * Copyright(c) 2015 - 2017 Intel Corporation. + * Copyright(c) 2015 - 2018 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -62,3 +62,4 @@ __print_symbolic(etype, \ #include "trace_rx.h" #include "trace_tx.h" #include "trace_mmu.h" +#include "trace_iowait.h" diff --git a/drivers/infiniband/hw/hfi1/trace_iowait.h b/drivers/infiniband/hw/hfi1/trace_iowait.h new file mode 100644 index 000000000000..27f4334ece2b --- /dev/null +++ b/drivers/infiniband/hw/hfi1/trace_iowait.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) */ +/* + * Copyright(c) 2018 Intel Corporation. + * + */ +#if !defined(__HFI1_TRACE_IOWAIT_H) || defined(TRACE_HEADER_MULTI_READ) +#define __HFI1_TRACE_IOWAIT_H + +#include +#include "iowait.h" +#include "verbs.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hfi1_iowait + +DECLARE_EVENT_CLASS(hfi1_iowait_template, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag), + TP_STRUCT__entry(/* entry */ + __field(unsigned long, addr) + __field(unsigned long, flags) + __field(u32, flag) + __field(u32, qpn) + ), + TP_fast_assign(/* assign */ + __entry->addr = (unsigned long)wait; + __entry->flags = wait->flags; + __entry->flag = (1 << flag); + __entry->qpn = iowait_to_qp(wait)->ibqp.qp_num; + ), + TP_printk(/* print */ + "iowait 0x%lx qp %u flags 0x%lx flag 0x%x", + __entry->addr, + __entry->qpn, + __entry->flags, + __entry->flag + ) + ); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_set, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +DEFINE_EVENT(hfi1_iowait_template, hfi1_iowait_clear, + TP_PROTO(struct iowait *wait, u32 flag), + TP_ARGS(wait, flag)); + +#endif /* __HFI1_TRACE_IOWAIT_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_iowait +#include From 3144533bf667c8e53bb20656b78295960073e57b Mon Sep 17 00:00:00 2001 From: Dennis Dalessandro Date: Wed, 26 Sep 2018 10:55:53 -0700 Subject: [PATCH 166/242] IB/hfi1: Ensure ucast_dlid access doesnt exceed bounds The dlid assignment made by looking into the u_ucast_dlid array does not do an explicit check for the size of the array. The code path to arrive at def_port, the index value is long and complicated so its best to just have an explicit check here. Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c index 267da8215e08..31cd361416ac 100644 --- a/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c +++ b/drivers/infiniband/ulp/opa_vnic/opa_vnic_encap.c @@ -351,7 +351,8 @@ static uint32_t opa_vnic_get_dlid(struct opa_vnic_adapter *adapter, if (unlikely(!dlid)) v_warn("Null dlid in MAC address\n"); } else if (def_port != OPA_VNIC_INVALID_PORT) { - dlid = info->vesw.u_ucast_dlid[def_port]; + if (def_port < OPA_VESW_MAX_NUM_DEF_PORT) + dlid = info->vesw.u_ucast_dlid[def_port]; } } From eb50130964e8c1379f37c3d3bab33a411ec62e98 Mon Sep 17 00:00:00 2001 From: Alex Estrin Date: Wed, 26 Sep 2018 10:56:03 -0700 Subject: [PATCH 167/242] IB/hfi1: Add mtu check for operational data VLs Since Virtual Lanes BCT credits and MTU are set through separate MADs, we have to ensure both are valid, and data VLs are ready for transmission before we allow port transition to Armed state. Fixes: 5e2d6764a729 ("IB/hfi1: Verify port data VLs credits on transition to Armed") Reviewed-by: Mike Marciniszyn Reviewed-by: Michael J. Ruhl Signed-off-by: Alex Estrin Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/chip.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/chip.c b/drivers/infiniband/hw/hfi1/chip.c index 290c3b5b22f3..8c0f88278f58 100644 --- a/drivers/infiniband/hw/hfi1/chip.c +++ b/drivers/infiniband/hw/hfi1/chip.c @@ -10558,12 +10558,29 @@ void set_link_down_reason(struct hfi1_pportdata *ppd, u8 lcl_reason, } } -/* - * Verify if BCT for data VLs is non-zero. +/** + * data_vls_operational() - Verify if data VL BCT credits and MTU + * are both set. + * @ppd: pointer to hfi1_pportdata structure + * + * Return: true - Ok, false -otherwise. */ static inline bool data_vls_operational(struct hfi1_pportdata *ppd) { - return !!ppd->actual_vls_operational; + int i; + u64 reg; + + if (!ppd->actual_vls_operational) + return false; + + for (i = 0; i < ppd->vls_supported; i++) { + reg = read_csr(ppd->dd, SEND_CM_CREDIT_VL + (8 * i)); + if ((reg && !ppd->dd->vld[i].mtu) || + (!reg && ppd->dd->vld[i].mtu)) + return false; + } + + return true; } /* @@ -10675,7 +10692,8 @@ int set_link_state(struct hfi1_pportdata *ppd, u32 state) if (!data_vls_operational(ppd)) { dd_dev_err(dd, - "%s: data VLs not operational\n", __func__); + "%s: Invalid data VL credits or mtu\n", + __func__); ret = -EINVAL; break; } From bfe397c387748a0110708c0aad05a12ed549a3d4 Mon Sep 17 00:00:00 2001 From: Kaike Wan Date: Wed, 26 Sep 2018 10:56:12 -0700 Subject: [PATCH 168/242] IB/hfi1: Use VL15 for SM packets Subnet Management Packets (SMP) should exclusively use VL15 and their SL is ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when an SMP is posted, the SL in the address handle can be set to 0 by a user application. Consequently, when an address handle is created by the IB core, some fields in struct rvt_ah may not be set correctly by using the SL2SC and SC2VL tables at the time. Subsequently, when the request is post sent, the incoming swqe may fail the validation check, resulting in the rejection of the send request. This patch fixes the problem by using VL15 for any validation, ignoring the SL in the address handle. Reviewed-by: Mike Marciniszyn Signed-off-by: Kaike Wan Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/qp.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c index 126e9739e44f..6f3bc4dab858 100644 --- a/drivers/infiniband/hw/hfi1/qp.c +++ b/drivers/infiniband/hw/hfi1/qp.c @@ -306,6 +306,8 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) { struct hfi1_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num); struct rvt_ah *ah; + struct hfi1_pportdata *ppd; + struct hfi1_devdata *dd; switch (qp->ibqp.qp_type) { case IB_QPT_RC: @@ -316,8 +318,16 @@ int hfi1_setup_wqe(struct rvt_qp *qp, struct rvt_swqe *wqe, bool *call_send) *call_send = false; break; case IB_QPT_SMI: - ah = ibah_to_rvtah(wqe->ud_wr.ah); - if (wqe->length > (1 << ah->log_pmtu)) + /* + * SM packets should exclusively use VL15 and their SL is + * ignored (IBTA v1.3, Section 3.5.8.2). Therefore, when ah + * is created, SL is 0 in most cases and as a result some + * fields (vl and pmtu) in ah may not be set correctly, + * depending on the SL2SC and SC2VL tables at the time. + */ + ppd = ppd_from_ibp(ibp); + dd = dd_from_ppd(ppd); + if (wqe->length > dd->vld[15].mtu) return -EINVAL; break; case IB_QPT_GSI: From 78fb282b150c36269fcecf5d08e6de7117e9f4ab Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sat, 15 Sep 2018 12:07:55 +0300 Subject: [PATCH 169/242] RDMA/cma: Allow accepting requests for multi port rdma device When IP failover is used between multiple ports of a given rdma device, allow accepting CM requests from either of the ports. This is applicable for IPv4 and IPv6 non link local addressing scheme. IPv6 link local addresses are bound. IP failover requests for listen cm_ids bound to specific netdev interfaces cannot be supported. (Similar to traditional sockets). Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index c650223c52bf..47e884162ce5 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1460,17 +1460,34 @@ static bool cma_protocol_roce(const struct rdma_cm_id *id) return rdma_protocol_roce(device, port_num); } +static bool cma_is_req_ipv6_ll(const struct cma_req_info *req) +{ + const struct sockaddr *daddr = + (const struct sockaddr *)&req->listen_addr_storage; + const struct sockaddr_in6 *daddr6 = (const struct sockaddr_in6 *)daddr; + + /* Returns true if the req is for IPv6 link local */ + return (daddr->sa_family == AF_INET6 && + (ipv6_addr_type(&daddr6->sin6_addr) & IPV6_ADDR_LINKLOCAL)); +} + static bool cma_match_net_dev(const struct rdma_cm_id *id, const struct net_device *net_dev, - u8 port_num) + const struct cma_req_info *req) { const struct rdma_addr *addr = &id->route.addr; if (!net_dev) /* This request is an AF_IB request */ - return (!id->port_num || id->port_num == port_num) && + return (!id->port_num || id->port_num == req->port) && (addr->src_addr.ss_family == AF_IB); + /* + * If the request is not for IPv6 link local, allow matching + * request to any netdevice of the one or multiport rdma device. + */ + if (!cma_is_req_ipv6_ll(req)) + return true; /* * Net namespaces must match, and if the listner is listening * on a specific netdevice than netdevice must match as well. @@ -1498,13 +1515,14 @@ static struct rdma_id_private *cma_find_listener( hlist_for_each_entry(id_priv, &bind_list->owners, node) { if (cma_match_private_data(id_priv, ib_event->private_data)) { if (id_priv->id.device == cm_id->device && - cma_match_net_dev(&id_priv->id, net_dev, req->port)) + cma_match_net_dev(&id_priv->id, net_dev, req)) return id_priv; list_for_each_entry(id_priv_dev, &id_priv->listen_list, listen_list) { if (id_priv_dev->id.device == cm_id->device && - cma_match_net_dev(&id_priv_dev->id, net_dev, req->port)) + cma_match_net_dev(&id_priv_dev->id, + net_dev, req)) return id_priv_dev; } } From ff11c6cd521f4fd859c825976e4146dfb166029c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sat, 15 Sep 2018 12:07:56 +0300 Subject: [PATCH 170/242] RDMA/cma: Introduce and use cma_acquire_dev_by_src_ip() Light weight version of cma_acquire_dev() just for binding with rdma device based on source IP(v4/v6) address. This simplifies cma_acquire_dev() to avoid listen_id specific checks and also for subsequent simplification for IB vs iWarp. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 84 +++++++++++++++++++++++++++-------- 1 file changed, 66 insertions(+), 18 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 47e884162ce5..11bce4909f54 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -639,6 +639,58 @@ static void cma_bind_sgid_attr(struct rdma_id_private *id_priv, id_priv->id.route.addr.dev_addr.sgid_attr = sgid_attr; } +/** + * cma_acquire_dev_by_src_ip - Acquire cma device, port, gid attribute + * based on source ip address. + * @id_priv: cm_id which should be bound to cma device + * + * cma_acquire_dev_by_src_ip() binds cm id to cma device, port and GID attribute + * based on source IP address. It returns 0 on success or error code otherwise. + * It is applicable to active and passive side cm_id. + */ +static int cma_acquire_dev_by_src_ip(struct rdma_id_private *id_priv) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + union ib_gid gid, iboe_gid, *gidp; + struct cma_device *cma_dev; + enum ib_gid_type gid_type; + int ret = -ENODEV; + u8 port; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &iboe_gid); + + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); + list_for_each_entry(cma_dev, &dev_list, list) { + for (port = rdma_start_port(cma_dev->device); + port <= rdma_end_port(cma_dev->device); port++) { + gidp = rdma_protocol_roce(cma_dev->device, port) ? + &iboe_gid : &gid; + gid_type = cma_dev->default_gid_type[port - 1]; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + cma_attach_to_dev(id_priv, cma_dev); + ret = 0; + goto out; + } + } + } +out: + mutex_unlock(&lock); + return ret; +} + static int cma_acquire_dev(struct rdma_id_private *id_priv, const struct rdma_id_private *listen_id_priv) { @@ -661,26 +713,22 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, memcpy(&gid, dev_addr->src_dev_addr + rdma_addr_gid_offset(dev_addr), sizeof gid); - if (listen_id_priv) { - cma_dev = listen_id_priv->cma_dev; - port = listen_id_priv->id.port_num; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; - gid_type = listen_id_priv->gid_type; - sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); - if (!IS_ERR(sgid_attr)) { - id_priv->id.port_num = port; - cma_bind_sgid_attr(id_priv, sgid_attr); - ret = 0; - goto out; - } + cma_dev = listen_id_priv->cma_dev; + port = listen_id_priv->id.port_num; + gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; + gid_type = listen_id_priv->gid_type; + sgid_attr = cma_validate_port(cma_dev->device, port, + gid_type, gidp, id_priv); + if (!IS_ERR(sgid_attr)) { + id_priv->id.port_num = port; + cma_bind_sgid_attr(id_priv, sgid_attr); + ret = 0; + goto out; } list_for_each_entry(cma_dev, &dev_list, list) { for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) { - if (listen_id_priv && - listen_id_priv->cma_dev == cma_dev && + if (listen_id_priv->cma_dev == cma_dev && listen_id_priv->id.port_num == port) continue; @@ -2878,7 +2926,7 @@ static void addr_handler(int status, struct sockaddr *src_addr, memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr)); if (!status && !id_priv->cma_dev) { - status = cma_acquire_dev(id_priv, NULL); + status = cma_acquire_dev_by_src_ip(id_priv); if (status) pr_debug_ratelimited("RDMA CM: ADDR_ERROR: failed to acquire device. status %d\n", status); @@ -3427,7 +3475,7 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) if (ret) goto err1; - ret = cma_acquire_dev(id_priv, NULL); + ret = cma_acquire_dev_by_src_ip(id_priv); if (ret) goto err1; } From 41ab1cb7d1cd5d53d68bcf5fb3fddad77af15545 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sat, 15 Sep 2018 12:07:57 +0300 Subject: [PATCH 171/242] RDMA/cma: Introduce and use cma_ib_acquire_dev() When RDMA CM connect request arrives for IB transport, it already contains device, port, netdevice (optional). Instead of traversing all the cma devices, use the cma device already found by the cma_find_listener() for which a listener id is provided. iWarp devices doesn't need to derive RoCE GIDs, therefore drop RoCE specific checks from cma_acquire_dev() and rename it to cma_iw_acquire_dev(). Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 93 ++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 24 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 11bce4909f54..897aac68158b 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -691,34 +691,80 @@ out: return ret; } -static int cma_acquire_dev(struct rdma_id_private *id_priv, - const struct rdma_id_private *listen_id_priv) +/** + * cma_ib_acquire_dev - Acquire cma device, port and SGID attribute + * @id_priv: cm id to bind to cma device + * @listen_id_priv: listener cm id to match against + * @req: Pointer to req structure containaining incoming + * request information + * cma_ib_acquire_dev() acquires cma device, port and SGID attribute when + * rdma device matches for listen_id and incoming request. It also verifies + * that a GID table entry is present for the source address. + * Returns 0 on success, or returns error code otherwise. + */ +static int cma_ib_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv, + struct cma_req_info *req) +{ + struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; + const struct ib_gid_attr *sgid_attr; + enum ib_gid_type gid_type; + union ib_gid gid; + + if (dev_addr->dev_type != ARPHRD_INFINIBAND && + id_priv->id.ps == RDMA_PS_IPOIB) + return -EINVAL; + + if (rdma_protocol_roce(req->device, req->port)) + rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, + &gid); + else + memcpy(&gid, dev_addr->src_dev_addr + + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + gid_type = listen_id_priv->cma_dev->default_gid_type[req->port - 1]; + sgid_attr = cma_validate_port(req->device, req->port, + gid_type, &gid, id_priv); + if (IS_ERR(sgid_attr)) + return PTR_ERR(sgid_attr); + + id_priv->id.port_num = req->port; + cma_bind_sgid_attr(id_priv, sgid_attr); + /* Need to acquire lock to protect against reader + * of cma_dev->id_list such as cma_netdev_callback() and + * cma_process_remove(). + */ + mutex_lock(&lock); + cma_attach_to_dev(id_priv, listen_id_priv->cma_dev); + mutex_unlock(&lock); + return 0; +} + +static int cma_iw_acquire_dev(struct rdma_id_private *id_priv, + const struct rdma_id_private *listen_id_priv) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; const struct ib_gid_attr *sgid_attr; struct cma_device *cma_dev; - union ib_gid gid, iboe_gid, *gidp; enum ib_gid_type gid_type; int ret = -ENODEV; + union ib_gid gid; u8 port; if (dev_addr->dev_type != ARPHRD_INFINIBAND && id_priv->id.ps == RDMA_PS_IPOIB) return -EINVAL; - mutex_lock(&lock); - rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, - &iboe_gid); - memcpy(&gid, dev_addr->src_dev_addr + - rdma_addr_gid_offset(dev_addr), sizeof gid); + rdma_addr_gid_offset(dev_addr), sizeof(gid)); + + mutex_lock(&lock); cma_dev = listen_id_priv->cma_dev; port = listen_id_priv->id.port_num; - gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; gid_type = listen_id_priv->gid_type; sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); + gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); @@ -732,11 +778,9 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, listen_id_priv->id.port_num == port) continue; - gidp = rdma_protocol_roce(cma_dev->device, port) ? - &iboe_gid : &gid; gid_type = cma_dev->default_gid_type[port - 1]; sgid_attr = cma_validate_port(cma_dev->device, port, - gid_type, gidp, id_priv); + gid_type, &gid, id_priv); if (!IS_ERR(sgid_attr)) { id_priv->id.port_num = port; cma_bind_sgid_attr(id_priv, sgid_attr); @@ -1582,18 +1626,18 @@ static struct rdma_id_private *cma_find_listener( static struct rdma_id_private * cma_ib_id_from_event(struct ib_cm_id *cm_id, const struct ib_cm_event *ib_event, + struct cma_req_info *req, struct net_device **net_dev) { - struct cma_req_info req; struct rdma_bind_list *bind_list; struct rdma_id_private *id_priv; int err; - err = cma_save_req_info(ib_event, &req); + err = cma_save_req_info(ib_event, req); if (err) return ERR_PTR(err); - *net_dev = cma_get_net_dev(ib_event, &req); + *net_dev = cma_get_net_dev(ib_event, req); if (IS_ERR(*net_dev)) { if (PTR_ERR(*net_dev) == -EAFNOSUPPORT) { /* Assuming the protocol is AF_IB */ @@ -1631,17 +1675,17 @@ cma_ib_id_from_event(struct ib_cm_id *cm_id, } if (!validate_net_dev(*net_dev, - (struct sockaddr *)&req.listen_addr_storage, - (struct sockaddr *)&req.src_addr_storage)) { + (struct sockaddr *)&req->listen_addr_storage, + (struct sockaddr *)&req->src_addr_storage)) { id_priv = ERR_PTR(-EHOSTUNREACH); goto err; } } bind_list = cma_ps_find(*net_dev ? dev_net(*net_dev) : &init_net, - rdma_ps_from_service_id(req.service_id), - cma_port_from_service_id(req.service_id)); - id_priv = cma_find_listener(bind_list, cm_id, ib_event, &req, *net_dev); + rdma_ps_from_service_id(req->service_id), + cma_port_from_service_id(req->service_id)); + id_priv = cma_find_listener(bind_list, cm_id, ib_event, req, *net_dev); err: rcu_read_unlock(); if (IS_ERR(id_priv) && *net_dev) { @@ -2063,11 +2107,12 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, { struct rdma_id_private *listen_id, *conn_id = NULL; struct rdma_cm_event event = {}; + struct cma_req_info req = {}; struct net_device *net_dev; u8 offset; int ret; - listen_id = cma_ib_id_from_event(cm_id, ib_event, &net_dev); + listen_id = cma_ib_id_from_event(cm_id, ib_event, &req, &net_dev); if (IS_ERR(listen_id)) return PTR_ERR(listen_id); @@ -2100,7 +2145,7 @@ static int cma_ib_req_handler(struct ib_cm_id *cm_id, } mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING); - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_ib_acquire_dev(conn_id, listen_id, &req); if (ret) goto err2; @@ -2296,7 +2341,7 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, goto out; } - ret = cma_acquire_dev(conn_id, listen_id); + ret = cma_iw_acquire_dev(conn_id, listen_id); if (ret) { mutex_unlock(&conn_id->handler_mutex); rdma_destroy_id(new_cm_id); From 914e5d7d4697fb9815fa237a4d42bfb952bf48d6 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 28 Sep 2018 15:20:23 -0600 Subject: [PATCH 172/242] RDMA: Fix building with CONFIG_MMU=n The zap_vma_ptes() is declared but not defined on NOMMU kernels, causing a link error for the newly added uverbs code: drivers/infiniband/core/uverbs_main.o: In function `uverbs_user_mmap_disassociate': uverbs_main.c:(.text+0x114c): undefined reference to `zap_vma_ptes' drivers/infiniband/core/uverbs_main.o: In function `rdma_umap_open': uverbs_main.c:(.text+0x53c): undefined reference to `zap_vma_ptes' Since all user access for all of our drivers depend on remapping pages to user space disable USER_ACCESS when there is no mmu. Fixes: 5f9794dc94f5 ("RDMA/ucontext: Add a core API for mmaping driver IO memory") Reported-by: Arnd Bergmann Signed-off-by: Jason Gunthorpe --- drivers/infiniband/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index abb6660c099c..0a3ec7c726ec 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -26,6 +26,7 @@ config INFINIBAND_USER_MAD config INFINIBAND_USER_ACCESS tristate "InfiniBand userspace access (verbs and CM)" select ANON_INODES + depends on MMU ---help--- Userspace InfiniBand access support. This enables the kernel side of userspace verbs and the userspace From e73798f20ecb35f7d6c672d48d6b9da57c8cbf64 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Fri, 28 Sep 2018 16:28:02 -0600 Subject: [PATCH 173/242] RDMA/uverbs: Fix RCU annotation for radix slot deference The uapi radix tree is a write-once data structure protected by kref. Once we get to the ioctl() fop it is not possible for anything else to be writing to it, so the access should use rcu_dereference_protected. Reported-by: Matthew Wilcox Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/uverbs_ioctl.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/uverbs_ioctl.c b/drivers/infiniband/core/uverbs_ioctl.c index 0e95a5888274..b0e493e8d860 100644 --- a/drivers/infiniband/core/uverbs_ioctl.c +++ b/drivers/infiniband/core/uverbs_ioctl.c @@ -386,8 +386,7 @@ static int uverbs_set_attr(struct bundle_priv *pbundle, return -EPROTONOSUPPORT; return 0; } - attr = srcu_dereference( - *slot, &pbundle->bundle.ufile->device->disassociate_srcu); + attr = rcu_dereference_protected(*slot, true); /* Reject duplicate attributes from user-space */ if (test_bit(attr_bkey, pbundle->bundle.attr_present)) @@ -498,9 +497,7 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit) if (WARN_ON(!slot)) continue; - attr_uapi = srcu_dereference( - *slot, - &pbundle->bundle.ufile->device->disassociate_srcu); + attr_uapi = rcu_dereference_protected(*slot, true); if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) { current_ret = uverbs_free_idrs_array( @@ -542,7 +539,7 @@ static int ib_uverbs_cmd_verbs(struct ib_uverbs_file *ufile, uapi_key_ioctl_method(hdr->method_id)); if (unlikely(!slot)) return -EPROTONOSUPPORT; - method_elm = srcu_dereference(*slot, &ufile->device->disassociate_srcu); + method_elm = rcu_dereference_protected(*slot, true); if (!method_elm->use_stack) { pbundle = kmalloc(method_elm->bundle_size, GFP_KERNEL); From 39f2495618c5e980d2873ea3f2d1877dd253e07a Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Sat, 29 Sep 2018 03:55:16 +0000 Subject: [PATCH 174/242] IB/mthca: Fix error return code in __mthca_init_one() Fix to return a negative error code from the mthca_cmd_init() error handling case instead of 0, as done elsewhere in this function. Fixes: 80fd8238734c ("[PATCH] IB/mthca: Encapsulate command interface init") Signed-off-by: Wei Yongjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mthca/mthca_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c index f99c7e0b234c..92c49bff22bc 100644 --- a/drivers/infiniband/hw/mthca/mthca_main.c +++ b/drivers/infiniband/hw/mthca/mthca_main.c @@ -986,7 +986,8 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type) goto err_free_dev; } - if (mthca_cmd_init(mdev)) { + err = mthca_cmd_init(mdev); + if (err) { mthca_err(mdev, "Failed to init command interface, aborting.\n"); goto err_free_dev; } From aae0484e15f062ad2c2502e68e15dfb8b8f84608 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sun, 30 Sep 2018 02:27:16 -0400 Subject: [PATCH 175/242] IB/rxe: avoid srq memory leak In rxe_queue_init, q and q->buf are allocated. In do_mmap_info, q->ip is allocated. When error occurs, rxe_srq_from_init and the later error handler do not free these allocated memories. This will make memory leak. Signed-off-by: Zhu Yanjun Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_srq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_srq.c b/drivers/infiniband/sw/rxe/rxe_srq.c index 0d6c04ba7fc3..c41a5fee81f7 100644 --- a/drivers/infiniband/sw/rxe/rxe_srq.c +++ b/drivers/infiniband/sw/rxe/rxe_srq.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include "rxe.h" #include "rxe_loc.h" #include "rxe_queue.h" @@ -129,13 +130,18 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, q->buf, q->buf_size, &q->ip); - if (err) + if (err) { + vfree(q->buf); + kfree(q); return err; + } if (uresp) { if (copy_to_user(&uresp->srq_num, &srq->srq_num, - sizeof(uresp->srq_num))) + sizeof(uresp->srq_num))) { + rxe_queue_cleanup(q); return -EFAULT; + } } return 0; From 03241627b222427917999c533f1a2f9d5beda2be Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 2 Oct 2018 11:03:10 +0300 Subject: [PATCH 176/242] RDMA/rxe: Remove unused addr_same() This function is not in use - delete it. Signed-off-by: Kamal Heib Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe_net.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_net.c b/drivers/infiniband/sw/rxe/rxe_net.c index fb06f94f33d8..40e82e0f6c2d 100644 --- a/drivers/infiniband/sw/rxe/rxe_net.c +++ b/drivers/infiniband/sw/rxe/rxe_net.c @@ -491,11 +491,6 @@ void rxe_loopback(struct sk_buff *skb) rxe_rcv(skb); } -static inline int addr_same(struct rxe_dev *rxe, struct rxe_av *av) -{ - return rxe->port.port_guid == av->grh.dgid.global.interface_id; -} - struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av, int paylen, struct rxe_pkt_info *pkt) { From d31131bba5a1630304c55ea775c48cc84912ab59 Mon Sep 17 00:00:00 2001 From: Kamal Heib Date: Tue, 2 Oct 2018 16:11:21 +0300 Subject: [PATCH 177/242] RDMA: Remove unused parameter from ib_modify_qp_is_ok() The ll parameter is not used in ib_modify_qp_is_ok(), so remove it. Signed-off-by: Kamal Heib Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/verbs.c | 3 +-- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 3 +-- drivers/infiniband/hw/hns/hns_roce_qp.c | 4 ++-- drivers/infiniband/hw/mlx4/qp.c | 8 +------- drivers/infiniband/hw/mlx5/qp.c | 5 ++--- drivers/infiniband/hw/mthca/mthca_qp.c | 4 ++-- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 3 +-- drivers/infiniband/hw/qedr/verbs.c | 3 +-- drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 5 +---- drivers/infiniband/sw/rxe/rxe_qp.c | 3 +-- include/rdma/ib_verbs.h | 4 +--- 12 files changed, 15 insertions(+), 32 deletions(-) diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index ee5fc8408add..1e7ad5e0a46e 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -1509,8 +1509,7 @@ static const struct { }; bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll) + enum ib_qp_type type, enum ib_qp_attr_mask mask) { enum ib_qp_attr_mask req_param, opt_param; diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index bc2b9e038439..9d7c48466f10 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -1598,8 +1598,7 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr, curr_qp_state = __to_ib_qp_state(qp->qplib_qp.cur_qp_state); new_qp_state = qp_attr->qp_state; if (!ib_modify_qp_is_ok(curr_qp_state, new_qp_state, - ib_qp->qp_type, qp_attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ib_qp->qp_type, qp_attr_mask)) { dev_err(rdev_to_dev(rdev), "Invalid attribute mask: %#x specified ", qp_attr_mask); diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index efb7e961ca65..0378fc41fcfa 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -952,8 +952,8 @@ int hns_roce_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { dev_err(dev, "ib_modify_qp_is_ok failed\n"); goto out; } diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 6dd3cd2c2f80..0711ca1dfb8f 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -2629,7 +2629,6 @@ enum { static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, struct ib_udata *udata) { - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); enum ib_qp_state cur_state, new_state; @@ -2639,13 +2638,8 @@ static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (cur_state != new_state || cur_state != IB_QPS_RESET) { - int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = rdma_port_get_link_layer(&dev->ib_dev, port); - } - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, ll)) { + attr_mask)) { pr_debug("qpn 0x%x: invalid attribute mask specified " "for transition %d to %d. qp_type %d," " attr_mask 0x%x\n", diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index c49a0815a12b..fa8e5dc65cb4 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -3509,7 +3509,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, size_t required_cmd_sz; int err = -EINVAL; int port; - enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; if (ibqp->rwq_ind_tbl) return -ENOSYS; @@ -3555,7 +3554,6 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; - ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); } if (qp->flags & MLX5_IB_QP_UNDERLAY) { @@ -3566,7 +3564,8 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, } } else if (qp_type != MLX5_IB_QPT_REG_UMR && qp_type != MLX5_IB_QPT_DCI && - !ib_modify_qp_is_ok(cur_state, new_state, qp_type, attr_mask, ll)) { + !ib_modify_qp_is_ok(cur_state, new_state, qp_type, + attr_mask)) { mlx5_ib_dbg(dev, "invalid QP state transition from %d to %d, qp_type %d, attr_mask 0x%x\n", cur_state, new_state, ibqp->qp_type, attr_mask); goto out; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 3d37f2373d63..9d178ee3c96a 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -872,8 +872,8 @@ int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask, new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_UNSPECIFIED)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, + attr_mask)) { mthca_dbg(dev, "Bad QP transition (transport %d) " "%d->%d with attr 0x%08x\n", qp->transport, cur_state, new_state, diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index c158ca9fde6d..06d2a7f3304c 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -1480,8 +1480,7 @@ int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, new_qps = old_qps; spin_unlock_irqrestore(&qp->q_lock, flags); - if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask)) { pr_err("%s(%d) invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n", __func__, dev->id, attr_mask, qp->id, ibqp->qp_type, diff --git a/drivers/infiniband/hw/qedr/verbs.c b/drivers/infiniband/hw/qedr/verbs.c index 9d4d165014d9..82ee4b4a7084 100644 --- a/drivers/infiniband/hw/qedr/verbs.c +++ b/drivers/infiniband/hw/qedr/verbs.c @@ -2238,8 +2238,7 @@ int qedr_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (rdma_protocol_roce(&dev->ibdev, 1)) { if (!ib_modify_qp_is_ok(old_qp_state, new_qp_state, - ibqp->qp_type, attr_mask, - IB_LINK_LAYER_ETHERNET)) { + ibqp->qp_type, attr_mask)) { DP_ERR(dev, "modify qp: invalid attribute mask=0x%x specified for\n" "qpn=0x%x of type=0x%x old_qp_state=0x%x, new_qp_state=0x%x\n", diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c index 60083c0363a5..cf22f57a9f0d 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c @@ -499,7 +499,7 @@ int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, next_state = (attr_mask & IB_QP_STATE) ? attr->qp_state : cur_state; if (!ib_modify_qp_is_ok(cur_state, next_state, ibqp->qp_type, - attr_mask, IB_LINK_LAYER_ETHERNET)) { + attr_mask)) { ret = -EINVAL; goto out; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 2db71e956d02..a036a5368103 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -1164,11 +1164,8 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int lastwqe = 0; int mig = 0; int pmtu = 0; /* for gcc warning only */ - enum rdma_link_layer link; int opa_ah; - link = rdma_port_get_link_layer(ibqp->device, qp->port_num); - spin_lock_irq(&qp->r_lock); spin_lock(&qp->s_hlock); spin_lock(&qp->s_lock); @@ -1179,7 +1176,7 @@ int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, opa_ah = rdma_cap_opa_ah(ibqp->device, qp->port_num); if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, - attr_mask, link)) + attr_mask)) goto inval; if (rdi->driver_f.check_modify_qp && diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 45b392b7342f..b9710907dac2 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -419,8 +419,7 @@ int rxe_qp_chk_attr(struct rxe_dev *rxe, struct rxe_qp *qp, enum ib_qp_state new_state = (mask & IB_QP_STATE) ? attr->qp_state : cur_state; - if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask, - IB_LINK_LAYER_ETHERNET)) { + if (!ib_modify_qp_is_ok(cur_state, new_state, qp_type(qp), mask)) { pr_warn("invalid mask or state for qp\n"); goto err1; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 9897d2329f2c..f88c1071413a 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2742,7 +2742,6 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * @next_state: Next QP state * @type: QP type * @mask: Mask of supplied QP attributes - * @ll : link layer of port * * This function is a helper function that a low-level driver's * modify_qp method can use to validate the consumer's input. It @@ -2751,8 +2750,7 @@ static inline int ib_destroy_usecnt(atomic_t *usecnt, * and that the attribute mask supplied is allowed for the transition. */ bool ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state, - enum ib_qp_type type, enum ib_qp_attr_mask mask, - enum rdma_link_layer ll); + enum ib_qp_type type, enum ib_qp_attr_mask mask); void ib_register_event_handler(struct ib_event_handler *event_handler); void ib_unregister_event_handler(struct ib_event_handler *event_handler); From 38716732f161c3d107c4cc406a287f1201bed752 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:49:24 +0300 Subject: [PATCH 178/242] RDMA/netlink: Simplify netlink listener existence check All users of rdma_nl_chk_listeners() are interested to get boolean answer if netlink socket has listeners, so update all places to boolean function. Signed-off-by: Leon Romanovsky Reviewed-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/netlink.c | 4 ++-- drivers/infiniband/core/sa_query.c | 2 +- include/rdma/rdma_netlink.h | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index c2ca9e4b5160..1400a9d0d56d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -315,7 +315,7 @@ static void queue_req(struct addr_req *req) static int ib_nl_fetch_ha(struct rdma_dev_addr *dev_addr, const void *daddr, u32 seq, u16 family) { - if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) + if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) return -EADDRNOTAVAIL; return ib_nl_ip_send_msg(dev_addr, daddr, seq, family); diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 3ccaae18ad75..724f5a62e82f 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -47,9 +47,9 @@ static struct { const struct rdma_nl_cbs *cb_table; } rdma_nl_types[RDMA_NL_NUM_CLIENTS]; -int rdma_nl_chk_listeners(unsigned int group) +bool rdma_nl_chk_listeners(unsigned int group) { - return (netlink_has_listeners(nls, group)) ? 0 : -1; + return netlink_has_listeners(nls, group); } EXPORT_SYMBOL(rdma_nl_chk_listeners); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a5e76d432d3f..f28f6fdb78cb 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1384,7 +1384,7 @@ static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) if ((query->flags & IB_SA_ENABLE_LOCAL_SERVICE) && (!(query->flags & IB_SA_QUERY_OPA))) { - if (!rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { + if (rdma_nl_chk_listeners(RDMA_NL_GROUP_LS)) { if (!ib_nl_make_request(query, gfp_mask)) return id; } diff --git a/include/rdma/rdma_netlink.h b/include/rdma/rdma_netlink.h index c369703fcd69..70218e6b5187 100644 --- a/include/rdma/rdma_netlink.h +++ b/include/rdma/rdma_netlink.h @@ -96,7 +96,7 @@ int rdma_nl_multicast(struct sk_buff *skb, unsigned int group, gfp_t flags); /** * Check if there are any listeners to the netlink group * @group: the netlink group ID - * Returns 0 on success or a negative for no listeners. + * Returns true on success or false if no listeners. */ -int rdma_nl_chk_listeners(unsigned int group); +bool rdma_nl_chk_listeners(unsigned int group); #endif /* _RDMA_NETLINK_H */ From 06ef0ee4b569101f3a07ce08335dbf29fd1404ef Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:28 +0800 Subject: [PATCH 179/242] RDMA/hns: Bugfix for reserved qp number It needs to include two special qps for every port. The hip08 have four ports and the all reserved qp numbers are eight. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 + drivers/infiniband/hw/hns/hns_roce_qp.c | 10 ++++++++-- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index ffa92550dbfa..34f8e90c8044 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -671,6 +671,7 @@ struct hns_roce_caps { u32 max_sq_inline; /* 32 */ u32 max_rq_sg; /* 2 */ int num_qps; /* 256k */ + int reserved_qps; u32 max_wqes; /* 16k */ u32 max_sq_desc_sz; /* 64 */ u32 max_rq_desc_sz; /* 64 */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 8136dd85845e..e4d98a1f7e82 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1219,6 +1219,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->reserved_mrws = 1; caps->reserved_uars = 0; caps->reserved_cqs = 0; + caps->reserved_qps = HNS_ROCE_V2_RSV_QPS; caps->qpc_ba_pg_sz = 0; caps->qpc_buf_pg_sz = 0; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 712542657c64..18775813cf13 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -78,6 +78,7 @@ #define HNS_ROCE_INVALID_LKEY 0x100 #define HNS_ROCE_CMQ_TX_TIMEOUT 30000 #define HNS_ROCE_V2_UC_RC_SGE_NUM_IN_WQE 2 +#define HNS_ROCE_V2_RSV_QPS 8 #define HNS_ROCE_CONTEXT_HOP_NUM 1 #define HNS_ROCE_MTT_HOP_NUM 1 diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 0378fc41fcfa..0e0c79609b0c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -1106,14 +1106,20 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev) { struct hns_roce_qp_table *qp_table = &hr_dev->qp_table; int reserved_from_top = 0; + int reserved_from_bot; int ret; spin_lock_init(&qp_table->lock); INIT_RADIX_TREE(&hr_dev->qp_table_tree, GFP_ATOMIC); - /* A port include two SQP, six port total 12 */ + /* In hw v1, a port include two SQP, six ports total 12 */ + if (hr_dev->caps.max_sq_sg <= 2) + reserved_from_bot = SQP_NUM; + else + reserved_from_bot = hr_dev->caps.reserved_qps; + ret = hns_roce_bitmap_init(&qp_table->bitmap, hr_dev->caps.num_qps, - hr_dev->caps.num_qps - 1, SQP_NUM, + hr_dev->caps.num_qps - 1, reserved_from_bot, reserved_from_top); if (ret) { dev_err(hr_dev->dev, "qp bitmap init failed!error=%d\n", From c80e066100b5fed722c8da67c1bd2312e7bcf129 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:29 +0800 Subject: [PATCH 180/242] RDMA/hns: Submit bad wr when post send wr exception When user issues a RDMA read and enables sq inline, it needs to report a bad wr to user. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e4d98a1f7e82..b543720c775e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -133,6 +133,7 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr, } if (wr->opcode == IB_WR_RDMA_READ) { + *bad_wr = wr; dev_err(hr_dev->dev, "Not support inline data!\n"); return -EINVAL; } From 15fc056fba7b17b9abfbe80a12f188403fc949fb Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:30 +0800 Subject: [PATCH 181/242] RDMA/hns: Bugfix for CM test It will print the warning when the MSB bit of SLID is not zero running cm_req_handler function that test CM. It needs to fixed zero when test RoCE device. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index b543720c775e..20cc86620063 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2321,6 +2321,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *hr_cq, wc->src_qp = (u8)roce_get_field(cqe->byte_32, V2_CQE_BYTE_32_RMT_QPN_M, V2_CQE_BYTE_32_RMT_QPN_S); + wc->slid = 0; wc->wc_flags |= (roce_get_bit(cqe->byte_32, V2_CQE_BYTE_32_GRH_S) ? IB_WC_GRH : 0); From 05ad5482a5904c416bcfd74afd7193e206e563ce Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:31 +0800 Subject: [PATCH 182/242] RDMA/hns: Limit the size of extend sge of sq The hip08 split two hardware version. The version id are 0x20 and 0x21 according to the PCI revison. The max size of extend sge of sq is limited to 2M for 0x20 version and 8M for 0x21 version. It may be exceeded to 2M according to the algorithm that compute the product of wqe count and extend sge number of every wqe. But the product always less than 8M. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 1 + drivers/infiniband/hw/hns/hns_roce_qp.c | 19 +++++++++++++++++++ 4 files changed, 22 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index 34f8e90c8044..b06d3e432f37 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -670,6 +670,7 @@ struct hns_roce_caps { u32 max_sq_sg; /* 2 */ u32 max_sq_inline; /* 32 */ u32 max_rq_sg; /* 2 */ + u32 max_extend_sg; int num_qps; /* 256k */ int reserved_qps; u32 max_wqes; /* 16k */ diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 20cc86620063..d563d2955597 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -1191,6 +1191,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) caps->num_cqs = HNS_ROCE_V2_MAX_CQ_NUM; caps->max_cqes = HNS_ROCE_V2_MAX_CQE_NUM; caps->max_sq_sg = HNS_ROCE_V2_MAX_SQ_SGE_NUM; + caps->max_extend_sg = HNS_ROCE_V2_MAX_EXTEND_SGE_NUM; caps->max_rq_sg = HNS_ROCE_V2_MAX_RQ_SGE_NUM; caps->max_sq_inline = HNS_ROCE_V2_MAX_SQ_INLINE; caps->num_uars = HNS_ROCE_V2_UAR_NUM; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 18775813cf13..b921ca6358e0 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -50,6 +50,7 @@ #define HNS_ROCE_V2_MAX_CQE_NUM 0x10000 #define HNS_ROCE_V2_MAX_RQ_SGE_NUM 0x100 #define HNS_ROCE_V2_MAX_SQ_SGE_NUM 0xff +#define HNS_ROCE_V2_MAX_EXTEND_SGE_NUM 0x200000 #define HNS_ROCE_V2_MAX_SQ_INLINE 0x20 #define HNS_ROCE_V2_UAR_NUM 256 #define HNS_ROCE_V2_PHY_UAR_NUM 1 diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 0e0c79609b0c..2805ab2ab2c5 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include #include @@ -372,6 +373,16 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, if (hr_qp->sq.max_gs > 2) hr_qp->sge.sge_cnt = roundup_pow_of_two(hr_qp->sq.wqe_cnt * (hr_qp->sq.max_gs - 2)); + + if ((hr_qp->sq.max_gs > 2) && (hr_dev->pci_dev->revision == 0x20)) { + if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) { + dev_err(hr_dev->dev, + "The extended sge cnt error! sge_cnt=%d\n", + hr_qp->sge.sge_cnt); + return -EINVAL; + } + } + hr_qp->sge.sge_shift = 4; /* Get buf size, SQ and RQ are aligned to page_szie */ @@ -465,6 +476,14 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sge.sge_shift = 4; } + if ((hr_qp->sq.max_gs > 2) && hr_dev->pci_dev->revision == 0x20) { + if (hr_qp->sge.sge_cnt > hr_dev->caps.max_extend_sg) { + dev_err(dev, "The extended sge cnt error! sge_cnt=%d\n", + hr_qp->sge.sge_cnt); + return -EINVAL; + } + } + /* Get buf size, SQ and RQ are aligned to PAGE_SIZE */ page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); hr_qp->sq.offset = 0; From 157b52a08da68b650a6b5bcaf427d39a5d016a36 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:32 +0800 Subject: [PATCH 183/242] RDMA/hns: Configure ecn field of ip header In order to compatible with the third party RoCE device, The hardware modify the set method for the ecn field of ip header in new hip08 version. The high 6bit of tclass be assigned for dscp field of packet. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index d563d2955597..3408fc59d417 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3640,8 +3640,15 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, V2_QPC_BYTE_24_HOP_LIMIT_M, V2_QPC_BYTE_24_HOP_LIMIT_S, 0); - roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, - V2_QPC_BYTE_24_TC_S, grh->traffic_class); + if (hr_dev->pci_dev->revision == 0x21 && + gid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + roce_set_field(context->byte_24_mtu_tc, + V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, + grh->traffic_class >> 2); + else + roce_set_field(context->byte_24_mtu_tc, + V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, + grh->traffic_class); roce_set_field(qpc_mask->byte_24_mtu_tc, V2_QPC_BYTE_24_TC_M, V2_QPC_BYTE_24_TC_S, 0); roce_set_field(context->byte_28_at_fl, V2_QPC_BYTE_28_FL_M, From 3a63c964eaa168c4ada1076b7c6ffd6a53a9ef86 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:33 +0800 Subject: [PATCH 184/242] RDMA/hns: Update some attributes of the RoCE device According to the IB protocol definition, the driver needs to show the correct device information and the information will be queryed by device attribute. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 1 + drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 31 +++++++++++++++++++-- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 6 ++++ drivers/infiniband/hw/hns/hns_roce_main.c | 1 + 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index b06d3e432f37..de9b8e391563 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -661,6 +661,7 @@ struct hns_roce_eq_table { }; struct hns_roce_caps { + u64 fw_ver; u8 num_ports; int gid_table_len[HNS_ROCE_MAX_PORTS]; int pkey_table_len[HNS_ROCE_MAX_PORTS]; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 3408fc59d417..5c9461defc23 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -933,7 +933,24 @@ static int hns_roce_cmq_query_hw_info(struct hns_roce_dev *hr_dev) resp = (struct hns_roce_query_version *)desc.data; hr_dev->hw_rev = le32_to_cpu(resp->rocee_hw_version); - hr_dev->vendor_id = le32_to_cpu(resp->rocee_vendor_id); + hr_dev->vendor_id = hr_dev->pci_dev->vendor; + + return 0; +} + +static int hns_roce_query_fw_ver(struct hns_roce_dev *hr_dev) +{ + struct hns_roce_query_fw_info *resp; + struct hns_roce_cmq_desc desc; + int ret; + + hns_roce_cmq_setup_basic_desc(&desc, HNS_QUERY_FW_VER, true); + ret = hns_roce_cmq_send(hr_dev, &desc, 1); + if (ret) + return ret; + + resp = (struct hns_roce_query_fw_info *)desc.data; + hr_dev->caps.fw_ver = (u64)(le32_to_cpu(resp->fw_ver)); return 0; } @@ -1155,6 +1172,13 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) int ret; ret = hns_roce_cmq_query_hw_info(hr_dev); + if (ret) { + dev_err(hr_dev->dev, "Query hardware version fail, ret = %d.\n", + ret); + return ret; + } + + ret = hns_roce_query_fw_ver(hr_dev); if (ret) { dev_err(hr_dev->dev, "Query firmware version fail, ret = %d.\n", ret); @@ -1183,8 +1207,9 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) return ret; } - hr_dev->vendor_part_id = 0; - hr_dev->sys_image_guid = 0; + + hr_dev->vendor_part_id = hr_dev->pci_dev->device; + hr_dev->sys_image_guid = be64_to_cpu(hr_dev->ib_dev.node_guid); caps->num_qps = HNS_ROCE_V2_MAX_QP_NUM; caps->max_wqes = HNS_ROCE_V2_MAX_WQE_NUM; diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index b921ca6358e0..7f39b9431819 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -203,6 +203,7 @@ enum { /* CMQ command */ enum hns_roce_opcode_type { + HNS_QUERY_FW_VER = 0x0001, HNS_ROCE_OPC_QUERY_HW_VER = 0x8000, HNS_ROCE_OPC_CFG_GLOBAL_PARAM = 0x8001, HNS_ROCE_OPC_ALLOC_PF_RES = 0x8004, @@ -1087,6 +1088,11 @@ struct hns_roce_query_version { __le32 rsv[5]; }; +struct hns_roce_query_fw_info { + __le32 fw_ver; + __le32 rsv[5]; +}; + struct hns_roce_cfg_llm_a { __le32 base_addr_l; __le32 base_addr_h; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 8c5160ec3a4d..7e693b11c823 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -196,6 +196,7 @@ static int hns_roce_query_device(struct ib_device *ib_dev, memset(props, 0, sizeof(*props)); + props->fw_ver = hr_dev->caps.fw_ver; props->sys_image_guid = cpu_to_be64(hr_dev->sys_image_guid); props->max_mr_size = (u64)(~(0ULL)); props->page_size_cap = hr_dev->caps.page_size_cap; From b28ca7cceff8b2cfadebd34a907742cd396c2c33 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:34 +0800 Subject: [PATCH 185/242] RDMA/hns: Limit extend sq sge num According to hip08 limit, the buffer size of extend sge needs to be an integer wqe_sge_buf_page size. For example, the value of sge_shift field of qp context is greater or equal to eight when buffer page size is 4K size. The value of sge_shift field of qp context assigned by hr_qp->sge.sge_cnt. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_qp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c index 2805ab2ab2c5..5ebf481a39d9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_qp.c +++ b/drivers/infiniband/hw/hns/hns_roce_qp.c @@ -344,6 +344,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, { u32 roundup_sq_stride = roundup_pow_of_two(hr_dev->caps.max_sq_desc_sz); u8 max_sq_stride = ilog2(roundup_sq_stride); + u32 ex_sge_num; u32 page_size; u32 max_cnt; @@ -384,6 +385,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, } hr_qp->sge.sge_shift = 4; + ex_sge_num = hr_qp->sge.sge_cnt; /* Get buf size, SQ and RQ are aligned to page_szie */ if (hr_dev->caps.max_sq_sg <= 2) { @@ -397,6 +399,8 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift), PAGE_SIZE); } else { page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT); + hr_qp->sge.sge_cnt = + max(page_size / (1 << hr_qp->sge.sge_shift), ex_sge_num); hr_qp->buff_size = HNS_ROCE_ALOGN_UP((hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift), page_size) + HNS_ROCE_ALOGN_UP((hr_qp->sge.sge_cnt << @@ -405,7 +409,7 @@ static int hns_roce_set_user_sq_size(struct hns_roce_dev *hr_dev, hr_qp->sq.wqe_shift), page_size); hr_qp->sq.offset = 0; - if (hr_qp->sge.sge_cnt) { + if (ex_sge_num) { hr_qp->sge.offset = HNS_ROCE_ALOGN_UP( (hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift), @@ -491,6 +495,8 @@ static int hns_roce_set_kernel_sq_size(struct hns_roce_dev *hr_dev, page_size); if (hr_dev->caps.max_sq_sg > 2 && hr_qp->sge.sge_cnt) { + hr_qp->sge.sge_cnt = max(page_size/(1 << hr_qp->sge.sge_shift), + (u32)hr_qp->sge.sge_cnt); hr_qp->sge.offset = size; size += HNS_ROCE_ALOGN_UP(hr_qp->sge.sge_cnt << hr_qp->sge.sge_shift, page_size); From 2362cceef3f4cbd382f338327097f98795f6b91e Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:35 +0800 Subject: [PATCH 186/242] RDMA/hns: Update some fields of qp context The hip08 hardware has two version. the version id are 0x20 and 0x21 according to the pci revision. It needs to adjust some fields for extending new features. The specific updates include: 1. Add some fields for supporting new features by enabling some reserved fields in 0x20 version. 2. remove some fields which the user is not visiable in order to support the extend features. 3. Init some fields with zero. These updates is compatible with 0x20 version. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 69 +++++++++------------- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 42 ++++++------- 2 files changed, 45 insertions(+), 66 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 5c9461defc23..c7d5353ecb81 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -2670,21 +2670,16 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_TX_ERR_S, 0); roce_set_bit(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_RQ_RX_ERR_S, 0); - roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_MAPID_M, - V2_QPC_BYTE_60_MAPID_S, 0); + roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_TEMPID_M, + V2_QPC_BYTE_60_TEMPID_S, 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, - V2_QPC_BYTE_60_INNER_MAP_IND_S, 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_MAP_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_RQ_MAP_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_EXT_MAP_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_RLS_IND_S, - 0); - roce_set_bit(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_SQ_EXT_IND_S, - 0); + roce_set_field(qpc_mask->byte_60_qpst_tempid, + V2_QPC_BYTE_60_SCC_TOKEN_M, V2_QPC_BYTE_60_SCC_TOKEN_S, + 0); + roce_set_bit(qpc_mask->byte_60_qpst_tempid, + V2_QPC_BYTE_60_SQ_DB_DOING_S, 0); + roce_set_bit(qpc_mask->byte_60_qpst_tempid, + V2_QPC_BYTE_60_RQ_DB_DOING_S, 0); roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0); roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0); @@ -2766,7 +2761,8 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M, V2_QPC_BYTE_132_TRRL_TAIL_MAX_S, 0); - roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RSVD_RAQ_MAP_S, 0); + roce_set_bit(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S, + 0); roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M, V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S, 0); roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RAQ_TRRL_TAIL_M, @@ -2775,8 +2771,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M, V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S, 0); - roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S, - 0); roce_set_field(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RAQ_CREDIT_M, V2_QPC_BYTE_144_RAQ_CREDIT_S, 0); roce_set_bit(qpc_mask->byte_144_raq, V2_QPC_BYTE_144_RESP_RTY_FLG_S, 0); @@ -2802,14 +2796,12 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_160_SQ_CONSUMER_IDX_M, V2_QPC_BYTE_160_SQ_CONSUMER_IDX_S, 0); - roce_set_field(context->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, - ilog2((unsigned int)hr_qp->sq.wqe_cnt)); - roce_set_field(qpc_mask->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0); - + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S, 0); + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S, 0); + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S, 0); roce_set_bit(qpc_mask->byte_168_irrl_idx, V2_QPC_BYTE_168_MSG_RTY_LP_FLG_S, 0); roce_set_bit(qpc_mask->byte_168_irrl_idx, @@ -2871,6 +2863,13 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, V2_QPC_BYTE_232_IRRL_SGE_IDX_M, V2_QPC_BYTE_232_IRRL_SGE_IDX_S, 0); + roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_SO_LP_VLD_S, + 0); + roce_set_bit(qpc_mask->byte_232_irrl_sge, + V2_QPC_BYTE_232_FENCE_LP_VLD_S, 0); + roce_set_bit(qpc_mask->byte_232_irrl_sge, V2_QPC_BYTE_232_IRRL_LP_VLD_S, + 0); + qpc_mask->irrl_cur_sge_offset = 0; roce_set_field(qpc_mask->byte_240_irrl_tail, @@ -3036,13 +3035,6 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp, roce_set_field(qpc_mask->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M, V2_QPC_BYTE_56_DQPN_S, 0); } - roce_set_field(context->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, - ilog2((unsigned int)hr_qp->sq.wqe_cnt)); - roce_set_field(qpc_mask->byte_168_irrl_idx, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_M, - V2_QPC_BYTE_168_SQ_SHIFT_BAK_S, 0); } static int modify_qp_init_to_rtr(struct ib_qp *ibqp, @@ -3352,13 +3344,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp, * we should set all bits of the relevant fields in context mask to * 0 at the same time, else set them to 0x1. */ - roce_set_field(context->byte_60_qpst_mapid, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, attr->retry_cnt); - roce_set_field(qpc_mask->byte_60_qpst_mapid, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M, - V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S, 0); - context->sq_cur_blk_addr = (u32)(mtts[0] >> PAGE_ADDR_SHIFT); roce_set_field(context->byte_168_irrl_idx, V2_QPC_BYTE_168_SQ_CUR_BLK_ADDR_M, @@ -3694,9 +3679,9 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask); /* Every status migrate must change state */ - roce_set_field(context->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M, + roce_set_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S, new_state); - roce_set_field(qpc_mask->byte_60_qpst_mapid, V2_QPC_BYTE_60_QP_ST_M, + roce_set_field(qpc_mask->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S, 0); /* SW pass context to HW */ @@ -3816,7 +3801,7 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, goto out; } - state = roce_get_field(context->byte_60_qpst_mapid, + state = roce_get_field(context->byte_60_qpst_tempid, V2_QPC_BYTE_60_QP_ST_M, V2_QPC_BYTE_60_QP_ST_S); tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state); if (tmp_qp_state == -1) { diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index 7f39b9431819..c399ac36afa1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -354,7 +354,7 @@ struct hns_roce_v2_qp_context { __le32 dmac; __le32 byte_52_udpspn_dmac; __le32 byte_56_dqpn_err; - __le32 byte_60_qpst_mapid; + __le32 byte_60_qpst_tempid; __le32 qkey_xrcd; __le32 byte_68_rq_db; __le32 rq_db_record_addr; @@ -496,26 +496,15 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_56_LP_PKTN_INI_S 28 #define V2_QPC_BYTE_56_LP_PKTN_INI_M GENMASK(31, 28) -#define V2_QPC_BYTE_60_MAPID_S 0 -#define V2_QPC_BYTE_60_MAPID_M GENMASK(12, 0) +#define V2_QPC_BYTE_60_TEMPID_S 0 +#define V2_QPC_BYTE_60_TEMPID_M GENMASK(7, 0) -#define V2_QPC_BYTE_60_INNER_MAP_IND_S 13 +#define V2_QPC_BYTE_60_SCC_TOKEN_S 8 +#define V2_QPC_BYTE_60_SCC_TOKEN_M GENMASK(26, 8) -#define V2_QPC_BYTE_60_SQ_MAP_IND_S 14 +#define V2_QPC_BYTE_60_SQ_DB_DOING_S 27 -#define V2_QPC_BYTE_60_RQ_MAP_IND_S 15 - -#define V2_QPC_BYTE_60_TEMPID_S 16 -#define V2_QPC_BYTE_60_TEMPID_M GENMASK(22, 16) - -#define V2_QPC_BYTE_60_EXT_MAP_IND_S 23 - -#define V2_QPC_BYTE_60_RTY_NUM_INI_BAK_S 24 -#define V2_QPC_BYTE_60_RTY_NUM_INI_BAK_M GENMASK(26, 24) - -#define V2_QPC_BYTE_60_SQ_RLS_IND_S 27 - -#define V2_QPC_BYTE_60_SQ_EXT_IND_S 28 +#define V2_QPC_BYTE_60_RQ_DB_DOING_S 28 #define V2_QPC_BYTE_60_QP_ST_S 29 #define V2_QPC_BYTE_60_QP_ST_M GENMASK(31, 29) @@ -592,7 +581,7 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_140_RR_MAX_S 12 #define V2_QPC_BYTE_140_RR_MAX_M GENMASK(14, 12) -#define V2_QPC_BYTE_140_RSVD_RAQ_MAP_S 15 +#define V2_QPC_BYTE_140_RQ_RTY_WAIT_DO_S 15 #define V2_QPC_BYTE_140_RAQ_TRRL_HEAD_S 16 #define V2_QPC_BYTE_140_RAQ_TRRL_HEAD_M GENMASK(23, 16) @@ -603,8 +592,6 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_S 0 #define V2_QPC_BYTE_144_RAQ_RTY_INI_PSN_M GENMASK(23, 0) -#define V2_QPC_BYTE_144_RAQ_RTY_INI_IND_S 24 - #define V2_QPC_BYTE_144_RAQ_CREDIT_S 25 #define V2_QPC_BYTE_144_RAQ_CREDIT_M GENMASK(29, 25) @@ -641,9 +628,9 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_168_LP_SGEN_INI_S 22 #define V2_QPC_BYTE_168_LP_SGEN_INI_M GENMASK(23, 22) -#define V2_QPC_BYTE_168_SQ_SHIFT_BAK_S 24 -#define V2_QPC_BYTE_168_SQ_SHIFT_BAK_M GENMASK(27, 24) - +#define V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S 25 +#define V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S 26 +#define V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S 27 #define V2_QPC_BYTE_168_IRRL_IDX_LSB_S 28 #define V2_QPC_BYTE_168_IRRL_IDX_LSB_M GENMASK(31, 28) @@ -729,6 +716,10 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_232_IRRL_SGE_IDX_S 20 #define V2_QPC_BYTE_232_IRRL_SGE_IDX_M GENMASK(28, 20) +#define V2_QPC_BYTE_232_SO_LP_VLD_S 29 +#define V2_QPC_BYTE_232_FENCE_LP_VLD_S 30 +#define V2_QPC_BYTE_232_IRRL_LP_VLD_S 31 + #define V2_QPC_BYTE_240_IRRL_TAIL_REAL_S 0 #define V2_QPC_BYTE_240_IRRL_TAIL_REAL_M GENMASK(7, 0) @@ -747,6 +738,9 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_244_RNR_CNT_S 27 #define V2_QPC_BYTE_244_RNR_CNT_M GENMASK(29, 27) +#define V2_QPC_BYTE_244_LCL_OP_FLG_S 30 +#define V2_QPC_BYTE_244_IRRL_RD_FLG_S 31 + #define V2_QPC_BYTE_248_IRRL_PSN_S 0 #define V2_QPC_BYTE_248_IRRL_PSN_M GENMASK(23, 0) From e93df01085791e5c8a079c94a15af2a677176b81 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:36 +0800 Subject: [PATCH 187/242] RDMA/hns: Support local invalidate for hip08 in kernel space This patch adds local invalidate Memory Region (MR) support in the kernel space driver. Signed-off-by: Yangyang Li Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index c7d5353ecb81..4b8266d7ade1 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -457,6 +457,10 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, break; case IB_WR_LOCAL_INV: hr_op = HNS_ROCE_V2_WQE_OP_LOCAL_INV; + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_SEND_WQE_BYTE_4_SO_S, 1); + rc_sq_wqe->inv_key = + cpu_to_le32(wr->ex.invalidate_rkey); break; case IB_WR_ATOMIC_CMP_AND_SWP: hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP; @@ -1722,7 +1726,7 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr, roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 0); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); - roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 0); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_BIND_EN_S, (mr->access & IB_ACCESS_MW_BIND ? 1 : 0)); roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_ATOMIC_EN_S, From caf3e4064af086e2dda3fc5cf598a06708aa997c Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:37 +0800 Subject: [PATCH 188/242] RDMA/hns: Add vlan enable bit for hip08 In order to extend vlan device range, the design add two field of qp context for checking vlan packet in sender and in recevicer. Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 11 +++++++++++ drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 2 ++ 2 files changed, 13 insertions(+) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index 4b8266d7ade1..f2bf9b5c5faf 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -3608,6 +3608,17 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp, memcpy(src_mac, gid_attr->ndev->dev_addr, ETH_ALEN); } + if (is_vlan_dev(gid_attr->ndev)) { + roce_set_bit(context->byte_76_srqn_op_en, + V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1); + roce_set_bit(qpc_mask->byte_76_srqn_op_en, + V2_QPC_BYTE_76_RQ_VLAN_EN_S, 0); + roce_set_bit(context->byte_168_irrl_idx, + V2_QPC_BYTE_168_SQ_VLAN_EN_S, 1); + roce_set_bit(qpc_mask->byte_168_irrl_idx, + V2_QPC_BYTE_168_SQ_VLAN_EN_S, 0); + } + roce_set_field(context->byte_24_mtu_tc, V2_QPC_BYTE_24_VLAN_ID_M, V2_QPC_BYTE_24_VLAN_ID_S, vlan); diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index c399ac36afa1..f8abccea152c 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -527,6 +527,7 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_76_RQIE_S 28 +#define V2_QPC_BYTE_76_RQ_VLAN_EN_S 30 #define V2_QPC_BYTE_80_RX_CQN_S 0 #define V2_QPC_BYTE_80_RX_CQN_M GENMASK(23, 0) @@ -628,6 +629,7 @@ struct hns_roce_v2_qp_context { #define V2_QPC_BYTE_168_LP_SGEN_INI_S 22 #define V2_QPC_BYTE_168_LP_SGEN_INI_M GENMASK(23, 22) +#define V2_QPC_BYTE_168_SQ_VLAN_EN_S 24 #define V2_QPC_BYTE_168_POLL_DB_WAIT_DO_S 25 #define V2_QPC_BYTE_168_SCC_TOKEN_FORBID_SQ_DEQ_S 26 #define V2_QPC_BYTE_168_WAIT_ACK_TIMEOUT_S 27 From d9581bf358c0578600940fc8d1f3b8f069870225 Mon Sep 17 00:00:00 2001 From: Lijun Ou Date: Sun, 30 Sep 2018 17:00:38 +0800 Subject: [PATCH 189/242] RDMA/hns: Bugfix for atomic operation The atomic operation not supported inline. Besides, the standard atomic operation only support a sge and the sge is placed in the wqe. Fix: 384f881("RDMA/hns: Add atomic support") Signed-off-by: Lijun Ou Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 31 +++++++++++++++------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index f2bf9b5c5faf..e3d9f1de8899 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -467,18 +467,14 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, rc_sq_wqe->rkey = cpu_to_le32(atomic_wr(wr)->rkey); rc_sq_wqe->va = - cpu_to_le32(atomic_wr(wr)->remote_addr); - wqe += sizeof(struct hns_roce_v2_wqe_data_seg); - set_atomic_seg(wqe, atomic_wr(wr)); + cpu_to_le64(atomic_wr(wr)->remote_addr); break; case IB_WR_ATOMIC_FETCH_AND_ADD: hr_op = HNS_ROCE_V2_WQE_OP_ATOM_FETCH_AND_ADD; rc_sq_wqe->rkey = cpu_to_le32(atomic_wr(wr)->rkey); rc_sq_wqe->va = - cpu_to_le32(atomic_wr(wr)->remote_addr); - wqe += sizeof(struct hns_roce_v2_wqe_data_seg); - set_atomic_seg(wqe, atomic_wr(wr)); + cpu_to_le64(atomic_wr(wr)->remote_addr); break; case IB_WR_MASKED_ATOMIC_CMP_AND_SWP: hr_op = @@ -497,10 +493,25 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, V2_RC_SEND_WQE_BYTE_4_OPCODE_M, V2_RC_SEND_WQE_BYTE_4_OPCODE_S, hr_op); - ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, - &sge_ind, bad_wr); - if (ret) - goto out; + if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP || + wr->opcode == IB_WR_ATOMIC_FETCH_AND_ADD) { + struct hns_roce_v2_wqe_data_seg *dseg; + + dseg = wqe; + set_data_seg_v2(dseg, wr->sg_list); + wqe += sizeof(struct hns_roce_v2_wqe_data_seg); + set_atomic_seg(wqe, atomic_wr(wr)); + roce_set_field(rc_sq_wqe->byte_16, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, + V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, + wr->num_sge); + } else { + ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, + wqe, &sge_ind, bad_wr); + if (ret) + goto out; + } + ind++; } else { dev_err(dev, "Illegal qp_type(0x%x)\n", ibqp->qp_type); From b56511c15713ba6c7572e77a41f7ddba9c1053ec Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Mon, 24 Sep 2018 12:57:16 -0700 Subject: [PATCH 190/242] IB/mlx4: Avoid implicit enumerated type conversion Clang warns when one enumerated type is implicitly converted to another. drivers/infiniband/hw/mlx4/mad.c:1811:41: warning: implicit conversion from enumeration type 'enum mlx4_ib_qp_flags' to different enumeration type 'enum ib_qp_create_flags' [-Wenum-conversion] qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP; ~ ^~~~~~~~~~~~~~~~~~~~~~~ drivers/infiniband/hw/mlx4/mad.c:1819:41: warning: implicit conversion from enumeration type 'enum mlx4_ib_qp_flags' to different enumeration type 'enum ib_qp_create_flags' [-Wenum-conversion] qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP; ~ ^~~~~~~~~~~~~~~~~ The type mlx4_ib_qp_flags explicitly provides supplemental values to the type ib_qp_create_flags. Make that clear to Clang by changing the create_flags type to u32. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index f88c1071413a..7ce617d77f8f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -1151,7 +1151,7 @@ struct ib_qp_init_attr { struct ib_qp_cap cap; enum ib_sig_type sq_sig_type; enum ib_qp_type qp_type; - enum ib_qp_create_flags create_flags; + u32 create_flags; /* * Only needed for special QP types, or when using the RW API. From 019f118b94c895294debfaa394b267638fe2f648 Mon Sep 17 00:00:00 2001 From: Brian Welty Date: Wed, 26 Sep 2018 10:44:33 -0700 Subject: [PATCH 191/242] IB/{hfi1, qib, rdmavt}: Move copy SGE logic into rdmavt This patch moves hfi1_copy_sge() into rdmavt for sharing with qib. This patch also moves all the wss_*() functions into rdmavt as several wss_*() functions are called from hfi1_copy_sge() When SGE copy mode is adaptive, cacheless copy may be done in some cases for performance reasons. In those cases, X86 cacheless copy function is called since the drivers that use rdmavt and may set SGE copy mode to adaptive are X86 only. For this reason, this patch adds "depends on X86_64" to rdmavt/Kconfig. Reviewed-by: Ashutosh Dixit Reviewed-by: Michael J. Ruhl Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Brian Welty Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/init.c | 6 - drivers/infiniband/hw/hfi1/rc.c | 10 +- drivers/infiniband/hw/hfi1/ruc.c | 3 +- drivers/infiniband/hw/hfi1/uc.c | 10 +- drivers/infiniband/hw/hfi1/ud.c | 18 +- drivers/infiniband/hw/hfi1/verbs.c | 226 +--------------------- drivers/infiniband/hw/hfi1/verbs.h | 25 --- drivers/infiniband/hw/qib/qib_rc.c | 10 +- drivers/infiniband/hw/qib/qib_ruc.c | 2 +- drivers/infiniband/hw/qib/qib_uc.c | 10 +- drivers/infiniband/hw/qib/qib_ud.c | 13 +- drivers/infiniband/hw/qib/qib_verbs.c | 22 +-- drivers/infiniband/hw/qib/qib_verbs.h | 3 - drivers/infiniband/sw/rdmavt/Kconfig | 2 +- drivers/infiniband/sw/rdmavt/qp.c | 258 ++++++++++++++++++++++++++ drivers/infiniband/sw/rdmavt/qp.h | 2 + drivers/infiniband/sw/rdmavt/vt.c | 12 +- include/rdma/rdma_vt.h | 22 +++ include/rdma/rdmavt_qp.h | 4 + 19 files changed, 344 insertions(+), 314 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/init.c b/drivers/infiniband/hw/hfi1/init.c index 1e770a133779..09044905284f 100644 --- a/drivers/infiniband/hw/hfi1/init.c +++ b/drivers/infiniband/hw/hfi1/init.c @@ -1504,9 +1504,6 @@ static int __init hfi1_mod_init(void) idr_init(&hfi1_unit_table); hfi1_dbg_init(); - ret = hfi1_wss_init(); - if (ret < 0) - goto bail_wss; ret = pci_register_driver(&hfi1_pci_driver); if (ret < 0) { pr_err("Unable to register driver: error %d\n", -ret); @@ -1515,8 +1512,6 @@ static int __init hfi1_mod_init(void) goto bail; /* all OK */ bail_dev: - hfi1_wss_exit(); -bail_wss: hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); dev_cleanup(); @@ -1533,7 +1528,6 @@ static void __exit hfi1_mod_cleanup(void) { pci_unregister_driver(&hfi1_pci_driver); node_affinity_destroy_all(); - hfi1_wss_exit(); hfi1_dbg_exit(); idr_destroy(&hfi1_unit_table); diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 9bd63abb2dfe..673b31ebf0ac 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -1644,7 +1644,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1684,7 +1685,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - hfi1_copy_sge(&qp->s_rdma_read_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void)do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -2144,7 +2146,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -2200,7 +2202,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, copy_last); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, copy_last); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!__test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 17b49b4309a3..223eaf184934 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -361,7 +361,8 @@ do_write: if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, + len, release, copy_last); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index e254dcec6f64..48a320c01552 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -426,7 +426,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - hfi1_copy_sge(&qp->r_sge, data, pmtu, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -449,7 +449,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - hfi1_copy_sge(&qp->r_sge, data, tlen, false, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -523,7 +523,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, pmtu, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -550,7 +550,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -564,7 +564,7 @@ rdma_last: tlen -= (hdrsize + extra_bytes); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - hfi1_copy_sge(&qp->r_sge, data, tlen, true, false); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index 70d39fc450a1..e55bc4280d58 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -210,8 +210,8 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) } hfi1_make_grh(ibp, &grh, &grd, 0, 0); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -228,7 +228,7 @@ static void ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; WARN_ON_ONCE(len == 0); - hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, true, false); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -1019,8 +1019,8 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) goto drop; } if (packet->grh) { - hfi1_copy_sge(&qp->r_sge, packet->grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, packet->grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else if (packet->etype == RHF_RCV_TYPE_BYPASS) { struct ib_grh grh; @@ -1030,14 +1030,14 @@ void hfi1_ud_rcv(struct hfi1_packet *packet) * out when creating 16B, add back the GRH here. */ hfi1_make_ext_grh(packet, &grh, slid, dlid); - hfi1_copy_sge(&qp->r_sge, &grh, - sizeof(struct ib_grh), true, false); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else { rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); } - hfi1_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), - true, false); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 16b88948383b..0a47b46f979e 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -129,8 +129,6 @@ unsigned short piothreshold = 256; module_param(piothreshold, ushort, S_IRUGO); MODULE_PARM_DESC(piothreshold, "size used to determine sdma vs. pio"); -#define COPY_CACHELESS 1 -#define COPY_ADAPTIVE 2 static unsigned int sge_copy_mode; module_param(sge_copy_mode, uint, S_IRUGO); MODULE_PARM_DESC(sge_copy_mode, @@ -151,159 +149,13 @@ static int pio_wait(struct rvt_qp *qp, /* 16B trailing buffer */ static const u8 trail_buf[MAX_16B_PADDING]; -static uint wss_threshold; +static uint wss_threshold = 80; module_param(wss_threshold, uint, S_IRUGO); MODULE_PARM_DESC(wss_threshold, "Percentage (1-100) of LLC to use as a threshold for a cacheless copy"); static uint wss_clean_period = 256; module_param(wss_clean_period, uint, S_IRUGO); MODULE_PARM_DESC(wss_clean_period, "Count of verbs copies before an entry in the page copy table is cleaned"); -/* memory working set size */ -struct hfi1_wss { - unsigned long *entries; - atomic_t total_count; - atomic_t clean_counter; - atomic_t clean_entry; - - int threshold; - int num_entries; - long pages_mask; -}; - -static struct hfi1_wss wss; - -int hfi1_wss_init(void) -{ - long llc_size; - long llc_bits; - long table_size; - long table_bits; - - /* check for a valid percent range - default to 80 if none or invalid */ - if (wss_threshold < 1 || wss_threshold > 100) - wss_threshold = 80; - /* reject a wildly large period */ - if (wss_clean_period > 1000000) - wss_clean_period = 256; - /* reject a zero period */ - if (wss_clean_period == 0) - wss_clean_period = 1; - - /* - * Calculate the table size - the next power of 2 larger than the - * LLC size. LLC size is in KiB. - */ - llc_size = wss_llc_size() * 1024; - table_size = roundup_pow_of_two(llc_size); - - /* one bit per page in rounded up table */ - llc_bits = llc_size / PAGE_SIZE; - table_bits = table_size / PAGE_SIZE; - wss.pages_mask = table_bits - 1; - wss.num_entries = table_bits / BITS_PER_LONG; - - wss.threshold = (llc_bits * wss_threshold) / 100; - if (wss.threshold == 0) - wss.threshold = 1; - - atomic_set(&wss.clean_counter, wss_clean_period); - - wss.entries = kcalloc(wss.num_entries, sizeof(*wss.entries), - GFP_KERNEL); - if (!wss.entries) { - hfi1_wss_exit(); - return -ENOMEM; - } - - return 0; -} - -void hfi1_wss_exit(void) -{ - /* coded to handle partially initialized and repeat callers */ - kfree(wss.entries); - wss.entries = NULL; -} - -/* - * Advance the clean counter. When the clean period has expired, - * clean an entry. - * - * This is implemented in atomics to avoid locking. Because multiple - * variables are involved, it can be racy which can lead to slightly - * inaccurate information. Since this is only a heuristic, this is - * OK. Any innaccuracies will clean themselves out as the counter - * advances. That said, it is unlikely the entry clean operation will - * race - the next possible racer will not start until the next clean - * period. - * - * The clean counter is implemented as a decrement to zero. When zero - * is reached an entry is cleaned. - */ -static void wss_advance_clean_counter(void) -{ - int entry; - int weight; - unsigned long bits; - - /* become the cleaner if we decrement the counter to zero */ - if (atomic_dec_and_test(&wss.clean_counter)) { - /* - * Set, not add, the clean period. This avoids an issue - * where the counter could decrement below the clean period. - * Doing a set can result in lost decrements, slowing the - * clean advance. Since this a heuristic, this possible - * slowdown is OK. - * - * An alternative is to loop, advancing the counter by a - * clean period until the result is > 0. However, this could - * lead to several threads keeping another in the clean loop. - * This could be mitigated by limiting the number of times - * we stay in the loop. - */ - atomic_set(&wss.clean_counter, wss_clean_period); - - /* - * Uniquely grab the entry to clean and move to next. - * The current entry is always the lower bits of - * wss.clean_entry. The table size, wss.num_entries, - * is always a power-of-2. - */ - entry = (atomic_inc_return(&wss.clean_entry) - 1) - & (wss.num_entries - 1); - - /* clear the entry and count the bits */ - bits = xchg(&wss.entries[entry], 0); - weight = hweight64((u64)bits); - /* only adjust the contended total count if needed */ - if (weight) - atomic_sub(weight, &wss.total_count); - } -} - -/* - * Insert the given address into the working set array. - */ -static void wss_insert(void *address) -{ - u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss.pages_mask; - u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ - u32 nr = page & (BITS_PER_LONG - 1); - - if (!test_and_set_bit(nr, &wss.entries[entry])) - atomic_inc(&wss.total_count); - - wss_advance_clean_counter(); -} - -/* - * Is the working set larger than the threshold? - */ -static inline bool wss_exceeds_threshold(void) -{ - return atomic_read(&wss.total_count) >= wss.threshold; -} - /* * Translate ib_wr_opcode into ib_wc_opcode. */ @@ -438,79 +290,6 @@ static const u32 pio_opmask[BIT(3)] = { */ __be64 ib_hfi1_sys_image_guid; -/** - * hfi1_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - * @release: boolean to release MR - * @copy_last: do a separate copy of the last 8 bytes - */ -void hfi1_copy_sge( - struct rvt_sge_state *ss, - void *data, u32 length, - bool release, - bool copy_last) -{ - struct rvt_sge *sge = &ss->sge; - int i; - bool in_last = false; - bool cacheless_copy = false; - - if (sge_copy_mode == COPY_CACHELESS) { - cacheless_copy = length >= PAGE_SIZE; - } else if (sge_copy_mode == COPY_ADAPTIVE) { - if (length >= PAGE_SIZE) { - /* - * NOTE: this *assumes*: - * o The first vaddr is the dest. - * o If multiple pages, then vaddr is sequential. - */ - wss_insert(sge->vaddr); - if (length >= (2 * PAGE_SIZE)) - wss_insert(sge->vaddr + PAGE_SIZE); - - cacheless_copy = wss_exceeds_threshold(); - } else { - wss_advance_clean_counter(); - } - } - if (copy_last) { - if (length > 8) { - length -= 8; - } else { - copy_last = false; - in_last = true; - } - } - -again: - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - if (unlikely(in_last)) { - /* enforce byte transfer ordering */ - for (i = 0; i < len; i++) - ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; - } else if (cacheless_copy) { - cacheless_memcpy(sge->vaddr, data, len); - } else { - memcpy(sge->vaddr, data, len); - } - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } - - if (copy_last) { - copy_last = false; - in_last = true; - length = 8; - goto again; - } -} - /* * Make sure the QP is ready and able to accept the given opcode. */ @@ -1949,6 +1728,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) dd->verbs_dev.rdi.dparms.lkey_table_size = hfi1_lkey_table_size; dd->verbs_dev.rdi.dparms.nports = dd->num_pports; dd->verbs_dev.rdi.dparms.npkeys = hfi1_get_npkeys(dd); + dd->verbs_dev.rdi.dparms.sge_copy_mode = sge_copy_mode; + dd->verbs_dev.rdi.dparms.wss_threshold = wss_threshold; + dd->verbs_dev.rdi.dparms.wss_clean_period = wss_clean_period; /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index d4164114396e..eb99e8df6251 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -315,9 +315,6 @@ void hfi1_put_txreq(struct verbs_txreq *tx); int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps); -void hfi1_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - bool release, bool copy_last); - void hfi1_cnp_rcv(struct hfi1_packet *packet); void hfi1_uc_rcv(struct hfi1_packet *packet); @@ -393,28 +390,6 @@ int hfi1_verbs_send_dma(struct rvt_qp *qp, struct hfi1_pkt_state *ps, int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, u64 pbc); -int hfi1_wss_init(void); -void hfi1_wss_exit(void); - -/* platform specific: return the lowest level cache (llc) size, in KiB */ -static inline int wss_llc_size(void) -{ - /* assume that the boot CPU value is universal for all CPUs */ - return boot_cpu_data.x86_cache_size; -} - -/* platform specific: cacheless copy */ -static inline void cacheless_memcpy(void *dst, void *src, size_t n) -{ - /* - * Use the only available X64 cacheless copy. Add a __user cast - * to quiet sparse. The src agument is already in the kernel so - * there are no security issues. The extra fault recovery machinery - * is not invoked. - */ - __copy_user_nocache(dst, (void __user *)src, n, 0); -} - static inline bool opa_bth_is_migration(struct ib_other_headers *ohdr) { return ohdr->bth[1] & cpu_to_be32(OPA_BTH_MIG_REQ); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index f35fdeb14347..034b9729f991 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -1425,7 +1425,8 @@ read_middle: qp->s_rdma_read_len -= pmtu; update_last_psn(qp, psn); spin_unlock_irqrestore(&qp->s_lock, flags); - qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, pmtu, false, false); goto bail; case OP(RDMA_READ_RESPONSE_ONLY): @@ -1471,7 +1472,8 @@ read_last: if (unlikely(tlen != qp->s_rdma_read_len)) goto ack_len_err; aeth = be32_to_cpu(ohdr->u.aeth); - qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->s_rdma_read_sge, + data, tlen, false, false); WARN_ON(qp->s_rdma_read_sge.num_sge); (void) do_rc_ack(qp, aeth, psn, OP(RDMA_READ_RESPONSE_LAST), 0, rcd); @@ -1844,7 +1846,7 @@ send_middle: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -1890,7 +1892,7 @@ send_last: wc.byte_len = tlen + qp->r_rcv_len; if (unlikely(wc.byte_len > qp->r_len)) goto nack_inv; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); qp->r_msn++; if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index f8a7de795beb..bc2a9e208d18 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -354,7 +354,7 @@ again: if (len > sge->sge_length) len = sge->sge_length; BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, release); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, release, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 3e54bc11e0ae..0a090569148c 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -359,7 +359,7 @@ send_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto rewind; - qib_copy_sge(&qp->r_sge, data, pmtu, 0); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, false, false); break; case OP(SEND_LAST_WITH_IMMEDIATE): @@ -385,7 +385,7 @@ send_last: if (unlikely(wc.byte_len > qp->r_len)) goto rewind; wc.opcode = IB_WC_RECV; - qib_copy_sge(&qp->r_sge, data, tlen, 0); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, false, false); rvt_put_ss(&qp->s_rdma_read_sge); last_imm: wc.wr_id = qp->r_wr_id; @@ -449,7 +449,7 @@ rdma_first: qp->r_rcv_len += pmtu; if (unlikely(qp->r_rcv_len > qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, pmtu, 1); + rvt_copy_sge(qp, &qp->r_sge, data, pmtu, true, false); break; case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE): @@ -479,7 +479,7 @@ rdma_last_imm: } wc.byte_len = qp->r_len; wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); goto last_imm; @@ -495,7 +495,7 @@ rdma_last: tlen -= (hdrsize + pad + 4); if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) goto drop; - qib_copy_sge(&qp->r_sge, data, tlen, 1); + rvt_copy_sge(qp, &qp->r_sge, data, tlen, true, false); rvt_put_ss(&qp->r_sge); break; diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index f8d029a2390f..b12b9c3a6b5c 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -162,8 +162,8 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) const struct ib_global_route *grd = rdma_ah_read_grh(ah_attr); qib_make_grh(ibp, &grh, grd, 0, 0); - qib_copy_sge(&qp->r_sge, &grh, - sizeof(grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &grh, + sizeof(grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); @@ -179,7 +179,7 @@ static void qib_ud_loopback(struct rvt_qp *sqp, struct rvt_swqe *swqe) if (len > sge->sge_length) len = sge->sge_length; BUG_ON(len == 0); - qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, true, false); sge->vaddr += len; sge->length -= len; sge->sge_length -= len; @@ -551,12 +551,13 @@ void qib_ud_rcv(struct qib_ibport *ibp, struct ib_header *hdr, goto drop; } if (has_grh) { - qib_copy_sge(&qp->r_sge, &hdr->u.l.grh, - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, &hdr->u.l.grh, + sizeof(struct ib_grh), true, false); wc.wc_flags |= IB_WC_GRH; } else rvt_skip_sge(&qp->r_sge, sizeof(struct ib_grh), true); - qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1); + rvt_copy_sge(qp, &qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), + true, false); rvt_put_ss(&qp->r_sge); if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) return; diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 26ab78e5aaa7..ae6d42cc9651 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -131,27 +131,6 @@ const enum ib_wc_opcode ib_qib_wc_opcode[] = { */ __be64 ib_qib_sys_image_guid; -/** - * qib_copy_sge - copy data to SGE memory - * @ss: the SGE state - * @data: the data to copy - * @length: the length of the data - */ -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, int release) -{ - struct rvt_sge *sge = &ss->sge; - - while (length) { - u32 len = rvt_get_sge_length(sge, length); - - WARN_ON_ONCE(len == 0); - memcpy(sge->vaddr, data, len); - rvt_update_sge(ss, len, release); - data += len; - length -= len; - } -} - /* * Count the number of DMA descriptors needed to send length bytes of data. * Don't modify the qib_sge_state to get the count. @@ -1631,6 +1610,7 @@ int qib_register_ib_device(struct qib_devdata *dd) dd->verbs_dev.rdi.dparms.node = dd->assigned_node_id; dd->verbs_dev.rdi.dparms.core_cap_flags = RDMA_CORE_PORT_IBA_IB; dd->verbs_dev.rdi.dparms.max_mad_size = IB_MGMT_MAD_SIZE; + dd->verbs_dev.rdi.dparms.sge_copy_mode = RVT_SGE_COPY_MEMCPY; qib_fill_device_attr(dd); diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index df90a7a41534..0c5e623ec70c 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -292,9 +292,6 @@ void qib_put_txreq(struct qib_verbs_txreq *tx); int qib_verbs_send(struct rvt_qp *qp, struct ib_header *hdr, u32 hdrwords, struct rvt_sge_state *ss, u32 len); -void qib_copy_sge(struct rvt_sge_state *ss, void *data, u32 length, - int release); - void qib_uc_rcv(struct qib_ibport *ibp, struct ib_header *hdr, int has_grh, void *data, u32 tlen, struct rvt_qp *qp); diff --git a/drivers/infiniband/sw/rdmavt/Kconfig b/drivers/infiniband/sw/rdmavt/Kconfig index 98e798007f75..7df896a18d38 100644 --- a/drivers/infiniband/sw/rdmavt/Kconfig +++ b/drivers/infiniband/sw/rdmavt/Kconfig @@ -1,6 +1,6 @@ config INFINIBAND_RDMAVT tristate "RDMA verbs transport library" - depends on 64BIT && ARCH_DMA_ADDR_T_64BIT + depends on X86_64 && ARCH_DMA_ADDR_T_64BIT depends on PCI select DMA_VIRT_OPS ---help--- diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index a036a5368103..d969b0803e6f 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -118,6 +118,187 @@ const int ib_rvt_state_ops[IB_QPS_ERR + 1] = { }; EXPORT_SYMBOL(ib_rvt_state_ops); +/* platform specific: return the last level cache (llc) size, in KiB */ +static int rvt_wss_llc_size(void) +{ + /* assume that the boot CPU value is universal for all CPUs */ + return boot_cpu_data.x86_cache_size; +} + +/* platform specific: cacheless copy */ +static void cacheless_memcpy(void *dst, void *src, size_t n) +{ + /* + * Use the only available X64 cacheless copy. Add a __user cast + * to quiet sparse. The src agument is already in the kernel so + * there are no security issues. The extra fault recovery machinery + * is not invoked. + */ + __copy_user_nocache(dst, (void __user *)src, n, 0); +} + +void rvt_wss_exit(struct rvt_dev_info *rdi) +{ + struct rvt_wss *wss = rdi->wss; + + if (!wss) + return; + + /* coded to handle partially initialized and repeat callers */ + kfree(wss->entries); + wss->entries = NULL; + kfree(rdi->wss); + rdi->wss = NULL; +} + +/** + * rvt_wss_init - Init wss data structures + * + * Return: 0 on success + */ +int rvt_wss_init(struct rvt_dev_info *rdi) +{ + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + unsigned int wss_threshold = rdi->dparms.wss_threshold; + unsigned int wss_clean_period = rdi->dparms.wss_clean_period; + long llc_size; + long llc_bits; + long table_size; + long table_bits; + struct rvt_wss *wss; + int node = rdi->dparms.node; + + if (sge_copy_mode != RVT_SGE_COPY_ADAPTIVE) { + rdi->wss = NULL; + return 0; + } + + rdi->wss = kzalloc_node(sizeof(*rdi->wss), GFP_KERNEL, node); + if (!rdi->wss) + return -ENOMEM; + wss = rdi->wss; + + /* check for a valid percent range - default to 80 if none or invalid */ + if (wss_threshold < 1 || wss_threshold > 100) + wss_threshold = 80; + + /* reject a wildly large period */ + if (wss_clean_period > 1000000) + wss_clean_period = 256; + + /* reject a zero period */ + if (wss_clean_period == 0) + wss_clean_period = 1; + + /* + * Calculate the table size - the next power of 2 larger than the + * LLC size. LLC size is in KiB. + */ + llc_size = rvt_wss_llc_size() * 1024; + table_size = roundup_pow_of_two(llc_size); + + /* one bit per page in rounded up table */ + llc_bits = llc_size / PAGE_SIZE; + table_bits = table_size / PAGE_SIZE; + wss->pages_mask = table_bits - 1; + wss->num_entries = table_bits / BITS_PER_LONG; + + wss->threshold = (llc_bits * wss_threshold) / 100; + if (wss->threshold == 0) + wss->threshold = 1; + + wss->clean_period = wss_clean_period; + atomic_set(&wss->clean_counter, wss_clean_period); + + wss->entries = kcalloc_node(wss->num_entries, sizeof(*wss->entries), + GFP_KERNEL, node); + if (!wss->entries) { + rvt_wss_exit(rdi); + return -ENOMEM; + } + + return 0; +} + +/* + * Advance the clean counter. When the clean period has expired, + * clean an entry. + * + * This is implemented in atomics to avoid locking. Because multiple + * variables are involved, it can be racy which can lead to slightly + * inaccurate information. Since this is only a heuristic, this is + * OK. Any innaccuracies will clean themselves out as the counter + * advances. That said, it is unlikely the entry clean operation will + * race - the next possible racer will not start until the next clean + * period. + * + * The clean counter is implemented as a decrement to zero. When zero + * is reached an entry is cleaned. + */ +static void wss_advance_clean_counter(struct rvt_wss *wss) +{ + int entry; + int weight; + unsigned long bits; + + /* become the cleaner if we decrement the counter to zero */ + if (atomic_dec_and_test(&wss->clean_counter)) { + /* + * Set, not add, the clean period. This avoids an issue + * where the counter could decrement below the clean period. + * Doing a set can result in lost decrements, slowing the + * clean advance. Since this a heuristic, this possible + * slowdown is OK. + * + * An alternative is to loop, advancing the counter by a + * clean period until the result is > 0. However, this could + * lead to several threads keeping another in the clean loop. + * This could be mitigated by limiting the number of times + * we stay in the loop. + */ + atomic_set(&wss->clean_counter, wss->clean_period); + + /* + * Uniquely grab the entry to clean and move to next. + * The current entry is always the lower bits of + * wss.clean_entry. The table size, wss.num_entries, + * is always a power-of-2. + */ + entry = (atomic_inc_return(&wss->clean_entry) - 1) + & (wss->num_entries - 1); + + /* clear the entry and count the bits */ + bits = xchg(&wss->entries[entry], 0); + weight = hweight64((u64)bits); + /* only adjust the contended total count if needed */ + if (weight) + atomic_sub(weight, &wss->total_count); + } +} + +/* + * Insert the given address into the working set array. + */ +static void wss_insert(struct rvt_wss *wss, void *address) +{ + u32 page = ((unsigned long)address >> PAGE_SHIFT) & wss->pages_mask; + u32 entry = page / BITS_PER_LONG; /* assumes this ends up a shift */ + u32 nr = page & (BITS_PER_LONG - 1); + + if (!test_and_set_bit(nr, &wss->entries[entry])) + atomic_inc(&wss->total_count); + + wss_advance_clean_counter(wss); +} + +/* + * Is the working set larger than the threshold? + */ +static inline bool wss_exceeds_threshold(struct rvt_wss *wss) +{ + return atomic_read(&wss->total_count) >= wss->threshold; +} + static void get_map_page(struct rvt_qpn_table *qpt, struct rvt_qpn_map *map) { @@ -2476,3 +2657,80 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, rcu_read_unlock(); } EXPORT_SYMBOL(rvt_qp_iter); + +/** + * rvt_copy_sge - copy data to SGE memory + * @qp: associated QP + * @ss: the SGE state + * @data: the data to copy + * @length: the length of the data + * @release: boolean to release MR + * @copy_last: do a separate copy of the last 8 bytes + */ +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last) +{ + struct rvt_sge *sge = &ss->sge; + int i; + bool in_last = false; + bool cacheless_copy = false; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + struct rvt_wss *wss = rdi->wss; + unsigned int sge_copy_mode = rdi->dparms.sge_copy_mode; + + if (sge_copy_mode == RVT_SGE_COPY_CACHELESS) { + cacheless_copy = length >= PAGE_SIZE; + } else if (sge_copy_mode == RVT_SGE_COPY_ADAPTIVE) { + if (length >= PAGE_SIZE) { + /* + * NOTE: this *assumes*: + * o The first vaddr is the dest. + * o If multiple pages, then vaddr is sequential. + */ + wss_insert(wss, sge->vaddr); + if (length >= (2 * PAGE_SIZE)) + wss_insert(wss, (sge->vaddr + PAGE_SIZE)); + + cacheless_copy = wss_exceeds_threshold(wss); + } else { + wss_advance_clean_counter(wss); + } + } + + if (copy_last) { + if (length > 8) { + length -= 8; + } else { + copy_last = false; + in_last = true; + } + } + +again: + while (length) { + u32 len = rvt_get_sge_length(sge, length); + + WARN_ON_ONCE(len == 0); + if (unlikely(in_last)) { + /* enforce byte transfer ordering */ + for (i = 0; i < len; i++) + ((u8 *)sge->vaddr)[i] = ((u8 *)data)[i]; + } else if (cacheless_copy) { + cacheless_memcpy(sge->vaddr, data, len); + } else { + memcpy(sge->vaddr, data, len); + } + rvt_update_sge(ss, len, release); + data += len; + length -= len; + } + + if (copy_last) { + copy_last = false; + in_last = true; + length = 8; + goto again; + } +} +EXPORT_SYMBOL(rvt_copy_sge); diff --git a/drivers/infiniband/sw/rdmavt/qp.h b/drivers/infiniband/sw/rdmavt/qp.h index 264811fdc530..6d883972e0b8 100644 --- a/drivers/infiniband/sw/rdmavt/qp.h +++ b/drivers/infiniband/sw/rdmavt/qp.h @@ -66,4 +66,6 @@ int rvt_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr, const struct ib_send_wr **bad_wr); int rvt_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr, const struct ib_recv_wr **bad_wr); +int rvt_wss_init(struct rvt_dev_info *rdi); +void rvt_wss_exit(struct rvt_dev_info *rdi); #endif /* DEF_RVTQP_H */ diff --git a/drivers/infiniband/sw/rdmavt/vt.c b/drivers/infiniband/sw/rdmavt/vt.c index e3249d46bcef..723d3daf2eba 100644 --- a/drivers/infiniband/sw/rdmavt/vt.c +++ b/drivers/infiniband/sw/rdmavt/vt.c @@ -774,6 +774,13 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) goto bail_no_mr; } + /* Memory Working Set Size */ + ret = rvt_wss_init(rdi); + if (ret) { + rvt_pr_err(rdi, "Error in WSS init.\n"); + goto bail_mr; + } + /* Completion queues */ spin_lock_init(&rdi->n_cqs_lock); @@ -832,7 +839,7 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rdi->driver_f.port_callback); if (ret) { rvt_pr_err(rdi, "Failed to register driver with ib core.\n"); - goto bail_mr; + goto bail_wss; } rvt_create_mad_agents(rdi); @@ -840,6 +847,8 @@ int rvt_register_device(struct rvt_dev_info *rdi, u32 driver_id) rvt_pr_info(rdi, "Registration with rdmavt done.\n"); return ret; +bail_wss: + rvt_wss_exit(rdi); bail_mr: rvt_mr_exit(rdi); @@ -863,6 +872,7 @@ void rvt_unregister_device(struct rvt_dev_info *rdi) rvt_free_mad_agents(rdi); ib_unregister_device(&rdi->ibdev); + rvt_wss_exit(rdi); rvt_mr_exit(rdi); rvt_qp_exit(rdi); } diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 0a888a9ecc96..7fa2f2d46a3c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -149,6 +149,10 @@ struct rvt_ibport { #define RVT_CQN_MAX 16 /* maximum length of cq name */ +#define RVT_SGE_COPY_MEMCPY 0 +#define RVT_SGE_COPY_CACHELESS 1 +#define RVT_SGE_COPY_ADAPTIVE 2 + /* * Things that are driver specific, module parameters in hfi1 and qib */ @@ -161,6 +165,9 @@ struct rvt_driver_params { */ unsigned int lkey_table_size; unsigned int qp_table_size; + unsigned int sge_copy_mode; + unsigned int wss_threshold; + unsigned int wss_clean_period; int qpn_start; int qpn_inc; int qpn_res_start; @@ -193,6 +200,19 @@ struct rvt_ah { u8 log_pmtu; }; +/* memory working set size */ +struct rvt_wss { + unsigned long *entries; + atomic_t total_count; + atomic_t clean_counter; + atomic_t clean_entry; + + int threshold; + int num_entries; + long pages_mask; + unsigned int clean_period; +}; + struct rvt_dev_info; struct rvt_swqe; struct rvt_driver_provided { @@ -418,6 +438,8 @@ struct rvt_dev_info { u32 n_mcast_grps_allocated; /* number of mcast groups allocated */ spinlock_t n_mcast_grps_lock; + /* Memory Working Set Size */ + struct rvt_wss *wss; }; /** diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 927f6d5b6d0f..eaf2593ca822 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -678,6 +678,10 @@ void rvt_del_timers_sync(struct rvt_qp *qp); void rvt_stop_rc_timers(struct rvt_qp *qp); void rvt_add_retry_timer(struct rvt_qp *qp); +void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, + void *data, u32 length, + bool release, bool copy_last); + /** * struct rvt_qp_iter - the iterator for QPs * @qp - the current QP From 116aa0330ec71b9872dfb3a3cc5202a72b5a1666 Mon Sep 17 00:00:00 2001 From: Venkata Sandeep Dhanalakota Date: Wed, 26 Sep 2018 10:44:42 -0700 Subject: [PATCH 192/242] IB/{hfi1, qib, rdmavt}: Move send completion logic to rdmavt Moving send completion code into rdmavt in order to have shared logic between qib and hfi1 drivers. Reviewed-by: Mike Marciniszyn Reviewed-by: Brian Welty Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/rc.c | 14 ++++---- drivers/infiniband/hw/hfi1/ruc.c | 45 ++----------------------- drivers/infiniband/hw/hfi1/uc.c | 4 +-- drivers/infiniband/hw/hfi1/ud.c | 4 +-- drivers/infiniband/hw/hfi1/verbs.c | 9 +++-- drivers/infiniband/hw/hfi1/verbs.h | 3 -- drivers/infiniband/hw/qib/qib_rc.c | 8 ++--- drivers/infiniband/hw/qib/qib_ruc.c | 43 ++--------------------- drivers/infiniband/hw/qib/qib_sdma.c | 2 +- drivers/infiniband/hw/qib/qib_uc.c | 2 +- drivers/infiniband/hw/qib/qib_ud.c | 4 +-- drivers/infiniband/hw/qib/qib_verbs.c | 7 ++-- drivers/infiniband/hw/qib/qib_verbs.h | 3 -- drivers/infiniband/sw/rdmavt/qp.c | 43 +++++++++++++++++++++++ drivers/infiniband/sw/rdmavt/trace_tx.h | 42 +++++++++++++++++++++++ include/rdma/rdma_vt.h | 3 ++ include/rdma/rdmavt_qp.h | 2 ++ 17 files changed, 124 insertions(+), 114 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/rc.c b/drivers/infiniband/hw/hfi1/rc.c index 673b31ebf0ac..188aa4f686a0 100644 --- a/drivers/infiniband/hw/hfi1/rc.c +++ b/drivers/infiniband/hw/hfi1/rc.c @@ -309,7 +309,7 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done_free_tx; @@ -378,9 +378,9 @@ int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, - err ? IB_WC_LOC_PROT_ERR - : IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, + err ? IB_WC_LOC_PROT_ERR + : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); goto done_free_tx; @@ -1043,7 +1043,7 @@ void hfi1_restart_rc(struct rvt_qp *qp, u32 psn, int wait) hfi1_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else { /* need to handle delayed completion */ @@ -1468,7 +1468,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1706,7 +1706,7 @@ ack_len_err: status = IB_WC_LOC_LEN_ERR; ack_err: if (qp->s_last == qp->s_acked) { - hfi1_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index 223eaf184934..db1d0d8a04a5 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -411,7 +411,7 @@ send_comp: ibp->rvp.n_loop_pkts++; flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - hfi1_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); if (local_ops) { atomic_dec(&sqp->local_ops_pending); local_ops = 0; @@ -459,7 +459,7 @@ err: serr: spin_lock_irqsave(&sqp->s_lock, flags); - hfi1_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); if (sqp->ibqp.qp_type == IB_QPT_RC) { int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); @@ -922,44 +922,3 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) iowait_starve_clear(ps.pkts_sent, &priv->s_iowait); spin_unlock_irqrestore(&qp->s_lock, ps.flags); } - -/* - * This should be called with s_lock held. - */ -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status) -{ - u32 old_last, last; - - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - return; - - last = qp->s_last; - old_last = last; - trace_hfi1_qp_send_completion(qp, wqe, last); - if (++last >= qp->s_size) - last = 0; - trace_hfi1_qp_send_completion(qp, wqe, last); - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_swqe(wqe); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); - - rvt_qp_swqe_complete(qp, - wqe, - ib_hfi1_wc_opcode[wqe->wr.opcode], - status); - - if (qp->s_acked == old_last) - qp->s_acked = last; - if (qp->s_cur == old_last) - qp->s_cur = last; - if (qp->s_tail == old_last) - qp->s_tail = last; - if (qp->state == IB_QPS_SQD && last == qp->s_cur) - qp->s_draining = 0; -} diff --git a/drivers/infiniband/hw/hfi1/uc.c b/drivers/infiniband/hw/hfi1/uc.c index 48a320c01552..6aca0c5a7f97 100644 --- a/drivers/infiniband/hw/hfi1/uc.c +++ b/drivers/infiniband/hw/hfi1/uc.c @@ -88,7 +88,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) } clear_ahg(qp); wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -140,7 +140,7 @@ int hfi1_make_uc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) qp, wqe->wr.ex.invalidate_rkey); local_ops = 1; } - hfi1_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR + rvt_send_complete(qp, wqe, err ? IB_WC_LOC_PROT_ERR : IB_WC_SUCCESS); if (local_ops) atomic_dec(&qp->local_ops_pending); diff --git a/drivers/infiniband/hw/hfi1/ud.c b/drivers/infiniband/hw/hfi1/ud.c index e55bc4280d58..4baa8f4d49de 100644 --- a/drivers/infiniband/hw/hfi1/ud.c +++ b/drivers/infiniband/hw/hfi1/ud.c @@ -518,7 +518,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - hfi1_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done_free_tx; } @@ -560,7 +560,7 @@ int hfi1_make_ud_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps) ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, tflags); ps->flags = tflags; - hfi1_send_complete(qp, wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); goto done_free_tx; } } diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 0a47b46f979e..bc7f00ba1988 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -492,7 +492,7 @@ static void verbs_sdma_complete( spin_lock(&qp->s_lock); if (tx->wqe) { - hfi1_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); } else if (qp->ibqp.qp_type == IB_QPT_RC) { struct hfi1_opa_header *hdr; @@ -938,7 +938,7 @@ int hfi1_verbs_send_pio(struct rvt_qp *qp, struct hfi1_pkt_state *ps, pio_bail: if (qp->s_wqe) { spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, wc_status); + rvt_send_complete(qp, qp->s_wqe, wc_status); spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); @@ -1145,7 +1145,7 @@ int hfi1_verbs_send(struct rvt_qp *qp, struct hfi1_pkt_state *ps) hfi1_cdbg(PIO, "%s() Failed. Completing with err", __func__); spin_lock_irqsave(&qp->s_lock, flags); - hfi1_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock_irqrestore(&qp->s_lock, flags); } return -EINVAL; @@ -1735,6 +1735,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) /* post send table */ dd->verbs_dev.rdi.post_parms = hfi1_post_parms; + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_hfi1_wc_opcode; + ppd = dd->pport; for (i = 0; i < dd->num_pports; i++, ppd++) rvt_init_port(&dd->verbs_dev.rdi, diff --git a/drivers/infiniband/hw/hfi1/verbs.h b/drivers/infiniband/hw/hfi1/verbs.h index eb99e8df6251..64c9054db5f3 100644 --- a/drivers/infiniband/hw/hfi1/verbs.h +++ b/drivers/infiniband/hw/hfi1/verbs.h @@ -363,9 +363,6 @@ void hfi1_do_send_from_rvt(struct rvt_qp *qp); void hfi1_do_send(struct rvt_qp *qp, bool in_thread); -void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status); - void hfi1_send_rc_ack(struct hfi1_packet *packet, bool is_fecn); int hfi1_make_rc_req(struct rvt_qp *qp, struct hfi1_pkt_state *ps); diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c index 034b9729f991..6fa002940451 100644 --- a/drivers/infiniband/hw/qib/qib_rc.c +++ b/drivers/infiniband/hw/qib/qib_rc.c @@ -254,7 +254,7 @@ int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ? + rvt_send_complete(qp, wqe, qp->s_last != qp->s_acked ? IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR); /* will get called again */ goto done; @@ -838,7 +838,7 @@ void qib_restart_rc(struct rvt_qp *qp, u32 psn, int wait) qib_migrate_qp(qp); qp->s_retry = qp->s_retry_cnt; } else if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); + rvt_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); return; } else /* XXX need to handle delayed completion */ @@ -1221,7 +1221,7 @@ static int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, ibp->rvp.n_other_naks++; class_b: if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } break; @@ -1492,7 +1492,7 @@ ack_len_err: status = IB_WC_LOC_LEN_ERR; ack_err: if (qp->s_last == qp->s_acked) { - qib_send_complete(qp, wqe, status); + rvt_send_complete(qp, wqe, status); rvt_error_qp(qp, IB_WC_WR_FLUSH_ERR); } ack_done: diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index bc2a9e208d18..c5627baf5dbf 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -403,7 +403,7 @@ send_comp: ibp->rvp.n_loop_pkts++; flush_send: sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - qib_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); goto again; rnr_nak: @@ -447,7 +447,7 @@ err: serr: spin_lock_irqsave(&sqp->s_lock, flags); - qib_send_complete(sqp, wqe, send_status); + rvt_send_complete(sqp, wqe, send_status); if (sqp->ibqp.qp_type == IB_QPT_RC) { int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); @@ -613,42 +613,3 @@ void qib_do_send(struct rvt_qp *qp) spin_unlock_irqrestore(&qp->s_lock, flags); } - -/* - * This should be called with s_lock held. - */ -void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status) -{ - u32 old_last, last; - - if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - return; - - last = qp->s_last; - old_last = last; - if (++last >= qp->s_size) - last = 0; - qp->s_last = last; - /* See post_send() */ - barrier(); - rvt_put_swqe(wqe); - if (qp->ibqp.qp_type == IB_QPT_UD || - qp->ibqp.qp_type == IB_QPT_SMI || - qp->ibqp.qp_type == IB_QPT_GSI) - atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); - - rvt_qp_swqe_complete(qp, - wqe, - ib_qib_wc_opcode[wqe->wr.opcode], - status); - - if (qp->s_acked == old_last) - qp->s_acked = last; - if (qp->s_cur == old_last) - qp->s_cur = last; - if (qp->s_tail == old_last) - qp->s_tail = last; - if (qp->state == IB_QPS_SQD && last == qp->s_cur) - qp->s_draining = 0; -} diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c index d0723d4aef5c..757d4c9d713d 100644 --- a/drivers/infiniband/hw/qib/qib_sdma.c +++ b/drivers/infiniband/hw/qib/qib_sdma.c @@ -651,7 +651,7 @@ unmap: if (ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) rvt_error_qp(qp, IB_WC_GENERAL_ERR); } else if (qp->s_wqe) - qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); + rvt_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR); spin_unlock(&qp->s_lock); spin_unlock(&qp->r_lock); /* return zero to process the next send work request */ diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c index 0a090569148c..30c70ad0f4bf 100644 --- a/drivers/infiniband/hw/qib/qib_uc.c +++ b/drivers/infiniband/hw/qib/qib_uc.c @@ -68,7 +68,7 @@ int qib_make_uc_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c index b12b9c3a6b5c..4d4c31ea4e2d 100644 --- a/drivers/infiniband/hw/qib/qib_ud.c +++ b/drivers/infiniband/hw/qib/qib_ud.c @@ -260,7 +260,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) goto bail; } wqe = rvt_get_swqe_ptr(qp, qp->s_last); - qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); + rvt_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR); goto done; } @@ -304,7 +304,7 @@ int qib_make_ud_req(struct rvt_qp *qp, unsigned long *flags) qib_ud_loopback(qp, wqe); spin_lock_irqsave(&qp->s_lock, tflags); *flags = tflags; - qib_send_complete(qp, wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, wqe, IB_WC_SUCCESS); goto done; } } diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index ae6d42cc9651..8a45964c4700 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -731,7 +731,7 @@ static void sdma_complete(struct qib_sdma_txreq *cookie, int status) spin_lock(&qp->s_lock); if (tx->wqe) - qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, tx->wqe, IB_WC_SUCCESS); else if (qp->ibqp.qp_type == IB_QPT_RC) { struct ib_header *hdr; @@ -1004,7 +1004,7 @@ done: } if (qp->s_wqe) { spin_lock_irqsave(&qp->s_lock, flags); - qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); + rvt_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS); spin_unlock_irqrestore(&qp->s_lock, flags); } else if (qp->ibqp.qp_type == IB_QPT_RC) { spin_lock_irqsave(&qp->s_lock, flags); @@ -1491,6 +1491,9 @@ static void qib_fill_device_attr(struct qib_devdata *dd) rdi->dparms.props.max_mcast_grp; /* post send table */ dd->verbs_dev.rdi.post_parms = qib_post_parms; + + /* opcode translation table */ + dd->verbs_dev.rdi.wc_opcode = ib_qib_wc_opcode; } /** diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index 0c5e623ec70c..a4426c24b0d1 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -331,9 +331,6 @@ void _qib_do_send(struct work_struct *work); void qib_do_send(struct rvt_qp *qp); -void qib_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, - enum ib_wc_status status); - void qib_send_rc_ack(struct rvt_qp *qp); int qib_make_rc_req(struct rvt_qp *qp, unsigned long *flags); diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index d969b0803e6f..7e3ec6674cf7 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2658,6 +2658,49 @@ void rvt_qp_iter(struct rvt_dev_info *rdi, } EXPORT_SYMBOL(rvt_qp_iter); +/* + * This should be called with s_lock held. + */ +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status) +{ + u32 old_last, last; + struct rvt_dev_info *rdi = ib_to_rvt(qp->ibqp.device); + + if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + return; + + last = qp->s_last; + old_last = last; + trace_rvt_qp_send_completion(qp, wqe, last); + if (++last >= qp->s_size) + last = 0; + trace_rvt_qp_send_completion(qp, wqe, last); + qp->s_last = last; + /* See post_send() */ + barrier(); + rvt_put_swqe(wqe); + if (qp->ibqp.qp_type == IB_QPT_UD || + qp->ibqp.qp_type == IB_QPT_SMI || + qp->ibqp.qp_type == IB_QPT_GSI) + atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount); + + rvt_qp_swqe_complete(qp, + wqe, + rdi->wc_opcode[wqe->wr.opcode], + status); + + if (qp->s_acked == old_last) + qp->s_acked = last; + if (qp->s_cur == old_last) + qp->s_cur = last; + if (qp->s_tail == old_last) + qp->s_tail = last; + if (qp->state == IB_QPS_SQD && last == qp->s_cur) + qp->s_draining = 0; +} +EXPORT_SYMBOL(rvt_send_complete); + /** * rvt_copy_sge - copy data to SGE memory * @qp: associated QP diff --git a/drivers/infiniband/sw/rdmavt/trace_tx.h b/drivers/infiniband/sw/rdmavt/trace_tx.h index 0ef25fc49f25..d5df352eadb1 100644 --- a/drivers/infiniband/sw/rdmavt/trace_tx.h +++ b/drivers/infiniband/sw/rdmavt/trace_tx.h @@ -153,6 +153,48 @@ TRACE_EVENT( ) ); +TRACE_EVENT( + rvt_qp_send_completion, + TP_PROTO(struct rvt_qp *qp, struct rvt_swqe *wqe, u32 idx), + TP_ARGS(qp, wqe, idx), + TP_STRUCT__entry( + RDI_DEV_ENTRY(ib_to_rvt(qp->ibqp.device)) + __field(struct rvt_swqe *, wqe) + __field(u64, wr_id) + __field(u32, qpn) + __field(u32, qpt) + __field(u32, length) + __field(u32, idx) + __field(u32, ssn) + __field(enum ib_wr_opcode, opcode) + __field(int, send_flags) + ), + TP_fast_assign( + RDI_DEV_ASSIGN(ib_to_rvt(qp->ibqp.device)) + __entry->wqe = wqe; + __entry->wr_id = wqe->wr.wr_id; + __entry->qpn = qp->ibqp.qp_num; + __entry->qpt = qp->ibqp.qp_type; + __entry->length = wqe->length; + __entry->idx = idx; + __entry->ssn = wqe->ssn; + __entry->opcode = wqe->wr.opcode; + __entry->send_flags = wqe->wr.send_flags; + ), + TP_printk( + "[%s] qpn 0x%x qpt %u wqe %p idx %u wr_id %llx length %u ssn %u opcode %x send_flags %x", + __get_str(dev), + __entry->qpn, + __entry->qpt, + __entry->wqe, + __entry->idx, + __entry->wr_id, + __entry->length, + __entry->ssn, + __entry->opcode, + __entry->send_flags + ) +); #endif /* __RVT_TRACE_TX_H */ #undef TRACE_INCLUDE_PATH diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 7fa2f2d46a3c..3584d0816fcd 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -398,6 +398,9 @@ struct rvt_dev_info { /* post send table */ const struct rvt_operation_params *post_parms; + /* opcode translation table */ + const enum ib_wc_opcode *wc_opcode; + /* Driver specific helper functions */ struct rvt_driver_provided driver_f; diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index eaf2593ca822..6fd6f2ad9c0f 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -681,6 +681,8 @@ void rvt_add_retry_timer(struct rvt_qp *qp); void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, void *data, u32 length, bool release, bool copy_last); +void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_status status); /** * struct rvt_qp_iter - the iterator for QPs From 15703461533a5ffd775722390431625daaae7618 Mon Sep 17 00:00:00 2001 From: Venkata Sandeep Dhanalakota Date: Wed, 26 Sep 2018 10:44:52 -0700 Subject: [PATCH 193/242] IB/{hfi1, qib, rdmavt}: Move ruc_loopback to rdmavt This patch moves ruc_loopback() from hfi1 into rdmavt for code sharing with the qib driver. Reviewed-by: Brian Welty Reviewed-by: Mike Marciniszyn Reviewed-by: Dennis Dalessandro Signed-off-by: Venkata Sandeep Dhanalakota Signed-off-by: Harish Chegondi Signed-off-by: Dennis Dalessandro Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hfi1/ruc.c | 332 +--------------------------- drivers/infiniband/hw/qib/qib_ruc.c | 303 +------------------------ drivers/infiniband/sw/rdmavt/qp.c | 331 +++++++++++++++++++++++++++ include/rdma/rdmavt_qp.h | 1 + 4 files changed, 335 insertions(+), 632 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/ruc.c b/drivers/infiniband/hw/hfi1/ruc.c index db1d0d8a04a5..7fb317c711df 100644 --- a/drivers/infiniband/hw/hfi1/ruc.c +++ b/drivers/infiniband/hw/hfi1/ruc.c @@ -155,334 +155,6 @@ int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct hfi1_packet *packet) return 0; } -/** - * ruc_loopback - handle UC and RC loopback requests - * @sqp: the sending QP - * - * This is called from hfi1_do_send() to - * forward a WQE addressed to the same HFI. - * Note that although we are single threaded due to the send engine, we still - * have to protect against post_send(). We don't have to worry about - * receive interrupts since this is a connected protocol and all packets - * will pass through here. - */ -static void ruc_loopback(struct rvt_qp *sqp) -{ - struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct rvt_qp *qp; - struct rvt_swqe *wqe; - struct rvt_sge *sge; - unsigned long flags; - struct ib_wc wc; - u64 sdata; - atomic64_t *maddr; - enum ib_wc_status send_status; - bool release; - int ret; - bool copy_last = false; - int local_ops = 0; - - rcu_read_lock(); - - /* - * Note that we check the responder QP state after - * checking the requester's state. - */ - qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp, - sqp->remote_qpn); - - spin_lock_irqsave(&sqp->s_lock, flags); - - /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | HFI1_S_ANY_WAIT)) || - !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - goto unlock; - - sqp->s_flags |= RVT_S_BUSY; - -again: - if (sqp->s_last == READ_ONCE(sqp->s_head)) - goto clr_busy; - wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); - - /* Return if it is not OK to start a new work request. */ - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { - if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) - goto clr_busy; - /* We are in the error state, flush the work request. */ - send_status = IB_WC_WR_FLUSH_ERR; - goto flush_send; - } - - /* - * We can rely on the entry not changing without the s_lock - * being held until we update s_last. - * We increment s_cur to indicate s_last is in progress. - */ - if (sqp->s_last == sqp->s_cur) { - if (++sqp->s_cur >= sqp->s_size) - sqp->s_cur = 0; - } - spin_unlock_irqrestore(&sqp->s_lock, flags); - - if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || - qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->rvp.n_pkt_drops++; - /* - * For RC, the requester would timeout and retry so - * shortcut the timeouts and just signal too many retries. - */ - if (sqp->ibqp.qp_type == IB_QPT_RC) - send_status = IB_WC_RETRY_EXC_ERR; - else - send_status = IB_WC_SUCCESS; - goto serr; - } - - memset(&wc, 0, sizeof(wc)); - send_status = IB_WC_SUCCESS; - - release = true; - sqp->s_sge.sge = wqe->sg_list[0]; - sqp->s_sge.sg_list = wqe->sg_list + 1; - sqp->s_sge.num_sge = wqe->wr.num_sge; - sqp->s_len = wqe->length; - switch (wqe->wr.opcode) { - case IB_WR_REG_MR: - goto send_comp; - - case IB_WR_LOCAL_INV: - if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { - if (rvt_invalidate_rkey(sqp, - wqe->wr.ex.invalidate_rkey)) - send_status = IB_WC_LOC_PROT_ERR; - local_ops = 1; - } - goto send_comp; - - case IB_WR_SEND_WITH_INV: - if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { - wc.wc_flags = IB_WC_WITH_INVALIDATE; - wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; - } - goto send; - - case IB_WR_SEND_WITH_IMM: - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - /* FALLTHROUGH */ - case IB_WR_SEND: -send: - ret = rvt_get_rwqe(qp, false); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - break; - - case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = rvt_get_rwqe(qp, true); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - /* skip copy_last set and qp_access_flags recheck */ - goto do_write; - case IB_WR_RDMA_WRITE: - copy_last = rvt_is_user_qp(qp); - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; -do_write: - if (wqe->length == 0) - break; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) - goto acc_err; - qp->r_sge.sg_list = NULL; - qp->r_sge.num_sge = 1; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) - goto acc_err; - release = false; - sqp->s_sge.sg_list = NULL; - sqp->s_sge.num_sge = 1; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->wr.num_sge; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) - goto acc_err; - /* Perform atomic OP and save result. */ - maddr = (atomic64_t *)qp->r_sge.sge.vaddr; - sdata = wqe->atomic_wr.compare_add; - *(u64 *)sqp->s_sge.sge.vaddr = - (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64)atomic64_add_return(sdata, maddr) - sdata : - (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, - sdata, wqe->atomic_wr.swap); - rvt_put_mr(qp->r_sge.sge.mr); - qp->r_sge.num_sge = 0; - goto send_comp; - - default: - send_status = IB_WC_LOC_QP_OP_ERR; - goto serr; - } - - sge = &sqp->s_sge.sge; - while (sqp->s_len) { - u32 len = sqp->s_len; - - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; - WARN_ON_ONCE(len == 0); - rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, - len, release, copy_last); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - sqp->s_len -= len; - } - if (release) - rvt_put_ss(&qp->r_sge); - - if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) - goto send_comp; - - if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - else - wc.opcode = IB_WC_RECV; - wc.wr_id = qp->r_wr_id; - wc.status = IB_WC_SUCCESS; - wc.byte_len = wqe->length; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; - wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); - wc.port_num = 1; - /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); - -send_comp: - spin_lock_irqsave(&sqp->s_lock, flags); - ibp->rvp.n_loop_pkts++; -flush_send: - sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - rvt_send_complete(sqp, wqe, send_status); - if (local_ops) { - atomic_dec(&sqp->local_ops_pending); - local_ops = 0; - } - goto again; - -rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - ibp->rvp.n_rnr_naks++; - /* - * Note: we don't need the s_lock held since the BUSY flag - * makes this single threaded. - */ - if (sqp->s_rnr_retry == 0) { - send_status = IB_WC_RNR_RETRY_EXC_ERR; - goto serr; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) - goto clr_busy; - rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << - IB_AETH_CREDIT_SHIFT); - goto clr_busy; - -op_err: - send_status = IB_WC_REM_OP_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -inv_err: - send_status = IB_WC_REM_INV_REQ_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -acc_err: - send_status = IB_WC_REM_ACCESS_ERR; - wc.status = IB_WC_LOC_PROT_ERR; -err: - /* responder goes to error state */ - rvt_rc_error(qp, wc.status); - -serr: - spin_lock_irqsave(&sqp->s_lock, flags); - rvt_send_complete(sqp, wqe, send_status); - if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - - sqp->s_flags &= ~RVT_S_BUSY; - spin_unlock_irqrestore(&sqp->s_lock, flags); - if (lastwqe) { - struct ib_event ev; - - ev.device = sqp->ibqp.device; - ev.element.qp = &sqp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); - } - goto done; - } -clr_busy: - sqp->s_flags &= ~RVT_S_BUSY; -unlock: - spin_unlock_irqrestore(&sqp->s_lock, flags); -done: - rcu_read_unlock(); -} - /** * hfi1_make_grh - construct a GRH header * @ibp: a pointer to the IB port @@ -860,7 +532,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_rc_req; @@ -870,7 +542,7 @@ void hfi1_do_send(struct rvt_qp *qp, bool in_thread) if (!loopback && ((rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ps.ppd->lmc) - 1)) == ps.ppd->lid)) { - ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } make_req = hfi1_make_uc_req; diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c index c5627baf5dbf..1fa21938f310 100644 --- a/drivers/infiniband/hw/qib/qib_ruc.c +++ b/drivers/infiniband/hw/qib/qib_ruc.c @@ -170,307 +170,6 @@ err: return 1; } -/** - * qib_ruc_loopback - handle UC and RC lookback requests - * @sqp: the sending QP - * - * This is called from qib_do_send() to - * forward a WQE addressed to the same HCA. - * Note that although we are single threaded due to the tasklet, we still - * have to protect against post_send(). We don't have to worry about - * receive interrupts since this is a connected protocol and all packets - * will pass through here. - */ -static void qib_ruc_loopback(struct rvt_qp *sqp) -{ - struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num); - struct qib_pportdata *ppd = ppd_from_ibp(ibp); - struct qib_devdata *dd = ppd->dd; - struct rvt_dev_info *rdi = &dd->verbs_dev.rdi; - struct rvt_qp *qp; - struct rvt_swqe *wqe; - struct rvt_sge *sge; - unsigned long flags; - struct ib_wc wc; - u64 sdata; - atomic64_t *maddr; - enum ib_wc_status send_status; - int release; - int ret; - - rcu_read_lock(); - /* - * Note that we check the responder QP state after - * checking the requester's state. - */ - qp = rvt_lookup_qpn(rdi, &ibp->rvp, sqp->remote_qpn); - if (!qp) - goto done; - - spin_lock_irqsave(&sqp->s_lock, flags); - - /* Return if we are already busy processing a work request. */ - if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || - !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) - goto unlock; - - sqp->s_flags |= RVT_S_BUSY; - -again: - if (sqp->s_last == READ_ONCE(sqp->s_head)) - goto clr_busy; - wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); - - /* Return if it is not OK to start a new work reqeust. */ - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { - if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) - goto clr_busy; - /* We are in the error state, flush the work request. */ - send_status = IB_WC_WR_FLUSH_ERR; - goto flush_send; - } - - /* - * We can rely on the entry not changing without the s_lock - * being held until we update s_last. - * We increment s_cur to indicate s_last is in progress. - */ - if (sqp->s_last == sqp->s_cur) { - if (++sqp->s_cur >= sqp->s_size) - sqp->s_cur = 0; - } - spin_unlock_irqrestore(&sqp->s_lock, flags); - - if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || - qp->ibqp.qp_type != sqp->ibqp.qp_type) { - ibp->rvp.n_pkt_drops++; - /* - * For RC, the requester would timeout and retry so - * shortcut the timeouts and just signal too many retries. - */ - if (sqp->ibqp.qp_type == IB_QPT_RC) - send_status = IB_WC_RETRY_EXC_ERR; - else - send_status = IB_WC_SUCCESS; - goto serr; - } - - memset(&wc, 0, sizeof(wc)); - send_status = IB_WC_SUCCESS; - - release = 1; - sqp->s_sge.sge = wqe->sg_list[0]; - sqp->s_sge.sg_list = wqe->sg_list + 1; - sqp->s_sge.num_sge = wqe->wr.num_sge; - sqp->s_len = wqe->length; - switch (wqe->wr.opcode) { - case IB_WR_SEND_WITH_IMM: - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - /* FALLTHROUGH */ - case IB_WR_SEND: - ret = rvt_get_rwqe(qp, false); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - break; - - case IB_WR_RDMA_WRITE_WITH_IMM: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - wc.wc_flags = IB_WC_WITH_IMM; - wc.ex.imm_data = wqe->wr.ex.imm_data; - ret = rvt_get_rwqe(qp, true); - if (ret < 0) - goto op_err; - if (!ret) - goto rnr_nak; - /* FALLTHROUGH */ - case IB_WR_RDMA_WRITE: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) - goto inv_err; - if (wqe->length == 0) - break; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_WRITE))) - goto acc_err; - qp->r_sge.sg_list = NULL; - qp->r_sge.num_sge = 1; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_RDMA_READ: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, - wqe->rdma_wr.remote_addr, - wqe->rdma_wr.rkey, - IB_ACCESS_REMOTE_READ))) - goto acc_err; - release = 0; - sqp->s_sge.sg_list = NULL; - sqp->s_sge.num_sge = 1; - qp->r_sge.sge = wqe->sg_list[0]; - qp->r_sge.sg_list = wqe->sg_list + 1; - qp->r_sge.num_sge = wqe->wr.num_sge; - qp->r_sge.total_len = wqe->length; - break; - - case IB_WR_ATOMIC_CMP_AND_SWP: - case IB_WR_ATOMIC_FETCH_AND_ADD: - if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) - goto inv_err; - if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), - wqe->atomic_wr.remote_addr, - wqe->atomic_wr.rkey, - IB_ACCESS_REMOTE_ATOMIC))) - goto acc_err; - /* Perform atomic OP and save result. */ - maddr = (atomic64_t *) qp->r_sge.sge.vaddr; - sdata = wqe->atomic_wr.compare_add; - *(u64 *) sqp->s_sge.sge.vaddr = - (wqe->atomic_wr.wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? - (u64) atomic64_add_return(sdata, maddr) - sdata : - (u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr, - sdata, wqe->atomic_wr.swap); - rvt_put_mr(qp->r_sge.sge.mr); - qp->r_sge.num_sge = 0; - goto send_comp; - - default: - send_status = IB_WC_LOC_QP_OP_ERR; - goto serr; - } - - sge = &sqp->s_sge.sge; - while (sqp->s_len) { - u32 len = sqp->s_len; - - if (len > sge->length) - len = sge->length; - if (len > sge->sge_length) - len = sge->sge_length; - BUG_ON(len == 0); - rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, len, release, false); - sge->vaddr += len; - sge->length -= len; - sge->sge_length -= len; - if (sge->sge_length == 0) { - if (!release) - rvt_put_mr(sge->mr); - if (--sqp->s_sge.num_sge) - *sge = *sqp->s_sge.sg_list++; - } else if (sge->length == 0 && sge->mr->lkey) { - if (++sge->n >= RVT_SEGSZ) { - if (++sge->m >= sge->mr->mapsz) - break; - sge->n = 0; - } - sge->vaddr = - sge->mr->map[sge->m]->segs[sge->n].vaddr; - sge->length = - sge->mr->map[sge->m]->segs[sge->n].length; - } - sqp->s_len -= len; - } - if (release) - rvt_put_ss(&qp->r_sge); - - if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) - goto send_comp; - - if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) - wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; - else - wc.opcode = IB_WC_RECV; - wc.wr_id = qp->r_wr_id; - wc.status = IB_WC_SUCCESS; - wc.byte_len = wqe->length; - wc.qp = &qp->ibqp; - wc.src_qp = qp->remote_qpn; - wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr); - wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); - wc.port_num = 1; - /* Signal completion event if the solicited bit is set. */ - rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, - wqe->wr.send_flags & IB_SEND_SOLICITED); - -send_comp: - spin_lock_irqsave(&sqp->s_lock, flags); - ibp->rvp.n_loop_pkts++; -flush_send: - sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; - rvt_send_complete(sqp, wqe, send_status); - goto again; - -rnr_nak: - /* Handle RNR NAK */ - if (qp->ibqp.qp_type == IB_QPT_UC) - goto send_comp; - ibp->rvp.n_rnr_naks++; - /* - * Note: we don't need the s_lock held since the BUSY flag - * makes this single threaded. - */ - if (sqp->s_rnr_retry == 0) { - send_status = IB_WC_RNR_RETRY_EXC_ERR; - goto serr; - } - if (sqp->s_rnr_retry_cnt < 7) - sqp->s_rnr_retry--; - spin_lock_irqsave(&sqp->s_lock, flags); - if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) - goto clr_busy; - rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << - IB_AETH_CREDIT_SHIFT); - goto clr_busy; - -op_err: - send_status = IB_WC_REM_OP_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -inv_err: - send_status = IB_WC_REM_INV_REQ_ERR; - wc.status = IB_WC_LOC_QP_OP_ERR; - goto err; - -acc_err: - send_status = IB_WC_REM_ACCESS_ERR; - wc.status = IB_WC_LOC_PROT_ERR; -err: - /* responder goes to error state */ - rvt_rc_error(qp, wc.status); - -serr: - spin_lock_irqsave(&sqp->s_lock, flags); - rvt_send_complete(sqp, wqe, send_status); - if (sqp->ibqp.qp_type == IB_QPT_RC) { - int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); - - sqp->s_flags &= ~RVT_S_BUSY; - spin_unlock_irqrestore(&sqp->s_lock, flags); - if (lastwqe) { - struct ib_event ev; - - ev.device = sqp->ibqp.device; - ev.element.qp = &sqp->ibqp; - ev.event = IB_EVENT_QP_LAST_WQE_REACHED; - sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); - } - goto done; - } -clr_busy: - sqp->s_flags &= ~RVT_S_BUSY; -unlock: - spin_unlock_irqrestore(&sqp->s_lock, flags); -done: - rcu_read_unlock(); -} - /** * qib_make_grh - construct a GRH header * @ibp: a pointer to the IB port @@ -573,7 +272,7 @@ void qib_do_send(struct rvt_qp *qp) qp->ibqp.qp_type == IB_QPT_UC) && (rdma_ah_get_dlid(&qp->remote_ah_attr) & ~((1 << ppd->lmc) - 1)) == ppd->lid) { - qib_ruc_loopback(qp); + rvt_ruc_loopback(qp); return; } diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index 7e3ec6674cf7..1735deb1a9d4 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -2777,3 +2777,334 @@ again: } } EXPORT_SYMBOL(rvt_copy_sge); + +/** + * ruc_loopback - handle UC and RC loopback requests + * @sqp: the sending QP + * + * This is called from rvt_do_send() to forward a WQE addressed to the same HFI + * Note that although we are single threaded due to the send engine, we still + * have to protect against post_send(). We don't have to worry about + * receive interrupts since this is a connected protocol and all packets + * will pass through here. + */ +void rvt_ruc_loopback(struct rvt_qp *sqp) +{ + struct rvt_ibport *rvp = NULL; + struct rvt_dev_info *rdi = ib_to_rvt(sqp->ibqp.device); + struct rvt_qp *qp; + struct rvt_swqe *wqe; + struct rvt_sge *sge; + unsigned long flags; + struct ib_wc wc; + u64 sdata; + atomic64_t *maddr; + enum ib_wc_status send_status; + bool release; + int ret; + bool copy_last = false; + int local_ops = 0; + + rcu_read_lock(); + rvp = rdi->ports[sqp->port_num - 1]; + + /* + * Note that we check the responder QP state after + * checking the requester's state. + */ + + qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), rvp, + sqp->remote_qpn); + + spin_lock_irqsave(&sqp->s_lock, flags); + + /* Return if we are already busy processing a work request. */ + if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) || + !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND)) + goto unlock; + + sqp->s_flags |= RVT_S_BUSY; + +again: + if (sqp->s_last == READ_ONCE(sqp->s_head)) + goto clr_busy; + wqe = rvt_get_swqe_ptr(sqp, sqp->s_last); + + /* Return if it is not OK to start a new work request. */ + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) { + if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND)) + goto clr_busy; + /* We are in the error state, flush the work request. */ + send_status = IB_WC_WR_FLUSH_ERR; + goto flush_send; + } + + /* + * We can rely on the entry not changing without the s_lock + * being held until we update s_last. + * We increment s_cur to indicate s_last is in progress. + */ + if (sqp->s_last == sqp->s_cur) { + if (++sqp->s_cur >= sqp->s_size) + sqp->s_cur = 0; + } + spin_unlock_irqrestore(&sqp->s_lock, flags); + + if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) || + qp->ibqp.qp_type != sqp->ibqp.qp_type) { + rvp->n_pkt_drops++; + /* + * For RC, the requester would timeout and retry so + * shortcut the timeouts and just signal too many retries. + */ + if (sqp->ibqp.qp_type == IB_QPT_RC) + send_status = IB_WC_RETRY_EXC_ERR; + else + send_status = IB_WC_SUCCESS; + goto serr; + } + + memset(&wc, 0, sizeof(wc)); + send_status = IB_WC_SUCCESS; + + release = true; + sqp->s_sge.sge = wqe->sg_list[0]; + sqp->s_sge.sg_list = wqe->sg_list + 1; + sqp->s_sge.num_sge = wqe->wr.num_sge; + sqp->s_len = wqe->length; + switch (wqe->wr.opcode) { + case IB_WR_REG_MR: + goto send_comp; + + case IB_WR_LOCAL_INV: + if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) { + if (rvt_invalidate_rkey(sqp, + wqe->wr.ex.invalidate_rkey)) + send_status = IB_WC_LOC_PROT_ERR; + local_ops = 1; + } + goto send_comp; + + case IB_WR_SEND_WITH_INV: + if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) { + wc.wc_flags = IB_WC_WITH_INVALIDATE; + wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey; + } + goto send; + + case IB_WR_SEND_WITH_IMM: + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + /* FALLTHROUGH */ + case IB_WR_SEND: +send: + ret = rvt_get_rwqe(qp, false); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + break; + + case IB_WR_RDMA_WRITE_WITH_IMM: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; + wc.wc_flags = IB_WC_WITH_IMM; + wc.ex.imm_data = wqe->wr.ex.imm_data; + ret = rvt_get_rwqe(qp, true); + if (ret < 0) + goto op_err; + if (!ret) + goto rnr_nak; + /* skip copy_last set and qp_access_flags recheck */ + goto do_write; + case IB_WR_RDMA_WRITE: + copy_last = rvt_is_user_qp(qp); + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE))) + goto inv_err; +do_write: + if (wqe->length == 0) + break; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_WRITE))) + goto acc_err; + qp->r_sge.sg_list = NULL; + qp->r_sge.num_sge = 1; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_RDMA_READ: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length, + wqe->rdma_wr.remote_addr, + wqe->rdma_wr.rkey, + IB_ACCESS_REMOTE_READ))) + goto acc_err; + release = false; + sqp->s_sge.sg_list = NULL; + sqp->s_sge.num_sge = 1; + qp->r_sge.sge = wqe->sg_list[0]; + qp->r_sge.sg_list = wqe->sg_list + 1; + qp->r_sge.num_sge = wqe->wr.num_sge; + qp->r_sge.total_len = wqe->length; + break; + + case IB_WR_ATOMIC_CMP_AND_SWP: + case IB_WR_ATOMIC_FETCH_AND_ADD: + if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC))) + goto inv_err; + if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64), + wqe->atomic_wr.remote_addr, + wqe->atomic_wr.rkey, + IB_ACCESS_REMOTE_ATOMIC))) + goto acc_err; + /* Perform atomic OP and save result. */ + maddr = (atomic64_t *)qp->r_sge.sge.vaddr; + sdata = wqe->atomic_wr.compare_add; + *(u64 *)sqp->s_sge.sge.vaddr = + (wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ? + (u64)atomic64_add_return(sdata, maddr) - sdata : + (u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr, + sdata, wqe->atomic_wr.swap); + rvt_put_mr(qp->r_sge.sge.mr); + qp->r_sge.num_sge = 0; + goto send_comp; + + default: + send_status = IB_WC_LOC_QP_OP_ERR; + goto serr; + } + + sge = &sqp->s_sge.sge; + while (sqp->s_len) { + u32 len = sqp->s_len; + + if (len > sge->length) + len = sge->length; + if (len > sge->sge_length) + len = sge->sge_length; + WARN_ON_ONCE(len == 0); + rvt_copy_sge(qp, &qp->r_sge, sge->vaddr, + len, release, copy_last); + sge->vaddr += len; + sge->length -= len; + sge->sge_length -= len; + if (sge->sge_length == 0) { + if (!release) + rvt_put_mr(sge->mr); + if (--sqp->s_sge.num_sge) + *sge = *sqp->s_sge.sg_list++; + } else if (sge->length == 0 && sge->mr->lkey) { + if (++sge->n >= RVT_SEGSZ) { + if (++sge->m >= sge->mr->mapsz) + break; + sge->n = 0; + } + sge->vaddr = + sge->mr->map[sge->m]->segs[sge->n].vaddr; + sge->length = + sge->mr->map[sge->m]->segs[sge->n].length; + } + sqp->s_len -= len; + } + if (release) + rvt_put_ss(&qp->r_sge); + + if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags)) + goto send_comp; + + if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM) + wc.opcode = IB_WC_RECV_RDMA_WITH_IMM; + else + wc.opcode = IB_WC_RECV; + wc.wr_id = qp->r_wr_id; + wc.status = IB_WC_SUCCESS; + wc.byte_len = wqe->length; + wc.qp = &qp->ibqp; + wc.src_qp = qp->remote_qpn; + wc.slid = rdma_ah_get_dlid(&qp->remote_ah_attr) & U16_MAX; + wc.sl = rdma_ah_get_sl(&qp->remote_ah_attr); + wc.port_num = 1; + /* Signal completion event if the solicited bit is set. */ + rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, + wqe->wr.send_flags & IB_SEND_SOLICITED); + +send_comp: + spin_lock_irqsave(&sqp->s_lock, flags); + rvp->n_loop_pkts++; +flush_send: + sqp->s_rnr_retry = sqp->s_rnr_retry_cnt; + rvt_send_complete(sqp, wqe, send_status); + if (local_ops) { + atomic_dec(&sqp->local_ops_pending); + local_ops = 0; + } + goto again; + +rnr_nak: + /* Handle RNR NAK */ + if (qp->ibqp.qp_type == IB_QPT_UC) + goto send_comp; + rvp->n_rnr_naks++; + /* + * Note: we don't need the s_lock held since the BUSY flag + * makes this single threaded. + */ + if (sqp->s_rnr_retry == 0) { + send_status = IB_WC_RNR_RETRY_EXC_ERR; + goto serr; + } + if (sqp->s_rnr_retry_cnt < 7) + sqp->s_rnr_retry--; + spin_lock_irqsave(&sqp->s_lock, flags); + if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK)) + goto clr_busy; + rvt_add_rnr_timer(sqp, qp->r_min_rnr_timer << + IB_AETH_CREDIT_SHIFT); + goto clr_busy; + +op_err: + send_status = IB_WC_REM_OP_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +inv_err: + send_status = IB_WC_REM_INV_REQ_ERR; + wc.status = IB_WC_LOC_QP_OP_ERR; + goto err; + +acc_err: + send_status = IB_WC_REM_ACCESS_ERR; + wc.status = IB_WC_LOC_PROT_ERR; +err: + /* responder goes to error state */ + rvt_rc_error(qp, wc.status); + +serr: + spin_lock_irqsave(&sqp->s_lock, flags); + rvt_send_complete(sqp, wqe, send_status); + if (sqp->ibqp.qp_type == IB_QPT_RC) { + int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR); + + sqp->s_flags &= ~RVT_S_BUSY; + spin_unlock_irqrestore(&sqp->s_lock, flags); + if (lastwqe) { + struct ib_event ev; + + ev.device = sqp->ibqp.device; + ev.element.qp = &sqp->ibqp; + ev.event = IB_EVENT_QP_LAST_WQE_REACHED; + sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context); + } + goto done; + } +clr_busy: + sqp->s_flags &= ~RVT_S_BUSY; +unlock: + spin_unlock_irqrestore(&sqp->s_lock, flags); +done: + rcu_read_unlock(); +} +EXPORT_SYMBOL(rvt_ruc_loopback); diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index 6fd6f2ad9c0f..cbafb1878669 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -683,6 +683,7 @@ void rvt_copy_sge(struct rvt_qp *qp, struct rvt_sge_state *ss, bool release, bool copy_last); void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe, enum ib_wc_status status); +void rvt_ruc_loopback(struct rvt_qp *qp); /** * struct rvt_qp_iter - the iterator for QPs From fe33507ec38a8b2e8b782b83669943b7a5fefd4c Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Fri, 21 Sep 2018 09:18:24 -0500 Subject: [PATCH 194/242] RDMA/core: Check error status of rdma_find_ndev_for_src_ip_rcu rdma_find_ndev_for_src_ip_rcu() returns either valid netdev pointer or ERR_PTR(). Instead of checking for NULL, check for error. Fixes: caf1e3ae9fa6 ("RDMA/core Introduce and use rdma_find_ndev_for_src_ip_rcu") Reported-by: syzbot+20c32fa6ff84a2d28c36@syzkaller.appspotmail.com Signed-off-by: Parav Pandit Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 1400a9d0d56d..07e0ffe74a8a 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -513,7 +513,7 @@ static int rdma_set_src_addr_rcu(struct rdma_dev_addr *dev_addr, * loopback IP address. */ ndev = rdma_find_ndev_for_src_ip_rcu(dev_net(ndev), dst_in); - if (!ndev) + if (IS_ERR(ndev)) return -ENODEV; } From 363ad35577de3a73cf97006ec5f00fccaee73172 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:01 +0300 Subject: [PATCH 195/242] RDMA/restrack: Un-inline set task implementation Prepare rdma_restrack_set_task() call to accommodate more code by moving its implementation from *.h to *.c. Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 10 ++++++++++ include/rdma/restrack.h | 10 ++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index bcc693fffd4c..b02d43988e16 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -155,6 +155,16 @@ static bool res_is_user(struct rdma_restrack_entry *res) } } +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + struct task_struct *task) +{ + if (res->task) + put_task_struct(res->task); + get_task_struct(task); + res->task = task; +} +EXPORT_SYMBOL(rdma_restrack_set_task); + void rdma_restrack_add(struct rdma_restrack_entry *res) { struct ib_device *dev = res_to_dev(res); diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 9654d33edd98..0bddbbdbaf7c 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -175,14 +175,8 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); * @res: resource entry * @task: task struct */ -static inline void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) -{ - if (res->task) - put_task_struct(res->task); - get_task_struct(task); - res->task = task; -} +void rdma_restrack_set_task(struct rdma_restrack_entry *res, + struct task_struct *task); /* * Helper functions for rdma drivers when filling out From 2165fc264079ecb7fbfa5e8b330a92eb3f0fcbe1 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:02 +0300 Subject: [PATCH 196/242] RDMA/restrack: Consolidate task name updates in one place Unify task update and kernel name set in one place. Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 10 ++-------- drivers/infiniband/core/cq.c | 2 +- drivers/infiniband/core/restrack.c | 13 +++++++++---- drivers/infiniband/core/verbs.c | 4 ++-- include/rdma/restrack.h | 4 ++-- 5 files changed, 16 insertions(+), 17 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 897aac68158b..f117b755c4c2 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -875,10 +875,7 @@ struct rdma_cm_id *__rdma_create_id(struct net *net, if (!id_priv) return ERR_PTR(-ENOMEM); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); id_priv->res.type = RDMA_RESTRACK_CM_ID; id_priv->state = RDMA_CM_IDLE; id_priv->id.context = context; @@ -3945,10 +3942,7 @@ int __rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param, id_priv = container_of(id, struct rdma_id_private, id); - if (caller) - id_priv->res.kern_name = caller; - else - rdma_restrack_set_task(&id_priv->res, current); + rdma_restrack_set_task(&id_priv->res, caller); if (!cma_comp(id_priv, RDMA_CM_CONNECT)) return -EINVAL; diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c index 9271f7290005..b1e5365ddafa 100644 --- a/drivers/infiniband/core/cq.c +++ b/drivers/infiniband/core/cq.c @@ -161,7 +161,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private, goto out_destroy_cq; cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); switch (cq->poll_ctx) { diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index b02d43988e16..035af568ba64 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -156,12 +156,17 @@ static bool res_is_user(struct rdma_restrack_entry *res) } void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task) + const char *caller) { + if (caller) { + res->kern_name = caller; + return; + } + if (res->task) put_task_struct(res->task); - get_task_struct(task); - res->task = task; + get_task_struct(current); + res->task = current; } EXPORT_SYMBOL(rdma_restrack_set_task); @@ -177,7 +182,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) if (res_is_user(res)) { if (!res->task) - rdma_restrack_set_task(res, current); + rdma_restrack_set_task(res, NULL); res->kern_name = NULL; } else { set_kern_name(res); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 1e7ad5e0a46e..65a7e0b44ad7 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -264,7 +264,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, } pd->res.type = RDMA_RESTRACK_PD; - pd->res.kern_name = caller; + rdma_restrack_set_task(&pd->res, caller); rdma_restrack_add(&pd->res); if (mr_access_flags) { @@ -1889,7 +1889,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device, cq->cq_context = cq_context; atomic_set(&cq->usecnt, 0); cq->res.type = RDMA_RESTRACK_CQ; - cq->res.kern_name = caller; + rdma_restrack_set_task(&cq->res, caller); rdma_restrack_add(&cq->res); } diff --git a/include/rdma/restrack.h b/include/rdma/restrack.h index 0bddbbdbaf7c..2638fa7cd702 100644 --- a/include/rdma/restrack.h +++ b/include/rdma/restrack.h @@ -173,10 +173,10 @@ int rdma_restrack_put(struct rdma_restrack_entry *res); /** * rdma_restrack_set_task() - set the task for this resource * @res: resource entry - * @task: task struct + * @caller: kernel name, the current task will be used if the caller is NULL. */ void rdma_restrack_set_task(struct rdma_restrack_entry *res, - struct task_struct *task); + const char *caller); /* * Helper functions for rdma drivers when filling out From ed7a01fd3fd77f40b4ef2562b966a5decd8928d2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:48:03 +0300 Subject: [PATCH 197/242] RDMA/restrack: Release task struct which was hold by CM_ID object Tracking CM_ID resource is performed in two stages: creation of cm_id and connecting it to the cma_dev. It is needed because rdma-cm protocol exports two separate user-visible calls rdma_create_id and rdma_accept. At the time of CM_ID creation, the real owner of that object is unknown yet and we need to grab task_struct. This task_struct is released or reassigned in attach phase later on. but call to rdma_destroy_id left this task_struct unreleased. Such separation is unique to CM_ID and other restrack objects initialize in one shot. It means that it is safe to use "res->valid" check to catch unfinished CM_ID flow and release task_struct for that object. Fixes: 00313983cda6 ("RDMA/nldev: provide detailed CM_ID information") Reported-by: Artemy Kovalyov Reviewed-by: Artemy Kovalyov Reviewed-by: Yossi Itigin Signed-off-by: Leon Romanovsky Reviewed-by: Steve Wise Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma.c | 7 +++---- drivers/infiniband/core/restrack.c | 6 ++++-- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index f117b755c4c2..f98ddb5f4d59 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -1815,8 +1815,8 @@ void rdma_destroy_id(struct rdma_cm_id *id) mutex_lock(&id_priv->handler_mutex); mutex_unlock(&id_priv->handler_mutex); + rdma_restrack_del(&id_priv->res); if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); if (rdma_cap_ib_cm(id_priv->id.device, 1)) { if (id_priv->cm_id.ib) ib_destroy_cm_id(id_priv->cm_id.ib); @@ -3542,10 +3542,9 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr) return 0; err2: - if (id_priv->cma_dev) { - rdma_restrack_del(&id_priv->res); + rdma_restrack_del(&id_priv->res); + if (id_priv->cma_dev) cma_release_dev(id_priv); - } err1: cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE); return ret; diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 035af568ba64..16b5f9949770 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -223,7 +223,7 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) struct ib_device *dev; if (!res->valid) - return; + goto out; dev = res_to_dev(res); if (!dev) @@ -236,8 +236,10 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) down_write(&dev->res.rwsem); hash_del(&res->node); res->valid = false; + up_write(&dev->res.rwsem); + +out: if (res->task) put_task_struct(res->task); - up_write(&dev->res.rwsem); } EXPORT_SYMBOL(rdma_restrack_del); From 5a23e0b1dd51fe0efae666b03fdb15e1301f437a Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:27:52 -0700 Subject: [PATCH 198/242] RDMA/bnxt_re: Add missing spin lock initialization Add the missing initalization of the cq_lock and qplib.flush_lock. Fixes: 942c9b6ca8de ("RDMA/bnxt_re: Avoid Hard lockup during error CQE processing") Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/ib_verbs.c | 1 + drivers/infiniband/hw/bnxt_re/qplib_fp.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c index 9d7c48466f10..54fdd4cf5288 100644 --- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c +++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c @@ -2663,6 +2663,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev, nq->budget++; atomic_inc(&rdev->cq_count); + spin_lock_init(&cq->cq_lock); if (context) { struct bnxt_re_cq_resp resp; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index 6eaa4b2abb78..e14168923d9a 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -1966,6 +1966,7 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq) INIT_LIST_HEAD(&cq->sqf_head); INIT_LIST_HEAD(&cq->rqf_head); spin_lock_init(&cq->compl_lock); + spin_lock_init(&cq->flush_lock); bnxt_qplib_arm_cq_enable(cq); return 0; From d455f29f6d76a5f94881ca1289aaa1e90617ff5d Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:27:53 -0700 Subject: [PATCH 199/242] RDMA/bnxt_re: Fix recursive lock warning in debug kernel Fix possible recursive lock warning. Its a false warning as the locks are part of two differnt HW Queue data structure - cmdq and creq. Debug kernel is throwing the following warning and stack trace. [ 783.914967] ============================================ [ 783.914970] WARNING: possible recursive locking detected [ 783.914973] 4.19.0-rc2+ #33 Not tainted [ 783.914976] -------------------------------------------- [ 783.914979] swapper/2/0 is trying to acquire lock: [ 783.914982] 000000002aa3949d (&(&hwq->lock)->rlock){..-.}, at: bnxt_qplib_service_creq+0x232/0x350 [bnxt_re] [ 783.914999] but task is already holding lock: [ 783.915002] 00000000be73920d (&(&hwq->lock)->rlock){..-.}, at: bnxt_qplib_service_creq+0x2a/0x350 [bnxt_re] [ 783.915013] other info that might help us debug this: [ 783.915016] Possible unsafe locking scenario: [ 783.915019] CPU0 [ 783.915021] ---- [ 783.915034] lock(&(&hwq->lock)->rlock); [ 783.915035] lock(&(&hwq->lock)->rlock); [ 783.915037] *** DEADLOCK *** [ 783.915038] May be due to missing lock nesting notation [ 783.915039] 1 lock held by swapper/2/0: [ 783.915040] #0: 00000000be73920d (&(&hwq->lock)->rlock){..-.}, at: bnxt_qplib_service_creq+0x2a/0x350 [bnxt_re] [ 783.915044] stack backtrace: [ 783.915046] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.19.0-rc2+ #33 [ 783.915047] Hardware name: Dell Inc. PowerEdge R730/0599V5, BIOS 1.0.4 08/28/2014 [ 783.915048] Call Trace: [ 783.915049] [ 783.915054] dump_stack+0x90/0xe3 [ 783.915058] __lock_acquire+0x106c/0x1080 [ 783.915061] ? sched_clock+0x5/0x10 [ 783.915063] lock_acquire+0xbd/0x1a0 [ 783.915065] ? bnxt_qplib_service_creq+0x232/0x350 [bnxt_re] [ 783.915069] _raw_spin_lock_irqsave+0x4a/0x90 [ 783.915071] ? bnxt_qplib_service_creq+0x232/0x350 [bnxt_re] [ 783.915073] bnxt_qplib_service_creq+0x232/0x350 [bnxt_re] [ 783.915078] tasklet_action_common.isra.17+0x197/0x1b0 [ 783.915081] __do_softirq+0xcb/0x3a6 [ 783.915084] irq_exit+0xe9/0x100 [ 783.915085] do_IRQ+0x6a/0x120 [ 783.915087] common_interrupt+0xf/0xf [ 783.915088] Use nested notation for the spin_lock to avoid this warning. Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 14105004d258..0c6a3acef2db 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -311,8 +311,17 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, rcfw->aeq_handler(rcfw, qp_event, qp); break; default: - /* Command Response */ - spin_lock_irqsave(&cmdq->lock, flags); + /* + * Command Response + * cmdq->lock needs to be acquired to synchronie + * the command send and completion reaping. This function + * is always called with creq->lock held. Using + * the nested variant of spin_lock. + * + */ + + spin_lock_irqsave_nested(&cmdq->lock, flags, + SINGLE_DEPTH_NESTING); cookie = le16_to_cpu(qp_event->cookie); mcookie = qp_event->cookie; blocked = cookie & RCFW_CMD_IS_BLOCKING; From 1b7042d7a586a4b8c9d257ceba6f40390bcd21ee Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:27:54 -0700 Subject: [PATCH 200/242] RDMA/bnxt_re: Remove the unnecessary version macro definition Version macro is not required as the driver is not maintaining the version. Removing the references of this macro too. Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 1 - drivers/infiniband/hw/bnxt_re/main.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index 96f76896488d..f34ef05a92ee 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -40,7 +40,6 @@ #ifndef __BNXT_RE_H__ #define __BNXT_RE_H__ #define ROCE_DRV_MODULE_NAME "bnxt_re" -#define ROCE_DRV_MODULE_VERSION "1.0.0" #define BNXT_RE_DESC "Broadcom NetXtreme-C/E RoCE Driver" #define BNXT_RE_PAGE_SHIFT_4K (12) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 63186396ffa4..7f273255875f 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -67,7 +67,7 @@ #include "hw_counters.h" static char version[] = - BNXT_RE_DESC " v" ROCE_DRV_MODULE_VERSION "\n"; + BNXT_RE_DESC "\n"; MODULE_AUTHOR("Eddie Wai "); MODULE_DESCRIPTION(BNXT_RE_DESC " Driver"); From eae4ad1b0c9a77ef0cbac212d58d46976eaacfc1 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:27:55 -0700 Subject: [PATCH 201/242] RDMA/bnxt_re: Avoid NULL check after accessing the pointer This is reported by smatch check. rcfw->creq_bar_reg_iomem is accessed in bnxt_qplib_rcfw_stop_irq and this variable check afterwards doesn't make sense. Also, rcfw->creq_bar_reg_iomem will never be NULL. So Removing this check. Reported-by: Dan Carpenter Fixes: 6e04b1035689 ("RDMA/bnxt_re: Fix broken RoCE driver due to recent L2 driver changes") Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 0c6a3acef2db..5d1504e29a11 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -615,13 +615,8 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) bnxt_qplib_rcfw_stop_irq(rcfw, true); - if (rcfw->cmdq_bar_reg_iomem) - iounmap(rcfw->cmdq_bar_reg_iomem); - rcfw->cmdq_bar_reg_iomem = NULL; - - if (rcfw->creq_bar_reg_iomem) - iounmap(rcfw->creq_bar_reg_iomem); - rcfw->creq_bar_reg_iomem = NULL; + iounmap(rcfw->cmdq_bar_reg_iomem); + iounmap(rcfw->creq_bar_reg_iomem); indx = find_first_bit(rcfw->cmdq_bitmap, rcfw->bmap_size); if (indx != rcfw->bmap_size) @@ -630,6 +625,8 @@ void bnxt_qplib_disable_rcfw_channel(struct bnxt_qplib_rcfw *rcfw) kfree(rcfw->cmdq_bitmap); rcfw->bmap_size = 0; + rcfw->cmdq_bar_reg_iomem = NULL; + rcfw->creq_bar_reg_iomem = NULL; rcfw->aeq_handler = NULL; rcfw->vector = 0; } @@ -713,6 +710,8 @@ int bnxt_qplib_enable_rcfw_channel(struct pci_dev *pdev, if (!rcfw->creq_bar_reg_iomem) { dev_err(&rcfw->pdev->dev, "CREQ BAR region %d mapping failed\n", rcfw->creq_bar_reg); + iounmap(rcfw->cmdq_bar_reg_iomem); + rcfw->cmdq_bar_reg_iomem = NULL; return -ENOMEM; } rcfw->creq_qp_event_processed = 0; From ed51efd2ce44091a858ad829f666727e7c95695e Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:27:56 -0700 Subject: [PATCH 202/242] RDMA/bnxt_re: Avoid accessing nq->bar_reg_iomem in failure case In the failure path, nq->bar_reg_iomem gets accessed without initializing. Avoid this by calling the bnxt_qplib_nq_stop_irq only if the initialization is complete. Reported-by: Dan Carpenter Fixes: 1ac5a4047975 ("RDMA/bnxt_re: Add bnxt_re RoCE driver") Fixes: 6e04b1035689 ("RDMA/bnxt_re: Fix broken RoCE driver due to recent L2 driver changes") Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_fp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_fp.c b/drivers/infiniband/hw/bnxt_re/qplib_fp.c index e14168923d9a..b98b054148cd 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_fp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_fp.c @@ -358,7 +358,8 @@ void bnxt_qplib_disable_nq(struct bnxt_qplib_nq *nq) } /* Make sure the HW is stopped! */ - bnxt_qplib_nq_stop_irq(nq, true); + if (nq->requested) + bnxt_qplib_nq_stop_irq(nq, true); if (nq->bar_reg_iomem) iounmap(nq->bar_reg_iomem); From f2bd4d096eb4e0074b476452789a3b10a31af162 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Mon, 8 Oct 2018 03:27:57 -0700 Subject: [PATCH 203/242] RDMA/bnxt_re: Drop L2 async events silently In some FW versions, RoCE driver also receives an async notification which was directed to L2 driver. RoCE driver does not handle this and print a message to syslog. Drop these notifications silently. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index 5d1504e29a11..a3d9447c3aee 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -391,9 +391,10 @@ static void bnxt_qplib_service_creq(unsigned long data) "aeqe:%#x Not handled\n", type); break; default: - dev_warn(&rcfw->pdev->dev, - "creqe with op_event = 0x%x not handled\n", - type); + if (type != ASYNC_EVENT_CMPL_TYPE_HWRM_ASYNC_EVENT) + dev_warn(&rcfw->pdev->dev, + "creqe with event 0x%x not handled\n", + type); break; } raw_cons++; From bb22c36cbaaeff351b8a6956d88be98497cc14aa Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Mon, 8 Oct 2018 03:27:58 -0700 Subject: [PATCH 204/242] RDMA/bnxt_re: Prevent driver crash due to NULL pointer in error message print crsqe->resp would be NULL in case the host command timed out before getting a response from HW. Check for NULL pointer to avoid a potential crash while printing the error message. Signed-off-by: Somnath Kotur Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_rcfw.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c index a3d9447c3aee..be4e33e9f962 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.c @@ -333,10 +333,12 @@ static int bnxt_qplib_process_qp_event(struct bnxt_qplib_rcfw *rcfw, memcpy(crsqe->resp, qp_event, sizeof(*qp_event)); crsqe->resp = NULL; } else { - dev_err(&rcfw->pdev->dev, - "CMD %s resp->cookie = %#x, evnt->cookie = %#x\n", - crsqe->resp ? "mismatch" : "collision", - crsqe->resp ? crsqe->resp->cookie : 0, mcookie); + if (crsqe->resp && crsqe->resp->cookie) + dev_err(&rcfw->pdev->dev, + "CMD %s cookie sent=%#x, recd=%#x\n", + crsqe->resp ? "mismatch" : "collision", + crsqe->resp ? crsqe->resp->cookie : 0, + mcookie); } if (!test_and_clear_bit(cbit, rcfw->cmdq_bitmap)) dev_warn(&rcfw->pdev->dev, From 5c80c9138e28f600e185d0f23fca5ea7fe45bd95 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:27:59 -0700 Subject: [PATCH 205/242] RDMA/bnxt_re: Expose rx discards and drop counters Expose the RoCE discard and drop counters from the HW statistics context Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/hw_counters.c | 6 ++++++ drivers/infiniband/hw/bnxt_re/hw_counters.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index 77416bc61e6e..e63adf2cae19 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -68,6 +68,8 @@ static const char * const bnxt_re_stat_name[] = { [BNXT_RE_TX_PKTS] = "tx_pkts", [BNXT_RE_TX_BYTES] = "tx_bytes", [BNXT_RE_RECOVERABLE_ERRORS] = "recoverable_errors", + [BNXT_RE_RX_DROPS] = "rx_roce_drops", + [BNXT_RE_RX_DISCARDS] = "rx_roce_discards", [BNXT_RE_TO_RETRANSMITS] = "to_retransmits", [BNXT_RE_SEQ_ERR_NAKS_RCVD] = "seq_err_naks_rcvd", [BNXT_RE_MAX_RETRY_EXCEEDED] = "max_retry_exceeded", @@ -128,6 +130,10 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, if (bnxt_re_stats) { stats->value[BNXT_RE_RECOVERABLE_ERRORS] = le64_to_cpu(bnxt_re_stats->tx_bcast_pkts); + stats->value[BNXT_RE_RX_DROPS] = + le64_to_cpu(bnxt_re_stats->rx_drop_pkts); + stats->value[BNXT_RE_RX_DISCARDS] = + le64_to_cpu(bnxt_re_stats->rx_discard_pkts); stats->value[BNXT_RE_RX_PKTS] = le64_to_cpu(bnxt_re_stats->rx_ucast_pkts); stats->value[BNXT_RE_RX_BYTES] = diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h index a01a922717d5..2c8a40983342 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.h +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h @@ -51,6 +51,8 @@ enum bnxt_re_hw_stats { BNXT_RE_TX_PKTS, BNXT_RE_TX_BYTES, BNXT_RE_RECOVERABLE_ERRORS, + BNXT_RE_RX_DROPS, + BNXT_RE_RX_DISCARDS, BNXT_RE_TO_RETRANSMITS, BNXT_RE_SEQ_ERR_NAKS_RCVD, BNXT_RE_MAX_RETRY_EXCEEDED, From 316dd2825db1396304fb3003fad6cf1fef1883a0 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:28:00 -0700 Subject: [PATCH 206/242] RDMA/bnxt_re: Report out of sequence hw counters Expose out of sequence errors received from FW. This counter is a 32 bit counter and driver has to accumulate the counter. Stores the previous value for calculating the difference in the next query. Also, update the HW statistics structure with new fields. Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/hw_counters.c | 5 ++++- drivers/infiniband/hw/bnxt_re/hw_counters.h | 1 + drivers/infiniband/hw/bnxt_re/qplib_rcfw.h | 4 ++++ drivers/infiniband/hw/bnxt_re/qplib_sp.c | 10 ++++++++++ drivers/infiniband/hw/bnxt_re/qplib_sp.h | 10 ++++++++++ drivers/infiniband/hw/bnxt_re/roce_hsi.h | 5 +++++ 6 files changed, 34 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.c b/drivers/infiniband/hw/bnxt_re/hw_counters.c index e63adf2cae19..604b71875f5f 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.c +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.c @@ -108,7 +108,8 @@ static const char * const bnxt_re_stat_name[] = { [BNXT_RE_RES_CQ_LOAD_ERR] = "res_cq_load_err", [BNXT_RE_RES_SRQ_LOAD_ERR] = "res_srq_load_err", [BNXT_RE_RES_TX_PCI_ERR] = "res_tx_pci_err", - [BNXT_RE_RES_RX_PCI_ERR] = "res_rx_pci_err" + [BNXT_RE_RES_RX_PCI_ERR] = "res_rx_pci_err", + [BNXT_RE_OUT_OF_SEQ_ERR] = "oos_drop_count" }; int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, @@ -226,6 +227,8 @@ int bnxt_re_ib_get_hw_stats(struct ib_device *ibdev, rdev->stats.res_tx_pci_err; stats->value[BNXT_RE_RES_RX_PCI_ERR] = rdev->stats.res_rx_pci_err; + stats->value[BNXT_RE_OUT_OF_SEQ_ERR] = + rdev->stats.res_oos_drop_count; } return ARRAY_SIZE(bnxt_re_stat_name); diff --git a/drivers/infiniband/hw/bnxt_re/hw_counters.h b/drivers/infiniband/hw/bnxt_re/hw_counters.h index 2c8a40983342..76399f477e5c 100644 --- a/drivers/infiniband/hw/bnxt_re/hw_counters.h +++ b/drivers/infiniband/hw/bnxt_re/hw_counters.h @@ -92,6 +92,7 @@ enum bnxt_re_hw_stats { BNXT_RE_RES_SRQ_LOAD_ERR, BNXT_RE_RES_TX_PCI_ERR, BNXT_RE_RES_RX_PCI_ERR, + BNXT_RE_OUT_OF_SEQ_ERR, BNXT_RE_NUM_COUNTERS }; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h index 46416dfe8830..9a8687dc0a79 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_rcfw.h @@ -154,6 +154,8 @@ struct bnxt_qplib_qp_node { void *qp_handle; /* ptr to qplib_qp */ }; +#define BNXT_QPLIB_OOS_COUNT_MASK 0xFFFFFFFF + /* RCFW Communication Channels */ struct bnxt_qplib_rcfw { struct pci_dev *pdev; @@ -190,6 +192,8 @@ struct bnxt_qplib_rcfw { struct bnxt_qplib_crsq *crsqe_tbl; int qp_tbl_size; struct bnxt_qplib_qp_node *qp_tbl; + u64 oos_prev; + u32 init_oos_stats; }; void bnxt_qplib_free_rcfw_channel(struct bnxt_qplib_rcfw *rcfw); diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index c80f2eaaf1a3..14e2b3c13e5c 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -840,6 +840,16 @@ int bnxt_qplib_get_roce_stats(struct bnxt_qplib_rcfw *rcfw, stats->res_srq_load_err = le64_to_cpu(sb->res_srq_load_err); stats->res_tx_pci_err = le64_to_cpu(sb->res_tx_pci_err); stats->res_rx_pci_err = le64_to_cpu(sb->res_rx_pci_err); + if (!rcfw->init_oos_stats) { + rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count); + rcfw->init_oos_stats = 1; + } else { + stats->res_oos_drop_count += + (le64_to_cpu(sb->res_oos_drop_count) - + rcfw->oos_prev) & BNXT_QPLIB_OOS_COUNT_MASK; + rcfw->oos_prev = le64_to_cpu(sb->res_oos_drop_count); + } + bail: bnxt_qplib_rcfw_free_sbuf(rcfw, sbuf); return rc; diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.h b/drivers/infiniband/hw/bnxt_re/qplib_sp.h index 9d3e8b994945..8079d7f5a008 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.h +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.h @@ -205,6 +205,16 @@ struct bnxt_qplib_roce_stats { /* res_tx_pci_err is 64 b */ u64 res_rx_pci_err; /* res_rx_pci_err is 64 b */ + u64 res_oos_drop_count; + /* res_oos_drop_count */ + u64 active_qp_count_p0; + /* port 0 active qps */ + u64 active_qp_count_p1; + /* port 1 active qps */ + u64 active_qp_count_p2; + /* port 2 active qps */ + u64 active_qp_count_p3; + /* port 3 active qps */ }; int bnxt_qplib_get_sgid(struct bnxt_qplib_res *res, diff --git a/drivers/infiniband/hw/bnxt_re/roce_hsi.h b/drivers/infiniband/hw/bnxt_re/roce_hsi.h index 3e5a4f760d0e..8a9ead419ac2 100644 --- a/drivers/infiniband/hw/bnxt_re/roce_hsi.h +++ b/drivers/infiniband/hw/bnxt_re/roce_hsi.h @@ -2929,6 +2929,11 @@ struct creq_query_roce_stats_resp_sb { __le64 res_srq_load_err; __le64 res_tx_pci_err; __le64 res_rx_pci_err; + __le64 res_oos_drop_count; + __le64 active_qp_count_p0; + __le64 active_qp_count_p1; + __le64 active_qp_count_p2; + __le64 active_qp_count_p3; }; /* QP error notification event (16 bytes) */ From 4c01f2e3a906a0d2d798be5751c331cf501bc129 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Mon, 8 Oct 2018 03:28:01 -0700 Subject: [PATCH 207/242] RDMA/bnxt_re: Fix qp async event reporting Reports affiliated async event on the qp-async event channel instead of global event channel. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 7f273255875f..2b1d00d78ad1 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -988,12 +988,17 @@ static void bnxt_re_dispatch_event(struct ib_device *ibdev, struct ib_qp *qp, struct ib_event ib_event; ib_event.device = ibdev; - if (qp) + if (qp) { ib_event.element.qp = qp; - else + ib_event.event = event; + if (qp->event_handler) + qp->event_handler(&ib_event, qp->qp_context); + + } else { ib_event.element.port_num = port_num; - ib_event.event = event; - ib_dispatch_event(&ib_event); + ib_event.event = event; + ib_dispatch_event(&ib_event); + } } #define HWRM_QUEUE_PRI2COS_QCFG_INPUT_FLAGS_IVLAN 0x02 From 854a202001171594df8988b335410adebac641d5 Mon Sep 17 00:00:00 2001 From: Devesh Sharma Date: Mon, 8 Oct 2018 03:28:02 -0700 Subject: [PATCH 208/242] RDMA/bnxt_re: Limit max_pkey to 16 bit value Some FW versios return pkey values more than 0xFFFF. pkey_tbl_len of ib_port_attr is 16bit value. So restricting max_pkeys to 0xFFFF. Signed-off-by: Devesh Sharma Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/qplib_sp.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/qplib_sp.c b/drivers/infiniband/hw/bnxt_re/qplib_sp.c index 14e2b3c13e5c..5216b5f844cc 100644 --- a/drivers/infiniband/hw/bnxt_re/qplib_sp.c +++ b/drivers/infiniband/hw/bnxt_re/qplib_sp.c @@ -137,8 +137,16 @@ int bnxt_qplib_get_dev_attr(struct bnxt_qplib_rcfw *rcfw, attr->max_srq = le16_to_cpu(sb->max_srq); attr->max_srq_wqes = le32_to_cpu(sb->max_srq_wr) - 1; attr->max_srq_sges = sb->max_srq_sge; - /* Bono only reports 1 PKEY for now, but it can support > 1 */ attr->max_pkey = le32_to_cpu(sb->max_pkeys); + /* + * Some versions of FW reports more than 0xFFFF. + * Restrict it for now to 0xFFFF to avoid + * reporting trucated value + */ + if (attr->max_pkey > 0xFFFF) { + /* ib_port_attr::pkey_tbl_len is u16 */ + attr->max_pkey = 0xFFFF; + } attr->max_inline_data = le32_to_cpu(sb->max_inline_data); attr->l2_db_size = (sb->l2_db_space_size + 1) * From a08b9e9a705675ba05fa44f0342e2875fd1bda28 Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:28:03 -0700 Subject: [PATCH 209/242] RDMA/bnxt_re: Wait for delayed work to finish before device removal Delayed work bnxt_re_worker would be still running even after cancel_delayed_work returns. This causes crash as the driver proceeds with device removal. To make sure that the work is finished before returning, use cancel_delayed_work_sync. Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 2b1d00d78ad1..75a54c58fdc6 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -1203,7 +1203,7 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) bnxt_re_unregister_ib(rdev); } if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) - cancel_delayed_work(&rdev->worker); + cancel_delayed_work_sync(&rdev->worker); bnxt_re_cleanup_res(rdev); bnxt_re_free_res(rdev); From 5df950994934814a8b91f0cf9f653842d2ba082d Mon Sep 17 00:00:00 2001 From: Selvin Xavier Date: Mon, 8 Oct 2018 03:28:04 -0700 Subject: [PATCH 210/242] RDMA/bnxt_re: Avoid resource leak in case the NQ registration fails In case the NQ alloc/enable fails, free up the already allocated/enabled NQ before reporting failure. Also, track the alloc/enable using proper state checking. Signed-off-by: Selvin Xavier Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/bnxt_re.h | 2 ++ drivers/infiniband/hw/bnxt_re/main.c | 31 ++++++++++++++++++------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/bnxt_re.h b/drivers/infiniband/hw/bnxt_re/bnxt_re.h index f34ef05a92ee..31baa8939a4f 100644 --- a/drivers/infiniband/hw/bnxt_re/bnxt_re.h +++ b/drivers/infiniband/hw/bnxt_re/bnxt_re.h @@ -119,6 +119,8 @@ struct bnxt_re_dev { #define BNXT_RE_FLAG_HAVE_L2_REF 3 #define BNXT_RE_FLAG_RCFW_CHANNEL_EN 4 #define BNXT_RE_FLAG_QOS_WORK_REG 5 +#define BNXT_RE_FLAG_RESOURCES_ALLOCATED 7 +#define BNXT_RE_FLAG_RESOURCES_INITIALIZED 8 #define BNXT_RE_FLAG_ISSUE_ROCE_STATS 29 struct net_device *netdev; unsigned int version, major, minor; diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index 75a54c58fdc6..e5d4b12f7bf5 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -863,10 +863,8 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) { int i; - if (rdev->nq[0].hwq.max_elements) { - for (i = 1; i < rdev->num_msix; i++) - bnxt_qplib_disable_nq(&rdev->nq[i - 1]); - } + for (i = 1; i < rdev->num_msix; i++) + bnxt_qplib_disable_nq(&rdev->nq[i - 1]); if (rdev->qplib_res.rcfw) bnxt_qplib_cleanup_res(&rdev->qplib_res); @@ -875,6 +873,7 @@ static void bnxt_re_cleanup_res(struct bnxt_re_dev *rdev) static int bnxt_re_init_res(struct bnxt_re_dev *rdev) { int rc = 0, i; + int num_vec_enabled = 0; bnxt_qplib_init_res(&rdev->qplib_res); @@ -890,9 +889,13 @@ static int bnxt_re_init_res(struct bnxt_re_dev *rdev) "Failed to enable NQ with rc = 0x%x", rc); goto fail; } + num_vec_enabled++; } return 0; fail: + for (i = num_vec_enabled; i >= 0; i--) + bnxt_qplib_disable_nq(&rdev->nq[i]); + return rc; } @@ -924,6 +927,7 @@ static void bnxt_re_free_res(struct bnxt_re_dev *rdev) static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) { int rc = 0, i; + int num_vec_created = 0; /* Configure and allocate resources for qplib */ rdev->qplib_res.rcfw = &rdev->rcfw; @@ -950,7 +954,7 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) if (rc) { dev_err(rdev_to_dev(rdev), "Alloc Failed NQ%d rc:%#x", i, rc); - goto dealloc_dpi; + goto free_nq; } rc = bnxt_re_net_ring_alloc (rdev, rdev->nq[i].hwq.pbl[PBL_LVL_0].pg_map_arr, @@ -963,14 +967,17 @@ static int bnxt_re_alloc_res(struct bnxt_re_dev *rdev) dev_err(rdev_to_dev(rdev), "Failed to allocate NQ fw id with rc = 0x%x", rc); + bnxt_qplib_free_nq(&rdev->nq[i]); goto free_nq; } + num_vec_created++; } return 0; free_nq: - for (i = 0; i < rdev->num_msix - 1; i++) + for (i = num_vec_created; i >= 0; i--) { + bnxt_re_net_ring_free(rdev, rdev->nq[i].ring_id); bnxt_qplib_free_nq(&rdev->nq[i]); -dealloc_dpi: + } bnxt_qplib_dealloc_dpi(&rdev->qplib_res, &rdev->qplib_res.dpi_tbl, &rdev->dpi_privileged); @@ -1205,8 +1212,11 @@ static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) if (test_and_clear_bit(BNXT_RE_FLAG_QOS_WORK_REG, &rdev->flags)) cancel_delayed_work_sync(&rdev->worker); - bnxt_re_cleanup_res(rdev); - bnxt_re_free_res(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, + &rdev->flags)) + bnxt_re_cleanup_res(rdev); + if (test_and_clear_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags)) + bnxt_re_free_res(rdev); if (test_and_clear_bit(BNXT_RE_FLAG_RCFW_CHANNEL_EN, &rdev->flags)) { rc = bnxt_qplib_deinit_rcfw(&rdev->rcfw); @@ -1335,12 +1345,15 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) pr_err("Failed to allocate resources: %#x\n", rc); goto fail; } + set_bit(BNXT_RE_FLAG_RESOURCES_ALLOCATED, &rdev->flags); rc = bnxt_re_init_res(rdev); if (rc) { pr_err("Failed to initialize resources: %#x\n", rc); goto fail; } + set_bit(BNXT_RE_FLAG_RESOURCES_INITIALIZED, &rdev->flags); + if (!rdev->is_virtfn) { rc = bnxt_re_setup_qos(rdev); if (rc) From 68a997c5d28c581ae23594eb8a1420d834c30d24 Mon Sep 17 00:00:00 2001 From: Yixian Liu Date: Fri, 5 Oct 2018 17:53:24 +0800 Subject: [PATCH 211/242] RDMA/hns: Add FRMR support for hip08 This patch adds fast register physical memory region (FRMR) support for hip08. Signed-off-by: Yixian Liu Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/hns/hns_roce_device.h | 10 +++ drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 98 ++++++++++++++++++++- drivers/infiniband/hw/hns/hns_roce_hw_v2.h | 22 +++++ drivers/infiniband/hw/hns/hns_roce_main.c | 6 ++ drivers/infiniband/hw/hns/hns_roce_mr.c | 92 +++++++++++++++++-- 5 files changed, 218 insertions(+), 10 deletions(-) diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h b/drivers/infiniband/hw/hns/hns_roce_device.h index de9b8e391563..d39bdfdb5de9 100644 --- a/drivers/infiniband/hw/hns/hns_roce_device.h +++ b/drivers/infiniband/hw/hns/hns_roce_device.h @@ -88,8 +88,11 @@ #define BITMAP_RR 1 #define MR_TYPE_MR 0x00 +#define MR_TYPE_FRMR 0x01 #define MR_TYPE_DMA 0x03 +#define HNS_ROCE_FRMR_MAX_PA 512 + #define PKEY_ID 0xffff #define GUID_LEN 8 #define NODE_DESC_SIZE 64 @@ -194,6 +197,7 @@ enum { HNS_ROCE_CAP_FLAG_RECORD_DB = BIT(3), HNS_ROCE_CAP_FLAG_SQ_RECORD_DB = BIT(4), HNS_ROCE_CAP_FLAG_MW = BIT(7), + HNS_ROCE_CAP_FLAG_FRMR = BIT(8), HNS_ROCE_CAP_FLAG_ATOMIC = BIT(10), }; @@ -308,6 +312,7 @@ struct hns_roce_mr { u32 key; /* Key of MR */ u32 pd; /* PD num of MR */ u32 access;/* Access permission of MR */ + u32 npages; int enabled; /* MR's active status */ int type; /* MR's register type */ u64 *pbl_buf;/* MR's PBL space */ @@ -773,6 +778,7 @@ struct hns_roce_hw { struct hns_roce_mr *mr, int flags, u32 pdn, int mr_access_flags, u64 iova, u64 size, void *mb_buf); + int (*frmr_write_mtpt)(void *mb_buf, struct hns_roce_mr *mr); int (*mw_write_mtpt)(void *mb_buf, struct hns_roce_mw *mw); void (*write_cqc)(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq, void *mb_buf, u64 *mtts, @@ -983,6 +989,10 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, int hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); +struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg); +int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset); int hns_roce_dereg_mr(struct ib_mr *ibmr); int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev, struct hns_roce_cmd_mailbox *mailbox, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c index e3d9f1de8899..a4c62ae23a9a 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c @@ -54,6 +54,47 @@ static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, dseg->len = cpu_to_le32(sg->length); } +static void set_frmr_seg(struct hns_roce_v2_rc_send_wqe *rc_sq_wqe, + struct hns_roce_wqe_frmr_seg *fseg, + const struct ib_reg_wr *wr) +{ + struct hns_roce_mr *mr = to_hr_mr(wr->mr); + + /* use ib_access_flags */ + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S, + wr->access & IB_ACCESS_MW_BIND ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S, + wr->access & IB_ACCESS_REMOTE_ATOMIC ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_RR_S, + wr->access & IB_ACCESS_REMOTE_READ ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_RW_S, + wr->access & IB_ACCESS_REMOTE_WRITE ? 1 : 0); + roce_set_bit(rc_sq_wqe->byte_4, + V2_RC_FRMR_WQE_BYTE_4_LW_S, + wr->access & IB_ACCESS_LOCAL_WRITE ? 1 : 0); + + /* Data structure reuse may lead to confusion */ + rc_sq_wqe->msg_len = cpu_to_le32(mr->pbl_ba & 0xffffffff); + rc_sq_wqe->inv_key = cpu_to_le32(mr->pbl_ba >> 32); + + rc_sq_wqe->byte_16 = cpu_to_le32(wr->mr->length & 0xffffffff); + rc_sq_wqe->byte_20 = cpu_to_le32(wr->mr->length >> 32); + rc_sq_wqe->rkey = cpu_to_le32(wr->key); + rc_sq_wqe->va = cpu_to_le64(wr->mr->iova); + + fseg->pbl_size = cpu_to_le32(mr->pbl_size); + roce_set_field(fseg->mode_buf_pg_sz, + V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M, + V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S, + mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + roce_set_bit(fseg->mode_buf_pg_sz, + V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S, 0); +} + static void set_atomic_seg(struct hns_roce_wqe_atomic_seg *aseg, const struct ib_atomic_wr *wr) { @@ -192,6 +233,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, struct hns_roce_v2_ud_send_wqe *ud_sq_wqe; struct hns_roce_v2_rc_send_wqe *rc_sq_wqe; struct hns_roce_qp *qp = to_hr_qp(ibqp); + struct hns_roce_wqe_frmr_seg *fseg; struct device *dev = hr_dev->dev; struct hns_roce_v2_db sq_db; struct ib_qp_attr attr; @@ -462,6 +504,11 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, rc_sq_wqe->inv_key = cpu_to_le32(wr->ex.invalidate_rkey); break; + case IB_WR_REG_MR: + hr_op = HNS_ROCE_V2_WQE_OP_FAST_REG_PMR; + fseg = wqe; + set_frmr_seg(rc_sq_wqe, fseg, reg_wr(wr)); + break; case IB_WR_ATOMIC_CMP_AND_SWP: hr_op = HNS_ROCE_V2_WQE_OP_ATOM_CMP_AND_SWAP; rc_sq_wqe->rkey = @@ -505,7 +552,7 @@ static int hns_roce_v2_post_send(struct ib_qp *ibqp, V2_RC_SEND_WQE_BYTE_16_SGE_NUM_M, V2_RC_SEND_WQE_BYTE_16_SGE_NUM_S, wr->num_sge); - } else { + } else if (wr->opcode != IB_WR_REG_MR) { ret = set_rwqe_data_seg(ibqp, wr, rc_sq_wqe, wqe, &sge_ind, bad_wr); if (ret) @@ -1297,7 +1344,8 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev) HNS_ROCE_CAP_FLAG_SQ_RECORD_DB; if (hr_dev->pci_dev->revision == 0x21) - caps->flags |= HNS_ROCE_CAP_FLAG_MW; + caps->flags |= HNS_ROCE_CAP_FLAG_MW | + HNS_ROCE_CAP_FLAG_FRMR; caps->pkey_table_len[0] = 1; caps->gid_table_len[0] = HNS_ROCE_V2_GID_INDEX_NUM; @@ -1865,6 +1913,48 @@ static int hns_roce_v2_rereg_write_mtpt(struct hns_roce_dev *hr_dev, return 0; } +static int hns_roce_v2_frmr_write_mtpt(void *mb_buf, struct hns_roce_mr *mr) +{ + struct hns_roce_v2_mpt_entry *mpt_entry; + + mpt_entry = mb_buf; + memset(mpt_entry, 0, sizeof(*mpt_entry)); + + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_MPT_ST_M, + V2_MPT_BYTE_4_MPT_ST_S, V2_MPT_ST_FREE); + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PBL_HOP_NUM_M, + V2_MPT_BYTE_4_PBL_HOP_NUM_S, 1); + roce_set_field(mpt_entry->byte_4_pd_hop_st, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_M, + V2_MPT_BYTE_4_PBL_BA_PG_SZ_S, + mr->pbl_ba_pg_sz + PG_SHIFT_OFFSET); + roce_set_field(mpt_entry->byte_4_pd_hop_st, V2_MPT_BYTE_4_PD_M, + V2_MPT_BYTE_4_PD_S, mr->pd); + + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 1); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1); + roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1); + + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_FRE_S, 1); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_PA_S, 0); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_MR_MW_S, 0); + roce_set_bit(mpt_entry->byte_12_mw_pa, V2_MPT_BYTE_12_BPD_S, 1); + + mpt_entry->pbl_size = cpu_to_le32(mr->pbl_size); + + mpt_entry->pbl_ba_l = cpu_to_le32(lower_32_bits(mr->pbl_ba >> 3)); + roce_set_field(mpt_entry->byte_48_mode_ba, V2_MPT_BYTE_48_PBL_BA_H_M, + V2_MPT_BYTE_48_PBL_BA_H_S, + upper_32_bits(mr->pbl_ba >> 3)); + + roce_set_field(mpt_entry->byte_64_buf_pa1, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_M, + V2_MPT_BYTE_64_PBL_BUF_PG_SZ_S, + mr->pbl_buf_pg_sz + PG_SHIFT_OFFSET); + + return 0; +} + static int hns_roce_v2_mw_write_mtpt(void *mb_buf, struct hns_roce_mw *mw) { struct hns_roce_v2_mpt_entry *mpt_entry; @@ -2834,6 +2924,9 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp, roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_MSG_RNR_FLG_S, 0); + roce_set_bit(context->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 1); + roce_set_bit(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_FRE_S, 0); + roce_set_field(qpc_mask->byte_176_msg_pktn, V2_QPC_BYTE_176_MSG_USE_PKTN_M, V2_QPC_BYTE_176_MSG_USE_PKTN_S, 0); @@ -5259,6 +5352,7 @@ static const struct hns_roce_hw hns_roce_hw_v2 = { .set_mac = hns_roce_v2_set_mac, .write_mtpt = hns_roce_v2_write_mtpt, .rereg_write_mtpt = hns_roce_v2_rereg_write_mtpt, + .frmr_write_mtpt = hns_roce_v2_frmr_write_mtpt, .mw_write_mtpt = hns_roce_v2_mw_write_mtpt, .write_cqc = hns_roce_v2_write_cqc, .set_hem = hns_roce_v2_set_hem, diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h index f8abccea152c..8bc820635bbd 100644 --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.h +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.h @@ -886,6 +886,8 @@ struct hns_roce_v2_mpt_entry { #define V2_MPT_BYTE_8_MW_CNT_S 8 #define V2_MPT_BYTE_8_MW_CNT_M GENMASK(31, 8) +#define V2_MPT_BYTE_12_FRE_S 0 + #define V2_MPT_BYTE_12_PA_S 1 #define V2_MPT_BYTE_12_MR_MW_S 4 @@ -1058,6 +1060,16 @@ struct hns_roce_v2_rc_send_wqe { #define V2_RC_SEND_WQE_BYTE_4_INLINE_S 12 +#define V2_RC_FRMR_WQE_BYTE_4_BIND_EN_S 19 + +#define V2_RC_FRMR_WQE_BYTE_4_ATOMIC_S 20 + +#define V2_RC_FRMR_WQE_BYTE_4_RR_S 21 + +#define V2_RC_FRMR_WQE_BYTE_4_RW_S 22 + +#define V2_RC_FRMR_WQE_BYTE_4_LW_S 23 + #define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_S 0 #define V2_RC_SEND_WQE_BYTE_16_XRC_SRQN_M GENMASK(23, 0) @@ -1067,6 +1079,16 @@ struct hns_roce_v2_rc_send_wqe { #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_S 0 #define V2_RC_SEND_WQE_BYTE_20_MSG_START_SGE_IDX_M GENMASK(23, 0) +struct hns_roce_wqe_frmr_seg { + __le32 pbl_size; + __le32 mode_buf_pg_sz; +}; + +#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_S 4 +#define V2_RC_FRMR_WQE_BYTE_40_PBL_BUF_PG_SZ_M GENMASK(7, 4) + +#define V2_RC_FRMR_WQE_BYTE_40_BLK_MODE_S 8 + struct hns_roce_v2_wqe_data_seg { __le32 len; __le32 lkey; diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c index 7e693b11c823..1b3ee514f2ef 100644 --- a/drivers/infiniband/hw/hns/hns_roce_main.c +++ b/drivers/infiniband/hw/hns/hns_roce_main.c @@ -535,6 +535,12 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev) (1ULL << IB_USER_VERBS_CMD_DEALLOC_MW); } + /* FRMR */ + if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_FRMR) { + ib_dev->alloc_mr = hns_roce_alloc_mr; + ib_dev->map_mr_sg = hns_roce_map_mr_sg; + } + /* OTHERS */ ib_dev->get_port_immutable = hns_roce_port_immutable; ib_dev->disassociate_ucontext = hns_roce_disassociate_ucontext; diff --git a/drivers/infiniband/hw/hns/hns_roce_mr.c b/drivers/infiniband/hw/hns/hns_roce_mr.c index 0613c117a442..521ad2aa3a4e 100644 --- a/drivers/infiniband/hw/hns/hns_roce_mr.c +++ b/drivers/infiniband/hw/hns/hns_roce_mr.c @@ -329,7 +329,7 @@ static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages, u64 bt_idx; u64 size; - mhop_num = hr_dev->caps.pbl_hop_num; + mhop_num = (mr->type == MR_TYPE_FRMR ? 1 : hr_dev->caps.pbl_hop_num); pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); pbl_last_bt_num = (npages + pbl_bt_sz / 8 - 1) / (pbl_bt_sz / 8); @@ -351,7 +351,7 @@ static int hns_roce_mhop_alloc(struct hns_roce_dev *hr_dev, int npages, mr->pbl_size = npages; mr->pbl_ba = mr->pbl_dma_addr; - mr->pbl_hop_num = hr_dev->caps.pbl_hop_num; + mr->pbl_hop_num = mhop_num; mr->pbl_ba_pg_sz = hr_dev->caps.pbl_ba_pg_sz; mr->pbl_buf_pg_sz = hr_dev->caps.pbl_buf_pg_sz; return 0; @@ -511,7 +511,6 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova, mr->key = hw_index_to_key(index); /* MR key */ if (size == ~0ull) { - mr->type = MR_TYPE_DMA; mr->pbl_buf = NULL; mr->pbl_dma_addr = 0; /* PBL multi-hop addressing parameters */ @@ -522,7 +521,6 @@ static int hns_roce_mr_alloc(struct hns_roce_dev *hr_dev, u32 pd, u64 iova, mr->pbl_l1_dma_addr = NULL; mr->pbl_l0_dma_addr = 0; } else { - mr->type = MR_TYPE_MR; if (!hr_dev->caps.pbl_hop_num) { mr->pbl_buf = dma_alloc_coherent(dev, npages * 8, &(mr->pbl_dma_addr), @@ -548,9 +546,9 @@ static void hns_roce_mhop_free(struct hns_roce_dev *hr_dev, u32 mhop_num; u64 bt_idx; - npages = ib_umem_page_count(mr->umem); + npages = mr->pbl_size; pbl_bt_sz = 1 << (hr_dev->caps.pbl_ba_pg_sz + PAGE_SHIFT); - mhop_num = hr_dev->caps.pbl_hop_num; + mhop_num = (mr->type == MR_TYPE_FRMR) ? 1 : hr_dev->caps.pbl_hop_num; if (mhop_num == HNS_ROCE_HOP_NUM_0) return; @@ -636,7 +634,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, } if (mr->size != ~0ULL) { - npages = ib_umem_page_count(mr->umem); + if (mr->type == MR_TYPE_MR) + npages = ib_umem_page_count(mr->umem); if (!hr_dev->caps.pbl_hop_num) dma_free_coherent(dev, (unsigned int)(npages * 8), @@ -674,7 +673,10 @@ static int hns_roce_mr_enable(struct hns_roce_dev *hr_dev, goto err_table; } - ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx); + if (mr->type != MR_TYPE_FRMR) + ret = hr_dev->hw->write_mtpt(mailbox->buf, mr, mtpt_idx); + else + ret = hr_dev->hw->frmr_write_mtpt(mailbox->buf, mr); if (ret) { dev_err(dev, "Write mtpt fail!\n"); goto err_page; @@ -855,6 +857,8 @@ struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc) if (mr == NULL) return ERR_PTR(-ENOMEM); + mr->type = MR_TYPE_DMA; + /* Allocate memory region key */ ret = hns_roce_mr_alloc(to_hr_dev(pd->device), to_hr_pd(pd)->pdn, 0, ~0ULL, acc, 0, mr); @@ -1031,6 +1035,8 @@ struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, } } + mr->type = MR_TYPE_MR; + ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, virt_addr, length, access_flags, n, mr); if (ret) @@ -1202,6 +1208,76 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr) return ret; } +struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, + u32 max_num_sg) +{ + struct hns_roce_dev *hr_dev = to_hr_dev(pd->device); + struct device *dev = hr_dev->dev; + struct hns_roce_mr *mr; + u64 length; + u32 page_size; + int ret; + + page_size = 1 << (hr_dev->caps.pbl_buf_pg_sz + PAGE_SHIFT); + length = max_num_sg * page_size; + + if (mr_type != IB_MR_TYPE_MEM_REG) + return ERR_PTR(-EINVAL); + + if (max_num_sg > HNS_ROCE_FRMR_MAX_PA) { + dev_err(dev, "max_num_sg larger than %d\n", + HNS_ROCE_FRMR_MAX_PA); + return ERR_PTR(-EINVAL); + } + + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) + return ERR_PTR(-ENOMEM); + + mr->type = MR_TYPE_FRMR; + + /* Allocate memory region key */ + ret = hns_roce_mr_alloc(hr_dev, to_hr_pd(pd)->pdn, 0, length, + 0, max_num_sg, mr); + if (ret) + goto err_free; + + ret = hns_roce_mr_enable(hr_dev, mr); + if (ret) + goto err_mr; + + mr->ibmr.rkey = mr->ibmr.lkey = mr->key; + mr->umem = NULL; + + return &mr->ibmr; + +err_mr: + hns_roce_mr_free(to_hr_dev(pd->device), mr); + +err_free: + kfree(mr); + return ERR_PTR(ret); +} + +static int hns_roce_set_page(struct ib_mr *ibmr, u64 addr) +{ + struct hns_roce_mr *mr = to_hr_mr(ibmr); + + mr->pbl_buf[mr->npages++] = cpu_to_le64(addr); + + return 0; +} + +int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents, + unsigned int *sg_offset) +{ + struct hns_roce_mr *mr = to_hr_mr(ibmr); + + mr->npages = 0; + + return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, hns_roce_set_page); +} + static void hns_roce_mw_free(struct hns_roce_dev *hr_dev, struct hns_roce_mw *mw) { From 2351776e87a18318b5f4732e7790f0c726cc37a0 Mon Sep 17 00:00:00 2001 From: Yishai Hadas Date: Sun, 7 Oct 2018 12:06:34 +0300 Subject: [PATCH 212/242] IB/mlx5: Verify DEVX object type Verify that the input DEVX object type matches the created object. As the obj_id in the firmware is not globally unique the object type must be considered upon checking for a valid object id. Once both the type and the id match we know that the lock was taken on the correct object by the uverbs layer. Fixes: e662e14d801b ("IB/mlx5: Add DEVX support for modify and query commands") Signed-off-by: Yishai Hadas Reviewed-by: Artemy Kovalyov Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/devx.c | 180 +++++++++++++++++++++--------- 1 file changed, 126 insertions(+), 54 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index 31f12295aec6..61aab7c0c513 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -19,7 +19,7 @@ #define MLX5_MAX_DESTROY_INBOX_SIZE_DW MLX5_ST_SZ_DW(delete_fte_in) struct devx_obj { struct mlx5_core_dev *mdev; - u32 obj_id; + u64 obj_id; u32 dinlen; /* destroy inbox length */ u32 dinbox[MLX5_MAX_DESTROY_INBOX_SIZE_DW]; }; @@ -106,150 +106,218 @@ bool mlx5_ib_devx_is_flow_dest(void *obj, int *dest_id, int *dest_type) } } +/* + * As the obj_id in the firmware is not globally unique the object type + * must be considered upon checking for a valid object id. + * For that the opcode of the creator command is encoded as part of the obj_id. + */ +static u64 get_enc_obj_id(u16 opcode, u32 obj_id) +{ + return ((u64)opcode << 32) | obj_id; +} + static int devx_is_valid_obj_id(struct devx_obj *obj, const void *in) { u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); - u32 obj_id; + u64 obj_id; switch (opcode) { case MLX5_CMD_OP_MODIFY_GENERAL_OBJECT: case MLX5_CMD_OP_QUERY_GENERAL_OBJECT: - obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_GENERAL_OBJECT, + MLX5_GET(general_obj_in_cmd_hdr, in, + obj_id)); break; case MLX5_CMD_OP_QUERY_MKEY: - obj_id = MLX5_GET(query_mkey_in, in, mkey_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_MKEY, + MLX5_GET(query_mkey_in, in, + mkey_index)); break; case MLX5_CMD_OP_QUERY_CQ: - obj_id = MLX5_GET(query_cq_in, in, cqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ, + MLX5_GET(query_cq_in, in, cqn)); break; case MLX5_CMD_OP_MODIFY_CQ: - obj_id = MLX5_GET(modify_cq_in, in, cqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_CQ, + MLX5_GET(modify_cq_in, in, cqn)); break; case MLX5_CMD_OP_QUERY_SQ: - obj_id = MLX5_GET(query_sq_in, in, sqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ, + MLX5_GET(query_sq_in, in, sqn)); break; case MLX5_CMD_OP_MODIFY_SQ: - obj_id = MLX5_GET(modify_sq_in, in, sqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SQ, + MLX5_GET(modify_sq_in, in, sqn)); break; case MLX5_CMD_OP_QUERY_RQ: - obj_id = MLX5_GET(query_rq_in, in, rqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(query_rq_in, in, rqn)); break; case MLX5_CMD_OP_MODIFY_RQ: - obj_id = MLX5_GET(modify_rq_in, in, rqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(modify_rq_in, in, rqn)); break; case MLX5_CMD_OP_QUERY_RMP: - obj_id = MLX5_GET(query_rmp_in, in, rmpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP, + MLX5_GET(query_rmp_in, in, rmpn)); break; case MLX5_CMD_OP_MODIFY_RMP: - obj_id = MLX5_GET(modify_rmp_in, in, rmpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RMP, + MLX5_GET(modify_rmp_in, in, rmpn)); break; case MLX5_CMD_OP_QUERY_RQT: - obj_id = MLX5_GET(query_rqt_in, in, rqtn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT, + MLX5_GET(query_rqt_in, in, rqtn)); break; case MLX5_CMD_OP_MODIFY_RQT: - obj_id = MLX5_GET(modify_rqt_in, in, rqtn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQT, + MLX5_GET(modify_rqt_in, in, rqtn)); break; case MLX5_CMD_OP_QUERY_TIR: - obj_id = MLX5_GET(query_tir_in, in, tirn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR, + MLX5_GET(query_tir_in, in, tirn)); break; case MLX5_CMD_OP_MODIFY_TIR: - obj_id = MLX5_GET(modify_tir_in, in, tirn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIR, + MLX5_GET(modify_tir_in, in, tirn)); break; case MLX5_CMD_OP_QUERY_TIS: - obj_id = MLX5_GET(query_tis_in, in, tisn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS, + MLX5_GET(query_tis_in, in, tisn)); break; case MLX5_CMD_OP_MODIFY_TIS: - obj_id = MLX5_GET(modify_tis_in, in, tisn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_TIS, + MLX5_GET(modify_tis_in, in, tisn)); break; case MLX5_CMD_OP_QUERY_FLOW_TABLE: - obj_id = MLX5_GET(query_flow_table_in, in, table_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE, + MLX5_GET(query_flow_table_in, in, + table_id)); break; case MLX5_CMD_OP_MODIFY_FLOW_TABLE: - obj_id = MLX5_GET(modify_flow_table_in, in, table_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_TABLE, + MLX5_GET(modify_flow_table_in, in, + table_id)); break; case MLX5_CMD_OP_QUERY_FLOW_GROUP: - obj_id = MLX5_GET(query_flow_group_in, in, group_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_FLOW_GROUP, + MLX5_GET(query_flow_group_in, in, + group_id)); break; case MLX5_CMD_OP_QUERY_FLOW_TABLE_ENTRY: - obj_id = MLX5_GET(query_fte_in, in, flow_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY, + MLX5_GET(query_fte_in, in, + flow_index)); break; case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: - obj_id = MLX5_GET(set_fte_in, in, flow_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY, + MLX5_GET(set_fte_in, in, flow_index)); break; case MLX5_CMD_OP_QUERY_Q_COUNTER: - obj_id = MLX5_GET(query_q_counter_in, in, counter_set_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_Q_COUNTER, + MLX5_GET(query_q_counter_in, in, + counter_set_id)); break; case MLX5_CMD_OP_QUERY_FLOW_COUNTER: - obj_id = MLX5_GET(query_flow_counter_in, in, flow_counter_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_FLOW_COUNTER, + MLX5_GET(query_flow_counter_in, in, + flow_counter_id)); break; case MLX5_CMD_OP_QUERY_MODIFY_HEADER_CONTEXT: - obj_id = MLX5_GET(general_obj_in_cmd_hdr, in, obj_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT, + MLX5_GET(general_obj_in_cmd_hdr, in, + obj_id)); break; case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT: - obj_id = MLX5_GET(query_scheduling_element_in, in, - scheduling_element_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT, + MLX5_GET(query_scheduling_element_in, + in, scheduling_element_id)); break; case MLX5_CMD_OP_MODIFY_SCHEDULING_ELEMENT: - obj_id = MLX5_GET(modify_scheduling_element_in, in, - scheduling_element_id); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT, + MLX5_GET(modify_scheduling_element_in, + in, scheduling_element_id)); break; case MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT: - obj_id = MLX5_GET(add_vxlan_udp_dport_in, in, vxlan_udp_port); + obj_id = get_enc_obj_id(MLX5_CMD_OP_ADD_VXLAN_UDP_DPORT, + MLX5_GET(add_vxlan_udp_dport_in, in, + vxlan_udp_port)); break; case MLX5_CMD_OP_QUERY_L2_TABLE_ENTRY: - obj_id = MLX5_GET(query_l2_table_entry_in, in, table_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY, + MLX5_GET(query_l2_table_entry_in, in, + table_index)); break; case MLX5_CMD_OP_SET_L2_TABLE_ENTRY: - obj_id = MLX5_GET(set_l2_table_entry_in, in, table_index); + obj_id = get_enc_obj_id(MLX5_CMD_OP_SET_L2_TABLE_ENTRY, + MLX5_GET(set_l2_table_entry_in, in, + table_index)); break; case MLX5_CMD_OP_QUERY_QP: - obj_id = MLX5_GET(query_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(query_qp_in, in, qpn)); break; case MLX5_CMD_OP_RST2INIT_QP: - obj_id = MLX5_GET(rst2init_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rst2init_qp_in, in, qpn)); break; case MLX5_CMD_OP_INIT2RTR_QP: - obj_id = MLX5_GET(init2rtr_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(init2rtr_qp_in, in, qpn)); break; case MLX5_CMD_OP_RTR2RTS_QP: - obj_id = MLX5_GET(rtr2rts_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rtr2rts_qp_in, in, qpn)); break; case MLX5_CMD_OP_RTS2RTS_QP: - obj_id = MLX5_GET(rts2rts_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(rts2rts_qp_in, in, qpn)); break; case MLX5_CMD_OP_SQERR2RTS_QP: - obj_id = MLX5_GET(sqerr2rts_qp_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(sqerr2rts_qp_in, in, qpn)); break; case MLX5_CMD_OP_2ERR_QP: - obj_id = MLX5_GET(qp_2err_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(qp_2err_in, in, qpn)); break; case MLX5_CMD_OP_2RST_QP: - obj_id = MLX5_GET(qp_2rst_in, in, qpn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_QP, + MLX5_GET(qp_2rst_in, in, qpn)); break; case MLX5_CMD_OP_QUERY_DCT: - obj_id = MLX5_GET(query_dct_in, in, dctn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, + MLX5_GET(query_dct_in, in, dctn)); break; case MLX5_CMD_OP_QUERY_XRQ: - obj_id = MLX5_GET(query_xrq_in, in, xrqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ, + MLX5_GET(query_xrq_in, in, xrqn)); break; case MLX5_CMD_OP_QUERY_XRC_SRQ: - obj_id = MLX5_GET(query_xrc_srq_in, in, xrc_srqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ, + MLX5_GET(query_xrc_srq_in, in, + xrc_srqn)); break; case MLX5_CMD_OP_ARM_XRC_SRQ: - obj_id = MLX5_GET(arm_xrc_srq_in, in, xrc_srqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRC_SRQ, + MLX5_GET(arm_xrc_srq_in, in, xrc_srqn)); break; case MLX5_CMD_OP_QUERY_SRQ: - obj_id = MLX5_GET(query_srq_in, in, srqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_SRQ, + MLX5_GET(query_srq_in, in, srqn)); break; case MLX5_CMD_OP_ARM_RQ: - obj_id = MLX5_GET(arm_rq_in, in, srq_number); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_RQ, + MLX5_GET(arm_rq_in, in, srq_number)); break; case MLX5_CMD_OP_DRAIN_DCT: case MLX5_CMD_OP_ARM_DCT_FOR_KEY_VIOLATION: - obj_id = MLX5_GET(drain_dct_in, in, dctn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_DCT, + MLX5_GET(drain_dct_in, in, dctn)); break; case MLX5_CMD_OP_ARM_XRQ: - obj_id = MLX5_GET(arm_xrq_in, in, xrqn); + obj_id = get_enc_obj_id(MLX5_CMD_OP_CREATE_XRQ, + MLX5_GET(arm_xrq_in, in, xrqn)); break; default: return false; @@ -352,11 +420,11 @@ static void devx_set_umem_valid(const void *in) } } -static bool devx_is_obj_create_cmd(const void *in) +static bool devx_is_obj_create_cmd(const void *in, u16 *opcode) { - u16 opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); + *opcode = MLX5_GET(general_obj_in_cmd_hdr, in, opcode); - switch (opcode) { + switch (*opcode) { case MLX5_CMD_OP_CREATE_GENERAL_OBJECT: case MLX5_CMD_OP_CREATE_MKEY: case MLX5_CMD_OP_CREATE_CQ: @@ -854,12 +922,14 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( struct devx_obj *obj; int err; int uid; + u32 obj_id; + u16 opcode; uid = devx_get_uid(c, cmd_in); if (uid < 0) return uid; - if (!devx_is_obj_create_cmd(cmd_in)) + if (!devx_is_obj_create_cmd(cmd_in, &opcode)) return -EINVAL; cmd_out = uverbs_zalloc(attrs, cmd_out_len); @@ -881,13 +951,15 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_CREATE)( uobj->object = obj; obj->mdev = dev->mdev; - devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, &obj->obj_id); + devx_obj_build_destroy_cmd(cmd_in, cmd_out, obj->dinbox, &obj->dinlen, + &obj_id); WARN_ON(obj->dinlen > MLX5_MAX_DESTROY_INBOX_SIZE_DW * sizeof(u32)); err = uverbs_copy_to(attrs, MLX5_IB_ATTR_DEVX_OBJ_CREATE_CMD_OUT, cmd_out, cmd_out_len); if (err) goto obj_destroy; + obj->obj_id = get_enc_obj_id(opcode, obj_id); return 0; obj_destroy: From 645ba5970c0766c610920fd5e8dd34e62766af5a Mon Sep 17 00:00:00 2001 From: Gal Pressman Date: Mon, 8 Oct 2018 19:44:03 +0300 Subject: [PATCH 213/242] RDMA/mlx5: Remove extraneous error check Remove double error check from create user RQ error flow. Fixes: 79b20a6c3014 ("IB/mlx5: Add receive Work Queue verbs") Signed-off-by: Gal Pressman Reviewed-by: Majd Dibbiny Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/qp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index fa8e5dc65cb4..829aec3aba8f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -5587,8 +5587,7 @@ static int prepare_user_rq(struct ib_pd *pd, err = create_user_rq(dev, pd, rwq, &ucmd); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); - if (err) - return err; + return err; } rwq->user_index = ucmd.user_index; From ba4a41198324be2e6fbb06c270fdc8500c0e38de Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 10 Oct 2018 09:55:10 +0300 Subject: [PATCH 214/242] RDMA/mlx5: Add support for flow tag to raw create flow A user can provide a hint which will be attached to the packet and written to the CQE on receive. This can be used as a way to offload operations into the HW, for example parsing a packet which is a tunneled packet, and if so, pass 0x1 as the hint. The software can use that hint to decapsulate the packet and parse only the inner headers thus saving CPU cycles. Signed-off-by: Mark Bloch Reviewed-by: Yishai Hadas Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/flow.c | 15 ++++++++++++++- include/uapi/rdma/mlx5_user_ioctl_cmds.h | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/flow.c b/drivers/infiniband/hw/mlx5/flow.c index 4ee4af450720..e57435cb6d96 100644 --- a/drivers/infiniband/hw/mlx5/flow.c +++ b/drivers/infiniband/hw/mlx5/flow.c @@ -153,6 +153,16 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)( arr_flow_actions[i]->object); } + ret = uverbs_copy_from(&flow_act.flow_tag, attrs, + MLX5_IB_ATTR_CREATE_FLOW_TAG); + if (!ret) { + if (flow_act.flow_tag >= BIT(24)) { + ret = -EINVAL; + goto err_out; + } + flow_act.has_flow_tag = true; + } + flow_handler = mlx5_ib_raw_fs_rule_add(dev, fs_matcher, &flow_act, cmd_in, inlen, dest_id, dest_type); @@ -513,7 +523,10 @@ DECLARE_UVERBS_NAMED_METHOD( UVERBS_OBJECT_FLOW_ACTION, UVERBS_ACCESS_READ, 1, MLX5_IB_CREATE_FLOW_MAX_FLOW_ACTIONS, - UA_OPTIONAL)); + UA_OPTIONAL), + UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_CREATE_FLOW_TAG, + UVERBS_ATTR_TYPE(u32), + UA_OPTIONAL)); DECLARE_UVERBS_NAMED_METHOD_DESTROY( MLX5_IB_METHOD_DESTROY_FLOW, diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h index fb4a8b17cca8..408e220034de 100644 --- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h +++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h @@ -157,6 +157,7 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX, MLX5_IB_ATTR_CREATE_FLOW_MATCHER, MLX5_IB_ATTR_CREATE_FLOW_ARR_FLOW_ACTIONS, + MLX5_IB_ATTR_CREATE_FLOW_TAG, }; enum mlx5_ib_destoy_flow_attrs { From fe9bc1644918aa1d02a889b4ca788bfb67f90816 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 22:10:10 +0300 Subject: [PATCH 215/242] RDMA/restrack: Protect from reentry to resource return path Nullify the resource task struct pointer to ensure that subsequent calls won't try to release task_struct again. ------------[ cut here ]------------ ODEBUG: free active (active state 1) object type: rcu_head hint: (null) WARNING: CPU: 0 PID: 6048 at lib/debugobjects.c:329 debug_print_object+0x16a/0x210 lib/debugobjects.c:326 Kernel panic - not syncing: panic_on_warn set ... CPU: 0 PID: 6048 Comm: syz-executor022 Not tainted 4.19.0-rc7-next-20181008+ #89 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x244/0x3ab lib/dump_stack.c:113 panic+0x238/0x4e7 kernel/panic.c:184 __warn.cold.8+0x163/0x1ba kernel/panic.c:536 report_bug+0x254/0x2d0 lib/bug.c:186 fixup_bug arch/x86/kernel/traps.c:178 [inline] do_error_trap+0x11b/0x200 arch/x86/kernel/traps.c:271 do_invalid_op+0x36/0x40 arch/x86/kernel/traps.c:290 invalid_op+0x14/0x20 arch/x86/entry/entry_64.S:969 RIP: 0010:debug_print_object+0x16a/0x210 lib/debugobjects.c:326 Code: 41 88 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 85 92 00 00 00 48 8b 14 dd 60 02 41 88 4c 89 fe 48 c7 c7 00 f8 40 88 e8 36 2f b4 fd <0f> 0b 83 05 a9 f4 5e 06 01 48 83 c4 18 5b 41 5c 41 5d 41 5e 41 5f RSP: 0018:ffff8801d8c3eda8 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff8164d235 RDI: 0000000000000005 RBP: ffff8801d8c3ede8 R08: ffff8801d70aa280 R09: ffffed003b5c3eda R10: ffffed003b5c3eda R11: ffff8801dae1f6d7 R12: 0000000000000001 R13: ffffffff8939a760 R14: 0000000000000000 R15: ffffffff8840fca0 __debug_check_no_obj_freed lib/debugobjects.c:786 [inline] debug_check_no_obj_freed+0x3ae/0x58d lib/debugobjects.c:818 kmem_cache_free+0x202/0x290 mm/slab.c:3759 free_task_struct kernel/fork.c:163 [inline] free_task+0x16e/0x1f0 kernel/fork.c:457 __put_task_struct+0x2e6/0x620 kernel/fork.c:730 put_task_struct include/linux/sched/task.h:96 [inline] finish_task_switch+0x66c/0x900 kernel/sched/core.c:2715 context_switch kernel/sched/core.c:2834 [inline] __schedule+0x8d7/0x21d0 kernel/sched/core.c:3480 schedule+0xfe/0x460 kernel/sched/core.c:3524 freezable_schedule include/linux/freezer.h:172 [inline] futex_wait_queue_me+0x3f9/0x840 kernel/futex.c:2530 futex_wait+0x45c/0xa50 kernel/futex.c:2645 do_futex+0x31a/0x26d0 kernel/futex.c:3528 __do_sys_futex kernel/futex.c:3589 [inline] __se_sys_futex kernel/futex.c:3557 [inline] __x64_sys_futex+0x472/0x6a0 kernel/futex.c:3557 do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x446549 Code: e8 2c b3 02 00 48 83 c4 18 c3 0f 1f 80 00 00 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 2b 09 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f3a998f5da8 EFLAGS: 00000246 ORIG_RAX: 00000000000000ca RAX: ffffffffffffffda RBX: 00000000006dbc38 RCX: 0000000000446549 RDX: 0000000000000000 RSI: 0000000000000080 RDI: 00000000006dbc38 RBP: 00000000006dbc30 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 00000000006dbc3c R13: 2f646e6162696e69 R14: 666e692f7665642f R15: 00000000006dbd2c Kernel Offset: disabled Reported-by: syzbot+71aff6ea121ffefc280f@syzkaller.appspotmail.com Fixes: ed7a01fd3fd7 ("RDMA/restrack: Release task struct which was hold by CM_ID object") Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index 16b5f9949770..06d8657ce583 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -239,7 +239,9 @@ void rdma_restrack_del(struct rdma_restrack_entry *res) up_write(&dev->res.rwsem); out: - if (res->task) + if (res->task) { put_task_struct(res->task); + res->task = NULL; + } } EXPORT_SYMBOL(rdma_restrack_del); From 4d6e4d12da2c308f8f976d3955c45ee62539ac98 Mon Sep 17 00:00:00 2001 From: Denis Drozdov Date: Thu, 11 Oct 2018 22:33:57 +0300 Subject: [PATCH 216/242] IB/ipoib: Clear IPCB before icmp_send IPCB should be cleared before icmp_send, since it may contain data from previous layers and the data could be misinterpreted as ip header options, which later caused the ihl to be set to an invalid value and resulted in the following stack corruption: [ 1083.031512] ib0: packet len 57824 (> 2048) too long to send, dropping [ 1083.031843] ib0: packet len 37904 (> 2048) too long to send, dropping [ 1083.032004] ib0: packet len 4040 (> 2048) too long to send, dropping [ 1083.032253] ib0: packet len 63800 (> 2048) too long to send, dropping [ 1083.032481] ib0: packet len 23960 (> 2048) too long to send, dropping [ 1083.033149] ib0: packet len 63800 (> 2048) too long to send, dropping [ 1083.033439] ib0: packet len 63800 (> 2048) too long to send, dropping [ 1083.033700] ib0: packet len 63800 (> 2048) too long to send, dropping [ 1083.034124] ib0: packet len 63800 (> 2048) too long to send, dropping [ 1083.034387] ================================================================== [ 1083.034602] BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0xf08/0x1310 [ 1083.034798] Write of size 4 at addr ffff880353457c5f by task kworker/u16:0/7 [ 1083.034990] [ 1083.035104] CPU: 7 PID: 7 Comm: kworker/u16:0 Tainted: G O 4.19.0-rc5+ #1 [ 1083.035316] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu2 04/01/2014 [ 1083.035573] Workqueue: ipoib_wq ipoib_cm_skb_reap [ib_ipoib] [ 1083.035750] Call Trace: [ 1083.035888] dump_stack+0x9a/0xeb [ 1083.036031] print_address_description+0xe3/0x2e0 [ 1083.036213] kasan_report+0x18a/0x2e0 [ 1083.036356] ? __ip_options_echo+0xf08/0x1310 [ 1083.036522] __ip_options_echo+0xf08/0x1310 [ 1083.036688] icmp_send+0x7b9/0x1cd0 [ 1083.036843] ? icmp_route_lookup.constprop.9+0x1070/0x1070 [ 1083.037018] ? netif_schedule_queue+0x5/0x200 [ 1083.037180] ? debug_show_all_locks+0x310/0x310 [ 1083.037341] ? rcu_dynticks_curr_cpu_in_eqs+0x85/0x120 [ 1083.037519] ? debug_locks_off+0x11/0x80 [ 1083.037673] ? debug_check_no_obj_freed+0x207/0x4c6 [ 1083.037841] ? check_flags.part.27+0x450/0x450 [ 1083.037995] ? debug_check_no_obj_freed+0xc3/0x4c6 [ 1083.038169] ? debug_locks_off+0x11/0x80 [ 1083.038318] ? skb_dequeue+0x10e/0x1a0 [ 1083.038476] ? ipoib_cm_skb_reap+0x2b5/0x650 [ib_ipoib] [ 1083.038642] ? netif_schedule_queue+0xa8/0x200 [ 1083.038820] ? ipoib_cm_skb_reap+0x544/0x650 [ib_ipoib] [ 1083.038996] ipoib_cm_skb_reap+0x544/0x650 [ib_ipoib] [ 1083.039174] process_one_work+0x912/0x1830 [ 1083.039336] ? wq_pool_ids_show+0x310/0x310 [ 1083.039491] ? lock_acquire+0x145/0x3a0 [ 1083.042312] worker_thread+0x87/0xbb0 [ 1083.045099] ? process_one_work+0x1830/0x1830 [ 1083.047865] kthread+0x322/0x3e0 [ 1083.050624] ? kthread_create_worker_on_cpu+0xc0/0xc0 [ 1083.053354] ret_from_fork+0x3a/0x50 For instance __ip_options_echo is failing to proceed with invalid srr and optlen passed from another layer via IPCB [ 762.139568] IPv4: __ip_options_echo rr=0 ts=0 srr=43 cipso=0 [ 762.139720] IPv4: ip_options_build: IPCB 00000000f3cd969e opt 000000002ccb3533 [ 762.139838] IPv4: __ip_options_echo in srr: optlen 197 soffset 84 [ 762.139852] IPv4: ip_options_build srr=0 is_frag=0 rr_needaddr=0 ts_needaddr=0 ts_needtime=0 rr=0 ts=0 [ 762.140269] ================================================================== [ 762.140713] IPv4: __ip_options_echo rr=0 ts=0 srr=0 cipso=0 [ 762.141078] BUG: KASAN: stack-out-of-bounds in __ip_options_echo+0x12ec/0x1680 [ 762.141087] Write of size 4 at addr ffff880353457c7f by task kworker/u16:0/7 Signed-off-by: Denis Drozdov Reviewed-by: Erez Shitrit Reviewed-by: Feras Daoud Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/ulp/ipoib/ipoib_cm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 3d5424f335cb..0428e01e8f69 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -1438,11 +1438,15 @@ static void ipoib_cm_skb_reap(struct work_struct *work) spin_unlock_irqrestore(&priv->lock, flags); netif_tx_unlock_bh(dev); - if (skb->protocol == htons(ETH_P_IP)) + if (skb->protocol == htons(ETH_P_IP)) { + memset(IPCB(skb), 0, sizeof(*IPCB(skb))); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + } #if IS_ENABLED(CONFIG_IPV6) - else if (skb->protocol == htons(ETH_P_IPV6)) + else if (skb->protocol == htons(ETH_P_IPV6)) { + memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + } #endif dev_kfree_skb_any(skb); From e54b6a3bcd1ec972b25a164bdf495d9e7120b107 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 22:36:10 +0300 Subject: [PATCH 217/242] RDMA/cm: Respect returned status of cm_init_av_by_path Add missing check for failure of cm_init_av_by_path Fixes: e1444b5a163e ("IB/cm: Fix automatic path migration support") Reported-by: Slava Shwartsman Reviewed-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cm.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index a6a20603ccea..edb2cb758be7 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -3292,8 +3292,11 @@ static int cm_lap_handler(struct cm_work *work) if (ret) goto unlock; - cm_init_av_by_path(param->alternate_path, NULL, &cm_id_priv->alt_av, - cm_id_priv); + ret = cm_init_av_by_path(param->alternate_path, NULL, + &cm_id_priv->alt_av, cm_id_priv); + if (ret) + goto unlock; + cm_id_priv->id.lap_state = IB_CM_LAP_RCVD; cm_id_priv->tid = lap_msg->hdr.tid; ret = atomic_inc_and_test(&cm_id_priv->work_count); From 013c2403bf32e48119aeb13126929f81352cc7ac Mon Sep 17 00:00:00 2001 From: Artemy Kovalyov Date: Mon, 15 Oct 2018 14:13:35 +0300 Subject: [PATCH 218/242] IB/mlx5: Fix MR cache initialization Schedule MR cache work only after bucket was initialized. Cc: # 4.10 Fixes: 49780d42dfc9 ("IB/mlx5: Expose MR cache for mlx5_ib") Signed-off-by: Artemy Kovalyov Reviewed-by: Majd Dibbiny Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/mr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c index 02346d6bcc50..9b195d65a13e 100644 --- a/drivers/infiniband/hw/mlx5/mr.c +++ b/drivers/infiniband/hw/mlx5/mr.c @@ -691,7 +691,6 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) init_completion(&ent->compl); INIT_WORK(&ent->work, cache_work_func); INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func); - queue_work(cache->wq, &ent->work); if (i > MR_CACHE_LAST_STD_ENTRY) { mlx5_odp_init_mr_cache_entry(ent); @@ -711,6 +710,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev) ent->limit = dev->mdev->profile->mr_cache[i].limit; else ent->limit = 0; + queue_work(cache->wq, &ent->work); } err = mlx5_mr_cache_debugfs_init(dev); From d6f9125207902ace40d36d6571cda251b43a8f95 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:03 +0300 Subject: [PATCH 219/242] RDMA/cma: Remove unused timeout_ms parameter from cma_resolve_iw_route() cma_resolve_iw_route() doesn't use timeout_ms parameter, so let's remove it. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/cma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 5c7e3bafdd4a..1156cb911a5c 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2751,7 +2751,7 @@ err: } EXPORT_SYMBOL(rdma_set_ib_path); -static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_iw_route(struct rdma_id_private *id_priv) { struct cma_work *work; @@ -2867,7 +2867,7 @@ int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) else if (rdma_protocol_roce(id->device, id->port_num)) ret = cma_resolve_iboe_route(id_priv); else if (rdma_protocol_iwarp(id->device, id->port_num)) - ret = cma_resolve_iw_route(id_priv, timeout_ms); + ret = cma_resolve_iw_route(id_priv); else ret = -ENOSYS; From 9549c2bd094f0f54b8827d64886f5b1de370dff3 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:04 +0300 Subject: [PATCH 220/242] RDMA/core: Align multiple functions to kernel coding style This patch changes the small number of functions to be aligned to kernel coding style. It is needed to minimize the diffstat of the following patch. It doesn't change any functionality. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 3 +-- drivers/infiniband/core/sa.h | 10 ++++------ include/rdma/ib_addr.h | 3 +-- include/rdma/ib_sa.h | 36 ++++++++++++++-------------------- 4 files changed, 21 insertions(+), 31 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 07e0ffe74a8a..b6f7cde36c2d 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -662,8 +662,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - bool resolve_by_gid_attr, - void *context) + bool resolve_by_gid_attr, void *context) { struct sockaddr *src_in, *dst_in; struct addr_req *req; diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index b1d4bbf4ce5c..57d4496f6720 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -49,16 +49,14 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) } int ib_sa_mcmember_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, + struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); int mcast_init(void); void mcast_cleanup(void); diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index e09eca91eb18..eebbe63b530c 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -102,8 +102,7 @@ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, struct rdma_dev_addr *addr, int timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), - bool resolve_by_gid_attr, - void *context); + bool resolve_by_gid_attr, void *context); void rdma_addr_cancel(struct rdma_dev_addr *addr); diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index b6ddf2a1b9d8..95ce625a49e3 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -449,28 +449,23 @@ struct ib_sa_query; void ib_sa_cancel_query(int id, struct ib_sa_query *query); -int ib_sa_path_rec_get(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct sa_path_rec *resp, +int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, + u8 port_num, struct sa_path_rec *rec, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, struct sa_path_rec *resp, void *context), - void *context, - struct ib_sa_query **query); + void *context, struct ib_sa_query **query); int ib_sa_service_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - u8 method, - struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_sa_service_rec *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); + struct ib_device *device, u8 port_num, u8 method, + struct ib_sa_service_rec *rec, + ib_sa_comp_mask comp_mask, int timeout_ms, + gfp_t gfp_mask, + void (*callback)(int status, + struct ib_sa_service_rec *resp, + void *context), + void *context, struct ib_sa_query **sa_query); struct ib_sa_multicast { struct ib_sa_mcmember_rec rec; @@ -577,8 +572,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), - void *context, - struct ib_sa_query **sa_query); + void *context, struct ib_sa_query **sa_query); bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, struct ib_device *device, From dbace111e5b320682eee63d7173959a2b2bd9ccb Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Thu, 11 Oct 2018 17:30:05 +0300 Subject: [PATCH 221/242] RDMA/core: Annotate timeout as unsigned long The ucma users supply timeout in u32 format, it means that any number with most significant bit set will be converted to negative value by various rdma_*, cma_* and sa_query functions, which treat timeout as int. In the lowest level, the timeout is converted back to be unsigned long. Remove this ambiguous conversion by updating all function signatures to receive unsigned long. Reported-by: Noa Osherovich Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/addr.c | 2 +- drivers/infiniband/core/cma.c | 11 ++++++----- drivers/infiniband/core/mad.c | 2 +- drivers/infiniband/core/mad_priv.h | 2 +- drivers/infiniband/core/sa.h | 4 ++-- drivers/infiniband/core/sa_query.c | 13 +++++++------ include/rdma/ib_addr.h | 2 +- include/rdma/ib_cm.h | 2 +- include/rdma/ib_sa.h | 6 +++--- include/rdma/rdma_cm.h | 5 +++-- 10 files changed, 26 insertions(+), 23 deletions(-) diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index b6f7cde36c2d..0dce94e3c495 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -659,7 +659,7 @@ static void process_one_req(struct work_struct *_work) } int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context) diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 1156cb911a5c..15d5bb7bf6bb 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -2510,8 +2510,8 @@ static void cma_query_handler(int status, struct sa_path_rec *path_rec, queue_work(cma_wq, &work->work); } -static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms, - struct cma_work *work) +static int cma_query_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms, struct cma_work *work) { struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; struct sa_path_rec path_rec; @@ -2629,7 +2629,8 @@ static void cma_init_resolve_addr_work(struct cma_work *work, work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED; } -static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms) +static int cma_resolve_ib_route(struct rdma_id_private *id_priv, + unsigned long timeout_ms) { struct rdma_route *route = &id_priv->id.route; struct cma_work *work; @@ -2852,7 +2853,7 @@ err1: return ret; } -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms) +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; @@ -3072,7 +3073,7 @@ static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, } int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms) + const struct sockaddr *dst_addr, unsigned long timeout_ms) { struct rdma_id_private *id_priv; int ret; diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index c355379e7534..d7025cd5be28 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -2414,7 +2414,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) } void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms) + unsigned long timeout_ms) { mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); wait_for_response(mad_send_wr); diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index d84ae1671898..216509036aa8 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -221,6 +221,6 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr); void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr, - int timeout_ms); + unsigned long timeout_ms); #endif /* __IB_MAD_PRIV_H__ */ diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h index 57d4496f6720..cbaaaa92fff3 100644 --- a/drivers/infiniband/core/sa.h +++ b/drivers/infiniband/core/sa.h @@ -51,8 +51,8 @@ static inline void ib_sa_client_put(struct ib_sa_client *client) int ib_sa_mcmember_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_mcmember_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, - gfp_t gfp_mask, + ib_sa_comp_mask comp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index f28f6fdb78cb..be5ba5e15496 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -1360,7 +1360,8 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent) spin_unlock_irqrestore(&tid_lock, flags); } -static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask) +static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms, + gfp_t gfp_mask) { bool preload = gfpflags_allow_blocking(gfp_mask); unsigned long flags; @@ -1550,7 +1551,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -1704,7 +1705,7 @@ int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, void *context), @@ -1801,7 +1802,7 @@ int ib_sa_mcmember_rec_query(struct ib_sa_client *client, u8 method, struct ib_sa_mcmember_rec *rec, ib_sa_comp_mask comp_mask, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_mcmember_rec *resp, void *context), @@ -1892,7 +1893,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), @@ -2059,7 +2060,7 @@ static void ib_sa_classport_info_rec_release(struct ib_sa_query *sa_query) } static int ib_sa_classport_info_rec_query(struct ib_sa_port *port, - int timeout_ms, + unsigned long timeout_ms, void (*callback)(void *context), void *context, struct ib_sa_query **sa_query) diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index eebbe63b530c..2734c895c1bf 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -99,7 +99,7 @@ int rdma_translate_ip(const struct sockaddr *addr, * @context: User-specified context associated with the call. */ int rdma_resolve_ip(struct sockaddr *src_addr, const struct sockaddr *dst_addr, - struct rdma_dev_addr *addr, int timeout_ms, + struct rdma_dev_addr *addr, unsigned long timeout_ms, void (*callback)(int status, struct sockaddr *src_addr, struct rdma_dev_addr *addr, void *context), bool resolve_by_gid_attr, void *context); diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index c10f4b5ea8ab..49f4f75499b3 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -583,7 +583,7 @@ struct ib_cm_sidr_req_param { struct sa_path_rec *path; const struct ib_gid_attr *sgid_attr; __be64 service_id; - int timeout_ms; + unsigned long timeout_ms; const void *private_data; u8 private_data_len; u8 max_cm_retries; diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 95ce625a49e3..19520979b84c 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -451,7 +451,7 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct sa_path_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct sa_path_rec *resp, void *context), @@ -460,7 +460,7 @@ int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, int ib_sa_service_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, u8 method, struct ib_sa_service_rec *rec, - ib_sa_comp_mask comp_mask, int timeout_ms, + ib_sa_comp_mask comp_mask, unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_service_rec *resp, @@ -568,7 +568,7 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, struct ib_device *device, u8 port_num, struct ib_sa_guidinfo_rec *rec, ib_sa_comp_mask comp_mask, u8 method, - int timeout_ms, gfp_t gfp_mask, + unsigned long timeout_ms, gfp_t gfp_mask, void (*callback)(int status, struct ib_sa_guidinfo_rec *resp, void *context), diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index 53d93c7d8e01..60987a5903b7 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -196,7 +196,8 @@ int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr); * @timeout_ms: Time to wait for resolution to complete. */ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, - const struct sockaddr *dst_addr, int timeout_ms); + const struct sockaddr *dst_addr, + unsigned long timeout_ms); /** * rdma_resolve_route - Resolve the RDMA address bound to the RDMA identifier @@ -206,7 +207,7 @@ int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr, * Users must have first called rdma_resolve_addr to resolve a dst_addr * into an RDMA address before calling this routine. */ -int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms); +int rdma_resolve_route(struct rdma_cm_id *id, unsigned long timeout_ms); /** * rdma_create_qp - Allocate a QP and associate it with the specified RDMA From d21943dd19b5c79dc09bb0e8bf80cd5ee09c41c2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 10 Oct 2018 09:19:11 +0300 Subject: [PATCH 222/242] RDMA/core: Implement IB device rename function Generic implementation of IB device rename function. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 1 + drivers/infiniband/core/device.c | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index d7399d5b1cb6..c5881756b799 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -87,6 +87,7 @@ int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); void ib_device_unregister_sysfs(struct ib_device *device); +int ib_device_rename(struct ib_device *ibdev, const char *name); typedef void (*roce_netdev_callback)(struct ib_device *device, u8 port, struct net_device *idev, void *cookie); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d105b9b2d118..5e70f5e1cfd9 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -171,6 +171,31 @@ static struct ib_device *__ib_device_get_by_name(const char *name) return NULL; } +int ib_device_rename(struct ib_device *ibdev, const char *name) +{ + struct ib_device *device; + int ret = 0; + + if (!strcmp(name, dev_name(&ibdev->dev))) + return ret; + + mutex_lock(&device_mutex); + list_for_each_entry(device, &device_list, core_list) { + if (!strcmp(name, dev_name(&device->dev))) { + ret = -EEXIST; + goto out; + } + } + + ret = device_rename(&ibdev->dev, name); + if (ret) + goto out; + strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX); +out: + mutex_unlock(&device_mutex); + return ret; +} + static int alloc_name(struct ib_device *ibdev, const char *name) { unsigned long *inuse; From 05d940d3a3ec4e6d5d6a726aae4d73c5c64603c6 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 10 Oct 2018 09:19:12 +0300 Subject: [PATCH 223/242] RDMA/nldev: Allow IB device rename through RDMA netlink Provide an option to rename IB device name through RDMA netlink and limit it to users with ADMIN capability only. Signed-off-by: Leon Romanovsky Reviewed-by: Parav Pandit Signed-off-by: Doug Ledford --- drivers/infiniband/core/nldev.c | 34 ++++++++++++++++++++++++++++++++ include/uapi/rdma/rdma_netlink.h | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index ba5403fbcd88..573399e3ccc1 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -646,6 +646,36 @@ err: return err; } +static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[RDMA_NLDEV_ATTR_MAX]; + struct ib_device *device; + u32 index; + int err; + + err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1, nldev_policy, + extack); + if (err || !tb[RDMA_NLDEV_ATTR_DEV_INDEX]) + return -EINVAL; + + index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]); + device = ib_device_get_by_index(index); + if (!device) + return -EINVAL; + + if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { + char name[IB_DEVICE_NAME_MAX] = {}; + + nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + IB_DEVICE_NAME_MAX); + err = ib_device_rename(device, name); + } + + put_device(&device->dev); + return err; +} + static int _nldev_get_dumpit(struct ib_device *device, struct sk_buff *skb, struct netlink_callback *cb, @@ -1078,6 +1108,10 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = { .doit = nldev_get_doit, .dump = nldev_get_dumpit, }, + [RDMA_NLDEV_CMD_SET] = { + .doit = nldev_set_doit, + .flags = RDMA_NL_ADMIN_PERM, + }, [RDMA_NLDEV_CMD_PORT_GET] = { .doit = nldev_port_get_doit, .dump = nldev_port_get_dumpit, diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h index edba6351ac13..f9c41bf59efc 100644 --- a/include/uapi/rdma/rdma_netlink.h +++ b/include/uapi/rdma/rdma_netlink.h @@ -227,8 +227,9 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_UNSPEC, RDMA_NLDEV_CMD_GET, /* can dump */ + RDMA_NLDEV_CMD_SET, - /* 2 - 4 are free to use */ + /* 3 - 4 are free to use */ RDMA_NLDEV_CMD_PORT_GET = 5, /* can dump */ From 60f1fc204940fc0352394866b2af0987fd5b2563 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 7 Oct 2018 12:12:39 +0300 Subject: [PATCH 224/242] IB/mlx4: Refer to the device kobject instead of ports_parent iov sysfs tree is created under ib device at /sys/class/infiniband/mlx4_0/iov. And, ibdev->ports_parent->parent = &ibdev->dev. Therefore, refer to device's kobject directly instead of indirect access to it. Additionally, iov entries are created under device kobject and deleted before device is removed. There is no need to hold additional reference to device kobject in provider driver. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/sysfs.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c index e219093d2764..752bdd536130 100644 --- a/drivers/infiniband/hw/mlx4/sysfs.c +++ b/drivers/infiniband/hw/mlx4/sysfs.c @@ -818,9 +818,7 @@ int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev) if (!mlx4_is_master(dev->dev)) return 0; - dev->iov_parent = - kobject_create_and_add("iov", - kobject_get(dev->ib_dev.ports_parent->parent)); + dev->iov_parent = kobject_create_and_add("iov", &dev->ib_dev.dev.kobj); if (!dev->iov_parent) { ret = -ENOMEM; goto err; @@ -850,7 +848,6 @@ err_add_entries: err_ports: kobject_put(dev->iov_parent); err: - kobject_put(dev->ib_dev.ports_parent->parent); pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret); return ret; } @@ -886,5 +883,4 @@ void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device) kobject_put(device->ports_parent); kobject_put(device->iov_parent); kobject_put(device->iov_parent); - kobject_put(device->ib_dev.ports_parent->parent); } From 0f6ef65d1c6ec8deb5d0f11f86631ec4cfe8f22e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 7 Oct 2018 12:12:40 +0300 Subject: [PATCH 225/242] RDMA/core: Do not expose unsupported counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the provider driver (such as rdma_rxe) doesn't support pma counters, avoid exposing its directory similar to optional hw_counters directory. If core fails to read the PMA counter, return an error so that user can retry later if needed. Fixes: 35c4cbb17811 ("IB/core: Create get_perf_mad function in sysfs.c") Reported-by: Holger Hoffstätte Tested-by: Holger Hoffstätte Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index bc947a863b34..107c8ba2046c 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -512,7 +512,7 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, 40 + offset / 8, sizeof(data)); if (ret < 0) - return sprintf(buf, "N/A (no PMA)\n"); + return ret; switch (width) { case 4: @@ -1057,10 +1057,12 @@ static int add_port(struct ib_device *device, int port_num, goto err_put; } - p->pma_table = get_counter_table(device, port_num); - ret = sysfs_create_group(&p->kobj, p->pma_table); - if (ret) - goto err_put_gid_attrs; + if (device->process_mad) { + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; + } p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); @@ -1173,7 +1175,8 @@ err_free_gid: p->gid_group.attrs = NULL; err_remove_pma: - sysfs_remove_group(&p->kobj, p->pma_table); + if (p->pma_table) + sysfs_remove_group(&p->kobj, p->pma_table); err_put_gid_attrs: kobject_put(&p->gid_attr_group->kobj); @@ -1289,7 +1292,9 @@ static void free_port_list_attributes(struct ib_device *device) kfree(port->hw_stats); free_hsag(&port->kobj, port->hw_stats_ag); } - sysfs_remove_group(p, port->pma_table); + + if (port->pma_table) + sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); sysfs_remove_group(&port->gid_attr_group->kobj, From 1ae4cfa03902c83d1d77123e5ac8f0812c61b90e Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Sun, 7 Oct 2018 12:12:41 +0300 Subject: [PATCH 226/242] RDMA/core: Rename ports_parent to ports_kobj Normally kobj objects have kobj suffix to reflect it. Rename ports_parent to ports_kobj. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 9 ++++----- include/rdma/ib_verbs.h | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index 107c8ba2046c..f54f107ef668 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1036,7 +1036,7 @@ static int add_port(struct ib_device *device, int port_num, p->port_num = port_num; ret = kobject_init_and_add(&p->kobj, &port_type, - device->ports_parent, + device->ports_kobj, "%d", port_num); if (ret) { kfree(p); @@ -1305,7 +1305,7 @@ static void free_port_list_attributes(struct ib_device *device) kobject_put(p); } - kobject_put(device->ports_parent); + kobject_put(device->ports_kobj); } int ib_device_register_sysfs(struct ib_device *device, @@ -1323,9 +1323,8 @@ int ib_device_register_sysfs(struct ib_device *device, if (ret) goto err; - device->ports_parent = kobject_create_and_add("ports", - &class_dev->kobj); - if (!device->ports_parent) { + device->ports_kobj = kobject_create_and_add("ports", &class_dev->kobj); + if (!device->ports_kobj) { ret = -ENOMEM; goto err_put; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7ce617d77f8f..7d732cf87886 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2542,7 +2542,7 @@ struct ib_device { /* First group for device attributes, NULL terminated array */ const struct attribute_group *groups[2]; - struct kobject *ports_parent; + struct kobject *ports_kobj; struct list_head port_list; enum { From 0a094ff0619607a36c18d07a81d2ea62e6cdf074 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Tue, 9 Oct 2018 15:27:39 +0200 Subject: [PATCH 227/242] IB/mlx4: Enable debug print of SMPs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IB Subnet Management Packets (SMPs) were excluded from debug prints. Fixed by enabling print even on QP0 MADs. Signed-off-by: Håkon Bugge Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mad.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index e5466d786bb1..1718715cc094 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -807,7 +807,7 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, int err; struct ib_port_attr pattr; - if (in_wc && in_wc->qp->qp_num) { + if (in_wc && in_wc->qp) { pr_debug("received MAD: slid:%d sqpn:%d " "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", in_wc->slid, in_wc->src_qp, From b4c542df5ae83f079f867011a8f7721fb8a1f44c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?H=C3=A5kon=20Bugge?= Date: Tue, 9 Oct 2018 15:27:40 +0200 Subject: [PATCH 228/242] IB/mlx4: Add port and TID to MAD debug print MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add said information and make the debug print format consistent. Signed-off-by: Håkon Bugge Acked-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx4/mad.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c index 1718715cc094..8942f5f7f04d 100644 --- a/drivers/infiniband/hw/mlx4/mad.c +++ b/drivers/infiniband/hw/mlx4/mad.c @@ -808,14 +808,16 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num, struct ib_port_attr pattr; if (in_wc && in_wc->qp) { - pr_debug("received MAD: slid:%d sqpn:%d " - "dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n", - in_wc->slid, in_wc->src_qp, - in_wc->dlid_path_bits, - in_wc->qp->qp_num, - in_wc->wc_flags, - in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, - be16_to_cpu(in_mad->mad_hdr.attr_id)); + pr_debug("received MAD: port:%d slid:%d sqpn:%d " + "dlid_bits:%d dqpn:%d wc_flags:0x%x tid:%016llx cls:%x mtd:%x atr:%x\n", + port_num, + in_wc->slid, in_wc->src_qp, + in_wc->dlid_path_bits, + in_wc->qp->qp_num, + in_wc->wc_flags, + be64_to_cpu(in_mad->mad_hdr.tid), + in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method, + be16_to_cpu(in_mad->mad_hdr.attr_id)); if (in_wc->wc_flags & IB_WC_GRH) { pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n", be64_to_cpu(in_grh->sgid.global.subnet_prefix), From 7d65cbf0b0ac7d7eebf397ff9af6645b2b3004c2 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:13:28 +0300 Subject: [PATCH 229/242] RDMA/core: Increase total number of RDMA ports across all devices IDA adds overhead to store IDs bitmap with maximal value of IDA can be upto 2099202 (IDA_MAX = 0x80000000U / IDA_BITMAP_BITS - 1). However, there is no need to add such enormous number of devices and it is enough for now to limit it to be 8192. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/core_priv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index c5881756b799..bb9007a0cca7 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -44,7 +44,7 @@ #include "mad_priv.h" /* Total number of ports combined across all struct ib_devices's */ -#define RDMA_MAX_PORTS 1024 +#define RDMA_MAX_PORTS 8192 struct pkey_index_qp_list { struct list_head pkey_index_list; From 90f6e41cc03a4055d56e94ad7c97df4b1add7f61 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:13:29 +0300 Subject: [PATCH 230/242] RDMA/uverbs: Use kernel API to allocate uverbs indexes Replace custom code to allocate indexes to generic kernel API. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/uverbs_main.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index 4258cbd55ed7..6d373f5515b7 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -73,7 +73,7 @@ enum { static dev_t dynamic_uverbs_dev; static struct class *uverbs_class; -static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES); +static DEFINE_IDA(uverbs_ida); static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file, const char __user *buf, int in_len, @@ -1268,11 +1268,11 @@ static void ib_uverbs_add_one(struct ib_device *device) rcu_assign_pointer(uverbs_dev->ib_dev, device); uverbs_dev->num_comp_vectors = device->num_comp_vectors; - devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES); - if (devnum >= IB_UVERBS_MAX_DEVICES) + devnum = ida_alloc_max(&uverbs_ida, IB_UVERBS_MAX_DEVICES - 1, + GFP_KERNEL); + if (devnum < 0) goto err; uverbs_dev->devnum = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UVERBS_NUM_FIXED_MINOR) base = dynamic_uverbs_dev + devnum - IB_UVERBS_NUM_FIXED_MINOR; else @@ -1296,7 +1296,7 @@ static void ib_uverbs_add_one(struct ib_device *device) return; err_uapi: - clear_bit(devnum, dev_map); + ida_free(&uverbs_ida, devnum); err: if (atomic_dec_and_test(&uverbs_dev->refcount)) ib_uverbs_comp_dev(uverbs_dev); @@ -1371,7 +1371,7 @@ static void ib_uverbs_remove_one(struct ib_device *device, void *client_data) return; cdev_device_del(&uverbs_dev->cdev, &uverbs_dev->dev); - clear_bit(uverbs_dev->devnum, dev_map); + ida_free(&uverbs_ida, uverbs_dev->devnum); if (device->disassociate_ucontext) { /* We disassociate HW resources and immediately return. From 551d315e34a5e6961f8deaf2d6f37ad24fccaa08 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 2 Oct 2018 11:13:30 +0300 Subject: [PATCH 231/242] RDMA/umad: Use kernel API to allocate umad indexes Replace custom code to allocate indexes to generic kernel API. Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/user_mad.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 9961859da06a..f55f48f6b272 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -138,7 +138,7 @@ static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) + static dev_t dynamic_umad_dev; static dev_t dynamic_issm_dev; -static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS); +static DEFINE_IDA(umad_ida); static void ib_umad_add_one(struct ib_device *device); static void ib_umad_remove_one(struct ib_device *device, void *client_data); @@ -1159,11 +1159,10 @@ static int ib_umad_init_port(struct ib_device *device, int port_num, dev_t base_umad; dev_t base_issm; - devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS); - if (devnum >= IB_UMAD_MAX_PORTS) + devnum = ida_alloc_max(&umad_ida, IB_UMAD_MAX_PORTS - 1, GFP_KERNEL); + if (devnum < 0) return -1; port->dev_num = devnum; - set_bit(devnum, dev_map); if (devnum >= IB_UMAD_NUM_FIXED_MINOR) { base_umad = dynamic_umad_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; base_issm = dynamic_issm_dev + devnum - IB_UMAD_NUM_FIXED_MINOR; @@ -1227,7 +1226,7 @@ err_dev: err_cdev: cdev_del(&port->cdev); - clear_bit(devnum, dev_map); + ida_free(&umad_ida, devnum); return -1; } @@ -1261,7 +1260,7 @@ static void ib_umad_kill_port(struct ib_umad_port *port) } mutex_unlock(&port->file_mutex); - clear_bit(port->dev_num, dev_map); + ida_free(&umad_ida, port->dev_num); } static void ib_umad_add_one(struct ib_device *device) From 0797e6f1a85872ab48cb4986a06ebdcae22ecf1a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 26 Sep 2018 22:12:23 -0700 Subject: [PATCH 232/242] IB/rxe: Remove unnecessary enum values Clang warns when an emumerated type is implicitly converted to another. drivers/infiniband/sw/rxe/rxe.c:106:27: warning: implicit conversion from enumeration type 'enum rxe_device_param' to different enumeration type 'enum ib_atomic_cap' [-Wenum-conversion] rxe->attr.atomic_cap = RXE_ATOMIC_CAP; ~ ^~~~~~~~~~~~~~ drivers/infiniband/sw/rxe/rxe.c:131:22: warning: implicit conversion from enumeration type 'enum rxe_port_param' to different enumeration type 'enum ib_port_state' [-Wenum-conversion] port->attr.state = RXE_PORT_STATE; ~ ^~~~~~~~~~~~~~ drivers/infiniband/sw/rxe/rxe.c:132:24: warning: implicit conversion from enumeration type 'enum rxe_port_param' to different enumeration type 'enum ib_mtu' [-Wenum-conversion] port->attr.max_mtu = RXE_PORT_MAX_MTU; ~ ^~~~~~~~~~~~~~~~ drivers/infiniband/sw/rxe/rxe.c:133:27: warning: implicit conversion from enumeration type 'enum rxe_port_param' to different enumeration type 'enum ib_mtu' [-Wenum-conversion] port->attr.active_mtu = RXE_PORT_ACTIVE_MTU; ~ ^~~~~~~~~~~~~~~~~~~ drivers/infiniband/sw/rxe/rxe.c:151:24: warning: implicit conversion from enumeration type 'enum rxe_port_param' to different enumeration type 'enum ib_mtu' [-Wenum-conversion] ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU); ~~~~~~~~~~~~~~~~~~ ^~~~~~~~~~~~~~~~~~~ 5 warnings generated. Use the appropriate values from the expected enumerated type so no conversion needs to happen then remove the unneeded definitions. Reported-by: Nick Desaulniers Signed-off-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Signed-off-by: Jason Gunthorpe --- drivers/infiniband/sw/rxe/rxe.c | 13 ++++++------- drivers/infiniband/sw/rxe/rxe_loc.h | 2 +- drivers/infiniband/sw/rxe/rxe_param.h | 4 ---- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c index 10999fa69281..383e65c7bbc0 100644 --- a/drivers/infiniband/sw/rxe/rxe.c +++ b/drivers/infiniband/sw/rxe/rxe.c @@ -103,7 +103,7 @@ static void rxe_init_device_param(struct rxe_dev *rxe) rxe->attr.max_res_rd_atom = RXE_MAX_RES_RD_ATOM; rxe->attr.max_qp_init_rd_atom = RXE_MAX_QP_INIT_RD_ATOM; rxe->attr.max_ee_init_rd_atom = RXE_MAX_EE_INIT_RD_ATOM; - rxe->attr.atomic_cap = RXE_ATOMIC_CAP; + rxe->attr.atomic_cap = IB_ATOMIC_HCA; rxe->attr.max_ee = RXE_MAX_EE; rxe->attr.max_rdd = RXE_MAX_RDD; rxe->attr.max_mw = RXE_MAX_MW; @@ -128,9 +128,9 @@ static void rxe_init_device_param(struct rxe_dev *rxe) /* initialize port attributes */ static int rxe_init_port_param(struct rxe_port *port) { - port->attr.state = RXE_PORT_STATE; - port->attr.max_mtu = RXE_PORT_MAX_MTU; - port->attr.active_mtu = RXE_PORT_ACTIVE_MTU; + port->attr.state = IB_PORT_DOWN; + port->attr.max_mtu = IB_MTU_4096; + port->attr.active_mtu = IB_MTU_256; port->attr.gid_tbl_len = RXE_PORT_GID_TBL_LEN; port->attr.port_cap_flags = RXE_PORT_PORT_CAP_FLAGS; port->attr.max_msg_sz = RXE_PORT_MAX_MSG_SZ; @@ -147,8 +147,7 @@ static int rxe_init_port_param(struct rxe_port *port) port->attr.active_width = RXE_PORT_ACTIVE_WIDTH; port->attr.active_speed = RXE_PORT_ACTIVE_SPEED; port->attr.phys_state = RXE_PORT_PHYS_STATE; - port->mtu_cap = - ib_mtu_enum_to_int(RXE_PORT_ACTIVE_MTU); + port->mtu_cap = ib_mtu_enum_to_int(IB_MTU_256); port->subnet_prefix = cpu_to_be64(RXE_PORT_SUBNET_PREFIX); return 0; @@ -300,7 +299,7 @@ void rxe_set_mtu(struct rxe_dev *rxe, unsigned int ndev_mtu) mtu = eth_mtu_int_to_enum(ndev_mtu); /* Make sure that new MTU in range */ - mtu = mtu ? min_t(enum ib_mtu, mtu, RXE_PORT_MAX_MTU) : IB_MTU_256; + mtu = mtu ? min_t(enum ib_mtu, mtu, IB_MTU_4096) : IB_MTU_256; port->attr.active_mtu = mtu; port->mtu_cap = ib_mtu_enum_to_int(mtu); diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h index 8e305422adbb..afd53f57a62b 100644 --- a/drivers/infiniband/sw/rxe/rxe_loc.h +++ b/drivers/infiniband/sw/rxe/rxe_loc.h @@ -195,7 +195,7 @@ static inline int qp_mtu(struct rxe_qp *qp) if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) return qp->attr.path_mtu; else - return RXE_PORT_MAX_MTU; + return IB_MTU_4096; } static inline int rcv_wqe_size(int max_sge) diff --git a/drivers/infiniband/sw/rxe/rxe_param.h b/drivers/infiniband/sw/rxe/rxe_param.h index 4555510d86c4..bdea899a58ac 100644 --- a/drivers/infiniband/sw/rxe/rxe_param.h +++ b/drivers/infiniband/sw/rxe/rxe_param.h @@ -90,7 +90,6 @@ enum rxe_device_param { RXE_MAX_RES_RD_ATOM = 0x3f000, RXE_MAX_QP_INIT_RD_ATOM = 128, RXE_MAX_EE_INIT_RD_ATOM = 0, - RXE_ATOMIC_CAP = 1, RXE_MAX_EE = 0, RXE_MAX_RDD = 0, RXE_MAX_MW = 0, @@ -139,9 +138,6 @@ enum rxe_device_param { /* default/initial rxe port parameters */ enum rxe_port_param { - RXE_PORT_STATE = IB_PORT_DOWN, - RXE_PORT_MAX_MTU = IB_MTU_4096, - RXE_PORT_ACTIVE_MTU = IB_MTU_256, RXE_PORT_GID_TBL_LEN = 1024, RXE_PORT_PORT_CAP_FLAGS = RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP, RXE_PORT_MAX_MSG_SZ = 0x800000, From d4122f5abef844112799d2056fdc7bbedbc913f3 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 11 Oct 2018 22:31:53 +0300 Subject: [PATCH 233/242] RDMA/core: Allow existing drivers to set one sysfs group per device Currently many rdma drivers are creating device attribute files using device_create_file() with device specific attributes. Device specific attributes should be exposed via well defined netlink device attributes in future. Introduce an API rdma_set_device_sysfs_group() for existing drivers to set a group for sysfs attributes for legacy. This API is only for exposing legacy attributes which existed for sometime now. New drivers should not be using this API and rather follow netlink path. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- include/rdma/ib_verbs.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 7d732cf87886..b17eea0373cb 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -2539,8 +2539,11 @@ struct ib_device { struct module *owner; struct device dev; - /* First group for device attributes, NULL terminated array */ - const struct attribute_group *groups[2]; + /* First group for device attributes, + * Second group for driver provided attributes (optional). + * It is NULL terminated array. + */ + const struct attribute_group *groups[3]; struct kobject *ports_kobj; struct list_head port_list; @@ -4191,4 +4194,27 @@ struct ib_ucontext *ib_uverbs_get_ucontext(struct ib_uverbs_file *ufile); int uverbs_destroy_def_handler(struct ib_uverbs_file *file, struct uverbs_attr_bundle *attrs); + +/** + * rdma_set_device_sysfs_group - Set device attributes group to have + * driver specific sysfs entries at + * for infiniband class. + * + * @device: device pointer for which attributes to be created + * @group: Pointer to group which should be added when device + * is registered with sysfs. + * rdma_set_device_sysfs_group() allows existing drivers to expose one + * group per device to have sysfs attributes. + * + * NOTE: New drivers should not make use of this API; instead new device + * parameter should be exposed via netlink command. This API and mechanism + * exist only for existing drivers. + */ +static inline void +rdma_set_device_sysfs_group(struct ib_device *dev, + const struct attribute_group *group) +{ + dev->groups[1] = group; +} + #endif /* IB_VERBS_H */ From 508a523f6bc6cdfbf7031d66559d4ad24956b741 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Thu, 11 Oct 2018 22:31:54 +0300 Subject: [PATCH 234/242] RDMA/drivers: Use core provided API for registering device attributes Use rdma_set_device_sysfs_group() to register device attributes and simplify the driver. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/bnxt_re/main.c | 74 ++++++-------- drivers/infiniband/hw/cxgb3/iwch_provider.c | 52 ++++------ drivers/infiniband/hw/cxgb4/provider.c | 47 ++++----- drivers/infiniband/hw/hfi1/hfi.h | 1 + drivers/infiniband/hw/hfi1/sysfs.c | 67 ++++++------- drivers/infiniband/hw/hfi1/verbs.c | 3 + drivers/infiniband/hw/i40iw/i40iw_verbs.c | 70 +++++-------- drivers/infiniband/hw/mlx4/main.c | 37 ++++--- drivers/infiniband/hw/mlx5/ib_rep.c | 3 - drivers/infiniband/hw/mlx5/main.c | 66 +++++-------- drivers/infiniband/hw/mlx5/mlx5_ib.h | 1 - drivers/infiniband/hw/mthca/mthca_provider.c | 41 ++++---- drivers/infiniband/hw/nes/nes_verbs.c | 58 ++++------- drivers/infiniband/hw/ocrdma/ocrdma_main.c | 71 ++++++------- drivers/infiniband/hw/qedr/main.c | 69 ++++++------- drivers/infiniband/hw/qib/qib.h | 2 +- drivers/infiniband/hw/qib/qib_sysfs.c | 99 ++++++++----------- drivers/infiniband/hw/qib/qib_verbs.c | 7 +- drivers/infiniband/hw/usnic/usnic_ib_main.c | 2 + drivers/infiniband/hw/usnic/usnic_ib_sysfs.c | 68 +++++-------- drivers/infiniband/hw/usnic/usnic_ib_sysfs.h | 2 + .../infiniband/hw/vmw_pvrdma/pvrdma_main.c | 41 ++++---- drivers/infiniband/sw/rxe/rxe_verbs.c | 26 ++--- 23 files changed, 364 insertions(+), 543 deletions(-) diff --git a/drivers/infiniband/hw/bnxt_re/main.c b/drivers/infiniband/hw/bnxt_re/main.c index e5d4b12f7bf5..cf2282654210 100644 --- a/drivers/infiniband/hw/bnxt_re/main.c +++ b/drivers/infiniband/hw/bnxt_re/main.c @@ -535,6 +535,34 @@ static struct bnxt_en_dev *bnxt_re_dev_probe(struct net_device *netdev) return en_dev; } +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); + + return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc); +} +static DEVICE_ATTR_RO(hca_type); + +static struct attribute *bnxt_re_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + NULL +}; + +static const struct attribute_group bnxt_re_dev_attr_group = { + .attrs = bnxt_re_attributes, +}; + static void bnxt_re_unregister_ib(struct bnxt_re_dev *rdev) { ib_unregister_device(&rdev->ibdev); @@ -638,34 +666,11 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev) ibdev->get_hw_stats = bnxt_re_ib_get_hw_stats; ibdev->alloc_hw_stats = bnxt_re_ib_alloc_hw_stats; + rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group); ibdev->driver_id = RDMA_DRIVER_BNXT_RE; return ib_register_device(ibdev, "bnxt_re%d", NULL); } -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); - - return scnprintf(buf, PAGE_SIZE, "0x%x\n", rdev->en_dev->pdev->vendor); -} - -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct bnxt_re_dev *rdev = to_bnxt_re_dev(device, ibdev.dev); - - return scnprintf(buf, PAGE_SIZE, "%s\n", rdev->ibdev.node_desc); -} - -static DEVICE_ATTR(hw_rev, 0444, show_rev, NULL); -static DEVICE_ATTR(hca_type, 0444, show_hca, NULL); - -static struct device_attribute *bnxt_re_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type -}; - static void bnxt_re_dev_remove(struct bnxt_re_dev *rdev) { dev_put(rdev->netdev); @@ -1200,12 +1205,9 @@ static int bnxt_re_setup_qos(struct bnxt_re_dev *rdev) static void bnxt_re_ib_unreg(struct bnxt_re_dev *rdev) { - int i, rc; + int rc; if (test_and_clear_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags)) { - for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) - device_remove_file(&rdev->ibdev.dev, - bnxt_re_attributes[i]); /* Cleanup ib dev */ bnxt_re_unregister_ib(rdev); } @@ -1255,7 +1257,7 @@ static void bnxt_re_worker(struct work_struct *work) static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) { - int i, j, rc; + int rc; bool locked; @@ -1375,20 +1377,6 @@ static int bnxt_re_ib_reg(struct bnxt_re_dev *rdev) } set_bit(BNXT_RE_FLAG_IBDEV_REGISTERED, &rdev->flags); dev_info(rdev_to_dev(rdev), "Device registered successfully"); - for (i = 0; i < ARRAY_SIZE(bnxt_re_attributes); i++) { - rc = device_create_file(&rdev->ibdev.dev, - bnxt_re_attributes[i]); - if (rc) { - dev_err(rdev_to_dev(rdev), - "Failed to create IB sysfs: %#x", rc); - /* Must clean up all created device files */ - for (j = 0; j < i; j++) - device_remove_file(&rdev->ibdev.dev, - bnxt_re_attributes[j]); - bnxt_re_unregister_ib(rdev); - goto fail; - } - } ib_get_eth_speed(&rdev->ibdev, 1, &rdev->active_speed, &rdev->active_width); set_bit(BNXT_RE_FLAG_ISSUE_ROCE_STATS, &rdev->flags); diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 39530cc15f95..ebbec02cebe0 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -1127,17 +1127,18 @@ static int iwch_query_port(struct ib_device *ibdev, return 0; } -static ssize_t show_rev(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, ibdev.dev); pr_debug("%s dev 0x%p\n", __func__, dev); return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, ibdev.dev); @@ -1148,9 +1149,10 @@ static ssize_t show_hca(struct device *dev, struct device_attribute *attr, lldev->ethtool_ops->get_drvinfo(lldev, &info); return sprintf(buf, "%s\n", info.driver); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_board(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev, ibdev.dev); @@ -1158,6 +1160,7 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor, iwch_dev->rdev.rnic_info.pdev->device); } +static DEVICE_ATTR_RO(board_id); enum counters { IPINRECEIVES, @@ -1274,14 +1277,15 @@ static int iwch_get_mib(struct ib_device *ibdev, struct rdma_hw_stats *stats, return stats->num_counters; } -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *iwch_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *iwch_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, +static const struct attribute_group iwch_attr_group = { + .attrs = iwch_class_attributes, }; static int iwch_port_immutable(struct ib_device *ibdev, u8 port_num, @@ -1316,7 +1320,6 @@ static void get_dev_fw_ver_str(struct ib_device *ibdev, char *str) int iwch_register_device(struct iwch_dev *dev) { int ret; - int i; pr_debug("%s iwch_dev %p\n", __func__, dev); memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid)); @@ -1401,33 +1404,16 @@ int iwch_register_device(struct iwch_dev *dev) sizeof(dev->ibdev.iwcm->ifname)); dev->ibdev.driver_id = RDMA_DRIVER_CXGB3; + rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group); ret = ib_register_device(&dev->ibdev, "cxgb3_%d", NULL); if (ret) - goto bail1; - - for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) { - ret = device_create_file(&dev->ibdev.dev, - iwch_class_attributes[i]); - if (ret) { - goto bail2; - } - } - return 0; -bail2: - ib_unregister_device(&dev->ibdev); -bail1: - kfree(dev->ibdev.iwcm); + kfree(dev->ibdev.iwcm); return ret; } void iwch_unregister_device(struct iwch_dev *dev) { - int i; - pr_debug("%s iwch_dev %p\n", __func__, dev); - for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) - device_remove_file(&dev->ibdev.dev, - iwch_class_attributes[i]); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); return; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 416f8d1af610..cbb3c0ddd990 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -373,8 +373,8 @@ static int c4iw_query_port(struct ib_device *ibdev, u8 port, return 0; } -static ssize_t show_rev(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev.dev); @@ -382,9 +382,10 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%d\n", CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type)); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev.dev); @@ -395,9 +396,10 @@ static ssize_t show_hca(struct device *dev, struct device_attribute *attr, lldev->ethtool_ops->get_drvinfo(lldev, &info); return sprintf(buf, "%s\n", info.driver); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_board(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev.dev); @@ -405,6 +407,7 @@ static ssize_t show_board(struct device *dev, struct device_attribute *attr, return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor, c4iw_dev->rdev.lldi.pdev->device); } +static DEVICE_ATTR_RO(board_id); enum counters { IP4INSEGS, @@ -461,14 +464,15 @@ static int c4iw_get_mib(struct ib_device *ibdev, return stats->num_counters; } -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *c4iw_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *c4iw_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, +static const struct attribute_group c4iw_attr_group = { + .attrs = c4iw_class_attributes, }; static int c4iw_port_immutable(struct ib_device *ibdev, u8 port_num, @@ -530,7 +534,6 @@ static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res) void c4iw_register_device(struct work_struct *work) { int ret; - int i; struct uld_ctx *ctx = container_of(work, struct uld_ctx, reg_work); struct c4iw_dev *dev = ctx->dev; @@ -625,20 +628,13 @@ void c4iw_register_device(struct work_struct *work) memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name, sizeof(dev->ibdev.iwcm->ifname)); + rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_CXGB4; ret = ib_register_device(&dev->ibdev, "cxgb4_%d", NULL); if (ret) goto err_kfree_iwcm; - - for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) { - ret = device_create_file(&dev->ibdev.dev, - c4iw_class_attributes[i]); - if (ret) - goto err_unregister_device; - } return; -err_unregister_device: - ib_unregister_device(&dev->ibdev); + err_kfree_iwcm: kfree(dev->ibdev.iwcm); err_dealloc_ctx: @@ -650,12 +646,7 @@ err_dealloc_ctx: void c4iw_unregister_device(struct c4iw_dev *dev) { - int i; - pr_debug("c4iw_dev %p\n", dev); - for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) - device_remove_file(&dev->ibdev.dev, - c4iw_class_attributes[i]); ib_unregister_device(&dev->ibdev); kfree(dev->ibdev.iwcm); return; diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h index d3d5717650e9..1401b6ea4a28 100644 --- a/drivers/infiniband/hw/hfi1/hfi.h +++ b/drivers/infiniband/hw/hfi1/hfi.h @@ -1968,6 +1968,7 @@ static inline u32 get_rcvhdrtail(const struct hfi1_ctxtdata *rcd) */ extern const char ib_hfi1_version[]; +extern const struct attribute_group ib_hfi1_attr_group; int hfi1_device_create(struct hfi1_devdata *dd); void hfi1_device_remove(struct hfi1_devdata *dd); diff --git a/drivers/infiniband/hw/hfi1/sysfs.c b/drivers/infiniband/hw/hfi1/sysfs.c index 25e867393463..2be513d4c9da 100644 --- a/drivers/infiniband/hw/hfi1/sysfs.c +++ b/drivers/infiniband/hw/hfi1/sysfs.c @@ -494,17 +494,18 @@ static struct kobj_type hfi1_vl2mtu_ktype = { * Start of per-unit (or driver, in some cases, but replicated * per unit) functions (these get a device *) */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) { struct hfi1_ibdev *dev = container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hfi(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = container_of(device, struct hfi1_ibdev, rdi.ibdev.dev); @@ -517,8 +518,9 @@ static ssize_t show_hfi(struct device *device, struct device_attribute *attr, ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); return ret; } +static DEVICE_ATTR_RO(board_id); -static ssize_t show_boardversion(struct device *device, +static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -528,8 +530,9 @@ static ssize_t show_boardversion(struct device *device, /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); } +static DEVICE_ATTR_RO(boardversion); -static ssize_t show_nctxts(struct device *device, +static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -546,8 +549,9 @@ static ssize_t show_nctxts(struct device *device, min(dd->num_user_contexts, (u32)dd->sc_sizes[SC_USER].count)); } +static DEVICE_ATTR_RO(nctxts); -static ssize_t show_nfreectxts(struct device *device, +static ssize_t nfreectxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -557,8 +561,9 @@ static ssize_t show_nfreectxts(struct device *device, /* Return the number of free user ports (contexts) available. */ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); } +static DEVICE_ATTR_RO(nfreectxts); -static ssize_t show_serial(struct device *device, +static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -567,8 +572,9 @@ static ssize_t show_serial(struct device *device, return scnprintf(buf, PAGE_SIZE, "%s", dd->serial); } +static DEVICE_ATTR_RO(serial); -static ssize_t store_chip_reset(struct device *device, +static ssize_t chip_reset_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { @@ -586,6 +592,7 @@ static ssize_t store_chip_reset(struct device *device, bail: return ret < 0 ? ret : count; } +static DEVICE_ATTR_WO(chip_reset); /* * Convert the reported temperature from an integer (reported in @@ -598,7 +605,7 @@ bail: /* * Dump tempsense values, in decimal, to ease shell-scripts. */ -static ssize_t show_tempsense(struct device *device, +static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct hfi1_ibdev *dev = @@ -622,6 +629,7 @@ static ssize_t show_tempsense(struct device *device, } return ret; } +static DEVICE_ATTR_RO(tempsense); /* * end of per-unit (or driver, in some cases, but replicated @@ -629,24 +637,20 @@ static ssize_t show_tempsense(struct device *device, */ /* start of per-unit file structures and support code */ -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_hfi, NULL); -static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); -static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); -static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); -static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); -static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); -static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); +static struct attribute *hfi1_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_board_id.attr, + &dev_attr_nctxts.attr, + &dev_attr_nfreectxts.attr, + &dev_attr_serial.attr, + &dev_attr_boardversion.attr, + &dev_attr_tempsense.attr, + &dev_attr_chip_reset.attr, + NULL, +}; -static struct device_attribute *hfi1_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_board_id, - &dev_attr_nctxts, - &dev_attr_nfreectxts, - &dev_attr_serial, - &dev_attr_boardversion, - &dev_attr_tempsense, - &dev_attr_chip_reset, +const struct attribute_group ib_hfi1_attr_group = { + .attrs = hfi1_attributes, }; int hfi1_create_port_files(struct ib_device *ibdev, u8 port_num, @@ -832,12 +836,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) struct device *class_dev = &dev->dev; int i, j, ret; - for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) { - ret = device_create_file(&dev->dev, hfi1_attributes[i]); - if (ret) - goto bail; - } - for (i = 0; i < dd->num_sdma; i++) { ret = kobject_init_and_add(&dd->per_sdma[i].kobj, &sde_ktype, &class_dev->kobj, @@ -855,9 +853,6 @@ int hfi1_verbs_register_sysfs(struct hfi1_devdata *dd) return 0; bail: - for (i = 0; i < ARRAY_SIZE(hfi1_attributes); ++i) - device_remove_file(&dev->dev, hfi1_attributes[i]); - for (i = 0; i < dd->num_sdma; i++) kobject_del(&dd->per_sdma[i].kobj); diff --git a/drivers/infiniband/hw/hfi1/verbs.c b/drivers/infiniband/hw/hfi1/verbs.c index 5fc27a94b4f0..48e11e510358 100644 --- a/drivers/infiniband/hw/hfi1/verbs.c +++ b/drivers/infiniband/hw/hfi1/verbs.c @@ -1751,6 +1751,9 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) i, ppd->pkeys); + rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, + &ib_hfi1_attr_group); + ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_HFI1); if (ret) goto err_verbs_txreq; diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c b/drivers/infiniband/hw/i40iw/i40iw_verbs.c index cb2aef874ca8..102875872bea 100644 --- a/drivers/infiniband/hw/i40iw/i40iw_verbs.c +++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c @@ -2135,10 +2135,10 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr) } /** - * i40iw_show_rev + * hw_rev_show */ -static ssize_t i40iw_show_rev(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct i40iw_ib_device *iwibdev = container_of(dev, struct i40iw_ib_device, @@ -2147,34 +2147,37 @@ static ssize_t i40iw_show_rev(struct device *dev, return sprintf(buf, "%x\n", hw_rev); } +static DEVICE_ATTR_RO(hw_rev); /** - * i40iw_show_hca + * hca_type_show */ -static ssize_t i40iw_show_hca(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "I40IW\n"); } +static DEVICE_ATTR_RO(hca_type); /** - * i40iw_show_board + * board_id_show */ -static ssize_t i40iw_show_board(struct device *dev, - struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%.*s\n", 32, "I40IW Board ID"); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, i40iw_show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, i40iw_show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, i40iw_show_board, NULL); +static struct attribute *i40iw_dev_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *i40iw_dev_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group i40iw_attr_group = { + .attrs = i40iw_dev_attributes, }; /** @@ -2849,20 +2852,6 @@ void i40iw_port_ibevent(struct i40iw_device *iwdev) ib_dispatch_event(&event); } -/** - * i40iw_unregister_rdma_device - unregister of iwarp from IB - * @iwibdev: rdma device ptr - */ -static void i40iw_unregister_rdma_device(struct i40iw_ib_device *iwibdev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i) - device_remove_file(&iwibdev->ibdev.dev, - i40iw_dev_attributes[i]); - ib_unregister_device(&iwibdev->ibdev); -} - /** * i40iw_destroy_rdma_device - destroy rdma device and free resources * @iwibdev: IB device ptr @@ -2872,7 +2861,7 @@ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev) if (!iwibdev) return; - i40iw_unregister_rdma_device(iwibdev); + ib_unregister_device(&iwibdev->ibdev); kfree(iwibdev->ibdev.iwcm); iwibdev->ibdev.iwcm = NULL; wait_event_timeout(iwibdev->iwdev->close_wq, @@ -2887,32 +2876,19 @@ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev) */ int i40iw_register_rdma_device(struct i40iw_device *iwdev) { - int i, ret; + int ret; struct i40iw_ib_device *iwibdev; iwdev->iwibdev = i40iw_init_rdma_device(iwdev); if (!iwdev->iwibdev) return -ENOMEM; iwibdev = iwdev->iwibdev; - + rdma_set_device_sysfs_group(&iwibdev->ibdev, &i40iw_attr_group); iwibdev->ibdev.driver_id = RDMA_DRIVER_I40IW; ret = ib_register_device(&iwibdev->ibdev, "i40iw%d", NULL); if (ret) goto error; - for (i = 0; i < ARRAY_SIZE(i40iw_dev_attributes); ++i) { - ret = - device_create_file(&iwibdev->ibdev.dev, - i40iw_dev_attributes[i]); - if (ret) { - while (i > 0) { - i--; - device_remove_file(&iwibdev->ibdev.dev, i40iw_dev_attributes[i]); - } - ib_unregister_device(&iwibdev->ibdev); - goto error; - } - } return 0; error: kfree(iwdev->iwibdev->ibdev.iwcm); diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 81126f18f1fd..0def2323459c 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -2039,39 +2039,43 @@ out: return err; } -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->dev->rev_id); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx4_ib_dev *dev = container_of(device, struct mlx4_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *mlx4_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *mlx4_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group mlx4_attr_group = { + .attrs = mlx4_class_attributes, }; struct diag_counter { @@ -2803,6 +2807,7 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_alloc_diag_counters(ibdev)) goto err_steer_free_bitmap; + rdma_set_device_sysfs_group(&ibdev->ib_dev, &mlx4_attr_group); ibdev->ib_dev.driver_id = RDMA_DRIVER_MLX4; if (ib_register_device(&ibdev->ib_dev, "mlx4_%d", NULL)) goto err_diag_counters; @@ -2827,12 +2832,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_notif; } - for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { - if (device_create_file(&ibdev->ib_dev.dev, - mlx4_class_attributes[j])) - goto err_notif; - } - ibdev->ib_active = true; mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) devlink_port_type_ib_set(mlx4_get_devlink_port(dev, i), diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c index 35a0e04c38f2..584ff2ea7810 100644 --- a/drivers/infiniband/hw/mlx5/ib_rep.c +++ b/drivers/infiniband/hw/mlx5/ib_rep.c @@ -39,9 +39,6 @@ static const struct mlx5_ib_profile rep_profile = { STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), - STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, - mlx5_ib_stage_class_attr_init, - NULL), }; static int diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b3294a7e3ff9..be701d40289e 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4079,16 +4079,17 @@ static int init_node_data(struct mlx5_ib_dev *dev) return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid); } -static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t fw_pages_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages); } +static DEVICE_ATTR_RO(fw_pages); -static ssize_t show_reg_pages(struct device *device, +static ssize_t reg_pages_show(struct device *device, struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = @@ -4096,44 +4097,47 @@ static ssize_t show_reg_pages(struct device *device, return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages)); } +static DEVICE_ATTR_RO(reg_pages); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "MT%d\n", dev->mdev->pdev->device); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->mdev->rev_id); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mlx5_ib_dev *dev = container_of(device, struct mlx5_ib_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN, dev->mdev->board_id); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); -static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL); -static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL); +static struct attribute *mlx5_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + &dev_attr_fw_pages.attr, + &dev_attr_reg_pages.attr, + NULL, +}; -static struct device_attribute *mlx5_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, - &dev_attr_fw_pages, - &dev_attr_reg_pages, +static const struct attribute_group mlx5_attr_group = { + .attrs = mlx5_class_attributes, }; static void pkey_change_handler(struct work_struct *work) @@ -6112,6 +6116,7 @@ int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev) { const char *name; + rdma_set_device_sysfs_group(&dev->ib_dev, &mlx5_attr_group); if (!mlx5_lag_is_active(dev->mdev)) name = "mlx5_%d"; else @@ -6146,21 +6151,6 @@ static void mlx5_ib_stage_delay_drop_cleanup(struct mlx5_ib_dev *dev) cancel_delay_drop(dev); } -int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev) -{ - int err; - int i; - - for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) { - err = device_create_file(&dev->ib_dev.dev, - mlx5_class_attributes[i]); - if (err) - return err; - } - - return 0; -} - static int mlx5_ib_stage_rep_reg_init(struct mlx5_ib_dev *dev) { mlx5_ib_register_vport_reps(dev); @@ -6268,9 +6258,6 @@ static const struct mlx5_ib_profile pf_profile = { STAGE_CREATE(MLX5_IB_STAGE_DELAY_DROP, mlx5_ib_stage_delay_drop_init, mlx5_ib_stage_delay_drop_cleanup), - STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, - mlx5_ib_stage_class_attr_init, - NULL), }; static const struct mlx5_ib_profile nic_rep_profile = { @@ -6313,9 +6300,6 @@ static const struct mlx5_ib_profile nic_rep_profile = { STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR, mlx5_ib_stage_post_ib_reg_umr_init, NULL), - STAGE_CREATE(MLX5_IB_STAGE_CLASS_ATTR, - mlx5_ib_stage_class_attr_init, - NULL), STAGE_CREATE(MLX5_IB_STAGE_REP_REG, mlx5_ib_stage_rep_reg_init, mlx5_ib_stage_rep_reg_cleanup), diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index e5ec3fdaa4d5..8444ea78229a 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1201,7 +1201,6 @@ void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev); int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev); void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev); int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev); -int mlx5_ib_stage_class_attr_init(struct mlx5_ib_dev *dev); void __mlx5_ib_remove(struct mlx5_ib_dev *dev, const struct mlx5_ib_profile *profile, int stage); diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index 7bd7e2ad17e4..691c6f048938 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -1076,16 +1076,17 @@ static int mthca_unmap_fmr(struct list_head *fmr_list) return err; } -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); return sprintf(buf, "%x\n", dev->rev_id); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); @@ -1103,23 +1104,26 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, return sprintf(buf, "unknown\n"); } } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct mthca_dev *dev = container_of(device, struct mthca_dev, ib_dev.dev); return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *mthca_dev_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL +}; -static struct device_attribute *mthca_dev_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group mthca_attr_group = { + .attrs = mthca_dev_attributes, }; static int mthca_init_node_data(struct mthca_dev *dev) @@ -1192,7 +1196,6 @@ static void get_dev_fw_str(struct ib_device *device, char *str) int mthca_register_device(struct mthca_dev *dev) { int ret; - int i; ret = mthca_init_node_data(dev); if (ret) @@ -1295,20 +1298,12 @@ int mthca_register_device(struct mthca_dev *dev) mutex_init(&dev->cap_mask_mutex); + rdma_set_device_sysfs_group(&dev->ib_dev, &mthca_attr_group); dev->ib_dev.driver_id = RDMA_DRIVER_MTHCA; ret = ib_register_device(&dev->ib_dev, "mthca%d", NULL); if (ret) return ret; - for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) { - ret = device_create_file(&dev->ib_dev.dev, - mthca_dev_attributes[i]); - if (ret) { - ib_unregister_device(&dev->ib_dev); - return ret; - } - } - mthca_start_catas_poll(dev); return 0; diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 94054bc611bd..92d1cadd4cfd 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -2556,8 +2556,8 @@ static int nes_dereg_mr(struct ib_mr *ib_mr) /** * show_rev */ -static ssize_t show_rev(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct nes_ib_device *nesibdev = container_of(dev, struct nes_ib_device, ibdev.dev); @@ -2566,40 +2566,40 @@ static ssize_t show_rev(struct device *dev, struct device_attribute *attr, nes_debug(NES_DBG_INIT, "\n"); return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev); } - +static DEVICE_ATTR_RO(hw_rev); /** * show_hca */ -static ssize_t show_hca(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *dev, + struct device_attribute *attr, char *buf) { nes_debug(NES_DBG_INIT, "\n"); return sprintf(buf, "NES020\n"); } - +static DEVICE_ATTR_RO(hca_type); /** * show_board */ -static ssize_t show_board(struct device *dev, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *dev, + struct device_attribute *attr, char *buf) { nes_debug(NES_DBG_INIT, "\n"); return sprintf(buf, "%.*s\n", 32, "NES020 Board ID"); } +static DEVICE_ATTR_RO(board_id); - -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); - -static struct device_attribute *nes_dev_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static struct attribute *nes_dev_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL }; +static const struct attribute_group nes_attr_group = { + .attrs = nes_dev_attributes, +}; /** * nes_query_qp @@ -3794,8 +3794,9 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) struct nes_vnic *nesvnic = nesibdev->nesvnic; struct nes_device *nesdev = nesvnic->nesdev; struct nes_adapter *nesadapter = nesdev->nesadapter; - int i, ret; + int ret; + rdma_set_device_sysfs_group(&nesvnic->nesibdev->ibdev, &nes_attr_group); nesvnic->nesibdev->ibdev.driver_id = RDMA_DRIVER_NES; ret = ib_register_device(&nesvnic->nesibdev->ibdev, "nes%d", NULL); if (ret) { @@ -3808,19 +3809,6 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count; nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count; - for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { - ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); - if (ret) { - while (i > 0) { - i--; - device_remove_file(&nesibdev->ibdev.dev, - nes_dev_attributes[i]); - } - ib_unregister_device(&nesibdev->ibdev); - return ret; - } - } - nesvnic->of_device_registered = 1; return 0; @@ -3833,15 +3821,9 @@ int nes_register_ofa_device(struct nes_ib_device *nesibdev) static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev) { struct nes_vnic *nesvnic = nesibdev->nesvnic; - int i; - for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) { - device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]); - } - - if (nesvnic->of_device_registered) { + if (nesvnic->of_device_registered) ib_unregister_device(&nesibdev->ibdev); - } nesvnic->of_device_registered = 0; } diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 4d3c27613351..873cc7f6fe61 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -114,6 +114,35 @@ static void get_dev_fw_str(struct ib_device *device, char *str) snprintf(str, IB_FW_VERSION_NAME_MAX, "%s", &dev->attr.fw_ver[0]); } +/* OCRDMA sysfs interface */ +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + struct ocrdma_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); +} +static DEVICE_ATTR_RO(hca_type); + +static struct attribute *ocrdma_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + NULL +}; + +static const struct attribute_group ocrdma_attr_group = { + .attrs = ocrdma_attributes, +}; + static int ocrdma_register_device(struct ocrdma_dev *dev) { ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid); @@ -212,6 +241,7 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.destroy_srq = ocrdma_destroy_srq; dev->ibdev.post_srq_recv = ocrdma_post_srq_recv; } + rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA; return ib_register_device(&dev->ibdev, "ocrdma%d", NULL); } @@ -259,42 +289,9 @@ static void ocrdma_free_resources(struct ocrdma_dev *dev) kfree(dev->cq_tbl); } -/* OCRDMA sysfs interface */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct ocrdma_dev *dev = dev_get_drvdata(device); - - return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor); -} - -static ssize_t show_hca_type(struct device *device, - struct device_attribute *attr, char *buf) -{ - struct ocrdma_dev *dev = dev_get_drvdata(device); - - return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]); -} - -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); - -static struct device_attribute *ocrdma_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type -}; - -static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) - device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]); -} - static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) { - int status = 0, i; + int status = 0; u8 lstate = 0; struct ocrdma_dev *dev; @@ -330,9 +327,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) if (!status) ocrdma_update_link_state(dev, lstate); - for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++) - if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i])) - goto sysfs_err; /* Init stats */ ocrdma_add_port_stats(dev); /* Interrupt Moderation */ @@ -347,8 +341,6 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info) dev_name(&dev->nic_info.pdev->dev), dev->id); return dev; -sysfs_err: - ocrdma_remove_sysfiles(dev); alloc_err: ocrdma_free_resources(dev); ocrdma_cleanup_hw(dev); @@ -375,7 +367,6 @@ static void ocrdma_remove(struct ocrdma_dev *dev) * of the registered clients. */ cancel_delayed_work_sync(&dev->eqd_work); - ocrdma_remove_sysfiles(dev); ib_unregister_device(&dev->ibdev); ocrdma_rem_port_stats(dev); diff --git a/drivers/infiniband/hw/qedr/main.c b/drivers/infiniband/hw/qedr/main.c index cd7b8b39a129..8d6ff9df49fe 100644 --- a/drivers/infiniband/hw/qedr/main.c +++ b/drivers/infiniband/hw/qedr/main.c @@ -133,6 +133,33 @@ static int qedr_iw_port_immutable(struct ib_device *ibdev, u8 port_num, return 0; } +/* QEDR sysfs interface */ +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) +{ + struct qedr_dev *dev = dev_get_drvdata(device); + + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); +} +static DEVICE_ATTR_RO(hw_rev); + +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) +{ + return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET"); +} +static DEVICE_ATTR_RO(hca_type); + +static struct attribute *qedr_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + NULL +}; + +static const struct attribute_group qedr_attr_group = { + .attrs = qedr_attributes, +}; + static int qedr_iw_register_device(struct qedr_dev *dev) { dev->ibdev.node_type = RDMA_NODE_RNIC; @@ -260,7 +287,7 @@ static int qedr_register_device(struct qedr_dev *dev) dev->ibdev.get_link_layer = qedr_link_layer; dev->ibdev.get_dev_fw_str = qedr_get_dev_fw_str; - + rdma_set_device_sysfs_group(&dev->ibdev, &qedr_attr_group); dev->ibdev.driver_id = RDMA_DRIVER_QEDR; return ib_register_device(&dev->ibdev, "qedr%d", NULL); } @@ -402,37 +429,6 @@ err1: return rc; } -/* QEDR sysfs interface */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) -{ - struct qedr_dev *dev = dev_get_drvdata(device); - - return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); -} - -static ssize_t show_hca_type(struct device *device, - struct device_attribute *attr, char *buf) -{ - return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET"); -} - -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL); - -static struct device_attribute *qedr_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type -}; - -static void qedr_remove_sysfiles(struct qedr_dev *dev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++) - device_remove_file(&dev->ibdev.dev, qedr_attributes[i]); -} - static void qedr_pci_set_atomic(struct qedr_dev *dev, struct pci_dev *pdev) { int rc = pci_enable_atomic_ops_to_root(pdev, @@ -853,7 +849,7 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, { struct qed_dev_rdma_info dev_info; struct qedr_dev *dev; - int rc = 0, i; + int rc = 0; dev = (struct qedr_dev *)ib_alloc_device(sizeof(*dev)); if (!dev) { @@ -912,18 +908,12 @@ static struct qedr_dev *qedr_add(struct qed_dev *cdev, struct pci_dev *pdev, goto reg_err; } - for (i = 0; i < ARRAY_SIZE(qedr_attributes); i++) - if (device_create_file(&dev->ibdev.dev, qedr_attributes[i])) - goto sysfs_err; - if (!test_and_set_bit(QEDR_ENET_STATE_BIT, &dev->enet_state)) qedr_ib_dispatch_event(dev, QEDR_PORT, IB_EVENT_PORT_ACTIVE); DP_DEBUG(dev, QEDR_MSG_INIT, "qedr driver loaded successfully\n"); return dev; -sysfs_err: - ib_unregister_device(&dev->ibdev); reg_err: qedr_sync_free_irqs(dev); irq_err: @@ -942,7 +932,6 @@ static void qedr_remove(struct qedr_dev *dev) /* First unregister with stack to stop all the active traffic * of the registered clients. */ - qedr_remove_sysfiles(dev); ib_unregister_device(&dev->ibdev); qedr_stop_hw(dev); diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 3461df002f81..83d2349188db 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -1390,13 +1390,13 @@ static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd) */ extern const char ib_qib_version[]; +extern const struct attribute_group qib_attr_group; int qib_device_create(struct qib_devdata *); void qib_device_remove(struct qib_devdata *); int qib_create_port_files(struct ib_device *ibdev, u8 port_num, struct kobject *kobj); -int qib_verbs_register_sysfs(struct qib_devdata *); void qib_verbs_unregister_sysfs(struct qib_devdata *); /* Hook for sysfs read of QSFP */ extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len); diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c index ca2638d8f35e..1cf4ca3f23e3 100644 --- a/drivers/infiniband/hw/qib/qib_sysfs.c +++ b/drivers/infiniband/hw/qib/qib_sysfs.c @@ -551,17 +551,18 @@ static struct kobj_type qib_diagc_ktype = { * Start of per-unit (or driver, in some cases, but replicated * per unit) functions (these get a device *) */ -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, struct device_attribute *attr, + char *buf) { struct qib_ibdev *dev = container_of(device, struct qib_ibdev, rdi.ibdev.dev); return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = container_of(device, struct qib_ibdev, rdi.ibdev.dev); @@ -574,15 +575,18 @@ static ssize_t show_hca(struct device *device, struct device_attribute *attr, ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname); return ret; } +static DEVICE_ATTR_RO(hca_type); +static DEVICE_ATTR(board_id, 0444, hca_type_show, NULL); -static ssize_t show_version(struct device *device, +static ssize_t version_show(struct device *device, struct device_attribute *attr, char *buf) { /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version); } +static DEVICE_ATTR_RO(version); -static ssize_t show_boardversion(struct device *device, +static ssize_t boardversion_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -592,9 +596,9 @@ static ssize_t show_boardversion(struct device *device, /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion); } +static DEVICE_ATTR_RO(boardversion); - -static ssize_t show_localbus_info(struct device *device, +static ssize_t localbus_info_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -604,9 +608,9 @@ static ssize_t show_localbus_info(struct device *device, /* The string printed here is already newline-terminated. */ return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info); } +static DEVICE_ATTR_RO(localbus_info); - -static ssize_t show_nctxts(struct device *device, +static ssize_t nctxts_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -620,9 +624,10 @@ static ssize_t show_nctxts(struct device *device, (dd->first_user_ctxt > dd->cfgctxts) ? 0 : (dd->cfgctxts - dd->first_user_ctxt)); } +static DEVICE_ATTR_RO(nctxts); -static ssize_t show_nfreectxts(struct device *device, - struct device_attribute *attr, char *buf) +static ssize_t nfreectxts_show(struct device *device, + struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = container_of(device, struct qib_ibdev, rdi.ibdev.dev); @@ -631,8 +636,9 @@ static ssize_t show_nfreectxts(struct device *device, /* Return the number of free user ports (contexts) available. */ return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts); } +static DEVICE_ATTR_RO(nfreectxts); -static ssize_t show_serial(struct device *device, +static ssize_t serial_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -644,8 +650,9 @@ static ssize_t show_serial(struct device *device, strcat(buf, "\n"); return strlen(buf); } +static DEVICE_ATTR_RO(serial); -static ssize_t store_chip_reset(struct device *device, +static ssize_t chip_reset_store(struct device *device, struct device_attribute *attr, const char *buf, size_t count) { @@ -663,11 +670,12 @@ static ssize_t store_chip_reset(struct device *device, bail: return ret < 0 ? ret : count; } +static DEVICE_ATTR_WO(chip_reset); /* * Dump tempsense regs. in decimal, to ease shell-scripts. */ -static ssize_t show_tempsense(struct device *device, +static ssize_t tempsense_show(struct device *device, struct device_attribute *attr, char *buf) { struct qib_ibdev *dev = @@ -695,6 +703,7 @@ static ssize_t show_tempsense(struct device *device, *(signed char *)(regvals + 7)); return ret; } +static DEVICE_ATTR_RO(tempsense); /* * end of per-unit (or driver, in some cases, but replicated @@ -702,30 +711,23 @@ static ssize_t show_tempsense(struct device *device, */ /* start of per-unit file structures and support code */ -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(version, S_IRUGO, show_version, NULL); -static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL); -static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL); -static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); -static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL); -static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL); -static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL); -static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset); +static struct attribute *qib_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + &dev_attr_version.attr, + &dev_attr_nctxts.attr, + &dev_attr_nfreectxts.attr, + &dev_attr_serial.attr, + &dev_attr_boardversion.attr, + &dev_attr_tempsense.attr, + &dev_attr_localbus_info.attr, + &dev_attr_chip_reset.attr, + NULL, +}; -static struct device_attribute *qib_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id, - &dev_attr_version, - &dev_attr_nctxts, - &dev_attr_nfreectxts, - &dev_attr_serial, - &dev_attr_boardversion, - &dev_attr_tempsense, - &dev_attr_localbus_info, - &dev_attr_chip_reset, +const struct attribute_group qib_attr_group = { + .attrs = qib_attributes, }; int qib_create_port_files(struct ib_device *ibdev, u8 port_num, @@ -826,27 +828,6 @@ bail: return ret; } -/* - * Register and create our files in /sys/class/infiniband. - */ -int qib_verbs_register_sysfs(struct qib_devdata *dd) -{ - struct ib_device *dev = &dd->verbs_dev.rdi.ibdev; - int i, ret; - - for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) { - ret = device_create_file(&dev->dev, qib_attributes[i]); - if (ret) - goto bail; - } - - return 0; -bail: - for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) - device_remove_file(&dev->dev, qib_attributes[i]); - return ret; -} - /* * Unregister and remove our files in /sys/class/infiniband. */ diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index 8a45964c4700..4b0f5761a646 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -1625,19 +1625,14 @@ int qib_register_ib_device(struct qib_devdata *dd) i, dd->rcd[ctxt]->pkeys); } + rdma_set_device_sysfs_group(&dd->verbs_dev.rdi.ibdev, &qib_attr_group); ret = rvt_register_device(&dd->verbs_dev.rdi, RDMA_DRIVER_QIB); if (ret) goto err_tx; - ret = qib_verbs_register_sysfs(dd); - if (ret) - goto err_class; - return ret; -err_class: - rvt_unregister_device(&dd->verbs_dev.rdi); err_tx: while (!list_empty(&dev->txreq_free)) { struct list_head *l = dev->txreq_free.next; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c index e3f98fe2f1e6..73bd00f8d2c8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_main.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c @@ -417,6 +417,8 @@ static void *usnic_ib_device_add(struct pci_dev *dev) us_ibdev->ib_dev.driver_id = RDMA_DRIVER_USNIC; + rdma_set_device_sysfs_group(&us_ibdev->ib_dev, &usnic_attr_group); + if (ib_register_device(&us_ibdev->ib_dev, "usnic_%d", NULL)) goto err_fwd_dealloc; diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c index fab4cb780122..a7e4b2ccfaf8 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c @@ -46,9 +46,8 @@ #include "usnic_ib_sysfs.h" #include "usnic_log.h" -static ssize_t usnic_ib_show_board(struct device *device, - struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev); @@ -60,13 +59,13 @@ static ssize_t usnic_ib_show_board(struct device *device, return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id); } +static DEVICE_ATTR_RO(board_id); /* * Report the configuration for this PF */ static ssize_t -usnic_ib_show_config(struct device *device, struct device_attribute *attr, - char *buf) +config_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; char *ptr; @@ -126,10 +125,10 @@ usnic_ib_show_config(struct device *device, struct device_attribute *attr, return ptr - buf; } +static DEVICE_ATTR_RO(config); static ssize_t -usnic_ib_show_iface(struct device *device, struct device_attribute *attr, - char *buf) +iface_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; @@ -138,10 +137,10 @@ usnic_ib_show_iface(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%s\n", netdev_name(us_ibdev->netdev)); } +static DEVICE_ATTR_RO(iface); static ssize_t -usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, - char *buf) +max_vf_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; @@ -150,10 +149,10 @@ usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%u\n", kref_read(&us_ibdev->vf_cnt)); } +static DEVICE_ATTR_RO(max_vf); static ssize_t -usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, - char *buf) +qp_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; int qp_per_vf; @@ -165,10 +164,10 @@ usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%d\n", qp_per_vf); } +static DEVICE_ATTR_RO(qp_per_vf); static ssize_t -usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, - char *buf) +cq_per_vf_show(struct device *device, struct device_attribute *attr, char *buf) { struct usnic_ib_dev *us_ibdev; @@ -177,21 +176,20 @@ usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr, return scnprintf(buf, PAGE_SIZE, "%d\n", us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]); } +static DEVICE_ATTR_RO(cq_per_vf); -static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL); -static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL); -static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL); -static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL); -static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL); -static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL); +static struct attribute *usnic_class_attributes[] = { + &dev_attr_board_id.attr, + &dev_attr_config.attr, + &dev_attr_iface.attr, + &dev_attr_max_vf.attr, + &dev_attr_qp_per_vf.attr, + &dev_attr_cq_per_vf.attr, + NULL +}; -static struct device_attribute *usnic_class_attributes[] = { - &dev_attr_board_id, - &dev_attr_config, - &dev_attr_iface, - &dev_attr_max_vf, - &dev_attr_qp_per_vf, - &dev_attr_cq_per_vf, +const struct attribute_group usnic_attr_group = { + .attrs = usnic_class_attributes, }; struct qpn_attribute { @@ -278,18 +276,6 @@ static struct kobj_type usnic_ib_qpn_type = { int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) { - int i; - int err; - for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { - err = device_create_file(&us_ibdev->ib_dev.dev, - usnic_class_attributes[i]); - if (err) { - usnic_err("Failed to create device file %d for %s eith err %d", - i, dev_name(&us_ibdev->ib_dev.dev), err); - return -EINVAL; - } - } - /* create kernel object for looking at individual QPs */ kobject_get(&us_ibdev->ib_dev.dev.kobj); us_ibdev->qpn_kobj = kobject_create_and_add("qpn", @@ -304,12 +290,6 @@ int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev) void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev) { - int i; - for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) { - device_remove_file(&us_ibdev->ib_dev.dev, - usnic_class_attributes[i]); - } - kobject_put(us_ibdev->qpn_kobj); } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h index 3d98e16cfeaf..b1f064cec850 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h @@ -41,4 +41,6 @@ void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev); void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp); void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp); +extern const struct attribute_group usnic_attr_group; + #endif /* !USNIC_IB_SYSFS_H_ */ diff --git a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c index c1e31985b11c..398443f43dc3 100644 --- a/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c +++ b/drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c @@ -65,32 +65,36 @@ static struct workqueue_struct *event_wq; static int pvrdma_add_gid(const struct ib_gid_attr *attr, void **context); static int pvrdma_del_gid(const struct ib_gid_attr *attr, void **context); -static ssize_t show_hca(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hca_type_show(struct device *device, + struct device_attribute *attr, char *buf) { return sprintf(buf, "VMW_PVRDMA-%s\n", DRV_VERSION); } +static DEVICE_ATTR_RO(hca_type); -static ssize_t show_rev(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t hw_rev_show(struct device *device, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", PVRDMA_REV_ID); } +static DEVICE_ATTR_RO(hw_rev); -static ssize_t show_board(struct device *device, struct device_attribute *attr, - char *buf) +static ssize_t board_id_show(struct device *device, + struct device_attribute *attr, char *buf) { return sprintf(buf, "%d\n", PVRDMA_BOARD_ID); } +static DEVICE_ATTR_RO(board_id); -static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); -static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL); -static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL); +static struct attribute *pvrdma_class_attributes[] = { + &dev_attr_hw_rev.attr, + &dev_attr_hca_type.attr, + &dev_attr_board_id.attr, + NULL, +}; -static struct device_attribute *pvrdma_class_attributes[] = { - &dev_attr_hw_rev, - &dev_attr_hca_type, - &dev_attr_board_id +static const struct attribute_group pvrdma_attr_group = { + .attrs = pvrdma_class_attributes, }; static void pvrdma_get_fw_ver_str(struct ib_device *device, char *str) @@ -160,7 +164,6 @@ static struct net_device *pvrdma_get_netdev(struct ib_device *ibdev, static int pvrdma_register_device(struct pvrdma_dev *dev) { int ret = -1; - int i = 0; dev->ib_dev.node_guid = dev->dsr->caps.node_guid; dev->sys_image_guid = dev->dsr->caps.sys_image_guid; @@ -265,24 +268,16 @@ static int pvrdma_register_device(struct pvrdma_dev *dev) } dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA; spin_lock_init(&dev->srq_tbl_lock); + rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group); ret = ib_register_device(&dev->ib_dev, "vmw_pvrdma%d", NULL); if (ret) goto err_srq_free; - for (i = 0; i < ARRAY_SIZE(pvrdma_class_attributes); ++i) { - ret = device_create_file(&dev->ib_dev.dev, - pvrdma_class_attributes[i]); - if (ret) - goto err_class; - } - dev->ib_active = true; return 0; -err_class: - ib_unregister_device(&dev->ib_dev); err_srq_free: kfree(dev->srq_tbl); err_qp_free: diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.c b/drivers/infiniband/sw/rxe/rxe_verbs.c index e4da5b671e4a..9c19f2027511 100644 --- a/drivers/infiniband/sw/rxe/rxe_verbs.c +++ b/drivers/infiniband/sw/rxe/rxe_verbs.c @@ -1148,14 +1148,18 @@ static ssize_t parent_show(struct device *device, static DEVICE_ATTR_RO(parent); -static struct device_attribute *rxe_dev_attributes[] = { - &dev_attr_parent, +static struct attribute *rxe_dev_attributes[] = { + &dev_attr_parent.attr, + NULL +}; + +static const struct attribute_group rxe_attr_group = { + .attrs = rxe_dev_attributes, }; int rxe_register_device(struct rxe_dev *rxe) { int err; - int i; struct ib_device *dev = &rxe->ib_dev; struct crypto_shash *tfm; @@ -1259,6 +1263,7 @@ int rxe_register_device(struct rxe_dev *rxe) } rxe->tfm = tfm; + rdma_set_device_sysfs_group(dev, &rxe_attr_group); dev->driver_id = RDMA_DRIVER_RXE; err = ib_register_device(dev, "rxe%d", NULL); if (err) { @@ -1266,19 +1271,8 @@ int rxe_register_device(struct rxe_dev *rxe) goto err1; } - for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) { - err = device_create_file(&dev->dev, rxe_dev_attributes[i]); - if (err) { - pr_warn("%s failed with error %d for attr number %d\n", - __func__, err, i); - goto err2; - } - } - return 0; -err2: - ib_unregister_device(dev); err1: crypto_free_shash(rxe->tfm); @@ -1287,12 +1281,8 @@ err1: int rxe_unregister_device(struct rxe_dev *rxe) { - int i; struct ib_device *dev = &rxe->ib_dev; - for (i = 0; i < ARRAY_SIZE(rxe_dev_attributes); ++i) - device_remove_file(&dev->dev, rxe_dev_attributes[i]); - ib_unregister_device(dev); return 0; From 5d6ff1babe78034f0cf8e5f7bf312a257e5574cc Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Tue, 9 Oct 2018 12:05:13 +0300 Subject: [PATCH 235/242] IB/mlx5: Support scatter to CQE for DC transport type Scatter to CQE is a HW offload that saves PCI writes by scattering the payload to the CQE. This patch extends already existing functionality to support DC transport type. Signed-off-by: Yonatan Cohen Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/cq.c | 2 +- drivers/infiniband/hw/mlx5/mlx5_ib.h | 2 +- drivers/infiniband/hw/mlx5/qp.c | 71 ++++++++++++++++++++-------- 3 files changed, 54 insertions(+), 21 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index dae30b6478bf..a41519dc8d3a 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -1460,7 +1460,7 @@ ex: return err; } -int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq) +int mlx5_ib_get_cqe_size(struct ib_cq *ibcq) { struct mlx5_ib_cq *cq; diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 8444ea78229a..9de9397166b8 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -1127,7 +1127,7 @@ void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem, int page_shift, __be64 *pas, int access_flags); void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num); -int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq); +int mlx5_ib_get_cqe_size(struct ib_cq *ibcq); int mlx5_mr_cache_init(struct mlx5_ib_dev *dev); int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 829aec3aba8f..817c391bdfc0 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1053,7 +1053,8 @@ static u32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr) static int is_connected(enum ib_qp_type qp_type) { - if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC) + if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC || + qp_type == MLX5_IB_QPT_DCI) return 1; return 0; @@ -1684,6 +1685,49 @@ err: return err; } +static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, + void *qpc) +{ + int rcqe_sz; + + if (init_attr->qp_type == MLX5_IB_QPT_DCI) + return; + + rcqe_sz = mlx5_ib_get_cqe_size(init_attr->recv_cq); + + if (rcqe_sz == 128) { + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); + return; + } + + if (init_attr->qp_type != MLX5_IB_QPT_DCT) + MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); +} + +static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, + struct ib_qp_init_attr *init_attr, + void *qpc) +{ + enum ib_qp_type qpt = init_attr->qp_type; + int scqe_sz; + + if (qpt == IB_QPT_UC || qpt == IB_QPT_UD) + return; + + if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) + return; + + scqe_sz = mlx5_ib_get_cqe_size(init_attr->send_cq); + if (scqe_sz == 128) { + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); + return; + } + + if (init_attr->qp_type != MLX5_IB_QPT_DCI || + MLX5_CAP_GEN(dev->mdev, dc_req_scat_data_cqe)) + MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); +} + static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) @@ -1787,7 +1831,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return err; qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); - qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); + if (MLX5_CAP_GEN(dev->mdev, sctr_data_cqe)) + qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); if (ucmd.flags & MLX5_QP_FLAG_TUNNEL_OFFLOADS) { if (init_attr->qp_type != IB_QPT_RAW_PACKET || !tunnel_offload_supported(mdev)) { @@ -1911,23 +1956,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_SET(qpc, qpc, cd_slave_receive, 1); if (qp->scat_cqe && is_connected(init_attr->qp_type)) { - int rcqe_sz; - int scqe_sz; - - rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq); - scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq); - - if (rcqe_sz == 128) - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA64_CQE); - else - MLX5_SET(qpc, qpc, cs_res, MLX5_RES_SCAT_DATA32_CQE); - - if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) { - if (scqe_sz == 128) - MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA64_CQE); - else - MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); - } + configure_responder_scat_cqe(init_attr, qpc); + configure_requester_scat_cqe(dev, init_attr, qpc); } if (qp->rq.wqe_cnt) { @@ -2302,6 +2332,9 @@ static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd, MLX5_SET64(dctc, dctc, dc_access_key, ucmd->access_key); MLX5_SET(dctc, dctc, user_index, uidx); + if (ucmd->flags & MLX5_QP_FLAG_SCATTER_CQE) + configure_responder_scat_cqe(attr, dctc); + qp->state = IB_QPS_RESET; return &qp->ibqp; From 2e43bb31b8df662f591a7e80270ca3acda44bb48 Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Tue, 9 Oct 2018 12:05:14 +0300 Subject: [PATCH 236/242] IB/mlx5: Verify that driver supports user flags Flags sent down from user might not be supported by running driver. This might lead to unwanted bugs. To solve this, added macro to test for unsupported flags. Signed-off-by: Yonatan Cohen Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 817c391bdfc0..5b1811be6677 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1728,6 +1728,11 @@ static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); } +static inline bool check_flags_mask(uint64_t input, uint64_t supported) +{ + return (input & ~supported) == 0; +} + static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) @@ -1825,6 +1830,15 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EFAULT; } + if (!check_flags_mask(ucmd.flags, + MLX5_QP_FLAG_SIGNATURE | + MLX5_QP_FLAG_SCATTER_CQE | + MLX5_QP_FLAG_TUNNEL_OFFLOADS | + MLX5_QP_FLAG_BFREG_INDEX | + MLX5_QP_FLAG_TYPE_DCT | + MLX5_QP_FLAG_TYPE_DCI)) + return -EINVAL; + err = get_qp_user_index(to_mucontext(pd->uobject->context), &ucmd, udata->inlen, &uidx); if (err) From 6f4bc0ea682b59d7013cbc5ced2d4dd73067a33f Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Tue, 9 Oct 2018 12:05:15 +0300 Subject: [PATCH 237/242] IB/mlx5: Allow scatter to CQE without global signaled WRs Requester scatter to CQE is restricted to QPs configured to signal all WRs. This patch adds ability to enable scatter to cqe (force enable) in the requester without sig_all, for users who do not want all WRs signaled but rather just the ones whose data found in the CQE. Signed-off-by: Yonatan Cohen Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 14 +++++++++++--- include/uapi/rdma/mlx5-abi.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 5b1811be6677..368728e6f980 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1706,15 +1706,20 @@ static void configure_responder_scat_cqe(struct ib_qp_init_attr *init_attr, static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, + struct mlx5_ib_create_qp *ucmd, void *qpc) { enum ib_qp_type qpt = init_attr->qp_type; int scqe_sz; + bool allow_scat_cqe = 0; if (qpt == IB_QPT_UC || qpt == IB_QPT_UD) return; - if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) + if (ucmd) + allow_scat_cqe = ucmd->flags & MLX5_QP_FLAG_ALLOW_SCATTER_CQE; + + if (!allow_scat_cqe && init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) return; scqe_sz = mlx5_ib_get_cqe_size(init_attr->send_cq); @@ -1836,7 +1841,8 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, MLX5_QP_FLAG_TUNNEL_OFFLOADS | MLX5_QP_FLAG_BFREG_INDEX | MLX5_QP_FLAG_TYPE_DCT | - MLX5_QP_FLAG_TYPE_DCI)) + MLX5_QP_FLAG_TYPE_DCI | + MLX5_QP_FLAG_ALLOW_SCATTER_CQE)) return -EINVAL; err = get_qp_user_index(to_mucontext(pd->uobject->context), @@ -1971,7 +1977,9 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->scat_cqe && is_connected(init_attr->qp_type)) { configure_responder_scat_cqe(init_attr, qpc); - configure_requester_scat_cqe(dev, init_attr, qpc); + configure_requester_scat_cqe(dev, init_attr, + (pd && pd->uobject) ? &ucmd : NULL, + qpc); } if (qp->rq.wqe_cnt) { diff --git a/include/uapi/rdma/mlx5-abi.h b/include/uapi/rdma/mlx5-abi.h index 6056625237cf..8fa9f90e2bb1 100644 --- a/include/uapi/rdma/mlx5-abi.h +++ b/include/uapi/rdma/mlx5-abi.h @@ -47,6 +47,7 @@ enum { MLX5_QP_FLAG_TYPE_DCI = 1 << 5, MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC = 1 << 6, MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC = 1 << 7, + MLX5_QP_FLAG_ALLOW_SCATTER_CQE = 1 << 8, }; enum { From 56e027a604c80b1106786f8426e7cc5dbacb53b2 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Wed, 17 Oct 2018 09:20:04 +0200 Subject: [PATCH 238/242] ib_srp: Remove WARN_ON in srp_terminate_io() The WARN_ON() is pointless as the rport is placed in SDEV_TRANSPORT_OFFLINE at that time, so no new commands can be submitted via srp_queuecommand() Signed-off-by: Hannes Reinecke Reviewed-by: Jens Axboe Reviewed-by: Johannes Thumshirn Acked-by: Bart Van Assche Signed-off-by: Doug Ledford --- drivers/infiniband/ulp/srp/ib_srp.c | 9 --------- 1 file changed, 9 deletions(-) diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 36ade9ae7dde..eed0eb3bb04c 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -1330,17 +1330,8 @@ static void srp_terminate_io(struct srp_rport *rport) { struct srp_target_port *target = rport->lld_data; struct srp_rdma_ch *ch; - struct Scsi_Host *shost = target->scsi_host; - struct scsi_device *sdev; int i, j; - /* - * Invoking srp_terminate_io() while srp_queuecommand() is running - * is not safe. Hence the warning statement below. - */ - shost_for_each_device(sdev, shost) - WARN_ON_ONCE(sdev->request_queue->request_fn_active); - for (i = 0; i < target->ch_count; i++) { ch = &target->ch[i]; From 67fecaf8e9cc28812042f61194ac0e0a9737f897 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 17 Oct 2018 13:19:27 +0300 Subject: [PATCH 239/242] RDMA/core: Fix unwinding flow in case of error to register device If port pkey list initialization fails, free the port_immutable memory during cleanup path. Currently it is missed out. If cache setup fails, free the pkey list during cleanup path. Fixes: d291f1a65 ("IB/core: Enforce PKey security on QPs") Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 5e70f5e1cfd9..d175b94ae952 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -545,14 +545,14 @@ int ib_register_device(struct ib_device *device, const char *name, ret = setup_port_pkey_list(device); if (ret) { dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - goto out; + goto port_cleanup; } ret = ib_cache_setup_one(device); if (ret) { dev_warn(&device->dev, "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto port_cleanup; + goto pkey_cleanup; } device->index = __dev_new_index(); @@ -596,6 +596,8 @@ cg_cleanup: cache_cleanup: ib_cache_cleanup_one(device); ib_cache_release_one(device); +pkey_cleanup: + kfree(device->port_pkey_list); port_cleanup: kfree(device->port_immutable); out: From 548cb4fbe80d68b9d1b8b30aca179636e74bec36 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 17 Oct 2018 13:20:20 +0300 Subject: [PATCH 240/242] RDMA/core: Refactor ib_register_device() function ib_register_device() does several allocation and initialization steps. Split it into smaller more readable functions for easy review and maintenance. Signed-off-by: Parav Pandit Reviewed-by: Daniel Jurgens Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/device.c | 132 ++++++++++++++++++------------- 1 file changed, 78 insertions(+), 54 deletions(-) diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index d175b94ae952..87eb4f2cdd7d 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -465,22 +465,8 @@ static u32 __dev_new_index(void) } } -/** - * ib_register_device - Register an IB device with IB core - * @device:Device to register - * - * Low-level drivers use ib_register_device() to register their - * devices with the IB core. All registered clients will receive a - * callback for each device that is added. @device must be allocated - * with ib_alloc_device(). - */ -int ib_register_device(struct ib_device *device, const char *name, - int (*port_callback)(struct ib_device *, u8, - struct kobject *)) +static void setup_dma_device(struct ib_device *device) { - int ret; - struct ib_client *client; - struct ib_udata uhw = {.outlen = 0, .inlen = 0}; struct device *parent = device->dev.parent; WARN_ON_ONCE(device->dma_device); @@ -512,6 +498,78 @@ int ib_register_device(struct ib_device *device, const char *name, WARN_ON_ONCE(!parent); device->dma_device = parent; } +} + +static void cleanup_device(struct ib_device *device) +{ + ib_cache_cleanup_one(device); + ib_cache_release_one(device); + kfree(device->port_pkey_list); + kfree(device->port_immutable); +} + +static int setup_device(struct ib_device *device) +{ + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; + int ret; + + ret = ib_device_check_mandatory(device); + if (ret) + return ret; + + ret = read_port_immutable(device); + if (ret) { + dev_warn(&device->dev, + "Couldn't create per port immutable data\n"); + return ret; + } + + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); + if (ret) { + dev_warn(&device->dev, + "Couldn't query the device attributes\n"); + goto port_cleanup; + } + + ret = setup_port_pkey_list(device); + if (ret) { + dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); + goto port_cleanup; + } + + ret = ib_cache_setup_one(device); + if (ret) { + dev_warn(&device->dev, + "Couldn't set up InfiniBand P_Key/GID cache\n"); + goto pkey_cleanup; + } + return 0; + +pkey_cleanup: + kfree(device->port_pkey_list); +port_cleanup: + kfree(device->port_immutable); + return ret; +} + +/** + * ib_register_device - Register an IB device with IB core + * @device:Device to register + * + * Low-level drivers use ib_register_device() to register their + * devices with the IB core. All registered clients will receive a + * callback for each device that is added. @device must be allocated + * with ib_alloc_device(). + */ +int ib_register_device(struct ib_device *device, const char *name, + int (*port_callback)(struct ib_device *, u8, + struct kobject *)) +{ + int ret; + struct ib_client *client; + + setup_dma_device(device); mutex_lock(&device_mutex); @@ -530,30 +588,9 @@ int ib_register_device(struct ib_device *device, const char *name, } strlcpy(device->name, dev_name(&device->dev), IB_DEVICE_NAME_MAX); - if (ib_device_check_mandatory(device)) { - ret = -EINVAL; + ret = setup_device(device); + if (ret) goto out; - } - - ret = read_port_immutable(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't create per port immutable data\n"); - goto out; - } - - ret = setup_port_pkey_list(device); - if (ret) { - dev_warn(&device->dev, "Couldn't create per port_pkey_list\n"); - goto port_cleanup; - } - - ret = ib_cache_setup_one(device); - if (ret) { - dev_warn(&device->dev, - "Couldn't set up InfiniBand P_Key/GID cache\n"); - goto pkey_cleanup; - } device->index = __dev_new_index(); @@ -561,15 +598,7 @@ int ib_register_device(struct ib_device *device, const char *name, if (ret) { dev_warn(&device->dev, "Couldn't register device with rdma cgroup\n"); - goto cache_cleanup; - } - - memset(&device->attrs, 0, sizeof(device->attrs)); - ret = device->query_device(device, &device->attrs, &uhw); - if (ret) { - dev_warn(&device->dev, - "Couldn't query the device attributes\n"); - goto cg_cleanup; + goto dev_cleanup; } ret = ib_device_register_sysfs(device, port_callback); @@ -593,13 +622,8 @@ int ib_register_device(struct ib_device *device, const char *name, cg_cleanup: ib_device_unregister_rdmacg(device); -cache_cleanup: - ib_cache_cleanup_one(device); - ib_cache_release_one(device); -pkey_cleanup: - kfree(device->port_pkey_list); -port_cleanup: - kfree(device->port_immutable); +dev_cleanup: + cleanup_device(device); out: mutex_unlock(&device_mutex); return ret; From 76d865b87c327b34c0e24f23e75828878022f899 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 17 Oct 2018 13:21:08 +0300 Subject: [PATCH 241/242] RDMA/core: Fix comment for hw stats init for port == 0 When add_port() is done for port == 0, it indicates that ports hardware counters initialization should be skipped. Reflect so in the comment. Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/core/sysfs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index f54f107ef668..6fcce2c206c6 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -1120,9 +1120,9 @@ static int add_port(struct ib_device *device, int port_num, } /* - * If port == 0, it means we have only one port and the parent - * device, not this port device, should be the holder of the - * hw_counters + * If port == 0, it means hw_counters are per device and not per + * port, so holder should be device. Therefore skip per port conunter + * initialization. */ if (device->alloc_hw_stats && port_num) setup_hw_stats(device, p, port_num); From a60109dc9a954ef9eddba6577e2d2e9e7952e487 Mon Sep 17 00:00:00 2001 From: Yonatan Cohen Date: Wed, 10 Oct 2018 09:25:16 +0300 Subject: [PATCH 242/242] IB/mlx5: Add support for extended atomic operations Extended atomic operations cmp&swp and fetch&add is a Mellanox feature extending the standard atomic operation to use, varied operand sizes, as apposed to normal atomic operation that use an 8 byte operand only. Extended atomics allows masking the results and arguments. This patch configures QP to support extended atomic operation with the maximum size possible, as exposed by HCA capabilities. Signed-off-by: Yonatan Cohen Reviewed-by: Guy Levi Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford --- drivers/infiniband/hw/mlx5/qp.c | 98 ++++++++++++++++++++++++++++----- include/linux/mlx5/driver.h | 23 ++++---- 2 files changed, 96 insertions(+), 25 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 368728e6f980..6841c0f9237f 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -1733,6 +1733,53 @@ static void configure_requester_scat_cqe(struct mlx5_ib_dev *dev, MLX5_SET(qpc, qpc, cs_req, MLX5_REQ_SCAT_DATA32_CQE); } +static int atomic_size_to_mode(int size_mask) +{ + /* driver does not support atomic_size > 256B + * and does not know how to translate bigger sizes + */ + int supported_size_mask = size_mask & 0x1ff; + int log_max_size; + + if (!supported_size_mask) + return -EOPNOTSUPP; + + log_max_size = __fls(supported_size_mask); + + if (log_max_size > 3) + return log_max_size; + + return MLX5_ATOMIC_MODE_8B; +} + +static int get_atomic_mode(struct mlx5_ib_dev *dev, + enum ib_qp_type qp_type) +{ + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic = MLX5_CAP_GEN(dev->mdev, atomic); + int atomic_mode = -EOPNOTSUPP; + int atomic_size_mask; + + if (!atomic) + return -EOPNOTSUPP; + + if (qp_type == MLX5_IB_QPT_DCT) + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc); + else + atomic_size_mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + + if ((atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP) || + (atomic_operations & MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD)) + atomic_mode = atomic_size_to_mode(atomic_size_mask); + + if (atomic_mode <= 0 && + (atomic_operations & MLX5_ATOMIC_OPS_CMP_SWAP && + atomic_operations & MLX5_ATOMIC_OPS_FETCH_ADD)) + atomic_mode = MLX5_ATOMIC_MODE_IB_COMP; + + return atomic_mode; +} + static inline bool check_flags_mask(uint64_t input, uint64_t supported) { return (input & ~supported) == 0; @@ -2562,13 +2609,15 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp) return 0; } -static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr, - int attr_mask) +static int to_mlx5_access_flags(struct mlx5_ib_qp *qp, + const struct ib_qp_attr *attr, + int attr_mask, __be32 *hw_access_flags) { - u32 hw_access_flags = 0; u8 dest_rd_atomic; u32 access_flags; + struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); + if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else @@ -2583,13 +2632,25 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att access_flags &= IB_ACCESS_REMOTE_WRITE; if (access_flags & IB_ACCESS_REMOTE_READ) - hw_access_flags |= MLX5_QP_BIT_RRE; - if (access_flags & IB_ACCESS_REMOTE_ATOMIC) - hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX); - if (access_flags & IB_ACCESS_REMOTE_WRITE) - hw_access_flags |= MLX5_QP_BIT_RWE; + *hw_access_flags |= MLX5_QP_BIT_RRE; + if ((access_flags & IB_ACCESS_REMOTE_ATOMIC) && + qp->ibqp.qp_type == IB_QPT_RC) { + int atomic_mode; - return cpu_to_be32(hw_access_flags); + atomic_mode = get_atomic_mode(dev, qp->ibqp.qp_type); + if (atomic_mode < 0) + return -EOPNOTSUPP; + + *hw_access_flags |= MLX5_QP_BIT_RAE; + *hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET; + } + + if (access_flags & IB_ACCESS_REMOTE_WRITE) + *hw_access_flags |= MLX5_QP_BIT_RWE; + + *hw_access_flags = cpu_to_be32(*hw_access_flags); + + return 0; } enum { @@ -3287,8 +3348,15 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21); } - if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) - context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask); + if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) { + __be32 access_flags = 0; + + err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags); + if (err) + goto out; + + context->params2 |= access_flags; + } if (attr_mask & IB_QP_MIN_RNR_TIMER) context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24); @@ -3504,10 +3572,14 @@ static int mlx5_ib_modify_dct(struct ib_qp *ibqp, struct ib_qp_attr *attr, if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) MLX5_SET(dctc, dctc, rwe, 1); if (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) { - if (!mlx5_ib_dc_atomic_is_supported(dev)) + int atomic_mode; + + atomic_mode = get_atomic_mode(dev, MLX5_IB_QPT_DCT); + if (atomic_mode < 0) return -EOPNOTSUPP; + + MLX5_SET(dctc, dctc, atomic_mode, atomic_mode); MLX5_SET(dctc, dctc, rae, 1); - MLX5_SET(dctc, dctc, atomic_mode, MLX5_ATOMIC_MODE_DCT_CX); } MLX5_SET(dctc, dctc, pkey_index, attr->pkey_index); MLX5_SET(dctc, dctc, port, attr->port_num); diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 8fb072aa8671..a73c701edd16 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -97,14 +97,15 @@ enum { }; enum { - MLX5_ATOMIC_MODE_IB_COMP = 1 << 16, - MLX5_ATOMIC_MODE_CX = 2 << 16, - MLX5_ATOMIC_MODE_8B = 3 << 16, - MLX5_ATOMIC_MODE_16B = 4 << 16, - MLX5_ATOMIC_MODE_32B = 5 << 16, - MLX5_ATOMIC_MODE_64B = 6 << 16, - MLX5_ATOMIC_MODE_128B = 7 << 16, - MLX5_ATOMIC_MODE_256B = 8 << 16, + MLX5_ATOMIC_MODE_OFFSET = 16, + MLX5_ATOMIC_MODE_IB_COMP = 1, + MLX5_ATOMIC_MODE_CX = 2, + MLX5_ATOMIC_MODE_8B = 3, + MLX5_ATOMIC_MODE_16B = 4, + MLX5_ATOMIC_MODE_32B = 5, + MLX5_ATOMIC_MODE_64B = 6, + MLX5_ATOMIC_MODE_128B = 7, + MLX5_ATOMIC_MODE_256B = 8, }; enum { @@ -162,13 +163,11 @@ enum mlx5_dcbx_oper_mode { MLX5E_DCBX_PARAM_VER_OPER_AUTO = 0x3, }; -enum mlx5_dct_atomic_mode { - MLX5_ATOMIC_MODE_DCT_CX = 2, -}; - enum { MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0, MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1, + MLX5_ATOMIC_OPS_EXTENDED_CMP_SWAP = 1 << 2, + MLX5_ATOMIC_OPS_EXTENDED_FETCH_ADD = 1 << 3, }; enum mlx5_page_fault_resume_flags {