// SPDX-License-Identifier: GPL-2.0 /* * Shared Memory Communications over RDMA (SMC-R) and RoCE * * Basic Transport Functions exploiting Infiniband API * * Copyright IBM Corp. 2016 * * Author(s): Ursula Braun */ #include #include #include #include #include #include #include #include "smc.h" #include "smc_clc.h" #include "smc_core.h" #include "smc_ib.h" #include "smc_wr.h" #include "smc_llc.h" #include "smc_cdc.h" #include "smc_close.h" #define SMC_LGR_NUM_INCR 256 #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10) static u32 smc_lgr_num; /* unique link group number */ /* Register connection's alert token in our lookup structure. * To use rbtrees we have to implement our own insert core. * Requires @conns_lock * @smc connection to register * Returns 0 on success, != otherwise. */ static void smc_lgr_add_alert_token(struct smc_connection *conn) { struct rb_node **link, *parent = NULL; u32 token = conn->alert_token_local; link = &conn->lgr->conns_all.rb_node; while (*link) { struct smc_connection *cur = rb_entry(*link, struct smc_connection, alert_node); parent = *link; if (cur->alert_token_local > token) link = &parent->rb_left; else link = &parent->rb_right; } /* Put the new node there */ rb_link_node(&conn->alert_node, parent, link); rb_insert_color(&conn->alert_node, &conn->lgr->conns_all); } /* Register connection in link group by assigning an alert token * registered in a search tree. * Requires @conns_lock * Note that '0' is a reserved value and not assigned. */ static void smc_lgr_register_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); static atomic_t nexttoken = ATOMIC_INIT(0); /* find a new alert_token_local value not yet used by some connection * in this link group */ sock_hold(&smc->sk); /* sock_put in smc_lgr_unregister_conn() */ while (!conn->alert_token_local) { conn->alert_token_local = atomic_inc_return(&nexttoken); if (smc_lgr_find_conn(conn->alert_token_local, conn->lgr)) conn->alert_token_local = 0; } smc_lgr_add_alert_token(conn); conn->lgr->conns_num++; } /* Unregister connection and reset the alert token of the given connection< */ static void __smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_sock *smc = container_of(conn, struct smc_sock, conn); struct smc_link_group *lgr = conn->lgr; rb_erase(&conn->alert_node, &lgr->conns_all); lgr->conns_num--; conn->alert_token_local = 0; conn->lgr = NULL; sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } /* Unregister connection and trigger lgr freeing if applicable */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; int reduced = 0; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { reduced = 1; __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); if (!reduced || lgr->conns_num) return; /* client link group creation always follows the server link group * creation. For client use a somewhat higher removal delay time, * otherwise there is a risk of out-of-sync link groups. */ mod_delayed_work(system_wq, &lgr->free_work, lgr->role == SMC_CLNT ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); } static void smc_lgr_free_work(struct work_struct *work) { struct smc_link_group *lgr = container_of(to_delayed_work(work), struct smc_link_group, free_work); bool conns; spin_lock_bh(&smc_lgr_list.lock); if (list_empty(&lgr->list)) goto free; read_lock_bh(&lgr->conns_lock); conns = RB_EMPTY_ROOT(&lgr->conns_all); read_unlock_bh(&lgr->conns_lock); if (!conns) { /* number of lgr connections is no longer zero */ spin_unlock_bh(&smc_lgr_list.lock); return; } list_del_init(&lgr->list); /* remove from smc_lgr_list */ free: spin_unlock_bh(&smc_lgr_list.lock); smc_lgr_free(lgr); } /* create a new SMC link group */ static int smc_lgr_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, char *peer_systemid, unsigned short vlan_id) { struct smc_link_group *lgr; struct smc_link *lnk; u8 rndvec[3]; int rc = 0; int i; lgr = kzalloc(sizeof(*lgr), GFP_KERNEL); if (!lgr) { rc = -ENOMEM; goto out; } lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT; lgr->sync_err = false; memcpy(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN); lgr->vlan_id = vlan_id; rwlock_init(&lgr->sndbufs_lock); rwlock_init(&lgr->rmbs_lock); for (i = 0; i < SMC_RMBE_SIZES; i++) { INIT_LIST_HEAD(&lgr->sndbufs[i]); INIT_LIST_HEAD(&lgr->rmbs[i]); } smc_lgr_num += SMC_LGR_NUM_INCR; memcpy(&lgr->id, (u8 *)&smc_lgr_num, SMC_LGR_ID_SIZE); INIT_DELAYED_WORK(&lgr->free_work, smc_lgr_free_work); lgr->conns_all = RB_ROOT; lnk = &lgr->lnk[SMC_SINGLE_LINK]; /* initialize link */ lnk->smcibdev = smcibdev; lnk->ibport = ibport; lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu; if (!smcibdev->initialized) smc_ib_setup_per_ibdev(smcibdev); get_random_bytes(rndvec, sizeof(rndvec)); lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16); rc = smc_wr_alloc_link_mem(lnk); if (rc) goto free_lgr; rc = smc_ib_create_protection_domain(lnk); if (rc) goto free_link_mem; rc = smc_ib_create_queue_pair(lnk); if (rc) goto dealloc_pd; rc = smc_wr_create_link(lnk); if (rc) goto destroy_qp; init_completion(&lnk->llc_confirm); init_completion(&lnk->llc_confirm_resp); smc->conn.lgr = lgr; rwlock_init(&lgr->conns_lock); spin_lock_bh(&smc_lgr_list.lock); list_add(&lgr->list, &smc_lgr_list.list); spin_unlock_bh(&smc_lgr_list.lock); return 0; destroy_qp: smc_ib_destroy_queue_pair(lnk); dealloc_pd: smc_ib_dealloc_protection_domain(lnk); free_link_mem: smc_wr_free_link_mem(lnk); free_lgr: kfree(lgr); out: return rc; } static void smc_buf_unuse(struct smc_connection *conn) { if (conn->sndbuf_desc) { conn->sndbuf_desc->used = 0; conn->sndbuf_size = 0; } if (conn->rmb_desc) { conn->rmb_desc->reused = true; conn->rmb_desc->used = 0; conn->rmbe_size = 0; } } /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { if (!conn->lgr) return; smc_cdc_tx_dismiss_slots(conn); smc_lgr_unregister_conn(conn); smc_buf_unuse(conn); } static void smc_link_clear(struct smc_link *lnk) { lnk->peer_qpn = 0; smc_ib_modify_qp_reset(lnk); smc_wr_free_link(lnk); smc_ib_destroy_queue_pair(lnk); smc_ib_dealloc_protection_domain(lnk); smc_wr_free_link_mem(lnk); } static void smc_buf_free(struct smc_buf_desc *buf_desc, struct smc_link *lnk, bool is_rmb) { if (is_rmb) { if (buf_desc->mr_rx[SMC_SINGLE_LINK]) smc_ib_put_memory_region( buf_desc->mr_rx[SMC_SINGLE_LINK]); smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, DMA_FROM_DEVICE); } else { smc_ib_buf_unmap_sg(lnk->smcibdev, buf_desc, DMA_TO_DEVICE); } sg_free_table(&buf_desc->sgt[SMC_SINGLE_LINK]); if (buf_desc->cpu_addr) free_pages((unsigned long)buf_desc->cpu_addr, buf_desc->order); kfree(buf_desc); } static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; struct smc_buf_desc *buf_desc, *bf_desc; struct list_head *buf_list; int i; for (i = 0; i < SMC_RMBE_SIZES; i++) { if (is_rmb) buf_list = &lgr->rmbs[i]; else buf_list = &lgr->sndbufs[i]; list_for_each_entry_safe(buf_desc, bf_desc, buf_list, list) { list_del(&buf_desc->list); smc_buf_free(buf_desc, lnk, is_rmb); } } } static void smc_lgr_free_bufs(struct smc_link_group *lgr) { /* free send buffers */ __smc_lgr_free_bufs(lgr, false); /* free rmbs */ __smc_lgr_free_bufs(lgr, true); } /* remove a link group */ void smc_lgr_free(struct smc_link_group *lgr) { smc_lgr_free_bufs(lgr); smc_link_clear(&lgr->lnk[SMC_SINGLE_LINK]); kfree(lgr); } /* terminate linkgroup abnormally */ void smc_lgr_terminate(struct smc_link_group *lgr) { struct smc_connection *conn; struct smc_sock *smc; struct rb_node *node; spin_lock_bh(&smc_lgr_list.lock); if (list_empty(&lgr->list)) { /* termination already triggered */ spin_unlock_bh(&smc_lgr_list.lock); return; } /* do not use this link group for new connections */ list_del_init(&lgr->list); spin_unlock_bh(&smc_lgr_list.lock); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); while (node) { conn = rb_entry(node, struct smc_connection, alert_node); smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); /* sock_put in close work */ conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; __smc_lgr_unregister_conn(conn); write_unlock_bh(&lgr->conns_lock); if (!schedule_work(&conn->close_work)) sock_put(&smc->sk); write_lock_bh(&lgr->conns_lock); node = rb_first(&lgr->conns_all); } write_unlock_bh(&lgr->conns_lock); wake_up(&lgr->lnk[SMC_SINGLE_LINK].wr_reg_wait); } /* Determine vlan of internal TCP socket. * @vlan_id: address to store the determined vlan id into */ static int smc_vlan_by_tcpsk(struct socket *clcsock, unsigned short *vlan_id) { struct dst_entry *dst = sk_dst_get(clcsock->sk); int rc = 0; *vlan_id = 0; if (!dst) { rc = -ENOTCONN; goto out; } if (!dst->dev) { rc = -ENODEV; goto out_rel; } if (is_vlan_dev(dst->dev)) *vlan_id = vlan_dev_vlan_id(dst->dev); out_rel: dst_release(dst); out: return rc; } /* determine the link gid matching the vlan id of the link group */ static int smc_link_determine_gid(struct smc_link_group *lgr) { struct smc_link *lnk = &lgr->lnk[SMC_SINGLE_LINK]; struct ib_gid_attr gattr; union ib_gid gid; int i; if (!lgr->vlan_id) { lnk->gid = lnk->smcibdev->gid[lnk->ibport - 1]; return 0; } for (i = 0; i < lnk->smcibdev->pattr[lnk->ibport - 1].gid_tbl_len; i++) { if (ib_query_gid(lnk->smcibdev->ibdev, lnk->ibport, i, &gid, &gattr)) continue; if (gattr.ndev) { if (is_vlan_dev(gattr.ndev) && vlan_dev_vlan_id(gattr.ndev) == lgr->vlan_id) { lnk->gid = gid; dev_put(gattr.ndev); return 0; } dev_put(gattr.ndev); } } return -ENODEV; } /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_ib_device *smcibdev, u8 ibport, struct smc_clc_msg_local *lcl, int srv_first_contact) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr; unsigned short vlan_id; enum smc_lgr_role role; int local_contact = SMC_FIRST_CONTACT; int rc = 0; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; rc = smc_vlan_by_tcpsk(smc->clcsock, &vlan_id); if (rc) return rc; if ((role == SMC_CLNT) && srv_first_contact) /* create new link group as well */ goto create; /* determine if an existing link group can be reused */ spin_lock_bh(&smc_lgr_list.lock); list_for_each_entry(lgr, &smc_lgr_list.list, list) { write_lock_bh(&lgr->conns_lock); if (!memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) && !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_gid, &lcl->gid, SMC_GID_SIZE) && !memcmp(lgr->lnk[SMC_SINGLE_LINK].peer_mac, lcl->mac, sizeof(lcl->mac)) && !lgr->sync_err && (lgr->role == role) && (lgr->vlan_id == vlan_id) && ((role == SMC_CLNT) || (lgr->conns_num < SMC_RMBS_PER_LGR_MAX))) { /* link group found */ local_contact = SMC_REUSE_CONTACT; conn->lgr = lgr; smc_lgr_register_conn(conn); /* add smc conn to lgr */ write_unlock_bh(&lgr->conns_lock); break; } write_unlock_bh(&lgr->conns_lock); } spin_unlock_bh(&smc_lgr_list.lock); if (role == SMC_CLNT && !srv_first_contact && (local_contact == SMC_FIRST_CONTACT)) { /* Server reuses a link group, but Client wants to start * a new one * send out_of_sync decline, reason synchr. error */ return -ENOLINK; } create: if (local_contact == SMC_FIRST_CONTACT) { rc = smc_lgr_create(smc, smcibdev, ibport, lcl->id_for_peer, vlan_id); if (rc) goto out; smc_lgr_register_conn(conn); /* add smc conn to lgr */ rc = smc_link_determine_gid(conn->lgr); } conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE; conn->local_tx_ctrl.len = sizeof(struct smc_cdc_msg); #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&conn->acurs_lock); #endif out: return rc ? rc : local_contact; } /* try to reuse a sndbuf or rmb description slot for a certain * buffer size; if not available, return NULL */ static inline struct smc_buf_desc *smc_buf_get_slot(struct smc_link_group *lgr, int compressed_bufsize, rwlock_t *lock, struct list_head *buf_list) { struct smc_buf_desc *buf_slot; read_lock_bh(lock); list_for_each_entry(buf_slot, buf_list, list) { if (cmpxchg(&buf_slot->used, 0, 1) == 0) { read_unlock_bh(lock); return buf_slot; } } read_unlock_bh(lock); return NULL; } /* one of the conditions for announcing a receiver's current window size is * that it "results in a minimum increase in the window size of 10% of the * receive buffer space" [RFC7609] */ static inline int smc_rmb_wnd_update_limit(int rmbe_size) { return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2); } static struct smc_buf_desc *smc_new_buf_create(struct smc_link_group *lgr, bool is_rmb, int bufsize) { struct smc_buf_desc *buf_desc; struct smc_link *lnk; int rc; /* try to alloc a new buffer */ buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL); if (!buf_desc) return ERR_PTR(-ENOMEM); buf_desc->cpu_addr = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_ZERO, get_order(bufsize)); if (!buf_desc->cpu_addr) { kfree(buf_desc); return ERR_PTR(-EAGAIN); } buf_desc->order = get_order(bufsize); /* build the sg table from the pages */ lnk = &lgr->lnk[SMC_SINGLE_LINK]; rc = sg_alloc_table(&buf_desc->sgt[SMC_SINGLE_LINK], 1, GFP_KERNEL); if (rc) { smc_buf_free(buf_desc, lnk, is_rmb); return ERR_PTR(rc); } sg_set_buf(buf_desc->sgt[SMC_SINGLE_LINK].sgl, buf_desc->cpu_addr, bufsize); /* map sg table to DMA address */ rc = smc_ib_buf_map_sg(lnk->smcibdev, buf_desc, is_rmb ? DMA_FROM_DEVICE : DMA_TO_DEVICE); /* SMC protocol depends on mapping to one DMA address only */ if (rc != 1) { smc_buf_free(buf_desc, lnk, is_rmb); return ERR_PTR(-EAGAIN); } /* create a new memory region for the RMB */ if (is_rmb) { rc = smc_ib_get_memory_region(lnk->roce_pd, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE, buf_desc); if (rc) { smc_buf_free(buf_desc, lnk, is_rmb); return ERR_PTR(rc); } } return buf_desc; } static int __smc_buf_create(struct smc_sock *smc, bool is_rmb) { struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct list_head *buf_list; int bufsize, bufsize_short; int sk_buf_size; rwlock_t *lock; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_rcvbuf / 2; else /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf / 2; for (bufsize_short = smc_compress_bufsize(sk_buf_size); bufsize_short >= 0; bufsize_short--) { if (is_rmb) { lock = &lgr->rmbs_lock; buf_list = &lgr->rmbs[bufsize_short]; } else { lock = &lgr->sndbufs_lock; buf_list = &lgr->sndbufs[bufsize_short]; } bufsize = smc_uncompress_bufsize(bufsize_short); if ((1 << get_order(bufsize)) > SG_MAX_SINGLE_ALLOC) continue; /* check for reusable slot in the link group */ buf_desc = smc_buf_get_slot(lgr, bufsize_short, lock, buf_list); if (buf_desc) { memset(buf_desc->cpu_addr, 0, bufsize); break; /* found reusable slot */ } buf_desc = smc_new_buf_create(lgr, is_rmb, bufsize); if (PTR_ERR(buf_desc) == -ENOMEM) break; if (IS_ERR(buf_desc)) continue; buf_desc->used = 1; write_lock_bh(lock); list_add(&buf_desc->list, buf_list); write_unlock_bh(lock); break; /* found */ } if (IS_ERR(buf_desc)) return -ENOMEM; if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size = bufsize; conn->rmbe_size_short = bufsize_short; smc->sk.sk_rcvbuf = bufsize * 2; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(bufsize); } else { conn->sndbuf_desc = buf_desc; conn->sndbuf_size = bufsize; smc->sk.sk_sndbuf = bufsize * 2; atomic_set(&conn->sndbuf_space, bufsize); } return 0; } void smc_sndbuf_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_sndbuf_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->sndbuf_desc, DMA_TO_DEVICE); } void smc_rmb_sync_sg_for_cpu(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; smc_ib_sync_sg_for_cpu(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } void smc_rmb_sync_sg_for_device(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; smc_ib_sync_sg_for_device(lgr->lnk[SMC_SINGLE_LINK].smcibdev, conn->rmb_desc, DMA_FROM_DEVICE); } /* create the send and receive buffer for an SMC socket; * receive buffers are called RMBs; * (even though the SMC protocol allows more than one RMB-element per RMB, * the Linux implementation uses just one RMB-element per RMB, i.e. uses an * extra RMB for every connection in a link group */ int smc_buf_create(struct smc_sock *smc) { int rc; /* create send buffer */ rc = __smc_buf_create(smc, false); if (rc) return rc; /* create rmb */ rc = __smc_buf_create(smc, true); if (rc) smc_buf_free(smc->conn.sndbuf_desc, &smc->conn.lgr->lnk[SMC_SINGLE_LINK], false); return rc; } static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr) { int i; for_each_clear_bit(i, lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) { if (!test_and_set_bit(i, lgr->rtokens_used_mask)) return i; } return -ENOSPC; } /* add a new rtoken from peer */ int smc_rtoken_add(struct smc_link_group *lgr, __be64 nw_vaddr, __be32 nw_rkey) { u64 dma_addr = be64_to_cpu(nw_vaddr); u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if ((lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey) && (lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr == dma_addr) && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ return i; } } i = smc_rmb_reserve_rtoken_idx(lgr); if (i < 0) return i; lgr->rtokens[i][SMC_SINGLE_LINK].rkey = rkey; lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = dma_addr; return i; } /* delete an rtoken */ int smc_rtoken_delete(struct smc_link_group *lgr, __be32 nw_rkey) { u32 rkey = ntohl(nw_rkey); int i; for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][SMC_SINGLE_LINK].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { lgr->rtokens[i][SMC_SINGLE_LINK].rkey = 0; lgr->rtokens[i][SMC_SINGLE_LINK].dma_addr = 0; clear_bit(i, lgr->rtokens_used_mask); return 0; } } return -ENOENT; } /* save rkey and dma_addr received from peer during clc handshake */ int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_clc_msg_accept_confirm *clc) { conn->rtoken_idx = smc_rtoken_add(conn->lgr, clc->rmb_dma_addr, clc->rmb_rkey); if (conn->rtoken_idx < 0) return conn->rtoken_idx; return 0; }