Merge branch 'net-smc-send-and-write-inline-optimization-for-smc'

Guangguan Wang says: ==================== net/smc: send and write inline optimization for smc Send cdc msgs and write data inline if qp has sufficent inline space, helps latency reducing. In my test environment, which are 2 VMs running on the same physical host and whose NICs(ConnectX-4Lx) are working on SR-IOV mode, qperf shows 0.4us-1.3us improvement in latency. Test command: server: smc_run taskset -c 1 qperf client: smc_run taskset -c 1 qperf <server ip> -oo \ msg_size:1:2K:*2 -t 30 -vu tcp_lat The results shown below: msgsize before after 1B 11.9 us 10.6 us (-1.3 us) 2B 11.7 us 10.7 us (-1.0 us) 4B 11.7 us 10.7 us (-1.0 us) 8B 11.6 us 10.6 us (-1.0 us) 16B 11.7 us 10.7 us (-1.0 us) 32B 11.7 us 10.6 us (-1.1 us) 64B 11.7 us 11.2 us (-0.5 us) 128B 11.6 us 11.2 us (-0.4 us) 256B 11.8 us 11.2 us (-0.6 us) 512B 11.8 us 11.3 us (-0.5 us) 1KB 11.9 us 11.5 us (-0.4 us) 2KB 12.1 us 11.5 us (-0.6 us) ==================== Link: https://lore.kernel.org/r/20220516055137.51873-1-guangguan.wang@linux.alibaba.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2022-05-17 17:34:59 -07:00 · 2022-05-17 17:34:59 -07:00 · 68a0bd6790
commit 68a0bd6790
parent 65a9dedc11 793a7df630
3 changed files with 17 additions and 6 deletions
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@ -671,6 +671,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
 			.max_recv_wr = SMC_WR_BUF_CNT * 3,
 			.max_send_sge = SMC_IB_MAX_SEND_SGE,
 			.max_recv_sge = sges_per_buf,
+			.max_inline_data = 0,
 		},
 		.sq_sig_type = IB_SIGNAL_REQ_WR,
 		.qp_type = IB_QPT_RC,
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@ -391,12 +391,20 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
 	int rc;

 	for (dstchunk = 0; dstchunk < 2; dstchunk++) {
-		struct ib_sge *sge =
-			wr_rdma_buf->wr_tx_rdma[dstchunk].wr.sg_list;
+		struct ib_rdma_wr *wr = &wr_rdma_buf->wr_tx_rdma[dstchunk];
+		struct ib_sge *sge = wr->wr.sg_list;
+		u64 base_addr = dma_addr;
+
+		if (dst_len < link->qp_attr.cap.max_inline_data) {
+			base_addr = (uintptr_t)conn->sndbuf_desc->cpu_addr;
+			wr->wr.send_flags |= IB_SEND_INLINE;
+		} else {
+			wr->wr.send_flags &= ~IB_SEND_INLINE;
+		}

 		num_sges = 0;
 		for (srcchunk = 0; srcchunk < 2; srcchunk++) {
-			sge[srcchunk].addr = dma_addr + src_off;
+			sge[srcchunk].addr = base_addr + src_off;
 			sge[srcchunk].length = src_len;
 			num_sges++;

@ -410,8 +418,7 @@ static int smcr_tx_rdma_writes(struct smc_connection *conn, size_t len,
 			src_len = dst_len - src_len; /* remainder */
 			src_len_sum += src_len;
 		}
-		rc = smc_tx_rdma_write(conn, dst_off, num_sges,
-				       &wr_rdma_buf->wr_tx_rdma[dstchunk]);
+		rc = smc_tx_rdma_write(conn, dst_off, num_sges, wr);
 		if (rc)
 			return rc;
 		if (dst_len_sum == len)
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@ -554,10 +554,11 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
 static void smc_wr_init_sge(struct smc_link *lnk)
 {
 	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
+	bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE);
 	u32 i;

 	for (i = 0; i < lnk->wr_tx_cnt; i++) {
-		lnk->wr_tx_sges[i].addr =
+		lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) :
 			lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
 		lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE;
 		lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
@ -575,6 +576,8 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 		lnk->wr_tx_ibs[i].opcode = IB_WR_SEND;
 		lnk->wr_tx_ibs[i].send_flags =
 			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+		if (send_inline)
+			lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE;
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list =