Merge branch 'smc-datapath-opts'
Dust Li says: ==================== net/smc: some datapath performance optimizations This series tries to improve the performance of SMC in datapath. - patch #1, add sysctl interface to support tuning the behaviour of SMC in container environment. - patch #2/#3, add autocorking support which is very efficient for small messages without trade-off for latency. - patch #4, send directly on setting TCP_NODELAY, without wake up the TX worker, this make it consistent with clearing TCP_CORK. - patch #5, this correct the setting of RMB window update limit, so we don't send CDC messages to update peer's RMB window too frequently in some cases. - patch #6, implemented something like NAPI in SMC, decrease the number of hardirq when busy. - patch #7, this moves TX work doing in the BH to the user context when sock_lock is hold by user. With this patchset applied, we can get a good performance gain: - qperf tcp_bw test has shown a great improvement. Other benchmarks like 'netperf TCP_STREAM' or 'sockperf throughput' has similar result. - In my testing environment, running qperf tcp_bw and tcp_lat, SMC behaves better then TCP in most all message size. Here are some test results with the following testing command: client: smc_run taskset -c 1 qperf smc-server -oo msg_size:1:64K:*2 \ -t 30 -vu tcp_{bw|lat} server: smc_run taskset -c 1 qperf ==== Bandwidth ==== MsgSize Origin SMC TCP SMC with patches 1 0.578 MB/s 2.392 MB/s(313.57%) 2.561 MB/s(342.83%) 2 1.159 MB/s 4.780 MB/s(312.53%) 5.162 MB/s(345.46%) 4 2.283 MB/s 10.266 MB/s(349.77%) 10.122 MB/s(343.46%) 8 4.668 MB/s 19.040 MB/s(307.86%) 20.521 MB/s(339.59%) 16 9.147 MB/s 38.904 MB/s(325.31%) 40.823 MB/s(346.29%) 32 18.369 MB/s 79.587 MB/s(333.25%) 80.535 MB/s(338.42%) 64 36.562 MB/s 148.668 MB/s(306.61%) 158.170 MB/s(332.60%) 128 72.961 MB/s 274.913 MB/s(276.80%) 316.217 MB/s(333.41%) 256 144.705 MB/s 512.059 MB/s(253.86%) 626.019 MB/s(332.62%) 512 288.873 MB/s 884.977 MB/s(206.35%) 1221.596 MB/s(322.88%) 1024 574.180 MB/s 1337.736 MB/s(132.98%) 2203.156 MB/s(283.70%) 2048 1095.192 MB/s 1865.952 MB/s( 70.38%) 3036.448 MB/s(177.25%) 4096 2066.157 MB/s 2380.337 MB/s( 15.21%) 3834.271 MB/s( 85.58%) 8192 3717.198 MB/s 2733.073 MB/s(-26.47%) 4904.910 MB/s( 31.95%) 16384 4742.221 MB/s 2958.693 MB/s(-37.61%) 5220.272 MB/s( 10.08%) 32768 5349.550 MB/s 3061.285 MB/s(-42.77%) 5321.865 MB/s( -0.52%) 65536 5162.919 MB/s 3731.408 MB/s(-27.73%) 5245.021 MB/s( 1.59%) ==== Latency ==== MsgSize Origin SMC TCP SMC with patches 1 10.540 us 11.938 us( 13.26%) 10.356 us( -1.75%) 2 10.996 us 11.992 us( 9.06%) 10.073 us( -8.39%) 4 10.229 us 11.687 us( 14.25%) 9.996 us( -2.28%) 8 10.203 us 11.653 us( 14.21%) 10.063 us( -1.37%) 16 10.530 us 11.313 us( 7.44%) 10.013 us( -4.91%) 32 10.241 us 11.586 us( 13.13%) 10.081 us( -1.56%) 64 10.693 us 11.652 us( 8.97%) 9.986 us( -6.61%) 128 10.597 us 11.579 us( 9.27%) 10.262 us( -3.16%) 256 10.409 us 11.957 us( 14.87%) 10.148 us( -2.51%) 512 11.088 us 12.505 us( 12.78%) 10.206 us( -7.95%) 1024 11.240 us 12.255 us( 9.03%) 10.631 us( -5.42%) 2048 11.485 us 16.970 us( 47.76%) 10.981 us( -4.39%) 4096 12.077 us 13.948 us( 15.49%) 11.847 us( -1.90%) 8192 13.683 us 16.693 us( 22.00%) 13.336 us( -2.54%) 16384 16.470 us 23.615 us( 43.38%) 16.519 us( 0.30%) 32768 22.540 us 40.966 us( 81.75%) 22.452 us( -0.39%) 65536 34.192 us 73.003 us(113.51%) 33.916 us( -0.81%) ------------ Test environment notes: 1. Testing is run on 2 VMs within the same physical host 2. The NIC is ConnectX-4Lx, using SRIOV, and passing through 2 VFs to the 2 VMs respectively. 3. To decrease jitter, VM's vCPU are binded to each physical CPU, and those physical CPUs are all isolated using boot parameter `isolcpus=xxx` 4. The queue number are set to 1, and interrupt from the queue is binded to CPU0 in the guest ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
7282c126f7
23
Documentation/networking/smc-sysctl.rst
Normal file
23
Documentation/networking/smc-sysctl.rst
Normal file
@ -0,0 +1,23 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=========
|
||||
SMC Sysctl
|
||||
=========
|
||||
|
||||
/proc/sys/net/smc/* Variables
|
||||
==============================
|
||||
|
||||
autocorking_size - INTEGER
|
||||
Setting SMC auto corking size:
|
||||
SMC auto corking is like TCP auto corking from the application's
|
||||
perspective of view. When applications do consecutive small
|
||||
write()/sendmsg() system calls, we try to coalesce these small writes
|
||||
as much as possible, to lower total amount of CDC and RDMA Write been
|
||||
sent.
|
||||
autocorking_size limits the maximum corked bytes that can be sent to
|
||||
the under device in 1 single sending. If set to 0, the SMC auto corking
|
||||
is disabled.
|
||||
Applications can still use TCP_CORK for optimal behavior when they
|
||||
know how/when to uncork their sockets.
|
||||
|
||||
Default: 64K
|
@ -14,5 +14,9 @@ struct netns_smc {
|
||||
struct smc_stats_rsn *fback_rsn;
|
||||
|
||||
bool limit_smc_hs; /* constraint on handshake */
|
||||
#ifdef CONFIG_SYSCTL
|
||||
struct ctl_table_header *smc_hdr;
|
||||
#endif
|
||||
unsigned int sysctl_autocorking_size;
|
||||
};
|
||||
#endif
|
||||
|
@ -4,4 +4,4 @@ obj-$(CONFIG_SMC) += smc.o
|
||||
obj-$(CONFIG_SMC_DIAG) += smc_diag.o
|
||||
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
|
||||
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
|
||||
smc-y += smc_tracepoint.o
|
||||
smc-y += smc_tracepoint.o smc_sysctl.o
|
||||
|
@ -51,6 +51,7 @@
|
||||
#include "smc_close.h"
|
||||
#include "smc_stats.h"
|
||||
#include "smc_tracepoint.h"
|
||||
#include "smc_sysctl.h"
|
||||
|
||||
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
|
||||
* creation on server
|
||||
@ -192,12 +193,27 @@ void smc_unhash_sk(struct sock *sk)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(smc_unhash_sk);
|
||||
|
||||
/* This will be called before user really release sock_lock. So do the
|
||||
* work which we didn't do because of user hold the sock_lock in the
|
||||
* BH context
|
||||
*/
|
||||
static void smc_release_cb(struct sock *sk)
|
||||
{
|
||||
struct smc_sock *smc = smc_sk(sk);
|
||||
|
||||
if (smc->conn.tx_in_release_sock) {
|
||||
smc_tx_pending(&smc->conn);
|
||||
smc->conn.tx_in_release_sock = false;
|
||||
}
|
||||
}
|
||||
|
||||
struct proto smc_proto = {
|
||||
.name = "SMC",
|
||||
.owner = THIS_MODULE,
|
||||
.keepalive = smc_set_keepalive,
|
||||
.hash = smc_hash_sk,
|
||||
.unhash = smc_unhash_sk,
|
||||
.release_cb = smc_release_cb,
|
||||
.obj_size = sizeof(struct smc_sock),
|
||||
.h.smc_hash = &smc_v4_hashinfo,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
@ -210,6 +226,7 @@ struct proto smc_proto6 = {
|
||||
.keepalive = smc_set_keepalive,
|
||||
.hash = smc_hash_sk,
|
||||
.unhash = smc_unhash_sk,
|
||||
.release_cb = smc_release_cb,
|
||||
.obj_size = sizeof(struct smc_sock),
|
||||
.h.smc_hash = &smc_v6_hashinfo,
|
||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||
@ -2795,8 +2812,8 @@ static int smc_setsockopt(struct socket *sock, int level, int optname,
|
||||
sk->sk_state != SMC_CLOSED) {
|
||||
if (val) {
|
||||
SMC_STAT_INC(smc, ndly_cnt);
|
||||
mod_delayed_work(smc->conn.lgr->tx_wq,
|
||||
&smc->conn.tx_work, 0);
|
||||
smc_tx_pending(&smc->conn);
|
||||
cancel_delayed_work(&smc->conn.tx_work);
|
||||
}
|
||||
}
|
||||
break;
|
||||
@ -3273,9 +3290,17 @@ static int __init smc_init(void)
|
||||
goto out_sock;
|
||||
}
|
||||
|
||||
rc = smc_sysctl_init();
|
||||
if (rc) {
|
||||
pr_err("%s: sysctl_init fails with %d\n", __func__, rc);
|
||||
goto out_ulp;
|
||||
}
|
||||
|
||||
static_branch_enable(&tcp_have_smc);
|
||||
return 0;
|
||||
|
||||
out_ulp:
|
||||
tcp_unregister_ulp(&smc_ulp_ops);
|
||||
out_sock:
|
||||
sock_unregister(PF_SMC);
|
||||
out_proto6:
|
||||
@ -3303,6 +3328,7 @@ out_pernet_subsys:
|
||||
static void __exit smc_exit(void)
|
||||
{
|
||||
static_branch_disable(&tcp_have_smc);
|
||||
smc_sysctl_exit();
|
||||
tcp_unregister_ulp(&smc_ulp_ops);
|
||||
sock_unregister(PF_SMC);
|
||||
smc_core_exit();
|
||||
|
@ -29,6 +29,7 @@
|
||||
#define SMC_MAX_ISM_DEVS 8 /* max # of proposed non-native ISM
|
||||
* devices
|
||||
*/
|
||||
#define SMC_AUTOCORKING_DEFAULT_SIZE 0x10000 /* 64K by default */
|
||||
|
||||
extern struct proto smc_proto;
|
||||
extern struct proto smc_proto6;
|
||||
@ -192,6 +193,7 @@ struct smc_connection {
|
||||
* - dec on polled tx cqe
|
||||
*/
|
||||
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
|
||||
atomic_t tx_pushing; /* nr_threads trying tx push */
|
||||
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
|
||||
u32 tx_off; /* base offset in peer rmb */
|
||||
|
||||
@ -211,6 +213,10 @@ struct smc_connection {
|
||||
* data still pending
|
||||
*/
|
||||
char urg_rx_byte; /* urgent byte */
|
||||
bool tx_in_release_sock;
|
||||
/* flush pending tx data in
|
||||
* sock release_cb()
|
||||
*/
|
||||
atomic_t bytes_to_rcv; /* arrived data,
|
||||
* not yet received
|
||||
*/
|
||||
|
@ -48,9 +48,19 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
|
||||
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
|
||||
}
|
||||
|
||||
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
|
||||
unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
|
||||
wake_up(&conn->cdc_pend_tx_wq);
|
||||
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr)) {
|
||||
/* If user owns the sock_lock, mark the connection need sending.
|
||||
* User context will later try to send when it release sock_lock
|
||||
* in smc_release_cb()
|
||||
*/
|
||||
if (sock_owned_by_user(&smc->sk))
|
||||
conn->tx_in_release_sock = true;
|
||||
else
|
||||
smc_tx_pending(conn);
|
||||
|
||||
if (unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
|
||||
wake_up(&conn->cdc_pend_tx_wq);
|
||||
}
|
||||
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
|
||||
|
||||
smc_tx_sndbuf_nonfull(smc);
|
||||
@ -350,8 +360,12 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
|
||||
/* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */
|
||||
if ((diff_cons && smc_tx_prepared_sends(conn)) ||
|
||||
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req ||
|
||||
conn->local_rx_ctrl.prod_flags.urg_data_pending)
|
||||
smc_tx_sndbuf_nonempty(conn);
|
||||
conn->local_rx_ctrl.prod_flags.urg_data_pending) {
|
||||
if (!sock_owned_by_user(&smc->sk))
|
||||
smc_tx_pending(conn);
|
||||
else
|
||||
conn->tx_in_release_sock = true;
|
||||
}
|
||||
|
||||
if (diff_cons && conn->urg_tx_pend &&
|
||||
atomic_read(&conn->peer_rmbe_space) == conn->peer_rmbe_size) {
|
||||
|
@ -1988,7 +1988,7 @@ static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
|
||||
*/
|
||||
static inline int smc_rmb_wnd_update_limit(int rmbe_size)
|
||||
{
|
||||
return min_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
|
||||
return max_t(int, rmbe_size / 10, SOCK_MIN_SNDBUF / 2);
|
||||
}
|
||||
|
||||
/* map an rmb buf to a link */
|
||||
|
80
net/smc/smc_sysctl.c
Normal file
80
net/smc/smc_sysctl.c
Normal file
@ -0,0 +1,80 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Shared Memory Communications over RDMA (SMC-R) and RoCE
|
||||
*
|
||||
* smc_sysctl.c: sysctl interface to SMC subsystem.
|
||||
*
|
||||
* Copyright (c) 2022, Alibaba Inc.
|
||||
*
|
||||
* Author: Tony Lu <tonylu@linux.alibaba.com>
|
||||
*
|
||||
*/
|
||||
|
||||
#include <linux/init.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <net/net_namespace.h>
|
||||
|
||||
#include "smc.h"
|
||||
#include "smc_sysctl.h"
|
||||
|
||||
static struct ctl_table smc_table[] = {
|
||||
{
|
||||
.procname = "autocorking_size",
|
||||
.data = &init_net.smc.sysctl_autocorking_size,
|
||||
.maxlen = sizeof(unsigned int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_douintvec,
|
||||
},
|
||||
{ }
|
||||
};
|
||||
|
||||
static __net_init int smc_sysctl_init_net(struct net *net)
|
||||
{
|
||||
struct ctl_table *table;
|
||||
|
||||
table = smc_table;
|
||||
if (!net_eq(net, &init_net)) {
|
||||
int i;
|
||||
|
||||
table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
|
||||
if (!table)
|
||||
goto err_alloc;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(smc_table) - 1; i++)
|
||||
table[i].data += (void *)net - (void *)&init_net;
|
||||
}
|
||||
|
||||
net->smc.smc_hdr = register_net_sysctl(net, "net/smc", table);
|
||||
if (!net->smc.smc_hdr)
|
||||
goto err_reg;
|
||||
|
||||
net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE;
|
||||
|
||||
return 0;
|
||||
|
||||
err_reg:
|
||||
if (!net_eq(net, &init_net))
|
||||
kfree(table);
|
||||
err_alloc:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static __net_exit void smc_sysctl_exit_net(struct net *net)
|
||||
{
|
||||
unregister_net_sysctl_table(net->smc.smc_hdr);
|
||||
}
|
||||
|
||||
static struct pernet_operations smc_sysctl_ops __net_initdata = {
|
||||
.init = smc_sysctl_init_net,
|
||||
.exit = smc_sysctl_exit_net,
|
||||
};
|
||||
|
||||
int __init smc_sysctl_init(void)
|
||||
{
|
||||
return register_pernet_subsys(&smc_sysctl_ops);
|
||||
}
|
||||
|
||||
void smc_sysctl_exit(void)
|
||||
{
|
||||
unregister_pernet_subsys(&smc_sysctl_ops);
|
||||
}
|
32
net/smc/smc_sysctl.h
Normal file
32
net/smc/smc_sysctl.h
Normal file
@ -0,0 +1,32 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/*
|
||||
* Shared Memory Communications over RDMA (SMC-R) and RoCE
|
||||
*
|
||||
* smc_sysctl.c: sysctl interface to SMC subsystem.
|
||||
*
|
||||
* Copyright (c) 2022, Alibaba Inc.
|
||||
*
|
||||
* Author: Tony Lu <tonylu@linux.alibaba.com>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef _SMC_SYSCTL_H
|
||||
#define _SMC_SYSCTL_H
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
|
||||
int smc_sysctl_init(void);
|
||||
void smc_sysctl_exit(void);
|
||||
|
||||
#else
|
||||
|
||||
int smc_sysctl_init(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
void smc_sysctl_exit(void) { }
|
||||
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
#endif /* _SMC_SYSCTL_H */
|
107
net/smc/smc_tx.c
107
net/smc/smc_tx.c
@ -131,6 +131,51 @@ static bool smc_tx_is_corked(struct smc_sock *smc)
|
||||
return (tp->nonagle & TCP_NAGLE_CORK) ? true : false;
|
||||
}
|
||||
|
||||
/* If we have pending CDC messages, do not send:
|
||||
* Because CQE of this CDC message will happen shortly, it gives
|
||||
* a chance to coalesce future sendmsg() payload in to one RDMA Write,
|
||||
* without need for a timer, and with no latency trade off.
|
||||
* Algorithm here:
|
||||
* 1. First message should never cork
|
||||
* 2. If we have pending Tx CDC messages, wait for the first CDC
|
||||
* message's completion
|
||||
* 3. Don't cork to much data in a single RDMA Write to prevent burst
|
||||
* traffic, total corked message should not exceed sendbuf/2
|
||||
*/
|
||||
static bool smc_should_autocork(struct smc_sock *smc)
|
||||
{
|
||||
struct smc_connection *conn = &smc->conn;
|
||||
int corking_size;
|
||||
|
||||
corking_size = min(sock_net(&smc->sk)->smc.sysctl_autocorking_size,
|
||||
conn->sndbuf_desc->len >> 1);
|
||||
|
||||
if (atomic_read(&conn->cdc_pend_tx_wr) == 0 ||
|
||||
smc_tx_prepared_sends(conn) > corking_size)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool smc_tx_should_cork(struct smc_sock *smc, struct msghdr *msg)
|
||||
{
|
||||
struct smc_connection *conn = &smc->conn;
|
||||
|
||||
if (smc_should_autocork(smc))
|
||||
return true;
|
||||
|
||||
/* for a corked socket defer the RDMA writes if
|
||||
* sndbuf_space is still available. The applications
|
||||
* should known how/when to uncork it.
|
||||
*/
|
||||
if ((msg->msg_flags & MSG_MORE ||
|
||||
smc_tx_is_corked(smc) ||
|
||||
msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
|
||||
atomic_read(&conn->sndbuf_space))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/* sndbuf producer: main API called by socket layer.
|
||||
* called under sock lock.
|
||||
*/
|
||||
@ -235,13 +280,10 @@ int smc_tx_sendmsg(struct smc_sock *smc, struct msghdr *msg, size_t len)
|
||||
*/
|
||||
if ((msg->msg_flags & MSG_OOB) && !send_remaining)
|
||||
conn->urg_tx_pend = true;
|
||||
/* for a corked socket defer the RDMA writes if
|
||||
* sndbuf_space is still available. The applications
|
||||
* should known how/when to uncork it.
|
||||
/* If we need to cork, do nothing and wait for the next
|
||||
* sendmsg() call or push on tx completion
|
||||
*/
|
||||
if (!((msg->msg_flags & MSG_MORE || smc_tx_is_corked(smc) ||
|
||||
msg->msg_flags & MSG_SENDPAGE_NOTLAST) &&
|
||||
atomic_read(&conn->sndbuf_space)))
|
||||
if (!smc_tx_should_cork(smc, msg))
|
||||
smc_tx_sndbuf_nonempty(conn);
|
||||
|
||||
trace_smc_tx_sendmsg(smc, copylen);
|
||||
@ -589,13 +631,26 @@ static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn)
|
||||
return rc;
|
||||
}
|
||||
|
||||
int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
|
||||
static int __smc_tx_sndbuf_nonempty(struct smc_connection *conn)
|
||||
{
|
||||
int rc;
|
||||
struct smc_sock *smc = container_of(conn, struct smc_sock, conn);
|
||||
int rc = 0;
|
||||
|
||||
/* No data in the send queue */
|
||||
if (unlikely(smc_tx_prepared_sends(conn) <= 0))
|
||||
goto out;
|
||||
|
||||
/* Peer don't have RMBE space */
|
||||
if (unlikely(atomic_read(&conn->peer_rmbe_space) <= 0)) {
|
||||
SMC_STAT_RMB_TX_PEER_FULL(smc, !conn->lnk);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (conn->killed ||
|
||||
conn->local_rx_ctrl.conn_state_flags.peer_conn_abort)
|
||||
return -EPIPE; /* connection being aborted */
|
||||
conn->local_rx_ctrl.conn_state_flags.peer_conn_abort) {
|
||||
rc = -EPIPE; /* connection being aborted */
|
||||
goto out;
|
||||
}
|
||||
if (conn->lgr->is_smcd)
|
||||
rc = smcd_tx_sndbuf_nonempty(conn);
|
||||
else
|
||||
@ -603,10 +658,38 @@ int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
|
||||
|
||||
if (!rc) {
|
||||
/* trigger socket release if connection is closing */
|
||||
struct smc_sock *smc = container_of(conn, struct smc_sock,
|
||||
conn);
|
||||
smc_close_wake_tx_prepared(smc);
|
||||
}
|
||||
|
||||
out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
int smc_tx_sndbuf_nonempty(struct smc_connection *conn)
|
||||
{
|
||||
int rc;
|
||||
|
||||
/* This make sure only one can send simultaneously to prevent wasting
|
||||
* of CPU and CDC slot.
|
||||
* Record whether someone has tried to push while we are pushing.
|
||||
*/
|
||||
if (atomic_inc_return(&conn->tx_pushing) > 1)
|
||||
return 0;
|
||||
|
||||
again:
|
||||
atomic_set(&conn->tx_pushing, 1);
|
||||
smp_wmb(); /* Make sure tx_pushing is 1 before real send */
|
||||
rc = __smc_tx_sndbuf_nonempty(conn);
|
||||
|
||||
/* We need to check whether someone else have added some data into
|
||||
* the send queue and tried to push but failed after the atomic_set()
|
||||
* when we are pushing.
|
||||
* If so, we need to push again to prevent those data hang in the send
|
||||
* queue.
|
||||
*/
|
||||
if (unlikely(!atomic_dec_and_test(&conn->tx_pushing)))
|
||||
goto again;
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
@ -137,25 +137,28 @@ static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t)
|
||||
{
|
||||
struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet);
|
||||
struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
|
||||
int i = 0, rc;
|
||||
int polled = 0;
|
||||
int i, rc;
|
||||
|
||||
again:
|
||||
polled++;
|
||||
do {
|
||||
memset(&wc, 0, sizeof(wc));
|
||||
rc = ib_poll_cq(dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc);
|
||||
if (polled == 1) {
|
||||
ib_req_notify_cq(dev->roce_cq_send,
|
||||
IB_CQ_NEXT_COMP |
|
||||
IB_CQ_REPORT_MISSED_EVENTS);
|
||||
}
|
||||
if (!rc)
|
||||
break;
|
||||
for (i = 0; i < rc; i++)
|
||||
smc_wr_tx_process_cqe(&wc[i]);
|
||||
if (rc < SMC_WR_MAX_POLL_CQE)
|
||||
/* If < SMC_WR_MAX_POLL_CQE, the CQ should have been
|
||||
* drained, no need to poll again. --Guangguan Wang
|
||||
*/
|
||||
break;
|
||||
} while (rc > 0);
|
||||
if (polled == 1)
|
||||
|
||||
/* IB_CQ_REPORT_MISSED_EVENTS make sure if ib_req_notify_cq() returns
|
||||
* 0, it is safe to wait for the next event.
|
||||
* Else we must poll the CQ again to make sure we won't miss any event
|
||||
*/
|
||||
if (ib_req_notify_cq(dev->roce_cq_send,
|
||||
IB_CQ_NEXT_COMP |
|
||||
IB_CQ_REPORT_MISSED_EVENTS))
|
||||
goto again;
|
||||
}
|
||||
|
||||
@ -478,24 +481,28 @@ static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t)
|
||||
{
|
||||
struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet);
|
||||
struct ib_wc wc[SMC_WR_MAX_POLL_CQE];
|
||||
int polled = 0;
|
||||
int rc;
|
||||
|
||||
again:
|
||||
polled++;
|
||||
do {
|
||||
memset(&wc, 0, sizeof(wc));
|
||||
rc = ib_poll_cq(dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc);
|
||||
if (polled == 1) {
|
||||
ib_req_notify_cq(dev->roce_cq_recv,
|
||||
IB_CQ_SOLICITED_MASK
|
||||
| IB_CQ_REPORT_MISSED_EVENTS);
|
||||
}
|
||||
if (!rc)
|
||||
if (rc > 0)
|
||||
smc_wr_rx_process_cqes(&wc[0], rc);
|
||||
if (rc < SMC_WR_MAX_POLL_CQE)
|
||||
/* If < SMC_WR_MAX_POLL_CQE, the CQ should have been
|
||||
* drained, no need to poll again. --Guangguan Wang
|
||||
*/
|
||||
break;
|
||||
smc_wr_rx_process_cqes(&wc[0], rc);
|
||||
} while (rc > 0);
|
||||
if (polled == 1)
|
||||
|
||||
/* IB_CQ_REPORT_MISSED_EVENTS make sure if ib_req_notify_cq() returns
|
||||
* 0, it is safe to wait for the next event.
|
||||
* Else we must poll the CQ again to make sure we won't miss any event
|
||||
*/
|
||||
if (ib_req_notify_cq(dev->roce_cq_recv,
|
||||
IB_CQ_SOLICITED_MASK |
|
||||
IB_CQ_REPORT_MISSED_EVENTS))
|
||||
goto again;
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user