From 41500c3e2a19ffcf40a7158fce1774de08e26ba2 Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Jun 2017 11:28:53 -0700 Subject: [PATCH 1/3] rds: tcp: remove cp_outgoing After commit 1a0e100fb2c9 ("RDS: TCP: Force every connection to be initiated by numerically smaller IP address") we no longer need the logic associated with cp_outgoing, so clean up usage of this field. Signed-off-by: Sowmini Varadhan Tested-by: Imanti Mendez Signed-off-by: David S. Miller --- net/rds/connection.c | 5 ----- net/rds/rds.h | 2 -- net/rds/tcp_connect.c | 1 - net/rds/tcp_listen.c | 19 ++++--------------- 4 files changed, 4 insertions(+), 23 deletions(-) diff --git a/net/rds/connection.c b/net/rds/connection.c index 6a5ebdea7d2e..382443b060cb 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -124,11 +124,6 @@ static void __rds_conn_path_init(struct rds_connection *conn, cp->cp_conn = conn; atomic_set(&cp->cp_state, RDS_CONN_DOWN); cp->cp_send_gen = 0; - /* cp_outgoing is per-path. So we can only set it here - * for the single-path transports. - */ - if (!conn->c_trans->t_mp_capable) - cp->cp_outgoing = (is_outgoing ? 1 : 0); cp->cp_reconnect_jiffies = 0; INIT_DELAYED_WORK(&cp->cp_send_w, rds_send_worker); INIT_DELAYED_WORK(&cp->cp_recv_w, rds_recv_worker); diff --git a/net/rds/rds.h b/net/rds/rds.h index 82d38ccf5e8b..aa183d6adbcc 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -125,8 +125,6 @@ struct rds_conn_path { unsigned int cp_unacked_packets; unsigned int cp_unacked_bytes; - unsigned int cp_outgoing:1, - cp_pad_to_32:31; unsigned int cp_index; }; diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index d6839d96d539..97db86101ac5 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -135,7 +135,6 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) ret = sock->ops->connect(sock, (struct sockaddr *)&dest, sizeof(dest), O_NONBLOCK); - cp->cp_outgoing = 1; rdsdebug("connect to address %pI4 returned %d\n", &conn->c_faddr, ret); if (ret == -EINPROGRESS) ret = 0; diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 507678853e6c..238ff5c9a75b 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -171,21 +171,10 @@ int rds_tcp_accept_one(struct socket *sock) if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) goto rst_nsk; if (rs_tcp->t_sock) { - /* Need to resolve a duelling SYN between peers. - * We have an outstanding SYN to this peer, which may - * potentially have transitioned to the RDS_CONN_UP state, - * so we must quiesce any send threads before resetting - * c_transport_data. - */ - if (ntohl(inet->inet_saddr) < ntohl(inet->inet_daddr) || - !cp->cp_outgoing) { - goto rst_nsk; - } else { - rds_tcp_reset_callbacks(new_sock, cp); - cp->cp_outgoing = 0; - /* rds_connect_path_complete() marks RDS_CONN_UP */ - rds_connect_path_complete(cp, RDS_CONN_RESETTING); - } + /* Duelling SYN has been handled in rds_tcp_accept_one() */ + rds_tcp_reset_callbacks(new_sock, cp); + /* rds_connect_path_complete() marks RDS_CONN_UP */ + rds_connect_path_complete(cp, RDS_CONN_RESETTING); } else { rds_tcp_set_callbacks(new_sock, cp); rds_connect_path_complete(cp, RDS_CONN_CONNECTING); From 00354de5779db4aa9c019db787ef89bd1a6b149b Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Jun 2017 11:28:54 -0700 Subject: [PATCH 2/3] rds: tcp: various endian-ness fixes Found when testing between sparc and x86 machines on different subnets, so the address comparison patterns hit the corner cases and brought out some bugs fixed by this patch. Signed-off-by: Sowmini Varadhan Tested-by: Imanti Mendez Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rds.h | 2 ++ net/rds/recv.c | 12 +++++++----- net/rds/send.c | 11 +++++++---- net/rds/tcp_connect.c | 2 +- net/rds/tcp_listen.c | 2 +- net/rds/threads.c | 5 +++-- 6 files changed, 21 insertions(+), 13 deletions(-) diff --git a/net/rds/rds.h b/net/rds/rds.h index aa183d6adbcc..d6a04a05eb79 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -92,6 +92,8 @@ enum { #define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ (rs)->rs_hash_initval) & ((n) - 1)) +#define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) + /* Per mpath connection state */ struct rds_conn_path { struct rds_connection *cp_conn; diff --git a/net/rds/recv.c b/net/rds/recv.c index c70c32cb05f5..49493dbc43a1 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -215,10 +215,10 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, switch (type) { case RDS_EXTHDR_NPATHS: conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, - buffer.rds_npaths); + be16_to_cpu(buffer.rds_npaths)); break; case RDS_EXTHDR_GEN_NUM: - new_peer_gen_num = buffer.rds_gen_num; + new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); break; default: pr_warn_ratelimited("ignoring unknown exthdr type " @@ -254,7 +254,8 @@ static void rds_start_mprds(struct rds_connection *conn) int i; struct rds_conn_path *cp; - if (conn->c_npaths > 1 && conn->c_laddr < conn->c_faddr) { + if (conn->c_npaths > 1 && + IS_CANONICAL(conn->c_laddr, conn->c_faddr)) { for (i = 1; i < conn->c_npaths; i++) { cp = &conn->c_path[i]; rds_conn_path_connect_if_down(cp); @@ -339,14 +340,15 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, rds_stats_inc(s_recv_ping); rds_send_pong(cp, inc->i_hdr.h_sport); /* if this is a handshake ping, start multipath if necessary */ - if (RDS_HS_PROBE(inc->i_hdr.h_sport, inc->i_hdr.h_dport)) { + if (RDS_HS_PROBE(be16_to_cpu(inc->i_hdr.h_sport), + be16_to_cpu(inc->i_hdr.h_dport))) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); rds_start_mprds(cp->cp_conn); } goto out; } - if (inc->i_hdr.h_dport == RDS_FLAG_PROBE_PORT && + if (be16_to_cpu(inc->i_hdr.h_dport) == RDS_FLAG_PROBE_PORT && inc->i_hdr.h_sport == 0) { rds_recv_hs_exthdrs(&inc->i_hdr, cp->cp_conn); /* if this is a handshake pong, start multipath if necessary */ diff --git a/net/rds/send.c b/net/rds/send.c index 5cc64039caf7..3652a50397c7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1246,15 +1246,17 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, rm->m_inc.i_hdr.h_flags |= h_flags; cp->cp_next_tx_seq++; - if (RDS_HS_PROBE(sport, dport) && cp->cp_conn->c_trans->t_mp_capable) { - u16 npaths = RDS_MPATH_WORKERS; + if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && + cp->cp_conn->c_trans->t_mp_capable) { + u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); + u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, sizeof(npaths)); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, - &cp->cp_conn->c_my_gen_num, + &my_gen_num, sizeof(u32)); } spin_unlock_irqrestore(&cp->cp_lock, flags); @@ -1293,5 +1295,6 @@ rds_send_ping(struct rds_connection *conn) } conn->c_ping_triggered = 1; spin_unlock_irqrestore(&cp->cp_lock, flags); - rds_send_probe(&conn->c_path[0], RDS_FLAG_PROBE_PORT, 0, 0); + rds_send_probe(&conn->c_path[0], cpu_to_be16(RDS_FLAG_PROBE_PORT), + 0, 0); } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 97db86101ac5..5a62a083bb5a 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -66,7 +66,7 @@ void rds_tcp_state_change(struct sock *sk) * RDS connection as RDS_CONN_UP until the reconnect, * to avoid RDS datagram loss. */ - if (cp->cp_conn->c_laddr > cp->cp_conn->c_faddr && + if (!IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr) && rds_conn_path_transition(cp, RDS_CONN_CONNECTING, RDS_CONN_ERROR)) { rds_conn_path_drop(cp); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 238ff5c9a75b..f9c6312be841 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -83,7 +83,7 @@ static struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) { int i; - bool peer_is_smaller = (conn->c_faddr < conn->c_laddr); + bool peer_is_smaller = IS_CANONICAL(conn->c_faddr, conn->c_laddr); int npaths = max_t(int, 1, conn->c_npaths); /* for mprds, all paths MUST be initiated by the peer diff --git a/net/rds/threads.c b/net/rds/threads.c index 3e447d056d09..2852bc1d37d4 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -127,7 +127,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) /* let peer with smaller addr initiate reconnect, to avoid duels */ if (conn->c_trans->t_type == RDS_TRANS_TCP && - conn->c_laddr > conn->c_faddr) + !IS_CANONICAL(conn->c_laddr, conn->c_faddr)) return; set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); @@ -156,7 +156,8 @@ void rds_connect_worker(struct work_struct *work) struct rds_connection *conn = cp->cp_conn; int ret; - if (cp->cp_index > 0 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr) + if (cp->cp_index > 0 && + !IS_CANONICAL(cp->cp_conn->c_laddr, cp->cp_conn->c_faddr)) return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); From 10beea7d7408d0b1c9208757f445c5c710239e0e Mon Sep 17 00:00:00 2001 From: Sowmini Varadhan Date: Thu, 15 Jun 2017 11:28:55 -0700 Subject: [PATCH 3/3] rds: tcp: Set linger when rejecting an incoming conn in rds_tcp_accept_one Each time we get an incoming SYN to the RDS_TCP_PORT, the TCP layer accepts the connection and then the rds_tcp_accept_one() callback is invoked to process the incoming connection. rds_tcp_accept_one() may reject the incoming syn for a number of reasons, e.g., commit 1a0e100fb2c9 ("RDS: TCP: Force every connection to be initiated by numerically smaller IP address"), or because we are getting spammed by a malicious node that is triggering a flood of connection attempts to RDS_TCP_PORT. If the incoming syn is rejected, no data would have been sent on the TCP socket, and we do not need to be in TIME_WAIT state, so we set linger on the TCP socket before closing, thereby closing the socket efficiently with a RST. Signed-off-by: Sowmini Varadhan Tested-by: Imanti Mendez Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/tcp_listen.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index f9c6312be841..df291ac245d6 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -112,6 +112,17 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) return NULL; } +static void rds_tcp_set_linger(struct socket *sock) +{ + struct linger no_linger = { + .l_onoff = 1, + .l_linger = 0, + }; + + kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, + (char *)&no_linger, sizeof(no_linger)); +} + int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -183,7 +194,13 @@ int rds_tcp_accept_one(struct socket *sock) ret = 0; goto out; rst_nsk: - /* reset the newly returned accept sock and bail */ + /* reset the newly returned accept sock and bail. + * It is safe to set linger on new_sock because the RDS connection + * has not been brought up on new_sock, so no RDS-level data could + * be pending on it. By setting linger, we achieve the side-effect + * of avoiding TIME_WAIT state on new_sock. + */ + rds_tcp_set_linger(new_sock); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: