From 88f8598d0a302a08380eadefd09b9f5cb1c4c428 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:28 -0800 Subject: [PATCH 1/8] tcp: exit if nothing to retransmit on RTO timeout Previously TCP only warns if its RTO timer fires and the retransmission queue is empty, but it'll cause null pointer reference later on. It's better to avoid such catastrophic failure and simply exit with a warning. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 71a29e9c0620..e7d09e3705b8 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -443,10 +443,8 @@ void tcp_retransmit_timer(struct sock *sk) */ return; } - if (!tp->packets_out) - goto out; - - WARN_ON(tcp_rtx_queue_empty(sk)); + if (!tp->packets_out || WARN_ON_ONCE(tcp_rtx_queue_empty(sk))) + return; tp->tlp_high_seq = 0; From 7f12422c4873e9b274bc151ea59cb0cdf9415cf1 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:29 -0800 Subject: [PATCH 2/8] tcp: always timestamp on every skb transmission Previously TCP skbs are not always timestamped if the transmission failed due to memory or other local issues. This makes deciding when to abort a socket tricky and complicated because the first unacknowledged skb's timestamp may be 0 on TCP timeout. The straight-forward fix is to always timestamp skb on every transmission attempt. Also every skb retransmission needs to be flagged properly to avoid RTT under-estimation. This can happen upon receiving an ACK for the original packet and the a previous (spurious) retransmission has failed. It's worth noting that this reverts to the old time-stamping style before commit 8c72c65b426b ("tcp: update skb->skb_mstamp more carefully") which addresses a problem in computing the elapsed time of a stalled window-probing socket. The problem will be addressed differently in the next patches with a simpler approach. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 730bc44dbad9..57a56e205070 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -980,7 +980,6 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (sk->sk_pacing_status != SK_PACING_NONE) { unsigned long rate = sk->sk_pacing_rate; @@ -1028,7 +1027,9 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, BUG_ON(!skb || !tcp_skb_pcount(skb)); tp = tcp_sk(sk); - + prior_wstamp = tp->tcp_wstamp_ns; + tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); + skb->skb_mstamp_ns = tp->tcp_wstamp_ns; if (clone_it) { TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq - tp->snd_una; @@ -1045,11 +1046,6 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, return -ENOBUFS; } - prior_wstamp = tp->tcp_wstamp_ns; - tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); - - skb->skb_mstamp_ns = tp->tcp_wstamp_ns; - inet = inet_sk(sk); tcb = TCP_SKB_CB(skb); memset(&opts, 0, sizeof(opts)); @@ -2937,12 +2933,16 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); } + /* To avoid taking spuriously low RTT samples based on a timestamp + * for a transmit that never happened, always mark EVER_RETRANS + */ + TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; + if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RETRANS_CB_FLAG)) tcp_call_bpf_3arg(sk, BPF_SOCK_OPS_RETRANS_CB, TCP_SKB_CB(skb)->seq, segs, err); if (likely(!err)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; trace_tcp_retransmit_skb(sk, skb); } else if (err != -EBUSY) { NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL, segs); From 7ae189759cc48cf8b54beebff566e9fd2d4e7d7c Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:30 -0800 Subject: [PATCH 3/8] tcp: always set retrans_stamp on recovery Previously TCP socket's retrans_stamp is not set if the retransmission has failed to send. As a result if a socket is experiencing local issues to retransmit packets, determining when to abort a socket is complicated w/o knowning the starting time of the recovery since retrans_stamp may remain zero. This complication causes sub-optimal behavior that TCP may use the latest, instead of the first, retransmission time to compute the elapsed time of a stalling connection due to local issues. Then TCP may disrecard TCP retries settings and keep retrying until it finally succeed: not a good idea when the local host is already strained. The simple fix is to always timestamp the start of a recovery. It's worth noting that retrans_stamp is also used to compare echo timestamp values to detect spurious recovery. This patch does not break that because retrans_stamp is still later than when the original packet was sent. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 9 ++++----- net/ipv4/tcp_timer.c | 23 +++-------------------- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 57a56e205070..d2d494c74811 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2963,13 +2963,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) #endif TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; tp->retrans_out += tcp_skb_pcount(skb); - - /* Save stamp of the first retransmit. */ - if (!tp->retrans_stamp) - tp->retrans_stamp = tcp_skb_timestamp(skb); - } + /* Save stamp of the first (attempted) retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_skb_timestamp(skb); + if (tp->undo_retrans < 0) tp->undo_retrans = 0; tp->undo_retrans += tcp_skb_pcount(skb); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index e7d09e3705b8..1e61f0bd6e24 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,28 +22,14 @@ #include #include -static u32 tcp_retransmit_stamp(const struct sock *sk) -{ - u32 start_ts = tcp_sk(sk)->retrans_stamp; - - if (unlikely(!start_ts)) { - struct sk_buff *head = tcp_rtx_queue_head(sk); - - if (!head) - return 0; - start_ts = tcp_skb_timestamp(head); - } - return start_ts; -} - static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 elapsed, start_ts; s32 remaining; - start_ts = tcp_retransmit_stamp(sk); - if (!icsk->icsk_user_timeout || !start_ts) + start_ts = tcp_sk(sk)->retrans_stamp; + if (!icsk->icsk_user_timeout) return icsk->icsk_rto; elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts; remaining = icsk->icsk_user_timeout - elapsed; @@ -197,10 +183,7 @@ static bool retransmits_timed_out(struct sock *sk, if (!inet_csk(sk)->icsk_retransmits) return false; - start_ts = tcp_retransmit_stamp(sk); - if (!start_ts) - return false; - + start_ts = tcp_sk(sk)->retrans_stamp; if (likely(timeout == 0)) { linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); From c7d13c8faa74f4e8ef191f88a252cefab6805b38 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:31 -0800 Subject: [PATCH 4/8] tcp: properly track retry time on passive Fast Open This patch addresses a corner issue on timeout behavior of a passive Fast Open socket. A passive Fast Open server may write and close the socket when it is re-trying SYN-ACK to complete the handshake. After the handshake is completely, the server does not properly stamp the recovery start time (tp->retrans_stamp is 0), and the socket may abort immediately on the very first FIN timeout, instead of retying until it passes the system or user specified limit. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 1e61f0bd6e24..074de38bafbd 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -378,6 +378,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); int max_retries = icsk->icsk_syn_retries ? : sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */ + struct tcp_sock *tp = tcp_sk(sk); struct request_sock *req; req = tcp_sk(sk)->fastopen_rsk; @@ -395,6 +396,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk) inet_rtx_syn_ack(sk, req); req->num_timeout++; icsk->icsk_retransmits++; + if (!tp->retrans_stamp) + tp->retrans_stamp = tcp_time_stamp(tp); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); } From 01a523b071618abbc634d1958229fe3bd2dfa5fa Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:32 -0800 Subject: [PATCH 5/8] tcp: create a helper to model exponential backoff Create a helper to model TCP exponential backoff for the next patch. This is pure refactor w no behavior change. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 074de38bafbd..bcc2f5783e57 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -159,7 +159,20 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } +static unsigned int tcp_model_timeout(struct sock *sk, + unsigned int boundary, + unsigned int rto_base) +{ + unsigned int linear_backoff_thresh, timeout; + linear_backoff_thresh = ilog2(TCP_RTO_MAX / rto_base); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * rto_base; + else + timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + return jiffies_to_msecs(timeout); +} /** * retransmits_timed_out() - returns true if this connection has timed out * @sk: The current socket @@ -177,23 +190,15 @@ static bool retransmits_timed_out(struct sock *sk, unsigned int boundary, unsigned int timeout) { - const unsigned int rto_base = TCP_RTO_MIN; - unsigned int linear_backoff_thresh, start_ts; + unsigned int start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; start_ts = tcp_sk(sk)->retrans_stamp; - if (likely(timeout == 0)) { - linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base); + if (likely(timeout == 0)) + timeout = tcp_model_timeout(sk, boundary, TCP_RTO_MIN); - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * rto_base; - else - timeout = ((2 << linear_backoff_thresh) - 1) * rto_base + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; - timeout = jiffies_to_msecs(timeout); - } return (s32)(tcp_time_stamp(tcp_sk(sk)) - start_ts - timeout) >= 0; } From 9721e709fa68ef9b860c322b474cfbd1f8285b0f Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:33 -0800 Subject: [PATCH 6/8] tcp: simplify window probe aborting on USER_TIMEOUT Previously we use the next unsent skb's timestamp to determine when to abort a socket stalling on window probes. This no longer works as skb timestamp reflects the last instead of the first transmission. Instead we can estimate how long the socket has been stalling with the probe count and the exponential backoff behavior. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index bcc2f5783e57..c36089aa3515 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -333,7 +333,6 @@ static void tcp_probe_timer(struct sock *sk) struct sk_buff *skb = tcp_send_head(sk); struct tcp_sock *tp = tcp_sk(sk); int max_probes; - u32 start_ts; if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; @@ -348,12 +347,13 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - start_ts = tcp_skb_timestamp(skb); - if (!start_ts) - skb->skb_mstamp_ns = tp->tcp_clock_cache; - else if (icsk->icsk_user_timeout && - (s32)(tcp_time_stamp(tp) - start_ts) > icsk->icsk_user_timeout) - goto abort; + if (icsk->icsk_user_timeout) { + u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, + tcp_probe0_base(sk)); + + if (elapsed >= icsk->icsk_user_timeout) + goto abort; + } max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { From 590d2026d62418bb27de9ca87526e9131c1f48af Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:34 -0800 Subject: [PATCH 7/8] tcp: retry more conservatively on local congestion Previously when the sender fails to retransmit a data packet on timeout due to congestion in the local host (e.g. throttling in qdisc), it'll retry within an RTO up to 500ms. In low-RTT networks such as data-centers, RTO is often far below the default minimum 200ms (and the cap 500ms). Then local host congestion could trigger a retry storm pouring gas to the fire. Worse yet, the retry counter (icsk_retransmits) is not properly updated so the aggressive retry may exceed the system limit (15 rounds) until the packet finally slips through. On such rare events, it's wise to retry more conservatively (500ms) and update the stats properly to reflect these incidents and follow the system limit. Note that this is consistent with the behavior when a keep-alive probe is dropped due to local congestion. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c36089aa3515..d7399a89469d 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -500,14 +500,13 @@ void tcp_retransmit_timer(struct sock *sk) tcp_enter_loss(sk); + icsk->icsk_retransmits++; if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) { /* Retransmission failed because of local congestion, - * do not backoff. + * Let senders fight for local resources conservatively. */ - if (!icsk->icsk_retransmits) - icsk->icsk_retransmits = 1; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL), + TCP_RESOURCE_PROBE_INTERVAL, TCP_RTO_MAX); goto out; } @@ -528,7 +527,6 @@ void tcp_retransmit_timer(struct sock *sk) * the 120 second clamps though! */ icsk->icsk_backoff++; - icsk->icsk_retransmits++; out_reset_timer: /* If stream is thin, use linear timeouts. Since 'icsk_backoff' is From c1d5674f8313b9f8e683c265f1c00a2582cf5fc5 Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Wed, 16 Jan 2019 15:05:35 -0800 Subject: [PATCH 8/8] tcp: less aggressive window probing on local congestion Previously when the sender fails to send (original) data packet or window probes due to congestion in the local host (e.g. throttling in qdisc), it'll retry within an RTO or two up to 500ms. In low-RTT networks such as data-centers, RTO is often far below the default minimum 200ms. Then local host congestion could trigger a retry storm pouring gas to the fire. Worse yet, the probe counter (icsk_probes_out) is not properly updated so the aggressive retry may exceed the system limit (15 rounds) until the packet finally slips through. On such rare events, it's wise to retry more conservatively (500ms) and update the stats properly to reflect these incidents and follow the system limit. Note that this is consistent with the behaviors when a keep-alive probe or RTO retry is dropped due to local congestion. Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Reviewed-by: Neal Cardwell Reviewed-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d2d494c74811..6527f61f59ff 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3749,7 +3749,7 @@ void tcp_send_probe0(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); - unsigned long probe_max; + unsigned long timeout; int err; err = tcp_write_wakeup(sk, LINUX_MIB_TCPWINPROBE); @@ -3761,26 +3761,18 @@ void tcp_send_probe0(struct sock *sk) return; } + icsk->icsk_probes_out++; if (err <= 0) { if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2) icsk->icsk_backoff++; - icsk->icsk_probes_out++; - probe_max = TCP_RTO_MAX; + timeout = tcp_probe0_when(sk, TCP_RTO_MAX); } else { /* If packet was not sent due to local congestion, - * do not backoff and do not remember icsk_probes_out. - * Let local senders to fight for local resources. - * - * Use accumulated backoff yet. + * Let senders fight for local resources conservatively. */ - if (!icsk->icsk_probes_out) - icsk->icsk_probes_out = 1; - probe_max = TCP_RESOURCE_PROBE_INTERVAL; + timeout = TCP_RESOURCE_PROBE_INTERVAL; } - tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, - tcp_probe0_when(sk, probe_max), - TCP_RTO_MAX, - NULL); + tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, timeout, TCP_RTO_MAX, NULL); } int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)