linux/net/ipv4/tcp_recovery.c

#include <linux/tcp.h>
#include <net/tcp.h>

int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;

static void tcp_rack_mark_skb_lost(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);

	tcp_skb_mark_lost_uncond_verify(tp, skb);
	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
		/* Account for retransmits that are lost again */
		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
		tp->retrans_out -= tcp_skb_pcount(skb);
		NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);
	}
}

/* Marks a packet lost, if some packet sent later has been (s)acked.
 * The underlying idea is similar to the traditional dupthresh and FACK
 * but they look at different metrics:
 *
 * dupthresh: 3 OOO packets delivered (packet count)
 * FACK: sequence delta to highest sacked sequence (sequence space)
 * RACK: sent time delta to the latest delivered packet (time domain)
 *
 * The advantage of RACK is it applies to both original and retransmitted
 * packet and therefore is robust against tail losses. Another advantage
 * is being more resilient to reordering by simply allowing some
 * "settling delay", instead of tweaking the dupthresh.
 *
 * The current version is only used after recovery starts but can be
 * easily extended to detect the first loss.
 */
static void tcp_rack_detect_loss(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *skb;
	u32 reo_wnd;

	/* To be more reordering resilient, allow min_rtt/4 settling delay
	 * (lower-bounded to 1000uS). We use min_rtt instead of the smoothed
	 * RTT because reordering is often a path property and less related
	 * to queuing or delayed ACKs.
	 *
	 * TODO: measure and adapt to the observed reordering delay, and
	 * use a timer to retransmit like the delayed early retransmit.
	 */
	reo_wnd = 1000;
	if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)
		reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);

	tcp_for_write_queue(skb, sk) {
		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);

		if (skb == tcp_send_head(sk))
			break;

		/* Skip ones already (s)acked */
		if (!after(scb->end_seq, tp->snd_una) ||
		    scb->sacked & TCPCB_SACKED_ACKED)
			continue;

		if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {

			if (skb_mstamp_us_delta(&tp->rack.mstamp,
						&skb->skb_mstamp) <= reo_wnd)
				continue;

			/* skb is lost if packet sent later is sacked */
			tcp_rack_mark_skb_lost(sk, skb);
		} else if (!(scb->sacked & TCPCB_RETRANS)) {
			/* Original data are sent sequentially so stop early
			 * b/c the rest are all sent after rack_sent
			 */
			break;
		}
	}
}

void tcp_rack_mark_lost(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);

	if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery || !tp->rack.advanced)
		return;
	/* Reset the advanced flag to avoid unnecessary queue scanning */
	tp->rack.advanced = 0;
	tcp_rack_detect_loss(sk);
}

/* Record the most recently (re)sent time among the (s)acked packets */
void tcp_rack_advance(struct tcp_sock *tp,
		      const struct skb_mstamp *xmit_time, u8 sacked)
{
	if (tp->rack.mstamp.v64 &&
	    !skb_mstamp_after(xmit_time, &tp->rack.mstamp))
		return;

	if (sacked & TCPCB_RETRANS) {
		struct skb_mstamp now;

		/* If the sacked packet was retransmitted, it's ambiguous
		 * whether the retransmission or the original (or the prior
		 * retransmission) was sacked.
		 *
		 * If the original is lost, there is no ambiguity. Otherwise
		 * we assume the original can be delayed up to aRTT + min_rtt.
		 * the aRTT term is bounded by the fast recovery or timeout,
		 * so it's at least one RTT (i.e., retransmission is at least
		 * an RTT later).
		 */
		skb_mstamp_get(&now);
		if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))
			return;
	}

	tp->rack.mstamp = *xmit_time;
	tp->rack.advanced = 1;
}
tcp: track the packet timings in RACK This patch is the first half of the RACK loss recovery. RACK loss recovery uses the notion of time instead of packet sequence (FACK) or counts (dupthresh). It's inspired by the previous FACK heuristic in tcp_mark_lost_retrans(): when a limited transmit (new data packet) is sacked, then current retransmitted sequence below the newly sacked sequence must been lost, since at least one round trip time has elapsed. But it has several limitations: 1) can't detect tail drops since it depends on limited transmit 2) is disabled upon reordering (assumes no reordering) 3) only enabled in fast recovery ut not timeout recovery RACK (Recently ACK) addresses these limitations with the notion of time instead: a packet P1 is lost if a later packet P2 is s/acked, as at least one round trip has passed. Since RACK cares about the time sequence instead of the data sequence of packets, it can detect tail drops when later retransmission is s/acked while FACK or dupthresh can't. For reordering RACK uses a dynamically adjusted reordering window ("reo_wnd") to reduce false positives on ever (small) degree of reordering. This patch implements tcp_advanced_rack() which tracks the most recent transmission time among the packets that have been delivered (ACKed or SACKed) in tp->rack.mstamp. This timestamp is the key to determine which packet has been lost. Consider an example that the sender sends six packets: T1: P1 (lost) T2: P2 T3: P3 T4: P4 T100: sack of P2. rack.mstamp = T2 T101: retransmit P1 T102: sack of P2,P3,P4. rack.mstamp = T4 T205: ACK of P4 since the hole is repaired. rack.mstamp = T101 We need to be careful about spurious retransmission because it may falsely advance tp->rack.mstamp by an RTT or an RTO, causing RACK to falsely mark all packets lost, just like a spurious timeout. We identify spurious retransmission by the ACK's TS echo value. If TS option is not applicable but the retransmission is acknowledged less than min-RTT ago, it is likely to be spurious. We refrain from using the transmission time of these spurious retransmissions. The second half is implemented in the next patch that marks packet lost using RACK timestamp. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:46 +00:00			`#include <linux/tcp.h>`
			`#include <net/tcp.h>`

tcp: use RACK to detect losses This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:47 +00:00			`int sysctl_tcp_recovery __read_mostly = TCP_RACK_LOST_RETRANS;`

tcp: new helper function for RACK loss detection Create a new helper tcp_rack_mark_skb_lost to prepare the upcoming RACK reordering timer support. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-01-13 06:11:30 +00:00			`static void tcp_rack_mark_skb_lost(struct sock sk, struct sk_buff skb)`
			`{`
			`struct tcp_sock *tp = tcp_sk(sk);`

			`tcp_skb_mark_lost_uncond_verify(tp, skb);`
			`if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {`
			`/* Account for retransmits that are lost again */`
			`TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;`
			`tp->retrans_out -= tcp_skb_pcount(skb);`
			`NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSTRETRANSMIT);`
			`}`
			`}`

tcp: use RACK to detect losses This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:47 +00:00			`/* Marks a packet lost, if some packet sent later has been (s)acked.`
			`* The underlying idea is similar to the traditional dupthresh and FACK`
			`* but they look at different metrics:`
			`*`
			`* dupthresh: 3 OOO packets delivered (packet count)`
			`* FACK: sequence delta to highest sacked sequence (sequence space)`
			`* RACK: sent time delta to the latest delivered packet (time domain)`
			`*`
			`* The advantage of RACK is it applies to both original and retransmitted`
			`* packet and therefore is robust against tail losses. Another advantage`
			`* is being more resilient to reordering by simply allowing some`
			`* "settling delay", instead of tweaking the dupthresh.`
			`*`
			`* The current version is only used after recovery starts but can be`
			`* easily extended to detect the first loss.`
			`*/`
tcp: new helper for RACK to detect loss Create a new helper tcp_rack_detect_loss to prepare the upcoming RACK reordering timer patch. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-01-13 06:11:31 +00:00			`static void tcp_rack_detect_loss(struct sock *sk)`
tcp: use RACK to detect losses This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:47 +00:00			`{`
			`struct tcp_sock *tp = tcp_sk(sk);`
			`struct sk_buff *skb;`
tcp: new helper for RACK to detect loss Create a new helper tcp_rack_detect_loss to prepare the upcoming RACK reordering timer patch. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-01-13 06:11:31 +00:00			`u32 reo_wnd;`
tcp: use RACK to detect losses This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:47 +00:00
			`/* To be more reordering resilient, allow min_rtt/4 settling delay`
			`* (lower-bounded to 1000uS). We use min_rtt instead of the smoothed`
			`* RTT because reordering is often a path property and less related`
			`* to queuing or delayed ACKs.`
			`*`
			`* TODO: measure and adapt to the observed reordering delay, and`
			`* use a timer to retransmit like the delayed early retransmit.`
			`*/`
			`reo_wnd = 1000;`
			`if (tp->rack.reord && tcp_min_rtt(tp) != ~0U)`
			`reo_wnd = max(tcp_min_rtt(tp) >> 2, reo_wnd);`

			`tcp_for_write_queue(skb, sk) {`
			`struct tcp_skb_cb *scb = TCP_SKB_CB(skb);`

			`if (skb == tcp_send_head(sk))`
			`break;`

			`/* Skip ones already (s)acked */`
			`if (!after(scb->end_seq, tp->snd_una) \|\|`
			`scb->sacked & TCPCB_SACKED_ACKED)`
			`continue;`

			`if (skb_mstamp_after(&tp->rack.mstamp, &skb->skb_mstamp)) {`

			`if (skb_mstamp_us_delta(&tp->rack.mstamp,`
			`&skb->skb_mstamp) <= reo_wnd)`
			`continue;`

			`/* skb is lost if packet sent later is sacked */`
tcp: new helper function for RACK loss detection Create a new helper tcp_rack_mark_skb_lost to prepare the upcoming RACK reordering timer support. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-01-13 06:11:30 +00:00			`tcp_rack_mark_skb_lost(sk, skb);`
tcp: use RACK to detect losses This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:47 +00:00			`} else if (!(scb->sacked & TCPCB_RETRANS)) {`
			`/* Original data are sent sequentially so stop early`
			`* b/c the rest are all sent after rack_sent`
			`*/`
			`break;`
			`}`
			`}`
tcp: new helper for RACK to detect loss Create a new helper tcp_rack_detect_loss to prepare the upcoming RACK reordering timer patch. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-01-13 06:11:31 +00:00			`}`

			`void tcp_rack_mark_lost(struct sock *sk)`
			`{`
			`struct tcp_sock *tp = tcp_sk(sk);`

			`if (inet_csk(sk)->icsk_ca_state < TCP_CA_Recovery \|\| !tp->rack.advanced)`
			`return;`
			`/* Reset the advanced flag to avoid unnecessary queue scanning */`
			`tp->rack.advanced = 0;`
			`tcp_rack_detect_loss(sk);`
tcp: use RACK to detect losses This patch implements the second half of RACK that uses the the most recent transmit time among all delivered packets to detect losses. tcp_rack_mark_lost() is called upon receiving a dubious ACK. It then checks if an not-yet-sacked packet was sent at least "reo_wnd" prior to the sent time of the most recently delivered. If so the packet is deemed lost. The "reo_wnd" reordering window starts with 1msec for fast loss detection and changes to min-RTT/4 when reordering is observed. We found 1msec accommodates well on tiny degree of reordering (<3 pkts) on faster links. We use min-RTT instead of SRTT because reordering is more of a path property but SRTT can be inflated by self-inflicated congestion. The factor of 4 is borrowed from the delayed early retransmit and seems to work reasonably well. Since RACK is still experimental, it is now used as a supplemental loss detection on top of existing algorithms. It is only effective after the fast recovery starts or after the timeout occurs. The fast recovery is still triggered by FACK and/or dupack threshold instead of RACK. We introduce a new sysctl net.ipv4.tcp_recovery for future experiments of loss recoveries. For now RACK can be disabled by setting it to 0. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:47 +00:00			`}`

tcp: track the packet timings in RACK This patch is the first half of the RACK loss recovery. RACK loss recovery uses the notion of time instead of packet sequence (FACK) or counts (dupthresh). It's inspired by the previous FACK heuristic in tcp_mark_lost_retrans(): when a limited transmit (new data packet) is sacked, then current retransmitted sequence below the newly sacked sequence must been lost, since at least one round trip time has elapsed. But it has several limitations: 1) can't detect tail drops since it depends on limited transmit 2) is disabled upon reordering (assumes no reordering) 3) only enabled in fast recovery ut not timeout recovery RACK (Recently ACK) addresses these limitations with the notion of time instead: a packet P1 is lost if a later packet P2 is s/acked, as at least one round trip has passed. Since RACK cares about the time sequence instead of the data sequence of packets, it can detect tail drops when later retransmission is s/acked while FACK or dupthresh can't. For reordering RACK uses a dynamically adjusted reordering window ("reo_wnd") to reduce false positives on ever (small) degree of reordering. This patch implements tcp_advanced_rack() which tracks the most recent transmission time among the packets that have been delivered (ACKed or SACKed) in tp->rack.mstamp. This timestamp is the key to determine which packet has been lost. Consider an example that the sender sends six packets: T1: P1 (lost) T2: P2 T3: P3 T4: P4 T100: sack of P2. rack.mstamp = T2 T101: retransmit P1 T102: sack of P2,P3,P4. rack.mstamp = T4 T205: ACK of P4 since the hole is repaired. rack.mstamp = T101 We need to be careful about spurious retransmission because it may falsely advance tp->rack.mstamp by an RTT or an RTO, causing RACK to falsely mark all packets lost, just like a spurious timeout. We identify spurious retransmission by the ACK's TS echo value. If TS option is not applicable but the retransmission is acknowledged less than min-RTT ago, it is likely to be spurious. We refrain from using the transmission time of these spurious retransmissions. The second half is implemented in the next patch that marks packet lost using RACK timestamp. Signed-off-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2015-10-17 04:57:46 +00:00			`/* Record the most recently (re)sent time among the (s)acked packets */`
			`void tcp_rack_advance(struct tcp_sock *tp,`
			`const struct skb_mstamp *xmit_time, u8 sacked)`
			`{`
			`if (tp->rack.mstamp.v64 &&`
			`!skb_mstamp_after(xmit_time, &tp->rack.mstamp))`
			`return;`

			`if (sacked & TCPCB_RETRANS) {`
			`struct skb_mstamp now;`

			`/* If the sacked packet was retransmitted, it's ambiguous`
			`* whether the retransmission or the original (or the prior`
			`* retransmission) was sacked.`
			`*`
			`* If the original is lost, there is no ambiguity. Otherwise`
			`* we assume the original can be delayed up to aRTT + min_rtt.`
			`* the aRTT term is bounded by the fast recovery or timeout,`
			`* so it's at least one RTT (i.e., retransmission is at least`
			`* an RTT later).`
			`*/`
			`skb_mstamp_get(&now);`
			`if (skb_mstamp_us_delta(&now, xmit_time) < tcp_min_rtt(tp))`
			`return;`
			`}`

			`tp->rack.mstamp = *xmit_time;`
			`tp->rack.advanced = 1;`
			`}`