tcp-tso: do not split TSO packets at retransmit time
Linux TCP stack painfully segments all TSO/GSO packets before retransmits. This was fine back in the days when TSO/GSO were emerging, with their bugs, but we believe the dark age is over. Keeping big packets in write queues, but also in stack traversal has a lot of benefits. - Less memory overhead, because write queues have less skbs - Less cpu overhead at ACK processing. - Better SACK processing, as lot of studies mentioned how awful linux was at this ;) - Less cpu overhead to send the rtx packets (IP stack traversal, netfilter traversal, drivers...) - Better latencies in presence of losses. - Smaller spikes in fq like packet schedulers, as retransmits are not constrained by TCP Small Queues. 1 % packet losses are common today, and at 100Gbit speeds, this translates to ~80,000 losses per second. Losses are often correlated, and we see many retransmit events leading to 1-MSS train of packets, at the time hosts are already under stress. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Yuchung Cheng <ycheng@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
		
							parent
							
								
									8cee83dd29
								
							
						
					
					
						commit
						10d3be5692
					
				| @ -538,8 +538,8 @@ __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); | ||||
| void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, | ||||
| 			       int nonagle); | ||||
| bool tcp_may_send_now(struct sock *sk); | ||||
| int __tcp_retransmit_skb(struct sock *, struct sk_buff *); | ||||
| int tcp_retransmit_skb(struct sock *, struct sk_buff *); | ||||
| int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); | ||||
| int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); | ||||
| void tcp_retransmit_timer(struct sock *sk); | ||||
| void tcp_xmit_retransmit_queue(struct sock *); | ||||
| void tcp_simple_retransmit(struct sock *); | ||||
|  | ||||
| @ -5545,7 +5545,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, | ||||
| 	if (data) { /* Retransmit unacked data in SYN */ | ||||
| 		tcp_for_write_queue_from(data, sk) { | ||||
| 			if (data == tcp_send_head(sk) || | ||||
| 			    __tcp_retransmit_skb(sk, data)) | ||||
| 			    __tcp_retransmit_skb(sk, data, 1)) | ||||
| 				break; | ||||
| 		} | ||||
| 		tcp_rearm_rto(sk); | ||||
|  | ||||
| @ -2268,7 +2268,7 @@ void tcp_send_loss_probe(struct sock *sk) | ||||
| 	if (WARN_ON(!skb || !tcp_skb_pcount(skb))) | ||||
| 		goto rearm_timer; | ||||
| 
 | ||||
| 	if (__tcp_retransmit_skb(sk, skb)) | ||||
| 	if (__tcp_retransmit_skb(sk, skb, 1)) | ||||
| 		goto rearm_timer; | ||||
| 
 | ||||
| 	/* Record snd_nxt for loss detection. */ | ||||
| @ -2571,17 +2571,17 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, | ||||
|  * state updates are done by the caller.  Returns non-zero if an | ||||
|  * error occurred which prevented the send. | ||||
|  */ | ||||
| int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | ||||
| int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | ||||
| { | ||||
| 	struct tcp_sock *tp = tcp_sk(sk); | ||||
| 	struct inet_connection_sock *icsk = inet_csk(sk); | ||||
| 	struct tcp_sock *tp = tcp_sk(sk); | ||||
| 	unsigned int cur_mss; | ||||
| 	int err; | ||||
| 	int diff, len, err; | ||||
| 
 | ||||
| 	/* Inconslusive MTU probe */ | ||||
| 	if (icsk->icsk_mtup.probe_size) { | ||||
| 
 | ||||
| 	/* Inconclusive MTU probe */ | ||||
| 	if (icsk->icsk_mtup.probe_size) | ||||
| 		icsk->icsk_mtup.probe_size = 0; | ||||
| 	} | ||||
| 
 | ||||
| 	/* Do not sent more than we queued. 1/4 is reserved for possible
 | ||||
| 	 * copying overhead: fragmentation, tunneling, mangling etc. | ||||
| @ -2614,30 +2614,27 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | ||||
| 	    TCP_SKB_CB(skb)->seq != tp->snd_una) | ||||
| 		return -EAGAIN; | ||||
| 
 | ||||
| 	if (skb->len > cur_mss) { | ||||
| 		if (tcp_fragment(sk, skb, cur_mss, cur_mss, GFP_ATOMIC)) | ||||
| 	len = cur_mss * segs; | ||||
| 	if (skb->len > len) { | ||||
| 		if (tcp_fragment(sk, skb, len, cur_mss, GFP_ATOMIC)) | ||||
| 			return -ENOMEM; /* We'll try again later. */ | ||||
| 	} else { | ||||
| 		int oldpcount = tcp_skb_pcount(skb); | ||||
| 		if (skb_unclone(skb, GFP_ATOMIC)) | ||||
| 			return -ENOMEM; | ||||
| 
 | ||||
| 		if (unlikely(oldpcount > 1)) { | ||||
| 			if (skb_unclone(skb, GFP_ATOMIC)) | ||||
| 				return -ENOMEM; | ||||
| 			tcp_init_tso_segs(skb, cur_mss); | ||||
| 			tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb)); | ||||
| 		} | ||||
| 		diff = tcp_skb_pcount(skb); | ||||
| 		tcp_set_skb_tso_segs(skb, cur_mss); | ||||
| 		diff -= tcp_skb_pcount(skb); | ||||
| 		if (diff) | ||||
| 			tcp_adjust_pcount(sk, skb, diff); | ||||
| 		if (skb->len < cur_mss) | ||||
| 			tcp_retrans_try_collapse(sk, skb, cur_mss); | ||||
| 	} | ||||
| 
 | ||||
| 	/* RFC3168, section 6.1.1.1. ECN fallback */ | ||||
| 	if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) | ||||
| 		tcp_ecn_clear_syn(sk, skb); | ||||
| 
 | ||||
| 	tcp_retrans_try_collapse(sk, skb, cur_mss); | ||||
| 
 | ||||
| 	/* Make a copy, if the first transmission SKB clone we made
 | ||||
| 	 * is still in somebody's hands, else make a clone. | ||||
| 	 */ | ||||
| 
 | ||||
| 	/* make sure skb->data is aligned on arches that require it
 | ||||
| 	 * and check if ack-trimming & collapsing extended the headroom | ||||
| 	 * beyond what csum_start can cover. | ||||
| @ -2653,20 +2650,22 @@ int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | ||||
| 	} | ||||
| 
 | ||||
| 	if (likely(!err)) { | ||||
| 		segs = tcp_skb_pcount(skb); | ||||
| 
 | ||||
| 		TCP_SKB_CB(skb)->sacked |= TCPCB_EVER_RETRANS; | ||||
| 		/* Update global TCP statistics. */ | ||||
| 		TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); | ||||
| 		TCP_ADD_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS, segs); | ||||
| 		if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN) | ||||
| 			NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); | ||||
| 		tp->total_retrans++; | ||||
| 		tp->total_retrans += segs; | ||||
| 	} | ||||
| 	return err; | ||||
| } | ||||
| 
 | ||||
| int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) | ||||
| int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs) | ||||
| { | ||||
| 	struct tcp_sock *tp = tcp_sk(sk); | ||||
| 	int err = __tcp_retransmit_skb(sk, skb); | ||||
| 	int err = __tcp_retransmit_skb(sk, skb, segs); | ||||
| 
 | ||||
| 	if (err == 0) { | ||||
| #if FASTRETRANS_DEBUG > 0 | ||||
| @ -2757,6 +2756,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | ||||
| 
 | ||||
| 	tcp_for_write_queue_from(skb, sk) { | ||||
| 		__u8 sacked = TCP_SKB_CB(skb)->sacked; | ||||
| 		int segs; | ||||
| 
 | ||||
| 		if (skb == tcp_send_head(sk)) | ||||
| 			break; | ||||
| @ -2764,14 +2764,8 @@ void tcp_xmit_retransmit_queue(struct sock *sk) | ||||
| 		if (!hole) | ||||
| 			tp->retransmit_skb_hint = skb; | ||||
| 
 | ||||
| 		/* Assume this retransmit will generate
 | ||||
| 		 * only one packet for congestion window | ||||
| 		 * calculation purposes.  This works because | ||||
| 		 * tcp_retransmit_skb() will chop up the | ||||
| 		 * packet to be MSS sized and all the | ||||
| 		 * packet counting works out. | ||||
| 		 */ | ||||
| 		if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) | ||||
| 		segs = tp->snd_cwnd - tcp_packets_in_flight(tp); | ||||
| 		if (segs <= 0) | ||||
| 			return; | ||||
| 
 | ||||
| 		if (fwd_rexmitting) { | ||||
| @ -2808,7 +2802,7 @@ begin_fwd: | ||||
| 		if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) | ||||
| 			continue; | ||||
| 
 | ||||
| 		if (tcp_retransmit_skb(sk, skb)) | ||||
| 		if (tcp_retransmit_skb(sk, skb, segs)) | ||||
| 			return; | ||||
| 
 | ||||
| 		NET_INC_STATS_BH(sock_net(sk), mib_idx); | ||||
|  | ||||
| @ -404,7 +404,7 @@ void tcp_retransmit_timer(struct sock *sk) | ||||
| 			goto out; | ||||
| 		} | ||||
| 		tcp_enter_loss(sk); | ||||
| 		tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); | ||||
| 		tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1); | ||||
| 		__sk_dst_reset(sk); | ||||
| 		goto out_reset_timer; | ||||
| 	} | ||||
| @ -436,7 +436,7 @@ void tcp_retransmit_timer(struct sock *sk) | ||||
| 
 | ||||
| 	tcp_enter_loss(sk); | ||||
| 
 | ||||
| 	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk)) > 0) { | ||||
| 	if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) { | ||||
| 		/* Retransmission failed because of local congestion,
 | ||||
| 		 * do not backoff. | ||||
| 		 */ | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user