From 69e996c58a35db9ca79b3f021a15bcd22202e1c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:32 -0700 Subject: [PATCH 01/10] tcp: add tp->tcp_mstamp field We want to use precise timestamps in TCP stack, but we do not want to call possibly expensive kernel time services too often. tp->tcp_mstamp is guaranteed to be updated once per incoming packet. We will use it in the following patches, removing specific skb_mstamp_get() calls, and removing ack_time from struct tcp_sacktag_state. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 1 + net/ipv4/tcp_input.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index cbe5b602a2d3..99a22f44c32e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -240,6 +240,7 @@ struct tcp_sock { u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ + struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5af2f04f8859..bd18c65df4a9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5362,6 +5362,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); + skb_mstamp_get(&tp->tcp_mstamp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* @@ -5922,6 +5923,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; + skb_mstamp_get(&tp->tcp_mstamp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5933,6 +5935,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; } + skb_mstamp_get(&tp->tcp_mstamp); tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { From 7c1c7308592f0f261836a96e37b7835ffd10d85b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:33 -0700 Subject: [PATCH 02/10] tcp: do not pass timestamp to tcp_rack_detect_loss() We can use tp->tcp_mstamp as it contains a recent timestamp. This removes a call to skb_mstamp_get() from tcp_rack_reo_timeout() Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_recovery.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index d8acbd9f477a..fdac262e277b 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,8 +45,7 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1, * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will * make us enter the CA_Recovery state. */ -static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, - u32 *reo_timeout) +static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -79,7 +78,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, * A packet is lost if its elapsed time is beyond * the recent RTT plus the reordering window. */ - u32 elapsed = skb_mstamp_us_delta(now, + u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp, &skb->skb_mstamp); s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; @@ -115,7 +114,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; - tcp_rack_detect_loss(sk, now, &timeout); + tcp_rack_detect_loss(sk, &timeout); if (timeout) { timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, @@ -165,12 +164,10 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp now; u32 timeout, prior_inflight; - skb_mstamp_get(&now); prior_inflight = tcp_packets_in_flight(tp); - tcp_rack_detect_loss(sk, &now, &timeout); + tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); From 128eda86bebeacefb0fcc64cab0155aa76857c92 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:34 -0700 Subject: [PATCH 03/10] tcp: do not pass timestamp to tcp_rack_mark_lost() This is no longer used, since tcp_rack_detect_loss() takes the timestamp from tp->tcp_mstamp Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 2 +- net/ipv4/tcp_recovery.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index da28bef1d82b..8b4433c4aaa2 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1853,7 +1853,7 @@ void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ -extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now); +extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, const struct skb_mstamp *xmit_time, const struct skb_mstamp *ack_time); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bd18c65df4a9..d4885f7a6a93 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2769,7 +2769,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; - tcp_rack_mark_lost(sk, ack_time); + tcp_rack_mark_lost(sk); if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index fdac262e277b..6ca8b5d9d803 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -104,7 +104,7 @@ static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) } } -void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) +void tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; From efab8f85826afbbfe4b0ca9208e006aabbd2df3a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:35 -0700 Subject: [PATCH 04/10] tcp: do not pass timestamp to tcp_rack_identify_loss() Not used anymore now tp->tcp_mstamp holds the information. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d4885f7a6a93..99b0d65de169 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2760,8 +2760,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) return false; } -static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, - const struct skb_mstamp *ack_time) +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2857,11 +2856,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_keep_open(sk); return; } - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; @@ -2877,7 +2876,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; From 1317a9d69f5fa0f5417237772f55a4aac49f7921 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:36 -0700 Subject: [PATCH 05/10] tcp: do not pass timestamp to tcp_fastretrans_alert() Not used anymore now tp->tcp_mstamp holds the information. This is needed to remove sack_state.ack_time in a following patch. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 99b0d65de169..68094aa8cfb2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2787,8 +2787,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - bool is_dupack, int *ack_flag, int *rexmit, - const struct skb_mstamp *ack_time) + bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3646,8 +3645,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3668,8 +3666,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3693,8 +3690,7 @@ old_ack: skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); tcp_xmit_recovery(sk, rexmit); } From 88d5c65098e5d15f2cea81f90bb6ecc167e1aa3b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:37 -0700 Subject: [PATCH 06/10] tcp: do not pass timestamp to tcp_rate_gen() No longer needed, since tp->tcp_mstamp holds the information. This is needed to remove sack_state.ack_time in a following patch. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- net/ipv4/tcp_input.c | 3 +-- net/ipv4/tcp_rate.c | 7 ++++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 8b4433c4aaa2..d7aae25efc7f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1004,7 +1004,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct skb_mstamp *now, struct rate_sample *rs); + struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 68094aa8cfb2..2d84483de2e1 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3657,8 +3657,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, - sack_state.rate); + tcp_rate_gen(sk, delivered, lost, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 9be1581a5a08..c6a9fa894646 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct skb_mstamp *now, struct rate_sample *rs) + struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -120,7 +120,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * to carry current time, flags, stats like "tcp_sacktag_state". */ if (delivered) - tp->delivered_mstamp = *now; + tp->delivered_mstamp = tp->tcp_mstamp; rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ @@ -138,7 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */ + ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp, + &rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Normally we expect interval_us >= min-rtt. From d2329f102d846214e449941289c7009e16be01a0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:38 -0700 Subject: [PATCH 07/10] tcp: do not pass timestamp to tcp_rack_advance() No longer needed, since tp->tcp_mstamp holds the information. This is needed to remove sack_state.ack_time in a following patch. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +-- net/ipv4/tcp_input.c | 6 ++---- net/ipv4/tcp_recovery.c | 5 ++--- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index d7aae25efc7f..270e5cc43c99 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1855,8 +1855,7 @@ void tcp_init(void); /* tcp_recovery.c */ extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time, - const struct skb_mstamp *ack_time); + const struct skb_mstamp *xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); /* diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2d84483de2e1..5485204853d3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1214,8 +1214,7 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { - tcp_rack_advance(tp, sacked, end_seq, - xmit_time, &state->ack_time); + tcp_rack_advance(tp, sacked, end_seq, xmit_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, @@ -3118,8 +3117,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - &skb->skb_mstamp, - &sack->ack_time); + &skb->skb_mstamp); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index 6ca8b5d9d803..cd72b3d3879e 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -127,8 +127,7 @@ void tcp_rack_mark_lost(struct sock *sk) * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time, - const struct skb_mstamp *ack_time) + const struct skb_mstamp *xmit_time) { u32 rtt_us; @@ -137,7 +136,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, end_seq, tp->rack.end_seq)) return; - rtt_us = skb_mstamp_us_delta(ack_time, xmit_time); + rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time); if (sacked & TCPCB_RETRANS) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior From 7e0ca8a4c19c5fbaa802ffd609f5f9ab9249af5d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:39 -0700 Subject: [PATCH 08/10] tcp: use tp->tcp_mstamp in tcp_clean_rtx_queue() Following patch will remove ack_time from struct tcp_sacktag_state Same info is now found in tp->tcp_mstamp Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5485204853d3..f4e1836c696c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3056,8 +3056,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt; - struct skb_mstamp *now = &sack->ack_time; struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp *now = &tp->tcp_mstamp; u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; From a6db50b81e3f20b2b692bbddd35d9484057eae9d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:40 -0700 Subject: [PATCH 09/10] tcp: remove ack_time from struct tcp_sacktag_state It is no longer needed, everything uses tp->tcp_mstamp instead. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f4e1836c696c..f475f0b53bfe 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1131,7 +1131,6 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; - struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */ struct rate_sample *rate; int flag; }; @@ -3572,8 +3571,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -3684,7 +3681,6 @@ old_ack: * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { - skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); From 645f4c6f2ebd040688cc2a5f626ffc909e66ccf2 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 25 Apr 2017 10:15:41 -0700 Subject: [PATCH 10/10] tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps Some devices or distributions use HZ=100 or HZ=250 TCP receive buffer autotuning has poor behavior caused by this choice. Since autotuning happens after 4 ms or 10 ms, short distance flows get their receive buffer tuned to a very high value, but after an initial period where it was frozen to (too small) initial value. With tp->tcp_mstamp introduction, we can switch to high resolution timestamps almost for free (at the expense of 8 additional bytes per TCP structure) Note that some TCP stacks use usec TCP timestamps where this patch makes even more sense : Many TCP flows have < 500 usec RTT. Hopefully this finer TS option can be standardized soon. Tested: HZ=100 kernel ./netperf -H lpaa24 -t TCP_RR -l 1000 -- -r 10000,10000 & Peer without patch : lpaa24:~# ss -tmi dst lpaa23 ... skmem:(r0,rb8388608,...) rcv_rtt:10 rcv_space:3210000 minrtt:0.017 Peer with the patch : lpaa23:~# ss -tmi dst lpaa24 ... skmem:(r0,rb428800,...) rcv_rtt:0.069 rcv_space:30000 minrtt:0.017 We can see saner RCVBUF, and more precise rcv_rtt information. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/linux/tcp.h | 12 ++++++------ net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 28 +++++++++++++++++----------- 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 99a22f44c32e..b6d5adcee8fc 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -333,16 +333,16 @@ struct tcp_sock { /* Receiver side RTT estimation */ struct { - u32 rtt; - u32 seq; - u32 time; + u32 rtt_us; + u32 seq; + struct skb_mstamp time; } rcv_rtt_est; /* Receiver queue space */ struct { - int space; - u32 seq; - u32 time; + int space; + u32 seq; + struct skb_mstamp time; } rcvq_space; /* TCP-specific MTU probe information. */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index efc976ae66ae..059dad7deefe 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2853,7 +2853,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; - info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; + info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f475f0b53bfe..9739962bfb3f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -442,7 +442,8 @@ void tcp_init_buffer_space(struct sock *sk) tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; - tp->rcvq_space.time = tcp_time_stamp; + skb_mstamp_get(&tp->tcp_mstamp); + tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); @@ -518,7 +519,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { - u32 new_sample = tp->rcv_rtt_est.rtt; + u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; if (m == 0) @@ -548,21 +549,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) new_sample = m << 3; } - if (tp->rcv_rtt_est.rtt != new_sample) - tp->rcv_rtt_est.rtt = new_sample; + tp->rcv_rtt_est.rtt_us = new_sample; } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { - if (tp->rcv_rtt_est.time == 0) + u32 delta_us; + + if (tp->rcv_rtt_est.time.v64 == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); + delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); + tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; - tp->rcv_rtt_est.time = tcp_time_stamp; + tp->rcv_rtt_est.time = tp->tcp_mstamp; } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, @@ -572,7 +575,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); + tcp_rcv_rtt_update(tp, + jiffies_to_usecs(tcp_time_stamp - + tp->rx_opt.rcv_tsecr), + 0); } /* @@ -585,8 +591,8 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; - time = tcp_time_stamp - tp->rcvq_space.time; - if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) + time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); + if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; /* Number of bytes copied to user in last RTT */ @@ -642,7 +648,7 @@ void tcp_rcv_space_adjust(struct sock *sk) new_measure: tp->rcvq_space.seq = tp->copied_seq; - tp->rcvq_space.time = tcp_time_stamp; + tp->rcvq_space.time = tp->tcp_mstamp; } /* There is something which you must keep in mind when you analyze the