mirror of
https://github.com/torvalds/linux.git
synced 2024-12-03 17:41:22 +00:00
tcp: implement coalescing on backlog queue
In case GRO is not as efficient as it should be or disabled, we might have a user thread trapped in __release_sock() while softirq handler flood packets up to the point we have to drop. This patch balances work done from user thread and softirq, to give more chances to __release_sock() to complete its work before new packets are added the the backlog. This also helps if we receive many ACK packets, since GRO does not aggregate them. This patch brings ~60% throughput increase on a receiver without GRO, but the spectacular gain is really on 1000x release_sock() latency reduction I have measured. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Neal Cardwell <ncardwell@google.com> Cc: Yuchung Cheng <ycheng@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
parent
85bdf7db5b
commit
4f693b55c3
@ -243,6 +243,7 @@ enum
|
|||||||
LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */
|
LINUX_MIB_TCPREQQFULLDROP, /* TCPReqQFullDrop */
|
||||||
LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */
|
LINUX_MIB_TCPRETRANSFAIL, /* TCPRetransFail */
|
||||||
LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */
|
LINUX_MIB_TCPRCVCOALESCE, /* TCPRcvCoalesce */
|
||||||
|
LINUX_MIB_TCPBACKLOGCOALESCE, /* TCPBacklogCoalesce */
|
||||||
LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */
|
LINUX_MIB_TCPOFOQUEUE, /* TCPOFOQueue */
|
||||||
LINUX_MIB_TCPOFODROP, /* TCPOFODrop */
|
LINUX_MIB_TCPOFODROP, /* TCPOFODrop */
|
||||||
LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */
|
LINUX_MIB_TCPOFOMERGE, /* TCPOFOMerge */
|
||||||
|
@ -219,6 +219,7 @@ static const struct snmp_mib snmp4_net_list[] = {
|
|||||||
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
|
SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL),
|
||||||
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
|
SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL),
|
||||||
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
|
SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED),
|
||||||
|
SNMP_MIB_ITEM("TCPBacklogCoalesce", LINUX_MIB_TCPBACKLOGCOALESCE),
|
||||||
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
|
SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT),
|
||||||
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
|
SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT),
|
||||||
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
|
SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV),
|
||||||
|
@ -1619,12 +1619,14 @@ int tcp_v4_early_demux(struct sk_buff *skb)
|
|||||||
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
|
bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
|
||||||
{
|
{
|
||||||
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
|
u32 limit = sk->sk_rcvbuf + sk->sk_sndbuf;
|
||||||
|
struct skb_shared_info *shinfo;
|
||||||
/* Only socket owner can try to collapse/prune rx queues
|
const struct tcphdr *th;
|
||||||
* to reduce memory overhead, so add a little headroom here.
|
struct tcphdr *thtail;
|
||||||
* Few sockets backlog are possibly concurrently non empty.
|
struct sk_buff *tail;
|
||||||
*/
|
unsigned int hdrlen;
|
||||||
limit += 64*1024;
|
bool fragstolen;
|
||||||
|
u32 gso_segs;
|
||||||
|
int delta;
|
||||||
|
|
||||||
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
|
/* In case all data was pulled from skb frags (in __pskb_pull_tail()),
|
||||||
* we can fix skb->truesize to its real value to avoid future drops.
|
* we can fix skb->truesize to its real value to avoid future drops.
|
||||||
@ -1636,6 +1638,84 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb)
|
|||||||
|
|
||||||
skb_dst_drop(skb);
|
skb_dst_drop(skb);
|
||||||
|
|
||||||
|
if (unlikely(tcp_checksum_complete(skb))) {
|
||||||
|
bh_unlock_sock(sk);
|
||||||
|
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
|
||||||
|
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Attempt coalescing to last skb in backlog, even if we are
|
||||||
|
* above the limits.
|
||||||
|
* This is okay because skb capacity is limited to MAX_SKB_FRAGS.
|
||||||
|
*/
|
||||||
|
th = (const struct tcphdr *)skb->data;
|
||||||
|
hdrlen = th->doff * 4;
|
||||||
|
shinfo = skb_shinfo(skb);
|
||||||
|
|
||||||
|
if (!shinfo->gso_size)
|
||||||
|
shinfo->gso_size = skb->len - hdrlen;
|
||||||
|
|
||||||
|
if (!shinfo->gso_segs)
|
||||||
|
shinfo->gso_segs = 1;
|
||||||
|
|
||||||
|
tail = sk->sk_backlog.tail;
|
||||||
|
if (!tail)
|
||||||
|
goto no_coalesce;
|
||||||
|
thtail = (struct tcphdr *)tail->data;
|
||||||
|
|
||||||
|
if (TCP_SKB_CB(tail)->end_seq != TCP_SKB_CB(skb)->seq ||
|
||||||
|
TCP_SKB_CB(tail)->ip_dsfield != TCP_SKB_CB(skb)->ip_dsfield ||
|
||||||
|
((TCP_SKB_CB(tail)->tcp_flags |
|
||||||
|
TCP_SKB_CB(skb)->tcp_flags) & TCPHDR_URG) ||
|
||||||
|
((TCP_SKB_CB(tail)->tcp_flags ^
|
||||||
|
TCP_SKB_CB(skb)->tcp_flags) & (TCPHDR_ECE | TCPHDR_CWR)) ||
|
||||||
|
#ifdef CONFIG_TLS_DEVICE
|
||||||
|
tail->decrypted != skb->decrypted ||
|
||||||
|
#endif
|
||||||
|
thtail->doff != th->doff ||
|
||||||
|
memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
|
||||||
|
goto no_coalesce;
|
||||||
|
|
||||||
|
__skb_pull(skb, hdrlen);
|
||||||
|
if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) {
|
||||||
|
thtail->window = th->window;
|
||||||
|
|
||||||
|
TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq;
|
||||||
|
|
||||||
|
if (after(TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(tail)->ack_seq))
|
||||||
|
TCP_SKB_CB(tail)->ack_seq = TCP_SKB_CB(skb)->ack_seq;
|
||||||
|
|
||||||
|
TCP_SKB_CB(tail)->tcp_flags |= TCP_SKB_CB(skb)->tcp_flags;
|
||||||
|
|
||||||
|
if (TCP_SKB_CB(skb)->has_rxtstamp) {
|
||||||
|
TCP_SKB_CB(tail)->has_rxtstamp = true;
|
||||||
|
tail->tstamp = skb->tstamp;
|
||||||
|
skb_hwtstamps(tail)->hwtstamp = skb_hwtstamps(skb)->hwtstamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Not as strict as GRO. We only need to carry mss max value */
|
||||||
|
skb_shinfo(tail)->gso_size = max(shinfo->gso_size,
|
||||||
|
skb_shinfo(tail)->gso_size);
|
||||||
|
|
||||||
|
gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs;
|
||||||
|
skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
|
||||||
|
|
||||||
|
sk->sk_backlog.len += delta;
|
||||||
|
__NET_INC_STATS(sock_net(sk),
|
||||||
|
LINUX_MIB_TCPBACKLOGCOALESCE);
|
||||||
|
kfree_skb_partial(skb, fragstolen);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
__skb_push(skb, hdrlen);
|
||||||
|
|
||||||
|
no_coalesce:
|
||||||
|
/* Only socket owner can try to collapse/prune rx queues
|
||||||
|
* to reduce memory overhead, so add a little headroom here.
|
||||||
|
* Few sockets backlog are possibly concurrently non empty.
|
||||||
|
*/
|
||||||
|
limit += 64*1024;
|
||||||
|
|
||||||
if (unlikely(sk_add_backlog(sk, skb, limit))) {
|
if (unlikely(sk_add_backlog(sk, skb, limit))) {
|
||||||
bh_unlock_sock(sk);
|
bh_unlock_sock(sk);
|
||||||
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
|
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPBACKLOGDROP);
|
||||||
|
Loading…
Reference in New Issue
Block a user