From 1a24e04e4b50939daa3041682b38b82c896ca438 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:25 -0700 Subject: [PATCH 1/6] net: fix sk_mem_reclaim_partial() sk_mem_reclaim_partial() goal is to ensure each socket has one SK_MEM_QUANTUM forward allocation. This is needed both for performance and better handling of memory pressure situations in follow up patches. SK_MEM_QUANTUM is currently a page, but might be reduced to 4096 bytes as some arches have 64KB pages. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 6 +++--- net/core/sock.c | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index d882f4c8e438..4581a60636f8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1368,7 +1368,7 @@ static inline struct inode *SOCK_INODE(struct socket *socket) * Functions for memory accounting */ int __sk_mem_schedule(struct sock *sk, int size, int kind); -void __sk_mem_reclaim(struct sock *sk); +void __sk_mem_reclaim(struct sock *sk, int amount); #define SK_MEM_QUANTUM ((int)PAGE_SIZE) #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) @@ -1409,7 +1409,7 @@ static inline void sk_mem_reclaim(struct sock *sk) if (!sk_has_account(sk)) return; if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) - __sk_mem_reclaim(sk); + __sk_mem_reclaim(sk, sk->sk_forward_alloc); } static inline void sk_mem_reclaim_partial(struct sock *sk) @@ -1417,7 +1417,7 @@ static inline void sk_mem_reclaim_partial(struct sock *sk) if (!sk_has_account(sk)) return; if (sk->sk_forward_alloc > SK_MEM_QUANTUM) - __sk_mem_reclaim(sk); + __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1); } static inline void sk_mem_charge(struct sock *sk, int size) diff --git a/net/core/sock.c b/net/core/sock.c index c18738a795b0..29124fcdc42a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2069,12 +2069,13 @@ EXPORT_SYMBOL(__sk_mem_schedule); /** * __sk_reclaim - reclaim memory_allocated * @sk: socket + * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) */ -void __sk_mem_reclaim(struct sock *sk) +void __sk_mem_reclaim(struct sock *sk, int amount) { - sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); - sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + amount >>= SK_MEM_QUANTUM_SHIFT; + sk_memory_allocated_sub(sk, amount); + sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; if (sk_under_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) From a6c5ea4ccf0033591e6e476d7a273c0074c07aa7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:26 -0700 Subject: [PATCH 2/6] tcp: rename sk_forced_wmem_schedule() to sk_forced_mem_schedule() We plan to use sk_forced_wmem_schedule() in input path as well, so make it non static and rename it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp_output.c | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 7ace6acbf5fd..841691a296dc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -311,6 +311,8 @@ static inline bool tcp_out_of_memory(struct sock *sk) return false; } +void sk_forced_mem_schedule(struct sock *sk, int size); + static inline bool tcp_too_many_orphans(struct sock *sk, int shift) { struct percpu_counter *ocp = sk->sk_prot->orphan_count; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 7386d32cd670..bac1a950d087 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2816,8 +2816,10 @@ begin_fwd: * connection tear down and (memory) recovery. * Otherwise tcp_send_fin() could be tempted to either delay FIN * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() */ -static void sk_forced_wmem_schedule(struct sock *sk, int size) +void sk_forced_mem_schedule(struct sock *sk, int size) { int amt, status; @@ -2864,7 +2866,7 @@ coalesce: return; } skb_reserve(skb, MAX_TCP_HEADER); - sk_forced_wmem_schedule(sk, skb->truesize); + sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ tcp_init_nondata_skb(skb, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); From b8da51ebb1aa93908350f95efae73aecbc2e266c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:27 -0700 Subject: [PATCH 3/6] tcp: introduce tcp_under_memory_pressure() Introduce an optimized version of sk_under_memory_pressure() for TCP. Our intent is to use it in fast paths. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/tcp.h | 8 ++++++++ net/ipv4/tcp_input.c | 8 ++++---- net/ipv4/tcp_output.c | 4 ++-- net/ipv4/tcp_timer.c | 2 +- 4 files changed, 15 insertions(+), 7 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 841691a296dc..0d85223efa4c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -286,6 +286,14 @@ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern int tcp_memory_pressure; +/* optimized version of sk_under_memory_pressure() for TCP sockets */ +static inline bool tcp_under_memory_pressure(const struct sock *sk) +{ + if (mem_cgroup_sockets_enabled && sk->sk_cgrp) + return !!sk->sk_cgrp->memory_pressure; + + return tcp_memory_pressure; +} /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cf8b20ff6658..093779f7e893 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -359,7 +359,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && - !sk_under_memory_pressure(sk)) { + !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -446,7 +446,7 @@ static void tcp_clamp_window(struct sock *sk) if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && - !sk_under_memory_pressure(sk) && + !tcp_under_memory_pressure(sk) && sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) { sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), sysctl_tcp_rmem[2]); @@ -4781,7 +4781,7 @@ static int tcp_prune_queue(struct sock *sk) if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) tcp_clamp_window(sk); - else if (sk_under_memory_pressure(sk)) + else if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); tcp_collapse_ofo_queue(sk); @@ -4825,7 +4825,7 @@ static bool tcp_should_expand_sndbuf(const struct sock *sk) return false; /* If we are under global TCP memory pressure, do not expand. */ - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) return false; /* If we are under soft global TCP memory pressure, do not expand. */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index bac1a950d087..08c2cc40b26d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2392,7 +2392,7 @@ u32 __tcp_select_window(struct sock *sk) if (free_space < (full_space >> 1)) { icsk->icsk_ack.quick = 0; - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -2843,7 +2843,7 @@ void tcp_send_fin(struct sock *sk) * Note: in the latter case, FIN packet will be sent after a timeout, * as TCP stack thinks it has already been transmitted. */ - if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) { + if (tskb && (tcp_send_head(sk) || tcp_under_memory_pressure(sk))) { coalesce: TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN; TCP_SKB_CB(tskb)->end_seq++; diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 65bf670e8714..5b752f58a900 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -247,7 +247,7 @@ void tcp_delack_timer_handler(struct sock *sk) } out: - if (sk_under_memory_pressure(sk)) + if (tcp_under_memory_pressure(sk)) sk_mem_reclaim(sk); } From 8e4d980ac21596a9b91d8e720c77ad081975a0a8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:28 -0700 Subject: [PATCH 4/6] tcp: fix behavior for epoll edge trigger Under memory pressure, tcp_sendmsg() can fail to queue a packet while no packet is present in write queue. If we return -EAGAIN with no packet in write queue, no ACK packet will ever come to raise EPOLLOUT. We need to allow one skb per TCP socket, and make sure that tcp sockets can release their forward allocations under pressure. This is a followup to commit 790ba4566c1a ("tcp: set SOCK_NOSPACE under memory pressure") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ecccfdc50d76..9eabfd3e0925 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -815,9 +815,20 @@ struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) /* The TCP header must be at least 32-bit aligned. */ size = ALIGN(size, 4); + if (unlikely(tcp_under_memory_pressure(sk))) + sk_mem_reclaim_partial(sk); + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); - if (skb) { - if (sk_wmem_schedule(sk, skb->truesize)) { + if (likely(skb)) { + bool mem_schedule; + + if (skb_queue_len(&sk->sk_write_queue) == 0) { + mem_schedule = true; + sk_forced_mem_schedule(sk, skb->truesize); + } else { + mem_schedule = sk_wmem_schedule(sk, skb->truesize); + } + if (likely(mem_schedule)) { skb_reserve(skb, sk->sk_prot->max_header); /* * Make sure that we have exactly size bytes From 76dfa6082032b5c179864816fa508879421678eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:29 -0700 Subject: [PATCH 5/6] tcp: allow one skb to be received per socket under memory pressure While testing tight tcp_mem settings, I found tcp sessions could be stuck because we do not allow even one skb to be received on them. By allowing one skb to be received, we introduce fairness and eventuallu force memory hogs to release their allocation. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 093779f7e893..40c435997e54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4507,10 +4507,12 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: - if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb, skb->truesize)) - goto drop; - + if (eaten < 0) { + if (skb_queue_len(&sk->sk_receive_queue) == 0) + sk_forced_mem_schedule(sk, skb->truesize); + else if (tcp_try_rmem_schedule(sk, skb, skb->truesize)) + goto drop; + } eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); } tcp_rcv_nxt_update(tp, TCP_SKB_CB(skb)->end_seq); From b66e91ccbc34ebd5a2f90f9e1bc1597e2924a500 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 May 2015 12:39:30 -0700 Subject: [PATCH 6/6] tcp: halves tcp_mem[] limits Allowing tcp to use ~19% of physical memory is way too much, and allowed bugs to be hidden. Add to this that some drivers use a full page per incoming frame, so real cost can be twice the advertized one. Reduce tcp_mem by 50 % as a first step to sanity. tcp_mem[0,1,2] defaults are now 4.68%, 6.25%, 9.37% of physical memory. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9eabfd3e0925..c724195e5862 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3068,11 +3068,12 @@ __setup("thash_entries=", set_thash_entries); static void __init tcp_init_mem(void) { - unsigned long limit = nr_free_buffer_pages() / 8; + unsigned long limit = nr_free_buffer_pages() / 16; + limit = max(limit, 128UL); - sysctl_tcp_mem[0] = limit / 4 * 3; - sysctl_tcp_mem[1] = limit; - sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + sysctl_tcp_mem[0] = limit / 4 * 3; /* 4.68 % */ + sysctl_tcp_mem[1] = limit; /* 6.25 % */ + sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; /* 9.37 % */ } void __init tcp_init(void)