forked from Minki/linux
Merge branch 'tcp-ns-rmem-wmem'
Eric Dumazet says: ==================== net: Namespace-ify sysctl_tcp_rmem and sysctl_tcp_wmem We need to get per netns sysctl for sysctl_[proto]_rmem and sysctl_[proto]_wmem This patch series adds the basic infrastructure allowing per proto conversion, and takes care of TCP. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
commit
c7947e4339
@ -155,6 +155,8 @@ struct netns_ipv4 {
|
|||||||
int sysctl_tcp_invalid_ratelimit;
|
int sysctl_tcp_invalid_ratelimit;
|
||||||
int sysctl_tcp_pacing_ss_ratio;
|
int sysctl_tcp_pacing_ss_ratio;
|
||||||
int sysctl_tcp_pacing_ca_ratio;
|
int sysctl_tcp_pacing_ca_ratio;
|
||||||
|
int sysctl_tcp_wmem[3];
|
||||||
|
int sysctl_tcp_rmem[3];
|
||||||
struct inet_timewait_death_row tcp_death_row;
|
struct inet_timewait_death_row tcp_death_row;
|
||||||
int sysctl_max_syn_backlog;
|
int sysctl_max_syn_backlog;
|
||||||
int sysctl_tcp_fastopen;
|
int sysctl_tcp_fastopen;
|
||||||
|
@ -1101,8 +1101,12 @@ struct proto {
|
|||||||
*/
|
*/
|
||||||
unsigned long *memory_pressure;
|
unsigned long *memory_pressure;
|
||||||
long *sysctl_mem;
|
long *sysctl_mem;
|
||||||
|
|
||||||
int *sysctl_wmem;
|
int *sysctl_wmem;
|
||||||
int *sysctl_rmem;
|
int *sysctl_rmem;
|
||||||
|
u32 sysctl_wmem_offset;
|
||||||
|
u32 sysctl_rmem_offset;
|
||||||
|
|
||||||
int max_header;
|
int max_header;
|
||||||
bool no_autobind;
|
bool no_autobind;
|
||||||
|
|
||||||
@ -2390,4 +2394,22 @@ extern int sysctl_optmem_max;
|
|||||||
extern __u32 sysctl_wmem_default;
|
extern __u32 sysctl_wmem_default;
|
||||||
extern __u32 sysctl_rmem_default;
|
extern __u32 sysctl_rmem_default;
|
||||||
|
|
||||||
|
static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto)
|
||||||
|
{
|
||||||
|
/* Does this proto have per netns sysctl_wmem ? */
|
||||||
|
if (proto->sysctl_wmem_offset)
|
||||||
|
return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset);
|
||||||
|
|
||||||
|
return *proto->sysctl_wmem;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto)
|
||||||
|
{
|
||||||
|
/* Does this proto have per netns sysctl_rmem ? */
|
||||||
|
if (proto->sysctl_rmem_offset)
|
||||||
|
return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset);
|
||||||
|
|
||||||
|
return *proto->sysctl_rmem;
|
||||||
|
}
|
||||||
|
|
||||||
#endif /* _SOCK_H */
|
#endif /* _SOCK_H */
|
||||||
|
@ -242,8 +242,6 @@ void tcp_time_wait(struct sock *sk, int state, int timeo);
|
|||||||
/* sysctl variables for tcp */
|
/* sysctl variables for tcp */
|
||||||
extern int sysctl_tcp_max_orphans;
|
extern int sysctl_tcp_max_orphans;
|
||||||
extern long sysctl_tcp_mem[3];
|
extern long sysctl_tcp_mem[3];
|
||||||
extern int sysctl_tcp_wmem[3];
|
|
||||||
extern int sysctl_tcp_rmem[3];
|
|
||||||
|
|
||||||
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
|
#define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */
|
||||||
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
|
#define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */
|
||||||
|
@ -48,7 +48,7 @@ TRACE_EVENT(sock_exceed_buf_limit,
|
|||||||
strncpy(__entry->name, prot->name, 32);
|
strncpy(__entry->name, prot->name, 32);
|
||||||
__entry->sysctl_mem = prot->sysctl_mem;
|
__entry->sysctl_mem = prot->sysctl_mem;
|
||||||
__entry->allocated = allocated;
|
__entry->allocated = allocated;
|
||||||
__entry->sysctl_rmem = prot->sysctl_rmem[0];
|
__entry->sysctl_rmem = sk_get_rmem0(sk, prot);
|
||||||
__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
|
__entry->rmem_alloc = atomic_read(&sk->sk_rmem_alloc);
|
||||||
),
|
),
|
||||||
|
|
||||||
|
@ -2346,16 +2346,18 @@ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
|
|||||||
|
|
||||||
/* guarantee minimum buffer size under pressure */
|
/* guarantee minimum buffer size under pressure */
|
||||||
if (kind == SK_MEM_RECV) {
|
if (kind == SK_MEM_RECV) {
|
||||||
if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
|
if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
} else { /* SK_MEM_SEND */
|
} else { /* SK_MEM_SEND */
|
||||||
|
int wmem0 = sk_get_wmem0(sk, prot);
|
||||||
|
|
||||||
if (sk->sk_type == SOCK_STREAM) {
|
if (sk->sk_type == SOCK_STREAM) {
|
||||||
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
|
if (sk->sk_wmem_queued < wmem0)
|
||||||
return 1;
|
return 1;
|
||||||
} else if (refcount_read(&sk->sk_wmem_alloc) <
|
} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
|
||||||
prot->sysctl_wmem[0])
|
|
||||||
return 1;
|
return 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sk_has_memory_pressure(sk)) {
|
if (sk_has_memory_pressure(sk)) {
|
||||||
|
@ -440,22 +440,6 @@ static struct ctl_table ipv4_table[] = {
|
|||||||
.mode = 0644,
|
.mode = 0644,
|
||||||
.proc_handler = proc_doulongvec_minmax,
|
.proc_handler = proc_doulongvec_minmax,
|
||||||
},
|
},
|
||||||
{
|
|
||||||
.procname = "tcp_wmem",
|
|
||||||
.data = &sysctl_tcp_wmem,
|
|
||||||
.maxlen = sizeof(sysctl_tcp_wmem),
|
|
||||||
.mode = 0644,
|
|
||||||
.proc_handler = proc_dointvec_minmax,
|
|
||||||
.extra1 = &one,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
.procname = "tcp_rmem",
|
|
||||||
.data = &sysctl_tcp_rmem,
|
|
||||||
.maxlen = sizeof(sysctl_tcp_rmem),
|
|
||||||
.mode = 0644,
|
|
||||||
.proc_handler = proc_dointvec_minmax,
|
|
||||||
.extra1 = &one,
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
.procname = "tcp_low_latency",
|
.procname = "tcp_low_latency",
|
||||||
.data = &sysctl_tcp_low_latency,
|
.data = &sysctl_tcp_low_latency,
|
||||||
@ -1164,6 +1148,22 @@ static struct ctl_table ipv4_net_table[] = {
|
|||||||
.extra1 = &zero,
|
.extra1 = &zero,
|
||||||
.extra2 = &thousand,
|
.extra2 = &thousand,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_wmem",
|
||||||
|
.data = &init_net.ipv4.sysctl_tcp_wmem,
|
||||||
|
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_wmem),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dointvec_minmax,
|
||||||
|
.extra1 = &one,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.procname = "tcp_rmem",
|
||||||
|
.data = &init_net.ipv4.sysctl_tcp_rmem,
|
||||||
|
.maxlen = sizeof(init_net.ipv4.sysctl_tcp_rmem),
|
||||||
|
.mode = 0644,
|
||||||
|
.proc_handler = proc_dointvec_minmax,
|
||||||
|
.extra1 = &one,
|
||||||
|
},
|
||||||
{ }
|
{ }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -289,12 +289,7 @@ struct percpu_counter tcp_orphan_count;
|
|||||||
EXPORT_SYMBOL_GPL(tcp_orphan_count);
|
EXPORT_SYMBOL_GPL(tcp_orphan_count);
|
||||||
|
|
||||||
long sysctl_tcp_mem[3] __read_mostly;
|
long sysctl_tcp_mem[3] __read_mostly;
|
||||||
int sysctl_tcp_wmem[3] __read_mostly;
|
|
||||||
int sysctl_tcp_rmem[3] __read_mostly;
|
|
||||||
|
|
||||||
EXPORT_SYMBOL(sysctl_tcp_mem);
|
EXPORT_SYMBOL(sysctl_tcp_mem);
|
||||||
EXPORT_SYMBOL(sysctl_tcp_rmem);
|
|
||||||
EXPORT_SYMBOL(sysctl_tcp_wmem);
|
|
||||||
|
|
||||||
atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
|
atomic_long_t tcp_memory_allocated; /* Current allocated memory. */
|
||||||
EXPORT_SYMBOL(tcp_memory_allocated);
|
EXPORT_SYMBOL(tcp_memory_allocated);
|
||||||
@ -456,8 +451,8 @@ void tcp_init_sock(struct sock *sk)
|
|||||||
|
|
||||||
icsk->icsk_sync_mss = tcp_sync_mss;
|
icsk->icsk_sync_mss = tcp_sync_mss;
|
||||||
|
|
||||||
sk->sk_sndbuf = sysctl_tcp_wmem[1];
|
sk->sk_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[1];
|
||||||
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
|
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
|
||||||
|
|
||||||
sk_sockets_allocated_inc(sk);
|
sk_sockets_allocated_inc(sk);
|
||||||
}
|
}
|
||||||
@ -3636,13 +3631,13 @@ void __init tcp_init(void)
|
|||||||
max_wshare = min(4UL*1024*1024, limit);
|
max_wshare = min(4UL*1024*1024, limit);
|
||||||
max_rshare = min(6UL*1024*1024, limit);
|
max_rshare = min(6UL*1024*1024, limit);
|
||||||
|
|
||||||
sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
|
init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
|
||||||
sysctl_tcp_wmem[1] = 16*1024;
|
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
|
||||||
sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
|
init_net.ipv4.sysctl_tcp_wmem[2] = max(64*1024, max_wshare);
|
||||||
|
|
||||||
sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
|
init_net.ipv4.sysctl_tcp_rmem[0] = SK_MEM_QUANTUM;
|
||||||
sysctl_tcp_rmem[1] = 87380;
|
init_net.ipv4.sysctl_tcp_rmem[1] = 87380;
|
||||||
sysctl_tcp_rmem[2] = max(87380, max_rshare);
|
init_net.ipv4.sysctl_tcp_rmem[2] = max(87380, max_rshare);
|
||||||
|
|
||||||
pr_info("Hash tables configured (established %u bind %u)\n",
|
pr_info("Hash tables configured (established %u bind %u)\n",
|
||||||
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
|
tcp_hashinfo.ehash_mask + 1, tcp_hashinfo.bhash_size);
|
||||||
|
@ -320,7 +320,7 @@ static void tcp_sndbuf_expand(struct sock *sk)
|
|||||||
sndmem *= nr_segs * per_mss;
|
sndmem *= nr_segs * per_mss;
|
||||||
|
|
||||||
if (sk->sk_sndbuf < sndmem)
|
if (sk->sk_sndbuf < sndmem)
|
||||||
sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]);
|
sk->sk_sndbuf = min(sndmem, sock_net(sk)->ipv4.sysctl_tcp_wmem[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
|
/* 2. Tuning advertised window (window_clamp, rcv_ssthresh)
|
||||||
@ -354,7 +354,7 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb)
|
|||||||
struct tcp_sock *tp = tcp_sk(sk);
|
struct tcp_sock *tp = tcp_sk(sk);
|
||||||
/* Optimize this! */
|
/* Optimize this! */
|
||||||
int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
|
int truesize = tcp_win_from_space(sk, skb->truesize) >> 1;
|
||||||
int window = tcp_win_from_space(sk, sysctl_tcp_rmem[2]) >> 1;
|
int window = tcp_win_from_space(sk, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]) >> 1;
|
||||||
|
|
||||||
while (tp->rcv_ssthresh <= window) {
|
while (tp->rcv_ssthresh <= window) {
|
||||||
if (truesize <= skb->len)
|
if (truesize <= skb->len)
|
||||||
@ -409,7 +409,7 @@ static void tcp_fixup_rcvbuf(struct sock *sk)
|
|||||||
rcvmem <<= 2;
|
rcvmem <<= 2;
|
||||||
|
|
||||||
if (sk->sk_rcvbuf < rcvmem)
|
if (sk->sk_rcvbuf < rcvmem)
|
||||||
sk->sk_rcvbuf = min(rcvmem, sysctl_tcp_rmem[2]);
|
sk->sk_rcvbuf = min(rcvmem, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* 4. Try to fixup all. It is made immediately after connection enters
|
/* 4. Try to fixup all. It is made immediately after connection enters
|
||||||
@ -457,15 +457,16 @@ static void tcp_clamp_window(struct sock *sk)
|
|||||||
{
|
{
|
||||||
struct tcp_sock *tp = tcp_sk(sk);
|
struct tcp_sock *tp = tcp_sk(sk);
|
||||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||||
|
struct net *net = sock_net(sk);
|
||||||
|
|
||||||
icsk->icsk_ack.quick = 0;
|
icsk->icsk_ack.quick = 0;
|
||||||
|
|
||||||
if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] &&
|
if (sk->sk_rcvbuf < net->ipv4.sysctl_tcp_rmem[2] &&
|
||||||
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
|
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
|
||||||
!tcp_under_memory_pressure(sk) &&
|
!tcp_under_memory_pressure(sk) &&
|
||||||
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
|
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
|
||||||
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
|
sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc),
|
||||||
sysctl_tcp_rmem[2]);
|
net->ipv4.sysctl_tcp_rmem[2]);
|
||||||
}
|
}
|
||||||
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
|
if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
|
||||||
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
|
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
|
||||||
@ -623,7 +624,8 @@ void tcp_rcv_space_adjust(struct sock *sk)
|
|||||||
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
|
while (tcp_win_from_space(sk, rcvmem) < tp->advmss)
|
||||||
rcvmem += 128;
|
rcvmem += 128;
|
||||||
|
|
||||||
rcvbuf = min(rcvwin / tp->advmss * rcvmem, sysctl_tcp_rmem[2]);
|
rcvbuf = min(rcvwin / tp->advmss * rcvmem,
|
||||||
|
sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
|
||||||
if (rcvbuf > sk->sk_rcvbuf) {
|
if (rcvbuf > sk->sk_rcvbuf) {
|
||||||
sk->sk_rcvbuf = rcvbuf;
|
sk->sk_rcvbuf = rcvbuf;
|
||||||
|
|
||||||
|
@ -2409,8 +2409,8 @@ struct proto tcp_prot = {
|
|||||||
.memory_allocated = &tcp_memory_allocated,
|
.memory_allocated = &tcp_memory_allocated,
|
||||||
.memory_pressure = &tcp_memory_pressure,
|
.memory_pressure = &tcp_memory_pressure,
|
||||||
.sysctl_mem = sysctl_tcp_mem,
|
.sysctl_mem = sysctl_tcp_mem,
|
||||||
.sysctl_wmem = sysctl_tcp_wmem,
|
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
|
||||||
.sysctl_rmem = sysctl_tcp_rmem,
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
|
||||||
.max_header = MAX_TCP_HEADER,
|
.max_header = MAX_TCP_HEADER,
|
||||||
.obj_size = sizeof(struct tcp_sock),
|
.obj_size = sizeof(struct tcp_sock),
|
||||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||||
@ -2509,7 +2509,14 @@ static int __net_init tcp_sk_init(struct net *net)
|
|||||||
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
|
net->ipv4.sysctl_tcp_invalid_ratelimit = HZ/2;
|
||||||
net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
|
net->ipv4.sysctl_tcp_pacing_ss_ratio = 200;
|
||||||
net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
|
net->ipv4.sysctl_tcp_pacing_ca_ratio = 120;
|
||||||
|
if (net != &init_net) {
|
||||||
|
memcpy(net->ipv4.sysctl_tcp_rmem,
|
||||||
|
init_net.ipv4.sysctl_tcp_rmem,
|
||||||
|
sizeof(init_net.ipv4.sysctl_tcp_rmem));
|
||||||
|
memcpy(net->ipv4.sysctl_tcp_wmem,
|
||||||
|
init_net.ipv4.sysctl_tcp_wmem,
|
||||||
|
sizeof(init_net.ipv4.sysctl_tcp_wmem));
|
||||||
|
}
|
||||||
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
|
net->ipv4.sysctl_tcp_fastopen = TFO_CLIENT_ENABLE;
|
||||||
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
|
spin_lock_init(&net->ipv4.tcp_fastopen_ctx_lock);
|
||||||
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
|
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
|
||||||
|
@ -220,7 +220,7 @@ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss,
|
|||||||
(*rcv_wscale) = 0;
|
(*rcv_wscale) = 0;
|
||||||
if (wscale_ok) {
|
if (wscale_ok) {
|
||||||
/* Set window scaling on max possible window */
|
/* Set window scaling on max possible window */
|
||||||
space = max_t(u32, space, sysctl_tcp_rmem[2]);
|
space = max_t(u32, space, sock_net(sk)->ipv4.sysctl_tcp_rmem[2]);
|
||||||
space = max_t(u32, space, sysctl_rmem_max);
|
space = max_t(u32, space, sysctl_rmem_max);
|
||||||
space = min_t(u32, space, *window_clamp);
|
space = min_t(u32, space, *window_clamp);
|
||||||
while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
|
while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) {
|
||||||
|
@ -1940,8 +1940,8 @@ struct proto tcpv6_prot = {
|
|||||||
.memory_pressure = &tcp_memory_pressure,
|
.memory_pressure = &tcp_memory_pressure,
|
||||||
.orphan_count = &tcp_orphan_count,
|
.orphan_count = &tcp_orphan_count,
|
||||||
.sysctl_mem = sysctl_tcp_mem,
|
.sysctl_mem = sysctl_tcp_mem,
|
||||||
.sysctl_wmem = sysctl_tcp_wmem,
|
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
|
||||||
.sysctl_rmem = sysctl_tcp_rmem,
|
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
|
||||||
.max_header = MAX_TCP_HEADER,
|
.max_header = MAX_TCP_HEADER,
|
||||||
.obj_size = sizeof(struct tcp6_sock),
|
.obj_size = sizeof(struct tcp6_sock),
|
||||||
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
.slab_flags = SLAB_TYPESAFE_BY_RCU,
|
||||||
|
Loading…
Reference in New Issue
Block a user