diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index a759872a2883..e7b3fa7bb3f7 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -1040,6 +1040,35 @@ tcp_challenge_ack_limit - INTEGER TCP stack implements per TCP socket limits anyway. Default: INT_MAX (unlimited) +tcp_ehash_entries - INTEGER + Show the number of hash buckets for TCP sockets in the current + networking namespace. + + A negative value means the networking namespace does not own its + hash buckets and shares the initial networking namespace's one. + +tcp_child_ehash_entries - INTEGER + Control the number of hash buckets for TCP sockets in the child + networking namespace, which must be set before clone() or unshare(). + + If the value is not 0, the kernel uses a value rounded up to 2^n + as the actual hash bucket size. 0 is a special value, meaning + the child networking namespace will share the initial networking + namespace's hash buckets. + + Note that the child will use the global one in case the kernel + fails to allocate enough memory. In addition, the global hash + buckets are spread over available NUMA nodes, but the allocation + of the child hash table depends on the current process's NUMA + policy, which could result in performance differences. + + Note also that the default value of tcp_max_tw_buckets and + tcp_max_syn_backlog depend on the hash bucket size. + + Possible values: 0, 2^n (n: 0 - 24 (16Mi)) + + Default: 0 + UDP variables ============= diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 520dd894b73d..9121ccab1fa1 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -168,6 +168,8 @@ struct inet_hashinfo { /* The 2nd listener table hashed by local port and address */ unsigned int lhash2_mask; struct inet_listen_hashbucket *lhash2; + + bool pernet; }; static inline struct inet_hashinfo *tcp_or_dccp_get_hashinfo(const struct sock *sk) @@ -214,6 +216,10 @@ static inline void inet_ehash_locks_free(struct inet_hashinfo *hashinfo) hashinfo->ehash_locks = NULL; } +struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, + unsigned int ehash_entries); +void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo); + struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 2c7df93e3403..1b8004679445 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -171,6 +171,7 @@ struct netns_ipv4 { int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; + unsigned int sysctl_tcp_child_ehash_entries; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; int sysctl_max_syn_backlog; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7cd4a6cc99fc..c548ca3e9b0e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1197,6 +1197,8 @@ static int __init dccp_init(void) INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } + dccp_hashinfo.pernet = false; + rc = dccp_mib_init(); if (rc) goto out_free_dccp_bhash2; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index c440de998910..74e64aad5114 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1145,3 +1145,50 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); + +struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, + unsigned int ehash_entries) +{ + struct inet_hashinfo *new_hashinfo; + int i; + + new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); + if (!new_hashinfo) + goto err; + + new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), + GFP_KERNEL_ACCOUNT); + if (!new_hashinfo->ehash) + goto free_hashinfo; + + new_hashinfo->ehash_mask = ehash_entries - 1; + + if (inet_ehash_locks_alloc(new_hashinfo)) + goto free_ehash; + + for (i = 0; i < ehash_entries; i++) + INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); + + new_hashinfo->pernet = true; + + return new_hashinfo; + +free_ehash: + vfree(new_hashinfo->ehash); +free_hashinfo: + kfree(new_hashinfo); +err: + return NULL; +} +EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); + +void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) +{ + if (!hashinfo->pernet) + return; + + inet_ehash_locks_free(hashinfo); + vfree(hashinfo->ehash); + kfree(hashinfo); +} +EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4d7c110c772f..9b8a6db7a66b 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -39,6 +39,7 @@ static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; +static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -382,6 +383,29 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, return ret; } +static int proc_tcp_ehash_entries(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_child_ehash_entries); + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; + int tcp_ehash_entries; + struct ctl_table tbl; + + tcp_ehash_entries = hinfo->ehash_mask + 1; + + /* A negative number indicates that the child netns + * shares the global ehash. + */ + if (!net_eq(net, &init_net) && !hinfo->pernet) + tcp_ehash_entries *= -1; + + tbl.data = &tcp_ehash_entries; + tbl.maxlen = sizeof(int); + + return proc_dointvec(&tbl, write, buffer, lenp, ppos); +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, @@ -1320,6 +1344,21 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, + { + .procname = "tcp_ehash_entries", + .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, + .mode = 0444, + .proc_handler = proc_tcp_ehash_entries, + }, + { + .procname = "tcp_child_ehash_entries", + .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_child_ehash_entries_max, + }, { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 8230be00ecca..829beee3fa32 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4790,6 +4790,7 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } + tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 73e6854ea662..6376ad915765 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -3110,10 +3110,38 @@ static void __net_exit tcp_sk_exit(struct net *net) net->ipv4.tcp_congestion_control->owner); } +static void __net_init tcp_set_hashinfo(struct net *net) +{ + struct inet_hashinfo *hinfo; + unsigned int ehash_entries; + struct net *old_net; + + if (net_eq(net, &init_net)) + goto fallback; + + old_net = current->nsproxy->net_ns; + ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); + if (!ehash_entries) + goto fallback; + + ehash_entries = roundup_pow_of_two(ehash_entries); + hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); + if (!hinfo) { + pr_warn("Failed to allocate TCP ehash (entries: %u) " + "for a netns, fallback to the global one\n", + ehash_entries); +fallback: + hinfo = &tcp_hashinfo; + ehash_entries = tcp_hashinfo.ehash_mask + 1; + } + + net->ipv4.tcp_death_row.hashinfo = hinfo; + net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; + net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); +} + static int __net_init tcp_sk_init(struct net *net) { - int cnt; - net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn_fallback = 1; @@ -3140,11 +3168,8 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); - cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row.sysctl_max_tw_buckets = cnt / 2; - net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; + tcp_set_hashinfo(net); - net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; @@ -3209,6 +3234,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) tcp_twsk_purge(net_exit_list, AF_INET); list_for_each_entry(net, net_exit_list, exit_list) { + inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); tcp_fastopen_ctx_destroy(net); } diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 98c576d4b671..442838ab0253 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -349,6 +349,7 @@ EXPORT_SYMBOL_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list, int family) { + bool purged_once = false; struct net *net; list_for_each_entry(net, net_exit_list, exit_list) { @@ -356,8 +357,12 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family) if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1) continue; - inet_twsk_purge(&tcp_hashinfo, family); - break; + if (net->ipv4.tcp_death_row.hashinfo->pernet) { + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); + } else if (!purged_once) { + inet_twsk_purge(&tcp_hashinfo, family); + purged_once = true; + } } } EXPORT_SYMBOL_GPL(tcp_twsk_purge);