From 21a216a8fc630161e69eaf02cbebdd1816ff1a13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:28 -0800 Subject: [PATCH 01/11] ipv6/addrconf: allocate a per netns hash table Add a per netns hash table and a dedicated spinlock, first step to get rid of the global inet6_addr_lst[] one. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 4 ++++ net/ipv6/addrconf.c | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 30cdfc4e1615..755f12001c8b 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -92,6 +92,10 @@ struct netns_ipv6 { struct sock *tcp_sk; struct sock *igmp_sk; struct sock *mc_autojoin_sk; + + struct hlist_head *inet6_addr_lst; + spinlock_t addrconf_hash_lock; + #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES struct mr_table *mrt6; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ef23e7dc538a..cda9e59cab43 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7111,6 +7111,13 @@ static int __net_init addrconf_init_net(struct net *net) int err = -ENOMEM; struct ipv6_devconf *all, *dflt; + spin_lock_init(&net->ipv6.addrconf_hash_lock); + net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!net->ipv6.inet6_addr_lst) + goto err_alloc_addr; + all = kmemdup(&ipv6_devconf, sizeof(ipv6_devconf), GFP_KERNEL); if (!all) goto err_alloc_all; @@ -7172,11 +7179,15 @@ err_reg_all: err_alloc_dflt: kfree(all); err_alloc_all: + kfree(net->ipv6.inet6_addr_lst); +err_alloc_addr: return err; } static void __net_exit addrconf_exit_net(struct net *net) { + int i; + #ifdef CONFIG_SYSCTL __addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt, NETCONFA_IFINDEX_DEFAULT); @@ -7187,6 +7198,15 @@ static void __net_exit addrconf_exit_net(struct net *net) net->ipv6.devconf_dflt = NULL; kfree(net->ipv6.devconf_all); net->ipv6.devconf_all = NULL; + + /* + * Check hash table, then free it. + */ + for (i = 0; i < IN6_ADDR_HSIZE; i++) + WARN_ON_ONCE(!hlist_empty(&net->ipv6.inet6_addr_lst[i])); + + kfree(net->ipv6.inet6_addr_lst); + net->ipv6.inet6_addr_lst = NULL; } static struct pernet_operations addrconf_ops = { From 8805d13ff1b2bef6a7bb8a005d2441763286dd7a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:29 -0800 Subject: [PATCH 02/11] ipv6/addrconf: use one delayed work per netns Next step for using per netns inet6_addr_lst is to have per netns work item to ultimately call addrconf_verify_rtnl() and addrconf_verify() with a new 'struct net*' argument. Everything is still using the global inet6_addr_lst[] table. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- include/net/netns/ipv6.h | 1 + net/ipv6/addrconf.c | 44 ++++++++++++++++++++++------------------ 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 755f12001c8b..d145f1966682 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -95,6 +95,7 @@ struct netns_ipv6 { struct hlist_head *inet6_addr_lst; spinlock_t addrconf_hash_lock; + struct delayed_work addr_chk_work; #ifdef CONFIG_IPV6_MROUTE #ifndef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index cda9e59cab43..dab291cd39ba 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -152,12 +152,10 @@ static int ipv6_generate_stable_address(struct in6_addr *addr, static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; static DEFINE_SPINLOCK(addrconf_hash_lock); -static void addrconf_verify(void); -static void addrconf_verify_rtnl(void); -static void addrconf_verify_work(struct work_struct *); +static void addrconf_verify(struct net *net); +static void addrconf_verify_rtnl(struct net *net); static struct workqueue_struct *addrconf_wq; -static DECLARE_DELAYED_WORK(addr_chk_work, addrconf_verify_work); static void addrconf_join_anycast(struct inet6_ifaddr *ifp); static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); @@ -2675,7 +2673,7 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, create, now); in6_ifa_put(ifp); - addrconf_verify(); + addrconf_verify(net); } return 0; @@ -2987,7 +2985,7 @@ static int inet6_addr_add(struct net *net, int ifindex, manage_tempaddrs(idev, ifp, cfg->valid_lft, cfg->preferred_lft, true, jiffies); in6_ifa_put(ifp); - addrconf_verify_rtnl(); + addrconf_verify_rtnl(net); return 0; } else if (cfg->ifa_flags & IFA_F_MCAUTOJOIN) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, false, @@ -3027,7 +3025,7 @@ static int inet6_addr_del(struct net *net, int ifindex, u32 ifa_flags, manage_tempaddrs(idev, ifp, 0, 0, false, jiffies); ipv6_del_addr(ifp); - addrconf_verify_rtnl(); + addrconf_verify_rtnl(net); if (ipv6_addr_is_multicast(pfx)) { ipv6_mc_config(net->ipv6.mc_autojoin_sk, false, pfx, dev->ifindex); @@ -4246,7 +4244,7 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp, bool bump_id, * before this temporary address becomes deprecated. */ if (ifp->flags & IFA_F_TEMPORARY) - addrconf_verify_rtnl(); + addrconf_verify_rtnl(dev_net(dev)); } static void addrconf_dad_run(struct inet6_dev *idev, bool restart) @@ -4484,7 +4482,7 @@ int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, * Periodic address status verification */ -static void addrconf_verify_rtnl(void) +static void addrconf_verify_rtnl(struct net *net) { unsigned long now, next, next_sec, next_sched; struct inet6_ifaddr *ifp; @@ -4496,7 +4494,7 @@ static void addrconf_verify_rtnl(void) now = jiffies; next = round_jiffies_up(now + ADDR_CHECK_FREQUENCY); - cancel_delayed_work(&addr_chk_work); + cancel_delayed_work(&net->ipv6.addr_chk_work); for (i = 0; i < IN6_ADDR_HSIZE; i++) { restart: @@ -4599,20 +4597,23 @@ restart: pr_debug("now = %lu, schedule = %lu, rounded schedule = %lu => %lu\n", now, next, next_sec, next_sched); - mod_delayed_work(addrconf_wq, &addr_chk_work, next_sched - now); + mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, next_sched - now); rcu_read_unlock_bh(); } static void addrconf_verify_work(struct work_struct *w) { + struct net *net = container_of(to_delayed_work(w), struct net, + ipv6.addr_chk_work); + rtnl_lock(); - addrconf_verify_rtnl(); + addrconf_verify_rtnl(net); rtnl_unlock(); } -static void addrconf_verify(void) +static void addrconf_verify(struct net *net) { - mod_delayed_work(addrconf_wq, &addr_chk_work, 0); + mod_delayed_work(addrconf_wq, &net->ipv6.addr_chk_work, 0); } static struct in6_addr *extract_addr(struct nlattr *addr, struct nlattr *local, @@ -4708,7 +4709,8 @@ static int modify_prefix_route(struct inet6_ifaddr *ifp, return 0; } -static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) +static int inet6_addr_modify(struct net *net, struct inet6_ifaddr *ifp, + struct ifa6_config *cfg) { u32 flags; clock_t expires; @@ -4822,7 +4824,7 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, struct ifa6_config *cfg) jiffies); } - addrconf_verify_rtnl(); + addrconf_verify_rtnl(net); return 0; } @@ -4909,7 +4911,7 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, !(nlh->nlmsg_flags & NLM_F_REPLACE)) err = -EEXIST; else - err = inet6_addr_modify(ifa, &cfg); + err = inet6_addr_modify(net, ifa, &cfg); in6_ifa_put(ifa); @@ -5794,7 +5796,7 @@ update_lft: write_unlock_bh(&idev->lock); inet6_ifinfo_notify(RTM_NEWLINK, idev); - addrconf_verify_rtnl(); + addrconf_verify_rtnl(dev_net(dev)); return 0; } @@ -7112,6 +7114,7 @@ static int __net_init addrconf_init_net(struct net *net) struct ipv6_devconf *all, *dflt; spin_lock_init(&net->ipv6.addrconf_hash_lock); + INIT_DEFERRABLE_WORK(&net->ipv6.addr_chk_work, addrconf_verify_work); net->ipv6.inet6_addr_lst = kcalloc(IN6_ADDR_HSIZE, sizeof(struct hlist_head), GFP_KERNEL); @@ -7199,6 +7202,7 @@ static void __net_exit addrconf_exit_net(struct net *net) kfree(net->ipv6.devconf_all); net->ipv6.devconf_all = NULL; + cancel_delayed_work(&net->ipv6.addr_chk_work); /* * Check hash table, then free it. */ @@ -7281,7 +7285,7 @@ int __init addrconf_init(void) register_netdevice_notifier(&ipv6_dev_notf); - addrconf_verify(); + addrconf_verify(&init_net); rtnl_af_register(&inet6_ops); @@ -7364,7 +7368,7 @@ void addrconf_cleanup(void) for (i = 0; i < IN6_ADDR_HSIZE; i++) WARN_ON(!hlist_empty(&inet6_addr_lst[i])); spin_unlock_bh(&addrconf_hash_lock); - cancel_delayed_work(&addr_chk_work); + rtnl_unlock(); destroy_workqueue(addrconf_wq); From e66d117222047ea90f92e065f929bc0e0eec3647 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:30 -0800 Subject: [PATCH 03/11] ipv6/addrconf: switch to per netns inet6_addr_lst hash table IPv6 does not scale very well with the number of IPv6 addresses. It uses a global (shared by all netns) hash table with 256 buckets. Some functions like addrconf_verify_rtnl() and addrconf_ifdown() have to iterate all addresses in the hash table. I have seen addrconf_verify_rtnl() holding the cpu for 10ms or more. Switch to the per netns hashtable (and spinlock) added in prior patches. This considerably speeds up netns dismantle times on hosts with thousands of netns. This also has an impact on regular (fast path) IPv6 processing. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 77 ++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 54 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index dab291cd39ba..4f402bc38f05 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -146,11 +146,6 @@ static int ipv6_generate_stable_address(struct in6_addr *addr, #define IN6_ADDR_HSIZE_SHIFT 8 #define IN6_ADDR_HSIZE (1 << IN6_ADDR_HSIZE_SHIFT) -/* - * Configured unicast address hash table - */ -static struct hlist_head inet6_addr_lst[IN6_ADDR_HSIZE]; -static DEFINE_SPINLOCK(addrconf_hash_lock); static void addrconf_verify(struct net *net); static void addrconf_verify_rtnl(struct net *net); @@ -1009,9 +1004,7 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, { struct inet6_ifaddr *ifp; - hlist_for_each_entry(ifp, &inet6_addr_lst[hash], addr_lst) { - if (!net_eq(dev_net(ifp->idev->dev), net)) - continue; + hlist_for_each_entry(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) { if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev) return true; @@ -1022,20 +1015,21 @@ static bool ipv6_chk_same_addr(struct net *net, const struct in6_addr *addr, static int ipv6_add_addr_hash(struct net_device *dev, struct inet6_ifaddr *ifa) { - unsigned int hash = inet6_addr_hash(dev_net(dev), &ifa->addr); + struct net *net = dev_net(dev); + unsigned int hash = inet6_addr_hash(net, &ifa->addr); int err = 0; - spin_lock(&addrconf_hash_lock); + spin_lock(&net->ipv6.addrconf_hash_lock); /* Ignore adding duplicate addresses on an interface */ - if (ipv6_chk_same_addr(dev_net(dev), &ifa->addr, dev, hash)) { + if (ipv6_chk_same_addr(net, &ifa->addr, dev, hash)) { netdev_dbg(dev, "ipv6_add_addr: already assigned\n"); err = -EEXIST; } else { - hlist_add_head_rcu(&ifa->addr_lst, &inet6_addr_lst[hash]); + hlist_add_head_rcu(&ifa->addr_lst, &net->ipv6.inet6_addr_lst[hash]); } - spin_unlock(&addrconf_hash_lock); + spin_unlock(&net->ipv6.addrconf_hash_lock); return err; } @@ -1259,9 +1253,10 @@ cleanup_prefix_route(struct inet6_ifaddr *ifp, unsigned long expires, static void ipv6_del_addr(struct inet6_ifaddr *ifp) { - int state; enum cleanup_prefix_rt_t action = CLEANUP_PREFIX_RT_NOP; + struct net *net = dev_net(ifp->idev->dev); unsigned long expires; + int state; ASSERT_RTNL(); @@ -1273,9 +1268,9 @@ static void ipv6_del_addr(struct inet6_ifaddr *ifp) if (state == INET6_IFADDR_STATE_DEAD) goto out; - spin_lock_bh(&addrconf_hash_lock); + spin_lock_bh(&net->ipv6.addrconf_hash_lock); hlist_del_init_rcu(&ifp->addr_lst); - spin_unlock_bh(&addrconf_hash_lock); + spin_unlock_bh(&net->ipv6.addrconf_hash_lock); write_lock_bh(&ifp->idev->lock); @@ -1918,10 +1913,8 @@ __ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, if (skip_dev_check) dev = NULL; - hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { + hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) { ndev = ifp->idev->dev; - if (!net_eq(dev_net(ndev), net)) - continue; if (l3mdev_master_dev_rcu(ndev) != l3mdev) continue; @@ -2025,9 +2018,7 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add struct inet6_ifaddr *ifp, *result = NULL; rcu_read_lock(); - hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { - if (!net_eq(dev_net(ifp->idev->dev), net)) - continue; + hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) { if (ipv6_addr_equal(&ifp->addr, addr)) { if (!dev || ifp->idev->dev == dev || !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { @@ -2094,7 +2085,7 @@ static int addrconf_dad_end(struct inet6_ifaddr *ifp) void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp) { struct inet6_dev *idev = ifp->idev; - struct net *net = dev_net(ifp->idev->dev); + struct net *net = dev_net(idev->dev); if (addrconf_dad_end(ifp)) { in6_ifa_put(ifp); @@ -3770,9 +3761,9 @@ static int addrconf_ifdown(struct net_device *dev, bool unregister) /* Step 2: clear hash table */ for (i = 0; i < IN6_ADDR_HSIZE; i++) { - struct hlist_head *h = &inet6_addr_lst[i]; + struct hlist_head *h = &net->ipv6.inet6_addr_lst[i]; - spin_lock_bh(&addrconf_hash_lock); + spin_lock_bh(&net->ipv6.addrconf_hash_lock); restart: hlist_for_each_entry_rcu(ifa, h, addr_lst) { if (ifa->idev == idev) { @@ -3788,7 +3779,7 @@ restart: } } } - spin_unlock_bh(&addrconf_hash_lock); + spin_unlock_bh(&net->ipv6.addrconf_hash_lock); } write_lock_bh(&idev->lock); @@ -4286,10 +4277,8 @@ static struct inet6_ifaddr *if6_get_first(struct seq_file *seq, loff_t pos) } for (; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { - hlist_for_each_entry_rcu(ifa, &inet6_addr_lst[state->bucket], + hlist_for_each_entry_rcu(ifa, &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) { - if (!net_eq(dev_net(ifa->idev->dev), net)) - continue; /* sync with offset */ if (p < state->offset) { p++; @@ -4312,8 +4301,6 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct net *net = seq_file_net(seq); hlist_for_each_entry_continue_rcu(ifa, addr_lst) { - if (!net_eq(dev_net(ifa->idev->dev), net)) - continue; state->offset++; return ifa; } @@ -4321,9 +4308,7 @@ static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, state->offset = 0; while (++state->bucket < IN6_ADDR_HSIZE) { hlist_for_each_entry_rcu(ifa, - &inet6_addr_lst[state->bucket], addr_lst) { - if (!net_eq(dev_net(ifa->idev->dev), net)) - continue; + &net->ipv6.inet6_addr_lst[state->bucket], addr_lst) { return ifa; } } @@ -4411,9 +4396,7 @@ int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr) int ret = 0; rcu_read_lock(); - hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { - if (!net_eq(dev_net(ifp->idev->dev), net)) - continue; + hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) { if (ipv6_addr_equal(&ifp->addr, addr) && (ifp->flags & IFA_F_HOMEADDRESS)) { ret = 1; @@ -4451,9 +4434,7 @@ int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, hash = inet6_addr_hash(net, addr); hash_found = false; - hlist_for_each_entry_rcu(ifp, &inet6_addr_lst[hash], addr_lst) { - if (!net_eq(dev_net(ifp->idev->dev), net)) - continue; + hlist_for_each_entry_rcu(ifp, &net->ipv6.inet6_addr_lst[hash], addr_lst) { if (ipv6_addr_equal(&ifp->addr, addr)) { hash_found = true; @@ -4498,7 +4479,7 @@ static void addrconf_verify_rtnl(struct net *net) for (i = 0; i < IN6_ADDR_HSIZE; i++) { restart: - hlist_for_each_entry_rcu_bh(ifp, &inet6_addr_lst[i], addr_lst) { + hlist_for_each_entry_rcu_bh(ifp, &net->ipv6.inet6_addr_lst[i], addr_lst) { unsigned long age; /* When setting preferred_lft to a value not zero or @@ -7233,7 +7214,7 @@ static struct rtnl_af_ops inet6_ops __read_mostly = { int __init addrconf_init(void) { struct inet6_dev *idev; - int i, err; + int err; err = ipv6_addr_label_init(); if (err < 0) { @@ -7280,9 +7261,6 @@ int __init addrconf_init(void) ip6_route_init_special_entries(); - for (i = 0; i < IN6_ADDR_HSIZE; i++) - INIT_HLIST_HEAD(&inet6_addr_lst[i]); - register_netdevice_notifier(&ipv6_dev_notf); addrconf_verify(&init_net); @@ -7343,7 +7321,6 @@ out: void addrconf_cleanup(void) { struct net_device *dev; - int i; unregister_netdevice_notifier(&ipv6_dev_notf); unregister_pernet_subsys(&addrconf_ops); @@ -7361,14 +7338,6 @@ void addrconf_cleanup(void) } addrconf_ifdown(init_net.loopback_dev, true); - /* - * Check hash table. - */ - spin_lock_bh(&addrconf_hash_lock); - for (i = 0; i < IN6_ADDR_HSIZE; i++) - WARN_ON(!hlist_empty(&inet6_addr_lst[i])); - spin_unlock_bh(&addrconf_hash_lock); - rtnl_unlock(); destroy_workqueue(addrconf_wq); From fea7b201320ca222d532398685d23db2b8d55558 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:31 -0800 Subject: [PATCH 04/11] nexthop: change nexthop_net_exit() to nexthop_net_exit_batch() cleanup_net() is competing with other rtnl users. nexthop_net_exit() seems a good candidate for exit_batch(), as this gives chance for cleanup_net() to progress much faster, holding rtnl a bit longer. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/nexthop.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index eeafeccebb8d..e459a391e607 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3733,12 +3733,16 @@ out: } EXPORT_SYMBOL(nexthop_res_grp_activity_update); -static void __net_exit nexthop_net_exit(struct net *net) +static void __net_exit nexthop_net_exit_batch(struct list_head *net_list) { + struct net *net; + rtnl_lock(); - flush_all_nexthops(net); + list_for_each_entry(net, net_list, exit_list) { + flush_all_nexthops(net); + kfree(net->nexthop.devhash); + } rtnl_unlock(); - kfree(net->nexthop.devhash); } static int __net_init nexthop_net_init(struct net *net) @@ -3756,7 +3760,7 @@ static int __net_init nexthop_net_init(struct net *net) static struct pernet_operations nexthop_net_ops = { .init = nexthop_net_init, - .exit = nexthop_net_exit, + .exit_batch = nexthop_net_exit_batch, }; static int __init nexthop_init(void) From 1c69576461435521b020cf0e6475b7524c1394dd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:32 -0800 Subject: [PATCH 05/11] ipv4: add fib_net_exit_batch() cleanup_net() is competing with other rtnl users. Instead of acquiring rtnl at each fib_net_exit() invocation, add fib_net_exit_batch() so that rtnl is acquired once. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/fib_frontend.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c60e1d1ed2b0..54811728d906 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1556,7 +1556,7 @@ static void ip_fib_net_exit(struct net *net) { int i; - rtnl_lock(); + ASSERT_RTNL(); #ifdef CONFIG_IP_MULTIPLE_TABLES RCU_INIT_POINTER(net->ipv4.fib_main, NULL); RCU_INIT_POINTER(net->ipv4.fib_default, NULL); @@ -1581,7 +1581,7 @@ static void ip_fib_net_exit(struct net *net) #ifdef CONFIG_IP_MULTIPLE_TABLES fib4_rules_exit(net); #endif - rtnl_unlock(); + kfree(net->ipv4.fib_table_hash); fib4_notifier_exit(net); } @@ -1608,7 +1608,9 @@ out: out_proc: nl_fib_lookup_exit(net); out_nlfl: + rtnl_lock(); ip_fib_net_exit(net); + rtnl_unlock(); goto out; } @@ -1616,12 +1618,23 @@ static void __net_exit fib_net_exit(struct net *net) { fib_proc_exit(net); nl_fib_lookup_exit(net); - ip_fib_net_exit(net); +} + +static void __net_exit fib_net_exit_batch(struct list_head *net_list) +{ + struct net *net; + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) + ip_fib_net_exit(net); + + rtnl_unlock(); } static struct pernet_operations fib_net_ops = { .init = fib_net_init, .exit = fib_net_exit, + .exit_batch = fib_net_exit_batch, }; void __init ip_fib_init(void) From ea3e91666ddd9f141632157ee601325d1d207061 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:33 -0800 Subject: [PATCH 06/11] ipv6: change fib6_rules_net_exit() to batch mode cleanup_net() is competing with other rtnl users. fib6_rules_net_exit() seems a good candidate for exit_batch(), as this gives chance for cleanup_net() to progress much faster, holding rtnl a bit longer. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv6/fib6_rules.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index e2a7b0059669..7c2003833010 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -493,16 +493,21 @@ out_fib6_rules_ops: goto out; } -static void __net_exit fib6_rules_net_exit(struct net *net) +static void __net_exit fib6_rules_net_exit_batch(struct list_head *net_list) { + struct net *net; + rtnl_lock(); - fib_rules_unregister(net->ipv6.fib6_rules_ops); + list_for_each_entry(net, net_list, exit_list) { + fib_rules_unregister(net->ipv6.fib6_rules_ops); + cond_resched(); + } rtnl_unlock(); } static struct pernet_operations fib6_rules_net_ops = { .init = fib6_rules_net_init, - .exit = fib6_rules_net_exit, + .exit_batch = fib6_rules_net_exit_batch, }; int __init fib6_rules_init(void) From e2f736b753ecb70174f862aa040184e85c158255 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:34 -0800 Subject: [PATCH 07/11] ip6mr: introduce ip6mr_net_exit_batch() cleanup_net() is competing with other rtnl users. Avoiding to acquire rtnl for each netns before calling ip6mr_rules_exit() gives chance for cleanup_net() to progress much faster, holding rtnl a bit longer. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv6/ip6mr.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index fd660414d482..881fe6b50307 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -253,13 +253,12 @@ static void __net_exit ip6mr_rules_exit(struct net *net) { struct mr_table *mrt, *next; - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv6.mr6_tables, list) { list_del(&mrt->list); ip6mr_free_table(mrt); } fib_rules_unregister(net->ipv6.mr6_rules_ops); - rtnl_unlock(); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, @@ -316,10 +315,9 @@ static int __net_init ip6mr_rules_init(struct net *net) static void __net_exit ip6mr_rules_exit(struct net *net) { - rtnl_lock(); + ASSERT_RTNL(); ip6mr_free_table(net->ipv6.mrt6); net->ipv6.mrt6 = NULL; - rtnl_unlock(); } static int ip6mr_rules_dump(struct net *net, struct notifier_block *nb, @@ -1323,7 +1321,9 @@ static int __net_init ip6mr_net_init(struct net *net) proc_cache_fail: remove_proc_entry("ip6_mr_vif", net->proc_net); proc_vif_fail: + rtnl_lock(); ip6mr_rules_exit(net); + rtnl_unlock(); #endif ip6mr_rules_fail: ip6mr_notifier_exit(net); @@ -1336,13 +1336,23 @@ static void __net_exit ip6mr_net_exit(struct net *net) remove_proc_entry("ip6_mr_cache", net->proc_net); remove_proc_entry("ip6_mr_vif", net->proc_net); #endif - ip6mr_rules_exit(net); ip6mr_notifier_exit(net); } +static void __net_exit ip6mr_net_exit_batch(struct list_head *net_list) +{ + struct net *net; + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) + ip6mr_rules_exit(net); + rtnl_unlock(); +} + static struct pernet_operations ip6mr_net_ops = { .init = ip6mr_net_init, .exit = ip6mr_net_exit, + .exit_batch = ip6mr_net_exit_batch, }; int __init ip6_mr_init(void) From 696e595f707582cd54900d57041721b5223dfdb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:35 -0800 Subject: [PATCH 08/11] ipmr: introduce ipmr_net_exit_batch() cleanup_net() is competing with other rtnl users. Avoiding to acquire rtnl for each netns before calling ipmr_rules_exit() gives chance for cleanup_net() to progress much faster, holding rtnl a bit longer. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/ipv4/ipmr.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 07274619b9ea..4a55a620e526 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -266,13 +266,12 @@ static void __net_exit ipmr_rules_exit(struct net *net) { struct mr_table *mrt, *next; - rtnl_lock(); + ASSERT_RTNL(); list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) { list_del(&mrt->list); ipmr_free_table(mrt); } fib_rules_unregister(net->ipv4.mr_rules_ops); - rtnl_unlock(); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, @@ -328,10 +327,9 @@ static int __net_init ipmr_rules_init(struct net *net) static void __net_exit ipmr_rules_exit(struct net *net) { - rtnl_lock(); + ASSERT_RTNL(); ipmr_free_table(net->ipv4.mrt); net->ipv4.mrt = NULL; - rtnl_unlock(); } static int ipmr_rules_dump(struct net *net, struct notifier_block *nb, @@ -3075,7 +3073,9 @@ static int __net_init ipmr_net_init(struct net *net) proc_cache_fail: remove_proc_entry("ip_mr_vif", net->proc_net); proc_vif_fail: + rtnl_lock(); ipmr_rules_exit(net); + rtnl_unlock(); #endif ipmr_rules_fail: ipmr_notifier_exit(net); @@ -3090,12 +3090,22 @@ static void __net_exit ipmr_net_exit(struct net *net) remove_proc_entry("ip_mr_vif", net->proc_net); #endif ipmr_notifier_exit(net); - ipmr_rules_exit(net); +} + +static void __net_exit ipmr_net_exit_batch(struct list_head *net_list) +{ + struct net *net; + + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) + ipmr_rules_exit(net); + rtnl_unlock(); } static struct pernet_operations ipmr_net_ops = { .init = ipmr_net_init, .exit = ipmr_net_exit, + .exit_batch = ipmr_net_exit_batch, }; int __init ip_mr_init(void) From ef0de6696c38cbefba64fc1e29c18882bac1f747 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:36 -0800 Subject: [PATCH 09/11] can: gw: switch cangw_pernet_exit() to batch mode cleanup_net() is competing with other rtnl users. Avoiding to acquire rtnl for each netns before calling cgw_remove_all_jobs() gives chance for cleanup_net() to progress much faster, holding rtnl a bit longer. Signed-off-by: Eric Dumazet Acked-by: Oliver Hartkopp Acked-by: Marc Kleine-Budde Signed-off-by: Jakub Kicinski --- net/can/gw.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/can/gw.c b/net/can/gw.c index d8861e862f15..24221352e059 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1239,16 +1239,19 @@ static int __net_init cangw_pernet_init(struct net *net) return 0; } -static void __net_exit cangw_pernet_exit(struct net *net) +static void __net_exit cangw_pernet_exit_batch(struct list_head *net_list) { + struct net *net; + rtnl_lock(); - cgw_remove_all_jobs(net); + list_for_each_entry(net, net_list, exit_list) + cgw_remove_all_jobs(net); rtnl_unlock(); } static struct pernet_operations cangw_pernet_ops = { .init = cangw_pernet_init, - .exit = cangw_pernet_exit, + .exit_batch = cangw_pernet_exit_batch, }; static __init int cgw_module_init(void) From 16a41634accacb2b3eee3580a0aef2da0f15aabd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:37 -0800 Subject: [PATCH 10/11] bonding: switch bond_net_exit() to batch mode cleanup_net() is competing with other rtnl users. Batching bond_net_exit() factorizes all rtnl acquistions to a single one, giving chance for cleanup_net() to progress much faster, holding rtnl a bit longer. Signed-off-by: Eric Dumazet Cc: Jay Vosburgh Cc: Veaceslav Falico Cc: Andy Gospodarek Signed-off-by: Jakub Kicinski --- drivers/net/bonding/bond_main.c | 27 +++++++++++++++++++-------- drivers/net/bonding/bond_procfs.c | 1 - 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 238b56d77c36..617c2bf8c5a7 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -6048,27 +6048,38 @@ static int __net_init bond_net_init(struct net *net) return 0; } -static void __net_exit bond_net_exit(struct net *net) +static void __net_exit bond_net_exit_batch(struct list_head *net_list) { - struct bond_net *bn = net_generic(net, bond_net_id); - struct bonding *bond, *tmp_bond; + struct bond_net *bn; + struct net *net; LIST_HEAD(list); - bond_destroy_sysfs(bn); + list_for_each_entry(net, net_list, exit_list) { + bn = net_generic(net, bond_net_id); + bond_destroy_sysfs(bn); + } /* Kill off any bonds created after unregistering bond rtnl ops */ rtnl_lock(); - list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) - unregister_netdevice_queue(bond->dev, &list); + list_for_each_entry(net, net_list, exit_list) { + struct bonding *bond, *tmp_bond; + + bn = net_generic(net, bond_net_id); + list_for_each_entry_safe(bond, tmp_bond, &bn->dev_list, bond_list) + unregister_netdevice_queue(bond->dev, &list); + } unregister_netdevice_many(&list); rtnl_unlock(); - bond_destroy_proc_dir(bn); + list_for_each_entry(net, net_list, exit_list) { + bn = net_generic(net, bond_net_id); + bond_destroy_proc_dir(bn); + } } static struct pernet_operations bond_net_ops = { .init = bond_net_init, - .exit = bond_net_exit, + .exit_batch = bond_net_exit_batch, .id = &bond_net_id, .size = sizeof(struct bond_net), }; diff --git a/drivers/net/bonding/bond_procfs.c b/drivers/net/bonding/bond_procfs.c index 46b150e6289e..cfe37be42be4 100644 --- a/drivers/net/bonding/bond_procfs.c +++ b/drivers/net/bonding/bond_procfs.c @@ -307,7 +307,6 @@ void __net_init bond_create_proc_dir(struct bond_net *bn) } /* Destroy the bonding directory under /proc/net, if empty. - * Caller must hold rtnl_lock. */ void __net_exit bond_destroy_proc_dir(struct bond_net *bn) { From ee403248fa6db5ca23031fc51b06284d6855cd02 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Feb 2022 20:50:38 -0800 Subject: [PATCH 11/11] net: remove default_device_exit() For some reason default_device_ops kept two exit method: 1) default_device_exit() is called for each netns being dismantled in a cleanup_net() round. This acquires rtnl for each invocation. 2) default_device_exit_batch() is called once with the list of all netns int the batch, allowing for a single rtnl invocation. Get rid of the .exit() method to handle the logic from default_device_exit_batch(), to decrease the number of rtnl acquisition to one. Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Signed-off-by: Jakub Kicinski --- net/core/dev.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 66556a21800a..f5ef51601081 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10850,14 +10850,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = { .exit = netdev_exit, }; -static void __net_exit default_device_exit(struct net *net) +static void __net_exit default_device_exit_net(struct net *net) { struct net_device *dev, *aux; /* * Push all migratable network devices back to the * initial network namespace */ - rtnl_lock(); + ASSERT_RTNL(); for_each_netdev_safe(net, dev, aux) { int err; char fb_name[IFNAMSIZ]; @@ -10881,22 +10881,22 @@ static void __net_exit default_device_exit(struct net *net) BUG(); } } - rtnl_unlock(); } static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) { - /* Return with the rtnl_lock held when there are no network + /* Return (with the rtnl_lock held) when there are no network * devices unregistering in any network namespace in net_list. */ - struct net *net; - bool unregistering; DEFINE_WAIT_FUNC(wait, woken_wake_function); + bool unregistering; + struct net *net; + ASSERT_RTNL(); add_wait_queue(&netdev_unregistering_wq, &wait); for (;;) { unregistering = false; - rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { if (net->dev_unreg_count > 0) { unregistering = true; @@ -10908,6 +10908,7 @@ static void __net_exit rtnl_lock_unregistering(struct list_head *net_list) __rtnl_unlock(); wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); + rtnl_lock(); } remove_wait_queue(&netdev_unregistering_wq, &wait); } @@ -10923,6 +10924,11 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) struct net *net; LIST_HEAD(dev_kill_list); + rtnl_lock(); + list_for_each_entry(net, net_list, exit_list) { + default_device_exit_net(net); + cond_resched(); + } /* To prevent network device cleanup code from dereferencing * loopback devices or network devices that have been freed * wait here for all pending unregistrations to complete, @@ -10935,6 +10941,7 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) * default_device_exit_batch. */ rtnl_lock_unregistering(net_list); + list_for_each_entry(net, net_list, exit_list) { for_each_netdev_reverse(net, dev) { if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) @@ -10948,7 +10955,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } static struct pernet_operations __net_initdata default_device_ops = { - .exit = default_device_exit, .exit_batch = default_device_exit_batch, };