From 898f8015ffe74118e7b461827451f2cc6e51035b Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Mon, 8 Feb 2021 11:34:08 -0800 Subject: [PATCH 1/3] net: extract napi poll functionality to __napi_poll() This commit introduces a new function __napi_poll() which does the main logic of the existing napi_poll() function, and will be called by other functions in later commits. This idea and implementation is done by Felix Fietkau and is proposed as part of the patch to move napi work to work_queue context. This commit by itself is a code restructure. Signed-off-by: Felix Fietkau Signed-off-by: Wei Wang Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- net/core/dev.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 21d74d30f5d7..59751a22d7c3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6776,15 +6776,10 @@ void __netif_napi_del(struct napi_struct *napi) } EXPORT_SYMBOL(__netif_napi_del); -static int napi_poll(struct napi_struct *n, struct list_head *repoll) +static int __napi_poll(struct napi_struct *n, bool *repoll) { - void *have; int work, weight; - list_del_init(&n->poll_list); - - have = netpoll_poll_lock(n); - weight = n->weight; /* This NAPI_STATE_SCHED test is for avoiding a race @@ -6804,7 +6799,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) n->poll, work, weight); if (likely(work < weight)) - goto out_unlock; + return work; /* Drivers must not modify the NAPI state if they * consume the entire weight. In such cases this code @@ -6813,7 +6808,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) */ if (unlikely(napi_disable_pending(n))) { napi_complete(n); - goto out_unlock; + return work; } /* The NAPI context has more processing work, but busy-polling @@ -6826,7 +6821,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) */ napi_schedule(n); } - goto out_unlock; + return work; } if (n->gro_bitmask) { @@ -6844,12 +6839,29 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) if (unlikely(!list_empty(&n->poll_list))) { pr_warn_once("%s: Budget exhausted after napi rescheduled\n", n->dev ? n->dev->name : "backlog"); - goto out_unlock; + return work; } - list_add_tail(&n->poll_list, repoll); + *repoll = true; + + return work; +} + +static int napi_poll(struct napi_struct *n, struct list_head *repoll) +{ + bool do_repoll = false; + void *have; + int work; + + list_del_init(&n->poll_list); + + have = netpoll_poll_lock(n); + + work = __napi_poll(n, &do_repoll); + + if (do_repoll) + list_add_tail(&n->poll_list, repoll); -out_unlock: netpoll_poll_unlock(have); return work; From 29863d41bb6e1d969c62fdb15b0961806942960e Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Feb 2021 11:34:09 -0800 Subject: [PATCH 2/3] net: implement threaded-able napi poll loop support This patch allows running each napi poll loop inside its own kernel thread. The kthread is created during netif_napi_add() if dev->threaded is set. And threaded mode is enabled in napi_enable(). We will provide a way to set dev->threaded and enable threaded mode without a device up/down in the following patch. Once that threaded mode is enabled and the kthread is started, napi_schedule() will wake-up such thread instead of scheduling the softirq. The threaded poll loop behaves quite likely the net_rx_action, but it does not have to manipulate local irqs and uses an explicit scheduling point based on netdev_budget. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Hannes Frederic Sowa Signed-off-by: Hannes Frederic Sowa Co-developed-by: Jakub Kicinski Signed-off-by: Jakub Kicinski Signed-off-by: Wei Wang Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/netdevice.h | 21 +++---- net/core/dev.c | 112 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+), 14 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index e9e7ada07ea1..99fb4ec9573e 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -347,6 +347,7 @@ struct napi_struct { struct list_head dev_list; struct hlist_node napi_hash_node; unsigned int napi_id; + struct task_struct *thread; }; enum { @@ -358,6 +359,7 @@ enum { NAPI_STATE_NO_BUSY_POLL, /* Do not add in napi_hash, no busy polling */ NAPI_STATE_IN_BUSY_POLL, /* sk_busy_loop() owns this NAPI */ NAPI_STATE_PREFER_BUSY_POLL, /* prefer busy-polling over softirq processing*/ + NAPI_STATE_THREADED, /* The poll is performed inside its own thread*/ }; enum { @@ -369,6 +371,7 @@ enum { NAPIF_STATE_NO_BUSY_POLL = BIT(NAPI_STATE_NO_BUSY_POLL), NAPIF_STATE_IN_BUSY_POLL = BIT(NAPI_STATE_IN_BUSY_POLL), NAPIF_STATE_PREFER_BUSY_POLL = BIT(NAPI_STATE_PREFER_BUSY_POLL), + NAPIF_STATE_THREADED = BIT(NAPI_STATE_THREADED), }; enum gro_result { @@ -503,20 +506,7 @@ static inline bool napi_complete(struct napi_struct *n) */ void napi_disable(struct napi_struct *n); -/** - * napi_enable - enable NAPI scheduling - * @n: NAPI context - * - * Resume NAPI from being scheduled on this context. - * Must be paired with napi_disable. - */ -static inline void napi_enable(struct napi_struct *n) -{ - BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); - smp_mb__before_atomic(); - clear_bit(NAPI_STATE_SCHED, &n->state); - clear_bit(NAPI_STATE_NPSVC, &n->state); -} +void napi_enable(struct napi_struct *n); /** * napi_synchronize - wait until NAPI is not running @@ -1827,6 +1817,8 @@ enum netdev_priv_flags { * * @wol_enabled: Wake-on-LAN is enabled * + * @threaded: napi threaded mode is enabled + * * @net_notifier_list: List of per-net netdev notifier block * that follow this device when it is moved * to another network namespace. @@ -2145,6 +2137,7 @@ struct net_device { struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; + unsigned threaded:1; struct list_head net_notifier_list; diff --git a/net/core/dev.c b/net/core/dev.c index 59751a22d7c3..1e35f4f44f3b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -91,6 +91,7 @@ #include #include #include +#include #include #include #include @@ -1494,6 +1495,27 @@ void netdev_notify_peers(struct net_device *dev) } EXPORT_SYMBOL(netdev_notify_peers); +static int napi_threaded_poll(void *data); + +static int napi_kthread_create(struct napi_struct *n) +{ + int err = 0; + + /* Create and wake up the kthread once to put it in + * TASK_INTERRUPTIBLE mode to avoid the blocked task + * warning and work with loadavg. + */ + n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d", + n->dev->name, n->napi_id); + if (IS_ERR(n->thread)) { + err = PTR_ERR(n->thread); + pr_err("kthread_run failed with err %d\n", err); + n->thread = NULL; + } + + return err; +} + static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; @@ -4265,6 +4287,21 @@ int gro_normal_batch __read_mostly = 8; static inline void ____napi_schedule(struct softnet_data *sd, struct napi_struct *napi) { + struct task_struct *thread; + + if (test_bit(NAPI_STATE_THREADED, &napi->state)) { + /* Paired with smp_mb__before_atomic() in + * napi_enable(). Use READ_ONCE() to guarantee + * a complete read on napi->thread. Only call + * wake_up_process() when it's not NULL. + */ + thread = READ_ONCE(napi->thread); + if (thread) { + wake_up_process(thread); + return; + } + } + list_add_tail(&napi->poll_list, &sd->poll_list); __raise_softirq_irqoff(NET_RX_SOFTIRQ); } @@ -6728,6 +6765,12 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi, set_bit(NAPI_STATE_NPSVC, &napi->state); list_add_rcu(&napi->dev_list, &dev->napi_list); napi_hash_add(napi); + /* Create kthread for this napi if dev->threaded is set. + * Clear dev->threaded if kthread creation failed so that + * threaded mode will not be enabled in napi_enable(). + */ + if (dev->threaded && napi_kthread_create(napi)) + dev->threaded = 0; } EXPORT_SYMBOL(netif_napi_add); @@ -6745,9 +6788,28 @@ void napi_disable(struct napi_struct *n) clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state); clear_bit(NAPI_STATE_DISABLE, &n->state); + clear_bit(NAPI_STATE_THREADED, &n->state); } EXPORT_SYMBOL(napi_disable); +/** + * napi_enable - enable NAPI scheduling + * @n: NAPI context + * + * Resume NAPI from being scheduled on this context. + * Must be paired with napi_disable. + */ +void napi_enable(struct napi_struct *n) +{ + BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state)); + smp_mb__before_atomic(); + clear_bit(NAPI_STATE_SCHED, &n->state); + clear_bit(NAPI_STATE_NPSVC, &n->state); + if (n->dev->threaded && n->thread) + set_bit(NAPI_STATE_THREADED, &n->state); +} +EXPORT_SYMBOL(napi_enable); + static void flush_gro_hash(struct napi_struct *napi) { int i; @@ -6773,6 +6835,11 @@ void __netif_napi_del(struct napi_struct *napi) flush_gro_hash(napi); napi->gro_bitmask = 0; + + if (napi->thread) { + kthread_stop(napi->thread); + napi->thread = NULL; + } } EXPORT_SYMBOL(__netif_napi_del); @@ -6867,6 +6934,51 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) return work; } +static int napi_thread_wait(struct napi_struct *napi) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop() && !napi_disable_pending(napi)) { + if (test_bit(NAPI_STATE_SCHED, &napi->state)) { + WARN_ON(!list_empty(&napi->poll_list)); + __set_current_state(TASK_RUNNING); + return 0; + } + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return -1; +} + +static int napi_threaded_poll(void *data) +{ + struct napi_struct *napi = data; + void *have; + + while (!napi_thread_wait(napi)) { + for (;;) { + bool repoll = false; + + local_bh_disable(); + + have = netpoll_poll_lock(napi); + __napi_poll(napi, &repoll); + netpoll_poll_unlock(have); + + __kfree_skb_flush(); + local_bh_enable(); + + if (!repoll) + break; + + cond_resched(); + } + } + return 0; +} + static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); From 5fdd2f0e5c64846bf3066689b73fc3b8dddd1c74 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 8 Feb 2021 11:34:10 -0800 Subject: [PATCH 3/3] net: add sysfs attribute to control napi threaded mode This patch adds a new sysfs attribute to the network device class. Said attribute provides a per-device control to enable/disable the threaded mode for all the napi instances of the given network device, without the need for a device up/down. User sets it to 1 or 0 to enable or disable threaded mode. Note: when switching between threaded and the current softirq based mode for a napi instance, it will not immediately take effect if the napi is currently being polled. The mode switch will happen for the next time napi_schedule() is called. Co-developed-by: Paolo Abeni Signed-off-by: Paolo Abeni Co-developed-by: Hannes Frederic Sowa Signed-off-by: Hannes Frederic Sowa Co-developed-by: Felix Fietkau Signed-off-by: Felix Fietkau Signed-off-by: Wei Wang Reviewed-by: Alexander Duyck Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 15 +++++++ include/linux/netdevice.h | 2 + net/core/dev.c | 48 ++++++++++++++++++++++- net/core/net-sysfs.c | 40 +++++++++++++++++++ 4 files changed, 103 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 1f2002df5ba2..1419103d11f9 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -337,3 +337,18 @@ Contact: netdev@vger.kernel.org Description: 32-bit unsigned integer counting the number of times the link has been down + +What: /sys/class/net//threaded +Date: Jan 2021 +KernelVersion: 5.12 +Contact: netdev@vger.kernel.org +Description: + Boolean value to control the threaded mode per device. User could + set this value to enable/disable threaded mode for all napi + belonging to this device, without the need to do device up/down. + + Possible values: + == ================================== + 0 threaded mode disabled for this dev + 1 threaded mode enabled for this dev + == ================================== diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 99fb4ec9573e..1340327f7abf 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -497,6 +497,8 @@ static inline bool napi_complete(struct napi_struct *n) return napi_complete_done(n, 0); } +int dev_set_threaded(struct net_device *dev, bool threaded); + /** * napi_disable - prevent NAPI from scheduling * @n: NAPI context diff --git a/net/core/dev.c b/net/core/dev.c index 1e35f4f44f3b..7647278e46f0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4291,8 +4291,9 @@ static inline void ____napi_schedule(struct softnet_data *sd, if (test_bit(NAPI_STATE_THREADED, &napi->state)) { /* Paired with smp_mb__before_atomic() in - * napi_enable(). Use READ_ONCE() to guarantee - * a complete read on napi->thread. Only call + * napi_enable()/dev_set_threaded(). + * Use READ_ONCE() to guarantee a complete + * read on napi->thread. Only call * wake_up_process() when it's not NULL. */ thread = READ_ONCE(napi->thread); @@ -6738,6 +6739,49 @@ static void init_gro_hash(struct napi_struct *napi) napi->gro_bitmask = 0; } +int dev_set_threaded(struct net_device *dev, bool threaded) +{ + struct napi_struct *napi; + int err = 0; + + if (dev->threaded == threaded) + return 0; + + if (threaded) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (!napi->thread) { + err = napi_kthread_create(napi); + if (err) { + threaded = false; + break; + } + } + } + } + + dev->threaded = threaded; + + /* Make sure kthread is created before THREADED bit + * is set. + */ + smp_mb__before_atomic(); + + /* Setting/unsetting threaded mode on a napi might not immediately + * take effect, if the current napi instance is actively being + * polled. In this case, the switch between threaded mode and + * softirq mode will happen in the next round of napi_schedule(). + * This should not cause hiccups/stalls to the live traffic. + */ + list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (threaded) + set_bit(NAPI_STATE_THREADED, &napi->state); + else + clear_bit(NAPI_STATE_THREADED, &napi->state); + } + + return err; +} + void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 91afb0b6de69..307628fdf380 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -538,6 +538,45 @@ static ssize_t phys_switch_id_show(struct device *dev, } static DEVICE_ATTR_RO(phys_switch_id); +static ssize_t threaded_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + ssize_t ret = -EINVAL; + + if (!rtnl_trylock()) + return restart_syscall(); + + if (dev_isalive(netdev)) + ret = sprintf(buf, fmt_dec, netdev->threaded); + + rtnl_unlock(); + return ret; +} + +static int modify_napi_threaded(struct net_device *dev, unsigned long val) +{ + int ret; + + if (list_empty(&dev->napi_list)) + return -EOPNOTSUPP; + + if (val != 0 && val != 1) + return -EOPNOTSUPP; + + ret = dev_set_threaded(dev, val); + + return ret; +} + +static ssize_t threaded_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return netdev_store(dev, attr, buf, len, modify_napi_threaded); +} +static DEVICE_ATTR_RW(threaded); + static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_netdev_group.attr, &dev_attr_type.attr, @@ -570,6 +609,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_proto_down.attr, &dev_attr_carrier_up_count.attr, &dev_attr_carrier_down_count.attr, + &dev_attr_threaded.attr, NULL, }; ATTRIBUTE_GROUPS(net_class);