From 97e719a82b43c6c2bb5eebdb3c5d479a332ac2ac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:53 -0700 Subject: [PATCH 1/4] net: fix possible race in skb_attempt_defer_free() A cpu can observe sd->defer_count reaching 128, and call smp_call_function_single_async() Problem is that the remote CPU can clear sd->defer_count before the IPI is run/acknowledged. Other cpus can queue more packets and also decide to call smp_call_function_single_async() while the pending IPI was not yet delivered. This is a common issue with smp_call_function_single_async(). Callers must ensure correct synchronization and serialization. I triggered this issue while experimenting smaller threshold. Performing the call to smp_call_function_single_async() under sd->defer_lock protection did not solve the problem. Commit 5a18ceca6350 ("smp: Allow smp_call_function_single_async() to insert locked csd") replaced an informative WARN_ON_ONCE() with a return of -EBUSY, which is often ignored. Test of CSD_FLAG_LOCK presence is racy anyway. Fixes: 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + net/core/dev.c | 7 +++++-- net/core/skbuff.c | 5 ++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d57ce248004c..cbaf312e365b 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3136,6 +3136,7 @@ struct softnet_data { /* Another possibly contended cache line */ spinlock_t defer_lock ____cacheline_aligned_in_smp; int defer_count; + int defer_ipi_scheduled; struct sk_buff *defer_list; call_single_data_t defer_csd; }; diff --git a/net/core/dev.c b/net/core/dev.c index d93456c75b55..a5e663e1a75a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4582,9 +4582,12 @@ static void rps_trigger_softirq(void *data) #endif /* CONFIG_RPS */ /* Called from hardirq (IPI) context */ -static void trigger_rx_softirq(void *data __always_unused) +static void trigger_rx_softirq(void *data) { + struct softnet_data *sd = data; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + smp_store_release(&sd->defer_ipi_scheduled, 0); } /* @@ -11382,7 +11385,7 @@ static int __init net_dev_init(void) INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif - INIT_CSD(&sd->defer_csd, trigger_rx_softirq, NULL); + INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); spin_lock_init(&sd->defer_lock); init_gro_hash(&sd->backlog); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2fea964f09d8..b40c8cdf4785 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6516,8 +6516,7 @@ void skb_attempt_defer_free(struct sk_buff *skb) sd->defer_count++; /* kick every time queue length reaches 128. - * This should avoid blocking in smp_call_function_single_async(). - * This condition should hardly be bit under normal conditions, + * This condition should hardly be hit under normal conditions, * unless cpu suddenly stopped to receive NIC interrupts. */ kick = sd->defer_count == 128; @@ -6527,6 +6526,6 @@ void skb_attempt_defer_free(struct sk_buff *skb) /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ - if (unlikely(kick)) + if (unlikely(kick) && !cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) smp_call_function_single_async(cpu, &sd->defer_csd); } From 2db60eed1a957423cf06ee1060fc45ed3971990d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:54 -0700 Subject: [PATCH 2/4] net: use napi_consume_skb() in skb_defer_free_flush() skb_defer_free_flush() runs from softirq context, we have the opportunity to refill the napi_alloc_cache, and/or use kmem_cache_free_bulk() when this cache is full. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index a5e663e1a75a..d0b34bc50706 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6633,7 +6633,7 @@ static void skb_defer_free_flush(struct softnet_data *sd) while (skb != NULL) { next = skb->next; - __kfree_skb(skb); + napi_consume_skb(skb, 1); skb = next; } } From 39564c3fdc6684c6726b63e131d2a9f3809811cb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:55 -0700 Subject: [PATCH 3/4] net: add skb_defer_max sysctl commit 68822bdf76f1 ("net: generalize skb freeing deferral to per-cpu lists") added another per-cpu cache of skbs. It was expected to be small, and an IPI was forced whenever the list reached 128 skbs. We might need to be able to control more precisely queue capacity and added latency. An IPI is generated whenever queue reaches half capacity. Default value of the new limit is 64. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 8 ++++++++ net/core/dev.c | 1 + net/core/dev.h | 2 +- net/core/skbuff.c | 15 +++++++++------ net/core/sysctl_net_core.c | 8 ++++++++ 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index f86b5e1623c6..fa4dcdb283cf 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -322,6 +322,14 @@ a leaked reference faster. A larger value may be useful to prevent false warnings on slow/loaded systems. Default value is 10, minimum 1, maximum 3600. +skb_defer_max +------------- + +Max size (in skbs) of the per-cpu list of skbs being freed +by the cpu which allocated them. Used by TCP stack so far. + +Default: 64 + optmem_max ---------- diff --git a/net/core/dev.c b/net/core/dev.c index d0b34bc50706..6359f8953269 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4330,6 +4330,7 @@ int netdev_max_backlog __read_mostly = 1000; EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; +unsigned int sysctl_skb_defer_max __read_mostly = 64; int netdev_budget __read_mostly = 300; /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */ unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ; diff --git a/net/core/dev.h b/net/core/dev.h index 328b37af90ba..cbb8a925175a 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -39,7 +39,7 @@ void dev_addr_check(struct net_device *dev); /* sysctls not referred to from outside net/core/ */ extern int netdev_budget; extern unsigned int netdev_budget_usecs; - +extern unsigned int sysctl_skb_defer_max; extern int netdev_tstamp_prequeue; extern int netdev_unregister_timeout_secs; extern int weight_p; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index b40c8cdf4785..1d10bb4adec1 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -80,6 +80,7 @@ #include #include +#include "dev.h" #include "sock_destructor.h" struct kmem_cache *skbuff_head_cache __ro_after_init; @@ -6496,16 +6497,21 @@ void skb_attempt_defer_free(struct sk_buff *skb) int cpu = skb->alloc_cpu; struct softnet_data *sd; unsigned long flags; + unsigned int defer_max; bool kick; if (WARN_ON_ONCE(cpu >= nr_cpu_ids) || !cpu_online(cpu) || cpu == raw_smp_processor_id()) { - __kfree_skb(skb); +nodefer: __kfree_skb(skb); return; } sd = &per_cpu(softnet_data, cpu); + defer_max = READ_ONCE(sysctl_skb_defer_max); + if (READ_ONCE(sd->defer_count) >= defer_max) + goto nodefer; + /* We do not send an IPI or any signal. * Remote cpu will eventually call skb_defer_free_flush() */ @@ -6515,11 +6521,8 @@ void skb_attempt_defer_free(struct sk_buff *skb) WRITE_ONCE(sd->defer_list, skb); sd->defer_count++; - /* kick every time queue length reaches 128. - * This condition should hardly be hit under normal conditions, - * unless cpu suddenly stopped to receive NIC interrupts. - */ - kick = sd->defer_count == 128; + /* Send an IPI every time queue reaches half capacity. */ + kick = sd->defer_count == (defer_max >> 1); spin_unlock_irqrestore(&sd->defer_lock, flags); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 195ca5c28771..ca8d38325e1e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -578,6 +578,14 @@ static struct ctl_table net_core_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, + { + .procname = "skb_defer_max", + .data = &sysctl_skb_defer_max, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, { } }; From 909876500251b3b48480a840bbf9053588254eee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 15 May 2022 21:24:56 -0700 Subject: [PATCH 4/4] net: call skb_defer_free_flush() before each napi_poll() skb_defer_free_flush() can consume cpu cycles, it seems better to call it in the inner loop: - Potentially frees page/skb that will be reallocated while hot. - Account for the cpu cycles in the @time_limit determination. - Keep softnet_data.defer_count small to reduce chances for skb_attempt_defer_free() to send an IPI. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 6359f8953269..04fd056f7f74 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6655,6 +6655,8 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) for (;;) { struct napi_struct *n; + skb_defer_free_flush(sd); + if (list_empty(&list)) { if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll)) goto end; @@ -6684,8 +6686,7 @@ static __latent_entropy void net_rx_action(struct softirq_action *h) __raise_softirq_irqoff(NET_RX_SOFTIRQ); net_rps_action_and_irq_enable(sd); -end: - skb_defer_free_flush(sd); +end:; } struct netdev_adjacent {