From 20b52a75166086a40d838397ef3db28a4f2c5998 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:17 +0200 Subject: [PATCH 1/4] xsk: fix potential lost completion message in SKB path The code in xskq_produce_addr erroneously checked if there was up to LAZY_UPDATE_THRESHOLD amount of space in the completion queue. It only needs to check if there is one slot left in the queue. This bug could under some circumstances lead to a WARN_ON_ONCE being triggered and the completion message to user space being lost. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Reported-by: Pavel Odintsov Signed-off-by: Alexei Starovoitov --- net/xdp/xsk_queue.h | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index ef6a6f0ec949..52ecaf770642 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -62,14 +62,9 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) return (entries > dcnt) ? dcnt : entries; } -static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer) -{ - return q->nentries - (producer - q->cons_tail); -} - static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) { - u32 free_entries = xskq_nb_free_lazy(q, producer); + u32 free_entries = q->nentries - (producer - q->cons_tail); if (free_entries >= dcnt) return free_entries; @@ -129,7 +124,7 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) + if (xskq_nb_free(q, q->prod_tail, 1) == 0) return -ENOSPC; ring->desc[q->prod_tail++ & q->ring_mask] = addr; From fe5886852601fb2593cbc5a7549ef9fd2ef481ba Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:18 +0200 Subject: [PATCH 2/4] xsk: frame could be completed more than once in SKB path Fixed a bug in which a frame could be completed more than once when an error was returned from dev_direct_xmit(). The code erroneously retried sending the message leading to multiple calls to the SKB destructor and therefore multiple completions of the same buffer to user space. The error code in this case has been changed from EAGAIN to EBUSY in order to tell user space that the sending of the packet failed and the buffer has been return to user space through the completion queue. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Reported-by: Pavel Odintsov Signed-off-by: Alexei Starovoitov --- net/xdp/xsk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 59fb7d3c36a3..15aca73805fc 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -268,15 +268,15 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, skb->destructor = xsk_destruct_skb; err = dev_direct_xmit(skb, xs->queue_id); + xskq_discard_desc(xs->tx); /* Ignore NET_XMIT_CN as packet might have been sent */ if (err == NET_XMIT_DROP || err == NETDEV_TX_BUSY) { - err = -EAGAIN; - /* SKB consumed by dev_direct_xmit() */ + /* SKB completed but not sent */ + err = -EBUSY; goto out; } sent_frame = true; - xskq_discard_desc(xs->tx); } out: From c03079c9d93d593d44524883b6b6127b21978e22 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:19 +0200 Subject: [PATCH 3/4] samples/bpf: deal with EBUSY return code from sendmsg in xdpsock sample Sendmsg in the SKB path of AF_XDP can now return EBUSY when a packet was discarded and completed by the driver. Just ignore this message in the sample application. Fixes: b4b8faa1ded7 ("samples/bpf: sample application and documentation for AF_XDP sockets") Signed-off-by: Magnus Karlsson Reported-by: Pavel Odintsov Signed-off-by: Alexei Starovoitov --- samples/bpf/xdpsock_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/xdpsock_user.c b/samples/bpf/xdpsock_user.c index d69c8d78d3fd..5904b1543831 100644 --- a/samples/bpf/xdpsock_user.c +++ b/samples/bpf/xdpsock_user.c @@ -729,7 +729,7 @@ static void kick_tx(int fd) int ret; ret = sendto(fd, NULL, 0, MSG_DONTWAIT, NULL, 0); - if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN) + if (ret >= 0 || errno == ENOBUFS || errno == EAGAIN || errno == EBUSY) return; lassert(0); } From a9744f7ca200c756e6f8c65b633770a2da711651 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Fri, 29 Jun 2018 09:48:20 +0200 Subject: [PATCH 4/4] xsk: fix potential race in SKB TX completion code There is a potential race in the TX completion code for the SKB case. One process enters the sendmsg code of an AF_XDP socket in order to send a frame. The execution eventually trickles down to the driver that is told to send the packet. However, it decides to drop the packet due to some error condition (e.g., rings full) and frees the SKB. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. At the same time a TX interrupt has fired on another core and it dispatches the TX completion code in the driver. It does its HW specific things and ends up freeing the SKB associated with the transmitted packet. This will trigger the SKB destructor and a completion will be sent to the AF_XDP user space through its single-producer/single-consumer queues. With a pseudo call stack, it would look like this: Core 1: sendmsg() being called in the application netdev_start_xmit() Driver entered through ndo_start_xmit Driver decides to free the SKB for some reason (e.g., rings full) Destructor of SKB called xskq_produce_addr() is called to signal completion to user space Core 2: TX completion irq NAPI loop Driver irq handler for TX completions Frees the SKB Destructor of SKB called xskq_produce_addr() is called to signal completion to user space We now have a violation of the single-producer/single-consumer principle for our queues as there are two threads trying to produce at the same time on the same queue. Fixed by introducing a spin_lock in the destructor. In regards to the performance, I get around 1.74 Mpps for txonly before and after the introduction of the spinlock. There is of course some impact due to the spin lock but it is in the less significant digits that are too noisy for me to measure. But let us say that the version without the spin lock got 1.745 Mpps in the best case and the version with 1.735 Mpps in the worst case, then that would mean a maximum drop in performance of 0.5%. Fixes: 35fcde7f8deb ("xsk: support for Tx") Signed-off-by: Magnus Karlsson Signed-off-by: Alexei Starovoitov --- include/net/xdp_sock.h | 4 ++++ net/xdp/xsk.c | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 9fe472f2ac95..7161856bcf9c 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -60,6 +60,10 @@ struct xdp_sock { bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; + /* Mutual exclusion of NAPI TX thread and sendmsg error paths + * in the SKB destructor callback. + */ + spinlock_t tx_completion_lock; u64 rx_dropped; }; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 15aca73805fc..7d220cbd09b6 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -199,8 +199,11 @@ static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; struct xdp_sock *xs = xdp_sk(skb->sk); + unsigned long flags; + spin_lock_irqsave(&xs->tx_completion_lock, flags); WARN_ON_ONCE(xskq_produce_addr(xs->umem->cq, addr)); + spin_unlock_irqrestore(&xs->tx_completion_lock, flags); sock_wfree(skb); } @@ -755,6 +758,7 @@ static int xsk_create(struct net *net, struct socket *sock, int protocol, xs = xdp_sk(sk); mutex_init(&xs->mutex); + spin_lock_init(&xs->tx_completion_lock); local_bh_disable(); sock_prot_inuse_add(net, &xsk_proto, 1);