From c75fb320d482a5ce6e522378d137fd2c3bf79225 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Apr 2021 13:04:37 +0200 Subject: [PATCH 1/4] veth: use skb_orphan_partial instead of skb_orphan As described by commit 9c4c325252c5 ("skbuff: preserve sock reference when scrubbing the skb."), orphaning a skb in the TX path will cause OoO. Let's use skb_orphan_partial() instead of skb_orphan(), so that we keep the sk around for queue's selection sake and we still avoid the problem fixed with commit 4bf9ffa0fb57 ("veth: Orphan skb before GRO") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- drivers/net/veth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 9e525646df1d..ce085659d55f 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -686,7 +686,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq, int mac_len, delta, off; struct xdp_buff xdp; - skb_orphan(skb); + skb_orphan_partial(skb); rcu_read_lock(); xdp_prog = rcu_dereference(rq->xdp_prog); From d3256efd8e8b234a6251e4d4580bd2c3c31fdc4c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Apr 2021 13:04:38 +0200 Subject: [PATCH 2/4] veth: allow enabling NAPI even without XDP Currently the veth device has the GRO feature bit set, even if no GRO aggregation is possible with the default configuration, as the veth device does not hook into the GRO engine. Flipping the GRO feature bit from user-space is a no-op, unless XDP is enabled. In such scenario GRO could actually take place, but TSO is forced to off on the peer device. This change allow user-space to really control the GRO feature, with no need for an XDP program. The GRO feature bit is now cleared by default - so that there are no user-visible behavior changes with the default configuration. When the GRO bit is set, the per-queue NAPI instances are initialized and registered. On xmit, when napi instances are available, we try to use them. Some additional checks are in place to ensure we initialize/delete NAPIs only when needed in case of overlapping XDP and GRO configuration changes. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- drivers/net/veth.c | 129 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 116 insertions(+), 13 deletions(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index ce085659d55f..c95c67aa52d2 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -57,6 +57,7 @@ struct veth_rq_stats { struct veth_rq { struct napi_struct xdp_napi; + struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */ struct net_device *dev; struct bpf_prog __rcu *xdp_prog; struct xdp_mem_info xdp_mem; @@ -299,7 +300,7 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) struct veth_rq *rq = NULL; struct net_device *rcv; int length = skb->len; - bool rcv_xdp = false; + bool use_napi = false; int rxq; rcu_read_lock(); @@ -313,20 +314,24 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) rxq = skb_get_queue_mapping(skb); if (rxq < rcv->real_num_rx_queues) { rq = &rcv_priv->rq[rxq]; - rcv_xdp = rcu_access_pointer(rq->xdp_prog); + + /* The napi pointer is available when an XDP program is + * attached or when GRO is enabled + */ + use_napi = rcu_access_pointer(rq->napi); skb_record_rx_queue(skb, rxq); } skb_tx_timestamp(skb); - if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) { - if (!rcv_xdp) + if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) { + if (!use_napi) dev_lstats_add(dev, length); } else { drop: atomic64_inc(&priv->dropped); } - if (rcv_xdp) + if (use_napi) __veth_xdp_flush(rq); rcu_read_unlock(); @@ -903,7 +908,7 @@ static int veth_poll(struct napi_struct *napi, int budget) return done; } -static int veth_napi_add(struct net_device *dev) +static int __veth_napi_enable(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); int err, i; @@ -920,6 +925,7 @@ static int veth_napi_add(struct net_device *dev) struct veth_rq *rq = &priv->rq[i]; napi_enable(&rq->xdp_napi); + rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); } return 0; @@ -938,6 +944,7 @@ static void veth_napi_del(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; + rcu_assign_pointer(priv->rq[i].napi, NULL); napi_disable(&rq->xdp_napi); __netif_napi_del(&rq->xdp_napi); } @@ -951,8 +958,14 @@ static void veth_napi_del(struct net_device *dev) } } +static bool veth_gro_requested(const struct net_device *dev) +{ + return !!(dev->wanted_features & NETIF_F_GRO); +} + static int veth_enable_xdp(struct net_device *dev) { + bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP); struct veth_priv *priv = netdev_priv(dev); int err, i; @@ -960,7 +973,8 @@ static int veth_enable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; - netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); + if (!napi_already_on) + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id); if (err < 0) goto err_rxq_reg; @@ -975,13 +989,25 @@ static int veth_enable_xdp(struct net_device *dev) rq->xdp_mem = rq->xdp_rxq.mem; } - err = veth_napi_add(dev); - if (err) - goto err_rxq_reg; + if (!napi_already_on) { + err = __veth_napi_enable(dev); + if (err) + goto err_rxq_reg; + + if (!veth_gro_requested(dev)) { + /* user-space did not require GRO, but adding XDP + * is supposed to get GRO working + */ + dev->features |= NETIF_F_GRO; + netdev_features_change(dev); + } + } } - for (i = 0; i < dev->real_num_rx_queues; i++) + for (i = 0; i < dev->real_num_rx_queues; i++) { rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog); + rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi); + } return 0; err_reg_mem: @@ -991,7 +1017,8 @@ err_rxq_reg: struct veth_rq *rq = &priv->rq[i]; xdp_rxq_info_unreg(&rq->xdp_rxq); - netif_napi_del(&rq->xdp_napi); + if (!napi_already_on) + netif_napi_del(&rq->xdp_napi); } return err; @@ -1004,7 +1031,19 @@ static void veth_disable_xdp(struct net_device *dev) for (i = 0; i < dev->real_num_rx_queues; i++) rcu_assign_pointer(priv->rq[i].xdp_prog, NULL); - veth_napi_del(dev); + + if (!netif_running(dev) || !veth_gro_requested(dev)) { + veth_napi_del(dev); + + /* if user-space did not require GRO, since adding XDP + * enabled it, clear it now + */ + if (!veth_gro_requested(dev) && netif_running(dev)) { + dev->features &= ~NETIF_F_GRO; + netdev_features_change(dev); + } + } + for (i = 0; i < dev->real_num_rx_queues; i++) { struct veth_rq *rq = &priv->rq[i]; @@ -1013,6 +1052,29 @@ static void veth_disable_xdp(struct net_device *dev) } } +static int veth_napi_enable(struct net_device *dev) +{ + struct veth_priv *priv = netdev_priv(dev); + int err, i; + + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT); + } + + err = __veth_napi_enable(dev); + if (err) { + for (i = 0; i < dev->real_num_rx_queues; i++) { + struct veth_rq *rq = &priv->rq[i]; + + netif_napi_del(&rq->xdp_napi); + } + return err; + } + return err; +} + static int veth_open(struct net_device *dev) { struct veth_priv *priv = netdev_priv(dev); @@ -1026,6 +1088,10 @@ static int veth_open(struct net_device *dev) err = veth_enable_xdp(dev); if (err) return err; + } else if (veth_gro_requested(dev)) { + err = veth_napi_enable(dev); + if (err) + return err; } if (peer->flags & IFF_UP) { @@ -1047,6 +1113,8 @@ static int veth_close(struct net_device *dev) if (priv->_xdp_prog) veth_disable_xdp(dev); + else if (veth_gro_requested(dev)) + veth_napi_del(dev); return 0; } @@ -1145,10 +1213,32 @@ static netdev_features_t veth_fix_features(struct net_device *dev, if (peer_priv->_xdp_prog) features &= ~NETIF_F_GSO_SOFTWARE; } + if (priv->_xdp_prog) + features |= NETIF_F_GRO; return features; } +static int veth_set_features(struct net_device *dev, + netdev_features_t features) +{ + netdev_features_t changed = features ^ dev->features; + struct veth_priv *priv = netdev_priv(dev); + int err; + + if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog) + return 0; + + if (features & NETIF_F_GRO) { + err = veth_napi_enable(dev); + if (err) + return err; + } else { + veth_napi_del(dev); + } + return 0; +} + static void veth_set_rx_headroom(struct net_device *dev, int new_hr) { struct veth_priv *peer_priv, *priv = netdev_priv(dev); @@ -1267,6 +1357,7 @@ static const struct net_device_ops veth_netdev_ops = { #endif .ndo_get_iflink = veth_get_iflink, .ndo_fix_features = veth_fix_features, + .ndo_set_features = veth_set_features, .ndo_features_check = passthru_features_check, .ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_bpf = veth_xdp, @@ -1329,6 +1420,13 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[], static struct rtnl_link_ops veth_link_ops; +static void veth_disable_gro(struct net_device *dev) +{ + dev->features &= ~NETIF_F_GRO; + dev->wanted_features &= ~NETIF_F_GRO; + netdev_update_features(dev); +} + static int veth_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) @@ -1401,6 +1499,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, if (err < 0) goto err_register_peer; + /* keep GRO disabled by default to be consistent with the established + * veth behavior + */ + veth_disable_gro(peer); netif_carrier_off(peer); err = rtnl_configure_link(peer, ifmp); @@ -1438,6 +1540,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, priv = netdev_priv(peer); rcu_assign_pointer(priv->peer, dev); + veth_disable_gro(dev); return 0; err_register_dev: From 47e550e0105be9b716a3860545731735a67c6b3c Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Apr 2021 13:04:39 +0200 Subject: [PATCH 3/4] veth: refine napi usage After the previous patch, when enabling GRO, locally generated TCP traffic experiences some measurable overhead, as it traverses the GRO engine without any chance of aggregation. This change refine the NAPI receive path admission test, to avoid unnecessary GRO overhead in most scenarios, when GRO is enabled on a veth peer. Only skbs that are eligible for aggregation enter the GRO layer, the others will go through the traditional receive path. Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- drivers/net/veth.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/drivers/net/veth.c b/drivers/net/veth.c index c95c67aa52d2..15b2e3923c47 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -294,6 +294,25 @@ static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb, netif_rx(skb); } +/* return true if the specified skb has chances of GRO aggregation + * Don't strive for accuracy, but try to avoid GRO overhead in the most + * common scenarios. + * When XDP is enabled, all traffic is considered eligible, as the xmit + * device has TSO off. + * When TSO is enabled on the xmit device, we are likely interested only + * in UDP aggregation, explicitly check for that if the skb is suspected + * - the sock_wfree destructor is used by UDP, ICMP and XDP sockets - + * to belong to locally generated UDP traffic. + */ +static bool veth_skb_is_eligible_for_gro(const struct net_device *dev, + const struct net_device *rcv, + const struct sk_buff *skb) +{ + return !(dev->features & NETIF_F_ALL_TSO) || + (skb->destructor == sock_wfree && + rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD)); +} + static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) { struct veth_priv *rcv_priv, *priv = netdev_priv(dev); @@ -317,8 +336,10 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev) /* The napi pointer is available when an XDP program is * attached or when GRO is enabled + * Don't bother with napi/GRO if the skb can't be aggregated */ - use_napi = rcu_access_pointer(rq->napi); + use_napi = rcu_access_pointer(rq->napi) && + veth_skb_is_eligible_for_gro(dev, rcv, skb); skb_record_rx_queue(skb, rxq); } From 1c3cadbe02420e6c85251c416a78a16f17761231 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 9 Apr 2021 13:04:40 +0200 Subject: [PATCH 4/4] self-tests: add veth tests Add some basic veth tests, that verify the expected flags and aggregation with different setups (default, xdp, etc...) Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- tools/testing/selftests/net/Makefile | 1 + tools/testing/selftests/net/veth.sh | 177 +++++++++++++++++++++++++++ 2 files changed, 178 insertions(+) create mode 100755 tools/testing/selftests/net/veth.sh diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 2d71b283dde3..f4242a961088 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -24,6 +24,7 @@ TEST_PROGS += vrf_route_leaking.sh TEST_PROGS += bareudp.sh TEST_PROGS += unicast_extensions.sh TEST_PROGS += udpgro_fwd.sh +TEST_PROGS += veth.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/veth.sh b/tools/testing/selftests/net/veth.sh new file mode 100755 index 000000000000..2fedc0781ce8 --- /dev/null +++ b/tools/testing/selftests/net/veth.sh @@ -0,0 +1,177 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0 + +readonly STATS="$(mktemp -p /tmp ns-XXXXXX)" +readonly BASE=`basename $STATS` +readonly SRC=2 +readonly DST=1 +readonly DST_NAT=100 +readonly NS_SRC=$BASE$SRC +readonly NS_DST=$BASE$DST + +# "baremetal" network used for raw UDP traffic +readonly BM_NET_V4=192.168.1. +readonly BM_NET_V6=2001:db8:: + +readonly NPROCS=`nproc` +ret=0 + +cleanup() { + local ns + local -r jobs="$(jobs -p)" + [ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null + rm -f $STATS + + for ns in $NS_SRC $NS_DST; do + ip netns del $ns 2>/dev/null + done +} + +trap cleanup EXIT + +create_ns() { + local ns + + for ns in $NS_SRC $NS_DST; do + ip netns add $ns + ip -n $ns link set dev lo up + done + + ip link add name veth$SRC type veth peer name veth$DST + + for ns in $SRC $DST; do + ip link set dev veth$ns netns $BASE$ns up + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24 + ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad + done + echo "#kernel" > $BASE + chmod go-rw $BASE +} + +__chk_flag() { + local msg="$1" + local target=$2 + local expected=$3 + local flagname=$4 + + local flag=`ip netns exec $BASE$target ethtool -k veth$target |\ + grep $flagname | awk '{print $2}'` + + printf "%-60s" "$msg" + if [ "$flag" = "$expected" ]; then + echo " ok " + else + echo " fail - expected $expected found $flag" + ret=1 + fi +} + +chk_gro_flag() { + __chk_flag "$1" $2 $3 generic-receive-offload +} + +chk_tso_flag() { + __chk_flag "$1" $2 $3 tcp-segmentation-offload +} + +chk_gro() { + local msg="$1" + local expected=$2 + + ip netns exec $BASE$SRC ping -qc 1 $BM_NET_V4$DST >/dev/null + NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat -n + + printf "%-60s" "$msg" + ip netns exec $BASE$DST ./udpgso_bench_rx -C 1000 -R 10 & + local spid=$! + sleep 0.1 + + ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 13000 -S 1300 -M 1 -D $BM_NET_V4$DST + local retc=$? + wait $spid + local rets=$? + if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then + echo " fail client exit code $retc, server $rets" + ret=1 + return + fi + + local pkts=`NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat IpInReceives | \ + awk '{print $2}' | tail -n 1` + if [ "$pkts" = "$expected" ]; then + echo " ok " + else + echo " fail - got $pkts packets, expected $expected " + ret=1 + fi +} + +if [ ! -f ../bpf/xdp_dummy.o ]; then + echo "Missing xdp_dummy helper. Build bpf selftest first" + exit -1 +fi + +create_ns +chk_gro_flag "default - gro flag" $SRC off +chk_gro_flag " - peer gro flag" $DST off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +chk_gro " - aggregation" 1 +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro " - aggregation with TSO off" 10 +cleanup + +create_ns +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro on - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST down +ip netns exec $NS_DST ethtool -K veth$DST gro on +chk_gro_flag "with gro enabled on link down - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation with TSO off" 1 +cleanup + +create_ns +ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null +chk_gro_flag "with xdp attached - gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC off +chk_tso_flag " - peer tso flag" $DST on +ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on +chk_gro " - aggregation" 1 + + +ip -n $NS_DST link set dev veth$DST down +ip -n $NS_SRC link set dev veth$SRC down +chk_gro_flag " - after dev off, flag" $DST on +chk_gro_flag " - peer flag" $SRC off + +ip netns exec $NS_DST ethtool -K veth$DST gro on +ip -n $NS_DST link set dev veth$DST xdp off +chk_gro_flag " - after gro on xdp off, gro flag" $DST on +chk_gro_flag " - peer gro flag" $SRC off +chk_tso_flag " - tso flag" $SRC on +chk_tso_flag " - peer tso flag" $DST on +ip -n $NS_DST link set dev veth$DST up +ip -n $NS_SRC link set dev veth$SRC up +chk_gro " - aggregation" 1 + +ip netns exec $NS_DST ethtool -K veth$DST gro off +ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off +chk_gro "aggregation again with default and TSO off" 10 + +exit $ret