Merge branch 'veth-gro'

Paolo Abeni  says:

====================
veth: allow GRO even without XDP

This series allows the user-space to enable GRO/NAPI on a veth
device even without attaching an XDP program.

It does not change the default veth behavior (no NAPI, no GRO),
except that the GRO feature bit on top of this series will be
effectively off by default on veth devices. Note that currently
the GRO bit is on by default, but GRO never takes place in
absence of XDP.

On top of this series, setting the GRO feature bit enables NAPI
and allows the GRO to take place. The TSO features on the peer
device are preserved.

The main goal is improving UDP forwarding performances for
containers in a typical virtual network setup:

(container) veth -> veth peer -> bridge/ovs -> vxlan -> NIC

Enabling the NAPI threaded mode, GRO the NETIF_F_GRO_UDP_FWD
feature on the veth peer improves the UDP stream performance
with not void netfilter configuration by 2x factor with no
measurable overhead for TCP traffic: some heuristic ensures
that TCP will not go through the additional NAPI/GRO layer.

Some self-tests are added to check the expected behavior in
the default configuration, with XDP and with plain GRO enabled.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2021-04-11 16:39:28 -07:00
commit 23cfa4d4aa
3 changed files with 316 additions and 14 deletions

View File

@ -57,6 +57,7 @@ struct veth_rq_stats {
struct veth_rq {
struct napi_struct xdp_napi;
struct napi_struct __rcu *napi; /* points to xdp_napi when the latter is initialized */
struct net_device *dev;
struct bpf_prog __rcu *xdp_prog;
struct xdp_mem_info xdp_mem;
@ -293,13 +294,32 @@ static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
netif_rx(skb);
}
/* return true if the specified skb has chances of GRO aggregation
* Don't strive for accuracy, but try to avoid GRO overhead in the most
* common scenarios.
* When XDP is enabled, all traffic is considered eligible, as the xmit
* device has TSO off.
* When TSO is enabled on the xmit device, we are likely interested only
* in UDP aggregation, explicitly check for that if the skb is suspected
* - the sock_wfree destructor is used by UDP, ICMP and XDP sockets -
* to belong to locally generated UDP traffic.
*/
static bool veth_skb_is_eligible_for_gro(const struct net_device *dev,
const struct net_device *rcv,
const struct sk_buff *skb)
{
return !(dev->features & NETIF_F_ALL_TSO) ||
(skb->destructor == sock_wfree &&
rcv->features & (NETIF_F_GRO_FRAGLIST | NETIF_F_GRO_UDP_FWD));
}
static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
struct veth_rq *rq = NULL;
struct net_device *rcv;
int length = skb->len;
bool rcv_xdp = false;
bool use_napi = false;
int rxq;
rcu_read_lock();
@ -313,20 +333,26 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
rxq = skb_get_queue_mapping(skb);
if (rxq < rcv->real_num_rx_queues) {
rq = &rcv_priv->rq[rxq];
rcv_xdp = rcu_access_pointer(rq->xdp_prog);
/* The napi pointer is available when an XDP program is
* attached or when GRO is enabled
* Don't bother with napi/GRO if the skb can't be aggregated
*/
use_napi = rcu_access_pointer(rq->napi) &&
veth_skb_is_eligible_for_gro(dev, rcv, skb);
skb_record_rx_queue(skb, rxq);
}
skb_tx_timestamp(skb);
if (likely(veth_forward_skb(rcv, skb, rq, rcv_xdp) == NET_RX_SUCCESS)) {
if (!rcv_xdp)
if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
if (!use_napi)
dev_lstats_add(dev, length);
} else {
drop:
atomic64_inc(&priv->dropped);
}
if (rcv_xdp)
if (use_napi)
__veth_xdp_flush(rq);
rcu_read_unlock();
@ -686,7 +712,7 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
int mac_len, delta, off;
struct xdp_buff xdp;
skb_orphan(skb);
skb_orphan_partial(skb);
rcu_read_lock();
xdp_prog = rcu_dereference(rq->xdp_prog);
@ -903,7 +929,7 @@ static int veth_poll(struct napi_struct *napi, int budget)
return done;
}
static int veth_napi_add(struct net_device *dev)
static int __veth_napi_enable(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
@ -920,6 +946,7 @@ static int veth_napi_add(struct net_device *dev)
struct veth_rq *rq = &priv->rq[i];
napi_enable(&rq->xdp_napi);
rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
}
return 0;
@ -938,6 +965,7 @@ static void veth_napi_del(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
rcu_assign_pointer(priv->rq[i].napi, NULL);
napi_disable(&rq->xdp_napi);
__netif_napi_del(&rq->xdp_napi);
}
@ -951,8 +979,14 @@ static void veth_napi_del(struct net_device *dev)
}
}
static bool veth_gro_requested(const struct net_device *dev)
{
return !!(dev->wanted_features & NETIF_F_GRO);
}
static int veth_enable_xdp(struct net_device *dev)
{
bool napi_already_on = veth_gro_requested(dev) && (dev->flags & IFF_UP);
struct veth_priv *priv = netdev_priv(dev);
int err, i;
@ -960,7 +994,8 @@ static int veth_enable_xdp(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
if (!napi_already_on)
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
err = xdp_rxq_info_reg(&rq->xdp_rxq, dev, i, rq->xdp_napi.napi_id);
if (err < 0)
goto err_rxq_reg;
@ -975,13 +1010,25 @@ static int veth_enable_xdp(struct net_device *dev)
rq->xdp_mem = rq->xdp_rxq.mem;
}
err = veth_napi_add(dev);
if (err)
goto err_rxq_reg;
if (!napi_already_on) {
err = __veth_napi_enable(dev);
if (err)
goto err_rxq_reg;
if (!veth_gro_requested(dev)) {
/* user-space did not require GRO, but adding XDP
* is supposed to get GRO working
*/
dev->features |= NETIF_F_GRO;
netdev_features_change(dev);
}
}
}
for (i = 0; i < dev->real_num_rx_queues; i++)
for (i = 0; i < dev->real_num_rx_queues; i++) {
rcu_assign_pointer(priv->rq[i].xdp_prog, priv->_xdp_prog);
rcu_assign_pointer(priv->rq[i].napi, &priv->rq[i].xdp_napi);
}
return 0;
err_reg_mem:
@ -991,7 +1038,8 @@ err_rxq_reg:
struct veth_rq *rq = &priv->rq[i];
xdp_rxq_info_unreg(&rq->xdp_rxq);
netif_napi_del(&rq->xdp_napi);
if (!napi_already_on)
netif_napi_del(&rq->xdp_napi);
}
return err;
@ -1004,7 +1052,19 @@ static void veth_disable_xdp(struct net_device *dev)
for (i = 0; i < dev->real_num_rx_queues; i++)
rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
veth_napi_del(dev);
if (!netif_running(dev) || !veth_gro_requested(dev)) {
veth_napi_del(dev);
/* if user-space did not require GRO, since adding XDP
* enabled it, clear it now
*/
if (!veth_gro_requested(dev) && netif_running(dev)) {
dev->features &= ~NETIF_F_GRO;
netdev_features_change(dev);
}
}
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
@ -1013,6 +1073,29 @@ static void veth_disable_xdp(struct net_device *dev)
}
}
static int veth_napi_enable(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
int err, i;
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_add(dev, &rq->xdp_napi, veth_poll, NAPI_POLL_WEIGHT);
}
err = __veth_napi_enable(dev);
if (err) {
for (i = 0; i < dev->real_num_rx_queues; i++) {
struct veth_rq *rq = &priv->rq[i];
netif_napi_del(&rq->xdp_napi);
}
return err;
}
return err;
}
static int veth_open(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
@ -1026,6 +1109,10 @@ static int veth_open(struct net_device *dev)
err = veth_enable_xdp(dev);
if (err)
return err;
} else if (veth_gro_requested(dev)) {
err = veth_napi_enable(dev);
if (err)
return err;
}
if (peer->flags & IFF_UP) {
@ -1047,6 +1134,8 @@ static int veth_close(struct net_device *dev)
if (priv->_xdp_prog)
veth_disable_xdp(dev);
else if (veth_gro_requested(dev))
veth_napi_del(dev);
return 0;
}
@ -1145,10 +1234,32 @@ static netdev_features_t veth_fix_features(struct net_device *dev,
if (peer_priv->_xdp_prog)
features &= ~NETIF_F_GSO_SOFTWARE;
}
if (priv->_xdp_prog)
features |= NETIF_F_GRO;
return features;
}
static int veth_set_features(struct net_device *dev,
netdev_features_t features)
{
netdev_features_t changed = features ^ dev->features;
struct veth_priv *priv = netdev_priv(dev);
int err;
if (!(changed & NETIF_F_GRO) || !(dev->flags & IFF_UP) || priv->_xdp_prog)
return 0;
if (features & NETIF_F_GRO) {
err = veth_napi_enable(dev);
if (err)
return err;
} else {
veth_napi_del(dev);
}
return 0;
}
static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
{
struct veth_priv *peer_priv, *priv = netdev_priv(dev);
@ -1267,6 +1378,7 @@ static const struct net_device_ops veth_netdev_ops = {
#endif
.ndo_get_iflink = veth_get_iflink,
.ndo_fix_features = veth_fix_features,
.ndo_set_features = veth_set_features,
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = veth_set_rx_headroom,
.ndo_bpf = veth_xdp,
@ -1329,6 +1441,13 @@ static int veth_validate(struct nlattr *tb[], struct nlattr *data[],
static struct rtnl_link_ops veth_link_ops;
static void veth_disable_gro(struct net_device *dev)
{
dev->features &= ~NETIF_F_GRO;
dev->wanted_features &= ~NETIF_F_GRO;
netdev_update_features(dev);
}
static int veth_newlink(struct net *src_net, struct net_device *dev,
struct nlattr *tb[], struct nlattr *data[],
struct netlink_ext_ack *extack)
@ -1401,6 +1520,10 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
if (err < 0)
goto err_register_peer;
/* keep GRO disabled by default to be consistent with the established
* veth behavior
*/
veth_disable_gro(peer);
netif_carrier_off(peer);
err = rtnl_configure_link(peer, ifmp);
@ -1438,6 +1561,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
priv = netdev_priv(peer);
rcu_assign_pointer(priv->peer, dev);
veth_disable_gro(dev);
return 0;
err_register_dev:

View File

@ -24,6 +24,7 @@ TEST_PROGS += vrf_route_leaking.sh
TEST_PROGS += bareudp.sh
TEST_PROGS += unicast_extensions.sh
TEST_PROGS += udpgro_fwd.sh
TEST_PROGS += veth.sh
TEST_PROGS_EXTENDED := in_netns.sh
TEST_GEN_FILES = socket nettest
TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any

View File

@ -0,0 +1,177 @@
#!/bin/sh
# SPDX-License-Identifier: GPL-2.0
readonly STATS="$(mktemp -p /tmp ns-XXXXXX)"
readonly BASE=`basename $STATS`
readonly SRC=2
readonly DST=1
readonly DST_NAT=100
readonly NS_SRC=$BASE$SRC
readonly NS_DST=$BASE$DST
# "baremetal" network used for raw UDP traffic
readonly BM_NET_V4=192.168.1.
readonly BM_NET_V6=2001:db8::
readonly NPROCS=`nproc`
ret=0
cleanup() {
local ns
local -r jobs="$(jobs -p)"
[ -n "${jobs}" ] && kill -1 ${jobs} 2>/dev/null
rm -f $STATS
for ns in $NS_SRC $NS_DST; do
ip netns del $ns 2>/dev/null
done
}
trap cleanup EXIT
create_ns() {
local ns
for ns in $NS_SRC $NS_DST; do
ip netns add $ns
ip -n $ns link set dev lo up
done
ip link add name veth$SRC type veth peer name veth$DST
for ns in $SRC $DST; do
ip link set dev veth$ns netns $BASE$ns up
ip -n $BASE$ns addr add dev veth$ns $BM_NET_V4$ns/24
ip -n $BASE$ns addr add dev veth$ns $BM_NET_V6$ns/64 nodad
done
echo "#kernel" > $BASE
chmod go-rw $BASE
}
__chk_flag() {
local msg="$1"
local target=$2
local expected=$3
local flagname=$4
local flag=`ip netns exec $BASE$target ethtool -k veth$target |\
grep $flagname | awk '{print $2}'`
printf "%-60s" "$msg"
if [ "$flag" = "$expected" ]; then
echo " ok "
else
echo " fail - expected $expected found $flag"
ret=1
fi
}
chk_gro_flag() {
__chk_flag "$1" $2 $3 generic-receive-offload
}
chk_tso_flag() {
__chk_flag "$1" $2 $3 tcp-segmentation-offload
}
chk_gro() {
local msg="$1"
local expected=$2
ip netns exec $BASE$SRC ping -qc 1 $BM_NET_V4$DST >/dev/null
NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat -n
printf "%-60s" "$msg"
ip netns exec $BASE$DST ./udpgso_bench_rx -C 1000 -R 10 &
local spid=$!
sleep 0.1
ip netns exec $NS_SRC ./udpgso_bench_tx -4 -s 13000 -S 1300 -M 1 -D $BM_NET_V4$DST
local retc=$?
wait $spid
local rets=$?
if [ ${rets} -ne 0 ] || [ ${retc} -ne 0 ]; then
echo " fail client exit code $retc, server $rets"
ret=1
return
fi
local pkts=`NSTAT_HISTORY=$STATS ip netns exec $NS_DST nstat IpInReceives | \
awk '{print $2}' | tail -n 1`
if [ "$pkts" = "$expected" ]; then
echo " ok "
else
echo " fail - got $pkts packets, expected $expected "
ret=1
fi
}
if [ ! -f ../bpf/xdp_dummy.o ]; then
echo "Missing xdp_dummy helper. Build bpf selftest first"
exit -1
fi
create_ns
chk_gro_flag "default - gro flag" $SRC off
chk_gro_flag " - peer gro flag" $DST off
chk_tso_flag " - tso flag" $SRC on
chk_tso_flag " - peer tso flag" $DST on
chk_gro " - aggregation" 1
ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
chk_gro " - aggregation with TSO off" 10
cleanup
create_ns
ip netns exec $NS_DST ethtool -K veth$DST gro on
chk_gro_flag "with gro on - gro flag" $DST on
chk_gro_flag " - peer gro flag" $SRC off
chk_tso_flag " - tso flag" $SRC on
chk_tso_flag " - peer tso flag" $DST on
ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
chk_gro " - aggregation with TSO off" 1
cleanup
create_ns
ip -n $NS_DST link set dev veth$DST down
ip netns exec $NS_DST ethtool -K veth$DST gro on
chk_gro_flag "with gro enabled on link down - gro flag" $DST on
chk_gro_flag " - peer gro flag" $SRC off
chk_tso_flag " - tso flag" $SRC on
chk_tso_flag " - peer tso flag" $DST on
ip -n $NS_DST link set dev veth$DST up
ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
chk_gro " - aggregation with TSO off" 1
cleanup
create_ns
ip -n $NS_DST link set dev veth$DST xdp object ../bpf/xdp_dummy.o section xdp_dummy 2>/dev/null
chk_gro_flag "with xdp attached - gro flag" $DST on
chk_gro_flag " - peer gro flag" $SRC off
chk_tso_flag " - tso flag" $SRC off
chk_tso_flag " - peer tso flag" $DST on
ip netns exec $NS_DST ethtool -K veth$DST rx-udp-gro-forwarding on
chk_gro " - aggregation" 1
ip -n $NS_DST link set dev veth$DST down
ip -n $NS_SRC link set dev veth$SRC down
chk_gro_flag " - after dev off, flag" $DST on
chk_gro_flag " - peer flag" $SRC off
ip netns exec $NS_DST ethtool -K veth$DST gro on
ip -n $NS_DST link set dev veth$DST xdp off
chk_gro_flag " - after gro on xdp off, gro flag" $DST on
chk_gro_flag " - peer gro flag" $SRC off
chk_tso_flag " - tso flag" $SRC on
chk_tso_flag " - peer tso flag" $DST on
ip -n $NS_DST link set dev veth$DST up
ip -n $NS_SRC link set dev veth$SRC up
chk_gro " - aggregation" 1
ip netns exec $NS_DST ethtool -K veth$DST gro off
ip netns exec $NS_SRC ethtool -K veth$SRC tx-udp-segmentation off
chk_gro "aggregation again with default and TSO off" 10
exit $ret