Merge branch 'vxlan-vnifiltering'

Roopa Prabhu says:

====================
vxlan metadata device vnifiltering support

This series adds vnifiltering support to vxlan collect metadata device.

Motivation:
You can only use a single vxlan collect metadata device for a given
vxlan udp port in the system today. The vxlan collect metadata device
terminates all received vxlan packets. As shown in the below diagram,
there are use-cases where you need to support multiple such vxlan devices in
independent bridge domains. Each vxlan device must terminate the vni's
it is configured for.
Example usecase: In a service provider network a service provider
typically supports multiple bridge domains with overlapping vlans.
One bridge domain per customer. Vlans in each bridge domain are
mapped to globally unique vxlan ranges assigned to each customer.

This series adds vnifiltering support to collect metadata devices to
terminate only configured vnis. This is similar to vlan filtering in
bridge driver. The vni filtering capability is provided by a new flag on
collect metadata device.

In the below pic:
	- customer1 is mapped to br1 bridge domain
	- customer2 is mapped to br2 bridge domain
	- customer1 vlan 10-11 is mapped to vni 1001-1002
	- customer2 vlan 10-11 is mapped to vni 2001-2002
	- br1 and br2 are vlan filtering bridges
	- vxlan1 and vxlan2 are collect metadata devices with
	  vnifiltering enabled

┌──────────────────────────────────────────────────────────────────┐
│  switch                                                          │
│                                                                  │
│         ┌───────────┐                 ┌───────────┐              │
│         │           │                 │           │              │
│         │   br1     │                 │   br2     │              │
│         └┬─────────┬┘                 └──┬───────┬┘              │
│     vlans│         │               vlans │       │               │
│     10,11│         │                10,11│       │               │
│          │     vlanvnimap:               │    vlanvnimap:        │
│          │       10-1001,11-1002         │      10-2001,11-2002  │
│          │         │                     │       │               │
│   ┌──────┴┐     ┌──┴─────────┐       ┌───┴────┐  │               │
│   │ swp1  │     │vxlan1      │       │ swp2   │ ┌┴─────────────┐ │
│   │       │     │  vnifilter:│       │        │ │vxlan2        │ │
│   └───┬───┘     │   1001,1002│       └───┬────┘ │ vnifilter:   │ │
│       │         └────────────┘           │      │  2001,2002   │ │
│       │                                  │      └──────────────┘ │
│       │                                  │                       │
└───────┼──────────────────────────────────┼───────────────────────┘
        │                                  │
        │                                  │
  ┌─────┴───────┐                          │
  │  customer1  │                    ┌─────┴──────┐
  │ host/VM     │                    │customer2   │
  └─────────────┘                    │ host/VM    │
                                     └────────────┘

v2:
  - remove stale xstats declarations pointed out by Nikolay Aleksandrov
  - squash selinux patch with the tunnel api patch as pointed out by
    benjamin poirier
  - Fix various build issues:
	Reported-by: kernel test robot <lkp@intel.com>

v3:
  - incorporate review feedback from Jakub
	- move rhashtable declarations to c file
	- define and use netlink policy for top level vxlan filter api
	- fix unused stats function warning
	- pass vninode from vnifilter lookup into stats count function
		to avoid another lookup (only applicable to vxlan_rcv)
	- fix missing vxlan vni delete notifications in vnifilter uninit
	  function
	- misc cleanups
  - remote dev check for multicast groups added via vnifiltering api
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller 2022-03-01 08:38:02 +00:00
commit 1e385c0824
11 changed files with 2307 additions and 265 deletions

View File

@ -31,7 +31,7 @@ obj-$(CONFIG_TUN) += tun.o
obj-$(CONFIG_TAP) += tap.o
obj-$(CONFIG_VETH) += veth.o
obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
obj-$(CONFIG_VXLAN) += vxlan.o
obj-$(CONFIG_VXLAN) += vxlan/
obj-$(CONFIG_GENEVE) += geneve.o
obj-$(CONFIG_BAREUDP) += bareudp.o
obj-$(CONFIG_GTP) += gtp.o

View File

@ -0,0 +1,7 @@
#
# Makefile for the vxlan driver
#
obj-$(CONFIG_VXLAN) += vxlan.o
vxlan-objs := vxlan_core.o vxlan_multicast.o vxlan_vnifilter.o

View File

@ -34,10 +34,10 @@
#include <net/ip6_checksum.h>
#endif
#include "vxlan_private.h"
#define VXLAN_VERSION "0.1"
#define PORT_HASH_BITS 8
#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
#define FDB_AGE_DEFAULT 300 /* 5 min */
#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
@ -53,41 +53,15 @@ static bool log_ecn_error = true;
module_param(log_ecn_error, bool, 0644);
MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
static unsigned int vxlan_net_id;
static struct rtnl_link_ops vxlan_link_ops;
unsigned int vxlan_net_id;
static const u8 all_zeros_mac[ETH_ALEN + 2];
const u8 all_zeros_mac[ETH_ALEN + 2];
static struct rtnl_link_ops vxlan_link_ops;
static int vxlan_sock_add(struct vxlan_dev *vxlan);
static void vxlan_vs_del_dev(struct vxlan_dev *vxlan);
/* per-network namespace private data for this module */
struct vxlan_net {
struct list_head vxlan_list;
struct hlist_head sock_list[PORT_HASH_SIZE];
spinlock_t sock_lock;
struct notifier_block nexthop_notifier_block;
};
/* Forwarding table entry */
struct vxlan_fdb {
struct hlist_node hlist; /* linked list of entries */
struct rcu_head rcu;
unsigned long updated; /* jiffies */
unsigned long used;
struct list_head remotes;
u8 eth_addr[ETH_ALEN];
u16 state; /* see ndm_state */
__be32 vni;
u16 flags; /* see ndm_flags and below */
struct list_head nh_list;
struct nexthop __rcu *nh;
struct vxlan_dev __rcu *vdev;
};
#define NTF_VXLAN_ADDED_BY_USER 0x100
/* salt for hash table */
static u32 vxlan_salt __read_mostly;
@ -98,17 +72,6 @@ static inline bool vxlan_collect_metadata(struct vxlan_sock *vs)
}
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
if (a->sa.sa_family != b->sa.sa_family)
return false;
if (a->sa.sa_family == AF_INET6)
return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
else
return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
if (nla_len(nla) >= sizeof(struct in6_addr)) {
@ -135,12 +98,6 @@ static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
#else /* !CONFIG_IPV6 */
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
static int vxlan_nla_get_addr(union vxlan_addr *ip, struct nlattr *nla)
{
if (nla_len(nla) >= sizeof(struct in6_addr)) {
@ -161,37 +118,6 @@ static int vxlan_nla_put_addr(struct sk_buff *skb, int attr,
}
#endif
/* Virtual Network hash table head */
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
{
return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
}
/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}
/* First remote destination for a forwarding entry.
* Guaranteed to be non-NULL because remotes are never deleted.
*/
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
{
if (rcu_access_pointer(fdb->nh))
return NULL;
return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}
static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
if (rcu_access_pointer(fdb->nh))
return NULL;
return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
}
/* Find VXLAN socket based on network namespace, address family, UDP port,
* enabled unshareable flags and socket device binding (see l3mdev with
* non-default VRF).
@ -213,18 +139,29 @@ static struct vxlan_sock *vxlan_find_sock(struct net *net, sa_family_t family,
return NULL;
}
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
__be32 vni)
static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs,
int ifindex, __be32 vni,
struct vxlan_vni_node **vninode)
{
struct vxlan_vni_node *vnode;
struct vxlan_dev_node *node;
/* For flow based devices, map all packets to VNI 0 */
if (vs->flags & VXLAN_F_COLLECT_METADATA)
if (vs->flags & VXLAN_F_COLLECT_METADATA &&
!(vs->flags & VXLAN_F_VNIFILTER))
vni = 0;
hlist_for_each_entry_rcu(node, vni_head(vs, vni), hlist) {
if (node->vxlan->default_dst.remote_vni != vni)
if (!node->vxlan)
continue;
vnode = NULL;
if (node->vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
vnode = vxlan_vnifilter_lookup(node->vxlan, vni);
if (!vnode)
continue;
} else if (node->vxlan->default_dst.remote_vni != vni) {
continue;
}
if (IS_ENABLED(CONFIG_IPV6)) {
const struct vxlan_config *cfg = &node->vxlan->cfg;
@ -234,6 +171,8 @@ static struct vxlan_dev *vxlan_vs_find_vni(struct vxlan_sock *vs, int ifindex,
continue;
}
if (vninode)
*vninode = vnode;
return node->vxlan;
}
@ -251,7 +190,7 @@ static struct vxlan_dev *vxlan_find_vni(struct net *net, int ifindex,
if (!vs)
return NULL;
return vxlan_vs_find_vni(vs, ifindex, vni);
return vxlan_vs_find_vni(vs, ifindex, vni, NULL);
}
/* Fill in neighbour message in skbuff. */
@ -493,7 +432,7 @@ static u32 eth_hash(const unsigned char *addr)
return hash_64(value, FDB_HASH_BITS);
}
static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
{
/* use 1 byte of OUI and 3 bytes of NIC */
u32 key = get_unaligned((u32 *)(addr + 2));
@ -501,7 +440,7 @@ static u32 eth_vni_hash(const unsigned char *addr, __be32 vni)
return jhash_2words(key, vni, vxlan_salt) & (FDB_HASH_SIZE - 1);
}
static u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni)
{
if (vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA)
return eth_vni_hash(mac, vni);
@ -920,12 +859,12 @@ err_inval:
return err;
}
static int vxlan_fdb_create(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __be16 port, __be32 src_vni,
__be32 vni, __u32 ifindex, __u16 ndm_flags,
u32 nhid, struct vxlan_fdb **fdb,
struct netlink_ext_ack *extack)
int vxlan_fdb_create(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __be16 port, __be32 src_vni,
__be32 vni, __u32 ifindex, __u16 ndm_flags,
u32 nhid, struct vxlan_fdb **fdb,
struct netlink_ext_ack *extack)
{
struct vxlan_rdst *rd = NULL;
struct vxlan_fdb *f;
@ -1150,13 +1089,13 @@ err_notify:
}
/* Add new entry to forwarding table -- assumes lock held */
static int vxlan_fdb_update(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __u16 flags,
__be16 port, __be32 src_vni, __be32 vni,
__u32 ifindex, __u16 ndm_flags, u32 nhid,
bool swdev_notify,
struct netlink_ext_ack *extack)
int vxlan_fdb_update(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __u16 flags,
__be16 port, __be32 src_vni, __be32 vni,
__u32 ifindex, __u16 ndm_flags, u32 nhid,
bool swdev_notify,
struct netlink_ext_ack *extack)
{
struct vxlan_fdb *f;
@ -1307,10 +1246,10 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
return err;
}
static int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
const unsigned char *addr, union vxlan_addr ip,
__be16 port, __be32 src_vni, __be32 vni,
u32 ifindex, bool swdev_notify)
int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
const unsigned char *addr, union vxlan_addr ip,
__be16 port, __be32 src_vni, __be32 vni,
u32 ifindex, bool swdev_notify)
{
struct vxlan_rdst *rd = NULL;
struct vxlan_fdb *f;
@ -1519,56 +1458,6 @@ static bool vxlan_snoop(struct net_device *dev,
return false;
}
/* See if multicast group is already in use by other ID */
static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
{
struct vxlan_dev *vxlan;
struct vxlan_sock *sock4;
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock *sock6;
#endif
unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
sock4 = rtnl_dereference(dev->vn4_sock);
/* The vxlan_sock is only used by dev, leaving group has
* no effect on other vxlan devices.
*/
if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
return false;
#if IS_ENABLED(CONFIG_IPV6)
sock6 = rtnl_dereference(dev->vn6_sock);
if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
return false;
#endif
list_for_each_entry(vxlan, &vn->vxlan_list, next) {
if (!netif_running(vxlan->dev) || vxlan == dev)
continue;
if (family == AF_INET &&
rtnl_dereference(vxlan->vn4_sock) != sock4)
continue;
#if IS_ENABLED(CONFIG_IPV6)
if (family == AF_INET6 &&
rtnl_dereference(vxlan->vn6_sock) != sock6)
continue;
#endif
if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
&dev->default_dst.remote_ip))
continue;
if (vxlan->default_dst.remote_ifindex !=
dev->default_dst.remote_ifindex)
continue;
return true;
}
return false;
}
static bool __vxlan_sock_release_prep(struct vxlan_sock *vs)
{
struct vxlan_net *vn;
@ -1602,7 +1491,10 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan)
RCU_INIT_POINTER(vxlan->vn4_sock, NULL);
synchronize_net();
vxlan_vs_del_dev(vxlan);
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
vxlan_vs_del_vnigrp(vxlan);
else
vxlan_vs_del_dev(vxlan);
if (__vxlan_sock_release_prep(sock4)) {
udp_tunnel_sock_release(sock4->sock);
@ -1617,76 +1509,6 @@ static void vxlan_sock_release(struct vxlan_dev *vxlan)
#endif
}
/* Update multicast group membership when first VNI on
* multicast address is brought up
*/
static int vxlan_igmp_join(struct vxlan_dev *vxlan)
{
struct sock *sk;
union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
int ifindex = vxlan->default_dst.remote_ifindex;
int ret = -EINVAL;
if (ip->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr,
.imr_ifindex = ifindex,
};
sk = sock4->sock->sk;
lock_sock(sk);
ret = ip_mc_join_group(sk, &mreq);
release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
sk = sock6->sock->sk;
lock_sock(sk);
ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
&ip->sin6.sin6_addr);
release_sock(sk);
#endif
}
return ret;
}
/* Inverse of vxlan_igmp_join when last VNI is brought down */
static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
{
struct sock *sk;
union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
int ifindex = vxlan->default_dst.remote_ifindex;
int ret = -EINVAL;
if (ip->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr,
.imr_ifindex = ifindex,
};
sk = sock4->sock->sk;
lock_sock(sk);
ret = ip_mc_leave_group(sk, &mreq);
release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
sk = sock6->sock->sk;
lock_sock(sk);
ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
&ip->sin6.sin6_addr);
release_sock(sk);
#endif
}
return ret;
}
static bool vxlan_remcsum(struct vxlanhdr *unparsed,
struct sk_buff *skb, u32 vxflags)
{
@ -1828,6 +1650,7 @@ static bool vxlan_ecn_decapsulate(struct vxlan_sock *vs, void *oiph,
/* Callback from net/ipv4/udp.c to receive packets */
static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
{
struct vxlan_vni_node *vninode = NULL;
struct vxlan_dev *vxlan;
struct vxlan_sock *vs;
struct vxlanhdr unparsed;
@ -1860,7 +1683,7 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
vni = vxlan_vni(vxlan_hdr(skb)->vx_vni);
vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, &vninode);
if (!vxlan)
goto drop;
@ -1930,6 +1753,8 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
if (!vxlan_ecn_decapsulate(vs, oiph, skb)) {
++vxlan->dev->stats.rx_frame_errors;
++vxlan->dev->stats.rx_errors;
vxlan_vnifilter_count(vxlan, vni, vninode,
VXLAN_VNI_STATS_RX_ERRORS, 0);
goto drop;
}
@ -1938,10 +1763,13 @@ static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
if (unlikely(!(vxlan->dev->flags & IFF_UP))) {
rcu_read_unlock();
atomic_long_inc(&vxlan->dev->rx_dropped);
vxlan_vnifilter_count(vxlan, vni, vninode,
VXLAN_VNI_STATS_RX_DROPS, 0);
goto drop;
}
dev_sw_netstats_rx_add(vxlan->dev, skb->len);
vxlan_vnifilter_count(vxlan, vni, vninode, VXLAN_VNI_STATS_RX, skb->len);
gro_cells_receive(&vxlan->gro_cells, skb);
rcu_read_unlock();
@ -1975,7 +1803,7 @@ static int vxlan_err_lookup(struct sock *sk, struct sk_buff *skb)
return -ENOENT;
vni = vxlan_vni(hdr->vx_vni);
vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni);
vxlan = vxlan_vs_find_vni(vs, skb->dev->ifindex, vni, NULL);
if (!vxlan)
return -ENOENT;
@ -2049,8 +1877,12 @@ static int arp_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
reply->ip_summed = CHECKSUM_UNNECESSARY;
reply->pkt_type = PACKET_HOST;
if (netif_rx_ni(reply) == NET_RX_DROP)
if (netif_rx_ni(reply) == NET_RX_DROP) {
dev->stats.rx_dropped++;
vxlan_vnifilter_count(vxlan, vni, NULL,
VXLAN_VNI_STATS_RX_DROPS, 0);
}
} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
union vxlan_addr ipa = {
.sin.sin_addr.s_addr = tip,
@ -2204,9 +2036,11 @@ static int neigh_reduce(struct net_device *dev, struct sk_buff *skb, __be32 vni)
if (reply == NULL)
goto out;
if (netif_rx_ni(reply) == NET_RX_DROP)
if (netif_rx_ni(reply) == NET_RX_DROP) {
dev->stats.rx_dropped++;
vxlan_vnifilter_count(vxlan, vni, NULL,
VXLAN_VNI_STATS_RX_DROPS, 0);
}
} else if (vxlan->cfg.flags & VXLAN_F_L3MISS) {
union vxlan_addr ipa = {
.sin6.sin6_addr = msg->target,
@ -2540,15 +2374,20 @@ static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
tx_stats->tx_packets++;
tx_stats->tx_bytes += len;
u64_stats_update_end(&tx_stats->syncp);
vxlan_vnifilter_count(src_vxlan, vni, NULL, VXLAN_VNI_STATS_TX, len);
if (__netif_rx(skb) == NET_RX_SUCCESS) {
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->rx_packets++;
rx_stats->rx_bytes += len;
u64_stats_update_end(&rx_stats->syncp);
vxlan_vnifilter_count(dst_vxlan, vni, NULL, VXLAN_VNI_STATS_RX,
len);
} else {
drop:
dev->stats.rx_dropped++;
vxlan_vnifilter_count(dst_vxlan, vni, NULL,
VXLAN_VNI_STATS_RX_DROPS, 0);
}
rcu_read_unlock();
}
@ -2578,6 +2417,8 @@ static int encap_bypass_if_local(struct sk_buff *skb, struct net_device *dev,
vxlan->cfg.flags);
if (!dst_vxlan) {
dev->stats.tx_errors++;
vxlan_vnifilter_count(vxlan, vni, NULL,
VXLAN_VNI_STATS_TX_ERRORS, 0);
kfree_skb(skb);
return -ENOENT;
@ -2601,15 +2442,19 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
union vxlan_addr remote_ip, local_ip;
struct vxlan_metadata _md;
struct vxlan_metadata *md = &_md;
unsigned int pkt_len = skb->len;
__be16 src_port = 0, dst_port;
struct dst_entry *ndst = NULL;
__be32 vni, label;
__u8 tos, ttl;
int ifindex;
int err;
u32 flags = vxlan->cfg.flags;
bool udp_sum = false;
bool xnet = !net_eq(vxlan->net, dev_net(vxlan->dev));
__be32 vni = 0;
#if IS_ENABLED(CONFIG_IPV6)
__be32 label;
#endif
info = skb_tunnel_info(skb);
@ -2647,7 +2492,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM_TX);
else
udp_sum = !(flags & VXLAN_F_UDP_ZERO_CSUM6_TX);
#if IS_ENABLED(CONFIG_IPV6)
label = vxlan->cfg.label;
#endif
} else {
if (!info) {
WARN_ONCE(1, "%s: Missing encapsulation instructions\n",
@ -2674,7 +2521,9 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
}
ttl = info->key.ttl;
tos = info->key.tos;
#if IS_ENABLED(CONFIG_IPV6)
label = info->key.label;
#endif
udp_sum = !!(info->key.tun_flags & TUNNEL_CSUM);
}
src_port = udp_flow_src_port(dev_net(dev), skb, vxlan->cfg.port_min,
@ -2821,12 +2670,14 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
label, src_port, dst_port, !udp_sum);
#endif
}
vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX, pkt_len);
out_unlock:
rcu_read_unlock();
return;
drop:
dev->stats.tx_dropped++;
vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_DROPS, 0);
dev_kfree_skb(skb);
return;
@ -2838,6 +2689,7 @@ tx_error:
dev->stats.tx_carrier_errors++;
dst_release(ndst);
dev->stats.tx_errors++;
vxlan_vnifilter_count(vxlan, vni, NULL, VXLAN_VNI_STATS_TX_ERRORS, 0);
kfree_skb(skb);
}
@ -2870,6 +2722,8 @@ static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev,
drop:
dev->stats.tx_dropped++;
vxlan_vnifilter_count(netdev_priv(dev), vni, NULL,
VXLAN_VNI_STATS_TX_DROPS, 0);
dev_kfree_skb(skb);
}
@ -2944,6 +2798,8 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
vxlan_fdb_miss(vxlan, eth->h_dest);
dev->stats.tx_dropped++;
vxlan_vnifilter_count(vxlan, vni, NULL,
VXLAN_VNI_STATS_TX_DROPS, 0);
kfree_skb(skb);
return NETDEV_TX_OK;
}
@ -3044,6 +2900,9 @@ static int vxlan_init(struct net_device *dev)
struct vxlan_dev *vxlan = netdev_priv(dev);
int err;
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
vxlan_vnigroup_init(vxlan);
dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
if (!dev->tstats)
return -ENOMEM;
@ -3073,6 +2932,9 @@ static void vxlan_uninit(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
vxlan_vnigroup_uninit(vxlan);
gro_cells_destroy(&vxlan->gro_cells);
vxlan_fdb_delete_default(vxlan, vxlan->cfg.vni);
@ -3090,14 +2952,10 @@ static int vxlan_open(struct net_device *dev)
if (ret < 0)
return ret;
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
ret = vxlan_igmp_join(vxlan);
if (ret == -EADDRINUSE)
ret = 0;
if (ret) {
vxlan_sock_release(vxlan);
return ret;
}
ret = vxlan_multicast_join(vxlan);
if (ret) {
vxlan_sock_release(vxlan);
return ret;
}
if (vxlan->cfg.age_interval)
@ -3134,12 +2992,9 @@ static void vxlan_flush(struct vxlan_dev *vxlan, bool do_all)
static int vxlan_stop(struct net_device *dev)
{
struct vxlan_dev *vxlan = netdev_priv(dev);
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
int ret = 0;
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
!vxlan_group_used(vn, vxlan))
ret = vxlan_igmp_leave(vxlan);
vxlan_multicast_leave(vxlan);
del_timer_sync(&vxlan->age_timer);
@ -3369,6 +3224,7 @@ static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
[IFLA_VXLAN_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG },
[IFLA_VXLAN_TTL_INHERIT] = { .type = NLA_FLAG },
[IFLA_VXLAN_DF] = { .type = NLA_U8 },
[IFLA_VXLAN_VNIFILTER] = { .type = NLA_U8 },
};
static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[],
@ -3554,6 +3410,7 @@ static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
bool metadata = vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA;
struct vxlan_sock *vs = NULL;
struct vxlan_dev_node *node;
int l3mdev_index = 0;
@ -3589,7 +3446,12 @@ static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
rcu_assign_pointer(vxlan->vn4_sock, vs);
node = &vxlan->hlist4;
}
vxlan_vs_add_dev(vs, vxlan, node);
if (metadata && (vxlan->cfg.flags & VXLAN_F_VNIFILTER))
vxlan_vs_add_vnigrp(vxlan, vs, ipv6);
else
vxlan_vs_add_dev(vs, vxlan, node);
return 0;
}
@ -3616,13 +3478,42 @@ static int vxlan_sock_add(struct vxlan_dev *vxlan)
return ret;
}
int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan,
struct vxlan_config *conf, __be32 vni)
{
struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
struct vxlan_dev *tmp;
list_for_each_entry(tmp, &vn->vxlan_list, next) {
if (tmp == vxlan)
continue;
if (tmp->cfg.flags & VXLAN_F_VNIFILTER) {
if (!vxlan_vnifilter_lookup(tmp, vni))
continue;
} else if (tmp->cfg.vni != vni) {
continue;
}
if (tmp->cfg.dst_port != conf->dst_port)
continue;
if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
(conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
continue;
if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
tmp->cfg.remote_ifindex != conf->remote_ifindex)
continue;
return -EEXIST;
}
return 0;
}
static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
struct net_device **lower,
struct vxlan_dev *old,
struct netlink_ext_ack *extack)
{
struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
struct vxlan_dev *tmp;
bool use_ipv6 = false;
if (conf->flags & VXLAN_F_GPE) {
@ -3755,22 +3646,7 @@ static int vxlan_config_validate(struct net *src_net, struct vxlan_config *conf,
if (!conf->age_interval)
conf->age_interval = FDB_AGE_DEFAULT;
list_for_each_entry(tmp, &vn->vxlan_list, next) {
if (tmp == old)
continue;
if (tmp->cfg.vni != conf->vni)
continue;
if (tmp->cfg.dst_port != conf->dst_port)
continue;
if ((tmp->cfg.flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)) !=
(conf->flags & (VXLAN_F_RCV_FLAGS | VXLAN_F_IPV6)))
continue;
if ((conf->flags & VXLAN_F_IPV6_LINKLOCAL) &&
tmp->cfg.remote_ifindex != conf->remote_ifindex)
continue;
if (vxlan_vni_in_use(src_net, old, conf, conf->vni)) {
NL_SET_ERR_MSG(extack,
"A VXLAN device with the specified VNI already exists");
return -EEXIST;
@ -4226,6 +4102,21 @@ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
if (data[IFLA_VXLAN_DF])
conf->df = nla_get_u8(data[IFLA_VXLAN_DF]);
if (data[IFLA_VXLAN_VNIFILTER]) {
err = vxlan_nl2flag(conf, data, IFLA_VXLAN_VNIFILTER,
VXLAN_F_VNIFILTER, changelink, false,
extack);
if (err)
return err;
if ((conf->flags & VXLAN_F_VNIFILTER) &&
!(conf->flags & VXLAN_F_COLLECT_METADATA)) {
NL_SET_ERR_MSG_ATTR(extack, data[IFLA_VXLAN_VNIFILTER],
"vxlan vnifilter only valid in collect metadata mode");
return -EINVAL;
}
}
return 0;
}
@ -4301,6 +4192,19 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[],
dst->remote_ifindex,
true);
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
/* If vni filtering device, also update fdb entries of
* all vnis that were using default remote ip
*/
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
err = vxlan_vnilist_update_group(vxlan, &dst->remote_ip,
&conf.remote_ip, extack);
if (err) {
netdev_adjacent_change_abort(dst->remote_dev,
lowerdev, dev);
return err;
}
}
}
if (conf.age_interval != vxlan->cfg.age_interval)
@ -4446,6 +4350,11 @@ static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
nla_put_flag(skb, IFLA_VXLAN_REMCSUM_NOPARTIAL))
goto nla_put_failure;
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER &&
nla_put_u8(skb, IFLA_VXLAN_VNIFILTER,
!!(vxlan->cfg.flags & VXLAN_F_VNIFILTER)))
goto nla_put_failure;
return 0;
nla_put_failure:
@ -4805,6 +4714,8 @@ static int __init vxlan_init_module(void)
if (rc)
goto out4;
vxlan_vnifilter_init();
return 0;
out4:
unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
@ -4819,6 +4730,7 @@ late_initcall(vxlan_init_module);
static void __exit vxlan_cleanup_module(void)
{
vxlan_vnifilter_uninit();
rtnl_link_unregister(&vxlan_link_ops);
unregister_switchdev_notifier(&vxlan_switchdev_notifier_block);
unregister_netdevice_notifier(&vxlan_notifier_block);

View File

@ -0,0 +1,272 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Vxlan multicast group handling
*
*/
#include <linux/kernel.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <linux/igmp.h>
#include <net/vxlan.h>
#include "vxlan_private.h"
/* Update multicast group membership when first VNI on
* multicast address is brought up
*/
int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip,
int rifindex)
{
union vxlan_addr *ip = (rip ? : &vxlan->default_dst.remote_ip);
int ifindex = (rifindex ? : vxlan->default_dst.remote_ifindex);
int ret = -EINVAL;
struct sock *sk;
if (ip->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr,
.imr_ifindex = ifindex,
};
sk = sock4->sock->sk;
lock_sock(sk);
ret = ip_mc_join_group(sk, &mreq);
release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
sk = sock6->sock->sk;
lock_sock(sk);
ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
&ip->sin6.sin6_addr);
release_sock(sk);
#endif
}
return ret;
}
int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip,
int rifindex)
{
union vxlan_addr *ip = (rip ? : &vxlan->default_dst.remote_ip);
int ifindex = (rifindex ? : vxlan->default_dst.remote_ifindex);
int ret = -EINVAL;
struct sock *sk;
if (ip->sa.sa_family == AF_INET) {
struct vxlan_sock *sock4 = rtnl_dereference(vxlan->vn4_sock);
struct ip_mreqn mreq = {
.imr_multiaddr.s_addr = ip->sin.sin_addr.s_addr,
.imr_ifindex = ifindex,
};
sk = sock4->sock->sk;
lock_sock(sk);
ret = ip_mc_leave_group(sk, &mreq);
release_sock(sk);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct vxlan_sock *sock6 = rtnl_dereference(vxlan->vn6_sock);
sk = sock6->sock->sk;
lock_sock(sk);
ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
&ip->sin6.sin6_addr);
release_sock(sk);
#endif
}
return ret;
}
static bool vxlan_group_used_match(union vxlan_addr *ip, int ifindex,
union vxlan_addr *rip, int rifindex)
{
if (!vxlan_addr_multicast(rip))
return false;
if (!vxlan_addr_equal(rip, ip))
return false;
if (rifindex != ifindex)
return false;
return true;
}
static bool vxlan_group_used_by_vnifilter(struct vxlan_dev *vxlan,
union vxlan_addr *ip, int ifindex)
{
struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp);
struct vxlan_vni_node *v, *tmp;
if (vxlan_group_used_match(ip, ifindex,
&vxlan->default_dst.remote_ip,
vxlan->default_dst.remote_ifindex))
return true;
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
if (!vxlan_addr_multicast(&v->remote_ip))
continue;
if (vxlan_group_used_match(ip, ifindex,
&v->remote_ip,
vxlan->default_dst.remote_ifindex))
return true;
}
return false;
}
/* See if multicast group is already in use by other ID */
bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev,
__be32 vni, union vxlan_addr *rip, int rifindex)
{
union vxlan_addr *ip = (rip ? : &dev->default_dst.remote_ip);
int ifindex = (rifindex ? : dev->default_dst.remote_ifindex);
struct vxlan_dev *vxlan;
struct vxlan_sock *sock4;
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_sock *sock6;
#endif
unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
sock4 = rtnl_dereference(dev->vn4_sock);
/* The vxlan_sock is only used by dev, leaving group has
* no effect on other vxlan devices.
*/
if (family == AF_INET && sock4 && refcount_read(&sock4->refcnt) == 1)
return false;
#if IS_ENABLED(CONFIG_IPV6)
sock6 = rtnl_dereference(dev->vn6_sock);
if (family == AF_INET6 && sock6 && refcount_read(&sock6->refcnt) == 1)
return false;
#endif
list_for_each_entry(vxlan, &vn->vxlan_list, next) {
if (!netif_running(vxlan->dev) || vxlan == dev)
continue;
if (family == AF_INET &&
rtnl_dereference(vxlan->vn4_sock) != sock4)
continue;
#if IS_ENABLED(CONFIG_IPV6)
if (family == AF_INET6 &&
rtnl_dereference(vxlan->vn6_sock) != sock6)
continue;
#endif
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER) {
if (!vxlan_group_used_by_vnifilter(vxlan, ip, ifindex))
continue;
} else {
if (!vxlan_group_used_match(ip, ifindex,
&vxlan->default_dst.remote_ip,
vxlan->default_dst.remote_ifindex))
continue;
}
return true;
}
return false;
}
static int vxlan_multicast_join_vnigrp(struct vxlan_dev *vxlan)
{
struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp);
struct vxlan_vni_node *v, *tmp, *vgood = NULL;
int ret = 0;
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
if (!vxlan_addr_multicast(&v->remote_ip))
continue;
/* skip if address is same as default address */
if (vxlan_addr_equal(&v->remote_ip,
&vxlan->default_dst.remote_ip))
continue;
ret = vxlan_igmp_join(vxlan, &v->remote_ip, 0);
if (ret == -EADDRINUSE)
ret = 0;
if (ret)
goto out;
vgood = v;
}
out:
if (ret) {
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
if (!vxlan_addr_multicast(&v->remote_ip))
continue;
if (vxlan_addr_equal(&v->remote_ip,
&vxlan->default_dst.remote_ip))
continue;
vxlan_igmp_leave(vxlan, &v->remote_ip, 0);
if (v == vgood)
break;
}
}
return ret;
}
static int vxlan_multicast_leave_vnigrp(struct vxlan_dev *vxlan)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp);
struct vxlan_vni_node *v, *tmp;
int last_err = 0, ret;
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
if (vxlan_addr_multicast(&v->remote_ip) &&
!vxlan_group_used(vn, vxlan, v->vni, &v->remote_ip,
0)) {
ret = vxlan_igmp_leave(vxlan, &v->remote_ip, 0);
if (ret)
last_err = ret;
}
}
return last_err;
}
int vxlan_multicast_join(struct vxlan_dev *vxlan)
{
int ret = 0;
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip)) {
ret = vxlan_igmp_join(vxlan, &vxlan->default_dst.remote_ip,
vxlan->default_dst.remote_ifindex);
if (ret == -EADDRINUSE)
ret = 0;
if (ret)
return ret;
}
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
return vxlan_multicast_join_vnigrp(vxlan);
return 0;
}
int vxlan_multicast_leave(struct vxlan_dev *vxlan)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
int ret = 0;
if (vxlan_addr_multicast(&vxlan->default_dst.remote_ip) &&
!vxlan_group_used(vn, vxlan, 0, NULL, 0)) {
ret = vxlan_igmp_leave(vxlan, &vxlan->default_dst.remote_ip,
vxlan->default_dst.remote_ifindex);
if (ret)
return ret;
}
if (vxlan->cfg.flags & VXLAN_F_VNIFILTER)
return vxlan_multicast_leave_vnigrp(vxlan);
return 0;
}

View File

@ -0,0 +1,162 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Vxlan private header file
*
*/
#ifndef _VXLAN_PRIVATE_H
#define _VXLAN_PRIVATE_H
#include <linux/rhashtable.h>
extern unsigned int vxlan_net_id;
extern const u8 all_zeros_mac[ETH_ALEN + 2];
extern const struct rhashtable_params vxlan_vni_rht_params;
#define PORT_HASH_BITS 8
#define PORT_HASH_SIZE (1 << PORT_HASH_BITS)
/* per-network namespace private data for this module */
struct vxlan_net {
struct list_head vxlan_list;
struct hlist_head sock_list[PORT_HASH_SIZE];
spinlock_t sock_lock;
struct notifier_block nexthop_notifier_block;
};
/* Forwarding table entry */
struct vxlan_fdb {
struct hlist_node hlist; /* linked list of entries */
struct rcu_head rcu;
unsigned long updated; /* jiffies */
unsigned long used;
struct list_head remotes;
u8 eth_addr[ETH_ALEN];
u16 state; /* see ndm_state */
__be32 vni;
u16 flags; /* see ndm_flags and below */
struct list_head nh_list;
struct nexthop __rcu *nh;
struct vxlan_dev __rcu *vdev;
};
#define NTF_VXLAN_ADDED_BY_USER 0x100
/* Virtual Network hash table head */
static inline struct hlist_head *vni_head(struct vxlan_sock *vs, __be32 vni)
{
return &vs->vni_list[hash_32((__force u32)vni, VNI_HASH_BITS)];
}
/* Socket hash table head */
static inline struct hlist_head *vs_head(struct net *net, __be16 port)
{
struct vxlan_net *vn = net_generic(net, vxlan_net_id);
return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
}
/* First remote destination for a forwarding entry.
* Guaranteed to be non-NULL because remotes are never deleted.
*/
static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb)
{
if (rcu_access_pointer(fdb->nh))
return NULL;
return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list);
}
static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb)
{
if (rcu_access_pointer(fdb->nh))
return NULL;
return list_first_entry(&fdb->remotes, struct vxlan_rdst, list);
}
#if IS_ENABLED(CONFIG_IPV6)
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
if (a->sa.sa_family != b->sa.sa_family)
return false;
if (a->sa.sa_family == AF_INET6)
return ipv6_addr_equal(&a->sin6.sin6_addr, &b->sin6.sin6_addr);
else
return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
#else /* !CONFIG_IPV6 */
static inline
bool vxlan_addr_equal(const union vxlan_addr *a, const union vxlan_addr *b)
{
return a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr;
}
#endif
static inline struct vxlan_vni_node *
vxlan_vnifilter_lookup(struct vxlan_dev *vxlan, __be32 vni)
{
struct vxlan_vni_group *vg;
vg = rcu_dereference_rtnl(vxlan->vnigrp);
if (!vg)
return NULL;
return rhashtable_lookup_fast(&vg->vni_hash, &vni,
vxlan_vni_rht_params);
}
/* vxlan_core.c */
int vxlan_fdb_create(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __be16 port, __be32 src_vni,
__be32 vni, __u32 ifindex, __u16 ndm_flags,
u32 nhid, struct vxlan_fdb **fdb,
struct netlink_ext_ack *extack);
int __vxlan_fdb_delete(struct vxlan_dev *vxlan,
const unsigned char *addr, union vxlan_addr ip,
__be16 port, __be32 src_vni, __be32 vni,
u32 ifindex, bool swdev_notify);
u32 eth_vni_hash(const unsigned char *addr, __be32 vni);
u32 fdb_head_index(struct vxlan_dev *vxlan, const u8 *mac, __be32 vni);
int vxlan_fdb_update(struct vxlan_dev *vxlan,
const u8 *mac, union vxlan_addr *ip,
__u16 state, __u16 flags,
__be16 port, __be32 src_vni, __be32 vni,
__u32 ifindex, __u16 ndm_flags, u32 nhid,
bool swdev_notify, struct netlink_ext_ack *extack);
int vxlan_vni_in_use(struct net *src_net, struct vxlan_dev *vxlan,
struct vxlan_config *conf, __be32 vni);
/* vxlan_vnifilter.c */
int vxlan_vnigroup_init(struct vxlan_dev *vxlan);
void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan);
void vxlan_vnifilter_init(void);
void vxlan_vnifilter_uninit(void);
void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni,
struct vxlan_vni_node *vninode,
int type, unsigned int len);
void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan,
struct vxlan_sock *vs,
bool ipv6);
void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan);
int vxlan_vnilist_update_group(struct vxlan_dev *vxlan,
union vxlan_addr *old_remote_ip,
union vxlan_addr *new_remote_ip,
struct netlink_ext_ack *extack);
/* vxlan_multicast.c */
int vxlan_multicast_join(struct vxlan_dev *vxlan);
int vxlan_multicast_leave(struct vxlan_dev *vxlan);
bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev,
__be32 vni, union vxlan_addr *rip, int rifindex);
int vxlan_igmp_join(struct vxlan_dev *vxlan, union vxlan_addr *rip,
int rifindex);
int vxlan_igmp_leave(struct vxlan_dev *vxlan, union vxlan_addr *rip,
int rifindex);
#endif

View File

@ -0,0 +1,999 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Vxlan vni filter for collect metadata mode
*
* Authors: Roopa Prabhu <roopa@nvidia.com>
*
*/
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/etherdevice.h>
#include <linux/rhashtable.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/vxlan.h>
#include "vxlan_private.h"
static inline int vxlan_vni_cmp(struct rhashtable_compare_arg *arg,
const void *ptr)
{
const struct vxlan_vni_node *vnode = ptr;
__be32 vni = *(__be32 *)arg->key;
return vnode->vni != vni;
}
const struct rhashtable_params vxlan_vni_rht_params = {
.head_offset = offsetof(struct vxlan_vni_node, vnode),
.key_offset = offsetof(struct vxlan_vni_node, vni),
.key_len = sizeof(__be32),
.nelem_hint = 3,
.max_size = VXLAN_N_VID,
.obj_cmpfn = vxlan_vni_cmp,
.automatic_shrinking = true,
};
static void vxlan_vs_add_del_vninode(struct vxlan_dev *vxlan,
struct vxlan_vni_node *v,
bool del)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_dev_node *node;
struct vxlan_sock *vs;
spin_lock(&vn->sock_lock);
if (del) {
if (!hlist_unhashed(&v->hlist4.hlist))
hlist_del_init_rcu(&v->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
if (!hlist_unhashed(&v->hlist6.hlist))
hlist_del_init_rcu(&v->hlist6.hlist);
#endif
goto out;
}
#if IS_ENABLED(CONFIG_IPV6)
vs = rtnl_dereference(vxlan->vn6_sock);
if (vs && v) {
node = &v->hlist6;
hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni));
}
#endif
vs = rtnl_dereference(vxlan->vn4_sock);
if (vs && v) {
node = &v->hlist4;
hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni));
}
out:
spin_unlock(&vn->sock_lock);
}
void vxlan_vs_add_vnigrp(struct vxlan_dev *vxlan,
struct vxlan_sock *vs,
bool ipv6)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp);
struct vxlan_vni_node *v, *tmp;
struct vxlan_dev_node *node;
if (!vg)
return;
spin_lock(&vn->sock_lock);
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
#if IS_ENABLED(CONFIG_IPV6)
if (ipv6)
node = &v->hlist6;
else
#endif
node = &v->hlist4;
node->vxlan = vxlan;
hlist_add_head_rcu(&node->hlist, vni_head(vs, v->vni));
}
spin_unlock(&vn->sock_lock);
}
void vxlan_vs_del_vnigrp(struct vxlan_dev *vxlan)
{
struct vxlan_vni_group *vg = rtnl_dereference(vxlan->vnigrp);
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_vni_node *v, *tmp;
if (!vg)
return;
spin_lock(&vn->sock_lock);
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
hlist_del_init_rcu(&v->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
hlist_del_init_rcu(&v->hlist6.hlist);
#endif
}
spin_unlock(&vn->sock_lock);
}
static void vxlan_vnifilter_stats_get(const struct vxlan_vni_node *vninode,
struct vxlan_vni_stats *dest)
{
int i;
memset(dest, 0, sizeof(*dest));
for_each_possible_cpu(i) {
struct vxlan_vni_stats_pcpu *pstats;
struct vxlan_vni_stats temp;
unsigned int start;
pstats = per_cpu_ptr(vninode->stats, i);
do {
start = u64_stats_fetch_begin_irq(&pstats->syncp);
memcpy(&temp, &pstats->stats, sizeof(temp));
} while (u64_stats_fetch_retry_irq(&pstats->syncp, start));
dest->rx_packets += temp.rx_packets;
dest->rx_bytes += temp.rx_bytes;
dest->rx_drops += temp.rx_drops;
dest->rx_errors += temp.rx_errors;
dest->tx_packets += temp.tx_packets;
dest->tx_bytes += temp.tx_bytes;
dest->tx_drops += temp.tx_drops;
dest->tx_errors += temp.tx_errors;
}
}
static void vxlan_vnifilter_stats_add(struct vxlan_vni_node *vninode,
int type, unsigned int len)
{
struct vxlan_vni_stats_pcpu *pstats = this_cpu_ptr(vninode->stats);
u64_stats_update_begin(&pstats->syncp);
switch (type) {
case VXLAN_VNI_STATS_RX:
pstats->stats.rx_bytes += len;
pstats->stats.rx_packets++;
break;
case VXLAN_VNI_STATS_RX_DROPS:
pstats->stats.rx_drops++;
break;
case VXLAN_VNI_STATS_RX_ERRORS:
pstats->stats.rx_errors++;
break;
case VXLAN_VNI_STATS_TX:
pstats->stats.tx_bytes += len;
pstats->stats.tx_packets++;
break;
case VXLAN_VNI_STATS_TX_DROPS:
pstats->stats.tx_drops++;
break;
case VXLAN_VNI_STATS_TX_ERRORS:
pstats->stats.tx_errors++;
break;
}
u64_stats_update_end(&pstats->syncp);
}
void vxlan_vnifilter_count(struct vxlan_dev *vxlan, __be32 vni,
struct vxlan_vni_node *vninode,
int type, unsigned int len)
{
struct vxlan_vni_node *vnode;
if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
return;
if (vninode) {
vnode = vninode;
} else {
vnode = vxlan_vnifilter_lookup(vxlan, vni);
if (!vnode)
return;
}
vxlan_vnifilter_stats_add(vnode, type, len);
}
static u32 vnirange(struct vxlan_vni_node *vbegin,
struct vxlan_vni_node *vend)
{
return (be32_to_cpu(vend->vni) - be32_to_cpu(vbegin->vni));
}
static size_t vxlan_vnifilter_entry_nlmsg_size(void)
{
return NLMSG_ALIGN(sizeof(struct tunnel_msg))
+ nla_total_size(0) /* VXLAN_VNIFILTER_ENTRY */
+ nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_START */
+ nla_total_size(sizeof(u32)) /* VXLAN_VNIFILTER_ENTRY_END */
+ nla_total_size(sizeof(struct in6_addr));/* VXLAN_VNIFILTER_ENTRY_GROUP{6} */
}
static int __vnifilter_entry_fill_stats(struct sk_buff *skb,
const struct vxlan_vni_node *vbegin)
{
struct vxlan_vni_stats vstats;
struct nlattr *vstats_attr;
vstats_attr = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY_STATS);
if (!vstats_attr)
goto out_stats_err;
vxlan_vnifilter_stats_get(vbegin, &vstats);
if (nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_BYTES,
vstats.rx_bytes, VNIFILTER_ENTRY_STATS_PAD) ||
nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_PKTS,
vstats.rx_packets, VNIFILTER_ENTRY_STATS_PAD) ||
nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_DROPS,
vstats.rx_drops, VNIFILTER_ENTRY_STATS_PAD) ||
nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_RX_ERRORS,
vstats.rx_errors, VNIFILTER_ENTRY_STATS_PAD) ||
nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_BYTES,
vstats.tx_bytes, VNIFILTER_ENTRY_STATS_PAD) ||
nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_PKTS,
vstats.tx_packets, VNIFILTER_ENTRY_STATS_PAD) ||
nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_DROPS,
vstats.tx_drops, VNIFILTER_ENTRY_STATS_PAD) ||
nla_put_u64_64bit(skb, VNIFILTER_ENTRY_STATS_TX_ERRORS,
vstats.tx_errors, VNIFILTER_ENTRY_STATS_PAD))
goto out_stats_err;
nla_nest_end(skb, vstats_attr);
return 0;
out_stats_err:
nla_nest_cancel(skb, vstats_attr);
return -EMSGSIZE;
}
static bool vxlan_fill_vni_filter_entry(struct sk_buff *skb,
struct vxlan_vni_node *vbegin,
struct vxlan_vni_node *vend,
bool fill_stats)
{
struct nlattr *ventry;
u32 vs = be32_to_cpu(vbegin->vni);
u32 ve = 0;
if (vbegin != vend)
ve = be32_to_cpu(vend->vni);
ventry = nla_nest_start(skb, VXLAN_VNIFILTER_ENTRY);
if (!ventry)
return false;
if (nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_START, vs))
goto out_err;
if (ve && nla_put_u32(skb, VXLAN_VNIFILTER_ENTRY_END, ve))
goto out_err;
if (!vxlan_addr_any(&vbegin->remote_ip)) {
if (vbegin->remote_ip.sa.sa_family == AF_INET) {
if (nla_put_in_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP,
vbegin->remote_ip.sin.sin_addr.s_addr))
goto out_err;
#if IS_ENABLED(CONFIG_IPV6)
} else {
if (nla_put_in6_addr(skb, VXLAN_VNIFILTER_ENTRY_GROUP6,
&vbegin->remote_ip.sin6.sin6_addr))
goto out_err;
#endif
}
}
if (fill_stats && __vnifilter_entry_fill_stats(skb, vbegin))
goto out_err;
nla_nest_end(skb, ventry);
return true;
out_err:
nla_nest_cancel(skb, ventry);
return false;
}
static void vxlan_vnifilter_notify(const struct vxlan_dev *vxlan,
struct vxlan_vni_node *vninode, int cmd)
{
struct tunnel_msg *tmsg;
struct sk_buff *skb;
struct nlmsghdr *nlh;
struct net *net = dev_net(vxlan->dev);
int err = -ENOBUFS;
skb = nlmsg_new(vxlan_vnifilter_entry_nlmsg_size(), GFP_KERNEL);
if (!skb)
goto out_err;
err = -EMSGSIZE;
nlh = nlmsg_put(skb, 0, 0, cmd, sizeof(*tmsg), 0);
if (!nlh)
goto out_err;
tmsg = nlmsg_data(nlh);
memset(tmsg, 0, sizeof(*tmsg));
tmsg->family = AF_BRIDGE;
tmsg->ifindex = vxlan->dev->ifindex;
if (!vxlan_fill_vni_filter_entry(skb, vninode, vninode, false))
goto out_err;
nlmsg_end(skb, nlh);
rtnl_notify(skb, net, 0, RTNLGRP_TUNNEL, NULL, GFP_KERNEL);
return;
out_err:
rtnl_set_sk_err(net, RTNLGRP_TUNNEL, err);
kfree_skb(skb);
}
static int vxlan_vnifilter_dump_dev(const struct net_device *dev,
struct sk_buff *skb,
struct netlink_callback *cb)
{
struct vxlan_vni_node *tmp, *v, *vbegin = NULL, *vend = NULL;
struct vxlan_dev *vxlan = netdev_priv(dev);
struct tunnel_msg *new_tmsg, *tmsg;
int idx = 0, s_idx = cb->args[1];
struct vxlan_vni_group *vg;
struct nlmsghdr *nlh;
bool dump_stats;
int err = 0;
if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
return -EINVAL;
/* RCU needed because of the vni locking rules (rcu || rtnl) */
vg = rcu_dereference(vxlan->vnigrp);
if (!vg || !vg->num_vnis)
return 0;
tmsg = nlmsg_data(cb->nlh);
dump_stats = !!(tmsg->flags & TUNNEL_MSG_FLAG_STATS);
nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
RTM_NEWTUNNEL, sizeof(*new_tmsg), NLM_F_MULTI);
if (!nlh)
return -EMSGSIZE;
new_tmsg = nlmsg_data(nlh);
memset(new_tmsg, 0, sizeof(*new_tmsg));
new_tmsg->family = PF_BRIDGE;
new_tmsg->ifindex = dev->ifindex;
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
if (idx < s_idx) {
idx++;
continue;
}
if (!vbegin) {
vbegin = v;
vend = v;
continue;
}
if (!dump_stats && vnirange(vend, v) == 1 &&
vxlan_addr_equal(&v->remote_ip, &vend->remote_ip)) {
goto update_end;
} else {
if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend,
dump_stats)) {
err = -EMSGSIZE;
break;
}
idx += vnirange(vbegin, vend) + 1;
vbegin = v;
}
update_end:
vend = v;
}
if (!err && vbegin) {
if (!vxlan_fill_vni_filter_entry(skb, vbegin, vend, dump_stats))
err = -EMSGSIZE;
}
cb->args[1] = err ? idx : 0;
nlmsg_end(skb, nlh);
return err;
}
static int vxlan_vnifilter_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
int idx = 0, err = 0, s_idx = cb->args[0];
struct net *net = sock_net(skb->sk);
struct tunnel_msg *tmsg;
struct net_device *dev;
tmsg = nlmsg_data(cb->nlh);
if (tmsg->flags & ~TUNNEL_MSG_VALID_USER_FLAGS) {
NL_SET_ERR_MSG(cb->extack, "Invalid tunnelmsg flags in ancillary header");
return -EINVAL;
}
rcu_read_lock();
if (tmsg->ifindex) {
dev = dev_get_by_index_rcu(net, tmsg->ifindex);
if (!dev) {
err = -ENODEV;
goto out_err;
}
err = vxlan_vnifilter_dump_dev(dev, skb, cb);
/* if the dump completed without an error we return 0 here */
if (err != -EMSGSIZE)
goto out_err;
} else {
for_each_netdev_rcu(net, dev) {
if (!netif_is_vxlan(dev))
continue;
if (idx < s_idx)
goto skip;
err = vxlan_vnifilter_dump_dev(dev, skb, cb);
if (err == -EMSGSIZE)
break;
skip:
idx++;
}
}
cb->args[0] = idx;
rcu_read_unlock();
return skb->len;
out_err:
rcu_read_unlock();
return err;
}
static const struct nla_policy vni_filter_entry_policy[VXLAN_VNIFILTER_ENTRY_MAX + 1] = {
[VXLAN_VNIFILTER_ENTRY_START] = { .type = NLA_U32 },
[VXLAN_VNIFILTER_ENTRY_END] = { .type = NLA_U32 },
[VXLAN_VNIFILTER_ENTRY_GROUP] = { .type = NLA_BINARY,
.len = sizeof_field(struct iphdr, daddr) },
[VXLAN_VNIFILTER_ENTRY_GROUP6] = { .type = NLA_BINARY,
.len = sizeof(struct in6_addr) },
};
static const struct nla_policy vni_filter_policy[VXLAN_VNIFILTER_MAX + 1] = {
[VXLAN_VNIFILTER_ENTRY] = { .type = NLA_NESTED },
};
static int vxlan_update_default_fdb_entry(struct vxlan_dev *vxlan, __be32 vni,
union vxlan_addr *old_remote_ip,
union vxlan_addr *remote_ip,
struct netlink_ext_ack *extack)
{
struct vxlan_rdst *dst = &vxlan->default_dst;
u32 hash_index;
int err = 0;
hash_index = fdb_head_index(vxlan, all_zeros_mac, vni);
spin_lock_bh(&vxlan->hash_lock[hash_index]);
if (remote_ip && !vxlan_addr_any(remote_ip)) {
err = vxlan_fdb_update(vxlan, all_zeros_mac,
remote_ip,
NUD_REACHABLE | NUD_PERMANENT,
NLM_F_APPEND | NLM_F_CREATE,
vxlan->cfg.dst_port,
vni,
vni,
dst->remote_ifindex,
NTF_SELF, 0, true, extack);
if (err) {
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
return err;
}
}
if (old_remote_ip && !vxlan_addr_any(old_remote_ip)) {
__vxlan_fdb_delete(vxlan, all_zeros_mac,
*old_remote_ip,
vxlan->cfg.dst_port,
vni, vni,
dst->remote_ifindex,
true);
}
spin_unlock_bh(&vxlan->hash_lock[hash_index]);
return err;
}
static int vxlan_vni_update_group(struct vxlan_dev *vxlan,
struct vxlan_vni_node *vninode,
union vxlan_addr *group,
bool create, bool *changed,
struct netlink_ext_ack *extack)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_rdst *dst = &vxlan->default_dst;
union vxlan_addr *newrip = NULL, *oldrip = NULL;
union vxlan_addr old_remote_ip;
int ret = 0;
memcpy(&old_remote_ip, &vninode->remote_ip, sizeof(old_remote_ip));
/* if per vni remote ip is not present use vxlan dev
* default dst remote ip for fdb entry
*/
if (group && !vxlan_addr_any(group)) {
newrip = group;
} else {
if (!vxlan_addr_any(&dst->remote_ip))
newrip = &dst->remote_ip;
}
/* if old rip exists, and no newrip,
* explicitly delete old rip
*/
if (!newrip && !vxlan_addr_any(&old_remote_ip))
oldrip = &old_remote_ip;
if (!newrip && !oldrip)
return 0;
if (!create && oldrip && newrip && vxlan_addr_equal(oldrip, newrip))
return 0;
ret = vxlan_update_default_fdb_entry(vxlan, vninode->vni,
oldrip, newrip,
extack);
if (ret)
goto out;
if (group)
memcpy(&vninode->remote_ip, group, sizeof(vninode->remote_ip));
if (vxlan->dev->flags & IFF_UP) {
if (vxlan_addr_multicast(&old_remote_ip) &&
!vxlan_group_used(vn, vxlan, vninode->vni,
&old_remote_ip,
vxlan->default_dst.remote_ifindex)) {
ret = vxlan_igmp_leave(vxlan, &old_remote_ip,
0);
if (ret)
goto out;
}
if (vxlan_addr_multicast(&vninode->remote_ip)) {
ret = vxlan_igmp_join(vxlan, &vninode->remote_ip, 0);
if (ret == -EADDRINUSE)
ret = 0;
if (ret)
goto out;
}
}
*changed = true;
return 0;
out:
return ret;
}
int vxlan_vnilist_update_group(struct vxlan_dev *vxlan,
union vxlan_addr *old_remote_ip,
union vxlan_addr *new_remote_ip,
struct netlink_ext_ack *extack)
{
struct list_head *headp, *hpos;
struct vxlan_vni_group *vg;
struct vxlan_vni_node *vent;
int ret;
vg = rtnl_dereference(vxlan->vnigrp);
headp = &vg->vni_list;
list_for_each_prev(hpos, headp) {
vent = list_entry(hpos, struct vxlan_vni_node, vlist);
if (vxlan_addr_any(&vent->remote_ip)) {
ret = vxlan_update_default_fdb_entry(vxlan, vent->vni,
old_remote_ip,
new_remote_ip,
extack);
if (ret)
return ret;
}
}
return 0;
}
static void vxlan_vni_delete_group(struct vxlan_dev *vxlan,
struct vxlan_vni_node *vninode)
{
struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
struct vxlan_rdst *dst = &vxlan->default_dst;
/* if per vni remote_ip not present, delete the
* default dst remote_ip previously added for this vni
*/
if (!vxlan_addr_any(&vninode->remote_ip) ||
!vxlan_addr_any(&dst->remote_ip))
__vxlan_fdb_delete(vxlan, all_zeros_mac,
(vxlan_addr_any(&vninode->remote_ip) ?
dst->remote_ip : vninode->remote_ip),
vxlan->cfg.dst_port,
vninode->vni, vninode->vni,
dst->remote_ifindex,
true);
if (vxlan->dev->flags & IFF_UP) {
if (vxlan_addr_multicast(&vninode->remote_ip) &&
!vxlan_group_used(vn, vxlan, vninode->vni,
&vninode->remote_ip,
dst->remote_ifindex)) {
vxlan_igmp_leave(vxlan, &vninode->remote_ip, 0);
}
}
}
static int vxlan_vni_update(struct vxlan_dev *vxlan,
struct vxlan_vni_group *vg,
__be32 vni, union vxlan_addr *group,
bool *changed,
struct netlink_ext_ack *extack)
{
struct vxlan_vni_node *vninode;
int ret;
vninode = rhashtable_lookup_fast(&vg->vni_hash, &vni,
vxlan_vni_rht_params);
if (!vninode)
return 0;
ret = vxlan_vni_update_group(vxlan, vninode, group, false, changed,
extack);
if (ret)
return ret;
if (changed)
vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL);
return 0;
}
static void __vxlan_vni_add_list(struct vxlan_vni_group *vg,
struct vxlan_vni_node *v)
{
struct list_head *headp, *hpos;
struct vxlan_vni_node *vent;
headp = &vg->vni_list;
list_for_each_prev(hpos, headp) {
vent = list_entry(hpos, struct vxlan_vni_node, vlist);
if (be32_to_cpu(v->vni) < be32_to_cpu(vent->vni))
continue;
else
break;
}
list_add_rcu(&v->vlist, hpos);
vg->num_vnis++;
}
static void __vxlan_vni_del_list(struct vxlan_vni_group *vg,
struct vxlan_vni_node *v)
{
list_del_rcu(&v->vlist);
vg->num_vnis--;
}
static struct vxlan_vni_node *vxlan_vni_alloc(struct vxlan_dev *vxlan,
__be32 vni)
{
struct vxlan_vni_node *vninode;
vninode = kzalloc(sizeof(*vninode), GFP_ATOMIC);
if (!vninode)
return NULL;
vninode->stats = netdev_alloc_pcpu_stats(struct vxlan_vni_stats_pcpu);
if (!vninode->stats) {
kfree(vninode);
return NULL;
}
vninode->vni = vni;
vninode->hlist4.vxlan = vxlan;
#if IS_ENABLED(CONFIG_IPV6)
vninode->hlist6.vxlan = vxlan;
#endif
return vninode;
}
static int vxlan_vni_add(struct vxlan_dev *vxlan,
struct vxlan_vni_group *vg,
u32 vni, union vxlan_addr *group,
struct netlink_ext_ack *extack)
{
struct vxlan_vni_node *vninode;
__be32 v = cpu_to_be32(vni);
bool changed = false;
int err = 0;
if (vxlan_vnifilter_lookup(vxlan, v))
return vxlan_vni_update(vxlan, vg, v, group, &changed, extack);
err = vxlan_vni_in_use(vxlan->net, vxlan, &vxlan->cfg, v);
if (err) {
NL_SET_ERR_MSG(extack, "VNI in use");
return err;
}
vninode = vxlan_vni_alloc(vxlan, v);
if (!vninode)
return -ENOMEM;
err = rhashtable_lookup_insert_fast(&vg->vni_hash,
&vninode->vnode,
vxlan_vni_rht_params);
if (err) {
kfree(vninode);
return err;
}
__vxlan_vni_add_list(vg, vninode);
if (vxlan->dev->flags & IFF_UP)
vxlan_vs_add_del_vninode(vxlan, vninode, false);
err = vxlan_vni_update_group(vxlan, vninode, group, true, &changed,
extack);
if (changed)
vxlan_vnifilter_notify(vxlan, vninode, RTM_NEWTUNNEL);
return err;
}
static void vxlan_vni_node_rcu_free(struct rcu_head *rcu)
{
struct vxlan_vni_node *v;
v = container_of(rcu, struct vxlan_vni_node, rcu);
free_percpu(v->stats);
kfree(v);
}
static int vxlan_vni_del(struct vxlan_dev *vxlan,
struct vxlan_vni_group *vg,
u32 vni, struct netlink_ext_ack *extack)
{
struct vxlan_vni_node *vninode;
__be32 v = cpu_to_be32(vni);
int err = 0;
vg = rtnl_dereference(vxlan->vnigrp);
vninode = rhashtable_lookup_fast(&vg->vni_hash, &v,
vxlan_vni_rht_params);
if (!vninode) {
err = -ENOENT;
goto out;
}
vxlan_vni_delete_group(vxlan, vninode);
err = rhashtable_remove_fast(&vg->vni_hash,
&vninode->vnode,
vxlan_vni_rht_params);
if (err)
goto out;
__vxlan_vni_del_list(vg, vninode);
vxlan_vnifilter_notify(vxlan, vninode, RTM_DELTUNNEL);
if (vxlan->dev->flags & IFF_UP)
vxlan_vs_add_del_vninode(vxlan, vninode, true);
call_rcu(&vninode->rcu, vxlan_vni_node_rcu_free);
return 0;
out:
return err;
}
static int vxlan_vni_add_del(struct vxlan_dev *vxlan, __u32 start_vni,
__u32 end_vni, union vxlan_addr *group,
int cmd, struct netlink_ext_ack *extack)
{
struct vxlan_vni_group *vg;
int v, err = 0;
vg = rtnl_dereference(vxlan->vnigrp);
for (v = start_vni; v <= end_vni; v++) {
switch (cmd) {
case RTM_NEWTUNNEL:
err = vxlan_vni_add(vxlan, vg, v, group, extack);
break;
case RTM_DELTUNNEL:
err = vxlan_vni_del(vxlan, vg, v, extack);
break;
default:
err = -EOPNOTSUPP;
break;
}
if (err)
goto out;
}
return 0;
out:
return err;
}
static int vxlan_process_vni_filter(struct vxlan_dev *vxlan,
struct nlattr *nlvnifilter,
int cmd, struct netlink_ext_ack *extack)
{
struct nlattr *vattrs[VXLAN_VNIFILTER_ENTRY_MAX + 1];
u32 vni_start = 0, vni_end = 0;
union vxlan_addr group;
int err;
err = nla_parse_nested(vattrs,
VXLAN_VNIFILTER_ENTRY_MAX,
nlvnifilter, vni_filter_entry_policy,
extack);
if (err)
return err;
if (vattrs[VXLAN_VNIFILTER_ENTRY_START]) {
vni_start = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_START]);
vni_end = vni_start;
}
if (vattrs[VXLAN_VNIFILTER_ENTRY_END])
vni_end = nla_get_u32(vattrs[VXLAN_VNIFILTER_ENTRY_END]);
if (!vni_start && !vni_end) {
NL_SET_ERR_MSG_ATTR(extack, nlvnifilter,
"vni start nor end found in vni entry");
return -EINVAL;
}
if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]) {
group.sin.sin_addr.s_addr =
nla_get_in_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP]);
group.sa.sa_family = AF_INET;
} else if (vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]) {
group.sin6.sin6_addr =
nla_get_in6_addr(vattrs[VXLAN_VNIFILTER_ENTRY_GROUP6]);
group.sa.sa_family = AF_INET6;
} else {
memset(&group, 0, sizeof(group));
}
if (vxlan_addr_multicast(&group) && !vxlan->default_dst.remote_ifindex) {
NL_SET_ERR_MSG(extack,
"Local interface required for multicast remote group");
return -EINVAL;
}
err = vxlan_vni_add_del(vxlan, vni_start, vni_end, &group, cmd,
extack);
if (err)
return err;
return 0;
}
void vxlan_vnigroup_uninit(struct vxlan_dev *vxlan)
{
struct vxlan_vni_node *v, *tmp;
struct vxlan_vni_group *vg;
vg = rtnl_dereference(vxlan->vnigrp);
list_for_each_entry_safe(v, tmp, &vg->vni_list, vlist) {
rhashtable_remove_fast(&vg->vni_hash, &v->vnode,
vxlan_vni_rht_params);
hlist_del_init_rcu(&v->hlist4.hlist);
#if IS_ENABLED(CONFIG_IPV6)
hlist_del_init_rcu(&v->hlist6.hlist);
#endif
__vxlan_vni_del_list(vg, v);
vxlan_vnifilter_notify(vxlan, v, RTM_DELTUNNEL);
call_rcu(&v->rcu, vxlan_vni_node_rcu_free);
}
rhashtable_destroy(&vg->vni_hash);
kfree(vg);
}
int vxlan_vnigroup_init(struct vxlan_dev *vxlan)
{
struct vxlan_vni_group *vg;
int ret;
vg = kzalloc(sizeof(*vg), GFP_KERNEL);
if (!vg)
return -ENOMEM;
ret = rhashtable_init(&vg->vni_hash, &vxlan_vni_rht_params);
if (ret) {
kfree(vg);
return ret;
}
INIT_LIST_HEAD(&vg->vni_list);
rcu_assign_pointer(vxlan->vnigrp, vg);
return 0;
}
static int vxlan_vnifilter_process(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
struct tunnel_msg *tmsg;
struct vxlan_dev *vxlan;
struct net_device *dev;
struct nlattr *attr;
int err, vnis = 0;
int rem;
/* this should validate the header and check for remaining bytes */
err = nlmsg_parse(nlh, sizeof(*tmsg), NULL, VXLAN_VNIFILTER_MAX,
vni_filter_policy, extack);
if (err < 0)
return err;
tmsg = nlmsg_data(nlh);
dev = __dev_get_by_index(net, tmsg->ifindex);
if (!dev)
return -ENODEV;
if (!netif_is_vxlan(dev)) {
NL_SET_ERR_MSG_MOD(extack, "The device is not a vxlan device");
return -EINVAL;
}
vxlan = netdev_priv(dev);
if (!(vxlan->cfg.flags & VXLAN_F_VNIFILTER))
return -EOPNOTSUPP;
nlmsg_for_each_attr(attr, nlh, sizeof(*tmsg), rem) {
switch (nla_type(attr)) {
case VXLAN_VNIFILTER_ENTRY:
err = vxlan_process_vni_filter(vxlan, attr,
nlh->nlmsg_type, extack);
break;
default:
continue;
}
vnis++;
if (err)
break;
}
if (!vnis) {
NL_SET_ERR_MSG_MOD(extack, "No vnis found to process");
err = -EINVAL;
}
return err;
}
void vxlan_vnifilter_init(void)
{
rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_GETTUNNEL, NULL,
vxlan_vnifilter_dump, 0);
rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_NEWTUNNEL,
vxlan_vnifilter_process, NULL, 0);
rtnl_register_module(THIS_MODULE, PF_BRIDGE, RTM_DELTUNNEL,
vxlan_vnifilter_process, NULL, 0);
}
void vxlan_vnifilter_uninit(void)
{
rtnl_unregister(PF_BRIDGE, RTM_GETTUNNEL);
rtnl_unregister(PF_BRIDGE, RTM_NEWTUNNEL);
rtnl_unregister(PF_BRIDGE, RTM_DELTUNNEL);
}

View File

@ -227,11 +227,56 @@ struct vxlan_config {
enum ifla_vxlan_df df;
};
enum {
VXLAN_VNI_STATS_RX,
VXLAN_VNI_STATS_RX_DROPS,
VXLAN_VNI_STATS_RX_ERRORS,
VXLAN_VNI_STATS_TX,
VXLAN_VNI_STATS_TX_DROPS,
VXLAN_VNI_STATS_TX_ERRORS,
};
struct vxlan_vni_stats {
u64 rx_packets;
u64 rx_bytes;
u64 rx_drops;
u64 rx_errors;
u64 tx_packets;
u64 tx_bytes;
u64 tx_drops;
u64 tx_errors;
};
struct vxlan_vni_stats_pcpu {
struct vxlan_vni_stats stats;
struct u64_stats_sync syncp;
};
struct vxlan_dev_node {
struct hlist_node hlist;
struct vxlan_dev *vxlan;
};
struct vxlan_vni_node {
struct rhash_head vnode;
struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */
#if IS_ENABLED(CONFIG_IPV6)
struct vxlan_dev_node hlist6; /* vni hash table for IPv6 socket */
#endif
struct list_head vlist;
__be32 vni;
union vxlan_addr remote_ip; /* default remote ip for this vni */
struct vxlan_vni_stats_pcpu __percpu *stats;
struct rcu_head rcu;
};
struct vxlan_vni_group {
struct rhashtable vni_hash;
struct list_head vni_list;
u32 num_vnis;
};
/* Pseudo network device */
struct vxlan_dev {
struct vxlan_dev_node hlist4; /* vni hash table for IPv4 socket */
@ -254,6 +299,8 @@ struct vxlan_dev {
struct vxlan_config cfg;
struct vxlan_vni_group __rcu *vnigrp;
struct hlist_head fdb_head[FDB_HASH_SIZE];
};
@ -274,6 +321,7 @@ struct vxlan_dev {
#define VXLAN_F_GPE 0x4000
#define VXLAN_F_IPV6_LINKLOCAL 0x8000
#define VXLAN_F_TTL_INHERIT 0x10000
#define VXLAN_F_VNIFILTER 0x20000
/* Flags that are used in the receive path. These flags must match in
* order for a socket to be shareable
@ -283,7 +331,8 @@ struct vxlan_dev {
VXLAN_F_UDP_ZERO_CSUM6_RX | \
VXLAN_F_REMCSUM_RX | \
VXLAN_F_REMCSUM_NOPARTIAL | \
VXLAN_F_COLLECT_METADATA)
VXLAN_F_COLLECT_METADATA | \
VXLAN_F_VNIFILTER)
/* Flags that can be set together with VXLAN_F_GPE. */
#define VXLAN_F_ALLOWED_GPE (VXLAN_F_GPE | \
@ -292,7 +341,8 @@ struct vxlan_dev {
VXLAN_F_UDP_ZERO_CSUM_TX | \
VXLAN_F_UDP_ZERO_CSUM6_TX | \
VXLAN_F_UDP_ZERO_CSUM6_RX | \
VXLAN_F_COLLECT_METADATA)
VXLAN_F_COLLECT_METADATA | \
VXLAN_F_VNIFILTER)
struct net_device *vxlan_dev_create(struct net *net, const char *name,
u8 name_assign_type, struct vxlan_config *conf);

View File

@ -713,7 +713,55 @@ enum ipvlan_mode {
#define IPVLAN_F_PRIVATE 0x01
#define IPVLAN_F_VEPA 0x02
/* Tunnel RTM header */
struct tunnel_msg {
__u8 family;
__u8 flags;
__u16 reserved2;
__u32 ifindex;
};
/* VXLAN section */
/* include statistics in the dump */
#define TUNNEL_MSG_FLAG_STATS 0x01
#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS
/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */
enum {
VNIFILTER_ENTRY_STATS_UNSPEC,
VNIFILTER_ENTRY_STATS_RX_BYTES,
VNIFILTER_ENTRY_STATS_RX_PKTS,
VNIFILTER_ENTRY_STATS_RX_DROPS,
VNIFILTER_ENTRY_STATS_RX_ERRORS,
VNIFILTER_ENTRY_STATS_TX_BYTES,
VNIFILTER_ENTRY_STATS_TX_PKTS,
VNIFILTER_ENTRY_STATS_TX_DROPS,
VNIFILTER_ENTRY_STATS_TX_ERRORS,
VNIFILTER_ENTRY_STATS_PAD,
__VNIFILTER_ENTRY_STATS_MAX
};
#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1)
enum {
VXLAN_VNIFILTER_ENTRY_UNSPEC,
VXLAN_VNIFILTER_ENTRY_START,
VXLAN_VNIFILTER_ENTRY_END,
VXLAN_VNIFILTER_ENTRY_GROUP,
VXLAN_VNIFILTER_ENTRY_GROUP6,
VXLAN_VNIFILTER_ENTRY_STATS,
__VXLAN_VNIFILTER_ENTRY_MAX
};
#define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1)
enum {
VXLAN_VNIFILTER_UNSPEC,
VXLAN_VNIFILTER_ENTRY,
__VXLAN_VNIFILTER_MAX
};
#define VXLAN_VNIFILTER_MAX (__VXLAN_VNIFILTER_MAX - 1)
enum {
IFLA_VXLAN_UNSPEC,
IFLA_VXLAN_ID,
@ -745,6 +793,7 @@ enum {
IFLA_VXLAN_GPE,
IFLA_VXLAN_TTL_INHERIT,
IFLA_VXLAN_DF,
IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */
__IFLA_VXLAN_MAX
};
#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1)

View File

@ -185,6 +185,13 @@ enum {
RTM_GETNEXTHOPBUCKET,
#define RTM_GETNEXTHOPBUCKET RTM_GETNEXTHOPBUCKET
RTM_NEWTUNNEL = 120,
#define RTM_NEWTUNNEL RTM_NEWTUNNEL
RTM_DELTUNNEL,
#define RTM_DELTUNNEL RTM_DELTUNNEL
RTM_GETTUNNEL,
#define RTM_GETTUNNEL RTM_GETTUNNEL
__RTM_MAX,
#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1)
};
@ -756,6 +763,8 @@ enum rtnetlink_groups {
#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN
RTNLGRP_MCTP_IFADDR,
#define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR
RTNLGRP_TUNNEL,
#define RTNLGRP_TUNNEL RTNLGRP_TUNNEL
__RTNLGRP_MAX
};
#define RTNLGRP_MAX (__RTNLGRP_MAX - 1)

View File

@ -91,6 +91,9 @@ static const struct nlmsg_perm nlmsg_route_perms[] =
{ RTM_NEWNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
{ RTM_DELNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
{ RTM_GETNEXTHOPBUCKET, NETLINK_ROUTE_SOCKET__NLMSG_READ },
{ RTM_NEWTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
{ RTM_DELTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_WRITE },
{ RTM_GETTUNNEL, NETLINK_ROUTE_SOCKET__NLMSG_READ },
};
static const struct nlmsg_perm nlmsg_tcpdiag_perms[] =
@ -176,7 +179,7 @@ int selinux_nlmsg_lookup(u16 sclass, u16 nlmsg_type, u32 *perm)
* structures at the top of this file with the new mappings
* before updating the BUILD_BUG_ON() macro!
*/
BUILD_BUG_ON(RTM_MAX != (RTM_NEWNEXTHOPBUCKET + 3));
BUILD_BUG_ON(RTM_MAX != (RTM_NEWTUNNEL + 3));
err = nlmsg_perm(nlmsg_type, perm, nlmsg_route_perms,
sizeof(nlmsg_route_perms));
break;

View File

@ -0,0 +1,579 @@
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
# This test is for checking the VXLAN vni filtering api and
# datapath.
# It simulates two hypervisors running two VMs each using four network
# six namespaces: two for the HVs, four for the VMs. Each VM is
# connected to a separate bridge. The VM's use overlapping vlans and
# hence the separate bridge domain. Each vxlan device is a collect
# metadata device with vni filtering and hence has the ability to
# terminate configured vni's only.
# +--------------------------------+ +------------------------------------+
# | vm-11 netns | | vm-21 netns |
# | | | |
# |+------------+ +-------------+ | |+-------------+ +----------------+ |
# ||veth-11.10 | |veth-11.20 | | ||veth-21.10 | | veth-21.20 | |
# ||10.0.10.11/24 |10.0.20.11/24| | ||10.0.10.21/24| | 10.0.20.21/24 | |
# |+------|-----+ +|------------+ | |+-----------|-+ +---|------------+ |
# | | | | | | | |
# | | | | | +------------+ |
# | +------------+ | | | veth-21 | |
# | | veth-11 | | | | | |
# | | | | | +-----|------+ |
# | +-----|------+ | | | |
# | | | | | |
# +------------|-------------------+ +---------------|--------------------+
# +------------|-----------------------------------------|-------------------+
# | +-----|------+ +-----|------+ |
# | |vethhv-11 | |vethhv-21 | |
# | +----|-------+ +-----|------+ |
# | +---|---+ +---|--+ |
# | | br1 | | br2 | |
# | +---|---+ +---|--+ |
# | +---|----+ +---|--+ |
# | | vxlan1| |vxlan2| |
# | +--|-----+ +--|---+ |
# | | | |
# | | +---------------------+ | |
# | | |veth0 | | |
# | +---------|172.16.0.1/24 -----------+ |
# | |2002:fee1::1/64 | |
# | hv-1 netns +--------|------------+ |
# +-----------------------------|--------------------------------------------+
# |
# +-----------------------------|--------------------------------------------+
# | hv-2 netns +--------|-------------+ |
# | | veth0 | |
# | +------| 172.16.0.2/24 |---+ |
# | | | 2002:fee1::2/64 | | |
# | | | | | |
# | | +----------------------+ | - |
# | | | |
# | +-|-------+ +--------|-+ |
# | | vxlan1 | | vxlan2 | |
# | +----|----+ +---|------+ |
# | +--|--+ +-|---+ |
# | | br1 | | br2 | |
# | +--|--+ +--|--+ |
# | +-----|-------+ +----|-------+ |
# | | vethhv-12 | |vethhv-22 | |
# | +------|------+ +-------|----+ |
# +-----------------|----------------------------|---------------------------+
# | |
# +-----------------|-----------------+ +--------|---------------------------+
# | +-------|---+ | | +--|---------+ |
# | | veth-12 | | | |veth-22 | |
# | +-|--------|+ | | +--|--------|+ |
# | | | | | | | |
# |+----------|--+ +---|-----------+ | |+-------|-----+ +|---------------+ |
# ||veth-12.10 | |veth-12.20 | | ||veth-22.10 | |veth-22.20 | |
# ||10.0.10.12/24| |10.0.20.12/24 | | ||10.0.10.22/24| |10.0.20.22/24 | |
# |+-------------+ +---------------+ | |+-------------+ +----------------+ |
# | | | |
# | | | |
# | vm-12 netns | |vm-22 netns |
# +-----------------------------------+ +------------------------------------+
#
#
# This test tests the new vxlan vnifiltering api
ret=0
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
# all tests in this script. Can be overridden with -t option
TESTS="
vxlan_vnifilter_api
vxlan_vnifilter_datapath
vxlan_vnifilter_datapath_pervni
vxlan_vnifilter_datapath_mgroup
vxlan_vnifilter_datapath_mgroup_pervni
vxlan_vnifilter_metadata_and_traditional_mix
"
VERBOSE=0
PAUSE_ON_FAIL=no
PAUSE=no
which ping6 > /dev/null 2>&1 && ping6=$(which ping6) || ping6=$(which ping)
log_test()
{
local rc=$1
local expected=$2
local msg="$3"
if [ ${rc} -eq ${expected} ]; then
printf " TEST: %-60s [ OK ]\n" "${msg}"
nsuccess=$((nsuccess+1))
else
ret=1
nfail=$((nfail+1))
printf " TEST: %-60s [FAIL]\n" "${msg}"
if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
echo
echo "hit enter to continue, 'q' to quit"
read a
[ "$a" = "q" ] && exit 1
fi
fi
if [ "${PAUSE}" = "yes" ]; then
echo
echo "hit enter to continue, 'q' to quit"
read a
[ "$a" = "q" ] && exit 1
fi
}
run_cmd()
{
local cmd="$1"
local out
local stderr="2>/dev/null"
if [ "$VERBOSE" = "1" ]; then
printf "COMMAND: $cmd\n"
stderr=
fi
out=$(eval $cmd $stderr)
rc=$?
if [ "$VERBOSE" = "1" -a -n "$out" ]; then
echo " $out"
fi
return $rc
}
check_hv_connectivity() {
ip netns exec hv-1 ping -c 1 -W 1 $1 &>/dev/null
sleep 1
ip netns exec hv-1 ping -c 1 -W 1 $2 &>/dev/null
return $?
}
check_vm_connectivity() {
run_cmd "ip netns exec vm-11 ping -c 1 -W 1 10.0.10.12"
log_test $? 0 "VM connectivity over $1 (ipv4 default rdst)"
run_cmd "ip netns exec vm-21 ping -c 1 -W 1 10.0.10.22"
log_test $? 0 "VM connectivity over $1 (ipv6 default rdst)"
}
cleanup() {
ip link del veth-hv-1 2>/dev/null || true
ip link del vethhv-11 vethhv-12 vethhv-21 vethhv-22 2>/dev/null || true
for ns in hv-1 hv-2 vm-11 vm-21 vm-12 vm-22 vm-31 vm-32; do
ip netns del $ns 2>/dev/null || true
done
}
trap cleanup EXIT
setup-hv-networking() {
hv=$1
local1=$2
mask1=$3
local2=$4
mask2=$5
ip netns add hv-$hv
ip link set veth-hv-$hv netns hv-$hv
ip -netns hv-$hv link set veth-hv-$hv name veth0
ip -netns hv-$hv addr add $local1/$mask1 dev veth0
ip -netns hv-$hv addr add $local2/$mask2 dev veth0
ip -netns hv-$hv link set veth0 up
}
# Setups a "VM" simulated by a netns an a veth pair
# example: setup-vm <hvid> <vmid> <brid> <VATTRS> <mcast_for_bum>
# VATTRS = comma separated "<vlan>-<v[46]>-<localip>-<remoteip>-<VTYPE>-<vxlandstport>"
# VTYPE = vxlan device type. "default = traditional device, metadata = metadata device
# vnifilter = vnifiltering device,
# vnifilterg = vnifiltering device with per vni group/remote"
# example:
# setup-vm 1 11 1 \
# 10-v4-172.16.0.1-239.1.1.100-vnifilterg,20-v4-172.16.0.1-239.1.1.100-vnifilterg 1
#
setup-vm() {
hvid=$1
vmid=$2
brid=$3
vattrs=$4
mcast=$5
lastvxlandev=""
# create bridge
ip -netns hv-$hvid link add br$brid type bridge vlan_filtering 1 vlan_default_pvid 0 \
mcast_snooping 0
ip -netns hv-$hvid link set br$brid up
# create vm namespace and interfaces and connect to hypervisor
# namespace
ip netns add vm-$vmid
hvvethif="vethhv-$vmid"
vmvethif="veth-$vmid"
ip link add $hvvethif type veth peer name $vmvethif
ip link set $hvvethif netns hv-$hvid
ip link set $vmvethif netns vm-$vmid
ip -netns hv-$hvid link set $hvvethif up
ip -netns vm-$vmid link set $vmvethif up
ip -netns hv-$hvid link set $hvvethif master br$brid
# configure VM vlan/vni filtering on hypervisor
for vmap in $(echo $vattrs | cut -d "," -f1- --output-delimiter=' ')
do
local vid=$(echo $vmap | awk -F'-' '{print ($1)}')
local family=$(echo $vmap | awk -F'-' '{print ($2)}')
local localip=$(echo $vmap | awk -F'-' '{print ($3)}')
local group=$(echo $vmap | awk -F'-' '{print ($4)}')
local vtype=$(echo $vmap | awk -F'-' '{print ($5)}')
local port=$(echo $vmap | awk -F'-' '{print ($6)}')
ip -netns vm-$vmid link add name $vmvethif.$vid link $vmvethif type vlan id $vid
ip -netns vm-$vmid addr add 10.0.$vid.$vmid/24 dev $vmvethif.$vid
ip -netns vm-$vmid link set $vmvethif.$vid up
tid=$vid
vxlandev="vxlan$brid"
vxlandevflags=""
if [[ -n $vtype && $vtype == "metadata" ]]; then
vxlandevflags="$vxlandevflags external"
elif [[ -n $vtype && $vtype == "vnifilter" || $vtype == "vnifilterg" ]]; then
vxlandevflags="$vxlandevflags external vnifilter"
tid=$((vid+brid))
else
vxlandevflags="$vxlandevflags id $tid"
vxlandev="vxlan$tid"
fi
if [[ -n $vtype && $vtype != "vnifilterg" ]]; then
if [[ -n "$group" && "$group" != "null" ]]; then
if [ $mcast -eq 1 ]; then
vxlandevflags="$vxlandevflags group $group"
else
vxlandevflags="$vxlandevflags remote $group"
fi
fi
fi
if [[ -n "$port" && "$port" != "default" ]]; then
vxlandevflags="$vxlandevflags dstport $port"
fi
# create vxlan device
if [ "$vxlandev" != "$lastvxlandev" ]; then
ip -netns hv-$hvid link add $vxlandev type vxlan local $localip $vxlandevflags dev veth0 2>/dev/null
ip -netns hv-$hvid link set $vxlandev master br$brid
ip -netns hv-$hvid link set $vxlandev up
lastvxlandev=$vxlandev
fi
# add vlan
bridge -netns hv-$hvid vlan add vid $vid dev $hvvethif
bridge -netns hv-$hvid vlan add vid $vid pvid dev $vxlandev
# Add bridge vni filter for tx
if [[ -n $vtype && $vtype == "metadata" || $vtype == "vnifilter" || $vtype == "vnifilterg" ]]; then
bridge -netns hv-$hvid link set dev $vxlandev vlan_tunnel on
bridge -netns hv-$hvid vlan add dev $vxlandev vid $vid tunnel_info id $tid
fi
if [[ -n $vtype && $vtype == "metadata" ]]; then
bridge -netns hv-$hvid fdb add 00:00:00:00:00:00 dev $vxlandev \
src_vni $tid vni $tid dst $group self
elif [[ -n $vtype && $vtype == "vnifilter" ]]; then
# Add per vni rx filter with 'bridge vni' api
bridge -netns hv-$hvid vni add dev $vxlandev vni $tid
elif [[ -n $vtype && $vtype == "vnifilterg" ]]; then
# Add per vni group config with 'bridge vni' api
if [ -n "$group" ]; then
if [ "$family" == "v4" ]; then
if [ $mcast -eq 1 ]; then
bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group $group
else
bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote $group
fi
else
if [ $mcast -eq 1 ]; then
bridge -netns hv-$hvid vni add dev $vxlandev vni $tid group6 $group
else
bridge -netns hv-$hvid vni add dev $vxlandev vni $tid remote6 $group
fi
fi
fi
fi
done
}
setup_vnifilter_api()
{
ip link add veth-host type veth peer name veth-testns
ip netns add testns
ip link set veth-testns netns testns
}
cleanup_vnifilter_api()
{
ip link del veth-host 2>/dev/null || true
ip netns del testns 2>/dev/null || true
}
# tests vxlan filtering api
vxlan_vnifilter_api()
{
hv1addr1="172.16.0.1"
hv2addr1="172.16.0.2"
hv1addr2="2002:fee1::1"
hv2addr2="2002:fee1::2"
localip="172.16.0.1"
group="239.1.1.101"
cleanup_vnifilter_api &>/dev/null
setup_vnifilter_api
# Duplicate vni test
# create non-vnifiltering traditional vni device
run_cmd "ip -netns testns link add vxlan100 type vxlan id 100 local $localip dev veth-testns dstport 4789"
log_test $? 0 "Create traditional vxlan device"
# create vni filtering device
run_cmd "ip -netns testns link add vxlan-ext1 type vxlan vnifilter local $localip dev veth-testns dstport 4789"
log_test $? 1 "Cannot create vnifilter device without external flag"
run_cmd "ip -netns testns link add vxlan-ext1 type vxlan external vnifilter local $localip dev veth-testns dstport 4789"
log_test $? 0 "Creating external vxlan device with vnifilter flag"
run_cmd "bridge -netns testns vni add dev vxlan-ext1 vni 100"
log_test $? 0 "Cannot set in-use vni id on vnifiltering device"
run_cmd "bridge -netns testns vni add dev vxlan-ext1 vni 200"
log_test $? 0 "Set new vni id on vnifiltering device"
run_cmd "ip -netns testns link add vxlan-ext2 type vxlan external vnifilter local $localip dev veth-testns dstport 4789"
log_test $? 0 "Create second external vxlan device with vnifilter flag"
run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 200"
log_test $? 255 "Cannot set in-use vni id on vnifiltering device"
run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 300"
log_test $? 0 "Set new vni id on vnifiltering device"
# check in bridge vni show
run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 300"
log_test $? 0 "Update vni id on vnifiltering device"
run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 400"
log_test $? 0 "Add new vni id on vnifiltering device"
# add multicast group per vni
run_cmd "bridge -netns testns vni add dev vxlan-ext1 vni 200 group $group"
log_test $? 0 "Set multicast group on existing vni"
# add multicast group per vni
run_cmd "bridge -netns testns vni add dev vxlan-ext2 vni 300 group $group"
log_test $? 0 "Set multicast group on existing vni"
# set vnifilter on an existing external vxlan device
run_cmd "ip -netns testns link set dev vxlan-ext1 type vxlan external vnifilter"
log_test $? 2 "Cannot set vnifilter flag on a device"
# change vxlan vnifilter flag
run_cmd "ip -netns testns link set dev vxlan-ext1 type vxlan external novnifilter"
log_test $? 2 "Cannot unset vnifilter flag on a device"
}
# Sanity test vnifilter datapath
# vnifilter vnis inherit BUM group from
# vxlan device
vxlan_vnifilter_datapath()
{
hv1addr1="172.16.0.1"
hv2addr1="172.16.0.2"
hv1addr2="2002:fee1::1"
hv2addr2="2002:fee1::2"
ip link add veth-hv-1 type veth peer name veth-hv-2
setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64 $hv2addr1 $hv2addr2
setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64 $hv1addr1 $hv1addr2
check_hv_connectivity hv2addr1 hv2addr2
setup-vm 1 11 1 10-v4-$hv1addr1-$hv2addr1-vnifilter,20-v4-$hv1addr1-$hv2addr1-vnifilter 0
setup-vm 1 21 2 10-v6-$hv1addr2-$hv2addr2-vnifilter,20-v6-$hv1addr2-$hv2addr2-vnifilter 0
setup-vm 2 12 1 10-v4-$hv2addr1-$hv1addr1-vnifilter,20-v4-$hv2addr1-$hv1addr1-vnifilter 0
setup-vm 2 22 2 10-v6-$hv2addr2-$hv1addr2-vnifilter,20-v6-$hv2addr2-$hv1addr2-vnifilter 0
check_vm_connectivity "vnifiltering vxlan"
}
# Sanity test vnifilter datapath
# with vnifilter per vni configured BUM
# group/remote
vxlan_vnifilter_datapath_pervni()
{
hv1addr1="172.16.0.1"
hv2addr1="172.16.0.2"
hv1addr2="2002:fee1::1"
hv2addr2="2002:fee1::2"
ip link add veth-hv-1 type veth peer name veth-hv-2
setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64
setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64
check_hv_connectivity hv2addr1 hv2addr2
setup-vm 1 11 1 10-v4-$hv1addr1-$hv2addr1-vnifilterg,20-v4-$hv1addr1-$hv2addr1-vnifilterg 0
setup-vm 1 21 2 10-v6-$hv1addr2-$hv2addr2-vnifilterg,20-v6-$hv1addr2-$hv2addr2-vnifilterg 0
setup-vm 2 12 1 10-v4-$hv2addr1-$hv1addr1-vnifilterg,20-v4-$hv2addr1-$hv1addr1-vnifilterg 0
setup-vm 2 22 2 10-v6-$hv2addr2-$hv1addr2-vnifilterg,20-v6-$hv2addr2-$hv1addr2-vnifilterg 0
check_vm_connectivity "vnifiltering vxlan pervni remote"
}
vxlan_vnifilter_datapath_mgroup()
{
hv1addr1="172.16.0.1"
hv2addr1="172.16.0.2"
hv1addr2="2002:fee1::1"
hv2addr2="2002:fee1::2"
group="239.1.1.100"
group6="ff07::1"
ip link add veth-hv-1 type veth peer name veth-hv-2
setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64
setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64
check_hv_connectivity hv2addr1 hv2addr2
setup-vm 1 11 1 10-v4-$hv1addr1-$group-vnifilter,20-v4-$hv1addr1-$group-vnifilter 1
setup-vm 1 21 2 "10-v6-$hv1addr2-$group6-vnifilter,20-v6-$hv1addr2-$group6-vnifilter" 1
setup-vm 2 12 1 10-v4-$hv2addr1-$group-vnifilter,20-v4-$hv2addr1-$group-vnifilter 1
setup-vm 2 22 2 10-v6-$hv2addr2-$group6-vnifilter,20-v6-$hv2addr2-$group6-vnifilter 1
check_vm_connectivity "vnifiltering vxlan mgroup"
}
vxlan_vnifilter_datapath_mgroup_pervni()
{
hv1addr1="172.16.0.1"
hv2addr1="172.16.0.2"
hv1addr2="2002:fee1::1"
hv2addr2="2002:fee1::2"
group="239.1.1.100"
group6="ff07::1"
ip link add veth-hv-1 type veth peer name veth-hv-2
setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64
setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64
check_hv_connectivity hv2addr1 hv2addr2
setup-vm 1 11 1 10-v4-$hv1addr1-$group-vnifilterg,20-v4-$hv1addr1-$group-vnifilterg 1
setup-vm 1 21 2 10-v6-$hv1addr2-$group6-vnifilterg,20-v6-$hv1addr2-$group6-vnifilterg 1
setup-vm 2 12 1 10-v4-$hv2addr1-$group-vnifilterg,20-v4-$hv2addr1-$group-vnifilterg 1
setup-vm 2 22 2 10-v6-$hv2addr2-$group6-vnifilterg,20-v6-$hv2addr2-$group6-vnifilterg 1
check_vm_connectivity "vnifiltering vxlan pervni mgroup"
}
vxlan_vnifilter_metadata_and_traditional_mix()
{
hv1addr1="172.16.0.1"
hv2addr1="172.16.0.2"
hv1addr2="2002:fee1::1"
hv2addr2="2002:fee1::2"
ip link add veth-hv-1 type veth peer name veth-hv-2
setup-hv-networking 1 $hv1addr1 24 $hv1addr2 64
setup-hv-networking 2 $hv2addr1 24 $hv2addr2 64
check_hv_connectivity hv2addr1 hv2addr2
setup-vm 1 11 1 10-v4-$hv1addr1-$hv2addr1-vnifilter,20-v4-$hv1addr1-$hv2addr1-vnifilter 0
setup-vm 1 21 2 10-v6-$hv1addr2-$hv2addr2-vnifilter,20-v6-$hv1addr2-$hv2addr2-vnifilter 0
setup-vm 1 31 3 30-v4-$hv1addr1-$hv2addr1-default-4790,40-v6-$hv1addr2-$hv2addr2-default-4790,50-v4-$hv1addr1-$hv2addr1-metadata-4791 0
setup-vm 2 12 1 10-v4-$hv2addr1-$hv1addr1-vnifilter,20-v4-$hv2addr1-$hv1addr1-vnifilter 0
setup-vm 2 22 2 10-v6-$hv2addr2-$hv1addr2-vnifilter,20-v6-$hv2addr2-$hv1addr2-vnifilter 0
setup-vm 2 32 3 30-v4-$hv2addr1-$hv1addr1-default-4790,40-v6-$hv2addr2-$hv1addr2-default-4790,50-v4-$hv2addr1-$hv1addr1-metadata-4791 0
check_vm_connectivity "vnifiltering vxlan pervni remote mix"
# check VM connectivity over traditional/non-vxlan filtering vxlan devices
run_cmd "ip netns exec vm-31 ping -c 1 -W 1 10.0.30.32"
log_test $? 0 "VM connectivity over traditional vxlan (ipv4 default rdst)"
run_cmd "ip netns exec vm-31 ping -c 1 -W 1 10.0.40.32"
log_test $? 0 "VM connectivity over traditional vxlan (ipv6 default rdst)"
run_cmd "ip netns exec vm-31 ping -c 1 -W 1 10.0.50.32"
log_test $? 0 "VM connectivity over metadata nonfiltering vxlan (ipv4 default rdst)"
}
while getopts :t:pP46hv o
do
case $o in
t) TESTS=$OPTARG;;
p) PAUSE_ON_FAIL=yes;;
P) PAUSE=yes;;
v) VERBOSE=$(($VERBOSE + 1));;
h) usage; exit 0;;
*) usage; exit 1;;
esac
done
# make sure we don't pause twice
[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no
if [ "$(id -u)" -ne 0 ];then
echo "SKIP: Need root privileges"
exit $ksft_skip;
fi
if [ ! -x "$(command -v ip)" ]; then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
ip link help vxlan 2>&1 | grep -q "vnifilter"
if [ $? -ne 0 ]; then
echo "SKIP: iproute2 too old, missing vxlan dev vnifilter setting"
sync
exit $ksft_skip
fi
bridge vni help 2>&1 | grep -q "Usage: bridge vni"
if [ $? -ne 0 ]; then
echo "SKIP: iproute2 bridge lacks vxlan vnifiltering support"
exit $ksft_skip
fi
# start clean
cleanup &> /dev/null
for t in $TESTS
do
case $t in
none) setup; exit 0;;
*) $t; cleanup;;
esac
done
if [ "$TESTS" != "none" ]; then
printf "\nTests passed: %3d\n" ${nsuccess}
printf "Tests failed: %3d\n" ${nfail}
fi
exit $ret