linux/net/tipc/udp_media.c

838 lines
21 KiB
C
Raw Normal View History

/* net/tipc/udp_media.c: IP bearer support for TIPC
*
* Copyright (c) 2015, Ericsson AB
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the names of the copyright holders nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* Alternatively, this software may be distributed under the terms of the
* GNU General Public License ("GPL") version 2 as published by the Free
* Software Foundation.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/socket.h>
#include <linux/ip.h>
#include <linux/udp.h>
#include <linux/inet.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/list.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/udp_tunnel.h>
#include <net/ipv6_stubs.h>
#include <linux/tipc_netlink.h>
#include "core.h"
#include "addr.h"
#include "net.h"
#include "bearer.h"
#include "netlink.h"
#include "msg.h"
/* IANA assigned UDP port */
#define UDP_PORT_DEFAULT 6118
#define UDP_MIN_HEADROOM 48
/**
* struct udp_media_addr - IP/UDP addressing information
*
* This is the bearer level originating address used in neighbor discovery
* messages, and all fields should be in network byte order
*/
struct udp_media_addr {
__be16 proto;
__be16 port;
union {
struct in_addr ipv4;
struct in6_addr ipv6;
};
};
/* struct udp_replicast - container for UDP remote addresses */
struct udp_replicast {
struct udp_media_addr addr;
struct dst_cache dst_cache;
struct rcu_head rcu;
struct list_head list;
};
/**
* struct udp_bearer - ip/udp bearer data structure
* @bearer: associated generic tipc bearer
* @ubsock: bearer associated socket
* @ifindex: local address scope
* @work: used to schedule deferred work on a bearer
*/
struct udp_bearer {
struct tipc_bearer __rcu *bearer;
struct socket *ubsock;
u32 ifindex;
struct work_struct work;
struct udp_replicast rcast;
};
static int tipc_udp_is_mcast_addr(struct udp_media_addr *addr)
{
if (ntohs(addr->proto) == ETH_P_IP)
return ipv4_is_multicast(addr->ipv4.s_addr);
#if IS_ENABLED(CONFIG_IPV6)
else
return ipv6_addr_is_multicast(&addr->ipv6);
#endif
return 0;
}
/* udp_media_addr_set - convert a ip/udp address to a TIPC media address */
static void tipc_udp_media_addr_set(struct tipc_media_addr *addr,
struct udp_media_addr *ua)
{
memset(addr, 0, sizeof(struct tipc_media_addr));
addr->media_id = TIPC_MEDIA_TYPE_UDP;
memcpy(addr->value, ua, sizeof(struct udp_media_addr));
if (tipc_udp_is_mcast_addr(ua))
addr->broadcast = TIPC_BROADCAST_SUPPORT;
}
/* tipc_udp_addr2str - convert ip/udp address to string */
static int tipc_udp_addr2str(struct tipc_media_addr *a, char *buf, int size)
{
struct udp_media_addr *ua = (struct udp_media_addr *)&a->value;
if (ntohs(ua->proto) == ETH_P_IP)
snprintf(buf, size, "%pI4:%u", &ua->ipv4, ntohs(ua->port));
else if (ntohs(ua->proto) == ETH_P_IPV6)
snprintf(buf, size, "%pI6:%u", &ua->ipv6, ntohs(ua->port));
else
pr_err("Invalid UDP media address\n");
return 0;
}
/* tipc_udp_msg2addr - extract an ip/udp address from a TIPC ndisc message */
static int tipc_udp_msg2addr(struct tipc_bearer *b, struct tipc_media_addr *a,
char *msg)
{
struct udp_media_addr *ua;
ua = (struct udp_media_addr *) (msg + TIPC_MEDIA_ADDR_OFFSET);
if (msg[TIPC_MEDIA_TYPE_OFFSET] != TIPC_MEDIA_TYPE_UDP)
return -EINVAL;
tipc_udp_media_addr_set(a, ua);
return 0;
}
/* tipc_udp_addr2msg - write an ip/udp address to a TIPC ndisc message */
static int tipc_udp_addr2msg(char *msg, struct tipc_media_addr *a)
{
memset(msg, 0, TIPC_MEDIA_INFO_SIZE);
msg[TIPC_MEDIA_TYPE_OFFSET] = TIPC_MEDIA_TYPE_UDP;
memcpy(msg + TIPC_MEDIA_ADDR_OFFSET, a->value,
sizeof(struct udp_media_addr));
return 0;
}
/* tipc_send_msg - enqueue a send request */
static int tipc_udp_xmit(struct net *net, struct sk_buff *skb,
struct udp_bearer *ub, struct udp_media_addr *src,
struct udp_media_addr *dst, struct dst_cache *cache)
{
struct dst_entry *ndst = dst_cache_get(cache);
int ttl, err = 0;
if (dst->proto == htons(ETH_P_IP)) {
struct rtable *rt = (struct rtable *)ndst;
if (!rt) {
struct flowi4 fl = {
.daddr = dst->ipv4.s_addr,
.saddr = src->ipv4.s_addr,
.flowi4_mark = skb->mark,
.flowi4_proto = IPPROTO_UDP
};
rt = ip_route_output_key(net, &fl);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
goto tx_error;
}
dst_cache_set_ip4(cache, &rt->dst, fl.saddr);
}
ttl = ip4_dst_hoplimit(&rt->dst);
udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
dst->ipv4.s_addr, 0, ttl, 0, src->port,
dst->port, false, true);
#if IS_ENABLED(CONFIG_IPV6)
} else {
if (!ndst) {
struct flowi6 fl6 = {
.flowi6_oif = ub->ifindex,
.daddr = dst->ipv6,
.saddr = src->ipv6,
.flowi6_proto = IPPROTO_UDP
};
ndst = ipv6_stub->ipv6_dst_lookup_flow(net,
ub->ubsock->sk,
&fl6, NULL);
if (IS_ERR(ndst)) {
err = PTR_ERR(ndst);
goto tx_error;
}
dst_cache_set_ip6(cache, ndst, &fl6.saddr);
}
ttl = ip6_dst_hoplimit(ndst);
tipc: pass tunnel dev as NULL to udp_tunnel(6)_xmit_skb udp_tunnel(6)_xmit_skb() called by tipc_udp_xmit() expects a tunnel device to count packets on dev->tstats, a perpcu variable. However, TIPC is using udp tunnel with no tunnel device, and pass the lower dev, like veth device that only initializes dev->lstats(a perpcu variable) when creating it. Later iptunnel_xmit_stats() called by ip(6)tunnel_xmit() thinks the dev as a tunnel device, and uses dev->tstats instead of dev->lstats. tstats' each pointer points to a bigger struct than lstats, so when tstats->tx_bytes is increased, other percpu variable's members could be overwritten. syzbot has reported quite a few crashes due to fib_nh_common percpu member 'nhc_pcpu_rth_output' overwritten, call traces are like: BUG: KASAN: slab-out-of-bounds in rt_cache_valid+0x158/0x190 net/ipv4/route.c:1556 rt_cache_valid+0x158/0x190 net/ipv4/route.c:1556 __mkroute_output net/ipv4/route.c:2332 [inline] ip_route_output_key_hash_rcu+0x819/0x2d50 net/ipv4/route.c:2564 ip_route_output_key_hash+0x1ef/0x360 net/ipv4/route.c:2393 __ip_route_output_key include/net/route.h:125 [inline] ip_route_output_flow+0x28/0xc0 net/ipv4/route.c:2651 ip_route_output_key include/net/route.h:135 [inline] ... or: kasan: GPF could be caused by NULL-ptr deref or user memory access RIP: 0010:dst_dev_put+0x24/0x290 net/core/dst.c:168 <IRQ> rt_fibinfo_free_cpus net/ipv4/fib_semantics.c:200 [inline] free_fib_info_rcu+0x2e1/0x490 net/ipv4/fib_semantics.c:217 __rcu_reclaim kernel/rcu/rcu.h:240 [inline] rcu_do_batch kernel/rcu/tree.c:2437 [inline] invoke_rcu_callbacks kernel/rcu/tree.c:2716 [inline] rcu_process_callbacks+0x100a/0x1ac0 kernel/rcu/tree.c:2697 ... The issue exists since tunnel stats update is moved to iptunnel_xmit by Commit 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()"), and here to fix it by passing a NULL tunnel dev to udp_tunnel(6)_xmit_skb so that the packets counting won't happen on dev->tstats. Reported-by: syzbot+9d4c12bfd45a58738d0a@syzkaller.appspotmail.com Reported-by: syzbot+a9e23ea2aa21044c2798@syzkaller.appspotmail.com Reported-by: syzbot+c4c4b2bb358bb936ad7e@syzkaller.appspotmail.com Reported-by: syzbot+0290d2290a607e035ba1@syzkaller.appspotmail.com Reported-by: syzbot+a43d8d4e7e8a7a9e149e@syzkaller.appspotmail.com Reported-by: syzbot+a47c5f4c6c00fc1ed16e@syzkaller.appspotmail.com Fixes: 039f50629b7f ("ip_tunnel: Move stats update to iptunnel_xmit()") Signed-off-by: Xin Long <lucien.xin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-06-17 13:34:15 +00:00
err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb, NULL,
&src->ipv6, &dst->ipv6, 0, ttl, 0,
src->port, dst->port, false);
#endif
}
return err;
tx_error:
kfree_skb(skb);
return err;
}
static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
struct tipc_bearer *b,
struct tipc_media_addr *addr)
{
struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
struct udp_media_addr *dst = (struct udp_media_addr *)&addr->value;
struct udp_replicast *rcast;
struct udp_bearer *ub;
int err = 0;
if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
if (err)
goto out;
}
skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
ub = rcu_dereference(b->media_ptr);
if (!ub) {
err = -ENODEV;
goto out;
}
if (addr->broadcast != TIPC_REPLICAST_SUPPORT)
return tipc_udp_xmit(net, skb, ub, src, dst,
&ub->rcast.dst_cache);
/* Replicast, send an skb to each configured IP address */
list_for_each_entry_rcu(rcast, &ub->rcast.list, list) {
struct sk_buff *_skb;
_skb = pskb_copy(skb, GFP_ATOMIC);
if (!_skb) {
err = -ENOMEM;
goto out;
}
err = tipc_udp_xmit(net, _skb, ub, src, &rcast->addr,
&rcast->dst_cache);
if (err)
goto out;
}
err = 0;
out:
kfree_skb(skb);
return err;
}
static bool tipc_udp_is_known_peer(struct tipc_bearer *b,
struct udp_media_addr *addr)
{
struct udp_replicast *rcast, *tmp;
struct udp_bearer *ub;
ub = rcu_dereference_rtnl(b->media_ptr);
if (!ub) {
pr_err_ratelimited("UDP bearer instance not found\n");
return false;
}
list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
if (!memcmp(&rcast->addr, addr, sizeof(struct udp_media_addr)))
return true;
}
return false;
}
static int tipc_udp_rcast_add(struct tipc_bearer *b,
struct udp_media_addr *addr)
{
struct udp_replicast *rcast;
struct udp_bearer *ub;
ub = rcu_dereference_rtnl(b->media_ptr);
if (!ub)
return -ENODEV;
rcast = kmalloc(sizeof(*rcast), GFP_ATOMIC);
if (!rcast)
return -ENOMEM;
if (dst_cache_init(&rcast->dst_cache, GFP_ATOMIC)) {
kfree(rcast);
return -ENOMEM;
}
memcpy(&rcast->addr, addr, sizeof(struct udp_media_addr));
if (ntohs(addr->proto) == ETH_P_IP)
pr_info("New replicast peer: %pI4\n", &rcast->addr.ipv4);
#if IS_ENABLED(CONFIG_IPV6)
else if (ntohs(addr->proto) == ETH_P_IPV6)
pr_info("New replicast peer: %pI6\n", &rcast->addr.ipv6);
#endif
b->bcast_addr.broadcast = TIPC_REPLICAST_SUPPORT;
list_add_rcu(&rcast->list, &ub->rcast.list);
return 0;
}
static int tipc_udp_rcast_disc(struct tipc_bearer *b, struct sk_buff *skb)
{
struct udp_media_addr src = {0};
struct udp_media_addr *dst;
dst = (struct udp_media_addr *)&b->bcast_addr.value;
if (tipc_udp_is_mcast_addr(dst))
return 0;
src.port = udp_hdr(skb)->source;
if (ip_hdr(skb)->version == 4) {
struct iphdr *iphdr = ip_hdr(skb);
src.proto = htons(ETH_P_IP);
src.ipv4.s_addr = iphdr->saddr;
if (ipv4_is_multicast(iphdr->daddr))
return 0;
#if IS_ENABLED(CONFIG_IPV6)
} else if (ip_hdr(skb)->version == 6) {
struct ipv6hdr *iphdr = ipv6_hdr(skb);
src.proto = htons(ETH_P_IPV6);
src.ipv6 = iphdr->saddr;
if (ipv6_addr_is_multicast(&iphdr->daddr))
return 0;
#endif
} else {
return 0;
}
if (likely(tipc_udp_is_known_peer(b, &src)))
return 0;
return tipc_udp_rcast_add(b, &src);
}
/* tipc_udp_recv - read data from bearer socket */
static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
{
struct udp_bearer *ub;
struct tipc_bearer *b;
struct tipc_msg *hdr;
int err;
ub = rcu_dereference_sk_user_data(sk);
if (!ub) {
pr_err_ratelimited("Failed to get UDP bearer reference");
goto out;
}
skb_pull(skb, sizeof(struct udphdr));
hdr = buf_msg(skb);
b = rcu_dereference(ub->bearer);
if (!b)
goto out;
if (b && test_bit(0, &b->up)) {
tipc: introduce TIPC encryption & authentication This commit offers an option to encrypt and authenticate all messaging, including the neighbor discovery messages. The currently most advanced algorithm supported is the AEAD AES-GCM (like IPSec or TLS). All encryption/decryption is done at the bearer layer, just before leaving or after entering TIPC. Supported features: - Encryption & authentication of all TIPC messages (header + data); - Two symmetric-key modes: Cluster and Per-node; - Automatic key switching; - Key-expired revoking (sequence number wrapped); - Lock-free encryption/decryption (RCU); - Asynchronous crypto, Intel AES-NI supported; - Multiple cipher transforms; - Logs & statistics; Two key modes: - Cluster key mode: One single key is used for both TX & RX in all nodes in the cluster. - Per-node key mode: Each nodes in the cluster has one specific TX key. For RX, a node requires its peers' TX key to be able to decrypt the messages from those peers. Key setting from user-space is performed via netlink by a user program (e.g. the iproute2 'tipc' tool). Internal key state machine: Attach Align(RX) +-+ +-+ | V | V +---------+ Attach +---------+ | IDLE |---------------->| PENDING |(user = 0) +---------+ +---------+ A A Switch| A | | | | | | Free(switch/revoked) | | (Free)| +----------------------+ | |Timeout | (TX) | | |(RX) | | | | | | v | +---------+ Switch +---------+ | PASSIVE |<----------------| ACTIVE | +---------+ (RX) +---------+ (user = 1) (user >= 1) The number of TFMs is 10 by default and can be changed via the procfs 'net/tipc/max_tfms'. At this moment, as for simplicity, this file is also used to print the crypto statistics at runtime: echo 0xfff1 > /proc/sys/net/tipc/max_tfms The patch defines a new TIPC version (v7) for the encryption message (- backward compatibility as well). The message is basically encapsulated as follows: +----------------------------------------------------------+ | TIPCv7 encryption | Original TIPCv2 | Authentication | | header | packet (encrypted) | Tag | +----------------------------------------------------------+ The throughput is about ~40% for small messages (compared with non- encryption) and ~9% for large messages. With the support from hardware crypto i.e. the Intel AES-NI CPU instructions, the throughput increases upto ~85% for small messages and ~55% for large messages. By default, the new feature is inactive (i.e. no encryption) until user sets a key for TIPC. There is however also a new option - "TIPC_CRYPTO" in the kernel configuration to enable/disable the new code when needed. MAINTAINERS | add two new files 'crypto.h' & 'crypto.c' in tipc Acked-by: Ying Xue <ying.xue@windreiver.com> Acked-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: Tuong Lien <tuong.t.lien@dektech.com.au> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-11-08 05:05:11 +00:00
TIPC_SKB_CB(skb)->flags = 0;
tipc_rcv(sock_net(sk), skb, b);
return 0;
}
if (unlikely(msg_user(hdr) == LINK_CONFIG)) {
err = tipc_udp_rcast_disc(b, skb);
if (err)
goto out;
}
out:
kfree_skb(skb);
return 0;
}
static int enable_mcast(struct udp_bearer *ub, struct udp_media_addr *remote)
{
int err = 0;
struct ip_mreqn mreqn;
struct sock *sk = ub->ubsock->sk;
if (ntohs(remote->proto) == ETH_P_IP) {
mreqn.imr_multiaddr = remote->ipv4;
mreqn.imr_ifindex = ub->ifindex;
err = ip_mc_join_group(sk, &mreqn);
#if IS_ENABLED(CONFIG_IPV6)
} else {
err = ipv6_stub->ipv6_sock_mc_join(sk, ub->ifindex,
&remote->ipv6);
#endif
}
return err;
}
static int __tipc_nl_add_udp_addr(struct sk_buff *skb,
struct udp_media_addr *addr, int nla_t)
{
if (ntohs(addr->proto) == ETH_P_IP) {
struct sockaddr_in ip4;
memset(&ip4, 0, sizeof(ip4));
ip4.sin_family = AF_INET;
ip4.sin_port = addr->port;
ip4.sin_addr.s_addr = addr->ipv4.s_addr;
if (nla_put(skb, nla_t, sizeof(ip4), &ip4))
return -EMSGSIZE;
#if IS_ENABLED(CONFIG_IPV6)
} else if (ntohs(addr->proto) == ETH_P_IPV6) {
struct sockaddr_in6 ip6;
memset(&ip6, 0, sizeof(ip6));
ip6.sin6_family = AF_INET6;
ip6.sin6_port = addr->port;
memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr));
if (nla_put(skb, nla_t, sizeof(ip6), &ip6))
return -EMSGSIZE;
#endif
}
return 0;
}
int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb)
{
u32 bid = cb->args[0];
u32 skip_cnt = cb->args[1];
u32 portid = NETLINK_CB(cb->skb).portid;
struct udp_replicast *rcast, *tmp;
struct tipc_bearer *b;
struct udp_bearer *ub;
void *hdr;
int err;
int i;
if (!bid && !skip_cnt) {
struct nlattr **attrs = genl_dumpit_info(cb)->attrs;
struct net *net = sock_net(skb->sk);
struct nlattr *battrs[TIPC_NLA_BEARER_MAX + 1];
char *bname;
if (!attrs[TIPC_NLA_BEARER])
return -EINVAL;
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 12:07:28 +00:00
err = nla_parse_nested_deprecated(battrs, TIPC_NLA_BEARER_MAX,
attrs[TIPC_NLA_BEARER],
tipc_nl_bearer_policy, NULL);
if (err)
return err;
if (!battrs[TIPC_NLA_BEARER_NAME])
return -EINVAL;
bname = nla_data(battrs[TIPC_NLA_BEARER_NAME]);
rtnl_lock();
b = tipc_bearer_find(net, bname);
if (!b) {
rtnl_unlock();
return -EINVAL;
}
bid = b->identity;
} else {
struct net *net = sock_net(skb->sk);
struct tipc_net *tn = net_generic(net, tipc_net_id);
rtnl_lock();
b = rtnl_dereference(tn->bearer_list[bid]);
if (!b) {
rtnl_unlock();
return -EINVAL;
}
}
ub = rtnl_dereference(b->media_ptr);
if (!ub) {
rtnl_unlock();
return -EINVAL;
}
i = 0;
list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
if (i < skip_cnt)
goto count;
hdr = genlmsg_put(skb, portid, cb->nlh->nlmsg_seq,
&tipc_genl_family, NLM_F_MULTI,
TIPC_NL_BEARER_GET);
if (!hdr)
goto done;
err = __tipc_nl_add_udp_addr(skb, &rcast->addr,
TIPC_NLA_UDP_REMOTE);
if (err) {
genlmsg_cancel(skb, hdr);
goto done;
}
genlmsg_end(skb, hdr);
count:
i++;
}
done:
rtnl_unlock();
cb->args[0] = bid;
cb->args[1] = i;
return skb->len;
}
int tipc_udp_nl_add_bearer_data(struct tipc_nl_msg *msg, struct tipc_bearer *b)
{
struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
struct udp_media_addr *dst;
struct udp_bearer *ub;
struct nlattr *nest;
ub = rtnl_dereference(b->media_ptr);
if (!ub)
return -ENODEV;
nest = nla_nest_start_noflag(msg->skb, TIPC_NLA_BEARER_UDP_OPTS);
if (!nest)
goto msg_full;
if (__tipc_nl_add_udp_addr(msg->skb, src, TIPC_NLA_UDP_LOCAL))
goto msg_full;
dst = (struct udp_media_addr *)&b->bcast_addr.value;
if (__tipc_nl_add_udp_addr(msg->skb, dst, TIPC_NLA_UDP_REMOTE))
goto msg_full;
if (!list_empty(&ub->rcast.list)) {
if (nla_put_flag(msg->skb, TIPC_NLA_UDP_MULTI_REMOTEIP))
goto msg_full;
}
nla_nest_end(msg->skb, nest);
return 0;
msg_full:
nla_nest_cancel(msg->skb, nest);
return -EMSGSIZE;
}
/**
* tipc_parse_udp_addr - build udp media address from netlink data
* @nlattr: netlink attribute containing sockaddr storage aligned address
* @addr: tipc media address to fill with address, port and protocol type
* @scope_id: IPv6 scope id pointer, not NULL indicates it's required
*/
static int tipc_parse_udp_addr(struct nlattr *nla, struct udp_media_addr *addr,
u32 *scope_id)
{
struct sockaddr_storage sa;
nla_memcpy(&sa, nla, sizeof(sa));
if (sa.ss_family == AF_INET) {
struct sockaddr_in *ip4 = (struct sockaddr_in *)&sa;
addr->proto = htons(ETH_P_IP);
addr->port = ip4->sin_port;
addr->ipv4.s_addr = ip4->sin_addr.s_addr;
return 0;
#if IS_ENABLED(CONFIG_IPV6)
} else if (sa.ss_family == AF_INET6) {
struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)&sa;
addr->proto = htons(ETH_P_IPV6);
addr->port = ip6->sin6_port;
memcpy(&addr->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
/* Scope ID is only interesting for local addresses */
if (scope_id) {
int atype;
atype = ipv6_addr_type(&ip6->sin6_addr);
if (__ipv6_addr_needs_scope_id(atype) &&
!ip6->sin6_scope_id) {
return -EINVAL;
}
*scope_id = ip6->sin6_scope_id ? : 0;
}
return 0;
#endif
}
return -EADDRNOTAVAIL;
}
int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr)
{
int err;
struct udp_media_addr addr = {0};
struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
struct udp_media_addr *dst;
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 12:07:28 +00:00
if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy, NULL))
return -EINVAL;
if (!opts[TIPC_NLA_UDP_REMOTE])
return -EINVAL;
err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &addr, NULL);
if (err)
return err;
dst = (struct udp_media_addr *)&b->bcast_addr.value;
if (tipc_udp_is_mcast_addr(dst)) {
pr_err("Can't add remote ip to TIPC UDP multicast bearer\n");
return -EINVAL;
}
if (tipc_udp_is_known_peer(b, &addr))
return 0;
return tipc_udp_rcast_add(b, &addr);
}
/**
* tipc_udp_enable - callback to create a new udp bearer instance
* @net: network namespace
* @b: pointer to generic tipc_bearer
* @attrs: netlink bearer configuration
*
* validate the bearer parameters and initialize the udp bearer
* rtnl_lock should be held
*/
static int tipc_udp_enable(struct net *net, struct tipc_bearer *b,
struct nlattr *attrs[])
{
int err = -EINVAL;
struct udp_bearer *ub;
struct udp_media_addr remote = {0};
struct udp_media_addr local = {0};
struct udp_port_cfg udp_conf = {0};
struct udp_tunnel_sock_cfg tuncfg = {NULL};
struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
u8 node_id[NODE_ID_LEN] = {0,};
int rmcast = 0;
ub = kzalloc(sizeof(*ub), GFP_ATOMIC);
if (!ub)
return -ENOMEM;
INIT_LIST_HEAD(&ub->rcast.list);
if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
goto err;
netlink: make validation more configurable for future strictness We currently have two levels of strict validation: 1) liberal (default) - undefined (type >= max) & NLA_UNSPEC attributes accepted - attribute length >= expected accepted - garbage at end of message accepted 2) strict (opt-in) - NLA_UNSPEC attributes accepted - attribute length >= expected accepted Split out parsing strictness into four different options: * TRAILING - check that there's no trailing data after parsing attributes (in message or nested) * MAXTYPE - reject attrs > max known type * UNSPEC - reject attributes with NLA_UNSPEC policy entries * STRICT_ATTRS - strictly validate attribute size The default for future things should be *everything*. The current *_strict() is a combination of TRAILING and MAXTYPE, and is renamed to _deprecated_strict(). The current regular parsing has none of this, and is renamed to *_parse_deprecated(). Additionally it allows us to selectively set one of the new flags even on old policies. Notably, the UNSPEC flag could be useful in this case, since it can be arranged (by filling in the policy) to not be an incompatible userspace ABI change, but would then going forward prevent forgetting attribute entries. Similar can apply to the POLICY flag. We end up with the following renames: * nla_parse -> nla_parse_deprecated * nla_parse_strict -> nla_parse_deprecated_strict * nlmsg_parse -> nlmsg_parse_deprecated * nlmsg_parse_strict -> nlmsg_parse_deprecated_strict * nla_parse_nested -> nla_parse_nested_deprecated * nla_validate_nested -> nla_validate_nested_deprecated Using spatch, of course: @@ expression TB, MAX, HEAD, LEN, POL, EXT; @@ -nla_parse(TB, MAX, HEAD, LEN, POL, EXT) +nla_parse_deprecated(TB, MAX, HEAD, LEN, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression NLH, HDRLEN, TB, MAX, POL, EXT; @@ -nlmsg_parse_strict(NLH, HDRLEN, TB, MAX, POL, EXT) +nlmsg_parse_deprecated_strict(NLH, HDRLEN, TB, MAX, POL, EXT) @@ expression TB, MAX, NLA, POL, EXT; @@ -nla_parse_nested(TB, MAX, NLA, POL, EXT) +nla_parse_nested_deprecated(TB, MAX, NLA, POL, EXT) @@ expression START, MAX, POL, EXT; @@ -nla_validate_nested(START, MAX, POL, EXT) +nla_validate_nested_deprecated(START, MAX, POL, EXT) @@ expression NLH, HDRLEN, MAX, POL, EXT; @@ -nlmsg_validate(NLH, HDRLEN, MAX, POL, EXT) +nlmsg_validate_deprecated(NLH, HDRLEN, MAX, POL, EXT) For this patch, don't actually add the strict, non-renamed versions yet so that it breaks compile if I get it wrong. Also, while at it, make nla_validate and nla_parse go down to a common __nla_validate_parse() function to avoid code duplication. Ultimately, this allows us to have very strict validation for every new caller of nla_parse()/nlmsg_parse() etc as re-introduced in the next patch, while existing things will continue to work as is. In effect then, this adds fully strict validation for any new command. Signed-off-by: Johannes Berg <johannes.berg@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-04-26 12:07:28 +00:00
if (nla_parse_nested_deprecated(opts, TIPC_NLA_UDP_MAX, attrs[TIPC_NLA_BEARER_UDP_OPTS], tipc_nl_udp_policy, NULL))
goto err;
if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) {
pr_err("Invalid UDP bearer configuration");
err = -EINVAL;
goto err;
}
err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_LOCAL], &local,
&ub->ifindex);
if (err)
goto err;
err = tipc_parse_udp_addr(opts[TIPC_NLA_UDP_REMOTE], &remote, NULL);
if (err)
goto err;
if (remote.proto != local.proto) {
err = -EINVAL;
goto err;
}
/* Checking remote ip address */
rmcast = tipc_udp_is_mcast_addr(&remote);
/* Autoconfigure own node identity if needed */
if (!tipc_own_id(net)) {
memcpy(node_id, local.ipv6.in6_u.u6_addr8, 16);
tipc_net_init(net, node_id, 0);
}
if (!tipc_own_id(net)) {
pr_warn("Failed to set node id, please configure manually\n");
err = -EINVAL;
goto err;
}
b->bcast_addr.media_id = TIPC_MEDIA_TYPE_UDP;
b->bcast_addr.broadcast = TIPC_BROADCAST_SUPPORT;
rcu_assign_pointer(b->media_ptr, ub);
rcu_assign_pointer(ub->bearer, b);
tipc_udp_media_addr_set(&b->addr, &local);
if (local.proto == htons(ETH_P_IP)) {
struct net_device *dev;
dev = __ip_dev_find(net, local.ipv4.s_addr, false);
if (!dev) {
err = -ENODEV;
goto err;
}
udp_conf.family = AF_INET;
/* Switch to use ANY to receive packets from group */
if (rmcast)
udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
else
udp_conf.local_ip.s_addr = local.ipv4.s_addr;
udp_conf.use_udp_checksums = false;
ub->ifindex = dev->ifindex;
if (tipc_mtu_bad(dev, sizeof(struct iphdr) +
sizeof(struct udphdr))) {
err = -EINVAL;
goto err;
}
b->mtu = b->media->mtu;
#if IS_ENABLED(CONFIG_IPV6)
} else if (local.proto == htons(ETH_P_IPV6)) {
udp_conf.family = AF_INET6;
udp_conf.use_udp6_tx_checksums = true;
udp_conf.use_udp6_rx_checksums = true;
if (rmcast)
udp_conf.local_ip6 = in6addr_any;
else
udp_conf.local_ip6 = local.ipv6;
b->mtu = 1280;
#endif
} else {
err = -EAFNOSUPPORT;
goto err;
}
udp_conf.local_udp_port = local.port;
err = udp_sock_create(net, &udp_conf, &ub->ubsock);
if (err)
goto err;
tuncfg.sk_user_data = ub;
tuncfg.encap_type = 1;
tuncfg.encap_rcv = tipc_udp_recv;
tuncfg.encap_destroy = NULL;
setup_udp_tunnel_sock(net, ub->ubsock, &tuncfg);
err = dst_cache_init(&ub->rcast.dst_cache, GFP_ATOMIC);
if (err)
goto free;
/**
* The bcast media address port is used for all peers and the ip
* is used if it's a multicast address.
*/
memcpy(&b->bcast_addr.value, &remote, sizeof(remote));
if (rmcast)
err = enable_mcast(ub, &remote);
else
err = tipc_udp_rcast_add(b, &remote);
if (err)
goto free;
return 0;
free:
dst_cache_destroy(&ub->rcast.dst_cache);
udp_tunnel_sock_release(ub->ubsock);
err:
kfree(ub);
return err;
}
/* cleanup_bearer - break the socket/bearer association */
static void cleanup_bearer(struct work_struct *work)
{
struct udp_bearer *ub = container_of(work, struct udp_bearer, work);
struct udp_replicast *rcast, *tmp;
list_for_each_entry_safe(rcast, tmp, &ub->rcast.list, list) {
dst_cache_destroy(&rcast->dst_cache);
list_del_rcu(&rcast->list);
kfree_rcu(rcast, rcu);
}
dst_cache_destroy(&ub->rcast.dst_cache);
udp_tunnel_sock_release(ub->ubsock);
synchronize_net();
kfree(ub);
}
/* tipc_udp_disable - detach bearer from socket */
static void tipc_udp_disable(struct tipc_bearer *b)
{
struct udp_bearer *ub;
ub = rtnl_dereference(b->media_ptr);
if (!ub) {
pr_err("UDP bearer instance not found\n");
return;
}
sock_set_flag(ub->ubsock->sk, SOCK_DEAD);
RCU_INIT_POINTER(ub->bearer, NULL);
/* sock_release need to be done outside of rtnl lock */
INIT_WORK(&ub->work, cleanup_bearer);
schedule_work(&ub->work);
}
struct tipc_media udp_media_info = {
.send_msg = tipc_udp_send_msg,
.enable_media = tipc_udp_enable,
.disable_media = tipc_udp_disable,
.addr2str = tipc_udp_addr2str,
.addr2msg = tipc_udp_addr2msg,
.msg2addr = tipc_udp_msg2addr,
.priority = TIPC_DEF_LINK_PRI,
.tolerance = TIPC_DEF_LINK_TOL,
tipc: introduce variable window congestion control We introduce a simple variable window congestion control for links. The algorithm is inspired by the Reno algorithm, covering both 'slow start', 'congestion avoidance', and 'fast recovery' modes. - We introduce hard lower and upper window limits per link, still different and configurable per bearer type. - We introduce a 'slow start theshold' variable, initially set to the maximum window size. - We let a link start at the minimum congestion window, i.e. in slow start mode, and then let is grow rapidly (+1 per rceived ACK) until it reaches the slow start threshold and enters congestion avoidance mode. - In congestion avoidance mode we increment the congestion window for each window-size number of acked packets, up to a possible maximum equal to the configured maximum window. - For each non-duplicate NACK received, we drop back to fast recovery mode, by setting the both the slow start threshold to and the congestion window to (current_congestion_window / 2). - If the timeout handler finds that the transmit queue has not moved since the previous timeout, it drops the link back to slow start and forces a probe containing the last sent sequence number to the sent to the peer, so that this can discover the stale situation. This change does in reality have effect only on unicast ethernet transport, as we have seen that there is no room whatsoever for increasing the window max size for the UDP bearer. For now, we also choose to keep the limits for the broadcast link unchanged and equal. This algorithm seems to give a 50-100% throughput improvement for messages larger than MTU. Suggested-by: Xin Long <lucien.xin@gmail.com> Acked-by: Ying Xue <ying.xue@windriver.com> Signed-off-by: Jon Maloy <jon.maloy@ericsson.com> Signed-off-by: David S. Miller <davem@davemloft.net>
2019-12-09 23:52:46 +00:00
.min_win = TIPC_DEF_LINK_WIN,
.max_win = TIPC_DEF_LINK_WIN,
.mtu = TIPC_DEF_LINK_UDP_MTU,
.type_id = TIPC_MEDIA_TYPE_UDP,
.hwaddr_len = 0,
.name = "udp"
};