bpf: add handling of BPF_LWT_REROUTE to lwt_bpf.c

This patch builds on top of the previous patch in the patchset,
which added BPF_LWT_ENCAP_IP mode to bpf_lwt_push_encap. As the
encapping can result in the skb needing to go via a different
interface/route/dst, bpf programs can indicate this by returning
BPF_LWT_REROUTE, which triggers a new route lookup for the skb.

v8 changes: fix kbuild errors when LWTUNNEL_BPF is builtin, but
   IPV6 is a module: as LWTUNNEL_BPF can only be either Y or N,
   call IPV6 routing functions only if they are built-in.

v9 changes:
   - fixed a kbuild test robot compiler warning;
   - call IPV6 routing functions via ipv6_stub.

v10 changes: removed unnecessary IS_ENABLED and pr_warn_once.

v11 changes: fixed a potential dst leak.

Signed-off-by: Peter Oskolkov <posk@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Peter Oskolkov 2019-02-13 11:53:39 -08:00 committed by Alexei Starovoitov
parent 9b0a6a9dba
commit 3bd0b15281

View File

@ -17,6 +17,7 @@
#include <linux/bpf.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
#include <net/ip6_route.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@ -56,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
switch (ret) {
case BPF_OK:
case BPF_LWT_REROUTE:
break;
case BPF_REDIRECT:
@ -88,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
return ret;
}
static int bpf_lwt_input_reroute(struct sk_buff *skb)
{
int err = -EINVAL;
if (skb->protocol == htons(ETH_P_IP)) {
struct iphdr *iph = ip_hdr(skb);
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb_dst(skb)->dev);
} else if (skb->protocol == htons(ETH_P_IPV6)) {
err = ipv6_stub->ipv6_route_input(skb);
} else {
err = -EAFNOSUPPORT;
}
if (err)
goto err;
return dst_input(skb);
err:
kfree_skb(skb);
return err;
}
static int bpf_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@ -99,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)
ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
if (ret < 0)
return ret;
if (ret == BPF_LWT_REROUTE)
return bpf_lwt_input_reroute(skb);
}
if (unlikely(!dst->lwtstate->orig_input)) {
pr_warn_once("orig_input not set on dst for prog %s\n",
bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
@ -148,6 +174,91 @@ static int xmit_check_hhlen(struct sk_buff *skb)
return 0;
}
static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
{
struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
int oif = l3mdev ? l3mdev->ifindex : 0;
struct dst_entry *dst = NULL;
struct sock *sk;
struct net *net;
bool ipv4;
int err;
if (skb->protocol == htons(ETH_P_IP))
ipv4 = true;
else if (skb->protocol == htons(ETH_P_IPV6))
ipv4 = false;
else
return -EAFNOSUPPORT;
sk = sk_to_full_sk(skb->sk);
if (sk) {
if (sk->sk_bound_dev_if)
oif = sk->sk_bound_dev_if;
net = sock_net(sk);
} else {
net = dev_net(skb_dst(skb)->dev);
}
if (ipv4) {
struct iphdr *iph = ip_hdr(skb);
struct flowi4 fl4 = {};
struct rtable *rt;
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
fl4.saddr = iph->saddr;
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt))
return -EINVAL;
dst = &rt->dst;
} else {
struct ipv6hdr *iph6 = ipv6_hdr(skb);
struct flowi6 fl6 = {};
fl6.flowi6_oif = oif;
fl6.flowi6_mark = skb->mark;
fl6.flowi6_uid = sock_net_uid(net, sk);
fl6.flowlabel = ip6_flowinfo(iph6);
fl6.flowi6_proto = iph6->nexthdr;
fl6.daddr = iph6->daddr;
fl6.saddr = iph6->saddr;
err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
if (err || IS_ERR(dst))
return -EINVAL;
}
if (unlikely(dst->error)) {
dst_release(dst);
return -EINVAL;
}
/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
* was done for the previous dst, so we are doing it here again, in
* case the new dst needs much more space. The call below is a noop
* if there is enough header space in skb.
*/
err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
if (unlikely(err))
return err;
skb_dst_drop(skb);
skb_dst_set(skb, dst);
err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
if (unlikely(err))
return err;
/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
return LWTUNNEL_XMIT_DONE;
}
static int bpf_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@ -155,11 +266,20 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
__be16 proto = skb->protocol;
int ret;
ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
switch (ret) {
case BPF_OK:
/* If the header changed, e.g. via bpf_lwt_push_encap,
* BPF_LWT_REROUTE below should have been used if the
* protocol was also changed.
*/
if (skb->protocol != proto) {
kfree_skb(skb);
return -EINVAL;
}
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
@ -170,6 +290,8 @@ static int bpf_xmit(struct sk_buff *skb)
return LWTUNNEL_XMIT_CONTINUE;
case BPF_REDIRECT:
return LWTUNNEL_XMIT_DONE;
case BPF_LWT_REROUTE:
return bpf_lwt_xmit_reroute(skb);
default:
return ret;
}