linux/net/ipv6/netfilter.c
Jason A. Donenfeld 46d6c5ae95 netfilter: use actual socket sk rather than skb sk when routing harder
If netfilter changes the packet mark when mangling, the packet is
rerouted using the route_me_harder set of functions. Prior to this
commit, there's one big difference between route_me_harder and the
ordinary initial routing functions, described in the comment above
__ip_queue_xmit():

   /* Note: skb->sk can be different from sk, in case of tunnels */
   int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,

That function goes on to correctly make use of sk->sk_bound_dev_if,
rather than skb->sk->sk_bound_dev_if. And indeed the comment is true: a
tunnel will receive a packet in ndo_start_xmit with an initial skb->sk.
It will make some transformations to that packet, and then it will send
the encapsulated packet out of a *new* socket. That new socket will
basically always have a different sk_bound_dev_if (otherwise there'd be
a routing loop). So for the purposes of routing the encapsulated packet,
the routing information as it pertains to the socket should come from
that socket's sk, rather than the packet's original skb->sk. For that
reason __ip_queue_xmit() and related functions all do the right thing.

One might argue that all tunnels should just call skb_orphan(skb) before
transmitting the encapsulated packet into the new socket. But tunnels do
*not* do this -- and this is wisely avoided in skb_scrub_packet() too --
because features like TSQ rely on skb->destructor() being called when
that buffer space is truely available again. Calling skb_orphan(skb) too
early would result in buffers filling up unnecessarily and accounting
info being all wrong. Instead, additional routing must take into account
the new sk, just as __ip_queue_xmit() notes.

So, this commit addresses the problem by fishing the correct sk out of
state->sk -- it's already set properly in the call to nf_hook() in
__ip_local_out(), which receives the sk as part of its normal
functionality. So we make sure to plumb state->sk through the various
route_me_harder functions, and then make correct use of it following the
example of __ip_queue_xmit().

Fixes: 1da177e4c3 ("Linux-2.6.12-rc2")
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
Reviewed-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
2020-10-30 12:57:39 +01:00

266 lines
6.5 KiB
C

/*
* IPv6 specific functions of netfilter core
*
* Rusty Russell (C) 2000 -- This code is GPL.
* Patrick McHardy (C) 2006-2012
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/ipv6.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv6.h>
#include <linux/export.h>
#include <net/addrconf.h>
#include <net/dst.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/netfilter/nf_queue.h>
#include <net/netfilter/nf_conntrack_bridge.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
#include "../bridge/br_private.h"
int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct sock *sk = sk_to_full_sk(sk_partial);
unsigned int hh_len;
struct dst_entry *dst;
int strict = (ipv6_addr_type(&iph->daddr) &
(IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
struct flowi6 fl6 = {
.flowi6_oif = sk && sk->sk_bound_dev_if ? sk->sk_bound_dev_if :
strict ? skb_dst(skb)->dev->ifindex : 0,
.flowi6_mark = skb->mark,
.flowi6_uid = sock_net_uid(net, sk),
.daddr = iph->daddr,
.saddr = iph->saddr,
};
int err;
dst = ip6_route_output(net, sk, &fl6);
err = dst->error;
if (err) {
IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
net_dbg_ratelimited("ip6_route_me_harder: No more route\n");
dst_release(dst);
return err;
}
/* Drop old route. */
skb_dst_drop(skb);
skb_dst_set(skb, dst);
#ifdef CONFIG_XFRM
if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
skb_dst_set(skb, NULL);
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
skb_dst_set(skb, dst);
}
#endif
/* Change in oif may mean change in hh_len. */
hh_len = skb_dst(skb)->dev->hard_header_len;
if (skb_headroom(skb) < hh_len &&
pskb_expand_head(skb, HH_DATA_ALIGN(hh_len - skb_headroom(skb)),
0, GFP_ATOMIC))
return -ENOMEM;
return 0;
}
EXPORT_SYMBOL(ip6_route_me_harder);
static int nf_ip6_reroute(struct sk_buff *skb,
const struct nf_queue_entry *entry)
{
struct ip6_rt_info *rt_info = nf_queue_entry_reroute(entry);
if (entry->state.hook == NF_INET_LOCAL_OUT) {
const struct ipv6hdr *iph = ipv6_hdr(skb);
if (!ipv6_addr_equal(&iph->daddr, &rt_info->daddr) ||
!ipv6_addr_equal(&iph->saddr, &rt_info->saddr) ||
skb->mark != rt_info->mark)
return ip6_route_me_harder(entry->state.net, entry->state.sk, skb);
}
return 0;
}
int __nf_ip6_route(struct net *net, struct dst_entry **dst,
struct flowi *fl, bool strict)
{
static const struct ipv6_pinfo fake_pinfo;
static const struct inet_sock fake_sk = {
/* makes ip6_route_output set RT6_LOOKUP_F_IFACE: */
.sk.sk_bound_dev_if = 1,
.pinet6 = (struct ipv6_pinfo *) &fake_pinfo,
};
const void *sk = strict ? &fake_sk : NULL;
struct dst_entry *result;
int err;
result = ip6_route_output(net, sk, &fl->u.ip6);
err = result->error;
if (err)
dst_release(result);
else
*dst = result;
return err;
}
EXPORT_SYMBOL_GPL(__nf_ip6_route);
int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct nf_bridge_frag_data *data,
int (*output)(struct net *, struct sock *sk,
const struct nf_bridge_frag_data *data,
struct sk_buff *))
{
int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
ktime_t tstamp = skb->tstamp;
struct ip6_frag_state state;
u8 *prevhdr, nexthdr = 0;
unsigned int mtu, hlen;
int hroom, err = 0;
__be32 frag_id;
err = ip6_find_1stfragopt(skb, &prevhdr);
if (err < 0)
goto blackhole;
hlen = err;
nexthdr = *prevhdr;
mtu = skb->dev->mtu;
if (frag_max_size > mtu ||
frag_max_size < IPV6_MIN_MTU)
goto blackhole;
mtu = frag_max_size;
if (mtu < hlen + sizeof(struct frag_hdr) + 8)
goto blackhole;
mtu -= hlen + sizeof(struct frag_hdr);
frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
&ipv6_hdr(skb)->saddr);
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(err = skb_checksum_help(skb)))
goto blackhole;
hroom = LL_RESERVED_SPACE(skb->dev);
if (skb_has_frag_list(skb)) {
unsigned int first_len = skb_pagelen(skb);
struct ip6_fraglist_iter iter;
struct sk_buff *frag2;
if (first_len - hlen > mtu ||
skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
goto blackhole;
if (skb_cloned(skb))
goto slow_path;
skb_walk_frags(skb, frag2) {
if (frag2->len > mtu ||
skb_headroom(frag2) < (hlen + hroom + sizeof(struct frag_hdr)))
goto blackhole;
/* Partially cloned skb? */
if (skb_shared(frag2))
goto slow_path;
}
err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
&iter);
if (err < 0)
goto blackhole;
for (;;) {
/* Prepare header of the next frame,
* before previous one went down.
*/
if (iter.frag)
ip6_fraglist_prepare(skb, &iter);
skb->tstamp = tstamp;
err = output(net, sk, data, skb);
if (err || !iter.frag)
break;
skb = ip6_fraglist_next(&iter);
}
kfree(iter.tmp_hdr);
if (!err)
return 0;
kfree_skb_list(iter.frag);
return err;
}
slow_path:
/* This is a linearized skbuff, the original geometry is lost for us.
* This may also be a clone skbuff, we could preserve the geometry for
* the copies but probably not worth the effort.
*/
ip6_frag_init(skb, hlen, mtu, skb->dev->needed_tailroom,
LL_RESERVED_SPACE(skb->dev), prevhdr, nexthdr, frag_id,
&state);
while (state.left > 0) {
struct sk_buff *skb2;
skb2 = ip6_frag_next(skb, &state);
if (IS_ERR(skb2)) {
err = PTR_ERR(skb2);
goto blackhole;
}
skb2->tstamp = tstamp;
err = output(net, sk, data, skb2);
if (err)
goto blackhole;
}
consume_skb(skb);
return err;
blackhole:
kfree_skb(skb);
return 0;
}
EXPORT_SYMBOL_GPL(br_ip6_fragment);
static const struct nf_ipv6_ops ipv6ops = {
#if IS_MODULE(CONFIG_IPV6)
.chk_addr = ipv6_chk_addr,
.route_me_harder = ip6_route_me_harder,
.dev_get_saddr = ipv6_dev_get_saddr,
.route = __nf_ip6_route,
#if IS_ENABLED(CONFIG_SYN_COOKIES)
.cookie_init_sequence = __cookie_v6_init_sequence,
.cookie_v6_check = __cookie_v6_check,
#endif
#endif
.route_input = ip6_route_input,
.fragment = ip6_fragment,
.reroute = nf_ip6_reroute,
#if IS_MODULE(CONFIG_IPV6)
.br_fragment = br_ip6_fragment,
#endif
};
int __init ipv6_netfilter_init(void)
{
RCU_INIT_POINTER(nf_ipv6_ops, &ipv6ops);
return 0;
}
/* This can be called from inet6_init() on errors, so it cannot
* be marked __exit. -DaveM
*/
void ipv6_netfilter_fini(void)
{
RCU_INIT_POINTER(nf_ipv6_ops, NULL);
}