mirror of
https://github.com/torvalds/linux.git
synced 2024-11-25 05:32:00 +00:00
cf8b49fbd0
Reproduce environment: network with 3 VM linuxs is connected as below: VM1<---->VM2(latest kernel 6.5.0-rc7)<---->VM3 VM1: eth0 ip: 192.168.122.207 MTU 1800 VM2: eth0 ip: 192.168.122.208, eth1 ip: 192.168.123.224 MTU 1500 VM3: eth0 ip: 192.168.123.240 MTU 1800 Reproduce: VM1 send 1600 bytes UDP data to VM3 using tools scapy with flags='DF'. scapy command: send(IP(dst="192.168.123.240",flags='DF')/UDP()/str('0'*1600),count=1, inter=1.000000) Result: Before IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqdss Ip: 1 64 6 0 2 2 0 0 2 4 0 0 0 0 0 0 0 0 0 ...... root@qemux86-64:~# ---------------------------------------------------------------------- After IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqdss Ip: 1 64 7 0 2 2 0 0 2 5 0 0 0 0 0 0 0 1 0 ...... root@qemux86-64:~# ---------------------------------------------------------------------- ForwDatagrams is always keeping 2 without increment. Issue description and patch: ip_exceeds_mtu() in ip_forward() drops this IP datagram because skb len (1600 sending by scapy) is over MTU(1500 in VM2) if "DF" is set. According to RFC 4293 "3.2.3. IP Statistics Tables", +-------+------>------+----->-----+----->-----+ | InForwDatagrams (6) | OutForwDatagrams (6) | | V +->-+ OutFragReqds | InNoRoutes | | (packets) / (local packet (3) | | | IF is that of the address | +--> OutFragFails | and may not be the receiving IF) | | (packets) the IPSTATS_MIB_OUTFORWDATAGRAMS should be counted before fragment check. The existing implementation, instead, would incease the counter after fragment check: ip_exceeds_mtu() in ipv4 and ip6_pkt_too_big() in ipv6. So do patch to move IPSTATS_MIB_OUTFORWDATAGRAMS counter to ip_forward() for ipv4 and ip6_forward() for ipv6. Test result with patch: Before IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqdss Ip: 1 64 6 0 2 2 0 0 2 4 0 0 0 0 0 0 0 0 0 ...... root@qemux86-64:~# ---------------------------------------------------------------------- After IP data is sent. ---------------------------------------------------------------------- root@qemux86-64:~# cat /proc/net/snmp Ip: Forwarding DefaultTTL InReceives InHdrErrors InAddrErrors ForwDatagrams InUnknownProtos InDiscards InDelivers OutRequests OutDiscards OutNoRoutes ReasmTimeout ReasmReqdss Ip: 1 64 7 0 2 3 0 0 2 5 0 0 0 0 0 0 0 1 0 ...... root@qemux86-64:~# ---------------------------------------------------------------------- ForwDatagrams is updated from 2 to 3. Reviewed-by: Filip Pudak <filip.pudak@windriver.com> Signed-off-by: Heng Guo <heng.guo@windriver.com> Reviewed-by: David Ahern <dsahern@kernel.org> Link: https://lore.kernel.org/r/20231011015137.27262-1-heng.guo@windriver.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
182 lines
4.2 KiB
C
182 lines
4.2 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
/*
|
|
* INET An implementation of the TCP/IP protocol suite for the LINUX
|
|
* operating system. INET is implemented using the BSD Socket
|
|
* interface as the means of communication with the user level.
|
|
*
|
|
* The IP forwarding functionality.
|
|
*
|
|
* Authors: see ip.c
|
|
*
|
|
* Fixes:
|
|
* Many : Split from ip.c , see ip_input.c for
|
|
* history.
|
|
* Dave Gregorich : NULL ip_rt_put fix for multicast
|
|
* routing.
|
|
* Jos Vos : Add call_out_firewall before sending,
|
|
* use output device for accounting.
|
|
* Jos Vos : Call forward firewall after routing
|
|
* (always use output device).
|
|
* Mike McLagan : Routing by source
|
|
*/
|
|
|
|
#include <linux/types.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/ip.h>
|
|
#include <linux/icmp.h>
|
|
#include <linux/netdevice.h>
|
|
#include <linux/slab.h>
|
|
#include <net/sock.h>
|
|
#include <net/ip.h>
|
|
#include <net/tcp.h>
|
|
#include <net/udp.h>
|
|
#include <net/icmp.h>
|
|
#include <linux/tcp.h>
|
|
#include <linux/udp.h>
|
|
#include <linux/netfilter_ipv4.h>
|
|
#include <net/checksum.h>
|
|
#include <linux/route.h>
|
|
#include <net/route.h>
|
|
#include <net/xfrm.h>
|
|
|
|
static bool ip_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu)
|
|
{
|
|
if (skb->len <= mtu)
|
|
return false;
|
|
|
|
if (unlikely((ip_hdr(skb)->frag_off & htons(IP_DF)) == 0))
|
|
return false;
|
|
|
|
/* original fragment exceeds mtu and DF is set */
|
|
if (unlikely(IPCB(skb)->frag_max_size > mtu))
|
|
return true;
|
|
|
|
if (skb->ignore_df)
|
|
return false;
|
|
|
|
if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
|
|
static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
|
|
{
|
|
struct ip_options *opt = &(IPCB(skb)->opt);
|
|
|
|
#ifdef CONFIG_NET_SWITCHDEV
|
|
if (skb->offload_l3_fwd_mark) {
|
|
consume_skb(skb);
|
|
return 0;
|
|
}
|
|
#endif
|
|
|
|
if (unlikely(opt->optlen))
|
|
ip_forward_options(skb);
|
|
|
|
skb_clear_tstamp(skb);
|
|
return dst_output(net, sk, skb);
|
|
}
|
|
|
|
int ip_forward(struct sk_buff *skb)
|
|
{
|
|
u32 mtu;
|
|
struct iphdr *iph; /* Our header */
|
|
struct rtable *rt; /* Route we use */
|
|
struct ip_options *opt = &(IPCB(skb)->opt);
|
|
struct net *net;
|
|
SKB_DR(reason);
|
|
|
|
/* that should never happen */
|
|
if (skb->pkt_type != PACKET_HOST)
|
|
goto drop;
|
|
|
|
if (unlikely(skb->sk))
|
|
goto drop;
|
|
|
|
if (skb_warn_if_lro(skb))
|
|
goto drop;
|
|
|
|
if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
|
|
SKB_DR_SET(reason, XFRM_POLICY);
|
|
goto drop;
|
|
}
|
|
|
|
if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb))
|
|
return NET_RX_SUCCESS;
|
|
|
|
skb_forward_csum(skb);
|
|
net = dev_net(skb->dev);
|
|
|
|
/*
|
|
* According to the RFC, we must first decrease the TTL field. If
|
|
* that reaches zero, we must reply an ICMP control message telling
|
|
* that the packet's lifetime expired.
|
|
*/
|
|
if (ip_hdr(skb)->ttl <= 1)
|
|
goto too_many_hops;
|
|
|
|
if (!xfrm4_route_forward(skb)) {
|
|
SKB_DR_SET(reason, XFRM_POLICY);
|
|
goto drop;
|
|
}
|
|
|
|
rt = skb_rtable(skb);
|
|
|
|
if (opt->is_strictroute && rt->rt_uses_gateway)
|
|
goto sr_failed;
|
|
|
|
__IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
|
|
|
|
IPCB(skb)->flags |= IPSKB_FORWARDED;
|
|
mtu = ip_dst_mtu_maybe_forward(&rt->dst, true);
|
|
if (ip_exceeds_mtu(skb, mtu)) {
|
|
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
|
|
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
|
|
htonl(mtu));
|
|
SKB_DR_SET(reason, PKT_TOO_BIG);
|
|
goto drop;
|
|
}
|
|
|
|
/* We are about to mangle packet. Copy it! */
|
|
if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
|
|
goto drop;
|
|
iph = ip_hdr(skb);
|
|
|
|
/* Decrease ttl after skb cow done */
|
|
ip_decrease_ttl(iph);
|
|
|
|
/*
|
|
* We now generate an ICMP HOST REDIRECT giving the route
|
|
* we calculated.
|
|
*/
|
|
if (IPCB(skb)->flags & IPSKB_DOREDIRECT && !opt->srr &&
|
|
!skb_sec_path(skb))
|
|
ip_rt_send_redirect(skb);
|
|
|
|
if (READ_ONCE(net->ipv4.sysctl_ip_fwd_update_priority))
|
|
skb->priority = rt_tos2priority(iph->tos);
|
|
|
|
return NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
|
|
net, NULL, skb, skb->dev, rt->dst.dev,
|
|
ip_forward_finish);
|
|
|
|
sr_failed:
|
|
/*
|
|
* Strict routing permits no gatewaying
|
|
*/
|
|
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0);
|
|
goto drop;
|
|
|
|
too_many_hops:
|
|
/* Tell the sender its packet died... */
|
|
__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
|
|
icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
|
|
SKB_DR_SET(reason, IP_INHDR);
|
|
drop:
|
|
kfree_skb_reason(skb, reason);
|
|
return NET_RX_DROP;
|
|
}
|