mirror of
https://github.com/torvalds/linux.git
synced 2024-12-13 06:32:50 +00:00
Merge branch 'ipv6-avoid-atomic-fragment-on-gso-output'
Yan Zhai says: ==================== ipv6: avoid atomic fragment on GSO output When the ipv6 stack output a GSO packet, if its gso_size is larger than dst MTU, then all segments would be fragmented. However, it is possible for a GSO packet to have a trailing segment with smaller actual size than both gso_size as well as the MTU, which leads to an "atomic fragment". Atomic fragments are considered harmful in RFC-8021. An Existing report from APNIC also shows that atomic fragments are more likely to be dropped even it is equivalent to a no-op [1]. The series contains following changes: * drop feature RTAX_FEATURE_ALLFRAG, which has been broken. This helps simplifying other changes in this set. * refactor __ip6_finish_output code to separate GSO and non-GSO packet processing, mirroring IPv4 side logic. * avoid generating atomic fragment on GSO packets. Link: https://www.potaroo.net/presentations/2022-03-01-ipv6-frag.pdf [1] V4: https://lore.kernel.org/netdev/cover.1698114636.git.yan@cloudflare.com/ V3: https://lore.kernel.org/netdev/cover.1697779681.git.yan@cloudflare.com/ V2: https://lore.kernel.org/netdev/ZS1%2Fqtr0dZJ35VII@debian.debian/ ==================== Link: https://lore.kernel.org/r/cover.1698156966.git.yan@cloudflare.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
commit
d8c4ef76d7
@ -222,13 +222,6 @@ static inline unsigned long dst_metric_rtt(const struct dst_entry *dst, int metr
|
||||
return msecs_to_jiffies(dst_metric(dst, metric));
|
||||
}
|
||||
|
||||
static inline u32
|
||||
dst_allfrag(const struct dst_entry *dst)
|
||||
{
|
||||
int ret = dst_feature(dst, RTAX_FEATURE_ALLFRAG);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline int
|
||||
dst_metric_locked(const struct dst_entry *dst, int metric)
|
||||
{
|
||||
|
@ -44,7 +44,6 @@ struct inet_connection_sock_af_ops {
|
||||
struct request_sock *req_unhash,
|
||||
bool *own_req);
|
||||
u16 net_header_len;
|
||||
u16 net_frag_header_len;
|
||||
u16 sockaddr_len;
|
||||
int (*setsockopt)(struct sock *sk, int level, int optname,
|
||||
sockptr_t optval, unsigned int optlen);
|
||||
|
@ -244,7 +244,6 @@ struct inet_sock {
|
||||
};
|
||||
|
||||
#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */
|
||||
#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */
|
||||
|
||||
enum {
|
||||
INET_FLAGS_PKTINFO = 0,
|
||||
|
@ -505,7 +505,7 @@ enum {
|
||||
#define RTAX_FEATURE_ECN (1 << 0)
|
||||
#define RTAX_FEATURE_SACK (1 << 1) /* unused */
|
||||
#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */
|
||||
#define RTAX_FEATURE_ALLFRAG (1 << 3)
|
||||
#define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */
|
||||
#define RTAX_FEATURE_TCP_USEC_TS (1 << 4)
|
||||
|
||||
#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \
|
||||
|
@ -1698,14 +1698,6 @@ static inline int __tcp_mtu_to_mss(struct sock *sk, int pmtu)
|
||||
*/
|
||||
mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
|
||||
|
||||
/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
|
||||
if (icsk->icsk_af_ops->net_frag_header_len) {
|
||||
const struct dst_entry *dst = __sk_dst_get(sk);
|
||||
|
||||
if (dst && dst_allfrag(dst))
|
||||
mss_now -= icsk->icsk_af_ops->net_frag_header_len;
|
||||
}
|
||||
|
||||
/* Clamp it (mss_clamp does not include tcp options) */
|
||||
if (mss_now > tp->rx_opt.mss_clamp)
|
||||
mss_now = tp->rx_opt.mss_clamp;
|
||||
@ -1733,21 +1725,11 @@ int tcp_mss_to_mtu(struct sock *sk, int mss)
|
||||
{
|
||||
const struct tcp_sock *tp = tcp_sk(sk);
|
||||
const struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
int mtu;
|
||||
|
||||
mtu = mss +
|
||||
return mss +
|
||||
tp->tcp_header_len +
|
||||
icsk->icsk_ext_hdr_len +
|
||||
icsk->icsk_af_ops->net_header_len;
|
||||
|
||||
/* IPv6 adds a frag_hdr in case RTAX_FEATURE_ALLFRAG is set */
|
||||
if (icsk->icsk_af_ops->net_frag_header_len) {
|
||||
const struct dst_entry *dst = __sk_dst_get(sk);
|
||||
|
||||
if (dst && dst_allfrag(dst))
|
||||
mtu += icsk->icsk_af_ops->net_frag_header_len;
|
||||
}
|
||||
return mtu;
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_mss_to_mtu);
|
||||
|
||||
|
@ -164,7 +164,13 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
|
||||
int err;
|
||||
|
||||
skb_mark_not_on_list(segs);
|
||||
err = ip6_fragment(net, sk, segs, ip6_finish_output2);
|
||||
/* Last GSO segment can be smaller than gso_size (and MTU).
|
||||
* Adding a fragment header would produce an "atomic fragment",
|
||||
* which is considered harmful (RFC-8021). Avoid that.
|
||||
*/
|
||||
err = segs->len > mtu ?
|
||||
ip6_fragment(net, sk, segs, ip6_finish_output2) :
|
||||
ip6_finish_output2(net, sk, segs);
|
||||
if (err && ret == 0)
|
||||
ret = err;
|
||||
}
|
||||
@ -172,6 +178,16 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int ip6_finish_output_gso(struct net *net, struct sock *sk,
|
||||
struct sk_buff *skb, unsigned int mtu)
|
||||
{
|
||||
if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
|
||||
!skb_gso_validate_network_len(skb, mtu))
|
||||
return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
|
||||
|
||||
return ip6_finish_output2(net, sk, skb);
|
||||
}
|
||||
|
||||
static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
unsigned int mtu;
|
||||
@ -185,17 +201,14 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
|
||||
#endif
|
||||
|
||||
mtu = ip6_skb_dst_mtu(skb);
|
||||
if (skb_is_gso(skb) &&
|
||||
!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
|
||||
!skb_gso_validate_network_len(skb, mtu))
|
||||
return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
|
||||
if (skb_is_gso(skb))
|
||||
return ip6_finish_output_gso(net, sk, skb, mtu);
|
||||
|
||||
if ((skb->len > mtu && !skb_is_gso(skb)) ||
|
||||
dst_allfrag(skb_dst(skb)) ||
|
||||
if (skb->len > mtu ||
|
||||
(IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
|
||||
return ip6_fragment(net, sk, skb, ip6_finish_output2);
|
||||
else
|
||||
return ip6_finish_output2(net, sk, skb);
|
||||
|
||||
return ip6_finish_output2(net, sk, skb);
|
||||
}
|
||||
|
||||
static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
|
||||
@ -1017,9 +1030,6 @@ slow_path:
|
||||
return err;
|
||||
|
||||
fail_toobig:
|
||||
if (skb->sk && dst_allfrag(skb_dst(skb)))
|
||||
sk_gso_disable(skb->sk);
|
||||
|
||||
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
|
||||
err = -EMSGSIZE;
|
||||
|
||||
@ -1384,10 +1394,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
|
||||
cork->base.mark = ipc6->sockc.mark;
|
||||
sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
|
||||
|
||||
if (dst_allfrag(xfrm_dst_path(&rt->dst)))
|
||||
cork->base.flags |= IPCORK_ALLFRAG;
|
||||
cork->base.length = 0;
|
||||
|
||||
cork->base.transmit_time = ipc6->sockc.transmit_time;
|
||||
|
||||
return 0;
|
||||
@ -1444,8 +1451,6 @@ static int __ip6_append_data(struct sock *sk,
|
||||
|
||||
headersize = sizeof(struct ipv6hdr) +
|
||||
(opt ? opt->opt_flen + opt->opt_nflen : 0) +
|
||||
(dst_allfrag(&rt->dst) ?
|
||||
sizeof(struct frag_hdr) : 0) +
|
||||
rt->rt6i_nfheader_len;
|
||||
|
||||
if (mtu <= fragheaderlen ||
|
||||
@ -1555,7 +1560,7 @@ emsgsize:
|
||||
|
||||
while (length > 0) {
|
||||
/* Check if the remaining data fits into current packet. */
|
||||
copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
|
||||
copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
|
||||
if (copy < length)
|
||||
copy = maxfraglen - skb->len;
|
||||
|
||||
@ -1586,7 +1591,7 @@ alloc_new_skb:
|
||||
*/
|
||||
datalen = length + fraggap;
|
||||
|
||||
if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
|
||||
if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
|
||||
datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
|
||||
fraglen = datalen + fragheaderlen;
|
||||
pagedlen = 0;
|
||||
@ -1835,7 +1840,6 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
|
||||
struct dst_entry *dst = cork->base.dst;
|
||||
|
||||
cork->base.dst = NULL;
|
||||
cork->base.flags &= ~IPCORK_ALLFRAG;
|
||||
skb_dst_set(skb, dst);
|
||||
}
|
||||
|
||||
@ -1856,7 +1860,6 @@ static void ip6_cork_release(struct inet_cork_full *cork,
|
||||
if (cork->base.dst) {
|
||||
dst_release(cork->base.dst);
|
||||
cork->base.dst = NULL;
|
||||
cork->base.flags &= ~IPCORK_ALLFRAG;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1895,7 +1895,6 @@ const struct inet_connection_sock_af_ops ipv6_specific = {
|
||||
.conn_request = tcp_v6_conn_request,
|
||||
.syn_recv_sock = tcp_v6_syn_recv_sock,
|
||||
.net_header_len = sizeof(struct ipv6hdr),
|
||||
.net_frag_header_len = sizeof(struct frag_hdr),
|
||||
.setsockopt = ipv6_setsockopt,
|
||||
.getsockopt = ipv6_getsockopt,
|
||||
.addr2sockaddr = inet6_csk_addr2sockaddr,
|
||||
|
@ -95,7 +95,7 @@ static int __xfrm6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
|
||||
return -EMSGSIZE;
|
||||
}
|
||||
|
||||
if (toobig || dst_allfrag(skb_dst(skb)))
|
||||
if (toobig)
|
||||
return ip6_fragment(net, sk, skb,
|
||||
__xfrm6_output_finish);
|
||||
|
||||
|
@ -2051,7 +2051,6 @@ void __init mptcp_subflow_init(void)
|
||||
subflow_v6m_specific.send_check = ipv4_specific.send_check;
|
||||
subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
|
||||
subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
|
||||
subflow_v6m_specific.net_frag_header_len = 0;
|
||||
subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
|
||||
|
||||
tcpv6_prot_override = tcpv6_prot;
|
||||
|
Loading…
Reference in New Issue
Block a user