From e9faf53c5a5d01f6f2a09ae28ec63a3bbd6f64fd Mon Sep 17 00:00:00 2001 From: Dongliang Mu <mudongliangabcd@gmail.com> Date: Mon, 5 Jul 2021 21:13:20 +0800 Subject: [PATCH 01/87] ieee802154: hwsim: fix GPF in hwsim_set_edge_lqi Both MAC802154_HWSIM_ATTR_RADIO_ID and MAC802154_HWSIM_ATTR_RADIO_EDGE, MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID and MAC802154_HWSIM_EDGE_ATTR_LQI must be present to fix GPF. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: Dongliang Mu <mudongliangabcd@gmail.com> Acked-by: Alexander Aring <aahringo@redhat.com> Link: https://lore.kernel.org/r/20210705131321.217111-1-mudongliangabcd@gmail.com Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org> --- drivers/net/ieee802154/mac802154_hwsim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index ebc976b7fcc2..cae52bfb871e 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -528,14 +528,14 @@ static int hwsim_set_edge_lqi(struct sk_buff *msg, struct genl_info *info) u32 v0, v1; u8 lqi; - if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] && + if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; if (nla_parse_nested_deprecated(edge_attrs, MAC802154_HWSIM_EDGE_ATTR_MAX, info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE], hwsim_edge_policy, NULL)) return -EINVAL; - if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] && + if (!edge_attrs[MAC802154_HWSIM_EDGE_ATTR_ENDPOINT_ID] || !edge_attrs[MAC802154_HWSIM_EDGE_ATTR_LQI]) return -EINVAL; From 889d0e7dc68314a273627d89cbb60c09e1cc1c25 Mon Sep 17 00:00:00 2001 From: Dongliang Mu <mudongliangabcd@gmail.com> Date: Wed, 7 Jul 2021 23:56:32 +0800 Subject: [PATCH 02/87] ieee802154: hwsim: fix GPF in hwsim_new_edge_nl Both MAC802154_HWSIM_ATTR_RADIO_ID and MAC802154_HWSIM_ATTR_RADIO_EDGE must be present to fix GPF. Fixes: f25da51fdc38 ("ieee802154: hwsim: add replacement for fakelb") Signed-off-by: Dongliang Mu <mudongliangabcd@gmail.com> Acked-by: Alexander Aring <aahringo@redhat.com> Link: https://lore.kernel.org/r/20210707155633.1486603-1-mudongliangabcd@gmail.com Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org> --- drivers/net/ieee802154/mac802154_hwsim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c index cae52bfb871e..8caa61ec718f 100644 --- a/drivers/net/ieee802154/mac802154_hwsim.c +++ b/drivers/net/ieee802154/mac802154_hwsim.c @@ -418,7 +418,7 @@ static int hwsim_new_edge_nl(struct sk_buff *msg, struct genl_info *info) struct hwsim_edge *e; u32 v0, v1; - if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] && + if (!info->attrs[MAC802154_HWSIM_ATTR_RADIO_ID] || !info->attrs[MAC802154_HWSIM_ATTR_RADIO_EDGE]) return -EINVAL; From 5f7b51bf09baca8e4f80cbe879536842bafb5f31 Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik <kadlec@netfilter.org> Date: Wed, 28 Jul 2021 17:01:15 +0200 Subject: [PATCH 03/87] netfilter: ipset: Limit the maximal range of consecutive elements to add/delete The range size of consecutive elements were not limited. Thus one could define a huge range which may result soft lockup errors due to the long execution time. Now the range size is limited to 2^20 entries. Reported-by: Brad Spengler <spender@grsecurity.net> Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/linux/netfilter/ipset/ip_set.h | 3 +++ net/netfilter/ipset/ip_set_hash_ip.c | 9 ++++++++- net/netfilter/ipset/ip_set_hash_ipmark.c | 10 +++++++++- net/netfilter/ipset/ip_set_hash_ipport.c | 3 +++ net/netfilter/ipset/ip_set_hash_ipportip.c | 3 +++ net/netfilter/ipset/ip_set_hash_ipportnet.c | 3 +++ net/netfilter/ipset/ip_set_hash_net.c | 11 ++++++++++- net/netfilter/ipset/ip_set_hash_netiface.c | 10 +++++++++- net/netfilter/ipset/ip_set_hash_netnet.c | 16 +++++++++++++++- net/netfilter/ipset/ip_set_hash_netport.c | 11 ++++++++++- net/netfilter/ipset/ip_set_hash_netportnet.c | 16 +++++++++++++++- 11 files changed, 88 insertions(+), 7 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index 10279c4830ac..ada1296c87d5 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -196,6 +196,9 @@ struct ip_set_region { u32 elements; /* Number of elements vs timeout */ }; +/* Max range where every element is added/deleted in one step */ +#define IPSET_MAX_RANGE (1<<20) + /* The max revision number supported by any set type + 1 */ #define IPSET_REVISION_MAX 9 diff --git a/net/netfilter/ipset/ip_set_hash_ip.c b/net/netfilter/ipset/ip_set_hash_ip.c index d1bef23fd4f5..dd30c03d5a23 100644 --- a/net/netfilter/ipset/ip_set_hash_ip.c +++ b/net/netfilter/ipset/ip_set_hash_ip.c @@ -132,8 +132,11 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -144,6 +147,10 @@ hash_ip4_uadt(struct ip_set *set, struct nlattr *tb[], hosts = h->netmask == 32 ? 1 : 2 << (32 - h->netmask - 1); + /* 64bit division is not allowed on 32bit */ + if (((u64)ip_to - ip + 1) >> (32 - h->netmask) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) { ip = ntohl(h->next.ip); e.ip = htonl(ip); diff --git a/net/netfilter/ipset/ip_set_hash_ipmark.c b/net/netfilter/ipset/ip_set_hash_ipmark.c index 18346d18aa16..153de3457423 100644 --- a/net/netfilter/ipset/ip_set_hash_ipmark.c +++ b/net/netfilter/ipset/ip_set_hash_ipmark.c @@ -121,6 +121,8 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], e.mark = ntohl(nla_get_be32(tb[IPSET_ATTR_MARK])); e.mark &= h->markmask; + if (e.mark == 0 && e.ip == 0) + return -IPSET_ERR_HASH_ELEM; if (adt == IPSET_TEST || !(tb[IPSET_ATTR_IP_TO] || tb[IPSET_ATTR_CIDR])) { @@ -133,8 +135,11 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP_TO], &ip_to); if (ret) return ret; - if (ip > ip_to) + if (ip > ip_to) { + if (e.mark == 0 && ip_to == 0) + return -IPSET_ERR_HASH_ELEM; swap(ip, ip_to); + } } else if (tb[IPSET_ATTR_CIDR]) { u8 cidr = nla_get_u8(tb[IPSET_ATTR_CIDR]); @@ -143,6 +148,9 @@ hash_ipmark4_uadt(struct ip_set *set, struct nlattr *tb[], ip_set_mask_from_to(ip, ip_to, cidr); } + if (((u64)ip_to - ip + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipport.c b/net/netfilter/ipset/ip_set_hash_ipport.c index e1ca11196515..7303138e46be 100644 --- a/net/netfilter/ipset/ip_set_hash_ipport.c +++ b/net/netfilter/ipset/ip_set_hash_ipport.c @@ -173,6 +173,9 @@ hash_ipport4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportip.c b/net/netfilter/ipset/ip_set_hash_ipportip.c index ab179e064597..334fb1ad0e86 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportip.c +++ b/net/netfilter/ipset/ip_set_hash_ipportip.c @@ -180,6 +180,9 @@ hash_ipportip4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); for (; ip <= ip_to; ip++) { diff --git a/net/netfilter/ipset/ip_set_hash_ipportnet.c b/net/netfilter/ipset/ip_set_hash_ipportnet.c index 8f075b44cf64..7df94f437f60 100644 --- a/net/netfilter/ipset/ip_set_hash_ipportnet.c +++ b/net/netfilter/ipset/ip_set_hash_ipportnet.c @@ -253,6 +253,9 @@ hash_ipportnet4_uadt(struct ip_set *set, struct nlattr *tb[], swap(port, port_to); } + if (((u64)ip_to - ip + 1)*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; + ip2_to = ip2_from; if (tb[IPSET_ATTR_IP2_TO]) { ret = ip_set_get_hostipaddr4(tb[IPSET_ATTR_IP2_TO], &ip2_to); diff --git a/net/netfilter/ipset/ip_set_hash_net.c b/net/netfilter/ipset/ip_set_hash_net.c index c1a11f041ac6..1422739d9aa2 100644 --- a/net/netfilter/ipset/ip_set_hash_net.c +++ b/net/netfilter/ipset/ip_set_hash_net.c @@ -140,7 +140,7 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_net4_elem e = { .cidr = HOST_MASK }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, ipn, n = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -188,6 +188,15 @@ hash_net4_uadt(struct ip_set *set, struct nlattr *tb[], if (ip + UINT_MAX == ip_to) return -IPSET_ERR_HASH_RANGE; } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); + n++; + } while (ipn++ < ip_to); + + if (n > IPSET_MAX_RANGE) + return -ERANGE; + if (retried) ip = ntohl(h->next.ip); do { diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index ddd51c2e1cb3..9810f5bf63f5 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -202,7 +202,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netiface4_elem e = { .cidr = HOST_MASK, .elem = 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 ip = 0, ip_to = 0; + u32 ip = 0, ip_to = 0, ipn, n = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -256,6 +256,14 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr); + n++; + } while (ipn++ < ip_to); + + if (n > IPSET_MAX_RANGE) + return -ERANGE; if (retried) ip = ntohl(h->next.ip); diff --git a/net/netfilter/ipset/ip_set_hash_netnet.c b/net/netfilter/ipset/ip_set_hash_netnet.c index 6532f0505e66..3d09eefe998a 100644 --- a/net/netfilter/ipset/ip_set_hash_netnet.c +++ b/net/netfilter/ipset/ip_set_hash_netnet.c @@ -168,7 +168,8 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0; - u32 ip2 = 0, ip2_from = 0, ip2_to = 0; + u32 ip2 = 0, ip2_from = 0, ip2_to = 0, ipn; + u64 n = 0, m = 0; int ret; if (tb[IPSET_ATTR_LINENO]) @@ -244,6 +245,19 @@ hash_netnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); + n++; + } while (ipn++ < ip_to); + ipn = ip2_from; + do { + ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); + m++; + } while (ipn++ < ip2_to); + + if (n*m > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); diff --git a/net/netfilter/ipset/ip_set_hash_netport.c b/net/netfilter/ipset/ip_set_hash_netport.c index ec1564a1cb5a..09cf72eb37f8 100644 --- a/net/netfilter/ipset/ip_set_hash_netport.c +++ b/net/netfilter/ipset/ip_set_hash_netport.c @@ -158,7 +158,8 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], ipset_adtfn adtfn = set->variant->adt[adt]; struct hash_netport4_elem e = { .cidr = HOST_MASK - 1 }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); - u32 port, port_to, p = 0, ip = 0, ip_to = 0; + u32 port, port_to, p = 0, ip = 0, ip_to = 0, ipn; + u64 n = 0; bool with_ports = false; u8 cidr; int ret; @@ -235,6 +236,14 @@ hash_netport4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip, ip_to, e.cidr + 1); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &cidr); + n++; + } while (ipn++ < ip_to); + + if (n*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip); diff --git a/net/netfilter/ipset/ip_set_hash_netportnet.c b/net/netfilter/ipset/ip_set_hash_netportnet.c index 0e91d1e82f1c..19bcdb3141f6 100644 --- a/net/netfilter/ipset/ip_set_hash_netportnet.c +++ b/net/netfilter/ipset/ip_set_hash_netportnet.c @@ -182,7 +182,8 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], struct hash_netportnet4_elem e = { }; struct ip_set_ext ext = IP_SET_INIT_UEXT(set); u32 ip = 0, ip_to = 0, p = 0, port, port_to; - u32 ip2_from = 0, ip2_to = 0, ip2; + u32 ip2_from = 0, ip2_to = 0, ip2, ipn; + u64 n = 0, m = 0; bool with_ports = false; int ret; @@ -284,6 +285,19 @@ hash_netportnet4_uadt(struct ip_set *set, struct nlattr *tb[], } else { ip_set_mask_from_to(ip2_from, ip2_to, e.cidr[1]); } + ipn = ip; + do { + ipn = ip_set_range_to_cidr(ipn, ip_to, &e.cidr[0]); + n++; + } while (ipn++ < ip_to); + ipn = ip2_from; + do { + ipn = ip_set_range_to_cidr(ipn, ip2_to, &e.cidr[1]); + m++; + } while (ipn++ < ip2_to); + + if (n*m*(port_to - port + 1) > IPSET_MAX_RANGE) + return -ERANGE; if (retried) { ip = ntohl(h->next.ip[0]); From 38ea9def5b62f9193f6bad96c5d108e2830ecbde Mon Sep 17 00:00:00 2001 From: Yajun Deng <yajun.deng@linux.dev> Date: Thu, 29 Jul 2021 16:20:21 +0800 Subject: [PATCH 04/87] netfilter: nf_conntrack_bridge: Fix memory leak when error It should be added kfree_skb_list() when err is not equal to zero in nf_br_ip_fragment(). v2: keep this aligned with IPv6. v3: modify iter.frag_list to iter.frag. Fixes: 3c171f496ef5 ("netfilter: bridge: add connection tracking system") Signed-off-by: Yajun Deng <yajun.deng@linux.dev> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/bridge/netfilter/nf_conntrack_bridge.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 8d033a75a766..fdbed3158555 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -88,6 +88,12 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, skb = ip_fraglist_next(&iter); } + + if (!err) + return 0; + + kfree_skb_list(iter.frag); + return err; } slow_path: From aff51c5da3208bd164381e1488998667269c6cf4 Mon Sep 17 00:00:00 2001 From: DENG Qingfang <dqfext@gmail.com> Date: Fri, 6 Aug 2021 12:05:27 +0800 Subject: [PATCH 05/87] net: dsa: mt7530: add the missing RxUnicast MIB counter Add the missing RxUnicast counter. Fixes: b8f126a8d543 ("net-next: dsa: add dsa support for Mediatek MT7530 switch") Signed-off-by: DENG Qingfang <dqfext@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/mt7530.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index 69f21b71614c..632f0fcc5aa7 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -47,6 +47,7 @@ static const struct mt7530_mib_desc mt7530_mib[] = { MIB_DESC(2, 0x48, "TxBytes"), MIB_DESC(1, 0x60, "RxDrop"), MIB_DESC(1, 0x64, "RxFiltering"), + MIB_DESC(1, 0x68, "RxUnicast"), MIB_DESC(1, 0x6c, "RxMulticast"), MIB_DESC(1, 0x70, "RxBroadcast"), MIB_DESC(1, 0x74, "RxAlignErr"), From 704e624f7b3e8a4fc1ce43fb564746d1d07b20c0 Mon Sep 17 00:00:00 2001 From: John Hubbard <jhubbard@nvidia.com> Date: Thu, 5 Aug 2021 23:53:30 -0700 Subject: [PATCH 06/87] net: mvvp2: fix short frame size on s390 On s390, the following build warning occurs: drivers/net/ethernet/marvell/mvpp2/mvpp2.h:844:2: warning: overflow in conversion from 'long unsigned int' to 'int' changes value from '18446744073709551584' to '-32' [-Woverflow] 844 | ((total_size) - MVPP2_SKB_HEADROOM - MVPP2_SKB_SHINFO_SIZE) This happens because MVPP2_SKB_SHINFO_SIZE, which is 320 bytes (which is already 64-byte aligned) on some architectures, actually gets ALIGN'd up to 512 bytes in the s390 case. So then, when this is invoked: MVPP2_RX_MAX_PKT_SIZE(MVPP2_BM_SHORT_FRAME_SIZE) ...that turns into: 704 - 224 - 512 == -32 ...which is not a good frame size to end up with! The warning above is a bit lucky: it notices a signed/unsigned bad behavior here, which leads to the real problem of a frame that is too short for its contents. Increase MVPP2_BM_SHORT_FRAME_SIZE by 32 (from 704 to 736), which is just exactly big enough. (The other values can't readily be changed without causing a lot of other problems.) Fixes: 07dd0a7aae7f ("mvpp2: add basic XDP support") Cc: Sven Auhagen <sven.auhagen@voleatech.de> Cc: Matteo Croce <mcroce@microsoft.com> Cc: David S. Miller <davem@davemloft.net> Signed-off-by: John Hubbard <jhubbard@nvidia.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/marvell/mvpp2/mvpp2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h index b9fbc9f000f2..cf8acabb90ac 100644 --- a/drivers/net/ethernet/marvell/mvpp2/mvpp2.h +++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2.h @@ -938,7 +938,7 @@ enum mvpp22_ptp_packet_format { #define MVPP2_BM_COOKIE_POOL_OFFS 8 #define MVPP2_BM_COOKIE_CPU_OFFS 24 -#define MVPP2_BM_SHORT_FRAME_SIZE 704 /* frame size 128 */ +#define MVPP2_BM_SHORT_FRAME_SIZE 736 /* frame size 128 */ #define MVPP2_BM_LONG_FRAME_SIZE 2240 /* frame size 1664 */ #define MVPP2_BM_JUMBO_FRAME_SIZE 10432 /* frame size 9856 */ /* BM short pool packet size From 4608fdfc07e116f9fc0895beb40abad7cdb5ee3d Mon Sep 17 00:00:00 2001 From: Florian Westphal <fw@strlen.de> Date: Tue, 27 Jul 2021 00:29:19 +0200 Subject: [PATCH 07/87] netfilter: conntrack: collect all entries in one cycle Michal Kubecek reports that conntrack gc is responsible for frequent wakeups (every 125ms) on idle systems. On busy systems, timed out entries are evicted during lookup. The gc worker is only needed to remove entries after system becomes idle after a busy period. To resolve this, always scan the entire table. If the scan is taking too long, reschedule so other work_structs can run and resume from next bucket. After a completed scan, wait for 2 minutes before the next cycle. Heuristics for faster re-schedule are removed. GC_SCAN_INTERVAL could be exposed as a sysctl in the future to allow tuning this as-needed or even turn the gc worker off. Reported-by: Michal Kubecek <mkubecek@suse.cz> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nf_conntrack_core.c | 71 ++++++++++--------------------- 1 file changed, 22 insertions(+), 49 deletions(-) diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 5c03e5106751..d31dbccbe7bd 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -66,22 +66,17 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash); struct conntrack_gc_work { struct delayed_work dwork; - u32 last_bucket; + u32 next_bucket; bool exiting; bool early_drop; - long next_gc_run; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; static DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; -/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ -#define GC_MAX_BUCKETS_DIV 128u -/* upper bound of full table scan */ -#define GC_MAX_SCAN_JIFFIES (16u * HZ) -/* desired ratio of entries found to be expired */ -#define GC_EVICT_RATIO 50u +#define GC_SCAN_INTERVAL (120u * HZ) +#define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) static struct conntrack_gc_work conntrack_gc_work; @@ -1363,17 +1358,13 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct) static void gc_worker(struct work_struct *work) { - unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); - unsigned int i, goal, buckets = 0, expired_count = 0; - unsigned int nf_conntrack_max95 = 0; + unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION; + unsigned int i, hashsz, nf_conntrack_max95 = 0; + unsigned long next_run = GC_SCAN_INTERVAL; struct conntrack_gc_work *gc_work; - unsigned int ratio, scanned = 0; - unsigned long next_run; - gc_work = container_of(work, struct conntrack_gc_work, dwork.work); - goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; - i = gc_work->last_bucket; + i = gc_work->next_bucket; if (gc_work->early_drop) nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; @@ -1381,15 +1372,15 @@ static void gc_worker(struct work_struct *work) struct nf_conntrack_tuple_hash *h; struct hlist_nulls_head *ct_hash; struct hlist_nulls_node *n; - unsigned int hashsz; struct nf_conn *tmp; - i++; rcu_read_lock(); nf_conntrack_get_ht(&ct_hash, &hashsz); - if (i >= hashsz) - i = 0; + if (i >= hashsz) { + rcu_read_unlock(); + break; + } hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct nf_conntrack_net *cnet; @@ -1397,7 +1388,6 @@ static void gc_worker(struct work_struct *work) tmp = nf_ct_tuplehash_to_ctrack(h); - scanned++; if (test_bit(IPS_OFFLOAD_BIT, &tmp->status)) { nf_ct_offload_timeout(tmp); continue; @@ -1405,7 +1395,6 @@ static void gc_worker(struct work_struct *work) if (nf_ct_is_expired(tmp)) { nf_ct_gc_expired(tmp); - expired_count++; continue; } @@ -1438,7 +1427,14 @@ static void gc_worker(struct work_struct *work) */ rcu_read_unlock(); cond_resched(); - } while (++buckets < goal); + i++; + + if (time_after(jiffies, end_time) && i < hashsz) { + gc_work->next_bucket = i; + next_run = 0; + break; + } + } while (i < hashsz); if (gc_work->exiting) return; @@ -1449,40 +1445,17 @@ static void gc_worker(struct work_struct *work) * * This worker is only here to reap expired entries when system went * idle after a busy period. - * - * The heuristics below are supposed to balance conflicting goals: - * - * 1. Minimize time until we notice a stale entry - * 2. Maximize scan intervals to not waste cycles - * - * Normally, expire ratio will be close to 0. - * - * As soon as a sizeable fraction of the entries have expired - * increase scan frequency. */ - ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio > GC_EVICT_RATIO) { - gc_work->next_gc_run = min_interval; - } else { - unsigned int max = GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV; - - BUILD_BUG_ON((GC_MAX_SCAN_JIFFIES / GC_MAX_BUCKETS_DIV) == 0); - - gc_work->next_gc_run += min_interval; - if (gc_work->next_gc_run > max) - gc_work->next_gc_run = max; + if (next_run) { + gc_work->early_drop = false; + gc_work->next_bucket = 0; } - - next_run = gc_work->next_gc_run; - gc_work->last_bucket = i; - gc_work->early_drop = false; queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker); - gc_work->next_gc_run = HZ; gc_work->exiting = false; } From 61e0c2bc555a194ada2632fadac73f2bdb5df9cb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Tue, 3 Aug 2021 00:15:53 +0200 Subject: [PATCH 08/87] netfilter: nfnetlink_hook: strip off module name from hookfn NFNLA_HOOK_FUNCTION_NAME should include the hook function name only, the module name is already provided by NFNLA_HOOK_MODULE_NAME. Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nfnetlink_hook.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 202f57d17bab..ca453c61dbdf 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -135,6 +135,7 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, if (module_name) { char *end; + *module_name = '\0'; module_name += 2; end = strchr(module_name, ']'); if (end) { From a6e57c4af12bbacf927d7321c3aa894948653688 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Tue, 3 Aug 2021 00:15:54 +0200 Subject: [PATCH 09/87] netfilter: nfnetlink_hook: missing chain family The family is relevant for pseudo-families like NFPROTO_INET otherwise the user needs to rely on the hook function name to differentiate it from NFPROTO_IPV4 and NFPROTO_IPV6 names. Add nfnl_hook_chain_desc_attributes instead of using the existing NFTA_CHAIN_* attributes, since these do not provide a family number. Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- include/uapi/linux/netfilter/nfnetlink_hook.h | 9 +++++++++ net/netfilter/nfnetlink_hook.c | 8 ++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/netfilter/nfnetlink_hook.h b/include/uapi/linux/netfilter/nfnetlink_hook.h index 912ec60b26b0..bbcd285b22e1 100644 --- a/include/uapi/linux/netfilter/nfnetlink_hook.h +++ b/include/uapi/linux/netfilter/nfnetlink_hook.h @@ -43,6 +43,15 @@ enum nfnl_hook_chain_info_attributes { }; #define NFNLA_HOOK_INFO_MAX (__NFNLA_HOOK_INFO_MAX - 1) +enum nfnl_hook_chain_desc_attributes { + NFNLA_CHAIN_UNSPEC, + NFNLA_CHAIN_TABLE, + NFNLA_CHAIN_FAMILY, + NFNLA_CHAIN_NAME, + __NFNLA_CHAIN_MAX, +}; +#define NFNLA_CHAIN_MAX (__NFNLA_CHAIN_MAX - 1) + /** * enum nfnl_hook_chaintype - chain type * diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index ca453c61dbdf..e0ff2973fd14 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -89,11 +89,15 @@ static int nfnl_hook_put_nft_chain_info(struct sk_buff *nlskb, if (!nest2) goto cancel_nest; - ret = nla_put_string(nlskb, NFTA_CHAIN_TABLE, chain->table->name); + ret = nla_put_string(nlskb, NFNLA_CHAIN_TABLE, chain->table->name); if (ret) goto cancel_nest; - ret = nla_put_string(nlskb, NFTA_CHAIN_NAME, chain->name); + ret = nla_put_string(nlskb, NFNLA_CHAIN_NAME, chain->name); + if (ret) + goto cancel_nest; + + ret = nla_put_u8(nlskb, NFNLA_CHAIN_FAMILY, chain->table->family); if (ret) goto cancel_nest; From 3d9bbaf6c5416bfc50f014ce5879c8c440aaa511 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Wed, 4 Aug 2021 01:27:19 +0200 Subject: [PATCH 10/87] netfilter: nfnetlink_hook: use the sequence number of the request message The sequence number allows to correlate the netlink reply message (as part of the dump) with the original request message. The cb->seq field is internally used to detect an interference (update) of the hook list during the netlink dump, do not use it as sequence number in the netlink dump header. Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nfnetlink_hook.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index e0ff2973fd14..7b0d4a317457 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -264,7 +264,8 @@ static int nfnl_hook_dump(struct sk_buff *nlskb, ops = nf_hook_entries_get_hook_ops(e); for (; i < e->num_hook_entries; i++) { - err = nfnl_hook_dump_one(nlskb, ctx, ops[i], cb->seq); + err = nfnl_hook_dump_one(nlskb, ctx, ops[i], + cb->nlh->nlmsg_seq); if (err) break; } From 69311e7c997451dd40942b6b27b522cc3b659cef Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Wed, 4 Aug 2021 01:27:20 +0200 Subject: [PATCH 11/87] netfilter: nfnetlink_hook: Use same family as request message Use the same family as the request message, for consistency. The netlink payload provides sufficient information to describe the hook object, including the family. This makes it easier to userspace to correlate the hooks are that visited by the packets for a certain family. Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nfnetlink_hook.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 7b0d4a317457..32eea785ae25 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -113,7 +113,7 @@ cancel_nest: static int nfnl_hook_dump_one(struct sk_buff *nlskb, const struct nfnl_dump_hook_data *ctx, const struct nf_hook_ops *ops, - unsigned int seq) + int family, unsigned int seq) { u16 event = nfnl_msg_type(NFNL_SUBSYS_HOOK, NFNL_MSG_HOOK_GET); unsigned int portid = NETLINK_CB(nlskb).portid; @@ -124,7 +124,7 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, char *module_name; #endif nlh = nfnl_msg_put(nlskb, portid, seq, event, - NLM_F_MULTI, ops->pf, NFNETLINK_V0, 0); + NLM_F_MULTI, family, NFNETLINK_V0, 0); if (!nlh) goto nla_put_failure; @@ -264,7 +264,7 @@ static int nfnl_hook_dump(struct sk_buff *nlskb, ops = nf_hook_entries_get_hook_ops(e); for (; i < e->num_hook_entries; i++) { - err = nfnl_hook_dump_one(nlskb, ctx, ops[i], + err = nfnl_hook_dump_one(nlskb, ctx, ops[i], family, cb->nlh->nlmsg_seq); if (err) break; From 4592ee7f525c4683ec9e290381601fdee50ae110 Mon Sep 17 00:00:00 2001 From: Florian Westphal <fw@strlen.de> Date: Wed, 4 Aug 2021 15:02:15 +0200 Subject: [PATCH 12/87] netfilter: conntrack: remove offload_pickup sysctl again These two sysctls were added because the hardcoded defaults (2 minutes, tcp, 30 seconds, udp) turned out to be too low for some setups. They appeared in 5.14-rc1 so it should be fine to remove it again. Marcelo convinced me that there should be no difference between a flow that was offloaded vs. a flow that was not wrt. timeout handling. Thus the default is changed to those for TCP established and UDP stream, 5 days and 120 seconds, respectively. Marcelo also suggested to account for the timeout value used for the offloading, this avoids increase beyond the value in the conntrack-sysctl and will also instantly expire the conntrack entry with altered sysctls. Example: nf_conntrack_udp_timeout_stream=60 nf_flowtable_udp_timeout=60 This will remove offloaded udp flows after one minute, rather than two. An earlier version of this patch also cleared the ASSURED bit to allow nf_conntrack to evict the entry via early_drop (i.e., table full). However, it looks like we can safely assume that connection timed out via HW is still in established state, so this isn't needed. Quoting Oz: [..] the hardware sends all packets with a set FIN flags to sw. [..] Connections that are aged in hardware are expected to be in the established state. In case it turns out that back-to-sw-path transition can occur for 'dodgy' connections too (e.g., one side disappeared while software-path would have been in RETRANS timeout), we can adjust this later. Cc: Oz Shlomo <ozsh@nvidia.com> Cc: Paul Blakey <paulb@nvidia.com> Suggested-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Signed-off-by: Florian Westphal <fw@strlen.de> Reviewed-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com> Reviewed-by: Oz Shlomo <ozsh@nvidia.com> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- Documentation/networking/nf_conntrack-sysctl.rst | 10 ---------- include/net/netns/conntrack.h | 2 -- net/netfilter/nf_conntrack_proto_tcp.c | 1 - net/netfilter/nf_conntrack_proto_udp.c | 1 - net/netfilter/nf_conntrack_standalone.c | 16 ---------------- net/netfilter/nf_flow_table_core.c | 11 ++++++++--- 6 files changed, 8 insertions(+), 33 deletions(-) diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index d31ed6c1cb0d..024d784157c8 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -191,19 +191,9 @@ nf_flowtable_tcp_timeout - INTEGER (seconds) TCP connections may be offloaded from nf conntrack to nf flow table. Once aged, the connection is returned to nf conntrack with tcp pickup timeout. -nf_flowtable_tcp_pickup - INTEGER (seconds) - default 120 - - TCP connection timeout after being aged from nf flow table offload. - nf_flowtable_udp_timeout - INTEGER (seconds) default 30 Control offload timeout for udp connections. UDP connections may be offloaded from nf conntrack to nf flow table. Once aged, the connection is returned to nf conntrack with udp pickup timeout. - -nf_flowtable_udp_pickup - INTEGER (seconds) - default 30 - - UDP connection timeout after being aged from nf flow table offload. diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h index 37e5300c7e5a..fefd38db95b3 100644 --- a/include/net/netns/conntrack.h +++ b/include/net/netns/conntrack.h @@ -30,7 +30,6 @@ struct nf_tcp_net { u8 tcp_ignore_invalid_rst; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) unsigned int offload_timeout; - unsigned int offload_pickup; #endif }; @@ -44,7 +43,6 @@ struct nf_udp_net { unsigned int timeouts[UDP_CT_MAX]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) unsigned int offload_timeout; - unsigned int offload_pickup; #endif }; diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 3259416f2ea4..af5115e127cf 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -1478,7 +1478,6 @@ void nf_conntrack_tcp_init_net(struct net *net) #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) tn->offload_timeout = 30 * HZ; - tn->offload_pickup = 120 * HZ; #endif } diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index 698fee49e732..f8e3c0d2602f 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -271,7 +271,6 @@ void nf_conntrack_udp_init_net(struct net *net) #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) un->offload_timeout = 30 * HZ; - un->offload_pickup = 30 * HZ; #endif } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 214d9f9e499b..e84b499b7bfa 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -575,7 +575,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_UNACK, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD, - NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP, #endif NF_SYSCTL_CT_PROTO_TCP_LOOSE, NF_SYSCTL_CT_PROTO_TCP_LIBERAL, @@ -585,7 +584,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD, - NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP, #endif NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP, NF_SYSCTL_CT_PROTO_TIMEOUT_ICMPV6, @@ -776,12 +774,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP] = { - .procname = "nf_flowtable_tcp_pickup", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, #endif [NF_SYSCTL_CT_PROTO_TCP_LOOSE] = { .procname = "nf_conntrack_tcp_loose", @@ -832,12 +824,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, - [NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP] = { - .procname = "nf_flowtable_udp_pickup", - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_jiffies, - }, #endif [NF_SYSCTL_CT_PROTO_TIMEOUT_ICMP] = { .procname = "nf_conntrack_icmp_timeout", @@ -1018,7 +1004,6 @@ static void nf_conntrack_standalone_init_tcp_sysctl(struct net *net, #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD].data = &tn->offload_timeout; - table[NF_SYSCTL_CT_PROTO_TIMEOUT_TCP_OFFLOAD_PICKUP].data = &tn->offload_pickup; #endif } @@ -1111,7 +1096,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_STREAM].data = &un->timeouts[UDP_CT_REPLIED]; #if IS_ENABLED(CONFIG_NF_FLOW_TABLE) table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD].data = &un->offload_timeout; - table[NF_SYSCTL_CT_PROTO_TIMEOUT_UDP_OFFLOAD_PICKUP].data = &un->offload_pickup; #endif nf_conntrack_standalone_init_tcp_sysctl(net, table); diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 551976e4284c..8788b519255e 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -183,7 +183,7 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) const struct nf_conntrack_l4proto *l4proto; struct net *net = nf_ct_net(ct); int l4num = nf_ct_protonum(ct); - unsigned int timeout; + s32 timeout; l4proto = nf_ct_l4proto_find(l4num); if (!l4proto) @@ -192,15 +192,20 @@ static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) if (l4num == IPPROTO_TCP) { struct nf_tcp_net *tn = nf_tcp_pernet(net); - timeout = tn->offload_pickup; + timeout = tn->timeouts[TCP_CONNTRACK_ESTABLISHED]; + timeout -= tn->offload_timeout; } else if (l4num == IPPROTO_UDP) { struct nf_udp_net *tn = nf_udp_pernet(net); - timeout = tn->offload_pickup; + timeout = tn->timeouts[UDP_CT_REPLIED]; + timeout -= tn->offload_timeout; } else { return; } + if (timeout < 0) + timeout = 0; + if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) ct->timeout = nfct_time_stamp + timeout; } From 269fc69533de73a9065c0b7971bcd109880290b3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso <pablo@netfilter.org> Date: Fri, 6 Aug 2021 12:33:57 +0200 Subject: [PATCH 13/87] netfilter: nfnetlink_hook: translate inet ingress to netdev The NFPROTO_INET pseudofamily is not exposed through this new netlink interface. The netlink dump either shows NFPROTO_IPV4 or NFPROTO_IPV6 for NFPROTO_INET prerouting/input/forward/output/postrouting hooks. The NFNLA_CHAIN_FAMILY attribute provides the family chain, which specifies if this hook applies to inet traffic only (either IPv4 or IPv6). Translate the inet/ingress hook to netdev/ingress to fully hide the NFPROTO_INET implementation details. Fixes: e2cf17d3774c ("netfilter: add new hook nfnl subsystem") Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> --- net/netfilter/nfnetlink_hook.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nfnetlink_hook.c b/net/netfilter/nfnetlink_hook.c index 32eea785ae25..f554e2ea32ee 100644 --- a/net/netfilter/nfnetlink_hook.c +++ b/net/netfilter/nfnetlink_hook.c @@ -119,6 +119,7 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, unsigned int portid = NETLINK_CB(nlskb).portid; struct nlmsghdr *nlh; int ret = -EMSGSIZE; + u32 hooknum; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; char *module_name; @@ -156,7 +157,12 @@ static int nfnl_hook_dump_one(struct sk_buff *nlskb, goto nla_put_failure; #endif - ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(ops->hooknum)); + if (ops->pf == NFPROTO_INET && ops->hooknum == NF_INET_INGRESS) + hooknum = NF_NETDEV_INGRESS; + else + hooknum = ops->hooknum; + + ret = nla_put_be32(nlskb, NFNLA_HOOK_HOOKNUM, htonl(hooknum)); if (ret) goto nla_put_failure; From 78d14bda861dd2729f15bb438fe355b48514bfe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Robin=20G=C3=B6gge?= <r.goegge@googlemail.com> Date: Thu, 29 Jul 2021 00:58:25 +0200 Subject: [PATCH 14/87] libbpf: Fix probe for BPF_PROG_TYPE_CGROUP_SOCKOPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes the probe for BPF_PROG_TYPE_CGROUP_SOCKOPT, so the probe reports accurate results when used by e.g. bpftool. Fixes: 4cdbfb59c44a ("libbpf: support sockopt hooks") Signed-off-by: Robin Gögge <r.goegge@gmail.com> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Reviewed-by: Quentin Monnet <quentin@isovalent.com> Link: https://lore.kernel.org/bpf/20210728225825.2357586-1-r.goegge@gmail.com --- tools/lib/bpf/libbpf_probes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c index ecaae2927ab8..cd8c703dde71 100644 --- a/tools/lib/bpf/libbpf_probes.c +++ b/tools/lib/bpf/libbpf_probes.c @@ -75,6 +75,9 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: xattr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; break; + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + xattr.expected_attach_type = BPF_CGROUP_GETSOCKOPT; + break; case BPF_PROG_TYPE_SK_LOOKUP: xattr.expected_attach_type = BPF_SK_LOOKUP; break; @@ -104,7 +107,6 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, case BPF_PROG_TYPE_SK_REUSEPORT: case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_SYSCTL: - case BPF_PROG_TYPE_CGROUP_SOCKOPT: case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_STRUCT_OPS: case BPF_PROG_TYPE_EXT: From c34c338a40e4f3b6f80889cd17fd9281784d1c32 Mon Sep 17 00:00:00 2001 From: Daniel Xu <dxu@dxuuu.xyz> Date: Wed, 28 Jul 2021 16:09:21 -0700 Subject: [PATCH 15/87] libbpf: Do not close un-owned FD 0 on errors Before this patch, btf_new() was liable to close an arbitrary FD 0 if BTF parsing failed. This was because: * btf->fd was initialized to 0 through the calloc() * btf__free() (in the `done` label) closed any FDs >= 0 * btf->fd is left at 0 if parsing fails This issue was discovered on a system using libbpf v0.3 (without BTF_KIND_FLOAT support) but with a kernel that had BTF_KIND_FLOAT types in BTF. Thus, parsing fails. While this patch technically doesn't fix any issues b/c upstream libbpf has BTF_KIND_FLOAT support, it'll help prevent issues in the future if more BTF types are added. It also allow the fix to be backported to older libbpf's. Fixes: 3289959b97ca ("libbpf: Support BTF loading and raw data output in both endianness") Signed-off-by: Daniel Xu <dxu@dxuuu.xyz> Signed-off-by: Andrii Nakryiko <andrii@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Yonghong Song <yhs@fb.com> Link: https://lore.kernel.org/bpf/5969bb991adedb03c6ae93e051fd2a00d293cf25.1627513670.git.dxu@dxuuu.xyz --- tools/lib/bpf/btf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index b46760b93bb4..7ff3d5ce44f9 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -804,6 +804,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) btf->nr_types = 0; btf->start_id = 1; btf->start_str_off = 0; + btf->fd = -1; if (base_btf) { btf->base_btf = base_btf; @@ -832,8 +833,6 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) if (err) goto done; - btf->fd = -1; - done: if (err) { btf__free(btf); From 7c4a22339e7ce7b6ed473a8e682da622c3a774ee Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Sun, 1 Aug 2021 18:50:37 -0700 Subject: [PATCH 16/87] libbpf, doc: Eliminate warnings in libbpf_naming_convention Use "code-block: none" instead of "c" for non-C-language code blocks. Removes these warnings: lnx-514-rc4/Documentation/bpf/libbpf/libbpf_naming_convention.rst:111: WARNING: Could not lex literal_block as "c". Highlighting skipped. lnx-514-rc4/Documentation/bpf/libbpf/libbpf_naming_convention.rst:124: WARNING: Could not lex literal_block as "c". Highlighting skipped. Fixes: f42cfb469f9b ("bpf: Add documentation for libbpf including API autogen") Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20210802015037.787-1-rdunlap@infradead.org --- Documentation/bpf/libbpf/libbpf_naming_convention.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/bpf/libbpf/libbpf_naming_convention.rst b/Documentation/bpf/libbpf/libbpf_naming_convention.rst index 3de1d51e41da..6bf9c5ac7576 100644 --- a/Documentation/bpf/libbpf/libbpf_naming_convention.rst +++ b/Documentation/bpf/libbpf/libbpf_naming_convention.rst @@ -108,7 +108,7 @@ This bump in ABI version is at most once per kernel development cycle. For example, if current state of ``libbpf.map`` is: -.. code-block:: c +.. code-block:: none LIBBPF_0.0.1 { global: @@ -121,7 +121,7 @@ For example, if current state of ``libbpf.map`` is: , and a new symbol ``bpf_func_c`` is being introduced, then ``libbpf.map`` should be changed like this: -.. code-block:: c +.. code-block:: none LIBBPF_0.0.1 { global: From c4eb1f403243fc7bbb7de644db8587c03de36da6 Mon Sep 17 00:00:00 2001 From: Tatsuhiko Yasumatsu <th.yasumatsu@gmail.com> Date: Sat, 7 Aug 2021 00:04:18 +0900 Subject: [PATCH 17/87] bpf: Fix integer overflow involving bucket_size In __htab_map_lookup_and_delete_batch(), hash buckets are iterated over to count the number of elements in each bucket (bucket_size). If bucket_size is large enough, the multiplication to calculate kvmalloc() size could overflow, resulting in out-of-bounds write as reported by KASAN: [...] [ 104.986052] BUG: KASAN: vmalloc-out-of-bounds in __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.986489] Write of size 4194224 at addr ffffc9010503be70 by task crash/112 [ 104.986889] [ 104.987193] CPU: 0 PID: 112 Comm: crash Not tainted 5.14.0-rc4 #13 [ 104.987552] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 [ 104.988104] Call Trace: [ 104.988410] dump_stack_lvl+0x34/0x44 [ 104.988706] print_address_description.constprop.0+0x21/0x140 [ 104.988991] ? __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.989327] ? __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.989622] kasan_report.cold+0x7f/0x11b [ 104.989881] ? __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.990239] kasan_check_range+0x17c/0x1e0 [ 104.990467] memcpy+0x39/0x60 [ 104.990670] __htab_map_lookup_and_delete_batch+0x5ce/0xb60 [ 104.990982] ? __wake_up_common+0x4d/0x230 [ 104.991256] ? htab_of_map_free+0x130/0x130 [ 104.991541] bpf_map_do_batch+0x1fb/0x220 [...] In hashtable, if the elements' keys have the same jhash() value, the elements will be put into the same bucket. By putting a lot of elements into a single bucket, the value of bucket_size can be increased to trigger the integer overflow. Triggering the overflow is possible for both callers with CAP_SYS_ADMIN and callers without CAP_SYS_ADMIN. It will be trivial for a caller with CAP_SYS_ADMIN to intentionally reach this overflow by enabling BPF_F_ZERO_SEED. As this flag will set the random seed passed to jhash() to 0, it will be easy for the caller to prepare keys which will be hashed into the same value, and thus put all the elements into the same bucket. If the caller does not have CAP_SYS_ADMIN, BPF_F_ZERO_SEED cannot be used. However, it will be still technically possible to trigger the overflow, by guessing the random seed value passed to jhash() (32bit) and repeating the attempt to trigger the overflow. In this case, the probability to trigger the overflow will be low and will take a very long time. Fix the integer overflow by calling kvmalloc_array() instead of kvmalloc() to allocate memory. Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Tatsuhiko Yasumatsu <th.yasumatsu@gmail.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20210806150419.109658-1-th.yasumatsu@gmail.com --- kernel/bpf/hashtab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 72c58cc516a3..9c011f3a2687 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1565,8 +1565,8 @@ alloc: /* We cannot do copy_from_user or copy_to_user inside * the rcu_read_lock. Allocate enough space here. */ - keys = kvmalloc(key_size * bucket_size, GFP_USER | __GFP_NOWARN); - values = kvmalloc(value_size * bucket_size, GFP_USER | __GFP_NOWARN); + keys = kvmalloc_array(key_size, bucket_size, GFP_USER | __GFP_NOWARN); + values = kvmalloc_array(value_size, bucket_size, GFP_USER | __GFP_NOWARN); if (!keys || !values) { ret = -ENOMEM; goto after_loop; From 2115d3d482656ea702f7cf308c0ded3500282903 Mon Sep 17 00:00:00 2001 From: Hayes Wang <hayeswang@realtek.com> Date: Fri, 6 Aug 2021 17:15:55 +0800 Subject: [PATCH 18/87] Revert "r8169: avoid link-up interrupt issue on RTL8106e if user enables ASPM" This reverts commit 1ee8856de82faec9bc8bd0f2308a7f27e30ba207. This is used to re-enable ASPM on RTL8106e, if it is possible. Signed-off-by: Hayes Wang <hayeswang@realtek.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/realtek/r8169_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index c7af5bc3b8af..731631511dcf 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3508,6 +3508,7 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp) rtl_eri_write(tp, 0x1b0, ERIAR_MASK_0011, 0x0000); rtl_pcie_state_l2l3_disable(tp); + rtl_hw_aspm_clkreq_enable(tp, true); } DECLARE_RTL_COND(rtl_mac_ocp_e00e_cond) From 9c40186488145b57f800de120f0872168772adfe Mon Sep 17 00:00:00 2001 From: Hayes Wang <hayeswang@realtek.com> Date: Fri, 6 Aug 2021 17:15:56 +0800 Subject: [PATCH 19/87] r8169: change the L0/L1 entrance latencies for RTL8106e The original L0 and L1 entrance latencies of RTL8106e are 4us. And they cause the delay of link-up interrupt when enabling ASPM. Change the L0 entrance latency to 7us and L1 entrance latency to 32us. Then, they could avoid the issue. Tested-by: Koba Ko <koba.ko@canonical.com> Signed-off-by: Hayes Wang <hayeswang@realtek.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/realtek/r8169_main.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 731631511dcf..4d8e337f5085 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -3502,6 +3502,9 @@ static void rtl_hw_start_8106(struct rtl8169_private *tp) RTL_W8(tp, MCU, RTL_R8(tp, MCU) | EN_NDP | EN_OOB_RESET); RTL_W8(tp, DLLPR, RTL_R8(tp, DLLPR) & ~PFM_EN); + /* The default value is 0x13. Change it to 0x2f */ + rtl_csi_access_enable(tp, 0x2f); + rtl_eri_write(tp, 0x1d0, ERIAR_MASK_0011, 0x0000); /* disable EEE */ From 47fac45600aafc5939d9620055c3c46f7135d316 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel <o.rempel@pengutronix.de> Date: Fri, 6 Aug 2021 11:47:23 +0200 Subject: [PATCH 20/87] net: dsa: qca: ar9331: make proper initial port defaults Make sure that all external port are actually isolated from each other, so no packets are leaked. Fixes: ec6698c272de ("net: dsa: add support for Atheros AR9331 built-in switch") Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/qca/ar9331.c | 73 +++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/qca/ar9331.c b/drivers/net/dsa/qca/ar9331.c index 6686192e1883..563d8a279030 100644 --- a/drivers/net/dsa/qca/ar9331.c +++ b/drivers/net/dsa/qca/ar9331.c @@ -101,6 +101,23 @@ AR9331_SW_PORT_STATUS_RX_FLOW_EN | AR9331_SW_PORT_STATUS_TX_FLOW_EN | \ AR9331_SW_PORT_STATUS_SPEED_M) +#define AR9331_SW_REG_PORT_CTRL(_port) (0x104 + (_port) * 0x100) +#define AR9331_SW_PORT_CTRL_HEAD_EN BIT(11) +#define AR9331_SW_PORT_CTRL_PORT_STATE GENMASK(2, 0) +#define AR9331_SW_PORT_CTRL_PORT_STATE_DISABLED 0 +#define AR9331_SW_PORT_CTRL_PORT_STATE_BLOCKING 1 +#define AR9331_SW_PORT_CTRL_PORT_STATE_LISTENING 2 +#define AR9331_SW_PORT_CTRL_PORT_STATE_LEARNING 3 +#define AR9331_SW_PORT_CTRL_PORT_STATE_FORWARD 4 + +#define AR9331_SW_REG_PORT_VLAN(_port) (0x108 + (_port) * 0x100) +#define AR9331_SW_PORT_VLAN_8021Q_MODE GENMASK(31, 30) +#define AR9331_SW_8021Q_MODE_SECURE 3 +#define AR9331_SW_8021Q_MODE_CHECK 2 +#define AR9331_SW_8021Q_MODE_FALLBACK 1 +#define AR9331_SW_8021Q_MODE_NONE 0 +#define AR9331_SW_PORT_VLAN_PORT_VID_MEMBER GENMASK(25, 16) + /* MIB registers */ #define AR9331_MIB_COUNTER(x) (0x20000 + ((x) * 0x100)) @@ -371,11 +388,59 @@ static int ar9331_sw_mbus_init(struct ar9331_sw_priv *priv) return 0; } +static int ar9331_sw_setup_port(struct dsa_switch *ds, int port) +{ + struct ar9331_sw_priv *priv = (struct ar9331_sw_priv *)ds->priv; + struct regmap *regmap = priv->regmap; + u32 port_mask, port_ctrl, val; + int ret; + + /* Generate default port settings */ + port_ctrl = FIELD_PREP(AR9331_SW_PORT_CTRL_PORT_STATE, + AR9331_SW_PORT_CTRL_PORT_STATE_FORWARD); + + if (dsa_is_cpu_port(ds, port)) { + /* CPU port should be allowed to communicate with all user + * ports. + */ + port_mask = dsa_user_ports(ds); + /* Enable Atheros header on CPU port. This will allow us + * communicate with each port separately + */ + port_ctrl |= AR9331_SW_PORT_CTRL_HEAD_EN; + } else if (dsa_is_user_port(ds, port)) { + /* User ports should communicate only with the CPU port. + */ + port_mask = BIT(dsa_upstream_port(ds, port)); + } else { + /* Other ports do not need to communicate at all */ + port_mask = 0; + } + + val = FIELD_PREP(AR9331_SW_PORT_VLAN_8021Q_MODE, + AR9331_SW_8021Q_MODE_NONE) | + FIELD_PREP(AR9331_SW_PORT_VLAN_PORT_VID_MEMBER, port_mask); + + ret = regmap_write(regmap, AR9331_SW_REG_PORT_VLAN(port), val); + if (ret) + goto error; + + ret = regmap_write(regmap, AR9331_SW_REG_PORT_CTRL(port), port_ctrl); + if (ret) + goto error; + + return 0; +error: + dev_err(priv->dev, "%s: error: %i\n", __func__, ret); + + return ret; +} + static int ar9331_sw_setup(struct dsa_switch *ds) { struct ar9331_sw_priv *priv = (struct ar9331_sw_priv *)ds->priv; struct regmap *regmap = priv->regmap; - int ret; + int ret, i; ret = ar9331_sw_reset(priv); if (ret) @@ -402,6 +467,12 @@ static int ar9331_sw_setup(struct dsa_switch *ds) if (ret) goto error; + for (i = 0; i < ds->num_ports; i++) { + ret = ar9331_sw_setup_port(ds, i); + if (ret) + goto error; + } + ds->configure_vlan_while_not_filtering = false; return 0; From 34737e1320db6d51f0d140d5c684b9eb32f0da76 Mon Sep 17 00:00:00 2001 From: Loic Poulain <loic.poulain@linaro.org> Date: Fri, 6 Aug 2021 12:35:09 +0200 Subject: [PATCH 21/87] net: wwan: mhi_wwan_ctrl: Fix possible deadlock Lockdep detected possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mhiwwan->rx_lock); local_irq_disable(); lock(&mhi_cntrl->pm_lock); lock(&mhiwwan->rx_lock); <Interrupt> lock(&mhi_cntrl->pm_lock); *** DEADLOCK *** To prevent this we need to disable the soft-interrupts when taking the rx_lock. Cc: stable@vger.kernel.org Fixes: fa588eba632d ("net: Add Qcom WWAN control driver") Reported-by: Thomas Perrot <thomas.perrot@bootlin.com> Signed-off-by: Loic Poulain <loic.poulain@linaro.org> Reviewed-by: Sergey Ryazanov <ryazanov.s.a@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/wwan/mhi_wwan_ctrl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/wwan/mhi_wwan_ctrl.c b/drivers/net/wwan/mhi_wwan_ctrl.c index 1e18420ce404..d0a98f34c54d 100644 --- a/drivers/net/wwan/mhi_wwan_ctrl.c +++ b/drivers/net/wwan/mhi_wwan_ctrl.c @@ -41,14 +41,14 @@ struct mhi_wwan_dev { /* Increment RX budget and schedule RX refill if necessary */ static void mhi_wwan_rx_budget_inc(struct mhi_wwan_dev *mhiwwan) { - spin_lock(&mhiwwan->rx_lock); + spin_lock_bh(&mhiwwan->rx_lock); mhiwwan->rx_budget++; if (test_bit(MHI_WWAN_RX_REFILL, &mhiwwan->flags)) schedule_work(&mhiwwan->rx_refill); - spin_unlock(&mhiwwan->rx_lock); + spin_unlock_bh(&mhiwwan->rx_lock); } /* Decrement RX budget if non-zero and return true on success */ @@ -56,7 +56,7 @@ static bool mhi_wwan_rx_budget_dec(struct mhi_wwan_dev *mhiwwan) { bool ret = false; - spin_lock(&mhiwwan->rx_lock); + spin_lock_bh(&mhiwwan->rx_lock); if (mhiwwan->rx_budget) { mhiwwan->rx_budget--; @@ -64,7 +64,7 @@ static bool mhi_wwan_rx_budget_dec(struct mhi_wwan_dev *mhiwwan) ret = true; } - spin_unlock(&mhiwwan->rx_lock); + spin_unlock_bh(&mhiwwan->rx_lock); return ret; } @@ -130,9 +130,9 @@ static void mhi_wwan_ctrl_stop(struct wwan_port *port) { struct mhi_wwan_dev *mhiwwan = wwan_port_get_drvdata(port); - spin_lock(&mhiwwan->rx_lock); + spin_lock_bh(&mhiwwan->rx_lock); clear_bit(MHI_WWAN_RX_REFILL, &mhiwwan->flags); - spin_unlock(&mhiwwan->rx_lock); + spin_unlock_bh(&mhiwwan->rx_lock); cancel_work_sync(&mhiwwan->rx_refill); From 2383cb9497d113360137a2be308b390faa80632d Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Sat, 7 Aug 2021 02:06:18 +0200 Subject: [PATCH 22/87] net: phy: micrel: Fix link detection on ksz87xx switch" Commit a5e63c7d38d5 "net: phy: micrel: Fix detection of ksz87xx switch" broke link detection on the external ports of the KSZ8795. The previously unused phy_driver structure for these devices specifies config_aneg and read_status functions that appear to be designed for a fixed link and do not work with the embedded PHYs in the KSZ8795. Delete the use of these functions in favour of the generic PHY implementations which were used previously. Fixes: a5e63c7d38d5 ("net: phy: micrel: Fix detection of ksz87xx switch") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/phy/micrel.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c index 53bdd673ae56..5c928f827173 100644 --- a/drivers/net/phy/micrel.c +++ b/drivers/net/phy/micrel.c @@ -1760,8 +1760,6 @@ static struct phy_driver ksphy_driver[] = { .name = "Micrel KSZ87XX Switch", /* PHY_BASIC_FEATURES */ .config_init = kszphy_config_init, - .config_aneg = ksz8873mll_config_aneg, - .read_status = ksz8873mll_read_status, .match_phy_device = ksz8795_match_phy_device, .suspend = genphy_suspend, .resume = genphy_resume, From d329e41a08f37c478159d5c3379a17b9c07befa3 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes <vinicius.gomes@intel.com> Date: Fri, 6 Aug 2021 18:15:46 -0700 Subject: [PATCH 23/87] ptp: Fix possible memory leak caused by invalid cast Fixes possible leak of PTP virtual clocks. The number of PTP virtual clocks to be unregistered is passed as 'u32', but the function that unregister the devices handles that as 'u8'. Fixes: 73f37068d540 ("ptp: support ptp physical/virtual clocks conversion") Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/ptp/ptp_sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ptp/ptp_sysfs.c b/drivers/ptp/ptp_sysfs.c index b3d96b747292..41b92dc2f011 100644 --- a/drivers/ptp/ptp_sysfs.c +++ b/drivers/ptp/ptp_sysfs.c @@ -154,7 +154,7 @@ static int unregister_vclock(struct device *dev, void *data) struct ptp_clock *ptp = dev_get_drvdata(dev); struct ptp_clock_info *info = ptp->info; struct ptp_vclock *vclock; - u8 *num = data; + u32 *num = data; vclock = info_to_vclock(info); dev_info(dev->parent, "delete virtual clock ptp%d\n", From 1027b96ec9d34f9abab69bc1a4dc5b1ad8ab1349 Mon Sep 17 00:00:00 2001 From: Kefeng Wang <wangkefeng.wang@huawei.com> Date: Fri, 6 Aug 2021 16:21:24 +0800 Subject: [PATCH 24/87] once: Fix panic when module unload DO_ONCE DEFINE_STATIC_KEY_TRUE(___once_key); __do_once_done once_disable_jump(once_key); INIT_WORK(&w->work, once_deferred); struct once_work *w; w->key = key; schedule_work(&w->work); module unload //*the key is destroy* process_one_work once_deferred BUG_ON(!static_key_enabled(work->key)); static_key_count((struct static_key *)x) //*access key, crash* When module uses DO_ONCE mechanism, it could crash due to the above concurrency problem, we could reproduce it with link[1]. Fix it by add/put module refcount in the once work process. [1] https://lore.kernel.org/netdev/eaa6c371-465e-57eb-6be9-f4b16b9d7cbf@huawei.com/ Cc: Hannes Frederic Sowa <hannes@stressinduktion.org> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: David S. Miller <davem@davemloft.net> Cc: Eric Dumazet <edumazet@google.com> Reported-by: Minmin chen <chenmingmin@huawei.com> Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com> Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org> Signed-off-by: David S. Miller <davem@davemloft.net> --- include/linux/once.h | 4 ++-- lib/once.c | 11 ++++++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/linux/once.h b/include/linux/once.h index 9225ee6d96c7..ae6f4eb41cbe 100644 --- a/include/linux/once.h +++ b/include/linux/once.h @@ -7,7 +7,7 @@ bool __do_once_start(bool *done, unsigned long *flags); void __do_once_done(bool *done, struct static_key_true *once_key, - unsigned long *flags); + unsigned long *flags, struct module *mod); /* Call a function exactly once. The idea of DO_ONCE() is to perform * a function call such as initialization of random seeds, etc, only @@ -46,7 +46,7 @@ void __do_once_done(bool *done, struct static_key_true *once_key, if (unlikely(___ret)) { \ func(__VA_ARGS__); \ __do_once_done(&___done, &___once_key, \ - &___flags); \ + &___flags, THIS_MODULE); \ } \ } \ ___ret; \ diff --git a/lib/once.c b/lib/once.c index 8b7d6235217e..59149bf3bfb4 100644 --- a/lib/once.c +++ b/lib/once.c @@ -3,10 +3,12 @@ #include <linux/spinlock.h> #include <linux/once.h> #include <linux/random.h> +#include <linux/module.h> struct once_work { struct work_struct work; struct static_key_true *key; + struct module *module; }; static void once_deferred(struct work_struct *w) @@ -16,10 +18,11 @@ static void once_deferred(struct work_struct *w) work = container_of(w, struct once_work, work); BUG_ON(!static_key_enabled(work->key)); static_branch_disable(work->key); + module_put(work->module); kfree(work); } -static void once_disable_jump(struct static_key_true *key) +static void once_disable_jump(struct static_key_true *key, struct module *mod) { struct once_work *w; @@ -29,6 +32,8 @@ static void once_disable_jump(struct static_key_true *key) INIT_WORK(&w->work, once_deferred); w->key = key; + w->module = mod; + __module_get(mod); schedule_work(&w->work); } @@ -53,11 +58,11 @@ bool __do_once_start(bool *done, unsigned long *flags) EXPORT_SYMBOL(__do_once_start); void __do_once_done(bool *done, struct static_key_true *once_key, - unsigned long *flags) + unsigned long *flags, struct module *mod) __releases(once_lock) { *done = true; spin_unlock_irqrestore(&once_lock, *flags); - once_disable_jump(once_key); + once_disable_jump(once_key, mod); } EXPORT_SYMBOL(__do_once_done); From fbfee25796e2688004d58ad4d0673279366b97dd Mon Sep 17 00:00:00 2001 From: Michael Chan <michael.chan@broadcom.com> Date: Sat, 7 Aug 2021 15:03:13 -0400 Subject: [PATCH 25/87] bnxt_en: Update firmware interface to 1.10.2.52 The key change is the firmware call to retrieve the PTP TX timestamp. The header offset for the PTP sequence number field is now added. Signed-off-by: Michael Chan <michael.chan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h | 76 ++++++++++++++----- 1 file changed, 55 insertions(+), 21 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h index 3fc6781c5b98..94d07a9f7034 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hsi.h @@ -368,6 +368,7 @@ struct cmd_nums { #define HWRM_FUNC_PTP_TS_QUERY 0x19fUL #define HWRM_FUNC_PTP_EXT_CFG 0x1a0UL #define HWRM_FUNC_PTP_EXT_QCFG 0x1a1UL + #define HWRM_FUNC_KEY_CTX_ALLOC 0x1a2UL #define HWRM_SELFTEST_QLIST 0x200UL #define HWRM_SELFTEST_EXEC 0x201UL #define HWRM_SELFTEST_IRQ 0x202UL @@ -531,8 +532,8 @@ struct hwrm_err_output { #define HWRM_VERSION_MAJOR 1 #define HWRM_VERSION_MINOR 10 #define HWRM_VERSION_UPDATE 2 -#define HWRM_VERSION_RSVD 47 -#define HWRM_VERSION_STR "1.10.2.47" +#define HWRM_VERSION_RSVD 52 +#define HWRM_VERSION_STR "1.10.2.52" /* hwrm_ver_get_input (size:192b/24B) */ struct hwrm_ver_get_input { @@ -585,6 +586,7 @@ struct hwrm_ver_get_output { #define VER_GET_RESP_DEV_CAPS_CFG_CFA_ADV_FLOW_MGNT_SUPPORTED 0x1000UL #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TFLIB_SUPPORTED 0x2000UL #define VER_GET_RESP_DEV_CAPS_CFG_CFA_TRUFLOW_SUPPORTED 0x4000UL + #define VER_GET_RESP_DEV_CAPS_CFG_SECURE_BOOT_CAPABLE 0x8000UL u8 roce_fw_maj_8b; u8 roce_fw_min_8b; u8 roce_fw_bld_8b; @@ -886,7 +888,8 @@ struct hwrm_async_event_cmpl_reset_notify { #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_FATAL (0x2UL << 8) #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_EXCEPTION_NON_FATAL (0x3UL << 8) #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FAST_RESET (0x4UL << 8) - #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_LAST ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FAST_RESET + #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_ACTIVATION (0x5UL << 8) + #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_LAST ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_REASON_CODE_FW_ACTIVATION #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DELAY_IN_100MS_TICKS_MASK 0xffff0000UL #define ASYNC_EVENT_CMPL_RESET_NOTIFY_EVENT_DATA1_DELAY_IN_100MS_TICKS_SFT 16 }; @@ -1236,13 +1239,14 @@ struct hwrm_async_event_cmpl_error_report_base { u8 timestamp_lo; __le16 timestamp_hi; __le32 event_data1; - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT 0 - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED 0x0UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM 0x1UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL 0x2UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM 0x3UL - #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_MASK 0xffUL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_SFT 0 + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_RESERVED 0x0UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_PAUSE_STORM 0x1UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_INVALID_SIGNAL 0x2UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM 0x3UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD 0x4UL + #define ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_LAST ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD }; /* hwrm_async_event_cmpl_error_report_pause_storm (size:128b/16B) */ @@ -1446,6 +1450,8 @@ struct hwrm_func_vf_cfg_input { #define FUNC_VF_CFG_REQ_ENABLES_NUM_VNICS 0x200UL #define FUNC_VF_CFG_REQ_ENABLES_NUM_STAT_CTXS 0x400UL #define FUNC_VF_CFG_REQ_ENABLES_NUM_HW_RING_GRPS 0x800UL + #define FUNC_VF_CFG_REQ_ENABLES_NUM_TX_KEY_CTXS 0x1000UL + #define FUNC_VF_CFG_REQ_ENABLES_NUM_RX_KEY_CTXS 0x2000UL __le16 mtu; __le16 guest_vlan; __le16 async_event_cr; @@ -1469,7 +1475,8 @@ struct hwrm_func_vf_cfg_input { __le16 num_vnics; __le16 num_stat_ctxs; __le16 num_hw_ring_grps; - u8 unused_0[4]; + __le16 num_tx_key_ctxs; + __le16 num_rx_key_ctxs; }; /* hwrm_func_vf_cfg_output (size:128b/16B) */ @@ -1493,7 +1500,7 @@ struct hwrm_func_qcaps_input { u8 unused_0[6]; }; -/* hwrm_func_qcaps_output (size:704b/88B) */ +/* hwrm_func_qcaps_output (size:768b/96B) */ struct hwrm_func_qcaps_output { __le16 error_code; __le16 req_type; @@ -1587,7 +1594,8 @@ struct hwrm_func_qcaps_output { #define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_TE_CFA 0x4UL #define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_RE_CFA 0x8UL #define FUNC_QCAPS_RESP_MPC_CHNLS_CAP_PRIMATE 0x10UL - u8 unused_1; + __le16 max_key_ctxs_alloc; + u8 unused_1[7]; u8 valid; }; @@ -1602,7 +1610,7 @@ struct hwrm_func_qcfg_input { u8 unused_0[6]; }; -/* hwrm_func_qcfg_output (size:832b/104B) */ +/* hwrm_func_qcfg_output (size:896b/112B) */ struct hwrm_func_qcfg_output { __le16 error_code; __le16 req_type; @@ -1749,11 +1757,13 @@ struct hwrm_func_qcfg_output { #define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100 (0x1UL << 29) #define FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_LAST FUNC_QCFG_RESP_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100 __le16 host_mtu; - u8 unused_3; + __le16 alloc_tx_key_ctxs; + __le16 alloc_rx_key_ctxs; + u8 unused_3[5]; u8 valid; }; -/* hwrm_func_cfg_input (size:832b/104B) */ +/* hwrm_func_cfg_input (size:896b/112B) */ struct hwrm_func_cfg_input { __le16 req_type; __le16 cmpl_ring; @@ -1820,6 +1830,8 @@ struct hwrm_func_cfg_input { #define FUNC_CFG_REQ_ENABLES_PARTITION_MAX_BW 0x8000000UL #define FUNC_CFG_REQ_ENABLES_TPID 0x10000000UL #define FUNC_CFG_REQ_ENABLES_HOST_MTU 0x20000000UL + #define FUNC_CFG_REQ_ENABLES_TX_KEY_CTXS 0x40000000UL + #define FUNC_CFG_REQ_ENABLES_RX_KEY_CTXS 0x80000000UL __le16 admin_mtu; __le16 mru; __le16 num_rsscos_ctxs; @@ -1929,6 +1941,9 @@ struct hwrm_func_cfg_input { #define FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_LAST FUNC_CFG_REQ_PARTITION_MAX_BW_BW_VALUE_UNIT_PERCENT1_100 __be16 tpid; __le16 host_mtu; + __le16 num_tx_key_ctxs; + __le16 num_rx_key_ctxs; + u8 unused_0[4]; }; /* hwrm_func_cfg_output (size:128b/16B) */ @@ -2099,6 +2114,7 @@ struct hwrm_func_drv_rgtr_input { #define FUNC_DRV_RGTR_REQ_FLAGS_MASTER_SUPPORT 0x40UL #define FUNC_DRV_RGTR_REQ_FLAGS_FAST_RESET_SUPPORT 0x80UL #define FUNC_DRV_RGTR_REQ_FLAGS_RSS_STRICT_HASH_TYPE_SUPPORT 0x100UL + #define FUNC_DRV_RGTR_REQ_FLAGS_NPAR_1_2_SUPPORT 0x200UL __le32 enables; #define FUNC_DRV_RGTR_REQ_ENABLES_OS_TYPE 0x1UL #define FUNC_DRV_RGTR_REQ_ENABLES_VER 0x2UL @@ -2268,7 +2284,7 @@ struct hwrm_func_resource_qcaps_input { u8 unused_0[6]; }; -/* hwrm_func_resource_qcaps_output (size:448b/56B) */ +/* hwrm_func_resource_qcaps_output (size:512b/64B) */ struct hwrm_func_resource_qcaps_output { __le16 error_code; __le16 req_type; @@ -2300,11 +2316,15 @@ struct hwrm_func_resource_qcaps_output { __le16 max_tx_scheduler_inputs; __le16 flags; #define FUNC_RESOURCE_QCAPS_RESP_FLAGS_MIN_GUARANTEED 0x1UL + __le16 min_tx_key_ctxs; + __le16 max_tx_key_ctxs; + __le16 min_rx_key_ctxs; + __le16 max_rx_key_ctxs; u8 unused_0[5]; u8 valid; }; -/* hwrm_func_vf_resource_cfg_input (size:448b/56B) */ +/* hwrm_func_vf_resource_cfg_input (size:512b/64B) */ struct hwrm_func_vf_resource_cfg_input { __le16 req_type; __le16 cmpl_ring; @@ -2331,6 +2351,10 @@ struct hwrm_func_vf_resource_cfg_input { __le16 max_hw_ring_grps; __le16 flags; #define FUNC_VF_RESOURCE_CFG_REQ_FLAGS_MIN_GUARANTEED 0x1UL + __le16 min_tx_key_ctxs; + __le16 max_tx_key_ctxs; + __le16 min_rx_key_ctxs; + __le16 max_rx_key_ctxs; u8 unused_0[2]; }; @@ -2348,7 +2372,9 @@ struct hwrm_func_vf_resource_cfg_output { __le16 reserved_vnics; __le16 reserved_stat_ctx; __le16 reserved_hw_ring_grps; - u8 unused_0[7]; + __le16 reserved_tx_key_ctxs; + __le16 reserved_rx_key_ctxs; + u8 unused_0[3]; u8 valid; }; @@ -4220,7 +4246,7 @@ struct hwrm_port_lpbk_clr_stats_output { u8 valid; }; -/* hwrm_port_ts_query_input (size:256b/32B) */ +/* hwrm_port_ts_query_input (size:320b/40B) */ struct hwrm_port_ts_query_input { __le16 req_type; __le16 cmpl_ring; @@ -4238,8 +4264,11 @@ struct hwrm_port_ts_query_input { __le16 enables; #define PORT_TS_QUERY_REQ_ENABLES_TS_REQ_TIMEOUT 0x1UL #define PORT_TS_QUERY_REQ_ENABLES_PTP_SEQ_ID 0x2UL + #define PORT_TS_QUERY_REQ_ENABLES_PTP_HDR_OFFSET 0x4UL __le16 ts_req_timeout; __le32 ptp_seq_id; + __le16 ptp_hdr_offset; + u8 unused_1[6]; }; /* hwrm_port_ts_query_output (size:192b/24B) */ @@ -8172,6 +8201,7 @@ struct hwrm_fw_reset_input { u8 host_idx; u8 flags; #define FW_RESET_REQ_FLAGS_RESET_GRACEFUL 0x1UL + #define FW_RESET_REQ_FLAGS_FW_ACTIVATION 0x2UL u8 unused_0[4]; }; @@ -8952,7 +8982,7 @@ struct hwrm_nvm_get_dir_info_output { u8 valid; }; -/* hwrm_nvm_write_input (size:384b/48B) */ +/* hwrm_nvm_write_input (size:448b/56B) */ struct hwrm_nvm_write_input { __le16 req_type; __le16 cmpl_ring; @@ -8968,7 +8998,11 @@ struct hwrm_nvm_write_input { __le16 option; __le16 flags; #define NVM_WRITE_REQ_FLAGS_KEEP_ORIG_ACTIVE_IMG 0x1UL + #define NVM_WRITE_REQ_FLAGS_BATCH_MODE 0x2UL + #define NVM_WRITE_REQ_FLAGS_BATCH_LAST 0x4UL __le32 dir_item_length; + __le32 offset; + __le32 len; __le32 unused_0; }; From 9e26680733d5c6538ba2e7a111fb49c9ac2dc16a Mon Sep 17 00:00:00 2001 From: Michael Chan <michael.chan@broadcom.com> Date: Sat, 7 Aug 2021 15:03:14 -0400 Subject: [PATCH 26/87] bnxt_en: Update firmware call to retrieve TX PTP timestamp New firmware interface requires the PTP sequence ID header offset to be passed to the firmware to properly find the matching timestamp for all protocols. Fixes: 83bb623c968e ("bnxt_en: Transmit and retrieve packet timestamps") Reviewed-by: Edwin Peer <edwin.peer@broadcom.com> Signed-off-by: Michael Chan <michael.chan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 ++++- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c | 4 +++- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 6 ++++-- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 89606587b156..2fe743503949 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -426,7 +426,10 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) if (ptp && ptp->tx_tstamp_en && !skb_is_gso(skb) && atomic_dec_if_positive(&ptp->tx_avail) >= 0) { - if (!bnxt_ptp_parse(skb, &ptp->tx_seqid)) { + if (!bnxt_ptp_parse(skb, &ptp->tx_seqid, + &ptp->tx_hdr_off)) { + if (vlan_tag_flags) + ptp->tx_hdr_off += VLAN_HLEN; lflags |= cpu_to_le32(TX_BD_FLAGS_STAMP); skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; } else { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c index ec381c2423b8..81f40ab748f1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.c @@ -20,7 +20,7 @@ #include "bnxt.h" #include "bnxt_ptp.h" -int bnxt_ptp_parse(struct sk_buff *skb, u16 *seq_id) +int bnxt_ptp_parse(struct sk_buff *skb, u16 *seq_id, u16 *hdr_off) { unsigned int ptp_class; struct ptp_header *hdr; @@ -34,6 +34,7 @@ int bnxt_ptp_parse(struct sk_buff *skb, u16 *seq_id) if (!hdr) return -EINVAL; + *hdr_off = (u8 *)hdr - skb->data; *seq_id = ntohs(hdr->sequence_id); return 0; default: @@ -91,6 +92,7 @@ static int bnxt_hwrm_port_ts_query(struct bnxt *bp, u32 flags, u64 *ts) PORT_TS_QUERY_REQ_FLAGS_PATH_TX) { req.enables = cpu_to_le16(BNXT_PTP_QTS_TX_ENABLES); req.ptp_seq_id = cpu_to_le32(bp->ptp_cfg->tx_seqid); + req.ptp_hdr_offset = cpu_to_le16(bp->ptp_cfg->tx_hdr_off); req.ts_req_timeout = cpu_to_le16(BNXT_PTP_QTS_TIMEOUT); } mutex_lock(&bp->hwrm_cmd_lock); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h index 254ba7bc0f99..57a8b9243a31 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h @@ -19,7 +19,8 @@ #define BNXT_PTP_QTS_TIMEOUT 1000 #define BNXT_PTP_QTS_TX_ENABLES (PORT_TS_QUERY_REQ_ENABLES_PTP_SEQ_ID | \ - PORT_TS_QUERY_REQ_ENABLES_TS_REQ_TIMEOUT) + PORT_TS_QUERY_REQ_ENABLES_TS_REQ_TIMEOUT | \ + PORT_TS_QUERY_REQ_ENABLES_PTP_HDR_OFFSET) struct bnxt_ptp_cfg { struct ptp_clock_info ptp_info; @@ -37,6 +38,7 @@ struct bnxt_ptp_cfg { #define BNXT_PHC_OVERFLOW_PERIOD (19 * 3600 * HZ) u16 tx_seqid; + u16 tx_hdr_off; struct bnxt *bp; atomic_t tx_avail; #define BNXT_MAX_TX_TS 1 @@ -74,7 +76,7 @@ do { \ ((dst) = READ_ONCE(src)) #endif -int bnxt_ptp_parse(struct sk_buff *skb, u16 *seq_id); +int bnxt_ptp_parse(struct sk_buff *skb, u16 *seq_id, u16 *hdr_off); int bnxt_hwtstamp_set(struct net_device *dev, struct ifreq *ifr); int bnxt_hwtstamp_get(struct net_device *dev, struct ifreq *ifr); int bnxt_get_tx_ts_p5(struct bnxt *bp, struct sk_buff *skb); From 92529df76db5ab184b82674cf7a4eef4b665b40e Mon Sep 17 00:00:00 2001 From: Michael Chan <michael.chan@broadcom.com> Date: Sat, 7 Aug 2021 15:03:15 -0400 Subject: [PATCH 27/87] bnxt_en: Use register window 6 instead of 5 to read the PHC Some older Broadcom debug tools use window 5 and may conflict, so switch to use window 6 instead. Fixes: 118612d519d8 ("bnxt_en: Add PTP clock APIs, ioctls, and ethtool methods") Reviewed-by: Andy Gospodarek <gospo@broadcom.com> Signed-off-by: Michael Chan <michael.chan@broadcom.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h index 57a8b9243a31..524f1c272054 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ptp.h @@ -10,8 +10,8 @@ #ifndef BNXT_PTP_H #define BNXT_PTP_H -#define BNXT_PTP_GRC_WIN 5 -#define BNXT_PTP_GRC_WIN_BASE 0x5000 +#define BNXT_PTP_GRC_WIN 6 +#define BNXT_PTP_GRC_WIN_BASE 0x6000 #define BNXT_MAX_PHC_DRIFT 31000000 #define BNXT_LO_TIMER_MASK 0x0000ffffffffUL From 2459dcb96bcba94c08d6861f8a050185ff301672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org> Date: Sat, 7 Aug 2021 15:27:03 +0200 Subject: [PATCH 28/87] ppp: Fix generating ifname when empty IFLA_IFNAME is specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit IFLA_IFNAME is nul-term string which means that IFLA_IFNAME buffer can be larger than length of string which contains. Function __rtnl_newlink() generates new own ifname if either IFLA_IFNAME was not specified at all or userspace passed empty nul-term string. It is expected that if userspace does not specify ifname for new ppp netdev then kernel generates one in format "ppp<id>" where id matches to the ppp unit id which can be later obtained by PPPIOCGUNIT ioctl. And it works in this way if IFLA_IFNAME is not specified at all. But it does not work when IFLA_IFNAME is specified with empty string. So fix this logic also for empty IFLA_IFNAME in ppp_nl_newlink() function and correctly generates ifname based on ppp unit identifier if userspace did not provided preferred ifname. Without this patch when IFLA_IFNAME was specified with empty string then kernel created a new ppp interface in format "ppp<id>" but id did not match ppp unit id returned by PPPIOCGUNIT ioctl. In this case id was some number generated by __rtnl_newlink() function. Signed-off-by: Pali Rohár <pali@kernel.org> Fixes: bb8082f69138 ("ppp: build ifname using unit identifier for rtnl based devices") Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ppp/ppp_generic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 930e49ef15f6..b2b35ec37702 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1306,7 +1306,7 @@ static int ppp_nl_newlink(struct net *src_net, struct net_device *dev, * the PPP unit identifer as suffix (i.e. ppp<unit_id>). This allows * userspace to infer the device name using to the PPPIOCGUNIT ioctl. */ - if (!tb[IFLA_IFNAME]) + if (!tb[IFLA_IFNAME] || !nla_len(tb[IFLA_IFNAME]) || !*(char *)nla_data(tb[IFLA_IFNAME])) conf.ifname_is_set = false; err = ppp_dev_configure(src_net, dev, &conf); From 3125f26c514826077f2a4490b75e9b1c7a644c42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= <pali@kernel.org> Date: Sat, 7 Aug 2021 18:00:50 +0200 Subject: [PATCH 29/87] ppp: Fix generating ppp unit id when ifname is not specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When registering new ppp interface via PPPIOCNEWUNIT ioctl then kernel has to choose interface name as this ioctl API does not support specifying it. Kernel in this case register new interface with name "ppp<id>" where <id> is the ppp unit id, which can be obtained via PPPIOCGUNIT ioctl. This applies also in the case when registering new ppp interface via rtnl without supplying IFLA_IFNAME. PPPIOCNEWUNIT ioctl allows to specify own ppp unit id which will kernel assign to ppp interface, in case this ppp id is not already used by other ppp interface. In case user does not specify ppp unit id then kernel choose the first free ppp unit id. This applies also for case when creating ppp interface via rtnl method as it does not provide a way for specifying own ppp unit id. If some network interface (does not have to be ppp) has name "ppp<id>" with this first free ppp id then PPPIOCNEWUNIT ioctl or rtnl call fails. And registering new ppp interface is not possible anymore, until interface which holds conflicting name is renamed. Or when using rtnl method with custom interface name in IFLA_IFNAME. As list of allocated / used ppp unit ids is not possible to retrieve from kernel to userspace, userspace has no idea what happens nor which interface is doing this conflict. So change the algorithm how ppp unit id is generated. And choose the first number which is not neither used as ppp unit id nor in some network interface with pattern "ppp<id>". This issue can be simply reproduced by following pppd call when there is no ppp interface registered and also no interface with name pattern "ppp<id>": pppd ifname ppp1 +ipv6 noip noauth nolock local nodetach pty "pppd +ipv6 noip noauth nolock local nodetach notty" Or by creating the one ppp interface (which gets assigned ppp unit id 0), renaming it to "ppp1" and then trying to create a new ppp interface (which will always fails as next free ppp unit id is 1, but network interface with name "ppp1" exists). This patch fixes above described issue by generating new and new ppp unit id until some non-conflicting id with network interfaces is generated. Signed-off-by: Pali Rohár <pali@kernel.org> Cc: stable@vger.kernel.org Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ppp/ppp_generic.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index b2b35ec37702..7a099c37527f 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -284,7 +284,7 @@ static struct channel *ppp_find_channel(struct ppp_net *pn, int unit); static int ppp_connect_channel(struct channel *pch, int unit); static int ppp_disconnect_channel(struct channel *pch); static void ppp_destroy_channel(struct channel *pch); -static int unit_get(struct idr *p, void *ptr); +static int unit_get(struct idr *p, void *ptr, int min); static int unit_set(struct idr *p, void *ptr, int n); static void unit_put(struct idr *p, int n); static void *unit_find(struct idr *p, int n); @@ -1155,9 +1155,20 @@ static int ppp_unit_register(struct ppp *ppp, int unit, bool ifname_is_set) mutex_lock(&pn->all_ppp_mutex); if (unit < 0) { - ret = unit_get(&pn->units_idr, ppp); + ret = unit_get(&pn->units_idr, ppp, 0); if (ret < 0) goto err; + if (!ifname_is_set) { + while (1) { + snprintf(ppp->dev->name, IFNAMSIZ, "ppp%i", ret); + if (!__dev_get_by_name(ppp->ppp_net, ppp->dev->name)) + break; + unit_put(&pn->units_idr, ret); + ret = unit_get(&pn->units_idr, ppp, ret + 1); + if (ret < 0) + goto err; + } + } } else { /* Caller asked for a specific unit number. Fail with -EEXIST * if unavailable. For backward compatibility, return -EEXIST @@ -3552,9 +3563,9 @@ static int unit_set(struct idr *p, void *ptr, int n) } /* get new free unit number and associate pointer with it */ -static int unit_get(struct idr *p, void *ptr) +static int unit_get(struct idr *p, void *ptr, int min) { - return idr_alloc(p, ptr, 0, 0, GFP_KERNEL); + return idr_alloc(p, ptr, min, 0, GFP_KERNEL); } /* put unit number back to a pool */ From 86aab09a4870bb8346c9579864588c3d7f555299 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Sun, 8 Aug 2021 16:04:40 -0700 Subject: [PATCH 30/87] dccp: add do-while-0 stubs for dccp_pr_debug macros GCC complains about empty macros in an 'if' statement, so convert them to 'do {} while (0)' macros. Fixes these build warnings: net/dccp/output.c: In function 'dccp_xmit_packet': ../net/dccp/output.c:283:71: warning: suggest braces around empty body in an 'if' statement [-Wempty-body] 283 | dccp_pr_debug("transmit_skb() returned err=%d\n", err); net/dccp/ackvec.c: In function 'dccp_ackvec_update_old': ../net/dccp/ackvec.c:163:80: warning: suggest braces around empty body in an 'else' statement [-Wempty-body] 163 | (unsigned long long)seqno, state); Fixes: dc841e30eaea ("dccp: Extend CCID packet dequeueing interface") Fixes: 380240864451 ("dccp ccid-2: Update code for the Ack Vector input/registration routine") Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: dccp@vger.kernel.org Cc: "David S. Miller" <davem@davemloft.net> Cc: Jakub Kicinski <kuba@kernel.org> Cc: Gerrit Renker <gerrit@erg.abdn.ac.uk> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/dccp/dccp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 9cc9d1ee6cdb..c5c1d2b8045e 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -41,9 +41,9 @@ extern bool dccp_debug; #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) #else -#define dccp_pr_debug(format, a...) -#define dccp_pr_debug_cat(format, a...) -#define dccp_debug(format, a...) +#define dccp_pr_debug(format, a...) do {} while (0) +#define dccp_pr_debug_cat(format, a...) do {} while (0) +#define dccp_debug(format, a...) do {} while (0) #endif extern struct inet_hashinfo dccp_hashinfo; From 0fa32ca438b42fadfb293d72690e117ab3d67489 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin <linyunsheng@huawei.com> Date: Fri, 6 Aug 2021 09:39:07 +0800 Subject: [PATCH 31/87] page_pool: mask the page->signature before the checking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As mentioned in commit c07aea3ef4d4 ("mm: add a signature in struct page"): "The page->signature field is aliased to page->lru.next and page->compound_head." And as the comment in page_is_pfmemalloc(): "lru.next has bit 1 set if the page is allocated from the pfmemalloc reserves. Callers may simply overwrite it if they do not need to preserve that information." The page->signature is OR’ed with PP_SIGNATURE when a page is allocated in page pool, see __page_pool_alloc_pages_slow(), and page->signature is checked directly with PP_SIGNATURE in page_pool_return_skb_page(), which might cause resoure leaking problem for a page from page pool if bit 1 of lru.next is set for a pfmemalloc page. What happens here is that the original pp->signature is OR'ed with PP_SIGNATURE after the allocation in order to preserve any existing bits(such as the bit 1, used to indicate a pfmemalloc page), so when those bits are present, those page is not considered to be from page pool and the DMA mapping of those pages will be left stale. As bit 0 is for page->compound_head, So mask both bit 0/1 before the checking in page_pool_return_skb_page(). And we will return those pfmemalloc pages back to the page allocator after cleaning up the DMA mapping. Fixes: 6a5bcd84e886 ("page_pool: Allow drivers to hint on SKB recycling") Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/core/page_pool.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/core/page_pool.c b/net/core/page_pool.c index 5e4eb45b139c..8ab7b402244c 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -634,7 +634,15 @@ bool page_pool_return_skb_page(struct page *page) struct page_pool *pp; page = compound_head(page); - if (unlikely(page->pp_magic != PP_SIGNATURE)) + + /* page->pp_magic is OR'ed with PP_SIGNATURE after the allocation + * in order to preserve any existing bits, such as bit 0 for the + * head page of compound page and bit 1 for pfmemalloc page, so + * mask those bits for freeing side when doing below checking, + * and page_is_pfmemalloc() is checked in __page_pool_put_page() + * to avoid recycling the pfmemalloc page. + */ + if (unlikely((page->pp_magic & ~0x3UL) != PP_SIGNATURE)) return false; pp = page->pp; From acc68b8d2a1196c4db806947606f162dbeed2274 Mon Sep 17 00:00:00 2001 From: Grygorii Strashko <grygorii.strashko@ti.com> Date: Thu, 5 Aug 2021 17:55:11 +0300 Subject: [PATCH 32/87] net: ethernet: ti: cpsw: fix min eth packet size for non-switch use-cases The CPSW switchdev driver inherited fix from commit 9421c9015047 ("net: ethernet: ti: cpsw: fix min eth packet size") which changes min TX packet size to 64bytes (VLAN_ETH_ZLEN, excluding ETH_FCS). It was done to fix HW packed drop issue when packets are sent from Host to the port with PVID and un-tagging enabled. Unfortunately this breaks some other non-switch specific use-cases, like: - [1] CPSW port as DSA CPU port with DSA-tag applied at the end of the packet - [2] Some industrial protocols, which expects min TX packet size 60Bytes (excluding FCS). Fix it by configuring min TX packet size depending on driver mode - 60Bytes (ETH_ZLEN) for multi mac (dual-mac) mode - 64Bytes (VLAN_ETH_ZLEN) for switch mode and update it during driver mode change and annotate with READ_ONCE()/WRITE_ONCE() as it can be read by napi while writing. [1] https://lore.kernel.org/netdev/20210531124051.GA15218@cephalopod/ [2] https://e2e.ti.com/support/arm/sitara_arm/f/791/t/701669 Cc: stable@vger.kernel.org Fixes: ed3525eda4c4 ("net: ethernet: ti: introduce cpsw switchdev based driver part 1 - dual-emac") Reported-by: Ben Hutchings <ben.hutchings@essensium.com> Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/ti/cpsw_new.c | 7 +++++-- drivers/net/ethernet/ti/cpsw_priv.h | 4 +++- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 57d279fdcc9f..d1d02001cef6 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -920,7 +920,7 @@ static netdev_tx_t cpsw_ndo_start_xmit(struct sk_buff *skb, struct cpdma_chan *txch; int ret, q_idx; - if (skb_padto(skb, CPSW_MIN_PACKET_SIZE)) { + if (skb_put_padto(skb, READ_ONCE(priv->tx_packet_min))) { cpsw_err(priv, tx_err, "packet pad failed\n"); ndev->stats.tx_dropped++; return NET_XMIT_DROP; @@ -1100,7 +1100,7 @@ static int cpsw_ndo_xdp_xmit(struct net_device *ndev, int n, for (i = 0; i < n; i++) { xdpf = frames[i]; - if (xdpf->len < CPSW_MIN_PACKET_SIZE) + if (xdpf->len < READ_ONCE(priv->tx_packet_min)) break; if (cpsw_xdp_tx_frame(priv, xdpf, NULL, priv->emac_port)) @@ -1389,6 +1389,7 @@ static int cpsw_create_ports(struct cpsw_common *cpsw) priv->dev = dev; priv->msg_enable = netif_msg_init(debug_level, CPSW_DEBUG); priv->emac_port = i + 1; + priv->tx_packet_min = CPSW_MIN_PACKET_SIZE; if (is_valid_ether_addr(slave_data->mac_addr)) { ether_addr_copy(priv->mac_addr, slave_data->mac_addr); @@ -1686,6 +1687,7 @@ static int cpsw_dl_switch_mode_set(struct devlink *dl, u32 id, priv = netdev_priv(sl_ndev); slave->port_vlan = vlan; + WRITE_ONCE(priv->tx_packet_min, CPSW_MIN_PACKET_SIZE_VLAN); if (netif_running(sl_ndev)) cpsw_port_add_switch_def_ale_entries(priv, slave); @@ -1714,6 +1716,7 @@ static int cpsw_dl_switch_mode_set(struct devlink *dl, u32 id, priv = netdev_priv(slave->ndev); slave->port_vlan = slave->data->dual_emac_res_vlan; + WRITE_ONCE(priv->tx_packet_min, CPSW_MIN_PACKET_SIZE); cpsw_port_add_dual_emac_def_ale_entries(priv, slave); } diff --git a/drivers/net/ethernet/ti/cpsw_priv.h b/drivers/net/ethernet/ti/cpsw_priv.h index a323bea54faa..2951fb7b9dae 100644 --- a/drivers/net/ethernet/ti/cpsw_priv.h +++ b/drivers/net/ethernet/ti/cpsw_priv.h @@ -89,7 +89,8 @@ do { \ #define CPSW_POLL_WEIGHT 64 #define CPSW_RX_VLAN_ENCAP_HDR_SIZE 4 -#define CPSW_MIN_PACKET_SIZE (VLAN_ETH_ZLEN) +#define CPSW_MIN_PACKET_SIZE_VLAN (VLAN_ETH_ZLEN) +#define CPSW_MIN_PACKET_SIZE (ETH_ZLEN) #define CPSW_MAX_PACKET_SIZE (VLAN_ETH_FRAME_LEN +\ ETH_FCS_LEN +\ CPSW_RX_VLAN_ENCAP_HDR_SIZE) @@ -380,6 +381,7 @@ struct cpsw_priv { u32 emac_port; struct cpsw_common *cpsw; int offload_fwd_mark; + u32 tx_packet_min; }; #define ndev_to_cpsw(ndev) (((struct cpsw_priv *)netdev_priv(ndev))->cpsw) From 8f3d65c166797746455553f4eaf74a5f89f996d4 Mon Sep 17 00:00:00 2001 From: Karsten Graul <kgraul@linux.ibm.com> Date: Mon, 9 Aug 2021 11:05:56 +0200 Subject: [PATCH 33/87] net/smc: fix wait on already cleared link There can be a race between the waiters for a tx work request buffer and the link down processing that finally clears the link. Although all waiters are woken up before the link is cleared there might be waiters which did not yet get back control and are still waiting. This results in an access to a cleared wait queue head. Fix this by introducing atomic reference counting around the wait calls, and wait with the link clear processing until all waiters have finished. Move the work request layer related calls into smc_wr.c and set the link state to INACTIVE before calling smcr_link_clear() in smc_llc_srv_add_link(). Fixes: 15e1b99aadfb ("net/smc: no WR buffer wait for terminating link group") Signed-off-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: Guvenc Gulce <guvenc@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/smc/smc_core.h | 2 ++ net/smc/smc_llc.c | 10 ++++------ net/smc/smc_tx.c | 18 +++++++++++++++++- net/smc/smc_wr.c | 10 ++++++++++ 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6d6fd1397c87..64d86298e4df 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -97,6 +97,7 @@ struct smc_link { unsigned long *wr_tx_mask; /* bit mask of used indexes */ u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ + atomic_t wr_tx_refcnt; /* tx refs to link */ struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ @@ -109,6 +110,7 @@ struct smc_link { struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ + atomic_t wr_reg_refcnt; /* reg refs to link */ enum smc_wr_reg_state wr_reg_state; /* state of wr_reg request */ u8 gid[SMC_GID_SIZE];/* gid matching used vlan id*/ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 273eaf1bfe49..2e7560eba981 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -888,6 +888,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry) if (!rc) goto out; out_clear_lnk: + lnk_new->state = SMC_LNK_INACTIVE; smcr_link_clear(lnk_new, false); out_reject: smc_llc_cli_add_link_reject(qentry); @@ -1184,6 +1185,7 @@ int smc_llc_srv_add_link(struct smc_link *link) goto out_err; return 0; out_err: + link_new->state = SMC_LNK_INACTIVE; smcr_link_clear(link_new, false); return rc; } @@ -1286,10 +1288,8 @@ static void smc_llc_process_cli_delete_link(struct smc_link_group *lgr) del_llc->reason = 0; smc_llc_send_message(lnk, &qentry->msg); /* response */ - if (smc_link_downing(&lnk_del->state)) { - if (smc_switch_conns(lgr, lnk_del, false)) - smc_wr_tx_wait_no_pending_sends(lnk_del); - } + if (smc_link_downing(&lnk_del->state)) + smc_switch_conns(lgr, lnk_del, false); smcr_link_clear(lnk_del, true); active_links = smc_llc_active_link_count(lgr); @@ -1805,8 +1805,6 @@ void smc_llc_link_clear(struct smc_link *link, bool log) link->smcibdev->ibdev->name, link->ibport); complete(&link->llc_testlink_resp); cancel_delayed_work_sync(&link->llc_testlink_wrk); - smc_wr_wakeup_reg_wait(link); - smc_wr_wakeup_tx_wait(link); } /* register a new rtoken at the remote peer (for all links) */ diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 289025cd545a..c79361dfcdfb 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -496,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn, /* Wakeup sndbuf consumers from any context (IRQ or process) * since there is more data to transmit; usable snd_wnd as max transmit */ -static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; struct smc_link *link = conn->lnk; @@ -550,6 +550,22 @@ out_unlock: return rc; } +static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn) +{ + struct smc_link *link = conn->lnk; + int rc = -ENOLINK; + + if (!link) + return rc; + + atomic_inc(&link->wr_tx_refcnt); + if (smc_link_usable(link)) + rc = _smcr_tx_sndbuf_nonempty(conn); + if (atomic_dec_and_test(&link->wr_tx_refcnt)) + wake_up_all(&link->wr_tx_wait); + return rc; +} + static int smcd_tx_sndbuf_nonempty(struct smc_connection *conn) { struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index cbc73a7e4d59..a419e9af36b9 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -322,9 +322,12 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) if (rc) return rc; + atomic_inc(&link->wr_reg_refcnt); rc = wait_event_interruptible_timeout(link->wr_reg_wait, (link->wr_reg_state != POSTED), SMC_WR_REG_MR_WAIT_TIME); + if (atomic_dec_and_test(&link->wr_reg_refcnt)) + wake_up_all(&link->wr_reg_wait); if (!rc) { /* timeout - terminate link */ smcr_link_down_cond_sched(link); @@ -566,10 +569,15 @@ void smc_wr_free_link(struct smc_link *lnk) return; ibdev = lnk->smcibdev->ibdev; + smc_wr_wakeup_reg_wait(lnk); + smc_wr_wakeup_tx_wait(lnk); + if (smc_wr_tx_wait_no_pending_sends(lnk)) memset(lnk->wr_tx_mask, 0, BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); + wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt))); + wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt))); if (lnk->wr_rx_dma_addr) { ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr, @@ -728,7 +736,9 @@ int smc_wr_create_link(struct smc_link *lnk) memset(lnk->wr_tx_mask, 0, BITS_TO_LONGS(SMC_WR_BUF_CNT) * sizeof(*lnk->wr_tx_mask)); init_waitqueue_head(&lnk->wr_tx_wait); + atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); + atomic_set(&lnk->wr_reg_refcnt, 0); return rc; dma_unmap: From 64513d269e8971aabb7e787955a1b320e3031306 Mon Sep 17 00:00:00 2001 From: Guvenc Gulce <guvenc@linux.ibm.com> Date: Mon, 9 Aug 2021 11:05:57 +0200 Subject: [PATCH 34/87] net/smc: Correct smc link connection counter in case of smc client SMC clients may be assigned to a different link after the initial connection between two peers was established. In such a case, the connection counter was not correctly set. Update the connection counter correctly when a smc client connection is assigned to a different smc link. Fixes: 07d51580ff65 ("net/smc: Add connection counters for links") Signed-off-by: Guvenc Gulce <guvenc@linux.ibm.com> Tested-by: Karsten Graul <kgraul@linux.ibm.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/smc/af_smc.c | 2 +- net/smc/smc_core.c | 4 ++-- net/smc/smc_core.h | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 898389611ae8..c038efc23ce3 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -795,7 +795,7 @@ static int smc_connect_rdma(struct smc_sock *smc, reason_code = SMC_CLC_DECL_NOSRVLINK; goto connect_abort; } - smc->conn.lnk = link; + smc_switch_link_and_count(&smc->conn, link); } /* create send buffer and rmb */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index cd0d7c908b2a..c160ff50c053 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -917,8 +917,8 @@ static int smc_switch_cursor(struct smc_sock *smc, struct smc_cdc_tx_pend *pend, return rc; } -static void smc_switch_link_and_count(struct smc_connection *conn, - struct smc_link *to_lnk) +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk) { atomic_dec(&conn->lnk->conn_cnt); conn->lnk = to_lnk; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 64d86298e4df..c043ecdca5c4 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -446,6 +446,8 @@ void smc_core_exit(void); int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, u8 link_idx, struct smc_init_info *ini); void smcr_link_clear(struct smc_link *lnk, bool log); +void smc_switch_link_and_count(struct smc_connection *conn, + struct smc_link *to_lnk); int smcr_buf_map_lgr(struct smc_link *lnk); int smcr_buf_reg_lgr(struct smc_link *lnk); void smcr_lgr_set_type(struct smc_link_group *lgr, enum smc_lgr_type new_type); From d09c548dbf3b31cb07bba562e0f452edfa01efe3 Mon Sep 17 00:00:00 2001 From: Hangbin Liu <liuhangbin@gmail.com> Date: Mon, 9 Aug 2021 15:04:55 +0800 Subject: [PATCH 35/87] net: sched: act_mirred: Reset ct info when mirror/redirect skb When mirror/redirect a skb to a different port, the ct info should be reset for reclassification. Or the pkts will match unexpected rules. For example, with following topology and commands: ----------- | veth0 -+------- | veth1 -+------- | ------------ tc qdisc add dev veth0 clsact # The same with "action mirred egress mirror dev veth1" or "action mirred ingress redirect dev veth1" tc filter add dev veth0 egress chain 1 protocol ip flower ct_state +trk action mirred ingress mirror dev veth1 tc filter add dev veth0 egress chain 0 protocol ip flower ct_state -inv action ct commit action goto chain 1 tc qdisc add dev veth1 clsact tc filter add dev veth1 ingress chain 0 protocol ip flower ct_state +trk action drop ping <remove ip via veth0> & tc -s filter show dev veth1 ingress With command 'tc -s filter show', we can find the pkts were dropped on veth1. Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct") Signed-off-by: Roi Dayan <roid@nvidia.com> Signed-off-by: Hangbin Liu <liuhangbin@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/sched/act_mirred.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index 7153c67f641e..2ef4cd2c848b 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -273,6 +273,9 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, goto out; } + /* All mirred/redirected skbs should clear previous ct info */ + nf_reset_ct(skb2); + want_ingress = tcf_mirred_act_wants_ingress(m_eaction); expects_nh = want_ingress || !m_mac_header_xmit; From 50ac7479846053ca8054be833c1594e64de496bb Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com> Date: Wed, 28 Jul 2021 12:39:10 -0700 Subject: [PATCH 36/87] ice: Prevent probing virtual functions The userspace utility "driverctl" can be used to change/override the system's default driver choices. This is useful in some situations (buggy driver, old driver missing a device ID, trying a workaround, etc.) where the user needs to load a different driver. However, this is also prone to user error, where a driver is mapped to a device it's not designed to drive. For example, if the ice driver is mapped to driver iavf devices, the ice driver crashes. Add a check to return an error if the ice driver is being used to probe a virtual function. Fixes: 837f08fdecbe ("ice: Add basic driver framework for Intel(R) E800 Series") Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com> Tested-by: Gurucharan G <gurucharanx.g@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com> --- drivers/net/ethernet/intel/ice/ice_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index ef8d1815af56..a9506041eb42 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4194,6 +4194,11 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent) struct ice_hw *hw; int i, err; + if (pdev->is_virtfn) { + dev_err(dev, "can't probe a virtual function\n"); + return -EINVAL; + } + /* this driver uses devres, see * Documentation/driver-api/driver-model/devres.rst */ From c503e63200c679e362afca7aca9d3dc63a0f45ed Mon Sep 17 00:00:00 2001 From: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com> Date: Wed, 4 Aug 2021 12:12:42 -0700 Subject: [PATCH 37/87] ice: Stop processing VF messages during teardown When VFs are setup and torn down in quick succession, it is possible that a VF is torn down by the PF while the VF's virtchnl requests are still in the PF's mailbox ring. Processing the VF's virtchnl request when the VF itself doesn't exist results in undefined behavior. Fix this by adding a check to stop processing virtchnl requests when VF teardown is in progress. Fixes: ddf30f7ff840 ("ice: Add handler to configure SR-IOV") Signed-off-by: Anirudh Venkataramanan <anirudh.venkataramanan@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com> --- drivers/net/ethernet/intel/ice/ice.h | 1 + drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/drivers/net/ethernet/intel/ice/ice.h b/drivers/net/ethernet/intel/ice/ice.h index a450343fbb92..eadcb9958346 100644 --- a/drivers/net/ethernet/intel/ice/ice.h +++ b/drivers/net/ethernet/intel/ice/ice.h @@ -234,6 +234,7 @@ enum ice_pf_state { ICE_VFLR_EVENT_PENDING, ICE_FLTR_OVERFLOW_PROMISC, ICE_VF_DIS, + ICE_VF_DEINIT_IN_PROGRESS, ICE_CFG_BUSY, ICE_SERVICE_SCHED, ICE_SERVICE_DIS, diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c index 2826570dab51..e93430ab37f1 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c @@ -615,6 +615,8 @@ void ice_free_vfs(struct ice_pf *pf) struct ice_hw *hw = &pf->hw; unsigned int tmp, i; + set_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state); + if (!pf->vf) return; @@ -680,6 +682,7 @@ void ice_free_vfs(struct ice_pf *pf) i); clear_bit(ICE_VF_DIS, pf->state); + clear_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state); clear_bit(ICE_FLAG_SRIOV_ENA, pf->flags); } @@ -4415,6 +4418,10 @@ void ice_vc_process_vf_msg(struct ice_pf *pf, struct ice_rq_event_info *event) struct device *dev; int err = 0; + /* if de-init is underway, don't process messages from VF */ + if (test_bit(ICE_VF_DEINIT_IN_PROGRESS, pf->state)) + return; + dev = ice_pf_to_dev(pf); if (ice_validate_vf_id(pf, vf_id)) { err = -EINVAL; From 3ba7f53f8bf1fb862e36c7f74434ac3aceb60158 Mon Sep 17 00:00:00 2001 From: Brett Creeley <brett.creeley@intel.com> Date: Fri, 6 Aug 2021 09:51:27 -0700 Subject: [PATCH 38/87] ice: don't remove netdev->dev_addr from uc sync list In some circumstances, such as with bridging, it's possible that the stack will add the device's own MAC address to its unicast address list. If, later, the stack deletes this address, the driver will receive a request to remove this address. The driver stores its current MAC address as part of the VSI MAC filter list instead of separately. So, this causes a problem when the device's MAC address is deleted unexpectedly, which results in traffic failure in some cases. The following configuration steps will reproduce the previously mentioned problem: > ip link set eth0 up > ip link add dev br0 type bridge > ip link set br0 up > ip addr flush dev eth0 > ip link set eth0 master br0 > echo 1 > /sys/class/net/br0/bridge/vlan_filtering > modprobe -r veth > modprobe -r bridge > ip addr add 192.168.1.100/24 dev eth0 The following ping command fails due to the netdev->dev_addr being deleted when removing the bridge module. > ping <link partner> Fix this by making sure to not delete the netdev->dev_addr during MAC address sync. After fixing this issue it was noticed that the netdev_warn() in .set_mac was overly verbose, so make it at netdev_dbg(). Also, there is a possibility of a race condition between .set_mac and .set_rx_mode. Fix this by calling netif_addr_lock_bh() and netif_addr_unlock_bh() on the device's netdev when the netdev->dev_addr is going to be updated in .set_mac. Fixes: e94d44786693 ("ice: Implement filter sync, NDO operations and bump version") Signed-off-by: Brett Creeley <brett.creeley@intel.com> Tested-by: Liang Li <liali@redhat.com> Tested-by: Gurucharan G <gurucharanx.g@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com> --- drivers/net/ethernet/intel/ice/ice_main.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index a9506041eb42..fe2ded775f25 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -191,6 +191,14 @@ static int ice_add_mac_to_unsync_list(struct net_device *netdev, const u8 *addr) struct ice_netdev_priv *np = netdev_priv(netdev); struct ice_vsi *vsi = np->vsi; + /* Under some circumstances, we might receive a request to delete our + * own device address from our uc list. Because we store the device + * address in the VSI's MAC filter list, we need to ignore such + * requests and not delete our device address from this list. + */ + if (ether_addr_equal(addr, netdev->dev_addr)) + return 0; + if (ice_fltr_add_mac_to_list(vsi, &vsi->tmp_unsync_list, addr, ICE_FWD_TO_VSI)) return -EINVAL; @@ -5124,7 +5132,7 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi) return -EADDRNOTAVAIL; if (ether_addr_equal(netdev->dev_addr, mac)) { - netdev_warn(netdev, "already using mac %pM\n", mac); + netdev_dbg(netdev, "already using mac %pM\n", mac); return 0; } @@ -5135,6 +5143,7 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi) return -EBUSY; } + netif_addr_lock_bh(netdev); /* Clean up old MAC filter. Not an error if old filter doesn't exist */ status = ice_fltr_remove_mac(vsi, netdev->dev_addr, ICE_FWD_TO_VSI); if (status && status != ICE_ERR_DOES_NOT_EXIST) { @@ -5144,30 +5153,28 @@ static int ice_set_mac_address(struct net_device *netdev, void *pi) /* Add filter for new MAC. If filter exists, return success */ status = ice_fltr_add_mac(vsi, mac, ICE_FWD_TO_VSI); - if (status == ICE_ERR_ALREADY_EXISTS) { + if (status == ICE_ERR_ALREADY_EXISTS) /* Although this MAC filter is already present in hardware it's * possible in some cases (e.g. bonding) that dev_addr was * modified outside of the driver and needs to be restored back * to this value. */ - memcpy(netdev->dev_addr, mac, netdev->addr_len); netdev_dbg(netdev, "filter for MAC %pM already exists\n", mac); - return 0; - } - - /* error if the new filter addition failed */ - if (status) + else if (status) + /* error if the new filter addition failed */ err = -EADDRNOTAVAIL; err_update_filters: if (err) { netdev_err(netdev, "can't set MAC %pM. filter update failed\n", mac); + netif_addr_unlock_bh(netdev); return err; } /* change the netdev's MAC address */ memcpy(netdev->dev_addr, mac, netdev->addr_len); + netif_addr_unlock_bh(netdev); netdev_dbg(vsi->netdev, "updated MAC address to %pM\n", netdev->dev_addr); From a7550f8b1c9712894f9e98d6caf5f49451ebd058 Mon Sep 17 00:00:00 2001 From: Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com> Date: Fri, 4 Jun 2021 09:53:33 -0700 Subject: [PATCH 39/87] iavf: Set RSS LUT and key in reset handle path iavf driver should set RSS LUT and key unconditionally in reset path. Currently, the driver does not do that. This patch fixes this issue. Fixes: 2c86ac3c7079 ("i40evf: create a generic config RSS function") Signed-off-by: Md Fahad Iqbal Polash <md.fahad.iqbal.polash@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com> --- drivers/net/ethernet/intel/iavf/iavf_main.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 44bafedd09f2..244ec74ceca7 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1506,11 +1506,6 @@ static int iavf_reinit_interrupt_scheme(struct iavf_adapter *adapter) set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); iavf_map_rings_to_vectors(adapter); - - if (RSS_AQ(adapter)) - adapter->aq_required |= IAVF_FLAG_AQ_CONFIGURE_RSS; - else - err = iavf_init_rss(adapter); err: return err; } @@ -2200,6 +2195,14 @@ continue_reset: goto reset_err; } + if (RSS_AQ(adapter)) { + adapter->aq_required |= IAVF_FLAG_AQ_CONFIGURE_RSS; + } else { + err = iavf_init_rss(adapter); + if (err) + goto reset_err; + } + adapter->aq_required |= IAVF_FLAG_AQ_GET_CONFIG; adapter->aq_required |= IAVF_FLAG_AQ_MAP_VECTORS; From 71330842ff93ae67a066c1fa68d75672527312fa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <daniel@iogearbox.net> Date: Mon, 9 Aug 2021 21:45:32 +0200 Subject: [PATCH 40/87] bpf: Add _kernel suffix to internal lockdown_bpf_read Rename LOCKDOWN_BPF_READ into LOCKDOWN_BPF_READ_KERNEL so we have naming more consistent with a LOCKDOWN_BPF_WRITE_USER option that we are adding. Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Andrii Nakryiko <andrii@kernel.org> --- include/linux/security.h | 2 +- kernel/bpf/helpers.c | 4 ++-- kernel/trace/bpf_trace.c | 8 ++++---- security/security.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 24eda04221e9..724d7a4a0c91 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -123,7 +123,7 @@ enum lockdown_reason { LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, - LOCKDOWN_BPF_READ, + LOCKDOWN_BPF_READ_KERNEL, LOCKDOWN_PERF, LOCKDOWN_TRACEFS, LOCKDOWN_XMON_RW, diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 62cf00383910..0b04553e8c44 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -1070,12 +1070,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b4916ef388ad..1836591197a5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -999,19 +999,19 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: - return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + return security_locked_down(LOCKDOWN_BPF_READ_KERNEL) < 0 ? NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS diff --git a/security/security.c b/security/security.c index 09533cbb7221..6b83ab4e9d66 100644 --- a/security/security.c +++ b/security/security.c @@ -61,7 +61,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", - [LOCKDOWN_BPF_READ] = "use of bpf to read kernel RAM", + [LOCKDOWN_BPF_READ_KERNEL] = "use of bpf to read kernel RAM", [LOCKDOWN_PERF] = "unsafe use of perf", [LOCKDOWN_TRACEFS] = "use of tracefs", [LOCKDOWN_XMON_RW] = "xmon read and write access", From beb7f2de5728b0bd2140a652fa51f6ad85d159f7 Mon Sep 17 00:00:00 2001 From: Roi Dayan <roid@nvidia.com> Date: Sun, 8 Aug 2021 09:52:42 +0300 Subject: [PATCH 41/87] psample: Add a fwd declaration for skbuff Without this there is a warning if source files include psample.h before skbuff.h or doesn't include it at all. Fixes: 6ae0a6286171 ("net: Introduce psample, a new genetlink channel for packet sampling") Signed-off-by: Roi Dayan <roid@nvidia.com> Link: https://lore.kernel.org/r/20210808065242.1522535-1-roid@nvidia.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- include/net/psample.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/net/psample.h b/include/net/psample.h index e328c5127757..0509d2d6be67 100644 --- a/include/net/psample.h +++ b/include/net/psample.h @@ -31,6 +31,8 @@ struct psample_group *psample_group_get(struct net *net, u32 group_num); void psample_group_take(struct psample_group *group); void psample_group_put(struct psample_group *group); +struct sk_buff; + #if IS_ENABLED(CONFIG_PSAMPLE) void psample_sample_packet(struct psample_group *group, struct sk_buff *skb, From d6e712aa7e6a3d5a9633f4bcbe2237f3edc292bd Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Sun, 8 Aug 2021 12:08:34 -0700 Subject: [PATCH 42/87] net: openvswitch: fix kernel-doc warnings in flow.c Repair kernel-doc notation in a few places to make it conform to the expected format. Fixes the following kernel-doc warnings: flow.c:296: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Parse vlan tag from vlan header. flow.c:296: warning: missing initial short description on line: * Parse vlan tag from vlan header. flow.c:537: warning: No description found for return value of 'key_extract_l3l4' flow.c:769: warning: No description found for return value of 'key_extract' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Cc: Pravin B Shelar <pshelar@ovn.org> Cc: dev@openvswitch.org Link: https://lore.kernel.org/r/20210808190834.23362-1-rdunlap@infradead.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/openvswitch/flow.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index e586424d8b04..9713035b89e3 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -293,14 +293,14 @@ static bool icmp6hdr_ok(struct sk_buff *skb) } /** - * Parse vlan tag from vlan header. + * parse_vlan_tag - Parse vlan tag from vlan header. * @skb: skb containing frame to parse * @key_vh: pointer to parsed vlan tag * @untag_vlan: should the vlan header be removed from the frame * - * Returns ERROR on memory error. - * Returns 0 if it encounters a non-vlan or incomplete packet. - * Returns 1 after successfully parsing vlan tag. + * Return: ERROR on memory error. + * %0 if it encounters a non-vlan or incomplete packet. + * %1 after successfully parsing vlan tag. */ static int parse_vlan_tag(struct sk_buff *skb, struct vlan_head *key_vh, bool untag_vlan) @@ -532,6 +532,7 @@ static int parse_nsh(struct sk_buff *skb, struct sw_flow_key *key) * L3 header * @key: output flow key * + * Return: %0 if successful, otherwise a negative errno value. */ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) { @@ -748,8 +749,6 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) * * The caller must ensure that skb->len >= ETH_HLEN. * - * Returns 0 if successful, otherwise a negative errno value. - * * Initializes @skb header fields as follows: * * - skb->mac_header: the L2 header. @@ -764,6 +763,8 @@ static int key_extract_l3l4(struct sk_buff *skb, struct sw_flow_key *key) * * - skb->protocol: the type of the data starting at skb->network_header. * Equals to key->eth.type. + * + * Return: %0 if successful, otherwise a negative errno value. */ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) { From 143a8526ab5fd4f8a0c4fe2a9cb28c181dc5a95f Mon Sep 17 00:00:00 2001 From: Guillaume Nault <gnault@redhat.com> Date: Fri, 6 Aug 2021 17:52:06 +0200 Subject: [PATCH 43/87] bareudp: Fix invalid read beyond skb's linear data Data beyond the UDP header might not be part of the skb's linear data. Use skb_copy_bits() instead of direct access to skb->data+X, so that we read the correct bytes even on a fragmented skb. Fixes: 4b5f67232d95 ("net: Special handling for IP & MPLS.") Signed-off-by: Guillaume Nault <gnault@redhat.com> Link: https://lore.kernel.org/r/7741c46545c6ef02e70c80a9b32814b22d9616b3.1628264975.git.gnault@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/bareudp.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index a7ee0af1af90..54e321a695ce 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -71,12 +71,18 @@ static int bareudp_udp_encap_recv(struct sock *sk, struct sk_buff *skb) family = AF_INET6; if (bareudp->ethertype == htons(ETH_P_IP)) { - struct iphdr *iphdr; + __u8 ipversion; - iphdr = (struct iphdr *)(skb->data + BAREUDP_BASE_HLEN); - if (iphdr->version == 4) { - proto = bareudp->ethertype; - } else if (bareudp->multi_proto_mode && (iphdr->version == 6)) { + if (skb_copy_bits(skb, BAREUDP_BASE_HLEN, &ipversion, + sizeof(ipversion))) { + bareudp->dev->stats.rx_dropped++; + goto drop; + } + ipversion >>= 4; + + if (ipversion == 4) { + proto = htons(ETH_P_IP); + } else if (ipversion == 6 && bareudp->multi_proto_mode) { proto = htons(ETH_P_IPV6); } else { bareudp->dev->stats.rx_dropped++; From c633e799641cf13960bd83189b4d5b1b2adb0d4e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky <leonro@nvidia.com> Date: Mon, 2 Aug 2021 16:39:54 +0300 Subject: [PATCH 44/87] net/mlx5: Don't skip subfunction cleanup in case of error in module init Clean SF resources if mlx5 eth failed to initialize. Fixes: 1958fc2f0712 ("net/mlx5: SF, Add auxiliary device driver") Signed-off-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Parav Pandit <parav@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/main.c | 12 ++++-------- drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h | 5 +++++ 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index eb1b316560a8..c84ad87c99bb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1784,16 +1784,14 @@ static int __init init(void) if (err) goto err_sf; -#ifdef CONFIG_MLX5_CORE_EN err = mlx5e_init(); - if (err) { - pci_unregister_driver(&mlx5_core_driver); - goto err_debug; - } -#endif + if (err) + goto err_en; return 0; +err_en: + mlx5_sf_driver_unregister(); err_sf: pci_unregister_driver(&mlx5_core_driver); err_debug: @@ -1803,9 +1801,7 @@ err_debug: static void __exit cleanup(void) { -#ifdef CONFIG_MLX5_CORE_EN mlx5e_cleanup(); -#endif mlx5_sf_driver_unregister(); pci_unregister_driver(&mlx5_core_driver); mlx5_unregister_debugfs(); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h index 343807ac2036..da365b8f0141 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h @@ -206,8 +206,13 @@ int mlx5_firmware_flash(struct mlx5_core_dev *dev, const struct firmware *fw, int mlx5_fw_version_query(struct mlx5_core_dev *dev, u32 *running_ver, u32 *stored_ver); +#ifdef CONFIG_MLX5_CORE_EN int mlx5e_init(void); void mlx5e_cleanup(void); +#else +static inline int mlx5e_init(void){ return 0; } +static inline void mlx5e_cleanup(void){} +#endif static inline bool mlx5_sriov_is_enabled(struct mlx5_core_dev *dev) { From d3875924dae632d5edd908d285fffc5f07c835a3 Mon Sep 17 00:00:00 2001 From: Alex Vesker <valex@nvidia.com> Date: Tue, 22 Jun 2021 16:51:58 +0300 Subject: [PATCH 45/87] net/mlx5: DR, Add fail on error check on decap While processing encapsulated packet on RX, one of the fields that is checked is the inner packet length. If the length as specified in the header doesn't match the actual inner packet length, the packet is invalid and should be dropped. However, such packet caused the NIC to hang. This patch turns on a 'fail_on_error' HW bit which allows HW to drop such an invalid packet while processing RX packet and trying to decap it. Fixes: ad17dc8cf910 ("net/mlx5: DR, Move STEv0 action apply logic") Signed-off-by: Alex Vesker <valex@nvidia.com> Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c index f1950e4968da..e4dd4eed5aee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v0.c @@ -352,6 +352,7 @@ static void dr_ste_v0_set_rx_decap(u8 *hw_ste_p) { MLX5_SET(ste_rx_steering_mult, hw_ste_p, tunneling_action, DR_STE_TUNL_ACTION_DECAP); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, fail_on_error, 1); } static void dr_ste_v0_set_rx_pop_vlan(u8 *hw_ste_p) @@ -365,6 +366,7 @@ static void dr_ste_v0_set_rx_decap_l3(u8 *hw_ste_p, bool vlan) MLX5_SET(ste_rx_steering_mult, hw_ste_p, tunneling_action, DR_STE_TUNL_ACTION_L3_DECAP); MLX5_SET(ste_modify_packet, hw_ste_p, action_description, vlan ? 1 : 0); + MLX5_SET(ste_rx_steering_mult, hw_ste_p, fail_on_error, 1); } static void dr_ste_v0_set_rewrite_actions(u8 *hw_ste_p, u16 num_of_actions, From c623c95afa56bf4bf64e4f58742dc94616ef83db Mon Sep 17 00:00:00 2001 From: Roi Dayan <roid@nvidia.com> Date: Sun, 1 Aug 2021 15:47:46 +0300 Subject: [PATCH 46/87] net/mlx5e: Avoid creating tunnel headers for local route It could be local and remote are on the same machine and the route result will be a local route which will result in creating encap id with src/dst mac address of 0. Fixes: a54e20b4fcae ("net/mlx5e: Add basic TC tunnel set action for SRIOV offloads") Signed-off-by: Roi Dayan <roid@nvidia.com> Reviewed-by: Maor Dickman <maord@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 8f79f04eccd6..1e2d117082d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -124,6 +124,11 @@ static int mlx5e_route_lookup_ipv4_get(struct mlx5e_priv *priv, if (IS_ERR(rt)) return PTR_ERR(rt); + if (rt->rt_type != RTN_UNICAST) { + ret = -ENETUNREACH; + goto err_rt_release; + } + if (mlx5_lag_is_multipath(mdev) && rt->rt_gw_family != AF_INET) { ret = -ENETUNREACH; goto err_rt_release; From 6d8680da2e98410a25fe49e0a53f28c004be6d6d Mon Sep 17 00:00:00 2001 From: Vlad Buslov <vladbu@nvidia.com> Date: Wed, 4 Aug 2021 18:33:26 +0300 Subject: [PATCH 47/87] net/mlx5: Bridge, fix ageing time Ageing time is not converted from clock_t to jiffies which results incorrect ageing timeout calculation in workqueue update task. Fix it by applying clock_t_to_jiffies() to provided value. Fixes: c636a0f0f3f0 ("net/mlx5: Bridge, dynamic entry ageing") Signed-off-by: Vlad Buslov <vladbu@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index a6e1d4f78268..f3f56f32e435 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -579,7 +579,7 @@ static struct mlx5_esw_bridge *mlx5_esw_bridge_create(int ifindex, xa_init(&bridge->vports); bridge->ifindex = ifindex; bridge->refcnt = 1; - bridge->ageing_time = BR_DEFAULT_AGEING_TIME; + bridge->ageing_time = clock_t_to_jiffies(BR_DEFAULT_AGEING_TIME); list_add(&bridge->list, &br_offloads->bridges); return bridge; @@ -1006,7 +1006,7 @@ int mlx5_esw_bridge_ageing_time_set(unsigned long ageing_time, struct mlx5_eswit if (!vport->bridge) return -EINVAL; - vport->bridge->ageing_time = ageing_time; + vport->bridge->ageing_time = clock_t_to_jiffies(ageing_time); return 0; } From 8ba3e4c85825c8801a2c298dcadac650a40d7137 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy <maximmi@nvidia.com> Date: Mon, 24 May 2021 16:15:22 +0300 Subject: [PATCH 48/87] net/mlx5e: Destroy page pool after XDP SQ to fix use-after-free mlx5e_close_xdpsq does the cleanup: it calls mlx5e_free_xdpsq_descs to free the outstanding descriptors, which relies on mlx5e_page_release_dynamic and page_pool_release_page. However, page_pool_destroy is already called by this point, because mlx5e_close_rq runs before mlx5e_close_xdpsq. This commit fixes the use-after-free by swapping mlx5e_close_xdpsq and mlx5e_close_rq. The commit cited below started calling page_pool_destroy directly from the driver. Previously, the page pool was destroyed under a call_rcu from xdp_rxq_info_unreg_mem_model, which would defer the deallocation until after the XDPSQ is cleaned up. Fixes: 1da4bbeffe41 ("net: core: page_pool: add user refcnt and reintroduce page_pool_destroy") Signed-off-by: Maxim Mikityanskiy <maximmi@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index 37c440837945..fd250f7bcd88 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1891,30 +1891,30 @@ static int mlx5e_open_queues(struct mlx5e_channel *c, if (err) goto err_close_icosq; + err = mlx5e_open_rxq_rq(c, params, &cparam->rq); + if (err) + goto err_close_sqs; + if (c->xdp) { err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL, &c->rq_xdpsq, false); if (err) - goto err_close_sqs; + goto err_close_rq; } - err = mlx5e_open_rxq_rq(c, params, &cparam->rq); - if (err) - goto err_close_xdp_sq; - err = mlx5e_open_xdpsq(c, params, &cparam->xdp_sq, NULL, &c->xdpsq, true); if (err) - goto err_close_rq; + goto err_close_xdp_sq; return 0; -err_close_rq: - mlx5e_close_rq(&c->rq); - err_close_xdp_sq: if (c->xdp) mlx5e_close_xdpsq(&c->rq_xdpsq); +err_close_rq: + mlx5e_close_rq(&c->rq); + err_close_sqs: mlx5e_close_sqs(c); @@ -1949,9 +1949,9 @@ err_close_async_icosq_cq: static void mlx5e_close_queues(struct mlx5e_channel *c) { mlx5e_close_xdpsq(&c->xdpsq); - mlx5e_close_rq(&c->rq); if (c->xdp) mlx5e_close_xdpsq(&c->rq_xdpsq); + mlx5e_close_rq(&c->rq); mlx5e_close_sqs(c); mlx5e_close_icosq(&c->icosq); mlx5e_close_icosq(&c->async_icosq); From c85a6b8feb16c0cdbbc8d9f581c7861c4a9ac351 Mon Sep 17 00:00:00 2001 From: Aya Levin <ayal@nvidia.com> Date: Wed, 28 Jul 2021 18:18:59 +0300 Subject: [PATCH 49/87] net/mlx5: Block switchdev mode while devlink traps are active Since switchdev mode can't support devlink traps, verify there are no active devlink traps before moving eswitch to switchdev mode. If there are active traps, prevent the switchdev mode configuration. Fixes: eb3862a0525d ("net/mlx5e: Enable traps according to link state") Signed-off-by: Aya Levin <ayal@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- .../ethernet/mellanox/mlx5/core/eswitch_offloads.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index 011e766e4f67..3bb71a186004 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -48,6 +48,7 @@ #include "lib/fs_chains.h" #include "en_tc.h" #include "en/mapping.h" +#include "devlink.h" #define mlx5_esw_for_each_rep(esw, i, rep) \ xa_for_each(&((esw)->offloads.vport_reps), i, rep) @@ -3001,12 +3002,19 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode, if (cur_mlx5_mode == mlx5_mode) goto unlock; - if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) + if (mode == DEVLINK_ESWITCH_MODE_SWITCHDEV) { + if (mlx5_devlink_trap_get_num_active(esw->dev)) { + NL_SET_ERR_MSG_MOD(extack, + "Can't change mode while devlink traps are active"); + err = -EOPNOTSUPP; + goto unlock; + } err = esw_offloads_start(esw, extack); - else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) + } else if (mode == DEVLINK_ESWITCH_MODE_LEGACY) { err = esw_offloads_stop(esw, extack); - else + } else { err = -EINVAL; + } unlock: mlx5_esw_unlock(esw); From 3c8946e0e2841aa7cbdabf6acaac6559fa8d1a49 Mon Sep 17 00:00:00 2001 From: Shay Drory <shayd@nvidia.com> Date: Wed, 16 Jun 2021 18:54:05 +0300 Subject: [PATCH 50/87] net/mlx5: Fix order of functions in mlx5_irq_detach_nb() Change order of functions in mlx5_irq_detach_nb() so it will be a mirror of mlx5_irq_attach_nb. Fixes: 71e084e26414 ("net/mlx5: Allocating a pool of MSI-X vectors for SFs") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Parav Pandit <parav@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index b25f764daa08..95e60da33f03 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -251,8 +251,11 @@ int mlx5_irq_attach_nb(struct mlx5_irq *irq, struct notifier_block *nb) int mlx5_irq_detach_nb(struct mlx5_irq *irq, struct notifier_block *nb) { + int err = 0; + + err = atomic_notifier_chain_unregister(&irq->nh, nb); irq_put(irq); - return atomic_notifier_chain_unregister(&irq->nh, nb); + return err; } struct cpumask *mlx5_irq_get_affinity_mask(struct mlx5_irq *irq) From 5957cc557dc5d52c3448be15c2474f33224b89b6 Mon Sep 17 00:00:00 2001 From: Shay Drory <shayd@nvidia.com> Date: Wed, 16 Jun 2021 18:59:05 +0300 Subject: [PATCH 51/87] net/mlx5: Set all field of mlx5_irq before inserting it to the xarray Currently irq->pool is set after the irq is insert to the xarray. Set irq->pool before the irq is inserted to the xarray. Fixes: 71e084e26414 ("net/mlx5: Allocating a pool of MSI-X vectors for SFs") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Parav Pandit <parav@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 95e60da33f03..7b923f6b5462 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -214,6 +214,7 @@ static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i) err = -ENOMEM; goto err_cpumask; } + irq->pool = pool; kref_init(&irq->kref); irq->index = i; err = xa_err(xa_store(&pool->irqs, irq->index, irq, GFP_KERNEL)); @@ -222,7 +223,6 @@ static struct mlx5_irq *irq_request(struct mlx5_irq_pool *pool, int i) irq->index, err); goto err_xa; } - irq->pool = pool; return irq; err_xa: free_cpumask_var(irq->mask); From ba317e832d457bc8fcecf6a6ed289732544b87e9 Mon Sep 17 00:00:00 2001 From: Shay Drory <shayd@nvidia.com> Date: Thu, 17 Jun 2021 01:51:00 +0300 Subject: [PATCH 52/87] net/mlx5: Destroy pool->mutex Destroy pool->mutex when we destroy the pool. Fixes: c36326d38d93 ("net/mlx5: Round-Robin EQs over IRQs") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Parav Pandit <parav@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c index 7b923f6b5462..3465b363fc2f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/pci_irq.c @@ -440,6 +440,7 @@ irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name, if (!pool) return ERR_PTR(-ENOMEM); pool->dev = dev; + mutex_init(&pool->lock); xa_init_flags(&pool->irqs, XA_FLAGS_ALLOC); pool->xa_num_irqs.min = start; pool->xa_num_irqs.max = start + size - 1; @@ -448,7 +449,6 @@ irq_pool_alloc(struct mlx5_core_dev *dev, int start, int size, char *name, name); pool->min_threshold = min_threshold * MLX5_EQ_REFS_PER_IRQ; pool->max_threshold = max_threshold * MLX5_EQ_REFS_PER_IRQ; - mutex_init(&pool->lock); mlx5_core_dbg(dev, "pool->name = %s, pool->size = %d, pool->start = %d", name, size, start); return pool; @@ -462,6 +462,7 @@ static void irq_pool_free(struct mlx5_irq_pool *pool) xa_for_each(&pool->irqs, index, irq) irq_release(&irq->kref); xa_destroy(&pool->irqs); + mutex_destroy(&pool->lock); kvfree(pool); } From 88bbd7b2369aca4598eb8f38c5f16be98c3bb5d4 Mon Sep 17 00:00:00 2001 From: Chris Mi <cmi@nvidia.com> Date: Thu, 6 May 2021 11:40:34 +0800 Subject: [PATCH 53/87] net/mlx5e: TC, Fix error handling memory leak Free the offload sample action on error. Fixes: f94d6389f6a8 ("net/mlx5e: TC, Add support to offload sample action") Signed-off-by: Chris Mi <cmi@nvidia.com> Reviewed-by: Oz Shlomo <ozsh@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/net/ethernet/mellanox/mlx5/core/esw/sample.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/sample.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/sample.c index 794012c5c476..d3ad78aa9d45 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/sample.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/sample.c @@ -501,6 +501,7 @@ err_sampler: err_offload_rule: mlx5_esw_vporttbl_put(esw, &per_vport_tbl_attr); err_default_tbl: + kfree(sample_flow); return ERR_PTR(err); } From 563476ae0c5e48a028cbfa38fa9d2fc0418eb88f Mon Sep 17 00:00:00 2001 From: Shay Drory <shayd@nvidia.com> Date: Sun, 11 Apr 2021 15:32:55 +0300 Subject: [PATCH 54/87] net/mlx5: Synchronize correct IRQ when destroying CQ The CQ destroy is performed based on the IRQ number that is stored in cq->irqn. That number wasn't set explicitly during CQ creation and as expected some of the API users of mlx5_core_create_cq() forgot to update it. This caused to wrong synchronization call of the wrong IRQ with a number 0 instead of the real one. As a fix, set the IRQ number directly in the mlx5_core_create_cq() and update all users accordingly. Fixes: 1a86b377aa21 ("vdpa/mlx5: Add VDPA driver for supported mlx5 devices") Fixes: ef1659ade359 ("IB/mlx5: Add DEVX support for CQ events") Signed-off-by: Shay Drory <shayd@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- drivers/infiniband/hw/mlx5/cq.c | 4 +--- drivers/infiniband/hw/mlx5/devx.c | 3 +-- drivers/net/ethernet/mellanox/mlx5/core/cq.c | 1 + .../net/ethernet/mellanox/mlx5/core/en_main.c | 13 ++---------- drivers/net/ethernet/mellanox/mlx5/core/eq.c | 20 +++++++++++++++---- .../ethernet/mellanox/mlx5/core/fpga/conn.c | 4 +--- .../net/ethernet/mellanox/mlx5/core/lib/eq.h | 2 ++ .../mellanox/mlx5/core/steering/dr_send.c | 4 +--- drivers/vdpa/mlx5/net/mlx5_vnet.c | 3 +-- include/linux/mlx5/driver.h | 3 +-- 10 files changed, 27 insertions(+), 30 deletions(-) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 7abeb576b3c5..b8e5e371bb19 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -945,7 +945,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, u32 *cqb = NULL; void *cqc; int cqe_size; - unsigned int irqn; int eqn; int err; @@ -984,7 +983,7 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, INIT_WORK(&cq->notify_work, notify_soft_wc_handler); } - err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); + err = mlx5_vector2eqn(dev->mdev, vector, &eqn); if (err) goto err_cqb; @@ -1007,7 +1006,6 @@ int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr, goto err_cqb; mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn); - cq->mcq.irqn = irqn; if (udata) cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp; else diff --git a/drivers/infiniband/hw/mlx5/devx.c b/drivers/infiniband/hw/mlx5/devx.c index eb9b0a2707f8..c869b2a91a28 100644 --- a/drivers/infiniband/hw/mlx5/devx.c +++ b/drivers/infiniband/hw/mlx5/devx.c @@ -975,7 +975,6 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( struct mlx5_ib_dev *dev; int user_vector; int dev_eqn; - unsigned int irqn; int err; if (uverbs_copy_from(&user_vector, attrs, @@ -987,7 +986,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_QUERY_EQN)( return PTR_ERR(c); dev = to_mdev(c->ibucontext.device); - err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn, &irqn); + err = mlx5_vector2eqn(dev->mdev, user_vector, &dev_eqn); if (err < 0) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cq.c b/drivers/net/ethernet/mellanox/mlx5/core/cq.c index df3e4938ecdd..360e093874d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/cq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/cq.c @@ -134,6 +134,7 @@ int mlx5_core_create_cq(struct mlx5_core_dev *dev, struct mlx5_core_cq *cq, cq->cqn); cq->uar = dev->priv.uar; + cq->irqn = eq->core.irqn; return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index fd250f7bcd88..24f919ef9b8e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1535,15 +1535,9 @@ static int mlx5e_alloc_cq_common(struct mlx5e_priv *priv, { struct mlx5_core_dev *mdev = priv->mdev; struct mlx5_core_cq *mcq = &cq->mcq; - int eqn_not_used; - unsigned int irqn; int err; u32 i; - err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn_not_used, &irqn); - if (err) - return err; - err = mlx5_cqwq_create(mdev, ¶m->wq, param->cqc, &cq->wq, &cq->wq_ctrl); if (err) @@ -1557,7 +1551,6 @@ static int mlx5e_alloc_cq_common(struct mlx5e_priv *priv, mcq->vector = param->eq_ix; mcq->comp = mlx5e_completion_event; mcq->event = mlx5e_cq_error_event; - mcq->irqn = irqn; for (i = 0; i < mlx5_cqwq_get_size(&cq->wq); i++) { struct mlx5_cqe64 *cqe = mlx5_cqwq_get_wqe(&cq->wq, i); @@ -1605,11 +1598,10 @@ static int mlx5e_create_cq(struct mlx5e_cq *cq, struct mlx5e_cq_param *param) void *in; void *cqc; int inlen; - unsigned int irqn_not_used; int eqn; int err; - err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn, &irqn_not_used); + err = mlx5_vector2eqn(mdev, param->eq_ix, &eqn); if (err) return err; @@ -1983,9 +1975,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, struct mlx5e_channel *c; unsigned int irq; int err; - int eqn; - err = mlx5_vector2eqn(priv->mdev, ix, &eqn, &irq); + err = mlx5_vector2irqn(priv->mdev, ix, &irq); if (err) return err; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 6e074cc457de..605c8ecc3610 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -855,8 +855,8 @@ clean: return err; } -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, - unsigned int *irqn) +static int vector2eqnirqn(struct mlx5_core_dev *dev, int vector, int *eqn, + unsigned int *irqn) { struct mlx5_eq_table *table = dev->priv.eq_table; struct mlx5_eq_comp *eq, *n; @@ -865,8 +865,10 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, list_for_each_entry_safe(eq, n, &table->comp_eqs_list, list) { if (i++ == vector) { - *eqn = eq->core.eqn; - *irqn = eq->core.irqn; + if (irqn) + *irqn = eq->core.irqn; + if (eqn) + *eqn = eq->core.eqn; err = 0; break; } @@ -874,8 +876,18 @@ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, return err; } + +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn) +{ + return vector2eqnirqn(dev, vector, eqn, NULL); +} EXPORT_SYMBOL(mlx5_vector2eqn); +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn) +{ + return vector2eqnirqn(dev, vector, NULL, irqn); +} + unsigned int mlx5_comp_vectors_count(struct mlx5_core_dev *dev) { return dev->priv.eq_table->num_comp_eqs; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c index bd66ab2af5b5..d5da4ab65766 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/conn.c @@ -417,7 +417,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) struct mlx5_wq_param wqp; struct mlx5_cqe64 *cqe; int inlen, err, eqn; - unsigned int irqn; void *cqc, *in; __be64 *pas; u32 i; @@ -446,7 +445,7 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) goto err_cqwq; } - err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn, &irqn); + err = mlx5_vector2eqn(mdev, smp_processor_id(), &eqn); if (err) { kvfree(in); goto err_cqwq; @@ -476,7 +475,6 @@ static int mlx5_fpga_conn_create_cq(struct mlx5_fpga_conn *conn, int cq_size) *conn->cq.mcq.arm_db = 0; conn->cq.mcq.vector = 0; conn->cq.mcq.comp = mlx5_fpga_conn_cq_complete; - conn->cq.mcq.irqn = irqn; conn->cq.mcq.uar = fdev->conn_res.uar; tasklet_setup(&conn->cq.tasklet, mlx5_fpga_conn_cq_tasklet); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h index 624cedebb510..d3d628b862f3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/eq.h @@ -104,4 +104,6 @@ void mlx5_core_eq_free_irqs(struct mlx5_core_dev *dev); struct cpu_rmap *mlx5_eq_table_get_rmap(struct mlx5_core_dev *dev); #endif +int mlx5_vector2irqn(struct mlx5_core_dev *dev, int vector, unsigned int *irqn); + #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c index 12cf323a5943..9df0e73d1c35 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_send.c @@ -749,7 +749,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, struct mlx5_cqe64 *cqe; struct mlx5dr_cq *cq; int inlen, err, eqn; - unsigned int irqn; void *cqc, *in; __be64 *pas; int vector; @@ -782,7 +781,7 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, goto err_cqwq; vector = raw_smp_processor_id() % mlx5_comp_vectors_count(mdev); - err = mlx5_vector2eqn(mdev, vector, &eqn, &irqn); + err = mlx5_vector2eqn(mdev, vector, &eqn); if (err) { kvfree(in); goto err_cqwq; @@ -818,7 +817,6 @@ static struct mlx5dr_cq *dr_create_cq(struct mlx5_core_dev *mdev, *cq->mcq.arm_db = cpu_to_be32(2 << 28); cq->mcq.vector = 0; - cq->mcq.irqn = irqn; cq->mcq.uar = uar; return cq; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 2a31467f7ac5..379a19144a25 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -526,7 +526,6 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) void __iomem *uar_page = ndev->mvdev.res.uar->map; u32 out[MLX5_ST_SZ_DW(create_cq_out)]; struct mlx5_vdpa_cq *vcq = &mvq->cq; - unsigned int irqn; __be64 *pas; int inlen; void *cqc; @@ -566,7 +565,7 @@ static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent) /* Use vector 0 by default. Consider adding code to choose least used * vector. */ - err = mlx5_vector2eqn(mdev, 0, &eqn, &irqn); + err = mlx5_vector2eqn(mdev, 0, &eqn); if (err) goto err_vec; diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 1efe37466969..25a8be58d289 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -1044,8 +1044,7 @@ void mlx5_unregister_debugfs(void); void mlx5_fill_page_array(struct mlx5_frag_buf *buf, __be64 *pas); void mlx5_fill_page_frag_array_perm(struct mlx5_frag_buf *buf, __be64 *pas, u8 perm); void mlx5_fill_page_frag_array(struct mlx5_frag_buf *frag_buf, __be64 *pas); -int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn, - unsigned int *irqn); +int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn); int mlx5_core_attach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); int mlx5_core_detach_mcg(struct mlx5_core_dev *dev, union ib_gid *mgid, u32 qpn); From bd37c2888ccaa5ceb9895718f6909b247cc372e0 Mon Sep 17 00:00:00 2001 From: Aya Levin <ayal@nvidia.com> Date: Tue, 8 Jun 2021 16:38:30 +0300 Subject: [PATCH 55/87] net/mlx5: Fix return value from tracer initialization Check return value of mlx5_fw_tracer_start(), set error path and fix return value of mlx5_fw_tracer_init() accordingly. Fixes: c71ad41ccb0c ("net/mlx5: FW tracer, events handling") Signed-off-by: Aya Levin <ayal@nvidia.com> Reviewed-by: Moshe Shemesh <moshe@nvidia.com> Reviewed-by: Tariq Toukan <tariqt@nvidia.com> Signed-off-by: Saeed Mahameed <saeedm@nvidia.com> --- .../net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c index 01a1d02dcf15..3f8a98093f8c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/diag/fw_tracer.c @@ -1019,12 +1019,19 @@ int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer) MLX5_NB_INIT(&tracer->nb, fw_tracer_event, DEVICE_TRACER); mlx5_eq_notifier_register(dev, &tracer->nb); - mlx5_fw_tracer_start(tracer); - + err = mlx5_fw_tracer_start(tracer); + if (err) { + mlx5_core_warn(dev, "FWTracer: Failed to start tracer %d\n", err); + goto err_notifier_unregister; + } return 0; +err_notifier_unregister: + mlx5_eq_notifier_unregister(dev, &tracer->nb); + mlx5_core_destroy_mkey(dev, &tracer->buff.mkey); err_dealloc_pd: mlx5_core_dealloc_pd(dev, tracer->buff.pdn); + cancel_work_sync(&tracer->read_fw_strings_work); return err; } From 7b637cd52f02c6d7ff0580143a438940978fc719 Mon Sep 17 00:00:00 2001 From: Baruch Siach <baruch@tkos.co.il> Date: Mon, 2 Aug 2021 06:59:41 +0300 Subject: [PATCH 56/87] MAINTAINERS: fix Microchip CAN BUS Analyzer Tool entry typo This patch fixes the abbreviated name of the Microchip CAN BUS Analyzer Tool. Fixes: 8a7b46fa7902 ("MAINTAINERS: add Yasushi SHOJI as reviewer for the Microchip CAN BUS Analyzer Tool driver") Link: https://lore.kernel.org/r/cc4831cb1c8759c15fb32c21fd326e831183733d.1627876781.git.baruch@tkos.co.il Signed-off-by: Baruch Siach <baruch@tkos.co.il> Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index c9467d2839f5..25dc566a67c1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11327,7 +11327,7 @@ W: https://linuxtv.org T: git git://linuxtv.org/media_tree.git F: drivers/media/radio/radio-maxiradio* -MCAB MICROCHIP CAN BUS ANALYZER TOOL DRIVER +MCBA MICROCHIP CAN BUS ANALYZER TOOL DRIVER R: Yasushi SHOJI <yashi@spacecubics.com> L: linux-can@vger.kernel.org S: Maintained From aae32b784ebdbda6f6055a8021c9fb8a0ab5bcba Mon Sep 17 00:00:00 2001 From: Hussein Alasadi <alasadi@arecs.eu> Date: Mon, 9 Aug 2021 17:36:52 +0000 Subject: [PATCH 57/87] can: m_can: m_can_set_bittiming(): fix setting M_CAN_DBTP register This patch fixes the setting of the M_CAN_DBTP register contents: - use DBTP_ (the data bitrate macros) instead of NBTP_ which area used for the nominal bitrate - do not overwrite possibly-existing DBTP_TDC flag by ORing reg_btp instead of overwriting Link: https://lore.kernel.org/r/FRYP281MB06140984ABD9994C0AAF7433D1F69@FRYP281MB0614.DEUP281.PROD.OUTLOOK.COM Fixes: 20779943a080 ("can: m_can: use bits.h macros for all regmasks") Cc: Torin Cooper-Bennun <torin@maxiluxsystems.com> Cc: Chandrasekar Ramakrishnan <rcsekar@samsung.com> Signed-off-by: Hussein Alasadi <alasadi@arecs.eu> [mkl: update patch description, update indention] Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de> --- drivers/net/can/m_can/m_can.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index bba2a449ac70..43bca315a66c 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -1164,10 +1164,10 @@ static int m_can_set_bittiming(struct net_device *dev) FIELD_PREP(TDCR_TDCO_MASK, tdco)); } - reg_btp = FIELD_PREP(NBTP_NBRP_MASK, brp) | - FIELD_PREP(NBTP_NSJW_MASK, sjw) | - FIELD_PREP(NBTP_NTSEG1_MASK, tseg1) | - FIELD_PREP(NBTP_NTSEG2_MASK, tseg2); + reg_btp |= FIELD_PREP(DBTP_DBRP_MASK, brp) | + FIELD_PREP(DBTP_DSJW_MASK, sjw) | + FIELD_PREP(DBTP_DTSEG1_MASK, tseg1) | + FIELD_PREP(DBTP_DTSEG2_MASK, tseg2); m_can_write(cdev, M_CAN_DBTP, reg_btp); } From 51e1bb9eeaf7868db56e58f47848e364ab4c4129 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann <daniel@iogearbox.net> Date: Mon, 9 Aug 2021 12:43:17 +0200 Subject: [PATCH 58/87] bpf: Add lockdown check for probe_write_user helper Back then, commit 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") added the bpf_probe_write_user() helper in order to allow to override user space memory. Its original goal was to have a facility to "debug, divert, and manipulate execution of semi-cooperative processes" under CAP_SYS_ADMIN. Write to kernel was explicitly disallowed since it would otherwise tamper with its integrity. One use case was shown in cf9b1199de27 ("samples/bpf: Add test/example of using bpf_probe_write_user bpf helper") where the program DNATs traffic at the time of connect(2) syscall, meaning, it rewrites the arguments to a syscall while they're still in userspace, and before the syscall has a chance to copy the argument into kernel space. These days we have better mechanisms in BPF for achieving the same (e.g. for load-balancers), but without having to write to userspace memory. Of course the bpf_probe_write_user() helper can also be used to abuse many other things for both good or bad purpose. Outside of BPF, there is a similar mechanism for ptrace(2) such as PTRACE_PEEK{TEXT,DATA} and PTRACE_POKE{TEXT,DATA}, but would likely require some more effort. Commit 96ae52279594 explicitly dedicated the helper for experimentation purpose only. Thus, move the helper's availability behind a newly added LOCKDOWN_BPF_WRITE_USER lockdown knob so that the helper is disabled under the "integrity" mode. More fine-grained control can be implemented also from LSM side with this change. Fixes: 96ae52279594 ("bpf: Add bpf_probe_write_user BPF helper to be called in tracers") Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Andrii Nakryiko <andrii@kernel.org> --- include/linux/security.h | 1 + kernel/trace/bpf_trace.c | 5 +++-- security/security.c | 1 + 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/security.h b/include/linux/security.h index 724d7a4a0c91..5b7288521300 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -120,6 +120,7 @@ enum lockdown_reason { LOCKDOWN_MMIOTRACE, LOCKDOWN_DEBUGFS, LOCKDOWN_XMON_WR, + LOCKDOWN_BPF_WRITE_USER, LOCKDOWN_INTEGRITY_MAX, LOCKDOWN_KCORE, LOCKDOWN_KPROBES, diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1836591197a5..fdd14072fc3b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -990,12 +990,13 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_numa_node_id_proto; case BPF_FUNC_perf_event_read: return &bpf_perf_event_read_proto; - case BPF_FUNC_probe_write_user: - return bpf_get_probe_write_proto(); case BPF_FUNC_current_task_under_cgroup: return &bpf_current_task_under_cgroup_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_probe_write_user: + return security_locked_down(LOCKDOWN_BPF_WRITE_USER) < 0 ? + NULL : bpf_get_probe_write_proto(); case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: diff --git a/security/security.c b/security/security.c index 6b83ab4e9d66..9ffa9e9c5c55 100644 --- a/security/security.c +++ b/security/security.c @@ -58,6 +58,7 @@ const char *const lockdown_reasons[LOCKDOWN_CONFIDENTIALITY_MAX+1] = { [LOCKDOWN_MMIOTRACE] = "unsafe mmio", [LOCKDOWN_DEBUGFS] = "debugfs access", [LOCKDOWN_XMON_WR] = "xmon write access", + [LOCKDOWN_BPF_WRITE_USER] = "use of bpf to write user RAM", [LOCKDOWN_INTEGRITY_MAX] = "integrity", [LOCKDOWN_KCORE] = "/proc/kcore access", [LOCKDOWN_KPROBES] = "use of kprobes", From 87b7b5335e6995a6d64fca98fc67b92b29caac9c Mon Sep 17 00:00:00 2001 From: Yonghong Song <yhs@fb.com> Date: Mon, 9 Aug 2021 16:51:51 -0700 Subject: [PATCH 59/87] bpf: Add missing bpf_read_[un]lock_trace() for syscall program Commit 79a7f8bdb159d ("bpf: Introduce bpf_sys_bpf() helper and program type.") added support for syscall program, which is a sleepable program. But the program run missed bpf_read_lock_trace()/bpf_read_unlock_trace(), which is needed to ensure proper rcu callback invocations. This patch adds bpf_read_[un]lock_trace() properly. Fixes: 79a7f8bdb159d ("bpf: Introduce bpf_sys_bpf() helper and program type.") Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20210809235151.1663680-1-yhs@fb.com --- net/bpf/test_run.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 1cc75c811e24..caa16bf30fb5 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -7,6 +7,7 @@ #include <linux/vmalloc.h> #include <linux/etherdevice.h> #include <linux/filter.h> +#include <linux/rcupdate_trace.h> #include <linux/sched/signal.h> #include <net/bpf_sk_storage.h> #include <net/sock.h> @@ -951,7 +952,10 @@ int bpf_prog_test_run_syscall(struct bpf_prog *prog, goto out; } } + + rcu_read_lock_trace(); retval = bpf_prog_run_pin_on_cpu(prog, ctx); + rcu_read_unlock_trace(); if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) { err = -EFAULT; From a2baf4e8bb0f306fbed7b5e6197c02896a638ab5 Mon Sep 17 00:00:00 2001 From: Yonghong Song <yhs@fb.com> Date: Mon, 9 Aug 2021 18:04:13 -0700 Subject: [PATCH 60/87] bpf: Fix potentially incorrect results with bpf_get_local_storage() Commit b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") fixed a bug for bpf_get_local_storage() helper so different tasks won't mess up with each other's percpu local storage. The percpu data contains 8 slots so it can hold up to 8 contexts (same or different tasks), for 8 different program runs, at the same time. This in general is sufficient. But our internal testing showed the following warning multiple times: [...] warning: WARNING: CPU: 13 PID: 41661 at include/linux/bpf-cgroup.h:193 __cgroup_bpf_run_filter_sock_ops+0x13e/0x180 RIP: 0010:__cgroup_bpf_run_filter_sock_ops+0x13e/0x180 <IRQ> tcp_call_bpf.constprop.99+0x93/0xc0 tcp_conn_request+0x41e/0xa50 ? tcp_rcv_state_process+0x203/0xe00 tcp_rcv_state_process+0x203/0xe00 ? sk_filter_trim_cap+0xbc/0x210 ? tcp_v6_inbound_md5_hash.constprop.41+0x44/0x160 tcp_v6_do_rcv+0x181/0x3e0 tcp_v6_rcv+0xc65/0xcb0 ip6_protocol_deliver_rcu+0xbd/0x450 ip6_input_finish+0x11/0x20 ip6_input+0xb5/0xc0 ip6_sublist_rcv_finish+0x37/0x50 ip6_sublist_rcv+0x1dc/0x270 ipv6_list_rcv+0x113/0x140 __netif_receive_skb_list_core+0x1a0/0x210 netif_receive_skb_list_internal+0x186/0x2a0 gro_normal_list.part.170+0x19/0x40 napi_complete_done+0x65/0x150 mlx5e_napi_poll+0x1ae/0x680 __napi_poll+0x25/0x120 net_rx_action+0x11e/0x280 __do_softirq+0xbb/0x271 irq_exit_rcu+0x97/0xa0 common_interrupt+0x7f/0xa0 </IRQ> asm_common_interrupt+0x1e/0x40 RIP: 0010:bpf_prog_1835a9241238291a_tw_egress+0x5/0xbac ? __cgroup_bpf_run_filter_skb+0x378/0x4e0 ? do_softirq+0x34/0x70 ? ip6_finish_output2+0x266/0x590 ? ip6_finish_output+0x66/0xa0 ? ip6_output+0x6c/0x130 ? ip6_xmit+0x279/0x550 ? ip6_dst_check+0x61/0xd0 [...] Using drgn [0] to dump the percpu buffer contents showed that on this CPU slot 0 is still available, but slots 1-7 are occupied and those tasks in slots 1-7 mostly don't exist any more. So we might have issues in bpf_cgroup_storage_unset(). Further debugging confirmed that there is a bug in bpf_cgroup_storage_unset(). Currently, it tries to unset "current" slot with searching from the start. So the following sequence is possible: 1. A task is running and claims slot 0 2. Running BPF program is done, and it checked slot 0 has the "task" and ready to reset it to NULL (not yet). 3. An interrupt happens, another BPF program runs and it claims slot 1 with the *same* task. 4. The unset() in interrupt context releases slot 0 since it matches "task". 5. Interrupt is done, the task in process context reset slot 0. At the end, slot 1 is not reset and the same process can continue to occupy slots 2-7 and finally, when the above step 1-5 is repeated again, step 3 BPF program won't be able to claim an empty slot and a warning will be issued. To fix the issue, for unset() function, we should traverse from the last slot to the first. This way, the above issue can be avoided. The same reverse traversal should also be done in bpf_get_local_storage() helper itself. Otherwise, incorrect local storage may be returned to BPF program. [0] https://github.com/osandov/drgn Fixes: b910eaaaa4b8 ("bpf: Fix NULL pointer dereference in bpf_get_local_storage() helper") Signed-off-by: Yonghong Song <yhs@fb.com> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Acked-by: Andrii Nakryiko <andrii@kernel.org> Link: https://lore.kernel.org/bpf/20210810010413.1976277-1-yhs@fb.com --- include/linux/bpf-cgroup.h | 4 ++-- kernel/bpf/helpers.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8b77d08d4b47..6c9b10d82c80 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -201,8 +201,8 @@ static inline void bpf_cgroup_storage_unset(void) { int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { + if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) continue; this_cpu_write(bpf_cgroup_storage_info[i].task, NULL); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 0b04553e8c44..7a97b2f4747d 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -397,8 +397,8 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) void *ptr; int i; - for (i = 0; i < BPF_CGROUP_STORAGE_NEST_MAX; i++) { - if (unlikely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) + for (i = BPF_CGROUP_STORAGE_NEST_MAX - 1; i >= 0; i--) { + if (likely(this_cpu_read(bpf_cgroup_storage_info[i].task) != current)) continue; storage = this_cpu_read(bpf_cgroup_storage_info[i].storage[stype]); From c34f674c8875235725c3ef86147a627f165d23b4 Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Tue, 10 Aug 2021 00:59:12 +0200 Subject: [PATCH 61/87] net: dsa: microchip: Fix ksz_read64() ksz_read64() currently does some dubious byte-swapping on the two halves of a 64-bit register, and then only returns the high bits. Replace this with a straightforward expression. Fixes: e66f840c08a2 ("net: dsa: ksz: Add Microchip KSZ8795 DSA driver") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/microchip/ksz_common.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index 2e6bfd333f50..6afbb41ad39e 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -205,12 +205,8 @@ static inline int ksz_read64(struct ksz_device *dev, u32 reg, u64 *val) int ret; ret = regmap_bulk_read(dev->regmap[2], reg, value, 2); - if (!ret) { - /* Ick! ToDo: Add 64bit R/W to regmap on 32bit systems */ - value[0] = swab32(value[0]); - value[1] = swab32(value[1]); - *val = swab64((u64)*value); - } + if (!ret) + *val = (u64)value[0] << 32 | value[1]; return ret; } From ef3b02a1d79b691f9a354c4903cf1e6917e315f9 Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Tue, 10 Aug 2021 00:59:28 +0200 Subject: [PATCH 62/87] net: dsa: microchip: ksz8795: Fix PVID tag insertion ksz8795 has never actually enabled PVID tag insertion, and it also programmed the PVID incorrectly. To fix this: * Allow tag insertion to be controlled per ingress port. On most chips, set bit 2 in Global Control 19. On KSZ88x3 this control flag doesn't exist. * When adding a PVID: - Set the appropriate register bits to enable tag insertion on egress at every other port if this was the packet's ingress port. - Mask *out* the VID from the default tag, before or-ing in the new PVID. * When removing a PVID: - Clear the same control bits to disable tag insertion. - Don't update the default tag. This wasn't doing anything useful. Fixes: e66f840c08a2 ("net: dsa: ksz: Add Microchip KSZ8795 DSA driver") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/microchip/ksz8795.c | 26 ++++++++++++++++++------- drivers/net/dsa/microchip/ksz8795_reg.h | 4 ++++ 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 560f6843bb65..b788db829d07 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -1124,6 +1124,16 @@ static int ksz8_port_vlan_filtering(struct dsa_switch *ds, int port, bool flag, return 0; } +static void ksz8_port_enable_pvid(struct ksz_device *dev, int port, bool state) +{ + if (ksz_is_ksz88x3(dev)) { + ksz_cfg(dev, REG_SW_INSERT_SRC_PVID, + 0x03 << (4 - 2 * port), state); + } else { + ksz_pwrite8(dev, port, REG_PORT_CTRL_12, state ? 0x0f : 0x00); + } +} + static int ksz8_port_vlan_add(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack) @@ -1160,9 +1170,11 @@ static int ksz8_port_vlan_add(struct dsa_switch *ds, int port, u16 vid; ksz_pread16(dev, port, REG_PORT_CTRL_VID, &vid); - vid &= 0xfff; + vid &= ~VLAN_VID_MASK; vid |= new_pvid; ksz_pwrite16(dev, port, REG_PORT_CTRL_VID, vid); + + ksz8_port_enable_pvid(dev, port, true); } return 0; @@ -1173,7 +1185,7 @@ static int ksz8_port_vlan_del(struct dsa_switch *ds, int port, { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; - u16 data, pvid, new_pvid = 0; + u16 data, pvid; u8 fid, member, valid; if (ksz_is_ksz88x3(dev)) @@ -1195,14 +1207,11 @@ static int ksz8_port_vlan_del(struct dsa_switch *ds, int port, valid = 0; } - if (pvid == vlan->vid) - new_pvid = 1; - ksz8_to_vlan(dev, fid, member, valid, &data); ksz8_w_vlan_table(dev, vlan->vid, data); - if (new_pvid != pvid) - ksz_pwrite16(dev, port, REG_PORT_CTRL_VID, pvid); + if (pvid == vlan->vid) + ksz8_port_enable_pvid(dev, port, false); return 0; } @@ -1435,6 +1444,9 @@ static int ksz8_setup(struct dsa_switch *ds) ksz_cfg(dev, S_MIRROR_CTRL, SW_MIRROR_RX_TX, false); + if (!ksz_is_ksz88x3(dev)) + ksz_cfg(dev, REG_SW_CTRL_19, SW_INS_TAG_ENABLE, true); + /* set broadcast storm protection 10% rate */ regmap_update_bits(dev->regmap[1], S_REPLACE_VID_CTRL, BROADCAST_STORM_RATE, diff --git a/drivers/net/dsa/microchip/ksz8795_reg.h b/drivers/net/dsa/microchip/ksz8795_reg.h index a32355624f31..6b40bc25f7ff 100644 --- a/drivers/net/dsa/microchip/ksz8795_reg.h +++ b/drivers/net/dsa/microchip/ksz8795_reg.h @@ -631,6 +631,10 @@ #define REG_PORT_4_OUT_RATE_3 0xEE #define REG_PORT_5_OUT_RATE_3 0xFE +/* 88x3 specific */ + +#define REG_SW_INSERT_SRC_PVID 0xC2 + /* PME */ #define SW_PME_OUTPUT_ENABLE BIT(1) From 8f4f58f88fe0d9bd591f21f53de7dbd42baeb3fa Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Tue, 10 Aug 2021 00:59:37 +0200 Subject: [PATCH 63/87] net: dsa: microchip: ksz8795: Reject unsupported VLAN configuration The switches supported by ksz8795 only have a per-port flag for Tag Removal. This means it is not possible to support both tagged and untagged VLANs on the same port. Reject attempts to add a VLAN that requires the flag to be changed, unless there are no VLANs currently configured. VID 0 is excluded from this check since it is untagged regardless of the state of the flag. On the CPU port we could support tagged and untagged VLANs at the same time. This will be enabled by a later patch. Fixes: e66f840c08a2 ("net: dsa: ksz: Add Microchip KSZ8795 DSA driver") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/microchip/ksz8795.c | 27 +++++++++++++++++++++++++- drivers/net/dsa/microchip/ksz_common.h | 1 + 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index b788db829d07..29a18058d1ee 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -1140,13 +1140,38 @@ static int ksz8_port_vlan_add(struct dsa_switch *ds, int port, { bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; + struct ksz_port *p = &dev->ports[port]; u16 data, new_pvid = 0; u8 fid, member, valid; if (ksz_is_ksz88x3(dev)) return -ENOTSUPP; - ksz_port_cfg(dev, port, P_TAG_CTRL, PORT_REMOVE_TAG, untagged); + /* If a VLAN is added with untagged flag different from the + * port's Remove Tag flag, we need to change the latter. + * Ignore VID 0, which is always untagged. + */ + if (untagged != p->remove_tag && vlan->vid != 0) { + unsigned int vid; + + /* Reject attempts to add a VLAN that requires the + * Remove Tag flag to be changed, unless there are no + * other VLANs currently configured. + */ + for (vid = 1; vid < dev->num_vlans; ++vid) { + /* Skip the VID we are going to add or reconfigure */ + if (vid == vlan->vid) + continue; + + ksz8_from_vlan(dev, dev->vlan_cache[vid].table[0], + &fid, &member, &valid); + if (valid && (member & BIT(port))) + return -EINVAL; + } + + ksz_port_cfg(dev, port, P_TAG_CTRL, PORT_REMOVE_TAG, untagged); + p->remove_tag = untagged; + } ksz8_r_vlan_table(dev, vlan->vid, &data); ksz8_from_vlan(dev, data, &fid, &member, &valid); diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index 6afbb41ad39e..1597c63988b4 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -27,6 +27,7 @@ struct ksz_port_mib { struct ksz_port { u16 member; u16 vid_member; + bool remove_tag; /* Remove Tag flag set, for ksz8795 only */ int stp_state; struct phy_device phydev; From af01754f9e3c553a2ee63b4693c79a3956e230ab Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Tue, 10 Aug 2021 00:59:47 +0200 Subject: [PATCH 64/87] net: dsa: microchip: ksz8795: Fix VLAN untagged flag change on deletion When a VLAN is deleted from a port, the flags in struct switchdev_obj_port_vlan are always 0. ksz8_port_vlan_del() copies the BRIDGE_VLAN_INFO_UNTAGGED flag to the port's Tag Removal flag, and therefore always clears it. In case there are multiple VLANs configured as untagged on this port - which seems useless, but is allowed - deleting one of them changes the remaining VLANs to be tagged. It's only ever necessary to change this flag when a VLAN is added to the port, so leave it unchanged in ksz8_port_vlan_del(). Fixes: e66f840c08a2 ("net: dsa: ksz: Add Microchip KSZ8795 DSA driver") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/microchip/ksz8795.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 29a18058d1ee..af3744d8b6cf 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -1208,7 +1208,6 @@ static int ksz8_port_vlan_add(struct dsa_switch *ds, int port, static int ksz8_port_vlan_del(struct dsa_switch *ds, int port, const struct switchdev_obj_port_vlan *vlan) { - bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; struct ksz_device *dev = ds->priv; u16 data, pvid; u8 fid, member, valid; @@ -1219,8 +1218,6 @@ static int ksz8_port_vlan_del(struct dsa_switch *ds, int port, ksz_pread16(dev, port, REG_PORT_CTRL_VID, &pvid); pvid = pvid & 0xFFF; - ksz_port_cfg(dev, port, P_TAG_CTRL, PORT_REMOVE_TAG, untagged); - ksz8_r_vlan_table(dev, vlan->vid, &data); ksz8_from_vlan(dev, data, &fid, &member, &valid); From 9130c2d30c17846287b803a9803106318cbe5266 Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Tue, 10 Aug 2021 00:59:57 +0200 Subject: [PATCH 65/87] net: dsa: microchip: ksz8795: Use software untagging on CPU port On the CPU port, we can support both tagged and untagged VLANs at the same time by doing any necessary untagging in software rather than hardware. To enable that, keep the CPU port's Remove Tag flag cleared and set the dsa_switch::untag_bridge_pvid flag. Fixes: e66f840c08a2 ("net: dsa: ksz: Add Microchip KSZ8795 DSA driver") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/microchip/ksz8795.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index af3744d8b6cf..8bcef4b2dd6f 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -1150,8 +1150,10 @@ static int ksz8_port_vlan_add(struct dsa_switch *ds, int port, /* If a VLAN is added with untagged flag different from the * port's Remove Tag flag, we need to change the latter. * Ignore VID 0, which is always untagged. + * Ignore CPU port, which will always be tagged. */ - if (untagged != p->remove_tag && vlan->vid != 0) { + if (untagged != p->remove_tag && vlan->vid != 0 && + port != dev->cpu_port) { unsigned int vid; /* Reject attempts to add a VLAN that requires the @@ -1751,6 +1753,11 @@ static int ksz8_switch_init(struct ksz_device *dev) /* set the real number of ports */ dev->ds->num_ports = dev->port_cnt; + /* We rely on software untagging on the CPU port, so that we + * can support both tagged and untagged VLANs + */ + dev->ds->untag_bridge_pvid = true; + return 0; } From 164844135a3f215d3018ee9d6875336beb942413 Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Tue, 10 Aug 2021 01:00:06 +0200 Subject: [PATCH 66/87] net: dsa: microchip: ksz8795: Fix VLAN filtering Currently ksz8_port_vlan_filtering() sets or clears the VLAN Enable hardware flag. That controls discarding of packets with a VID that has not been enabled for any port on the switch. Since it is a global flag, set the dsa_switch::vlan_filtering_is_global flag so that the DSA core understands this can't be controlled per port. When VLAN filtering is enabled, the switch should also discard packets with a VID that's not enabled on the ingress port. Set or clear each external port's VLAN Ingress Filter flag in ksz8_port_vlan_filtering() to make that happen. Fixes: e66f840c08a2 ("net: dsa: ksz: Add Microchip KSZ8795 DSA driver") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/microchip/ksz8795.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 8bcef4b2dd6f..b1a3763c97de 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -1119,8 +1119,14 @@ static int ksz8_port_vlan_filtering(struct dsa_switch *ds, int port, bool flag, if (ksz_is_ksz88x3(dev)) return -ENOTSUPP; + /* Discard packets with VID not enabled on the switch */ ksz_cfg(dev, S_MIRROR_CTRL, SW_VLAN_ENABLE, flag); + /* Discard packets with VID not enabled on the ingress port */ + for (port = 0; port < dev->phy_port_cnt; ++port) + ksz_port_cfg(dev, port, REG_PORT_CTRL_2, PORT_INGRESS_FILTER, + flag); + return 0; } @@ -1758,6 +1764,11 @@ static int ksz8_switch_init(struct ksz_device *dev) */ dev->ds->untag_bridge_pvid = true; + /* VLAN filtering is partly controlled by the global VLAN + * Enable flag + */ + dev->ds->vlan_filtering_is_global = true; + return 0; } From 411d466d94a6b16a20c8b552e403b7e8ce2397a2 Mon Sep 17 00:00:00 2001 From: Ben Hutchings <ben.hutchings@mind.be> Date: Tue, 10 Aug 2021 01:00:15 +0200 Subject: [PATCH 67/87] net: dsa: microchip: ksz8795: Don't use phy_port_cnt in VLAN table lookup The magic number 4 in VLAN table lookup was the number of entries we can read and write at once. Using phy_port_cnt here doesn't make sense and presumably broke VLAN filtering for 3-port switches. Change it back to 4. Fixes: 4ce2a984abd8 ("net: dsa: microchip: ksz8795: use phy_port_cnt ...") Signed-off-by: Ben Hutchings <ben.hutchings@mind.be> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/microchip/ksz8795.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index b1a3763c97de..c5142f86a3c7 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -687,8 +687,8 @@ static void ksz8_r_vlan_entries(struct ksz_device *dev, u16 addr) shifts = ksz8->shifts; ksz8_r_table(dev, TABLE_VLAN, addr, &data); - addr *= dev->phy_port_cnt; - for (i = 0; i < dev->phy_port_cnt; i++) { + addr *= 4; + for (i = 0; i < 4; i++) { dev->vlan_cache[addr + i].table[0] = (u16)data; data >>= shifts[VLAN_TABLE]; } @@ -702,7 +702,7 @@ static void ksz8_r_vlan_table(struct ksz_device *dev, u16 vid, u16 *vlan) u64 buf; data = (u16 *)&buf; - addr = vid / dev->phy_port_cnt; + addr = vid / 4; index = vid & 3; ksz8_r_table(dev, TABLE_VLAN, addr, &buf); *vlan = data[index]; @@ -716,7 +716,7 @@ static void ksz8_w_vlan_table(struct ksz_device *dev, u16 vid, u16 vlan) u64 buf; data = (u16 *)&buf; - addr = vid / dev->phy_port_cnt; + addr = vid / 4; index = vid & 3; ksz8_r_table(dev, TABLE_VLAN, addr, &buf); data[index] = vlan; From 1090340f7ee53e824fd4eef66a4855d548110c5b Mon Sep 17 00:00:00 2001 From: Takeshi Misawa <jeliantsurux@gmail.com> Date: Thu, 5 Aug 2021 16:54:14 +0900 Subject: [PATCH 68/87] net: Fix memory leak in ieee802154_raw_deliver If IEEE-802.15.4-RAW is closed before receive skb, skb is leaked. Fix this, by freeing sk_receive_queue in sk->sk_destruct(). syzbot report: BUG: memory leak unreferenced object 0xffff88810f644600 (size 232): comm "softirq", pid 0, jiffies 4294967032 (age 81.270s) hex dump (first 32 bytes): 10 7d 4b 12 81 88 ff ff 10 7d 4b 12 81 88 ff ff .}K......}K..... 00 00 00 00 00 00 00 00 40 7c 4b 12 81 88 ff ff ........@|K..... backtrace: [<ffffffff83651d4a>] skb_clone+0xaa/0x2b0 net/core/skbuff.c:1496 [<ffffffff83fe1b80>] ieee802154_raw_deliver net/ieee802154/socket.c:369 [inline] [<ffffffff83fe1b80>] ieee802154_rcv+0x100/0x340 net/ieee802154/socket.c:1070 [<ffffffff8367cc7a>] __netif_receive_skb_one_core+0x6a/0xa0 net/core/dev.c:5384 [<ffffffff8367cd07>] __netif_receive_skb+0x27/0xa0 net/core/dev.c:5498 [<ffffffff8367cdd9>] netif_receive_skb_internal net/core/dev.c:5603 [inline] [<ffffffff8367cdd9>] netif_receive_skb+0x59/0x260 net/core/dev.c:5662 [<ffffffff83fe6302>] ieee802154_deliver_skb net/mac802154/rx.c:29 [inline] [<ffffffff83fe6302>] ieee802154_subif_frame net/mac802154/rx.c:102 [inline] [<ffffffff83fe6302>] __ieee802154_rx_handle_packet net/mac802154/rx.c:212 [inline] [<ffffffff83fe6302>] ieee802154_rx+0x612/0x620 net/mac802154/rx.c:284 [<ffffffff83fe59a6>] ieee802154_tasklet_handler+0x86/0xa0 net/mac802154/main.c:35 [<ffffffff81232aab>] tasklet_action_common.constprop.0+0x5b/0x100 kernel/softirq.c:557 [<ffffffff846000bf>] __do_softirq+0xbf/0x2ab kernel/softirq.c:345 [<ffffffff81232f4c>] do_softirq kernel/softirq.c:248 [inline] [<ffffffff81232f4c>] do_softirq+0x5c/0x80 kernel/softirq.c:235 [<ffffffff81232fc1>] __local_bh_enable_ip+0x51/0x60 kernel/softirq.c:198 [<ffffffff8367a9a4>] local_bh_enable include/linux/bottom_half.h:32 [inline] [<ffffffff8367a9a4>] rcu_read_unlock_bh include/linux/rcupdate.h:745 [inline] [<ffffffff8367a9a4>] __dev_queue_xmit+0x7f4/0xf60 net/core/dev.c:4221 [<ffffffff83fe2db4>] raw_sendmsg+0x1f4/0x2b0 net/ieee802154/socket.c:295 [<ffffffff8363af16>] sock_sendmsg_nosec net/socket.c:654 [inline] [<ffffffff8363af16>] sock_sendmsg+0x56/0x80 net/socket.c:674 [<ffffffff8363deec>] __sys_sendto+0x15c/0x200 net/socket.c:1977 [<ffffffff8363dfb6>] __do_sys_sendto net/socket.c:1989 [inline] [<ffffffff8363dfb6>] __se_sys_sendto net/socket.c:1985 [inline] [<ffffffff8363dfb6>] __x64_sys_sendto+0x26/0x30 net/socket.c:1985 Fixes: 9ec767160357 ("net: add IEEE 802.15.4 socket family implementation") Reported-and-tested-by: syzbot+1f68113fa907bf0695a8@syzkaller.appspotmail.com Signed-off-by: Takeshi Misawa <jeliantsurux@gmail.com> Acked-by: Alexander Aring <aahringo@redhat.com> Link: https://lore.kernel.org/r/20210805075414.GA15796@DESKTOP Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org> --- net/ieee802154/socket.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index a45a0401adc5..c25f7617770c 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -984,6 +984,11 @@ static const struct proto_ops ieee802154_dgram_ops = { .sendpage = sock_no_sendpage, }; +static void ieee802154_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); +} + /* Create a socket. Initialise the socket, blank the addresses * set the state. */ @@ -1024,7 +1029,7 @@ static int ieee802154_create(struct net *net, struct socket *sock, sock->ops = ops; sock_init_data(sock, sk); - /* FIXME: sk->sk_destruct */ + sk->sk_destruct = ieee802154_sock_destruct; sk->sk_family = PF_IEEE802154; /* Checksums on by default */ From 4a2b285e7e103d4d6c6ed3e5052a0ff74a5d7f15 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Tue, 10 Aug 2021 02:45:47 -0700 Subject: [PATCH 69/87] net: igmp: fix data-race in igmp_ifc_timer_expire() Fix the data-race reported by syzbot [1] Issue here is that igmp_ifc_timer_expire() can update in_dev->mr_ifc_count while another change just occured from another context. in_dev->mr_ifc_count is only 8bit wide, so the race had little consequences. [1] BUG: KCSAN: data-race in igmp_ifc_event / igmp_ifc_timer_expire write to 0xffff8881051e3062 of 1 bytes by task 12547 on cpu 0: igmp_ifc_event+0x1d5/0x290 net/ipv4/igmp.c:821 igmp_group_added+0x462/0x490 net/ipv4/igmp.c:1356 ____ip_mc_inc_group+0x3ff/0x500 net/ipv4/igmp.c:1461 __ip_mc_join_group+0x24d/0x2c0 net/ipv4/igmp.c:2199 ip_mc_join_group_ssm+0x20/0x30 net/ipv4/igmp.c:2218 do_ip_setsockopt net/ipv4/ip_sockglue.c:1285 [inline] ip_setsockopt+0x1827/0x2a80 net/ipv4/ip_sockglue.c:1423 tcp_setsockopt+0x8c/0xa0 net/ipv4/tcp.c:3657 sock_common_setsockopt+0x5d/0x70 net/core/sock.c:3362 __sys_setsockopt+0x18f/0x200 net/socket.c:2159 __do_sys_setsockopt net/socket.c:2170 [inline] __se_sys_setsockopt net/socket.c:2167 [inline] __x64_sys_setsockopt+0x62/0x70 net/socket.c:2167 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff8881051e3062 of 1 bytes by interrupt on cpu 1: igmp_ifc_timer_expire+0x706/0xa30 net/ipv4/igmp.c:808 call_timer_fn+0x2e/0x1d0 kernel/time/timer.c:1419 expire_timers+0x135/0x250 kernel/time/timer.c:1464 __run_timers+0x358/0x420 kernel/time/timer.c:1732 run_timer_softirq+0x19/0x30 kernel/time/timer.c:1745 __do_softirq+0x12c/0x26e kernel/softirq.c:558 invoke_softirq kernel/softirq.c:432 [inline] __irq_exit_rcu+0x9a/0xb0 kernel/softirq.c:636 sysvec_apic_timer_interrupt+0x69/0x80 arch/x86/kernel/apic/apic.c:1100 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:638 console_unlock+0x8e8/0xb30 kernel/printk/printk.c:2646 vprintk_emit+0x125/0x3d0 kernel/printk/printk.c:2174 vprintk_default+0x22/0x30 kernel/printk/printk.c:2185 vprintk+0x15a/0x170 kernel/printk/printk_safe.c:392 printk+0x62/0x87 kernel/printk/printk.c:2216 selinux_netlink_send+0x399/0x400 security/selinux/hooks.c:6041 security_netlink_send+0x42/0x90 security/security.c:2070 netlink_sendmsg+0x59e/0x7c0 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmsg+0x1ed/0x270 net/socket.c:2475 __do_sys_sendmsg net/socket.c:2484 [inline] __se_sys_sendmsg net/socket.c:2482 [inline] __x64_sys_sendmsg+0x42/0x50 net/socket.c:2482 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x01 -> 0x02 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 12539 Comm: syz-executor.1 Not tainted 5.14.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: syzbot <syzkaller@googlegroups.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/ipv4/igmp.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 6b3c558a4f23..a51360087b19 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -803,10 +803,17 @@ static void igmp_gq_timer_expire(struct timer_list *t) static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); + u8 mr_ifc_count; igmpv3_send_cr(in_dev); - if (in_dev->mr_ifc_count) { - in_dev->mr_ifc_count--; +restart: + mr_ifc_count = READ_ONCE(in_dev->mr_ifc_count); + + if (mr_ifc_count) { + if (cmpxchg(&in_dev->mr_ifc_count, + mr_ifc_count, + mr_ifc_count - 1) != mr_ifc_count) + goto restart; igmp_ifc_start_timer(in_dev, unsolicited_report_interval(in_dev)); } @@ -818,7 +825,7 @@ static void igmp_ifc_event(struct in_device *in_dev) struct net *net = dev_net(in_dev->dev); if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) return; - in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; + WRITE_ONCE(in_dev->mr_ifc_count, in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv); igmp_ifc_start_timer(in_dev, 1); } @@ -957,7 +964,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, in_dev->mr_qri; } /* cancel the interface change timer */ - in_dev->mr_ifc_count = 0; + WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); /* clear deleted report items */ @@ -1724,7 +1731,7 @@ void ip_mc_down(struct in_device *in_dev) igmp_group_dropped(pmc); #ifdef CONFIG_IP_MULTICAST - in_dev->mr_ifc_count = 0; + WRITE_ONCE(in_dev->mr_ifc_count, 0); if (del_timer(&in_dev->mr_ifc_timer)) __in_dev_put(in_dev); in_dev->mr_gq_running = 0; @@ -1941,7 +1948,7 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode, pmc->sfmode = MCAST_INCLUDE; #ifdef CONFIG_IP_MULTICAST pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - in_dev->mr_ifc_count = pmc->crcount; + WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(pmc->interface); @@ -2120,7 +2127,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode, /* else no filters; keep old mode for reports */ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; - in_dev->mr_ifc_count = pmc->crcount; + WRITE_ONCE(in_dev->mr_ifc_count, pmc->crcount); for (psf = pmc->sources; psf; psf = psf->sf_next) psf->sf_crcount = 0; igmp_ifc_event(in_dev); From 019d0454c61707879cf9853c894e0a191f6b9774 Mon Sep 17 00:00:00 2001 From: Randy Dunlap <rdunlap@infradead.org> Date: Mon, 9 Aug 2021 14:52:29 -0700 Subject: [PATCH 70/87] bpf, core: Fix kernel-doc notation Fix kernel-doc warnings in kernel/bpf/core.c (found by scripts/kernel-doc and W=1 builds). That is, correct a function name in a comment and add return descriptions for 2 functions. Fixes these kernel-doc warnings: kernel/bpf/core.c:1372: warning: expecting prototype for __bpf_prog_run(). Prototype was for ___bpf_prog_run() instead kernel/bpf/core.c:1372: warning: No description found for return value of '___bpf_prog_run' kernel/bpf/core.c:1883: warning: No description found for return value of 'bpf_prog_select_runtime' Signed-off-by: Randy Dunlap <rdunlap@infradead.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net> Link: https://lore.kernel.org/bpf/20210809215229.7556-1-rdunlap@infradead.org --- kernel/bpf/core.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b1a5fc04492b..0a28a8095d3e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1362,11 +1362,13 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) } /** - * __bpf_prog_run - run eBPF program on a given context + * ___bpf_prog_run - run eBPF program on a given context * @regs: is the array of MAX_BPF_EXT_REG eBPF pseudo-registers * @insn: is the array of eBPF instructions * * Decode and execute eBPF instructions. + * + * Return: whatever value is in %BPF_R0 at program exit */ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn) { @@ -1878,6 +1880,9 @@ static void bpf_prog_select_func(struct bpf_prog *fp) * * Try to JIT eBPF program, if JIT is not available, use interpreter. * The BPF program will be executed via BPF_PROG_RUN() macro. + * + * Return: the &fp argument along with &err set to 0 for success or + * a negative errno code on failure */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { From cd391280bf4693ceddca8f19042cff42f98c1a89 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean <vladimir.oltean@nxp.com> Date: Tue, 10 Aug 2021 14:19:53 +0300 Subject: [PATCH 71/87] net: dsa: hellcreek: fix broken backpressure in .port_fdb_dump rtnl_fdb_dump() has logic to split a dump of PF_BRIDGE neighbors into multiple netlink skbs if the buffer provided by user space is too small (one buffer will typically handle a few hundred FDB entries). When the current buffer becomes full, nlmsg_put() in dsa_slave_port_fdb_do_dump() returns -EMSGSIZE and DSA saves the index of the last dumped FDB entry, returns to rtnl_fdb_dump() up to that point, and then the dump resumes on the same port with a new skb, and FDB entries up to the saved index are simply skipped. Since dsa_slave_port_fdb_do_dump() is pointed to by the "cb" passed to drivers, then drivers must check for the -EMSGSIZE error code returned by it. Otherwise, when a netlink skb becomes full, DSA will no longer save newly dumped FDB entries to it, but the driver will continue dumping. So FDB entries will be missing from the dump. Fix the broken backpressure by propagating the "cb" return code and allow rtnl_fdb_dump() to restart the FDB dump with a new skb. Fixes: e4b27ebc780f ("net: dsa: Add DSA driver for Hirschmann Hellcreek switches") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Acked-by: Kurt Kanzenbach <kurt@linutronix.de> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/hirschmann/hellcreek.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c index 9fdcc4bde480..5c54ae1be62c 100644 --- a/drivers/net/dsa/hirschmann/hellcreek.c +++ b/drivers/net/dsa/hirschmann/hellcreek.c @@ -912,6 +912,7 @@ static int hellcreek_fdb_dump(struct dsa_switch *ds, int port, { struct hellcreek *hellcreek = ds->priv; u16 entries; + int ret = 0; size_t i; mutex_lock(&hellcreek->reg_lock); @@ -943,12 +944,14 @@ static int hellcreek_fdb_dump(struct dsa_switch *ds, int port, if (!(entry.portmask & BIT(port))) continue; - cb(entry.mac, 0, entry.is_static, data); + ret = cb(entry.mac, 0, entry.is_static, data); + if (ret) + break; } mutex_unlock(&hellcreek->reg_lock); - return 0; + return ret; } static int hellcreek_vlan_filtering(struct dsa_switch *ds, int port, From ada2fee185d8145afb89056558bb59545b9dbdd0 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean <vladimir.oltean@nxp.com> Date: Tue, 10 Aug 2021 14:19:54 +0300 Subject: [PATCH 72/87] net: dsa: lan9303: fix broken backpressure in .port_fdb_dump rtnl_fdb_dump() has logic to split a dump of PF_BRIDGE neighbors into multiple netlink skbs if the buffer provided by user space is too small (one buffer will typically handle a few hundred FDB entries). When the current buffer becomes full, nlmsg_put() in dsa_slave_port_fdb_do_dump() returns -EMSGSIZE and DSA saves the index of the last dumped FDB entry, returns to rtnl_fdb_dump() up to that point, and then the dump resumes on the same port with a new skb, and FDB entries up to the saved index are simply skipped. Since dsa_slave_port_fdb_do_dump() is pointed to by the "cb" passed to drivers, then drivers must check for the -EMSGSIZE error code returned by it. Otherwise, when a netlink skb becomes full, DSA will no longer save newly dumped FDB entries to it, but the driver will continue dumping. So FDB entries will be missing from the dump. Fix the broken backpressure by propagating the "cb" return code and allow rtnl_fdb_dump() to restart the FDB dump with a new skb. Fixes: ab335349b852 ("net: dsa: lan9303: Add port_fast_age and port_fdb_dump methods") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/lan9303-core.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/drivers/net/dsa/lan9303-core.c b/drivers/net/dsa/lan9303-core.c index 344374025426..d7ce281570b5 100644 --- a/drivers/net/dsa/lan9303-core.c +++ b/drivers/net/dsa/lan9303-core.c @@ -557,12 +557,12 @@ static int lan9303_alr_make_entry_raw(struct lan9303 *chip, u32 dat0, u32 dat1) return 0; } -typedef void alr_loop_cb_t(struct lan9303 *chip, u32 dat0, u32 dat1, - int portmap, void *ctx); +typedef int alr_loop_cb_t(struct lan9303 *chip, u32 dat0, u32 dat1, + int portmap, void *ctx); -static void lan9303_alr_loop(struct lan9303 *chip, alr_loop_cb_t *cb, void *ctx) +static int lan9303_alr_loop(struct lan9303 *chip, alr_loop_cb_t *cb, void *ctx) { - int i; + int ret = 0, i; mutex_lock(&chip->alr_mutex); lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_CMD, @@ -582,13 +582,17 @@ static void lan9303_alr_loop(struct lan9303 *chip, alr_loop_cb_t *cb, void *ctx) LAN9303_ALR_DAT1_PORT_BITOFFS; portmap = alrport_2_portmap[alrport]; - cb(chip, dat0, dat1, portmap, ctx); + ret = cb(chip, dat0, dat1, portmap, ctx); + if (ret) + break; lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_CMD, LAN9303_ALR_CMD_GET_NEXT); lan9303_write_switch_reg(chip, LAN9303_SWE_ALR_CMD, 0); } mutex_unlock(&chip->alr_mutex); + + return ret; } static void alr_reg_to_mac(u32 dat0, u32 dat1, u8 mac[6]) @@ -606,18 +610,20 @@ struct del_port_learned_ctx { }; /* Clear learned (non-static) entry on given port */ -static void alr_loop_cb_del_port_learned(struct lan9303 *chip, u32 dat0, - u32 dat1, int portmap, void *ctx) +static int alr_loop_cb_del_port_learned(struct lan9303 *chip, u32 dat0, + u32 dat1, int portmap, void *ctx) { struct del_port_learned_ctx *del_ctx = ctx; int port = del_ctx->port; if (((BIT(port) & portmap) == 0) || (dat1 & LAN9303_ALR_DAT1_STATIC)) - return; + return 0; /* learned entries has only one port, we can just delete */ dat1 &= ~LAN9303_ALR_DAT1_VALID; /* delete entry */ lan9303_alr_make_entry_raw(chip, dat0, dat1); + + return 0; } struct port_fdb_dump_ctx { @@ -626,19 +632,19 @@ struct port_fdb_dump_ctx { dsa_fdb_dump_cb_t *cb; }; -static void alr_loop_cb_fdb_port_dump(struct lan9303 *chip, u32 dat0, - u32 dat1, int portmap, void *ctx) +static int alr_loop_cb_fdb_port_dump(struct lan9303 *chip, u32 dat0, + u32 dat1, int portmap, void *ctx) { struct port_fdb_dump_ctx *dump_ctx = ctx; u8 mac[ETH_ALEN]; bool is_static; if ((BIT(dump_ctx->port) & portmap) == 0) - return; + return 0; alr_reg_to_mac(dat0, dat1, mac); is_static = !!(dat1 & LAN9303_ALR_DAT1_STATIC); - dump_ctx->cb(mac, 0, is_static, dump_ctx->data); + return dump_ctx->cb(mac, 0, is_static, dump_ctx->data); } /* Set a static ALR entry. Delete entry if port_map is zero */ @@ -1210,9 +1216,7 @@ static int lan9303_port_fdb_dump(struct dsa_switch *ds, int port, }; dev_dbg(chip->dev, "%s(%d)\n", __func__, port); - lan9303_alr_loop(chip, alr_loop_cb_fdb_port_dump, &dump_ctx); - - return 0; + return lan9303_alr_loop(chip, alr_loop_cb_fdb_port_dump, &dump_ctx); } static int lan9303_port_mdb_prepare(struct dsa_switch *ds, int port, From 871a73a1c8f55da0a3db234e9dd816ea4fd546f2 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean <vladimir.oltean@nxp.com> Date: Tue, 10 Aug 2021 14:19:55 +0300 Subject: [PATCH 73/87] net: dsa: lantiq: fix broken backpressure in .port_fdb_dump rtnl_fdb_dump() has logic to split a dump of PF_BRIDGE neighbors into multiple netlink skbs if the buffer provided by user space is too small (one buffer will typically handle a few hundred FDB entries). When the current buffer becomes full, nlmsg_put() in dsa_slave_port_fdb_do_dump() returns -EMSGSIZE and DSA saves the index of the last dumped FDB entry, returns to rtnl_fdb_dump() up to that point, and then the dump resumes on the same port with a new skb, and FDB entries up to the saved index are simply skipped. Since dsa_slave_port_fdb_do_dump() is pointed to by the "cb" passed to drivers, then drivers must check for the -EMSGSIZE error code returned by it. Otherwise, when a netlink skb becomes full, DSA will no longer save newly dumped FDB entries to it, but the driver will continue dumping. So FDB entries will be missing from the dump. Fix the broken backpressure by propagating the "cb" return code and allow rtnl_fdb_dump() to restart the FDB dump with a new skb. Fixes: 58c59ef9e930 ("net: dsa: lantiq: Add Forwarding Database access") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/lantiq_gswip.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index 314ae78bbdd6..e78026ef6d8c 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -1404,11 +1404,17 @@ static int gswip_port_fdb_dump(struct dsa_switch *ds, int port, addr[1] = mac_bridge.key[2] & 0xff; addr[0] = (mac_bridge.key[2] >> 8) & 0xff; if (mac_bridge.val[1] & GSWIP_TABLE_MAC_BRIDGE_STATIC) { - if (mac_bridge.val[0] & BIT(port)) - cb(addr, 0, true, data); + if (mac_bridge.val[0] & BIT(port)) { + err = cb(addr, 0, true, data); + if (err) + return err; + } } else { - if (((mac_bridge.val[0] & GENMASK(7, 4)) >> 4) == port) - cb(addr, 0, false, data); + if (((mac_bridge.val[0] & GENMASK(7, 4)) >> 4) == port) { + err = cb(addr, 0, false, data); + if (err) + return err; + } } } return 0; From 21b52fed928e96d2f75d2f6aa9eac7a4b0b55d22 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean <vladimir.oltean@nxp.com> Date: Tue, 10 Aug 2021 14:19:56 +0300 Subject: [PATCH 74/87] net: dsa: sja1105: fix broken backpressure in .port_fdb_dump rtnl_fdb_dump() has logic to split a dump of PF_BRIDGE neighbors into multiple netlink skbs if the buffer provided by user space is too small (one buffer will typically handle a few hundred FDB entries). When the current buffer becomes full, nlmsg_put() in dsa_slave_port_fdb_do_dump() returns -EMSGSIZE and DSA saves the index of the last dumped FDB entry, returns to rtnl_fdb_dump() up to that point, and then the dump resumes on the same port with a new skb, and FDB entries up to the saved index are simply skipped. Since dsa_slave_port_fdb_do_dump() is pointed to by the "cb" passed to drivers, then drivers must check for the -EMSGSIZE error code returned by it. Otherwise, when a netlink skb becomes full, DSA will no longer save newly dumped FDB entries to it, but the driver will continue dumping. So FDB entries will be missing from the dump. Fix the broken backpressure by propagating the "cb" return code and allow rtnl_fdb_dump() to restart the FDB dump with a new skb. Fixes: 291d1e72b756 ("net: dsa: sja1105: Add support for FDB and MDB management") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/sja1105/sja1105_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index 8667c9754330..b188a80eeec6 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -1635,7 +1635,9 @@ static int sja1105_fdb_dump(struct dsa_switch *ds, int port, /* We need to hide the dsa_8021q VLANs from the user. */ if (priv->vlan_state == SJA1105_VLAN_UNAWARE) l2_lookup.vlanid = 0; - cb(macaddr, l2_lookup.vlanid, l2_lookup.lockeds, data); + rc = cb(macaddr, l2_lookup.vlanid, l2_lookup.lockeds, data); + if (rc) + return rc; } return 0; } From 45a687879b31caae4032abd1c2402e289d2b8083 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov <nikolay@nvidia.com> Date: Tue, 10 Aug 2021 14:00:10 +0300 Subject: [PATCH 75/87] net: bridge: fix flags interpretation for extern learn fdb entries Ignore fdb flags when adding port extern learn entries and always set BR_FDB_LOCAL flag when adding bridge extern learn entries. This is closest to the behaviour we had before and avoids breaking any use cases which were allowed. This patch fixes iproute2 calls which assume NUD_PERMANENT and were allowed before, example: $ bridge fdb add 00:11:22:33:44:55 dev swp1 extern_learn Extern learn entries are allowed to roam, but do not expire, so static or dynamic flags make no sense for them. Also add a comment for future reference. Fixes: eb100e0e24a2 ("net: bridge: allow to add externally learned entries from user-space") Fixes: 0541a6293298 ("net: bridge: validate the NUD_PERMANENT bit when adding an extern_learn FDB entry") Reviewed-by: Ido Schimmel <idosch@nvidia.com> Tested-by: Ido Schimmel <idosch@nvidia.com> Signed-off-by: Nikolay Aleksandrov <nikolay@nvidia.com> Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20210810110010.43859-1-razor@blackwall.org Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- include/uapi/linux/neighbour.h | 7 +++++-- net/bridge/br.c | 3 +-- net/bridge/br_fdb.c | 11 ++++------- net/bridge/br_private.h | 2 +- 4 files changed, 11 insertions(+), 12 deletions(-) diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index dc8b72201f6c..00a60695fa53 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -66,8 +66,11 @@ enum { #define NUD_NONE 0x00 /* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change - and make no address resolution or NUD. - NUD_PERMANENT also cannot be deleted by garbage collectors. + * and make no address resolution or NUD. + * NUD_PERMANENT also cannot be deleted by garbage collectors. + * When NTF_EXT_LEARNED is set for a bridge fdb entry the different cache entry + * states don't make sense and thus are ignored. Such entries don't age and + * can roam. */ struct nda_cacheinfo { diff --git a/net/bridge/br.c b/net/bridge/br.c index bbab9984f24e..ef743f94254d 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -166,8 +166,7 @@ static int br_switchdev_event(struct notifier_block *unused, case SWITCHDEV_FDB_ADD_TO_BRIDGE: fdb_info = ptr; err = br_fdb_external_learn_add(br, p, fdb_info->addr, - fdb_info->vid, - fdb_info->is_local, false); + fdb_info->vid, false); if (err) { err = notifier_from_errno(err); break; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 835cec1e5a03..5dee30966ed3 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -1044,10 +1044,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, "FDB entry towards bridge must be permanent"); return -EINVAL; } - - err = br_fdb_external_learn_add(br, p, addr, vid, - ndm->ndm_state & NUD_PERMANENT, - true); + err = br_fdb_external_learn_add(br, p, addr, vid, true); } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb); @@ -1275,7 +1272,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p) } int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid, bool is_local, + const unsigned char *addr, u16 vid, bool swdev_notify) { struct net_bridge_fdb_entry *fdb; @@ -1293,7 +1290,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, if (swdev_notify) flags |= BIT(BR_FDB_ADDED_BY_USER); - if (is_local) + if (!p) flags |= BIT(BR_FDB_LOCAL); fdb = fdb_create(br, p, addr, vid, flags); @@ -1322,7 +1319,7 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, if (swdev_notify) set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags); - if (is_local) + if (!p) set_bit(BR_FDB_LOCAL, &fdb->flags); if (modified) diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index aa64d8d63ca3..2b48b204205e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -711,7 +711,7 @@ int br_fdb_get(struct sk_buff *skb, struct nlattr *tb[], struct net_device *dev, int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p); void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p); int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p, - const unsigned char *addr, u16 vid, bool is_local, + const unsigned char *addr, u16 vid, bool swdev_notify); int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p, const unsigned char *addr, u16 vid, From c35b57ceff906856dd85af2d6709dab18fbca81f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean <vladimir.oltean@nxp.com> Date: Tue, 10 Aug 2021 14:50:24 +0300 Subject: [PATCH 76/87] net: switchdev: zero-initialize struct switchdev_notifier_fdb_info emitted by drivers towards the bridge The blamed commit added a new field to struct switchdev_notifier_fdb_info, but did not make sure that all call paths set it to something valid. For example, a switchdev driver may emit a SWITCHDEV_FDB_ADD_TO_BRIDGE notifier, and since the 'is_local' flag is not set, it contains junk from the stack, so the bridge might interpret those notifications as being for local FDB entries when that was not intended. To avoid that now and in the future, zero-initialize all switchdev_notifier_fdb_info structures created by drivers such that all newly added fields to not need to touch drivers again. Fixes: 2c4eca3ef716 ("net: bridge: switchdev: include local flag in FDB notifications") Reported-by: Ido Schimmel <idosch@idosch.org> Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Ido Schimmel <idosch@nvidia.com> Tested-by: Ido Schimmel <idosch@nvidia.com> Reviewed-by: Leon Romanovsky <leonro@nvidia.com> Reviewed-by: Karsten Graul <kgraul@linux.ibm.com> Link: https://lore.kernel.org/r/20210810115024.1629983-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/ethernet/marvell/prestera/prestera_switchdev.c | 4 ++-- drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c | 2 +- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 4 ++-- drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c | 2 +- drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c | 2 +- drivers/net/ethernet/rocker/rocker_main.c | 2 +- drivers/net/ethernet/rocker/rocker_ofdpa.c | 2 +- drivers/net/ethernet/ti/am65-cpsw-switchdev.c | 2 +- drivers/net/ethernet/ti/cpsw_switchdev.c | 2 +- drivers/s390/net/qeth_l2_main.c | 4 ++-- net/dsa/slave.c | 2 +- 11 files changed, 14 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c index 0b3e8f2db294..9a309169dbae 100644 --- a/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c +++ b/drivers/net/ethernet/marvell/prestera/prestera_switchdev.c @@ -748,7 +748,7 @@ static void prestera_fdb_offload_notify(struct prestera_port *port, struct switchdev_notifier_fdb_info *info) { - struct switchdev_notifier_fdb_info send_info; + struct switchdev_notifier_fdb_info send_info = {}; send_info.addr = info->addr; send_info.vid = info->vid; @@ -1123,7 +1123,7 @@ static int prestera_switchdev_blk_event(struct notifier_block *unused, static void prestera_fdb_event(struct prestera_switch *sw, struct prestera_event *evt, void *arg) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; struct net_device *dev = NULL; struct prestera_port *port; struct prestera_lag *lag; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c index f3f56f32e435..69a3630818d7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge.c @@ -69,7 +69,7 @@ static void mlx5_esw_bridge_fdb_offload_notify(struct net_device *dev, const unsigned char *addr, u16 vid, unsigned long val) { - struct switchdev_notifier_fdb_info send_info; + struct switchdev_notifier_fdb_info send_info = {}; send_info.addr = addr; send_info.vid = vid; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 7e221ef01437..f69cbb3852d5 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -9079,7 +9079,7 @@ mlxsw_sp_rif_fid_fid_get(struct mlxsw_sp_rif *rif, static void mlxsw_sp_rif_fid_fdb_del(struct mlxsw_sp_rif *rif, const char *mac) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; struct net_device *dev; dev = br_fdb_find_port(rif->dev, mac, 0); @@ -9127,8 +9127,8 @@ mlxsw_sp_rif_vlan_fid_get(struct mlxsw_sp_rif *rif, static void mlxsw_sp_rif_vlan_fdb_del(struct mlxsw_sp_rif *rif, const char *mac) { + struct switchdev_notifier_fdb_info info = {}; u16 vid = mlxsw_sp_fid_8021q_vid(rif->fid); - struct switchdev_notifier_fdb_info info; struct net_device *br_dev; struct net_device *dev; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index c5ef9aa64efe..8f90cd323d5f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -2508,7 +2508,7 @@ mlxsw_sp_fdb_call_notifiers(enum switchdev_notifier_type type, const char *mac, u16 vid, struct net_device *dev, bool offloaded) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; info.addr = mac; info.vid = vid; diff --git a/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c b/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c index 0443f66b5550..9a8e4f201eb1 100644 --- a/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c +++ b/drivers/net/ethernet/microchip/sparx5/sparx5_mactable.c @@ -277,7 +277,7 @@ static void sparx5_fdb_call_notifiers(enum switchdev_notifier_type type, const char *mac, u16 vid, struct net_device *dev, bool offloaded) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; info.addr = mac; info.vid = vid; diff --git a/drivers/net/ethernet/rocker/rocker_main.c b/drivers/net/ethernet/rocker/rocker_main.c index a46633606cae..1f06b92ee5bb 100644 --- a/drivers/net/ethernet/rocker/rocker_main.c +++ b/drivers/net/ethernet/rocker/rocker_main.c @@ -2715,7 +2715,7 @@ static void rocker_fdb_offload_notify(struct rocker_port *rocker_port, struct switchdev_notifier_fdb_info *recv_info) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; info.addr = recv_info->addr; info.vid = recv_info->vid; diff --git a/drivers/net/ethernet/rocker/rocker_ofdpa.c b/drivers/net/ethernet/rocker/rocker_ofdpa.c index 967a634ee9ac..e33a9d283a4e 100644 --- a/drivers/net/ethernet/rocker/rocker_ofdpa.c +++ b/drivers/net/ethernet/rocker/rocker_ofdpa.c @@ -1822,7 +1822,7 @@ static void ofdpa_port_fdb_learn_work(struct work_struct *work) container_of(work, struct ofdpa_fdb_learn_work, work); bool removing = (lw->flags & OFDPA_OP_FLAG_REMOVE); bool learned = (lw->flags & OFDPA_OP_FLAG_LEARNED); - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; info.addr = lw->addr; info.vid = lw->vid; diff --git a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c index 9c29b363e9ae..599708a3e81d 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-switchdev.c +++ b/drivers/net/ethernet/ti/am65-cpsw-switchdev.c @@ -358,7 +358,7 @@ static int am65_cpsw_port_obj_del(struct net_device *ndev, const void *ctx, static void am65_cpsw_fdb_offload_notify(struct net_device *ndev, struct switchdev_notifier_fdb_info *rcv) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; info.addr = rcv->addr; info.vid = rcv->vid; diff --git a/drivers/net/ethernet/ti/cpsw_switchdev.c b/drivers/net/ethernet/ti/cpsw_switchdev.c index f7fb6e17dadd..a7d97d429e06 100644 --- a/drivers/net/ethernet/ti/cpsw_switchdev.c +++ b/drivers/net/ethernet/ti/cpsw_switchdev.c @@ -368,7 +368,7 @@ static int cpsw_port_obj_del(struct net_device *ndev, const void *ctx, static void cpsw_fdb_offload_notify(struct net_device *ndev, struct switchdev_notifier_fdb_info *rcv) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; info.addr = rcv->addr; info.vid = rcv->vid; diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c index 2abf86c104d5..d7cdd9cfe485 100644 --- a/drivers/s390/net/qeth_l2_main.c +++ b/drivers/s390/net/qeth_l2_main.c @@ -279,7 +279,7 @@ static void qeth_l2_set_pnso_mode(struct qeth_card *card, static void qeth_l2_dev2br_fdb_flush(struct qeth_card *card) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; QETH_CARD_TEXT(card, 2, "fdbflush"); @@ -679,7 +679,7 @@ static void qeth_l2_dev2br_fdb_notify(struct qeth_card *card, u8 code, struct net_if_token *token, struct mac_addr_lnid *addr_lnid) { - struct switchdev_notifier_fdb_info info; + struct switchdev_notifier_fdb_info info = {}; u8 ntfy_mac[ETH_ALEN]; ether_addr_copy(ntfy_mac, addr_lnid->mac); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 532085da8d8f..23be8e01026b 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -2291,8 +2291,8 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, static void dsa_fdb_offload_notify(struct dsa_switchdev_event_work *switchdev_work) { + struct switchdev_notifier_fdb_info info = {}; struct dsa_switch *ds = switchdev_work->ds; - struct switchdev_notifier_fdb_info info; struct dsa_port *dp; if (!dsa_is_user_port(ds, switchdev_work->port)) From 519133debcc19f5c834e7e28480b60bdc234fe02 Mon Sep 17 00:00:00 2001 From: Yang Yingliang <yangyingliang@huawei.com> Date: Mon, 9 Aug 2021 21:20:23 +0800 Subject: [PATCH 77/87] net: bridge: fix memleak in br_add_if() I got a memleak report: BUG: memory leak unreferenced object 0x607ee521a658 (size 240): comm "syz-executor.0", pid 955, jiffies 4294780569 (age 16.449s) hex dump (first 32 bytes, cpu 1): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000d830ea5a>] br_multicast_add_port+0x1c2/0x300 net/bridge/br_multicast.c:1693 [<00000000274d9a71>] new_nbp net/bridge/br_if.c:435 [inline] [<00000000274d9a71>] br_add_if+0x670/0x1740 net/bridge/br_if.c:611 [<0000000012ce888e>] do_set_master net/core/rtnetlink.c:2513 [inline] [<0000000012ce888e>] do_set_master+0x1aa/0x210 net/core/rtnetlink.c:2487 [<0000000099d1cafc>] __rtnl_newlink+0x1095/0x13e0 net/core/rtnetlink.c:3457 [<00000000a01facc0>] rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3488 [<00000000acc9186c>] rtnetlink_rcv_msg+0x369/0xa10 net/core/rtnetlink.c:5550 [<00000000d4aabb9c>] netlink_rcv_skb+0x134/0x3d0 net/netlink/af_netlink.c:2504 [<00000000bc2e12a3>] netlink_unicast_kernel net/netlink/af_netlink.c:1314 [inline] [<00000000bc2e12a3>] netlink_unicast+0x4a0/0x6a0 net/netlink/af_netlink.c:1340 [<00000000e4dc2d0e>] netlink_sendmsg+0x789/0xc70 net/netlink/af_netlink.c:1929 [<000000000d22c8b3>] sock_sendmsg_nosec net/socket.c:654 [inline] [<000000000d22c8b3>] sock_sendmsg+0x139/0x170 net/socket.c:674 [<00000000e281417a>] ____sys_sendmsg+0x658/0x7d0 net/socket.c:2350 [<00000000237aa2ab>] ___sys_sendmsg+0xf8/0x170 net/socket.c:2404 [<000000004f2dc381>] __sys_sendmsg+0xd3/0x190 net/socket.c:2433 [<0000000005feca6c>] do_syscall_64+0x37/0x90 arch/x86/entry/common.c:47 [<000000007304477d>] entry_SYSCALL_64_after_hwframe+0x44/0xae On error path of br_add_if(), p->mcast_stats allocated in new_nbp() need be freed, or it will be leaked. Fixes: 1080ab95e3c7 ("net: bridge: add support for IGMP/MLD stats and export them via netlink") Reported-by: Hulk Robot <hulkci@huawei.com> Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> Acked-by: Nikolay Aleksandrov <nikolay@nvidia.com> Link: https://lore.kernel.org/r/20210809132023.978546-1-yangyingliang@huawei.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/bridge/br_if.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 6e4a32354a13..14cd6ef96111 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -616,6 +616,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev, err = dev_set_allmulti(dev, 1); if (err) { + br_multicast_del_port(p); kfree(p); /* kobject not yet init'd, manually free */ goto err1; } @@ -729,6 +730,7 @@ err4: err3: sysfs_remove_link(br->ifobj, p->dev->name); err2: + br_multicast_del_port(p); kobject_put(&p->kobj); dev_set_allmulti(dev, -1); err1: From 6922110d152e56d7569616b45a1f02876cf3eb9f Mon Sep 17 00:00:00 2001 From: Willy Tarreau <w@1wt.eu> Date: Mon, 9 Aug 2021 18:06:28 +0200 Subject: [PATCH 78/87] net: linkwatch: fix failure to restore device state across suspend/resume After migrating my laptop from 4.19-LTS to 5.4-LTS a while ago I noticed that my Ethernet port to which a bond and a VLAN interface are attached appeared to remain up after resuming from suspend with the cable unplugged (and that problem still persists with 5.10-LTS). It happens that the following happens: - the network driver (e1000e here) prepares to suspend, calls e1000e_down() which calls netif_carrier_off() to signal that the link is going down. - netif_carrier_off() adds a link_watch event to the list of events for this device - the device is completely stopped. - the machine suspends - the cable is unplugged and the machine brought to another location - the machine is resumed - the queued linkwatch events are processed for the device - the device doesn't yet have the __LINK_STATE_PRESENT bit and its events are silently dropped - the device is resumed with its link down - the upper VLAN and bond interfaces are never notified that the link had been turned down and remain up - the only way to provoke a change is to physically connect the machine to a port and possibly unplug it. The state after resume looks like this: $ ip -br li | egrep 'bond|eth' bond0 UP e8:6a:64:64:64:64 <BROADCAST,MULTICAST,MASTER,UP,LOWER_UP> eth0 DOWN e8:6a:64:64:64:64 <NO-CARRIER,BROADCAST,MULTICAST,SLAVE,UP> eth0.2@eth0 UP e8:6a:64:64:64:64 <BROADCAST,MULTICAST,SLAVE,UP,LOWER_UP> Placing an explicit call to netdev_state_change() either in the suspend or the resume code in the NIC driver worked around this but the solution is not satisfying. The issue in fact really is in link_watch that loses events while it ought not to. It happens that the test for the device being present was added by commit 124eee3f6955 ("net: linkwatch: add check for netdevice being present to linkwatch_do_dev") in 4.20 to avoid an access to devices that are not present. Instead of dropping events, this patch proceeds slightly differently by postponing their handling so that they happen after the device is fully resumed. Fixes: 124eee3f6955 ("net: linkwatch: add check for netdevice being present to linkwatch_do_dev") Link: https://lists.openwall.net/netdev/2018/03/15/62 Cc: Heiner Kallweit <hkallweit1@gmail.com> Cc: Geert Uytterhoeven <geert+renesas@glider.be> Cc: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: Willy Tarreau <w@1wt.eu> Link: https://lore.kernel.org/r/20210809160628.22623-1-w@1wt.eu Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/core/link_watch.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 75431ca9300f..1a455847da54 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -158,7 +158,7 @@ static void linkwatch_do_dev(struct net_device *dev) clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); rfc2863_policy(dev); - if (dev->flags & IFF_UP && netif_device_present(dev)) { + if (dev->flags & IFF_UP) { if (netif_carrier_ok(dev)) dev_activate(dev); else @@ -204,7 +204,8 @@ static void __linkwatch_run_queue(int urgent_only) dev = list_first_entry(&wrk, struct net_device, link_watch_list); list_del_init(&dev->link_watch_list); - if (urgent_only && !linkwatch_urgent_event(dev)) { + if (!netif_device_present(dev) || + (urgent_only && !linkwatch_urgent_event(dev))) { list_add_tail(&dev->link_watch_list, &lweventlist); continue; } From 2cad5d2ed1b47eded5a2f2372c2a94bb065a8f97 Mon Sep 17 00:00:00 2001 From: Wong Vee Khee <vee.khee.wong@linux.intel.com> Date: Tue, 10 Aug 2021 16:58:12 +0800 Subject: [PATCH 79/87] net: pcs: xpcs: fix error handling on failed to allocate memory Drivers such as sja1105 and stmmac that call xpcs_create() expects an error returned by the pcs-xpcs module, but this was not the case on failed to allocate memory. Fixed this by returning an -ENOMEM instead of a NULL pointer. Fixes: 3ad1d171548e ("net: dsa: sja1105: migrate to xpcs for SGMII") Signed-off-by: Wong Vee Khee <vee.khee.wong@linux.intel.com> Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com> Link: https://lore.kernel.org/r/20210810085812.1808466-1-vee.khee.wong@linux.intel.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/pcs/pcs-xpcs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/pcs/pcs-xpcs.c b/drivers/net/pcs/pcs-xpcs.c index 63fda3fc40aa..4bd61339823c 100644 --- a/drivers/net/pcs/pcs-xpcs.c +++ b/drivers/net/pcs/pcs-xpcs.c @@ -1089,7 +1089,7 @@ struct dw_xpcs *xpcs_create(struct mdio_device *mdiodev, xpcs = kzalloc(sizeof(*xpcs), GFP_KERNEL); if (!xpcs) - return NULL; + return ERR_PTR(-ENOMEM); xpcs->mdiodev = mdiodev; From 6de035fec045f8ae5ee5f3a02373a18b939e91fb Mon Sep 17 00:00:00 2001 From: Neal Cardwell <ncardwell@google.com> Date: Tue, 10 Aug 2021 22:40:56 -0400 Subject: [PATCH 80/87] tcp_bbr: fix u32 wrap bug in round logic if bbr_init() called after 2B packets Currently if BBR congestion control is initialized after more than 2B packets have been delivered, depending on the phase of the tp->delivered counter the tracking of BBR round trips can get stuck. The bug arises because if tp->delivered is between 2^31 and 2^32 at the time the BBR congestion control module is initialized, then the initialization of bbr->next_rtt_delivered to 0 will cause the logic to believe that the end of the round trip is still billions of packets in the future. More specifically, the following check will fail repeatedly: !before(rs->prior_delivered, bbr->next_rtt_delivered) and thus the connection will take up to 2B packets delivered before that check will pass and the connection will set: bbr->round_start = 1; This could cause many mechanisms in BBR to fail to trigger, for example bbr_check_full_bw_reached() would likely never exit STARTUP. This bug is 5 years old and has not been observed, and as a practical matter this would likely rarely trigger, since it would require transferring at least 2B packets, or likely more than 3 terabytes of data, before switching congestion control algorithms to BBR. This patch is a stable candidate for kernels as far back as v4.9, when tcp_bbr.c was added. Fixes: 0f8782ea1497 ("tcp_bbr: add BBR congestion control") Signed-off-by: Neal Cardwell <ncardwell@google.com> Reviewed-by: Yuchung Cheng <ycheng@google.com> Reviewed-by: Kevin Yang <yyd@google.com> Reviewed-by: Eric Dumazet <edumazet@google.com> Link: https://lore.kernel.org/r/20210811024056.235161-1-ncardwell@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/ipv4/tcp_bbr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c index 6ea3dc2e4219..6274462b86b4 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1041,7 +1041,7 @@ static void bbr_init(struct sock *sk) bbr->prior_cwnd = 0; tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; bbr->rtt_cnt = 0; - bbr->next_rtt_delivered = 0; + bbr->next_rtt_delivered = tp->delivered; bbr->prev_ca_state = TCP_CA_Open; bbr->packet_conservation = 0; From 0271824d9ebe945a2ecefdb87e1ce0a520be704d Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov <s.shtylyov@omp.ru> Date: Tue, 10 Aug 2021 23:17:12 +0300 Subject: [PATCH 81/87] MAINTAINERS: switch to my OMP email for Renesas Ethernet drivers I'm still going to continue looking after the Renesas Ethernet drivers and device tree bindings. Now my new employer, Open Mobile Platform (OMP), will pay for all my upstream work. Let's switch to my OMP email for the reviews. Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru> Link: https://lore.kernel.org/r/9c212711-a0d7-39cd-7840-ff7abf938da1@omp.ru Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 25dc566a67c1..f13c0d3b7d0c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15803,7 +15803,7 @@ F: Documentation/devicetree/bindings/i2c/renesas,iic-emev2.yaml F: drivers/i2c/busses/i2c-emev2.c RENESAS ETHERNET DRIVERS -R: Sergei Shtylyov <sergei.shtylyov@gmail.com> +R: Sergey Shtylyov <s.shtylyov@omp.ru> L: netdev@vger.kernel.org L: linux-renesas-soc@vger.kernel.org F: Documentation/devicetree/bindings/net/renesas,*.yaml From b69dd5b3780a7298bd893816a09da751bc0636f7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet <edumazet@google.com> Date: Wed, 11 Aug 2021 12:57:15 -0700 Subject: [PATCH 82/87] net: igmp: increase size of mr_ifc_count Some arches support cmpxchg() on 4-byte and 8-byte only. Increase mr_ifc_count width to 32bit to fix this problem. Fixes: 4a2b285e7e10 ("net: igmp: fix data-race in igmp_ifc_timer_expire()") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Guenter Roeck <linux@roeck-us.net> Link: https://lore.kernel.org/r/20210811195715.3684218-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- include/linux/inetdevice.h | 2 +- net/ipv4/igmp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index 53aa0343bf69..aaf4f1b4c277 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -41,7 +41,7 @@ struct in_device { unsigned long mr_qri; /* Query Response Interval */ unsigned char mr_qrv; /* Query Robustness Variable */ unsigned char mr_gq_running; - unsigned char mr_ifc_count; + u32 mr_ifc_count; struct timer_list mr_gq_timer; /* general query timer */ struct timer_list mr_ifc_timer; /* interface change timer */ diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a51360087b19..00576bae183d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -803,7 +803,7 @@ static void igmp_gq_timer_expire(struct timer_list *t) static void igmp_ifc_timer_expire(struct timer_list *t) { struct in_device *in_dev = from_timer(in_dev, t, mr_ifc_timer); - u8 mr_ifc_count; + u32 mr_ifc_count; igmpv3_send_cr(in_dev); restart: From 48c812e0327744b4965296f65c23fe2405692afc Mon Sep 17 00:00:00 2001 From: Mark Brown <broonie@kernel.org> Date: Tue, 10 Aug 2021 13:37:48 +0100 Subject: [PATCH 83/87] net: mscc: Fix non-GPL export of regmap APIs The ocelot driver makes use of regmap, wrapping it with driver specific operations that are thin wrappers around the core regmap APIs. These are exported with EXPORT_SYMBOL, dropping the _GPL from the core regmap exports which is frowned upon. Add _GPL suffixes to at least the APIs that are doing register I/O. Signed-off-by: Mark Brown <broonie@kernel.org> Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/ethernet/mscc/ocelot_io.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_io.c b/drivers/net/ethernet/mscc/ocelot_io.c index ea4e83410fe4..7390fa3980ec 100644 --- a/drivers/net/ethernet/mscc/ocelot_io.c +++ b/drivers/net/ethernet/mscc/ocelot_io.c @@ -21,7 +21,7 @@ u32 __ocelot_read_ix(struct ocelot *ocelot, u32 reg, u32 offset) ocelot->map[target][reg & REG_MASK] + offset, &val); return val; } -EXPORT_SYMBOL(__ocelot_read_ix); +EXPORT_SYMBOL_GPL(__ocelot_read_ix); void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset) { @@ -32,7 +32,7 @@ void __ocelot_write_ix(struct ocelot *ocelot, u32 val, u32 reg, u32 offset) regmap_write(ocelot->targets[target], ocelot->map[target][reg & REG_MASK] + offset, val); } -EXPORT_SYMBOL(__ocelot_write_ix); +EXPORT_SYMBOL_GPL(__ocelot_write_ix); void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, u32 offset) @@ -45,7 +45,7 @@ void __ocelot_rmw_ix(struct ocelot *ocelot, u32 val, u32 mask, u32 reg, ocelot->map[target][reg & REG_MASK] + offset, mask, val); } -EXPORT_SYMBOL(__ocelot_rmw_ix); +EXPORT_SYMBOL_GPL(__ocelot_rmw_ix); u32 ocelot_port_readl(struct ocelot_port *port, u32 reg) { @@ -58,7 +58,7 @@ u32 ocelot_port_readl(struct ocelot_port *port, u32 reg) regmap_read(port->target, ocelot->map[target][reg & REG_MASK], &val); return val; } -EXPORT_SYMBOL(ocelot_port_readl); +EXPORT_SYMBOL_GPL(ocelot_port_readl); void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg) { @@ -69,7 +69,7 @@ void ocelot_port_writel(struct ocelot_port *port, u32 val, u32 reg) regmap_write(port->target, ocelot->map[target][reg & REG_MASK], val); } -EXPORT_SYMBOL(ocelot_port_writel); +EXPORT_SYMBOL_GPL(ocelot_port_writel); void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg) { @@ -77,7 +77,7 @@ void ocelot_port_rmwl(struct ocelot_port *port, u32 val, u32 mask, u32 reg) ocelot_port_writel(port, (cur & (~mask)) | val, reg); } -EXPORT_SYMBOL(ocelot_port_rmwl); +EXPORT_SYMBOL_GPL(ocelot_port_rmwl); u32 __ocelot_target_read_ix(struct ocelot *ocelot, enum ocelot_target target, u32 reg, u32 offset) @@ -128,7 +128,7 @@ int ocelot_regfields_init(struct ocelot *ocelot, return 0; } -EXPORT_SYMBOL(ocelot_regfields_init); +EXPORT_SYMBOL_GPL(ocelot_regfields_init); static struct regmap_config ocelot_regmap_config = { .reg_bits = 32, @@ -148,4 +148,4 @@ struct regmap *ocelot_regmap_init(struct ocelot *ocelot, struct resource *res) return devm_regmap_init_mmio(ocelot->dev, regs, &ocelot_regmap_config); } -EXPORT_SYMBOL(ocelot_regmap_init); +EXPORT_SYMBOL_GPL(ocelot_regmap_init); From 86704993e6a5989e256b4212ca03115cc2694eda Mon Sep 17 00:00:00 2001 From: Hoang Le <hoang.h.le@dektech.com.au> Date: Wed, 11 Aug 2021 08:22:09 +0700 Subject: [PATCH 84/87] Revert "tipc: Return the correct errno code" This reverts commit 0efea3c649f0 because of: - The returning -ENOBUF error is fine on socket buffer allocation. - There is side effect in the calling path tipc_node_xmit()->tipc_link_xmit() when checking error code returning. Fixes: 0efea3c649f0 ("tipc: Return the correct errno code") Acked-by: Jon Maloy <jmaloy@redhat.com> Signed-off-by: Hoang Le <hoang.h.le@dektech.com.au> Signed-off-by: David S. Miller <davem@davemloft.net> --- net/tipc/link.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/tipc/link.c b/net/tipc/link.c index cf586840caeb..1b7a487c8841 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -913,7 +913,7 @@ static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr) skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0, dnode, l->addr, dport, 0, 0); if (!skb) - return -ENOMEM; + return -ENOBUFS; msg_set_dest_droppable(buf_msg(skb), true); TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr); skb_queue_tail(&l->wakeupq, skb); @@ -1031,7 +1031,7 @@ void tipc_link_reset(struct tipc_link *l) * * Consumes the buffer chain. * Messages at TIPC_SYSTEM_IMPORTANCE are always accepted - * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS or -ENOMEM + * Return: 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS */ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, struct sk_buff_head *xmitq) @@ -1089,7 +1089,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list, if (!_skb) { kfree_skb(skb); __skb_queue_purge(list); - return -ENOMEM; + return -ENOBUFS; } __skb_queue_tail(transmq, skb); tipc_link_set_skb_retransmit_time(skb, l); From 700fa08da43edb0af3e6a513f0255443e96088e8 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean <vladimir.oltean@nxp.com> Date: Wed, 11 Aug 2021 14:59:45 +0300 Subject: [PATCH 85/87] net: dsa: sja1105: unregister the MDIO buses during teardown The call to sja1105_mdiobus_unregister is present in the error path but absent from the main driver unbind path. Fixes: 5a8f09748ee7 ("net: dsa: sja1105: register the MDIO buses for 100base-T1 and 100base-TX") Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com> Reviewed-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> --- drivers/net/dsa/sja1105/sja1105_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index b188a80eeec6..49eb0ac41b7d 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -3187,6 +3187,7 @@ static void sja1105_teardown(struct dsa_switch *ds) } sja1105_devlink_teardown(ds); + sja1105_mdiobus_unregister(ds); sja1105_flower_teardown(ds); sja1105_tas_teardown(ds); sja1105_ptp_clock_unregister(ds); From d9d5b8961284b0051726e0fcda91d1e297e087f5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Date: Wed, 11 Aug 2021 15:48:45 +0300 Subject: [PATCH 86/87] wwan: core: Avoid returning NULL from wwan_create_dev() Make wwan_create_dev() to return either valid or error pointer, In some cases it may return NULL. Prevent this by converting it to the respective error pointer. Fixes: 9a44c1cc6388 ("net: Add a WWAN subsystem") Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com> Acked-by: Sergey Ryazanov <ryazanov.s.a@gmail.com> Reviewed-by: Loic Poulain <loic.poulain@linaro.org> Link: https://lore.kernel.org/r/20210811124845.10955-1-andriy.shevchenko@linux.intel.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- drivers/net/wwan/wwan_core.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c index 674a81d79db3..35ece98134c0 100644 --- a/drivers/net/wwan/wwan_core.c +++ b/drivers/net/wwan/wwan_core.c @@ -164,11 +164,14 @@ static struct wwan_device *wwan_create_dev(struct device *parent) goto done_unlock; id = ida_alloc(&wwan_dev_ids, GFP_KERNEL); - if (id < 0) + if (id < 0) { + wwandev = ERR_PTR(id); goto done_unlock; + } wwandev = kzalloc(sizeof(*wwandev), GFP_KERNEL); if (!wwandev) { + wwandev = ERR_PTR(-ENOMEM); ida_free(&wwan_dev_ids, id); goto done_unlock; } @@ -182,7 +185,8 @@ static struct wwan_device *wwan_create_dev(struct device *parent) err = device_register(&wwandev->dev); if (err) { put_device(&wwandev->dev); - wwandev = NULL; + wwandev = ERR_PTR(err); + goto done_unlock; } done_unlock: @@ -1014,8 +1018,8 @@ int wwan_register_ops(struct device *parent, const struct wwan_ops *ops, return -EINVAL; wwandev = wwan_create_dev(parent); - if (!wwandev) - return -ENOMEM; + if (IS_ERR(wwandev)) + return PTR_ERR(wwandev); if (WARN_ON(wwandev->ops)) { wwan_remove_dev(wwandev); From 49b0b6ffe20c5344f4173f3436298782a08da4f2 Mon Sep 17 00:00:00 2001 From: "Longpeng(Mike)" <longpeng2@huawei.com> Date: Thu, 12 Aug 2021 13:30:56 +0800 Subject: [PATCH 87/87] vsock/virtio: avoid potential deadlock when vsock device remove There's a potential deadlock case when remove the vsock device or process the RESET event: vsock_for_each_connected_socket: spin_lock_bh(&vsock_table_lock) ----------- (1) ... virtio_vsock_reset_sock: lock_sock(sk) --------------------- (2) ... spin_unlock_bh(&vsock_table_lock) lock_sock() may do initiative schedule when the 'sk' is owned by other thread at the same time, we would receivce a warning message that "scheduling while atomic". Even worse, if the next task (selected by the scheduler) try to release a 'sk', it need to request vsock_table_lock and the deadlock occur, cause the system into softlockup state. Call trace: queued_spin_lock_slowpath vsock_remove_bound vsock_remove_sock virtio_transport_release __vsock_release vsock_release __sock_release sock_close __fput ____fput So we should not require sk_lock in this case, just like the behavior in vhost_vsock or vmci. Fixes: 0ea9e1d3a9e3 ("VSOCK: Introduce virtio_transport.ko") Cc: Stefan Hajnoczi <stefanha@redhat.com> Signed-off-by: Longpeng(Mike) <longpeng2@huawei.com> Reviewed-by: Stefano Garzarella <sgarzare@redhat.com> Link: https://lore.kernel.org/r/20210812053056.1699-1-longpeng2@huawei.com Signed-off-by: Jakub Kicinski <kuba@kernel.org> --- net/vmw_vsock/virtio_transport.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index e0c2c992ad9c..4f7c99dfd16c 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -357,11 +357,14 @@ static void virtio_vsock_event_fill(struct virtio_vsock *vsock) static void virtio_vsock_reset_sock(struct sock *sk) { - lock_sock(sk); + /* vmci_transport.c doesn't take sk_lock here either. At least we're + * under vsock_table_lock so the sock cannot disappear while we're + * executing. + */ + sk->sk_state = TCP_CLOSE; sk->sk_err = ECONNRESET; sk_error_report(sk); - release_sock(sk); } static void virtio_vsock_update_guest_cid(struct virtio_vsock *vsock)