mirror of
https://github.com/torvalds/linux.git
synced 2024-12-29 14:21:47 +00:00
d1b4c689d4
mmapped netlink has a number of unresolved issues: - TX zerocopy support had to be disabled more than a year ago via commit4682a03586
("netlink: Always copy on mmap TX.") because the content of the mmapped area can change after netlink attribute validation but before message processing. - RX support was implemented mainly to speed up nfqueue dumping packet payload to userspace. However, since commitae08ce0021
("netfilter: nfnetlink_queue: zero copy support") we avoid one copy with the socket-based interface too (via the skb_zerocopy helper). The other problem is that skbs attached to mmaped netlink socket behave different from normal skbs: - they don't have a shinfo area, so all functions that use skb_shinfo() (e.g. skb_clone) cannot be used. - reserving headroom prevents userspace from seeing the content as it expects message to start at skb->head. See for instance commitaa3a022094
("netlink: not trim skb for mmaped socket when dump"). - skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we crash because it needs the sk to check if a tx ring is attached. Also not obvious, leads to non-intuitive bug fixes such as7c7bdf359
("netfilter: nfnetlink: use original skbuff when acking batches"). mmaped netlink also didn't play nicely with the skb_zerocopy helper used by nfqueue and openvswitch. Daniel Borkmann fixed this via commit6bb0fef489
("netlink, mmap: fix edge-case leakages in nf queue zero-copy")' but at the cost of also needing to provide remaining length to the allocation function. nfqueue also has problems when used with mmaped rx netlink: - mmaped netlink doesn't allow use of nfqueue batch verdict messages. Problem is that in the mmap case, the allocation time also determines the ordering in which the frame will be seen by userspace (A allocating before B means that A is located in earlier ring slot, but this also means that B might get a lower sequence number then A since seqno is decided later. To fix this we would need to extend the spinlocked region to also cover the allocation and message setup which isn't desirable. - nfqueue can now be configured to queue large (GSO) skbs to userspace. Queing GSO packets is faster than having to force a software segmentation in the kernel, so this is a desirable option. However, with a mmap based ring one has to use 64kb per ring slot element, else mmap has to fall back to the socket path (NL_MMAP_STATUS_COPY) for all large packets. To use the mmap interface, userspace not only has to probe for mmap netlink support, it also has to implement a recv/socket receive path in order to handle messages that exceed the size of an rx ring element. Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: Ken-ichirou MATSUZAWA <chamaken@gmail.com> Cc: Pablo Neira Ayuso <pablo@netfilter.org> Cc: Patrick McHardy <kaber@trash.net> Cc: Thomas Graf <tgraf@suug.ch> Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: David S. Miller <davem@davemloft.net>
200 lines
4.2 KiB
C
200 lines
4.2 KiB
C
#include <linux/module.h>
|
|
|
|
#include <net/sock.h>
|
|
#include <linux/netlink.h>
|
|
#include <linux/sock_diag.h>
|
|
#include <linux/netlink_diag.h>
|
|
#include <linux/rhashtable.h>
|
|
|
|
#include "af_netlink.h"
|
|
|
|
static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
|
|
{
|
|
struct netlink_sock *nlk = nlk_sk(sk);
|
|
|
|
if (nlk->groups == NULL)
|
|
return 0;
|
|
|
|
return nla_put(nlskb, NETLINK_DIAG_GROUPS, NLGRPSZ(nlk->ngroups),
|
|
nlk->groups);
|
|
}
|
|
|
|
static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
|
|
struct netlink_diag_req *req,
|
|
u32 portid, u32 seq, u32 flags, int sk_ino)
|
|
{
|
|
struct nlmsghdr *nlh;
|
|
struct netlink_diag_msg *rep;
|
|
struct netlink_sock *nlk = nlk_sk(sk);
|
|
|
|
nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rep),
|
|
flags);
|
|
if (!nlh)
|
|
return -EMSGSIZE;
|
|
|
|
rep = nlmsg_data(nlh);
|
|
rep->ndiag_family = AF_NETLINK;
|
|
rep->ndiag_type = sk->sk_type;
|
|
rep->ndiag_protocol = sk->sk_protocol;
|
|
rep->ndiag_state = sk->sk_state;
|
|
|
|
rep->ndiag_ino = sk_ino;
|
|
rep->ndiag_portid = nlk->portid;
|
|
rep->ndiag_dst_portid = nlk->dst_portid;
|
|
rep->ndiag_dst_group = nlk->dst_group;
|
|
sock_diag_save_cookie(sk, rep->ndiag_cookie);
|
|
|
|
if ((req->ndiag_show & NDIAG_SHOW_GROUPS) &&
|
|
sk_diag_dump_groups(sk, skb))
|
|
goto out_nlmsg_trim;
|
|
|
|
if ((req->ndiag_show & NDIAG_SHOW_MEMINFO) &&
|
|
sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
|
|
goto out_nlmsg_trim;
|
|
|
|
nlmsg_end(skb, nlh);
|
|
return 0;
|
|
|
|
out_nlmsg_trim:
|
|
nlmsg_cancel(skb, nlh);
|
|
return -EMSGSIZE;
|
|
}
|
|
|
|
static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
|
|
int protocol, int s_num)
|
|
{
|
|
struct netlink_table *tbl = &nl_table[protocol];
|
|
struct rhashtable *ht = &tbl->hash;
|
|
const struct bucket_table *htbl = rht_dereference_rcu(ht->tbl, ht);
|
|
struct net *net = sock_net(skb->sk);
|
|
struct netlink_diag_req *req;
|
|
struct netlink_sock *nlsk;
|
|
struct sock *sk;
|
|
int ret = 0, num = 0, i;
|
|
|
|
req = nlmsg_data(cb->nlh);
|
|
|
|
for (i = 0; i < htbl->size; i++) {
|
|
struct rhash_head *pos;
|
|
|
|
rht_for_each_entry_rcu(nlsk, pos, htbl, i, node) {
|
|
sk = (struct sock *)nlsk;
|
|
|
|
if (!net_eq(sock_net(sk), net))
|
|
continue;
|
|
if (num < s_num) {
|
|
num++;
|
|
continue;
|
|
}
|
|
|
|
if (sk_diag_fill(sk, skb, req,
|
|
NETLINK_CB(cb->skb).portid,
|
|
cb->nlh->nlmsg_seq,
|
|
NLM_F_MULTI,
|
|
sock_i_ino(sk)) < 0) {
|
|
ret = 1;
|
|
goto done;
|
|
}
|
|
|
|
num++;
|
|
}
|
|
}
|
|
|
|
sk_for_each_bound(sk, &tbl->mc_list) {
|
|
if (sk_hashed(sk))
|
|
continue;
|
|
if (!net_eq(sock_net(sk), net))
|
|
continue;
|
|
if (num < s_num) {
|
|
num++;
|
|
continue;
|
|
}
|
|
|
|
if (sk_diag_fill(sk, skb, req,
|
|
NETLINK_CB(cb->skb).portid,
|
|
cb->nlh->nlmsg_seq,
|
|
NLM_F_MULTI,
|
|
sock_i_ino(sk)) < 0) {
|
|
ret = 1;
|
|
goto done;
|
|
}
|
|
num++;
|
|
}
|
|
done:
|
|
cb->args[0] = num;
|
|
cb->args[1] = protocol;
|
|
|
|
return ret;
|
|
}
|
|
|
|
static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
|
|
{
|
|
struct netlink_diag_req *req;
|
|
int s_num = cb->args[0];
|
|
|
|
req = nlmsg_data(cb->nlh);
|
|
|
|
rcu_read_lock();
|
|
read_lock(&nl_table_lock);
|
|
|
|
if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
|
|
int i;
|
|
|
|
for (i = cb->args[1]; i < MAX_LINKS; i++) {
|
|
if (__netlink_diag_dump(skb, cb, i, s_num))
|
|
break;
|
|
s_num = 0;
|
|
}
|
|
} else {
|
|
if (req->sdiag_protocol >= MAX_LINKS) {
|
|
read_unlock(&nl_table_lock);
|
|
rcu_read_unlock();
|
|
return -ENOENT;
|
|
}
|
|
|
|
__netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
|
|
}
|
|
|
|
read_unlock(&nl_table_lock);
|
|
rcu_read_unlock();
|
|
|
|
return skb->len;
|
|
}
|
|
|
|
static int netlink_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
|
|
{
|
|
int hdrlen = sizeof(struct netlink_diag_req);
|
|
struct net *net = sock_net(skb->sk);
|
|
|
|
if (nlmsg_len(h) < hdrlen)
|
|
return -EINVAL;
|
|
|
|
if (h->nlmsg_flags & NLM_F_DUMP) {
|
|
struct netlink_dump_control c = {
|
|
.dump = netlink_diag_dump,
|
|
};
|
|
return netlink_dump_start(net->diag_nlsk, skb, h, &c);
|
|
} else
|
|
return -EOPNOTSUPP;
|
|
}
|
|
|
|
static const struct sock_diag_handler netlink_diag_handler = {
|
|
.family = AF_NETLINK,
|
|
.dump = netlink_diag_handler_dump,
|
|
};
|
|
|
|
static int __init netlink_diag_init(void)
|
|
{
|
|
return sock_diag_register(&netlink_diag_handler);
|
|
}
|
|
|
|
static void __exit netlink_diag_exit(void)
|
|
{
|
|
sock_diag_unregister(&netlink_diag_handler);
|
|
}
|
|
|
|
module_init(netlink_diag_init);
|
|
module_exit(netlink_diag_exit);
|
|
MODULE_LICENSE("GPL");
|
|
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 16 /* AF_NETLINK */);
|