linux/net/netlink/af_netlink.c
Jakub Kicinski 5fbf57a937 net: netlink: remove the cb_mutex "injection" from netlink core
Back in 2007, in commit af65bdfce9 ("[NETLINK]: Switch cb_lock spinlock
to mutex and allow to override it") netlink core was extended to allow
subsystems to replace the dump mutex lock with its own lock.

The mechanism was used by rtnetlink to take rtnl_lock but it isn't
sufficiently flexible for other users. Over the 17 years since
it was added no other user appeared. Since rtnetlink needs conditional
locking now, and doesn't use it either, axe this feature complete.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2024-06-10 13:15:40 +01:00

2959 lines
70 KiB
C

// SPDX-License-Identifier: GPL-2.0-or-later
/*
* NETLINK Kernel-user communication protocol.
*
* Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
* Patrick McHardy <kaber@trash.net>
*
* Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith
* added netlink_proto_exit
* Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo <acme@conectiva.com.br>
* use nlk_sk, as sk->protinfo is on a diet 8)
* Fri Jul 22 19:51:12 MEST 2005 Harald Welte <laforge@gnumonks.org>
* - inc module use count of module that owns
* the kernel socket in case userspace opens
* socket of same protocol
* - remove all module support, since netlink is
* mandatory if CONFIG_NET=y these days
*/
#include <linux/module.h>
#include <linux/bpf.h>
#include <linux/capability.h>
#include <linux/kernel.h>
#include <linux/filter.h>
#include <linux/init.h>
#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/un.h>
#include <linux/fcntl.h>
#include <linux/termios.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/security.h>
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/random.h>
#include <linux/bitops.h>
#include <linux/mm.h>
#include <linux/types.h>
#include <linux/audit.h>
#include <linux/mutex.h>
#include <linux/vmalloc.h>
#include <linux/if_arp.h>
#include <linux/rhashtable.h>
#include <asm/cacheflush.h>
#include <linux/hash.h>
#include <linux/net_namespace.h>
#include <linux/nospec.h>
#include <linux/btf_ids.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
#include <net/scm.h>
#include <net/netlink.h>
#define CREATE_TRACE_POINTS
#include <trace/events/netlink.h>
#include "af_netlink.h"
#include "genetlink.h"
struct listeners {
struct rcu_head rcu;
unsigned long masks[];
};
/* state bits */
#define NETLINK_S_CONGESTED 0x0
static inline int netlink_is_kernel(struct sock *sk)
{
return nlk_test_bit(KERNEL_SOCKET, sk);
}
struct netlink_table *nl_table __read_mostly;
EXPORT_SYMBOL_GPL(nl_table);
static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait);
static struct lock_class_key nlk_cb_mutex_keys[MAX_LINKS];
static const char *const nlk_cb_mutex_key_strings[MAX_LINKS + 1] = {
"nlk_cb_mutex-ROUTE",
"nlk_cb_mutex-1",
"nlk_cb_mutex-USERSOCK",
"nlk_cb_mutex-FIREWALL",
"nlk_cb_mutex-SOCK_DIAG",
"nlk_cb_mutex-NFLOG",
"nlk_cb_mutex-XFRM",
"nlk_cb_mutex-SELINUX",
"nlk_cb_mutex-ISCSI",
"nlk_cb_mutex-AUDIT",
"nlk_cb_mutex-FIB_LOOKUP",
"nlk_cb_mutex-CONNECTOR",
"nlk_cb_mutex-NETFILTER",
"nlk_cb_mutex-IP6_FW",
"nlk_cb_mutex-DNRTMSG",
"nlk_cb_mutex-KOBJECT_UEVENT",
"nlk_cb_mutex-GENERIC",
"nlk_cb_mutex-17",
"nlk_cb_mutex-SCSITRANSPORT",
"nlk_cb_mutex-ECRYPTFS",
"nlk_cb_mutex-RDMA",
"nlk_cb_mutex-CRYPTO",
"nlk_cb_mutex-SMC",
"nlk_cb_mutex-23",
"nlk_cb_mutex-24",
"nlk_cb_mutex-25",
"nlk_cb_mutex-26",
"nlk_cb_mutex-27",
"nlk_cb_mutex-28",
"nlk_cb_mutex-29",
"nlk_cb_mutex-30",
"nlk_cb_mutex-31",
"nlk_cb_mutex-MAX_LINKS"
};
static int netlink_dump(struct sock *sk, bool lock_taken);
/* nl_table locking explained:
* Lookup and traversal are protected with an RCU read-side lock. Insertion
* and removal are protected with per bucket lock while using RCU list
* modification primitives and may run in parallel to RCU protected lookups.
* Destruction of the Netlink socket may only occur *after* nl_table_lock has
* been acquired * either during or after the socket has been removed from
* the list and after an RCU grace period.
*/
DEFINE_RWLOCK(nl_table_lock);
EXPORT_SYMBOL_GPL(nl_table_lock);
static atomic_t nl_table_users = ATOMIC_INIT(0);
#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
static BLOCKING_NOTIFIER_HEAD(netlink_chain);
static const struct rhashtable_params netlink_rhashtable_params;
void do_trace_netlink_extack(const char *msg)
{
trace_netlink_extack(msg);
}
EXPORT_SYMBOL(do_trace_netlink_extack);
static inline u32 netlink_group_mask(u32 group)
{
if (group > 32)
return 0;
return group ? 1 << (group - 1) : 0;
}
static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
gfp_t gfp_mask)
{
unsigned int len = skb->len;
struct sk_buff *new;
new = alloc_skb(len, gfp_mask);
if (new == NULL)
return NULL;
NETLINK_CB(new).portid = NETLINK_CB(skb).portid;
NETLINK_CB(new).dst_group = NETLINK_CB(skb).dst_group;
NETLINK_CB(new).creds = NETLINK_CB(skb).creds;
skb_put_data(new, skb->data, len);
return new;
}
static unsigned int netlink_tap_net_id;
struct netlink_tap_net {
struct list_head netlink_tap_all;
struct mutex netlink_tap_lock;
};
int netlink_add_tap(struct netlink_tap *nt)
{
struct net *net = dev_net(nt->dev);
struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
if (unlikely(nt->dev->type != ARPHRD_NETLINK))
return -EINVAL;
mutex_lock(&nn->netlink_tap_lock);
list_add_rcu(&nt->list, &nn->netlink_tap_all);
mutex_unlock(&nn->netlink_tap_lock);
__module_get(nt->module);
return 0;
}
EXPORT_SYMBOL_GPL(netlink_add_tap);
static int __netlink_remove_tap(struct netlink_tap *nt)
{
struct net *net = dev_net(nt->dev);
struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
bool found = false;
struct netlink_tap *tmp;
mutex_lock(&nn->netlink_tap_lock);
list_for_each_entry(tmp, &nn->netlink_tap_all, list) {
if (nt == tmp) {
list_del_rcu(&nt->list);
found = true;
goto out;
}
}
pr_warn("__netlink_remove_tap: %p not found\n", nt);
out:
mutex_unlock(&nn->netlink_tap_lock);
if (found)
module_put(nt->module);
return found ? 0 : -ENODEV;
}
int netlink_remove_tap(struct netlink_tap *nt)
{
int ret;
ret = __netlink_remove_tap(nt);
synchronize_net();
return ret;
}
EXPORT_SYMBOL_GPL(netlink_remove_tap);
static __net_init int netlink_tap_init_net(struct net *net)
{
struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
INIT_LIST_HEAD(&nn->netlink_tap_all);
mutex_init(&nn->netlink_tap_lock);
return 0;
}
static struct pernet_operations netlink_tap_net_ops = {
.init = netlink_tap_init_net,
.id = &netlink_tap_net_id,
.size = sizeof(struct netlink_tap_net),
};
static bool netlink_filter_tap(const struct sk_buff *skb)
{
struct sock *sk = skb->sk;
/* We take the more conservative approach and
* whitelist socket protocols that may pass.
*/
switch (sk->sk_protocol) {
case NETLINK_ROUTE:
case NETLINK_USERSOCK:
case NETLINK_SOCK_DIAG:
case NETLINK_NFLOG:
case NETLINK_XFRM:
case NETLINK_FIB_LOOKUP:
case NETLINK_NETFILTER:
case NETLINK_GENERIC:
return true;
}
return false;
}
static int __netlink_deliver_tap_skb(struct sk_buff *skb,
struct net_device *dev)
{
struct sk_buff *nskb;
struct sock *sk = skb->sk;
int ret = -ENOMEM;
if (!net_eq(dev_net(dev), sock_net(sk)))
return 0;
dev_hold(dev);
if (is_vmalloc_addr(skb->head))
nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
else
nskb = skb_clone(skb, GFP_ATOMIC);
if (nskb) {
nskb->dev = dev;
nskb->protocol = htons((u16) sk->sk_protocol);
nskb->pkt_type = netlink_is_kernel(sk) ?
PACKET_KERNEL : PACKET_USER;
skb_reset_network_header(nskb);
ret = dev_queue_xmit(nskb);
if (unlikely(ret > 0))
ret = net_xmit_errno(ret);
}
dev_put(dev);
return ret;
}
static void __netlink_deliver_tap(struct sk_buff *skb, struct netlink_tap_net *nn)
{
int ret;
struct netlink_tap *tmp;
if (!netlink_filter_tap(skb))
return;
list_for_each_entry_rcu(tmp, &nn->netlink_tap_all, list) {
ret = __netlink_deliver_tap_skb(skb, tmp->dev);
if (unlikely(ret))
break;
}
}
static void netlink_deliver_tap(struct net *net, struct sk_buff *skb)
{
struct netlink_tap_net *nn = net_generic(net, netlink_tap_net_id);
rcu_read_lock();
if (unlikely(!list_empty(&nn->netlink_tap_all)))
__netlink_deliver_tap(skb, nn);
rcu_read_unlock();
}
static void netlink_deliver_tap_kernel(struct sock *dst, struct sock *src,
struct sk_buff *skb)
{
if (!(netlink_is_kernel(dst) && netlink_is_kernel(src)))
netlink_deliver_tap(sock_net(dst), skb);
}
static void netlink_overrun(struct sock *sk)
{
if (!nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
if (!test_and_set_bit(NETLINK_S_CONGESTED,
&nlk_sk(sk)->state)) {
WRITE_ONCE(sk->sk_err, ENOBUFS);
sk_error_report(sk);
}
}
atomic_inc(&sk->sk_drops);
}
static void netlink_rcv_wake(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
if (skb_queue_empty_lockless(&sk->sk_receive_queue))
clear_bit(NETLINK_S_CONGESTED, &nlk->state);
if (!test_bit(NETLINK_S_CONGESTED, &nlk->state))
wake_up_interruptible(&nlk->wait);
}
static void netlink_skb_destructor(struct sk_buff *skb)
{
if (is_vmalloc_addr(skb->head)) {
if (!skb->cloned ||
!atomic_dec_return(&(skb_shinfo(skb)->dataref)))
vfree_atomic(skb->head);
skb->head = NULL;
}
if (skb->sk != NULL)
sock_rfree(skb);
}
static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk)
{
WARN_ON(skb->sk != NULL);
skb->sk = sk;
skb->destructor = netlink_skb_destructor;
atomic_add(skb->truesize, &sk->sk_rmem_alloc);
sk_mem_charge(sk, skb->truesize);
}
static void netlink_sock_destruct(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
if (nlk->cb_running) {
if (nlk->cb.done)
nlk->cb.done(&nlk->cb);
module_put(nlk->cb.module);
kfree_skb(nlk->cb.skb);
}
skb_queue_purge(&sk->sk_receive_queue);
if (!sock_flag(sk, SOCK_DEAD)) {
printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
return;
}
WARN_ON(atomic_read(&sk->sk_rmem_alloc));
WARN_ON(refcount_read(&sk->sk_wmem_alloc));
WARN_ON(nlk_sk(sk)->groups);
}
static void netlink_sock_destruct_work(struct work_struct *work)
{
struct netlink_sock *nlk = container_of(work, struct netlink_sock,
work);
sk_free(&nlk->sk);
}
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
* SMP. Look, when several writers sleep and reader wakes them up, all but one
* immediately hit write lock and grab all the cpus. Exclusive sleep solves
* this, _but_ remember, it adds useless work on UP machines.
*/
void netlink_table_grab(void)
__acquires(nl_table_lock)
{
might_sleep();
write_lock_irq(&nl_table_lock);
if (atomic_read(&nl_table_users)) {
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&nl_table_wait, &wait);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (atomic_read(&nl_table_users) == 0)
break;
write_unlock_irq(&nl_table_lock);
schedule();
write_lock_irq(&nl_table_lock);
}
__set_current_state(TASK_RUNNING);
remove_wait_queue(&nl_table_wait, &wait);
}
}
void netlink_table_ungrab(void)
__releases(nl_table_lock)
{
write_unlock_irq(&nl_table_lock);
wake_up(&nl_table_wait);
}
static inline void
netlink_lock_table(void)
{
unsigned long flags;
/* read_lock() synchronizes us to netlink_table_grab */
read_lock_irqsave(&nl_table_lock, flags);
atomic_inc(&nl_table_users);
read_unlock_irqrestore(&nl_table_lock, flags);
}
static inline void
netlink_unlock_table(void)
{
if (atomic_dec_and_test(&nl_table_users))
wake_up(&nl_table_wait);
}
struct netlink_compare_arg
{
possible_net_t pnet;
u32 portid;
};
/* Doing sizeof directly may yield 4 extra bytes on 64-bit. */
#define netlink_compare_arg_len \
(offsetof(struct netlink_compare_arg, portid) + sizeof(u32))
static inline int netlink_compare(struct rhashtable_compare_arg *arg,
const void *ptr)
{
const struct netlink_compare_arg *x = arg->key;
const struct netlink_sock *nlk = ptr;
return nlk->portid != x->portid ||
!net_eq(sock_net(&nlk->sk), read_pnet(&x->pnet));
}
static void netlink_compare_arg_init(struct netlink_compare_arg *arg,
struct net *net, u32 portid)
{
memset(arg, 0, sizeof(*arg));
write_pnet(&arg->pnet, net);
arg->portid = portid;
}
static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
struct net *net)
{
struct netlink_compare_arg arg;
netlink_compare_arg_init(&arg, net, portid);
return rhashtable_lookup_fast(&table->hash, &arg,
netlink_rhashtable_params);
}
static int __netlink_insert(struct netlink_table *table, struct sock *sk)
{
struct netlink_compare_arg arg;
netlink_compare_arg_init(&arg, sock_net(sk), nlk_sk(sk)->portid);
return rhashtable_lookup_insert_key(&table->hash, &arg,
&nlk_sk(sk)->node,
netlink_rhashtable_params);
}
static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
{
struct netlink_table *table = &nl_table[protocol];
struct sock *sk;
rcu_read_lock();
sk = __netlink_lookup(table, portid, net);
if (sk)
sock_hold(sk);
rcu_read_unlock();
return sk;
}
static const struct proto_ops netlink_ops;
static void
netlink_update_listeners(struct sock *sk)
{
struct netlink_table *tbl = &nl_table[sk->sk_protocol];
unsigned long mask;
unsigned int i;
struct listeners *listeners;
listeners = nl_deref_protected(tbl->listeners);
if (!listeners)
return;
for (i = 0; i < NLGRPLONGS(tbl->groups); i++) {
mask = 0;
sk_for_each_bound(sk, &tbl->mc_list) {
if (i < NLGRPLONGS(nlk_sk(sk)->ngroups))
mask |= nlk_sk(sk)->groups[i];
}
listeners->masks[i] = mask;
}
/* this function is only called with the netlink table "grabbed", which
* makes sure updates are visible before bind or setsockopt return. */
}
static int netlink_insert(struct sock *sk, u32 portid)
{
struct netlink_table *table = &nl_table[sk->sk_protocol];
int err;
lock_sock(sk);
err = nlk_sk(sk)->portid == portid ? 0 : -EBUSY;
if (nlk_sk(sk)->bound)
goto err;
/* portid can be read locklessly from netlink_getname(). */
WRITE_ONCE(nlk_sk(sk)->portid, portid);
sock_hold(sk);
err = __netlink_insert(table, sk);
if (err) {
/* In case the hashtable backend returns with -EBUSY
* from here, it must not escape to the caller.
*/
if (unlikely(err == -EBUSY))
err = -EOVERFLOW;
if (err == -EEXIST)
err = -EADDRINUSE;
sock_put(sk);
goto err;
}
/* We need to ensure that the socket is hashed and visible. */
smp_wmb();
/* Paired with lockless reads from netlink_bind(),
* netlink_connect() and netlink_sendmsg().
*/
WRITE_ONCE(nlk_sk(sk)->bound, portid);
err:
release_sock(sk);
return err;
}
static void netlink_remove(struct sock *sk)
{
struct netlink_table *table;
table = &nl_table[sk->sk_protocol];
if (!rhashtable_remove_fast(&table->hash, &nlk_sk(sk)->node,
netlink_rhashtable_params)) {
WARN_ON(refcount_read(&sk->sk_refcnt) == 1);
__sock_put(sk);
}
netlink_table_grab();
if (nlk_sk(sk)->subscriptions) {
__sk_del_bind_node(sk);
netlink_update_listeners(sk);
}
if (sk->sk_protocol == NETLINK_GENERIC)
atomic_inc(&genl_sk_destructing_cnt);
netlink_table_ungrab();
}
static struct proto netlink_proto = {
.name = "NETLINK",
.owner = THIS_MODULE,
.obj_size = sizeof(struct netlink_sock),
};
static int __netlink_create(struct net *net, struct socket *sock,
int protocol, int kern)
{
struct sock *sk;
struct netlink_sock *nlk;
sock->ops = &netlink_ops;
sk = sk_alloc(net, PF_NETLINK, GFP_KERNEL, &netlink_proto, kern);
if (!sk)
return -ENOMEM;
sock_init_data(sock, sk);
nlk = nlk_sk(sk);
mutex_init(&nlk->nl_cb_mutex);
lockdep_set_class_and_name(&nlk->nl_cb_mutex,
nlk_cb_mutex_keys + protocol,
nlk_cb_mutex_key_strings[protocol]);
init_waitqueue_head(&nlk->wait);
sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
return 0;
}
static int netlink_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct module *module = NULL;
struct netlink_sock *nlk;
int (*bind)(struct net *net, int group);
void (*unbind)(struct net *net, int group);
void (*release)(struct sock *sock, unsigned long *groups);
int err = 0;
sock->state = SS_UNCONNECTED;
if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
return -ESOCKTNOSUPPORT;
if (protocol < 0 || protocol >= MAX_LINKS)
return -EPROTONOSUPPORT;
protocol = array_index_nospec(protocol, MAX_LINKS);
netlink_lock_table();
#ifdef CONFIG_MODULES
if (!nl_table[protocol].registered) {
netlink_unlock_table();
request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
netlink_lock_table();
}
#endif
if (nl_table[protocol].registered &&
try_module_get(nl_table[protocol].module))
module = nl_table[protocol].module;
else
err = -EPROTONOSUPPORT;
bind = nl_table[protocol].bind;
unbind = nl_table[protocol].unbind;
release = nl_table[protocol].release;
netlink_unlock_table();
if (err < 0)
goto out;
err = __netlink_create(net, sock, protocol, kern);
if (err < 0)
goto out_module;
sock_prot_inuse_add(net, &netlink_proto, 1);
nlk = nlk_sk(sock->sk);
nlk->module = module;
nlk->netlink_bind = bind;
nlk->netlink_unbind = unbind;
nlk->netlink_release = release;
out:
return err;
out_module:
module_put(module);
goto out;
}
static void deferred_put_nlk_sk(struct rcu_head *head)
{
struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
struct sock *sk = &nlk->sk;
kfree(nlk->groups);
nlk->groups = NULL;
if (!refcount_dec_and_test(&sk->sk_refcnt))
return;
if (nlk->cb_running && nlk->cb.done) {
INIT_WORK(&nlk->work, netlink_sock_destruct_work);
schedule_work(&nlk->work);
return;
}
sk_free(sk);
}
static int netlink_release(struct socket *sock)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk;
if (!sk)
return 0;
netlink_remove(sk);
sock_orphan(sk);
nlk = nlk_sk(sk);
/*
* OK. Socket is unlinked, any packets that arrive now
* will be purged.
*/
if (nlk->netlink_release)
nlk->netlink_release(sk, nlk->groups);
/* must not acquire netlink_table_lock in any way again before unbind
* and notifying genetlink is done as otherwise it might deadlock
*/
if (nlk->netlink_unbind) {
int i;
for (i = 0; i < nlk->ngroups; i++)
if (test_bit(i, nlk->groups))
nlk->netlink_unbind(sock_net(sk), i + 1);
}
if (sk->sk_protocol == NETLINK_GENERIC &&
atomic_dec_return(&genl_sk_destructing_cnt) == 0)
wake_up(&genl_sk_destructing_waitq);
sock->sk = NULL;
wake_up_interruptible_all(&nlk->wait);
skb_queue_purge(&sk->sk_write_queue);
if (nlk->portid && nlk->bound) {
struct netlink_notify n = {
.net = sock_net(sk),
.protocol = sk->sk_protocol,
.portid = nlk->portid,
};
blocking_notifier_call_chain(&netlink_chain,
NETLINK_URELEASE, &n);
}
module_put(nlk->module);
if (netlink_is_kernel(sk)) {
netlink_table_grab();
BUG_ON(nl_table[sk->sk_protocol].registered == 0);
if (--nl_table[sk->sk_protocol].registered == 0) {
struct listeners *old;
old = nl_deref_protected(nl_table[sk->sk_protocol].listeners);
RCU_INIT_POINTER(nl_table[sk->sk_protocol].listeners, NULL);
kfree_rcu(old, rcu);
nl_table[sk->sk_protocol].module = NULL;
nl_table[sk->sk_protocol].bind = NULL;
nl_table[sk->sk_protocol].unbind = NULL;
nl_table[sk->sk_protocol].flags = 0;
nl_table[sk->sk_protocol].registered = 0;
}
netlink_table_ungrab();
}
sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
/* Because struct net might disappear soon, do not keep a pointer. */
if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) {
__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
/* Because of deferred_put_nlk_sk and use of work queue,
* it is possible netns will be freed before this socket.
*/
sock_net_set(sk, &init_net);
__netns_tracker_alloc(&init_net, &sk->ns_tracker,
false, GFP_KERNEL);
}
call_rcu(&nlk->rcu, deferred_put_nlk_sk);
return 0;
}
static int netlink_autobind(struct socket *sock)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct netlink_table *table = &nl_table[sk->sk_protocol];
s32 portid = task_tgid_vnr(current);
int err;
s32 rover = -4096;
bool ok;
retry:
cond_resched();
rcu_read_lock();
ok = !__netlink_lookup(table, portid, net);
rcu_read_unlock();
if (!ok) {
/* Bind collision, search negative portid values. */
if (rover == -4096)
/* rover will be in range [S32_MIN, -4097] */
rover = S32_MIN + get_random_u32_below(-4096 - S32_MIN);
else if (rover >= -4096)
rover = -4097;
portid = rover--;
goto retry;
}
err = netlink_insert(sk, portid);
if (err == -EADDRINUSE)
goto retry;
/* If 2 threads race to autobind, that is fine. */
if (err == -EBUSY)
err = 0;
return err;
}
/**
* __netlink_ns_capable - General netlink message capability test
* @nsp: NETLINK_CB of the socket buffer holding a netlink command from userspace.
* @user_ns: The user namespace of the capability to use
* @cap: The capability to use
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
* message has the capability @cap in the user namespace @user_ns.
*/
bool __netlink_ns_capable(const struct netlink_skb_parms *nsp,
struct user_namespace *user_ns, int cap)
{
return ((nsp->flags & NETLINK_SKB_DST) ||
file_ns_capable(nsp->sk->sk_socket->file, user_ns, cap)) &&
ns_capable(user_ns, cap);
}
EXPORT_SYMBOL(__netlink_ns_capable);
/**
* netlink_ns_capable - General netlink message capability test
* @skb: socket buffer holding a netlink command from userspace
* @user_ns: The user namespace of the capability to use
* @cap: The capability to use
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
* message has the capability @cap in the user namespace @user_ns.
*/
bool netlink_ns_capable(const struct sk_buff *skb,
struct user_namespace *user_ns, int cap)
{
return __netlink_ns_capable(&NETLINK_CB(skb), user_ns, cap);
}
EXPORT_SYMBOL(netlink_ns_capable);
/**
* netlink_capable - Netlink global message capability test
* @skb: socket buffer holding a netlink command from userspace
* @cap: The capability to use
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
* message has the capability @cap in all user namespaces.
*/
bool netlink_capable(const struct sk_buff *skb, int cap)
{
return netlink_ns_capable(skb, &init_user_ns, cap);
}
EXPORT_SYMBOL(netlink_capable);
/**
* netlink_net_capable - Netlink network namespace message capability test
* @skb: socket buffer holding a netlink command from userspace
* @cap: The capability to use
*
* Test to see if the opener of the socket we received the message
* from had when the netlink socket was created and the sender of the
* message has the capability @cap over the network namespace of
* the socket we received the message from.
*/
bool netlink_net_capable(const struct sk_buff *skb, int cap)
{
return netlink_ns_capable(skb, sock_net(skb->sk)->user_ns, cap);
}
EXPORT_SYMBOL(netlink_net_capable);
static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
{
return (nl_table[sock->sk->sk_protocol].flags & flag) ||
ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
}
static void
netlink_update_subscriptions(struct sock *sk, unsigned int subscriptions)
{
struct netlink_sock *nlk = nlk_sk(sk);
if (nlk->subscriptions && !subscriptions)
__sk_del_bind_node(sk);
else if (!nlk->subscriptions && subscriptions)
sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list);
nlk->subscriptions = subscriptions;
}
static int netlink_realloc_groups(struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
unsigned int groups;
unsigned long *new_groups;
int err = 0;
netlink_table_grab();
groups = nl_table[sk->sk_protocol].groups;
if (!nl_table[sk->sk_protocol].registered) {
err = -ENOENT;
goto out_unlock;
}
if (nlk->ngroups >= groups)
goto out_unlock;
new_groups = krealloc(nlk->groups, NLGRPSZ(groups), GFP_ATOMIC);
if (new_groups == NULL) {
err = -ENOMEM;
goto out_unlock;
}
memset((char *)new_groups + NLGRPSZ(nlk->ngroups), 0,
NLGRPSZ(groups) - NLGRPSZ(nlk->ngroups));
nlk->groups = new_groups;
nlk->ngroups = groups;
out_unlock:
netlink_table_ungrab();
return err;
}
static void netlink_undo_bind(int group, long unsigned int groups,
struct sock *sk)
{
struct netlink_sock *nlk = nlk_sk(sk);
int undo;
if (!nlk->netlink_unbind)
return;
for (undo = 0; undo < group; undo++)
if (test_bit(undo, &groups))
nlk->netlink_unbind(sock_net(sk), undo + 1);
}
static int netlink_bind(struct socket *sock, struct sockaddr *addr,
int addr_len)
{
struct sock *sk = sock->sk;
struct net *net = sock_net(sk);
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
int err = 0;
unsigned long groups;
bool bound;
if (addr_len < sizeof(struct sockaddr_nl))
return -EINVAL;
if (nladdr->nl_family != AF_NETLINK)
return -EINVAL;
groups = nladdr->nl_groups;
/* Only superuser is allowed to listen multicasts */
if (groups) {
if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
return -EPERM;
err = netlink_realloc_groups(sk);
if (err)
return err;
}
if (nlk->ngroups < BITS_PER_LONG)
groups &= (1UL << nlk->ngroups) - 1;
/* Paired with WRITE_ONCE() in netlink_insert() */
bound = READ_ONCE(nlk->bound);
if (bound) {
/* Ensure nlk->portid is up-to-date. */
smp_rmb();
if (nladdr->nl_pid != nlk->portid)
return -EINVAL;
}
if (nlk->netlink_bind && groups) {
int group;
/* nl_groups is a u32, so cap the maximum groups we can bind */
for (group = 0; group < BITS_PER_TYPE(u32); group++) {
if (!test_bit(group, &groups))
continue;
err = nlk->netlink_bind(net, group + 1);
if (!err)
continue;
netlink_undo_bind(group, groups, sk);
return err;
}
}
/* No need for barriers here as we return to user-space without
* using any of the bound attributes.
*/
netlink_lock_table();
if (!bound) {
err = nladdr->nl_pid ?
netlink_insert(sk, nladdr->nl_pid) :
netlink_autobind(sock);
if (err) {
netlink_undo_bind(BITS_PER_TYPE(u32), groups, sk);
goto unlock;
}
}
if (!groups && (nlk->groups == NULL || !(u32)nlk->groups[0]))
goto unlock;
netlink_unlock_table();
netlink_table_grab();
netlink_update_subscriptions(sk, nlk->subscriptions +
hweight32(groups) -
hweight32(nlk->groups[0]));
nlk->groups[0] = (nlk->groups[0] & ~0xffffffffUL) | groups;
netlink_update_listeners(sk);
netlink_table_ungrab();
return 0;
unlock:
netlink_unlock_table();
return err;
}
static int netlink_connect(struct socket *sock, struct sockaddr *addr,
int alen, int flags)
{
int err = 0;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
if (alen < sizeof(addr->sa_family))
return -EINVAL;
if (addr->sa_family == AF_UNSPEC) {
/* paired with READ_ONCE() in netlink_getsockbyportid() */
WRITE_ONCE(sk->sk_state, NETLINK_UNCONNECTED);
/* dst_portid and dst_group can be read locklessly */
WRITE_ONCE(nlk->dst_portid, 0);
WRITE_ONCE(nlk->dst_group, 0);
return 0;
}
if (addr->sa_family != AF_NETLINK)
return -EINVAL;
if (alen < sizeof(struct sockaddr_nl))
return -EINVAL;
if ((nladdr->nl_groups || nladdr->nl_pid) &&
!netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
return -EPERM;
/* No need for barriers here as we return to user-space without
* using any of the bound attributes.
* Paired with WRITE_ONCE() in netlink_insert().
*/
if (!READ_ONCE(nlk->bound))
err = netlink_autobind(sock);
if (err == 0) {
/* paired with READ_ONCE() in netlink_getsockbyportid() */
WRITE_ONCE(sk->sk_state, NETLINK_CONNECTED);
/* dst_portid and dst_group can be read locklessly */
WRITE_ONCE(nlk->dst_portid, nladdr->nl_pid);
WRITE_ONCE(nlk->dst_group, ffs(nladdr->nl_groups));
}
return err;
}
static int netlink_getname(struct socket *sock, struct sockaddr *addr,
int peer)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
nladdr->nl_family = AF_NETLINK;
nladdr->nl_pad = 0;
if (peer) {
/* Paired with WRITE_ONCE() in netlink_connect() */
nladdr->nl_pid = READ_ONCE(nlk->dst_portid);
nladdr->nl_groups = netlink_group_mask(READ_ONCE(nlk->dst_group));
} else {
/* Paired with WRITE_ONCE() in netlink_insert() */
nladdr->nl_pid = READ_ONCE(nlk->portid);
netlink_lock_table();
nladdr->nl_groups = nlk->groups ? nlk->groups[0] : 0;
netlink_unlock_table();
}
return sizeof(*nladdr);
}
static int netlink_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
/* try to hand this ioctl down to the NIC drivers.
*/
return -ENOIOCTLCMD;
}
static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
{
struct sock *sock;
struct netlink_sock *nlk;
sock = netlink_lookup(sock_net(ssk), ssk->sk_protocol, portid);
if (!sock)
return ERR_PTR(-ECONNREFUSED);
/* Don't bother queuing skb if kernel socket has no input function */
nlk = nlk_sk(sock);
/* dst_portid and sk_state can be changed in netlink_connect() */
if (READ_ONCE(sock->sk_state) == NETLINK_CONNECTED &&
READ_ONCE(nlk->dst_portid) != nlk_sk(ssk)->portid) {
sock_put(sock);
return ERR_PTR(-ECONNREFUSED);
}
return sock;
}
struct sock *netlink_getsockbyfilp(struct file *filp)
{
struct inode *inode = file_inode(filp);
struct sock *sock;
if (!S_ISSOCK(inode->i_mode))
return ERR_PTR(-ENOTSOCK);
sock = SOCKET_I(inode)->sk;
if (sock->sk_family != AF_NETLINK)
return ERR_PTR(-EINVAL);
sock_hold(sock);
return sock;
}
struct sk_buff *netlink_alloc_large_skb(unsigned int size, int broadcast)
{
size_t head_size = SKB_HEAD_ALIGN(size);
struct sk_buff *skb;
void *data;
if (head_size <= PAGE_SIZE || broadcast)
return alloc_skb(size, GFP_KERNEL);
data = kvmalloc(head_size, GFP_KERNEL);
if (!data)
return NULL;
skb = __build_skb(data, head_size);
if (!skb)
kvfree(data);
else if (is_vmalloc_addr(data))
skb->destructor = netlink_skb_destructor;
return skb;
}
/*
* Attach a skb to a netlink socket.
* The caller must hold a reference to the destination socket. On error, the
* reference is dropped. The skb is not send to the destination, just all
* all error checks are performed and memory in the queue is reserved.
* Return values:
* < 0: error. skb freed, reference to sock dropped.
* 0: continue
* 1: repeat lookup - reference dropped while waiting for socket memory.
*/
int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
long *timeo, struct sock *ssk)
{
struct netlink_sock *nlk;
nlk = nlk_sk(sk);
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
DECLARE_WAITQUEUE(wait, current);
if (!*timeo) {
if (!ssk || netlink_is_kernel(ssk))
netlink_overrun(sk);
sock_put(sk);
kfree_skb(skb);
return -EAGAIN;
}
__set_current_state(TASK_INTERRUPTIBLE);
add_wait_queue(&nlk->wait, &wait);
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
!sock_flag(sk, SOCK_DEAD))
*timeo = schedule_timeout(*timeo);
__set_current_state(TASK_RUNNING);
remove_wait_queue(&nlk->wait, &wait);
sock_put(sk);
if (signal_pending(current)) {
kfree_skb(skb);
return sock_intr_errno(*timeo);
}
return 1;
}
netlink_skb_set_owner_r(skb, sk);
return 0;
}
static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
int len = skb->len;
netlink_deliver_tap(sock_net(sk), skb);
skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk);
return len;
}
int netlink_sendskb(struct sock *sk, struct sk_buff *skb)
{
int len = __netlink_sendskb(sk, skb);
sock_put(sk);
return len;
}
void netlink_detachskb(struct sock *sk, struct sk_buff *skb)
{
kfree_skb(skb);
sock_put(sk);
}
static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
{
int delta;
WARN_ON(skb->sk != NULL);
delta = skb->end - skb->tail;
if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
return skb;
if (skb_shared(skb)) {
struct sk_buff *nskb = skb_clone(skb, allocation);
if (!nskb)
return skb;
consume_skb(skb);
skb = nskb;
}
pskb_expand_head(skb, 0, -delta,
(allocation & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOWARN | __GFP_NORETRY);
return skb;
}
static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
struct sock *ssk)
{
int ret;
struct netlink_sock *nlk = nlk_sk(sk);
ret = -ECONNREFUSED;
if (nlk->netlink_rcv != NULL) {
ret = skb->len;
netlink_skb_set_owner_r(skb, sk);
NETLINK_CB(skb).sk = ssk;
netlink_deliver_tap_kernel(sk, ssk, skb);
nlk->netlink_rcv(skb);
consume_skb(skb);
} else {
kfree_skb(skb);
}
sock_put(sk);
return ret;
}
int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
u32 portid, int nonblock)
{
struct sock *sk;
int err;
long timeo;
skb = netlink_trim(skb, gfp_any());
timeo = sock_sndtimeo(ssk, nonblock);
retry:
sk = netlink_getsockbyportid(ssk, portid);
if (IS_ERR(sk)) {
kfree_skb(skb);
return PTR_ERR(sk);
}
if (netlink_is_kernel(sk))
return netlink_unicast_kernel(sk, skb, ssk);
if (sk_filter(sk, skb)) {
err = skb->len;
kfree_skb(skb);
sock_put(sk);
return err;
}
err = netlink_attachskb(sk, skb, &timeo, ssk);
if (err == 1)
goto retry;
if (err)
return err;
return netlink_sendskb(sk, skb);
}
EXPORT_SYMBOL(netlink_unicast);
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
int res = 0;
struct listeners *listeners;
BUG_ON(!netlink_is_kernel(sk));
rcu_read_lock();
listeners = rcu_dereference(nl_table[sk->sk_protocol].listeners);
if (listeners && group - 1 < nl_table[sk->sk_protocol].groups)
res = test_bit(group - 1, listeners->masks);
rcu_read_unlock();
return res;
}
EXPORT_SYMBOL_GPL(netlink_has_listeners);
bool netlink_strict_get_check(struct sk_buff *skb)
{
return nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);
}
EXPORT_SYMBOL_GPL(netlink_strict_get_check);
static int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb)
{
struct netlink_sock *nlk = nlk_sk(sk);
if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
!test_bit(NETLINK_S_CONGESTED, &nlk->state)) {
netlink_skb_set_owner_r(skb, sk);
__netlink_sendskb(sk, skb);
return atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1);
}
return -1;
}
struct netlink_broadcast_data {
struct sock *exclude_sk;
struct net *net;
u32 portid;
u32 group;
int failure;
int delivery_failure;
int congested;
int delivered;
gfp_t allocation;
struct sk_buff *skb, *skb2;
int (*tx_filter)(struct sock *dsk, struct sk_buff *skb, void *data);
void *tx_data;
};
static void do_one_broadcast(struct sock *sk,
struct netlink_broadcast_data *p)
{
struct netlink_sock *nlk = nlk_sk(sk);
int val;
if (p->exclude_sk == sk)
return;
if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
!test_bit(p->group - 1, nlk->groups))
return;
if (!net_eq(sock_net(sk), p->net)) {
if (!nlk_test_bit(LISTEN_ALL_NSID, sk))
return;
if (!peernet_has_id(sock_net(sk), p->net))
return;
if (!file_ns_capable(sk->sk_socket->file, p->net->user_ns,
CAP_NET_BROADCAST))
return;
}
if (p->failure) {
netlink_overrun(sk);
return;
}
sock_hold(sk);
if (p->skb2 == NULL) {
if (skb_shared(p->skb)) {
p->skb2 = skb_clone(p->skb, p->allocation);
} else {
p->skb2 = skb_get(p->skb);
/*
* skb ownership may have been set when
* delivered to a previous socket.
*/
skb_orphan(p->skb2);
}
}
if (p->skb2 == NULL) {
netlink_overrun(sk);
/* Clone failed. Notify ALL listeners. */
p->failure = 1;
if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
p->delivery_failure = 1;
goto out;
}
if (p->tx_filter && p->tx_filter(sk, p->skb2, p->tx_data)) {
kfree_skb(p->skb2);
p->skb2 = NULL;
goto out;
}
if (sk_filter(sk, p->skb2)) {
kfree_skb(p->skb2);
p->skb2 = NULL;
goto out;
}
NETLINK_CB(p->skb2).nsid = peernet2id(sock_net(sk), p->net);
if (NETLINK_CB(p->skb2).nsid != NETNSA_NSID_NOT_ASSIGNED)
NETLINK_CB(p->skb2).nsid_is_set = true;
val = netlink_broadcast_deliver(sk, p->skb2);
if (val < 0) {
netlink_overrun(sk);
if (nlk_test_bit(BROADCAST_SEND_ERROR, sk))
p->delivery_failure = 1;
} else {
p->congested |= val;
p->delivered = 1;
p->skb2 = NULL;
}
out:
sock_put(sk);
}
int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb,
u32 portid,
u32 group, gfp_t allocation,
netlink_filter_fn filter,
void *filter_data)
{
struct net *net = sock_net(ssk);
struct netlink_broadcast_data info;
struct sock *sk;
skb = netlink_trim(skb, allocation);
info.exclude_sk = ssk;
info.net = net;
info.portid = portid;
info.group = group;
info.failure = 0;
info.delivery_failure = 0;
info.congested = 0;
info.delivered = 0;
info.allocation = allocation;
info.skb = skb;
info.skb2 = NULL;
info.tx_filter = filter;
info.tx_data = filter_data;
/* While we sleep in clone, do not allow to change socket list */
netlink_lock_table();
sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
do_one_broadcast(sk, &info);
consume_skb(skb);
netlink_unlock_table();
if (info.delivery_failure) {
kfree_skb(info.skb2);
return -ENOBUFS;
}
consume_skb(info.skb2);
if (info.delivered) {
if (info.congested && gfpflags_allow_blocking(allocation))
yield();
return 0;
}
return -ESRCH;
}
EXPORT_SYMBOL(netlink_broadcast_filtered);
int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 portid,
u32 group, gfp_t allocation)
{
return netlink_broadcast_filtered(ssk, skb, portid, group, allocation,
NULL, NULL);
}
EXPORT_SYMBOL(netlink_broadcast);
struct netlink_set_err_data {
struct sock *exclude_sk;
u32 portid;
u32 group;
int code;
};
static int do_one_set_err(struct sock *sk, struct netlink_set_err_data *p)
{
struct netlink_sock *nlk = nlk_sk(sk);
int ret = 0;
if (sk == p->exclude_sk)
goto out;
if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
goto out;
if (nlk->portid == p->portid || p->group - 1 >= nlk->ngroups ||
!test_bit(p->group - 1, nlk->groups))
goto out;
if (p->code == ENOBUFS && nlk_test_bit(RECV_NO_ENOBUFS, sk)) {
ret = 1;
goto out;
}
WRITE_ONCE(sk->sk_err, p->code);
sk_error_report(sk);
out:
return ret;
}
/**
* netlink_set_err - report error to broadcast listeners
* @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
* @portid: the PORTID of a process that we want to skip (if any)
* @group: the broadcast group that will notice the error
* @code: error code, must be negative (as usual in kernelspace)
*
* This function returns the number of broadcast listeners that have set the
* NETLINK_NO_ENOBUFS socket option.
*/
int netlink_set_err(struct sock *ssk, u32 portid, u32 group, int code)
{
struct netlink_set_err_data info;
unsigned long flags;
struct sock *sk;
int ret = 0;
info.exclude_sk = ssk;
info.portid = portid;
info.group = group;
/* sk->sk_err wants a positive error value */
info.code = -code;
read_lock_irqsave(&nl_table_lock, flags);
sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
ret += do_one_set_err(sk, &info);
read_unlock_irqrestore(&nl_table_lock, flags);
return ret;
}
EXPORT_SYMBOL(netlink_set_err);
/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
unsigned int group,
int is_new)
{
int old, new = !!is_new, subscriptions;
old = test_bit(group - 1, nlk->groups);
subscriptions = nlk->subscriptions - old + new;
__assign_bit(group - 1, nlk->groups, new);
netlink_update_subscriptions(&nlk->sk, subscriptions);
netlink_update_listeners(&nlk->sk);
}
static int netlink_setsockopt(struct socket *sock, int level, int optname,
sockptr_t optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
unsigned int val = 0;
int nr = -1;
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
if (optlen >= sizeof(int) &&
copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;
switch (optname) {
case NETLINK_PKTINFO:
nr = NETLINK_F_RECV_PKTINFO;
break;
case NETLINK_ADD_MEMBERSHIP:
case NETLINK_DROP_MEMBERSHIP: {
int err;
if (!netlink_allowed(sock, NL_CFG_F_NONROOT_RECV))
return -EPERM;
err = netlink_realloc_groups(sk);
if (err)
return err;
if (!val || val - 1 >= nlk->ngroups)
return -EINVAL;
if (optname == NETLINK_ADD_MEMBERSHIP && nlk->netlink_bind) {
err = nlk->netlink_bind(sock_net(sk), val);
if (err)
return err;
}
netlink_table_grab();
netlink_update_socket_mc(nlk, val,
optname == NETLINK_ADD_MEMBERSHIP);
netlink_table_ungrab();
if (optname == NETLINK_DROP_MEMBERSHIP && nlk->netlink_unbind)
nlk->netlink_unbind(sock_net(sk), val);
break;
}
case NETLINK_BROADCAST_ERROR:
nr = NETLINK_F_BROADCAST_SEND_ERROR;
break;
case NETLINK_NO_ENOBUFS:
assign_bit(NETLINK_F_RECV_NO_ENOBUFS, &nlk->flags, val);
if (val) {
clear_bit(NETLINK_S_CONGESTED, &nlk->state);
wake_up_interruptible(&nlk->wait);
}
break;
case NETLINK_LISTEN_ALL_NSID:
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
return -EPERM;
nr = NETLINK_F_LISTEN_ALL_NSID;
break;
case NETLINK_CAP_ACK:
nr = NETLINK_F_CAP_ACK;
break;
case NETLINK_EXT_ACK:
nr = NETLINK_F_EXT_ACK;
break;
case NETLINK_GET_STRICT_CHK:
nr = NETLINK_F_STRICT_CHK;
break;
default:
return -ENOPROTOOPT;
}
if (nr >= 0)
assign_bit(nr, &nlk->flags, val);
return 0;
}
static int netlink_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
unsigned int flag;
int len, val;
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
if (get_user(len, optlen))
return -EFAULT;
if (len < 0)
return -EINVAL;
switch (optname) {
case NETLINK_PKTINFO:
flag = NETLINK_F_RECV_PKTINFO;
break;
case NETLINK_BROADCAST_ERROR:
flag = NETLINK_F_BROADCAST_SEND_ERROR;
break;
case NETLINK_NO_ENOBUFS:
flag = NETLINK_F_RECV_NO_ENOBUFS;
break;
case NETLINK_LIST_MEMBERSHIPS: {
int pos, idx, shift, err = 0;
netlink_lock_table();
for (pos = 0; pos * 8 < nlk->ngroups; pos += sizeof(u32)) {
if (len - pos < sizeof(u32))
break;
idx = pos / sizeof(unsigned long);
shift = (pos % sizeof(unsigned long)) * 8;
if (put_user((u32)(nlk->groups[idx] >> shift),
(u32 __user *)(optval + pos))) {
err = -EFAULT;
break;
}
}
if (put_user(ALIGN(BITS_TO_BYTES(nlk->ngroups), sizeof(u32)), optlen))
err = -EFAULT;
netlink_unlock_table();
return err;
}
case NETLINK_LISTEN_ALL_NSID:
flag = NETLINK_F_LISTEN_ALL_NSID;
break;
case NETLINK_CAP_ACK:
flag = NETLINK_F_CAP_ACK;
break;
case NETLINK_EXT_ACK:
flag = NETLINK_F_EXT_ACK;
break;
case NETLINK_GET_STRICT_CHK:
flag = NETLINK_F_STRICT_CHK;
break;
default:
return -ENOPROTOOPT;
}
if (len < sizeof(int))
return -EINVAL;
len = sizeof(int);
val = test_bit(flag, &nlk->flags);
if (put_user(len, optlen) ||
copy_to_user(optval, &val, len))
return -EFAULT;
return 0;
}
static void netlink_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb)
{
struct nl_pktinfo info;
info.group = NETLINK_CB(skb).dst_group;
put_cmsg(msg, SOL_NETLINK, NETLINK_PKTINFO, sizeof(info), &info);
}
static void netlink_cmsg_listen_all_nsid(struct sock *sk, struct msghdr *msg,
struct sk_buff *skb)
{
if (!NETLINK_CB(skb).nsid_is_set)
return;
put_cmsg(msg, SOL_NETLINK, NETLINK_LISTEN_ALL_NSID, sizeof(int),
&NETLINK_CB(skb).nsid);
}
static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
u32 dst_portid;
u32 dst_group;
struct sk_buff *skb;
int err;
struct scm_cookie scm;
u32 netlink_skb_flags = 0;
if (msg->msg_flags & MSG_OOB)
return -EOPNOTSUPP;
if (len == 0) {
pr_warn_once("Zero length message leads to an empty skb\n");
return -ENODATA;
}
err = scm_send(sock, msg, &scm, true);
if (err < 0)
return err;
if (msg->msg_namelen) {
err = -EINVAL;
if (msg->msg_namelen < sizeof(struct sockaddr_nl))
goto out;
if (addr->nl_family != AF_NETLINK)
goto out;
dst_portid = addr->nl_pid;
dst_group = ffs(addr->nl_groups);
err = -EPERM;
if ((dst_group || dst_portid) &&
!netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
goto out;
netlink_skb_flags |= NETLINK_SKB_DST;
} else {
/* Paired with WRITE_ONCE() in netlink_connect() */
dst_portid = READ_ONCE(nlk->dst_portid);
dst_group = READ_ONCE(nlk->dst_group);
}
/* Paired with WRITE_ONCE() in netlink_insert() */
if (!READ_ONCE(nlk->bound)) {
err = netlink_autobind(sock);
if (err)
goto out;
} else {
/* Ensure nlk is hashed and visible. */
smp_rmb();
}
err = -EMSGSIZE;
if (len > sk->sk_sndbuf - 32)
goto out;
err = -ENOBUFS;
skb = netlink_alloc_large_skb(len, dst_group);
if (skb == NULL)
goto out;
NETLINK_CB(skb).portid = nlk->portid;
NETLINK_CB(skb).dst_group = dst_group;
NETLINK_CB(skb).creds = scm.creds;
NETLINK_CB(skb).flags = netlink_skb_flags;
err = -EFAULT;
if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
kfree_skb(skb);
goto out;
}
err = security_netlink_send(sk, skb);
if (err) {
kfree_skb(skb);
goto out;
}
if (dst_group) {
refcount_inc(&skb->users);
netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
}
err = netlink_unicast(sk, skb, dst_portid, msg->msg_flags & MSG_DONTWAIT);
out:
scm_destroy(&scm);
return err;
}
static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
int flags)
{
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
size_t copied, max_recvmsg_len;
struct sk_buff *skb, *data_skb;
int err, ret;
if (flags & MSG_OOB)
return -EOPNOTSUPP;
copied = 0;
skb = skb_recv_datagram(sk, flags, &err);
if (skb == NULL)
goto out;
data_skb = skb;
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
if (unlikely(skb_shinfo(skb)->frag_list)) {
/*
* If this skb has a frag_list, then here that means that we
* will have to use the frag_list skb's data for compat tasks
* and the regular skb's data for normal (non-compat) tasks.
*
* If we need to send the compat skb, assign it to the
* 'data_skb' variable so that it will be used below for data
* copying. We keep 'skb' for everything else, including
* freeing both later.
*/
if (flags & MSG_CMSG_COMPAT)
data_skb = skb_shinfo(skb)->frag_list;
}
#endif
/* Record the max length of recvmsg() calls for future allocations */
max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len);
max_recvmsg_len = min_t(size_t, max_recvmsg_len,
SKB_WITH_OVERHEAD(32768));
WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len);
copied = data_skb->len;
if (len < copied) {
msg->msg_flags |= MSG_TRUNC;
copied = len;
}
err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
addr->nl_family = AF_NETLINK;
addr->nl_pad = 0;
addr->nl_pid = NETLINK_CB(skb).portid;
addr->nl_groups = netlink_group_mask(NETLINK_CB(skb).dst_group);
msg->msg_namelen = sizeof(*addr);
}
if (nlk_test_bit(RECV_PKTINFO, sk))
netlink_cmsg_recv_pktinfo(msg, skb);
if (nlk_test_bit(LISTEN_ALL_NSID, sk))
netlink_cmsg_listen_all_nsid(sk, msg, skb);
memset(&scm, 0, sizeof(scm));
scm.creds = *NETLINK_CREDS(skb);
if (flags & MSG_TRUNC)
copied = data_skb->len;
skb_free_datagram(sk, skb);
if (READ_ONCE(nlk->cb_running) &&
atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) {
ret = netlink_dump(sk, false);
if (ret) {
WRITE_ONCE(sk->sk_err, -ret);
sk_error_report(sk);
}
}
scm_recv(sock, msg, &scm, flags);
out:
netlink_rcv_wake(sk);
return err ? : copied;
}
static void netlink_data_ready(struct sock *sk)
{
BUG();
}
/*
* We export these functions to other modules. They provide a
* complete set of kernel non-blocking support for message
* queueing.
*/
struct sock *
__netlink_kernel_create(struct net *net, int unit, struct module *module,
struct netlink_kernel_cfg *cfg)
{
struct socket *sock;
struct sock *sk;
struct netlink_sock *nlk;
struct listeners *listeners = NULL;
unsigned int groups;
BUG_ON(!nl_table);
if (unit < 0 || unit >= MAX_LINKS)
return NULL;
if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock))
return NULL;
if (__netlink_create(net, sock, unit, 1) < 0)
goto out_sock_release_nosk;
sk = sock->sk;
if (!cfg || cfg->groups < 32)
groups = 32;
else
groups = cfg->groups;
listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
if (!listeners)
goto out_sock_release;
sk->sk_data_ready = netlink_data_ready;
if (cfg && cfg->input)
nlk_sk(sk)->netlink_rcv = cfg->input;
if (netlink_insert(sk, 0))
goto out_sock_release;
nlk = nlk_sk(sk);
set_bit(NETLINK_F_KERNEL_SOCKET, &nlk->flags);
netlink_table_grab();
if (!nl_table[unit].registered) {
nl_table[unit].groups = groups;
rcu_assign_pointer(nl_table[unit].listeners, listeners);
nl_table[unit].module = module;
if (cfg) {
nl_table[unit].bind = cfg->bind;
nl_table[unit].unbind = cfg->unbind;
nl_table[unit].release = cfg->release;
nl_table[unit].flags = cfg->flags;
}
nl_table[unit].registered = 1;
} else {
kfree(listeners);
nl_table[unit].registered++;
}
netlink_table_ungrab();
return sk;
out_sock_release:
kfree(listeners);
netlink_kernel_release(sk);
return NULL;
out_sock_release_nosk:
sock_release(sock);
return NULL;
}
EXPORT_SYMBOL(__netlink_kernel_create);
void
netlink_kernel_release(struct sock *sk)
{
if (sk == NULL || sk->sk_socket == NULL)
return;
sock_release(sk->sk_socket);
}
EXPORT_SYMBOL(netlink_kernel_release);
int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
struct listeners *new, *old;
struct netlink_table *tbl = &nl_table[sk->sk_protocol];
if (groups < 32)
groups = 32;
if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
new = kzalloc(sizeof(*new) + NLGRPSZ(groups), GFP_ATOMIC);
if (!new)
return -ENOMEM;
old = nl_deref_protected(tbl->listeners);
memcpy(new->masks, old->masks, NLGRPSZ(tbl->groups));
rcu_assign_pointer(tbl->listeners, new);
kfree_rcu(old, rcu);
}
tbl->groups = groups;
return 0;
}
/**
* netlink_change_ngroups - change number of multicast groups
*
* This changes the number of multicast groups that are available
* on a certain netlink family. Note that it is not possible to
* change the number of groups to below 32. Also note that it does
* not implicitly call netlink_clear_multicast_users() when the
* number of groups is reduced.
*
* @sk: The kernel netlink socket, as returned by netlink_kernel_create().
* @groups: The new number of groups.
*/
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
int err;
netlink_table_grab();
err = __netlink_change_ngroups(sk, groups);
netlink_table_ungrab();
return err;
}
void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
struct sock *sk;
struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
sk_for_each_bound(sk, &tbl->mc_list)
netlink_update_socket_mc(nlk_sk(sk), group, 0);
}
struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags)
{
struct nlmsghdr *nlh;
int size = nlmsg_msg_size(len);
nlh = skb_put(skb, NLMSG_ALIGN(size));
nlh->nlmsg_type = type;
nlh->nlmsg_len = size;
nlh->nlmsg_flags = flags;
nlh->nlmsg_pid = portid;
nlh->nlmsg_seq = seq;
if (!__builtin_constant_p(size) || NLMSG_ALIGN(size) - size != 0)
memset(nlmsg_data(nlh) + len, 0, NLMSG_ALIGN(size) - size);
return nlh;
}
EXPORT_SYMBOL(__nlmsg_put);
static size_t
netlink_ack_tlv_len(struct netlink_sock *nlk, int err,
const struct netlink_ext_ack *extack)
{
size_t tlvlen;
if (!extack || !test_bit(NETLINK_F_EXT_ACK, &nlk->flags))
return 0;
tlvlen = 0;
if (extack->_msg)
tlvlen += nla_total_size(strlen(extack->_msg) + 1);
if (extack->cookie_len)
tlvlen += nla_total_size(extack->cookie_len);
/* Following attributes are only reported as error (not warning) */
if (!err)
return tlvlen;
if (extack->bad_attr)
tlvlen += nla_total_size(sizeof(u32));
if (extack->policy)
tlvlen += netlink_policy_dump_attr_size_estimate(extack->policy);
if (extack->miss_type)
tlvlen += nla_total_size(sizeof(u32));
if (extack->miss_nest)
tlvlen += nla_total_size(sizeof(u32));
return tlvlen;
}
static void
netlink_ack_tlv_fill(struct sk_buff *in_skb, struct sk_buff *skb,
const struct nlmsghdr *nlh, int err,
const struct netlink_ext_ack *extack)
{
if (extack->_msg)
WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, extack->_msg));
if (extack->cookie_len)
WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE,
extack->cookie_len, extack->cookie));
if (!err)
return;
if (extack->bad_attr &&
!WARN_ON((u8 *)extack->bad_attr < in_skb->data ||
(u8 *)extack->bad_attr >= in_skb->data + in_skb->len))
WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS,
(u8 *)extack->bad_attr - (const u8 *)nlh));
if (extack->policy)
netlink_policy_dump_write_attr(skb, extack->policy,
NLMSGERR_ATTR_POLICY);
if (extack->miss_type)
WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_TYPE,
extack->miss_type));
if (extack->miss_nest &&
!WARN_ON((u8 *)extack->miss_nest < in_skb->data ||
(u8 *)extack->miss_nest > in_skb->data + in_skb->len))
WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_MISS_NEST,
(u8 *)extack->miss_nest - (const u8 *)nlh));
}
/*
* It looks a bit ugly.
* It would be better to create kernel thread.
*/
static int netlink_dump_done(struct netlink_sock *nlk, struct sk_buff *skb,
struct netlink_callback *cb,
struct netlink_ext_ack *extack)
{
struct nlmsghdr *nlh;
size_t extack_len;
nlh = nlmsg_put_answer(skb, cb, NLMSG_DONE, sizeof(nlk->dump_done_errno),
NLM_F_MULTI | cb->answer_flags);
if (WARN_ON(!nlh))
return -ENOBUFS;
nl_dump_check_consistent(cb, nlh);
memcpy(nlmsg_data(nlh), &nlk->dump_done_errno, sizeof(nlk->dump_done_errno));
extack_len = netlink_ack_tlv_len(nlk, nlk->dump_done_errno, extack);
if (extack_len) {
nlh->nlmsg_flags |= NLM_F_ACK_TLVS;
if (skb_tailroom(skb) >= extack_len) {
netlink_ack_tlv_fill(cb->skb, skb, cb->nlh,
nlk->dump_done_errno, extack);
nlmsg_end(skb, nlh);
}
}
return 0;
}
static int netlink_dump(struct sock *sk, bool lock_taken)
{
struct netlink_sock *nlk = nlk_sk(sk);
struct netlink_ext_ack extack = {};
struct netlink_callback *cb;
struct sk_buff *skb = NULL;
size_t max_recvmsg_len;
struct module *module;
int err = -ENOBUFS;
int alloc_min_size;
int alloc_size;
if (!lock_taken)
mutex_lock(&nlk->nl_cb_mutex);
if (!nlk->cb_running) {
err = -EINVAL;
goto errout_skb;
}
if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
goto errout_skb;
/* NLMSG_GOODSIZE is small to avoid high order allocations being
* required, but it makes sense to _attempt_ a 16K bytes allocation
* to reduce number of system calls on dump operations, if user
* ever provided a big enough buffer.
*/
cb = &nlk->cb;
alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE);
max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len);
if (alloc_min_size < max_recvmsg_len) {
alloc_size = max_recvmsg_len;
skb = alloc_skb(alloc_size,
(GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) |
__GFP_NOWARN | __GFP_NORETRY);
}
if (!skb) {
alloc_size = alloc_min_size;
skb = alloc_skb(alloc_size, GFP_KERNEL);
}
if (!skb)
goto errout_skb;
/* Trim skb to allocated size. User is expected to provide buffer as
* large as max(min_dump_alloc, 16KiB (mac_recvmsg_len capped at
* netlink_recvmsg())). dump will pack as many smaller messages as
* could fit within the allocated skb. skb is typically allocated
* with larger space than required (could be as much as near 2x the
* requested size with align to next power of 2 approach). Allowing
* dump to use the excess space makes it difficult for a user to have a
* reasonable static buffer based on the expected largest dump of a
* single netdev. The outcome is MSG_TRUNC error.
*/
skb_reserve(skb, skb_tailroom(skb) - alloc_size);
/* Make sure malicious BPF programs can not read unitialized memory
* from skb->head -> skb->data
*/
skb_reset_network_header(skb);
skb_reset_mac_header(skb);
netlink_skb_set_owner_r(skb, sk);
if (nlk->dump_done_errno > 0) {
cb->extack = &extack;
nlk->dump_done_errno = cb->dump(skb, cb);
/* EMSGSIZE plus something already in the skb means
* that there's more to dump but current skb has filled up.
* If the callback really wants to return EMSGSIZE to user space
* it needs to do so again, on the next cb->dump() call,
* without putting data in the skb.
*/
if (nlk->dump_done_errno == -EMSGSIZE && skb->len)
nlk->dump_done_errno = skb->len;
cb->extack = NULL;
}
if (nlk->dump_done_errno > 0 ||
skb_tailroom(skb) < nlmsg_total_size(sizeof(nlk->dump_done_errno))) {
mutex_unlock(&nlk->nl_cb_mutex);
if (sk_filter(sk, skb))
kfree_skb(skb);
else
__netlink_sendskb(sk, skb);
return 0;
}
if (netlink_dump_done(nlk, skb, cb, &extack))
goto errout_skb;
#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
/* frag_list skb's data is used for compat tasks
* and the regular skb's data for normal (non-compat) tasks.
* See netlink_recvmsg().
*/
if (unlikely(skb_shinfo(skb)->frag_list)) {
if (netlink_dump_done(nlk, skb_shinfo(skb)->frag_list, cb, &extack))
goto errout_skb;
}
#endif
if (sk_filter(sk, skb))
kfree_skb(skb);
else
__netlink_sendskb(sk, skb);
if (cb->done)
cb->done(cb);
WRITE_ONCE(nlk->cb_running, false);
module = cb->module;
skb = cb->skb;
mutex_unlock(&nlk->nl_cb_mutex);
module_put(module);
consume_skb(skb);
return 0;
errout_skb:
mutex_unlock(&nlk->nl_cb_mutex);
kfree_skb(skb);
return err;
}
int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct netlink_dump_control *control)
{
struct netlink_callback *cb;
struct netlink_sock *nlk;
struct sock *sk;
int ret;
refcount_inc(&skb->users);
sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
if (sk == NULL) {
ret = -ECONNREFUSED;
goto error_free;
}
nlk = nlk_sk(sk);
mutex_lock(&nlk->nl_cb_mutex);
/* A dump is in progress... */
if (nlk->cb_running) {
ret = -EBUSY;
goto error_unlock;
}
/* add reference of module which cb->dump belongs to */
if (!try_module_get(control->module)) {
ret = -EPROTONOSUPPORT;
goto error_unlock;
}
cb = &nlk->cb;
memset(cb, 0, sizeof(*cb));
cb->dump = control->dump;
cb->done = control->done;
cb->nlh = nlh;
cb->data = control->data;
cb->module = control->module;
cb->min_dump_alloc = control->min_dump_alloc;
cb->flags = control->flags;
cb->skb = skb;
cb->strict_check = nlk_test_bit(STRICT_CHK, NETLINK_CB(skb).sk);
if (control->start) {
cb->extack = control->extack;
ret = control->start(cb);
cb->extack = NULL;
if (ret)
goto error_put;
}
WRITE_ONCE(nlk->cb_running, true);
nlk->dump_done_errno = INT_MAX;
ret = netlink_dump(sk, true);
sock_put(sk);
if (ret)
return ret;
/* We successfully started a dump, by returning -EINTR we
* signal not to send ACK even if it was requested.
*/
return -EINTR;
error_put:
module_put(control->module);
error_unlock:
sock_put(sk);
mutex_unlock(&nlk->nl_cb_mutex);
error_free:
kfree_skb(skb);
return ret;
}
EXPORT_SYMBOL(__netlink_dump_start);
void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
const struct netlink_ext_ack *extack)
{
struct sk_buff *skb;
struct nlmsghdr *rep;
struct nlmsgerr *errmsg;
size_t payload = sizeof(*errmsg);
struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk);
unsigned int flags = 0;
size_t tlvlen;
/* Error messages get the original request appened, unless the user
* requests to cap the error message, and get extra error data if
* requested.
*/
if (err && !test_bit(NETLINK_F_CAP_ACK, &nlk->flags))
payload += nlmsg_len(nlh);
else
flags |= NLM_F_CAPPED;
tlvlen = netlink_ack_tlv_len(nlk, err, extack);
if (tlvlen)
flags |= NLM_F_ACK_TLVS;
skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
if (!skb)
goto err_skb;
rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
NLMSG_ERROR, sizeof(*errmsg), flags);
if (!rep)
goto err_bad_put;
errmsg = nlmsg_data(rep);
errmsg->error = err;
errmsg->msg = *nlh;
if (!(flags & NLM_F_CAPPED)) {
if (!nlmsg_append(skb, nlmsg_len(nlh)))
goto err_bad_put;
memcpy(nlmsg_data(&errmsg->msg), nlmsg_data(nlh),
nlmsg_len(nlh));
}
if (tlvlen)
netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack);
nlmsg_end(skb, rep);
nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);
return;
err_bad_put:
nlmsg_free(skb);
err_skb:
WRITE_ONCE(NETLINK_CB(in_skb).sk->sk_err, ENOBUFS);
sk_error_report(NETLINK_CB(in_skb).sk);
}
EXPORT_SYMBOL(netlink_ack);
int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *,
struct nlmsghdr *,
struct netlink_ext_ack *))
{
struct netlink_ext_ack extack;
struct nlmsghdr *nlh;
int err;
while (skb->len >= nlmsg_total_size(0)) {
int msglen;
memset(&extack, 0, sizeof(extack));
nlh = nlmsg_hdr(skb);
err = 0;
if (nlh->nlmsg_len < NLMSG_HDRLEN || skb->len < nlh->nlmsg_len)
return 0;
/* Only requests are handled by the kernel */
if (!(nlh->nlmsg_flags & NLM_F_REQUEST))
goto ack;
/* Skip control messages */
if (nlh->nlmsg_type < NLMSG_MIN_TYPE)
goto ack;
err = cb(skb, nlh, &extack);
if (err == -EINTR)
goto skip;
ack:
if (nlh->nlmsg_flags & NLM_F_ACK || err)
netlink_ack(skb, nlh, err, &extack);
skip:
msglen = NLMSG_ALIGN(nlh->nlmsg_len);
if (msglen > skb->len)
msglen = skb->len;
skb_pull(skb, msglen);
}
return 0;
}
EXPORT_SYMBOL(netlink_rcv_skb);
/**
* nlmsg_notify - send a notification netlink message
* @sk: netlink socket to use
* @skb: notification message
* @portid: destination netlink portid for reports or 0
* @group: destination multicast group or 0
* @report: 1 to report back, 0 to disable
* @flags: allocation flags
*/
int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid,
unsigned int group, int report, gfp_t flags)
{
int err = 0;
if (group) {
int exclude_portid = 0;
if (report) {
refcount_inc(&skb->users);
exclude_portid = portid;
}
/* errors reported via destination sk->sk_err, but propagate
* delivery errors if NETLINK_BROADCAST_ERROR flag is set */
err = nlmsg_multicast(sk, skb, exclude_portid, group, flags);
if (err == -ESRCH)
err = 0;
}
if (report) {
int err2;
err2 = nlmsg_unicast(sk, skb, portid);
if (!err)
err = err2;
}
return err;
}
EXPORT_SYMBOL(nlmsg_notify);
#ifdef CONFIG_PROC_FS
struct nl_seq_iter {
struct seq_net_private p;
struct rhashtable_iter hti;
int link;
};
static void netlink_walk_start(struct nl_seq_iter *iter)
{
rhashtable_walk_enter(&nl_table[iter->link].hash, &iter->hti);
rhashtable_walk_start(&iter->hti);
}
static void netlink_walk_stop(struct nl_seq_iter *iter)
{
rhashtable_walk_stop(&iter->hti);
rhashtable_walk_exit(&iter->hti);
}
static void *__netlink_seq_next(struct seq_file *seq)
{
struct nl_seq_iter *iter = seq->private;
struct netlink_sock *nlk;
do {
for (;;) {
nlk = rhashtable_walk_next(&iter->hti);
if (IS_ERR(nlk)) {
if (PTR_ERR(nlk) == -EAGAIN)
continue;
return nlk;
}
if (nlk)
break;
netlink_walk_stop(iter);
if (++iter->link >= MAX_LINKS)
return NULL;
netlink_walk_start(iter);
}
} while (sock_net(&nlk->sk) != seq_file_net(seq));
return nlk;
}
static void *netlink_seq_start(struct seq_file *seq, loff_t *posp)
__acquires(RCU)
{
struct nl_seq_iter *iter = seq->private;
void *obj = SEQ_START_TOKEN;
loff_t pos;
iter->link = 0;
netlink_walk_start(iter);
for (pos = *posp; pos && obj && !IS_ERR(obj); pos--)
obj = __netlink_seq_next(seq);
return obj;
}
static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
++*pos;
return __netlink_seq_next(seq);
}
static void netlink_native_seq_stop(struct seq_file *seq, void *v)
{
struct nl_seq_iter *iter = seq->private;
if (iter->link >= MAX_LINKS)
return;
netlink_walk_stop(iter);
}
static int netlink_native_seq_show(struct seq_file *seq, void *v)
{
if (v == SEQ_START_TOKEN) {
seq_puts(seq,
"sk Eth Pid Groups "
"Rmem Wmem Dump Locks Drops Inode\n");
} else {
struct sock *s = v;
struct netlink_sock *nlk = nlk_sk(s);
seq_printf(seq, "%pK %-3d %-10u %08x %-8d %-8d %-5d %-8d %-8u %-8lu\n",
s,
s->sk_protocol,
nlk->portid,
nlk->groups ? (u32)nlk->groups[0] : 0,
sk_rmem_alloc_get(s),
sk_wmem_alloc_get(s),
READ_ONCE(nlk->cb_running),
refcount_read(&s->sk_refcnt),
atomic_read(&s->sk_drops),
sock_i_ino(s)
);
}
return 0;
}
#ifdef CONFIG_BPF_SYSCALL
struct bpf_iter__netlink {
__bpf_md_ptr(struct bpf_iter_meta *, meta);
__bpf_md_ptr(struct netlink_sock *, sk);
};
DEFINE_BPF_ITER_FUNC(netlink, struct bpf_iter_meta *meta, struct netlink_sock *sk)
static int netlink_prog_seq_show(struct bpf_prog *prog,
struct bpf_iter_meta *meta,
void *v)
{
struct bpf_iter__netlink ctx;
meta->seq_num--; /* skip SEQ_START_TOKEN */
ctx.meta = meta;
ctx.sk = nlk_sk((struct sock *)v);
return bpf_iter_run_prog(prog, &ctx);
}
static int netlink_seq_show(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
meta.seq = seq;
prog = bpf_iter_get_info(&meta, false);
if (!prog)
return netlink_native_seq_show(seq, v);
if (v != SEQ_START_TOKEN)
return netlink_prog_seq_show(prog, &meta, v);
return 0;
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
{
struct bpf_iter_meta meta;
struct bpf_prog *prog;
if (!v) {
meta.seq = seq;
prog = bpf_iter_get_info(&meta, true);
if (prog)
(void)netlink_prog_seq_show(prog, &meta, v);
}
netlink_native_seq_stop(seq, v);
}
#else
static int netlink_seq_show(struct seq_file *seq, void *v)
{
return netlink_native_seq_show(seq, v);
}
static void netlink_seq_stop(struct seq_file *seq, void *v)
{
netlink_native_seq_stop(seq, v);
}
#endif
static const struct seq_operations netlink_seq_ops = {
.start = netlink_seq_start,
.next = netlink_seq_next,
.stop = netlink_seq_stop,
.show = netlink_seq_show,
};
#endif
int netlink_register_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_register(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_register_notifier);
int netlink_unregister_notifier(struct notifier_block *nb)
{
return blocking_notifier_chain_unregister(&netlink_chain, nb);
}
EXPORT_SYMBOL(netlink_unregister_notifier);
static const struct proto_ops netlink_ops = {
.family = PF_NETLINK,
.owner = THIS_MODULE,
.release = netlink_release,
.bind = netlink_bind,
.connect = netlink_connect,
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = netlink_getname,
.poll = datagram_poll,
.ioctl = netlink_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = netlink_setsockopt,
.getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
.mmap = sock_no_mmap,
};
static const struct net_proto_family netlink_family_ops = {
.family = PF_NETLINK,
.create = netlink_create,
.owner = THIS_MODULE, /* for consistency 8) */
};
static int __net_init netlink_net_init(struct net *net)
{
#ifdef CONFIG_PROC_FS
if (!proc_create_net("netlink", 0, net->proc_net, &netlink_seq_ops,
sizeof(struct nl_seq_iter)))
return -ENOMEM;
#endif
return 0;
}
static void __net_exit netlink_net_exit(struct net *net)
{
#ifdef CONFIG_PROC_FS
remove_proc_entry("netlink", net->proc_net);
#endif
}
static void __init netlink_add_usersock_entry(void)
{
struct listeners *listeners;
int groups = 32;
listeners = kzalloc(sizeof(*listeners) + NLGRPSZ(groups), GFP_KERNEL);
if (!listeners)
panic("netlink_add_usersock_entry: Cannot allocate listeners\n");
netlink_table_grab();
nl_table[NETLINK_USERSOCK].groups = groups;
rcu_assign_pointer(nl_table[NETLINK_USERSOCK].listeners, listeners);
nl_table[NETLINK_USERSOCK].module = THIS_MODULE;
nl_table[NETLINK_USERSOCK].registered = 1;
nl_table[NETLINK_USERSOCK].flags = NL_CFG_F_NONROOT_SEND;
netlink_table_ungrab();
}
static struct pernet_operations __net_initdata netlink_net_ops = {
.init = netlink_net_init,
.exit = netlink_net_exit,
};
static inline u32 netlink_hash(const void *data, u32 len, u32 seed)
{
const struct netlink_sock *nlk = data;
struct netlink_compare_arg arg;
netlink_compare_arg_init(&arg, sock_net(&nlk->sk), nlk->portid);
return jhash2((u32 *)&arg, netlink_compare_arg_len / sizeof(u32), seed);
}
static const struct rhashtable_params netlink_rhashtable_params = {
.head_offset = offsetof(struct netlink_sock, node),
.key_len = netlink_compare_arg_len,
.obj_hashfn = netlink_hash,
.obj_cmpfn = netlink_compare,
.automatic_shrinking = true,
};
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
BTF_ID_LIST(btf_netlink_sock_id)
BTF_ID(struct, netlink_sock)
static const struct bpf_iter_seq_info netlink_seq_info = {
.seq_ops = &netlink_seq_ops,
.init_seq_private = bpf_iter_init_seq_net,
.fini_seq_private = bpf_iter_fini_seq_net,
.seq_priv_size = sizeof(struct nl_seq_iter),
};
static struct bpf_iter_reg netlink_reg_info = {
.target = "netlink",
.ctx_arg_info_size = 1,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__netlink, sk),
PTR_TO_BTF_ID_OR_NULL },
},
.seq_info = &netlink_seq_info,
};
static int __init bpf_iter_register(void)
{
netlink_reg_info.ctx_arg_info[0].btf_id = *btf_netlink_sock_id;
return bpf_iter_reg_target(&netlink_reg_info);
}
#endif
static int __init netlink_proto_init(void)
{
int i;
int err = proto_register(&netlink_proto, 0);
if (err != 0)
goto out;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
err = bpf_iter_register();
if (err)
goto out;
#endif
BUILD_BUG_ON(sizeof(struct netlink_skb_parms) > sizeof_field(struct sk_buff, cb));
nl_table = kcalloc(MAX_LINKS, sizeof(*nl_table), GFP_KERNEL);
if (!nl_table)
goto panic;
for (i = 0; i < MAX_LINKS; i++) {
if (rhashtable_init(&nl_table[i].hash,
&netlink_rhashtable_params) < 0) {
while (--i > 0)
rhashtable_destroy(&nl_table[i].hash);
kfree(nl_table);
goto panic;
}
}
netlink_add_usersock_entry();
sock_register(&netlink_family_ops);
register_pernet_subsys(&netlink_net_ops);
register_pernet_subsys(&netlink_tap_net_ops);
/* The netlink device handler may be needed early. */
rtnetlink_init();
out:
return err;
panic:
panic("netlink_init: Cannot allocate nl_table\n");
}
core_initcall(netlink_proto_init);