mirror of
https://github.com/torvalds/linux.git
synced 2024-12-31 23:31:29 +00:00
bd1060a1d6
In cgroup v1, dealing with cgroup membership was difficult because the number of membership associations was unbound. As a result, cgroup v1 grew several controllers whose primary purpose is either tagging membership or pull in configuration knobs from other subsystems so that cgroup membership test can be avoided. net_cls and net_prio controllers are examples of the latter. They allow configuring network-specific attributes from cgroup side so that network subsystem can avoid testing cgroup membership; unfortunately, these are not only cumbersome but also problematic. Both net_cls and net_prio aren't properly hierarchical. Both inherit configuration from the parent on creation but there's no interaction afterwards. An ancestor doesn't restrict the behavior in its subtree in anyway and configuration changes aren't propagated downwards. Especially when combined with cgroup delegation, this is problematic because delegatees can mess up whatever network configuration implemented at the system level. net_prio would allow the delegatees to set whatever priority value regardless of CAP_NET_ADMIN and net_cls the same for classid. While it is possible to solve these issues from controller side by implementing hierarchical allowable ranges in both controllers, it would involve quite a bit of complexity in the controllers and further obfuscate network configuration as it becomes even more difficult to tell what's actually being configured looking from the network side. While not much can be done for v1 at this point, as membership handling is sane on cgroup v2, it'd be better to make cgroup matching behave like other network matches and classifiers than introducing further complications. In preparation, this patch updates sock->sk_cgrp_data handling so that it points to the v2 cgroup that sock was created in until either net_prio or net_cls is used. Once either of the two is used, sock->sk_cgrp_data reverts to its previous role of carrying prioidx and classid. This is to avoid adding yet another cgroup related field to struct sock. As the mode switching can happen at most once per boot, the switching mechanism is aimed at lowering hot path overhead. It may leak a finite, likely small, number of cgroup refs and report spurious prioidx or classid on switching; however, dynamic updates of prioidx and classid have always been racy and lossy - socks between creation and fd installation are never updated, config changes don't update existing sockets at all, and prioidx may index with dead and recycled cgroup IDs. Non-critical inaccuracies from small race windows won't make any noticeable difference. This patch doesn't make use of the pointer yet. The following patch will implement netfilter match for cgroup2 membership. v2: Use sock_cgroup_data to avoid inflating struct sock w/ another cgroup specific field. v3: Add comments explaining why sock_data_prioidx() and sock_data_classid() use different fallback values. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Daniel Borkmann <daniel@iogearbox.net> Cc: Daniel Wagner <daniel.wagner@bmw-carit.de> CC: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net>
304 lines
6.8 KiB
C
304 lines
6.8 KiB
C
/*
|
|
* net/core/netprio_cgroup.c Priority Control Group
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License
|
|
* as published by the Free Software Foundation; either version
|
|
* 2 of the License, or (at your option) any later version.
|
|
*
|
|
* Authors: Neil Horman <nhorman@tuxdriver.com>
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/module.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/types.h>
|
|
#include <linux/string.h>
|
|
#include <linux/errno.h>
|
|
#include <linux/skbuff.h>
|
|
#include <linux/cgroup.h>
|
|
#include <linux/rcupdate.h>
|
|
#include <linux/atomic.h>
|
|
#include <net/rtnetlink.h>
|
|
#include <net/pkt_cls.h>
|
|
#include <net/sock.h>
|
|
#include <net/netprio_cgroup.h>
|
|
|
|
#include <linux/fdtable.h>
|
|
|
|
/*
|
|
* netprio allocates per-net_device priomap array which is indexed by
|
|
* css->id. Limiting css ID to 16bits doesn't lose anything.
|
|
*/
|
|
#define NETPRIO_ID_MAX USHRT_MAX
|
|
|
|
#define PRIOMAP_MIN_SZ 128
|
|
|
|
/*
|
|
* Extend @dev->priomap so that it's large enough to accommodate
|
|
* @target_idx. @dev->priomap.priomap_len > @target_idx after successful
|
|
* return. Must be called under rtnl lock.
|
|
*/
|
|
static int extend_netdev_table(struct net_device *dev, u32 target_idx)
|
|
{
|
|
struct netprio_map *old, *new;
|
|
size_t new_sz, new_len;
|
|
|
|
/* is the existing priomap large enough? */
|
|
old = rtnl_dereference(dev->priomap);
|
|
if (old && old->priomap_len > target_idx)
|
|
return 0;
|
|
|
|
/*
|
|
* Determine the new size. Let's keep it power-of-two. We start
|
|
* from PRIOMAP_MIN_SZ and double it until it's large enough to
|
|
* accommodate @target_idx.
|
|
*/
|
|
new_sz = PRIOMAP_MIN_SZ;
|
|
while (true) {
|
|
new_len = (new_sz - offsetof(struct netprio_map, priomap)) /
|
|
sizeof(new->priomap[0]);
|
|
if (new_len > target_idx)
|
|
break;
|
|
new_sz *= 2;
|
|
/* overflowed? */
|
|
if (WARN_ON(new_sz < PRIOMAP_MIN_SZ))
|
|
return -ENOSPC;
|
|
}
|
|
|
|
/* allocate & copy */
|
|
new = kzalloc(new_sz, GFP_KERNEL);
|
|
if (!new)
|
|
return -ENOMEM;
|
|
|
|
if (old)
|
|
memcpy(new->priomap, old->priomap,
|
|
old->priomap_len * sizeof(old->priomap[0]));
|
|
|
|
new->priomap_len = new_len;
|
|
|
|
/* install the new priomap */
|
|
rcu_assign_pointer(dev->priomap, new);
|
|
if (old)
|
|
kfree_rcu(old, rcu);
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* netprio_prio - return the effective netprio of a cgroup-net_device pair
|
|
* @css: css part of the target pair
|
|
* @dev: net_device part of the target pair
|
|
*
|
|
* Should be called under RCU read or rtnl lock.
|
|
*/
|
|
static u32 netprio_prio(struct cgroup_subsys_state *css, struct net_device *dev)
|
|
{
|
|
struct netprio_map *map = rcu_dereference_rtnl(dev->priomap);
|
|
int id = css->cgroup->id;
|
|
|
|
if (map && id < map->priomap_len)
|
|
return map->priomap[id];
|
|
return 0;
|
|
}
|
|
|
|
/**
|
|
* netprio_set_prio - set netprio on a cgroup-net_device pair
|
|
* @css: css part of the target pair
|
|
* @dev: net_device part of the target pair
|
|
* @prio: prio to set
|
|
*
|
|
* Set netprio to @prio on @css-@dev pair. Should be called under rtnl
|
|
* lock and may fail under memory pressure for non-zero @prio.
|
|
*/
|
|
static int netprio_set_prio(struct cgroup_subsys_state *css,
|
|
struct net_device *dev, u32 prio)
|
|
{
|
|
struct netprio_map *map;
|
|
int id = css->cgroup->id;
|
|
int ret;
|
|
|
|
/* avoid extending priomap for zero writes */
|
|
map = rtnl_dereference(dev->priomap);
|
|
if (!prio && (!map || map->priomap_len <= id))
|
|
return 0;
|
|
|
|
ret = extend_netdev_table(dev, id);
|
|
if (ret)
|
|
return ret;
|
|
|
|
map = rtnl_dereference(dev->priomap);
|
|
map->priomap[id] = prio;
|
|
return 0;
|
|
}
|
|
|
|
static struct cgroup_subsys_state *
|
|
cgrp_css_alloc(struct cgroup_subsys_state *parent_css)
|
|
{
|
|
struct cgroup_subsys_state *css;
|
|
|
|
css = kzalloc(sizeof(*css), GFP_KERNEL);
|
|
if (!css)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
return css;
|
|
}
|
|
|
|
static int cgrp_css_online(struct cgroup_subsys_state *css)
|
|
{
|
|
struct cgroup_subsys_state *parent_css = css->parent;
|
|
struct net_device *dev;
|
|
int ret = 0;
|
|
|
|
if (css->id > NETPRIO_ID_MAX)
|
|
return -ENOSPC;
|
|
|
|
if (!parent_css)
|
|
return 0;
|
|
|
|
rtnl_lock();
|
|
/*
|
|
* Inherit prios from the parent. As all prios are set during
|
|
* onlining, there is no need to clear them on offline.
|
|
*/
|
|
for_each_netdev(&init_net, dev) {
|
|
u32 prio = netprio_prio(parent_css, dev);
|
|
|
|
ret = netprio_set_prio(css, dev, prio);
|
|
if (ret)
|
|
break;
|
|
}
|
|
rtnl_unlock();
|
|
return ret;
|
|
}
|
|
|
|
static void cgrp_css_free(struct cgroup_subsys_state *css)
|
|
{
|
|
kfree(css);
|
|
}
|
|
|
|
static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft)
|
|
{
|
|
return css->cgroup->id;
|
|
}
|
|
|
|
static int read_priomap(struct seq_file *sf, void *v)
|
|
{
|
|
struct net_device *dev;
|
|
|
|
rcu_read_lock();
|
|
for_each_netdev_rcu(&init_net, dev)
|
|
seq_printf(sf, "%s %u\n", dev->name,
|
|
netprio_prio(seq_css(sf), dev));
|
|
rcu_read_unlock();
|
|
return 0;
|
|
}
|
|
|
|
static ssize_t write_priomap(struct kernfs_open_file *of,
|
|
char *buf, size_t nbytes, loff_t off)
|
|
{
|
|
char devname[IFNAMSIZ + 1];
|
|
struct net_device *dev;
|
|
u32 prio;
|
|
int ret;
|
|
|
|
if (sscanf(buf, "%"__stringify(IFNAMSIZ)"s %u", devname, &prio) != 2)
|
|
return -EINVAL;
|
|
|
|
dev = dev_get_by_name(&init_net, devname);
|
|
if (!dev)
|
|
return -ENODEV;
|
|
|
|
cgroup_sk_alloc_disable();
|
|
|
|
rtnl_lock();
|
|
|
|
ret = netprio_set_prio(of_css(of), dev, prio);
|
|
|
|
rtnl_unlock();
|
|
dev_put(dev);
|
|
return ret ?: nbytes;
|
|
}
|
|
|
|
static int update_netprio(const void *v, struct file *file, unsigned n)
|
|
{
|
|
int err;
|
|
struct socket *sock = sock_from_file(file, &err);
|
|
if (sock) {
|
|
spin_lock(&cgroup_sk_update_lock);
|
|
sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
|
|
(unsigned long)v);
|
|
spin_unlock(&cgroup_sk_update_lock);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void net_prio_attach(struct cgroup_subsys_state *css,
|
|
struct cgroup_taskset *tset)
|
|
{
|
|
struct task_struct *p;
|
|
void *v = (void *)(unsigned long)css->cgroup->id;
|
|
|
|
cgroup_taskset_for_each(p, tset) {
|
|
task_lock(p);
|
|
iterate_fd(p->files, 0, update_netprio, v);
|
|
task_unlock(p);
|
|
}
|
|
}
|
|
|
|
static struct cftype ss_files[] = {
|
|
{
|
|
.name = "prioidx",
|
|
.read_u64 = read_prioidx,
|
|
},
|
|
{
|
|
.name = "ifpriomap",
|
|
.seq_show = read_priomap,
|
|
.write = write_priomap,
|
|
},
|
|
{ } /* terminate */
|
|
};
|
|
|
|
struct cgroup_subsys net_prio_cgrp_subsys = {
|
|
.css_alloc = cgrp_css_alloc,
|
|
.css_online = cgrp_css_online,
|
|
.css_free = cgrp_css_free,
|
|
.attach = net_prio_attach,
|
|
.legacy_cftypes = ss_files,
|
|
};
|
|
|
|
static int netprio_device_event(struct notifier_block *unused,
|
|
unsigned long event, void *ptr)
|
|
{
|
|
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
|
|
struct netprio_map *old;
|
|
|
|
/*
|
|
* Note this is called with rtnl_lock held so we have update side
|
|
* protection on our rcu assignments
|
|
*/
|
|
|
|
switch (event) {
|
|
case NETDEV_UNREGISTER:
|
|
old = rtnl_dereference(dev->priomap);
|
|
RCU_INIT_POINTER(dev->priomap, NULL);
|
|
if (old)
|
|
kfree_rcu(old, rcu);
|
|
break;
|
|
}
|
|
return NOTIFY_DONE;
|
|
}
|
|
|
|
static struct notifier_block netprio_device_notifier = {
|
|
.notifier_call = netprio_device_event
|
|
};
|
|
|
|
static int __init init_cgroup_netprio(void)
|
|
{
|
|
register_netdevice_notifier(&netprio_device_notifier);
|
|
return 0;
|
|
}
|
|
|
|
subsys_initcall(init_cgroup_netprio);
|
|
MODULE_LICENSE("GPL v2");
|