forked from Minki/linux
Merge branch 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup namespace support from Tejun Heo: "These are changes to implement namespace support for cgroup which has been pending for quite some time now. It is very straight-forward and only affects what part of cgroup hierarchies are visible. After unsharing, mounting a cgroup fs will be scoped to the cgroups the task belonged to at the time of unsharing and the cgroup paths exposed to userland would be adjusted accordingly" * 'for-4.6-ns' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: fix and restructure error handling in copy_cgroup_ns() cgroup: fix alloc_cgroup_ns() error handling in copy_cgroup_ns() Add FS_USERNS_FLAG to cgroup fs cgroup: Add documentation for cgroup namespaces cgroup: mount cgroupns-root when inside non-init cgroupns kernfs: define kernfs_node_dentry cgroup: cgroup namespace setns support cgroup: introduce cgroup namespaces sched: new clone flag CLONE_NEWCGROUP for cgroup namespace kernfs: Add API to generate relative kernfs path
This commit is contained in:
commit
5518f66b5a
@ -47,6 +47,11 @@ CONTENTS
|
||||
5-3. IO
|
||||
5-3-1. IO Interface Files
|
||||
5-3-2. Writeback
|
||||
6. Namespace
|
||||
6-1. Basics
|
||||
6-2. The Root and Views
|
||||
6-3. Migration and setns(2)
|
||||
6-4. Interaction with Other Namespaces
|
||||
P. Information on Kernel Programming
|
||||
P-1. Filesystem Support for Writeback
|
||||
D. Deprecated v1 Core Features
|
||||
@ -1114,6 +1119,148 @@ writeback as follows.
|
||||
vm.dirty[_background]_ratio.
|
||||
|
||||
|
||||
6. Namespace
|
||||
|
||||
6-1. Basics
|
||||
|
||||
cgroup namespace provides a mechanism to virtualize the view of the
|
||||
"/proc/$PID/cgroup" file and cgroup mounts. The CLONE_NEWCGROUP clone
|
||||
flag can be used with clone(2) and unshare(2) to create a new cgroup
|
||||
namespace. The process running inside the cgroup namespace will have
|
||||
its "/proc/$PID/cgroup" output restricted to cgroupns root. The
|
||||
cgroupns root is the cgroup of the process at the time of creation of
|
||||
the cgroup namespace.
|
||||
|
||||
Without cgroup namespace, the "/proc/$PID/cgroup" file shows the
|
||||
complete path of the cgroup of a process. In a container setup where
|
||||
a set of cgroups and namespaces are intended to isolate processes the
|
||||
"/proc/$PID/cgroup" file may leak potential system level information
|
||||
to the isolated processes. For Example:
|
||||
|
||||
# cat /proc/self/cgroup
|
||||
0::/batchjobs/container_id1
|
||||
|
||||
The path '/batchjobs/container_id1' can be considered as system-data
|
||||
and undesirable to expose to the isolated processes. cgroup namespace
|
||||
can be used to restrict visibility of this path. For example, before
|
||||
creating a cgroup namespace, one would see:
|
||||
|
||||
# ls -l /proc/self/ns/cgroup
|
||||
lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
|
||||
# cat /proc/self/cgroup
|
||||
0::/batchjobs/container_id1
|
||||
|
||||
After unsharing a new namespace, the view changes.
|
||||
|
||||
# ls -l /proc/self/ns/cgroup
|
||||
lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgroup:[4026532183]
|
||||
# cat /proc/self/cgroup
|
||||
0::/
|
||||
|
||||
When some thread from a multi-threaded process unshares its cgroup
|
||||
namespace, the new cgroupns gets applied to the entire process (all
|
||||
the threads). This is natural for the v2 hierarchy; however, for the
|
||||
legacy hierarchies, this may be unexpected.
|
||||
|
||||
A cgroup namespace is alive as long as there are processes inside or
|
||||
mounts pinning it. When the last usage goes away, the cgroup
|
||||
namespace is destroyed. The cgroupns root and the actual cgroups
|
||||
remain.
|
||||
|
||||
|
||||
6-2. The Root and Views
|
||||
|
||||
The 'cgroupns root' for a cgroup namespace is the cgroup in which the
|
||||
process calling unshare(2) is running. For example, if a process in
|
||||
/batchjobs/container_id1 cgroup calls unshare, cgroup
|
||||
/batchjobs/container_id1 becomes the cgroupns root. For the
|
||||
init_cgroup_ns, this is the real root ('/') cgroup.
|
||||
|
||||
The cgroupns root cgroup does not change even if the namespace creator
|
||||
process later moves to a different cgroup.
|
||||
|
||||
# ~/unshare -c # unshare cgroupns in some cgroup
|
||||
# cat /proc/self/cgroup
|
||||
0::/
|
||||
# mkdir sub_cgrp_1
|
||||
# echo 0 > sub_cgrp_1/cgroup.procs
|
||||
# cat /proc/self/cgroup
|
||||
0::/sub_cgrp_1
|
||||
|
||||
Each process gets its namespace-specific view of "/proc/$PID/cgroup"
|
||||
|
||||
Processes running inside the cgroup namespace will be able to see
|
||||
cgroup paths (in /proc/self/cgroup) only inside their root cgroup.
|
||||
From within an unshared cgroupns:
|
||||
|
||||
# sleep 100000 &
|
||||
[1] 7353
|
||||
# echo 7353 > sub_cgrp_1/cgroup.procs
|
||||
# cat /proc/7353/cgroup
|
||||
0::/sub_cgrp_1
|
||||
|
||||
From the initial cgroup namespace, the real cgroup path will be
|
||||
visible:
|
||||
|
||||
$ cat /proc/7353/cgroup
|
||||
0::/batchjobs/container_id1/sub_cgrp_1
|
||||
|
||||
From a sibling cgroup namespace (that is, a namespace rooted at a
|
||||
different cgroup), the cgroup path relative to its own cgroup
|
||||
namespace root will be shown. For instance, if PID 7353's cgroup
|
||||
namespace root is at '/batchjobs/container_id2', then it will see
|
||||
|
||||
# cat /proc/7353/cgroup
|
||||
0::/../container_id2/sub_cgrp_1
|
||||
|
||||
Note that the relative path always starts with '/' to indicate that
|
||||
its relative to the cgroup namespace root of the caller.
|
||||
|
||||
|
||||
6-3. Migration and setns(2)
|
||||
|
||||
Processes inside a cgroup namespace can move into and out of the
|
||||
namespace root if they have proper access to external cgroups. For
|
||||
example, from inside a namespace with cgroupns root at
|
||||
/batchjobs/container_id1, and assuming that the global hierarchy is
|
||||
still accessible inside cgroupns:
|
||||
|
||||
# cat /proc/7353/cgroup
|
||||
0::/sub_cgrp_1
|
||||
# echo 7353 > batchjobs/container_id2/cgroup.procs
|
||||
# cat /proc/7353/cgroup
|
||||
0::/../container_id2
|
||||
|
||||
Note that this kind of setup is not encouraged. A task inside cgroup
|
||||
namespace should only be exposed to its own cgroupns hierarchy.
|
||||
|
||||
setns(2) to another cgroup namespace is allowed when:
|
||||
|
||||
(a) the process has CAP_SYS_ADMIN against its current user namespace
|
||||
(b) the process has CAP_SYS_ADMIN against the target cgroup
|
||||
namespace's userns
|
||||
|
||||
No implicit cgroup changes happen with attaching to another cgroup
|
||||
namespace. It is expected that the someone moves the attaching
|
||||
process under the target cgroup namespace root.
|
||||
|
||||
|
||||
6-4. Interaction with Other Namespaces
|
||||
|
||||
Namespace specific cgroup hierarchy can be mounted by a process
|
||||
running inside a non-init cgroup namespace.
|
||||
|
||||
# mount -t cgroup2 none $MOUNT_POINT
|
||||
|
||||
This will mount the unified cgroup hierarchy with cgroupns root as the
|
||||
filesystem root. The process needs CAP_SYS_ADMIN against its user and
|
||||
mount namespaces.
|
||||
|
||||
The virtualization of /proc/self/cgroup file combined with restricting
|
||||
the view of cgroup hierarchy by namespace-private cgroupfs mount
|
||||
provides a properly isolated cgroup view inside the container.
|
||||
|
||||
|
||||
P. Information on Kernel Programming
|
||||
|
||||
This section contains kernel programming information in the areas
|
||||
|
191
fs/kernfs/dir.c
191
fs/kernfs/dir.c
@ -44,28 +44,122 @@ static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
|
||||
return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
|
||||
}
|
||||
|
||||
static char * __must_check kernfs_path_locked(struct kernfs_node *kn, char *buf,
|
||||
size_t buflen)
|
||||
/* kernfs_node_depth - compute depth from @from to @to */
|
||||
static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to)
|
||||
{
|
||||
char *p = buf + buflen;
|
||||
int len;
|
||||
size_t depth = 0;
|
||||
|
||||
*--p = '\0';
|
||||
while (to->parent && to != from) {
|
||||
depth++;
|
||||
to = to->parent;
|
||||
}
|
||||
return depth;
|
||||
}
|
||||
|
||||
do {
|
||||
len = strlen(kn->name);
|
||||
if (p - buf < len + 1) {
|
||||
buf[0] = '\0';
|
||||
p = NULL;
|
||||
break;
|
||||
}
|
||||
p -= len;
|
||||
memcpy(p, kn->name, len);
|
||||
*--p = '/';
|
||||
kn = kn->parent;
|
||||
} while (kn && kn->parent);
|
||||
static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
|
||||
struct kernfs_node *b)
|
||||
{
|
||||
size_t da, db;
|
||||
struct kernfs_root *ra = kernfs_root(a), *rb = kernfs_root(b);
|
||||
|
||||
return p;
|
||||
if (ra != rb)
|
||||
return NULL;
|
||||
|
||||
da = kernfs_depth(ra->kn, a);
|
||||
db = kernfs_depth(rb->kn, b);
|
||||
|
||||
while (da > db) {
|
||||
a = a->parent;
|
||||
da--;
|
||||
}
|
||||
while (db > da) {
|
||||
b = b->parent;
|
||||
db--;
|
||||
}
|
||||
|
||||
/* worst case b and a will be the same at root */
|
||||
while (b != a) {
|
||||
b = b->parent;
|
||||
a = a->parent;
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
/**
|
||||
* kernfs_path_from_node_locked - find a pseudo-absolute path to @kn_to,
|
||||
* where kn_from is treated as root of the path.
|
||||
* @kn_from: kernfs node which should be treated as root for the path
|
||||
* @kn_to: kernfs node to which path is needed
|
||||
* @buf: buffer to copy the path into
|
||||
* @buflen: size of @buf
|
||||
*
|
||||
* We need to handle couple of scenarios here:
|
||||
* [1] when @kn_from is an ancestor of @kn_to at some level
|
||||
* kn_from: /n1/n2/n3
|
||||
* kn_to: /n1/n2/n3/n4/n5
|
||||
* result: /n4/n5
|
||||
*
|
||||
* [2] when @kn_from is on a different hierarchy and we need to find common
|
||||
* ancestor between @kn_from and @kn_to.
|
||||
* kn_from: /n1/n2/n3/n4
|
||||
* kn_to: /n1/n2/n5
|
||||
* result: /../../n5
|
||||
* OR
|
||||
* kn_from: /n1/n2/n3/n4/n5 [depth=5]
|
||||
* kn_to: /n1/n2/n3 [depth=3]
|
||||
* result: /../..
|
||||
*
|
||||
* return value: length of the string. If greater than buflen,
|
||||
* then contents of buf are undefined. On error, -1 is returned.
|
||||
*/
|
||||
static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
|
||||
struct kernfs_node *kn_from,
|
||||
char *buf, size_t buflen)
|
||||
{
|
||||
struct kernfs_node *kn, *common;
|
||||
const char parent_str[] = "/..";
|
||||
size_t depth_from, depth_to, len = 0, nlen = 0;
|
||||
char *p;
|
||||
int i;
|
||||
|
||||
if (!kn_from)
|
||||
kn_from = kernfs_root(kn_to)->kn;
|
||||
|
||||
if (kn_from == kn_to)
|
||||
return strlcpy(buf, "/", buflen);
|
||||
|
||||
common = kernfs_common_ancestor(kn_from, kn_to);
|
||||
if (WARN_ON(!common))
|
||||
return -1;
|
||||
|
||||
depth_to = kernfs_depth(common, kn_to);
|
||||
depth_from = kernfs_depth(common, kn_from);
|
||||
|
||||
if (buf)
|
||||
buf[0] = '\0';
|
||||
|
||||
for (i = 0; i < depth_from; i++)
|
||||
len += strlcpy(buf + len, parent_str,
|
||||
len < buflen ? buflen - len : 0);
|
||||
|
||||
/* Calculate how many bytes we need for the rest */
|
||||
for (kn = kn_to; kn != common; kn = kn->parent)
|
||||
nlen += strlen(kn->name) + 1;
|
||||
|
||||
if (len + nlen >= buflen)
|
||||
return len + nlen;
|
||||
|
||||
p = buf + len + nlen;
|
||||
*p = '\0';
|
||||
for (kn = kn_to; kn != common; kn = kn->parent) {
|
||||
nlen = strlen(kn->name);
|
||||
p -= nlen;
|
||||
memcpy(p, kn->name, nlen);
|
||||
*(--p) = '/';
|
||||
}
|
||||
|
||||
return len + nlen;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -114,6 +208,34 @@ size_t kernfs_path_len(struct kernfs_node *kn)
|
||||
return len;
|
||||
}
|
||||
|
||||
/**
|
||||
* kernfs_path_from_node - build path of node @to relative to @from.
|
||||
* @from: parent kernfs_node relative to which we need to build the path
|
||||
* @to: kernfs_node of interest
|
||||
* @buf: buffer to copy @to's path into
|
||||
* @buflen: size of @buf
|
||||
*
|
||||
* Builds @to's path relative to @from in @buf. @from and @to must
|
||||
* be on the same kernfs-root. If @from is not parent of @to, then a relative
|
||||
* path (which includes '..'s) as needed to reach from @from to @to is
|
||||
* returned.
|
||||
*
|
||||
* If @buf isn't long enough, the return value will be greater than @buflen
|
||||
* and @buf contents are undefined.
|
||||
*/
|
||||
int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
|
||||
char *buf, size_t buflen)
|
||||
{
|
||||
unsigned long flags;
|
||||
int ret;
|
||||
|
||||
spin_lock_irqsave(&kernfs_rename_lock, flags);
|
||||
ret = kernfs_path_from_node_locked(to, from, buf, buflen);
|
||||
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kernfs_path_from_node);
|
||||
|
||||
/**
|
||||
* kernfs_path - build full path of a given node
|
||||
* @kn: kernfs_node of interest
|
||||
@ -127,13 +249,12 @@ size_t kernfs_path_len(struct kernfs_node *kn)
|
||||
*/
|
||||
char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen)
|
||||
{
|
||||
unsigned long flags;
|
||||
char *p;
|
||||
int ret;
|
||||
|
||||
spin_lock_irqsave(&kernfs_rename_lock, flags);
|
||||
p = kernfs_path_locked(kn, buf, buflen);
|
||||
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
|
||||
return p;
|
||||
ret = kernfs_path_from_node(kn, NULL, buf, buflen);
|
||||
if (ret < 0 || ret >= buflen)
|
||||
return NULL;
|
||||
return buf;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kernfs_path);
|
||||
|
||||
@ -164,17 +285,25 @@ void pr_cont_kernfs_name(struct kernfs_node *kn)
|
||||
void pr_cont_kernfs_path(struct kernfs_node *kn)
|
||||
{
|
||||
unsigned long flags;
|
||||
char *p;
|
||||
int sz;
|
||||
|
||||
spin_lock_irqsave(&kernfs_rename_lock, flags);
|
||||
|
||||
p = kernfs_path_locked(kn, kernfs_pr_cont_buf,
|
||||
sizeof(kernfs_pr_cont_buf));
|
||||
if (p)
|
||||
pr_cont("%s", p);
|
||||
else
|
||||
pr_cont("<name too long>");
|
||||
sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
|
||||
sizeof(kernfs_pr_cont_buf));
|
||||
if (sz < 0) {
|
||||
pr_cont("(error)");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (sz >= sizeof(kernfs_pr_cont_buf)) {
|
||||
pr_cont("(name too long)");
|
||||
goto out;
|
||||
}
|
||||
|
||||
pr_cont("%s", kernfs_pr_cont_buf);
|
||||
|
||||
out:
|
||||
spin_unlock_irqrestore(&kernfs_rename_lock, flags);
|
||||
}
|
||||
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/magic.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/namei.h>
|
||||
|
||||
#include "kernfs-internal.h"
|
||||
|
||||
@ -62,6 +63,74 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* find the next ancestor in the path down to @child, where @parent was the
|
||||
* ancestor whose descendant we want to find.
|
||||
*
|
||||
* Say the path is /a/b/c/d. @child is d, @parent is NULL. We return the root
|
||||
* node. If @parent is b, then we return the node for c.
|
||||
* Passing in d as @parent is not ok.
|
||||
*/
|
||||
static struct kernfs_node *find_next_ancestor(struct kernfs_node *child,
|
||||
struct kernfs_node *parent)
|
||||
{
|
||||
if (child == parent) {
|
||||
pr_crit_once("BUG in find_next_ancestor: called with parent == child");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
while (child->parent != parent) {
|
||||
if (!child->parent)
|
||||
return NULL;
|
||||
child = child->parent;
|
||||
}
|
||||
|
||||
return child;
|
||||
}
|
||||
|
||||
/**
|
||||
* kernfs_node_dentry - get a dentry for the given kernfs_node
|
||||
* @kn: kernfs_node for which a dentry is needed
|
||||
* @sb: the kernfs super_block
|
||||
*/
|
||||
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
|
||||
struct super_block *sb)
|
||||
{
|
||||
struct dentry *dentry;
|
||||
struct kernfs_node *knparent = NULL;
|
||||
|
||||
BUG_ON(sb->s_op != &kernfs_sops);
|
||||
|
||||
dentry = dget(sb->s_root);
|
||||
|
||||
/* Check if this is the root kernfs_node */
|
||||
if (!kn->parent)
|
||||
return dentry;
|
||||
|
||||
knparent = find_next_ancestor(kn, NULL);
|
||||
if (WARN_ON(!knparent))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
do {
|
||||
struct dentry *dtmp;
|
||||
struct kernfs_node *kntmp;
|
||||
|
||||
if (kn == knparent)
|
||||
return dentry;
|
||||
kntmp = find_next_ancestor(kn, knparent);
|
||||
if (WARN_ON(!kntmp))
|
||||
return ERR_PTR(-EINVAL);
|
||||
mutex_lock(&d_inode(dentry)->i_mutex);
|
||||
dtmp = lookup_one_len(kntmp->name, dentry, strlen(kntmp->name));
|
||||
mutex_unlock(&d_inode(dentry)->i_mutex);
|
||||
dput(dentry);
|
||||
if (IS_ERR(dtmp))
|
||||
return dtmp;
|
||||
knparent = kntmp;
|
||||
dentry = dtmp;
|
||||
} while (true);
|
||||
}
|
||||
|
||||
static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
|
||||
{
|
||||
struct kernfs_super_info *info = kernfs_info(sb);
|
||||
|
@ -28,6 +28,9 @@ static const struct proc_ns_operations *ns_entries[] = {
|
||||
&userns_operations,
|
||||
#endif
|
||||
&mntns_operations,
|
||||
#ifdef CONFIG_CGROUPS
|
||||
&cgroupns_operations,
|
||||
#endif
|
||||
};
|
||||
|
||||
static const char *proc_ns_get_link(struct dentry *dentry,
|
||||
|
@ -17,6 +17,11 @@
|
||||
#include <linux/seq_file.h>
|
||||
#include <linux/kernfs.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/ns_common.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/user_namespace.h>
|
||||
|
||||
#include <linux/cgroup-defs.h>
|
||||
|
||||
@ -611,4 +616,48 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {}
|
||||
|
||||
#endif /* CONFIG_CGROUP_DATA */
|
||||
|
||||
struct cgroup_namespace {
|
||||
atomic_t count;
|
||||
struct ns_common ns;
|
||||
struct user_namespace *user_ns;
|
||||
struct css_set *root_cset;
|
||||
};
|
||||
|
||||
extern struct cgroup_namespace init_cgroup_ns;
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
|
||||
void free_cgroup_ns(struct cgroup_namespace *ns);
|
||||
|
||||
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
|
||||
struct user_namespace *user_ns,
|
||||
struct cgroup_namespace *old_ns);
|
||||
|
||||
char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
|
||||
struct cgroup_namespace *ns);
|
||||
|
||||
#else /* !CONFIG_CGROUPS */
|
||||
|
||||
static inline void free_cgroup_ns(struct cgroup_namespace *ns) { }
|
||||
static inline struct cgroup_namespace *
|
||||
copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns,
|
||||
struct cgroup_namespace *old_ns)
|
||||
{
|
||||
return old_ns;
|
||||
}
|
||||
|
||||
#endif /* !CONFIG_CGROUPS */
|
||||
|
||||
static inline void get_cgroup_ns(struct cgroup_namespace *ns)
|
||||
{
|
||||
if (ns)
|
||||
atomic_inc(&ns->count);
|
||||
}
|
||||
|
||||
static inline void put_cgroup_ns(struct cgroup_namespace *ns)
|
||||
{
|
||||
if (ns && atomic_dec_and_test(&ns->count))
|
||||
free_cgroup_ns(ns);
|
||||
}
|
||||
|
||||
#endif /* _LINUX_CGROUP_H */
|
||||
|
@ -267,8 +267,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
|
||||
|
||||
int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
|
||||
size_t kernfs_path_len(struct kernfs_node *kn);
|
||||
char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
|
||||
size_t buflen);
|
||||
int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn,
|
||||
char *buf, size_t buflen);
|
||||
char *kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen);
|
||||
void pr_cont_kernfs_name(struct kernfs_node *kn);
|
||||
void pr_cont_kernfs_path(struct kernfs_node *kn);
|
||||
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn);
|
||||
@ -283,6 +284,8 @@ struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry);
|
||||
struct kernfs_root *kernfs_root_from_sb(struct super_block *sb);
|
||||
struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn);
|
||||
|
||||
struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
|
||||
struct super_block *sb);
|
||||
struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
|
||||
unsigned int flags, void *priv);
|
||||
void kernfs_destroy_root(struct kernfs_root *root);
|
||||
@ -338,8 +341,8 @@ static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
|
||||
static inline size_t kernfs_path_len(struct kernfs_node *kn)
|
||||
{ return 0; }
|
||||
|
||||
static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
|
||||
size_t buflen)
|
||||
static inline char *kernfs_path(struct kernfs_node *kn, char *buf,
|
||||
size_t buflen)
|
||||
{ return NULL; }
|
||||
|
||||
static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { }
|
||||
|
@ -8,6 +8,7 @@ struct mnt_namespace;
|
||||
struct uts_namespace;
|
||||
struct ipc_namespace;
|
||||
struct pid_namespace;
|
||||
struct cgroup_namespace;
|
||||
struct fs_struct;
|
||||
|
||||
/*
|
||||
@ -33,6 +34,7 @@ struct nsproxy {
|
||||
struct mnt_namespace *mnt_ns;
|
||||
struct pid_namespace *pid_ns_for_children;
|
||||
struct net *net_ns;
|
||||
struct cgroup_namespace *cgroup_ns;
|
||||
};
|
||||
extern struct nsproxy init_nsproxy;
|
||||
|
||||
|
@ -9,6 +9,8 @@
|
||||
struct pid_namespace;
|
||||
struct nsproxy;
|
||||
struct path;
|
||||
struct task_struct;
|
||||
struct inode;
|
||||
|
||||
struct proc_ns_operations {
|
||||
const char *name;
|
||||
@ -24,6 +26,7 @@ extern const struct proc_ns_operations ipcns_operations;
|
||||
extern const struct proc_ns_operations pidns_operations;
|
||||
extern const struct proc_ns_operations userns_operations;
|
||||
extern const struct proc_ns_operations mntns_operations;
|
||||
extern const struct proc_ns_operations cgroupns_operations;
|
||||
|
||||
/*
|
||||
* We always define these enumerators
|
||||
@ -34,6 +37,7 @@ enum {
|
||||
PROC_UTS_INIT_INO = 0xEFFFFFFEU,
|
||||
PROC_USER_INIT_INO = 0xEFFFFFFDU,
|
||||
PROC_PID_INIT_INO = 0xEFFFFFFCU,
|
||||
PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_PROC_FS
|
||||
|
@ -21,8 +21,7 @@
|
||||
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
|
||||
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
|
||||
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
|
||||
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
|
||||
and is now available for re-use. */
|
||||
#define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
|
||||
#define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
|
||||
#define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
|
||||
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
|
||||
|
229
kernel/cgroup.c
229
kernel/cgroup.c
@ -59,6 +59,9 @@
|
||||
#include <linux/delay.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/cpuset.h>
|
||||
#include <linux/proc_ns.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/proc_ns.h>
|
||||
#include <net/sock.h>
|
||||
|
||||
/*
|
||||
@ -215,6 +218,15 @@ static u16 have_fork_callback __read_mostly;
|
||||
static u16 have_exit_callback __read_mostly;
|
||||
static u16 have_free_callback __read_mostly;
|
||||
|
||||
/* cgroup namespace for init task */
|
||||
struct cgroup_namespace init_cgroup_ns = {
|
||||
.count = { .counter = 2, },
|
||||
.user_ns = &init_user_ns,
|
||||
.ns.ops = &cgroupns_operations,
|
||||
.ns.inum = PROC_CGROUP_INIT_INO,
|
||||
.root_cset = &init_css_set,
|
||||
};
|
||||
|
||||
/* Ditto for the can_fork callback. */
|
||||
static u16 have_canfork_callback __read_mostly;
|
||||
|
||||
@ -2002,6 +2014,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
{
|
||||
bool is_v2 = fs_type == &cgroup2_fs_type;
|
||||
struct super_block *pinned_sb = NULL;
|
||||
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
||||
struct cgroup_subsys *ss;
|
||||
struct cgroup_root *root;
|
||||
struct cgroup_sb_opts opts;
|
||||
@ -2010,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
int i;
|
||||
bool new_sb;
|
||||
|
||||
get_cgroup_ns(ns);
|
||||
|
||||
/* Check if the caller has permission to mount. */
|
||||
if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) {
|
||||
put_cgroup_ns(ns);
|
||||
return ERR_PTR(-EPERM);
|
||||
}
|
||||
|
||||
/*
|
||||
* The first time anyone tries to mount a cgroup, enable the list
|
||||
* linking each css_set to its tasks and fix up all existing tasks.
|
||||
@ -2020,6 +2041,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
if (is_v2) {
|
||||
if (data) {
|
||||
pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
|
||||
put_cgroup_ns(ns);
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
cgrp_dfl_visible = true;
|
||||
@ -2125,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
/*
|
||||
* We know this subsystem has not yet been bound. Users in a non-init
|
||||
* user namespace may only mount hierarchies with no bound subsystems,
|
||||
* i.e. 'none,name=user1'
|
||||
*/
|
||||
if (!opts.none && !capable(CAP_SYS_ADMIN)) {
|
||||
ret = -EPERM;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
root = kzalloc(sizeof(*root), GFP_KERNEL);
|
||||
if (!root) {
|
||||
ret = -ENOMEM;
|
||||
@ -2143,12 +2175,37 @@ out_free:
|
||||
kfree(opts.release_agent);
|
||||
kfree(opts.name);
|
||||
|
||||
if (ret)
|
||||
if (ret) {
|
||||
put_cgroup_ns(ns);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
out_mount:
|
||||
dentry = kernfs_mount(fs_type, flags, root->kf_root,
|
||||
is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC,
|
||||
&new_sb);
|
||||
|
||||
/*
|
||||
* In non-init cgroup namespace, instead of root cgroup's
|
||||
* dentry, we return the dentry corresponding to the
|
||||
* cgroupns->root_cgrp.
|
||||
*/
|
||||
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
|
||||
struct dentry *nsdentry;
|
||||
struct cgroup *cgrp;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
|
||||
cgrp = cset_cgroup_from_root(ns->root_cset, root);
|
||||
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
|
||||
dput(dentry);
|
||||
dentry = nsdentry;
|
||||
}
|
||||
|
||||
if (IS_ERR(dentry) || !new_sb)
|
||||
cgroup_put(&root->cgrp);
|
||||
|
||||
@ -2161,6 +2218,7 @@ out_mount:
|
||||
deactivate_super(pinned_sb);
|
||||
}
|
||||
|
||||
put_cgroup_ns(ns);
|
||||
return dentry;
|
||||
}
|
||||
|
||||
@ -2189,14 +2247,45 @@ static struct file_system_type cgroup_fs_type = {
|
||||
.name = "cgroup",
|
||||
.mount = cgroup_mount,
|
||||
.kill_sb = cgroup_kill_sb,
|
||||
.fs_flags = FS_USERNS_MOUNT,
|
||||
};
|
||||
|
||||
static struct file_system_type cgroup2_fs_type = {
|
||||
.name = "cgroup2",
|
||||
.mount = cgroup_mount,
|
||||
.kill_sb = cgroup_kill_sb,
|
||||
.fs_flags = FS_USERNS_MOUNT,
|
||||
};
|
||||
|
||||
static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
|
||||
struct cgroup_namespace *ns)
|
||||
{
|
||||
struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root);
|
||||
int ret;
|
||||
|
||||
ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen);
|
||||
if (ret < 0 || ret >= buflen)
|
||||
return NULL;
|
||||
return buf;
|
||||
}
|
||||
|
||||
char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen,
|
||||
struct cgroup_namespace *ns)
|
||||
{
|
||||
char *ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
|
||||
ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns);
|
||||
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_path_ns);
|
||||
|
||||
/**
|
||||
* task_cgroup_path - cgroup path of a task in the first cgroup hierarchy
|
||||
* @task: target task
|
||||
@ -2224,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
|
||||
|
||||
if (root) {
|
||||
cgrp = task_cgroup_from_root(task, root);
|
||||
path = cgroup_path(cgrp, buf, buflen);
|
||||
path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
|
||||
} else {
|
||||
/* if no hierarchy exists, everyone is in "/" */
|
||||
if (strlcpy(buf, "/", buflen) < buflen)
|
||||
@ -5450,6 +5539,8 @@ int __init cgroup_init(void)
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
|
||||
|
||||
get_user_ns(init_cgroup_ns.user_ns);
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
/*
|
||||
@ -5601,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
||||
* " (deleted)" is appended to the cgroup path.
|
||||
*/
|
||||
if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
|
||||
path = cgroup_path(cgrp, buf, PATH_MAX);
|
||||
path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX,
|
||||
current->nsproxy->cgroup_ns);
|
||||
if (!path) {
|
||||
retval = -ENAMETOOLONG;
|
||||
goto out_unlock;
|
||||
@ -5886,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work)
|
||||
if (!pathbuf || !agentbuf)
|
||||
goto out;
|
||||
|
||||
path = cgroup_path(cgrp, pathbuf, PATH_MAX);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
if (!path)
|
||||
goto out;
|
||||
|
||||
@ -6098,6 +6192,133 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
|
||||
|
||||
#endif /* CONFIG_SOCK_CGROUP_DATA */
|
||||
|
||||
/* cgroup namespaces */
|
||||
|
||||
static struct cgroup_namespace *alloc_cgroup_ns(void)
|
||||
{
|
||||
struct cgroup_namespace *new_ns;
|
||||
int ret;
|
||||
|
||||
new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
|
||||
if (!new_ns)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
ret = ns_alloc_inum(&new_ns->ns);
|
||||
if (ret) {
|
||||
kfree(new_ns);
|
||||
return ERR_PTR(ret);
|
||||
}
|
||||
atomic_set(&new_ns->count, 1);
|
||||
new_ns->ns.ops = &cgroupns_operations;
|
||||
return new_ns;
|
||||
}
|
||||
|
||||
void free_cgroup_ns(struct cgroup_namespace *ns)
|
||||
{
|
||||
put_css_set(ns->root_cset);
|
||||
put_user_ns(ns->user_ns);
|
||||
ns_free_inum(&ns->ns);
|
||||
kfree(ns);
|
||||
}
|
||||
EXPORT_SYMBOL(free_cgroup_ns);
|
||||
|
||||
struct cgroup_namespace *copy_cgroup_ns(unsigned long flags,
|
||||
struct user_namespace *user_ns,
|
||||
struct cgroup_namespace *old_ns)
|
||||
{
|
||||
struct cgroup_namespace *new_ns;
|
||||
struct css_set *cset;
|
||||
|
||||
BUG_ON(!old_ns);
|
||||
|
||||
if (!(flags & CLONE_NEWCGROUP)) {
|
||||
get_cgroup_ns(old_ns);
|
||||
return old_ns;
|
||||
}
|
||||
|
||||
/* Allow only sysadmin to create cgroup namespace. */
|
||||
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
spin_lock_bh(&css_set_lock);
|
||||
|
||||
cset = task_css_set(current);
|
||||
get_css_set(cset);
|
||||
|
||||
spin_unlock_bh(&css_set_lock);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
new_ns = alloc_cgroup_ns();
|
||||
if (IS_ERR(new_ns)) {
|
||||
put_css_set(cset);
|
||||
return new_ns;
|
||||
}
|
||||
|
||||
new_ns->user_ns = get_user_ns(user_ns);
|
||||
new_ns->root_cset = cset;
|
||||
|
||||
return new_ns;
|
||||
}
|
||||
|
||||
static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
|
||||
{
|
||||
return container_of(ns, struct cgroup_namespace, ns);
|
||||
}
|
||||
|
||||
static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
|
||||
{
|
||||
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
|
||||
|
||||
if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
|
||||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
|
||||
return -EPERM;
|
||||
|
||||
/* Don't need to do anything if we are attaching to our own cgroupns. */
|
||||
if (cgroup_ns == nsproxy->cgroup_ns)
|
||||
return 0;
|
||||
|
||||
get_cgroup_ns(cgroup_ns);
|
||||
put_cgroup_ns(nsproxy->cgroup_ns);
|
||||
nsproxy->cgroup_ns = cgroup_ns;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct ns_common *cgroupns_get(struct task_struct *task)
|
||||
{
|
||||
struct cgroup_namespace *ns = NULL;
|
||||
struct nsproxy *nsproxy;
|
||||
|
||||
task_lock(task);
|
||||
nsproxy = task->nsproxy;
|
||||
if (nsproxy) {
|
||||
ns = nsproxy->cgroup_ns;
|
||||
get_cgroup_ns(ns);
|
||||
}
|
||||
task_unlock(task);
|
||||
|
||||
return ns ? &ns->ns : NULL;
|
||||
}
|
||||
|
||||
static void cgroupns_put(struct ns_common *ns)
|
||||
{
|
||||
put_cgroup_ns(to_cg_ns(ns));
|
||||
}
|
||||
|
||||
const struct proc_ns_operations cgroupns_operations = {
|
||||
.name = "cgroup",
|
||||
.type = CLONE_NEWCGROUP,
|
||||
.get = cgroupns_get,
|
||||
.put = cgroupns_put,
|
||||
.install = cgroupns_install,
|
||||
};
|
||||
|
||||
static __init int cgroup_namespaces_init(void)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(cgroup_namespaces_init);
|
||||
|
||||
#ifdef CONFIG_CGROUP_DEBUG
|
||||
static struct cgroup_subsys_state *
|
||||
debug_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
|
@ -2714,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
|
||||
goto out;
|
||||
|
||||
retval = -ENAMETOOLONG;
|
||||
rcu_read_lock();
|
||||
css = task_css(tsk, cpuset_cgrp_id);
|
||||
p = cgroup_path(css->cgroup, buf, PATH_MAX);
|
||||
rcu_read_unlock();
|
||||
css = task_get_css(tsk, cpuset_cgrp_id);
|
||||
p = cgroup_path_ns(css->cgroup, buf, PATH_MAX,
|
||||
current->nsproxy->cgroup_ns);
|
||||
css_put(css);
|
||||
if (!p)
|
||||
goto out_free;
|
||||
seq_puts(m, p);
|
||||
|
@ -1892,7 +1892,7 @@ static int check_unshare_flags(unsigned long unshare_flags)
|
||||
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
|
||||
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
|
||||
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
|
||||
CLONE_NEWUSER|CLONE_NEWPID))
|
||||
CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
|
||||
return -EINVAL;
|
||||
/*
|
||||
* Not implemented, but pretend it works if there is nothing
|
||||
|
@ -25,6 +25,7 @@
|
||||
#include <linux/proc_ns.h>
|
||||
#include <linux/file.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
static struct kmem_cache *nsproxy_cachep;
|
||||
|
||||
@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = {
|
||||
#ifdef CONFIG_NET
|
||||
.net_ns = &init_net,
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUPS
|
||||
.cgroup_ns = &init_cgroup_ns,
|
||||
#endif
|
||||
};
|
||||
|
||||
static inline struct nsproxy *create_nsproxy(void)
|
||||
@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
|
||||
goto out_pid;
|
||||
}
|
||||
|
||||
new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns,
|
||||
tsk->nsproxy->cgroup_ns);
|
||||
if (IS_ERR(new_nsp->cgroup_ns)) {
|
||||
err = PTR_ERR(new_nsp->cgroup_ns);
|
||||
goto out_cgroup;
|
||||
}
|
||||
|
||||
new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns);
|
||||
if (IS_ERR(new_nsp->net_ns)) {
|
||||
err = PTR_ERR(new_nsp->net_ns);
|
||||
@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
|
||||
return new_nsp;
|
||||
|
||||
out_net:
|
||||
put_cgroup_ns(new_nsp->cgroup_ns);
|
||||
out_cgroup:
|
||||
if (new_nsp->pid_ns_for_children)
|
||||
put_pid_ns(new_nsp->pid_ns_for_children);
|
||||
out_pid:
|
||||
@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
|
||||
struct nsproxy *new_ns;
|
||||
|
||||
if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
||||
CLONE_NEWPID | CLONE_NEWNET)))) {
|
||||
CLONE_NEWPID | CLONE_NEWNET |
|
||||
CLONE_NEWCGROUP)))) {
|
||||
get_nsproxy(old_ns);
|
||||
return 0;
|
||||
}
|
||||
@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns)
|
||||
put_ipc_ns(ns->ipc_ns);
|
||||
if (ns->pid_ns_for_children)
|
||||
put_pid_ns(ns->pid_ns_for_children);
|
||||
put_cgroup_ns(ns->cgroup_ns);
|
||||
put_net(ns->net_ns);
|
||||
kmem_cache_free(nsproxy_cachep, ns);
|
||||
}
|
||||
@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
|
||||
int err = 0;
|
||||
|
||||
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
||||
CLONE_NEWNET | CLONE_NEWPID)))
|
||||
CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
|
||||
return 0;
|
||||
|
||||
user_ns = new_cred ? new_cred->user_ns : current_user_ns();
|
||||
|
Loading…
Reference in New Issue
Block a user