Subsystem migration methods shouldn't be called for empty migrations. cgroup_migrate_execute() implements this guarantee by bailing early if there are no source css_sets. This used to be correct beforea79a908fd2("cgroup: introduce cgroup namespaces"), but no longer since the commit because css_sets can stay pinned without tasks in them. This caused cgroup_migrate_execute() call into cpuset migration methods with an empty cgroup_taskset. cpuset migration methods correctly assume that cgroup_taskset_first() never returns NULL; however, due to the bug, it can, leading to the following oops. Unable to handle kernel paging request for data at address 0x00000960 Faulting instruction address: 0xc0000000001d6868 Oops: Kernel access of bad area, sig: 11 [#1] ... CPU: 14 PID: 16947 Comm: kworker/14:0 Tainted: G W 4.12.0-rc4-next-20170609 #2 Workqueue: events cpuset_hotplug_workfn task: c00000000ca60580 task.stack: c00000000c728000 NIP: c0000000001d6868 LR: c0000000001d6858 CTR: c0000000001d6810 REGS: c00000000c72b720 TRAP: 0300 Tainted: GW (4.12.0-rc4-next-20170609) MSR: 8000000000009033 <SF,EE,ME,IR,DR,RI,LE> CR: 44722422 XER: 20000000 CFAR: c000000000008710 DAR: 0000000000000960 DSISR: 40000000 SOFTE: 1 GPR00: c0000000001d6858 c00000000c72b9a0 c000000001536e00 0000000000000000 GPR04: c00000000c72b9c0 0000000000000000 c00000000c72bad0 c000000766367678 GPR08: c000000766366d10 c00000000c72b958 c000000001736e00 0000000000000000 GPR12: c0000000001d6810 c00000000e749300 c000000000123ef8 c000000775af4180 GPR16: 0000000000000000 0000000000000000 c00000075480e9c0 c00000075480e9e0 GPR20: c00000075480e8c0 0000000000000001 0000000000000000 c00000000c72ba20 GPR24: c00000000c72baa0 c00000000c72bac0 c000000001407248 c00000000c72ba20 GPR28: c00000000141fc80 c00000000c72bac0 c00000000c6bc790 0000000000000000 NIP [c0000000001d6868] cpuset_can_attach+0x58/0x1b0 LR [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 Call Trace: [c00000000c72b9a0] [c0000000001d6858] cpuset_can_attach+0x48/0x1b0 (unreliable) [c00000000c72ba00] [c0000000001cbe80] cgroup_migrate_execute+0xb0/0x450 [c00000000c72ba80] [c0000000001d3754] cgroup_transfer_tasks+0x1c4/0x360 [c00000000c72bba0] [c0000000001d923c] cpuset_hotplug_workfn+0x86c/0xa20 [c00000000c72bca0] [c00000000011aa44] process_one_work+0x1e4/0x580 [c00000000c72bd30] [c00000000011ae78] worker_thread+0x98/0x5c0 [c00000000c72bdc0] [c000000000124058] kthread+0x168/0x1b0 [c00000000c72be30] [c00000000000b2e8] ret_from_kernel_thread+0x5c/0x74 Instruction dump: f821ffa1 7c7d1b78 60000000 60000000 38810020 7fa3eb78 3f42ffed 4bff4c25 60000000 3b5a0448 3d420020 eb610020 <e9230960> 7f43d378 e9290000 f92af200 ---[ end trace dcaaf98fb36d9e64 ]--- This patch fixes the bug by adding an explicit nr_tasks counter to cgroup_taskset and skipping calling the migration methods if the counter is zero. While at it, remove the now spurious check on no source css_sets. Signed-off-by: Tejun Heo <tj@kernel.org> Reported-and-tested-by: Abdul Haleem <abdhalee@linux.vnet.ibm.com> Cc: Roman Gushchin <guro@fb.com> Cc: stable@vger.kernel.org # v4.6+ Fixes:a79a908fd2("cgroup: introduce cgroup namespaces") Link: http://lkml.kernel.org/r/1497266622.15415.39.camel@abdul.in.ibm.com
		
			
				
	
	
		
			221 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			221 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| #ifndef __CGROUP_INTERNAL_H
 | |
| #define __CGROUP_INTERNAL_H
 | |
| 
 | |
| #include <linux/cgroup.h>
 | |
| #include <linux/kernfs.h>
 | |
| #include <linux/workqueue.h>
 | |
| #include <linux/list.h>
 | |
| #include <linux/refcount.h>
 | |
| 
 | |
| /*
 | |
|  * A cgroup can be associated with multiple css_sets as different tasks may
 | |
|  * belong to different cgroups on different hierarchies.  In the other
 | |
|  * direction, a css_set is naturally associated with multiple cgroups.
 | |
|  * This M:N relationship is represented by the following link structure
 | |
|  * which exists for each association and allows traversing the associations
 | |
|  * from both sides.
 | |
|  */
 | |
| struct cgrp_cset_link {
 | |
| 	/* the cgroup and css_set this link associates */
 | |
| 	struct cgroup		*cgrp;
 | |
| 	struct css_set		*cset;
 | |
| 
 | |
| 	/* list of cgrp_cset_links anchored at cgrp->cset_links */
 | |
| 	struct list_head	cset_link;
 | |
| 
 | |
| 	/* list of cgrp_cset_links anchored at css_set->cgrp_links */
 | |
| 	struct list_head	cgrp_link;
 | |
| };
 | |
| 
 | |
| /* used to track tasks and csets during migration */
 | |
| struct cgroup_taskset {
 | |
| 	/* the src and dst cset list running through cset->mg_node */
 | |
| 	struct list_head	src_csets;
 | |
| 	struct list_head	dst_csets;
 | |
| 
 | |
| 	/* the number of tasks in the set */
 | |
| 	int			nr_tasks;
 | |
| 
 | |
| 	/* the subsys currently being processed */
 | |
| 	int			ssid;
 | |
| 
 | |
| 	/*
 | |
| 	 * Fields for cgroup_taskset_*() iteration.
 | |
| 	 *
 | |
| 	 * Before migration is committed, the target migration tasks are on
 | |
| 	 * ->mg_tasks of the csets on ->src_csets.  After, on ->mg_tasks of
 | |
| 	 * the csets on ->dst_csets.  ->csets point to either ->src_csets
 | |
| 	 * or ->dst_csets depending on whether migration is committed.
 | |
| 	 *
 | |
| 	 * ->cur_csets and ->cur_task point to the current task position
 | |
| 	 * during iteration.
 | |
| 	 */
 | |
| 	struct list_head	*csets;
 | |
| 	struct css_set		*cur_cset;
 | |
| 	struct task_struct	*cur_task;
 | |
| };
 | |
| 
 | |
| /* migration context also tracks preloading */
 | |
| struct cgroup_mgctx {
 | |
| 	/*
 | |
| 	 * Preloaded source and destination csets.  Used to guarantee
 | |
| 	 * atomic success or failure on actual migration.
 | |
| 	 */
 | |
| 	struct list_head	preloaded_src_csets;
 | |
| 	struct list_head	preloaded_dst_csets;
 | |
| 
 | |
| 	/* tasks and csets to migrate */
 | |
| 	struct cgroup_taskset	tset;
 | |
| 
 | |
| 	/* subsystems affected by migration */
 | |
| 	u16			ss_mask;
 | |
| };
 | |
| 
 | |
| #define CGROUP_TASKSET_INIT(tset)						\
 | |
| {										\
 | |
| 	.src_csets		= LIST_HEAD_INIT(tset.src_csets),		\
 | |
| 	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),		\
 | |
| 	.csets			= &tset.src_csets,				\
 | |
| }
 | |
| 
 | |
| #define CGROUP_MGCTX_INIT(name)							\
 | |
| {										\
 | |
| 	LIST_HEAD_INIT(name.preloaded_src_csets),				\
 | |
| 	LIST_HEAD_INIT(name.preloaded_dst_csets),				\
 | |
| 	CGROUP_TASKSET_INIT(name.tset),						\
 | |
| }
 | |
| 
 | |
| #define DEFINE_CGROUP_MGCTX(name)						\
 | |
| 	struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
 | |
| 
 | |
| struct cgroup_sb_opts {
 | |
| 	u16 subsys_mask;
 | |
| 	unsigned int flags;
 | |
| 	char *release_agent;
 | |
| 	bool cpuset_clone_children;
 | |
| 	char *name;
 | |
| 	/* User explicitly requested empty subsystem */
 | |
| 	bool none;
 | |
| };
 | |
| 
 | |
| extern struct mutex cgroup_mutex;
 | |
| extern spinlock_t css_set_lock;
 | |
| extern struct cgroup_subsys *cgroup_subsys[];
 | |
| extern struct list_head cgroup_roots;
 | |
| extern struct file_system_type cgroup_fs_type;
 | |
| 
 | |
| /* iterate across the hierarchies */
 | |
| #define for_each_root(root)						\
 | |
| 	list_for_each_entry((root), &cgroup_roots, root_list)
 | |
| 
 | |
| /**
 | |
|  * for_each_subsys - iterate all enabled cgroup subsystems
 | |
|  * @ss: the iteration cursor
 | |
|  * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end
 | |
|  */
 | |
| #define for_each_subsys(ss, ssid)					\
 | |
| 	for ((ssid) = 0; (ssid) < CGROUP_SUBSYS_COUNT &&		\
 | |
| 	     (((ss) = cgroup_subsys[ssid]) || true); (ssid)++)
 | |
| 
 | |
| static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 | |
| {
 | |
| 	return !(cgrp->self.flags & CSS_ONLINE);
 | |
| }
 | |
| 
 | |
| static inline bool notify_on_release(const struct cgroup *cgrp)
 | |
| {
 | |
| 	return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
 | |
| }
 | |
| 
 | |
| void put_css_set_locked(struct css_set *cset);
 | |
| 
 | |
| static inline void put_css_set(struct css_set *cset)
 | |
| {
 | |
| 	unsigned long flags;
 | |
| 
 | |
| 	/*
 | |
| 	 * Ensure that the refcount doesn't hit zero while any readers
 | |
| 	 * can see it. Similar to atomic_dec_and_lock(), but for an
 | |
| 	 * rwlock
 | |
| 	 */
 | |
| 	if (refcount_dec_not_one(&cset->refcount))
 | |
| 		return;
 | |
| 
 | |
| 	spin_lock_irqsave(&css_set_lock, flags);
 | |
| 	put_css_set_locked(cset);
 | |
| 	spin_unlock_irqrestore(&css_set_lock, flags);
 | |
| }
 | |
| 
 | |
| /*
 | |
|  * refcounted get/put for css_set objects
 | |
|  */
 | |
| static inline void get_css_set(struct css_set *cset)
 | |
| {
 | |
| 	refcount_inc(&cset->refcount);
 | |
| }
 | |
| 
 | |
| bool cgroup_ssid_enabled(int ssid);
 | |
| bool cgroup_on_dfl(const struct cgroup *cgrp);
 | |
| 
 | |
| struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root);
 | |
| struct cgroup *task_cgroup_from_root(struct task_struct *task,
 | |
| 				     struct cgroup_root *root);
 | |
| struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, bool drain_offline);
 | |
| void cgroup_kn_unlock(struct kernfs_node *kn);
 | |
| int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
 | |
| 			  struct cgroup_namespace *ns);
 | |
| 
 | |
| void cgroup_free_root(struct cgroup_root *root);
 | |
| void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts);
 | |
| int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags);
 | |
| int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask);
 | |
| struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
 | |
| 			       struct cgroup_root *root, unsigned long magic,
 | |
| 			       struct cgroup_namespace *ns);
 | |
| 
 | |
| bool cgroup_may_migrate_to(struct cgroup *dst_cgrp);
 | |
| void cgroup_migrate_finish(struct cgroup_mgctx *mgctx);
 | |
| void cgroup_migrate_add_src(struct css_set *src_cset, struct cgroup *dst_cgrp,
 | |
| 			    struct cgroup_mgctx *mgctx);
 | |
| int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx);
 | |
| int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 | |
| 		   struct cgroup_mgctx *mgctx);
 | |
| 
 | |
| int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
 | |
| 		       bool threadgroup);
 | |
| ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 | |
| 			     size_t nbytes, loff_t off, bool threadgroup);
 | |
| ssize_t cgroup_procs_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
 | |
| 			   loff_t off);
 | |
| 
 | |
| void cgroup_lock_and_drain_offline(struct cgroup *cgrp);
 | |
| 
 | |
| int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode);
 | |
| int cgroup_rmdir(struct kernfs_node *kn);
 | |
| int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
 | |
| 		     struct kernfs_root *kf_root);
 | |
| 
 | |
| int cgroup_task_count(const struct cgroup *cgrp);
 | |
| 
 | |
| /*
 | |
|  * namespace.c
 | |
|  */
 | |
| extern const struct proc_ns_operations cgroupns_operations;
 | |
| 
 | |
| /*
 | |
|  * cgroup-v1.c
 | |
|  */
 | |
| extern struct cftype cgroup1_base_files[];
 | |
| extern const struct file_operations proc_cgroupstats_operations;
 | |
| extern struct kernfs_syscall_ops cgroup1_kf_syscall_ops;
 | |
| 
 | |
| bool cgroup1_ssid_disabled(int ssid);
 | |
| void cgroup1_pidlist_destroy_all(struct cgroup *cgrp);
 | |
| void cgroup1_release_agent(struct work_struct *work);
 | |
| void cgroup1_check_for_release(struct cgroup *cgrp);
 | |
| struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags,
 | |
| 			     void *data, unsigned long magic,
 | |
| 			     struct cgroup_namespace *ns);
 | |
| 
 | |
| #endif /* __CGROUP_INTERNAL_H */
 |