From 1ed1328792ff46e4bb86a3d7f7be2971f4549f6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Sep 2015 12:53:17 -0400
Subject: [PATCH 01/38] sched, cgroup: replace signal_struct->group_rwsem with
 a global percpu_rwsem

Note: This commit was originally committed as d59cfc09c32a but got
      reverted by 0c986253b939 due to the performance regression from
      the percpu_rwsem write down/up operations added to cgroup task
      migration path.  percpu_rwsem changes which alleviate the
      performance issue are pending for v4.4-rc1 merge window.
      Re-apply.

The cgroup side of threadgroup locking uses signal_struct->group_rwsem
to synchronize against threadgroup changes.  This per-process rwsem
adds small overhead to thread creation, exit and exec paths, forces
cgroup code paths to do lock-verify-unlock-retry dance in a couple
places and makes it impossible to atomically perform operations across
multiple processes.

This patch replaces signal_struct->group_rwsem with a global
percpu_rwsem cgroup_threadgroup_rwsem which is cheaper on the reader
side and contained in cgroups proper.  This patch converts one-to-one.

This does make writer side heavier and lower the granularity; however,
cgroup process migration is a fairly cold path, we do want to optimize
thread operations over it and cgroup migration operations don't take
enough time for the lower granularity to matter.

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/cgroup-defs.h | 27 ++++++++++++-
 include/linux/init_task.h   |  8 ----
 include/linux/sched.h       | 12 ------
 kernel/cgroup.c             | 77 ++++++++++---------------------------
 kernel/fork.c               |  4 --
 5 files changed, 45 insertions(+), 83 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8492721b39be..4d8fcf2187dc 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -473,8 +473,31 @@ struct cgroup_subsys {
 	unsigned int depends_on;
 };
 
-void cgroup_threadgroup_change_begin(struct task_struct *tsk);
-void cgroup_threadgroup_change_end(struct task_struct *tsk);
+extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
+/**
+ * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_begin() and allows cgroup operations to
+ * synchronize against threadgroup changes using a percpu_rw_semaphore.
+ */
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+	percpu_down_read(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_end().  Counterpart of
+ * cgroup_threadcgroup_change_begin().
+ */
+static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+	percpu_up_read(&cgroup_threadgroup_rwsem);
+}
 
 #else	/* CONFIG_CGROUPS */
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e38681f4912d..d0b380ee7d67 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,13 +25,6 @@
 extern struct files_struct init_files;
 extern struct fs_struct init_fs;
 
-#ifdef CONFIG_CGROUPS
-#define INIT_GROUP_RWSEM(sig)						\
-	.group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
-#else
-#define INIT_GROUP_RWSEM(sig)
-#endif
-
 #ifdef CONFIG_CPUSETS
 #define INIT_CPUSET_SEQ(tsk)							\
 	.mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@ -64,7 +57,6 @@ extern struct fs_struct init_fs;
 	INIT_PREV_CPUTIME(sig)						\
 	.cred_guard_mutex =						\
 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
-	INIT_GROUP_RWSEM(sig)						\
 }
 
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b7b9501b41af..a4ab9daa387c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -762,18 +762,6 @@ struct signal_struct {
 	unsigned audit_tty_log_passwd;
 	struct tty_audit_buf *tty_audit_buf;
 #endif
-#ifdef CONFIG_CGROUPS
-	/*
-	 * group_rwsem prevents new tasks from entering the threadgroup and
-	 * member tasks from exiting,a more specifically, setting of
-	 * PF_EXITING.  fork and exit paths are protected with this rwsem
-	 * using threadgroup_change_begin/end().  Users which require
-	 * threadgroup to remain stable should use threadgroup_[un]lock()
-	 * which also takes care of exec path.  Currently, cgroup is the
-	 * only user.
-	 */
-	struct rw_semaphore group_rwsem;
-#endif
 
 	oom_flags_t oom_flags;
 	short oom_score_adj;		/* OOM kill score adjustment */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2c9eae6ad970..115091efa889 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/rwsem.h>
+#include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
 #include <linux/kmod.h>
@@ -103,6 +104,8 @@ static DEFINE_SPINLOCK(cgroup_idr_lock);
  */
 static DEFINE_SPINLOCK(release_agent_path_lock);
 
+struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
 #define cgroup_assert_mutex_or_rcu_locked()				\
 	RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&			\
 			   !lockdep_is_held(&cgroup_mutex),		\
@@ -871,48 +874,6 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	return cset;
 }
 
-void cgroup_threadgroup_change_begin(struct task_struct *tsk)
-{
-	down_read(&tsk->signal->group_rwsem);
-}
-
-void cgroup_threadgroup_change_end(struct task_struct *tsk)
-{
-	up_read(&tsk->signal->group_rwsem);
-}
-
-/**
- * threadgroup_lock - lock threadgroup
- * @tsk: member task of the threadgroup to lock
- *
- * Lock the threadgroup @tsk belongs to.  No new task is allowed to enter
- * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
- * change ->group_leader/pid.  This is useful for cases where the threadgroup
- * needs to stay stable across blockable operations.
- *
- * fork and exit explicitly call threadgroup_change_{begin|end}() for
- * synchronization.  While held, no new task will be added to threadgroup
- * and no existing live task will have its PF_EXITING set.
- *
- * de_thread() does threadgroup_change_{begin|end}() when a non-leader
- * sub-thread becomes a new leader.
- */
-static void threadgroup_lock(struct task_struct *tsk)
-{
-	down_write(&tsk->signal->group_rwsem);
-}
-
-/**
- * threadgroup_unlock - unlock threadgroup
- * @tsk: member task of the threadgroup to unlock
- *
- * Reverse threadgroup_lock().
- */
-static inline void threadgroup_unlock(struct task_struct *tsk)
-{
-	up_write(&tsk->signal->group_rwsem);
-}
-
 static struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
 {
 	struct cgroup *root_cgrp = kf_root->kn->priv;
@@ -2113,9 +2074,9 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	lockdep_assert_held(&css_set_rwsem);
 
 	/*
-	 * We are synchronized through threadgroup_lock() against PF_EXITING
-	 * setting such that we can't race against cgroup_exit() changing the
-	 * css_set to init_css_set and dropping the old one.
+	 * We are synchronized through cgroup_threadgroup_rwsem against
+	 * PF_EXITING setting such that we can't race against cgroup_exit()
+	 * changing the css_set to init_css_set and dropping the old one.
 	 */
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
@@ -2172,10 +2133,11 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
  * @src_cset and add it to @preloaded_csets, which should later be cleaned
  * up by cgroup_migrate_finish().
  *
- * This function may be called without holding threadgroup_lock even if the
- * target is a process.  Threads may be created and destroyed but as long
- * as cgroup_mutex is not dropped, no new css_set can be put into play and
- * the preloaded css_sets are guaranteed to cover all migrations.
+ * This function may be called without holding cgroup_threadgroup_rwsem
+ * even if the target is a process.  Threads may be created and destroyed
+ * but as long as cgroup_mutex is not dropped, no new css_set can be put
+ * into play and the preloaded css_sets are guaranteed to cover all
+ * migrations.
  */
 static void cgroup_migrate_add_src(struct css_set *src_cset,
 				   struct cgroup *dst_cgrp,
@@ -2278,7 +2240,7 @@ err:
  * @threadgroup: whether @leader points to the whole process or a single task
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
- * process, the caller must be holding threadgroup_lock of @leader.  The
+ * process, the caller must be holding cgroup_threadgroup_rwsem.  The
  * caller is also responsible for invoking cgroup_migrate_add_src() and
  * cgroup_migrate_prepare_dst() on the targets before invoking this
  * function and following up with cgroup_migrate_finish().
@@ -2406,7 +2368,7 @@ out_release_tset:
  * @leader: the task or the leader of the threadgroup to be attached
  * @threadgroup: attach the whole threadgroup?
  *
- * Call holding cgroup_mutex and threadgroup_lock of @leader.
+ * Call holding cgroup_mutex and cgroup_threadgroup_rwsem.
  */
 static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			      struct task_struct *leader, bool threadgroup)
@@ -2528,7 +2490,7 @@ retry_find_task:
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-	threadgroup_lock(tsk);
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	if (threadgroup) {
 		if (!thread_group_leader(tsk)) {
 			/*
@@ -2538,7 +2500,7 @@ retry_find_task:
 			 * try again; this is
 			 * "double-double-toil-and-trouble-check locking".
 			 */
-			threadgroup_unlock(tsk);
+			percpu_up_write(&cgroup_threadgroup_rwsem);
 			put_task_struct(tsk);
 			goto retry_find_task;
 		}
@@ -2548,7 +2510,7 @@ retry_find_task:
 	if (!ret)
 		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
-	threadgroup_unlock(tsk);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 
 	put_task_struct(tsk);
 out_unlock_cgroup:
@@ -2751,17 +2713,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
-			threadgroup_lock(task);
+			percpu_down_write(&cgroup_threadgroup_rwsem);
 			/* raced against de_thread() from another thread? */
 			if (!thread_group_leader(task)) {
-				threadgroup_unlock(task);
+				percpu_up_write(&cgroup_threadgroup_rwsem);
 				put_task_struct(task);
 				continue;
 			}
 
 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 
-			threadgroup_unlock(task);
+			percpu_up_write(&cgroup_threadgroup_rwsem);
 			put_task_struct(task);
 
 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -5083,6 +5045,7 @@ int __init cgroup_init(void)
 	unsigned long key;
 	int ssid, err;
 
+	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 2845623fb582..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1149,10 +1149,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	tty_audit_fork(sig);
 	sched_autogroup_fork(sig);
 
-#ifdef CONFIG_CGROUPS
-	init_rwsem(&sig->group_rwsem);
-#endif
-
 	sig->oom_score_adj = current->signal->oom_score_adj;
 	sig->oom_score_adj_min = current->signal->oom_score_adj_min;
 

From 3014dde762f618fbcfe899f9ce9e929a2e5aa6dd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 16 Sep 2015 13:03:02 -0400
Subject: [PATCH 02/38] cgroup: simplify threadgroup locking

Note: This commit was originally committed as b5ba75b5fc0e but got
      reverted by f9f9e7b77614 due to the performance regression from
      the percpu_rwsem write down/up operations added to cgroup task
      migration path.  percpu_rwsem changes which alleviate the
      performance issue are pending for v4.4-rc1 merge window.
      Re-apply.

Now that threadgroup locking is made global, code paths around it can
be simplified.

* lock-verify-unlock-retry dancing removed from __cgroup_procs_write().

* Race protection against de_thread() removed from
  cgroup_update_dfl_csses().

Signed-off-by: Tejun Heo <tj@kernel.org>
Link: http://lkml.kernel.org/g/55F8097A.7000206@de.ibm.com
---
 kernel/cgroup.c | 45 ++++++++++++---------------------------------
 1 file changed, 12 insertions(+), 33 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 115091efa889..2cf0f79f1fc9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2460,14 +2460,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
 	if (!cgrp)
 		return -ENODEV;
 
-retry_find_task:
+	percpu_down_write(&cgroup_threadgroup_rwsem);
 	rcu_read_lock();
 	if (pid) {
 		tsk = find_task_by_vpid(pid);
 		if (!tsk) {
-			rcu_read_unlock();
 			ret = -ESRCH;
-			goto out_unlock_cgroup;
+			goto out_unlock_rcu;
 		}
 	} else {
 		tsk = current;
@@ -2483,37 +2482,23 @@ retry_find_task:
 	 */
 	if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
 		ret = -EINVAL;
-		rcu_read_unlock();
-		goto out_unlock_cgroup;
+		goto out_unlock_rcu;
 	}
 
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-	percpu_down_write(&cgroup_threadgroup_rwsem);
-	if (threadgroup) {
-		if (!thread_group_leader(tsk)) {
-			/*
-			 * a race with de_thread from another thread's exec()
-			 * may strip us of our leadership, if this happens,
-			 * there is no choice but to throw this task away and
-			 * try again; this is
-			 * "double-double-toil-and-trouble-check locking".
-			 */
-			percpu_up_write(&cgroup_threadgroup_rwsem);
-			put_task_struct(tsk);
-			goto retry_find_task;
-		}
-	}
-
 	ret = cgroup_procs_write_permission(tsk, cgrp, of);
 	if (!ret)
 		ret = cgroup_attach_task(cgrp, tsk, threadgroup);
 
-	percpu_up_write(&cgroup_threadgroup_rwsem);
-
 	put_task_struct(tsk);
-out_unlock_cgroup:
+	goto out_unlock_threadgroup;
+
+out_unlock_rcu:
+	rcu_read_unlock();
+out_unlock_threadgroup:
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	cgroup_kn_unlock(of->kn);
 	return ret ?: nbytes;
 }
@@ -2658,6 +2643,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 	lockdep_assert_held(&cgroup_mutex);
 
+	percpu_down_write(&cgroup_threadgroup_rwsem);
+
 	/* look up all csses currently attached to @cgrp's subtree */
 	down_read(&css_set_rwsem);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
@@ -2713,17 +2700,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
-			percpu_down_write(&cgroup_threadgroup_rwsem);
-			/* raced against de_thread() from another thread? */
-			if (!thread_group_leader(task)) {
-				percpu_up_write(&cgroup_threadgroup_rwsem);
-				put_task_struct(task);
-				continue;
-			}
-
 			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
 
-			percpu_up_write(&cgroup_threadgroup_rwsem);
 			put_task_struct(task);
 
 			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
@@ -2733,6 +2711,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 
 out_finish:
 	cgroup_migrate_finish(&preloaded_csets);
+	percpu_up_write(&cgroup_threadgroup_rwsem);
 	return ret;
 }
 

From fa128fd735bd236b6b04d3fedfed7a784137c185 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 11:56:28 -0400
Subject: [PATCH 03/38] jump_label: make static_key_enabled() work on
 static_key_true/false types too

static_key_enabled() can be used on struct static_key but not on its
wrapper types static_key_true and static_key_false.  The function is
useful for debugging and management of static keys.  Update it so that
it can be used for the wrapper types too.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/jump_label.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 7f653e8f6690..c9ca050de846 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -216,11 +216,6 @@ static inline int jump_label_apply_nops(struct module *mod)
 #define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
 #define jump_label_enabled static_key_enabled
 
-static inline bool static_key_enabled(struct static_key *key)
-{
-	return static_key_count(key) > 0;
-}
-
 static inline void static_key_enable(struct static_key *key)
 {
 	int count = static_key_count(key);
@@ -267,6 +262,17 @@ struct static_key_false {
 #define DEFINE_STATIC_KEY_FALSE(name)	\
 	struct static_key_false name = STATIC_KEY_FALSE_INIT
 
+extern bool ____wrong_branch_error(void);
+
+#define static_key_enabled(x)							\
+({										\
+	if (!__builtin_types_compatible_p(typeof(*x), struct static_key) &&	\
+	    !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
+	    !__builtin_types_compatible_p(typeof(*x), struct static_key_false))	\
+		____wrong_branch_error();					\
+	static_key_count((struct static_key *)x) > 0;				\
+})
+
 #ifdef HAVE_JUMP_LABEL
 
 /*
@@ -325,8 +331,6 @@ struct static_key_false {
  * See jump_label_type() / jump_label_init_type().
  */
 
-extern bool ____wrong_branch_error(void);
-
 #define static_branch_likely(x)							\
 ({										\
 	bool branch;								\

From 49d1dc4b81797f88270832b11e9f73809e7e7209 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 11:56:28 -0400
Subject: [PATCH 04/38] cgroup: implement static_key based
 cgroup_subsys_enabled() and cgroup_subsys_on_dfl()

Whether a subsys is enabled and attached to the default hierarchy
seldom changes and may be tested in the hot paths.  This patch
implements static_key based cgroup_subsys_enabled() and
cgroup_subsys_on_dfl() tests.

The following patches will update the users and remove duplicate
mechanisms.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 include/linux/cgroup.h | 21 +++++++++++++++++++++
 kernel/cgroup.c        | 27 ++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index eb7ca55f72ef..c3a9f1eb4097 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
+#include <linux/jump_label.h>
 
 #include <linux/cgroup-defs.h>
 
@@ -50,6 +51,26 @@ extern struct css_set init_css_set;
 #include <linux/cgroup_subsys.h>
 #undef SUBSYS
 
+#define SUBSYS(_x)								\
+	extern struct static_key_true _x ## _cgrp_subsys_enabled_key;		\
+	extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+/**
+ * cgroup_subsys_enabled - fast test on whether a subsys is enabled
+ * @ss: subsystem in question
+ */
+#define cgroup_subsys_enabled(ss)						\
+	static_branch_likely(&ss ## _enabled_key)
+
+/**
+ * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
+ * @ss: subsystem in question
+ */
+#define cgroup_subsys_on_dfl(ss)						\
+	static_branch_likely(&ss ## _on_dfl_key)
+
 bool css_has_online_children(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
 struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2cf0f79f1fc9..361938911bd1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -139,6 +139,27 @@ static const char *cgroup_subsys_name[] = {
 };
 #undef SUBSYS
 
+/* array of static_keys for cgroup_subsys_enabled() and cgroup_subsys_on_dfl() */
+#define SUBSYS(_x)								\
+	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_enabled_key);			\
+	DEFINE_STATIC_KEY_TRUE(_x ## _cgrp_subsys_on_dfl_key);			\
+	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_enabled_key);			\
+	EXPORT_SYMBOL_GPL(_x ## _cgrp_subsys_on_dfl_key);
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_enabled_key,
+static struct static_key_true *cgroup_subsys_enabled_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
+#define SUBSYS(_x) [_x ## _cgrp_id] = &_x ## _cgrp_subsys_on_dfl_key,
+static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
+#include <linux/cgroup_subsys.h>
+};
+#undef SUBSYS
+
 /*
  * The default hierarchy, reserved for the subsystems that are otherwise
  * unattached - it never has more than a single cgroup, and all tasks are
@@ -1319,9 +1340,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 
 		/* default hierarchy doesn't enable controllers by default */
 		dst_root->subsys_mask |= 1 << ssid;
-		if (dst_root != &cgrp_dfl_root) {
+		if (dst_root == &cgrp_dfl_root) {
+			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
+		} else {
 			dst_root->cgrp.subtree_control |= 1 << ssid;
 			cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
 		}
 
 		if (ss->bind)
@@ -5483,6 +5507,7 @@ static int __init cgroup_disable(char *str)
 			    strcmp(token, ss->legacy_name))
 				continue;
 
+			static_branch_disable(cgroup_subsys_enabled_key[i]);
 			ss->disabled = 1;
 			printk(KERN_INFO "Disabling %s control group subsystem\n",
 			       ss->name);

From fc5ed1e95410ad73b2ab8f33cd90eb3bcf6c98a1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 11:56:28 -0400
Subject: [PATCH 05/38] cgroup: replace cgroup_subsys->disabled tests with
 cgroup_subsys_enabled()

Replace cgroup_subsys->disabled tests in controllers with
cgroup_subsys_enabled().  cgroup_subsys_enabled() requires literal
subsys name as its parameter and thus can't be used for cgroup core
which iterates through controllers.  For cgroup core, introduce and
use cgroup_ssid_enabled() which uses slower static_key_enabled() test
and can be indexed by subsys ID.

This leaves cgroup_subsys->disabled unused.  Removed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
---
 include/linux/cgroup-defs.h    |  1 -
 include/linux/hugetlb_cgroup.h |  4 +---
 include/linux/memcontrol.h     |  4 +---
 kernel/cgroup.c                | 28 +++++++++++++++++++++-------
 4 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 4d8fcf2187dc..c5d41c3c1f00 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -419,7 +419,6 @@ struct cgroup_subsys {
 		     struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
-	int disabled;
 	int early_init;
 
 	/*
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index bcc853eccc85..7edd30515298 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -48,9 +48,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
 
 static inline bool hugetlb_cgroup_disabled(void)
 {
-	if (hugetlb_cgrp_subsys.disabled)
-		return true;
-	return false;
+	return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
 }
 
 extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ad800e62cb7a..9aa7820c2177 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -347,9 +347,7 @@ ino_t page_cgroup_ino(struct page *page);
 
 static inline bool mem_cgroup_disabled(void)
 {
-	if (memory_cgrp_subsys.disabled)
-		return true;
-	return false;
+	return !cgroup_subsys_enabled(memory_cgrp_subsys);
 }
 
 /*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 361938911bd1..5703ba791b3d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -224,6 +224,19 @@ static void kill_css(struct cgroup_subsys_state *css);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 
+/**
+ * cgroup_ssid_enabled - cgroup subsys enabled test by subsys ID
+ * @ssid: subsys ID of interest
+ *
+ * cgroup_subsys_enabled() can only be used with literal subsys names which
+ * is fine for individual subsystems but unsuitable for cgroup core.  This
+ * is slower static_key_enabled() based test indexed by @ssid.
+ */
+static bool cgroup_ssid_enabled(int ssid)
+{
+	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
+}
+
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 			    gfp_t gfp_mask)
@@ -1482,7 +1495,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 		for_each_subsys(ss, i) {
 			if (strcmp(token, ss->legacy_name))
 				continue;
-			if (ss->disabled)
+			if (!cgroup_ssid_enabled(i))
 				continue;
 
 			/* Mutually exclusive option 'all' + subsystem name */
@@ -1513,7 +1526,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
 	 */
 	if (all_ss || (!one_ss && !opts->none && !opts->name))
 		for_each_subsys(ss, i)
-			if (!ss->disabled)
+			if (cgroup_ssid_enabled(i))
 				opts->subsys_mask |= (1 << i);
 
 	/*
@@ -2762,7 +2775,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 		if (tok[0] == '\0')
 			continue;
 		for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
-			if (ss->disabled || strcmp(tok + 1, ss->name))
+			if (!cgroup_ssid_enabled(ssid) ||
+			    strcmp(tok + 1, ss->name))
 				continue;
 
 			if (*tok == '+') {
@@ -3320,7 +3334,7 @@ static int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	int ret;
 
-	if (ss->disabled)
+	if (!cgroup_ssid_enabled(ss->id))
 		return 0;
 
 	if (!cfts || cfts[0].name[0] == '\0')
@@ -5082,7 +5096,7 @@ int __init cgroup_init(void)
 		 * disabled flag and cftype registration needs kmalloc,
 		 * both of which aren't available during early_init.
 		 */
-		if (ss->disabled)
+		if (!cgroup_ssid_enabled(ssid))
 			continue;
 
 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
@@ -5217,7 +5231,8 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	for_each_subsys(ss, i)
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->legacy_name, ss->root->hierarchy_id,
-			   atomic_read(&ss->root->nr_cgrps), !ss->disabled);
+			   atomic_read(&ss->root->nr_cgrps),
+			   cgroup_ssid_enabled(i));
 
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -5508,7 +5523,6 @@ static int __init cgroup_disable(char *str)
 				continue;
 
 			static_branch_disable(cgroup_subsys_enabled_key[i]);
-			ss->disabled = 1;
 			printk(KERN_INFO "Disabling %s control group subsystem\n",
 			       ss->name);
 			break;

From 9e10a130d9b62af976d17d120c95f3650769312c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 11:56:28 -0400
Subject: [PATCH 06/38] cgroup: replace cgroup_on_dfl() tests in controllers
 with cgroup_subsys_on_dfl()

cgroup_on_dfl() tests whether the cgroup's root is the default
hierarchy; however, an individual controller is only interested in
whether the controller is attached to the default hierarchy and never
tests a cgroup which doesn't belong to the hierarchy that the
controller is attached to.

This patch replaces cgroup_on_dfl() tests in controllers with faster
static_key based cgroup_subsys_on_dfl().  This leaves cgroup core as
the only user of cgroup_on_dfl() and the function is moved from the
header file to cgroup.c.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
---
 block/blk-throttle.c   |  2 +-
 block/cfq-iosched.c    |  4 +--
 include/linux/cgroup.h | 58 ------------------------------------------
 kernel/cgroup.c        | 58 ++++++++++++++++++++++++++++++++++++++++++
 kernel/cpuset.c        | 23 +++++++++--------
 mm/memcontrol.c        |  4 +--
 6 files changed, 76 insertions(+), 73 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index c75a2636dd40..2149a1ddbacf 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -369,7 +369,7 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
 	 * regardless of the position of the group in the hierarchy.
 	 */
 	sq->parent_sq = &td->service_queue;
-	if (cgroup_on_dfl(blkg->blkcg->css.cgroup) && blkg->parent)
+	if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
 		sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
 	tg->td = td;
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 04de88463a98..1f9093e901da 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1581,7 +1581,7 @@ static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp)
 static void cfq_cpd_init(struct blkcg_policy_data *cpd)
 {
 	struct cfq_group_data *cgd = cpd_to_cfqgd(cpd);
-	unsigned int weight = cgroup_on_dfl(blkcg_root.css.cgroup) ?
+	unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
 			      CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
 
 	if (cpd_to_blkcg(cpd) == &blkcg_root)
@@ -1599,7 +1599,7 @@ static void cfq_cpd_free(struct blkcg_policy_data *cpd)
 static void cfq_cpd_bind(struct blkcg_policy_data *cpd)
 {
 	struct blkcg *blkcg = cpd_to_blkcg(cpd);
-	bool on_dfl = cgroup_on_dfl(blkcg_root.css.cgroup);
+	bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys);
 	unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL;
 
 	if (blkcg == &blkcg_root)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index c3a9f1eb4097..355bf2ed8867 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -433,64 +433,6 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
 	return task_css(task, subsys_id)->cgroup;
 }
 
-/**
- * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
- * @cgrp: the cgroup of interest
- *
- * The default hierarchy is the v2 interface of cgroup and this function
- * can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
- * interface version.
- *
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
- *
- * List of changed behaviors:
- *
- * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
- *   and "name" are disallowed.
- *
- * - When mounting an existing superblock, mount options should match.
- *
- * - Remount is disallowed.
- *
- * - rename(2) is disallowed.
- *
- * - "tasks" is removed.  Everything should be at process granularity.  Use
- *   "cgroup.procs" instead.
- *
- * - "cgroup.procs" is not sorted.  pids will be unique unless they got
- *   recycled inbetween reads.
- *
- * - "release_agent" and "notify_on_release" are removed.  Replacement
- *   notification mechanism will be implemented.
- *
- * - "cgroup.clone_children" is removed.
- *
- * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
- *   and its descendants contain no task; otherwise, 1.  The file also
- *   generates kernfs notification which can be monitored through poll and
- *   [di]notify when the value of the file changes.
- *
- * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
- *   take masks of ancestors with non-empty cpus/mems, instead of being
- *   moved to an ancestor.
- *
- * - cpuset: a task can be moved into an empty cpuset, and again it takes
- *   masks of ancestors.
- *
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- *   is not created.
- *
- * - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
- */
-static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
-{
-	return cgrp->root == &cgrp_dfl_root;
-}
-
 /* no synchronization, the result can only be used as a hint */
 static inline bool cgroup_has_tasks(struct cgroup *cgrp)
 {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5703ba791b3d..c24f929fdeb7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -237,6 +237,64 @@ static bool cgroup_ssid_enabled(int ssid)
 	return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
 }
 
+/**
+ * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
+ * @cgrp: the cgroup of interest
+ *
+ * The default hierarchy is the v2 interface of cgroup and this function
+ * can be used to test whether a cgroup is on the default hierarchy for
+ * cases where a subsystem should behave differnetly depending on the
+ * interface version.
+ *
+ * The set of behaviors which change on the default hierarchy are still
+ * being determined and the mount option is prefixed with __DEVEL__.
+ *
+ * List of changed behaviors:
+ *
+ * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
+ *   and "name" are disallowed.
+ *
+ * - When mounting an existing superblock, mount options should match.
+ *
+ * - Remount is disallowed.
+ *
+ * - rename(2) is disallowed.
+ *
+ * - "tasks" is removed.  Everything should be at process granularity.  Use
+ *   "cgroup.procs" instead.
+ *
+ * - "cgroup.procs" is not sorted.  pids will be unique unless they got
+ *   recycled inbetween reads.
+ *
+ * - "release_agent" and "notify_on_release" are removed.  Replacement
+ *   notification mechanism will be implemented.
+ *
+ * - "cgroup.clone_children" is removed.
+ *
+ * - "cgroup.subtree_populated" is available.  Its value is 0 if the cgroup
+ *   and its descendants contain no task; otherwise, 1.  The file also
+ *   generates kernfs notification which can be monitored through poll and
+ *   [di]notify when the value of the file changes.
+ *
+ * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
+ *   take masks of ancestors with non-empty cpus/mems, instead of being
+ *   moved to an ancestor.
+ *
+ * - cpuset: a task can be moved into an empty cpuset, and again it takes
+ *   masks of ancestors.
+ *
+ * - memcg: use_hierarchy is on by default and the cgroup file for the flag
+ *   is not created.
+ *
+ * - blkcg: blk-throttle becomes properly hierarchical.
+ *
+ * - debug: disallowed on the default hierarchy.
+ */
+static bool cgroup_on_dfl(const struct cgroup *cgrp)
+{
+	return cgrp->root == &cgrp_dfl_root;
+}
+
 /* IDR wrappers which synchronize using cgroup_idr_lock */
 static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
 			    gfp_t gfp_mask)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f0acff0f66c9..20eedd8098c0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -473,7 +473,8 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 
 	/* On legacy hiearchy, we must be a subset of our parent cpuset. */
 	ret = -EACCES;
-	if (!cgroup_on_dfl(cur->css.cgroup) && !is_cpuset_subset(trial, par))
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+	    !is_cpuset_subset(trial, par))
 		goto out;
 
 	/*
@@ -879,7 +880,8 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some CPUs.
 		 */
-		if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus))
+		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+		    cpumask_empty(new_cpus))
 			cpumask_copy(new_cpus, parent->effective_cpus);
 
 		/* Skip the whole subtree if the cpumask remains the same. */
@@ -896,7 +898,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 		cpumask_copy(cp->effective_cpus, new_cpus);
 		spin_unlock_irq(&callback_lock);
 
-		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
 
 		update_tasks_cpumask(cp);
@@ -1135,7 +1137,8 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 		 * If it becomes empty, inherit the effective mask of the
 		 * parent, which is guaranteed to have some MEMs.
 		 */
-		if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems))
+		if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+		    nodes_empty(*new_mems))
 			*new_mems = parent->effective_mems;
 
 		/* Skip the whole subtree if the nodemask remains the same. */
@@ -1152,7 +1155,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
 		cp->effective_mems = *new_mems;
 		spin_unlock_irq(&callback_lock);
 
-		WARN_ON(!cgroup_on_dfl(cp->css.cgroup) &&
+		WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 			!nodes_equal(cp->mems_allowed, cp->effective_mems));
 
 		update_tasks_nodemask(cp);
@@ -1440,7 +1443,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css,
 
 	/* allow moving tasks into an empty cpuset if on default hierarchy */
 	ret = -ENOSPC;
-	if (!cgroup_on_dfl(css->cgroup) &&
+	if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
 	    (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
 		goto out_unlock;
 
@@ -1952,7 +1955,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cpuset_inc();
 
 	spin_lock_irq(&callback_lock);
-	if (cgroup_on_dfl(cs->css.cgroup)) {
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
 		cpumask_copy(cs->effective_cpus, parent->effective_cpus);
 		cs->effective_mems = parent->effective_mems;
 	}
@@ -2029,7 +2032,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
 	mutex_lock(&cpuset_mutex);
 	spin_lock_irq(&callback_lock);
 
-	if (cgroup_on_dfl(root_css->cgroup)) {
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
 		cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
 		top_cpuset.mems_allowed = node_possible_map;
 	} else {
@@ -2210,7 +2213,7 @@ retry:
 	cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
 	mems_updated = !nodes_equal(new_mems, cs->effective_mems);
 
-	if (cgroup_on_dfl(cs->css.cgroup))
+	if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
 		hotplug_update_tasks(cs, &new_cpus, &new_mems,
 				     cpus_updated, mems_updated);
 	else
@@ -2241,7 +2244,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
 	static cpumask_t new_cpus;
 	static nodemask_t new_mems;
 	bool cpus_updated, mems_updated;
-	bool on_dfl = cgroup_on_dfl(top_cpuset.css.cgroup);
+	bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
 
 	mutex_lock(&cpuset_mutex);
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6ddaeba34e09..b35c4cc47a30 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -434,7 +434,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 
 	memcg = page->mem_cgroup;
 
-	if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+	if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		memcg = root_mem_cgroup;
 
 	rcu_read_unlock();
@@ -5059,7 +5059,7 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
 	 * guarantees that @root doesn't have any children, so turning it
 	 * on for the root memcg is enough.
 	 */
-	if (cgroup_on_dfl(root_css->cgroup))
+	if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
 		root_mem_cgroup->use_hierarchy = true;
 	else
 		root_mem_cgroup->use_hierarchy = false;

From 4a07c222d3afb00e1113834fee38d23a8e5d71dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 17:54:22 -0400
Subject: [PATCH 07/38] cgroup: replace "cgroup.populated" with "cgroup.events"

memcg already uses "memory.events" for event reporting and other
controllers may need event reporting too.  Let's standardize on
"$SUBSYS.events" interface file for reporting events which don't
happen too frequently and thus can share event notification.

"cgroup.populated" is replaced with "populated" field in
"cgroup.events" and documentation is updated accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 Documentation/cgroups/unified-hierarchy.txt | 15 ++++++++++-----
 include/linux/cgroup-defs.h                 |  2 +-
 kernel/cgroup.c                             | 17 +++++++++--------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index e0975c2cf03d..176b940f8327 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -341,11 +341,11 @@ is riddled with issues.
   unnecessarily complicated and probably done this way because event
   delivery itself was expensive.
 
-Unified hierarchy implements an interface file "cgroup.populated"
-which can be used to monitor whether the cgroup's subhierarchy has
-tasks in it or not.  Its value is 0 if there is no task in the cgroup
-and its descendants; otherwise, 1.  poll and [id]notify events are
-triggered when the value changes.
+Unified hierarchy implements "populated" field in "cgroup.events"
+interface file which can be used to monitor whether the cgroup's
+subhierarchy has tasks in it or not.  Its value is 0 if there is no
+task in the cgroup and its descendants; otherwise, 1.  poll and
+[id]notify events are triggered when the value changes.
 
 This is significantly lighter and simpler and trivially allows
 delegating management of subhierarchy - subhierarchy monitoring can
@@ -435,6 +435,11 @@ may be specified in any order and not all pairs have to be specified.
   the first entry in the file.  Specific entries can use "default" as
   its value to indicate inheritance of the default value.
 
+- For events which are not very high frequency, an interface file
+  "events" should be created which lists event key value pairs.
+  Whenever a notifiable event happens, file modified event should be
+  generated on the file.
+
 
 5-4. Per-Controller Changes
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index c5d41c3c1f00..d95cc88e9dc2 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -226,7 +226,7 @@ struct cgroup {
 
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
 	struct kernfs_node *procs_kn;	/* kn for "cgroup.procs" */
-	struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
+	struct kernfs_node *events_kn;	/* kn for "cgroup.events" */
 
 	/*
 	 * The bitmask of subsystems enabled on the child cgroups.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c24f929fdeb7..75eba25d8dfd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -611,8 +611,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		if (!trigger)
 			break;
 
-		if (cgrp->populated_kn)
-			kernfs_notify(cgrp->populated_kn);
+		if (cgrp->events_kn)
+			kernfs_notify(cgrp->events_kn);
 		cgrp = cgroup_parent(cgrp);
 	} while (cgrp);
 }
@@ -3045,9 +3045,10 @@ err_undo_css:
 	goto out_unlock;
 }
 
-static int cgroup_populated_show(struct seq_file *seq, void *v)
+static int cgroup_events_show(struct seq_file *seq, void *v)
 {
-	seq_printf(seq, "%d\n", (bool)seq_css(seq)->cgroup->populated_cnt);
+	seq_printf(seq, "populated %d\n",
+		   (bool)seq_css(seq)->cgroup->populated_cnt);
 	return 0;
 }
 
@@ -3214,8 +3215,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 
 	if (cft->write == cgroup_procs_write)
 		cgrp->procs_kn = kn;
-	else if (cft->seq_show == cgroup_populated_show)
-		cgrp->populated_kn = kn;
+	else if (cft->seq_show == cgroup_events_show)
+		cgrp->events_kn = kn;
 	return 0;
 }
 
@@ -4388,9 +4389,9 @@ static struct cftype cgroup_dfl_base_files[] = {
 		.write = cgroup_subtree_control_write,
 	},
 	{
-		.name = "cgroup.populated",
+		.name = "cgroup.events",
 		.flags = CFTYPE_NOT_ON_ROOT,
-		.seq_show = cgroup_populated_show,
+		.seq_show = cgroup_events_show,
 	},
 	{ }	/* terminate */
 };

From 7dbdb199d3bf88f043ea17e97113eb28d5b100bc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 17:54:23 -0400
Subject: [PATCH 08/38] cgroup: replace cftype->mode with CFTYPE_WORLD_WRITABLE

cftype->mode allows controllers to give arbitrary permissions to
interface knobs.  Except for "cgroup.event_control", the existing uses
are spurious.

* Some explicitly specify S_IRUGO | S_IWUSR even though that's the
  default.

* "cpuset.memory_pressure" specifies S_IRUGO while also setting a
  write callback which returns -EACCES.  All it needs to do is simply
  not setting a write callback.

"cgroup.event_control" uses cftype->mode to make the file
world-writable.  It's a misdesigned interface and we don't want
controllers to be tweaking interface file permissions in general.
This patch removes cftype->mode and all its spurious uses and
implements CFTYPE_WORLD_WRITABLE for "cgroup.event_control" which is
marked as compatibility-only.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup-defs.h |  6 +-----
 kernel/cgroup.c             | 19 +++++++------------
 kernel/cpuset.c             |  6 ------
 mm/memcontrol.c             |  3 +--
 4 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index d95cc88e9dc2..10d814bcd487 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -76,6 +76,7 @@ enum {
 	CFTYPE_ONLY_ON_ROOT	= (1 << 0),	/* only create on root cgrp */
 	CFTYPE_NOT_ON_ROOT	= (1 << 1),	/* don't create on root cgrp */
 	CFTYPE_NO_PREFIX	= (1 << 3),	/* (DON'T USE FOR NEW FILES) no subsys prefix */
+	CFTYPE_WORLD_WRITABLE	= (1 << 4),	/* (DON'T USE FOR NEW FILES) S_IWUGO */
 
 	/* internal flags, do not use outside cgroup core proper */
 	__CFTYPE_ONLY_ON_DFL	= (1 << 16),	/* only on default hierarchy */
@@ -324,11 +325,6 @@ struct cftype {
 	 */
 	char name[MAX_CFTYPE_NAME];
 	unsigned long private;
-	/*
-	 * If not 0, file mode is set to this value, otherwise it will
-	 * be figured out automatically
-	 */
-	umode_t mode;
 
 	/*
 	 * The maximum length of string, excluding trailing nul, that can
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 75eba25d8dfd..5031edc6f077 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1139,23 +1139,21 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
  * cgroup_file_mode - deduce file mode of a control file
  * @cft: the control file in question
  *
- * returns cft->mode if ->mode is not 0
- * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
- * returns S_IRUGO if it has only a read handler
- * returns S_IWUSR if it has only a write hander
+ * S_IRUGO for read, S_IWUSR for write.
  */
 static umode_t cgroup_file_mode(const struct cftype *cft)
 {
 	umode_t mode = 0;
 
-	if (cft->mode)
-		return cft->mode;
-
 	if (cft->read_u64 || cft->read_s64 || cft->seq_show)
 		mode |= S_IRUGO;
 
-	if (cft->write_u64 || cft->write_s64 || cft->write)
-		mode |= S_IWUSR;
+	if (cft->write_u64 || cft->write_s64 || cft->write) {
+		if (cft->flags & CFTYPE_WORLD_WRITABLE)
+			mode |= S_IWUGO;
+		else
+			mode |= S_IWUSR;
+	}
 
 	return mode;
 }
@@ -4371,7 +4369,6 @@ static struct cftype cgroup_dfl_base_files[] = {
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_PROCS,
 		.write = cgroup_procs_write,
-		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "cgroup.controllers",
@@ -4406,7 +4403,6 @@ static struct cftype cgroup_legacy_base_files[] = {
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_PROCS,
 		.write = cgroup_procs_write,
-		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "cgroup.clone_children",
@@ -4426,7 +4422,6 @@ static struct cftype cgroup_legacy_base_files[] = {
 		.seq_show = cgroup_pidlist_show,
 		.private = CGROUP_FILE_TASKS,
 		.write = cgroup_tasks_write,
-		.mode = S_IRUGO | S_IWUSR,
 	},
 	{
 		.name = "notify_on_release",
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 20eedd8098c0..312961ef3ccc 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1597,9 +1597,6 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
 	case FILE_MEMORY_PRESSURE_ENABLED:
 		cpuset_memory_pressure_enabled = !!val;
 		break;
-	case FILE_MEMORY_PRESSURE:
-		retval = -EACCES;
-		break;
 	case FILE_SPREAD_PAGE:
 		retval = update_flag(CS_SPREAD_PAGE, cs, val);
 		break;
@@ -1866,9 +1863,6 @@ static struct cftype files[] = {
 	{
 		.name = "memory_pressure",
 		.read_u64 = cpuset_read_u64,
-		.write_u64 = cpuset_write_u64,
-		.private = FILE_MEMORY_PRESSURE,
-		.mode = S_IRUGO,
 	},
 
 	{
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index b35c4cc47a30..e672f2689326 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4060,8 +4060,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
 	{
 		.name = "cgroup.event_control",		/* XXX: for compat */
 		.write = memcg_write_event_control,
-		.flags = CFTYPE_NO_PREFIX,
-		.mode = S_IWUGO,
+		.flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
 	},
 	{
 		.name = "swappiness",

From ccdca2187b03decd186df13e44c57fbf1974d0a0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 17:54:23 -0400
Subject: [PATCH 09/38] cgroup: relocate cgroup_populate_dir()

Move it upwards so that it's right below cgroup_clear_dir() and the
forward declaration is unnecessary.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/cgroup.c | 63 ++++++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 32 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5031edc6f077..c38c3a229d14 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1116,7 +1116,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
  * update of a tasks cgroup pointer by cgroup_attach_task()
  */
 
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask);
 static struct kernfs_syscall_ops cgroup_kf_syscall_ops;
 static const struct file_operations proc_cgroupstats_operations;
 
@@ -1333,6 +1332,37 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
 	}
 }
 
+/**
+ * cgroup_populate_dir - create subsys files in a cgroup directory
+ * @cgrp: target cgroup
+ * @subsys_mask: mask of the subsystem ids whose files should be added
+ *
+ * On failure, no file is added.
+ */
+static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+{
+	struct cgroup_subsys *ss;
+	int i, ret = 0;
+
+	/* process cftsets of each subsystem */
+	for_each_subsys(ss, i) {
+		struct cftype *cfts;
+
+		if (!(subsys_mask & (1 << i)))
+			continue;
+
+		list_for_each_entry(cfts, &ss->cfts, node) {
+			ret = cgroup_addrm_files(cgrp, cfts, true);
+			if (ret < 0)
+				goto err;
+		}
+	}
+	return 0;
+err:
+	cgroup_clear_dir(cgrp, subsys_mask);
+	return ret;
+}
+
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask)
 {
@@ -4438,37 +4468,6 @@ static struct cftype cgroup_legacy_base_files[] = {
 	{ }	/* terminate */
 };
 
-/**
- * cgroup_populate_dir - create subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be added
- *
- * On failure, no file is added.
- */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
-{
-	struct cgroup_subsys *ss;
-	int i, ret = 0;
-
-	/* process cftsets of each subsystem */
-	for_each_subsys(ss, i) {
-		struct cftype *cfts;
-
-		if (!(subsys_mask & (1 << i)))
-			continue;
-
-		list_for_each_entry(cfts, &ss->cfts, node) {
-			ret = cgroup_addrm_files(cgrp, cfts, true);
-			if (ret < 0)
-				goto err;
-		}
-	}
-	return 0;
-err:
-	cgroup_clear_dir(cgrp, subsys_mask);
-	return ret;
-}
-
 /*
  * css destruction is four-stage process.
  *

From 6732ed853af942ba20ddbd091336acc7df569119 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 17:54:23 -0400
Subject: [PATCH 10/38] cgroup: make cgroup_addrm_files() clean up after itself
 on failures

After a file creation failure, cgroup_addrm_files() it didn't remove
the files which had already been created.  When cgroup_populate_dir()
is the caller, this is fine as the caller performs cleanup; however,
for other callers, this may leave unactivated dangling files behind.
As kernfs directory removals are recursive, this doesn't lead to
permanent memory leak but it can, for example, fail future attempts to
create those files again.

There's no point in keeping around this sort of subtlety and it gets
in the way of planned updates to file handling.  This patch makes
cgroup_addrm_files() clean up after itself on failures.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/cgroup.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c38c3a229d14..f09bab1cf275 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3255,19 +3255,18 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
  * @is_add: whether to add or remove
  *
  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
- * For removals, this function never fails.  If addition fails, this
- * function doesn't remove files already added.  The caller is responsible
- * for cleaning up.
+ * For removals, this function never fails.
  */
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add)
 {
-	struct cftype *cft;
+	struct cftype *cft, *cft_end = NULL;
 	int ret;
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	for (cft = cfts; cft->name[0] != '\0'; cft++) {
+restart:
+	for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
 		/* does cft->flags tell us to skip this file on @cgrp? */
 		if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
 			continue;
@@ -3283,7 +3282,9 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
 			if (ret) {
 				pr_warn("%s: failed to add %s, err=%d\n",
 					__func__, cft->name, ret);
-				return ret;
+				cft_end = cft;
+				is_add = false;
+				goto restart;
 			}
 		} else {
 			cgroup_rm_file(cgrp, cft);

From 1ada48387a31b437074aa8973ce81de6e8c60bde Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 17:54:23 -0400
Subject: [PATCH 11/38] cgroup: cosmetic updates to rebind_subsystems()

* Use local variables @scgrp and @dcgrp for @src_root->cgrp and
  @dst_root->cgrp respectively.

* Use initializers to set @src_root and @css in the inner bind loop.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/cgroup.c | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f09bab1cf275..a4ff4965e53b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1366,6 +1366,7 @@ err:
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask)
 {
+	struct cgroup *dcgrp = &dst_root->cgrp;
 	struct cgroup_subsys *ss;
 	unsigned long tmp_ss_mask;
 	int ssid, i, ret;
@@ -1387,7 +1388,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 	if (dst_root == &cgrp_dfl_root)
 		tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
 
-	ret = cgroup_populate_dir(&dst_root->cgrp, tmp_ss_mask);
+	ret = cgroup_populate_dir(dcgrp, tmp_ss_mask);
 	if (ret) {
 		if (dst_root != &cgrp_dfl_root)
 			return ret;
@@ -1413,37 +1414,35 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
 
 	for_each_subsys_which(ss, ssid, &ss_mask) {
-		struct cgroup_root *src_root;
-		struct cgroup_subsys_state *css;
+		struct cgroup_root *src_root = ss->root;
+		struct cgroup *scgrp = &src_root->cgrp;
+		struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
 		struct css_set *cset;
 
-		src_root = ss->root;
-		css = cgroup_css(&src_root->cgrp, ss);
+		WARN_ON(!css || cgroup_css(dcgrp, ss));
 
-		WARN_ON(!css || cgroup_css(&dst_root->cgrp, ss));
-
-		RCU_INIT_POINTER(src_root->cgrp.subsys[ssid], NULL);
-		rcu_assign_pointer(dst_root->cgrp.subsys[ssid], css);
+		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
+		rcu_assign_pointer(dcgrp->subsys[ssid], css);
 		ss->root = dst_root;
-		css->cgroup = &dst_root->cgrp;
+		css->cgroup = dcgrp;
 
 		down_write(&css_set_rwsem);
 		hash_for_each(css_set_table, i, cset, hlist)
 			list_move_tail(&cset->e_cset_node[ss->id],
-				       &dst_root->cgrp.e_csets[ss->id]);
+				       &dcgrp->e_csets[ss->id]);
 		up_write(&css_set_rwsem);
 
 		src_root->subsys_mask &= ~(1 << ssid);
-		src_root->cgrp.subtree_control &= ~(1 << ssid);
-		cgroup_refresh_child_subsys_mask(&src_root->cgrp);
+		scgrp->subtree_control &= ~(1 << ssid);
+		cgroup_refresh_child_subsys_mask(scgrp);
 
 		/* default hierarchy doesn't enable controllers by default */
 		dst_root->subsys_mask |= 1 << ssid;
 		if (dst_root == &cgrp_dfl_root) {
 			static_branch_enable(cgroup_subsys_on_dfl_key[ssid]);
 		} else {
-			dst_root->cgrp.subtree_control |= 1 << ssid;
-			cgroup_refresh_child_subsys_mask(&dst_root->cgrp);
+			dcgrp->subtree_control |= 1 << ssid;
+			cgroup_refresh_child_subsys_mask(dcgrp);
 			static_branch_disable(cgroup_subsys_on_dfl_key[ssid]);
 		}
 
@@ -1451,7 +1450,7 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 			ss->bind(css);
 	}
 
-	kernfs_activate(dst_root->cgrp.kn);
+	kernfs_activate(dcgrp->kn);
 	return 0;
 }
 

From 4df8dc903161c03cd9bff9077d8427fefe808e6d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 17:54:23 -0400
Subject: [PATCH 12/38] cgroup: restructure file creation / removal handling

The file creation / removal path has always been a bit icky and the
planned notification update requires css during file creation.
Restructure as follows.

* cgroup_addrm_files() now takes both @css and @cgrp and is only
  called directly by other file handling functions.

* cgroup_populate/clear_dir() are replaced with
  css_populate/clear_dir() taking @css and @cgrp_override.
  @cgrp_override is used only when files needs to be created on /
  removed from a cgroup which isn't attached to @css which happens
  during subsystem rebinds.  Subsystem loops are moved to the callers.

* cgroup_add_file() now takes both @css and @cgrp.  @css isn't used
  yet but will be used by the planned notification update.

This patch doens't cause any behavior changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/cgroup.c | 143 +++++++++++++++++++++++++-----------------------
 1 file changed, 76 insertions(+), 67 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a4ff4965e53b..06c9d1aeea9d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -221,7 +221,8 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 		      bool visible);
 static void css_release(struct percpu_ref *ref);
 static void kill_css(struct cgroup_subsys_state *css);
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add);
 
 /**
@@ -1313,53 +1314,57 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
 }
 
 /**
- * cgroup_clear_dir - remove subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be removed
+ * css_clear_dir - remove subsys files in a cgroup directory
+ * @css: taget css
+ * @cgrp_override: specify if target cgroup is different from css->cgroup
  */
-static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static void css_clear_dir(struct cgroup_subsys_state *css,
+			  struct cgroup *cgrp_override)
 {
-	struct cgroup_subsys *ss;
-	int i;
+	struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+	struct cftype *cfts;
 
-	for_each_subsys(ss, i) {
-		struct cftype *cfts;
-
-		if (!(subsys_mask & (1 << i)))
-			continue;
-		list_for_each_entry(cfts, &ss->cfts, node)
-			cgroup_addrm_files(cgrp, cfts, false);
-	}
+	list_for_each_entry(cfts, &css->ss->cfts, node)
+		cgroup_addrm_files(css, cgrp, cfts, false);
 }
 
 /**
- * cgroup_populate_dir - create subsys files in a cgroup directory
- * @cgrp: target cgroup
- * @subsys_mask: mask of the subsystem ids whose files should be added
+ * css_populate_dir - create subsys files in a cgroup directory
+ * @css: target css
+ * @cgrp_overried: specify if target cgroup is different from css->cgroup
  *
  * On failure, no file is added.
  */
-static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask)
+static int css_populate_dir(struct cgroup_subsys_state *css,
+			    struct cgroup *cgrp_override)
 {
-	struct cgroup_subsys *ss;
-	int i, ret = 0;
+	struct cgroup *cgrp = cgrp_override ?: css->cgroup;
+	struct cftype *cfts, *failed_cfts;
+	int ret;
 
-	/* process cftsets of each subsystem */
-	for_each_subsys(ss, i) {
-		struct cftype *cfts;
+	if (!css->ss) {
+		if (cgroup_on_dfl(cgrp))
+			cfts = cgroup_dfl_base_files;
+		else
+			cfts = cgroup_legacy_base_files;
 
-		if (!(subsys_mask & (1 << i)))
-			continue;
+		return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
+	}
 
-		list_for_each_entry(cfts, &ss->cfts, node) {
-			ret = cgroup_addrm_files(cgrp, cfts, true);
-			if (ret < 0)
-				goto err;
+	list_for_each_entry(cfts, &css->ss->cfts, node) {
+		ret = cgroup_addrm_files(css, cgrp, cfts, true);
+		if (ret < 0) {
+			failed_cfts = cfts;
+			goto err;
 		}
 	}
 	return 0;
 err:
-	cgroup_clear_dir(cgrp, subsys_mask);
+	list_for_each_entry(cfts, &css->ss->cfts, node) {
+		if (cfts == failed_cfts)
+			break;
+		cgroup_addrm_files(css, cgrp, cfts, false);
+	}
 	return ret;
 }
 
@@ -1388,10 +1393,13 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 	if (dst_root == &cgrp_dfl_root)
 		tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask;
 
-	ret = cgroup_populate_dir(dcgrp, tmp_ss_mask);
-	if (ret) {
-		if (dst_root != &cgrp_dfl_root)
-			return ret;
+	for_each_subsys_which(ss, ssid, &tmp_ss_mask) {
+		struct cgroup *scgrp = &ss->root->cgrp;
+		int tssid;
+
+		ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp);
+		if (!ret)
+			continue;
 
 		/*
 		 * Rebinding back to the default root is not allowed to
@@ -1399,20 +1407,27 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		 * be rare.  Moving subsystems back and forth even more so.
 		 * Just warn about it and continue.
 		 */
-		if (cgrp_dfl_root_visible) {
-			pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
-				ret, ss_mask);
-			pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+		if (dst_root == &cgrp_dfl_root) {
+			if (cgrp_dfl_root_visible) {
+				pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n",
+					ret, ss_mask);
+				pr_warn("you may retry by moving them to a different hierarchy and unbinding\n");
+			}
+			continue;
 		}
+
+		for_each_subsys_which(ss, tssid, &tmp_ss_mask) {
+			if (tssid == ssid)
+				break;
+			css_clear_dir(cgroup_css(scgrp, ss), dcgrp);
+		}
+		return ret;
 	}
 
 	/*
 	 * Nothing can fail from this point on.  Remove files for the
 	 * removed subsystems and rebind each subsystem.
 	 */
-	for_each_subsys_which(ss, ssid, &ss_mask)
-		cgroup_clear_dir(&ss->root->cgrp, 1 << ssid);
-
 	for_each_subsys_which(ss, ssid, &ss_mask) {
 		struct cgroup_root *src_root = ss->root;
 		struct cgroup *scgrp = &src_root->cgrp;
@@ -1421,6 +1436,8 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 
 		WARN_ON(!css || cgroup_css(dcgrp, ss));
 
+		css_clear_dir(css, NULL);
+
 		RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
 		rcu_assign_pointer(dcgrp->subsys[ssid], css);
 		ss->root = dst_root;
@@ -1791,7 +1808,6 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 {
 	LIST_HEAD(tmp_links);
 	struct cgroup *root_cgrp = &root->cgrp;
-	struct cftype *base_files;
 	struct css_set *cset;
 	int i, ret;
 
@@ -1830,12 +1846,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	}
 	root_cgrp->kn = root->kf_root->kn;
 
-	if (root == &cgrp_dfl_root)
-		base_files = cgroup_dfl_base_files;
-	else
-		base_files = cgroup_legacy_base_files;
-
-	ret = cgroup_addrm_files(root_cgrp, base_files, true);
+	ret = css_populate_dir(&root_cgrp->self, NULL);
 	if (ret)
 		goto destroy_root;
 
@@ -2985,7 +2996,8 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 				ret = create_css(child, ss,
 					cgrp->subtree_control & (1 << ssid));
 			else
-				ret = cgroup_populate_dir(child, 1 << ssid);
+				ret = css_populate_dir(cgroup_css(child, ss),
+						       NULL);
 			if (ret)
 				goto err_undo_css;
 		}
@@ -3018,7 +3030,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
 			if (css_disable & (1 << ssid)) {
 				kill_css(css);
 			} else {
-				cgroup_clear_dir(child, 1 << ssid);
+				css_clear_dir(css, NULL);
 				if (ss->css_reset)
 					ss->css_reset(css);
 			}
@@ -3066,7 +3078,7 @@ err_undo_css:
 			if (css_enable & (1 << ssid))
 				kill_css(css);
 			else
-				cgroup_clear_dir(child, 1 << ssid);
+				css_clear_dir(css, NULL);
 		}
 	}
 	goto out_unlock;
@@ -3218,7 +3230,8 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn)
 	return kernfs_setattr(kn, &iattr);
 }
 
-static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
+static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
+			   struct cftype *cft)
 {
 	char name[CGROUP_FILE_NAME_MAX];
 	struct kernfs_node *kn;
@@ -3249,14 +3262,16 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft)
 
 /**
  * cgroup_addrm_files - add or remove files to a cgroup directory
- * @cgrp: the target cgroup
+ * @css: the target css
+ * @cgrp: the target cgroup (usually css->cgroup)
  * @cfts: array of cftypes to be added
  * @is_add: whether to add or remove
  *
  * Depending on @is_add, add or remove files defined by @cfts on @cgrp.
  * For removals, this function never fails.
  */
-static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
+static int cgroup_addrm_files(struct cgroup_subsys_state *css,
+			      struct cgroup *cgrp, struct cftype cfts[],
 			      bool is_add)
 {
 	struct cftype *cft, *cft_end = NULL;
@@ -3277,7 +3292,7 @@ restart:
 			continue;
 
 		if (is_add) {
-			ret = cgroup_add_file(cgrp, cft);
+			ret = cgroup_add_file(css, cgrp, cft);
 			if (ret) {
 				pr_warn("%s: failed to add %s, err=%d\n",
 					__func__, cft->name, ret);
@@ -3309,7 +3324,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add)
 		if (cgroup_is_dead(cgrp))
 			continue;
 
-		ret = cgroup_addrm_files(cgrp, cfts, is_add);
+		ret = cgroup_addrm_files(css, cgrp, cfts, is_add);
 		if (ret)
 			break;
 	}
@@ -4685,7 +4700,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 	css->id = err;
 
 	if (visible) {
-		err = cgroup_populate_dir(cgrp, 1 << ss->id);
+		err = css_populate_dir(css, NULL);
 		if (err)
 			goto err_free_id;
 	}
@@ -4711,7 +4726,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 
 err_list_del:
 	list_del_rcu(&css->sibling);
-	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+	css_clear_dir(css, NULL);
 err_free_id:
 	cgroup_idr_remove(&ss->css_idr, css->id);
 err_free_percpu_ref:
@@ -4728,7 +4743,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	struct cgroup_root *root;
 	struct cgroup_subsys *ss;
 	struct kernfs_node *kn;
-	struct cftype *base_files;
 	int ssid, ret;
 
 	/* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
@@ -4804,12 +4818,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 	if (ret)
 		goto out_destroy;
 
-	if (cgroup_on_dfl(cgrp))
-		base_files = cgroup_dfl_base_files;
-	else
-		base_files = cgroup_legacy_base_files;
-
-	ret = cgroup_addrm_files(cgrp, base_files, true);
+	ret = css_populate_dir(&cgrp->self, NULL);
 	if (ret)
 		goto out_destroy;
 
@@ -4896,7 +4905,7 @@ static void kill_css(struct cgroup_subsys_state *css)
 	 * This must happen before css is disassociated with its cgroup.
 	 * See seq_css() for details.
 	 */
-	cgroup_clear_dir(css->cgroup, 1 << css->ss->id);
+	css_clear_dir(css, NULL);
 
 	/*
 	 * Killing would put the base ref, but we need to keep it alive

From 6f60eade2433cb3a38687d5f8a4f44b92c6c51bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 17:54:23 -0400
Subject: [PATCH 13/38] cgroup: generalize obtaining the handles of and
 notifying cgroup files

cgroup core handles creations and removals of cgroup interface files
as described by cftypes.  There are cases where the handle for a given
file instance is necessary, for example, to generate a file modified
event.  Currently, this is handled by explicitly matching the callback
method pointer and storing the file handle manually in
cgroup_add_file().  While this simple approach works for cgroup core
files, it can't for controller interface files.

This patch generalizes cgroup interface file handle handling.  struct
cgroup_file is defined and each cftype can optionally tell cgroup core
to store the file handle by setting ->file_offset.  A file handle
remains accessible as long as the containing css is accessible.

Both "cgroup.procs" and "cgroup.events" are converted to use the new
generic mechanism instead of hooking directly into cgroup_add_file().
Also, cgroup_file_notify() which takes a struct cgroup_file and
generates a file modified event on it is added and replaces explicit
kernfs_notify() invocations.

This generalizes cgroup file handle handling and allows controllers to
generate file modified notifications.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup-defs.h | 26 ++++++++++++++++++++++++--
 include/linux/cgroup.h      | 13 +++++++++++++
 kernel/cgroup.c             | 26 +++++++++++++++++++-------
 3 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 10d814bcd487..df589a097539 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -83,6 +83,17 @@ enum {
 	__CFTYPE_NOT_ON_DFL	= (1 << 17),	/* not on default hierarchy */
 };
 
+/*
+ * cgroup_file is the handle for a file instance created in a cgroup which
+ * is used, for example, to generate file changed notifications.  This can
+ * be obtained by setting cftype->file_offset.
+ */
+struct cgroup_file {
+	/* do not access any fields from outside cgroup core */
+	struct list_head node;			/* anchored at css->files */
+	struct kernfs_node *kn;
+};
+
 /*
  * Per-subsystem/per-cgroup state maintained by the system.  This is the
  * fundamental structural building block that controllers deal with.
@@ -123,6 +134,9 @@ struct cgroup_subsys_state {
 	 */
 	u64 serial_nr;
 
+	/* all cgroup_files associated with this css */
+	struct list_head files;
+
 	/* percpu_ref killing and RCU release */
 	struct rcu_head rcu_head;
 	struct work_struct destroy_work;
@@ -226,8 +240,8 @@ struct cgroup {
 	int populated_cnt;
 
 	struct kernfs_node *kn;		/* cgroup kernfs entry */
-	struct kernfs_node *procs_kn;	/* kn for "cgroup.procs" */
-	struct kernfs_node *events_kn;	/* kn for "cgroup.events" */
+	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
+	struct cgroup_file events_file;	/* handle for "cgroup.events" */
 
 	/*
 	 * The bitmask of subsystems enabled on the child cgroups.
@@ -335,6 +349,14 @@ struct cftype {
 	/* CFTYPE_* flags */
 	unsigned int flags;
 
+	/*
+	 * If non-zero, should contain the offset from the start of css to
+	 * a struct cgroup_file field.  cgroup will record the handle of
+	 * the created file into it.  The recorded handle can be used as
+	 * long as the containing css remains accessible.
+	 */
+	unsigned int file_offset;
+
 	/*
 	 * Fields used for internal bookkeeping.  Initialized automatically
 	 * during registration.
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 355bf2ed8867..fb717f2cba5b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -490,6 +490,19 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 
+/**
+ * cgroup_file_notify - generate a file modified event for a cgroup_file
+ * @cfile: target cgroup_file
+ *
+ * @cfile must have been obtained by setting cftype->file_offset.
+ */
+static inline void cgroup_file_notify(struct cgroup_file *cfile)
+{
+	/* might not have been created due to one of the CFTYPE selector flags */
+	if (cfile->kn)
+		kernfs_notify(cfile->kn);
+}
+
 #else /* !CONFIG_CGROUPS */
 
 struct cgroup_subsys_state;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 06c9d1aeea9d..0be276ffe08a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -612,8 +612,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		if (!trigger)
 			break;
 
-		if (cgrp->events_kn)
-			kernfs_notify(cgrp->events_kn);
+		cgroup_file_notify(&cgrp->events_file);
+
 		cgrp = cgroup_parent(cgrp);
 	} while (cgrp);
 }
@@ -1771,6 +1771,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 
 	INIT_LIST_HEAD(&cgrp->self.sibling);
 	INIT_LIST_HEAD(&cgrp->self.children);
+	INIT_LIST_HEAD(&cgrp->self.files);
 	INIT_LIST_HEAD(&cgrp->cset_links);
 	INIT_LIST_HEAD(&cgrp->pidlists);
 	mutex_init(&cgrp->pidlist_mutex);
@@ -2562,7 +2563,7 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 			cgrp = cgroup_parent(cgrp);
 
 		ret = -ENOMEM;
-		inode = kernfs_get_inode(sb, cgrp->procs_kn);
+		inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
 		if (inode) {
 			ret = inode_permission(inode, MAY_WRITE);
 			iput(inode);
@@ -3253,10 +3254,14 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp,
 		return ret;
 	}
 
-	if (cft->write == cgroup_procs_write)
-		cgrp->procs_kn = kn;
-	else if (cft->seq_show == cgroup_events_show)
-		cgrp->events_kn = kn;
+	if (cft->file_offset) {
+		struct cgroup_file *cfile = (void *)css + cft->file_offset;
+
+		kernfs_get(kn);
+		cfile->kn = kn;
+		list_add(&cfile->node, &css->files);
+	}
+
 	return 0;
 }
 
@@ -4408,6 +4413,7 @@ static int cgroup_clone_children_write(struct cgroup_subsys_state *css,
 static struct cftype cgroup_dfl_base_files[] = {
 	{
 		.name = "cgroup.procs",
+		.file_offset = offsetof(struct cgroup, procs_file),
 		.seq_start = cgroup_pidlist_start,
 		.seq_next = cgroup_pidlist_next,
 		.seq_stop = cgroup_pidlist_stop,
@@ -4433,6 +4439,7 @@ static struct cftype cgroup_dfl_base_files[] = {
 	{
 		.name = "cgroup.events",
 		.flags = CFTYPE_NOT_ON_ROOT,
+		.file_offset = offsetof(struct cgroup, events_file),
 		.seq_show = cgroup_events_show,
 	},
 	{ }	/* terminate */
@@ -4511,9 +4518,13 @@ static void css_free_work_fn(struct work_struct *work)
 		container_of(work, struct cgroup_subsys_state, destroy_work);
 	struct cgroup_subsys *ss = css->ss;
 	struct cgroup *cgrp = css->cgroup;
+	struct cgroup_file *cfile;
 
 	percpu_ref_exit(&css->refcnt);
 
+	list_for_each_entry(cfile, &css->files, node)
+		kernfs_put(cfile->kn);
+
 	if (ss) {
 		/* css free path */
 		int id = css->id;
@@ -4618,6 +4629,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
 	css->ss = ss;
 	INIT_LIST_HEAD(&css->sibling);
 	INIT_LIST_HEAD(&css->children);
+	INIT_LIST_HEAD(&css->files);
 	css->serial_nr = css_serial_nr_next++;
 
 	if (cgroup_parent(cgrp)) {

From 472912a2b5e2027efd58aa47f78acb2373675187 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 18 Sep 2015 18:01:59 -0400
Subject: [PATCH 14/38] memcg: generate file modified notifications on
 "memory.events"

cgroup core only recently grew generic notification support.  Wire up
"memory.events" so that it triggers a file modified event whenever its
content changes.

v2: Refreshed on top of mem_cgroup relocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Li Zefan <lizefan@huawei.com>
---
 include/linux/memcontrol.h | 4 ++++
 mm/memcontrol.c            | 1 +
 2 files changed, 5 insertions(+)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 9aa7820c2177..c83c699a6605 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -213,6 +213,9 @@ struct mem_cgroup {
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
 
+	/* handle for "memory.events" */
+	struct cgroup_file events_file;
+
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 
@@ -286,6 +289,7 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
 		       unsigned int nr)
 {
 	this_cpu_add(memcg->stat->events[idx], nr);
+	cgroup_file_notify(&memcg->events_file);
 }
 
 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e672f2689326..9f331402e502 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5202,6 +5202,7 @@ static struct cftype memory_files[] = {
 	{
 		.name = "events",
 		.flags = CFTYPE_NOT_ON_ROOT,
+		.file_offset = offsetof(struct mem_cgroup, events_file),
 		.seq_show = memory_events_show,
 	},
 	{ }	/* terminate */

From 3df9ca0a2b8b50db5a079ae9d97c5b55435e9a6c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Sep 2015 15:00:18 -0400
Subject: [PATCH 15/38] cpuset: migrate memory only for threadgroup leaders

If memory_migrate flag is set, cpuset migrates memory according to the
destnation css's nodemask.  The current implementation migrates memory
whenever any thread of a process is migrated making the behavior
somewhat arbitrary.  Let's tie memory operations to the threadgroup
leader so that memory is migrated only when the leader is migrated.

While this is a behavior change, given the inherent fuziness, this
change is not too likely to be noticed and allows us to clearly define
who owns the memory (always the leader) and helps the planned atomic
multi-process migration.

Note that we're currently migrating memory in migration path proper
while holding all the locks.  In the long term, this should be moved
out to an async work item.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cpuset.c | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 312961ef3ccc..0b361a0b58f6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1487,7 +1487,6 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 {
 	/* static buf protected by cpuset_mutex */
 	static nodemask_t cpuset_attach_nodemask_to;
-	struct mm_struct *mm;
 	struct task_struct *task;
 	struct task_struct *leader = cgroup_taskset_first(tset);
 	struct cpuset *cs = css_cs(css);
@@ -1515,26 +1514,31 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 	}
 
 	/*
-	 * Change mm, possibly for multiple threads in a threadgroup. This is
-	 * expensive and may sleep.
+	 * Change mm, possibly for multiple threads in a threadgroup. This
+	 * is expensive and may sleep and should be moved outside migration
+	 * path proper.
 	 */
 	cpuset_attach_nodemask_to = cs->effective_mems;
-	mm = get_task_mm(leader);
-	if (mm) {
-		mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+	if (thread_group_leader(leader)) {
+		struct mm_struct *mm = get_task_mm(leader);
 
-		/*
-		 * old_mems_allowed is the same with mems_allowed here, except
-		 * if this task is being moved automatically due to hotplug.
-		 * In that case @mems_allowed has been updated and is empty,
-		 * so @old_mems_allowed is the right nodesets that we migrate
-		 * mm from.
-		 */
-		if (is_memory_migrate(cs)) {
-			cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
-					  &cpuset_attach_nodemask_to);
+		if (mm) {
+			mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
+
+			/*
+			 * old_mems_allowed is the same with mems_allowed
+			 * here, except if this task is being moved
+			 * automatically due to hotplug.  In that case
+			 * @mems_allowed has been updated and is empty, so
+			 * @old_mems_allowed is the right nodesets that we
+			 * migrate mm from.
+			 */
+			if (is_memory_migrate(cs)) {
+				cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
+						  &cpuset_attach_nodemask_to);
+			}
+			mmput(mm);
 		}
-		mmput(mm);
 	}
 
 	cs->old_mems_allowed = cpuset_attach_nodemask_to;

From 4530eddb59494b89650d6bcd980fc7f7717ad80c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Sep 2015 15:00:19 -0400
Subject: [PATCH 16/38] cgroup, memcg, cpuset: implement
 cgroup_taskset_for_each_leader()

It wasn't explicitly documented but, when a process is being migrated,
cpuset and memcg depend on cgroup_taskset_first() returning the
threadgroup leader; however, this approach is somewhat ghetto and
would no longer work for the planned multi-process migration.

This patch introduces explicit cgroup_taskset_for_each_leader() which
iterates over only the threadgroup leaders and replaces
cgroup_taskset_first() usages for accessing the leader with it.

This prepares both memcg and cpuset for multi-process migration.  This
patch also updates the documentation for cgroup_taskset_for_each() to
clarify the iteration rules and removes comments mentioning task
ordering in tasksets.

v2: A previous patch which added threadgroup leader test was dropped.
    Patch updated accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/cgroup.h | 22 ++++++++++++++++++++++
 kernel/cgroup.c        | 11 -----------
 kernel/cpuset.c        |  9 ++++-----
 mm/memcontrol.c        | 17 +++++++++++++++--
 4 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index fb717f2cba5b..e9c3eac074e2 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -232,11 +232,33 @@ void css_task_iter_end(struct css_task_iter *it);
  * cgroup_taskset_for_each - iterate cgroup_taskset
  * @task: the loop cursor
  * @tset: taskset to iterate
+ *
+ * @tset may contain multiple tasks and they may belong to multiple
+ * processes.  When there are multiple tasks in @tset, if a task of a
+ * process is in @tset, all tasks of the process are in @tset.  Also, all
+ * are guaranteed to share the same source and destination csses.
+ *
+ * Iteration is not in any specific order.
  */
 #define cgroup_taskset_for_each(task, tset)				\
 	for ((task) = cgroup_taskset_first((tset)); (task);		\
 	     (task) = cgroup_taskset_next((tset)))
 
+/**
+ * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
+ * @leader: the loop cursor
+ * @tset: takset to iterate
+ *
+ * Iterate threadgroup leaders of @tset.  For single-task migrations, @tset
+ * may not contain any.
+ */
+#define cgroup_taskset_for_each_leader(leader, tset)			\
+	for ((leader) = cgroup_taskset_first((tset)); (leader);		\
+	     (leader) = cgroup_taskset_next((tset)))			\
+		if ((leader) != (leader)->group_leader)			\
+			;						\
+		else
+
 /*
  * Inline functions.
  */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0be276ffe08a..7f4b85af03dc 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2217,13 +2217,6 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 
 	get_css_set(new_cset);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
-
-	/*
-	 * Use move_tail so that cgroup_taskset_first() still returns the
-	 * leader after migration.  This works because cgroup_migrate()
-	 * ensures that the dst_cset of the leader is the first on the
-	 * tset's dst_csets list.
-	 */
 	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 
 	/*
@@ -2419,10 +2412,6 @@ static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
 		if (!cset->mg_src_cgrp)
 			goto next;
 
-		/*
-		 * cgroup_taskset_first() must always return the leader.
-		 * Take care to avoid disturbing the ordering.
-		 */
 		list_move_tail(&task->cg_list, &cset->mg_tasks);
 		if (list_empty(&cset->mg_node))
 			list_add_tail(&cset->mg_node, &tset.src_csets);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0b361a0b58f6..e4d999929903 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1488,7 +1488,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 	/* static buf protected by cpuset_mutex */
 	static nodemask_t cpuset_attach_nodemask_to;
 	struct task_struct *task;
-	struct task_struct *leader = cgroup_taskset_first(tset);
+	struct task_struct *leader;
 	struct cpuset *cs = css_cs(css);
 	struct cpuset *oldcs = cpuset_attach_old_cs;
 
@@ -1514,12 +1514,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css,
 	}
 
 	/*
-	 * Change mm, possibly for multiple threads in a threadgroup. This
-	 * is expensive and may sleep and should be moved outside migration
-	 * path proper.
+	 * Change mm for all threadgroup leaders. This is expensive and may
+	 * sleep and should be moved outside migration path proper.
 	 */
 	cpuset_attach_nodemask_to = cs->effective_mems;
-	if (thread_group_leader(leader)) {
+	cgroup_taskset_for_each_leader(leader, tset) {
 		struct mm_struct *mm = get_task_mm(leader);
 
 		if (mm) {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9f331402e502..33c8dad6830f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4828,7 +4828,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 	struct mem_cgroup *from;
-	struct task_struct *p;
+	struct task_struct *leader, *p;
 	struct mm_struct *mm;
 	unsigned long move_flags;
 	int ret = 0;
@@ -4842,7 +4842,20 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
 	if (!move_flags)
 		return 0;
 
-	p = cgroup_taskset_first(tset);
+	/*
+	 * Multi-process migrations only happen on the default hierarchy
+	 * where charge immigration is not used.  Perform charge
+	 * immigration if @tset contains a leader and whine if there are
+	 * multiple.
+	 */
+	p = NULL;
+	cgroup_taskset_for_each_leader(leader, tset) {
+		WARN_ON_ONCE(p);
+		p = leader;
+	}
+	if (!p)
+		return 0;
+
 	from = mem_cgroup_from_task(p);
 
 	VM_BUG_ON(from == memcg);

From 9af2ec45c2d323d5ce35b431bb150c0eb567f19a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Sep 2015 15:00:20 -0400
Subject: [PATCH 17/38] cgroup: reorder cgroup_migrate()'s parameters

cgroup_migrate() has the destination cgroup as the first parameter
while cgroup_task_migrate() has the destination cset as the last.
Another migration function is scheduled to be added which can make the
discrepancy further stand out.  Let's reorder cgroup_migrate()'s
parameters so that the destination cgroup is the last.

This doesn't cause any functional difference.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7f4b85af03dc..dc079560dd53 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2362,9 +2362,9 @@ err:
 
 /**
  * cgroup_migrate - migrate a process or task to a cgroup
- * @cgrp: the destination cgroup
  * @leader: the leader of the process or the task to migrate
  * @threadgroup: whether @leader points to the whole process or a single task
+ * @cgrp: the destination cgroup
  *
  * Migrate a process or task denoted by @leader to @cgrp.  If migrating a
  * process, the caller must be holding cgroup_threadgroup_rwsem.  The
@@ -2378,8 +2378,8 @@ err:
  * decided for all targets by invoking group_migrate_prepare_dst() before
  * actually starting migrating.
  */
-static int cgroup_migrate(struct cgroup *cgrp, struct task_struct *leader,
-			  bool threadgroup)
+static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
+			  struct cgroup *cgrp)
 {
 	struct cgroup_taskset tset = {
 		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
@@ -2516,7 +2516,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
 	if (!ret)
-		ret = cgroup_migrate(dst_cgrp, leader, threadgroup);
+		ret = cgroup_migrate(leader, threadgroup, dst_cgrp);
 
 	cgroup_migrate_finish(&preloaded_csets);
 	return ret;
@@ -2823,7 +2823,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 				goto out_finish;
 			last_task = task;
 
-			ret = cgroup_migrate(src_cset->dfl_cgrp, task, true);
+			ret = cgroup_migrate(task, true, src_cset->dfl_cgrp);
 
 			put_task_struct(task);
 
@@ -3905,7 +3905,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 		css_task_iter_end(&it);
 
 		if (task) {
-			ret = cgroup_migrate(to, task, false);
+			ret = cgroup_migrate(task, false, to);
 			put_task_struct(task);
 		}
 	} while (task && !ret);

From adaae5dcf8920375a2fdc6268f762a0b7b331c55 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Sep 2015 15:00:21 -0400
Subject: [PATCH 18/38] cgroup: separate out taskset operations from
 cgroup_migrate()

Currently, cgroup_migreate() implements large part of the migration
logic inline including building the target taskset and actually
migrating them.  This patch separates out the following taskset
operations.

 CGROUP_TASKSET_INIT()		: taskset initializer
 cgroup_taskset_add()		: add a task to a taskset
 cgroup_taskset_migrate()	: migrate a taskset to the destination cgroup

This will be used to implement atomic multi-process migration in
cgroup_update_dfl_csses().  This is pure reorganization which doesn't
introduce any functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup.c | 211 ++++++++++++++++++++++++++++--------------------
 1 file changed, 125 insertions(+), 86 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index dc079560dd53..f24d3cedd809 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2144,6 +2144,49 @@ struct cgroup_taskset {
 	struct task_struct	*cur_task;
 };
 
+#define CGROUP_TASKSET_INIT(tset)	(struct cgroup_taskset){	\
+	.src_csets		= LIST_HEAD_INIT(tset.src_csets),	\
+	.dst_csets		= LIST_HEAD_INIT(tset.dst_csets),	\
+	.csets			= &tset.src_csets,			\
+}
+
+/**
+ * cgroup_taskset_add - try to add a migration target task to a taskset
+ * @task: target task
+ * @tset: target taskset
+ *
+ * Add @task, which is a migration target, to @tset.  This function becomes
+ * noop if @task doesn't need to be migrated.  @task's css_set should have
+ * been added as a migration source and @task->cg_list will be moved from
+ * the css_set's tasks list to mg_tasks one.
+ */
+static void cgroup_taskset_add(struct task_struct *task,
+			       struct cgroup_taskset *tset)
+{
+	struct css_set *cset;
+
+	lockdep_assert_held(&css_set_rwsem);
+
+	/* @task either already exited or can't exit until the end */
+	if (task->flags & PF_EXITING)
+		return;
+
+	/* leave @task alone if post_fork() hasn't linked it yet */
+	if (list_empty(&task->cg_list))
+		return;
+
+	cset = task_css_set(task);
+	if (!cset->mg_src_cgrp)
+		return;
+
+	list_move_tail(&task->cg_list, &cset->mg_tasks);
+	if (list_empty(&cset->mg_node))
+		list_add_tail(&cset->mg_node, &tset->src_csets);
+	if (list_empty(&cset->mg_dst_cset->mg_node))
+		list_move_tail(&cset->mg_dst_cset->mg_node,
+			       &tset->dst_csets);
+}
+
 /**
  * cgroup_taskset_first - reset taskset and return the first task
  * @tset: taskset of interest
@@ -2227,6 +2270,84 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
 	put_css_set_locked(old_cset);
 }
 
+/**
+ * cgroup_taskset_migrate - migrate a taskset to a cgroup
+ * @tset: taget taskset
+ * @dst_cgrp: destination cgroup
+ *
+ * Migrate tasks in @tset to @dst_cgrp.  This function fails iff one of the
+ * ->can_attach callbacks fails and guarantees that either all or none of
+ * the tasks in @tset are migrated.  @tset is consumed regardless of
+ * success.
+ */
+static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
+				  struct cgroup *dst_cgrp)
+{
+	struct cgroup_subsys_state *css, *failed_css = NULL;
+	struct task_struct *task, *tmp_task;
+	struct css_set *cset, *tmp_cset;
+	int i, ret;
+
+	/* methods shouldn't be called if no task is actually migrating */
+	if (list_empty(&tset->src_csets))
+		return 0;
+
+	/* check that we can legitimately attach to the cgroup */
+	for_each_e_css(css, i, dst_cgrp) {
+		if (css->ss->can_attach) {
+			ret = css->ss->can_attach(css, tset);
+			if (ret) {
+				failed_css = css;
+				goto out_cancel_attach;
+			}
+		}
+	}
+
+	/*
+	 * Now that we're guaranteed success, proceed to move all tasks to
+	 * the new cgroup.  There are no failure cases after here, so this
+	 * is the commit point.
+	 */
+	down_write(&css_set_rwsem);
+	list_for_each_entry(cset, &tset->src_csets, mg_node) {
+		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
+			cgroup_task_migrate(cset->mg_src_cgrp, task,
+					    cset->mg_dst_cset);
+	}
+	up_write(&css_set_rwsem);
+
+	/*
+	 * Migration is committed, all target tasks are now on dst_csets.
+	 * Nothing is sensitive to fork() after this point.  Notify
+	 * controllers that migration is complete.
+	 */
+	tset->csets = &tset->dst_csets;
+
+	for_each_e_css(css, i, dst_cgrp)
+		if (css->ss->attach)
+			css->ss->attach(css, tset);
+
+	ret = 0;
+	goto out_release_tset;
+
+out_cancel_attach:
+	for_each_e_css(css, i, dst_cgrp) {
+		if (css == failed_css)
+			break;
+		if (css->ss->cancel_attach)
+			css->ss->cancel_attach(css, tset);
+	}
+out_release_tset:
+	down_write(&css_set_rwsem);
+	list_splice_init(&tset->dst_csets, &tset->src_csets);
+	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
+		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
+		list_del_init(&cset->mg_node);
+	}
+	up_write(&css_set_rwsem);
+	return ret;
+}
+
 /**
  * cgroup_migrate_finish - cleanup after attach
  * @preloaded_csets: list of preloaded css_sets
@@ -2381,15 +2502,8 @@ err:
 static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 			  struct cgroup *cgrp)
 {
-	struct cgroup_taskset tset = {
-		.src_csets	= LIST_HEAD_INIT(tset.src_csets),
-		.dst_csets	= LIST_HEAD_INIT(tset.dst_csets),
-		.csets		= &tset.src_csets,
-	};
-	struct cgroup_subsys_state *css, *failed_css = NULL;
-	struct css_set *cset, *tmp_cset;
-	struct task_struct *task, *tmp_task;
-	int i, ret;
+	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
+	struct task_struct *task;
 
 	/*
 	 * Prevent freeing of tasks while we take a snapshot. Tasks that are
@@ -2400,89 +2514,14 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	rcu_read_lock();
 	task = leader;
 	do {
-		/* @task either already exited or can't exit until the end */
-		if (task->flags & PF_EXITING)
-			goto next;
-
-		/* leave @task alone if post_fork() hasn't linked it yet */
-		if (list_empty(&task->cg_list))
-			goto next;
-
-		cset = task_css_set(task);
-		if (!cset->mg_src_cgrp)
-			goto next;
-
-		list_move_tail(&task->cg_list, &cset->mg_tasks);
-		if (list_empty(&cset->mg_node))
-			list_add_tail(&cset->mg_node, &tset.src_csets);
-		if (list_empty(&cset->mg_dst_cset->mg_node))
-			list_move_tail(&cset->mg_dst_cset->mg_node,
-				       &tset.dst_csets);
-	next:
+		cgroup_taskset_add(task, &tset);
 		if (!threadgroup)
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
 	up_write(&css_set_rwsem);
 
-	/* methods shouldn't be called if no task is actually migrating */
-	if (list_empty(&tset.src_csets))
-		return 0;
-
-	/* check that we can legitimately attach to the cgroup */
-	for_each_e_css(css, i, cgrp) {
-		if (css->ss->can_attach) {
-			ret = css->ss->can_attach(css, &tset);
-			if (ret) {
-				failed_css = css;
-				goto out_cancel_attach;
-			}
-		}
-	}
-
-	/*
-	 * Now that we're guaranteed success, proceed to move all tasks to
-	 * the new cgroup.  There are no failure cases after here, so this
-	 * is the commit point.
-	 */
-	down_write(&css_set_rwsem);
-	list_for_each_entry(cset, &tset.src_csets, mg_node) {
-		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
-			cgroup_task_migrate(cset->mg_src_cgrp, task,
-					    cset->mg_dst_cset);
-	}
-	up_write(&css_set_rwsem);
-
-	/*
-	 * Migration is committed, all target tasks are now on dst_csets.
-	 * Nothing is sensitive to fork() after this point.  Notify
-	 * controllers that migration is complete.
-	 */
-	tset.csets = &tset.dst_csets;
-
-	for_each_e_css(css, i, cgrp)
-		if (css->ss->attach)
-			css->ss->attach(css, &tset);
-
-	ret = 0;
-	goto out_release_tset;
-
-out_cancel_attach:
-	for_each_e_css(css, i, cgrp) {
-		if (css == failed_css)
-			break;
-		if (css->ss->cancel_attach)
-			css->ss->cancel_attach(css, &tset);
-	}
-out_release_tset:
-	down_write(&css_set_rwsem);
-	list_splice_init(&tset.dst_csets, &tset.src_csets);
-	list_for_each_entry_safe(cset, tmp_cset, &tset.src_csets, mg_node) {
-		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
-		list_del_init(&cset->mg_node);
-	}
-	up_write(&css_set_rwsem);
-	return ret;
+	return cgroup_taskset_migrate(&tset, cgrp);
 }
 
 /**

From 10265075aa3a8629b0ccdcff4d10b17bd740defe Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 11 Sep 2015 15:00:22 -0400
Subject: [PATCH 19/38] cgroup: make cgroup_update_dfl_csses() migrate all
 target processes atomically

cgroup_update_dfl_csses() is responsible for migrating processes when
controllers are enabled or disabled on the default hierarchy.  As the
css association changes for all the processes in the affected cgroups,
this involves migrating multiple processes.

Up until now, it was implemented by migrating process-by-process until
the source css_sets are empty; however, this means that if a process
fails to migrate after some succeed before it, the recovery is very
tricky.  This was considered okay as subsystems weren't allowed to
reject process migration on the default hierarchy; unfortunately,
enforcing this policy turned out to be problematic for certain types
of resources - realtime slices for now.

As such, the default hierarchy is gonna allow restricted failures
during migration and to support that this patch makes
cgroup_update_dfl_csses() migrate all target processes atomically
rather than one-by-one.  The preceding patches made subsystems ready
for multi-process migration and factored out taskset operations making
this almost trivial.  All tasks of the target processes are put in the
same taskset and the migration operations are performed once which
either fails or succeeds for all.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup.c | 44 ++++++++------------------------------------
 1 file changed, 8 insertions(+), 36 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f24d3cedd809..f924158a1b65 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2799,6 +2799,7 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v)
 static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 {
 	LIST_HEAD(preloaded_csets);
+	struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset);
 	struct cgroup_subsys_state *css;
 	struct css_set *src_cset;
 	int ret;
@@ -2827,50 +2828,21 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	if (ret)
 		goto out_finish;
 
+	down_write(&css_set_rwsem);
 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
-		struct task_struct *last_task = NULL, *task;
+		struct task_struct *task, *ntask;
 
 		/* src_csets precede dst_csets, break on the first dst_cset */
 		if (!src_cset->mg_src_cgrp)
 			break;
 
-		/*
-		 * All tasks in src_cset need to be migrated to the
-		 * matching dst_cset.  Empty it process by process.  We
-		 * walk tasks but migrate processes.  The leader might even
-		 * belong to a different cset but such src_cset would also
-		 * be among the target src_csets because the default
-		 * hierarchy enforces per-process membership.
-		 */
-		while (true) {
-			down_read(&css_set_rwsem);
-			task = list_first_entry_or_null(&src_cset->tasks,
-						struct task_struct, cg_list);
-			if (task) {
-				task = task->group_leader;
-				WARN_ON_ONCE(!task_css_set(task)->mg_src_cgrp);
-				get_task_struct(task);
-			}
-			up_read(&css_set_rwsem);
-
-			if (!task)
-				break;
-
-			/* guard against possible infinite loop */
-			if (WARN(last_task == task,
-				 "cgroup: update_dfl_csses failed to make progress, aborting in inconsistent state\n"))
-				goto out_finish;
-			last_task = task;
-
-			ret = cgroup_migrate(task, true, src_cset->dfl_cgrp);
-
-			put_task_struct(task);
-
-			if (WARN(ret, "cgroup: failed to update controllers for the default hierarchy (%d), further operations may crash or hang\n", ret))
-				goto out_finish;
-		}
+		/* all tasks in src_csets need to be migrated */
+		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
+			cgroup_taskset_add(task, &tset);
 	}
+	up_write(&css_set_rwsem);
 
+	ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
 	cgroup_migrate_finish(&preloaded_csets);
 	percpu_up_write(&cgroup_threadgroup_rwsem);

From a3e72739b7a7ea225dd11c4096f97123f6427d87 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 25 Sep 2015 16:24:27 -0400
Subject: [PATCH 20/38] cgroup: fix too early usage of static_branch_disable()

49d1dc4b8179 ("cgroup: implement static_key based
cgroup_subsys_enabled() and cgroup_subsys_on_dfl()") converted cgroup
enabled test to use static_key; however, cgroup_disable() is called
before static_key subsystem itself is initialized and thus leads to
the following warning when "cgroup_disable=" parameter is specified.

 WARNING: CPU: 0 PID: 0 at kernel/jump_label.c:99 static_key_slow_dec+0x44/0x60()
 static_key_slow_dec used before call to jump_label_init
 ...
 Call Trace:
  [<ffffffff813b18c2>] dump_stack+0x44/0x62
  [<ffffffff8108dd52>] warn_slowpath_common+0x82/0xc0
  [<ffffffff8108ddec>] warn_slowpath_fmt+0x5c/0x80
  [<ffffffff8119c054>] static_key_slow_dec+0x44/0x60
  [<ffffffff81d826b6>] cgroup_disable+0xaf/0xd6
  [<ffffffff81d5f9de>] unknown_bootoption+0x8c/0x194
  [<ffffffff810b0c03>] parse_args+0x273/0x4a0
  [<ffffffff81d5fd67>] start_kernel+0x205/0x4b8
 ...

Fix it by making cgroup_disable() to record the subsystems to disable
in cgroup_disable_mask and moving the actual application to
cgroup_init() which is late enough and where the enabled state is
first used.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Andrey Wagin <avagin@gmail.com>
Link: http://lkml.kernel.org/g/CANaxB-yFuS4SA2znSvcKrO9L_CbHciHYW+o9bN8sZJ8eR9FxYA@mail.gmail.com
Fixes: 49d1dc4b81797f88270832b11e9f73809e7e7209
---
 kernel/cgroup.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f924158a1b65..ae23814b0c6e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5124,6 +5124,8 @@ int __init cgroup_init_early(void)
 	return 0;
 }
 
+static unsigned long cgroup_disable_mask __initdata;
+
 /**
  * cgroup_init - cgroup initialization
  *
@@ -5170,8 +5172,12 @@ int __init cgroup_init(void)
 		 * disabled flag and cftype registration needs kmalloc,
 		 * both of which aren't available during early_init.
 		 */
-		if (!cgroup_ssid_enabled(ssid))
+		if (cgroup_disable_mask & (1 << ssid)) {
+			static_branch_disable(cgroup_subsys_enabled_key[ssid]);
+			printk(KERN_INFO "Disabling %s control group subsystem\n",
+			       ss->name);
 			continue;
+		}
 
 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
@@ -5595,11 +5601,7 @@ static int __init cgroup_disable(char *str)
 			if (strcmp(token, ss->name) &&
 			    strcmp(token, ss->legacy_name))
 				continue;
-
-			static_branch_disable(cgroup_subsys_enabled_key[i]);
-			printk(KERN_INFO "Disabling %s control group subsystem\n",
-			       ss->name);
-			break;
+			cgroup_disable_mask |= 1 << i;
 		}
 	}
 	return 1;

From b309e5b743a999cdb34a3989d1800c608e656c2b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:49 -0400
Subject: [PATCH 21/38] cgroup: remove an unused parameter from
 cgroup_task_migrate()

cgroup_task_migrate() no longer uses @old_cgrp.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ae23814b0c6e..49f30f1b7e0a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2235,14 +2235,12 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 
 /**
  * cgroup_task_migrate - move a task from one cgroup to another.
- * @old_cgrp: the cgroup @tsk is being migrated from
  * @tsk: the task being migrated
  * @new_cset: the new css_set @tsk is being attached to
  *
  * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
  */
-static void cgroup_task_migrate(struct cgroup *old_cgrp,
-				struct task_struct *tsk,
+static void cgroup_task_migrate(struct task_struct *tsk,
 				struct css_set *new_cset)
 {
 	struct css_set *old_cset;
@@ -2311,8 +2309,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	down_write(&css_set_rwsem);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
-			cgroup_task_migrate(cset->mg_src_cgrp, task,
-					    cset->mg_dst_cset);
+			cgroup_task_migrate(task, cset->mg_dst_cset);
 	}
 	up_write(&css_set_rwsem);
 

From 0de0942db2b36dd91c088a7950398d2e87f23b23 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:49 -0400
Subject: [PATCH 22/38] cgroup: make cgroup->nr_populated count the number of
 populated css_sets

Currently, cgroup->nr_populated counts whether the cgroup has any
css_sets linked to it and the number of children which has non-zero
->nr_populated.  This works because a css_set's refcnt converges with
the number of tasks linked to it and thus there's no css_set linked to
a cgroup if it doesn't have any live tasks.

To help tracking resource usage of zombie tasks, putting the ref of
css_set will be separated from disassociating the task from the
css_set which means that a cgroup may have css_sets linked to it even
when it doesn't have any live tasks.

This patch updates cgroup->nr_populated so that for the cgroup itself
it counts the number of css_sets which have tasks associated with them
so that empty css_sets don't skew the populated test.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  8 ++---
 kernel/cgroup.c             | 65 +++++++++++++++++++++++++++++--------
 2 files changed, 56 insertions(+), 17 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index df589a097539..17444505c870 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -232,10 +232,10 @@ struct cgroup {
 	int id;
 
 	/*
-	 * If this cgroup contains any tasks, it contributes one to
-	 * populated_cnt.  All children with non-zero popuplated_cnt of
-	 * their own contribute one.  The count is zero iff there's no task
-	 * in this cgroup or its subtree.
+	 * Each non-empty css_set associated with this cgroup contributes
+	 * one to populated_cnt.  All children with non-zero popuplated_cnt
+	 * of their own contribute one.  The count is zero iff there's no
+	 * task in this cgroup or its subtree.
 	 */
 	int populated_cnt;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 49f30f1b7e0a..e5231d045eb9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -581,15 +581,26 @@ struct css_set init_css_set = {
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
 
+/**
+ * css_set_populated - does a css_set contain any tasks?
+ * @cset: target css_set
+ */
+static bool css_set_populated(struct css_set *cset)
+{
+	lockdep_assert_held(&css_set_rwsem);
+
+	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
+}
+
 /**
  * cgroup_update_populated - updated populated count of a cgroup
  * @cgrp: the target cgroup
  * @populated: inc or dec populated count
  *
- * @cgrp is either getting the first task (css_set) or losing the last.
- * Update @cgrp->populated_cnt accordingly.  The count is propagated
- * towards root so that a given cgroup's populated_cnt is zero iff the
- * cgroup and all its descendants are empty.
+ * One of the css_sets associated with @cgrp is either getting its first
+ * task or losing the last.  Update @cgrp->populated_cnt accordingly.  The
+ * count is propagated towards root so that a given cgroup's populated_cnt
+ * is zero iff the cgroup and all its descendants don't contain any tasks.
  *
  * @cgrp's interface file "cgroup.populated" is zero if
  * @cgrp->populated_cnt is zero and 1 otherwise.  When @cgrp->populated_cnt
@@ -618,6 +629,24 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 	} while (cgrp);
 }
 
+/**
+ * css_set_update_populated - update populated state of a css_set
+ * @cset: target css_set
+ * @populated: whether @cset is populated or depopulated
+ *
+ * @cset is either getting the first task or losing the last.  Update the
+ * ->populated_cnt of all associated cgroups accordingly.
+ */
+static void css_set_update_populated(struct css_set *cset, bool populated)
+{
+	struct cgrp_cset_link *link;
+
+	lockdep_assert_held(&css_set_rwsem);
+
+	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
+		cgroup_update_populated(link->cgrp, populated);
+}
+
 /*
  * hash table for cgroup groups. This improves the performance to find
  * an existing css_set. This hash doesn't (currently) take into
@@ -663,10 +692,8 @@ static void put_css_set_locked(struct css_set *cset)
 		list_del(&link->cgrp_link);
 
 		/* @cgrp can't go away while we're holding css_set_rwsem */
-		if (list_empty(&cgrp->cset_links)) {
-			cgroup_update_populated(cgrp, false);
+		if (list_empty(&cgrp->cset_links))
 			check_for_release(cgrp);
-		}
 
 		kfree(link);
 	}
@@ -875,8 +902,6 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 	link->cset = cset;
 	link->cgrp = cgrp;
 
-	if (list_empty(&cgrp->cset_links))
-		cgroup_update_populated(cgrp, true);
 	list_move(&link->cset_link, &cgrp->cset_links);
 
 	/*
@@ -1754,6 +1779,8 @@ static void cgroup_enable_task_cg_lists(void)
 		if (!(p->flags & PF_EXITING)) {
 			struct css_set *cset = task_css_set(p);
 
+			if (!css_set_populated(cset))
+				css_set_update_populated(cset, true);
 			list_add(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
@@ -1868,8 +1895,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	 * objects.
 	 */
 	down_write(&css_set_rwsem);
-	hash_for_each(css_set_table, i, cset, hlist)
+	hash_for_each(css_set_table, i, cset, hlist) {
 		link_css_set(&tmp_links, cset, root_cgrp);
+		if (css_set_populated(cset))
+			cgroup_update_populated(root_cgrp, true);
+	}
 	up_write(&css_set_rwsem);
 
 	BUG_ON(!list_empty(&root_cgrp->self.children));
@@ -2256,10 +2286,16 @@ static void cgroup_task_migrate(struct task_struct *tsk,
 	WARN_ON_ONCE(tsk->flags & PF_EXITING);
 	old_cset = task_css_set(tsk);
 
+	if (!css_set_populated(new_cset))
+		css_set_update_populated(new_cset, true);
+
 	get_css_set(new_cset);
 	rcu_assign_pointer(tsk->cgroups, new_cset);
 	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
 
+	if (!css_set_populated(old_cset))
+		css_set_update_populated(old_cset, false);
+
 	/*
 	 * We just gained a reference on old_cset by taking it from the
 	 * task. As trading it for new_cset is protected by cgroup_mutex,
@@ -3774,7 +3810,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
 			link = list_entry(l, struct cgrp_cset_link, cset_link);
 			cset = link->cset;
 		}
-	} while (list_empty(&cset->tasks) && list_empty(&cset->mg_tasks));
+	} while (!css_set_populated(cset));
 
 	it->cset_pos = l;
 
@@ -5492,17 +5528,20 @@ void cgroup_exit(struct task_struct *tsk)
 
 	/*
 	 * Unlink from @tsk from its css_set.  As migration path can't race
-	 * with us, we can check cg_list without grabbing css_set_rwsem.
+	 * with us, we can check css_set and cg_list without synchronization.
 	 */
+	cset = task_css_set(tsk);
+
 	if (!list_empty(&tsk->cg_list)) {
 		down_write(&css_set_rwsem);
 		list_del_init(&tsk->cg_list);
+		if (!css_set_populated(cset))
+			css_set_update_populated(cset, false);
 		up_write(&css_set_rwsem);
 		put_cset = true;
 	}
 
 	/* Reassign the task to the init_css_set. */
-	cset = task_css_set(tsk);
 	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
 
 	/* see cgroup_post_fork() for details */

From 27bd4dbb8d51c476298e62bd088225317b7853de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:50 -0400
Subject: [PATCH 23/38] cgroup: replace cgroup_has_tasks() with
 cgroup_is_populated()

Currently, cgroup_has_tasks() tests whether the target cgroup has any
css_set linked to it.  This works because a css_set's refcnt converges
with the number of tasks linked to it and thus there's no css_set
linked to a cgroup if it doesn't have any live tasks.

To help tracking resource usage of zombie tasks, putting the ref of
css_set will be separated from disassociating the task from the
css_set which means that a cgroup may have css_sets linked to it even
when it doesn't have any live tasks.

This patch replaces cgroup_has_tasks() with cgroup_is_populated()
which tests cgroup->nr_populated instead which locally counts the
number of populated css_sets.  Unlike cgroup_has_tasks(),
cgroup_is_populated() is recursive - if any of the descendants is
populated, the cgroup is populated too.  While this changes the
meaning of the test, all the existing users are okay with the change.

While at it, replace the open-coded ->populated_cnt test in
cgroup_events_show() with cgroup_is_populated().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
---
 include/linux/cgroup.h | 4 ++--
 kernel/cgroup.c        | 6 +++---
 kernel/cpuset.c        | 2 +-
 mm/memcontrol.c        | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e9c3eac074e2..bdfdb3a1a83c 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -456,9 +456,9 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
 }
 
 /* no synchronization, the result can only be used as a hint */
-static inline bool cgroup_has_tasks(struct cgroup *cgrp)
+static inline bool cgroup_is_populated(struct cgroup *cgrp)
 {
-	return !list_empty(&cgrp->cset_links);
+	return cgrp->populated_cnt;
 }
 
 /* returns ino associated with a cgroup */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e5231d045eb9..435aa686567b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3121,7 +3121,7 @@ err_undo_css:
 static int cgroup_events_show(struct seq_file *seq, void *v)
 {
 	seq_printf(seq, "populated %d\n",
-		   (bool)seq_css(seq)->cgroup->populated_cnt);
+		   cgroup_is_populated(seq_css(seq)->cgroup));
 	return 0;
 }
 
@@ -5558,7 +5558,7 @@ void cgroup_exit(struct task_struct *tsk)
 
 static void check_for_release(struct cgroup *cgrp)
 {
-	if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+	if (notify_on_release(cgrp) && !cgroup_is_populated(cgrp) &&
 	    !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
 		schedule_work(&cgrp->release_agent_work);
 }
@@ -5806,7 +5806,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 
 static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
 {
-	return (!cgroup_has_tasks(css->cgroup) &&
+	return (!cgroup_is_populated(css->cgroup) &&
 		!css_has_online_children(&css->cgroup->self));
 }
 
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e4d999929903..d7ccb87a6714 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -498,7 +498,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
 	 * be changed to have empty cpus_allowed or mems_allowed.
 	 */
 	ret = -ENOSPC;
-	if ((cgroup_has_tasks(cur->css.cgroup) || cur->attach_in_progress)) {
+	if ((cgroup_is_populated(cur->css.cgroup) || cur->attach_in_progress)) {
 		if (!cpumask_empty(cur->cpus_allowed) &&
 		    cpumask_empty(trial->cpus_allowed))
 			goto out;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 33c8dad6830f..0ddd0ff2b52e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2920,7 +2920,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
 	 * of course permitted.
 	 */
 	mutex_lock(&memcg_create_mutex);
-	if (cgroup_has_tasks(memcg->css.cgroup) ||
+	if (cgroup_is_populated(memcg->css.cgroup) ||
 	    (memcg->use_hierarchy && memcg_has_children(memcg)))
 		err = -EBUSY;
 	mutex_unlock(&memcg_create_mutex);

From ad2ed2b35b76f01a876230a3a632efbc81d3fcd6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:50 -0400
Subject: [PATCH 24/38] cgroup: move check_for_release() invocation

To trigger release agent when the last task leaves the cgroup,
check_for_release() is called from put_css_set_locked(); however,
css_set being unlinked is being decoupled from task leaving the cgroup
and the correct condition to test is cgroup->nr_populated dropping to
zero which check_for_release() is already updated to test.

This patch moves check_for_release() invocation from
put_css_set_locked() to cgroup_update_populated().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 435aa686567b..855313d01e85 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -623,6 +623,7 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 		if (!trigger)
 			break;
 
+		check_for_release(cgrp);
 		cgroup_file_notify(&cgrp->events_file);
 
 		cgrp = cgroup_parent(cgrp);
@@ -686,15 +687,8 @@ static void put_css_set_locked(struct css_set *cset)
 	css_set_count--;
 
 	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
-		struct cgroup *cgrp = link->cgrp;
-
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
-
-		/* @cgrp can't go away while we're holding css_set_rwsem */
-		if (list_empty(&cgrp->cset_links))
-			check_for_release(cgrp);
-
 		kfree(link);
 	}
 

From 052c3f3a0b03651d94050d917ebc1df46a31c2f0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:50 -0400
Subject: [PATCH 25/38] cgroup: relocate cgroup_[try]get/put()

Relocate cgroup_get(), cgroup_tryget() and cgroup_put() upwards.  This
is pure code reorganization to prepare for future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 855313d01e85..ab5c9a5a0596 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -428,6 +428,22 @@ static inline bool cgroup_is_dead(const struct cgroup *cgrp)
 	return !(cgrp->self.flags & CSS_ONLINE);
 }
 
+static void cgroup_get(struct cgroup *cgrp)
+{
+	WARN_ON_ONCE(cgroup_is_dead(cgrp));
+	css_get(&cgrp->self);
+}
+
+static bool cgroup_tryget(struct cgroup *cgrp)
+{
+	return css_tryget(&cgrp->self);
+}
+
+static void cgroup_put(struct cgroup *cgrp)
+{
+	css_put(&cgrp->self);
+}
+
 struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
 {
 	struct cgroup *cgrp = of->kn->parent->priv;
@@ -1177,22 +1193,6 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	return mode;
 }
 
-static void cgroup_get(struct cgroup *cgrp)
-{
-	WARN_ON_ONCE(cgroup_is_dead(cgrp));
-	css_get(&cgrp->self);
-}
-
-static bool cgroup_tryget(struct cgroup *cgrp)
-{
-	return css_tryget(&cgrp->self);
-}
-
-static void cgroup_put(struct cgroup *cgrp)
-{
-	css_put(&cgrp->self);
-}
-
 /**
  * cgroup_calc_child_subsys_mask - calculate child_subsys_mask
  * @cgrp: the target cgroup

From 2ceb231b0ab0e3d700c5f7c839273bfeecbefe3b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:51 -0400
Subject: [PATCH 26/38] cgroup: make css_sets pin the associated cgroups

Currently, css_sets don't pin the associated cgroups.  This is okay as
a cgroup with css_sets associated are not allowed to be removed;
however, to help resource tracking for zombie tasks, this is scheduled
to change such that a cgroup can be removed even when it has css_sets
associated as long as none of them are populated.

To ensure that a cgroup doesn't go away while css_sets are still
associated with it, make each associated css_set hold a reference on
the cgroup if non-root.

v2: Root cgroups are special and shouldn't be ref'd by css_sets.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ab5c9a5a0596..ea87340af871 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -705,6 +705,8 @@ static void put_css_set_locked(struct css_set *cset)
 	list_for_each_entry_safe(link, tmp_link, &cset->cgrp_links, cgrp_link) {
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
+		if (cgroup_parent(link->cgrp))
+			cgroup_put(link->cgrp);
 		kfree(link);
 	}
 
@@ -919,6 +921,9 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 	 * is sorted by order of hierarchy creation
 	 */
 	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
+
+	if (cgroup_parent(cgrp))
+		cgroup_get(cgrp);
 }
 
 /**
@@ -4998,10 +5003,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	/*
-	 * css_set_rwsem synchronizes access to ->cset_links and prevents
-	 * @cgrp from being removed while put_css_set() is in progress.
-	 */
+	/* css_set_rwsem synchronizes access to ->cset_links */
 	down_read(&css_set_rwsem);
 	empty = list_empty(&cgrp->cset_links);
 	up_read(&css_set_rwsem);

From 91486f61f486662c27ef86dd910f875832e3a5de Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:51 -0400
Subject: [PATCH 27/38] cgroup: make cgroup_destroy_locked() test
 cgroup_is_populated()

cgroup_destroy_locked() currently tests whether any css_sets are
associated to reject removal if the cgroup contains tasks.  This works
because a css_set's refcnt converges with the number of tasks linked
to it and thus there's no css_set linked to a cgroup if it doesn't
have any live tasks.

To help tracking resource usage of zombie tasks, putting the ref of
css_set will be separated from disassociating the task from the
css_set which means that a cgroup may have css_sets linked to it even
when it doesn't have any live tasks.

This patch updates cgroup_destroy_locked() so that it tests
cgroup_is_populated(), which counts the number of populated css_sets,
instead of whether cgrp->cset_links is empty to determine whether the
cgroup is populated or not.  This ensures that rmdirs won't be
incorrectly rejected for cgroups which only contain zombie tasks.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ea87340af871..61ee85ded6ae 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4998,16 +4998,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct cgroup_subsys_state *css;
-	bool empty;
 	int ssid;
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	/* css_set_rwsem synchronizes access to ->cset_links */
-	down_read(&css_set_rwsem);
-	empty = list_empty(&cgrp->cset_links);
-	up_read(&css_set_rwsem);
-	if (!empty)
+	/*
+	 * Only migration can raise populated from zero and we're already
+	 * holding cgroup_mutex.
+	 */
+	if (cgroup_is_populated(cgrp))
 		return -EBUSY;
 
 	/*

From 389b9c1bc927c8194a49f5f0c7e069ed0ec79b9e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:51 -0400
Subject: [PATCH 28/38] cgroup: keep css_set and task lists in chronological
 order

css task iteration will be updated to not leak cgroup internal locking
to iterator users.  In preparation, update css_set and task lists to
be in chronological order.

For tasks, as migration path is already using list_splice_tail_init(),
only cgroup_enable_task_cg_lists() and cgroup_post_fork() need
updating.  For css_sets, link_css_set() is the only place which needs
to be updated.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 61ee85ded6ae..52444673e9a2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -914,12 +914,11 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
 	link->cset = cset;
 	link->cgrp = cgrp;
 
-	list_move(&link->cset_link, &cgrp->cset_links);
-
 	/*
-	 * Always add links to the tail of the list so that the list
-	 * is sorted by order of hierarchy creation
+	 * Always add links to the tail of the lists so that the lists are
+	 * in choronological order.
 	 */
+	list_move_tail(&link->cset_link, &cgrp->cset_links);
 	list_add_tail(&link->cgrp_link, &cset->cgrp_links);
 
 	if (cgroup_parent(cgrp))
@@ -1780,7 +1779,7 @@ static void cgroup_enable_task_cg_lists(void)
 
 			if (!css_set_populated(cset))
 				css_set_update_populated(cset, true);
-			list_add(&p->cg_list, &cset->tasks);
+			list_add_tail(&p->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		spin_unlock_irq(&p->sighand->siglock);
@@ -5480,7 +5479,7 @@ void cgroup_post_fork(struct task_struct *child,
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			rcu_assign_pointer(child->cgroups, cset);
-			list_add(&child->cg_list, &cset->tasks);
+			list_add_tail(&child->cg_list, &cset->tasks);
 			get_css_set(cset);
 		}
 		up_write(&css_set_rwsem);

From f6d7d049c17a29fbc4c2723899a242d6889554aa Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:52 -0400
Subject: [PATCH 29/38] cgroup: factor out css_set_move_task()

A task is associated and disassociated with its css_set in three
places - during migration, after a new task is created and when a task
exits.  The first is handled by cgroup_task_migrate() and the latter
two are open-coded.

These are similar operations and spreading them over multiple places
makes it harder to follow and update.  This patch collects all task
css_set [dis]association operations into css_set_move_task().

While css_set_move_task() may check whether populated state needs to
be updated when not strictly necessary, the behavior is essentially
equivalent before and after this patch.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 104 ++++++++++++++++++++++++++----------------------
 1 file changed, 56 insertions(+), 48 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 52444673e9a2..0b6e50c67b65 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -664,6 +664,52 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
 		cgroup_update_populated(link->cgrp, populated);
 }
 
+/**
+ * css_set_move_task - move a task from one css_set to another
+ * @task: task being moved
+ * @from_cset: css_set @task currently belongs to (may be NULL)
+ * @to_cset: new css_set @task is being moved to (may be NULL)
+ * @use_mg_tasks: move to @to_cset->mg_tasks instead of ->tasks
+ *
+ * Move @task from @from_cset to @to_cset.  If @task didn't belong to any
+ * css_set, @from_cset can be NULL.  If @task is being disassociated
+ * instead of moved, @to_cset can be NULL.
+ *
+ * This function automatically handles populated_cnt updates but the caller
+ * is responsible for managing @from_cset and @to_cset's reference counts.
+ */
+static void css_set_move_task(struct task_struct *task,
+			      struct css_set *from_cset, struct css_set *to_cset,
+			      bool use_mg_tasks)
+{
+	lockdep_assert_held(&css_set_rwsem);
+
+	if (from_cset) {
+		WARN_ON_ONCE(list_empty(&task->cg_list));
+		list_del_init(&task->cg_list);
+		if (!css_set_populated(from_cset))
+			css_set_update_populated(from_cset, false);
+	} else {
+		WARN_ON_ONCE(!list_empty(&task->cg_list));
+	}
+
+	if (to_cset) {
+		/*
+		 * We are synchronized through cgroup_threadgroup_rwsem
+		 * against PF_EXITING setting such that we can't race
+		 * against cgroup_exit() changing the css_set to
+		 * init_css_set and dropping the old one.
+		 */
+		WARN_ON_ONCE(task->flags & PF_EXITING);
+
+		if (!css_set_populated(to_cset))
+			css_set_update_populated(to_cset, true);
+		rcu_assign_pointer(task->cgroups, to_cset);
+		list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks :
+							     &to_cset->tasks);
+	}
+}
+
 /*
  * hash table for cgroup groups. This improves the performance to find
  * an existing css_set. This hash doesn't (currently) take into
@@ -2261,47 +2307,6 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset)
 	return NULL;
 }
 
-/**
- * cgroup_task_migrate - move a task from one cgroup to another.
- * @tsk: the task being migrated
- * @new_cset: the new css_set @tsk is being attached to
- *
- * Must be called with cgroup_mutex, threadgroup and css_set_rwsem locked.
- */
-static void cgroup_task_migrate(struct task_struct *tsk,
-				struct css_set *new_cset)
-{
-	struct css_set *old_cset;
-
-	lockdep_assert_held(&cgroup_mutex);
-	lockdep_assert_held(&css_set_rwsem);
-
-	/*
-	 * We are synchronized through cgroup_threadgroup_rwsem against
-	 * PF_EXITING setting such that we can't race against cgroup_exit()
-	 * changing the css_set to init_css_set and dropping the old one.
-	 */
-	WARN_ON_ONCE(tsk->flags & PF_EXITING);
-	old_cset = task_css_set(tsk);
-
-	if (!css_set_populated(new_cset))
-		css_set_update_populated(new_cset, true);
-
-	get_css_set(new_cset);
-	rcu_assign_pointer(tsk->cgroups, new_cset);
-	list_move_tail(&tsk->cg_list, &new_cset->mg_tasks);
-
-	if (!css_set_populated(old_cset))
-		css_set_update_populated(old_cset, false);
-
-	/*
-	 * We just gained a reference on old_cset by taking it from the
-	 * task. As trading it for new_cset is protected by cgroup_mutex,
-	 * we're safe to drop it here; it will be freed under RCU.
-	 */
-	put_css_set_locked(old_cset);
-}
-
 /**
  * cgroup_taskset_migrate - migrate a taskset to a cgroup
  * @tset: taget taskset
@@ -2342,8 +2347,14 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 */
 	down_write(&css_set_rwsem);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
-		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list)
-			cgroup_task_migrate(task, cset->mg_dst_cset);
+		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
+			struct css_set *from_cset = task_css_set(task);
+			struct css_set *to_cset = cset->mg_dst_cset;
+
+			get_css_set(to_cset);
+			css_set_move_task(task, from_cset, to_cset, true);
+			put_css_set_locked(from_cset);
+		}
 	}
 	up_write(&css_set_rwsem);
 
@@ -5478,9 +5489,8 @@ void cgroup_post_fork(struct task_struct *child,
 		down_write(&css_set_rwsem);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
-			rcu_assign_pointer(child->cgroups, cset);
-			list_add_tail(&child->cg_list, &cset->tasks);
 			get_css_set(cset);
+			css_set_move_task(child, NULL, cset, false);
 		}
 		up_write(&css_set_rwsem);
 	}
@@ -5528,9 +5538,7 @@ void cgroup_exit(struct task_struct *tsk)
 
 	if (!list_empty(&tsk->cg_list)) {
 		down_write(&css_set_rwsem);
-		list_del_init(&tsk->cg_list);
-		if (!css_set_populated(cset))
-			css_set_update_populated(cset, false);
+		css_set_move_task(tsk, cset, NULL, false);
 		up_write(&css_set_rwsem);
 		put_cset = true;
 	}

From ecb9d535df967b3ca565535e14456f612373bf5e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:52 -0400
Subject: [PATCH 30/38] cgroup: reorganize css_task_iter functions

* Rename css_advance_task_iter() to css_task_iter_advance_css_set()
  and make it clear it->task_pos too at the end of the iteration.

* Factor out css_task_iter_advance() from css_task_iter_next().  The
  new function whines if called on a terminated iterator.

Except for the termination check, this is pure reorganization and
doesn't introduce any behavior changes.  This will help the planned
locking update for css_task_iter.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 49 ++++++++++++++++++++++++++++---------------------
 1 file changed, 28 insertions(+), 21 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0b6e50c67b65..56e2b772b143 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3793,12 +3793,12 @@ bool css_has_online_children(struct cgroup_subsys_state *css)
 }
 
 /**
- * css_advance_task_iter - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task itererator to the next css_set
  * @it: the iterator to advance
  *
  * Advance @it to the next css_set to walk.
  */
-static void css_advance_task_iter(struct css_task_iter *it)
+static void css_task_iter_advance_css_set(struct css_task_iter *it)
 {
 	struct list_head *l = it->cset_pos;
 	struct cgrp_cset_link *link;
@@ -3809,6 +3809,7 @@ static void css_advance_task_iter(struct css_task_iter *it)
 		l = l->next;
 		if (l == it->cset_head) {
 			it->cset_pos = NULL;
+			it->task_pos = NULL;
 			return;
 		}
 
@@ -3832,6 +3833,28 @@ static void css_advance_task_iter(struct css_task_iter *it)
 	it->mg_tasks_head = &cset->mg_tasks;
 }
 
+static void css_task_iter_advance(struct css_task_iter *it)
+{
+	struct list_head *l = it->task_pos;
+
+	WARN_ON_ONCE(!l);
+
+	/*
+	 * Advance iterator to find next entry.  cset->tasks is consumed
+	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
+	 * next cset.
+	 */
+	l = l->next;
+
+	if (l == it->tasks_head)
+		l = it->mg_tasks_head->next;
+
+	if (l == it->mg_tasks_head)
+		css_task_iter_advance_css_set(it);
+	else
+		it->task_pos = l;
+}
+
 /**
  * css_task_iter_start - initiate task iteration
  * @css: the css to walk tasks of
@@ -3864,7 +3887,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	it->cset_head = it->cset_pos;
 
-	css_advance_task_iter(it);
+	css_task_iter_advance_css_set(it);
 }
 
 /**
@@ -3878,28 +3901,12 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
 	struct task_struct *res;
-	struct list_head *l = it->task_pos;
 
-	/* If the iterator cg is NULL, we have no tasks */
 	if (!it->cset_pos)
 		return NULL;
-	res = list_entry(l, struct task_struct, cg_list);
-
-	/*
-	 * Advance iterator to find next entry.  cset->tasks is consumed
-	 * first and then ->mg_tasks.  After ->mg_tasks, we move onto the
-	 * next cset.
-	 */
-	l = l->next;
-
-	if (l == it->tasks_head)
-		l = it->mg_tasks_head->next;
-
-	if (l == it->mg_tasks_head)
-		css_advance_task_iter(it);
-	else
-		it->task_pos = l;
 
+	res = list_entry(it->task_pos, struct task_struct, cg_list);
+	css_task_iter_advance(it);
 	return res;
 }
 

From ed27b9f7a17ddfbc007e16d4d11f33dff4fc2de7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:52 -0400
Subject: [PATCH 31/38] cgroup: don't hold css_set_rwsem across css task
 iteration

css_sets are synchronized through css_set_rwsem but the locking scheme
is kinda bizarre.  The hot paths - fork and exit - have to write lock
the rwsem making the rw part pointless; furthermore, many readers
already hold cgroup_mutex.

One of the readers is css task iteration.  It read locks the rwsem
over the entire duration of iteration.  This leads to silly locking
behavior.  When cpuset tries to migrate processes of a cgroup to a
different NUMA node, css_set_rwsem is held across the entire migration
attempt which can take a long time locking out forking, exiting and
other cgroup operations.

This patch updates css task iteration so that it locks css_set_rwsem
only while the iterator is being advanced.  css task iteration
involves two levels - css_set and task iteration.  As css_sets in use
are practically immutable, simply pinning the current one is enough
for resuming iteration afterwards.  Task iteration is tricky as tasks
may leave their css_set while iteration is in progress.  This is
solved by keeping track of active iterators and advancing them if
their next task leaves its css_set.

v2: put_task_struct() in css_task_iter_next() moved outside
    css_set_rwsem.  A later patch will add cgroup operations to
    task_struct free path which may grab the same lock and this avoids
    deadlock possibilities.

    css_set_move_task() updated to use list_for_each_entry_safe() when
    walking task_iters and advancing them.  This is necessary as
    advancing an iter may remove it from the list.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup-defs.h |  3 ++
 include/linux/cgroup.h      |  4 ++
 kernel/cgroup.c             | 87 +++++++++++++++++++++++++++++++------
 3 files changed, 80 insertions(+), 14 deletions(-)

diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 17444505c870..62413c3e2f4b 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -211,6 +211,9 @@ struct css_set {
 	 */
 	struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
 
+	/* all css_task_iters currently walking this cset */
+	struct list_head task_iters;
+
 	/* For RCU-protected deletion */
 	struct rcu_head rcu_head;
 };
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bdfdb3a1a83c..a9dcf0e76865 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -42,6 +42,10 @@ struct css_task_iter {
 	struct list_head		*task_pos;
 	struct list_head		*tasks_head;
 	struct list_head		*mg_tasks_head;
+
+	struct css_set			*cur_cset;
+	struct task_struct		*cur_task;
+	struct list_head		iters_node;	/* css_set->task_iters */
 };
 
 extern struct cgroup_root cgrp_dfl_root;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 56e2b772b143..0c5b0e605c47 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -216,6 +216,7 @@ static struct cftype cgroup_legacy_base_files[];
 
 static int rebind_subsystems(struct cgroup_root *dst_root,
 			     unsigned long ss_mask);
+static void css_task_iter_advance(struct css_task_iter *it);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
 		      bool visible);
@@ -593,6 +594,7 @@ struct css_set init_css_set = {
 	.mg_tasks		= LIST_HEAD_INIT(init_css_set.mg_tasks),
 	.mg_preload_node	= LIST_HEAD_INIT(init_css_set.mg_preload_node),
 	.mg_node		= LIST_HEAD_INIT(init_css_set.mg_node),
+	.task_iters		= LIST_HEAD_INIT(init_css_set.task_iters),
 };
 
 static int css_set_count	= 1;	/* 1 for init_css_set */
@@ -675,8 +677,9 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
  * css_set, @from_cset can be NULL.  If @task is being disassociated
  * instead of moved, @to_cset can be NULL.
  *
- * This function automatically handles populated_cnt updates but the caller
- * is responsible for managing @from_cset and @to_cset's reference counts.
+ * This function automatically handles populated_cnt updates and
+ * css_task_iter adjustments but the caller is responsible for managing
+ * @from_cset and @to_cset's reference counts.
  */
 static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
@@ -685,7 +688,22 @@ static void css_set_move_task(struct task_struct *task,
 	lockdep_assert_held(&css_set_rwsem);
 
 	if (from_cset) {
+		struct css_task_iter *it, *pos;
+
 		WARN_ON_ONCE(list_empty(&task->cg_list));
+
+		/*
+		 * @task is leaving, advance task iterators which are
+		 * pointing to it so that they can resume at the next
+		 * position.  Advancing an iterator might remove it from
+		 * the list, use safe walk.  See css_task_iter_advance*()
+		 * for details.
+		 */
+		list_for_each_entry_safe(it, pos, &from_cset->task_iters,
+					 iters_node)
+			if (it->task_pos == &task->cg_list)
+				css_task_iter_advance(it);
+
 		list_del_init(&task->cg_list);
 		if (!css_set_populated(from_cset))
 			css_set_update_populated(from_cset, false);
@@ -1019,6 +1037,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	INIT_LIST_HEAD(&cset->mg_tasks);
 	INIT_LIST_HEAD(&cset->mg_preload_node);
 	INIT_LIST_HEAD(&cset->mg_node);
+	INIT_LIST_HEAD(&cset->task_iters);
 	INIT_HLIST_NODE(&cset->hlist);
 
 	/* Copy the set of subsystem state objects generated in
@@ -3804,6 +3823,8 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
+	lockdep_assert_held(&css_set_rwsem);
+
 	/* Advance to the next non-empty css_set */
 	do {
 		l = l->next;
@@ -3831,12 +3852,36 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 
 	it->tasks_head = &cset->tasks;
 	it->mg_tasks_head = &cset->mg_tasks;
+
+	/*
+	 * We don't keep css_sets locked across iteration steps and thus
+	 * need to take steps to ensure that iteration can be resumed after
+	 * the lock is re-acquired.  Iteration is performed at two levels -
+	 * css_sets and tasks in them.
+	 *
+	 * Once created, a css_set never leaves its cgroup lists, so a
+	 * pinned css_set is guaranteed to stay put and we can resume
+	 * iteration afterwards.
+	 *
+	 * Tasks may leave @cset across iteration steps.  This is resolved
+	 * by registering each iterator with the css_set currently being
+	 * walked and making css_set_move_task() advance iterators whose
+	 * next task is leaving.
+	 */
+	if (it->cur_cset) {
+		list_del(&it->iters_node);
+		put_css_set_locked(it->cur_cset);
+	}
+	get_css_set(cset);
+	it->cur_cset = cset;
+	list_add(&it->iters_node, &cset->task_iters);
 }
 
 static void css_task_iter_advance(struct css_task_iter *it)
 {
 	struct list_head *l = it->task_pos;
 
+	lockdep_assert_held(&css_set_rwsem);
 	WARN_ON_ONCE(!l);
 
 	/*
@@ -3864,19 +3909,16 @@ static void css_task_iter_advance(struct css_task_iter *it)
  * css_task_iter_next() to walk through the tasks until the function
  * returns NULL.  On completion of iteration, css_task_iter_end() must be
  * called.
- *
- * Note that this function acquires a lock which is released when the
- * iteration finishes.  The caller can't sleep while iteration is in
- * progress.
  */
 void css_task_iter_start(struct cgroup_subsys_state *css,
 			 struct css_task_iter *it)
-	__acquires(css_set_rwsem)
 {
 	/* no one should try to iterate before mounting cgroups */
 	WARN_ON_ONCE(!use_task_css_set_links);
 
-	down_read(&css_set_rwsem);
+	memset(it, 0, sizeof(*it));
+
+	down_write(&css_set_rwsem);
 
 	it->ss = css->ss;
 
@@ -3888,6 +3930,8 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 	it->cset_head = it->cset_pos;
 
 	css_task_iter_advance_css_set(it);
+
+	up_write(&css_set_rwsem);
 }
 
 /**
@@ -3900,14 +3944,22 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
-	struct task_struct *res;
-
 	if (!it->cset_pos)
 		return NULL;
 
-	res = list_entry(it->task_pos, struct task_struct, cg_list);
+	if (it->cur_task)
+		put_task_struct(it->cur_task);
+
+	down_write(&css_set_rwsem);
+
+	it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list);
+	get_task_struct(it->cur_task);
+
 	css_task_iter_advance(it);
-	return res;
+
+	up_write(&css_set_rwsem);
+
+	return it->cur_task;
 }
 
 /**
@@ -3917,9 +3969,16 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
  * Finish task iteration started by css_task_iter_start().
  */
 void css_task_iter_end(struct css_task_iter *it)
-	__releases(css_set_rwsem)
 {
-	up_read(&css_set_rwsem);
+	if (it->cur_cset) {
+		down_write(&css_set_rwsem);
+		list_del(&it->iters_node);
+		put_css_set_locked(it->cur_cset);
+		up_write(&css_set_rwsem);
+	}
+
+	if (it->cur_task)
+		put_task_struct(it->cur_task);
 }
 
 /**

From f0d9a5f175753a371bc7fdff0d584a8d9cd72bb0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:53 -0400
Subject: [PATCH 32/38] cgroup: make css_set_rwsem a spinlock and rename it to
 css_set_lock

css_set_rwsem is the inner lock protecting css_sets and is accessed
from hot paths such as fork and exit.  Internally, it has no reason to
be a rwsem or even mutex.  There are no internal blocking operations
while holding it.  This was rwsem because css task iteration used to
expose it to external iterator users.  As the previous patch updated
css task iteration such that the locking is not leaked to its users,
there's no reason to keep it a rwsem.

This patch converts css_set_rwsem to a spinlock and rename it to
css_set_lock.  It uses bh-safe operations as a planned usage needs to
access it from RCU callback context.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/cgroup.h |   5 +-
 kernel/cgroup.c        | 144 ++++++++++++++++++++---------------------
 2 files changed, 74 insertions(+), 75 deletions(-)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index a9dcf0e76865..46020735bcbb 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -13,7 +13,6 @@
 #include <linux/nodemask.h>
 #include <linux/rculist.h>
 #include <linux/cgroupstats.h>
-#include <linux/rwsem.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
 #include <linux/kernfs.h>
@@ -367,11 +366,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
  */
 #ifdef CONFIG_PROVE_RCU
 extern struct mutex cgroup_mutex;
-extern struct rw_semaphore css_set_rwsem;
+extern spinlock_t css_set_lock;
 #define task_css_set_check(task, __c)					\
 	rcu_dereference_check((task)->cgroups,				\
 		lockdep_is_held(&cgroup_mutex) ||			\
-		lockdep_is_held(&css_set_rwsem) ||			\
+		lockdep_is_held(&css_set_lock) ||			\
 		((task)->flags & PF_EXITING) || (__c))
 #else
 #define task_css_set_check(task, __c)					\
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 0c5b0e605c47..ba7b3284c2e4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,7 +45,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
-#include <linux/rwsem.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/string.h>
 #include <linux/sort.h>
@@ -76,7 +75,7 @@
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
  *
- * css_set_rwsem protects task->cgroups pointer, the list of css_set
+ * css_set_lock protects task->cgroups pointer, the list of css_set
  * objects, and the chain of tasks off each css_set.
  *
  * These locks are exported if CONFIG_PROVE_RCU so that accessors in
@@ -84,12 +83,12 @@
  */
 #ifdef CONFIG_PROVE_RCU
 DEFINE_MUTEX(cgroup_mutex);
-DECLARE_RWSEM(css_set_rwsem);
+DEFINE_SPINLOCK(css_set_lock);
 EXPORT_SYMBOL_GPL(cgroup_mutex);
-EXPORT_SYMBOL_GPL(css_set_rwsem);
+EXPORT_SYMBOL_GPL(css_set_lock);
 #else
 static DEFINE_MUTEX(cgroup_mutex);
-static DECLARE_RWSEM(css_set_rwsem);
+static DEFINE_SPINLOCK(css_set_lock);
 #endif
 
 /*
@@ -605,7 +604,7 @@ static int css_set_count	= 1;	/* 1 for init_css_set */
  */
 static bool css_set_populated(struct css_set *cset)
 {
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	return !list_empty(&cset->tasks) || !list_empty(&cset->mg_tasks);
 }
@@ -628,7 +627,7 @@ static bool css_set_populated(struct css_set *cset)
  */
 static void cgroup_update_populated(struct cgroup *cgrp, bool populated)
 {
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	do {
 		bool trigger;
@@ -660,7 +659,7 @@ static void css_set_update_populated(struct css_set *cset, bool populated)
 {
 	struct cgrp_cset_link *link;
 
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link)
 		cgroup_update_populated(link->cgrp, populated);
@@ -685,7 +684,7 @@ static void css_set_move_task(struct task_struct *task,
 			      struct css_set *from_cset, struct css_set *to_cset,
 			      bool use_mg_tasks)
 {
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	if (from_cset) {
 		struct css_task_iter *it, *pos;
@@ -755,7 +754,7 @@ static void put_css_set_locked(struct css_set *cset)
 	struct cgroup_subsys *ss;
 	int ssid;
 
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	if (!atomic_dec_and_test(&cset->refcount))
 		return;
@@ -787,9 +786,9 @@ static void put_css_set(struct css_set *cset)
 	if (atomic_add_unless(&cset->refcount, -1, 1))
 		return;
 
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	put_css_set_locked(cset);
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 }
 
 /*
@@ -1012,11 +1011,11 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 
 	/* First see if we already have a cgroup group that matches
 	 * the desired set */
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	cset = find_existing_css_set(old_cset, cgrp, template);
 	if (cset)
 		get_css_set(cset);
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	if (cset)
 		return cset;
@@ -1044,7 +1043,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 	 * find_existing_css_set() */
 	memcpy(cset->subsys, template, sizeof(cset->subsys));
 
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	/* Add reference counts and links from the new css_set. */
 	list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
 		struct cgroup *c = link->cgrp;
@@ -1066,7 +1065,7 @@ static struct css_set *find_css_set(struct css_set *old_cset,
 		list_add_tail(&cset->e_cset_node[ssid],
 			      &cset->subsys[ssid]->cgroup->e_csets[ssid]);
 
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	return cset;
 }
@@ -1130,14 +1129,15 @@ static void cgroup_destroy_root(struct cgroup_root *root)
 	 * Release all the links from cset_links to this hierarchy's
 	 * root cgroup
 	 */
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 
 	list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
 		list_del(&link->cset_link);
 		list_del(&link->cgrp_link);
 		kfree(link);
 	}
-	up_write(&css_set_rwsem);
+
+	spin_unlock_bh(&css_set_lock);
 
 	if (!list_empty(&root->root_list)) {
 		list_del(&root->root_list);
@@ -1159,7 +1159,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 	struct cgroup *res = NULL;
 
 	lockdep_assert_held(&cgroup_mutex);
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	if (cset == &init_css_set) {
 		res = &root->cgrp;
@@ -1182,7 +1182,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
 
 /*
  * Return the cgroup for "task" from the given hierarchy. Must be
- * called with cgroup_mutex and css_set_rwsem held.
+ * called with cgroup_mutex and css_set_lock held.
  */
 static struct cgroup *task_cgroup_from_root(struct task_struct *task,
 					    struct cgroup_root *root)
@@ -1531,11 +1531,11 @@ static int rebind_subsystems(struct cgroup_root *dst_root,
 		ss->root = dst_root;
 		css->cgroup = dcgrp;
 
-		down_write(&css_set_rwsem);
+		spin_lock_bh(&css_set_lock);
 		hash_for_each(css_set_table, i, cset, hlist)
 			list_move_tail(&cset->e_cset_node[ss->id],
 				       &dcgrp->e_csets[ss->id]);
-		up_write(&css_set_rwsem);
+		spin_unlock_bh(&css_set_lock);
 
 		src_root->subsys_mask &= ~(1 << ssid);
 		scgrp->subtree_control &= ~(1 << ssid);
@@ -1812,7 +1812,7 @@ static void cgroup_enable_task_cg_lists(void)
 {
 	struct task_struct *p, *g;
 
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 
 	if (use_task_css_set_links)
 		goto out_unlock;
@@ -1851,7 +1851,7 @@ static void cgroup_enable_task_cg_lists(void)
 	} while_each_thread(g, p);
 	read_unlock(&tasklist_lock);
 out_unlock:
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 }
 
 static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1915,7 +1915,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 		goto out;
 
 	/*
-	 * We're accessing css_set_count without locking css_set_rwsem here,
+	 * We're accessing css_set_count without locking css_set_lock here,
 	 * but that's OK - it can only be increased by someone holding
 	 * cgroup_lock, and that's us. The worst that can happen is that we
 	 * have some link structures left over
@@ -1957,13 +1957,13 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	 * Link the root cgroup in this hierarchy into all the css_set
 	 * objects.
 	 */
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	hash_for_each(css_set_table, i, cset, hlist) {
 		link_css_set(&tmp_links, cset, root_cgrp);
 		if (css_set_populated(cset))
 			cgroup_update_populated(root_cgrp, true);
 	}
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	BUG_ON(!list_empty(&root_cgrp->self.children));
 	BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2196,7 +2196,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 	char *path = NULL;
 
 	mutex_lock(&cgroup_mutex);
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 
 	root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
 
@@ -2209,7 +2209,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
 			path = buf;
 	}
 
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	return path;
 }
@@ -2258,7 +2258,7 @@ static void cgroup_taskset_add(struct task_struct *task,
 {
 	struct css_set *cset;
 
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	/* @task either already exited or can't exit until the end */
 	if (task->flags & PF_EXITING)
@@ -2364,7 +2364,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 	 * the new cgroup.  There are no failure cases after here, so this
 	 * is the commit point.
 	 */
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(cset, &tset->src_csets, mg_node) {
 		list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
 			struct css_set *from_cset = task_css_set(task);
@@ -2375,7 +2375,7 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset,
 			put_css_set_locked(from_cset);
 		}
 	}
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	/*
 	 * Migration is committed, all target tasks are now on dst_csets.
@@ -2399,13 +2399,13 @@ out_cancel_attach:
 			css->ss->cancel_attach(css, tset);
 	}
 out_release_tset:
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	list_splice_init(&tset->dst_csets, &tset->src_csets);
 	list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
 		list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
 		list_del_init(&cset->mg_node);
 	}
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 	return ret;
 }
 
@@ -2422,14 +2422,14 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
 
 	lockdep_assert_held(&cgroup_mutex);
 
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
 		cset->mg_src_cgrp = NULL;
 		cset->mg_dst_cset = NULL;
 		list_del_init(&cset->mg_preload_node);
 		put_css_set_locked(cset);
 	}
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 }
 
 /**
@@ -2455,7 +2455,7 @@ static void cgroup_migrate_add_src(struct css_set *src_cset,
 	struct cgroup *src_cgrp;
 
 	lockdep_assert_held(&cgroup_mutex);
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
 
@@ -2571,7 +2571,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 	 * already PF_EXITING could be freed from underneath us unless we
 	 * take an rcu_read_lock.
 	 */
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2580,7 +2580,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	return cgroup_taskset_migrate(&tset, cgrp);
 }
@@ -2601,7 +2601,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 	int ret;
 
 	/* look up all src csets */
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	rcu_read_lock();
 	task = leader;
 	do {
@@ -2611,7 +2611,7 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp,
 			break;
 	} while_each_thread(leader, task);
 	rcu_read_unlock();
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	/* prepare dst csets and commit */
 	ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
@@ -2644,9 +2644,9 @@ static int cgroup_procs_write_permission(struct task_struct *task,
 		struct cgroup *cgrp;
 		struct inode *inode;
 
-		down_read(&css_set_rwsem);
+		spin_lock_bh(&css_set_lock);
 		cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
-		up_read(&css_set_rwsem);
+		spin_unlock_bh(&css_set_lock);
 
 		while (!cgroup_is_descendant(dst_cgrp, cgrp))
 			cgrp = cgroup_parent(cgrp);
@@ -2743,9 +2743,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
 		if (root == &cgrp_dfl_root)
 			continue;
 
-		down_read(&css_set_rwsem);
+		spin_lock_bh(&css_set_lock);
 		from_cgrp = task_cgroup_from_root(from, root);
-		up_read(&css_set_rwsem);
+		spin_unlock_bh(&css_set_lock);
 
 		retval = cgroup_attach_task(from_cgrp, tsk, false);
 		if (retval)
@@ -2870,7 +2870,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 	percpu_down_write(&cgroup_threadgroup_rwsem);
 
 	/* look up all csses currently attached to @cgrp's subtree */
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
 		struct cgrp_cset_link *link;
 
@@ -2882,14 +2882,14 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 			cgroup_migrate_add_src(link->cset, cgrp,
 					       &preloaded_csets);
 	}
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	/* NULL dst indicates self on default hierarchy */
 	ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
 	if (ret)
 		goto out_finish;
 
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
 		struct task_struct *task, *ntask;
 
@@ -2901,7 +2901,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
 		list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
 			cgroup_taskset_add(task, &tset);
 	}
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	ret = cgroup_taskset_migrate(&tset, cgrp);
 out_finish:
@@ -3577,10 +3577,10 @@ static int cgroup_task_count(const struct cgroup *cgrp)
 	int count = 0;
 	struct cgrp_cset_link *link;
 
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(link, &cgrp->cset_links, cset_link)
 		count += atomic_read(&link->cset->refcount);
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 	return count;
 }
 
@@ -3823,7 +3823,7 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
 	struct cgrp_cset_link *link;
 	struct css_set *cset;
 
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 
 	/* Advance to the next non-empty css_set */
 	do {
@@ -3881,7 +3881,7 @@ static void css_task_iter_advance(struct css_task_iter *it)
 {
 	struct list_head *l = it->task_pos;
 
-	lockdep_assert_held(&css_set_rwsem);
+	lockdep_assert_held(&css_set_lock);
 	WARN_ON_ONCE(!l);
 
 	/*
@@ -3918,7 +3918,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	memset(it, 0, sizeof(*it));
 
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 
 	it->ss = css->ss;
 
@@ -3931,7 +3931,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
 
 	css_task_iter_advance_css_set(it);
 
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 }
 
 /**
@@ -3950,14 +3950,14 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 	if (it->cur_task)
 		put_task_struct(it->cur_task);
 
-	down_write(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 
 	it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list);
 	get_task_struct(it->cur_task);
 
 	css_task_iter_advance(it);
 
-	up_write(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	return it->cur_task;
 }
@@ -3971,10 +3971,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it)
 void css_task_iter_end(struct css_task_iter *it)
 {
 	if (it->cur_cset) {
-		down_write(&css_set_rwsem);
+		spin_lock_bh(&css_set_lock);
 		list_del(&it->iters_node);
 		put_css_set_locked(it->cur_cset);
-		up_write(&css_set_rwsem);
+		spin_unlock_bh(&css_set_lock);
 	}
 
 	if (it->cur_task)
@@ -4003,10 +4003,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
 	mutex_lock(&cgroup_mutex);
 
 	/* all tasks in @from are being moved, all csets are source */
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(link, &from->cset_links, cset_link)
 		cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 
 	ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
 	if (ret)
@@ -5359,7 +5359,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 		goto out;
 
 	mutex_lock(&cgroup_mutex);
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 
 	for_each_root(root) {
 		struct cgroup_subsys *ss;
@@ -5391,7 +5391,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 
 	retval = 0;
 out_unlock:
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 	mutex_unlock(&cgroup_mutex);
 	kfree(buf);
 out:
@@ -5537,7 +5537,7 @@ void cgroup_post_fork(struct task_struct *child,
 	 * @child during its iteration.
 	 *
 	 * If we won the race, @child is associated with %current's
-	 * css_set.  Grabbing css_set_rwsem guarantees both that the
+	 * css_set.  Grabbing css_set_lock guarantees both that the
 	 * association is stable, and, on completion of the parent's
 	 * migration, @child is visible in the source of migration or
 	 * already in the destination cgroup.  This guarantee is necessary
@@ -5552,13 +5552,13 @@ void cgroup_post_fork(struct task_struct *child,
 	if (use_task_css_set_links) {
 		struct css_set *cset;
 
-		down_write(&css_set_rwsem);
+		spin_lock_bh(&css_set_lock);
 		cset = task_css_set(current);
 		if (list_empty(&child->cg_list)) {
 			get_css_set(cset);
 			css_set_move_task(child, NULL, cset, false);
 		}
-		up_write(&css_set_rwsem);
+		spin_unlock_bh(&css_set_lock);
 	}
 
 	/*
@@ -5603,9 +5603,9 @@ void cgroup_exit(struct task_struct *tsk)
 	cset = task_css_set(tsk);
 
 	if (!list_empty(&tsk->cg_list)) {
-		down_write(&css_set_rwsem);
+		spin_lock_bh(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
-		up_write(&css_set_rwsem);
+		spin_unlock_bh(&css_set_lock);
 		put_cset = true;
 	}
 
@@ -5823,7 +5823,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 	if (!name_buf)
 		return -ENOMEM;
 
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	rcu_read_lock();
 	cset = rcu_dereference(current->cgroups);
 	list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5834,7 +5834,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
 			   c->root->hierarchy_id, name_buf);
 	}
 	rcu_read_unlock();
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 	kfree(name_buf);
 	return 0;
 }
@@ -5845,7 +5845,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	struct cgroup_subsys_state *css = seq_css(seq);
 	struct cgrp_cset_link *link;
 
-	down_read(&css_set_rwsem);
+	spin_lock_bh(&css_set_lock);
 	list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
 		struct css_set *cset = link->cset;
 		struct task_struct *task;
@@ -5868,7 +5868,7 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
 	overflow:
 		seq_puts(seq, "  ...\n");
 	}
-	up_read(&css_set_rwsem);
+	spin_unlock_bh(&css_set_lock);
 	return 0;
 }
 

From 2e91fa7f6d451e3ea9fec999065d2fd199691f9d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:53 -0400
Subject: [PATCH 33/38] cgroup: keep zombies associated with their original
 cgroups

cgroup_exit() is called when a task exits and disassociates the
exiting task from its cgroups and half-attach it to the root cgroup.
This is unnecessary and undesirable.

No controller actually needs an exiting task to be disassociated with
non-root cgroups.  Both cpu and perf_event controllers update the
association to the root cgroup from their exit callbacks just to keep
consistent with the cgroup core behavior.

Also, this disassociation makes it difficult to track resources held
by zombies or determine where the zombies came from.  Currently, pids
controller is completely broken as it uncharges on exit and zombies
always escape the resource restriction.  With cgroup association being
reset on exit, fixing it is pretty painful.

There's no reason to reset cgroup membership on exit.  The zombie can
be removed from its css_set so that it doesn't show up on
"cgroup.procs" and thus can't be migrated or interfere with cgroup
removal.  It can still pin and point to the css_set so that its cgroup
membership is maintained.  This patch makes cgroup core keep zombies
associated with their cgroups at the time of exit.

* Previous patches decoupled populated_cnt tracking from css_set
  lifetime, so a dying task can be simply unlinked from its css_set
  while pinning and pointing to the css_set.  This keeps css_set
  association from task side alive while hiding it from "cgroup.procs"
  and populated_cnt tracking.  The css_set reference is dropped when
  the task_struct is freed.

* ->exit() callback no longer needs the css arguments as the
  associated css never changes once PF_EXITING is set.  Removed.

* cpu and perf_events controllers no longer need ->exit() callbacks.
  There's no reason to explicitly switch away on exit.  The final
  schedule out is enough.  The callbacks are removed.

* On traditional hierarchies, nothing changes.  "/proc/PID/cgroup"
  still reports "/" for all zombies.  On the default hierarchy,
  "/proc/PID/cgroup" keeps reporting the cgroup that the task belonged
  to at the time of exit.  If the cgroup gets removed before the task
  is reaped, " (deleted)" is appended.

v2: Build brekage due to missing dummy cgroup_free() when
    !CONFIG_CGROUP fixed.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
---
 Documentation/cgroups/unified-hierarchy.txt |  4 ++
 include/linux/cgroup-defs.h                 |  4 +-
 include/linux/cgroup.h                      |  2 +
 kernel/cgroup.c                             | 51 ++++++++++++++-------
 kernel/cgroup_pids.c                        |  6 +--
 kernel/events/core.c                        | 16 -------
 kernel/fork.c                               |  1 +
 kernel/sched/core.c                         | 16 -------
 8 files changed, 44 insertions(+), 56 deletions(-)

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 176b940f8327..6932453d37a2 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -374,6 +374,10 @@ supported and the interface files "release_agent" and
 
 - The "cgroup.clone_children" file is removed.
 
+- /proc/PID/cgroup keeps reporting the cgroup that a zombie belonged
+  to before exiting.  If the cgroup is removed before the zombie is
+  reaped, " (deleted)" is appeneded to the path.
+
 
 5-3. Controller File Conventions
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 62413c3e2f4b..6a1ab64ee5f9 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -435,9 +435,7 @@ struct cgroup_subsys {
 	int (*can_fork)(struct task_struct *task, void **priv_p);
 	void (*cancel_fork)(struct task_struct *task, void *priv);
 	void (*fork)(struct task_struct *task, void *priv);
-	void (*exit)(struct cgroup_subsys_state *css,
-		     struct cgroup_subsys_state *old_css,
-		     struct task_struct *task);
+	void (*exit)(struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
 	int early_init;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 46020735bcbb..22e3754f89c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -102,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
 extern void cgroup_post_fork(struct task_struct *p,
 			     void *old_ss_priv[CGROUP_CANFORK_COUNT]);
 void cgroup_exit(struct task_struct *p);
+void cgroup_free(struct task_struct *p);
 
 int cgroup_init_early(void);
 int cgroup_init(void);
@@ -547,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
 static inline void cgroup_post_fork(struct task_struct *p,
 				    void *ss_priv[CGROUP_CANFORK_COUNT]) {}
 static inline void cgroup_exit(struct task_struct *p) {}
+static inline void cgroup_free(struct task_struct *p) {}
 
 static inline int cgroup_init_early(void) { return 0; }
 static inline int cgroup_init(void) { return 0; }
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ba7b3284c2e4..918658497625 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5379,14 +5379,34 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
 			seq_printf(m, "%sname=%s", count ? "," : "",
 				   root->name);
 		seq_putc(m, ':');
+
 		cgrp = task_cgroup_from_root(tsk, root);
-		path = cgroup_path(cgrp, buf, PATH_MAX);
-		if (!path) {
-			retval = -ENAMETOOLONG;
-			goto out_unlock;
+
+		/*
+		 * On traditional hierarchies, all zombie tasks show up as
+		 * belonging to the root cgroup.  On the default hierarchy,
+		 * while a zombie doesn't show up in "cgroup.procs" and
+		 * thus can't be migrated, its /proc/PID/cgroup keeps
+		 * reporting the cgroup it belonged to before exiting.  If
+		 * the cgroup is removed before the zombie is reaped,
+		 * " (deleted)" is appended to the cgroup path.
+		 */
+		if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) {
+			path = cgroup_path(cgrp, buf, PATH_MAX);
+			if (!path) {
+				retval = -ENAMETOOLONG;
+				goto out_unlock;
+			}
+		} else {
+			path = "/";
 		}
+
 		seq_puts(m, path);
-		seq_putc(m, '\n');
+
+		if (cgroup_on_dfl(cgrp) && cgroup_is_dead(cgrp))
+			seq_puts(m, " (deleted)\n");
+		else
+			seq_putc(m, '\n');
 	}
 
 	retval = 0;
@@ -5593,7 +5613,6 @@ void cgroup_exit(struct task_struct *tsk)
 {
 	struct cgroup_subsys *ss;
 	struct css_set *cset;
-	bool put_cset = false;
 	int i;
 
 	/*
@@ -5606,22 +5625,20 @@ void cgroup_exit(struct task_struct *tsk)
 		spin_lock_bh(&css_set_lock);
 		css_set_move_task(tsk, cset, NULL, false);
 		spin_unlock_bh(&css_set_lock);
-		put_cset = true;
+	} else {
+		get_css_set(cset);
 	}
 
-	/* Reassign the task to the init_css_set. */
-	RCU_INIT_POINTER(tsk->cgroups, &init_css_set);
-
 	/* see cgroup_post_fork() for details */
-	for_each_subsys_which(ss, i, &have_exit_callback) {
-		struct cgroup_subsys_state *old_css = cset->subsys[i];
-		struct cgroup_subsys_state *css = task_css(tsk, i);
+	for_each_subsys_which(ss, i, &have_exit_callback)
+		ss->exit(tsk);
+}
 
-		ss->exit(css, old_css, tsk);
-	}
+void cgroup_free(struct task_struct *task)
+{
+	struct css_set *cset = task_css_set(task);
 
-	if (put_cset)
-		put_css_set(cset);
+	put_css_set(cset);
 }
 
 static void check_for_release(struct cgroup *cgrp)
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 806cd7693ac8..45f0856a61fe 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,11 +266,9 @@ static void pids_fork(struct task_struct *task, void *priv)
 	css_put(old_css);
 }
 
-static void pids_exit(struct cgroup_subsys_state *css,
-		      struct cgroup_subsys_state *old_css,
-		      struct task_struct *task)
+static void pids_exit(struct task_struct *task)
 {
-	struct pids_cgroup *pids = css_pids(old_css);
+	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 
 	pids_uncharge(pids, 1);
 }
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f548f69c4299..e9874949c787 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9293,25 +9293,9 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css,
 		task_function_call(task, __perf_cgroup_move, task);
 }
 
-static void perf_cgroup_exit(struct cgroup_subsys_state *css,
-			     struct cgroup_subsys_state *old_css,
-			     struct task_struct *task)
-{
-	/*
-	 * cgroup_exit() is called in the copy_process() failure path.
-	 * Ignore this case since the task hasn't ran yet, this avoids
-	 * trying to poke a half freed task state from generic code.
-	 */
-	if (!(task->flags & PF_EXITING))
-		return;
-
-	task_function_call(task, __perf_cgroup_move, task);
-}
-
 struct cgroup_subsys perf_event_cgrp_subsys = {
 	.css_alloc	= perf_cgroup_css_alloc,
 	.css_free	= perf_cgroup_css_free,
-	.exit		= perf_cgroup_exit,
 	.attach		= perf_cgroup_attach,
 };
 #endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7d5f0f118a63..118743bb5964 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -251,6 +251,7 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	cgroup_free(tsk);
 	task_numa_free(tsk);
 	security_task_free(tsk);
 	exit_creds(tsk);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3595403921bd..2cad9ba91036 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -8163,21 +8163,6 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css,
 		sched_move_task(task);
 }
 
-static void cpu_cgroup_exit(struct cgroup_subsys_state *css,
-			    struct cgroup_subsys_state *old_css,
-			    struct task_struct *task)
-{
-	/*
-	 * cgroup_exit() is called in the copy_process() failure path.
-	 * Ignore this case since the task hasn't ran yet, this avoids
-	 * trying to poke a half freed task state from generic code.
-	 */
-	if (!(task->flags & PF_EXITING))
-		return;
-
-	sched_move_task(task);
-}
-
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
 				struct cftype *cftype, u64 shareval)
@@ -8509,7 +8494,6 @@ struct cgroup_subsys cpu_cgrp_subsys = {
 	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
-	.exit		= cpu_cgroup_exit,
 	.legacy_cftypes	= cpu_files,
 	.early_init	= 1,
 };

From afcf6c8b75444382e0f9996157207ebae34a8848 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 16:41:53 -0400
Subject: [PATCH 34/38] cgroup: add cgroup_subsys->free() method and use it to
 fix pids controller

pids controller is completely broken in that it uncharges when a task
exits allowing zombies to escape resource control.  With the recent
updates, cgroup core now maintains cgroup association till task free
and pids controller can be fixed by uncharging on free instead of
exit.

This patch adds cgroup_subsys->free() method and update pids
controller to use it instead of ->exit() for uncharging.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Aleksa Sarai <cyphar@cyphar.com>
---
 Documentation/cgroups/cgroups.txt | 4 ++++
 include/linux/cgroup-defs.h       | 1 +
 kernel/cgroup.c                   | 7 +++++++
 kernel/cgroup_pids.c              | 4 ++--
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index f935fac1e73b..c6256ae9885b 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -637,6 +637,10 @@ void exit(struct task_struct *task)
 
 Called during task exit.
 
+void free(struct task_struct *task)
+
+Called when the task_struct is freed.
+
 void bind(struct cgroup *root)
 (cgroup_mutex held by caller)
 
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 6a1ab64ee5f9..60d44b26276d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -436,6 +436,7 @@ struct cgroup_subsys {
 	void (*cancel_fork)(struct task_struct *task, void *priv);
 	void (*fork)(struct task_struct *task, void *priv);
 	void (*exit)(struct task_struct *task);
+	void (*free)(struct task_struct *task);
 	void (*bind)(struct cgroup_subsys_state *root_css);
 
 	int early_init;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 918658497625..867384369669 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -206,6 +206,7 @@ static u64 css_serial_nr_next = 1;
  */
 static unsigned long have_fork_callback __read_mostly;
 static unsigned long have_exit_callback __read_mostly;
+static unsigned long have_free_callback __read_mostly;
 
 /* Ditto for the can_fork callback. */
 static unsigned long have_canfork_callback __read_mostly;
@@ -5180,6 +5181,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
 
 	have_fork_callback |= (bool)ss->fork << ss->id;
 	have_exit_callback |= (bool)ss->exit << ss->id;
+	have_free_callback |= (bool)ss->free << ss->id;
 	have_canfork_callback |= (bool)ss->can_fork << ss->id;
 
 	/* At system boot, before all subsystems have been
@@ -5637,6 +5639,11 @@ void cgroup_exit(struct task_struct *tsk)
 void cgroup_free(struct task_struct *task)
 {
 	struct css_set *cset = task_css_set(task);
+	struct cgroup_subsys *ss;
+	int ssid;
+
+	for_each_subsys_which(ss, ssid, &have_free_callback)
+		ss->free(task);
 
 	put_css_set(cset);
 }
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 45f0856a61fe..cdd8df4e991c 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -266,7 +266,7 @@ static void pids_fork(struct task_struct *task, void *priv)
 	css_put(old_css);
 }
 
-static void pids_exit(struct task_struct *task)
+static void pids_free(struct task_struct *task)
 {
 	struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
 
@@ -347,7 +347,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
 	.can_fork	= pids_can_fork,
 	.cancel_fork	= pids_cancel_fork,
 	.fork		= pids_fork,
-	.exit		= pids_exit,
+	.free		= pids_free,
 	.legacy_cftypes	= pids_files,
 	.dfl_cftypes	= pids_files,
 };

From 035f4f510583193949168c77dc957293df24bd77 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 17:00:43 -0400
Subject: [PATCH 35/38] cgroup: replace error handling in cgroup_init() with
 WARN_ON()s

The init sequence shouldn't fail short of bugs and even when it does
it's better to continue with the rest of initialization and we were
silently ignoring /proc/cgroups creation failure.

Drop the explicit error handling and wrap sysfs_create_mount_point(),
register_filesystem() and proc_create() with WARN_ON()s.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 kernel/cgroup.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 867384369669..74daeefa2d30 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5242,7 +5242,7 @@ int __init cgroup_init(void)
 {
 	struct cgroup_subsys *ss;
 	unsigned long key;
-	int ssid, err;
+	int ssid;
 
 	BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem));
 	BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
@@ -5304,17 +5304,10 @@ int __init cgroup_init(void)
 			ss->bind(init_css_set.subsys[ssid]);
 	}
 
-	err = sysfs_create_mount_point(fs_kobj, "cgroup");
-	if (err)
-		return err;
+	WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup"));
+	WARN_ON(register_filesystem(&cgroup_fs_type));
+	WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations));
 
-	err = register_filesystem(&cgroup_fs_type);
-	if (err < 0) {
-		sysfs_remove_mount_point(fs_kobj, "cgroup");
-		return err;
-	}
-
-	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
 	return 0;
 }
 

From e4b7037c8613da41fb3f7b029414fe25370f53c0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 15 Oct 2015 17:00:43 -0400
Subject: [PATCH 36/38] cgroup: drop cgroup__DEVEL__legacy_files_on_dfl

Now that interfaces for the major three controllers - cpu, memory, io
- are shaping up, there's no reason to have an option to force legacy
files to show up on the unified hierarchy for testing.  Drop it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
---
 Documentation/cgroups/unified-hierarchy.txt |  6 -----
 kernel/cgroup.c                             | 30 ++-------------------
 2 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 6932453d37a2..0cd27a4e0055 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -107,12 +107,6 @@ root of unified hierarchy can be bound to other hierarchies.  This
 allows mixing unified hierarchy with the traditional multiple
 hierarchies in a fully backward compatible way.
 
-For development purposes, the following boot parameter makes all
-controllers to appear on the unified hierarchy whether supported or
-not.
-
- cgroup__DEVEL__legacy_files_on_dfl
-
 A controller can be moved across hierarchies only after the controller
 is no longer referenced in its current hierarchy.  Because per-cgroup
 controller states are destroyed asynchronously and controllers may
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 74daeefa2d30..4f4fc538cb1b 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -173,12 +173,6 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root);
  */
 static bool cgrp_dfl_root_visible;
 
-/*
- * Set by the boot param of the same name and makes subsystems with NULL
- * ->dfl_files to use ->legacy_files on the default hierarchy.
- */
-static bool cgroup_legacy_files_on_dfl;
-
 /* some controllers are not supported in the default hierarchy */
 static unsigned long cgrp_dfl_root_inhibit_ss_mask;
 
@@ -3553,17 +3547,8 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 {
 	struct cftype *cft;
 
-	/*
-	 * If legacy_flies_on_dfl, we want to show the legacy files on the
-	 * dfl hierarchy but iff the target subsystem hasn't been updated
-	 * for the dfl hierarchy yet.
-	 */
-	if (!cgroup_legacy_files_on_dfl ||
-	    ss->dfl_cftypes != ss->legacy_cftypes) {
-		for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
-			cft->flags |= __CFTYPE_NOT_ON_DFL;
-	}
-
+	for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
+		cft->flags |= __CFTYPE_NOT_ON_DFL;
 	return cgroup_add_cftypes(ss, cfts);
 }
 
@@ -5287,9 +5272,6 @@ int __init cgroup_init(void)
 
 		cgrp_dfl_root.subsys_mask |= 1 << ss->id;
 
-		if (cgroup_legacy_files_on_dfl && !ss->dfl_cftypes)
-			ss->dfl_cftypes = ss->legacy_cftypes;
-
 		if (!ss->dfl_cftypes)
 			cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id;
 
@@ -5729,14 +5711,6 @@ static int __init cgroup_disable(char *str)
 }
 __setup("cgroup_disable=", cgroup_disable);
 
-static int __init cgroup_set_legacy_files_on_dfl(char *str)
-{
-	printk("cgroup: using legacy files on the default hierarchy\n");
-	cgroup_legacy_files_on_dfl = true;
-	return 0;
-}
-__setup("cgroup__DEVEL__legacy_files_on_dfl", cgroup_set_legacy_files_on_dfl);
-
 /**
  * css_tryget_online_from_dir - get corresponding css from a cgroup dentry
  * @dentry: directory dentry of interest

From ca0752c5e3e6fad83d286a22d729390bd8004aec Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 22 Oct 2015 09:48:37 +0900
Subject: [PATCH 37/38] blkcg: don't create "io.stat" on the root cgroup

The stat files on the root cgroup shows stats for the whole system and
usually don't contain any information which isn't available through
the usual system monitoring mechanisms.  Some controllers skip
collecting these duplicate stats to optimize cases where cgroup isn't
used and later try to emulate the result on demand.

This leads to complexities and subtle differences in the information
shown through different channels.  This is entirely unnecessary and
cgroup v2 is dropping stat files which are duplicate from all
controllers.  This patch removes "io.stat" from the root hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
 block/blk-cgroup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ac8370cb2515..4fa54161d423 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -896,6 +896,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
 struct cftype blkcg_files[] = {
 	{
 		.name = "stat",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.seq_show = blkcg_print_stat,
 	},
 	{ }	/* terminate */

From d57456753787ab158f906f1f8eb58d54a2ccd9f4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 29 Oct 2015 11:43:05 +0900
Subject: [PATCH 38/38] cgroup: fix race condition around termination check in
 css_task_iter_next()

css_task_iter_next() checked @it->cur_task before grabbing
css_set_lock and assumed that the result won't change afterwards;
however, tasks could leave the cgroup being iterated terminating the
iterator before css_task_lock is acquired.  If this happens,
css_task_iter_next() tries to calculate the current task from NULL
cg_list pointer leading to the following oops.

 BUG: unable to handle kernel paging request at fffffffffffff7d0
 IP: [<ffffffff810d5f22>] css_task_iter_next+0x42/0x80
 ...
 CPU: 4 PID: 6391 Comm: JobQDisp2 Not tainted 4.0.9-22_fbk4_rc3_81616_ge8d9cb6 #1
 Hardware name: Quanta Freedom/Winterfell, BIOS F03_3B08 03/04/2014
 task: ffff880868e46400 ti: ffff88083404c000 task.ti: ffff88083404c000
 RIP: 0010:[<ffffffff810d5f22>]  [<ffffffff810d5f22>] css_task_iter_next+0x42/0x80
 RSP: 0018:ffff88083404fd28  EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff88083404fd68 RCX: ffff8804697fb8b0
 RDX: fffffffffffff7c0 RSI: ffff8803b7dff800 RDI: ffffffff822c0278
 RBP: ffff88083404fd38 R08: 0000000000017160 R09: ffff88046f4070c0
 R10: ffffffff810d61f7 R11: 0000000000000293 R12: ffff880863bf8400
 R13: ffff88046b87fd80 R14: 0000000000000000 R15: ffff88083404fe58
 FS:  00007fa0567e2700(0000) GS:ffff88046f900000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: fffffffffffff7d0 CR3: 0000000469568000 CR4: 00000000001406e0
 Stack:
  0000000000000246 0000000000000000 ffff88083404fde8 ffffffff810d6248
  ffff88083404fd68 0000000000000000 ffff8803b7dff800 000001ef000001ee
  0000000000000000 0000000000000000 ffff880863bf8568 0000000000000000
 Call Trace:
  [<ffffffff810d6248>] cgroup_pidlist_start+0x258/0x550
  [<ffffffff810cf66d>] cgroup_seqfile_start+0x1d/0x20
  [<ffffffff8121f8ef>] kernfs_seq_start+0x5f/0xa0
  [<ffffffff811cab76>] seq_read+0x166/0x380
  [<ffffffff812200fd>] kernfs_fop_read+0x11d/0x180
  [<ffffffff811a7398>] __vfs_read+0x18/0x50
  [<ffffffff811a745d>] vfs_read+0x8d/0x150
  [<ffffffff811a756f>] SyS_read+0x4f/0xb0
  [<ffffffff818d4772>] system_call_fastpath+0x12/0x17

Fix it by moving the termination condition check inside css_set_lock.
@it->cur_task is now cleared after being put and @it->task_pos is
tested for termination instead of @it->cset_pos as they indicate the
same condition and @it->task_pos is what's being dereferenced.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Calvin Owens <calvinowens@fb.com>
Fixes: ed27b9f7a17d ("cgroup: don't hold css_set_rwsem across css task iteration")
Acked-by: Zefan Li <lizefan@huawei.com>
---
 kernel/cgroup.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4f4fc538cb1b..b9d0cce3f9ce 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3930,18 +3930,19 @@ void css_task_iter_start(struct cgroup_subsys_state *css,
  */
 struct task_struct *css_task_iter_next(struct css_task_iter *it)
 {
-	if (!it->cset_pos)
-		return NULL;
-
-	if (it->cur_task)
+	if (it->cur_task) {
 		put_task_struct(it->cur_task);
+		it->cur_task = NULL;
+	}
 
 	spin_lock_bh(&css_set_lock);
 
-	it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list);
-	get_task_struct(it->cur_task);
-
-	css_task_iter_advance(it);
+	if (it->task_pos) {
+		it->cur_task = list_entry(it->task_pos, struct task_struct,
+					  cg_list);
+		get_task_struct(it->cur_task);
+		css_task_iter_advance(it);
+	}
 
 	spin_unlock_bh(&css_set_lock);