PSI updates for v6.1:

- Various performance optimizations, resulting in a 4%-9% speedup in the mmtests/config-scheduler-perfpipe micro-benchmark. - New interface to turn PSI on/off on a per cgroup level. Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmNJKPsRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1iPmg//aovCitAQX2lLoHJDIgdQibU40oaEpKTX wM549EGz3Dr6qmwF8+qT1U2Ge6af/hHQc5G/ZqDpKbuTjUIc3RmBkqX80dNKFLuH uyi9UtfsSriw+ks8fWuDdjr+S4oppwW9ZoIXvK8v4bisd3F31DNGvKPTayNxt73m lExfzJiD1oJixDxGX8MGO9QpcoywmjWjzjrB2P+J8hnTpArouHx/HOKdQOpG6wXq ZRr9kZvju6ucDpXCTa1HJrfVRxNAh35tx/b4cDtXbBFifVAeKaPOrHapMTVsqfel Z7T+2DymhidNYK0hrRJoGUwa/vkz+2Sm1ZLG9LlgUCXVco/9S1zw1ZuQakVvzPen wriuxRaAkR+szCP0L8js5+/DAkGa43MjKsvQHmDVnetQtlsAD4eYnn+alQ837SXv MP3jwFqF+e4mcWdoQcfh0OWUgGec5XZzdgRYrFkBKyTWGLB2iPivcAMNf0X/h82Q xxv4DQJIIJ017GOQ/ho2saq+GbtFCvX8YnGYas9T47Bjjluhjo7jgTVtPTo+mhtN RfwMdG718Ap/gvnAX7wMe/t+L/4AP8AIgDRi5L35dTRqETwOjH+LAvOYjleQFYgu kMVtLMyzU+TGwHscuzPFRh7TnvSJ4sD48Ll1BPnyZsh3SS9u0gAs1bml7Cu7JbmW SIZD/S/hzdI= =91tB -----END PGP SIGNATURE----- Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull PSI updates from Ingo Molnar: - Various performance optimizations, resulting in a 4%-9% speedup in the mmtests/config-scheduler-perfpipe micro-benchmark. - New interface to turn PSI on/off on a per cgroup level. * tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/psi: Per-cgroup PSI accounting disable/re-enable interface sched/psi: Cache parent psi_group to speed up group iteration sched/psi: Consolidate cgroup_psi() sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure sched/psi: Remove NR_ONCPU task accounting sched/psi: Optimize task switch inside shared cgroups again sched/psi: Move private helpers to sched/stats.h sched/psi: Save percpu memory when !psi_cgroups_enabled sched/psi: Don't create cgroup PSI files when psi_disabled sched/psi: Fix periodic aggregation shut off
2022-10-14 13:03:00 -07:00 · 2022-10-14 13:03:00 -07:00 · bd9a3dba18
commit bd9a3dba18
parent 1df046ab1c 34f26a1561
9 changed files with 362 additions and 103 deletions
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
 	killing cgroups is a process directed operation, i.e. it affects
 	the whole thread-group.
  cgroup.pressure
 	A read-write single value file that allowed values are "0" and "1".
 	The default is "1".
 	Writing "0" to the file will disable the cgroup PSI accounting.
 	Writing "1" to the file will re-enable the cgroup PSI accounting.
 	This control attribute is not hierarchical, so disable or enable PSI
 	accounting in a cgroup does not affect PSI accounting in descendants
 	and doesn't need pass enablement via ancestors from root.
 	The reason this control attribute exists is that PSI accounts stalls for
 	each cgroup separately and aggregates it at each level of the hierarchy.
 	This may cause non-negligible overhead for some workloads when under
 	deep level of the hierarchy, in which case this control attribute can
 	be used to disable PSI accounting in the non-leaf cgroups.
  irq.pressure
 	A read-write nested-keyed file.
 	Shows pressure stall information for IRQ/SOFTIRQ. See
 	:ref:`Documentation/accounting/psi.rst <psi>` for details.
 Controllers
 ===========
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@ -428,6 +428,9 @@ struct cgroup {
 	struct cgroup_file procs_file;	/* handle for "cgroup.procs" */
 	struct cgroup_file events_file;	/* handle for "cgroup.events" */
 	/* handles for "{cpu,memory,io,irq}.pressure" */
 	struct cgroup_file psi_files[NR_PSI_RESOURCES];
 	/*
 	 * The bitmask of subsystems enabled on the child cgroups.
 	 * ->subtree_control is the one configured through
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
 	pr_cont_kernfs_path(cgrp->kn);
 }
 static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 {
 	return cgrp->psi;
 }
 bool cgroup_psi_enabled(void);
 static inline void cgroup_init_kthreadd(void)
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@ -7,6 +7,7 @@
 #include <linux/sched.h>
 #include <linux/poll.h>
 #include <linux/cgroup-defs.h>
 #include <linux/cgroup.h>
 struct seq_file;
 struct css_set;
@ -18,10 +19,6 @@ extern struct psi_group psi_system;
 void psi_init(void);
 void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
 void psi_memstall_enter(unsigned long *flags);
 void psi_memstall_leave(unsigned long *flags);
@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
 			poll_table *wait);
 #ifdef CONFIG_CGROUPS
 static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
 {
 	return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
 }
 int psi_cgroup_alloc(struct cgroup *cgrp);
 void psi_cgroup_free(struct cgroup *cgrp);
 void cgroup_move_task(struct task_struct *p, struct css_set *to);
 void psi_cgroup_restart(struct psi_group *group);
 #endif
 #else /* CONFIG_PSI */
@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
 {
 	rcu_assign_pointer(p->cgroups, to);
 }
 static inline void psi_cgroup_restart(struct psi_group *group) {}
 #endif
 #endif /* CONFIG_PSI */
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@ -15,13 +15,6 @@ enum psi_task_count {
 	NR_IOWAIT,
 	NR_MEMSTALL,
 	NR_RUNNING,
 	/*
 	 * This can't have values other than 0 or 1 and could be
 	 * implemented as a bit flag. But for now we still have room
 	 * in the first cacheline of psi_group_cpu, and this way we
 	 * don't have to special case any state tracking for it.
 	 */
 	NR_ONCPU,
 	/*
 	 * For IO and CPU stalls the presence of running/oncpu tasks
 	 * in the domain means a partial rather than a full stall.
@ -32,22 +25,27 @@ enum psi_task_count {
 	 * threads and memstall ones.
 	 */
 	NR_MEMSTALL_RUNNING,
-	NR_PSI_TASK_COUNTS = 5,
+	NR_PSI_TASK_COUNTS = 4,
 };
 /* Task state bitmasks */
 #define TSK_IOWAIT	(1 << NR_IOWAIT)
 #define TSK_MEMSTALL	(1 << NR_MEMSTALL)
 #define TSK_RUNNING	(1 << NR_RUNNING)
 #define TSK_ONCPU	(1 << NR_ONCPU)
 #define TSK_MEMSTALL_RUNNING	(1 << NR_MEMSTALL_RUNNING)
 /* Only one task can be scheduled, no corresponding task count */
 #define TSK_ONCPU	(1 << NR_PSI_TASK_COUNTS)
 /* Resources that workloads could be stalled on */
 enum psi_res {
 	PSI_IO,
 	PSI_MEM,
 	PSI_CPU,
-	NR_PSI_RESOURCES = 3,
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	PSI_IRQ,
 #endif
 	NR_PSI_RESOURCES,
 };
 /*
@ -63,11 +61,17 @@ enum psi_states {
 	PSI_MEM_FULL,
 	PSI_CPU_SOME,
 	PSI_CPU_FULL,
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	PSI_IRQ_FULL,
 #endif
 	/* Only per-CPU, to weigh the CPU in the global average: */
 	PSI_NONIDLE,
-	NR_PSI_STATES = 7,
+	NR_PSI_STATES,
 };
 /* Use one bit in the state mask to track TSK_ONCPU */
 #define PSI_ONCPU	(1 << NR_PSI_STATES)
 enum psi_aggregators {
 	PSI_AVGS = 0,
 	PSI_POLL,
@ -147,6 +151,9 @@ struct psi_trigger {
 };
 struct psi_group {
 	struct psi_group *parent;
 	bool enabled;
 	/* Protects data used by the aggregator */
 	struct mutex avgs_lock;
@ -188,6 +195,8 @@ struct psi_group {
 #else /* CONFIG_PSI */
 #define NR_PSI_RESOURCES	0
 struct psi_group { };
 #endif /* CONFIG_PSI */
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
 static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+	struct psi_group *psi = cgroup_psi(cgrp);
 	return psi_show(seq, psi, PSI_IO);
 }
 static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+	struct psi_group *psi = cgroup_psi(cgrp);
 	return psi_show(seq, psi, PSI_MEM);
 }
 static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
-	struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+	struct psi_group *psi = cgroup_psi(cgrp);
 	return psi_show(seq, psi, PSI_CPU);
 }
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
-					  size_t nbytes, enum psi_res res)
+			      size_t nbytes, enum psi_res res)
 {
 	struct cgroup_file_ctx *ctx = of->priv;
 	struct psi_trigger *new;
@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
 		return -EBUSY;
 	}
-	psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
+	psi = cgroup_psi(cgrp);
 	new = psi_trigger_create(psi, buf, res);
 	if (IS_ERR(new)) {
 		cgroup_put(cgrp);
@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
 					  char *buf, size_t nbytes,
 					  loff_t off)
 {
-	return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+	return pressure_write(of, buf, nbytes, PSI_IO);
 }
 static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
 					  char *buf, size_t nbytes,
 					  loff_t off)
 {
-	return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+	return pressure_write(of, buf, nbytes, PSI_MEM);
 }
 static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
 					  char *buf, size_t nbytes,
 					  loff_t off)
 {
-	return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+	return pressure_write(of, buf, nbytes, PSI_CPU);
 }
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	struct psi_group *psi = cgroup_psi(cgrp);
 	return psi_show(seq, psi, PSI_IRQ);
 }
 static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
 					 char *buf, size_t nbytes,
 					 loff_t off)
 {
 	return pressure_write(of, buf, nbytes, PSI_IRQ);
 }
 #endif
 static int cgroup_pressure_show(struct seq_file *seq, void *v)
 {
 	struct cgroup *cgrp = seq_css(seq)->cgroup;
 	struct psi_group *psi = cgroup_psi(cgrp);
 	seq_printf(seq, "%d\n", psi->enabled);
 	return 0;
 }
 static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
 				     char *buf, size_t nbytes,
 				     loff_t off)
 {
 	ssize_t ret;
 	int enable;
 	struct cgroup *cgrp;
 	struct psi_group *psi;
 	ret = kstrtoint(strstrip(buf), 0, &enable);
 	if (ret)
 		return ret;
 	if (enable < 0 || enable > 1)
 		return -ERANGE;
 	cgrp = cgroup_kn_lock_live(of->kn, false);
 	if (!cgrp)
 		return -ENOENT;
 	psi = cgroup_psi(cgrp);
 	if (psi->enabled != enable) {
 		int i;
 		/* show or hide {cpu,memory,io,irq}.pressure files */
 		for (i = 0; i < NR_PSI_RESOURCES; i++)
 			cgroup_file_show(&cgrp->psi_files[i], enable);
 		psi->enabled = enable;
 		if (enable)
 			psi_cgroup_restart(psi);
 	}
 	cgroup_kn_unlock(of->kn);
 	return nbytes;
 }
 static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
 bool cgroup_psi_enabled(void)
 {
 	if (static_branch_likely(&psi_disabled))
 		return false;
 	return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
 }
@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = {
 #ifdef CONFIG_PSI
 	{
 		.name = "io.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
 		.seq_show = cgroup_io_pressure_show,
 		.write = cgroup_io_pressure_write,
 		.poll = cgroup_pressure_poll,
@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = {
 	},
 	{
 		.name = "memory.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
 		.seq_show = cgroup_memory_pressure_show,
 		.write = cgroup_memory_pressure_write,
 		.poll = cgroup_pressure_poll,
@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = {
 	},
 	{
 		.name = "cpu.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
 		.seq_show = cgroup_cpu_pressure_show,
 		.write = cgroup_cpu_pressure_write,
 		.poll = cgroup_pressure_poll,
 		.release = cgroup_pressure_release,
 	},
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	{
 		.name = "irq.pressure",
 		.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
 		.seq_show = cgroup_irq_pressure_show,
 		.write = cgroup_irq_pressure_write,
 		.poll = cgroup_pressure_poll,
 		.release = cgroup_pressure_release,
 	},
 #endif
 	{
 		.name = "cgroup.pressure",
 		.seq_show = cgroup_pressure_show,
 		.write = cgroup_pressure_write,
 	},
 #endif /* CONFIG_PSI */
 	{ }	/* terminate */
 };
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 	rq->prev_irq_time += irq_delta;
 	delta -= irq_delta;
 	psi_account_irqtime(rq->curr, irq_delta);
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 	if (static_key_false((&paravirt_steal_rq_enabled))) {
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
 {
 	int cpu;
 	group->enabled = true;
 	for_each_possible_cpu(cpu)
 		seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
 	group->avg_last_update = sched_clock();
@ -201,6 +202,7 @@ void __init psi_init(void)
 {
 	if (!psi_enable) {
 		static_branch_enable(&psi_disabled);
 		static_branch_disable(&psi_cgroups_enabled);
 		return;
 	}
@ -211,7 +213,7 @@ void __init psi_init(void)
 	group_init(&psi_system);
 }
-static bool test_state(unsigned int *tasks, enum psi_states state)
+static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
 {
 	switch (state) {
 	case PSI_IO_SOME:
@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
 		return unlikely(tasks[NR_MEMSTALL] &&
 			tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
 	case PSI_CPU_SOME:
-		return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
+		return unlikely(tasks[NR_RUNNING] > oncpu);
 	case PSI_CPU_FULL:
-		return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
+		return unlikely(tasks[NR_RUNNING] && !oncpu);
 	case PSI_NONIDLE:
 		return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
 			tasks[NR_RUNNING];
@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
 			     bool wake_clock)
 {
 	struct psi_group_cpu *groupc;
 	u32 state_mask = 0;
 	unsigned int t, m;
 	enum psi_states s;
 	u32 state_mask;
 	groupc = per_cpu_ptr(group->pcpu, cpu);
 	/*
-	 * First we assess the aggregate resource states this CPU's
+	 * First we update the task counts according to the state
 	 * tasks have been in since the last change, and account any
 	 * SOME and FULL time these may have resulted in.
 	 *
 	 * Then we update the task counts according to the state
 	 * change requested through the @clear and @set bits.
 	 *
 	 * Then if the cgroup PSI stats accounting enabled, we
 	 * assess the aggregate resource states this CPU's tasks
 	 * have been in since the last change, and account any
 	 * SOME and FULL time these may have resulted in.
 	 */
 	write_seqcount_begin(&groupc->seq);
-	record_times(groupc, now);
+	/*
 	 * Start with TSK_ONCPU, which doesn't have a corresponding
 	 * task count - it's just a boolean flag directly encoded in
 	 * the state mask. Clear, set, or carry the current state if
 	 * no changes are requested.
 	 */
 	if (unlikely(clear & TSK_ONCPU)) {
 		state_mask = 0;
 		clear &= ~TSK_ONCPU;
 	} else if (unlikely(set & TSK_ONCPU)) {
 		state_mask = PSI_ONCPU;
 		set &= ~TSK_ONCPU;
 	} else {
 		state_mask = groupc->state_mask & PSI_ONCPU;
 	}
 	/*
 	 * The rest of the state mask is calculated based on the task
 	 * counts. Update those first, then construct the mask.
 	 */
 	for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
 		if (!(m & (1 << t)))
 			continue;
 		if (groupc->tasks[t]) {
 			groupc->tasks[t]--;
 		} else if (!psi_bug) {
-			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
+			printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
 					cpu, t, groupc->tasks[0],
 					groupc->tasks[1], groupc->tasks[2],
-					groupc->tasks[3], groupc->tasks[4],
+					groupc->tasks[3], clear, set);
 					clear, set);
 			psi_bug = 1;
 		}
 	}
@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		if (set & (1 << t))
 			groupc->tasks[t]++;
-	/* Calculate state mask representing active states */
+	if (!group->enabled) {
 		/*
 		 * On the first group change after disabling PSI, conclude
 		 * the current state and flush its time. This is unlikely
 		 * to matter to the user, but aggregation (get_recent_times)
 		 * may have already incorporated the live state into times_prev;
 		 * avoid a delta sample underflow when PSI is later re-enabled.
 		 */
 		if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
 			record_times(groupc, now);
 		groupc->state_mask = state_mask;
 		write_seqcount_end(&groupc->seq);
 		return;
 	}
 	for (s = 0; s < NR_PSI_STATES; s++) {
-		if (test_state(groupc->tasks, s))
+		if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
 			state_mask |= (1 << s);
 	}
@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	 * task in a cgroup is in_memstall, the corresponding groupc
 	 * on that cpu is in PSI_MEM_FULL state.
 	 */
-	if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
+	if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
 		state_mask |= (1 << PSI_MEM_FULL);
 	record_times(groupc, now);
 	groupc->state_mask = state_mask;
 	write_seqcount_end(&groupc->seq);
@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
 		schedule_delayed_work(&group->avgs_work, PSI_FREQ);
 }
-static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
+static inline struct psi_group *task_psi_group(struct task_struct *task)
 {
 	if (*iter == &psi_system)
 		return NULL;
 #ifdef CONFIG_CGROUPS
-	if (static_branch_likely(&psi_cgroups_enabled)) {
+	if (static_branch_likely(&psi_cgroups_enabled))
-		struct cgroup *cgroup = NULL;
+		return cgroup_psi(task_dfl_cgroup(task));
 		if (!*iter)
 			cgroup = task->cgroups->dfl_cgrp;
 		else
 			cgroup = cgroup_parent(*iter);
 		if (cgroup && cgroup_parent(cgroup)) {
 			*iter = cgroup;
 			return cgroup_psi(cgroup);
 		}
 	}
 #endif
 	*iter = &psi_system;
 	return &psi_system;
 }
@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
 	bool wake_clock = true;
 	void *iter = NULL;
 	u64 now;
 	if (!task->pid)
@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
 	psi_flags_change(task, clear, set);
 	now = cpu_clock(cpu);
 	/*
 	 * Periodic aggregation shuts off if there is a period of no
 	 * task changes, so we wake it back up if necessary. However,
 	 * don't do this if the task change is the aggregation worker
 	 * itself going to sleep, or we'll ping-pong forever.
 	 */
 	if (unlikely((clear & TSK_RUNNING) &&
 		     (task->flags & PF_WQ_WORKER) &&
 		     wq_worker_last_func(task) == psi_avgs_work))
 		wake_clock = false;
-	while ((group = iterate_groups(task, &iter)))
+	group = task_psi_group(task);
-		psi_group_change(group, cpu, clear, set, now, wake_clock);
+	do {
 		psi_group_change(group, cpu, clear, set, now, true);
 	} while ((group = group->parent));
 }
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 {
 	struct psi_group *group, *common = NULL;
 	int cpu = task_cpu(prev);
 	void *iter;
 	u64 now = cpu_clock(cpu);
 	if (next->pid) {
 		bool identical_state;
 		psi_flags_change(next, 0, TSK_ONCPU);
 		/*
-		 * When switching between tasks that have an identical
+		 * Set TSK_ONCPU on @next's cgroups. If @next shares any
-		 * runtime state, the cgroup that contains both tasks
+		 * ancestors with @prev, those will already have @prev's
-		 * we reach the first common ancestor. Iterate @next's
+		 * TSK_ONCPU bit set, and we can stop the iteration there.
 		 * ancestors only until we encounter @prev's ONCPU.
 		 */
-		identical_state = prev->psi_flags == next->psi_flags;
+		group = task_psi_group(next);
-		iter = NULL;
+		do {
-		while ((group = iterate_groups(next, &iter))) {
+			if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
-			if (identical_state &&
+			    PSI_ONCPU) {
 			    per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
 				common = group;
 				break;
 			}
 			psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
-		}
+		} while ((group = group->parent));
 	}
 	if (prev->pid) {
 		int clear = TSK_ONCPU, set = 0;
 		bool wake_clock = true;
 		/*
 		 * When we're going to sleep, psi_dequeue() lets us
@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 				clear |= TSK_MEMSTALL_RUNNING;
 			if (prev->in_iowait)
 				set |= TSK_IOWAIT;
 			/*
 			 * Periodic aggregation shuts off if there is a period of no
 			 * task changes, so we wake it back up if necessary. However,
 			 * don't do this if the task change is the aggregation worker
 			 * itself going to sleep, or we'll ping-pong forever.
 			 */
 			if (unlikely((prev->flags & PF_WQ_WORKER) &&
 				     wq_worker_last_func(prev) == psi_avgs_work))
 				wake_clock = false;
 		}
 		psi_flags_change(prev, clear, set);
-		iter = NULL;
+		group = task_psi_group(prev);
-		while ((group = iterate_groups(prev, &iter)) && group != common)
+		do {
-			psi_group_change(group, cpu, clear, set, now, true);
+			if (group == common)
 				break;
 			psi_group_change(group, cpu, clear, set, now, wake_clock);
 		} while ((group = group->parent));
 		/*
-		 * TSK_ONCPU is handled up to the common ancestor. If we're tasked
+		 * TSK_ONCPU is handled up to the common ancestor. If there are
-		 * with dequeuing too, finish that for the rest of the hierarchy.
+		 * any other differences between the two tasks (e.g. prev goes
 		 * to sleep, or only one task is memstall), finish propagating
 		 * those differences all the way up to the root.
 		 */
-		if (sleep) {
+		if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
 			clear &= ~TSK_ONCPU;
-			for (; group; group = iterate_groups(prev, &iter))
+			for (; group; group = group->parent)
-				psi_group_change(group, cpu, clear, set, now, true);
+				psi_group_change(group, cpu, clear, set, now, wake_clock);
 		}
 	}
 }
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 void psi_account_irqtime(struct task_struct *task, u32 delta)
 {
 	int cpu = task_cpu(task);
 	struct psi_group *group;
 	struct psi_group_cpu *groupc;
 	u64 now;
 	if (!task->pid)
 		return;
 	now = cpu_clock(cpu);
 	group = task_psi_group(task);
 	do {
 		if (!group->enabled)
 			continue;
 		groupc = per_cpu_ptr(group->pcpu, cpu);
 		write_seqcount_begin(&groupc->seq);
 		record_times(groupc, now);
 		groupc->times[PSI_IRQ_FULL] += delta;
 		write_seqcount_end(&groupc->seq);
 		if (group->poll_states & (1 << PSI_IRQ_FULL))
 			psi_schedule_poll_work(group, 1);
 	} while ((group = group->parent));
 }
 #endif
 /**
 * psi_memstall_enter - mark the beginning of a memory stall section
 * @flags: flags to handle nested sections
@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
 #ifdef CONFIG_CGROUPS
 int psi_cgroup_alloc(struct cgroup *cgroup)
 {
-	if (static_branch_likely(&psi_disabled))
+	if (!static_branch_likely(&psi_cgroups_enabled))
 		return 0;
 	cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
 		return -ENOMEM;
 	}
 	group_init(cgroup->psi);
 	cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
 	return 0;
 }
 void psi_cgroup_free(struct cgroup *cgroup)
 {
-	if (static_branch_likely(&psi_disabled))
+	if (!static_branch_likely(&psi_cgroups_enabled))
 		return;
 	cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 	struct rq_flags rf;
 	struct rq *rq;
-	if (static_branch_likely(&psi_disabled)) {
+	if (!static_branch_likely(&psi_cgroups_enabled)) {
 		/*
 		 * Lame to do this here, but the scheduler cannot be locked
 		 * from the outside, so we move cgroups from inside sched/.
@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
 	task_rq_unlock(rq, task, &rf);
 }
 void psi_cgroup_restart(struct psi_group *group)
 {
 	int cpu;
 	/*
 	 * After we disable psi_group->enabled, we don't actually
 	 * stop percpu tasks accounting in each psi_group_cpu,
 	 * instead only stop test_state() loop, record_times()
 	 * and averaging worker, see psi_group_change() for details.
 	 *
 	 * When disable cgroup PSI, this function has nothing to sync
 	 * since cgroup pressure files are hidden and percpu psi_group_cpu
 	 * would see !psi_group->enabled and only do task accounting.
 	 *
 	 * When re-enable cgroup PSI, this function use psi_group_change()
 	 * to get correct state mask from test_state() loop on tasks[],
 	 * and restart groupc->state_start from now, use .clear = .set = 0
 	 * here since no task status really changed.
 	 */
 	if (!group->enabled)
 		return;
 	for_each_possible_cpu(cpu) {
 		struct rq *rq = cpu_rq(cpu);
 		struct rq_flags rf;
 		u64 now;
 		rq_lock_irq(rq, &rf);
 		now = cpu_clock(cpu);
 		psi_group_change(group, cpu, 0, 0, now, true);
 		rq_unlock_irq(rq, &rf);
 	}
 }
 #endif /* CONFIG_CGROUPS */
 int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 {
 	bool only_full = false;
 	int full;
 	u64 now;
@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 		group->avg_next_update = update_averages(group, now);
 	mutex_unlock(&group->avgs_lock);
-	for (full = 0; full < 2; full++) {
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	only_full = res == PSI_IRQ;
 #endif
 	for (full = 0; full < 2 - only_full; full++) {
 		unsigned long avg[3] = { 0, };
 		u64 total = 0;
 		int w;
@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
 		}
 		seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
-			   full ? "full" : "some",
+			   full || only_full ? "full" : "some",
 			   LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
 			   LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
 			   LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
 	else
 		return ERR_PTR(-EINVAL);
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 	if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
 		return ERR_PTR(-EINVAL);
 #endif
 	if (state >= PSI_NONIDLE)
 		return ERR_PTR(-EINVAL);
@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
 	.proc_release	= psi_fop_release,
 };
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int psi_irq_show(struct seq_file *m, void *v)
 {
 	return psi_show(m, &psi_system, PSI_IRQ);
 }
 static int psi_irq_open(struct inode *inode, struct file *file)
 {
 	return psi_open(file, psi_irq_show);
 }
 static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
 			     size_t nbytes, loff_t *ppos)
 {
 	return psi_write(file, user_buf, nbytes, PSI_IRQ);
 }
 static const struct proc_ops psi_irq_proc_ops = {
 	.proc_open	= psi_irq_open,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
 	.proc_write	= psi_irq_write,
 	.proc_poll	= psi_fop_poll,
 	.proc_release	= psi_fop_release,
 };
 #endif
 static int __init psi_proc_init(void)
 {
 	if (psi_enable) {
@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
 		proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
 		proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
 		proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 		proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
 #endif
 	}
 	return 0;
 }
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
 }
 #ifdef CONFIG_PSI
 void psi_task_change(struct task_struct *task, int clear, int set);
 void psi_task_switch(struct task_struct *prev, struct task_struct *next,
 		     bool sleep);
 void psi_account_irqtime(struct task_struct *task, u32 delta);
 /*
 * PSI tracks state that persists across sleeps, such as iowaits and
 * memory stalls. As a result, it has to distinguish between sleeps,
@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
 static inline void psi_sched_switch(struct task_struct *prev,
 				    struct task_struct *next,
 				    bool sleep) {}
 static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
 #endif /* CONFIG_PSI */
 #ifdef CONFIG_SCHED_INFO