PSI updates for v6.1:

- Various performance optimizations, resulting in a 4%-9% speedup
    in the mmtests/config-scheduler-perfpipe micro-benchmark.
 
  - New interface to turn PSI on/off on a per cgroup level.
 
 Signed-off-by: Ingo Molnar <mingo@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmNJKPsRHG1pbmdvQGtl
 cm5lbC5vcmcACgkQEnMQ0APhK1iPmg//aovCitAQX2lLoHJDIgdQibU40oaEpKTX
 wM549EGz3Dr6qmwF8+qT1U2Ge6af/hHQc5G/ZqDpKbuTjUIc3RmBkqX80dNKFLuH
 uyi9UtfsSriw+ks8fWuDdjr+S4oppwW9ZoIXvK8v4bisd3F31DNGvKPTayNxt73m
 lExfzJiD1oJixDxGX8MGO9QpcoywmjWjzjrB2P+J8hnTpArouHx/HOKdQOpG6wXq
 ZRr9kZvju6ucDpXCTa1HJrfVRxNAh35tx/b4cDtXbBFifVAeKaPOrHapMTVsqfel
 Z7T+2DymhidNYK0hrRJoGUwa/vkz+2Sm1ZLG9LlgUCXVco/9S1zw1ZuQakVvzPen
 wriuxRaAkR+szCP0L8js5+/DAkGa43MjKsvQHmDVnetQtlsAD4eYnn+alQ837SXv
 MP3jwFqF+e4mcWdoQcfh0OWUgGec5XZzdgRYrFkBKyTWGLB2iPivcAMNf0X/h82Q
 xxv4DQJIIJ017GOQ/ho2saq+GbtFCvX8YnGYas9T47Bjjluhjo7jgTVtPTo+mhtN
 RfwMdG718Ap/gvnAX7wMe/t+L/4AP8AIgDRi5L35dTRqETwOjH+LAvOYjleQFYgu
 kMVtLMyzU+TGwHscuzPFRh7TnvSJ4sD48Ll1BPnyZsh3SS9u0gAs1bml7Cu7JbmW
 SIZD/S/hzdI=
 =91tB
 -----END PGP SIGNATURE-----

Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull PSI updates from Ingo Molnar:

 - Various performance optimizations, resulting in a 4%-9% speedup in
   the mmtests/config-scheduler-perfpipe micro-benchmark.

 - New interface to turn PSI on/off on a per cgroup level.

* tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  sched/psi: Per-cgroup PSI accounting disable/re-enable interface
  sched/psi: Cache parent psi_group to speed up group iteration
  sched/psi: Consolidate cgroup_psi()
  sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure
  sched/psi: Remove NR_ONCPU task accounting
  sched/psi: Optimize task switch inside shared cgroups again
  sched/psi: Move private helpers to sched/stats.h
  sched/psi: Save percpu memory when !psi_cgroups_enabled
  sched/psi: Don't create cgroup PSI files when psi_disabled
  sched/psi: Fix periodic aggregation shut off
This commit is contained in:
Linus Torvalds 2022-10-14 13:03:00 -07:00
commit bd9a3dba18
9 changed files with 362 additions and 103 deletions

View File

@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
killing cgroups is a process directed operation, i.e. it affects
the whole thread-group.
cgroup.pressure
A read-write single value file that allowed values are "0" and "1".
The default is "1".
Writing "0" to the file will disable the cgroup PSI accounting.
Writing "1" to the file will re-enable the cgroup PSI accounting.
This control attribute is not hierarchical, so disable or enable PSI
accounting in a cgroup does not affect PSI accounting in descendants
and doesn't need pass enablement via ancestors from root.
The reason this control attribute exists is that PSI accounts stalls for
each cgroup separately and aggregates it at each level of the hierarchy.
This may cause non-negligible overhead for some workloads when under
deep level of the hierarchy, in which case this control attribute can
be used to disable PSI accounting in the non-leaf cgroups.
irq.pressure
A read-write nested-keyed file.
Shows pressure stall information for IRQ/SOFTIRQ. See
:ref:`Documentation/accounting/psi.rst <psi>` for details.
Controllers
===========

View File

@ -428,6 +428,9 @@ struct cgroup {
struct cgroup_file procs_file; /* handle for "cgroup.procs" */
struct cgroup_file events_file; /* handle for "cgroup.events" */
/* handles for "{cpu,memory,io,irq}.pressure" */
struct cgroup_file psi_files[NR_PSI_RESOURCES];
/*
* The bitmask of subsystems enabled on the child cgroups.
* ->subtree_control is the one configured through

View File

@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
}
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return cgrp->psi;
}
bool cgroup_psi_enabled(void);
static inline void cgroup_init_kthreadd(void)

View File

@ -7,6 +7,7 @@
#include <linux/sched.h>
#include <linux/poll.h>
#include <linux/cgroup-defs.h>
#include <linux/cgroup.h>
struct seq_file;
struct css_set;
@ -18,10 +19,6 @@ extern struct psi_group psi_system;
void psi_init(void);
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
void psi_memstall_enter(unsigned long *flags);
void psi_memstall_leave(unsigned long *flags);
@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
poll_table *wait);
#ifdef CONFIG_CGROUPS
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
{
return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
}
int psi_cgroup_alloc(struct cgroup *cgrp);
void psi_cgroup_free(struct cgroup *cgrp);
void cgroup_move_task(struct task_struct *p, struct css_set *to);
void psi_cgroup_restart(struct psi_group *group);
#endif
#else /* CONFIG_PSI */
@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
{
rcu_assign_pointer(p->cgroups, to);
}
static inline void psi_cgroup_restart(struct psi_group *group) {}
#endif
#endif /* CONFIG_PSI */

View File

@ -15,13 +15,6 @@ enum psi_task_count {
NR_IOWAIT,
NR_MEMSTALL,
NR_RUNNING,
/*
* This can't have values other than 0 or 1 and could be
* implemented as a bit flag. But for now we still have room
* in the first cacheline of psi_group_cpu, and this way we
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
/*
* For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall.
@ -32,22 +25,27 @@ enum psi_task_count {
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5,
NR_PSI_TASK_COUNTS = 4,
};
/* Task state bitmasks */
#define TSK_IOWAIT (1 << NR_IOWAIT)
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Only one task can be scheduled, no corresponding task count */
#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
/* Resources that workloads could be stalled on */
enum psi_res {
PSI_IO,
PSI_MEM,
PSI_CPU,
NR_PSI_RESOURCES = 3,
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
PSI_IRQ,
#endif
NR_PSI_RESOURCES,
};
/*
@ -63,11 +61,17 @@ enum psi_states {
PSI_MEM_FULL,
PSI_CPU_SOME,
PSI_CPU_FULL,
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
PSI_IRQ_FULL,
#endif
/* Only per-CPU, to weigh the CPU in the global average: */
PSI_NONIDLE,
NR_PSI_STATES = 7,
NR_PSI_STATES,
};
/* Use one bit in the state mask to track TSK_ONCPU */
#define PSI_ONCPU (1 << NR_PSI_STATES)
enum psi_aggregators {
PSI_AVGS = 0,
PSI_POLL,
@ -147,6 +151,9 @@ struct psi_trigger {
};
struct psi_group {
struct psi_group *parent;
bool enabled;
/* Protects data used by the aggregator */
struct mutex avgs_lock;
@ -188,6 +195,8 @@ struct psi_group {
#else /* CONFIG_PSI */
#define NR_PSI_RESOURCES 0
struct psi_group { };
#endif /* CONFIG_PSI */

View File

@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return -EBUSY;
}
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
psi = cgroup_psi(cgrp);
new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
return pressure_write(of, buf, nbytes, PSI_CPU);
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IRQ);
}
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
return pressure_write(of, buf, nbytes, PSI_IRQ);
}
#endif
static int cgroup_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
struct psi_group *psi = cgroup_psi(cgrp);
seq_printf(seq, "%d\n", psi->enabled);
return 0;
}
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
ssize_t ret;
int enable;
struct cgroup *cgrp;
struct psi_group *psi;
ret = kstrtoint(strstrip(buf), 0, &enable);
if (ret)
return ret;
if (enable < 0 || enable > 1)
return -ERANGE;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENOENT;
psi = cgroup_psi(cgrp);
if (psi->enabled != enable) {
int i;
/* show or hide {cpu,memory,io,irq}.pressure files */
for (i = 0; i < NR_PSI_RESOURCES; i++)
cgroup_file_show(&cgrp->psi_files[i], enable);
psi->enabled = enable;
if (enable)
psi_cgroup_restart(psi);
}
cgroup_kn_unlock(of->kn);
return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
bool cgroup_psi_enabled(void)
{
if (static_branch_likely(&psi_disabled))
return false;
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
}
@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = {
},
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = {
},
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
#endif
{
.name = "cgroup.pressure",
.seq_show = cgroup_pressure_show,
.write = cgroup_pressure_write,
},
#endif /* CONFIG_PSI */
{ } /* terminate */
};

View File

@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->prev_irq_time += irq_delta;
delta -= irq_delta;
psi_account_irqtime(rq->curr, irq_delta);
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
if (static_key_false((&paravirt_steal_rq_enabled))) {

View File

@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
{
int cpu;
group->enabled = true;
for_each_possible_cpu(cpu)
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
group->avg_last_update = sched_clock();
@ -201,6 +202,7 @@ void __init psi_init(void)
{
if (!psi_enable) {
static_branch_enable(&psi_disabled);
static_branch_disable(&psi_cgroups_enabled);
return;
}
@ -211,7 +213,7 @@ void __init psi_init(void)
group_init(&psi_system);
}
static bool test_state(unsigned int *tasks, enum psi_states state)
static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
{
switch (state) {
case PSI_IO_SOME:
@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
return unlikely(tasks[NR_RUNNING] > oncpu);
case PSI_CPU_FULL:
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
return unlikely(tasks[NR_RUNNING] && !oncpu);
case PSI_NONIDLE:
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
tasks[NR_RUNNING];
@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
bool wake_clock)
{
struct psi_group_cpu *groupc;
u32 state_mask = 0;
unsigned int t, m;
enum psi_states s;
u32 state_mask;
groupc = per_cpu_ptr(group->pcpu, cpu);
/*
* First we assess the aggregate resource states this CPU's
* tasks have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*
* Then we update the task counts according to the state
* First we update the task counts according to the state
* change requested through the @clear and @set bits.
*
* Then if the cgroup PSI stats accounting enabled, we
* assess the aggregate resource states this CPU's tasks
* have been in since the last change, and account any
* SOME and FULL time these may have resulted in.
*/
write_seqcount_begin(&groupc->seq);
record_times(groupc, now);
/*
* Start with TSK_ONCPU, which doesn't have a corresponding
* task count - it's just a boolean flag directly encoded in
* the state mask. Clear, set, or carry the current state if
* no changes are requested.
*/
if (unlikely(clear & TSK_ONCPU)) {
state_mask = 0;
clear &= ~TSK_ONCPU;
} else if (unlikely(set & TSK_ONCPU)) {
state_mask = PSI_ONCPU;
set &= ~TSK_ONCPU;
} else {
state_mask = groupc->state_mask & PSI_ONCPU;
}
/*
* The rest of the state mask is calculated based on the task
* counts. Update those first, then construct the mask.
*/
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
if (!(m & (1 << t)))
continue;
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], groupc->tasks[4],
clear, set);
groupc->tasks[3], clear, set);
psi_bug = 1;
}
}
@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (set & (1 << t))
groupc->tasks[t]++;
/* Calculate state mask representing active states */
if (!group->enabled) {
/*
* On the first group change after disabling PSI, conclude
* the current state and flush its time. This is unlikely
* to matter to the user, but aggregation (get_recent_times)
* may have already incorporated the live state into times_prev;
* avoid a delta sample underflow when PSI is later re-enabled.
*/
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
record_times(groupc, now);
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
return;
}
for (s = 0; s < NR_PSI_STATES; s++) {
if (test_state(groupc->tasks, s))
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
state_mask |= (1 << s);
}
@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
* task in a cgroup is in_memstall, the corresponding groupc
* on that cpu is in PSI_MEM_FULL state.
*/
if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
state_mask |= (1 << PSI_MEM_FULL);
record_times(groupc, now);
groupc->state_mask = state_mask;
write_seqcount_end(&groupc->seq);
@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
}
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
static inline struct psi_group *task_psi_group(struct task_struct *task)
{
if (*iter == &psi_system)
return NULL;
#ifdef CONFIG_CGROUPS
if (static_branch_likely(&psi_cgroups_enabled)) {
struct cgroup *cgroup = NULL;
if (!*iter)
cgroup = task->cgroups->dfl_cgrp;
else
cgroup = cgroup_parent(*iter);
if (cgroup && cgroup_parent(cgroup)) {
*iter = cgroup;
return cgroup_psi(cgroup);
}
}
if (static_branch_likely(&psi_cgroups_enabled))
return cgroup_psi(task_dfl_cgroup(task));
#endif
*iter = &psi_system;
return &psi_system;
}
@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
{
int cpu = task_cpu(task);
struct psi_group *group;
bool wake_clock = true;
void *iter = NULL;
u64 now;
if (!task->pid)
@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
psi_flags_change(task, clear, set);
now = cpu_clock(cpu);
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
* don't do this if the task change is the aggregation worker
* itself going to sleep, or we'll ping-pong forever.
*/
if (unlikely((clear & TSK_RUNNING) &&
(task->flags & PF_WQ_WORKER) &&
wq_worker_last_func(task) == psi_avgs_work))
wake_clock = false;
while ((group = iterate_groups(task, &iter)))
psi_group_change(group, cpu, clear, set, now, wake_clock);
group = task_psi_group(task);
do {
psi_group_change(group, cpu, clear, set, now, true);
} while ((group = group->parent));
}
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
{
struct psi_group *group, *common = NULL;
int cpu = task_cpu(prev);
void *iter;
u64 now = cpu_clock(cpu);
if (next->pid) {
bool identical_state;
psi_flags_change(next, 0, TSK_ONCPU);
/*
* When switching between tasks that have an identical
* runtime state, the cgroup that contains both tasks
* we reach the first common ancestor. Iterate @next's
* ancestors only until we encounter @prev's ONCPU.
* Set TSK_ONCPU on @next's cgroups. If @next shares any
* ancestors with @prev, those will already have @prev's
* TSK_ONCPU bit set, and we can stop the iteration there.
*/
identical_state = prev->psi_flags == next->psi_flags;
iter = NULL;
while ((group = iterate_groups(next, &iter))) {
if (identical_state &&
per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
group = task_psi_group(next);
do {
if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
PSI_ONCPU) {
common = group;
break;
}
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
}
} while ((group = group->parent));
}
if (prev->pid) {
int clear = TSK_ONCPU, set = 0;
bool wake_clock = true;
/*
* When we're going to sleep, psi_dequeue() lets us
@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
/*
* Periodic aggregation shuts off if there is a period of no
* task changes, so we wake it back up if necessary. However,
* don't do this if the task change is the aggregation worker
* itself going to sleep, or we'll ping-pong forever.
*/
if (unlikely((prev->flags & PF_WQ_WORKER) &&
wq_worker_last_func(prev) == psi_avgs_work))
wake_clock = false;
}
psi_flags_change(prev, clear, set);
iter = NULL;
while ((group = iterate_groups(prev, &iter)) && group != common)
psi_group_change(group, cpu, clear, set, now, true);
group = task_psi_group(prev);
do {
if (group == common)
break;
psi_group_change(group, cpu, clear, set, now, wake_clock);
} while ((group = group->parent));
/*
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
* with dequeuing too, finish that for the rest of the hierarchy.
* TSK_ONCPU is handled up to the common ancestor. If there are
* any other differences between the two tasks (e.g. prev goes
* to sleep, or only one task is memstall), finish propagating
* those differences all the way up to the root.
*/
if (sleep) {
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
clear &= ~TSK_ONCPU;
for (; group; group = iterate_groups(prev, &iter))
psi_group_change(group, cpu, clear, set, now, true);
for (; group; group = group->parent)
psi_group_change(group, cpu, clear, set, now, wake_clock);
}
}
}
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
void psi_account_irqtime(struct task_struct *task, u32 delta)
{
int cpu = task_cpu(task);
struct psi_group *group;
struct psi_group_cpu *groupc;
u64 now;
if (!task->pid)
return;
now = cpu_clock(cpu);
group = task_psi_group(task);
do {
if (!group->enabled)
continue;
groupc = per_cpu_ptr(group->pcpu, cpu);
write_seqcount_begin(&groupc->seq);
record_times(groupc, now);
groupc->times[PSI_IRQ_FULL] += delta;
write_seqcount_end(&groupc->seq);
if (group->poll_states & (1 << PSI_IRQ_FULL))
psi_schedule_poll_work(group, 1);
} while ((group = group->parent));
}
#endif
/**
* psi_memstall_enter - mark the beginning of a memory stall section
* @flags: flags to handle nested sections
@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
#ifdef CONFIG_CGROUPS
int psi_cgroup_alloc(struct cgroup *cgroup)
{
if (static_branch_likely(&psi_disabled))
if (!static_branch_likely(&psi_cgroups_enabled))
return 0;
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
return -ENOMEM;
}
group_init(cgroup->psi);
cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
return 0;
}
void psi_cgroup_free(struct cgroup *cgroup)
{
if (static_branch_likely(&psi_disabled))
if (!static_branch_likely(&psi_cgroups_enabled))
return;
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
struct rq_flags rf;
struct rq *rq;
if (static_branch_likely(&psi_disabled)) {
if (!static_branch_likely(&psi_cgroups_enabled)) {
/*
* Lame to do this here, but the scheduler cannot be locked
* from the outside, so we move cgroups from inside sched/.
@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
task_rq_unlock(rq, task, &rf);
}
void psi_cgroup_restart(struct psi_group *group)
{
int cpu;
/*
* After we disable psi_group->enabled, we don't actually
* stop percpu tasks accounting in each psi_group_cpu,
* instead only stop test_state() loop, record_times()
* and averaging worker, see psi_group_change() for details.
*
* When disable cgroup PSI, this function has nothing to sync
* since cgroup pressure files are hidden and percpu psi_group_cpu
* would see !psi_group->enabled and only do task accounting.
*
* When re-enable cgroup PSI, this function use psi_group_change()
* to get correct state mask from test_state() loop on tasks[],
* and restart groupc->state_start from now, use .clear = .set = 0
* here since no task status really changed.
*/
if (!group->enabled)
return;
for_each_possible_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
struct rq_flags rf;
u64 now;
rq_lock_irq(rq, &rf);
now = cpu_clock(cpu);
psi_group_change(group, cpu, 0, 0, now, true);
rq_unlock_irq(rq, &rf);
}
}
#endif /* CONFIG_CGROUPS */
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
{
bool only_full = false;
int full;
u64 now;
@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
group->avg_next_update = update_averages(group, now);
mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) {
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
only_full = res == PSI_IRQ;
#endif
for (full = 0; full < 2 - only_full; full++) {
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
full || only_full ? "full" : "some",
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
else
return ERR_PTR(-EINVAL);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
return ERR_PTR(-EINVAL);
#endif
if (state >= PSI_NONIDLE)
return ERR_PTR(-EINVAL);
@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
.proc_release = psi_fop_release,
};
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
static int psi_irq_show(struct seq_file *m, void *v)
{
return psi_show(m, &psi_system, PSI_IRQ);
}
static int psi_irq_open(struct inode *inode, struct file *file)
{
return psi_open(file, psi_irq_show);
}
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
size_t nbytes, loff_t *ppos)
{
return psi_write(file, user_buf, nbytes, PSI_IRQ);
}
static const struct proc_ops psi_irq_proc_ops = {
.proc_open = psi_irq_open,
.proc_read = seq_read,
.proc_lseek = seq_lseek,
.proc_write = psi_irq_write,
.proc_poll = psi_fop_poll,
.proc_release = psi_fop_release,
};
#endif
static int __init psi_proc_init(void)
{
if (psi_enable) {
@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
#endif
}
return 0;
}

View File

@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
}
#ifdef CONFIG_PSI
void psi_task_change(struct task_struct *task, int clear, int set);
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
bool sleep);
void psi_account_irqtime(struct task_struct *task, u32 delta);
/*
* PSI tracks state that persists across sleeps, such as iowaits and
* memory stalls. As a result, it has to distinguish between sleeps,
@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
static inline void psi_sched_switch(struct task_struct *prev,
struct task_struct *next,
bool sleep) {}
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
#endif /* CONFIG_PSI */
#ifdef CONFIG_SCHED_INFO