forked from Minki/linux
PSI updates for v6.1:
- Various performance optimizations, resulting in a 4%-9% speedup in the mmtests/config-scheduler-perfpipe micro-benchmark. - New interface to turn PSI on/off on a per cgroup level. Signed-off-by: Ingo Molnar <mingo@kernel.org> -----BEGIN PGP SIGNATURE----- iQJFBAABCgAvFiEEBpT5eoXrXCwVQwEKEnMQ0APhK1gFAmNJKPsRHG1pbmdvQGtl cm5lbC5vcmcACgkQEnMQ0APhK1iPmg//aovCitAQX2lLoHJDIgdQibU40oaEpKTX wM549EGz3Dr6qmwF8+qT1U2Ge6af/hHQc5G/ZqDpKbuTjUIc3RmBkqX80dNKFLuH uyi9UtfsSriw+ks8fWuDdjr+S4oppwW9ZoIXvK8v4bisd3F31DNGvKPTayNxt73m lExfzJiD1oJixDxGX8MGO9QpcoywmjWjzjrB2P+J8hnTpArouHx/HOKdQOpG6wXq ZRr9kZvju6ucDpXCTa1HJrfVRxNAh35tx/b4cDtXbBFifVAeKaPOrHapMTVsqfel Z7T+2DymhidNYK0hrRJoGUwa/vkz+2Sm1ZLG9LlgUCXVco/9S1zw1ZuQakVvzPen wriuxRaAkR+szCP0L8js5+/DAkGa43MjKsvQHmDVnetQtlsAD4eYnn+alQ837SXv MP3jwFqF+e4mcWdoQcfh0OWUgGec5XZzdgRYrFkBKyTWGLB2iPivcAMNf0X/h82Q xxv4DQJIIJ017GOQ/ho2saq+GbtFCvX8YnGYas9T47Bjjluhjo7jgTVtPTo+mhtN RfwMdG718Ap/gvnAX7wMe/t+L/4AP8AIgDRi5L35dTRqETwOjH+LAvOYjleQFYgu kMVtLMyzU+TGwHscuzPFRh7TnvSJ4sD48Ll1BPnyZsh3SS9u0gAs1bml7Cu7JbmW SIZD/S/hzdI= =91tB -----END PGP SIGNATURE----- Merge tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull PSI updates from Ingo Molnar: - Various performance optimizations, resulting in a 4%-9% speedup in the mmtests/config-scheduler-perfpipe micro-benchmark. - New interface to turn PSI on/off on a per cgroup level. * tag 'sched-psi-2022-10-14' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/psi: Per-cgroup PSI accounting disable/re-enable interface sched/psi: Cache parent psi_group to speed up group iteration sched/psi: Consolidate cgroup_psi() sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure sched/psi: Remove NR_ONCPU task accounting sched/psi: Optimize task switch inside shared cgroups again sched/psi: Move private helpers to sched/stats.h sched/psi: Save percpu memory when !psi_cgroups_enabled sched/psi: Don't create cgroup PSI files when psi_disabled sched/psi: Fix periodic aggregation shut off
This commit is contained in:
commit
bd9a3dba18
@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
|
||||
killing cgroups is a process directed operation, i.e. it affects
|
||||
the whole thread-group.
|
||||
|
||||
cgroup.pressure
|
||||
A read-write single value file that allowed values are "0" and "1".
|
||||
The default is "1".
|
||||
|
||||
Writing "0" to the file will disable the cgroup PSI accounting.
|
||||
Writing "1" to the file will re-enable the cgroup PSI accounting.
|
||||
|
||||
This control attribute is not hierarchical, so disable or enable PSI
|
||||
accounting in a cgroup does not affect PSI accounting in descendants
|
||||
and doesn't need pass enablement via ancestors from root.
|
||||
|
||||
The reason this control attribute exists is that PSI accounts stalls for
|
||||
each cgroup separately and aggregates it at each level of the hierarchy.
|
||||
This may cause non-negligible overhead for some workloads when under
|
||||
deep level of the hierarchy, in which case this control attribute can
|
||||
be used to disable PSI accounting in the non-leaf cgroups.
|
||||
|
||||
irq.pressure
|
||||
A read-write nested-keyed file.
|
||||
|
||||
Shows pressure stall information for IRQ/SOFTIRQ. See
|
||||
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||
|
||||
Controllers
|
||||
===========
|
||||
|
||||
|
@ -428,6 +428,9 @@ struct cgroup {
|
||||
struct cgroup_file procs_file; /* handle for "cgroup.procs" */
|
||||
struct cgroup_file events_file; /* handle for "cgroup.events" */
|
||||
|
||||
/* handles for "{cpu,memory,io,irq}.pressure" */
|
||||
struct cgroup_file psi_files[NR_PSI_RESOURCES];
|
||||
|
||||
/*
|
||||
* The bitmask of subsystems enabled on the child cgroups.
|
||||
* ->subtree_control is the one configured through
|
||||
|
@ -682,11 +682,6 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
|
||||
pr_cont_kernfs_path(cgrp->kn);
|
||||
}
|
||||
|
||||
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
|
||||
{
|
||||
return cgrp->psi;
|
||||
}
|
||||
|
||||
bool cgroup_psi_enabled(void);
|
||||
|
||||
static inline void cgroup_init_kthreadd(void)
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/poll.h>
|
||||
#include <linux/cgroup-defs.h>
|
||||
#include <linux/cgroup.h>
|
||||
|
||||
struct seq_file;
|
||||
struct css_set;
|
||||
@ -18,10 +19,6 @@ extern struct psi_group psi_system;
|
||||
|
||||
void psi_init(void);
|
||||
|
||||
void psi_task_change(struct task_struct *task, int clear, int set);
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
bool sleep);
|
||||
|
||||
void psi_memstall_enter(unsigned long *flags);
|
||||
void psi_memstall_leave(unsigned long *flags);
|
||||
|
||||
@ -34,9 +31,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
|
||||
poll_table *wait);
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
static inline struct psi_group *cgroup_psi(struct cgroup *cgrp)
|
||||
{
|
||||
return cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
}
|
||||
|
||||
int psi_cgroup_alloc(struct cgroup *cgrp);
|
||||
void psi_cgroup_free(struct cgroup *cgrp);
|
||||
void cgroup_move_task(struct task_struct *p, struct css_set *to);
|
||||
void psi_cgroup_restart(struct psi_group *group);
|
||||
#endif
|
||||
|
||||
#else /* CONFIG_PSI */
|
||||
@ -58,6 +61,7 @@ static inline void cgroup_move_task(struct task_struct *p, struct css_set *to)
|
||||
{
|
||||
rcu_assign_pointer(p->cgroups, to);
|
||||
}
|
||||
static inline void psi_cgroup_restart(struct psi_group *group) {}
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_PSI */
|
||||
|
@ -15,13 +15,6 @@ enum psi_task_count {
|
||||
NR_IOWAIT,
|
||||
NR_MEMSTALL,
|
||||
NR_RUNNING,
|
||||
/*
|
||||
* This can't have values other than 0 or 1 and could be
|
||||
* implemented as a bit flag. But for now we still have room
|
||||
* in the first cacheline of psi_group_cpu, and this way we
|
||||
* don't have to special case any state tracking for it.
|
||||
*/
|
||||
NR_ONCPU,
|
||||
/*
|
||||
* For IO and CPU stalls the presence of running/oncpu tasks
|
||||
* in the domain means a partial rather than a full stall.
|
||||
@ -32,22 +25,27 @@ enum psi_task_count {
|
||||
* threads and memstall ones.
|
||||
*/
|
||||
NR_MEMSTALL_RUNNING,
|
||||
NR_PSI_TASK_COUNTS = 5,
|
||||
NR_PSI_TASK_COUNTS = 4,
|
||||
};
|
||||
|
||||
/* Task state bitmasks */
|
||||
#define TSK_IOWAIT (1 << NR_IOWAIT)
|
||||
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
|
||||
#define TSK_RUNNING (1 << NR_RUNNING)
|
||||
#define TSK_ONCPU (1 << NR_ONCPU)
|
||||
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
|
||||
|
||||
/* Only one task can be scheduled, no corresponding task count */
|
||||
#define TSK_ONCPU (1 << NR_PSI_TASK_COUNTS)
|
||||
|
||||
/* Resources that workloads could be stalled on */
|
||||
enum psi_res {
|
||||
PSI_IO,
|
||||
PSI_MEM,
|
||||
PSI_CPU,
|
||||
NR_PSI_RESOURCES = 3,
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
PSI_IRQ,
|
||||
#endif
|
||||
NR_PSI_RESOURCES,
|
||||
};
|
||||
|
||||
/*
|
||||
@ -63,11 +61,17 @@ enum psi_states {
|
||||
PSI_MEM_FULL,
|
||||
PSI_CPU_SOME,
|
||||
PSI_CPU_FULL,
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
PSI_IRQ_FULL,
|
||||
#endif
|
||||
/* Only per-CPU, to weigh the CPU in the global average: */
|
||||
PSI_NONIDLE,
|
||||
NR_PSI_STATES = 7,
|
||||
NR_PSI_STATES,
|
||||
};
|
||||
|
||||
/* Use one bit in the state mask to track TSK_ONCPU */
|
||||
#define PSI_ONCPU (1 << NR_PSI_STATES)
|
||||
|
||||
enum psi_aggregators {
|
||||
PSI_AVGS = 0,
|
||||
PSI_POLL,
|
||||
@ -147,6 +151,9 @@ struct psi_trigger {
|
||||
};
|
||||
|
||||
struct psi_group {
|
||||
struct psi_group *parent;
|
||||
bool enabled;
|
||||
|
||||
/* Protects data used by the aggregator */
|
||||
struct mutex avgs_lock;
|
||||
|
||||
@ -188,6 +195,8 @@ struct psi_group {
|
||||
|
||||
#else /* CONFIG_PSI */
|
||||
|
||||
#define NR_PSI_RESOURCES 0
|
||||
|
||||
struct psi_group { };
|
||||
|
||||
#endif /* CONFIG_PSI */
|
||||
|
@ -3698,27 +3698,27 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
|
||||
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_IO);
|
||||
}
|
||||
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_MEM);
|
||||
}
|
||||
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_CPU);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, enum psi_res res)
|
||||
static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
size_t nbytes, enum psi_res res)
|
||||
{
|
||||
struct cgroup_file_ctx *ctx = of->priv;
|
||||
struct psi_trigger *new;
|
||||
@ -3738,7 +3738,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi;
|
||||
psi = cgroup_psi(cgrp);
|
||||
new = psi_trigger_create(psi, buf, res);
|
||||
if (IS_ERR(new)) {
|
||||
cgroup_put(cgrp);
|
||||
@ -3755,21 +3755,86 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
|
||||
return pressure_write(of, buf, nbytes, PSI_IO);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
|
||||
return pressure_write(of, buf, nbytes, PSI_MEM);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
|
||||
return pressure_write(of, buf, nbytes, PSI_CPU);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
return psi_show(seq, psi, PSI_IRQ);
|
||||
}
|
||||
|
||||
static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
return pressure_write(of, buf, nbytes, PSI_IRQ);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int cgroup_pressure_show(struct seq_file *seq, void *v)
|
||||
{
|
||||
struct cgroup *cgrp = seq_css(seq)->cgroup;
|
||||
struct psi_group *psi = cgroup_psi(cgrp);
|
||||
|
||||
seq_printf(seq, "%d\n", psi->enabled);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes,
|
||||
loff_t off)
|
||||
{
|
||||
ssize_t ret;
|
||||
int enable;
|
||||
struct cgroup *cgrp;
|
||||
struct psi_group *psi;
|
||||
|
||||
ret = kstrtoint(strstrip(buf), 0, &enable);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (enable < 0 || enable > 1)
|
||||
return -ERANGE;
|
||||
|
||||
cgrp = cgroup_kn_lock_live(of->kn, false);
|
||||
if (!cgrp)
|
||||
return -ENOENT;
|
||||
|
||||
psi = cgroup_psi(cgrp);
|
||||
if (psi->enabled != enable) {
|
||||
int i;
|
||||
|
||||
/* show or hide {cpu,memory,io,irq}.pressure files */
|
||||
for (i = 0; i < NR_PSI_RESOURCES; i++)
|
||||
cgroup_file_show(&cgrp->psi_files[i], enable);
|
||||
|
||||
psi->enabled = enable;
|
||||
if (enable)
|
||||
psi_cgroup_restart(psi);
|
||||
}
|
||||
|
||||
cgroup_kn_unlock(of->kn);
|
||||
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
|
||||
@ -3789,6 +3854,9 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
|
||||
|
||||
bool cgroup_psi_enabled(void)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
return false;
|
||||
|
||||
return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
|
||||
}
|
||||
|
||||
@ -5175,6 +5243,7 @@ static struct cftype cgroup_psi_files[] = {
|
||||
#ifdef CONFIG_PSI
|
||||
{
|
||||
.name = "io.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
|
||||
.seq_show = cgroup_io_pressure_show,
|
||||
.write = cgroup_io_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
@ -5182,6 +5251,7 @@ static struct cftype cgroup_psi_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "memory.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
|
||||
.seq_show = cgroup_memory_pressure_show,
|
||||
.write = cgroup_memory_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
@ -5189,11 +5259,27 @@ static struct cftype cgroup_psi_files[] = {
|
||||
},
|
||||
{
|
||||
.name = "cpu.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
|
||||
.seq_show = cgroup_cpu_pressure_show,
|
||||
.write = cgroup_cpu_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
.release = cgroup_pressure_release,
|
||||
},
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
{
|
||||
.name = "irq.pressure",
|
||||
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
|
||||
.seq_show = cgroup_irq_pressure_show,
|
||||
.write = cgroup_irq_pressure_write,
|
||||
.poll = cgroup_pressure_poll,
|
||||
.release = cgroup_pressure_release,
|
||||
},
|
||||
#endif
|
||||
{
|
||||
.name = "cgroup.pressure",
|
||||
.seq_show = cgroup_pressure_show,
|
||||
.write = cgroup_pressure_write,
|
||||
},
|
||||
#endif /* CONFIG_PSI */
|
||||
{ } /* terminate */
|
||||
};
|
||||
|
@ -701,6 +701,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
||||
|
||||
rq->prev_irq_time += irq_delta;
|
||||
delta -= irq_delta;
|
||||
psi_account_irqtime(rq->curr, irq_delta);
|
||||
#endif
|
||||
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
||||
if (static_key_false((¶virt_steal_rq_enabled))) {
|
||||
|
@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
group->enabled = true;
|
||||
for_each_possible_cpu(cpu)
|
||||
seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
|
||||
group->avg_last_update = sched_clock();
|
||||
@ -201,6 +202,7 @@ void __init psi_init(void)
|
||||
{
|
||||
if (!psi_enable) {
|
||||
static_branch_enable(&psi_disabled);
|
||||
static_branch_disable(&psi_cgroups_enabled);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -211,7 +213,7 @@ void __init psi_init(void)
|
||||
group_init(&psi_system);
|
||||
}
|
||||
|
||||
static bool test_state(unsigned int *tasks, enum psi_states state)
|
||||
static bool test_state(unsigned int *tasks, enum psi_states state, bool oncpu)
|
||||
{
|
||||
switch (state) {
|
||||
case PSI_IO_SOME:
|
||||
@ -224,9 +226,9 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
|
||||
return unlikely(tasks[NR_MEMSTALL] &&
|
||||
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
|
||||
case PSI_CPU_SOME:
|
||||
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
|
||||
return unlikely(tasks[NR_RUNNING] > oncpu);
|
||||
case PSI_CPU_FULL:
|
||||
return unlikely(tasks[NR_RUNNING] && !tasks[NR_ONCPU]);
|
||||
return unlikely(tasks[NR_RUNNING] && !oncpu);
|
||||
case PSI_NONIDLE:
|
||||
return tasks[NR_IOWAIT] || tasks[NR_MEMSTALL] ||
|
||||
tasks[NR_RUNNING];
|
||||
@ -688,35 +690,53 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
bool wake_clock)
|
||||
{
|
||||
struct psi_group_cpu *groupc;
|
||||
u32 state_mask = 0;
|
||||
unsigned int t, m;
|
||||
enum psi_states s;
|
||||
u32 state_mask;
|
||||
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
/*
|
||||
* First we assess the aggregate resource states this CPU's
|
||||
* tasks have been in since the last change, and account any
|
||||
* SOME and FULL time these may have resulted in.
|
||||
*
|
||||
* Then we update the task counts according to the state
|
||||
* First we update the task counts according to the state
|
||||
* change requested through the @clear and @set bits.
|
||||
*
|
||||
* Then if the cgroup PSI stats accounting enabled, we
|
||||
* assess the aggregate resource states this CPU's tasks
|
||||
* have been in since the last change, and account any
|
||||
* SOME and FULL time these may have resulted in.
|
||||
*/
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
|
||||
record_times(groupc, now);
|
||||
/*
|
||||
* Start with TSK_ONCPU, which doesn't have a corresponding
|
||||
* task count - it's just a boolean flag directly encoded in
|
||||
* the state mask. Clear, set, or carry the current state if
|
||||
* no changes are requested.
|
||||
*/
|
||||
if (unlikely(clear & TSK_ONCPU)) {
|
||||
state_mask = 0;
|
||||
clear &= ~TSK_ONCPU;
|
||||
} else if (unlikely(set & TSK_ONCPU)) {
|
||||
state_mask = PSI_ONCPU;
|
||||
set &= ~TSK_ONCPU;
|
||||
} else {
|
||||
state_mask = groupc->state_mask & PSI_ONCPU;
|
||||
}
|
||||
|
||||
/*
|
||||
* The rest of the state mask is calculated based on the task
|
||||
* counts. Update those first, then construct the mask.
|
||||
*/
|
||||
for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
|
||||
if (!(m & (1 << t)))
|
||||
continue;
|
||||
if (groupc->tasks[t]) {
|
||||
groupc->tasks[t]--;
|
||||
} else if (!psi_bug) {
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
|
||||
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
|
||||
cpu, t, groupc->tasks[0],
|
||||
groupc->tasks[1], groupc->tasks[2],
|
||||
groupc->tasks[3], groupc->tasks[4],
|
||||
clear, set);
|
||||
groupc->tasks[3], clear, set);
|
||||
psi_bug = 1;
|
||||
}
|
||||
}
|
||||
@ -725,9 +745,25 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
if (set & (1 << t))
|
||||
groupc->tasks[t]++;
|
||||
|
||||
/* Calculate state mask representing active states */
|
||||
if (!group->enabled) {
|
||||
/*
|
||||
* On the first group change after disabling PSI, conclude
|
||||
* the current state and flush its time. This is unlikely
|
||||
* to matter to the user, but aggregation (get_recent_times)
|
||||
* may have already incorporated the live state into times_prev;
|
||||
* avoid a delta sample underflow when PSI is later re-enabled.
|
||||
*/
|
||||
if (unlikely(groupc->state_mask & (1 << PSI_NONIDLE)))
|
||||
record_times(groupc, now);
|
||||
|
||||
groupc->state_mask = state_mask;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
return;
|
||||
}
|
||||
|
||||
for (s = 0; s < NR_PSI_STATES; s++) {
|
||||
if (test_state(groupc->tasks, s))
|
||||
if (test_state(groupc->tasks, s, state_mask & PSI_ONCPU))
|
||||
state_mask |= (1 << s);
|
||||
}
|
||||
|
||||
@ -739,9 +775,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
* task in a cgroup is in_memstall, the corresponding groupc
|
||||
* on that cpu is in PSI_MEM_FULL state.
|
||||
*/
|
||||
if (unlikely(groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall))
|
||||
if (unlikely((state_mask & PSI_ONCPU) && cpu_curr(cpu)->in_memstall))
|
||||
state_mask |= (1 << PSI_MEM_FULL);
|
||||
|
||||
record_times(groupc, now);
|
||||
|
||||
groupc->state_mask = state_mask;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
@ -753,27 +791,12 @@ static void psi_group_change(struct psi_group *group, int cpu,
|
||||
schedule_delayed_work(&group->avgs_work, PSI_FREQ);
|
||||
}
|
||||
|
||||
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
|
||||
static inline struct psi_group *task_psi_group(struct task_struct *task)
|
||||
{
|
||||
if (*iter == &psi_system)
|
||||
return NULL;
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
if (static_branch_likely(&psi_cgroups_enabled)) {
|
||||
struct cgroup *cgroup = NULL;
|
||||
|
||||
if (!*iter)
|
||||
cgroup = task->cgroups->dfl_cgrp;
|
||||
else
|
||||
cgroup = cgroup_parent(*iter);
|
||||
|
||||
if (cgroup && cgroup_parent(cgroup)) {
|
||||
*iter = cgroup;
|
||||
return cgroup_psi(cgroup);
|
||||
}
|
||||
}
|
||||
if (static_branch_likely(&psi_cgroups_enabled))
|
||||
return cgroup_psi(task_dfl_cgroup(task));
|
||||
#endif
|
||||
*iter = &psi_system;
|
||||
return &psi_system;
|
||||
}
|
||||
|
||||
@ -796,8 +819,6 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
||||
{
|
||||
int cpu = task_cpu(task);
|
||||
struct psi_group *group;
|
||||
bool wake_clock = true;
|
||||
void *iter = NULL;
|
||||
u64 now;
|
||||
|
||||
if (!task->pid)
|
||||
@ -806,19 +827,11 @@ void psi_task_change(struct task_struct *task, int clear, int set)
|
||||
psi_flags_change(task, clear, set);
|
||||
|
||||
now = cpu_clock(cpu);
|
||||
/*
|
||||
* Periodic aggregation shuts off if there is a period of no
|
||||
* task changes, so we wake it back up if necessary. However,
|
||||
* don't do this if the task change is the aggregation worker
|
||||
* itself going to sleep, or we'll ping-pong forever.
|
||||
*/
|
||||
if (unlikely((clear & TSK_RUNNING) &&
|
||||
(task->flags & PF_WQ_WORKER) &&
|
||||
wq_worker_last_func(task) == psi_avgs_work))
|
||||
wake_clock = false;
|
||||
|
||||
while ((group = iterate_groups(task, &iter)))
|
||||
psi_group_change(group, cpu, clear, set, now, wake_clock);
|
||||
group = task_psi_group(task);
|
||||
do {
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
} while ((group = group->parent));
|
||||
}
|
||||
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
@ -826,34 +839,30 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
{
|
||||
struct psi_group *group, *common = NULL;
|
||||
int cpu = task_cpu(prev);
|
||||
void *iter;
|
||||
u64 now = cpu_clock(cpu);
|
||||
|
||||
if (next->pid) {
|
||||
bool identical_state;
|
||||
|
||||
psi_flags_change(next, 0, TSK_ONCPU);
|
||||
/*
|
||||
* When switching between tasks that have an identical
|
||||
* runtime state, the cgroup that contains both tasks
|
||||
* we reach the first common ancestor. Iterate @next's
|
||||
* ancestors only until we encounter @prev's ONCPU.
|
||||
* Set TSK_ONCPU on @next's cgroups. If @next shares any
|
||||
* ancestors with @prev, those will already have @prev's
|
||||
* TSK_ONCPU bit set, and we can stop the iteration there.
|
||||
*/
|
||||
identical_state = prev->psi_flags == next->psi_flags;
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(next, &iter))) {
|
||||
if (identical_state &&
|
||||
per_cpu_ptr(group->pcpu, cpu)->tasks[NR_ONCPU]) {
|
||||
group = task_psi_group(next);
|
||||
do {
|
||||
if (per_cpu_ptr(group->pcpu, cpu)->state_mask &
|
||||
PSI_ONCPU) {
|
||||
common = group;
|
||||
break;
|
||||
}
|
||||
|
||||
psi_group_change(group, cpu, 0, TSK_ONCPU, now, true);
|
||||
}
|
||||
} while ((group = group->parent));
|
||||
}
|
||||
|
||||
if (prev->pid) {
|
||||
int clear = TSK_ONCPU, set = 0;
|
||||
bool wake_clock = true;
|
||||
|
||||
/*
|
||||
* When we're going to sleep, psi_dequeue() lets us
|
||||
@ -867,26 +876,74 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
clear |= TSK_MEMSTALL_RUNNING;
|
||||
if (prev->in_iowait)
|
||||
set |= TSK_IOWAIT;
|
||||
|
||||
/*
|
||||
* Periodic aggregation shuts off if there is a period of no
|
||||
* task changes, so we wake it back up if necessary. However,
|
||||
* don't do this if the task change is the aggregation worker
|
||||
* itself going to sleep, or we'll ping-pong forever.
|
||||
*/
|
||||
if (unlikely((prev->flags & PF_WQ_WORKER) &&
|
||||
wq_worker_last_func(prev) == psi_avgs_work))
|
||||
wake_clock = false;
|
||||
}
|
||||
|
||||
psi_flags_change(prev, clear, set);
|
||||
|
||||
iter = NULL;
|
||||
while ((group = iterate_groups(prev, &iter)) && group != common)
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
group = task_psi_group(prev);
|
||||
do {
|
||||
if (group == common)
|
||||
break;
|
||||
psi_group_change(group, cpu, clear, set, now, wake_clock);
|
||||
} while ((group = group->parent));
|
||||
|
||||
/*
|
||||
* TSK_ONCPU is handled up to the common ancestor. If we're tasked
|
||||
* with dequeuing too, finish that for the rest of the hierarchy.
|
||||
* TSK_ONCPU is handled up to the common ancestor. If there are
|
||||
* any other differences between the two tasks (e.g. prev goes
|
||||
* to sleep, or only one task is memstall), finish propagating
|
||||
* those differences all the way up to the root.
|
||||
*/
|
||||
if (sleep) {
|
||||
if ((prev->psi_flags ^ next->psi_flags) & ~TSK_ONCPU) {
|
||||
clear &= ~TSK_ONCPU;
|
||||
for (; group; group = iterate_groups(prev, &iter))
|
||||
psi_group_change(group, cpu, clear, set, now, true);
|
||||
for (; group; group = group->parent)
|
||||
psi_group_change(group, cpu, clear, set, now, wake_clock);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
void psi_account_irqtime(struct task_struct *task, u32 delta)
|
||||
{
|
||||
int cpu = task_cpu(task);
|
||||
struct psi_group *group;
|
||||
struct psi_group_cpu *groupc;
|
||||
u64 now;
|
||||
|
||||
if (!task->pid)
|
||||
return;
|
||||
|
||||
now = cpu_clock(cpu);
|
||||
|
||||
group = task_psi_group(task);
|
||||
do {
|
||||
if (!group->enabled)
|
||||
continue;
|
||||
|
||||
groupc = per_cpu_ptr(group->pcpu, cpu);
|
||||
|
||||
write_seqcount_begin(&groupc->seq);
|
||||
|
||||
record_times(groupc, now);
|
||||
groupc->times[PSI_IRQ_FULL] += delta;
|
||||
|
||||
write_seqcount_end(&groupc->seq);
|
||||
|
||||
if (group->poll_states & (1 << PSI_IRQ_FULL))
|
||||
psi_schedule_poll_work(group, 1);
|
||||
} while ((group = group->parent));
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* psi_memstall_enter - mark the beginning of a memory stall section
|
||||
* @flags: flags to handle nested sections
|
||||
@ -952,7 +1009,7 @@ EXPORT_SYMBOL_GPL(psi_memstall_leave);
|
||||
#ifdef CONFIG_CGROUPS
|
||||
int psi_cgroup_alloc(struct cgroup *cgroup)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
if (!static_branch_likely(&psi_cgroups_enabled))
|
||||
return 0;
|
||||
|
||||
cgroup->psi = kzalloc(sizeof(struct psi_group), GFP_KERNEL);
|
||||
@ -965,12 +1022,13 @@ int psi_cgroup_alloc(struct cgroup *cgroup)
|
||||
return -ENOMEM;
|
||||
}
|
||||
group_init(cgroup->psi);
|
||||
cgroup->psi->parent = cgroup_psi(cgroup_parent(cgroup));
|
||||
return 0;
|
||||
}
|
||||
|
||||
void psi_cgroup_free(struct cgroup *cgroup)
|
||||
{
|
||||
if (static_branch_likely(&psi_disabled))
|
||||
if (!static_branch_likely(&psi_cgroups_enabled))
|
||||
return;
|
||||
|
||||
cancel_delayed_work_sync(&cgroup->psi->avgs_work);
|
||||
@ -998,7 +1056,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
|
||||
struct rq_flags rf;
|
||||
struct rq *rq;
|
||||
|
||||
if (static_branch_likely(&psi_disabled)) {
|
||||
if (!static_branch_likely(&psi_cgroups_enabled)) {
|
||||
/*
|
||||
* Lame to do this here, but the scheduler cannot be locked
|
||||
* from the outside, so we move cgroups from inside sched/.
|
||||
@ -1046,10 +1104,45 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
|
||||
|
||||
task_rq_unlock(rq, task, &rf);
|
||||
}
|
||||
|
||||
void psi_cgroup_restart(struct psi_group *group)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
/*
|
||||
* After we disable psi_group->enabled, we don't actually
|
||||
* stop percpu tasks accounting in each psi_group_cpu,
|
||||
* instead only stop test_state() loop, record_times()
|
||||
* and averaging worker, see psi_group_change() for details.
|
||||
*
|
||||
* When disable cgroup PSI, this function has nothing to sync
|
||||
* since cgroup pressure files are hidden and percpu psi_group_cpu
|
||||
* would see !psi_group->enabled and only do task accounting.
|
||||
*
|
||||
* When re-enable cgroup PSI, this function use psi_group_change()
|
||||
* to get correct state mask from test_state() loop on tasks[],
|
||||
* and restart groupc->state_start from now, use .clear = .set = 0
|
||||
* here since no task status really changed.
|
||||
*/
|
||||
if (!group->enabled)
|
||||
return;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
struct rq_flags rf;
|
||||
u64 now;
|
||||
|
||||
rq_lock_irq(rq, &rf);
|
||||
now = cpu_clock(cpu);
|
||||
psi_group_change(group, cpu, 0, 0, now, true);
|
||||
rq_unlock_irq(rq, &rf);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_CGROUPS */
|
||||
|
||||
int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
{
|
||||
bool only_full = false;
|
||||
int full;
|
||||
u64 now;
|
||||
|
||||
@ -1064,7 +1157,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
group->avg_next_update = update_averages(group, now);
|
||||
mutex_unlock(&group->avgs_lock);
|
||||
|
||||
for (full = 0; full < 2; full++) {
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
only_full = res == PSI_IRQ;
|
||||
#endif
|
||||
|
||||
for (full = 0; full < 2 - only_full; full++) {
|
||||
unsigned long avg[3] = { 0, };
|
||||
u64 total = 0;
|
||||
int w;
|
||||
@ -1078,7 +1175,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
|
||||
}
|
||||
|
||||
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
|
||||
full ? "full" : "some",
|
||||
full || only_full ? "full" : "some",
|
||||
LOAD_INT(avg[0]), LOAD_FRAC(avg[0]),
|
||||
LOAD_INT(avg[1]), LOAD_FRAC(avg[1]),
|
||||
LOAD_INT(avg[2]), LOAD_FRAC(avg[2]),
|
||||
@ -1106,6 +1203,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||
else
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
if (res == PSI_IRQ && --state != PSI_IRQ_FULL)
|
||||
return ERR_PTR(-EINVAL);
|
||||
#endif
|
||||
|
||||
if (state >= PSI_NONIDLE)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
@ -1390,6 +1492,33 @@ static const struct proc_ops psi_cpu_proc_ops = {
|
||||
.proc_release = psi_fop_release,
|
||||
};
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
static int psi_irq_show(struct seq_file *m, void *v)
|
||||
{
|
||||
return psi_show(m, &psi_system, PSI_IRQ);
|
||||
}
|
||||
|
||||
static int psi_irq_open(struct inode *inode, struct file *file)
|
||||
{
|
||||
return psi_open(file, psi_irq_show);
|
||||
}
|
||||
|
||||
static ssize_t psi_irq_write(struct file *file, const char __user *user_buf,
|
||||
size_t nbytes, loff_t *ppos)
|
||||
{
|
||||
return psi_write(file, user_buf, nbytes, PSI_IRQ);
|
||||
}
|
||||
|
||||
static const struct proc_ops psi_irq_proc_ops = {
|
||||
.proc_open = psi_irq_open,
|
||||
.proc_read = seq_read,
|
||||
.proc_lseek = seq_lseek,
|
||||
.proc_write = psi_irq_write,
|
||||
.proc_poll = psi_fop_poll,
|
||||
.proc_release = psi_fop_release,
|
||||
};
|
||||
#endif
|
||||
|
||||
static int __init psi_proc_init(void)
|
||||
{
|
||||
if (psi_enable) {
|
||||
@ -1397,6 +1526,9 @@ static int __init psi_proc_init(void)
|
||||
proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops);
|
||||
proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops);
|
||||
proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops);
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops);
|
||||
#endif
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -107,6 +107,11 @@ __schedstats_from_se(struct sched_entity *se)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PSI
|
||||
void psi_task_change(struct task_struct *task, int clear, int set);
|
||||
void psi_task_switch(struct task_struct *prev, struct task_struct *next,
|
||||
bool sleep);
|
||||
void psi_account_irqtime(struct task_struct *task, u32 delta);
|
||||
|
||||
/*
|
||||
* PSI tracks state that persists across sleeps, such as iowaits and
|
||||
* memory stalls. As a result, it has to distinguish between sleeps,
|
||||
@ -201,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {}
|
||||
static inline void psi_sched_switch(struct task_struct *prev,
|
||||
struct task_struct *next,
|
||||
bool sleep) {}
|
||||
static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {}
|
||||
#endif /* CONFIG_PSI */
|
||||
|
||||
#ifdef CONFIG_SCHED_INFO
|
||||
|
Loading…
Reference in New Issue
Block a user