psi: Fix PSI_MEM_FULL state when tasks are in memstall and doing reclaim

We've noticed cases where tasks in a cgroup are stalled on memory but
there is little memory FULL pressure since tasks stay on the runqueue
in reclaim.

A simple example involves a single threaded program that keeps leaking
and touching large amounts of memory. It runs in a cgroup with swap
enabled, memory.high set at 10M and cpu.max ratio set at 5%. Though
there is significant CPU pressure and memory SOME, there is barely any
memory FULL since the task enters reclaim and stays on the runqueue.
However, this memory-bound task is effectively stalled on memory and
we expect memory FULL to match memory SOME in this scenario.

The code is confused about memstall && running, thinking there is a
stalled task and a productive task when there's only one task: a
reclaimer that's counted as both. To fix this, we redefine the
condition for PSI_MEM_FULL to check that all running tasks are in an
active memstall instead of checking that there are no running tasks.

        case PSI_MEM_FULL:
-               return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
+               return unlikely(tasks[NR_MEMSTALL] &&
+                       tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);

This will capture reclaimers. It will also capture tasks that called
psi_memstall_enter() and are about to sleep, but this should be
negligible noise.

Signed-off-by: Brian Chen <brianchen118@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20211110213312.310243-1-brianchen118@gmail.com
This commit is contained in:
Brian Chen 2021-11-10 21:33:12 +00:00 committed by Peter Zijlstra
parent 4feee7d126
commit cb0e52b774
3 changed files with 44 additions and 19 deletions

View File

@ -22,7 +22,17 @@ enum psi_task_count {
* don't have to special case any state tracking for it.
*/
NR_ONCPU,
NR_PSI_TASK_COUNTS = 4,
/*
* For IO and CPU stalls the presence of running/oncpu tasks
* in the domain means a partial rather than a full stall.
* For memory it's not so simple because of page reclaimers:
* they are running/oncpu while representing a stall. To tell
* whether a domain has productivity left or not, we need to
* distinguish between regular running (i.e. productive)
* threads and memstall ones.
*/
NR_MEMSTALL_RUNNING,
NR_PSI_TASK_COUNTS = 5,
};
/* Task state bitmasks */
@ -30,6 +40,7 @@ enum psi_task_count {
#define TSK_MEMSTALL (1 << NR_MEMSTALL)
#define TSK_RUNNING (1 << NR_RUNNING)
#define TSK_ONCPU (1 << NR_ONCPU)
#define TSK_MEMSTALL_RUNNING (1 << NR_MEMSTALL_RUNNING)
/* Resources that workloads could be stalled on */
enum psi_res {

View File

@ -35,13 +35,19 @@
* delayed on that resource such that nobody is advancing and the CPU
* goes idle. This leaves both workload and CPU unproductive.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level, means all non-idle tasks
* in a cgroup are delayed on the CPU resource which used by others outside
* of the cgroup or throttled by the cgroup cpu.max configuration.
*
* SOME = nr_delayed_tasks != 0
* FULL = nr_delayed_tasks != 0 && nr_running_tasks == 0
* FULL = nr_delayed_tasks != 0 && nr_productive_tasks == 0
*
* What it means for a task to be productive is defined differently
* for each resource. For IO, productive means a running task. For
* memory, productive means a running task that isn't a reclaimer. For
* CPU, productive means an oncpu task.
*
* Naturally, the FULL state doesn't exist for the CPU resource at the
* system level, but exist at the cgroup level. At the cgroup level,
* FULL means all non-idle tasks in the cgroup are delayed on the CPU
* resource which is being used by others outside of the cgroup or
* throttled by the cgroup cpu.max configuration.
*
* The percentage of wallclock time spent in those compound stall
* states gives pressure numbers between 0 and 100 for each resource,
@ -82,13 +88,13 @@
*
* threads = min(nr_nonidle_tasks, nr_cpus)
* SOME = min(nr_delayed_tasks / threads, 1)
* FULL = (threads - min(nr_running_tasks, threads)) / threads
* FULL = (threads - min(nr_productive_tasks, threads)) / threads
*
* For the 257 number crunchers on 256 CPUs, this yields:
*
* threads = min(257, 256)
* SOME = min(1 / 256, 1) = 0.4%
* FULL = (256 - min(257, 256)) / 256 = 0%
* FULL = (256 - min(256, 256)) / 256 = 0%
*
* For the 1 out of 4 memory-delayed tasks, this yields:
*
@ -113,7 +119,7 @@
* For each runqueue, we track:
*
* tSOME[cpu] = time(nr_delayed_tasks[cpu] != 0)
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_running_tasks[cpu])
* tFULL[cpu] = time(nr_delayed_tasks[cpu] && !nr_productive_tasks[cpu])
* tNONIDLE[cpu] = time(nr_nonidle_tasks[cpu] != 0)
*
* and then periodically aggregate:
@ -234,7 +240,8 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
case PSI_MEM_SOME:
return unlikely(tasks[NR_MEMSTALL]);
case PSI_MEM_FULL:
return unlikely(tasks[NR_MEMSTALL] && !tasks[NR_RUNNING]);
return unlikely(tasks[NR_MEMSTALL] &&
tasks[NR_RUNNING] == tasks[NR_MEMSTALL_RUNNING]);
case PSI_CPU_SOME:
return unlikely(tasks[NR_RUNNING] > tasks[NR_ONCPU]);
case PSI_CPU_FULL:
@ -711,10 +718,11 @@ static void psi_group_change(struct psi_group *group, int cpu,
if (groupc->tasks[t]) {
groupc->tasks[t]--;
} else if (!psi_bug) {
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u] clear=%x set=%x\n",
printk_deferred(KERN_ERR "psi: task underflow! cpu=%d t=%d tasks=[%u %u %u %u %u] clear=%x set=%x\n",
cpu, t, groupc->tasks[0],
groupc->tasks[1], groupc->tasks[2],
groupc->tasks[3], clear, set);
groupc->tasks[3], groupc->tasks[4],
clear, set);
psi_bug = 1;
}
}
@ -854,12 +862,15 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next,
int clear = TSK_ONCPU, set = 0;
/*
* When we're going to sleep, psi_dequeue() lets us handle
* TSK_RUNNING and TSK_IOWAIT here, where we can combine it
* with TSK_ONCPU and save walking common ancestors twice.
* When we're going to sleep, psi_dequeue() lets us
* handle TSK_RUNNING, TSK_MEMSTALL_RUNNING and
* TSK_IOWAIT here, where we can combine it with
* TSK_ONCPU and save walking common ancestors twice.
*/
if (sleep) {
clear |= TSK_RUNNING;
if (prev->in_memstall)
clear |= TSK_MEMSTALL_RUNNING;
if (prev->in_iowait)
set |= TSK_IOWAIT;
}
@ -908,7 +919,7 @@ void psi_memstall_enter(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 1;
psi_task_change(current, 0, TSK_MEMSTALL);
psi_task_change(current, 0, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
rq_unlock_irq(rq, &rf);
}
@ -937,7 +948,7 @@ void psi_memstall_leave(unsigned long *flags)
rq = this_rq_lock_irq(&rf);
current->in_memstall = 0;
psi_task_change(current, TSK_MEMSTALL, 0);
psi_task_change(current, TSK_MEMSTALL | TSK_MEMSTALL_RUNNING, 0);
rq_unlock_irq(rq, &rf);
}

View File

@ -118,6 +118,9 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup)
if (static_branch_likely(&psi_disabled))
return;
if (p->in_memstall)
set |= TSK_MEMSTALL_RUNNING;
if (!wakeup || p->sched_psi_wake_requeue) {
if (p->in_memstall)
set |= TSK_MEMSTALL;
@ -148,7 +151,7 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep)
return;
if (p->in_memstall)
clear |= TSK_MEMSTALL;
clear |= (TSK_MEMSTALL | TSK_MEMSTALL_RUNNING);
psi_task_change(p, clear, 0);
}