oom: move oom_adj value from task_struct to signal_struct

Currently, OOM logic callflow is here.

    __out_of_memory()
        select_bad_process()            for each task
            badness()                   calculate badness of one task
                oom_kill_process()      search child
                    oom_kill_task()     kill target task and mm shared tasks with it

example, process-A have two thread, thread-A and thread-B and it have very
fat memory and each thread have following oom_adj and oom_score.

     thread-A: oom_adj = OOM_DISABLE, oom_score = 0
     thread-B: oom_adj = 0,           oom_score = very-high

Then, select_bad_process() select thread-B, but oom_kill_task() refuse
kill the task because thread-A have OOM_DISABLE.  Thus __out_of_memory()
call select_bad_process() again.  but select_bad_process() select the same
task.  It mean kernel fall in livelock.

The fact is, select_bad_process() must select killable task.  otherwise
OOM logic go into livelock.

And root cause is, oom_adj shouldn't be per-thread value.  it should be
per-process value because OOM-killer kill a process, not thread.  Thus
This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct signal_struct.  it naturally prevent
select_bad_process() choose wrong task.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
KOSAKI Motohiro 2009-09-21 17:03:13 -07:00 committed by Linus Torvalds
parent f168e1b639
commit 28b83c5193
4 changed files with 39 additions and 24 deletions

View File

@ -999,11 +999,17 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
char buffer[PROC_NUMBUF]; char buffer[PROC_NUMBUF];
size_t len; size_t len;
int oom_adjust; int oom_adjust = OOM_DISABLE;
unsigned long flags;
if (!task) if (!task)
return -ESRCH; return -ESRCH;
oom_adjust = task->oomkilladj;
if (lock_task_sighand(task, &flags)) {
oom_adjust = task->signal->oom_adj;
unlock_task_sighand(task, &flags);
}
put_task_struct(task); put_task_struct(task);
len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@ -1017,6 +1023,7 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
struct task_struct *task; struct task_struct *task;
char buffer[PROC_NUMBUF], *end; char buffer[PROC_NUMBUF], *end;
int oom_adjust; int oom_adjust;
unsigned long flags;
memset(buffer, 0, sizeof(buffer)); memset(buffer, 0, sizeof(buffer));
if (count > sizeof(buffer) - 1) if (count > sizeof(buffer) - 1)
@ -1032,11 +1039,20 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
task = get_proc_task(file->f_path.dentry->d_inode); task = get_proc_task(file->f_path.dentry->d_inode);
if (!task) if (!task)
return -ESRCH; return -ESRCH;
if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { if (!lock_task_sighand(task, &flags)) {
put_task_struct(task);
return -ESRCH;
}
if (oom_adjust < task->signal->oom_adj && !capable(CAP_SYS_RESOURCE)) {
unlock_task_sighand(task, &flags);
put_task_struct(task); put_task_struct(task);
return -EACCES; return -EACCES;
} }
task->oomkilladj = oom_adjust;
task->signal->oom_adj = oom_adjust;
unlock_task_sighand(task, &flags);
put_task_struct(task); put_task_struct(task);
if (end - buffer == 0) if (end - buffer == 0)
return -EIO; return -EIO;

View File

@ -639,6 +639,8 @@ struct signal_struct {
unsigned audit_tty; unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf; struct tty_audit_buf *tty_audit_buf;
#endif #endif
int oom_adj; /* OOM kill score adjustment (bit shift) */
}; };
/* Context switch must be unlocked if interrupts are to be enabled */ /* Context switch must be unlocked if interrupts are to be enabled */
@ -1221,7 +1223,6 @@ struct task_struct {
* a short time * a short time
*/ */
unsigned char fpu_counter; unsigned char fpu_counter;
s8 oomkilladj; /* OOM kill score adjustment (bit shift). */
#ifdef CONFIG_BLK_DEV_IO_TRACE #ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq; unsigned int btrace_seq;
#endif #endif

View File

@ -880,6 +880,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig); tty_audit_fork(sig);
sig->oom_adj = current->signal->oom_adj;
return 0; return 0;
} }

View File

@ -58,6 +58,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
unsigned long points, cpu_time, run_time; unsigned long points, cpu_time, run_time;
struct mm_struct *mm; struct mm_struct *mm;
struct task_struct *child; struct task_struct *child;
int oom_adj = p->signal->oom_adj;
if (oom_adj == OOM_DISABLE)
return 0;
task_lock(p); task_lock(p);
mm = p->mm; mm = p->mm;
@ -148,15 +152,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
points /= 8; points /= 8;
/* /*
* Adjust the score by oomkilladj. * Adjust the score by oom_adj.
*/ */
if (p->oomkilladj) { if (oom_adj) {
if (p->oomkilladj > 0) { if (oom_adj > 0) {
if (!points) if (!points)
points = 1; points = 1;
points <<= p->oomkilladj; points <<= oom_adj;
} else } else
points >>= -(p->oomkilladj); points >>= -(oom_adj);
} }
#ifdef DEBUG #ifdef DEBUG
@ -251,7 +255,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
*ppoints = ULONG_MAX; *ppoints = ULONG_MAX;
} }
if (p->oomkilladj == OOM_DISABLE) if (p->signal->oom_adj == OOM_DISABLE)
continue; continue;
points = badness(p, uptime.tv_sec); points = badness(p, uptime.tv_sec);
@ -304,7 +308,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
} }
printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm,
get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj,
p->comm); p->comm);
task_unlock(p); task_unlock(p);
} while_each_thread(g, p); } while_each_thread(g, p);
@ -359,18 +363,9 @@ static int oom_kill_task(struct task_struct *p)
* change to NULL at any time since we do not hold task_lock(p). * change to NULL at any time since we do not hold task_lock(p).
* However, this is of no concern to us. * However, this is of no concern to us.
*/ */
if (!mm || p->signal->oom_adj == OOM_DISABLE)
if (mm == NULL)
return 1; return 1;
/*
* Don't kill the process if any threads are set to OOM_DISABLE
*/
do_each_thread(g, q) {
if (q->mm == mm && q->oomkilladj == OOM_DISABLE)
return 1;
} while_each_thread(g, q);
__oom_kill_task(p, 1); __oom_kill_task(p, 1);
/* /*
@ -394,8 +389,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
if (printk_ratelimit()) { if (printk_ratelimit()) {
printk(KERN_WARNING "%s invoked oom-killer: " printk(KERN_WARNING "%s invoked oom-killer: "
"gfp_mask=0x%x, order=%d, oomkilladj=%d\n", "gfp_mask=0x%x, order=%d, oom_adj=%d\n",
current->comm, gfp_mask, order, current->oomkilladj); current->comm, gfp_mask, order,
current->signal->oom_adj);
task_lock(current); task_lock(current);
cpuset_print_task_mems_allowed(current); cpuset_print_task_mems_allowed(current);
task_unlock(current); task_unlock(current);