From 1a99ae3f00d3c7c7885ee529ac9a874b19caa0cf Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Tue, 10 May 2016 21:03:18 +0800 Subject: [PATCH 01/20] sched/fair: Fix the wrong throttled clock time for cfs_rq_clock_task() Two minor fixes for cfs_rq_clock_task(): 1) If cfs_rq is currently being throttled, we need to subtract the cfs throttled clock time. 2) Make "throttled_clock_task_time" update SMP unrelated. Now UP cases need it as well. Signed-off-by: Xunlei Pang Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1462885398-14724-1-git-send-email-xlpang@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 218f8e83db73..1e87bb633d43 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3688,7 +3688,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task; + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } @@ -3826,13 +3826,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; cfs_rq->throttle_count--; -#ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } -#endif return 0; } From df55f462b905f3b2d40ec3fb865891382a6ebfb1 Mon Sep 17 00:00:00 2001 From: "Gaurav Jindal (Gaurav Jindal)" Date: Thu, 12 May 2016 10:13:33 +0000 Subject: [PATCH 02/20] sched/idle: Optimize the generic idle loop Currently, smp_processor_id() is used to fetch the current CPU in cpu_idle_loop(). Every time the idle thread runs, it fetches the current CPU using smp_processor_id(). Since the idle thread is per CPU, the current CPU is constant, so we can lift the load out of the loop, saving execution cycles/time in the loop. x86-64: Before patch (execution in loop): 148: 0f ae e8 lfence 14b: 65 8b 04 25 00 00 00 00 mov %gs:0x0,%eax 152: 00 153: 89 c0 mov %eax,%eax 155: 49 0f a3 04 24 bt %rax,(%r12) After patch (execution in loop): 150: 0f ae e8 lfence 153: 4d 0f a3 34 24 bt %r14,(%r12) ARM64: Before patch (execution in loop): 168: d5033d9f dsb ld 16c: b9405661 ldr w1,[x19,#84] 170: 1100fc20 add w0,w1,#0x3f 174: 6b1f003f cmp w1,wzr 178: 1a81b000 csel w0,w0,w1,lt 17c: 130c7000 asr w0,w0,#6 180: 937d7c00 sbfiz x0,x0,#3,#32 184: f8606aa0 ldr x0,[x21,x0] 188: 9ac12401 lsr x1,x0,x1 18c: 36000e61 tbz w1,#0,358 After patch (execution in loop): 1a8: d50339df dsb ld 1ac: f8776ac0 ldr x0,[x22,x23] ab0: ea18001f tst x0,x24 1b4: 54000ea0 b.eq 388 Further observance on ARM64 for 4 seconds shows that cpu_idle_loop is called 8672 times. Shifting the code will save instructions executed in loop and eventually time as well. Signed-off-by: Gaurav Jindal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Sanjeev Yadav Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160512101330.GA488@gauravjindalubtnb.del.spreadtrum.com Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index bd12c6c714ec..db4ff7c100b9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -201,6 +201,8 @@ exit_idle: */ static void cpu_idle_loop(void) { + int cpu = smp_processor_id(); + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -219,7 +221,7 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } From 150593bf869393d10a79f6bd3df2585ecc20a9bb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 18 May 2016 19:02:18 +0200 Subject: [PATCH 03/20] sched/api: Introduce task_rcu_dereference() and try_get_task_struct() Generally task_struct is only protected by RCU if it was found on a RCU protected list (say, for_each_process() or find_task_by_vpid()). As Kirill pointed out rq->curr isn't protected by RCU, the scheduler drops the (potentially) last reference without RCU gp, this means that we need to fix the code which uses foreign_rq->curr under rcu_read_lock(). Add a new helper which can be used to dereference rq->curr or any other pointer to task_struct assuming that it should be cleared or updated before the final put_task_struct(). It returns non-NULL only if this task can't go away before rcu_read_unlock(). ( Also add try_get_task_struct() to make it easier to use this API correctly. ) Suggested-by: Kirill Tkhai Signed-off-by: Oleg Nesterov [ Updated comments; added try_get_task_struct()] Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20160518170218.GY3192@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++ kernel/exit.c | 76 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e42ada26345..dee41bf59e6b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2139,6 +2139,9 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } +struct task_struct *task_rcu_dereference(struct task_struct **ptask); +struct task_struct *try_get_task_struct(struct task_struct **ptask); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime); diff --git a/kernel/exit.c b/kernel/exit.c index 9e6e1356e6bb..2fb4d44c51b1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -210,6 +210,82 @@ repeat: goto repeat; } +/* + * Note that if this function returns a valid task_struct pointer (!NULL) + * task->usage must remain >0 for the duration of the RCU critical section. + */ +struct task_struct *task_rcu_dereference(struct task_struct **ptask) +{ + struct sighand_struct *sighand; + struct task_struct *task; + + /* + * We need to verify that release_task() was not called and thus + * delayed_put_task_struct() can't run and drop the last reference + * before rcu_read_unlock(). We check task->sighand != NULL, + * but we can read the already freed and reused memory. + */ +retry: + task = rcu_dereference(*ptask); + if (!task) + return NULL; + + probe_kernel_address(&task->sighand, sighand); + + /* + * Pairs with atomic_dec_and_test() in put_task_struct(). If this task + * was already freed we can not miss the preceding update of this + * pointer. + */ + smp_rmb(); + if (unlikely(task != READ_ONCE(*ptask))) + goto retry; + + /* + * We've re-checked that "task == *ptask", now we have two different + * cases: + * + * 1. This is actually the same task/task_struct. In this case + * sighand != NULL tells us it is still alive. + * + * 2. This is another task which got the same memory for task_struct. + * We can't know this of course, and we can not trust + * sighand != NULL. + * + * In this case we actually return a random value, but this is + * correct. + * + * If we return NULL - we can pretend that we actually noticed that + * *ptask was updated when the previous task has exited. Or pretend + * that probe_slab_address(&sighand) reads NULL. + * + * If we return the new task (because sighand is not NULL for any + * reason) - this is fine too. This (new) task can't go away before + * another gp pass. + * + * And note: We could even eliminate the false positive if re-read + * task->sighand once again to avoid the falsely NULL. But this case + * is very unlikely so we don't care. + */ + if (!sighand) + return NULL; + + return task; +} + +struct task_struct *try_get_task_struct(struct task_struct **ptask) +{ + struct task_struct *task; + + rcu_read_lock(); + task = task_rcu_dereference(ptask); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + /* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected From bac7857319bcf7fed329a10bb760053e761115c0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 18 May 2016 21:57:33 +0200 Subject: [PATCH 04/20] sched/fair: Use task_rcu_dereference() Simplify task_numa_compare()'s task reference magic by using task_rcu_dereference(). Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Chris Metcalf Cc: Christoph Lameter Cc: Kirill Tkhai Cc: Kirill Tkhai Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vladimir Davydov Link: http://lkml.kernel.org/r/20160518195733.GA15914@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 34 ++++------------------------------ 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e87bb633d43..c6dd8bab010c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1305,6 +1305,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1372,31 +1374,11 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); - - raw_spin_lock_irq(&dst_rq->lock); - cur = dst_rq->curr; - /* - * No need to move the exiting task or idle task. - */ - if ((cur->flags & PF_EXITING) || is_idle_task(cur)) + cur = task_rcu_dereference(&dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - - raw_spin_unlock_irq(&dst_rq->lock); /* * Because we have preemption enabled we can get migrated around and @@ -1479,7 +1461,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1505,16 +1486,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, From 03c041c5bf6ed584dff36b7cd509e0146a124277 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 3 Jun 2016 17:58:41 -0500 Subject: [PATCH 05/20] sched/debug: Always show 'nr_migrations' The nr_migrations field is updated independently of CONFIG_SCHEDSTATS, so it can be displayed regardless. Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Cc: Linus Torvalds Cc: Matt Fleming Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5b1b04057ae2b14d73c2d03f56582c1d38cfe066.1464994423.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a336..2a0a9995256d 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; From 2348140d58f4f4245e9635ea8f1a77e940a4d877 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:44 +0800 Subject: [PATCH 06/20] KVM: Fix steal clock warp during guest CPU hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes, after CPU hotplug you can observe a spike in stolen time (100%) followed by the CPU being marked as 100% idle when it's actually busy with a CPU hog task. The trace looks like the following: cpuhp/1-12 [001] d.h1 167.461657: account_process_tick: steal = 1291385514, prev_steal_time = 0 cpuhp/1-12 [001] d.h1 167.461659: account_process_tick: steal_jiffies = 1291 -0 [001] d.h1 167.462663: account_process_tick: steal = 18732255, prev_steal_time = 1291000000 -0 [001] d.h1 167.462664: account_process_tick: steal_jiffies = 18446744072437 The sudden decrease of "steal" causes steal_jiffies to underflow. The root cause is kvm_steal_time being reset to 0 after hot-plugging back in a CPU. Instead, the preexisting value can be used, which is what the core scheduler code expects. John Stultz also reported a similar issue after guest S3. Suggested-by: Paolo Bonzini Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: John Stultz Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1465813966-3116-2-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index eea2a6f72b31..1ef5e48b3a36 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -301,8 +301,6 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_info("kvm-stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); From 3d89e5478bf550a50c99e93adf659369798263b0 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:45 +0800 Subject: [PATCH 07/20] sched/cputime: Fix prev steal time accouting during CPU hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit: e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug") ... set rq->prev_* to 0 after a CPU hotplug comes back, in order to fix the case where (after CPU hotplug) steal time is smaller than rq->prev_steal_time. However, this should never happen. Steal time was only smaller because of the KVM-specific bug fixed by the previous patch. Worse, the previous patch triggers a bug on CPU hot-unplug/plug operation: because rq->prev_steal_time is cleared, all of the CPU's past steal time will be accounted again on hot-plug. Since the root cause has been fixed, we can just revert commit e9532e69b8d1. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Fixes: 'commit e9532e69b8d1 ("sched/cputime: Fix steal time accounting vs. CPU hotplug")' Link: http://lkml.kernel.org/r/1465813966-3116-3-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 - kernel/sched/sched.h | 13 ------------- 2 files changed, 14 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 13d0896aff87..c1b537bc4b90 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7227,7 +7227,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; - account_reset_rq(rq); update_max_interval(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72f1f3087b04..de607e4febd9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1809,16 +1809,3 @@ static inline void cpufreq_trigger_update(u64 time) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -} From 807e5b80687c06715d62df51a5473b231e3e8b15 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jun 2016 18:32:46 +0800 Subject: [PATCH 08/20] sched/cputime: Add steal time support to full dynticks CPU time accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds guest steal-time support to full dynticks CPU time accounting. After the following commit: ff9a9b4c4334 ("sched, time: Switch VIRT_CPU_ACCOUNTING_GEN to jiffy granularity") ... time sampling became jiffy based, even if we do the sampling from the context tracking code, so steal_account_process_tick() can be reused to account how many 'ticks' are stolen-time, after the last accumulation. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paolo Bonzini Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Rik van Riel Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1465813966-3116-4-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c5498d5..3d60e5d76fdb 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -257,7 +257,7 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline bool steal_account_process_tick(void) +static __always_inline unsigned long steal_account_process_tick(unsigned long max_jiffies) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { @@ -272,14 +272,14 @@ static __always_inline bool steal_account_process_tick(void) * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_jiffies = nsecs_to_jiffies(steal); + steal_jiffies = min(nsecs_to_jiffies(steal), max_jiffies); this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); account_steal_time(jiffies_to_cputime(steal_jiffies)); return steal_jiffies; } #endif - return false; + return 0; } /* @@ -346,7 +346,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, u64 cputime = (__force u64) cputime_one_jiffy; u64 *cpustat = kcpustat_this_cpu->cpustat; - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; cputime *= ticks; @@ -477,7 +477,7 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick()) + if (steal_account_process_tick(ULONG_MAX)) return; if (user_tick) @@ -681,12 +681,14 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta = now - tsk->vtime_snap; + unsigned long delta_jiffies, steal_jiffies; + delta_jiffies = now - tsk->vtime_snap; + steal_jiffies = steal_account_process_tick(delta_jiffies); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta); + return jiffies_to_cputime(delta_jiffies - steal_jiffies); } static void __vtime_account_system(struct task_struct *tsk) From e210bffd39d01b649c94b820c28ff112673266dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 18:51:48 +0200 Subject: [PATCH 09/20] sched/fair: Fix and optimize the fork() path The task_fork_fair() callback already calls __set_task_cpu() and takes rq->lock. If we move the sched_class::task_fork callback in sched_fork() under the existing p->pi_lock, right after its set_task_cpu() call, we can avoid doing two such calls and omit the IRQ disabling on the rq->lock. Change to __set_task_cpu() to skip the migration bits, this is a new task, not a migration. Similarly, make wake_up_new_task() use __set_task_cpu() for the same reason, the task hasn't actually migrated as it hasn't ever ran. This cures the problem of calling migrate_task_rq_fair(), which does remove_entity_from_load_avg() on tasks that have never been added to the load avg to begin with. This bug would result in transiently messed up load_avg values, averaged out after a few dozen milliseconds. This is probably the reason why this bug was not found for such a long time. Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 16 +++++++++++----- kernel/sched/fair.c | 27 ++++++--------------------- 2 files changed, 17 insertions(+), 26 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e406ba0c6891..fa3434dffbbb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2383,9 +2383,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2394,7 +2391,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2534,8 +2537,11 @@ void wake_up_new_task(struct task_struct *p) * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 73063560b9ec..994f5493ee5b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4448,7 +4448,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -8289,31 +8289,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8326,8 +8312,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* From 010114739d294c474764c94196d32fb92e233657 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Jun 2016 11:20:46 +0200 Subject: [PATCH 10/20] sched/fair: Fix PELT integrity for new groups Vincent reported that when a new task is moved into a new cgroup it gets attached twice to the load tracking: sched_move_task() task_move_group_fair() detach_task_cfs_rq() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() se->avg.last_load_update = cfs_rq->avg.last_load_update // == 0 enqueue_entity() enqueue_entity_load_avg() update_cfs_rq_load_avg() now = clock() __update_load_avg(&cfs_rq->avg) cfs_rq->avg.last_load_update = now // ages load/util for: now - 0, load/util -> 0 if (migrated) attach_entity_load_avg() se->avg.last_load_update = cfs_rq->avg.last_load_update; // now != 0 The problem is that we don't update cfs_rq load_avg before all entity attach/detach operations. Only enqueue_task() and migrate_task() do this. By fixing this, the above will not happen, because the sched_move_task() attach will have updated cfs_rq's last_load_update time before attach, and in turn the attach will have set the entity's last_load_update stamp. Note that there is a further problem with sched_move_task() calling detach on a task that hasn't yet been attached; this will be taken care of in a subsequent patch. Reported-by: Vincent Guittot Tested-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Yuyang Du Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 994f5493ee5b..dda0ccbd0f3d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3083,6 +3083,12 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + static inline void update_load_avg(struct sched_entity *se, int not_used) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -8368,6 +8374,7 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); if (!vruntime_normalized(p)) { /* @@ -8379,6 +8386,7 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); } @@ -8386,6 +8394,7 @@ static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8396,6 +8405,7 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); if (!vruntime_normalized(p)) From ea86cb4b7621e1298a37197005bf0abcc86348d4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 17 Jun 2016 13:38:55 +0200 Subject: [PATCH 11/20] sched/cgroup: Fix cpu_cgroup_fork() handling A new fair task is detached and attached from/to task_group with: cgroup_post_fork() ss->fork(child) := cpu_cgroup_fork() sched_move_task() task_move_group_fair() Which is wrong, because at this point in fork() the task isn't fully initialized and it cannot 'move' to another group, because its not attached to any group as yet. In fact, cpu_cgroup_fork() needs a small part of sched_move_task() so we can just call this small part directly instead sched_move_task(). And the task doesn't really migrate because it is not yet attached so we need the following sequence: do_fork() sched_fork() __set_task_cpu() cgroup_post_fork() set_task_rq() # set task group and runqueue wake_up_new_task() select_task_rq() can select a new cpu __set_task_cpu post_init_entity_util_avg attach_task_cfs_rq() activate_task enqueue_task This patch makes that happen. Signed-off-by: Vincent Guittot [ Added TASK_SET_GROUP to set depth properly. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 63 ++++++++++++++++++++++++++++---------------- kernel/sched/fair.c | 23 +++++++++++++++- kernel/sched/sched.h | 5 +++- 3 files changed, 67 insertions(+), 24 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa3434dffbbb..3d856c46f6d8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7744,14 +7744,37 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. +static void sched_change_group(struct task_struct *tsk, int type) +{ + struct task_group *tg; + + /* + * All callers are synchronized by task_rq_lock(); we do not use RCU + * which is pointless here. Thus, we pass "true" to task_css_check() + * to prevent lockdep warnings. + */ + tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), + struct task_group, css); + tg = autogroup_task_group(tsk, tg); + tsk->sched_task_group = tg; + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); + else +#endif + set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. */ void sched_move_task(struct task_struct *tsk) { - struct task_group *tg; int queued, running; struct rq_flags rf; struct rq *rq; @@ -7766,22 +7789,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) put_prev_task(rq, tsk); - /* - * All callers are synchronized by task_rq_lock(); we do not use RCU - * which is pointless here. Thus, we pass "true" to task_css_check() - * to prevent lockdep warnings. - */ - tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), - struct task_group, css); - tg = autogroup_task_group(tsk, tg); - tsk->sched_task_group = tg; - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); - else -#endif - set_task_rq(tsk, task_cpu(tsk)); + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8209,9 +8217,20 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task) { - sched_move_task(task); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &rf); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dda0ccbd0f3d..64f26bc436eb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8466,6 +8466,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -8478,6 +8486,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -8706,7 +8727,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 71ce9862abc3..307bd0418095 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1246,8 +1246,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group) (struct task_struct *p, int type); #endif }; From 7dc603c9028ea5d4354e0e317e8481df99b06d7e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Jun 2016 13:29:28 +0200 Subject: [PATCH 12/20] sched/fair: Fix PELT integrity for new tasks Vincent and Yuyang found another few scenarios in which entity tracking goes wobbly. The scenarios are basically due to the fact that new tasks are not immediately attached and thereby differ from the normal situation -- a task is always attached to a cfs_rq load average (such that it includes its blocked contribution) and are explicitly detached/attached on migration to another cfs_rq. Scenario 1: switch to fair class p->sched_class = fair_class; if (queued) enqueue_task(p); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() check_class_changed() switched_from() (!fair) switched_to() (fair) switched_to_fair() attach_entity_load_avg() If @p is a new task that hasn't been fair before, it will have !last_update_time and, per the above, end up in attach_entity_load_avg() _twice_. Scenario 2: change between cgroups sched_move_group(p) if (queued) dequeue_task() task_move_group_fair() detach_task_cfs_rq() detach_entity_load_avg() set_task_rq() attach_task_cfs_rq() attach_entity_load_avg() if (queued) enqueue_task(); ... enqueue_entity() enqueue_entity_load_avg() migrated = !sa->last_update_time (true) if (migrated) attach_entity_load_avg() Similar as with scenario 1, if @p is a new task, it will have !load_update_time and we'll end up in attach_entity_load_avg() _twice_. Furthermore, notice how we do a detach_entity_load_avg() on something that wasn't attached to begin with. As stated above; the problem is that the new task isn't yet attached to the load tracking and thereby violates the invariant assumption. This patch remedies this by ensuring a new task is indeed properly attached to the load tracking on creation, through post_init_entity_util_avg(). Of course, this isn't entirely as straightforward as one might think, since the task is hashed before we call wake_up_new_task() and thus can be poked at. We avoid this by adding TASK_NEW and teaching cpu_cgroup_can_attach() to refuse such tasks. Reported-by: Yuyang Du Reported-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 5 +++-- kernel/sched/core.c | 28 ++++++++++++++++++++++----- kernel/sched/fair.c | 45 +++++++++++++++++++++++++++++++++++-------- 3 files changed, 63 insertions(+), 15 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b45acfd18f4e..d99218a1e043 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -219,9 +219,10 @@ extern void proc_sched_set_task(struct task_struct *p); #define TASK_WAKING 256 #define TASK_PARKED 512 #define TASK_NOLOAD 1024 -#define TASK_STATE_MAX 2048 +#define TASK_NEW 2048 +#define TASK_STATE_MAX 4096 -#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3d856c46f6d8..14afa518948c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2342,11 +2342,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2383,6 +2383,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } + init_entity_runnable_average(&p->se); + /* * The child is not yet in the pid-hash so no cgroup attach races, * and the cgroup is pinned to this child due to cgroup_fork() @@ -2529,9 +2531,8 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: @@ -8237,6 +8238,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8247,8 +8249,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 64f26bc436eb..0c21a12c0205 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -690,6 +690,10 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -720,6 +724,7 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -733,16 +738,37 @@ void post_init_entity_util_avg(struct sched_entity *se) } sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -2840,8 +2866,6 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -2951,6 +2975,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. + * + * Or we're fresh through post_init_entity_util_avg(). */ if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -3056,11 +3082,14 @@ void remove_entity_load_avg(struct sched_entity *se) u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; last_update_time = cfs_rq_last_update_time(cfs_rq); From 3d30544f02120b884bba2a9466c87dba980e3be5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jun 2016 14:27:50 +0200 Subject: [PATCH 13/20] sched/fair: Apply more PELT fixes One additional 'rule' for using update_cfs_rq_load_avg() is that one should call update_tg_load_avg() if it returns true. Add a bunch of comments to hopefully clarify some of the rules: o You need to update cfs_rq _before_ any entity attach/detach, this is important, because while for mathmatical consisency this isn't strictly needed, it is required for the physical interpretation of the model, you attach/detach _now_. o When you modify the cfs_rq avg, you have to then call update_tg_load_avg() in order to propagate changes upwards. o (Fair) entities are always attached, switched_{to,from}_fair() deal with !fair. This directly follows from the definition of the cfs_rq averages, namely that they are a direct sum of all (runnable or blocked) entities on that rq. It is the second rule that this patch enforces, but it adds comments pertaining to all of them. Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 53 +++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 49 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0c21a12c0205..781788d54736 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -692,6 +692,7 @@ void init_entity_runnable_average(struct sched_entity *se) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); /* @@ -725,6 +726,7 @@ void post_init_entity_util_avg(struct sched_entity *se) struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -757,8 +759,10 @@ void post_init_entity_util_avg(struct sched_entity *se) } } - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } #else /* !CONFIG_SMP */ @@ -768,6 +772,9 @@ void init_entity_runnable_average(struct sched_entity *se) void post_init_entity_util_avg(struct sched_entity *se) { } +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} #endif /* CONFIG_SMP */ /* @@ -2912,7 +2919,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { @@ -2967,6 +2990,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) update_tg_load_avg(cfs_rq, 0); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -2998,6 +3029,14 @@ skip_aging: cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -8404,6 +8443,7 @@ static void detach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -8415,8 +8455,10 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) @@ -8424,6 +8466,7 @@ static void attach_task_cfs_rq(struct task_struct *p) struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8434,8 +8477,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ - update_cfs_rq_load_avg(now, cfs_rq, false); + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; From 8663e24d56dc1f093232783c23ea17f2a6f61c03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jun 2016 14:58:02 +0200 Subject: [PATCH 14/20] sched/fair: Reorder cgroup creation code A future patch needs rq->lock held _after_ we link the task_group into the hierarchy. In order to avoid taking every rq->lock twice, reorder things a little and create online_fair_sched_group() to be called after we link the task_group. All this code is still ran from css_alloc() so css_online() isn't in fact used for this. Signed-off-by: Peter Zijlstra (Intel) Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ kernel/sched/fair.c | 22 ++++++++++++++++++---- kernel/sched/sched.h | 1 + 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 14afa518948c..4ede4fc65653 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7717,6 +7717,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); + + online_fair_sched_group(tg); } /* rcu callback to free various structures associated with a task group */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 781788d54736..62d5e7dcc7f8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8624,10 +8624,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); - - raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); - raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8638,6 +8634,22 @@ err: return 0; } +void online_fair_sched_group(struct task_group *tg) +{ + struct sched_entity *se; + struct rq *rq; + int i; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + se = tg->se[i]; + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + raw_spin_unlock_irq(&rq->lock); + } +} + void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; @@ -8742,6 +8754,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } +void online_fair_sched_group(struct task_group *tg) { } + void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 307bd0418095..28c42b789f70 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, From 599b4840b0ea453c7d11e1798dcc8f494dcfd58a Mon Sep 17 00:00:00 2001 From: Zev Weiss Date: Sun, 26 Jun 2016 16:13:23 -0500 Subject: [PATCH 15/20] sched/core: Fix sched_getaffinity() return value kerneldoc comment Previous version was probably written referencing the man page for glibc's wrapper, but the wrapper's behavior differs from that of the syscall itself in this case. Signed-off-by: Zev Weiss Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1466975603-25408-1-git-send-email-zev@bewilderbeest.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ede4fc65653..28da50a5bc76 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4759,7 +4759,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) From 55e16d30bd99510900caec913c90f53bc2b35cba Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jun 2016 15:14:26 +0200 Subject: [PATCH 16/20] sched/fair: Rework throttle_count sync Since we already take rq->lock when creating a cgroup, use it to also sync the throttle_count and avoid the extra state and enqueue path branch. Signed-off-by: Peter Zijlstra (Intel) Cc: Konstantin Khlebnikov Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bsegall@google.com Cc: linux-kernel@vger.kernel.org [ Fixed build warning. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 39 +++++++++++++++++++-------------------- kernel/sched/sched.h | 2 +- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 62d5e7dcc7f8..4088eedea763 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4241,26 +4241,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* Synchronize hierarchical throttle counter: */ - if (unlikely(!cfs_rq->throttle_uptodate)) { - struct rq *rq = rq_of(cfs_rq); - struct cfs_rq *pcfs_rq; - struct task_group *tg; - - cfs_rq->throttle_uptodate = 1; - - /* Get closest up-to-date node, because leaves go first: */ - for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { - pcfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (pcfs_rq->throttle_uptodate) - break; - } - if (tg) { - cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(rq); - } - } - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4275,6 +4255,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } +static void sync_throttle(struct task_group *tg, int cpu) +{ + struct cfs_rq *pcfs_rq, *cfs_rq; + + if (!cfs_bandwidth_used()) + return; + + if (!tg->parent) + return; + + cfs_rq = tg->cfs_rq[cpu]; + pcfs_rq = tg->parent->cfs_rq[cpu]; + + cfs_rq->throttle_count = pcfs_rq->throttle_count; + pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} + /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4414,6 +4411,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -8646,6 +8644,7 @@ void online_fair_sched_group(struct task_group *tg) raw_spin_lock_irq(&rq->lock); post_init_entity_util_avg(se); + sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 28c42b789f70..f44da95c70cd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -438,7 +438,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count, throttle_uptodate; + int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ From 9acacc2ac525ef1397af63b15cef7bb77a823c06 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:18 +0800 Subject: [PATCH 17/20] sched/cpuacct: Merge cpuacct_usage_index and cpuacct_stat_index enums These two types have similar function, no need to separate them. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/436748885270d64363c7dc67167507d486c2057a.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 47 ++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0938..74241eb5f3ff 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,15 +25,13 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; -enum cpuacct_usage_index { - CPUACCT_USAGE_USER, /* ... user mode */ - CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ - - CPUACCT_USAGE_NRUSAGE, +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", }; struct cpuacct_usage { - u64 usages[CPUACCT_USAGE_NRUSAGE]; + u64 usages[CPUACCT_STAT_NSTATS]; }; /* track cpu usage of a group of tasks and its child groups */ @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) } static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; /* - * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of suages. */ - BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + BUG_ON(index > CPUACCT_STAT_NSTATS); #ifndef CONFIG_64BIT /* @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - if (index == CPUACCT_USAGE_NRUSAGE) { + if (index == CPUACCT_STAT_NSTATS) { int i = 0; data = 0; - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) data += cpuusage->usages[i]; } else { data = cpuusage->usages[index]; @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val; #ifndef CONFIG_64BIT @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* return total cpu usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_USER); + return __cpuusage_read(css, CPUACCT_STAT_USER); } static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); + return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); } static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); + return __cpuusage_read(css, CPUACCT_STAT_NSTATS); } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, } static int __cpuacct_percpu_seq_show(struct seq_file *m, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; @@ -229,24 +227,19 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); } static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); } static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } -static const char * const cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); @@ -316,11 +309,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_USAGE_SYSTEM; + int index = CPUACCT_STAT_SYSTEM; struct pt_regs *regs = task_pt_regs(tsk); if (regs && user_mode(regs)) - index = CPUACCT_USAGE_USER; + index = CPUACCT_STAT_USER; rcu_read_lock(); From 8e546bfafb3121ed25c73a0c02311ec58459344a Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:19 +0800 Subject: [PATCH 18/20] sched/cpuacct: Use loop to consolidate code in cpuacct_stats_show() In cpuacct_stats_show() we currently we have copies of similar code, for each cpustat(system/user) variant. Use a loop instead to consolidate the code. This will also work better if we extend the CPUACCT_STAT_NSTATS type. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b0597d4224655e9f333f1a6224ed9654c7d7d36a.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 74241eb5f3ff..677cd1ab33b7 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -243,27 +243,26 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); + s64 val[CPUACCT_STAT_NSTATS]; int cpu; - s64 val = 0; + int stat; + memset(val, 0, sizeof(val)); for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } return 0; } From 277a13e4f0d661678a7084bf97ed96a99c7dac21 Mon Sep 17 00:00:00 2001 From: Zhao Lei Date: Mon, 20 Jun 2016 17:37:20 +0800 Subject: [PATCH 19/20] sched/cpuacct: Introduce cpuacct.usage_all to show all CPU stats together In current code, we can get cpuacct data from several files, but each file has various limitations. For example: - We can get CPU usage in user and kernel mode via cpuacct.stat, but we can't get detailed data about each CPU. - We can get each CPU's kernel mode usage in cpuacct.usage_percpu_sys, but we can't get user mode usage data at the same time. This patch introduces cpuacct.usage_all, to show all detailed CPU accounting data together: # cat cpuacct.usage_all cpu user system 0 3809760299 5807968992 1 3250329855 454612211 .. Signed-off-by: Zhao Lei Cc: KOSAKI Motohiro Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/7744460969edd7caaf0e903592ee52353ed9bdd6.1466415271.git.zhaolei@cn.fujitsu.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 677cd1ab33b7..bc0b309c3f19 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -240,6 +240,42 @@ static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + int index; + int cpu; + + seq_puts(m, "cpu"); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %s", cpuacct_stat_desc[index]); + seq_puts(m, "\n"); + + for_each_possible_cpu(cpu) { + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + seq_printf(m, "%d", cpu); + + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit + * platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + + seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif + } + seq_puts(m, "\n"); + } + return 0; +} + static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); @@ -293,6 +329,10 @@ static struct cftype files[] = { .name = "usage_percpu_sys", .seq_show = cpuacct_percpu_sys_seq_show, }, + { + .name = "usage_all", + .seq_show = cpuacct_all_seq_show, + }, { .name = "stat", .seq_show = cpuacct_stats_show, From 748c7201e622d1c24abb4f85072d2e74d12f295f Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Fri, 3 Jun 2016 17:10:18 -0300 Subject: [PATCH 20/20] sched/core: Panic on scheduling while atomic bugs if kernel.panic_on_warn is set Currently, a schedule while atomic error prints the stack trace to the kernel log and the system continue running. Although it is possible to collect the kernel log messages and analyze it, often more information are needed. Furthermore, keep the system running is not always the best choice. For example, when the preempt count underflows the system will not stop to complain about scheduling while atomic, so the kernel log can wrap around overwriting the first stack trace, tuning the analysis even more challenging. This patch uses the kernel.panic_on_warn sysctl to help out on these more complex situations. When kernel.panic_on_warn is set to 1, the kernel will panic() in the schedule while atomic detection. The default value of the sysctl is 0, maintaining the current behavior. Signed-off-by: Daniel Bristot de Oliveira Reviewed-by: Luis Claudio R. Goncalves Cc: Christian Borntraeger Cc: Linus Torvalds Cc: Luis Claudio R. Goncalves Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e8f7b80f353aa22c63bd8557208163989af8493d.1464983675.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28da50a5bc76..4e9617a7e7d9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3168,6 +3168,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); }