From 1c3e826482ab698e418c7a894440e62c76aac893 Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Wed, 20 Feb 2013 17:14:38 +0800 Subject: [PATCH 1/6] sched/core: Remove the obsolete and unused nr_uninterruptible() function Signed-off-by: Sha Zhengju Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1361351678-8065-1-git-send-email-handai.szj@taobao.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/sched/core.c | 22 ++-------------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 33cc42130371..f9ca237df7e8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -98,7 +98,6 @@ extern int nr_threads; DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); -extern unsigned long nr_uninterruptible(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); extern unsigned long this_cpu_load(void); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 03d7784b7bd2..b7b03cd2d4cd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1969,11 +1969,10 @@ context_switch(struct rq *rq, struct task_struct *prev, } /* - * nr_running, nr_uninterruptible and nr_context_switches: + * nr_running and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, current number of uninterruptible-sleeping threads, total - * number of context switches performed since bootup. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { @@ -1985,23 +1984,6 @@ unsigned long nr_running(void) return sum; } -unsigned long nr_uninterruptible(void) -{ - unsigned long i, sum = 0; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_uninterruptible; - - /* - * Since we read the counters lockless, it might be slightly - * inaccurate. Do not allow it to go below zero though: - */ - if (unlikely((long)sum < 0)) - sum = 0; - - return sum; -} - unsigned long long nr_context_switches(void) { int i; From cb152ff26717961b10d0888cd983ba284cb99cd1 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 21 Feb 2013 15:15:08 -0800 Subject: [PATCH 2/6] sched: Fix /proc/sched_stat failure on very very large systems On systems with 4096 cores doing a cat /proc/sched_stat fails, because we are trying to push all the data into a single kmalloc buffer. The issue is on these very large machines all the data will not fit in 4mb. A better solution is to not use the single_open() mechanism but to provide our own seq_operations. The output should be identical to previous version and thus not need the version number. Reported-by: Dave Jones Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra Cc: Wu Fengguang [ Fix memleak] [ Fix spello in comment] [ Fix warnings] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched/stats.c | 79 +++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 20 deletions(-) diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 903ffa9e8872..e036eda1a9c9 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v) if (mask_str == NULL) return -ENOMEM; - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + if (v == (void *)1) { + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + } else { + struct rq *rq; #ifdef CONFIG_SMP struct sched_domain *sd; int dcount = 0; #endif + cpu = (unsigned long)(v - 2); + rq = cpu_rq(cpu); /* runqueue-specific stats */ seq_printf(seq, @@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v) return 0; } +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *schedstat_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return schedstat_start(file, offset); +} + +static void schedstat_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations schedstat_sops = { + .start = schedstat_start, + .next = schedstat_next, + .stop = schedstat_stop, + .show = show_schedstat, +}; + static int schedstat_open(struct inode *inode, struct file *file) { - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; + return seq_open(file, &schedstat_sops); } +static int schedstat_release(struct inode *inode, struct file *file) +{ + return 0; +}; + static const struct file_operations proc_schedstat_operations = { .open = schedstat_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = schedstat_release, }; static int __init proc_schedstat_init(void) From bbbfeac92beff40eb86c7f682a7f1395f9f0ae52 Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Thu, 21 Feb 2013 15:15:09 -0800 Subject: [PATCH 3/6] sched: Fix /proc/sched_debug failure on very very large systems On systems with 4096 cores attemping to read /proc/sched_debug fails because we are trying to push all the data into a single kmalloc buffer. The issue is on these very large machines all the data will not fit in 4mb. A better solution is to not us the single_open mechanism but to provide our own seq_operations and treat each cpu as an individual record. The output should be identical to the previous version. Reported-by: Dave Jones Signed-off-by: Nathan Zimmer Cc: Peter Zijlstra ) [ Whitespace fixlet] [ Fix spello in comment] Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 92 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 80 insertions(+), 12 deletions(-) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 7ae4c4c5420e..c496eb3c6459 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -269,11 +269,11 @@ static void print_cpu(struct seq_file *m, int cpu) { unsigned int freq = cpu_khz ? : 1; - SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + SEQ_printf(m, "cpu#%d, %u.%03u MHz\n", cpu, freq / 1000, (freq % 1000)); } #else - SEQ_printf(m, "\ncpu#%d\n", cpu); + SEQ_printf(m, "cpu#%d\n", cpu); #endif #define P(x) \ @@ -330,6 +330,7 @@ do { \ print_rq(m, rq, cpu); rcu_read_unlock(); spin_unlock_irqrestore(&sched_debug_lock, flags); + SEQ_printf(m, "\n"); } static const char *sched_tunable_scaling_names[] = { @@ -338,11 +339,10 @@ static const char *sched_tunable_scaling_names[] = { "linear" }; -static int sched_debug_show(struct seq_file *m, void *v) +static void sched_debug_header(struct seq_file *m) { u64 ktime, sched_clk, cpu_clk; unsigned long flags; - int cpu; local_irq_save(flags); ktime = ktime_to_ns(ktime_get()); @@ -384,33 +384,101 @@ static int sched_debug_show(struct seq_file *m, void *v) #undef PN #undef P - SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + SEQ_printf(m, " .%-40s: %d (%s)\n", + "sysctl_sched_tunable_scaling", sysctl_sched_tunable_scaling, sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); - - for_each_online_cpu(cpu) - print_cpu(m, cpu); - SEQ_printf(m, "\n"); +} + +static int sched_debug_show(struct seq_file *m, void *v) +{ + int cpu = (unsigned long)(v - 2); + + if (cpu != -1) + print_cpu(m, cpu); + else + sched_debug_header(m); return 0; } void sysrq_sched_debug_show(void) { - sched_debug_show(NULL, NULL); + int cpu; + + sched_debug_header(NULL); + for_each_online_cpu(cpu) + print_cpu(NULL, cpu); + +} + +/* + * This itererator needs some explanation. + * It returns 1 for the header position. + * This means 2 is cpu 0. + * In a hotplugged system some cpus, including cpu 0, may be missing so we have + * to use cpumask_* to iterate over the cpus. + */ +static void *sched_debug_start(struct seq_file *file, loff_t *offset) +{ + unsigned long n = *offset; + + if (n == 0) + return (void *) 1; + + n--; + + if (n > 0) + n = cpumask_next(n - 1, cpu_online_mask); + else + n = cpumask_first(cpu_online_mask); + + *offset = n + 1; + + if (n < nr_cpu_ids) + return (void *)(unsigned long)(n + 2); + return NULL; +} + +static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset) +{ + (*offset)++; + return sched_debug_start(file, offset); +} + +static void sched_debug_stop(struct seq_file *file, void *data) +{ +} + +static const struct seq_operations sched_debug_sops = { + .start = sched_debug_start, + .next = sched_debug_next, + .stop = sched_debug_stop, + .show = sched_debug_show, +}; + +static int sched_debug_release(struct inode *inode, struct file *file) +{ + seq_release(inode, file); + + return 0; } static int sched_debug_open(struct inode *inode, struct file *filp) { - return single_open(filp, sched_debug_show, NULL); + int ret = 0; + + ret = seq_open(filp, &sched_debug_sops); + + return ret; } static const struct file_operations sched_debug_fops = { .open = sched_debug_open, .read = seq_read, .llseek = seq_lseek, - .release = single_release, + .release = sched_debug_release, }; static int __init init_sched_debug_procfs(void) From 45ebd3945b2a3cf4eb89d5fb0090a3cb71af7973 Mon Sep 17 00:00:00 2001 From: Clark Williams Date: Wed, 20 Feb 2013 09:19:09 -0600 Subject: [PATCH 4/6] sched: Move RR_TIMESLICE from sysctl.h to rt.h This fixes an ia64 build bug reported by Tony Luck. Reported-by: Tony Luck Signed-off-by: Clark Williams Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1361373550-4011-2-git-send-email-clark.williams@gmail.com Signed-off-by: Ingo Molnar --- include/linux/sched/rt.h | 6 ++++++ include/linux/sched/sysctl.h | 6 ------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 94e19ea28fc3..440434df3627 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -55,4 +55,10 @@ static inline bool tsk_is_pi_blocked(struct task_struct *tsk) extern void normalize_rt_tasks(void); +/* + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define RR_TIMESLICE (100 * HZ / 1000) + #endif /* _SCHED_RT_H */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d2bb0ae979d0..bf8086b2506e 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -91,12 +91,6 @@ extern unsigned int sysctl_sched_cfs_bandwidth_slice; extern unsigned int sysctl_sched_autogroup_enabled; #endif -/* - * default timeslice is 100 msecs (used only for SCHED_RR tasks). - * Timeslices get refilled after they expire. - */ -#define RR_TIMESLICE (100 * HZ / 1000) - extern int sched_rr_timeslice; extern int sched_rr_handler(struct ctl_table *table, int write, From c78a4bcd1a879b39fb7646c887b0c195f1018909 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Sat, 23 Feb 2013 17:28:44 +0100 Subject: [PATCH 5/6] cputime: Constify timeval_to_cputime(timeval) argument Saw the following compiler warning on the linux-next tree: kernel/itimer.c: In function 'set_cpu_itimer': kernel/itimer.c:152:2: warning: passing argument 1 of 'timeval_to_cputime' discards 'const' qualifier from pointer target type [enabled by default] ... timeval_to_cputime() is always passed a constant timeval in argument, we need to teach the nsecs based cputime implementation about that. Signed-off-by: Li Zhong Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Kevin Hilman Link: http://lkml.kernel.org/r/1361636925-22288-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar Cc: Steven Rostedt Cc: Kevin Hilman --- include/asm-generic/cputime_nsecs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/cputime_nsecs.h b/include/asm-generic/cputime_nsecs.h index b6485cafb7bd..a8ece9a33aef 100644 --- a/include/asm-generic/cputime_nsecs.h +++ b/include/asm-generic/cputime_nsecs.h @@ -76,7 +76,7 @@ static inline void cputime_to_timespec(const cputime_t ct, struct timespec *val) /* * Convert cputime <-> timeval (msec) */ -static inline cputime_t timeval_to_cputime(struct timeval *val) +static inline cputime_t timeval_to_cputime(const struct timeval *val) { u64 ret = val->tv_sec * NSEC_PER_SEC + val->tv_usec * NSEC_PER_USEC; return (__force cputime_t) ret; From 7f6575f1fb963d5231afbceecd3feadb6ab58cd3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 23 Feb 2013 17:28:45 +0100 Subject: [PATCH 6/6] cputime: Use local_clock() for full dynticks cputime accounting Running the full dynticks cputime accounting with preemptible kernel debugging trigger the following warning: [ 4.488303] BUG: using smp_processor_id() in preemptible [00000000] code: init/1 [ 4.490971] caller is native_sched_clock+0x22/0x80 [ 4.493663] Pid: 1, comm: init Not tainted 3.8.0+ #13 [ 4.496376] Call Trace: [ 4.498996] [] debug_smp_processor_id+0xdb/0xf0 [ 4.501716] [] native_sched_clock+0x22/0x80 [ 4.504434] [] sched_clock+0x9/0x10 [ 4.507185] [] fetch_task_cputime+0xad/0x120 [ 4.509916] [] task_cputime+0x35/0x60 [ 4.512622] [] acct_update_integrals+0x1e/0x40 [ 4.515372] [] do_execve_common+0x4ff/0x5c0 [ 4.518117] [] ? do_execve_common+0x144/0x5c0 [ 4.520844] [] ? rest_init+0x160/0x160 [ 4.523554] [] do_execve+0x37/0x40 [ 4.526276] [] run_init_process+0x23/0x30 [ 4.528953] [] kernel_init+0x9c/0xf0 [ 4.531608] [] ret_from_fork+0x7c/0xb0 We use sched_clock() to perform and fixup the cputime accounting. However we are calling it with preemption enabled from the read side, which trigger the bug above. To fix this up, use local_clock() instead. It takes care of preemption and also provide a more reliable clock source. This is welcome for this kind of statistic that is widely relied on in userspace. Reported-by: Thomas Gleixner Reported-by: Ingo Molnar Suggested-by: Thomas Gleixner Signed-off-by: Frederic Weisbecker Cc: Li Zhong Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Kevin Hilman Link: http://lkml.kernel.org/r/1361636925-22288-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 9857329ed280..ed12cbb135f4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -604,7 +604,7 @@ static unsigned long long vtime_delta(struct task_struct *tsk) { unsigned long long clock; - clock = sched_clock(); + clock = local_clock(); if (clock < tsk->vtime_snap) return 0;