linux/kernel
Stanislaw Gruszka 6e998916df sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency
Commit d670ec1317 "posix-cpu-timers: Cure SMP wobbles" fixes one glibc
test case in cost of breaking another one. After that commit, calling
clock_nanosleep(TIMER_ABSTIME, X) and then clock_gettime(&Y) can result
of Y time being smaller than X time.

Reproducer/tester can be found further below, it can be compiled and ran by:

	gcc -o tst-cpuclock2 tst-cpuclock2.c -pthread
	while ./tst-cpuclock2 ; do : ; done

This reproducer, when running on a buggy kernel, will complain
about "clock_gettime difference too small".

Issue happens because on start in thread_group_cputimer() we initialize
sum_exec_runtime of cputimer with threads runtime not yet accounted and
then add the threads runtime to running cputimer again on scheduler
tick, making it's sum_exec_runtime bigger than actual threads runtime.

KOSAKI Motohiro posted a fix for this problem, but that patch was never
applied: https://lkml.org/lkml/2013/5/26/191 .

This patch takes different approach to cure the problem. It calls
update_curr() when cputimer starts, that assure we will have updated
stats of running threads and on the next schedule tick we will account
only the runtime that elapsed from cputimer start. That also assure we
have consistent state between cpu times of individual threads and cpu
time of the process consisted by those threads.

Full reproducer (tst-cpuclock2.c):

	#define _GNU_SOURCE
	#include <unistd.h>
	#include <sys/syscall.h>
	#include <stdio.h>
	#include <time.h>
	#include <pthread.h>
	#include <stdint.h>
	#include <inttypes.h>

	/* Parameters for the Linux kernel ABI for CPU clocks.  */
	#define CPUCLOCK_SCHED          2
	#define MAKE_PROCESS_CPUCLOCK(pid, clock) \
		((~(clockid_t) (pid) << 3) | (clockid_t) (clock))

	static pthread_barrier_t barrier;

	/* Help advance the clock.  */
	static void *chew_cpu(void *arg)
	{
		pthread_barrier_wait(&barrier);
		while (1) ;

		return NULL;
	}

	/* Don't use the glibc wrapper.  */
	static int do_nanosleep(int flags, const struct timespec *req)
	{
		clockid_t clock_id = MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED);

		return syscall(SYS_clock_nanosleep, clock_id, flags, req, NULL);
	}

	static int64_t tsdiff(const struct timespec *before, const struct timespec *after)
	{
		int64_t before_i = before->tv_sec * 1000000000ULL + before->tv_nsec;
		int64_t after_i = after->tv_sec * 1000000000ULL + after->tv_nsec;

		return after_i - before_i;
	}

	int main(void)
	{
		int result = 0;
		pthread_t th;

		pthread_barrier_init(&barrier, NULL, 2);

		if (pthread_create(&th, NULL, chew_cpu, NULL) != 0) {
			perror("pthread_create");
			return 1;
		}

		pthread_barrier_wait(&barrier);

		/* The test.  */
		struct timespec before, after, sleeptimeabs;
		int64_t sleepdiff, diffabs;
		const struct timespec sleeptime = {.tv_sec = 0,.tv_nsec = 100000000 };

		/* The relative nanosleep.  Not sure why this is needed, but its presence
		   seems to make it easier to reproduce the problem.  */
		if (do_nanosleep(0, &sleeptime) != 0) {
			perror("clock_nanosleep");
			return 1;
		}

		/* Get the current time.  */
		if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &before) < 0) {
			perror("clock_gettime[2]");
			return 1;
		}

		/* Compute the absolute sleep time based on the current time.  */
		uint64_t nsec = before.tv_nsec + sleeptime.tv_nsec;
		sleeptimeabs.tv_sec = before.tv_sec + nsec / 1000000000;
		sleeptimeabs.tv_nsec = nsec % 1000000000;

		/* Sleep for the computed time.  */
		if (do_nanosleep(TIMER_ABSTIME, &sleeptimeabs) != 0) {
			perror("absolute clock_nanosleep");
			return 1;
		}

		/* Get the time after the sleep.  */
		if (clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &after) < 0) {
			perror("clock_gettime[3]");
			return 1;
		}

		/* The time after sleep should always be equal to or after the absolute sleep
		   time passed to clock_nanosleep.  */
		sleepdiff = tsdiff(&sleeptimeabs, &after);
		if (sleepdiff < 0) {
			printf("absolute clock_nanosleep woke too early: %" PRId64 "\n", sleepdiff);
			result = 1;

			printf("Before %llu.%09llu\n", before.tv_sec, before.tv_nsec);
			printf("After  %llu.%09llu\n", after.tv_sec, after.tv_nsec);
			printf("Sleep  %llu.%09llu\n", sleeptimeabs.tv_sec, sleeptimeabs.tv_nsec);
		}

		/* The difference between the timestamps taken before and after the
		   clock_nanosleep call should be equal to or more than the duration of the
		   sleep.  */
		diffabs = tsdiff(&before, &after);
		if (diffabs < sleeptime.tv_nsec) {
			printf("clock_gettime difference too small: %" PRId64 "\n", diffabs);
			result = 1;
		}

		pthread_cancel(th);

		return result;
	}

Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20141112155843.GA24803@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-11-16 10:04:20 +01:00
..
bpf bpf: split eBPF out of NET 2014-10-27 19:09:59 -04:00
configs x86: Add "make tinyconfig" to configure the tiniest possible kernel 2014-08-08 16:30:24 -07:00
debug kdb: replace strnicmp with strncasecmp 2014-10-14 02:18:25 +02:00
events perf: Fix and clean up initialization of pmu::event_idx 2014-10-28 10:51:01 +01:00
gcov gcov: add ARM64 to GCOV_PROFILE_ALL 2014-10-29 16:33:14 -07:00
irq Merge branch 'for-3.18-consistent-ops' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu 2014-10-15 07:48:18 +02:00
locking Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2014-10-13 15:51:40 +02:00
power PM / Sleep: fix recovery during resuming from hibernation 2014-10-27 18:42:26 +01:00
printk Merge branch 'for-3.18-consistent-ops' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu 2014-10-15 07:48:18 +02:00
rcu rcu: Make rcu_barrier() understand about missing rcuo kthreads 2014-10-28 13:24:13 -07:00
sched sched/cputime: Fix clock_nanosleep()/clock_gettime() inconsistency 2014-11-16 10:04:20 +01:00
time sched/cputime: Fix cpu_timer_sample_group() double accounting 2014-11-16 10:04:18 +01:00
trace tracing/syscalls: Ignore numbers outside NR_syscalls' range 2014-10-30 20:58:38 -04:00
.gitignore
acct.c acct: eliminate compile warning 2014-10-09 22:26:04 -04:00
async.c kernel/async.c: switch to pr_foo() 2014-10-09 22:26:04 -04:00
audit_tree.c audit: rename audit_log_remove_rule to disambiguate for trees 2014-10-10 15:30:25 -04:00
audit_watch.c audit: invalid op= values for rules 2014-09-23 16:37:53 -04:00
audit.c Merge git://git.infradead.org/users/eparis/audit 2014-10-19 16:25:56 -07:00
audit.h audit: reduce scope of audit_log_fcaps 2014-09-23 16:37:51 -04:00
auditfilter.c Merge git://git.infradead.org/users/eparis/audit 2014-10-19 16:25:56 -07:00
auditsc.c Merge git://git.infradead.org/users/eparis/audit 2014-10-19 16:25:56 -07:00
backtracetest.c kernel/backtracetest.c: replace no level printk by pr_info() 2014-06-04 16:54:14 -07:00
bounds.c page-cgroup: get rid of NR_PCG_FLAGS 2014-08-08 15:57:18 -07:00
capability.c CAPABILITIES: remove undefined caps from all processes 2014-07-24 21:53:47 +10:00
cgroup_freezer.c cgroup: rename cgroup_subsys->base_cftypes to ->legacy_cftypes 2014-07-15 11:05:09 -04:00
cgroup.c Merge branch 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu 2014-10-10 07:26:02 -04:00
compat.c compat: nanosleep: Clarify error handling 2014-09-06 12:58:18 +02:00
configs.c
context_tracking.c sched: stop the unbound recursion in preempt_schedule_context() 2014-10-28 10:46:05 +01:00
cpu_pm.c
cpu.c rcu: More on deadlock between CPU hotplug and expedited grace periods 2014-10-23 07:51:17 -07:00
cpuset.c Merge branch 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup 2014-10-10 07:24:40 -04:00
crash_dump.c crash_dump: Make is_kdump_kernel() accessible from modules 2014-08-25 15:42:19 -07:00
cred.c
delayacct.c delayacct: Remove braindamaged type conversions 2014-07-23 10:18:06 -07:00
dma.c
elfcore.c
exec_domain.c kernel/exec_domain.c: code clean-up 2014-06-04 16:54:15 -07:00
exit.c Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2014-10-13 16:23:15 +02:00
extable.c
fork.c Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2014-10-13 16:23:15 +02:00
freezer.c freezer: remove obsolete comments in __thaw_task() 2014-10-21 23:44:20 +02:00
futex_compat.c
futex.c futex: Fix a race condition between REQUEUE_PI and task death 2014-10-26 16:16:18 +01:00
groups.c
hung_task.c kernel/hung_task.c: convert simple_strtoul to kstrtouint 2014-06-04 16:54:15 -07:00
irq_work.c Merge branch 'for-3.18-consistent-ops' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu 2014-10-15 07:48:18 +02:00
jump_label.c
kallsyms.c kernel/kallsyms.c: use __seq_open_private() 2014-10-14 02:18:16 +02:00
kcmp.c kcmp: fix standard comparison bug 2014-09-10 15:42:12 -07:00
Kconfig.freezer
Kconfig.hz
Kconfig.locks locking/rwsem: Add CONFIG_RWSEM_SPIN_ON_OWNER 2014-07-16 14:57:13 +02:00
Kconfig.preempt
kexec.c kexec: remove the unused function parameter 2014-10-14 02:18:21 +02:00
kmod.c kernel/kmod: fix use-after-free of the sub_info structure 2014-10-29 16:33:14 -07:00
kprobes.c kprobes: Skip kretprobe hit in NMI context to avoid deadlock 2014-08-08 10:38:04 +02:00
ksysfs.c
kthread.c kernel/kthread.c: partial revert of 81c98869fa ("kthread: ensure locality of task_struct allocations") 2014-10-09 22:25:51 -04:00
latencytop.c kernel/latencytop.c: convert seq_printf to seq_puts 2014-06-04 16:54:15 -07:00
Makefile bpf: split eBPF out of NET 2014-10-27 19:09:59 -04:00
module_signing.c
module-internal.h
module.c A single panic fix for a rare race, stable CC'd. 2014-10-18 10:24:26 -07:00
notifier.c
nsproxy.c namespaces: Use task_lock and not rcu to protect nsproxy 2014-07-29 18:08:50 -07:00
padata.c
panic.c panic: add TAINT_SOFTLOCKUP 2014-08-08 15:57:24 -07:00
params.c kernel/param: consolidate __{start,stop}___param[] in <linux/moduleparam.h> 2014-10-14 02:18:28 +02:00
pid_namespace.c
pid.c
profile.c kernel/profile.c: use static const char instead of static char 2014-06-06 16:08:13 -07:00
ptrace.c sched: Remove proliferation of wait_on_bit() action functions 2014-07-16 15:10:39 +02:00
range.c
reboot.c kernel: add support for kernel restart handler call chain 2014-09-26 00:00:06 -07:00
relay.c
res_counter.c kernel/res_counter.c: replace simple_strtoull by kstrtoull 2014-06-04 16:54:15 -07:00
resource.c x86: optimize resource lookups for ioremap 2014-10-14 02:18:22 +02:00
seccomp.c Merge branch 'x86-seccomp-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2014-10-14 02:27:06 +02:00
signal.c Merge branch 'signal-cleanup' of git://git.kernel.org/pub/scm/linux/kernel/git/rw/misc 2014-08-09 09:58:12 -07:00
smp.c Merge branch 'for-3.18-consistent-ops' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu 2014-10-15 07:48:18 +02:00
smpboot.c
smpboot.h
softirq.c Merge branch 'for-3.18-consistent-ops' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu 2014-10-15 07:48:18 +02:00
stacktrace.c
stop_machine.c kernel/stop_machine.c: kernel-doc warning fix 2014-06-04 16:54:15 -07:00
sys_ni.c Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next 2014-10-08 21:40:54 -04:00
sys.c Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip 2014-10-13 16:23:15 +02:00
sysctl_binary.c dmaengine-3.17 2014-10-07 20:39:25 -04:00
sysctl.c sched/fair: Fix division by zero sysctl_numa_balancing_scan_size 2014-10-28 10:46:04 +01:00
system_certificates.S
system_keyring.c KEYS: validate certificate trust only with builtin keys 2014-07-17 09:35:17 -04:00
task_work.c
taskstats.c scheduler: Replace __get_cpu_var with this_cpu_ptr 2014-08-26 13:45:45 -04:00
test_kprobes.c kernel/test_kprobes.c: use current logging functions 2014-08-08 15:57:18 -07:00
torture.c torture: Address race in module cleanup 2014-09-16 13:41:06 -07:00
tracepoint.c tracing: syscall_regfunc() should not skip kernel threads 2014-06-21 00:15:26 -04:00
tsacct.c sched: Make task->start_time nanoseconds based 2014-07-23 10:18:05 -07:00
uid16.c
up.c
user_namespace.c proc: constify seq_operations 2014-08-08 15:57:22 -07:00
user-return-notifier.c scheduler: Replace __get_cpu_var with this_cpu_ptr 2014-08-26 13:45:45 -04:00
user.c kernel/user.c: drop unused field 'files' from user_struct 2014-06-04 16:54:16 -07:00
utsname_sysctl.c sysctl: convert use of typedef ctl_table to struct ctl_table 2014-06-06 16:08:16 -07:00
utsname.c namespaces: Use task_lock and not rcu to protect nsproxy 2014-07-29 18:08:50 -07:00
watchdog.c Merge branch 'for-3.18-consistent-ops' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu 2014-10-15 07:48:18 +02:00
workqueue_internal.h
workqueue.c workqueue: Use cond_resched_rcu_qs macro 2014-10-06 05:58:26 -07:00