From 3588a085cd52ef080bf72df772378e1ba6bb292f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 Feb 2008 17:45:13 +0100 Subject: [PATCH 1/7] hrtimer: fix hrtimer_init_sleeper() users this patch: commit 37bb6cb4097e29ffee970065b74499cbf10603a3 Author: Peter Zijlstra Date: Fri Jan 25 21:08:32 2008 +0100 hrtimer: unlock hrtimer_wakeup Broke hrtimer_init_sleeper() users. It forgot to fix up the futex caller of this function to detect the failed queueing and messed up the do_nanosleep() caller in that it could leak a TASK_INTERRUPTIBLE state. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/futex.c | 2 ++ kernel/hrtimer.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index db9824de8bf0..0edd314798c0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1252,6 +1252,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, t.timer.expires = *abs_time; hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; /* * the timer could have already expired, in which diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index bd5d6b5060bc..1069998fe25f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -1315,6 +1315,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod } while (t->task && !signal_pending(current)); + __set_current_state(TASK_RUNNING); + return t->task == NULL; } From 1001d0a9ee74a468077dfd4da0565174e88de26b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:13 +0100 Subject: [PATCH 2/7] timekeeping: update xtime_cache when time(zone) changes xtime_cache needs to be updated whenever xtime and or wall_to_monotic are changed. Otherwise users of xtime_cache might see a stale (and in the case of timezone changes utterly wrong) value until the next update happens. Fixup the obvious places, which miss this update. Signed-off-by: Thomas Gleixner Acked-by: John Stultz Tested-by: Dhaval Giani Signed-off-by: Ingo Molnar --- include/linux/time.h | 1 + kernel/time.c | 1 + kernel/time/timekeeping.c | 6 ++++-- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/linux/time.h b/include/linux/time.h index b04136d60a2f..ceaab9fff155 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -122,6 +122,7 @@ extern void monotonic_to_bootbased(struct timespec *ts); extern struct timespec timespec_trunc(struct timespec t, unsigned gran); extern int timekeeping_is_continuous(void); extern void update_wall_time(void); +extern void update_xtime_cache(u64 nsec); /** * timespec_to_ns - Convert timespec to nanoseconds diff --git a/kernel/time.c b/kernel/time.c index 09d3c45c4da7..4064c0566e77 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -129,6 +129,7 @@ static inline void warp_clock(void) write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; xtime.tv_sec += sys_tz.tz_minuteswest * 60; + update_xtime_cache(0); write_sequnlock_irq(&xtime_lock); clock_was_set(); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 092a2366b5a9..cd5dbc4579c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -47,7 +47,7 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16))); static unsigned long total_sleep_time; /* seconds */ static struct timespec xtime_cache __attribute__ ((aligned (16))); -static inline void update_xtime_cache(u64 nsec) +void update_xtime_cache(u64 nsec) { xtime_cache = xtime; timespec_add_ns(&xtime_cache, nsec); @@ -145,6 +145,7 @@ int do_settimeofday(struct timespec *tv) set_normalized_timespec(&xtime, sec, nsec); set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); + update_xtime_cache(0); clock->error = 0; ntp_clear(); @@ -252,8 +253,8 @@ void __init timekeeping_init(void) xtime.tv_nsec = 0; set_normalized_timespec(&wall_to_monotonic, -xtime.tv_sec, -xtime.tv_nsec); + update_xtime_cache(0); total_sleep_time = 0; - write_sequnlock_irqrestore(&xtime_lock, flags); } @@ -290,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev) } /* Make sure that we have the correct xtime reference */ timespec_add_ns(&xtime, timekeeping_suspend_nsecs); + update_xtime_cache(0); /* re-base the last cycle value */ clock->cycle_last = clocksource_read(clock); clock->error = 0; From 5df7fa1c62146a0933767d040d400013310dbcc7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: [PATCH 3/7] tick-sched: add more debug information To allow better diagnosis of tick-sched related, especially NOHZ related problems, we need to know when the last wakeup via an irq happened and when the CPU left the idle state. Add two fields (idle_waketime, idle_exittime) to the tick_sched structure and add them to the timer_list output. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/tick.h | 4 ++++ kernel/time/tick-sched.c | 2 ++ kernel/time/timer_list.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/include/linux/tick.h b/include/linux/tick.h index 0fadf95debe1..a881c652f7e9 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -39,6 +39,8 @@ enum tick_nohz_mode { * @idle_calls: Total number of idle calls * @idle_sleeps: Number of idle calls, where the sched tick was stopped * @idle_entrytime: Time when the idle call was entered + * @idle_waketime: Time when the idle was interrupted + * @idle_exittime: Time when the idle state was left * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped * @sleep_length: Duration of the current idle sleep */ @@ -53,6 +55,8 @@ struct tick_sched { unsigned long idle_sleeps; int idle_active; ktime_t idle_entrytime; + ktime_t idle_waketime; + ktime_t idle_exittime; ktime_t idle_sleeptime; ktime_t idle_lastupdate; ktime_t sleep_length; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 63f24b550695..88267f0a8471 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -137,6 +137,7 @@ void tick_nohz_update_jiffies(void) cpu_clear(cpu, nohz_cpu_mask); now = ktime_get(); + ts->idle_waketime = now; local_irq_save(flags); tick_do_update_jiffies64(now); @@ -400,6 +401,7 @@ void tick_nohz_restart_sched_tick(void) * Cancel the scheduled timer and restore the tick */ ts->tick_stopped = 0; + ts->idle_exittime = now; hrtimer_cancel(&ts->sched_timer); ts->sched_timer.expires = ts->idle_tick; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 12c5f4cb6b8c..d3d94c1a0fd2 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -166,6 +166,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(idle_calls); P(idle_sleeps); P_ns(idle_entrytime); + P_ns(idle_waketime); + P_ns(idle_exittime); P_ns(idle_sleeptime); P(last_jiffies); P(next_jiffies); From 9d55b9923a1b7ea8193b8875c57ec940dc2ff027 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: [PATCH 4/7] x86: replace LOCK_PREFIX in futex.h The exception fixup for the futex macros __futex_atomic_op1/2 and futex_atomic_cmpxchg_inatomic() is missing an entry when the lock prefix is replaced by a NOP via SMP alternatives. Chuck Ebert tracked this down from the information provided in: https://bugzilla.redhat.com/show_bug.cgi?id=429412 A possible solution would be to add another fixup after the LOCK_PREFIX, so both the LOCK and NOP case have their own entry in the exception table, but it's not really worth the trouble. Simply replace LOCK_PREFIX with lock and keep those untouched by SMP alternatives. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/asm-x86/futex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/asm-x86/futex.h b/include/asm-x86/futex.h index 62828d63f1b1..9d919264923a 100644 --- a/include/asm-x86/futex.h +++ b/include/asm-x86/futex.h @@ -30,7 +30,7 @@ "1: movl %2, %0\n \ movl %0, %3\n" \ insn "\n" \ -"2: " LOCK_PREFIX "cmpxchgl %3, %2\n \ +"2: lock; cmpxchgl %3, %2\n \ jnz 1b\n \ 3: .section .fixup,\"ax\"\n \ 4: mov %5, %1\n \ @@ -72,7 +72,7 @@ futex_atomic_op_inuser(int encoded_op, int __user *uaddr) __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg); break; case FUTEX_OP_ADD: - __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval, + __futex_atomic_op1("lock; xaddl %0, %2", ret, oldval, uaddr, oparg); break; case FUTEX_OP_OR: @@ -111,8 +111,8 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) return -EFAULT; __asm__ __volatile__( - "1: " LOCK_PREFIX "cmpxchgl %3, %1 \n" + "1: lock; cmpxchgl %3, %1 \n" "2: .section .fixup, \"ax\" \n" "3: mov %2, %0 \n" " jmp 2b \n" From 83e96c604e781098a2f61b8a294919bcf3abfab4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: [PATCH 5/7] futex: Remove warn on in return fixup path The WARN_ON() in the fixup return path of futex_lock_pi() can trigger with false positives. The following scenario happens: t1 holds the futex and t2 and t3 are blocked on the kernel side rt_mutex. t1 releases the futex (and the rt_mutex) and assigned t2 to be the next owner of the futex. t2 is interrupted and returns w/o acquiring the rt_mutex, before t1 can release the rtmutex. t1 releases the rtmutex and t3 becomes the pending owner of the rtmutex. t2 notices that it is the designated owner (user space variable) and fails to acquire the rt_mutex via trylock, because it is not allowed to steal the rt_mutex from t3. Now it looks at the rt_mutex pending owner (t3) and assigns the futex and the pi_state to it. During the fixup t4 steals the rtmutex from t3. t2 returns from the fixup and the owner of the rt_mutex has changed from t3 to t4. There is no need to do another round of fixups from t2. The important part (t2 is not returning as the user space visible owner) is done. The further fixups are done, before either t3 or t4 return to user space. For the user space it is not relevant which task (t3 or t4) is the real owner, as long as those are both in the kernel, which is guaranteed by the serialization of the hash bucket lock. Both tasks (which ever returns first to userspace - t4 because it locked the rt_mutex or t3 due to a signal) are going through the lock_futex_pi() return path where the ownership is fixed before the return to user space. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/futex.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 0edd314798c0..0006d64c448e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1537,9 +1537,6 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, owner = rt_mutex_owner(&q.pi_state->pi_mutex); res = fixup_pi_state_owner(uaddr, &q, owner); - WARN_ON(rt_mutex_owner(&q.pi_state->pi_mutex) != - owner); - /* propagate -EFAULT, if the fixup failed */ if (res) ret = res; From cd689985cf49f6ff5c8eddc48d98b9d581d9475d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: [PATCH 6/7] futex: Add bitset conditional wait/wakeup functionality To allow the implementation of optimized rw-locks in user space, glibc needs a possibility to select waiters for wakeup depending on a bitset mask. This requires two new futex OPs: FUTEX_WAIT_BITS and FUTEX_WAKE_BITS These OPs are basically the same as FUTEX_WAIT and FUTEX_WAKE plus an additional argument - a bitset. Further the FUTEX_WAIT_BITS OP is expecting an absolute timeout value instead of the relative one, which is used for the FUTEX_WAIT OP. FUTEX_WAIT_BITS calls into the kernel with a bitset. The bitset is stored in the futex_q structure, which is used to enqueue the waiter into the hashed futex waitqueue. FUTEX_WAKE_BITS also calls into the kernel with a bitset. The wakeup function logically ANDs the bitset with the bitset stored in each waiters futex_q structure. If the result is zero (i.e. none of the set bits in the bitsets is matching), then the waiter is not woken up. If the result is not zero (i.e. one of the set bits in the bitsets is matching), then the waiter is woken. The bitset provided by the caller must be non zero. In case the provided bitset is zero the kernel returns EINVAL. Internaly the new OPs are only extensions to the existing FUTEX_WAIT and FUTEX_WAKE functions. The existing OPs hand a bitset with all bits set into the futex_wait() and futex_wake() functions. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/futex.h | 10 ++++++++++ include/linux/thread_info.h | 1 + kernel/futex.c | 37 ++++++++++++++++++++++++++++++------- kernel/futex_compat.c | 3 ++- 4 files changed, 43 insertions(+), 8 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index 1a15f8e237a7..90048fb28a38 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -21,6 +21,8 @@ union ktime; #define FUTEX_LOCK_PI 6 #define FUTEX_UNLOCK_PI 7 #define FUTEX_TRYLOCK_PI 8 +#define FUTEX_WAIT_BITSET 9 +#define FUTEX_WAKE_BITSET 10 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG @@ -33,6 +35,8 @@ union ktime; #define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) #define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) /* * Support for robust futexes: the kernel cleans up held futexes at @@ -111,6 +115,12 @@ struct robust_list_head { */ #define ROBUST_LIST_LIMIT 2048 +/* + * bitset with all bits set for the FUTEX_xxx_BITSET OPs to request a + * match of any bit. + */ +#define FUTEX_BITSET_MATCH_ANY 0xffffffff + #ifdef __KERNEL__ long do_futex(u32 __user *uaddr, int op, u32 val, union ktime *timeout, u32 __user *uaddr2, u32 val2, u32 val3); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index dfbdfb9836f4..421323e5a2d6 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -23,6 +23,7 @@ struct restart_block { u32 *uaddr; u32 val; u32 flags; + u32 bitset; u64 time; } futex; }; diff --git a/kernel/futex.c b/kernel/futex.c index 0006d64c448e..a6baaec44b8f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -109,6 +109,9 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task; + + /* Bitset for the optional bitmasked wakeup */ + u32 bitset; }; /* @@ -722,7 +725,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * to this virtual address: */ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, - int nr_wake) + int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -730,6 +733,9 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, union futex_key key; int ret; + if (!bitset) + return -EINVAL; + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); @@ -746,6 +752,11 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, ret = -EINVAL; break; } + + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; + wake_futex(this); if (++ret >= nr_wake) break; @@ -1156,7 +1167,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, static long futex_wait_restart(struct restart_block *restart); static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, - u32 val, ktime_t *abs_time) + u32 val, ktime_t *abs_time, u32 bitset) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); @@ -1167,7 +1178,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, struct hrtimer_sleeper t; int rem = 0; + if (!bitset) + return -EINVAL; + q.pi_state = NULL; + q.bitset = bitset; retry: futex_lock_mm(fshared); @@ -1295,6 +1310,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, restart->futex.uaddr = (u32 *)uaddr; restart->futex.val = val; restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; restart->futex.flags = 0; if (fshared) @@ -1321,7 +1337,8 @@ static long futex_wait_restart(struct restart_block *restart) restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = ¤t->mm->mmap_sem; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t); + return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + restart->futex.bitset); } @@ -1942,7 +1959,8 @@ retry: * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, &curr->mm->mmap_sem, 1); + futex_wake(uaddr, &curr->mm->mmap_sem, 1, + FUTEX_BITSET_MATCH_ANY); } return 0; } @@ -2042,10 +2060,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: - ret = futex_wait(uaddr, fshared, val, timeout); + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAIT_BITSET: + ret = futex_wait(uaddr, fshared, val, timeout, val3); break; case FUTEX_WAKE: - ret = futex_wake(uaddr, fshared, val); + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAKE_BITSET: + ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_FD: /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ @@ -2085,7 +2107,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, u32 val2 = 0; int cmd = op & FUTEX_CMD_MASK; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 0a43def6fee7..133d558db452 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -167,7 +167,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, int val2 = 0; int cmd = op & FUTEX_CMD_MASK; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET)) { if (get_compat_timespec(&ts, utime)) return -EFAULT; if (!timespec_valid(&ts)) From aa7d93506cc26378be6964692cd0dd34cffaee25 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Fri, 1 Feb 2008 17:45:14 +0100 Subject: [PATCH 7/7] latencytop: Change Kconfig dependency. Change latencytop Kconfig entry so it doesn't list the archictectures that support it. Instead introduce HAVE_LATENCY_SUPPORT which any architecture can set. Should reduce patch conflicts. Cc: Arjan van de Ven Cc: Martin Schwidefsky Cc: Holger Wolf Signed-off-by: Heiko Carstens Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 +++ lib/Kconfig.debug | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 65b449134cf7..93e66678e158 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -44,6 +44,9 @@ config LOCKDEP_SUPPORT config STACKTRACE_SUPPORT def_bool y +config HAVE_LATENCYTOP_SUPPORT + def_bool y + config SEMAPHORE_SLEEPERS def_bool y diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 89f4035b526c..0d8a5a4a789d 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -581,7 +581,7 @@ config LATENCYTOP select STACKTRACE select SCHEDSTATS select SCHED_DEBUG - depends on X86 || X86_64 + depends on HAVE_LATENCYTOP_SUPPORT help Enable this option if you want to use the LatencyTOP tool to find out which userspace is blocking on what kernel operations.