diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9a2849527dd7..7101ac64bb20 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -209,6 +209,7 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index e3f0f8585da4..896c16d2c5fb 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -6,6 +6,7 @@ #include #include #include +#include struct kernel_siginfo; struct task_struct; @@ -125,6 +126,16 @@ struct posix_cputimers { unsigned int expiry_active; }; +/** + * posix_cputimers_work - Container for task work based posix CPU timer expiry + * @work: The task work to be scheduled + * @scheduled: @work has been scheduled already, no further processing + */ +struct posix_cputimers_work { + struct callback_head work; + unsigned int scheduled; +}; + static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); @@ -165,6 +176,12 @@ static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } #endif +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +void posix_cputimers_init_work(void); +#else +static inline void posix_cputimers_init_work(void) { } +#endif + #define REQUEUE_PENDING 1 /** diff --git a/include/linux/sched.h b/include/linux/sched.h index 53ddc02e2e79..93ecd930efd3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -890,6 +890,10 @@ struct task_struct { /* Empty if CONFIG_POSIX_CPUTIMERS=n */ struct posix_cputimers posix_cputimers; +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK + struct posix_cputimers_work posix_cputimers_work; +#endif + /* Process credentials: */ /* Tracer's credentials at attach: */ diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index fcc42353f125..a09b1d61df6a 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -52,6 +52,15 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CMOS_UPDATE bool +# Select to handle posix CPU timers from task_work +# and not from the timer interrupt context +config HAVE_POSIX_CPU_TIMERS_TASK_WORK + bool + +config POSIX_CPU_TIMERS_TASK_WORK + bool + default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 165117996ea0..a71758e34e45 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -377,6 +377,7 @@ static int posix_cpu_clock_get(const clockid_t clock, struct timespec64 *tp) */ static int posix_cpu_timer_create(struct k_itimer *new_timer) { + static struct lock_class_key posix_cpu_timers_key; struct pid *pid; rcu_read_lock(); @@ -386,6 +387,17 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer) return -EINVAL; } + /* + * If posix timer expiry is handled in task work context then + * timer::it_lock can be taken without disabling interrupts as all + * other locking happens in task context. This requires a seperate + * lock class key otherwise regular posix timer expiry would record + * the lock class being taken in interrupt context and generate a + * false positive warning. + */ + if (IS_ENABLED(CONFIG_POSIX_CPU_TIMERS_TASK_WORK)) + lockdep_set_class(&new_timer->it_lock, &posix_cpu_timers_key); + new_timer->kclock = &clock_posix_cpu; timerqueue_init(&new_timer->it.cpu.node); new_timer->it.cpu.pid = get_pid(pid); @@ -1080,43 +1092,163 @@ static inline bool fastpath_timer_check(struct task_struct *tsk) return false; } -/* - * This is called from the timer interrupt handler. The irq handler has - * already updated our counts. We need to check if any timers fire now. - * Interrupts are disabled. - */ -void run_posix_cpu_timers(void) +static void handle_posix_cpu_timers(struct task_struct *tsk); + +#ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK +static void posix_cpu_timers_work(struct callback_head *work) +{ + handle_posix_cpu_timers(current); +} + +/* + * Initialize posix CPU timers task work in init task. Out of line to + * keep the callback static and to avoid header recursion hell. + */ +void __init posix_cputimers_init_work(void) +{ + init_task_work(¤t->posix_cputimers_work.work, + posix_cpu_timers_work); +} + +/* + * Note: All operations on tsk->posix_cputimer_work.scheduled happen either + * in hard interrupt context or in task context with interrupts + * disabled. Aside of that the writer/reader interaction is always in the + * context of the current task, which means they are strict per CPU. + */ +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return tsk->posix_cputimers_work.scheduled; +} + +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->posix_cputimers_work.scheduled)) + return; + + /* Schedule task work to actually expire the timers */ + tsk->posix_cputimers_work.scheduled = true; + task_work_add(tsk, &tsk->posix_cputimers_work.work, TWA_RESUME); +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + bool ret = true; + + /* + * On !RT kernels interrupts are disabled while collecting expired + * timers, so no tick can happen and the fast path check can be + * reenabled without further checks. + */ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + tsk->posix_cputimers_work.scheduled = false; + return true; + } + + /* + * On RT enabled kernels ticks can happen while the expired timers + * are collected under sighand lock. But any tick which observes + * the CPUTIMERS_WORK_SCHEDULED bit set, does not run the fastpath + * checks. So reenabling the tick work has do be done carefully: + * + * Disable interrupts and run the fast path check if jiffies have + * advanced since the collecting of expired timers started. If + * jiffies have not advanced or the fast path check did not find + * newly expired timers, reenable the fast path check in the timer + * interrupt. If there are newly expired timers, return false and + * let the collection loop repeat. + */ + local_irq_disable(); + if (start != jiffies && fastpath_timer_check(tsk)) + ret = false; + else + tsk->posix_cputimers_work.scheduled = false; + local_irq_enable(); + + return ret; +} +#else /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ +static inline void __run_posix_cpu_timers(struct task_struct *tsk) +{ + lockdep_posixtimer_enter(); + handle_posix_cpu_timers(tsk); + lockdep_posixtimer_exit(); +} + +static inline bool posix_cpu_timers_work_scheduled(struct task_struct *tsk) +{ + return false; +} + +static inline bool posix_cpu_timers_enable_work(struct task_struct *tsk, + unsigned long start) +{ + return true; +} +#endif /* CONFIG_POSIX_CPU_TIMERS_TASK_WORK */ + +static void handle_posix_cpu_timers(struct task_struct *tsk) { - struct task_struct *tsk = current; struct k_itimer *timer, *next; - unsigned long flags; + unsigned long flags, start; LIST_HEAD(firing); - lockdep_assert_irqs_disabled(); - - /* - * The fast path checks that there are no expired thread or thread - * group timers. If that's so, just return. - */ - if (!fastpath_timer_check(tsk)) + if (!lock_task_sighand(tsk, &flags)) return; - lockdep_posixtimer_enter(); - if (!lock_task_sighand(tsk, &flags)) { - lockdep_posixtimer_exit(); - return; - } - /* - * Here we take off tsk->signal->cpu_timers[N] and - * tsk->cpu_timers[N] all the timers that are firing, and - * put them on the firing list. - */ - check_thread_timers(tsk, &firing); + do { + /* + * On RT locking sighand lock does not disable interrupts, + * so this needs to be careful vs. ticks. Store the current + * jiffies value. + */ + start = READ_ONCE(jiffies); + barrier(); - check_process_timers(tsk, &firing); + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + + check_process_timers(tsk, &firing); + + /* + * The above timer checks have updated the exipry cache and + * because nothing can have queued or modified timers after + * sighand lock was taken above it is guaranteed to be + * consistent. So the next timer interrupt fastpath check + * will find valid data. + * + * If timer expiry runs in the timer interrupt context then + * the loop is not relevant as timers will be directly + * expired in interrupt context. The stub function below + * returns always true which allows the compiler to + * optimize the loop out. + * + * If timer expiry is deferred to task work context then + * the following rules apply: + * + * - On !RT kernels no tick can have happened on this CPU + * after sighand lock was acquired because interrupts are + * disabled. So reenabling task work before dropping + * sighand lock and reenabling interrupts is race free. + * + * - On RT kernels ticks might have happened but the tick + * work ignored posix CPU timer handling because the + * CPUTIMERS_WORK_SCHEDULED bit is set. Reenabling work + * must be done very carefully including a check whether + * ticks have happened since the start of the timer + * expiry checks. posix_cpu_timers_enable_work() takes + * care of that and eventually lets the expiry checks + * run again. + */ + } while (!posix_cpu_timers_enable_work(tsk, start)); /* - * We must release these locks before taking any timer's lock. + * We must release sighand lock before taking any timer's lock. * There is a potential race with timer deletion here, as the * siglock now protects our private firing list. We have set * the firing flag in each timer, so that a deletion attempt @@ -1134,6 +1266,13 @@ void run_posix_cpu_timers(void) list_for_each_entry_safe(timer, next, &firing, it.cpu.elist) { int cpu_firing; + /* + * spin_lock() is sufficient here even independent of the + * expiry context. If expiry happens in hard interrupt + * context it's obvious. For task work context it's safe + * because all other operations on timer::it_lock happen in + * task context (syscall or exit). + */ spin_lock(&timer->it_lock); list_del_init(&timer->it.cpu.elist); cpu_firing = timer->it.cpu.firing; @@ -1147,7 +1286,34 @@ void run_posix_cpu_timers(void) cpu_timer_fire(timer); spin_unlock(&timer->it_lock); } - lockdep_posixtimer_exit(); +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(void) +{ + struct task_struct *tsk = current; + + lockdep_assert_irqs_disabled(); + + /* + * If the actual expiry is deferred to task work context and the + * work is already scheduled there is no point to do anything here. + */ + if (posix_cpu_timers_work_scheduled(tsk)) + return; + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + __run_posix_cpu_timers(tsk); } /* diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ae5029f984a8..a16764b0116e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2017,6 +2017,7 @@ static void __init init_timer_cpus(void) void __init init_timers(void) { init_timer_cpus(); + posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); }