futex: Prevent inconsistent state and exit race
The recent rework of the requeue PI code introduced a possibility for
going back to user space in inconsistent state:
CPU 0 CPU 1
requeue_futex()
if (lock_pifutex_user()) {
dequeue_waiter();
wake_waiter(task);
sched_in(task);
return_from_futex_syscall();
---> Inconsistent state because PI state is not established
It becomes worse if the woken up task immediately exits:
sys_exit();
attach_pistate(vpid); <--- FAIL
Attach the pi state before dequeuing and waking the waiter. If the waiter
gets a spurious wakeup before the dequeue operation it will wait in
futex_requeue_pi_wakeup_sync() and therefore cannot return and exit.
Fixes: 07d91ef510
("futex: Prevent requeue_pi() lock nesting issue on RT")
Reported-by: syzbot+4d1bd0725ef09168e1a0@syzkaller.appspotmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20210902094414.558914045@linutronix.de
This commit is contained in:
parent
a974b54036
commit
4f07ec0d76
@ -1454,8 +1454,23 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
|
||||
newval |= FUTEX_WAITERS;
|
||||
|
||||
ret = lock_pi_update_atomic(uaddr, uval, newval);
|
||||
/* If the take over worked, return 1 */
|
||||
return ret < 0 ? ret : 1;
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* If the waiter bit was requested the caller also needs PI
|
||||
* state attached to the new owner of the user space futex.
|
||||
*
|
||||
* @task is guaranteed to be alive and it cannot be exiting
|
||||
* because it is either sleeping or waiting in
|
||||
* futex_requeue_pi_wakeup_sync().
|
||||
*/
|
||||
if (set_waiters) {
|
||||
ret = attach_to_pi_owner(uaddr, newval, key, ps,
|
||||
exiting);
|
||||
WARN_ON(ret);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2036,17 +2051,24 @@ futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
|
||||
return -EAGAIN;
|
||||
|
||||
/*
|
||||
* Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
|
||||
* the contended case or if set_waiters is 1. The pi_state is returned
|
||||
* in ps in contended cases.
|
||||
* Try to take the lock for top_waiter and set the FUTEX_WAITERS bit
|
||||
* in the contended case or if @set_waiters is true.
|
||||
*
|
||||
* In the contended case PI state is attached to the lock owner. If
|
||||
* the user space lock can be acquired then PI state is attached to
|
||||
* the new owner (@top_waiter->task) when @set_waiters is true.
|
||||
*/
|
||||
vpid = task_pid_vnr(top_waiter->task);
|
||||
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
|
||||
exiting, set_waiters);
|
||||
if (ret == 1) {
|
||||
/* Dequeue, wake up and update top_waiter::requeue_state */
|
||||
/*
|
||||
* Lock was acquired in user space and PI state was
|
||||
* attached to @top_waiter->task. That means state is fully
|
||||
* consistent and the waiter can return to user space
|
||||
* immediately after the wakeup.
|
||||
*/
|
||||
requeue_pi_wake_futex(top_waiter, key2, hb2);
|
||||
return vpid;
|
||||
} else if (ret < 0) {
|
||||
/* Rewind top_waiter::requeue_state */
|
||||
futex_requeue_pi_complete(top_waiter, ret);
|
||||
@ -2208,19 +2230,26 @@ retry_private:
|
||||
&exiting, nr_requeue);
|
||||
|
||||
/*
|
||||
* At this point the top_waiter has either taken uaddr2 or is
|
||||
* waiting on it. If the former, then the pi_state will not
|
||||
* exist yet, look it up one more time to ensure we have a
|
||||
* reference to it. If the lock was taken, @ret contains the
|
||||
* VPID of the top waiter task.
|
||||
* If the lock was not taken, we have pi_state and an initial
|
||||
* refcount on it. In case of an error we have nothing.
|
||||
* At this point the top_waiter has either taken uaddr2 or
|
||||
* is waiting on it. In both cases pi_state has been
|
||||
* established and an initial refcount on it. In case of an
|
||||
* error there's nothing.
|
||||
*
|
||||
* The top waiter's requeue_state is up to date:
|
||||
*
|
||||
* - If the lock was acquired atomically (ret > 0), then
|
||||
* - If the lock was acquired atomically (ret == 1), then
|
||||
* the state is Q_REQUEUE_PI_LOCKED.
|
||||
*
|
||||
* The top waiter has been dequeued and woken up and can
|
||||
* return to user space immediately. The kernel/user
|
||||
* space state is consistent. In case that there must be
|
||||
* more waiters requeued the WAITERS bit in the user
|
||||
* space futex is set so the top waiter task has to go
|
||||
* into the syscall slowpath to unlock the futex. This
|
||||
* will block until this requeue operation has been
|
||||
* completed and the hash bucket locks have been
|
||||
* dropped.
|
||||
*
|
||||
* - If the trylock failed with an error (ret < 0) then
|
||||
* the state is either Q_REQUEUE_PI_NONE, i.e. "nothing
|
||||
* happened", or Q_REQUEUE_PI_IGNORE when there was an
|
||||
@ -2234,36 +2263,20 @@ retry_private:
|
||||
* the same sanity checks for requeue_pi as the loop
|
||||
* below does.
|
||||
*/
|
||||
if (ret > 0) {
|
||||
WARN_ON(pi_state);
|
||||
task_count++;
|
||||
/*
|
||||
* If futex_proxy_trylock_atomic() acquired the
|
||||
* user space futex, then the user space value
|
||||
* @uaddr2 has been set to the @hb1's top waiter
|
||||
* task VPID. This task is guaranteed to be alive
|
||||
* and cannot be exiting because it is either
|
||||
* sleeping or blocked on @hb2 lock.
|
||||
*
|
||||
* The @uaddr2 futex cannot have waiters either as
|
||||
* otherwise futex_proxy_trylock_atomic() would not
|
||||
* have succeeded.
|
||||
*
|
||||
* In order to requeue waiters to @hb2, pi state is
|
||||
* required. Hand in the VPID value (@ret) and
|
||||
* allocate PI state with an initial refcount on
|
||||
* it.
|
||||
*/
|
||||
ret = attach_to_pi_owner(uaddr2, ret, &key2, &pi_state,
|
||||
&exiting);
|
||||
WARN_ON(ret);
|
||||
}
|
||||
|
||||
switch (ret) {
|
||||
case 0:
|
||||
/* We hold a reference on the pi state. */
|
||||
break;
|
||||
|
||||
case 1:
|
||||
/*
|
||||
* futex_proxy_trylock_atomic() acquired the user space
|
||||
* futex. Adjust task_count.
|
||||
*/
|
||||
task_count++;
|
||||
ret = 0;
|
||||
break;
|
||||
|
||||
/*
|
||||
* If the above failed, then pi_state is NULL and
|
||||
* waiter::requeue_state is correct.
|
||||
@ -2395,9 +2408,8 @@ retry_private:
|
||||
}
|
||||
|
||||
/*
|
||||
* We took an extra initial reference to the pi_state either in
|
||||
* futex_proxy_trylock_atomic() or in attach_to_pi_owner(). We need
|
||||
* to drop it here again.
|
||||
* We took an extra initial reference to the pi_state in
|
||||
* futex_proxy_trylock_atomic(). We need to drop it here again.
|
||||
*/
|
||||
put_pi_state(pi_state);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user