From 2ca7be7d55ad84fb0f7c2a23fb700a28fd76b19a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:00:07 -0500 Subject: [PATCH 01/14] exec: Only compute current once in flush_old_exec Make it clear that current only needs to be computed once in flush_old_exec. This may have some efficiency improvements and it makes the code easier to change. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Acked-by: Christian Brauner Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index db17be51b112..c3f34791f2f0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1260,13 +1260,14 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) */ int flush_old_exec(struct linux_binprm * bprm) { + struct task_struct *me = current; int retval; /* * Make sure we have a private signal table and that * we are unassociated from the previous thread group. */ - retval = de_thread(current); + retval = de_thread(me); if (retval) goto out; @@ -1294,10 +1295,10 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; set_fs(USER_DS); - current->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | + me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); flush_thread(); - current->personality &= ~bprm->per_clear; + me->personality &= ~bprm->per_clear; /* * We have to apply CLOEXEC before we change whether the process is @@ -1305,7 +1306,7 @@ int flush_old_exec(struct linux_binprm * bprm) * trying to access the should-be-closed file descriptors of a process * undergoing exec(2). */ - do_close_on_exec(current->files); + do_close_on_exec(me->files); return 0; out: From 021691559245498dfa15454c9fc4351f367d0b7f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:00:21 -0500 Subject: [PATCH 02/14] exec: Factor unshare_sighand out of de_thread and call it separately This makes the code clearer and makes it easier to implement a mutex that is not taken over any locations that may block indefinitely waiting for userspace. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Acked-by: Christian Brauner Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index c3f34791f2f0..ff74b9a74d34 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1194,6 +1194,23 @@ no_thread_group: flush_itimer_signals(); #endif + BUG_ON(!thread_group_leader(tsk)); + return 0; + +killed: + /* protects against exit_notify() and __exit_signal() */ + read_lock(&tasklist_lock); + sig->group_exit_task = NULL; + sig->notify_count = 0; + read_unlock(&tasklist_lock); + return -EAGAIN; +} + + +static int unshare_sighand(struct task_struct *me) +{ + struct sighand_struct *oldsighand = me->sighand; + if (refcount_read(&oldsighand->count) != 1) { struct sighand_struct *newsighand; /* @@ -1210,23 +1227,13 @@ no_thread_group: write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); - rcu_assign_pointer(tsk->sighand, newsighand); + rcu_assign_pointer(me->sighand, newsighand); spin_unlock(&oldsighand->siglock); write_unlock_irq(&tasklist_lock); __cleanup_sighand(oldsighand); } - - BUG_ON(!thread_group_leader(tsk)); return 0; - -killed: - /* protects against exit_notify() and __exit_signal() */ - read_lock(&tasklist_lock); - sig->group_exit_task = NULL; - sig->notify_count = 0; - read_unlock(&tasklist_lock); - return -EAGAIN; } char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) @@ -1264,13 +1271,19 @@ int flush_old_exec(struct linux_binprm * bprm) int retval; /* - * Make sure we have a private signal table and that - * we are unassociated from the previous thread group. + * Make this the only thread in the thread group. */ retval = de_thread(me); if (retval) goto out; + /* + * Make the signal table private. + */ + retval = unshare_sighand(me); + if (retval) + goto out; + /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update From 153ffb6ba49fd80dc607a9f230415af02b728d70 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:00:38 -0500 Subject: [PATCH 03/14] exec: Move cleanup of posix timers on exec out of de_thread These functions have very little to do with de_thread move them out of de_thread an into flush_old_exec proper so it can be more clearly seen what flush_old_exec is doing. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Acked-by: Christian Brauner Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index ff74b9a74d34..215d86f77b63 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1189,11 +1189,6 @@ no_thread_group: /* we have changed execution domain */ tsk->exit_signal = SIGCHLD; -#ifdef CONFIG_POSIX_TIMERS - exit_itimers(sig); - flush_itimer_signals(); -#endif - BUG_ON(!thread_group_leader(tsk)); return 0; @@ -1277,6 +1272,11 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; +#ifdef CONFIG_POSIX_TIMERS + exit_itimers(me->signal); + flush_itimer_signals(); +#endif + /* * Make the signal table private. */ From ccf0fa6be02687868006fa0c85af32c40cae6fb6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:03:21 -0500 Subject: [PATCH 04/14] exec: Move exec_mmap right after de_thread in flush_old_exec I have read through the code in exec_mmap and I do not see anything that depends on sighand or the sighand lock, or on signals in anyway so this should be safe. This rearrangement of code has two significant benefits. It makes the determination of passing the point of no return by testing bprm->mm accurate. All failures prior to that point in flush_old_exec are either truly recoverable or they are fatal. Further this consolidates all of the possible indefinite waits for userspace together at the top of flush_old_exec. The possible wait for a ptracer on PTRACE_EVENT_EXIT, the possible wait for a page fault to be resolved in clear_child_tid, and the possible wait for a page fault in exit_robust_list. This consolidation allows the creation of a mutex to replace cred_guard_mutex that is not held over possible indefinite userspace waits. Which will allow removing deadlock scenarios from the kernel. Signed-off-by: "Eric W. Biederman" Reviewed-by: Bernd Edlinger Reviewed-by: Kees Cook Reviewed-by: Kirill Tkhai Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 215d86f77b63..d820a7272a76 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1272,18 +1272,6 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; -#ifdef CONFIG_POSIX_TIMERS - exit_itimers(me->signal); - flush_itimer_signals(); -#endif - - /* - * Make the signal table private. - */ - retval = unshare_sighand(me); - if (retval) - goto out; - /* * Must be called _before_ exec_mmap() as bprm->mm is * not visibile until then. This also enables the update @@ -1307,6 +1295,18 @@ int flush_old_exec(struct linux_binprm * bprm) */ bprm->mm = NULL; +#ifdef CONFIG_POSIX_TIMERS + exit_itimers(me->signal); + flush_itimer_signals(); +#endif + + /* + * Make the signal table private. + */ + retval = unshare_sighand(me); + if (retval) + goto out; + set_fs(USER_DS); me->flags &= ~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE | PF_NO_SETAFFINITY); From eea9673250db4e854e9998ef9da6d4584857f0ea Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 25 Mar 2020 10:03:36 -0500 Subject: [PATCH 05/14] exec: Add exec_update_mutex to replace cred_guard_mutex The cred_guard_mutex is problematic as it is held over possibly indefinite waits for userspace. The possible indefinite waits for userspace that I have identified are: The cred_guard_mutex is held in PTRACE_EVENT_EXIT waiting for the tracer. The cred_guard_mutex is held over "put_user(0, tsk->clear_child_tid)" in exit_mm(). The cred_guard_mutex is held over "get_user(futex_offset, ...") in exit_robust_list. The cred_guard_mutex held over copy_strings. The functions get_user and put_user can trigger a page fault which can potentially wait indefinitely in the case of userfaultfd or if userspace implements part of the page fault path. In any of those cases the userspace process that the kernel is waiting for might make a different system call that winds up taking the cred_guard_mutex and result in deadlock. Holding a mutex over any of those possibly indefinite waits for userspace does not appear necessary. Add exec_update_mutex that will just cover updating the process during exec where the permissions and the objects pointed to by the task struct may be out of sync. The plan is to switch the users of cred_guard_mutex to exec_update_mutex one by one. This lets us move forward while still being careful and not introducing any regressions. Link: https://lore.kernel.org/lkml/20160921152946.GA24210@dhcp22.suse.cz/ Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@AM6PR03MB5170.eurprd03.prod.outlook.com/ Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@redhat.com/ Link: https://lore.kernel.org/lkml/20160923095031.GA14923@redhat.com/ Link: https://lore.kernel.org/lkml/20170213141452.GA30203@redhat.com/ Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.") Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2") Reviewed-by: Kirill Tkhai Signed-off-by: "Eric W. Biederman" Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/exec.c | 22 +++++++++++++++++++--- include/linux/binfmts.h | 8 +++++++- include/linux/sched/signal.h | 9 ++++++++- init/init_task.c | 1 + kernel/fork.c | 1 + 5 files changed, 36 insertions(+), 5 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index d820a7272a76..0e46ec57fe0a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1010,16 +1010,26 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) } EXPORT_SYMBOL(read_code); +/* + * Maps the mm_struct mm into the current task struct. + * On success, this function returns with the mutex + * exec_update_mutex locked. + */ static int exec_mmap(struct mm_struct *mm) { struct task_struct *tsk; struct mm_struct *old_mm, *active_mm; + int ret; /* Notify parent that we're no longer interested in the old VM */ tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + if (ret) + return ret; + if (old_mm) { sync_mm_rss(old_mm); /* @@ -1031,9 +1041,11 @@ static int exec_mmap(struct mm_struct *mm) down_read(&old_mm->mmap_sem); if (unlikely(old_mm->core_state)) { up_read(&old_mm->mmap_sem); + mutex_unlock(&tsk->signal->exec_update_mutex); return -EINTR; } } + task_lock(tsk); active_mm = tsk->active_mm; membarrier_exec_mmap(mm); @@ -1288,11 +1300,12 @@ int flush_old_exec(struct linux_binprm * bprm) goto out; /* - * After clearing bprm->mm (to mark that current is using the - * prepared mm now), we have nothing left of the original + * After setting bprm->called_exec_mmap (to mark that current is + * using the prepared mm now), we have nothing left of the original * process. If anything from here on returns an error, the check * in search_binary_handler() will SEGV current. */ + bprm->called_exec_mmap = 1; bprm->mm = NULL; #ifdef CONFIG_POSIX_TIMERS @@ -1438,6 +1451,8 @@ static void free_bprm(struct linux_binprm *bprm) { free_arg_pages(bprm); if (bprm->cred) { + if (bprm->called_exec_mmap) + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1487,6 +1502,7 @@ void install_exec_creds(struct linux_binprm *bprm) * credentials; any time after this it may be unlocked. */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->signal->exec_update_mutex); mutex_unlock(¤t->signal->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1678,7 +1694,7 @@ int search_binary_handler(struct linux_binprm *bprm) read_lock(&binfmt_lock); put_binfmt(fmt); - if (retval < 0 && !bprm->mm) { + if (retval < 0 && bprm->called_exec_mmap) { /* we got to flush_old_exec() and failed after it */ read_unlock(&binfmt_lock); force_sigsegv(SIGSEGV); diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index b40fc633f3be..a345d9fed3d8 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -44,7 +44,13 @@ struct linux_binprm { * exec has happened. Used to sanitize execution environment * and to set AT_SECURE auxv for glibc. */ - secureexec:1; + secureexec:1, + /* + * Set by flush_old_exec, when exec_mmap has been called. + * This is past the point of no return, when the + * exec_update_mutex has been taken. + */ + called_exec_mmap:1; #ifdef __alpha__ unsigned int taso:1; #endif diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 88050259c466..a29df79540ce 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -224,7 +224,14 @@ struct signal_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations - * (notably. ptrace) */ + * (notably. ptrace) + * Deprecated do not use in new code. + * Use exec_update_mutex instead. + */ + struct mutex exec_update_mutex; /* Held while task_struct is being + * updated during exec, and may have + * inconsistent permissions. + */ } __randomize_layout; /* diff --git a/init/init_task.c b/init/init_task.c index 9e5cbe5eab7b..bd403ed3e418 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -26,6 +26,7 @@ static struct signal_struct init_signals = { .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), + .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), #ifdef CONFIG_POSIX_TIMERS .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), .cputimer = { diff --git a/kernel/fork.c b/kernel/fork.c index 60a1295f4384..12896a6ecee6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); + mutex_init(&sig->exec_update_mutex); return 0; } From 3e74fabd39710ee29fa25618d2c2b40cfa7d76c7 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:26:04 +0100 Subject: [PATCH 06/14] exec: Fix a deadlock in strace This fixes a deadlock in the tracer when tracing a multi-threaded application that calls execve while more than one thread are running. I observed that when running strace on the gcc test suite, it always blocks after a while, when expect calls execve, because other threads have to be terminated. They send ptrace events, but the strace is no longer able to respond, since it is blocked in vm_access. The deadlock is always happening when strace needs to access the tracees process mmap, while another thread in the tracee starts to execve a child process, but that cannot continue until the PTRACE_EVENT_EXIT is handled and the WIFEXITED event is received: strace D 0 30614 30584 0x00000000 Call Trace: __schedule+0x3ce/0x6e0 schedule+0x5c/0xd0 schedule_preempt_disabled+0x15/0x20 __mutex_lock.isra.13+0x1ec/0x520 __mutex_lock_killable_slowpath+0x13/0x20 mutex_lock_killable+0x28/0x30 mm_access+0x27/0xa0 process_vm_rw_core.isra.3+0xff/0x550 process_vm_rw+0xdd/0xf0 __x64_sys_process_vm_readv+0x31/0x40 do_syscall_64+0x64/0x220 entry_SYSCALL_64_after_hwframe+0x44/0xa9 expect D 0 31933 30876 0x80004003 Call Trace: __schedule+0x3ce/0x6e0 schedule+0x5c/0xd0 flush_old_exec+0xc4/0x770 load_elf_binary+0x35a/0x16c0 search_binary_handler+0x97/0x1d0 __do_execve_file.isra.40+0x5d4/0x8a0 __x64_sys_execve+0x49/0x60 do_syscall_64+0x64/0x220 entry_SYSCALL_64_after_hwframe+0x44/0xa9 This changes mm_access to use the new exec_update_mutex instead of cred_guard_mutex. This patch is based on the following patch by Eric W. Biederman: "[PATCH 0/5] Infrastructure to allow fixing exec deadlocks" Link: https://lore.kernel.org/lkml/87v9ne5y4y.fsf_-_@x220.int.ebiederm.org/ Signed-off-by: Bernd Edlinger Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 12896a6ecee6..e0668a79bca1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1224,7 +1224,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) struct mm_struct *mm; int err; - err = mutex_lock_killable(&task->signal->cred_guard_mutex); + err = mutex_lock_killable(&task->signal->exec_update_mutex); if (err) return ERR_PTR(err); @@ -1234,7 +1234,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mmput(mm); mm = ERR_PTR(-EACCES); } - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return mm; } From 2de4e82318c7f9d34f4b08599a612cd4cd10bf0b Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:26:19 +0100 Subject: [PATCH 07/14] selftests/ptrace: add test cases for dead-locks This adds test cases for ptrace deadlocks. Additionally fixes a compile problem in get_syscall_info.c, observed with gcc-4.8.4: get_syscall_info.c: In function 'get_syscall_info': get_syscall_info.c:93:3: error: 'for' loop initial declarations are only allowed in C99 mode for (unsigned int i = 0; i < ARRAY_SIZE(args); ++i) { ^ get_syscall_info.c:93:3: note: use option -std=c99 or -std=gnu99 to compile your code Signed-off-by: Bernd Edlinger Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- tools/testing/selftests/ptrace/Makefile | 4 +- tools/testing/selftests/ptrace/vmaccess.c | 86 +++++++++++++++++++++++ 2 files changed, 88 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/ptrace/vmaccess.c diff --git a/tools/testing/selftests/ptrace/Makefile b/tools/testing/selftests/ptrace/Makefile index c0b7f89f0930..2f1f532c39db 100644 --- a/tools/testing/selftests/ptrace/Makefile +++ b/tools/testing/selftests/ptrace/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only -CFLAGS += -iquote../../../../include/uapi -Wall +CFLAGS += -std=c99 -pthread -iquote../../../../include/uapi -Wall -TEST_GEN_PROGS := get_syscall_info peeksiginfo +TEST_GEN_PROGS := get_syscall_info peeksiginfo vmaccess include ../lib.mk diff --git a/tools/testing/selftests/ptrace/vmaccess.c b/tools/testing/selftests/ptrace/vmaccess.c new file mode 100644 index 000000000000..4db327b44586 --- /dev/null +++ b/tools/testing/selftests/ptrace/vmaccess.c @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Copyright (c) 2020 Bernd Edlinger + * All rights reserved. + * + * Check whether /proc/$pid/mem can be accessed without causing deadlocks + * when de_thread is blocked with ->cred_guard_mutex held. + */ + +#include "../kselftest_harness.h" +#include +#include +#include +#include +#include +#include + +static void *thread(void *arg) +{ + ptrace(PTRACE_TRACEME, 0, 0L, 0L); + return NULL; +} + +TEST(vmaccess) +{ + int f, pid = fork(); + char mm[64]; + + if (!pid) { + pthread_t pt; + + pthread_create(&pt, NULL, thread, NULL); + pthread_join(pt, NULL); + execlp("true", "true", NULL); + } + + sleep(1); + sprintf(mm, "/proc/%d/mem", pid); + f = open(mm, O_RDONLY); + ASSERT_GE(f, 0); + close(f); + f = kill(pid, SIGCONT); + ASSERT_EQ(f, 0); +} + +TEST(attach) +{ + int s, k, pid = fork(); + + if (!pid) { + pthread_t pt; + + pthread_create(&pt, NULL, thread, NULL); + pthread_join(pt, NULL); + execlp("sleep", "sleep", "2", NULL); + } + + sleep(1); + k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + ASSERT_EQ(errno, EAGAIN); + ASSERT_EQ(k, -1); + k = waitpid(-1, &s, WNOHANG); + ASSERT_NE(k, -1); + ASSERT_NE(k, 0); + ASSERT_NE(k, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 0); + sleep(1); + k = ptrace(PTRACE_ATTACH, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFSTOPPED(s), 1); + ASSERT_EQ(WSTOPSIG(s), SIGSTOP); + k = ptrace(PTRACE_DETACH, pid, 0L, 0L); + ASSERT_EQ(k, 0); + k = waitpid(-1, &s, 0); + ASSERT_EQ(k, pid); + ASSERT_EQ(WIFEXITED(s), 1); + ASSERT_EQ(WEXITSTATUS(s), 0); + k = waitpid(-1, NULL, 0); + ASSERT_EQ(k, -1); + ASSERT_EQ(errno, ECHILD); +} + +TEST_HARNESS_MAIN From 8d09db80ef404d8e2e143677e0716cfcd544e29b Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:26:34 +0100 Subject: [PATCH 08/14] mm: docs: Fix a comment in process_vm_rw_core This removes a duplicate "a" in the comment in process_vm_rw_core. Signed-off-by: Bernd Edlinger Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- mm/process_vm_access.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index de41e830cdac..74e957e302fe 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -206,7 +206,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, if (!mm || IS_ERR(mm)) { rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; /* - * Explicitly map EACCES to EPERM as EPERM is a more a + * Explicitly map EACCES to EPERM as EPERM is a more * appropriate error code for process_vw_readv/writev */ if (rc == -EACCES) From aa884c11313656fee7b12972614b6333f154655c Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:26:50 +0100 Subject: [PATCH 09/14] kernel: doc: remove outdated comment cred.c This removes an outdated comment in prepare_kernel_cred. There is no "cred_replace_mutex" any more, so the comment must go away. Signed-off-by: Bernd Edlinger Reviewed-by: Kees Cook Signed-off-by: Eric W. Biederman --- kernel/cred.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/cred.c b/kernel/cred.c index 809a985b1793..71a792616917 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -675,8 +675,6 @@ void __init cred_init(void) * The caller may change these controls afterwards if desired. * * Returns the new credentials or NULL if out of memory. - * - * Does not take, and does not return holding current->cred_replace_mutex. */ struct cred *prepare_kernel_cred(struct task_struct *daemon) { From 454e3126cb842388e22df6b3ac3da44062c00765 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:27:05 +0100 Subject: [PATCH 10/14] kernel/kcmp.c: Use new infrastructure to fix deadlocks in execve This changes kcmp_epoll_target to use the new exec_update_mutex instead of cred_guard_mutex. This should be safe, as the credentials are only used for reading, and furthermore ->mm and ->sighand are updated on execve, but only under the new exec_update_mutex. Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- kernel/kcmp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index a0e3d7a0e8b8..b3ff9288c6cc 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, /* * One should have enough rights to inspect task details. */ - ret = kcmp_lock(&task1->signal->cred_guard_mutex, - &task2->signal->cred_guard_mutex); + ret = kcmp_lock(&task1->signal->exec_update_mutex, + &task2->signal->exec_update_mutex); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || @@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, } err_unlock: - kcmp_unlock(&task1->signal->cred_guard_mutex, - &task2->signal->cred_guard_mutex); + kcmp_unlock(&task1->signal->exec_update_mutex, + &task2->signal->exec_update_mutex); err: put_task_struct(task1); put_task_struct(task2); From 2db9dbf71bf98d02a0bf33e798e5bfd2a9944696 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:27:24 +0100 Subject: [PATCH 11/14] proc: Use new infrastructure to fix deadlocks in execve This changes lock_trace to use the new exec_update_mutex instead of cred_guard_mutex. This fixes possible deadlocks when the trace is accessing /proc/$pid/stack for instance. This should be safe, as the credentials are only used for reading, and task->mm is updated on execve under the new exec_update_mutex. Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index e7efe9d6f3d6..a278fda0458b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->cred_guard_mutex); + int err = mutex_lock_killable(&task->signal->exec_update_mutex); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); } #ifdef CONFIG_STACKTRACE From 76518d3798855242817e8a8ed76b2d72f4415624 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:27:41 +0100 Subject: [PATCH 12/14] proc: io_accounting: Use new infrastructure to fix deadlocks in execve This changes do_io_accounting to use the new exec_update_mutex instead of cred_guard_mutex. This fixes possible deadlocks when the trace is accessing /proc/$pid/io for instance. This should be safe, as the credentials are only used for reading. Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index a278fda0458b..74f948a6b621 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2883,7 +2883,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->cred_guard_mutex); + result = mutex_lock_killable(&task->signal->exec_update_mutex); if (result) return result; @@ -2919,7 +2919,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return result; } From 6914303824bb572278568330d72fc1f8f9814e67 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Fri, 20 Mar 2020 21:27:55 +0100 Subject: [PATCH 13/14] perf: Use new infrastructure to fix deadlocks in execve This changes perf_event_set_clock to use the new exec_update_mutex instead of cred_guard_mutex. This should be safe, as the credentials are only used for reading. Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e453589da97c..71cba8cfccbc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1249,7 +1249,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: - * cred_guard_mutex + * exec_update_mutex * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -11263,14 +11263,14 @@ SYSCALL_DEFINE5(perf_event_open, } if (task) { - err = mutex_lock_interruptible(&task->signal->cred_guard_mutex); + err = mutex_lock_interruptible(&task->signal->exec_update_mutex); if (err) goto err_task; /* * Reuse ptrace permission checks for now. * - * We must hold cred_guard_mutex across this and any potential + * We must hold exec_update_mutex across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). @@ -11559,7 +11559,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); if (task) { - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); put_task_struct(task); } @@ -11595,7 +11595,7 @@ err_alloc: free_event(event); err_cred: if (task) - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); err_task: if (task) put_task_struct(task); @@ -11900,7 +11900,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. * - * Can be called with cred_guard_mutex held when called from + * Can be called with exec_update_mutex held when called from * install_exec_creds(). */ void perf_event_exit_task(struct task_struct *child) From 501f9328bf5c6b5e4863da4b50e0e86792de3aa9 Mon Sep 17 00:00:00 2001 From: Bernd Edlinger Date: Sat, 21 Mar 2020 02:46:16 +0000 Subject: [PATCH 14/14] pidfd: Use new infrastructure to fix deadlocks in execve This changes __pidfd_fget to use the new exec_update_mutex instead of cred_guard_mutex. This should be safe, as the credentials do not change before exec_update_mutex is locked. Therefore whatever file access is possible with holding the cred_guard_mutex here is also possbile with the exec_update_mutex. Signed-off-by: Bernd Edlinger Signed-off-by: Eric W. Biederman --- kernel/pid.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/pid.c b/kernel/pid.c index 60820e72634c..efd34874b3d1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -577,7 +577,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) struct file *file; int ret; - ret = mutex_lock_killable(&task->signal->cred_guard_mutex); + ret = mutex_lock_killable(&task->signal->exec_update_mutex); if (ret) return ERR_PTR(ret); @@ -586,7 +586,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) else file = ERR_PTR(-EPERM); - mutex_unlock(&task->signal->cred_guard_mutex); + mutex_unlock(&task->signal->exec_update_mutex); return file ?: ERR_PTR(-EBADF); }