diff --git a/arch/um/drivers/mconsole_kern.c b/arch/um/drivers/mconsole_kern.c index b80a1d616e4e..30575bd92975 100644 --- a/arch/um/drivers/mconsole_kern.c +++ b/arch/um/drivers/mconsole_kern.c @@ -36,6 +36,8 @@ #include "mconsole_kern.h" #include +static struct vfsmount *proc_mnt = NULL; + static int do_unlink_socket(struct notifier_block *notifier, unsigned long what, void *data) { @@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req) void mconsole_proc(struct mc_request *req) { - struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt; + struct vfsmount *mnt = proc_mnt; char *buf; int len; struct file *file; @@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req) ptr += strlen("proc"); ptr = skip_spaces(ptr); + if (!mnt) { + mconsole_reply(req, "Proc not available", 1, 0); + goto out; + } file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0); if (IS_ERR(file)) { mconsole_reply(req, "Failed to open file", 1, 0); @@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req) with_console(req, stack_proc, to); } +static int __init mount_proc(void) +{ + struct file_system_type *proc_fs_type; + struct vfsmount *mnt; + + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) + return -ENODEV; + + mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + proc_mnt = mnt; + return 0; +} + /* * Changed by mconsole_setup, which is __setup, and called before SMP is * active. @@ -696,6 +720,8 @@ static int __init mconsole_init(void) int err; char file[UNIX_PATH_MAX]; + mount_proc(); + if (umid_file_name("mconsole", file, sizeof(file))) return -1; snprintf(mconsole_socket_name, sizeof(file), "%s", file); diff --git a/fs/proc/root.c b/fs/proc/root.c index 608233dfd29c..2633f10446c3 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = { .subdir = RB_ROOT, .name = "/proc", }; - -int pid_ns_prepare_proc(struct pid_namespace *ns) -{ - struct proc_fs_context *ctx; - struct fs_context *fc; - struct vfsmount *mnt; - - fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - if (fc->user_ns != ns->user_ns) { - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(ns->user_ns); - } - - ctx = fc->fs_private; - if (ctx->pid_ns != ns) { - put_pid_ns(ctx->pid_ns); - get_pid_ns(ns); - ctx->pid_ns = ns; - } - - mnt = fc_mount(fc); - put_fs_context(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); - - ns->proc_mnt = mnt; - return 0; -} - -void pid_ns_release_proc(struct pid_namespace *ns) -{ - kern_unmount(ns->proc_mnt); -} diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 2ed6af88794b..4956e362e55e 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -33,7 +33,6 @@ struct pid_namespace { unsigned int level; struct pid_namespace *parent; #ifdef CONFIG_PROC_FS - struct vfsmount *proc_mnt; struct dentry *proc_self; struct dentry *proc_thread_self; #endif @@ -42,7 +41,6 @@ struct pid_namespace { #endif struct user_namespace *user_ns; struct ucounts *ucounts; - struct work_struct proc_work; kgid_t pid_gid; int hide_pid; int reboot; /* group exit code if this pidns was rebooted */ diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 4626b1ac3b6c..e1106a077c1a 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -50,16 +50,11 @@ enum { #ifdef CONFIG_PROC_FS -extern int pid_ns_prepare_proc(struct pid_namespace *ns); -extern void pid_ns_release_proc(struct pid_namespace *ns); extern int proc_alloc_inum(unsigned int *pino); extern void proc_free_inum(unsigned int inum); #else /* CONFIG_PROC_FS */ -static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; } -static inline void pid_ns_release_proc(struct pid_namespace *ns) {} - static inline int proc_alloc_inum(unsigned int *inum) { *inum = 1; diff --git a/kernel/pid.c b/kernel/pid.c index ca08d6a3aa77..60820e72634c 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -144,9 +144,6 @@ void free_pid(struct pid *pid) /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; - /* fall through */ - case 0: - schedule_work(&ns->proc_work); break; } @@ -247,11 +244,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, tmp = tmp->parent; } - if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) - goto out_free; - } - get_pid_ns(ns); refcount_set(&pid->count, 1); for (type = 0; type < PIDTYPE_MAX; ++type) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index d40017e79ebe..01f8ba32cc0c 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -57,12 +57,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level) return READ_ONCE(*pkc); } -static void proc_cleanup_work(struct work_struct *work) -{ - struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work); - pid_ns_release_proc(ns); -} - static struct ucounts *inc_pid_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES); @@ -114,7 +108,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns ns->user_ns = get_user_ns(user_ns); ns->ucounts = ucounts; ns->pid_allocated = PIDNS_ADDING; - INIT_WORK(&ns->proc_work, proc_cleanup_work); return ns; @@ -231,20 +224,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) } while (rc != -ECHILD); /* - * kernel_wait4() above can't reap the EXIT_DEAD children but we do not - * really care, we could reparent them to the global init. We could - * exit and reap ->child_reaper even if it is not the last thread in - * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(), - * pid_ns can not go away until proc_kill_sb() drops the reference. + * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE + * process whose parents processes are outside of the pid + * namespace. Such processes are created with setns()+fork(). * - * But this ns can also have other tasks injected by setns()+fork(). - * Again, ignoring the user visible semantics we do not really need - * to wait until they are all reaped, but they can be reparented to - * us and thus we need to ensure that pid->child_reaper stays valid - * until they all go away. See free_pid()->wake_up_process(). + * If those EXIT_ZOMBIE processes are not reaped by their + * parents before their parents exit, they will be reparented + * to pid_ns->child_reaper. Thus pidns->child_reaper needs to + * stay valid until they all go away. * - * We rely on ignored SIGCHLD, an injected zombie must be autoreaped - * if reparented. + * The code relies on the the pid_ns->child_reaper ignoring + * SIGCHILD to cause those EXIT_ZOMBIE processes to be + * autoreaped if reparented. + * + * Semantically it is also desirable to wait for EXIT_ZOMBIE + * processes before allowing the child_reaper to be reaped, as + * that gives the invariant that when the init process of a + * pid namespace is reaped all of the processes in the pid + * namespace are gone. + * + * Once all of the other tasks are gone from the pid_namespace + * free_pid() will awaken this task. */ for (;;) { set_current_state(TASK_INTERRUPTIBLE);