Proc mount option handling is broken, and it has been since I

accidentally broke it in the middle 2016. The problem is that because we perform an internal mount of proc before user space mounts proc all of the mount options that user specifies when mounting proc are ignored. You can set those mount options with a remount but that is rather surprising. This most directly affects android which is using hidpid=2 by default. Now that the sysctl system call support has been removed, and we have settled on way of flushing proc dentries when a process exits without using proc_mnt, there is an simple and easy fix. a) Give UML mconsole it's own private mount of proc to use. b) Stop creating the internal mount of proc We still need Alexey Gladkov's full patch to get proc mount options to work inside of UML, and to be generally useful. This set of changes is just enough to get them working as well as they have in the past. If anyone sees any problem with this code please let me know. Otherwise I plan to merge these set of fixes through my tree. Link: https://lore.kernel.org/lkml/87r21tuulj.fsf@x220.int.ebiederm.org/ Link: https://lore.kernel.org/lkml/871rqk2brn.fsf_-_@x220.int.ebiederm.org/ Link: https://lore.kernel.org/lkml/20200210150519.538333-1-gladkov.alexey@gmail.com/ Link: https://lore.kernel.org/lkml/20180611195744.154962-1-astrachan@google.com/ Fixes: e94591d0 ("proc: Convert proc_mount to use mount_ns.") Eric W. Biederman (4): uml: Don't consult current to find the proc_mnt in mconsole_proc uml: Create a private mount of proc for mconsole proc: Remove the now unnecessary internal mount of proc pid: Improve the comment about waiting in zap_pid_ns_processes arch/um/drivers/mconsole_kern.c | 28 +++++++++++++++++++++++++++- fs/proc/root.c | 36 ------------------------------------ include/linux/pid_namespace.h | 2 -- include/linux/proc_ns.h | 5 ----- kernel/pid.c | 8 -------- kernel/pid_namespace.c | 38 +++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 71 deletions(-)

Proc mount option handling is broken, and it has been since I
accidentally broke it in the middle 2016. The problem is that because we perform an internal mount of proc before user space mounts proc all of the mount options that user specifies when mounting proc are ignored. You can set those mount options with a remount but that is rather surprising. This most directly affects android which is using hidpid=2 by default. Now that the sysctl system call support has been removed, and we have settled on way of flushing proc dentries when a process exits without using proc_mnt, there is an simple and easy fix. a) Give UML mconsole it's own private mount of proc to use. b) Stop creating the internal mount of proc We still need Alexey Gladkov's full patch to get proc mount options to work inside of UML, and to be generally useful. This set of changes is just enough to get them working as well as they have in the past. If anyone sees any problem with this code please let me know. Otherwise I plan to merge these set of fixes through my tree. Link: https://lore.kernel.org/lkml/87r21tuulj.fsf@x220.int.ebiederm.org/ Link: https://lore.kernel.org/lkml/871rqk2brn.fsf_-_@x220.int.ebiederm.org/ Link: https://lore.kernel.org/lkml/20200210150519.538333-1-gladkov.alexey@gmail.com/ Link: https://lore.kernel.org/lkml/20180611195744.154962-1-astrachan@google.com/ Fixes: e94591d0 ("proc: Convert proc_mount to use mount_ns.") Eric W. Biederman (4): uml: Don't consult current to find the proc_mnt in mconsole_proc uml: Create a private mount of proc for mconsole proc: Remove the now unnecessary internal mount of proc pid: Improve the comment about waiting in zap_pid_ns_processes arch/um/drivers/mconsole_kern.c | 28 +++++++++++++++++++++++++++- fs/proc/root.c | 36 ------------------------------------ include/linux/pid_namespace.h | 2 -- include/linux/proc_ns.h | 5 ----- kernel/pid.c | 8 -------- kernel/pid_namespace.c | 38 +++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 71 deletions(-)
a0d4a141 · Eric W. Biederman · a13ae697 · af9fe6d6 · a0d4a141 · a0d4a141
Commit a0d4a141 authored Mar 02, 2020 by Eric W. Biederman
6 changed files
--- a/arch/um/drivers/mconsole_kern.c
+++ b/arch/um/drivers/mconsole_kern.c
@@ -36,6 +36,8 @@
 #include "mconsole_kern.h"
 #include <os.h>

+static struct vfsmount *proc_mnt = NULL;
+
 static int do_unlink_socket(struct notifier_block *notifier,
 			    unsigned long what, void *data)
 {
@@ -123,7 +125,7 @@ void mconsole_log(struct mc_request *req)

 void mconsole_proc(struct mc_request *req)
 {
-	struct vfsmount *mnt = task_active_pid_ns(current)->proc_mnt;
+	struct vfsmount *mnt = proc_mnt;
 	char *buf;
 	int len;
 	struct file *file;
@@ -134,6 +136,10 @@ void mconsole_proc(struct mc_request *req)
 	ptr += strlen("proc");
 	ptr = skip_spaces(ptr);

+	if (!mnt) {
+		mconsole_reply(req, "Proc not available", 1, 0);
+		goto out;
+	}
 	file = file_open_root(mnt->mnt_root, mnt, ptr, O_RDONLY, 0);
 	if (IS_ERR(file)) {
 		mconsole_reply(req, "Failed to open file", 1, 0);
@@ -683,6 +689,24 @@ void mconsole_stack(struct mc_request *req)
 	with_console(req, stack_proc, to);
 }

+static int __init mount_proc(void)
+{
+	struct file_system_type *proc_fs_type;
+	struct vfsmount *mnt;
+
+	proc_fs_type = get_fs_type("proc");
+	if (!proc_fs_type)
+		return -ENODEV;
+
+	mnt = kern_mount(proc_fs_type);
+	put_filesystem(proc_fs_type);
+	if (IS_ERR(mnt))
+		return PTR_ERR(mnt);
+
+	proc_mnt = mnt;
+	return 0;
+}
+
 /*
 * Changed by mconsole_setup, which is __setup, and called before SMP is
 * active.
@@ -696,6 +720,8 @@ static int __init mconsole_init(void)
 	int err;
 	char file[UNIX_PATH_MAX];

+	mount_proc();
+
 	if (umid_file_name("mconsole", file, sizeof(file)))
 		return -1;
 	snprintf(mconsole_socket_name, sizeof(file), "%s", file);

--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -292,39 +292,3 @@ struct proc_dir_entry proc_root = {
 	.subdir		= RB_ROOT,
 	.name		= "/proc",
 };
-
-int pid_ns_prepare_proc(struct pid_namespace *ns)
-{
-	struct proc_fs_context *ctx;
-	struct fs_context *fc;
-	struct vfsmount *mnt;
-
-	fc = fs_context_for_mount(&proc_fs_type, SB_KERNMOUNT);
-	if (IS_ERR(fc))
-		return PTR_ERR(fc);
-
-	if (fc->user_ns != ns->user_ns) {
-		put_user_ns(fc->user_ns);
-		fc->user_ns = get_user_ns(ns->user_ns);
-	}
-
-	ctx = fc->fs_private;
-	if (ctx->pid_ns != ns) {
-		put_pid_ns(ctx->pid_ns);
-		get_pid_ns(ns);
-		ctx->pid_ns = ns;
-	}
-
-	mnt = fc_mount(fc);
-	put_fs_context(fc);
-	if (IS_ERR(mnt))
-		return PTR_ERR(mnt);
-
-	ns->proc_mnt = mnt;
-	return 0;
-}
-
-void pid_ns_release_proc(struct pid_namespace *ns)
-{
-	kern_unmount(ns->proc_mnt);
-}
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -33,7 +33,6 @@ struct pid_namespace {
 	unsigned int level;
 	struct pid_namespace *parent;
 #ifdef CONFIG_PROC_FS
-	struct vfsmount *proc_mnt;
 	struct dentry *proc_self;
 	struct dentry *proc_thread_self;
 #endif
@@ -42,7 +41,6 @@ struct pid_namespace {
 #endif
 	struct user_namespace *user_ns;
 	struct ucounts *ucounts;
-	struct work_struct proc_work;
 	kgid_t pid_gid;
 	int hide_pid;
 	int reboot;	/* group exit code if this pidns was rebooted */

--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -50,16 +50,11 @@ enum {

 #ifdef CONFIG_PROC_FS

-extern int pid_ns_prepare_proc(struct pid_namespace *ns);
-extern void pid_ns_release_proc(struct pid_namespace *ns);
 extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);

 #else /* CONFIG_PROC_FS */

-static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
-static inline void pid_ns_release_proc(struct pid_namespace *ns) {}
-
 static inline int proc_alloc_inum(unsigned int *inum)
 {
 	*inum = 1;

--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -144,9 +144,6 @@ void free_pid(struct pid *pid)
 			/* Handle a fork failure of the first process */
 			WARN_ON(ns->child_reaper);
 			ns->pid_allocated = 0;
-			/* fall through */
-		case 0:
-			schedule_work(&ns->proc_work);
 			break;
 		}

@@ -247,11 +244,6 @@ struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
 		tmp = tmp->parent;
 	}

-	if (unlikely(is_child_reaper(pid))) {
-		if (pid_ns_prepare_proc(ns))
-			goto out_free;
-	}
-
 	get_pid_ns(ns);
 	refcount_set(&pid->count, 1);
 	for (type = 0; type < PIDTYPE_MAX; ++type)

--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -57,12 +57,6 @@ static struct kmem_cache *create_pid_cachep(unsigned int level)
 	return READ_ONCE(*pkc);
 }

-static void proc_cleanup_work(struct work_struct *work)
-{
-	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
-	pid_ns_release_proc(ns);
-}
-
 static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
 {
 	return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
@@ -114,7 +108,6 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->user_ns = get_user_ns(user_ns);
 	ns->ucounts = ucounts;
 	ns->pid_allocated = PIDNS_ADDING;
-	INIT_WORK(&ns->proc_work, proc_cleanup_work);

 	return ns;

@@ -231,20 +224,27 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	} while (rc != -ECHILD);

 	/*
-	 * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
-	 * really care, we could reparent them to the global init. We could
-	 * exit and reap ->child_reaper even if it is not the last thread in
-	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
-	 * pid_ns can not go away until proc_kill_sb() drops the reference.
+	 * kernel_wait4() misses EXIT_DEAD children, and EXIT_ZOMBIE
+	 * process whose parents processes are outside of the pid
+	 * namespace.  Such processes are created with setns()+fork().
+	 *
+	 * If those EXIT_ZOMBIE processes are not reaped by their
+	 * parents before their parents exit, they will be reparented
+	 * to pid_ns->child_reaper.  Thus pidns->child_reaper needs to
+	 * stay valid until they all go away.
+	 *
+	 * The code relies on the the pid_ns->child_reaper ignoring
+	 * SIGCHILD to cause those EXIT_ZOMBIE processes to be
+	 * autoreaped if reparented.
 	 *
-	 * But this ns can also have other tasks injected by setns()+fork().
-	 * Again, ignoring the user visible semantics we do not really need
-	 * to wait until they are all reaped, but they can be reparented to
-	 * us and thus we need to ensure that pid->child_reaper stays valid
-	 * until they all go away. See free_pid()->wake_up_process().
+	 * Semantically it is also desirable to wait for EXIT_ZOMBIE
+	 * processes before allowing the child_reaper to be reaped, as
+	 * that gives the invariant that when the init process of a
+	 * pid namespace is reaped all of the processes in the pid
+	 * namespace are gone.
 	 *
-	 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
-	 * if reparented.
+	 * Once all of the other tasks are gone from the pid_namespace
+	 * free_pid() will awaken this task.
 	 */
 	for (;;) {
 		set_current_state(TASK_INTERRUPTIBLE);