UBUNTU: SAUCE: cgroup: Use a new super block when mounting in a cgroup namespace

BugLink: http://bugs.launchpad.net/bugs/1566505 Currently a new mount of an existing hierarchy always reuses the original super block, even when the new mount is in a cgroup namespace. This sometimes conflicts with the user namespace mount support, which requires a new mount of an existing super block to be in the same user namespace as the original mount. When mounting from non-init cgroup and user namespaces sget() will fail. To fix this we can pass a pointer to the cgroup ns to kernfs when mounting, causing kernfs_test_super() to no longer match super blocks from different cgroup namespaces. However we do wish to continue sharing the cgroup_root between mounts of the same heirarchy. The cgroup_root's lifetime is governed by the reference count of its cgrp member, but this is a percpu reference count and is not well suited to this new situation. Instead a new reference count is added to the cgroup_root structure to track the number of super blocks sharing that root, and this refcnt is used to determine when to put the cgroup reference. Signed-off-by: Seth Forshee <seth.forshee@canonical.com> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>

UBUNTU: SAUCE: cgroup: Use a new super block when mounting in a cgroup namespace
BugLink: http://bugs.launchpad.net/bugs/1566505 Currently a new mount of an existing hierarchy always reuses the original super block, even when the new mount is in a cgroup namespace. This sometimes conflicts with the user namespace mount support, which requires a new mount of an existing super block to be in the same user namespace as the original mount. When mounting from non-init cgroup and user namespaces sget() will fail. To fix this we can pass a pointer to the cgroup ns to kernfs when mounting, causing kernfs_test_super() to no longer match super blocks from different cgroup namespaces. However we do wish to continue sharing the cgroup_root between mounts of the same heirarchy. The cgroup_root's lifetime is governed by the reference count of its cgrp member, but this is a percpu reference count and is not well suited to this new situation. Instead a new reference count is added to the cgroup_root structure to track the number of super blocks sharing that root, and this refcnt is used to determine when to put the cgroup reference. Signed-off-by: Seth Forshee <seth.forshee@canonical.com> Signed-off-by: Tim Gardner <tim.gardner@canonical.com>
794fbce4 · Seth Forshee · Tim Gardner · 6d6a1681 · 794fbce4 · 794fbce4
Commit 794fbce4 authored Apr 04, 2016 by Seth Forshee Committed by Tim Gardner Apr 06, 2016
Hide whitespace changes
Inline Side-by-side

Showing with 34 additions and 39 deletions

include/linux/cgroup-defs.h include/linux/cgroup-defs.h +4 -0

kernel/cgroup.c kernel/cgroup.c +30 -39

No files found.
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/kref.h>

 #ifdef CONFIG_CGROUPS

@@ -301,6 +302,9 @@ struct cgroup {
 struct cgroup_root {
 	struct kernfs_root *kf_root;

+	/* Reference count for superblocks sharing this cgroup_root */
+	struct kref kref;
+
 	/* The bitmask of subsystems attached to this hierarchy */
 	unsigned int subsys_mask;


--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1940,6 +1940,7 @@ static void init_cgroup_root(struct cgroup_root *root,
 {
 	struct cgroup *cgrp = &root->cgrp;

+	kref_init(&root->kref);
 	INIT_LIST_HEAD(&root->root_list);
 	atomic_set(&root->nr_cgrps, 1);
 	cgrp->root = root;
@@ -2044,11 +2045,28 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
 	return ret;
 }

+static void cgroup_release_root(struct kref *kref)
+{
+	struct cgroup_root *root = container_of(kref, struct cgroup_root, kref);
+
+	/*
+	 * If @root doesn't have any mounts or children, start killing it.
+	 * This prevents new mounts by disabling percpu_ref_tryget_live().
+	 * cgroup_mount() may wait for @root's release.
+	 *
+	 * And don't kill the default root.
+	 */
+	if (!list_empty(&root->cgrp.self.children) ||
+	    root == &cgrp_dfl_root)
+		cgroup_put(&root->cgrp);
+	else
+		percpu_ref_kill(&root->cgrp.self.refcnt);
+}
+
 static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 			 int flags, const char *unused_dev_name,
 			 void *data)
 {
-	struct super_block *pinned_sb = NULL;
 	struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
 	struct cgroup_subsys *ss;
 	struct cgroup_root *root;
@@ -2144,22 +2162,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,

 		/*
 		 * We want to reuse @root whose lifetime is governed by its
-		 * ->cgrp.  Let's check whether @root is alive and keep it
-		 * that way.  As cgroup_kill_sb() can happen anytime, we
-		 * want to block it by pinning the sb so that @root doesn't
-		 * get killed before mount is complete.
-		 *
-		 * With the sb pinned, tryget_live can reliably indicate
-		 * whether @root can be reused.  If it's being killed,
-		 * drain it.  We can use wait_queue for the wait but this
-		 * path is super cold.  Let's just sleep a bit and retry.
+		 * refcnt.  If the refcnt is already zero then it's too late;
+		 * sleep a bit and retry.  Otherwise we get a reference and
+		 * can reuse the root.
 		 */
-		pinned_sb = kernfs_pin_sb(root->kf_root, NULL);
-		if (IS_ERR(pinned_sb) ||
-		    !percpu_ref_tryget_live(&root->cgrp.self.refcnt)) {
+		if (!kref_get_unless_zero(&root->kref)) {
 			mutex_unlock(&cgroup_mutex);
-			if (!IS_ERR_OR_NULL(pinned_sb))
-				deactivate_super(pinned_sb);
 			msleep(10);
 			ret = restart_syscall();
 			goto out_free;
@@ -2212,8 +2220,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		return ERR_PTR(ret);
 	}

-	dentry = kernfs_mount(fs_type, flags, root->kf_root,
-			      CGROUP_SUPER_MAGIC, &new_sb);
+	dentry = kernfs_mount_ns(fs_type, flags, root->kf_root,
+				 CGROUP_SUPER_MAGIC, &new_sb, ns);

 	/*
 	 * In non-init cgroup namespace, instead of root cgroup's
@@ -2237,17 +2245,12 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 		dentry = nsdentry;
 	}

-	if (IS_ERR(dentry) || !new_sb)
-		cgroup_put(&root->cgrp);
-
 	/*
-	 * If @pinned_sb, we're reusing an existing root and holding an
-	 * extra ref on its sb.  Mount is complete.  Put the extra ref.
+	 * On failure put the cgroup_root. If this is the last reference
+	 * cgroup_release_root will put the cgroup.
 	 */
-	if (pinned_sb) {
-		WARN_ON(new_sb);
-		deactivate_super(pinned_sb);
-	}
+	if (IS_ERR(dentry))
+		kref_put(&root->kref, cgroup_release_root);

 	put_cgroup_ns(ns);
 	return dentry;
@@ -2258,19 +2261,7 @@ static void cgroup_kill_sb(struct super_block *sb)
 	struct kernfs_root *kf_root = kernfs_root_from_sb(sb);
 	struct cgroup_root *root = cgroup_root_from_kf(kf_root);

-	/*
-	 * If @root doesn't have any mounts or children, start killing it.
-	 * This prevents new mounts by disabling percpu_ref_tryget_live().
-	 * cgroup_mount() may wait for @root's release.
-	 *
-	 * And don't kill the default root.
-	 */
-	if (!list_empty(&root->cgrp.self.children) ||
-	    root == &cgrp_dfl_root)
-		cgroup_put(&root->cgrp);
-	else
-		percpu_ref_kill(&root->cgrp.self.refcnt);
-
+	kref_put(&root->kref, cgroup_release_root);
 	kernfs_kill_sb(sb);
 }