Commit d8836005 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - Christian extended clone3 so that processes can be spawned into
   cgroups directly.

   This is not only neat in terms of semantics but also avoids grabbing
   the global cgroup_threadgroup_rwsem for migration.

 - Daniel added !root xattr support to cgroupfs.

   Userland already uses xattrs on cgroupfs for bookkeeping. This will
   allow delegated cgroups to support such usages.

 - Prateek tried to make cpuset hotplug handling synchronous but that
   led to possible deadlock scenarios. Reverted.

 - Other minor changes including release_agent_path handling cleanup.

* 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  docs: cgroup-v1: Document the cpuset_v2_mode mount option
  Revert "cpuset: Make cpuset hotplug synchronous"
  cgroupfs: Support user xattrs
  kernfs: Add option to enable user xattrs
  kernfs: Add removed_size out param for simple_xattr_set
  kernfs: kvmalloc xattr value instead of kmalloc
  cgroup: Restructure release_agent_path handling
  selftests/cgroup: add tests for cloning into cgroups
  clone3: allow spawning processes into cgroups
  cgroup: add cgroup_may_write() helper
  cgroup: refactor fork helpers
  cgroup: add cgroup_get_from_file() helper
  cgroup: unify attach permission checking
  cpuset: Make cpuset hotplug synchronous
  cgroup.c: Use built-in RCU list checking
  kselftest/cgroup: add cgroup destruction test
  cgroup: Clean up css_set task traversal
parents f2c3bec3 0c05b9bd
...@@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file ...@@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file
automatically tracks the value of node_states[N_MEMORY]--i.e., automatically tracks the value of node_states[N_MEMORY]--i.e.,
nodes with memory--using the cpuset_track_online_nodes() hook. nodes with memory--using the cpuset_track_online_nodes() hook.
The cpuset.effective_cpus and cpuset.effective_mems files are
normally read-only copies of cpuset.cpus and cpuset.mems files
respectively. If the cpuset cgroup filesystem is mounted with the
special "cpuset_v2_mode" option, the behavior of these files will become
similar to the corresponding files in cpuset v2. In other words, hotplug
events will not change cpuset.cpus and cpuset.mems. Those events will
only affect cpuset.effective_cpus and cpuset.effective_mems which show
the actual cpus and memory nodes that are currently used by this cpuset.
See Documentation/admin-guide/cgroup-v2.rst for more information about
cpuset v2 behavior.
1.4 What are exclusive cpusets ? 1.4 What are exclusive cpusets ?
-------------------------------- --------------------------------
......
...@@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc) ...@@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
kn->iattr->ia_ctime = kn->iattr->ia_atime; kn->iattr->ia_ctime = kn->iattr->ia_atime;
simple_xattrs_init(&kn->iattr->xattrs); simple_xattrs_init(&kn->iattr->xattrs);
atomic_set(&kn->iattr->nr_user_xattrs, 0);
atomic_set(&kn->iattr->user_xattr_size, 0);
out_unlock: out_unlock:
ret = kn->iattr; ret = kn->iattr;
mutex_unlock(&iattr_mutex); mutex_unlock(&iattr_mutex);
...@@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name, ...@@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
if (!attrs) if (!attrs)
return -ENOMEM; return -ENOMEM;
return simple_xattr_set(&attrs->xattrs, name, value, size, flags); return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
} }
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler, static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
...@@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler, ...@@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
return kernfs_xattr_set(kn, name, value, size, flags); return kernfs_xattr_set(kn, name, value, size, flags);
} }
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
const char *full_name,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
ssize_t removed_size;
int ret;
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
ret = -ENOSPC;
goto dec_count_out;
}
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
ret = -ENOSPC;
goto dec_size_out;
}
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
&removed_size);
if (!ret && removed_size >= 0)
size = removed_size;
else if (!ret)
return 0;
dec_size_out:
atomic_sub(size, sz);
dec_count_out:
atomic_dec(nr);
return ret;
}
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
const char *full_name,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
ssize_t removed_size;
int ret;
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
&removed_size);
if (removed_size >= 0) {
atomic_sub(removed_size, sz);
atomic_dec(nr);
}
return ret;
}
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *suffix, const void *value,
size_t size, int flags)
{
const char *full_name = xattr_full_name(handler, suffix);
struct kernfs_node *kn = inode->i_private;
struct kernfs_iattrs *attrs;
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
return -EOPNOTSUPP;
attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
if (value)
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
value, size, flags);
else
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
value, size, flags);
}
static const struct xattr_handler kernfs_trusted_xattr_handler = { static const struct xattr_handler kernfs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX, .prefix = XATTR_TRUSTED_PREFIX,
.get = kernfs_vfs_xattr_get, .get = kernfs_vfs_xattr_get,
...@@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = { ...@@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = {
.set = kernfs_vfs_xattr_set, .set = kernfs_vfs_xattr_set,
}; };
static const struct xattr_handler kernfs_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
.get = kernfs_vfs_xattr_get,
.set = kernfs_vfs_user_xattr_set,
};
const struct xattr_handler *kernfs_xattr_handlers[] = { const struct xattr_handler *kernfs_xattr_handlers[] = {
&kernfs_trusted_xattr_handler, &kernfs_trusted_xattr_handler,
&kernfs_security_xattr_handler, &kernfs_security_xattr_handler,
&kernfs_user_xattr_handler,
NULL NULL
}; };
...@@ -26,6 +26,8 @@ struct kernfs_iattrs { ...@@ -26,6 +26,8 @@ struct kernfs_iattrs {
struct timespec64 ia_ctime; struct timespec64 ia_ctime;
struct simple_xattrs xattrs; struct simple_xattrs xattrs;
atomic_t nr_user_xattrs;
atomic_t user_xattr_size;
}; };
/* +1 to avoid triggering overflow warning when negating it */ /* +1 to avoid triggering overflow warning when negating it */
......
...@@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size) ...@@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
if (len < sizeof(*new_xattr)) if (len < sizeof(*new_xattr))
return NULL; return NULL;
new_xattr = kmalloc(len, GFP_KERNEL); new_xattr = kvmalloc(len, GFP_KERNEL);
if (!new_xattr) if (!new_xattr)
return NULL; return NULL;
...@@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, ...@@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* @value: value of the xattr. If %NULL, will remove the attribute. * @value: value of the xattr. If %NULL, will remove the attribute.
* @size: size of the new xattr * @size: size of the new xattr
* @flags: %XATTR_{CREATE|REPLACE} * @flags: %XATTR_{CREATE|REPLACE}
* @removed_size: returns size of the removed xattr, -1 if none removed
* *
* %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
* with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
...@@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, ...@@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* Returns 0 on success, -errno on failure. * Returns 0 on success, -errno on failure.
*/ */
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags) const void *value, size_t size, int flags,
ssize_t *removed_size)
{ {
struct simple_xattr *xattr; struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL; struct simple_xattr *new_xattr = NULL;
...@@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, ...@@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
new_xattr->name = kstrdup(name, GFP_KERNEL); new_xattr->name = kstrdup(name, GFP_KERNEL);
if (!new_xattr->name) { if (!new_xattr->name) {
kfree(new_xattr); kvfree(new_xattr);
return -ENOMEM; return -ENOMEM;
} }
} }
...@@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, ...@@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
err = -EEXIST; err = -EEXIST;
} else if (new_xattr) { } else if (new_xattr) {
list_replace(&xattr->list, &new_xattr->list); list_replace(&xattr->list, &new_xattr->list);
if (removed_size)
*removed_size = xattr->size;
} else { } else {
list_del(&xattr->list); list_del(&xattr->list);
if (removed_size)
*removed_size = xattr->size;
} }
goto out; goto out;
} }
...@@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, ...@@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
list_add(&new_xattr->list, &xattrs->head); list_add(&new_xattr->list, &xattrs->head);
xattr = NULL; xattr = NULL;
} }
if (removed_size)
*removed_size = -1;
out: out:
spin_unlock(&xattrs->lock); spin_unlock(&xattrs->lock);
if (xattr) { if (xattr) {
kfree(xattr->name); kfree(xattr->name);
kfree(xattr); kvfree(xattr);
} }
return err; return err;
......
...@@ -633,8 +633,9 @@ struct cgroup_subsys { ...@@ -633,8 +633,9 @@ struct cgroup_subsys {
void (*cancel_attach)(struct cgroup_taskset *tset); void (*cancel_attach)(struct cgroup_taskset *tset);
void (*attach)(struct cgroup_taskset *tset); void (*attach)(struct cgroup_taskset *tset);
void (*post_attach)(void); void (*post_attach)(void);
int (*can_fork)(struct task_struct *task); int (*can_fork)(struct task_struct *task,
void (*cancel_fork)(struct task_struct *task); struct css_set *cset);
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
void (*fork)(struct task_struct *task); void (*fork)(struct task_struct *task);
void (*exit)(struct task_struct *task); void (*exit)(struct task_struct *task);
void (*release)(struct task_struct *task); void (*release)(struct task_struct *task);
......
...@@ -27,6 +27,8 @@ ...@@ -27,6 +27,8 @@
#include <linux/cgroup-defs.h> #include <linux/cgroup-defs.h>
struct kernel_clone_args;
#ifdef CONFIG_CGROUPS #ifdef CONFIG_CGROUPS
/* /*
...@@ -58,9 +60,6 @@ struct css_task_iter { ...@@ -58,9 +60,6 @@ struct css_task_iter {
struct list_head *tcset_head; struct list_head *tcset_head;
struct list_head *task_pos; struct list_head *task_pos;
struct list_head *tasks_head;
struct list_head *mg_tasks_head;
struct list_head *dying_tasks_head;
struct list_head *cur_tasks_head; struct list_head *cur_tasks_head;
struct css_set *cur_cset; struct css_set *cur_cset;
...@@ -122,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, ...@@ -122,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk); struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p); void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p,
extern void cgroup_cancel_fork(struct task_struct *p); struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p); void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p); void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p); void cgroup_free(struct task_struct *p);
...@@ -708,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats, ...@@ -708,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry) { return -EINVAL; } struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {} static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p) { return 0; } static inline int cgroup_can_fork(struct task_struct *p,
static inline void cgroup_cancel_fork(struct task_struct *p) {} struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_post_fork(struct task_struct *p) {} static inline void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {}
......
...@@ -37,8 +37,10 @@ enum kernfs_node_type { ...@@ -37,8 +37,10 @@ enum kernfs_node_type {
KERNFS_LINK = 0x0004, KERNFS_LINK = 0x0004,
}; };
#define KERNFS_TYPE_MASK 0x000f #define KERNFS_TYPE_MASK 0x000f
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
#define KERNFS_MAX_USER_XATTRS 128
#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10)
enum kernfs_node_flag { enum kernfs_node_flag {
KERNFS_ACTIVATED = 0x0010, KERNFS_ACTIVATED = 0x0010,
...@@ -78,6 +80,11 @@ enum kernfs_root_flag { ...@@ -78,6 +80,11 @@ enum kernfs_root_flag {
* fhandle to access nodes of the fs. * fhandle to access nodes of the fs.
*/ */
KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004,
/*
* Support user xattrs to be written to nodes rooted at this root.
*/
KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008,
}; };
/* type-specific structures for kernfs_node union members */ /* type-specific structures for kernfs_node union members */
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
struct task_struct; struct task_struct;
struct rusage; struct rusage;
union thread_union; union thread_union;
struct css_set;
/* All the bits taken by the old clone syscall. */ /* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL #define CLONE_LEGACY_FLAGS 0xffffffffULL
...@@ -29,6 +30,9 @@ struct kernel_clone_args { ...@@ -29,6 +30,9 @@ struct kernel_clone_args {
pid_t *set_tid; pid_t *set_tid;
/* Number of elements in *set_tid */ /* Number of elements in *set_tid */
size_t set_tid_size; size_t set_tid_size;
int cgroup;
struct cgroup *cgrp;
struct css_set *cset;
}; };
/* /*
......
...@@ -102,7 +102,8 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size); ...@@ -102,7 +102,8 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
void *buffer, size_t size); void *buffer, size_t size);
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags); const void *value, size_t size, int flags,
ssize_t *removed_size);
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer, ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
size_t size); size_t size);
void simple_xattr_list_add(struct simple_xattrs *xattrs, void simple_xattr_list_add(struct simple_xattrs *xattrs,
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
/* Flags for the clone3() syscall. */ /* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */ #define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
/* /*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3 * cloning flags intersect with CSIGNAL so can be used with unshare and clone3
...@@ -81,6 +82,8 @@ ...@@ -81,6 +82,8 @@
* @set_tid_size: This defines the size of the array referenced * @set_tid_size: This defines the size of the array referenced
* in @set_tid. This cannot be larger than the * in @set_tid. This cannot be larger than the
* kernel's limit of nested PID namespaces. * kernel's limit of nested PID namespaces.
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
* a file descriptor for the cgroup.
* *
* The structure is versioned by size and thus extensible. * The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and * New struct members must go at the end of the struct and
...@@ -97,11 +100,13 @@ struct clone_args { ...@@ -97,11 +100,13 @@ struct clone_args {
__aligned_u64 tls; __aligned_u64 tls;
__aligned_u64 set_tid; __aligned_u64 set_tid;
__aligned_u64 set_tid_size; __aligned_u64 set_tid_size;
__aligned_u64 cgroup;
}; };
#endif #endif
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ #define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ #define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
/* /*
* Scheduling policies * Scheduling policies
......
...@@ -38,10 +38,7 @@ static bool cgroup_no_v1_named; ...@@ -38,10 +38,7 @@ static bool cgroup_no_v1_named;
*/ */
static struct workqueue_struct *cgroup_pidlist_destroy_wq; static struct workqueue_struct *cgroup_pidlist_destroy_wq;
/* /* protects cgroup_subsys->release_agent_path */
* Protects cgroup_subsys->release_agent_path. Modifying it also requires
* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
*/
static DEFINE_SPINLOCK(release_agent_path_lock); static DEFINE_SPINLOCK(release_agent_path_lock);
bool cgroup1_ssid_disabled(int ssid) bool cgroup1_ssid_disabled(int ssid)
...@@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work) ...@@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work)
{ {
struct cgroup *cgrp = struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work); container_of(work, struct cgroup, release_agent_work);
char *pathbuf = NULL, *agentbuf = NULL; char *pathbuf, *agentbuf;
char *argv[3], *envp[3]; char *argv[3], *envp[3];
int ret; int ret;
mutex_lock(&cgroup_mutex); /* snoop agent path and exit early if empty */
if (!cgrp->root->release_agent_path[0])
return;
/* prepare argument buffers */
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf || !agentbuf || !strlen(agentbuf)) if (!pathbuf || !agentbuf)
goto out; goto out_free;
spin_lock_irq(&css_set_lock); spin_lock(&release_agent_path_lock);
ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
spin_unlock_irq(&css_set_lock); spin_unlock(&release_agent_path_lock);
if (!agentbuf[0])
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
if (ret < 0 || ret >= PATH_MAX) if (ret < 0 || ret >= PATH_MAX)
goto out; goto out_free;
argv[0] = agentbuf; argv[0] = agentbuf;
argv[1] = pathbuf; argv[1] = pathbuf;
...@@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work) ...@@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work)
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL; envp[2] = NULL;
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
goto out_free;
out:
mutex_unlock(&cgroup_mutex);
out_free: out_free:
kfree(agentbuf); kfree(agentbuf);
kfree(pathbuf); kfree(pathbuf);
......
...@@ -1966,7 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) ...@@ -1966,7 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root->kf_root = kernfs_create_root(kf_sops, root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED | KERNFS_ROOT_CREATE_DEACTIVATED |
KERNFS_ROOT_SUPPORT_EXPORTOP, KERNFS_ROOT_SUPPORT_EXPORTOP |
KERNFS_ROOT_SUPPORT_USER_XATTR,
root_cgrp); root_cgrp);
if (IS_ERR(root->kf_root)) { if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root); ret = PTR_ERR(root->kf_root);
...@@ -2726,11 +2727,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader, ...@@ -2726,11 +2727,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
{ {
DEFINE_CGROUP_MGCTX(mgctx); DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task; struct task_struct *task;
int ret; int ret = 0;
ret = cgroup_migrate_vet_dst(dst_cgrp);
if (ret)
return ret;
/* look up all src csets */ /* look up all src csets */
spin_lock_irq(&css_set_lock); spin_lock_irq(&css_set_lock);
...@@ -4160,7 +4157,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, ...@@ -4160,7 +4157,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
} else if (likely(!(pos->flags & CSS_RELEASED))) { } else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling); next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else { } else {
list_for_each_entry_rcu(next, &parent->children, sibling) list_for_each_entry_rcu(next, &parent->children, sibling,
lockdep_is_held(&cgroup_mutex))
if (next->serial_nr > pos->serial_nr) if (next->serial_nr > pos->serial_nr)
break; break;
} }
...@@ -4403,29 +4401,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) ...@@ -4403,29 +4401,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock); lockdep_assert_held(&css_set_lock);
/* Advance to the next non-empty css_set */ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
do { while ((cset = css_task_iter_next_css_set(it))) {
cset = css_task_iter_next_css_set(it); if (!list_empty(&cset->tasks)) {
if (!cset) { it->cur_tasks_head = &cset->tasks;
it->task_pos = NULL; break;
return; } else if (!list_empty(&cset->mg_tasks)) {
it->cur_tasks_head = &cset->mg_tasks;
break;
} else if (!list_empty(&cset->dying_tasks)) {
it->cur_tasks_head = &cset->dying_tasks;
break;
} }
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
if (!list_empty(&cset->tasks)) {
it->task_pos = cset->tasks.next;
it->cur_tasks_head = &cset->tasks;
} else if (!list_empty(&cset->mg_tasks)) {
it->task_pos = cset->mg_tasks.next;
it->cur_tasks_head = &cset->mg_tasks;
} else {
it->task_pos = cset->dying_tasks.next;
it->cur_tasks_head = &cset->dying_tasks;
} }
if (!cset) {
it->tasks_head = &cset->tasks; it->task_pos = NULL;
it->mg_tasks_head = &cset->mg_tasks; return;
it->dying_tasks_head = &cset->dying_tasks; }
it->task_pos = it->cur_tasks_head->next;
/* /*
* We don't keep css_sets locked across iteration steps and thus * We don't keep css_sets locked across iteration steps and thus
...@@ -4470,24 +4463,24 @@ static void css_task_iter_advance(struct css_task_iter *it) ...@@ -4470,24 +4463,24 @@ static void css_task_iter_advance(struct css_task_iter *it)
repeat: repeat:
if (it->task_pos) { if (it->task_pos) {
/* /*
* Advance iterator to find next entry. cset->tasks is * Advance iterator to find next entry. We go through cset
* consumed first and then ->mg_tasks. After ->mg_tasks, * tasks, mg_tasks and dying_tasks, when consumed we move onto
* we move onto the next cset. * the next cset.
*/ */
if (it->flags & CSS_TASK_ITER_SKIPPED) if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED; it->flags &= ~CSS_TASK_ITER_SKIPPED;
else else
it->task_pos = it->task_pos->next; it->task_pos = it->task_pos->next;
if (it->task_pos == it->tasks_head) { if (it->task_pos == &it->cur_cset->tasks) {
it->task_pos = it->mg_tasks_head->next; it->cur_tasks_head = &it->cur_cset->mg_tasks;
it->cur_tasks_head = it->mg_tasks_head; it->task_pos = it->cur_tasks_head->next;
} }
if (it->task_pos == it->mg_tasks_head) { if (it->task_pos == &it->cur_cset->mg_tasks) {
it->task_pos = it->dying_tasks_head->next; it->cur_tasks_head = &it->cur_cset->dying_tasks;
it->cur_tasks_head = it->dying_tasks_head; it->task_pos = it->cur_tasks_head->next;
} }
if (it->task_pos == it->dying_tasks_head) if (it->task_pos == &it->cur_cset->dying_tasks)
css_task_iter_advance_css_set(it); css_task_iter_advance_css_set(it);
} else { } else {
/* called from start, proceed to the first cset */ /* called from start, proceed to the first cset */
...@@ -4505,12 +4498,12 @@ static void css_task_iter_advance(struct css_task_iter *it) ...@@ -4505,12 +4498,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
goto repeat; goto repeat;
/* and dying leaders w/o live member threads */ /* and dying leaders w/o live member threads */
if (it->cur_tasks_head == it->dying_tasks_head && if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
!atomic_read(&task->signal->live)) !atomic_read(&task->signal->live))
goto repeat; goto repeat;
} else { } else {
/* skip all dying ones */ /* skip all dying ones */
if (it->cur_tasks_head == it->dying_tasks_head) if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
goto repeat; goto repeat;
} }
} }
...@@ -4674,13 +4667,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v) ...@@ -4674,13 +4667,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
return 0; return 0;
} }
static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
{
int ret;
struct inode *inode;
lockdep_assert_held(&cgroup_mutex);
inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
if (!inode)
return -ENOMEM;
ret = inode_permission(inode, MAY_WRITE);
iput(inode);
return ret;
}
static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp, struct cgroup *dst_cgrp,
struct super_block *sb) struct super_block *sb)
{ {
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp; struct cgroup *com_cgrp = src_cgrp;
struct inode *inode;
int ret; int ret;
lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&cgroup_mutex);
...@@ -4690,12 +4698,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, ...@@ -4690,12 +4698,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
com_cgrp = cgroup_parent(com_cgrp); com_cgrp = cgroup_parent(com_cgrp);
/* %current should be authorized to migrate to the common ancestor */ /* %current should be authorized to migrate to the common ancestor */
inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn); ret = cgroup_may_write(com_cgrp, sb);
if (!inode)
return -ENOMEM;
ret = inode_permission(inode, MAY_WRITE);
iput(inode);
if (ret) if (ret)
return ret; return ret;
...@@ -4711,6 +4714,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, ...@@ -4711,6 +4714,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
return 0; return 0;
} }
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
struct super_block *sb, bool threadgroup)
{
int ret = 0;
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
if (ret)
return ret;
ret = cgroup_migrate_vet_dst(dst_cgrp);
if (ret)
return ret;
if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
ret = -EOPNOTSUPP;
return ret;
}
static ssize_t cgroup_procs_write(struct kernfs_open_file *of, static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off) char *buf, size_t nbytes, loff_t off)
{ {
...@@ -4733,8 +4756,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of, ...@@ -4733,8 +4756,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock); spin_unlock_irq(&css_set_lock);
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
of->file->f_path.dentry->d_sb); of->file->f_path.dentry->d_sb, true);
if (ret) if (ret)
goto out_finish; goto out_finish;
...@@ -4778,16 +4801,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of, ...@@ -4778,16 +4801,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
spin_unlock_irq(&css_set_lock); spin_unlock_irq(&css_set_lock);
/* thread migrations follow the cgroup.procs delegation rule */ /* thread migrations follow the cgroup.procs delegation rule */
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
of->file->f_path.dentry->d_sb); of->file->f_path.dentry->d_sb, false);
if (ret) if (ret)
goto out_finish; goto out_finish;
/* and must be contained in the same domain */
ret = -EOPNOTSUPP;
if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
goto out_finish;
ret = cgroup_attach_task(dst_cgrp, task, false); ret = cgroup_attach_task(dst_cgrp, task, false);
out_finish: out_finish:
...@@ -5876,8 +5894,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, ...@@ -5876,8 +5894,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
* @child: pointer to task_struct of forking parent process. * @child: pointer to task_struct of forking parent process.
* *
* A task is associated with the init_css_set until cgroup_post_fork() * A task is associated with the init_css_set until cgroup_post_fork()
* attaches it to the parent's css_set. Empty cg_list indicates that * attaches it to the target css_set.
* @child isn't holding reference to its css_set.
*/ */
void cgroup_fork(struct task_struct *child) void cgroup_fork(struct task_struct *child)
{ {
...@@ -5885,21 +5902,172 @@ void cgroup_fork(struct task_struct *child) ...@@ -5885,21 +5902,172 @@ void cgroup_fork(struct task_struct *child)
INIT_LIST_HEAD(&child->cg_list); INIT_LIST_HEAD(&child->cg_list);
} }
static struct cgroup *cgroup_get_from_file(struct file *f)
{
struct cgroup_subsys_state *css;
struct cgroup *cgrp;
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
if (IS_ERR(css))
return ERR_CAST(css);
cgrp = css->cgroup;
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
}
return cgrp;
}
/**
* cgroup_css_set_fork - find or create a css_set for a child process
* @kargs: the arguments passed to create the child process
*
* This functions finds or creates a new css_set which the child
* process will be attached to in cgroup_post_fork(). By default,
* the child process will be given the same css_set as its parent.
*
* If CLONE_INTO_CGROUP is specified this function will try to find an
* existing css_set which includes the requested cgroup and if not create
* a new css_set that the child will be attached to later. If this function
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
* to the target cgroup.
*/
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
{
int ret;
struct cgroup *dst_cgrp = NULL;
struct css_set *cset;
struct super_block *sb;
struct file *f;
if (kargs->flags & CLONE_INTO_CGROUP)
mutex_lock(&cgroup_mutex);
cgroup_threadgroup_change_begin(current);
spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
get_css_set(cset);
spin_unlock_irq(&css_set_lock);
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
kargs->cset = cset;
return 0;
}
f = fget_raw(kargs->cgroup);
if (!f) {
ret = -EBADF;
goto err;
}
sb = f->f_path.dentry->d_sb;
dst_cgrp = cgroup_get_from_file(f);
if (IS_ERR(dst_cgrp)) {
ret = PTR_ERR(dst_cgrp);
dst_cgrp = NULL;
goto err;
}
if (cgroup_is_dead(dst_cgrp)) {
ret = -ENODEV;
goto err;
}
/*
* Verify that we the target cgroup is writable for us. This is
* usually done by the vfs layer but since we're not going through
* the vfs layer here we need to do it "manually".
*/
ret = cgroup_may_write(dst_cgrp, sb);
if (ret)
goto err;
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD));
if (ret)
goto err;
kargs->cset = find_css_set(cset, dst_cgrp);
if (!kargs->cset) {
ret = -ENOMEM;
goto err;
}
put_css_set(cset);
fput(f);
kargs->cgrp = dst_cgrp;
return ret;
err:
cgroup_threadgroup_change_end(current);
mutex_unlock(&cgroup_mutex);
if (f)
fput(f);
if (dst_cgrp)
cgroup_put(dst_cgrp);
put_css_set(cset);
if (kargs->cset)
put_css_set(kargs->cset);
return ret;
}
/**
* cgroup_css_set_put_fork - drop references we took during fork
* @kargs: the arguments passed to create the child process
*
* Drop references to the prepared css_set and target cgroup if
* CLONE_INTO_CGROUP was requested.
*/
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
cgroup_threadgroup_change_end(current);
if (kargs->flags & CLONE_INTO_CGROUP) {
struct cgroup *cgrp = kargs->cgrp;
struct css_set *cset = kargs->cset;
mutex_unlock(&cgroup_mutex);
if (cset) {
put_css_set(cset);
kargs->cset = NULL;
}
if (cgrp) {
cgroup_put(cgrp);
kargs->cgrp = NULL;
}
}
}
/** /**
* cgroup_can_fork - called on a new task before the process is exposed * cgroup_can_fork - called on a new task before the process is exposed
* @child: the task in question. * @child: the child process
* *
* This calls the subsystem can_fork() callbacks. If the can_fork() callback * This prepares a new css_set for the child process which the child will
* returns an error, the fork aborts with that error code. This allows for * be attached to in cgroup_post_fork().
* a cgroup subsystem to conditionally allow or deny new forks. * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
* callback returns an error, the fork aborts with that error code. This
* allows for a cgroup subsystem to conditionally allow or deny new forks.
*/ */
int cgroup_can_fork(struct task_struct *child) int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int i, j, ret; int i, j, ret;
ret = cgroup_css_set_fork(kargs);
if (ret)
return ret;
do_each_subsys_mask(ss, i, have_canfork_callback) { do_each_subsys_mask(ss, i, have_canfork_callback) {
ret = ss->can_fork(child); ret = ss->can_fork(child, kargs->cset);
if (ret) if (ret)
goto out_revert; goto out_revert;
} while_each_subsys_mask(); } while_each_subsys_mask();
...@@ -5911,54 +6079,64 @@ int cgroup_can_fork(struct task_struct *child) ...@@ -5911,54 +6079,64 @@ int cgroup_can_fork(struct task_struct *child)
if (j >= i) if (j >= i)
break; break;
if (ss->cancel_fork) if (ss->cancel_fork)
ss->cancel_fork(child); ss->cancel_fork(child, kargs->cset);
} }
cgroup_css_set_put_fork(kargs);
return ret; return ret;
} }
/** /**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork() * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
* @child: the task in question * @child: the child process
* @kargs: the arguments passed to create the child process
* *
* This calls the cancel_fork() callbacks if a fork failed *after* * This calls the cancel_fork() callbacks if a fork failed *after*
* cgroup_can_fork() succeded. * cgroup_can_fork() succeded and cleans up references we took to
* prepare a new css_set for the child process in cgroup_can_fork().
*/ */
void cgroup_cancel_fork(struct task_struct *child) void cgroup_cancel_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
int i; int i;
for_each_subsys(ss, i) for_each_subsys(ss, i)
if (ss->cancel_fork) if (ss->cancel_fork)
ss->cancel_fork(child); ss->cancel_fork(child, kargs->cset);
cgroup_css_set_put_fork(kargs);
} }
/** /**
* cgroup_post_fork - called on a new task after adding it to the task list * cgroup_post_fork - finalize cgroup setup for the child process
* @child: the task in question * @child: the child process
* *
* Adds the task to the list running through its css_set if necessary and * Attach the child process to its css_set calling the subsystem fork()
* call the subsystem fork() callbacks. Has to be after the task is * callbacks.
* visible on the task list in case we race with the first call to
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
* list.
*/ */
void cgroup_post_fork(struct task_struct *child) void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{ {
struct cgroup_subsys *ss; struct cgroup_subsys *ss;
struct css_set *cset; struct css_set *cset;
int i; int i;
cset = kargs->cset;
kargs->cset = NULL;
spin_lock_irq(&css_set_lock); spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */ /* init tasks are special, only link regular threads */
if (likely(child->pid)) { if (likely(child->pid)) {
WARN_ON_ONCE(!list_empty(&child->cg_list)); WARN_ON_ONCE(!list_empty(&child->cg_list));
cset = task_css_set(current); /* current is @child's parent */
get_css_set(cset);
cset->nr_tasks++; cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false); css_set_move_task(child, NULL, cset, false);
} else {
put_css_set(cset);
cset = NULL;
} }
/* /*
...@@ -5990,6 +6168,17 @@ void cgroup_post_fork(struct task_struct *child) ...@@ -5990,6 +6168,17 @@ void cgroup_post_fork(struct task_struct *child)
do_each_subsys_mask(ss, i, have_fork_callback) { do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child); ss->fork(child);
} while_each_subsys_mask(); } while_each_subsys_mask();
/* Make the new cset the root_cset of the new cgroup namespace. */
if (kargs->flags & CLONE_NEWCGROUP) {
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
get_css_set(cset);
child->nsproxy->cgroup_ns->root_cset = cset;
put_css_set(rcset);
}
cgroup_css_set_put_fork(kargs);
} }
/** /**
...@@ -6176,7 +6365,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path); ...@@ -6176,7 +6365,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
*/ */
struct cgroup *cgroup_get_from_fd(int fd) struct cgroup *cgroup_get_from_fd(int fd)
{ {
struct cgroup_subsys_state *css;
struct cgroup *cgrp; struct cgroup *cgrp;
struct file *f; struct file *f;
...@@ -6184,17 +6372,8 @@ struct cgroup *cgroup_get_from_fd(int fd) ...@@ -6184,17 +6372,8 @@ struct cgroup *cgroup_get_from_fd(int fd)
if (!f) if (!f)
return ERR_PTR(-EBADF); return ERR_PTR(-EBADF);
css = css_tryget_online_from_dir(f->f_path.dentry, NULL); cgrp = cgroup_get_from_file(f);
fput(f); fput(f);
if (IS_ERR(css))
return ERR_CAST(css);
cgrp = css->cgroup;
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
}
return cgrp; return cgrp;
} }
EXPORT_SYMBOL_GPL(cgroup_get_from_fd); EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
......
...@@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn); ...@@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq); static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/* /*
* Cgroup v2 behavior is used when on default hierarchy or the * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* cgroup_v2_mode flag is set. * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
* With v2 behavior, "cpus" and "mems" are always what the users have
* requested and won't be changed by hotplug events. Only the effective
* cpus or mems will be affected.
*/ */
static inline bool is_in_v2_mode(void) static inline bool is_in_v2_mode(void)
{ {
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include <linux/atomic.h> #include <linux/atomic.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/sched/task.h>
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL) #define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max" #define PIDS_MAX_STR "max"
...@@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) ...@@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process(). * on cgroup_threadgroup_change_begin() held by the copy_process().
*/ */
static int pids_can_fork(struct task_struct *task) static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{ {
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
struct pids_cgroup *pids; struct pids_cgroup *pids;
int err; int err;
css = task_css_check(current, pids_cgrp_id, true); if (cset)
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css); pids = css_pids(css);
err = pids_try_charge(pids, 1); err = pids_try_charge(pids, 1);
if (err) { if (err) {
...@@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task) ...@@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
return err; return err;
} }
static void pids_cancel_fork(struct task_struct *task) static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{ {
struct cgroup_subsys_state *css; struct cgroup_subsys_state *css;
struct pids_cgroup *pids; struct pids_cgroup *pids;
css = task_css_check(current, pids_cgrp_id, true); if (cset)
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css); pids = css_pids(css);
pids_uncharge(pids, 1); pids_uncharge(pids, 1);
} }
......
...@@ -2176,16 +2176,15 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -2176,16 +2176,15 @@ static __latent_entropy struct task_struct *copy_process(
INIT_LIST_HEAD(&p->thread_group); INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL; p->task_works = NULL;
cgroup_threadgroup_change_begin(current);
/* /*
* Ensure that the cgroup subsystem policies allow the new process to be * Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed * forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in * between here and cgroup_post_fork() if an organisation operation is in
* progress. * progress.
*/ */
retval = cgroup_can_fork(p); retval = cgroup_can_fork(p, args);
if (retval) if (retval)
goto bad_fork_cgroup_threadgroup_change_end; goto bad_fork_put_pidfd;
/* /*
* From this point on we must avoid any synchronous user-space * From this point on we must avoid any synchronous user-space
...@@ -2290,8 +2289,7 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -2290,8 +2289,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
proc_fork_connector(p); proc_fork_connector(p);
cgroup_post_fork(p); cgroup_post_fork(p, args);
cgroup_threadgroup_change_end(current);
perf_event_fork(p); perf_event_fork(p);
trace_task_newtask(p, clone_flags); trace_task_newtask(p, clone_flags);
...@@ -2302,9 +2300,7 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -2302,9 +2300,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup: bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock); spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p); cgroup_cancel_fork(p, args);
bad_fork_cgroup_threadgroup_change_end:
cgroup_threadgroup_change_end(current);
bad_fork_put_pidfd: bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) { if (clone_flags & CLONE_PIDFD) {
fput(pidfile); fput(pidfile);
...@@ -2633,6 +2629,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, ...@@ -2633,6 +2629,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
!valid_signal(args.exit_signal))) !valid_signal(args.exit_signal)))
return -EINVAL; return -EINVAL;
if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
return -EINVAL;
*kargs = (struct kernel_clone_args){ *kargs = (struct kernel_clone_args){
.flags = args.flags, .flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd), .pidfd = u64_to_user_ptr(args.pidfd),
...@@ -2643,6 +2642,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, ...@@ -2643,6 +2642,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack_size = args.stack_size, .stack_size = args.stack_size,
.tls = args.tls, .tls = args.tls,
.set_tid_size = args.set_tid_size, .set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
}; };
if (args.set_tid && if (args.set_tid &&
...@@ -2686,7 +2686,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs) ...@@ -2686,7 +2686,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
static bool clone3_args_valid(struct kernel_clone_args *kargs) static bool clone3_args_valid(struct kernel_clone_args *kargs)
{ {
/* Verify that no unknown flags are passed along. */ /* Verify that no unknown flags are passed along. */
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND)) if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
return false; return false;
/* /*
......
...@@ -3243,7 +3243,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, ...@@ -3243,7 +3243,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_inode_info *info = SHMEM_I(inode);
name = xattr_full_name(handler, name); name = xattr_full_name(handler, name);
return simple_xattr_set(&info->xattrs, name, value, size, flags); return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
} }
static const struct xattr_handler shmem_security_xattr_handler = { static const struct xattr_handler shmem_security_xattr_handler = {
......
...@@ -11,6 +11,6 @@ TEST_GEN_PROGS += test_freezer ...@@ -11,6 +11,6 @@ TEST_GEN_PROGS += test_freezer
include ../lib.mk include ../lib.mk
$(OUTPUT)/test_memcontrol: cgroup_util.c $(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_core: cgroup_util.c $(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_freezer: cgroup_util.c $(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <unistd.h> #include <unistd.h>
#include "cgroup_util.h" #include "cgroup_util.h"
#include "../clone3/clone3_selftests.h"
static ssize_t read_text(const char *path, char *buf, size_t max_len) static ssize_t read_text(const char *path, char *buf, size_t max_len)
{ {
...@@ -331,12 +332,112 @@ int cg_run(const char *cgroup, ...@@ -331,12 +332,112 @@ int cg_run(const char *cgroup,
} }
} }
pid_t clone_into_cgroup(int cgroup_fd)
{
#ifdef CLONE_ARGS_SIZE_VER2
pid_t pid;
struct clone_args args = {
.flags = CLONE_INTO_CGROUP,
.exit_signal = SIGCHLD,
.cgroup = cgroup_fd,
};
pid = sys_clone3(&args, sizeof(struct clone_args));
/*
* Verify that this is a genuine test failure:
* ENOSYS -> clone3() not available
* E2BIG -> CLONE_INTO_CGROUP not available
*/
if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
goto pretend_enosys;
return pid;
pretend_enosys:
#endif
errno = ENOSYS;
return -ENOSYS;
}
int clone_reap(pid_t pid, int options)
{
int ret;
siginfo_t info = {
.si_signo = 0,
};
again:
ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
if (ret < 0) {
if (errno == EINTR)
goto again;
return -1;
}
if (options & WEXITED) {
if (WIFEXITED(info.si_status))
return WEXITSTATUS(info.si_status);
}
if (options & WSTOPPED) {
if (WIFSTOPPED(info.si_status))
return WSTOPSIG(info.si_status);
}
if (options & WCONTINUED) {
if (WIFCONTINUED(info.si_status))
return 0;
}
return -1;
}
int dirfd_open_opath(const char *dir)
{
return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
}
#define close_prot_errno(fd) \
if (fd >= 0) { \
int _e_ = errno; \
close(fd); \
errno = _e_; \
}
static int clone_into_cgroup_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg)
{
int cgroup_fd;
pid_t pid;
cgroup_fd = dirfd_open_opath(cgroup);
if (cgroup_fd < 0)
return -1;
pid = clone_into_cgroup(cgroup_fd);
close_prot_errno(cgroup_fd);
if (pid == 0)
exit(fn(cgroup, arg));
return pid;
}
int cg_run_nowait(const char *cgroup, int cg_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg), int (*fn)(const char *cgroup, void *arg),
void *arg) void *arg)
{ {
int pid; int pid;
pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
if (pid > 0)
return pid;
/* Genuine test failure. */
if (pid < 0 && errno != ENOSYS)
return -1;
pid = fork(); pid = fork();
if (pid == 0) { if (pid == 0) {
char buf[64]; char buf[64];
...@@ -450,3 +551,28 @@ int proc_read_strstr(int pid, bool thread, const char *item, const char *needle) ...@@ -450,3 +551,28 @@ int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
return strstr(buf, needle) ? 0 : -1; return strstr(buf, needle) ? 0 : -1;
} }
int clone_into_cgroup_run_wait(const char *cgroup)
{
int cgroup_fd;
pid_t pid;
cgroup_fd = dirfd_open_opath(cgroup);
if (cgroup_fd < 0)
return -1;
pid = clone_into_cgroup(cgroup_fd);
close_prot_errno(cgroup_fd);
if (pid < 0)
return -1;
if (pid == 0)
exit(EXIT_SUCCESS);
/*
* We don't care whether this fails. We only care whether the initial
* clone succeeded.
*/
(void)clone_reap(pid, WEXITED);
return 0;
}
...@@ -50,3 +50,7 @@ extern int cg_wait_for_proc_count(const char *cgroup, int count); ...@@ -50,3 +50,7 @@ extern int cg_wait_for_proc_count(const char *cgroup, int count);
extern int cg_killall(const char *cgroup); extern int cg_killall(const char *cgroup);
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size); extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle); extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
extern pid_t clone_into_cgroup(int cgroup_fd);
extern int clone_reap(pid_t pid, int options);
extern int clone_into_cgroup_run_wait(const char *cgroup);
extern int dirfd_open_opath(const char *dir);
...@@ -2,7 +2,10 @@ ...@@ -2,7 +2,10 @@
#include <linux/limits.h> #include <linux/limits.h>
#include <sys/types.h> #include <sys/types.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <unistd.h> #include <unistd.h>
#include <fcntl.h>
#include <stdio.h> #include <stdio.h>
#include <errno.h> #include <errno.h>
#include <signal.h> #include <signal.h>
...@@ -12,6 +15,115 @@ ...@@ -12,6 +15,115 @@
#include "../kselftest.h" #include "../kselftest.h"
#include "cgroup_util.h" #include "cgroup_util.h"
static int touch_anon(char *buf, size_t size)
{
int fd;
char *pos = buf;
fd = open("/dev/urandom", O_RDONLY);
if (fd < 0)
return -1;
while (size > 0) {
ssize_t ret = read(fd, pos, size);
if (ret < 0) {
if (errno != EINTR) {
close(fd);
return -1;
}
} else {
pos += ret;
size -= ret;
}
}
close(fd);
return 0;
}
static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
{
int ppid = getppid();
size_t size = (size_t)arg;
void *buf;
buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
0, 0);
if (buf == MAP_FAILED)
return -1;
if (touch_anon((char *)buf, size)) {
munmap(buf, size);
return -1;
}
while (getppid() == ppid)
sleep(1);
munmap(buf, size);
return 0;
}
/*
* Create a child process that allocates and touches 100MB, then waits to be
* killed. Wait until the child is attached to the cgroup, kill all processes
* in that cgroup and wait until "cgroup.procs" is empty. At this point try to
* destroy the empty cgroup. The test helps detect race conditions between
* dying processes leaving the cgroup and cgroup destruction path.
*/
static int test_cgcore_destroy(const char *root)
{
int ret = KSFT_FAIL;
char *cg_test = NULL;
int child_pid;
char buf[PAGE_SIZE];
cg_test = cg_name(root, "cg_test");
if (!cg_test)
goto cleanup;
for (int i = 0; i < 10; i++) {
if (cg_create(cg_test))
goto cleanup;
child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
(void *) MB(100));
if (child_pid < 0)
goto cleanup;
/* wait for the child to enter cgroup */
if (cg_wait_for_proc_count(cg_test, 1))
goto cleanup;
if (cg_killall(cg_test))
goto cleanup;
/* wait for cgroup to be empty */
while (1) {
if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
goto cleanup;
if (buf[0] == '\0')
break;
usleep(1000);
}
if (rmdir(cg_test))
goto cleanup;
if (waitpid(child_pid, NULL, 0) < 0)
goto cleanup;
}
ret = KSFT_PASS;
cleanup:
if (cg_test)
cg_destroy(cg_test);
free(cg_test);
return ret;
}
/* /*
* A(0) - B(0) - C(1) * A(0) - B(0) - C(1)
* \ D(0) * \ D(0)
...@@ -25,8 +137,11 @@ ...@@ -25,8 +137,11 @@
static int test_cgcore_populated(const char *root) static int test_cgcore_populated(const char *root)
{ {
int ret = KSFT_FAIL; int ret = KSFT_FAIL;
int err;
char *cg_test_a = NULL, *cg_test_b = NULL; char *cg_test_a = NULL, *cg_test_b = NULL;
char *cg_test_c = NULL, *cg_test_d = NULL; char *cg_test_c = NULL, *cg_test_d = NULL;
int cgroup_fd = -EBADF;
pid_t pid;
cg_test_a = cg_name(root, "cg_test_a"); cg_test_a = cg_name(root, "cg_test_a");
cg_test_b = cg_name(root, "cg_test_a/cg_test_b"); cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
...@@ -78,6 +193,52 @@ static int test_cgcore_populated(const char *root) ...@@ -78,6 +193,52 @@ static int test_cgcore_populated(const char *root)
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n")) if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
goto cleanup; goto cleanup;
/* Test that we can directly clone into a new cgroup. */
cgroup_fd = dirfd_open_opath(cg_test_d);
if (cgroup_fd < 0)
goto cleanup;
pid = clone_into_cgroup(cgroup_fd);
if (pid < 0) {
if (errno == ENOSYS)
goto cleanup_pass;
goto cleanup;
}
if (pid == 0) {
if (raise(SIGSTOP))
exit(EXIT_FAILURE);
exit(EXIT_SUCCESS);
}
err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n");
(void)clone_reap(pid, WSTOPPED);
(void)kill(pid, SIGCONT);
(void)clone_reap(pid, WEXITED);
if (err)
goto cleanup;
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
goto cleanup;
/* Remove cgroup. */
if (cg_test_d) {
cg_destroy(cg_test_d);
free(cg_test_d);
cg_test_d = NULL;
}
pid = clone_into_cgroup(cgroup_fd);
if (pid < 0)
goto cleanup_pass;
if (pid == 0)
exit(EXIT_SUCCESS);
(void)clone_reap(pid, WEXITED);
goto cleanup;
cleanup_pass:
ret = KSFT_PASS; ret = KSFT_PASS;
cleanup: cleanup:
...@@ -93,6 +254,8 @@ static int test_cgcore_populated(const char *root) ...@@ -93,6 +254,8 @@ static int test_cgcore_populated(const char *root)
free(cg_test_c); free(cg_test_c);
free(cg_test_b); free(cg_test_b);
free(cg_test_a); free(cg_test_a);
if (cgroup_fd >= 0)
close(cgroup_fd);
return ret; return ret;
} }
...@@ -136,6 +299,16 @@ static int test_cgcore_invalid_domain(const char *root) ...@@ -136,6 +299,16 @@ static int test_cgcore_invalid_domain(const char *root)
if (errno != EOPNOTSUPP) if (errno != EOPNOTSUPP)
goto cleanup; goto cleanup;
if (!clone_into_cgroup_run_wait(child))
goto cleanup;
if (errno == ENOSYS)
goto cleanup_pass;
if (errno != EOPNOTSUPP)
goto cleanup;
cleanup_pass:
ret = KSFT_PASS; ret = KSFT_PASS;
cleanup: cleanup:
...@@ -345,6 +518,9 @@ static int test_cgcore_internal_process_constraint(const char *root) ...@@ -345,6 +518,9 @@ static int test_cgcore_internal_process_constraint(const char *root)
if (!cg_enter_current(parent)) if (!cg_enter_current(parent))
goto cleanup; goto cleanup;
if (!clone_into_cgroup_run_wait(parent))
goto cleanup;
ret = KSFT_PASS; ret = KSFT_PASS;
cleanup: cleanup:
...@@ -512,6 +688,7 @@ struct corecg_test { ...@@ -512,6 +688,7 @@ struct corecg_test {
T(test_cgcore_populated), T(test_cgcore_populated),
T(test_cgcore_proc_migration), T(test_cgcore_proc_migration),
T(test_cgcore_thread_migration), T(test_cgcore_thread_migration),
T(test_cgcore_destroy),
}; };
#undef T #undef T
......
...@@ -5,12 +5,24 @@ ...@@ -5,12 +5,24 @@
#define _GNU_SOURCE #define _GNU_SOURCE
#include <sched.h> #include <sched.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <stdint.h> #include <stdint.h>
#include <syscall.h> #include <syscall.h>
#include <linux/types.h> #include <sys/wait.h>
#include "../kselftest.h"
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr))) #define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
#ifndef CLONE_INTO_CGROUP
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#endif
#ifndef CLONE_ARGS_SIZE_VER0
#define CLONE_ARGS_SIZE_VER0 64
#endif
#ifndef __NR_clone3 #ifndef __NR_clone3
#define __NR_clone3 -1 #define __NR_clone3 -1
struct clone_args { struct clone_args {
...@@ -22,10 +34,13 @@ struct clone_args { ...@@ -22,10 +34,13 @@ struct clone_args {
__aligned_u64 stack; __aligned_u64 stack;
__aligned_u64 stack_size; __aligned_u64 stack_size;
__aligned_u64 tls; __aligned_u64 tls;
#define CLONE_ARGS_SIZE_VER1 80
__aligned_u64 set_tid; __aligned_u64 set_tid;
__aligned_u64 set_tid_size; __aligned_u64 set_tid_size;
#define CLONE_ARGS_SIZE_VER2 88
__aligned_u64 cgroup;
}; };
#endif #endif /* __NR_clone3 */
static pid_t sys_clone3(struct clone_args *args, size_t size) static pid_t sys_clone3(struct clone_args *args, size_t size)
{ {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment