Commit d8836005 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo:

 - Christian extended clone3 so that processes can be spawned into
   cgroups directly.

   This is not only neat in terms of semantics but also avoids grabbing
   the global cgroup_threadgroup_rwsem for migration.

 - Daniel added !root xattr support to cgroupfs.

   Userland already uses xattrs on cgroupfs for bookkeeping. This will
   allow delegated cgroups to support such usages.

 - Prateek tried to make cpuset hotplug handling synchronous but that
   led to possible deadlock scenarios. Reverted.

 - Other minor changes including release_agent_path handling cleanup.

* 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  docs: cgroup-v1: Document the cpuset_v2_mode mount option
  Revert "cpuset: Make cpuset hotplug synchronous"
  cgroupfs: Support user xattrs
  kernfs: Add option to enable user xattrs
  kernfs: Add removed_size out param for simple_xattr_set
  kernfs: kvmalloc xattr value instead of kmalloc
  cgroup: Restructure release_agent_path handling
  selftests/cgroup: add tests for cloning into cgroups
  clone3: allow spawning processes into cgroups
  cgroup: add cgroup_may_write() helper
  cgroup: refactor fork helpers
  cgroup: add cgroup_get_from_file() helper
  cgroup: unify attach permission checking
  cpuset: Make cpuset hotplug synchronous
  cgroup.c: Use built-in RCU list checking
  kselftest/cgroup: add cgroup destruction test
  cgroup: Clean up css_set task traversal
parents f2c3bec3 0c05b9bd
......@@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file
automatically tracks the value of node_states[N_MEMORY]--i.e.,
nodes with memory--using the cpuset_track_online_nodes() hook.
The cpuset.effective_cpus and cpuset.effective_mems files are
normally read-only copies of cpuset.cpus and cpuset.mems files
respectively. If the cpuset cgroup filesystem is mounted with the
special "cpuset_v2_mode" option, the behavior of these files will become
similar to the corresponding files in cpuset v2. In other words, hotplug
events will not change cpuset.cpus and cpuset.mems. Those events will
only affect cpuset.effective_cpus and cpuset.effective_mems which show
the actual cpus and memory nodes that are currently used by this cpuset.
See Documentation/admin-guide/cgroup-v2.rst for more information about
cpuset v2 behavior.
1.4 What are exclusive cpusets ?
--------------------------------
......
......@@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
kn->iattr->ia_ctime = kn->iattr->ia_atime;
simple_xattrs_init(&kn->iattr->xattrs);
atomic_set(&kn->iattr->nr_user_xattrs, 0);
atomic_set(&kn->iattr->user_xattr_size, 0);
out_unlock:
ret = kn->iattr;
mutex_unlock(&iattr_mutex);
......@@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
if (!attrs)
return -ENOMEM;
return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
}
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
......@@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
return kernfs_xattr_set(kn, name, value, size, flags);
}
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
const char *full_name,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
ssize_t removed_size;
int ret;
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
ret = -ENOSPC;
goto dec_count_out;
}
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
ret = -ENOSPC;
goto dec_size_out;
}
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
&removed_size);
if (!ret && removed_size >= 0)
size = removed_size;
else if (!ret)
return 0;
dec_size_out:
atomic_sub(size, sz);
dec_count_out:
atomic_dec(nr);
return ret;
}
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
const char *full_name,
struct simple_xattrs *xattrs,
const void *value, size_t size, int flags)
{
atomic_t *sz = &kn->iattr->user_xattr_size;
atomic_t *nr = &kn->iattr->nr_user_xattrs;
ssize_t removed_size;
int ret;
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
&removed_size);
if (removed_size >= 0) {
atomic_sub(removed_size, sz);
atomic_dec(nr);
}
return ret;
}
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *suffix, const void *value,
size_t size, int flags)
{
const char *full_name = xattr_full_name(handler, suffix);
struct kernfs_node *kn = inode->i_private;
struct kernfs_iattrs *attrs;
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
return -EOPNOTSUPP;
attrs = kernfs_iattrs(kn);
if (!attrs)
return -ENOMEM;
if (value)
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
value, size, flags);
else
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
value, size, flags);
}
static const struct xattr_handler kernfs_trusted_xattr_handler = {
.prefix = XATTR_TRUSTED_PREFIX,
.get = kernfs_vfs_xattr_get,
......@@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = {
.set = kernfs_vfs_xattr_set,
};
static const struct xattr_handler kernfs_user_xattr_handler = {
.prefix = XATTR_USER_PREFIX,
.get = kernfs_vfs_xattr_get,
.set = kernfs_vfs_user_xattr_set,
};
const struct xattr_handler *kernfs_xattr_handlers[] = {
&kernfs_trusted_xattr_handler,
&kernfs_security_xattr_handler,
&kernfs_user_xattr_handler,
NULL
};
......@@ -26,6 +26,8 @@ struct kernfs_iattrs {
struct timespec64 ia_ctime;
struct simple_xattrs xattrs;
atomic_t nr_user_xattrs;
atomic_t user_xattr_size;
};
/* +1 to avoid triggering overflow warning when negating it */
......
......@@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
if (len < sizeof(*new_xattr))
return NULL;
new_xattr = kmalloc(len, GFP_KERNEL);
new_xattr = kvmalloc(len, GFP_KERNEL);
if (!new_xattr)
return NULL;
......@@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* @value: value of the xattr. If %NULL, will remove the attribute.
* @size: size of the new xattr
* @flags: %XATTR_{CREATE|REPLACE}
* @removed_size: returns size of the removed xattr, -1 if none removed
*
* %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
* with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
......@@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
* Returns 0 on success, -errno on failure.
*/
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags)
const void *value, size_t size, int flags,
ssize_t *removed_size)
{
struct simple_xattr *xattr;
struct simple_xattr *new_xattr = NULL;
......@@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
new_xattr->name = kstrdup(name, GFP_KERNEL);
if (!new_xattr->name) {
kfree(new_xattr);
kvfree(new_xattr);
return -ENOMEM;
}
}
......@@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
err = -EEXIST;
} else if (new_xattr) {
list_replace(&xattr->list, &new_xattr->list);
if (removed_size)
*removed_size = xattr->size;
} else {
list_del(&xattr->list);
if (removed_size)
*removed_size = xattr->size;
}
goto out;
}
......@@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
list_add(&new_xattr->list, &xattrs->head);
xattr = NULL;
}
if (removed_size)
*removed_size = -1;
out:
spin_unlock(&xattrs->lock);
if (xattr) {
kfree(xattr->name);
kfree(xattr);
kvfree(xattr);
}
return err;
......
......@@ -633,8 +633,9 @@ struct cgroup_subsys {
void (*cancel_attach)(struct cgroup_taskset *tset);
void (*attach)(struct cgroup_taskset *tset);
void (*post_attach)(void);
int (*can_fork)(struct task_struct *task);
void (*cancel_fork)(struct task_struct *task);
int (*can_fork)(struct task_struct *task,
struct css_set *cset);
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
void (*fork)(struct task_struct *task);
void (*exit)(struct task_struct *task);
void (*release)(struct task_struct *task);
......
......@@ -27,6 +27,8 @@
#include <linux/cgroup-defs.h>
struct kernel_clone_args;
#ifdef CONFIG_CGROUPS
/*
......@@ -58,9 +60,6 @@ struct css_task_iter {
struct list_head *tcset_head;
struct list_head *task_pos;
struct list_head *tasks_head;
struct list_head *mg_tasks_head;
struct list_head *dying_tasks_head;
struct list_head *cur_tasks_head;
struct css_set *cur_cset;
......@@ -122,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *tsk);
void cgroup_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p);
extern void cgroup_cancel_fork(struct task_struct *p);
extern void cgroup_post_fork(struct task_struct *p);
extern int cgroup_can_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
extern void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
extern void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs);
void cgroup_exit(struct task_struct *p);
void cgroup_release(struct task_struct *p);
void cgroup_free(struct task_struct *p);
......@@ -708,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
struct dentry *dentry) { return -EINVAL; }
static inline void cgroup_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p) {}
static inline void cgroup_post_fork(struct task_struct *p) {}
static inline int cgroup_can_fork(struct task_struct *p,
struct kernel_clone_args *kargs) { return 0; }
static inline void cgroup_cancel_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_post_fork(struct task_struct *p,
struct kernel_clone_args *kargs) {}
static inline void cgroup_exit(struct task_struct *p) {}
static inline void cgroup_release(struct task_struct *p) {}
static inline void cgroup_free(struct task_struct *p) {}
......
......@@ -37,8 +37,10 @@ enum kernfs_node_type {
KERNFS_LINK = 0x0004,
};
#define KERNFS_TYPE_MASK 0x000f
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
#define KERNFS_TYPE_MASK 0x000f
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
#define KERNFS_MAX_USER_XATTRS 128
#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10)
enum kernfs_node_flag {
KERNFS_ACTIVATED = 0x0010,
......@@ -78,6 +80,11 @@ enum kernfs_root_flag {
* fhandle to access nodes of the fs.
*/
KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004,
/*
* Support user xattrs to be written to nodes rooted at this root.
*/
KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008,
};
/* type-specific structures for kernfs_node union members */
......
......@@ -13,6 +13,7 @@
struct task_struct;
struct rusage;
union thread_union;
struct css_set;
/* All the bits taken by the old clone syscall. */
#define CLONE_LEGACY_FLAGS 0xffffffffULL
......@@ -29,6 +30,9 @@ struct kernel_clone_args {
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
int cgroup;
struct cgroup *cgrp;
struct css_set *cset;
};
/*
......
......@@ -102,7 +102,8 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
void *buffer, size_t size);
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
const void *value, size_t size, int flags);
const void *value, size_t size, int flags,
ssize_t *removed_size);
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
size_t size);
void simple_xattr_list_add(struct simple_xattrs *xattrs,
......
......@@ -35,6 +35,7 @@
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
/*
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
......@@ -81,6 +82,8 @@
* @set_tid_size: This defines the size of the array referenced
* in @set_tid. This cannot be larger than the
* kernel's limit of nested PID namespaces.
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
* a file descriptor for the cgroup.
*
* The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and
......@@ -97,11 +100,13 @@ struct clone_args {
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
__aligned_u64 cgroup;
};
#endif
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
/*
* Scheduling policies
......
......@@ -38,10 +38,7 @@ static bool cgroup_no_v1_named;
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
/*
* Protects cgroup_subsys->release_agent_path. Modifying it also requires
* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
*/
/* protects cgroup_subsys->release_agent_path */
static DEFINE_SPINLOCK(release_agent_path_lock);
bool cgroup1_ssid_disabled(int ssid)
......@@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work)
{
struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work);
char *pathbuf = NULL, *agentbuf = NULL;
char *pathbuf, *agentbuf;
char *argv[3], *envp[3];
int ret;
mutex_lock(&cgroup_mutex);
/* snoop agent path and exit early if empty */
if (!cgrp->root->release_agent_path[0])
return;
/* prepare argument buffers */
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
if (!pathbuf || !agentbuf || !strlen(agentbuf))
goto out;
agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
if (!pathbuf || !agentbuf)
goto out_free;
spin_lock_irq(&css_set_lock);
ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
spin_unlock_irq(&css_set_lock);
spin_lock(&release_agent_path_lock);
strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
spin_unlock(&release_agent_path_lock);
if (!agentbuf[0])
goto out_free;
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
if (ret < 0 || ret >= PATH_MAX)
goto out;
goto out_free;
argv[0] = agentbuf;
argv[1] = pathbuf;
......@@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work)
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
goto out_free;
out:
mutex_unlock(&cgroup_mutex);
out_free:
kfree(agentbuf);
kfree(pathbuf);
......
This diff is collapsed.
......@@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
* Cgroup v2 behavior is used when on default hierarchy or the
* cgroup_v2_mode flag is set.
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
* With v2 behavior, "cpus" and "mems" are always what the users have
* requested and won't be changed by hotplug events. Only the effective
* cpus or mems will be affected.
*/
static inline bool is_in_v2_mode(void)
{
......
......@@ -33,6 +33,7 @@
#include <linux/atomic.h>
#include <linux/cgroup.h>
#include <linux/slab.h>
#include <linux/sched/task.h>
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
......@@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process().
*/
static int pids_can_fork(struct task_struct *task)
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
int err;
css = task_css_check(current, pids_cgrp_id, true);
if (cset)
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
err = pids_try_charge(pids, 1);
if (err) {
......@@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
return err;
}
static void pids_cancel_fork(struct task_struct *task)
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
css = task_css_check(current, pids_cgrp_id, true);
if (cset)
css = cset->subsys[pids_cgrp_id];
else
css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
pids_uncharge(pids, 1);
}
......
......@@ -2176,16 +2176,15 @@ static __latent_entropy struct task_struct *copy_process(
INIT_LIST_HEAD(&p->thread_group);
p->task_works = NULL;
cgroup_threadgroup_change_begin(current);
/*
* Ensure that the cgroup subsystem policies allow the new process to be
* forked. It should be noted the the new process's css_set can be changed
* between here and cgroup_post_fork() if an organisation operation is in
* progress.
*/
retval = cgroup_can_fork(p);
retval = cgroup_can_fork(p, args);
if (retval)
goto bad_fork_cgroup_threadgroup_change_end;
goto bad_fork_put_pidfd;
/*
* From this point on we must avoid any synchronous user-space
......@@ -2290,8 +2289,7 @@ static __latent_entropy struct task_struct *copy_process(
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
cgroup_threadgroup_change_end(current);
cgroup_post_fork(p, args);
perf_event_fork(p);
trace_task_newtask(p, clone_flags);
......@@ -2302,9 +2300,7 @@ static __latent_entropy struct task_struct *copy_process(
bad_fork_cancel_cgroup:
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
bad_fork_cgroup_threadgroup_change_end:
cgroup_threadgroup_change_end(current);
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
fput(pidfile);
......@@ -2633,6 +2629,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
!valid_signal(args.exit_signal)))
return -EINVAL;
if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
return -EINVAL;
*kargs = (struct kernel_clone_args){
.flags = args.flags,
.pidfd = u64_to_user_ptr(args.pidfd),
......@@ -2643,6 +2642,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
.cgroup = args.cgroup,
};
if (args.set_tid &&
......@@ -2686,7 +2686,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/* Verify that no unknown flags are passed along. */
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
if (kargs->flags &
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
return false;
/*
......
......@@ -3243,7 +3243,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
struct shmem_inode_info *info = SHMEM_I(inode);
name = xattr_full_name(handler, name);
return simple_xattr_set(&info->xattrs, name, value, size, flags);
return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
}
static const struct xattr_handler shmem_security_xattr_handler = {
......
......@@ -11,6 +11,6 @@ TEST_GEN_PROGS += test_freezer
include ../lib.mk
$(OUTPUT)/test_memcontrol: cgroup_util.c
$(OUTPUT)/test_core: cgroup_util.c
$(OUTPUT)/test_freezer: cgroup_util.c
$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
......@@ -15,6 +15,7 @@
#include <unistd.h>
#include "cgroup_util.h"
#include "../clone3/clone3_selftests.h"
static ssize_t read_text(const char *path, char *buf, size_t max_len)
{
......@@ -331,12 +332,112 @@ int cg_run(const char *cgroup,
}
}
pid_t clone_into_cgroup(int cgroup_fd)
{
#ifdef CLONE_ARGS_SIZE_VER2
pid_t pid;
struct clone_args args = {
.flags = CLONE_INTO_CGROUP,
.exit_signal = SIGCHLD,
.cgroup = cgroup_fd,
};
pid = sys_clone3(&args, sizeof(struct clone_args));
/*
* Verify that this is a genuine test failure:
* ENOSYS -> clone3() not available
* E2BIG -> CLONE_INTO_CGROUP not available
*/
if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
goto pretend_enosys;
return pid;
pretend_enosys:
#endif
errno = ENOSYS;
return -ENOSYS;
}
int clone_reap(pid_t pid, int options)
{
int ret;
siginfo_t info = {
.si_signo = 0,
};
again:
ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
if (ret < 0) {
if (errno == EINTR)
goto again;
return -1;
}
if (options & WEXITED) {
if (WIFEXITED(info.si_status))
return WEXITSTATUS(info.si_status);
}
if (options & WSTOPPED) {
if (WIFSTOPPED(info.si_status))
return WSTOPSIG(info.si_status);
}
if (options & WCONTINUED) {
if (WIFCONTINUED(info.si_status))
return 0;
}
return -1;
}
int dirfd_open_opath(const char *dir)
{
return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
}
#define close_prot_errno(fd) \
if (fd >= 0) { \
int _e_ = errno; \
close(fd); \
errno = _e_; \
}
static int clone_into_cgroup_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg)
{
int cgroup_fd;
pid_t pid;
cgroup_fd = dirfd_open_opath(cgroup);
if (cgroup_fd < 0)
return -1;
pid = clone_into_cgroup(cgroup_fd);
close_prot_errno(cgroup_fd);
if (pid == 0)
exit(fn(cgroup, arg));
return pid;
}
int cg_run_nowait(const char *cgroup,
int (*fn)(const char *cgroup, void *arg),
void *arg)
{
int pid;
pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
if (pid > 0)
return pid;
/* Genuine test failure. */
if (pid < 0 && errno != ENOSYS)
return -1;
pid = fork();
if (pid == 0) {
char buf[64];
......@@ -450,3 +551,28 @@ int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
return strstr(buf, needle) ? 0 : -1;
}
int clone_into_cgroup_run_wait(const char *cgroup)
{
int cgroup_fd;
pid_t pid;
cgroup_fd = dirfd_open_opath(cgroup);
if (cgroup_fd < 0)
return -1;
pid = clone_into_cgroup(cgroup_fd);
close_prot_errno(cgroup_fd);
if (pid < 0)
return -1;
if (pid == 0)
exit(EXIT_SUCCESS);
/*
* We don't care whether this fails. We only care whether the initial
* clone succeeded.
*/
(void)clone_reap(pid, WEXITED);
return 0;
}
......@@ -50,3 +50,7 @@ extern int cg_wait_for_proc_count(const char *cgroup, int count);
extern int cg_killall(const char *cgroup);
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
extern pid_t clone_into_cgroup(int cgroup_fd);
extern int clone_reap(pid_t pid, int options);
extern int clone_into_cgroup_run_wait(const char *cgroup);
extern int dirfd_open_opath(const char *dir);
......@@ -2,7 +2,10 @@
#include <linux/limits.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/wait.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdio.h>
#include <errno.h>
#include <signal.h>
......@@ -12,6 +15,115 @@
#include "../kselftest.h"
#include "cgroup_util.h"
static int touch_anon(char *buf, size_t size)
{
int fd;
char *pos = buf;
fd = open("/dev/urandom", O_RDONLY);
if (fd < 0)
return -1;
while (size > 0) {
ssize_t ret = read(fd, pos, size);
if (ret < 0) {
if (errno != EINTR) {
close(fd);
return -1;
}
} else {
pos += ret;
size -= ret;
}
}
close(fd);
return 0;
}
static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
{
int ppid = getppid();
size_t size = (size_t)arg;
void *buf;
buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
0, 0);
if (buf == MAP_FAILED)
return -1;
if (touch_anon((char *)buf, size)) {
munmap(buf, size);
return -1;
}
while (getppid() == ppid)
sleep(1);
munmap(buf, size);
return 0;
}
/*
* Create a child process that allocates and touches 100MB, then waits to be
* killed. Wait until the child is attached to the cgroup, kill all processes
* in that cgroup and wait until "cgroup.procs" is empty. At this point try to
* destroy the empty cgroup. The test helps detect race conditions between
* dying processes leaving the cgroup and cgroup destruction path.
*/
static int test_cgcore_destroy(const char *root)
{
int ret = KSFT_FAIL;
char *cg_test = NULL;
int child_pid;
char buf[PAGE_SIZE];
cg_test = cg_name(root, "cg_test");
if (!cg_test)
goto cleanup;
for (int i = 0; i < 10; i++) {
if (cg_create(cg_test))
goto cleanup;
child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
(void *) MB(100));
if (child_pid < 0)
goto cleanup;
/* wait for the child to enter cgroup */
if (cg_wait_for_proc_count(cg_test, 1))
goto cleanup;
if (cg_killall(cg_test))
goto cleanup;
/* wait for cgroup to be empty */
while (1) {
if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
goto cleanup;
if (buf[0] == '\0')
break;
usleep(1000);
}
if (rmdir(cg_test))
goto cleanup;
if (waitpid(child_pid, NULL, 0) < 0)
goto cleanup;
}
ret = KSFT_PASS;
cleanup:
if (cg_test)
cg_destroy(cg_test);
free(cg_test);
return ret;
}
/*
* A(0) - B(0) - C(1)
* \ D(0)
......@@ -25,8 +137,11 @@
static int test_cgcore_populated(const char *root)
{
int ret = KSFT_FAIL;
int err;
char *cg_test_a = NULL, *cg_test_b = NULL;
char *cg_test_c = NULL, *cg_test_d = NULL;
int cgroup_fd = -EBADF;
pid_t pid;
cg_test_a = cg_name(root, "cg_test_a");
cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
......@@ -78,6 +193,52 @@ static int test_cgcore_populated(const char *root)
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
goto cleanup;
/* Test that we can directly clone into a new cgroup. */
cgroup_fd = dirfd_open_opath(cg_test_d);
if (cgroup_fd < 0)
goto cleanup;
pid = clone_into_cgroup(cgroup_fd);
if (pid < 0) {
if (errno == ENOSYS)
goto cleanup_pass;
goto cleanup;
}
if (pid == 0) {
if (raise(SIGSTOP))
exit(EXIT_FAILURE);
exit(EXIT_SUCCESS);
}
err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n");
(void)clone_reap(pid, WSTOPPED);
(void)kill(pid, SIGCONT);
(void)clone_reap(pid, WEXITED);
if (err)
goto cleanup;
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
goto cleanup;
/* Remove cgroup. */
if (cg_test_d) {
cg_destroy(cg_test_d);
free(cg_test_d);
cg_test_d = NULL;
}
pid = clone_into_cgroup(cgroup_fd);
if (pid < 0)
goto cleanup_pass;
if (pid == 0)
exit(EXIT_SUCCESS);
(void)clone_reap(pid, WEXITED);
goto cleanup;
cleanup_pass:
ret = KSFT_PASS;
cleanup:
......@@ -93,6 +254,8 @@ static int test_cgcore_populated(const char *root)
free(cg_test_c);
free(cg_test_b);
free(cg_test_a);
if (cgroup_fd >= 0)
close(cgroup_fd);
return ret;
}
......@@ -136,6 +299,16 @@ static int test_cgcore_invalid_domain(const char *root)
if (errno != EOPNOTSUPP)
goto cleanup;
if (!clone_into_cgroup_run_wait(child))
goto cleanup;
if (errno == ENOSYS)
goto cleanup_pass;
if (errno != EOPNOTSUPP)
goto cleanup;
cleanup_pass:
ret = KSFT_PASS;
cleanup:
......@@ -345,6 +518,9 @@ static int test_cgcore_internal_process_constraint(const char *root)
if (!cg_enter_current(parent))
goto cleanup;
if (!clone_into_cgroup_run_wait(parent))
goto cleanup;
ret = KSFT_PASS;
cleanup:
......@@ -512,6 +688,7 @@ struct corecg_test {
T(test_cgcore_populated),
T(test_cgcore_proc_migration),
T(test_cgcore_thread_migration),
T(test_cgcore_destroy),
};
#undef T
......
......@@ -5,12 +5,24 @@
#define _GNU_SOURCE
#include <sched.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <stdint.h>
#include <syscall.h>
#include <linux/types.h>
#include <sys/wait.h>
#include "../kselftest.h"
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
#ifndef CLONE_INTO_CGROUP
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
#endif
#ifndef CLONE_ARGS_SIZE_VER0
#define CLONE_ARGS_SIZE_VER0 64
#endif
#ifndef __NR_clone3
#define __NR_clone3 -1
struct clone_args {
......@@ -22,10 +34,13 @@ struct clone_args {
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
#define CLONE_ARGS_SIZE_VER1 80
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
#define CLONE_ARGS_SIZE_VER2 88
__aligned_u64 cgroup;
};
#endif
#endif /* __NR_clone3 */
static pid_t sys_clone3(struct clone_args *args, size_t size)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment