Commit 9ae46e67 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-3.9-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cpuset changes from Tejun Heo:

 - Synchornization has seen a lot of changes with focus on decoupling
   cpuset synchronization from cgroup internal locking.

   After this change, there only remain a couple of mostly trivial
   dependencies on cgroup_lock outside cgroup core proper.  cgroup_lock
   is scheduled to be unexported in this devel cycle.

   This will finally remove the fragile locking order around cgroup
   (cgroup locking wants to / should be one of the outermost but yet has
   been acquired from deep inside individual controllers).

 - At this point, Li is most knowlegeable with cpuset and taking over
   the maintainership of cpuset.

* 'for-3.9-cpuset' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup:
  cpuset: drop spurious retval assignment in proc_cpuset_show()
  cpuset: fix RCU lockdep splat
  cpuset: update MAINTAINERS
  cpuset: remove cpuset->parent
  cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()
  cpuset: replace cgroup_mutex locking with cpuset internal locking
  cpuset: schedule hotplug propagation from cpuset_attach() if the cpuset is empty
  cpuset: pin down cpus and mems while a task is being attached
  cpuset: make CPU / memory hotplug propagation asynchronous
  cpuset: drop async_rebuild_sched_domains()
  cpuset: don't nest cgroup_mutex inside get_online_cpus()
  cpuset: reorganize CPU / memory hotplug handling
  cpuset: cleanup cpuset[_can]_attach()
  cpuset: introduce cpuset_for_each_child()
  cpuset: introduce CS_ONLINE
  cpuset: introduce ->css_on/offline()
  cpuset: remove fast exit path from remove_tasks_in_empty_cpuset()
  cpuset: remove unused cpuset_unlock()
parents 502b24c2 d127027b
...@@ -2140,10 +2140,10 @@ S: Maintained ...@@ -2140,10 +2140,10 @@ S: Maintained
F: tools/power/cpupower F: tools/power/cpupower
CPUSETS CPUSETS
M: Paul Menage <paul@paulmenage.org> M: Li Zefan <lizefan@huawei.com>
W: http://www.bullopensource.org/cpuset/ W: http://www.bullopensource.org/cpuset/
W: http://oss.sgi.com/projects/cpusets/ W: http://oss.sgi.com/projects/cpusets/
S: Supported S: Maintained
F: Documentation/cgroups/cpusets.txt F: Documentation/cgroups/cpusets.txt
F: include/linux/cpuset.h F: include/linux/cpuset.h
F: kernel/cpuset.c F: kernel/cpuset.c
......
...@@ -60,14 +60,6 @@ ...@@ -60,14 +60,6 @@
#include <linux/workqueue.h> #include <linux/workqueue.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
/*
* Workqueue for cpuset related tasks.
*
* Using kevent workqueue may cause deadlock when memory_migrate
* is set. So we create a separate workqueue thread for cpuset.
*/
static struct workqueue_struct *cpuset_wq;
/* /*
* Tracks how many cpusets are currently defined in system. * Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can * When there is only one cpuset (the root cpuset) we can
...@@ -95,18 +87,21 @@ struct cpuset { ...@@ -95,18 +87,21 @@ struct cpuset {
cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */
struct cpuset *parent; /* my parent */
struct fmeter fmeter; /* memory_pressure filter */ struct fmeter fmeter; /* memory_pressure filter */
/*
* Tasks are being attached to this cpuset. Used to prevent
* zeroing cpus/mems_allowed between ->can_attach() and ->attach().
*/
int attach_in_progress;
/* partition number for rebuild_sched_domains() */ /* partition number for rebuild_sched_domains() */
int pn; int pn;
/* for custom sched domain */ /* for custom sched domain */
int relax_domain_level; int relax_domain_level;
/* used for walking a cpuset hierarchy */ struct work_struct hotplug_work;
struct list_head stack_list;
}; };
/* Retrieve the cpuset for a cgroup */ /* Retrieve the cpuset for a cgroup */
...@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task) ...@@ -123,6 +118,15 @@ static inline struct cpuset *task_cs(struct task_struct *task)
struct cpuset, css); struct cpuset, css);
} }
static inline struct cpuset *parent_cs(const struct cpuset *cs)
{
struct cgroup *pcgrp = cs->css.cgroup->parent;
if (pcgrp)
return cgroup_cs(pcgrp);
return NULL;
}
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
static inline bool task_has_mempolicy(struct task_struct *task) static inline bool task_has_mempolicy(struct task_struct *task)
{ {
...@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task) ...@@ -138,6 +142,7 @@ static inline bool task_has_mempolicy(struct task_struct *task)
/* bits in struct cpuset flags field */ /* bits in struct cpuset flags field */
typedef enum { typedef enum {
CS_ONLINE,
CS_CPU_EXCLUSIVE, CS_CPU_EXCLUSIVE,
CS_MEM_EXCLUSIVE, CS_MEM_EXCLUSIVE,
CS_MEM_HARDWALL, CS_MEM_HARDWALL,
...@@ -147,13 +152,12 @@ typedef enum { ...@@ -147,13 +152,12 @@ typedef enum {
CS_SPREAD_SLAB, CS_SPREAD_SLAB,
} cpuset_flagbits_t; } cpuset_flagbits_t;
/* the type of hotplug event */
enum hotplug_event {
CPUSET_CPU_OFFLINE,
CPUSET_MEM_OFFLINE,
};
/* convenient tests for these bits */ /* convenient tests for these bits */
static inline bool is_cpuset_online(const struct cpuset *cs)
{
return test_bit(CS_ONLINE, &cs->flags);
}
static inline int is_cpu_exclusive(const struct cpuset *cs) static inline int is_cpu_exclusive(const struct cpuset *cs)
{ {
return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
...@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs) ...@@ -190,27 +194,52 @@ static inline int is_spread_slab(const struct cpuset *cs)
} }
static struct cpuset top_cpuset = { static struct cpuset top_cpuset = {
.flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), .flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
}; };
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
* @pos_cgrp: used for iteration
* @parent_cs: target cpuset to walk children of
*
* Walk @child_cs through the online children of @parent_cs. Must be used
* with RCU read locked.
*/
#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \
cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \
if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp)))))
/**
* cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants
* @des_cs: loop cursor pointing to the current descendant
* @pos_cgrp: used for iteration
* @root_cs: target cpuset to walk ancestor of
*
* Walk @des_cs through the online descendants of @root_cs. Must be used
* with RCU read locked. The caller may modify @pos_cgrp by calling
* cgroup_rightmost_descendant() to skip subtree.
*/
#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \
cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \
if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp)))))
/* /*
* There are two global mutexes guarding cpuset structures. The first * There are two global mutexes guarding cpuset structures - cpuset_mutex
* is the main control groups cgroup_mutex, accessed via * and callback_mutex. The latter may nest inside the former. We also
* cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific * require taking task_lock() when dereferencing a task's cpuset pointer.
* callback_mutex, below. They can nest. It is ok to first take * See "The task_lock() exception", at the end of this comment.
* cgroup_mutex, then nest callback_mutex. We also require taking *
* task_lock() when dereferencing a task's cpuset pointer. See "The * A task must hold both mutexes to modify cpusets. If a task holds
* task_lock() exception", at the end of this comment. * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
* * is the only task able to also acquire callback_mutex and be able to
* A task must hold both mutexes to modify cpusets. If a task * modify cpusets. It can perform various checks on the cpuset structure
* holds cgroup_mutex, then it blocks others wanting that mutex, * first, knowing nothing will change. It can also allocate memory while
* ensuring that it is the only task able to also acquire callback_mutex * just holding cpuset_mutex. While it is performing these checks, various
* and be able to modify cpusets. It can perform various checks on * callback routines can briefly acquire callback_mutex to query cpusets.
* the cpuset structure first, knowing nothing will change. It can * Once it is ready to make the changes, it takes callback_mutex, blocking
* also allocate memory while just holding cgroup_mutex. While it is * everyone else.
* performing these checks, various callback routines can briefly
* acquire callback_mutex to query cpusets. Once it is ready to make
* the changes, it takes callback_mutex, blocking everyone else.
* *
* Calls to the kernel memory allocator can not be made while holding * Calls to the kernel memory allocator can not be made while holding
* callback_mutex, as that would risk double tripping on callback_mutex * callback_mutex, as that would risk double tripping on callback_mutex
...@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = { ...@@ -232,6 +261,7 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c * guidelines for accessing subsystem state in kernel/cgroup.c
*/ */
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_MUTEX(callback_mutex); static DEFINE_MUTEX(callback_mutex);
/* /*
...@@ -245,6 +275,17 @@ static char cpuset_name[CPUSET_NAME_LEN]; ...@@ -245,6 +275,17 @@ static char cpuset_name[CPUSET_NAME_LEN];
static char cpuset_nodelist[CPUSET_NODELIST_LEN]; static char cpuset_nodelist[CPUSET_NODELIST_LEN];
static DEFINE_SPINLOCK(cpuset_buffer_lock); static DEFINE_SPINLOCK(cpuset_buffer_lock);
/*
* CPU / memory hotplug is handled asynchronously.
*/
static struct workqueue_struct *cpuset_propagate_hotplug_wq;
static void cpuset_hotplug_workfn(struct work_struct *work);
static void cpuset_propagate_hotplug_workfn(struct work_struct *work);
static void schedule_cpuset_propagate_hotplug(struct cpuset *cs);
static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
/* /*
* This is ugly, but preserves the userspace API for existing cpuset * This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we * users. If someone tries to mount the "cpuset" filesystem, we
...@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, ...@@ -289,7 +330,7 @@ static void guarantee_online_cpus(const struct cpuset *cs,
struct cpumask *pmask) struct cpumask *pmask)
{ {
while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask))
cs = cs->parent; cs = parent_cs(cs);
if (cs) if (cs)
cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask);
else else
...@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) ...@@ -314,7 +355,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
{ {
while (cs && !nodes_intersects(cs->mems_allowed, while (cs && !nodes_intersects(cs->mems_allowed,
node_states[N_MEMORY])) node_states[N_MEMORY]))
cs = cs->parent; cs = parent_cs(cs);
if (cs) if (cs)
nodes_and(*pmask, cs->mems_allowed, nodes_and(*pmask, cs->mems_allowed,
node_states[N_MEMORY]); node_states[N_MEMORY]);
...@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) ...@@ -326,7 +367,7 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
/* /*
* update task's spread flag if cpuset's page/slab spread flag is set * update task's spread flag if cpuset's page/slab spread flag is set
* *
* Called with callback_mutex/cgroup_mutex held * Called with callback_mutex/cpuset_mutex held
*/ */
static void cpuset_update_task_spread_flag(struct cpuset *cs, static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk) struct task_struct *tsk)
...@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs, ...@@ -346,7 +387,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
* *
* One cpuset is a subset of another if all its allowed CPUs and * One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags * Memory Nodes are a subset of the other, and its exclusive flags
* are only set if the other's are set. Call holding cgroup_mutex. * are only set if the other's are set. Call holding cpuset_mutex.
*/ */
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
...@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial) ...@@ -395,7 +436,7 @@ static void free_trial_cpuset(struct cpuset *trial)
* If we replaced the flag and mask values of the current cpuset * If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would * (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes * our various subset and exclusive rules still be valid? Presumes
* cgroup_mutex held. * cpuset_mutex held.
* *
* 'cur' is the address of an actual, in-use cpuset. Operations * 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the * such as list traversal that depend on the actual address of the
...@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) ...@@ -412,48 +453,58 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
{ {
struct cgroup *cont; struct cgroup *cont;
struct cpuset *c, *par; struct cpuset *c, *par;
int ret;
rcu_read_lock();
/* Each of our child cpusets must be a subset of us */ /* Each of our child cpusets must be a subset of us */
list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { ret = -EBUSY;
if (!is_cpuset_subset(cgroup_cs(cont), trial)) cpuset_for_each_child(c, cont, cur)
return -EBUSY; if (!is_cpuset_subset(c, trial))
} goto out;
/* Remaining checks don't apply to root cpuset */ /* Remaining checks don't apply to root cpuset */
ret = 0;
if (cur == &top_cpuset) if (cur == &top_cpuset)
return 0; goto out;
par = cur->parent; par = parent_cs(cur);
/* We must be a subset of our parent cpuset */ /* We must be a subset of our parent cpuset */
ret = -EACCES;
if (!is_cpuset_subset(trial, par)) if (!is_cpuset_subset(trial, par))
return -EACCES; goto out;
/* /*
* If either I or some sibling (!= me) is exclusive, we can't * If either I or some sibling (!= me) is exclusive, we can't
* overlap * overlap
*/ */
list_for_each_entry(cont, &par->css.cgroup->children, sibling) { ret = -EINVAL;
c = cgroup_cs(cont); cpuset_for_each_child(c, cont, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur && c != cur &&
cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
return -EINVAL; goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur && c != cur &&
nodes_intersects(trial->mems_allowed, c->mems_allowed)) nodes_intersects(trial->mems_allowed, c->mems_allowed))
return -EINVAL; goto out;
} }
/* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ /*
if (cgroup_task_count(cur->css.cgroup)) { * Cpusets with tasks - existing or newly being attached - can't
if (cpumask_empty(trial->cpus_allowed) || * have empty cpus_allowed or mems_allowed.
nodes_empty(trial->mems_allowed)) { */
return -ENOSPC; ret = -ENOSPC;
} if ((cgroup_task_count(cur->css.cgroup) || cur->attach_in_progress) &&
} (cpumask_empty(trial->cpus_allowed) ||
nodes_empty(trial->mems_allowed)))
goto out;
return 0; ret = 0;
out:
rcu_read_unlock();
return ret;
} }
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
...@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) ...@@ -474,31 +525,24 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
return; return;
} }
static void static void update_domain_attr_tree(struct sched_domain_attr *dattr,
update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) struct cpuset *root_cs)
{ {
LIST_HEAD(q);
list_add(&c->stack_list, &q);
while (!list_empty(&q)) {
struct cpuset *cp; struct cpuset *cp;
struct cgroup *cont; struct cgroup *pos_cgrp;
struct cpuset *child;
cp = list_first_entry(&q, struct cpuset, stack_list);
list_del(q.next);
if (cpumask_empty(cp->cpus_allowed)) rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) {
/* skip the whole subtree if @cp doesn't have any CPU */
if (cpumask_empty(cp->cpus_allowed)) {
pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
continue; continue;
}
if (is_sched_load_balance(cp)) if (is_sched_load_balance(cp))
update_domain_attr(dattr, cp); update_domain_attr(dattr, cp);
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont);
list_add_tail(&child->stack_list, &q);
}
} }
rcu_read_unlock();
} }
/* /*
...@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) ...@@ -520,7 +564,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
* domains when operating in the severe memory shortage situations * domains when operating in the severe memory shortage situations
* that could cause allocation failures below. * that could cause allocation failures below.
* *
* Must be called with cgroup_lock held. * Must be called with cpuset_mutex held.
* *
* The three key local variables below are: * The three key local variables below are:
* q - a linked-list queue of cpuset pointers, used to implement a * q - a linked-list queue of cpuset pointers, used to implement a
...@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) ...@@ -558,7 +602,6 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
static int generate_sched_domains(cpumask_var_t **domains, static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes) struct sched_domain_attr **attributes)
{ {
LIST_HEAD(q); /* queue of cpusets to be scanned */
struct cpuset *cp; /* scans q */ struct cpuset *cp; /* scans q */
struct cpuset **csa; /* array of all cpuset ptrs */ struct cpuset **csa; /* array of all cpuset ptrs */
int csn; /* how many cpuset ptrs in csa so far */ int csn; /* how many cpuset ptrs in csa so far */
...@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains, ...@@ -567,6 +610,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr *dattr; /* attributes for custom domains */ struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */ int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */ int nslot; /* next empty doms[] struct cpumask slot */
struct cgroup *pos_cgrp;
doms = NULL; doms = NULL;
dattr = NULL; dattr = NULL;
...@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains, ...@@ -594,33 +638,27 @@ static int generate_sched_domains(cpumask_var_t **domains,
goto done; goto done;
csn = 0; csn = 0;
list_add(&top_cpuset.stack_list, &q); rcu_read_lock();
while (!list_empty(&q)) { cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) {
struct cgroup *cont; /*
struct cpuset *child; /* scans child cpusets of cp */ * Continue traversing beyond @cp iff @cp has some CPUs and
* isn't load balancing. The former is obvious. The
cp = list_first_entry(&q, struct cpuset, stack_list); * latter: All child cpusets contain a subset of the
list_del(q.next); * parent's cpus, so just skip them, and then we call
* update_domain_attr_tree() to calc relax_domain_level of
if (cpumask_empty(cp->cpus_allowed)) * the corresponding sched domain.
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!is_sched_load_balance(cp))
continue; continue;
/* if (is_sched_load_balance(cp))
* All child cpusets contain a subset of the parent's cpus, so
* just skip them, and then we call update_domain_attr_tree()
* to calc relax_domain_level of the corresponding sched
* domain.
*/
if (is_sched_load_balance(cp)) {
csa[csn++] = cp; csa[csn++] = cp;
continue;
}
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { /* skip @cp's subtree */
child = cgroup_cs(cont); pos_cgrp = cgroup_rightmost_descendant(pos_cgrp);
list_add_tail(&child->stack_list, &q);
}
} }
rcu_read_unlock();
for (i = 0; i < csn; i++) for (i = 0; i < csn; i++)
csa[i]->pn = i; csa[i]->pn = i;
...@@ -725,25 +763,25 @@ static int generate_sched_domains(cpumask_var_t **domains, ...@@ -725,25 +763,25 @@ static int generate_sched_domains(cpumask_var_t **domains,
/* /*
* Rebuild scheduler domains. * Rebuild scheduler domains.
* *
* Call with neither cgroup_mutex held nor within get_online_cpus(). * If the flag 'sched_load_balance' of any cpuset with non-empty
* Takes both cgroup_mutex and get_online_cpus(). * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
* which has that flag enabled, or if any cpuset with a non-empty
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
* *
* Cannot be directly called from cpuset code handling changes * Call with cpuset_mutex held. Takes get_online_cpus().
* to the cpuset pseudo-filesystem, because it cannot be called
* from code that already holds cgroup_mutex.
*/ */
static void do_rebuild_sched_domains(struct work_struct *unused) static void rebuild_sched_domains_locked(void)
{ {
struct sched_domain_attr *attr; struct sched_domain_attr *attr;
cpumask_var_t *doms; cpumask_var_t *doms;
int ndoms; int ndoms;
lockdep_assert_held(&cpuset_mutex);
get_online_cpus(); get_online_cpus();
/* Generate domain masks and attrs */ /* Generate domain masks and attrs */
cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr); ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
/* Have scheduler rebuild the domains */ /* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr); partition_sched_domains(ndoms, doms, attr);
...@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused) ...@@ -751,7 +789,7 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
put_online_cpus(); put_online_cpus();
} }
#else /* !CONFIG_SMP */ #else /* !CONFIG_SMP */
static void do_rebuild_sched_domains(struct work_struct *unused) static void rebuild_sched_domains_locked(void)
{ {
} }
...@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains, ...@@ -763,44 +801,11 @@ static int generate_sched_domains(cpumask_var_t **domains,
} }
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
/*
* Rebuild scheduler domains, asynchronously via workqueue.
*
* If the flag 'sched_load_balance' of any cpuset with non-empty
* 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
* which has that flag enabled, or if any cpuset with a non-empty
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
* The rebuild_sched_domains() and partition_sched_domains()
* routines must nest cgroup_lock() inside get_online_cpus(),
* but such cpuset changes as these must nest that locking the
* other way, holding cgroup_lock() for much of the code.
*
* So in order to avoid an ABBA deadlock, the cpuset code handling
* these user changes delegates the actual sched domain rebuilding
* to a separate workqueue thread, which ends up processing the
* above do_rebuild_sched_domains() function.
*/
static void async_rebuild_sched_domains(void)
{
queue_work(cpuset_wq, &rebuild_sched_domains_work);
}
/*
* Accomplishes the same scheduler domain rebuild as the above
* async_rebuild_sched_domains(), however it directly calls the
* rebuild routine synchronously rather than calling it via an
* asynchronous work thread.
*
* This can only be called from code that is not holding
* cgroup_mutex (not nested in a cgroup_lock() call.)
*/
void rebuild_sched_domains(void) void rebuild_sched_domains(void)
{ {
do_rebuild_sched_domains(NULL); mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
mutex_unlock(&cpuset_mutex);
} }
/** /**
...@@ -808,7 +813,7 @@ void rebuild_sched_domains(void) ...@@ -808,7 +813,7 @@ void rebuild_sched_domains(void)
* @tsk: task to test * @tsk: task to test
* @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner
* *
* Call with cgroup_mutex held. May take callback_mutex during call. * Call with cpuset_mutex held. May take callback_mutex during call.
* Called for each task in a cgroup by cgroup_scan_tasks(). * Called for each task in a cgroup by cgroup_scan_tasks().
* Return nonzero if this tasks's cpus_allowed mask should be changed (in other * Return nonzero if this tasks's cpus_allowed mask should be changed (in other
* words, if its mask is not equal to its cpuset's mask). * words, if its mask is not equal to its cpuset's mask).
...@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk, ...@@ -829,7 +834,7 @@ static int cpuset_test_cpumask(struct task_struct *tsk,
* cpus_allowed mask needs to be changed. * cpus_allowed mask needs to be changed.
* *
* We don't need to re-check for the cgroup/cpuset membership, since we're * We don't need to re-check for the cgroup/cpuset membership, since we're
* holding cgroup_lock() at this point. * holding cpuset_mutex at this point.
*/ */
static void cpuset_change_cpumask(struct task_struct *tsk, static void cpuset_change_cpumask(struct task_struct *tsk,
struct cgroup_scanner *scan) struct cgroup_scanner *scan)
...@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, ...@@ -842,7 +847,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
* *
* Called with cgroup_mutex held * Called with cpuset_mutex held
* *
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each. * calling callback functions for each.
...@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, ...@@ -920,7 +925,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
heap_free(&heap); heap_free(&heap);
if (is_load_balanced) if (is_load_balanced)
async_rebuild_sched_domains(); rebuild_sched_domains_locked();
return 0; return 0;
} }
...@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, ...@@ -932,7 +937,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Temporarilly set tasks mems_allowed to target nodes of migration, * Temporarilly set tasks mems_allowed to target nodes of migration,
* so that the migration code can allocate pages on these nodes. * so that the migration code can allocate pages on these nodes.
* *
* Call holding cgroup_mutex, so current's cpuset won't change * Call holding cpuset_mutex, so current's cpuset won't change
* during this call, as manage_mutex holds off any cpuset_attach() * during this call, as manage_mutex holds off any cpuset_attach()
* calls. Therefore we don't need to take task_lock around the * calls. Therefore we don't need to take task_lock around the
* call to guarantee_online_mems(), as we know no one is changing * call to guarantee_online_mems(), as we know no one is changing
...@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, ...@@ -1007,7 +1012,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk,
/* /*
* Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy
* of it to cpuset's new mems_allowed, and migrate pages to new nodes if * of it to cpuset's new mems_allowed, and migrate pages to new nodes if
* memory_migrate flag is set. Called with cgroup_mutex held. * memory_migrate flag is set. Called with cpuset_mutex held.
*/ */
static void cpuset_change_nodemask(struct task_struct *p, static void cpuset_change_nodemask(struct task_struct *p,
struct cgroup_scanner *scan) struct cgroup_scanner *scan)
...@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p, ...@@ -1016,7 +1021,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs; struct cpuset *cs;
int migrate; int migrate;
const nodemask_t *oldmem = scan->data; const nodemask_t *oldmem = scan->data;
static nodemask_t newmems; /* protected by cgroup_mutex */ static nodemask_t newmems; /* protected by cpuset_mutex */
cs = cgroup_cs(scan->cg); cs = cgroup_cs(scan->cg);
guarantee_online_mems(cs, &newmems); guarantee_online_mems(cs, &newmems);
...@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound; ...@@ -1043,7 +1048,7 @@ static void *cpuset_being_rebound;
* @oldmem: old mems_allowed of cpuset cs * @oldmem: old mems_allowed of cpuset cs
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
* *
* Called with cgroup_mutex held * Called with cpuset_mutex held
* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
* if @heap != NULL. * if @heap != NULL.
*/ */
...@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, ...@@ -1065,7 +1070,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* take while holding tasklist_lock. Forks can happen - the * take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks, * mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold * and rebind their vma mempolicies too. Because we still hold
* the global cgroup_mutex, we know that no other rebind effort * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound. * will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm() * It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes. * is idempotent. Also migrate pages in each mm to new nodes.
...@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, ...@@ -1084,7 +1089,7 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
* mempolicies and if the cpuset is marked 'memory_migrate', * mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory. * migrate the tasks pages to the new memory.
* *
* Call with cgroup_mutex held. May take callback_mutex during call. * Call with cpuset_mutex held. May take callback_mutex during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs, * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_sem, scan its vma's and rebind * lock each such tasks mm->mmap_sem, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed. * their mempolicies to the cpusets new mems_allowed.
...@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) ...@@ -1168,7 +1173,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
cs->relax_domain_level = val; cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) && if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs)) is_sched_load_balance(cs))
async_rebuild_sched_domains(); rebuild_sched_domains_locked();
} }
return 0; return 0;
...@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) ...@@ -1182,7 +1187,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* Called by cgroup_scan_tasks() for each task in a cgroup. * Called by cgroup_scan_tasks() for each task in a cgroup.
* *
* We don't need to re-check for the cgroup/cpuset membership, since we're * We don't need to re-check for the cgroup/cpuset membership, since we're
* holding cgroup_lock() at this point. * holding cpuset_mutex at this point.
*/ */
static void cpuset_change_flag(struct task_struct *tsk, static void cpuset_change_flag(struct task_struct *tsk,
struct cgroup_scanner *scan) struct cgroup_scanner *scan)
...@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk, ...@@ -1195,7 +1200,7 @@ static void cpuset_change_flag(struct task_struct *tsk,
* @cs: the cpuset in which each task's spread flags needs to be changed * @cs: the cpuset in which each task's spread flags needs to be changed
* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
* *
* Called with cgroup_mutex held * Called with cpuset_mutex held
* *
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
* calling callback functions for each. * calling callback functions for each.
...@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) ...@@ -1220,7 +1225,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap)
* cs: the cpuset to update * cs: the cpuset to update
* turning_on: whether the flag is being set or cleared * turning_on: whether the flag is being set or cleared
* *
* Call with cgroup_mutex held. * Call with cpuset_mutex held.
*/ */
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
...@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, ...@@ -1260,7 +1265,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
async_rebuild_sched_domains(); rebuild_sched_domains_locked();
if (spread_flag_changed) if (spread_flag_changed)
update_tasks_flags(cs, &heap); update_tasks_flags(cs, &heap);
...@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp) ...@@ -1368,24 +1373,18 @@ static int fmeter_getrate(struct fmeter *fmp)
return val; return val;
} }
/* /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
* Protected by cgroup_lock. The nodemasks must be stored globally because
* dynamically allocating them is not allowed in can_attach, and they must
* persist until attach.
*/
static cpumask_var_t cpus_attach;
static nodemask_t cpuset_attach_nodemask_from;
static nodemask_t cpuset_attach_nodemask_to;
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{ {
struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *cs = cgroup_cs(cgrp);
struct task_struct *task; struct task_struct *task;
int ret; int ret;
mutex_lock(&cpuset_mutex);
ret = -ENOSPC;
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
return -ENOSPC; goto out_unlock;
cgroup_taskset_for_each(task, cgrp, tset) { cgroup_taskset_for_each(task, cgrp, tset) {
/* /*
...@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) ...@@ -1397,25 +1396,45 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
* set_cpus_allowed_ptr() on all attached tasks before * set_cpus_allowed_ptr() on all attached tasks before
* cpus_allowed may be changed. * cpus_allowed may be changed.
*/ */
ret = -EINVAL;
if (task->flags & PF_THREAD_BOUND) if (task->flags & PF_THREAD_BOUND)
return -EINVAL; goto out_unlock;
if ((ret = security_task_setscheduler(task))) ret = security_task_setscheduler(task);
return ret; if (ret)
goto out_unlock;
} }
/* prepare for attach */ /*
if (cs == &top_cpuset) * Mark attach is in progress. This makes validate_change() fail
cpumask_copy(cpus_attach, cpu_possible_mask); * changes which zero cpus/mems_allowed.
else */
guarantee_online_cpus(cs, cpus_attach); cs->attach_in_progress++;
ret = 0;
guarantee_online_mems(cs, &cpuset_attach_nodemask_to); out_unlock:
mutex_unlock(&cpuset_mutex);
return ret;
}
return 0; static void cpuset_cancel_attach(struct cgroup *cgrp,
struct cgroup_taskset *tset)
{
mutex_lock(&cpuset_mutex);
cgroup_cs(cgrp)->attach_in_progress--;
mutex_unlock(&cpuset_mutex);
} }
/*
* Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
{ {
/* static bufs protected by cpuset_mutex */
static nodemask_t cpuset_attach_nodemask_from;
static nodemask_t cpuset_attach_nodemask_to;
struct mm_struct *mm; struct mm_struct *mm;
struct task_struct *task; struct task_struct *task;
struct task_struct *leader = cgroup_taskset_first(tset); struct task_struct *leader = cgroup_taskset_first(tset);
...@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) ...@@ -1423,6 +1442,16 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *oldcs = cgroup_cs(oldcgrp); struct cpuset *oldcs = cgroup_cs(oldcgrp);
mutex_lock(&cpuset_mutex);
/* prepare for attach */
if (cs == &top_cpuset)
cpumask_copy(cpus_attach, cpu_possible_mask);
else
guarantee_online_cpus(cs, cpus_attach);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, cgrp, tset) { cgroup_taskset_for_each(task, cgrp, tset) {
/* /*
* can_attach beforehand should guarantee that this doesn't * can_attach beforehand should guarantee that this doesn't
...@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) ...@@ -1448,6 +1477,18 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
&cpuset_attach_nodemask_to); &cpuset_attach_nodemask_to);
mmput(mm); mmput(mm);
} }
cs->attach_in_progress--;
/*
* We may have raced with CPU/memory hotunplug. Trigger hotplug
* propagation if @cs doesn't have any CPU or memory. It will move
* the newly added tasks to the nearest parent which can execute.
*/
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
schedule_cpuset_propagate_hotplug(cs);
mutex_unlock(&cpuset_mutex);
} }
/* The various types of files and directories in a cpuset file system */ /* The various types of files and directories in a cpuset file system */
...@@ -1469,12 +1510,13 @@ typedef enum { ...@@ -1469,12 +1510,13 @@ typedef enum {
static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
{ {
int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private; cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
if (!cgroup_lock_live_group(cgrp)) mutex_lock(&cpuset_mutex);
return -ENODEV; if (!is_cpuset_online(cs))
goto out_unlock;
switch (type) { switch (type) {
case FILE_CPU_EXCLUSIVE: case FILE_CPU_EXCLUSIVE:
...@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) ...@@ -1508,18 +1550,20 @@ static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
retval = -EINVAL; retval = -EINVAL;
break; break;
} }
cgroup_unlock(); out_unlock:
mutex_unlock(&cpuset_mutex);
return retval; return retval;
} }
static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
{ {
int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *cs = cgroup_cs(cgrp);
cpuset_filetype_t type = cft->private; cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
if (!cgroup_lock_live_group(cgrp)) mutex_lock(&cpuset_mutex);
return -ENODEV; if (!is_cpuset_online(cs))
goto out_unlock;
switch (type) { switch (type) {
case FILE_SCHED_RELAX_DOMAIN_LEVEL: case FILE_SCHED_RELAX_DOMAIN_LEVEL:
...@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) ...@@ -1529,7 +1573,8 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
retval = -EINVAL; retval = -EINVAL;
break; break;
} }
cgroup_unlock(); out_unlock:
mutex_unlock(&cpuset_mutex);
return retval; return retval;
} }
...@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) ...@@ -1539,17 +1584,36 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
const char *buf) const char *buf)
{ {
int retval = 0;
struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *trialcs; struct cpuset *trialcs;
int retval = -ENODEV;
if (!cgroup_lock_live_group(cgrp)) /*
return -ENODEV; * CPU or memory hotunplug may leave @cs w/o any execution
* resources, in which case the hotplug code asynchronously updates
* configuration and transfers all tasks to the nearest ancestor
* which can execute.
*
* As writes to "cpus" or "mems" may restore @cs's execution
* resources, wait for the previously scheduled operations before
* proceeding, so that we don't end up keep removing tasks added
* after execution capability is restored.
*
* Flushing cpuset_hotplug_work is enough to synchronize against
* hotplug hanlding; however, cpuset_attach() may schedule
* propagation work directly. Flush the workqueue too.
*/
flush_work(&cpuset_hotplug_work);
flush_workqueue(cpuset_propagate_hotplug_wq);
mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
trialcs = alloc_trial_cpuset(cs); trialcs = alloc_trial_cpuset(cs);
if (!trialcs) { if (!trialcs) {
retval = -ENOMEM; retval = -ENOMEM;
goto out; goto out_unlock;
} }
switch (cft->private) { switch (cft->private) {
...@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, ...@@ -1565,8 +1629,8 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
} }
free_trial_cpuset(trialcs); free_trial_cpuset(trialcs);
out: out_unlock:
cgroup_unlock(); mutex_unlock(&cpuset_mutex);
return retval; return retval;
} }
...@@ -1790,15 +1854,12 @@ static struct cftype files[] = { ...@@ -1790,15 +1854,12 @@ static struct cftype files[] = {
static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
{ {
struct cgroup *parent_cg = cont->parent; struct cpuset *cs;
struct cgroup *tmp_cg;
struct cpuset *parent, *cs;
if (!parent_cg) if (!cont->parent)
return &top_cpuset.css; return &top_cpuset.css;
parent = cgroup_cs(parent_cg);
cs = kmalloc(sizeof(*cs), GFP_KERNEL); cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs) if (!cs)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) {
...@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) ...@@ -1806,22 +1867,38 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
} }
cs->flags = 0;
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed); cpumask_clear(cs->cpus_allowed);
nodes_clear(cs->mems_allowed); nodes_clear(cs->mems_allowed);
fmeter_init(&cs->fmeter); fmeter_init(&cs->fmeter);
INIT_WORK(&cs->hotplug_work, cpuset_propagate_hotplug_workfn);
cs->relax_domain_level = -1; cs->relax_domain_level = -1;
cs->parent = parent; return &cs->css;
}
static int cpuset_css_online(struct cgroup *cgrp)
{
struct cpuset *cs = cgroup_cs(cgrp);
struct cpuset *parent = parent_cs(cs);
struct cpuset *tmp_cs;
struct cgroup *pos_cg;
if (!parent)
return 0;
mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
set_bit(CS_SPREAD_PAGE, &cs->flags);
if (is_spread_slab(parent))
set_bit(CS_SPREAD_SLAB, &cs->flags);
number_of_cpusets++; number_of_cpusets++;
if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cont->flags)) if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags))
goto skip_clone; goto out_unlock;
/* /*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
...@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont) ...@@ -1836,35 +1913,49 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cont)
* changed to grant parent->cpus_allowed-sibling_cpus_exclusive * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
* (and likewise for mems) to the new cgroup. * (and likewise for mems) to the new cgroup.
*/ */
list_for_each_entry(tmp_cg, &parent_cg->children, sibling) { rcu_read_lock();
struct cpuset *tmp_cs = cgroup_cs(tmp_cg); cpuset_for_each_child(tmp_cs, pos_cg, parent) {
if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) rcu_read_unlock();
goto skip_clone; goto out_unlock;
}
} }
rcu_read_unlock();
mutex_lock(&callback_mutex); mutex_lock(&callback_mutex);
cs->mems_allowed = parent->mems_allowed; cs->mems_allowed = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
skip_clone: out_unlock:
return &cs->css; mutex_unlock(&cpuset_mutex);
return 0;
}
static void cpuset_css_offline(struct cgroup *cgrp)
{
struct cpuset *cs = cgroup_cs(cgrp);
mutex_lock(&cpuset_mutex);
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
number_of_cpusets--;
clear_bit(CS_ONLINE, &cs->flags);
mutex_unlock(&cpuset_mutex);
} }
/* /*
* If the cpuset being removed has its flag 'sched_load_balance' * If the cpuset being removed has its flag 'sched_load_balance'
* enabled, then simulate turning sched_load_balance off, which * enabled, then simulate turning sched_load_balance off, which
* will call async_rebuild_sched_domains(). * will call rebuild_sched_domains_locked().
*/ */
static void cpuset_css_free(struct cgroup *cont) static void cpuset_css_free(struct cgroup *cont)
{ {
struct cpuset *cs = cgroup_cs(cont); struct cpuset *cs = cgroup_cs(cont);
if (is_sched_load_balance(cs))
update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
number_of_cpusets--;
free_cpumask_var(cs->cpus_allowed); free_cpumask_var(cs->cpus_allowed);
kfree(cs); kfree(cs);
} }
...@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont) ...@@ -1872,8 +1963,11 @@ static void cpuset_css_free(struct cgroup *cont)
struct cgroup_subsys cpuset_subsys = { struct cgroup_subsys cpuset_subsys = {
.name = "cpuset", .name = "cpuset",
.css_alloc = cpuset_css_alloc, .css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
.css_offline = cpuset_css_offline,
.css_free = cpuset_css_free, .css_free = cpuset_css_free,
.can_attach = cpuset_can_attach, .can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach, .attach = cpuset_attach,
.subsys_id = cpuset_subsys_id, .subsys_id = cpuset_subsys_id,
.base_cftypes = files, .base_cftypes = files,
...@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk, ...@@ -1924,7 +2018,9 @@ static void cpuset_do_move_task(struct task_struct *tsk,
{ {
struct cgroup *new_cgroup = scan->data; struct cgroup *new_cgroup = scan->data;
cgroup_lock();
cgroup_attach_task(new_cgroup, tsk); cgroup_attach_task(new_cgroup, tsk);
cgroup_unlock();
} }
/** /**
...@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk, ...@@ -1932,7 +2028,7 @@ static void cpuset_do_move_task(struct task_struct *tsk,
* @from: cpuset in which the tasks currently reside * @from: cpuset in which the tasks currently reside
* @to: cpuset to which the tasks will be moved * @to: cpuset to which the tasks will be moved
* *
* Called with cgroup_mutex held * Called with cpuset_mutex held
* callback_mutex must not be held, as cpuset_attach() will take it. * callback_mutex must not be held, as cpuset_attach() will take it.
* *
* The cgroup_scan_tasks() function will scan all the tasks in a cgroup, * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
...@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) ...@@ -1959,169 +2055,200 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
* removing that CPU or node from all cpusets. If this removes the * removing that CPU or node from all cpusets. If this removes the
* last CPU or node from a cpuset, then move the tasks in the empty * last CPU or node from a cpuset, then move the tasks in the empty
* cpuset to its next-highest non-empty parent. * cpuset to its next-highest non-empty parent.
*
* Called with cgroup_mutex held
* callback_mutex must not be held, as cpuset_attach() will take it.
*/ */
static void remove_tasks_in_empty_cpuset(struct cpuset *cs) static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{ {
struct cpuset *parent; struct cpuset *parent;
/*
* The cgroup's css_sets list is in use if there are tasks
* in the cpuset; the list is empty if there are none;
* the cs->css.refcnt seems always 0.
*/
if (list_empty(&cs->css.cgroup->css_sets))
return;
/* /*
* Find its next-highest non-empty parent, (top cpuset * Find its next-highest non-empty parent, (top cpuset
* has online cpus, so can't be empty). * has online cpus, so can't be empty).
*/ */
parent = cs->parent; parent = parent_cs(cs);
while (cpumask_empty(parent->cpus_allowed) || while (cpumask_empty(parent->cpus_allowed) ||
nodes_empty(parent->mems_allowed)) nodes_empty(parent->mems_allowed))
parent = parent->parent; parent = parent_cs(parent);
move_member_tasks_to_cpuset(cs, parent); move_member_tasks_to_cpuset(cs, parent);
} }
/* /**
* Helper function to traverse cpusets. * cpuset_propagate_hotplug_workfn - propagate CPU/memory hotplug to a cpuset
* It can be used to walk the cpuset tree from top to bottom, completing * @cs: cpuset in interest
* one layer before dropping down to the next (thus always processing a *
* node before any of its children). * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
* offline, update @cs accordingly. If @cs ends up with no CPU or memory,
* all its tasks are moved to the nearest ancestor with both resources.
*/ */
static struct cpuset *cpuset_next(struct list_head *queue) static void cpuset_propagate_hotplug_workfn(struct work_struct *work)
{ {
struct cpuset *cp; static cpumask_t off_cpus;
struct cpuset *child; /* scans child cpusets of cp */ static nodemask_t off_mems, tmp_mems;
struct cgroup *cont; struct cpuset *cs = container_of(work, struct cpuset, hotplug_work);
bool is_empty;
if (list_empty(queue)) mutex_lock(&cpuset_mutex);
return NULL;
cp = list_first_entry(queue, struct cpuset, stack_list); cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
list_del(queue->next); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
child = cgroup_cs(cont); /* remove offline cpus from @cs */
list_add_tail(&child->stack_list, queue); if (!cpumask_empty(&off_cpus)) {
mutex_lock(&callback_mutex);
cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
mutex_unlock(&callback_mutex);
update_tasks_cpumask(cs, NULL);
}
/* remove offline mems from @cs */
if (!nodes_empty(off_mems)) {
tmp_mems = cs->mems_allowed;
mutex_lock(&callback_mutex);
nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
mutex_unlock(&callback_mutex);
update_tasks_nodemask(cs, &tmp_mems, NULL);
} }
return cp; is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
mutex_unlock(&cpuset_mutex);
/*
* If @cs became empty, move tasks to the nearest ancestor with
* execution resources. This is full cgroup operation which will
* also call back into cpuset. Should be done outside any lock.
*/
if (is_empty)
remove_tasks_in_empty_cpuset(cs);
/* the following may free @cs, should be the last operation */
css_put(&cs->css);
} }
/**
* schedule_cpuset_propagate_hotplug - schedule hotplug propagation to a cpuset
* @cs: cpuset of interest
*
* Schedule cpuset_propagate_hotplug_workfn() which will update CPU and
* memory masks according to top_cpuset.
*/
static void schedule_cpuset_propagate_hotplug(struct cpuset *cs)
{
/*
* Pin @cs. The refcnt will be released when the work item
* finishes executing.
*/
if (!css_tryget(&cs->css))
return;
/* /*
* Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory * Queue @cs->hotplug_work. If already pending, lose the css ref.
* online/offline) and update the cpusets accordingly. * cpuset_propagate_hotplug_wq is ordered and propagation will
* For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such * happen in the order this function is called.
* cpuset must be moved to a parent cpuset. */
if (!queue_work(cpuset_propagate_hotplug_wq, &cs->hotplug_work))
css_put(&cs->css);
}
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
* *
* Called with cgroup_mutex held. We take callback_mutex to modify * This function is called after either CPU or memory configuration has
* cpus_allowed and mems_allowed. * changed and updates cpuset accordingly. The top_cpuset is always
* synchronized to cpu_active_mask and N_MEMORY, which is necessary in
* order to make cpusets transparent (of no affect) on systems that are
* actively using CPU hotplug but making no active use of cpusets.
* *
* This walk processes the tree from top to bottom, completing one layer * Non-root cpusets are only affected by offlining. If any CPUs or memory
* before dropping down to the next. It always processes a node before * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
* any of its children. * descendants.
* *
* In the case of memory hot-unplug, it will remove nodes from N_MEMORY * Note that CPU offlining during suspend is ignored. We don't modify
* if all present pages from a node are offlined. * cpusets across suspend/resume cycles at all.
*/ */
static void static void cpuset_hotplug_workfn(struct work_struct *work)
scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
{ {
LIST_HEAD(queue); static cpumask_t new_cpus, tmp_cpus;
struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t new_mems, tmp_mems;
static nodemask_t oldmems; /* protected by cgroup_mutex */ bool cpus_updated, mems_updated;
bool cpus_offlined, mems_offlined;
list_add_tail((struct list_head *)&root->stack_list, &queue); mutex_lock(&cpuset_mutex);
switch (event) { /* fetch the available cpus/mems and find out which changed how */
case CPUSET_CPU_OFFLINE: cpumask_copy(&new_cpus, cpu_active_mask);
while ((cp = cpuset_next(&queue)) != NULL) { new_mems = node_states[N_MEMORY];
/* Continue past cpusets with all cpus online */ cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
if (cpumask_subset(cp->cpus_allowed, cpu_active_mask)) cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
continue; &new_cpus);
mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
mems_offlined = !nodes_empty(tmp_mems);
/* Remove offline cpus from this cpuset. */ /* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(&callback_mutex); mutex_lock(&callback_mutex);
cpumask_and(cp->cpus_allowed, cp->cpus_allowed, cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
cpu_active_mask);
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
/* Move tasks from the empty cpuset to a parent */
if (cpumask_empty(cp->cpus_allowed))
remove_tasks_in_empty_cpuset(cp);
else
update_tasks_cpumask(cp, NULL);
} }
break;
case CPUSET_MEM_OFFLINE: /* synchronize mems_allowed to N_MEMORY */
while ((cp = cpuset_next(&queue)) != NULL) { if (mems_updated) {
tmp_mems = top_cpuset.mems_allowed;
/* Continue past cpusets with all mems online */
if (nodes_subset(cp->mems_allowed,
node_states[N_MEMORY]))
continue;
oldmems = cp->mems_allowed;
/* Remove offline mems from this cpuset. */
mutex_lock(&callback_mutex); mutex_lock(&callback_mutex);
nodes_and(cp->mems_allowed, cp->mems_allowed, top_cpuset.mems_allowed = new_mems;
node_states[N_MEMORY]);
mutex_unlock(&callback_mutex); mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
/* Move tasks from the empty cpuset to a parent */
if (nodes_empty(cp->mems_allowed))
remove_tasks_in_empty_cpuset(cp);
else
update_tasks_nodemask(cp, &oldmems, NULL);
} }
/* if cpus or mems went down, we need to propagate to descendants */
if (cpus_offlined || mems_offlined) {
struct cpuset *cs;
struct cgroup *pos_cgrp;
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset)
schedule_cpuset_propagate_hotplug(cs);
rcu_read_unlock();
} }
}
/* mutex_unlock(&cpuset_mutex);
* The top_cpuset tracks what CPUs and Memory Nodes are online,
* period. This is necessary in order to make cpusets transparent /* wait for propagations to finish */
* (of no affect) on systems that are actively using CPU hotplug flush_workqueue(cpuset_propagate_hotplug_wq);
* but making no active use of cpusets.
* /* rebuild sched domains if cpus_allowed has changed */
* The only exception to this is suspend/resume, where we don't if (cpus_updated) {
* modify cpusets at all.
*
* This routine ensures that top_cpuset.cpus_allowed tracks
* cpu_active_mask on each CPU hotplug (cpuhp) event.
*
* Called within get_online_cpus(). Needs to call cgroup_lock()
* before calling generate_sched_domains().
*
* @cpu_online: Indicates whether this is a CPU online event (true) or
* a CPU offline event (false).
*/
void cpuset_update_active_cpus(bool cpu_online)
{
struct sched_domain_attr *attr; struct sched_domain_attr *attr;
cpumask_var_t *doms; cpumask_var_t *doms;
int ndoms; int ndoms;
cgroup_lock(); mutex_lock(&cpuset_mutex);
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
mutex_unlock(&callback_mutex);
if (!cpu_online)
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE);
ndoms = generate_sched_domains(&doms, &attr); ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock(); mutex_unlock(&cpuset_mutex);
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr); partition_sched_domains(ndoms, doms, attr);
}
}
void cpuset_update_active_cpus(bool cpu_online)
{
/*
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
*
* We still need to do partition_sched_domains() synchronously;
* otherwise, the scheduler will get confused and put tasks to the
* dead CPU. Fall back to the default single domain.
* cpuset_hotplug_workfn() will rebuild it as necessary.
*/
partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work);
} }
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
...@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online) ...@@ -2133,29 +2260,7 @@ void cpuset_update_active_cpus(bool cpu_online)
static int cpuset_track_online_nodes(struct notifier_block *self, static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg) unsigned long action, void *arg)
{ {
static nodemask_t oldmems; /* protected by cgroup_mutex */ schedule_work(&cpuset_hotplug_work);
cgroup_lock();
switch (action) {
case MEM_ONLINE:
oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_MEMORY];
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
break;
case MEM_OFFLINE:
/*
* needn't update top_cpuset.mems_allowed explicitly because
* scan_cpusets_upon_hotplug() will update it.
*/
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
break;
default:
break;
}
cgroup_unlock();
return NOTIFY_OK; return NOTIFY_OK;
} }
#endif #endif
...@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void) ...@@ -2173,8 +2278,9 @@ void __init cpuset_init_smp(void)
hotplug_memory_notifier(cpuset_track_online_nodes, 10); hotplug_memory_notifier(cpuset_track_online_nodes, 10);
cpuset_wq = create_singlethread_workqueue("cpuset"); cpuset_propagate_hotplug_wq =
BUG_ON(!cpuset_wq); alloc_ordered_workqueue("cpuset_hotplug", 0);
BUG_ON(!cpuset_propagate_hotplug_wq);
} }
/** /**
...@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) ...@@ -2273,8 +2379,8 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
*/ */
static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
{ {
while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
cs = cs->parent; cs = parent_cs(cs);
return cs; return cs;
} }
...@@ -2411,17 +2517,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) ...@@ -2411,17 +2517,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
return 0; return 0;
} }
/**
* cpuset_unlock - release lock on cpuset changes
*
* Undo the lock taken in a previous cpuset_lock() call.
*/
void cpuset_unlock(void)
{
mutex_unlock(&callback_mutex);
}
/** /**
* cpuset_mem_spread_node() - On which node to begin search for a file page * cpuset_mem_spread_node() - On which node to begin search for a file page
* cpuset_slab_spread_node() - On which node to begin search for a slab page * cpuset_slab_spread_node() - On which node to begin search for a slab page
...@@ -2568,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void) ...@@ -2568,7 +2663,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset. * - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it, * doesn't really matter if tsk->cpuset changes after we read it,
* and we take cgroup_mutex, keeping cpuset_attach() from changing it * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway. * anyway.
*/ */
static int proc_cpuset_show(struct seq_file *m, void *unused_v) static int proc_cpuset_show(struct seq_file *m, void *unused_v)
...@@ -2590,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v) ...@@ -2590,16 +2685,15 @@ static int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!tsk) if (!tsk)
goto out_free; goto out_free;
retval = -EINVAL; rcu_read_lock();
cgroup_lock();
css = task_subsys_state(tsk, cpuset_subsys_id); css = task_subsys_state(tsk, cpuset_subsys_id);
retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
rcu_read_unlock();
if (retval < 0) if (retval < 0)
goto out_unlock; goto out_put_task;
seq_puts(m, buf); seq_puts(m, buf);
seq_putc(m, '\n'); seq_putc(m, '\n');
out_unlock: out_put_task:
cgroup_unlock();
put_task_struct(tsk); put_task_struct(tsk);
out_free: out_free:
kfree(buf); kfree(buf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment