Commit deb7aa30 authored by Tejun Heo's avatar Tejun Heo

cpuset: reorganize CPU / memory hotplug handling

Reorganize hotplug path to prepare for async hotplug handling.

* Both CPU and memory hotplug handlings are collected into a single
  function - cpuset_handle_hotplug().  It doesn't take any argument
  but compares the current setttings of top_cpuset against what's
  actually available to determine what happened.  This function
  directly updates top_cpuset.  If there are CPUs or memory nodes
  which are taken down, cpuset_propagate_hotplug() in invoked on all
  !root cpusets.

* cpuset_propagate_hotplug() is responsible for updating the specified
  cpuset so that it doesn't include any resource which isn't available
  to top_cpuset.  If no CPU or memory is left after update, all tasks
  are moved to the nearest ancestor with both resources.

* update_tasks_cpumask() and update_tasks_nodemask() are now always
  called after cpus or mems masks are updated even if the cpuset
  doesn't have any task.  This is for brevity and not expected to have
  any measureable effect.

* cpu_active_mask and N_HIGH_MEMORY are read exactly once per
  cpuset_handle_hotplug() invocation, all cpusets share the same view
  of what resources are available, and cpuset_handle_hotplug() can
  handle multiple resources going up and down.  These properties will
  allow async operation.

The reorganization, while drastic, is equivalent and shouldn't cause
any behavior difference.  This will enable making hotplug handling
async and remove get_online_cpus() -> cgroup_mutex nesting.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Acked-by: default avatarLi Zefan <lizefan@huawei.com>
parent 4e4c9a14
...@@ -148,12 +148,6 @@ typedef enum { ...@@ -148,12 +148,6 @@ typedef enum {
CS_SPREAD_SLAB, CS_SPREAD_SLAB,
} cpuset_flagbits_t; } cpuset_flagbits_t;
/* the type of hotplug event */
enum hotplug_event {
CPUSET_CPU_OFFLINE,
CPUSET_MEM_OFFLINE,
};
/* convenient tests for these bits */ /* convenient tests for these bits */
static inline bool is_cpuset_online(const struct cpuset *cs) static inline bool is_cpuset_online(const struct cpuset *cs)
{ {
...@@ -2059,116 +2053,131 @@ static struct cpuset *cpuset_next(struct list_head *queue) ...@@ -2059,116 +2053,131 @@ static struct cpuset *cpuset_next(struct list_head *queue)
return cp; return cp;
} }
/**
/* * cpuset_propagate_hotplug - propagate CPU/memory hotplug to a cpuset
* Walk the specified cpuset subtree upon a hotplug operation (CPU/Memory * @cs: cpuset in interest
* online/offline) and update the cpusets accordingly.
* For regular CPU/Mem hotplug, look for empty cpusets; the tasks of such
* cpuset must be moved to a parent cpuset.
*
* Called with cgroup_mutex held. We take callback_mutex to modify
* cpus_allowed and mems_allowed.
* *
* This walk processes the tree from top to bottom, completing one layer * Compare @cs's cpu and mem masks against top_cpuset and if some have gone
* before dropping down to the next. It always processes a node before * offline, update @cs accordingly. If @cs ends up with no CPU or memory,
* any of its children. * all its tasks are moved to the nearest ancestor with both resources.
* *
* In the case of memory hot-unplug, it will remove nodes from N_MEMORY * Should be called with cgroup_mutex held.
* if all present pages from a node are offlined.
*/ */
static void static void cpuset_propagate_hotplug(struct cpuset *cs)
scan_cpusets_upon_hotplug(struct cpuset *root, enum hotplug_event event)
{ {
LIST_HEAD(queue); static cpumask_t off_cpus;
struct cpuset *cp; /* scans cpusets being updated */ static nodemask_t off_mems, tmp_mems;
static nodemask_t oldmems; /* protected by cgroup_mutex */
list_add_tail((struct list_head *)&root->stack_list, &queue);
switch (event) {
case CPUSET_CPU_OFFLINE:
while ((cp = cpuset_next(&queue)) != NULL) {
/* Continue past cpusets with all cpus online */ WARN_ON_ONCE(!cgroup_lock_is_held());
if (cpumask_subset(cp->cpus_allowed, cpu_active_mask))
continue;
/* Remove offline cpus from this cpuset. */ cpumask_andnot(&off_cpus, cs->cpus_allowed, top_cpuset.cpus_allowed);
mutex_lock(&callback_mutex); nodes_andnot(off_mems, cs->mems_allowed, top_cpuset.mems_allowed);
cpumask_and(cp->cpus_allowed, cp->cpus_allowed,
cpu_active_mask);
mutex_unlock(&callback_mutex);
/* Move tasks from the empty cpuset to a parent */ /* remove offline cpus from @cs */
if (cpumask_empty(cp->cpus_allowed)) if (!cpumask_empty(&off_cpus)) {
remove_tasks_in_empty_cpuset(cp); mutex_lock(&callback_mutex);
else cpumask_andnot(cs->cpus_allowed, cs->cpus_allowed, &off_cpus);
update_tasks_cpumask(cp, NULL); mutex_unlock(&callback_mutex);
} update_tasks_cpumask(cs, NULL);
break; }
case CPUSET_MEM_OFFLINE:
while ((cp = cpuset_next(&queue)) != NULL) {
/* Continue past cpusets with all mems online */
if (nodes_subset(cp->mems_allowed,
node_states[N_MEMORY]))
continue;
oldmems = cp->mems_allowed;
/* Remove offline mems from this cpuset. */
mutex_lock(&callback_mutex);
nodes_and(cp->mems_allowed, cp->mems_allowed,
node_states[N_MEMORY]);
mutex_unlock(&callback_mutex);
/* Move tasks from the empty cpuset to a parent */ /* remove offline mems from @cs */
if (nodes_empty(cp->mems_allowed)) if (!nodes_empty(off_mems)) {
remove_tasks_in_empty_cpuset(cp); tmp_mems = cs->mems_allowed;
else mutex_lock(&callback_mutex);
update_tasks_nodemask(cp, &oldmems, NULL); nodes_andnot(cs->mems_allowed, cs->mems_allowed, off_mems);
} mutex_unlock(&callback_mutex);
update_tasks_nodemask(cs, &tmp_mems, NULL);
} }
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
remove_tasks_in_empty_cpuset(cs);
} }
/* /**
* The top_cpuset tracks what CPUs and Memory Nodes are online, * cpuset_handle_hotplug - handle CPU/memory hot[un]plug
* period. This is necessary in order to make cpusets transparent
* (of no affect) on systems that are actively using CPU hotplug
* but making no active use of cpusets.
*
* The only exception to this is suspend/resume, where we don't
* modify cpusets at all.
* *
* This routine ensures that top_cpuset.cpus_allowed tracks * This function is called after either CPU or memory configuration has
* cpu_active_mask on each CPU hotplug (cpuhp) event. * changed and updates cpuset accordingly. The top_cpuset is always
* synchronized to cpu_active_mask and N_MEMORY, which is necessary in
* order to make cpusets transparent (of no affect) on systems that are
* actively using CPU hotplug but making no active use of cpusets.
* *
* Called within get_online_cpus(). Needs to call cgroup_lock() * Non-root cpusets are only affected by offlining. If any CPUs or memory
* before calling generate_sched_domains(). * nodes have been taken down, cpuset_propagate_hotplug() is invoked on all
* descendants.
* *
* @cpu_online: Indicates whether this is a CPU online event (true) or * Note that CPU offlining during suspend is ignored. We don't modify
* a CPU offline event (false). * cpusets across suspend/resume cycles at all.
*/ */
void cpuset_update_active_cpus(bool cpu_online) static void cpuset_handle_hotplug(void)
{ {
struct sched_domain_attr *attr; static cpumask_t new_cpus, tmp_cpus;
cpumask_var_t *doms; static nodemask_t new_mems, tmp_mems;
int ndoms; bool cpus_updated, mems_updated;
bool cpus_offlined, mems_offlined;
cgroup_lock(); cgroup_lock();
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
mutex_unlock(&callback_mutex);
if (!cpu_online) /* fetch the available cpus/mems and find out which changed how */
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_CPU_OFFLINE); cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
cpus_updated = !cpumask_equal(top_cpuset.cpus_allowed, &new_cpus);
cpus_offlined = cpumask_andnot(&tmp_cpus, top_cpuset.cpus_allowed,
&new_cpus);
mems_updated = !nodes_equal(top_cpuset.mems_allowed, new_mems);
nodes_andnot(tmp_mems, top_cpuset.mems_allowed, new_mems);
mems_offlined = !nodes_empty(tmp_mems);
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
mutex_lock(&callback_mutex);
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
mutex_unlock(&callback_mutex);
/* we don't mess with cpumasks of tasks in top_cpuset */
}
/* synchronize mems_allowed to N_MEMORY */
if (mems_updated) {
tmp_mems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = new_mems;
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset, &tmp_mems, NULL);
}
/* if cpus or mems went down, we need to propagate to descendants */
if (cpus_offlined || mems_offlined) {
struct cpuset *cs;
LIST_HEAD(queue);
list_add_tail(&top_cpuset.stack_list, &queue);
while ((cs = cpuset_next(&queue)))
if (cs != &top_cpuset)
cpuset_propagate_hotplug(cs);
}
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock(); cgroup_unlock();
/* Have scheduler rebuild the domains */ /* rebuild sched domains if cpus_allowed has changed */
partition_sched_domains(ndoms, doms, attr); if (cpus_updated) {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
cgroup_lock();
ndoms = generate_sched_domains(&doms, &attr);
cgroup_unlock();
partition_sched_domains(ndoms, doms, attr);
}
}
void cpuset_update_active_cpus(bool cpu_online)
{
cpuset_handle_hotplug();
} }
#ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_MEMORY_HOTPLUG
...@@ -2180,29 +2189,7 @@ void cpuset_update_active_cpus(bool cpu_online) ...@@ -2180,29 +2189,7 @@ void cpuset_update_active_cpus(bool cpu_online)
static int cpuset_track_online_nodes(struct notifier_block *self, static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg) unsigned long action, void *arg)
{ {
static nodemask_t oldmems; /* protected by cgroup_mutex */ cpuset_handle_hotplug();
cgroup_lock();
switch (action) {
case MEM_ONLINE:
oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_MEMORY];
mutex_unlock(&callback_mutex);
update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
break;
case MEM_OFFLINE:
/*
* needn't update top_cpuset.mems_allowed explicitly because
* scan_cpusets_upon_hotplug() will update it.
*/
scan_cpusets_upon_hotplug(&top_cpuset, CPUSET_MEM_OFFLINE);
break;
default:
break;
}
cgroup_unlock();
return NOTIFY_OK; return NOTIFY_OK;
} }
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment