Commit ddea677b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Move migrate_all_tasks to CPU_DEAD handling

From: Srivatsa Vaddagiri <vatsa@in.ibm.com>

migrate_all_tasks is currently run with rest of the machine stopped.
It iterates thr' the complete task table, turning off cpu affinity of any task
that it finds affine to the dying cpu. Depending on the task table
size this can take considerable time. All this time machine is stopped, doing
nothing.

Stopping the machine for such extended periods can be avoided if we do
task migration in CPU_DEAD notification and that's precisely what this patch
does.

The patch puts idle task to the _front_ of the dying CPU's runqueue at the 
highest priority possible. This cause idle thread to run _immediately_ after
kstopmachine thread yields. Idle thread notices that its cpu is offline and
dies quickly. Task migration can then be done at leisure in CPU_DEAD
notification, when rest of the CPUs are running.

Some advantages with this approach are:

	- More scalable. Predicatable amout of time that machine is stopped.
	- No changes to hot path/core code. We are just exploiting scheduler
	  rules which runs the next high-priority task on the runqueue. Also
	  since I put idle task to the _front_ of the runqueue, there
	  are no races when a equally high priority task is woken up
	  and added to the runqueue. It gets in at the back of the runqueue,
	  _after_ idle task!
	- cpu_is_offline check that is presenty required in try_to_wake_up,
	  idle_balance and rebalance_tick can be removed, thus speeding them
	  up a bit

From: Srivatsa Vaddagiri <vatsa@in.ibm.com>

  Rusty mentioned that the unlikely hints against cpu_is_offline is
  redundant since the macro already has that hint.  Patch below removes those
  redundant hints I added.
parent 4197ad87
...@@ -671,8 +671,7 @@ extern void sched_balance_exec(void); ...@@ -671,8 +671,7 @@ extern void sched_balance_exec(void);
#define sched_balance_exec() {} #define sched_balance_exec() {}
#endif #endif
/* Move tasks off this (offline) CPU onto another. */ extern void sched_idle_next(void);
extern void migrate_all_tasks(void);
extern void set_user_nice(task_t *p, long nice); extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p); extern int task_prio(task_t *p);
extern int task_nice(task_t *p); extern int task_nice(task_t *p);
......
...@@ -43,15 +43,16 @@ void unregister_cpu_notifier(struct notifier_block *nb) ...@@ -43,15 +43,16 @@ void unregister_cpu_notifier(struct notifier_block *nb)
EXPORT_SYMBOL(unregister_cpu_notifier); EXPORT_SYMBOL(unregister_cpu_notifier);
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
static inline void check_for_tasks(int cpu, struct task_struct *k) static inline void check_for_tasks(int cpu)
{ {
struct task_struct *p; struct task_struct *p;
write_lock_irq(&tasklist_lock); write_lock_irq(&tasklist_lock);
for_each_process(p) { for_each_process(p) {
if (task_cpu(p) == cpu && p != k) if (task_cpu(p) == cpu && (p->utime != 0 || p->stime != 0))
printk(KERN_WARNING "Task %s is on cpu %d\n", printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\
p->comm, cpu); (state = %ld, flags = %lx) \n",
p->comm, p->pid, cpu, p->state, p->flags);
} }
write_unlock_irq(&tasklist_lock); write_unlock_irq(&tasklist_lock);
} }
...@@ -96,8 +97,9 @@ static int take_cpu_down(void *unused) ...@@ -96,8 +97,9 @@ static int take_cpu_down(void *unused)
if (err < 0) if (err < 0)
cpu_set(smp_processor_id(), cpu_online_map); cpu_set(smp_processor_id(), cpu_online_map);
else else
/* Everyone else gets kicked off. */ /* Force idle task to run as soon as we yield: it should
migrate_all_tasks(); immediately notice cpu is offline and die quickly. */
sched_idle_next();
return err; return err;
} }
...@@ -106,6 +108,7 @@ int cpu_down(unsigned int cpu) ...@@ -106,6 +108,7 @@ int cpu_down(unsigned int cpu)
{ {
int err; int err;
struct task_struct *p; struct task_struct *p;
cpumask_t old_allowed, tmp;
if ((err = lock_cpu_hotplug_interruptible()) != 0) if ((err = lock_cpu_hotplug_interruptible()) != 0)
return err; return err;
...@@ -120,17 +123,21 @@ int cpu_down(unsigned int cpu) ...@@ -120,17 +123,21 @@ int cpu_down(unsigned int cpu)
goto out; goto out;
} }
/* Ensure that we are not runnable on dying cpu */
old_allowed = current->cpus_allowed;
tmp = CPU_MASK_ALL;
cpu_clear(cpu, tmp);
set_cpus_allowed(current, tmp);
p = __stop_machine_run(take_cpu_down, NULL, cpu); p = __stop_machine_run(take_cpu_down, NULL, cpu);
if (IS_ERR(p)) { if (IS_ERR(p)) {
err = PTR_ERR(p); err = PTR_ERR(p);
goto out; goto out_allowed;
} }
if (cpu_online(cpu)) if (cpu_online(cpu))
goto out_thread; goto out_thread;
check_for_tasks(cpu, p);
/* Wait for it to sleep (leaving idle task). */ /* Wait for it to sleep (leaving idle task). */
while (!idle_cpu(cpu)) while (!idle_cpu(cpu))
yield(); yield();
...@@ -146,10 +153,14 @@ int cpu_down(unsigned int cpu) ...@@ -146,10 +153,14 @@ int cpu_down(unsigned int cpu)
== NOTIFY_BAD) == NOTIFY_BAD)
BUG(); BUG();
check_for_tasks(cpu);
cpu_run_sbin_hotplug(cpu, "offline"); cpu_run_sbin_hotplug(cpu, "offline");
out_thread: out_thread:
err = kthread_stop(p); err = kthread_stop(p);
out_allowed:
set_cpus_allowed(current, old_allowed);
out: out:
unlock_cpu_hotplug(); unlock_cpu_hotplug();
return err; return err;
......
...@@ -26,6 +26,7 @@ ...@@ -26,6 +26,7 @@
#include <linux/binfmts.h> #include <linux/binfmts.h>
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/cpu.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/jiffies.h> #include <linux/jiffies.h>
...@@ -1196,8 +1197,15 @@ long do_fork(unsigned long clone_flags, ...@@ -1196,8 +1197,15 @@ long do_fork(unsigned long clone_flags,
wake_up_forked_thread(p); wake_up_forked_thread(p);
else else
wake_up_forked_process(p); wake_up_forked_process(p);
} else } else {
int cpu = get_cpu();
p->state = TASK_STOPPED; p->state = TASK_STOPPED;
if (cpu_is_offline(task_cpu(p)))
set_task_cpu(p, cpu);
put_cpu();
}
++total_forks; ++total_forks;
if (unlikely (trace)) { if (unlikely (trace)) {
......
...@@ -331,7 +331,6 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array) ...@@ -331,7 +331,6 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
p->array = array; p->array = array;
} }
#ifdef CONFIG_SMP
/* /*
* Used by the migration code - we pull tasks from the head of the * Used by the migration code - we pull tasks from the head of the
* remote queue so we want these tasks to show up at the head of the * remote queue so we want these tasks to show up at the head of the
...@@ -344,7 +343,6 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) ...@@ -344,7 +343,6 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
array->nr_active++; array->nr_active++;
p->array = array; p->array = array;
} }
#endif
/* /*
* effective_prio - return the priority that is based on the static * effective_prio - return the priority that is based on the static
...@@ -386,6 +384,15 @@ static inline void __activate_task(task_t *p, runqueue_t *rq) ...@@ -386,6 +384,15 @@ static inline void __activate_task(task_t *p, runqueue_t *rq)
rq->nr_running++; rq->nr_running++;
} }
/*
* __activate_idle_task - move idle task to the _front_ of runqueue.
*/
static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
{
enqueue_task_head(p, rq->active);
rq->nr_running++;
}
static void recalc_task_prio(task_t *p, unsigned long long now) static void recalc_task_prio(task_t *p, unsigned long long now)
{ {
unsigned long long __sleep_time = now - p->timestamp; unsigned long long __sleep_time = now - p->timestamp;
...@@ -749,7 +756,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) ...@@ -749,7 +756,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
this_cpu = smp_processor_id(); this_cpu = smp_processor_id();
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu))) if (unlikely(task_running(rq, p)))
goto out_activate; goto out_activate;
new_cpu = cpu; new_cpu = cpu;
...@@ -1781,9 +1788,6 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) ...@@ -1781,9 +1788,6 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
{ {
struct sched_domain *sd; struct sched_domain *sd;
if (unlikely(cpu_is_offline(this_cpu)))
return;
for_each_domain(this_cpu, sd) { for_each_domain(this_cpu, sd) {
if (sd->flags & SD_BALANCE_NEWIDLE) { if (sd->flags & SD_BALANCE_NEWIDLE) {
if (load_balance_newidle(this_cpu, this_rq, sd)) { if (load_balance_newidle(this_cpu, this_rq, sd)) {
...@@ -1871,9 +1875,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, ...@@ -1871,9 +1875,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
unsigned long j = jiffies + CPU_OFFSET(this_cpu); unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd; struct sched_domain *sd;
if (unlikely(cpu_is_offline(this_cpu)))
return;
/* Update our load */ /* Update our load */
old_load = this_rq->cpu_load; old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE; this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
...@@ -3325,18 +3326,19 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); ...@@ -3325,18 +3326,19 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
* So we race with normal scheduler movements, but that's OK, as long * So we race with normal scheduler movements, but that's OK, as long
* as the task is no longer on this CPU. * as the task is no longer on this CPU.
*/ */
static void __migrate_task(struct task_struct *p, int dest_cpu) static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
{ {
runqueue_t *rq_dest; runqueue_t *rq_dest, *rq_src;
if (unlikely(cpu_is_offline(dest_cpu))) if (unlikely(cpu_is_offline(dest_cpu)))
return; return;
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu); rq_dest = cpu_rq(dest_cpu);
double_rq_lock(this_rq(), rq_dest); double_rq_lock(rq_src, rq_dest);
/* Already moved. */ /* Already moved. */
if (task_cpu(p) != smp_processor_id()) if (task_cpu(p) != src_cpu)
goto out; goto out;
/* Affinity changed (again). */ /* Affinity changed (again). */
if (!cpu_isset(dest_cpu, p->cpus_allowed)) if (!cpu_isset(dest_cpu, p->cpus_allowed))
...@@ -3344,7 +3346,7 @@ static void __migrate_task(struct task_struct *p, int dest_cpu) ...@@ -3344,7 +3346,7 @@ static void __migrate_task(struct task_struct *p, int dest_cpu)
set_task_cpu(p, dest_cpu); set_task_cpu(p, dest_cpu);
if (p->array) { if (p->array) {
deactivate_task(p, this_rq()); deactivate_task(p, rq_src);
activate_task(p, rq_dest); activate_task(p, rq_dest);
if (TASK_PREEMPTS_CURR(p, rq_dest)) if (TASK_PREEMPTS_CURR(p, rq_dest))
resched_task(rq_dest->curr); resched_task(rq_dest->curr);
...@@ -3352,7 +3354,7 @@ static void __migrate_task(struct task_struct *p, int dest_cpu) ...@@ -3352,7 +3354,7 @@ static void __migrate_task(struct task_struct *p, int dest_cpu)
p->timestamp = rq_dest->timestamp_last_tick; p->timestamp = rq_dest->timestamp_last_tick;
out: out:
double_rq_unlock(this_rq(), rq_dest); double_rq_unlock(rq_src, rq_dest);
} }
/* /*
...@@ -3376,6 +3378,12 @@ static int migration_thread(void * data) ...@@ -3376,6 +3378,12 @@ static int migration_thread(void * data)
refrigerator(PF_FREEZE); refrigerator(PF_FREEZE);
spin_lock_irq(&rq->lock); spin_lock_irq(&rq->lock);
if (cpu_is_offline(cpu)) {
spin_unlock_irq(&rq->lock);
goto wait_to_die;
}
if (rq->active_balance) { if (rq->active_balance) {
active_load_balance(rq, cpu); active_load_balance(rq, cpu);
rq->active_balance = 0; rq->active_balance = 0;
...@@ -3394,7 +3402,8 @@ static int migration_thread(void * data) ...@@ -3394,7 +3402,8 @@ static int migration_thread(void * data)
if (req->type == REQ_MOVE_TASK) { if (req->type == REQ_MOVE_TASK) {
spin_unlock(&rq->lock); spin_unlock(&rq->lock);
__migrate_task(req->task, req->dest_cpu); __migrate_task(req->task, smp_processor_id(),
req->dest_cpu);
local_irq_enable(); local_irq_enable();
} else if (req->type == REQ_SET_DOMAIN) { } else if (req->type == REQ_SET_DOMAIN) {
rq->sd = req->sd; rq->sd = req->sd;
...@@ -3407,23 +3416,27 @@ static int migration_thread(void * data) ...@@ -3407,23 +3416,27 @@ static int migration_thread(void * data)
complete(&req->done); complete(&req->done);
} }
return 0; return 0;
wait_to_die:
/* Wait for kthread_stop */
set_current_state(TASK_INTERRUPTIBLE);
while (!kthread_should_stop()) {
schedule();
set_current_state(TASK_INTERRUPTIBLE);
}
__set_current_state(TASK_RUNNING);
return 0;
} }
#ifdef CONFIG_HOTPLUG_CPU #ifdef CONFIG_HOTPLUG_CPU
/* migrate_all_tasks - function to migrate all the tasks from the /* migrate_all_tasks - function to migrate all tasks from the dead cpu. */
* current cpu caller must have already scheduled this to the target static void migrate_all_tasks(int src_cpu)
* cpu via set_cpus_allowed. Machine is stopped. */
void migrate_all_tasks(void)
{ {
struct task_struct *tsk, *t; struct task_struct *tsk, *t;
int dest_cpu, src_cpu; int dest_cpu;
unsigned int node; unsigned int node;
/* We're nailed to this CPU. */ write_lock_irq(&tasklist_lock);
src_cpu = smp_processor_id();
/* Not required, but here for neatness. */
write_lock(&tasklist_lock);
/* watch out for per node tasks, let's stay on this node */ /* watch out for per node tasks, let's stay on this node */
node = cpu_to_node(src_cpu); node = cpu_to_node(src_cpu);
...@@ -3459,10 +3472,36 @@ void migrate_all_tasks(void) ...@@ -3459,10 +3472,36 @@ void migrate_all_tasks(void)
tsk->pid, tsk->comm, src_cpu); tsk->pid, tsk->comm, src_cpu);
} }
__migrate_task(tsk, dest_cpu); __migrate_task(tsk, src_cpu, dest_cpu);
} while_each_thread(t, tsk); } while_each_thread(t, tsk);
write_unlock(&tasklist_lock); write_unlock_irq(&tasklist_lock);
}
/* Schedules idle task to be the next runnable task on current CPU.
* It does so by boosting its priority to highest possible and adding it to
* the _front_ of runqueue. Used by CPU offline code.
*/
void sched_idle_next(void)
{
int cpu = smp_processor_id();
runqueue_t *rq = this_rq();
struct task_struct *p = rq->idle;
unsigned long flags;
/* cpu has to be offline */
BUG_ON(cpu_online(cpu));
/* Strictly not necessary since rest of the CPUs are stopped by now
* and interrupts disabled on current cpu.
*/
spin_lock_irqsave(&rq->lock, flags);
__setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
/* Add idle task to _front_ of it's priority queue */
__activate_idle_task(p, rq);
spin_unlock_irqrestore(&rq->lock, flags);
} }
#endif /* CONFIG_HOTPLUG_CPU */ #endif /* CONFIG_HOTPLUG_CPU */
...@@ -3498,11 +3537,20 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, ...@@ -3498,11 +3537,20 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
case CPU_UP_CANCELED: case CPU_UP_CANCELED:
/* Unbind it from offline cpu so it can run. Fall thru. */ /* Unbind it from offline cpu so it can run. Fall thru. */
kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id());
kthread_stop(cpu_rq(cpu)->migration_thread);
cpu_rq(cpu)->migration_thread = NULL;
break;
case CPU_DEAD: case CPU_DEAD:
migrate_all_tasks(cpu);
rq = cpu_rq(cpu); rq = cpu_rq(cpu);
kthread_stop(rq->migration_thread); kthread_stop(rq->migration_thread);
rq->migration_thread = NULL; rq->migration_thread = NULL;
BUG_ON(rq->nr_running != 0); /* Idle task back to normal (off runqueue, low prio) */
rq = task_rq_lock(rq->idle, &flags);
deactivate_task(rq->idle, rq);
__setscheduler(rq->idle, SCHED_NORMAL, MAX_PRIO);
task_rq_unlock(rq, &flags);
BUG_ON(rq->nr_running != 0);
/* No need to migrate the tasks: it was best-effort if /* No need to migrate the tasks: it was best-effort if
* they didn't do lock_cpu_hotplug(). Just wake up * they didn't do lock_cpu_hotplug(). Just wake up
...@@ -3523,8 +3571,12 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, ...@@ -3523,8 +3571,12 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
return NOTIFY_OK; return NOTIFY_OK;
} }
/* Register at highest priority so that task migration (migrate_all_tasks)
* happens before everything else.
*/
static struct notifier_block __devinitdata migration_notifier = { static struct notifier_block __devinitdata migration_notifier = {
.notifier_call = migration_call, .notifier_call = migration_call,
.priority = 10
}; };
int __init migration_init(void) int __init migration_init(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment