Commit 81026794 authored by Nick Piggin's avatar Nick Piggin Committed by Linus Torvalds

[PATCH] sched: improve load balancing pinned tasks

John Hawkes explained the problem best:

	A large number of processes that are pinned to a single CPU results
	in every other CPU's load_balance() seeing this overloaded CPU as
	"busiest", yet move_tasks() never finds a task to pull-migrate.  This
	condition occurs during module unload, but can also occur as a
	denial-of-service using sys_sched_setaffinity().  Several hundred
	CPUs performing this fruitless load_balance() will livelock on the
	busiest CPU's runqueue lock.  A smaller number of CPUs will livelock
	if the pinned task count gets high.

Expanding slightly on John's patch, this one attempts to work out whether the
balancing failure has been due to too many tasks pinned on the runqueue.  This
allows it to be basically invisible to the regular blancing paths (ie.  when
there are no pinned tasks).  We can use this extra knowledge to shut down the
balancing faster, and ensure the migration threads don't start running which
is another problem observed in the wild.
Signed-off-by: default avatarNick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent e0f364f4
...@@ -1632,7 +1632,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, ...@@ -1632,7 +1632,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
*/ */
static inline static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
struct sched_domain *sd, enum idle_type idle) struct sched_domain *sd, enum idle_type idle, int *all_pinned)
{ {
/* /*
* We do not migrate tasks that are: * We do not migrate tasks that are:
...@@ -1640,10 +1640,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, ...@@ -1640,10 +1640,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
* 2) cannot be migrated to this CPU due to cpus_allowed, or * 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU. * 3) are cache-hot on their current CPU.
*/ */
if (task_running(rq, p))
return 0;
if (!cpu_isset(this_cpu, p->cpus_allowed)) if (!cpu_isset(this_cpu, p->cpus_allowed))
return 0; return 0;
*all_pinned = 0;
if (task_running(rq, p))
return 0;
/* /*
* Aggressive migration if: * Aggressive migration if:
...@@ -1656,7 +1658,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, ...@@ -1656,7 +1658,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
return 1; return 1;
if (task_hot(p, rq->timestamp_last_tick, sd)) if (task_hot(p, rq->timestamp_last_tick, sd))
return 0; return 0;
return 1; return 1;
} }
...@@ -1669,16 +1671,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, ...@@ -1669,16 +1671,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
*/ */
static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
unsigned long max_nr_move, struct sched_domain *sd, unsigned long max_nr_move, struct sched_domain *sd,
enum idle_type idle) enum idle_type idle, int *all_pinned)
{ {
prio_array_t *array, *dst_array; prio_array_t *array, *dst_array;
struct list_head *head, *curr; struct list_head *head, *curr;
int idx, pulled = 0; int idx, pulled = 0, pinned = 0;
task_t *tmp; task_t *tmp;
if (max_nr_move <= 0 || busiest->nr_running <= 1) if (max_nr_move == 0)
goto out; goto out;
pinned = 1;
/* /*
* We first consider expired tasks. Those will likely not be * We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to * executed in the near future, and they are most likely to
...@@ -1717,7 +1721,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, ...@@ -1717,7 +1721,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
curr = curr->prev; curr = curr->prev;
if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
if (curr != head) if (curr != head)
goto skip_queue; goto skip_queue;
idx++; idx++;
...@@ -1746,6 +1750,9 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, ...@@ -1746,6 +1750,9 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
* inside pull_task(). * inside pull_task().
*/ */
schedstat_add(sd, lb_gained[idle], pulled); schedstat_add(sd, lb_gained[idle], pulled);
if (all_pinned)
*all_pinned = pinned;
return pulled; return pulled;
} }
...@@ -1917,7 +1924,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1917,7 +1924,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
struct sched_group *group; struct sched_group *group;
runqueue_t *busiest; runqueue_t *busiest;
unsigned long imbalance; unsigned long imbalance;
int nr_moved; int nr_moved, all_pinned;
int active_balance = 0;
spin_lock(&this_rq->lock); spin_lock(&this_rq->lock);
schedstat_inc(sd, lb_cnt[idle]); schedstat_inc(sd, lb_cnt[idle]);
...@@ -1956,9 +1964,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1956,9 +1964,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
*/ */
double_lock_balance(this_rq, busiest); double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest, nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, idle); imbalance, sd, idle,
&all_pinned);
spin_unlock(&busiest->lock); spin_unlock(&busiest->lock);
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(all_pinned))
goto out_balanced;
} }
spin_unlock(&this_rq->lock); spin_unlock(&this_rq->lock);
if (!nr_moved) { if (!nr_moved) {
...@@ -1966,16 +1980,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1966,16 +1980,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
sd->nr_balance_failed++; sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
int wake = 0;
spin_lock(&busiest->lock); spin_lock(&busiest->lock);
if (!busiest->active_balance) { if (!busiest->active_balance) {
busiest->active_balance = 1; busiest->active_balance = 1;
busiest->push_cpu = this_cpu; busiest->push_cpu = this_cpu;
wake = 1; active_balance = 1;
} }
spin_unlock(&busiest->lock); spin_unlock(&busiest->lock);
if (wake) if (active_balance)
wake_up_process(busiest->migration_thread); wake_up_process(busiest->migration_thread);
/* /*
...@@ -1984,18 +1997,21 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1984,18 +1997,21 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
*/ */
sd->nr_balance_failed = sd->cache_nice_tries; sd->nr_balance_failed = sd->cache_nice_tries;
} }
} else
/*
* We were unbalanced, but unsuccessful in move_tasks(),
* so bump the balance_interval to lessen the lock contention.
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval++;
} else {
sd->nr_balance_failed = 0; sd->nr_balance_failed = 0;
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */ /* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval; sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* move_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
} }
return nr_moved; return nr_moved;
...@@ -2047,7 +2063,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, ...@@ -2047,7 +2063,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
nr_moved = move_tasks(this_rq, this_cpu, busiest, nr_moved = move_tasks(this_rq, this_cpu, busiest,
imbalance, sd, NEWLY_IDLE); imbalance, sd, NEWLY_IDLE, NULL);
if (!nr_moved) if (!nr_moved)
schedstat_inc(sd, lb_failed[NEWLY_IDLE]); schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
...@@ -2126,7 +2142,7 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) ...@@ -2126,7 +2142,7 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
/* move a task from busiest_rq to target_rq */ /* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq); double_lock_balance(busiest_rq, target_rq);
if (move_tasks(target_rq, cpu, busiest_rq, if (move_tasks(target_rq, cpu, busiest_rq,
1, sd, SCHED_IDLE)) { 1, sd, SCHED_IDLE, NULL)) {
schedstat_inc(sd, alb_pushed); schedstat_inc(sd, alb_pushed);
} else { } else {
schedstat_inc(sd, alb_failed); schedstat_inc(sd, alb_failed);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment