Commit f01419fd authored by Martin J. Bligh's avatar Martin J. Bligh Committed by Justin T. Gibbs

[PATCH] (2/3) Initial load balancing

Patch from Michael Hohnbaum

This adds a hook, sched_balance_exec(), to the exec code, to make it
place the exec'ed task on the least loaded queue. We have less state
to move at exec time than fork time, so this is the cheapest point
to cross-node migrate. Experience in Dynix/PTX and testing on Linux
has confirmed that this is the cheapest time to move tasks between nodes.

It also macro-wraps changes to nr_running, to allow us to keep track of
per-node nr_running as well. Again, no impact on non-NUMA machines.
parent 5f24fe82
...@@ -1031,6 +1031,8 @@ int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs ...@@ -1031,6 +1031,8 @@ int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs
int retval; int retval;
int i; int i;
sched_balance_exec();
file = open_exec(filename); file = open_exec(filename);
retval = PTR_ERR(file); retval = PTR_ERR(file);
......
...@@ -447,6 +447,14 @@ extern void set_cpus_allowed(task_t *p, unsigned long new_mask); ...@@ -447,6 +447,14 @@ extern void set_cpus_allowed(task_t *p, unsigned long new_mask);
# define set_cpus_allowed(p, new_mask) do { } while (0) # define set_cpus_allowed(p, new_mask) do { } while (0)
#endif #endif
#ifdef CONFIG_NUMA
extern void sched_balance_exec(void);
extern void node_nr_running_init(void);
#else
#define sched_balance_exec() {}
#define node_nr_running_init() {}
#endif
extern void set_user_nice(task_t *p, long nice); extern void set_user_nice(task_t *p, long nice);
extern int task_prio(task_t *p); extern int task_prio(task_t *p);
extern int task_nice(task_t *p); extern int task_nice(task_t *p);
......
...@@ -495,6 +495,7 @@ static void do_pre_smp_initcalls(void) ...@@ -495,6 +495,7 @@ static void do_pre_smp_initcalls(void)
migration_init(); migration_init();
#endif #endif
node_nr_running_init();
spawn_ksoftirqd(); spawn_ksoftirqd();
} }
......
...@@ -153,7 +153,9 @@ struct runqueue { ...@@ -153,7 +153,9 @@ struct runqueue {
task_t *curr, *idle; task_t *curr, *idle;
prio_array_t *active, *expired, arrays[2]; prio_array_t *active, *expired, arrays[2];
int prev_nr_running[NR_CPUS]; int prev_nr_running[NR_CPUS];
#ifdef CONFIG_NUMA
atomic_t *node_nr_running;
#endif
task_t *migration_thread; task_t *migration_thread;
struct list_head migration_queue; struct list_head migration_queue;
...@@ -177,6 +179,48 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; ...@@ -177,6 +179,48 @@ static struct runqueue runqueues[NR_CPUS] __cacheline_aligned;
# define task_running(rq, p) ((rq)->curr == (p)) # define task_running(rq, p) ((rq)->curr == (p))
#endif #endif
#ifdef CONFIG_NUMA
/*
* Keep track of running tasks.
*/
static atomic_t node_nr_running[MAX_NUMNODES] ____cacheline_maxaligned_in_smp =
{[0 ...MAX_NUMNODES-1] = ATOMIC_INIT(0)};
static inline void nr_running_init(struct runqueue *rq)
{
rq->node_nr_running = &node_nr_running[0];
}
static inline void nr_running_inc(runqueue_t *rq)
{
atomic_inc(rq->node_nr_running);
rq->nr_running++;
}
static inline void nr_running_dec(runqueue_t *rq)
{
atomic_dec(rq->node_nr_running);
rq->nr_running--;
}
__init void node_nr_running_init(void)
{
int i;
for (i = 0; i < NR_CPUS; i++)
cpu_rq(i)->node_nr_running = &node_nr_running[__cpu_to_node(i)];
}
#else /* !CONFIG_NUMA */
# define nr_running_init(rq) do { } while (0)
# define nr_running_inc(rq) do { (rq)->nr_running++; } while (0)
# define nr_running_dec(rq) do { (rq)->nr_running--; } while (0)
#endif /* CONFIG_NUMA */
/* /*
* task_rq_lock - lock the runqueue a given task resides on and disable * task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without * interrupts. Note the ordering: we can safely lookup the task_rq without
...@@ -294,7 +338,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq) ...@@ -294,7 +338,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
p->prio = effective_prio(p); p->prio = effective_prio(p);
} }
enqueue_task(p, array); enqueue_task(p, array);
rq->nr_running++; nr_running_inc(rq);
} }
/* /*
...@@ -302,7 +346,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq) ...@@ -302,7 +346,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
*/ */
static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
{ {
rq->nr_running--; nr_running_dec(rq);
if (p->state == TASK_UNINTERRUPTIBLE) if (p->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++; rq->nr_uninterruptible++;
dequeue_task(p, p->array); dequeue_task(p, p->array);
...@@ -624,7 +668,72 @@ static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) ...@@ -624,7 +668,72 @@ static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
spin_unlock(&rq2->lock); spin_unlock(&rq2->lock);
} }
#ifdef CONFIG_NUMA #if CONFIG_NUMA
/*
* If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only
* allow dest_cpu, which will force the cpu onto dest_cpu. Then
* the cpu_allowed mask is restored.
*/
static void sched_migrate_task(task_t *p, int dest_cpu)
{
unsigned long old_mask;
old_mask = p->cpus_allowed;
if (!(old_mask & (1UL << dest_cpu)))
return;
/* force the process onto the specified CPU */
set_cpus_allowed(p, 1UL << dest_cpu);
/* restore the cpus allowed mask */
set_cpus_allowed(p, old_mask);
}
/*
* Find the least loaded CPU. Slightly favor the current CPU by
* setting its runqueue length as the minimum to start.
*/
static int sched_best_cpu(struct task_struct *p)
{
int i, minload, load, best_cpu, node = 0;
unsigned long cpumask;
best_cpu = task_cpu(p);
if (cpu_rq(best_cpu)->nr_running <= 2)
return best_cpu;
minload = 10000000;
for (i = 0; i < numnodes; i++) {
load = atomic_read(&node_nr_running[i]);
if (load < minload) {
minload = load;
node = i;
}
}
minload = 10000000;
cpumask = __node_to_cpu_mask(node);
for (i = 0; i < NR_CPUS; ++i) {
if (!(cpumask & (1UL << i)))
continue;
if (cpu_rq(i)->nr_running < minload) {
best_cpu = i;
minload = cpu_rq(i)->nr_running;
}
}
return best_cpu;
}
void sched_balance_exec(void)
{
int new_cpu;
if (numnodes > 1) {
new_cpu = sched_best_cpu(current);
if (new_cpu != smp_processor_id())
sched_migrate_task(current, new_cpu);
}
}
static inline unsigned long cpus_to_balance(int this_cpu) static inline unsigned long cpus_to_balance(int this_cpu)
{ {
...@@ -752,9 +861,9 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, ...@@ -752,9 +861,9 @@ static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu,
static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu)
{ {
dequeue_task(p, src_array); dequeue_task(p, src_array);
src_rq->nr_running--; nr_running_dec(src_rq);
set_task_cpu(p, this_cpu); set_task_cpu(p, this_cpu);
this_rq->nr_running++; nr_running_inc(this_rq);
enqueue_task(p, this_rq->active); enqueue_task(p, this_rq->active);
/* /*
* Note that idle threads have a prio of MAX_PRIO, for this test * Note that idle threads have a prio of MAX_PRIO, for this test
...@@ -2248,6 +2357,7 @@ void __init sched_init(void) ...@@ -2248,6 +2357,7 @@ void __init sched_init(void)
spin_lock_init(&rq->lock); spin_lock_init(&rq->lock);
INIT_LIST_HEAD(&rq->migration_queue); INIT_LIST_HEAD(&rq->migration_queue);
atomic_set(&rq->nr_iowait, 0); atomic_set(&rq->nr_iowait, 0);
nr_running_init(rq);
for (j = 0; j < 2; j++) { for (j = 0; j < 2; j++) {
array = rq->arrays + j; array = rq->arrays + j;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment