Commit 163122b7 authored by Kirill Tkhai's avatar Kirill Tkhai Committed by Ingo Molnar

sched/fair: Remove double_lock_balance() from load_balance()

Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for
load_balance(). The advantage is (obviously) not holding two
rq->lock's at the same time and thereby increasing parallelism.

Further note that if there was no task to migrate we will not
have acquired the second rq->lock at all.

The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer
than it would have been in the double rq lock scenario.
Signed-off-by: default avatarKirill Tkhai <ktkhai@parallels.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Mike Galbraith <umgwanakikbuti@gmail.com>
Cc: Kirill Tkhai <tkhai@yandex.ru>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Nicolas Pitre <nicolas.pitre@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1408528109.23412.94.camel@tkhaiSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent e5673f28
...@@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ ...@@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return; return;
/* /*
* This is possible from callers such as move_task(), in which we * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have * unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false * lead to a throttle). This both saves work and prevents false
* next-buddy nomination below. * next-buddy nomination below.
...@@ -5117,20 +5117,9 @@ struct lb_env { ...@@ -5117,20 +5117,9 @@ struct lb_env {
unsigned int loop_max; unsigned int loop_max;
enum fbq_type fbq_type; enum fbq_type fbq_type;
struct list_head tasks;
}; };
/*
* move_task - move a task from one runqueue to another runqueue.
* Both runqueues must be locked.
*/
static void move_task(struct task_struct *p, struct lb_env *env)
{
deactivate_task(env->src_rq, p, 0);
set_task_cpu(p, env->dst_cpu);
activate_task(env->dst_rq, p, 0);
check_preempt_curr(env->dst_rq, p, 0);
}
/* /*
* Is this task likely cache-hot: * Is this task likely cache-hot:
*/ */
...@@ -5345,6 +5334,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) ...@@ -5345,6 +5334,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
return 0; return 0;
} }
/*
* detach_task() -- detach the task for the migration specified in env
*/
static void detach_task(struct task_struct *p, struct lb_env *env)
{
lockdep_assert_held(&env->src_rq->lock);
deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, env->dst_cpu);
}
/* /*
* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain". * part of active balancing operations within "domain".
...@@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env) ...@@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env)
if (!can_migrate_task(p, env)) if (!can_migrate_task(p, env))
continue; continue;
deactivate_task(env->src_rq, p, 0); detach_task(p, env);
p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, env->dst_cpu);
/* /*
* Right now, this is only the second place where * Right now, this is only the second place where
* lb_gained[env->idle] is updated (other is move_tasks) * lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than * so we can safely collect stats here rather than
* inside move_tasks(). * inside detach_tasks().
*/ */
schedstat_inc(env->sd, lb_gained[env->idle]); schedstat_inc(env->sd, lb_gained[env->idle]);
return p; return p;
...@@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env) ...@@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL; return NULL;
} }
/*
* attach_one_task() -- attaches the task returned from detach_one_task() to
* its new rq.
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
BUG_ON(task_rq(p) != rq);
p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
raw_spin_unlock(&rq->lock);
}
static const unsigned int sched_nr_migrate_break = 32; static const unsigned int sched_nr_migrate_break = 32;
/* /*
* move_tasks tries to move up to imbalance weighted load from busiest to * detach_tasks() -- tries to detach up to imbalance weighted load from
* this_rq, as part of a balancing operation within domain "sd". * busiest_rq, as part of a balancing operation within domain "sd".
* Returns 1 if successful and 0 otherwise.
* *
* Called with both runqueues locked. * Returns number of detached tasks if successful and 0 otherwise.
*/ */
static int move_tasks(struct lb_env *env) static int detach_tasks(struct lb_env *env)
{ {
struct list_head *tasks = &env->src_rq->cfs_tasks; struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p; struct task_struct *p;
unsigned long load; unsigned long load;
int pulled = 0; int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0) if (env->imbalance <= 0)
return 0; return 0;
...@@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env) ...@@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance) if ((load / 2) > env->imbalance)
goto next; goto next;
move_task(p, env); detach_task(p, env);
pulled++; list_add(&p->se.group_node, &env->tasks);
detached++;
env->imbalance -= load; env->imbalance -= load;
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT
/* /*
* NEWIDLE balancing is a source of latency, so preemptible * NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is pulled to minimize * kernels will stop after the first task is detached to minimize
* the critical section. * the critical section.
*/ */
if (env->idle == CPU_NEWLY_IDLE) if (env->idle == CPU_NEWLY_IDLE)
...@@ -5463,13 +5451,58 @@ static int move_tasks(struct lb_env *env) ...@@ -5463,13 +5451,58 @@ static int move_tasks(struct lb_env *env)
} }
/* /*
* Right now, this is one of only two places move_task() is called, * Right now, this is one of only two places we collect this stat
* so we can safely collect move_task() stats here rather than * so we can safely collect detach_one_task() stats here rather
* inside move_task(). * than inside detach_one_task().
*/
schedstat_add(env->sd, lb_gained[env->idle], detached);
return detached;
}
/*
* attach_task() -- attach the task detached by detach_task() to its new rq.
*/ */
schedstat_add(env->sd, lb_gained[env->idle], pulled); static void attach_task(struct rq *rq, struct task_struct *p)
{
lockdep_assert_held(&rq->lock);
BUG_ON(task_rq(p) != rq);
p->on_rq = TASK_ON_RQ_QUEUED;
activate_task(rq, p, 0);
check_preempt_curr(rq, p, 0);
}
return pulled; /*
* attach_one_task() -- attaches the task returned from detach_one_task() to
* its new rq.
*/
static void attach_one_task(struct rq *rq, struct task_struct *p)
{
raw_spin_lock(&rq->lock);
attach_task(rq, p);
raw_spin_unlock(&rq->lock);
}
/*
* attach_tasks() -- attaches all tasks detached by detach_tasks() to their
* new rq.
*/
static void attach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->tasks;
struct task_struct *p;
raw_spin_lock(&env->dst_rq->lock);
while (!list_empty(tasks)) {
p = list_first_entry(tasks, struct task_struct, se.group_node);
list_del_init(&p->se.group_node);
attach_task(env->dst_rq, p);
}
raw_spin_unlock(&env->dst_rq->lock);
} }
#ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED
...@@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.loop_break = sched_nr_migrate_break, .loop_break = sched_nr_migrate_break,
.cpus = cpus, .cpus = cpus,
.fbq_type = all, .fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
}; };
/* /*
...@@ -6652,16 +6686,29 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -6652,16 +6686,29 @@ static int load_balance(int this_cpu, struct rq *this_rq,
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance: more_balance:
local_irq_save(flags); raw_spin_lock_irqsave(&busiest->lock, flags);
double_rq_lock(env.dst_rq, busiest);
/* /*
* cur_ld_moved - load moved in current iteration * cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations * ld_moved - cumulative load moved across iterations
*/ */
cur_ld_moved = move_tasks(&env); cur_ld_moved = detach_tasks(&env);
/*
* We've detached some tasks from busiest_rq. Every
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
* unlock busiest->lock, and we are able to be sure
* that nobody can manipulate the tasks in parallel.
* See task_rq_lock() family for the details.
*/
raw_spin_unlock(&busiest->lock);
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved; ld_moved += cur_ld_moved;
double_rq_unlock(env.dst_rq, busiest); }
local_irq_restore(flags); local_irq_restore(flags);
/* /*
...@@ -6797,7 +6844,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, ...@@ -6797,7 +6844,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* If we've begun active balancing, start to back off. This * If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there * case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call * is only 1 task on the busy runqueue (because we don't call
* move_tasks). * detach_tasks).
*/ */
if (sd->balance_interval < sd->max_interval) if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2; sd->balance_interval *= 2;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment