Commit 3632d86a authored by Nick Piggin's avatar Nick Piggin Committed by Linus Torvalds

[PATCH] sched: cleanup, improve sched <=> fork APIs

Move balancing and child-runs-first logic from fork.c into sched.c where
it belongs.

* Consolidate wake_up_forked_process and wake_up_forked_thread into
  wake_up_new_process, and pass in clone_flags as suggested by Linus.  This
  removes a lot of code duplication and allows all logic to be handled in that
  function.

* Don't do balance-on-clone balancing for vfork'ed threads.

* Don't do set_task_cpu or balance one clone in wake_up_new_process. 
  Instead do it in sched_fork to fix set_cpus_allowed races.

* Don't do child-runs-first for CLONE_VM processes, as there is obviously no
  COW benifit to be had.  This is a big one, it enables Andi's workload to run
  well without clone balancing, because the OpenMP child threads can get
  balanced off to other nodes *before* they start running and allocating
  memory.

* Rename sched_balance_exec to sched_exec: hide the policy from the API.


From: Ingo Molnar <mingo@elte.hu>

  rename wake_up_new_process -> wake_up_new_task.

  in sched.c we are gradually moving away from the overloaded 'process' or
  'thread' notion to the traditional task (or context) naming.
Signed-off-by: default avatarNick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 70a0b8e7
...@@ -1373,14 +1373,14 @@ int compat_do_execve(char * filename, ...@@ -1373,14 +1373,14 @@ int compat_do_execve(char * filename,
int retval; int retval;
int i; int i;
sched_balance_exec();
file = open_exec(filename); file = open_exec(filename);
retval = PTR_ERR(file); retval = PTR_ERR(file);
if (IS_ERR(file)) if (IS_ERR(file))
return retval; return retval;
sched_exec();
bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0]));
......
...@@ -1102,7 +1102,7 @@ int do_execve(char * filename, ...@@ -1102,7 +1102,7 @@ int do_execve(char * filename,
if (IS_ERR(file)) if (IS_ERR(file))
return retval; return retval;
sched_balance_exec(); sched_exec();
bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0]));
......
...@@ -692,10 +692,11 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask) ...@@ -692,10 +692,11 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)
extern unsigned long long sched_clock(void); extern unsigned long long sched_clock(void);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
extern void sched_balance_exec(void); extern void sched_exec(void);
#else #else
#define sched_balance_exec() {} #define sched_exec() {}
#endif #endif
extern void sched_idle_next(void); extern void sched_idle_next(void);
...@@ -754,18 +755,14 @@ extern void do_timer(struct pt_regs *); ...@@ -754,18 +755,14 @@ extern void do_timer(struct pt_regs *);
extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state)); extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
extern int FASTCALL(wake_up_process(struct task_struct * tsk)); extern int FASTCALL(wake_up_process(struct task_struct * tsk));
extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk)); extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
unsigned long clone_flags));
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk); extern void kick_process(struct task_struct *tsk);
extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk));
#else #else
static inline void kick_process(struct task_struct *tsk) { } static inline void kick_process(struct task_struct *tsk) { }
static inline void wake_up_forked_thread(struct task_struct * tsk)
{
wake_up_forked_process(tsk);
}
#endif #endif
extern void FASTCALL(sched_fork(task_t * p)); extern void FASTCALL(sched_fork(task_t * p, unsigned long clone_flags));
extern void FASTCALL(sched_exit(task_t * p)); extern void FASTCALL(sched_exit(task_t * p));
extern int in_group_p(gid_t); extern int in_group_p(gid_t);
......
...@@ -893,6 +893,16 @@ struct task_struct *copy_process(unsigned long clone_flags, ...@@ -893,6 +893,16 @@ struct task_struct *copy_process(unsigned long clone_flags,
if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
/*
* The newly dup'ed task shares the same cpus_allowed mask as its
* parent (ie. current), and it is not attached to the tasklist.
* The end result is that this CPU might go down and the parent
* be migrated away, leaving the task on a dead CPU. So take the
* hotplug lock here and release it after the child has been attached
* to the tasklist.
*/
lock_cpu_hotplug();
retval = security_task_create(clone_flags); retval = security_task_create(clone_flags);
if (retval) if (retval)
goto fork_out; goto fork_out;
...@@ -1020,7 +1030,7 @@ struct task_struct *copy_process(unsigned long clone_flags, ...@@ -1020,7 +1030,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
p->pdeath_signal = 0; p->pdeath_signal = 0;
/* Perform scheduler related setup */ /* Perform scheduler related setup */
sched_fork(p); sched_fork(p, clone_flags);
/* /*
* Ok, make it visible to the rest of the system. * Ok, make it visible to the rest of the system.
...@@ -1098,6 +1108,7 @@ struct task_struct *copy_process(unsigned long clone_flags, ...@@ -1098,6 +1108,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
retval = 0; retval = 0;
fork_out: fork_out:
unlock_cpu_hotplug();
if (retval) if (retval)
return ERR_PTR(retval); return ERR_PTR(retval);
return p; return p;
...@@ -1204,31 +1215,10 @@ long do_fork(unsigned long clone_flags, ...@@ -1204,31 +1215,10 @@ long do_fork(unsigned long clone_flags,
set_tsk_thread_flag(p, TIF_SIGPENDING); set_tsk_thread_flag(p, TIF_SIGPENDING);
} }
if (!(clone_flags & CLONE_STOPPED)) { if (!(clone_flags & CLONE_STOPPED))
/* wake_up_new_task(p, clone_flags);
* Do the wakeup last. On SMP we treat fork() and else
* CLONE_VM separately, because fork() has already
* created cache footprint on this CPU (due to
* copying the pagetables), hence migration would
* probably be costy. Threads on the other hand
* have less traction to the current CPU, and if
* there's an imbalance then the scheduler can
* migrate this fresh thread now, before it
* accumulates a larger cache footprint:
*/
if (clone_flags & CLONE_VM)
wake_up_forked_thread(p);
else
wake_up_forked_process(p);
} else {
int cpu = get_cpu();
p->state = TASK_STOPPED; p->state = TASK_STOPPED;
if (cpu_is_offline(task_cpu(p)))
set_task_cpu(p, cpu);
put_cpu();
}
++total_forks; ++total_forks;
if (unlikely (trace)) { if (unlikely (trace)) {
...@@ -1240,12 +1230,7 @@ long do_fork(unsigned long clone_flags, ...@@ -1240,12 +1230,7 @@ long do_fork(unsigned long clone_flags,
wait_for_completion(&vfork); wait_for_completion(&vfork);
if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
} else }
/*
* Let the child process run first, to avoid most of the
* COW overhead when the child exec()s afterwards.
*/
set_need_resched();
} }
return pid; return pid;
} }
......
...@@ -865,12 +865,50 @@ int fastcall wake_up_state(task_t *p, unsigned int state) ...@@ -865,12 +865,50 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
return try_to_wake_up(p, state, 0); return try_to_wake_up(p, state, 0);
} }
#ifdef CONFIG_SMP
static int find_idlest_cpu(struct task_struct *p, int this_cpu,
struct sched_domain *sd);
#endif
/* /*
* Perform scheduler related setup for a newly forked process p. * Perform scheduler related setup for a newly forked process p.
* p is forked by current. * p is forked by current. It also does runqueue balancing.
* The cpu hotplug lock is held.
*/ */
void fastcall sched_fork(task_t *p) void fastcall sched_fork(task_t *p, unsigned long clone_flags)
{ {
int cpu;
#ifdef CONFIG_SMP
struct sched_domain *tmp, *sd = NULL;
preempt_disable();
cpu = smp_processor_id();
if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) {
/*
* New thread that is not a vfork.
* Find the largest domain that this CPU is part of that
* is willing to balance on clone:
*/
for_each_domain(cpu, tmp)
if (tmp->flags & SD_BALANCE_CLONE)
sd = tmp;
if (sd)
cpu = find_idlest_cpu(p, cpu, sd);
}
preempt_enable();
#else
cpu = smp_processor_id();
#endif
/*
* The task hasn't been attached yet, so cpus_allowed mask cannot
* change. The cpus_allowed mask of the parent may have changed
* after it is copied, and it may then move to a CPU that is not
* allowed for the child.
*/
if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
cpu = any_online_cpu(p->cpus_allowed);
set_task_cpu(p, cpu);
/* /*
* We mark the process as running here, but have not actually * We mark the process as running here, but have not actually
* inserted it onto the runqueue yet. This guarantees that * inserted it onto the runqueue yet. This guarantees that
...@@ -920,43 +958,81 @@ void fastcall sched_fork(task_t *p) ...@@ -920,43 +958,81 @@ void fastcall sched_fork(task_t *p)
} }
/* /*
* wake_up_forked_process - wake up a freshly forked process. * wake_up_new_task - wake up a newly created task for the first time.
* *
* This function will do some initial scheduler statistics housekeeping * This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created process. * that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/ */
void fastcall wake_up_forked_process(task_t * p) void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
{ {
unsigned long flags; unsigned long flags;
runqueue_t *rq = task_rq_lock(current, &flags); int this_cpu, cpu;
runqueue_t *rq;
rq = task_rq_lock(p, &flags);
cpu = task_cpu(p);
this_cpu = smp_processor_id();
BUG_ON(p->state != TASK_RUNNING); BUG_ON(p->state != TASK_RUNNING);
/* /*
* We decrease the sleep average of forking parents * We decrease the sleep average of forking parents
* and children as well, to keep max-interactive tasks * and children as well, to keep max-interactive tasks
* from forking tasks that are max-interactive. * from forking tasks that are max-interactive. The parent
* (current) is done further down, under its lock.
*/ */
current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) * p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS); CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
p->interactive_credit = 0; p->interactive_credit = 0;
p->prio = effective_prio(p); p->prio = effective_prio(p);
set_task_cpu(p, smp_processor_id());
if (unlikely(!current->array)) if (likely(cpu == this_cpu)) {
if (!(clone_flags & CLONE_VM)) {
/*
* The VM isn't cloned, so we're in a good position to
* do child-runs-first in anticipation of an exec. This
* usually avoids a lot of COW overhead.
*/
if (unlikely(!current->array))
__activate_task(p, rq);
else {
p->prio = current->prio;
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
}
set_need_resched();
} else {
/* Run child last */
__activate_task(p, rq);
}
} else {
runqueue_t *this_rq = cpu_rq(this_cpu);
/*
* Not the local CPU - must adjust timestamp. This should
* get optimised away in the !CONFIG_SMP case.
*/
p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+ rq->timestamp_last_tick;
__activate_task(p, rq); __activate_task(p, rq);
else { if (TASK_PREEMPTS_CURR(p, rq))
p->prio = current->prio; resched_task(rq->curr);
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array; current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
p->array->nr_active++; PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
rq->nr_running++;
} }
if (unlikely(cpu != this_cpu)) {
task_rq_unlock(rq, &flags);
rq = task_rq_lock(current, &flags);
}
current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
task_rq_unlock(rq, &flags); task_rq_unlock(rq, &flags);
} }
...@@ -1225,89 +1301,6 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu, ...@@ -1225,89 +1301,6 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
return this_cpu; return this_cpu;
} }
/*
* wake_up_forked_thread - wake up a freshly forked thread.
*
* This function will do some initial scheduler statistics housekeeping
* that must be done for every newly created context, and it also does
* runqueue balancing.
*/
void fastcall wake_up_forked_thread(task_t * p)
{
unsigned long flags;
int this_cpu = get_cpu(), cpu;
struct sched_domain *tmp, *sd = NULL;
runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
/*
* Find the largest domain that this CPU is part of that
* is willing to balance on clone:
*/
for_each_domain(this_cpu, tmp)
if (tmp->flags & SD_BALANCE_CLONE)
sd = tmp;
if (sd)
cpu = find_idlest_cpu(p, this_cpu, sd);
else
cpu = this_cpu;
local_irq_save(flags);
lock_again:
rq = cpu_rq(cpu);
double_rq_lock(this_rq, rq);
BUG_ON(p->state != TASK_RUNNING);
/*
* We did find_idlest_cpu() unlocked, so in theory
* the mask could have changed - just dont migrate
* in this case:
*/
if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
cpu = this_cpu;
double_rq_unlock(this_rq, rq);
goto lock_again;
}
/*
* We decrease the sleep average of forking parents
* and children as well, to keep max-interactive tasks
* from forking tasks that are max-interactive.
*/
current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
p->interactive_credit = 0;
p->prio = effective_prio(p);
set_task_cpu(p, cpu);
if (cpu == this_cpu) {
if (unlikely(!current->array))
__activate_task(p, rq);
else {
p->prio = current->prio;
list_add_tail(&p->run_list, &current->run_list);
p->array = current->array;
p->array->nr_active++;
rq->nr_running++;
}
} else {
/* Not the local CPU - must adjust timestamp */
p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+ rq->timestamp_last_tick;
__activate_task(p, rq);
if (TASK_PREEMPTS_CURR(p, rq))
resched_task(rq->curr);
}
double_rq_unlock(this_rq, rq);
local_irq_restore(flags);
put_cpu();
}
/* /*
* If dest_cpu is allowed for this process, migrate the task to it. * If dest_cpu is allowed for this process, migrate the task to it.
* This is accomplished by forcing the cpu_allowed mask to only * This is accomplished by forcing the cpu_allowed mask to only
...@@ -1341,13 +1334,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu) ...@@ -1341,13 +1334,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
} }
/* /*
* sched_balance_exec(): find the highest-level, exec-balance-capable * sched_exec(): find the highest-level, exec-balance-capable
* domain and try to migrate the task to the least loaded CPU. * domain and try to migrate the task to the least loaded CPU.
* *
* execve() is a valuable balancing opportunity, because at this point * execve() is a valuable balancing opportunity, because at this point
* the task has the smallest effective memory and cache footprint. * the task has the smallest effective memory and cache footprint.
*/ */
void sched_balance_exec(void) void sched_exec(void)
{ {
struct sched_domain *tmp, *sd = NULL; struct sched_domain *tmp, *sd = NULL;
int new_cpu, this_cpu = get_cpu(); int new_cpu, this_cpu = get_cpu();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment