[PATCH] sched: cleanup, improve sched <=> fork APIs

Move balancing and child-runs-first logic from fork.c into sched.c where it belongs. * Consolidate wake_up_forked_process and wake_up_forked_thread into wake_up_new_process, and pass in clone_flags as suggested by Linus. This removes a lot of code duplication and allows all logic to be handled in that function. * Don't do balance-on-clone balancing for vfork'ed threads. * Don't do set_task_cpu or balance one clone in wake_up_new_process. Instead do it in sched_fork to fix set_cpus_allowed races. * Don't do child-runs-first for CLONE_VM processes, as there is obviously no COW benifit to be had. This is a big one, it enables Andi's workload to run well without clone balancing, because the OpenMP child threads can get balanced off to other nodes *before* they start running and allocating memory. * Rename sched_balance_exec to sched_exec: hide the policy from the API. From: Ingo Molnar <mingo@elte.hu> rename wake_up_new_process -> wake_up_new_task. in sched.c we are gradually moving away from the overloaded 'process' or 'thread' notion to the traditional task (or context) naming. Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

[PATCH] sched: cleanup, improve sched <=> fork APIs
Move balancing and child-runs-first logic from fork.c into sched.c where it belongs. * Consolidate wake_up_forked_process and wake_up_forked_thread into wake_up_new_process, and pass in clone_flags as suggested by Linus. This removes a lot of code duplication and allows all logic to be handled in that function. * Don't do balance-on-clone balancing for vfork'ed threads. * Don't do set_task_cpu or balance one clone in wake_up_new_process. Instead do it in sched_fork to fix set_cpus_allowed races. * Don't do child-runs-first for CLONE_VM processes, as there is obviously no COW benifit to be had. This is a big one, it enables Andi's workload to run well without clone balancing, because the OpenMP child threads can get balanced off to other nodes *before* they start running and allocating memory. * Rename sched_balance_exec to sched_exec: hide the policy from the API. From: Ingo Molnar <mingo@elte.hu> rename wake_up_new_process -> wake_up_new_task. in sched.c we are gradually moving away from the overloaded 'process' or 'thread' notion to the traditional task (or context) naming. Signed-off-by: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
3632d86a · Nick Piggin · Linus Torvalds · 70a0b8e7 · 3632d86a · 3632d86a
Commit 3632d86a authored Aug 23, 2004 by Nick Piggin Committed by Linus Torvalds Aug 23, 2004
Showing with 121 additions and 146 deletions

fs/compat.c fs/compat.c +2 -2

fs/exec.c fs/exec.c +1 -1

include/linux/sched.h include/linux/sched.h +6 -9

kernel/fork.c kernel/fork.c +16 -31

kernel/sched.c kernel/sched.c +96 -103

No files found.
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1373,14 +1373,14 @@ int compat_do_execve(char * filename,
 	int retval;
 	int i;

-	sched_balance_exec();
-
 	file = open_exec(filename);

 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
 		return retval;

+	sched_exec();
+
 	bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
 	memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0]));


--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1102,7 +1102,7 @@ int do_execve(char * filename,
 	if (IS_ERR(file))
 		return retval;

-	sched_balance_exec();
+	sched_exec();

 	bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
 	memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0]));

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -692,10 +692,11 @@ static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)

 extern unsigned long long sched_clock(void);

+/* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
-extern void sched_balance_exec(void);
+extern void sched_exec(void);
 #else
-#define sched_balance_exec()   {}
+#define sched_exec()   {}
 #endif

 extern void sched_idle_next(void);
@@ -754,18 +755,14 @@ extern void do_timer(struct pt_regs *);

 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
-extern void FASTCALL(wake_up_forked_process(struct task_struct * tsk));
+extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
+						unsigned long clone_flags));
 #ifdef CONFIG_SMP
 extern void kick_process(struct task_struct *tsk);
- extern void FASTCALL(wake_up_forked_thread(struct task_struct * tsk));
 #else
 static inline void kick_process(struct task_struct *tsk) { }
- static inline void wake_up_forked_thread(struct task_struct * tsk)
- {
-	wake_up_forked_process(tsk);
- }
 #endif
-extern void FASTCALL(sched_fork(task_t * p));
+extern void FASTCALL(sched_fork(task_t * p, unsigned long clone_flags));
 extern void FASTCALL(sched_exit(task_t * p));

 extern int in_group_p(gid_t);

--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -893,6 +893,16 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
 		return ERR_PTR(-EINVAL);

+	/*
+	 * The newly dup'ed task shares the same cpus_allowed mask as its
+	 * parent (ie. current), and it is not attached to the tasklist.
+	 * The end result is that this CPU might go down and the parent
+	 * be migrated away, leaving the task on a dead CPU. So take the
+	 * hotplug lock here and release it after the child has been attached
+	 * to the tasklist.
+	 */
+	lock_cpu_hotplug();
+
 	retval = security_task_create(clone_flags);
 	if (retval)
 		goto fork_out;
@@ -1020,7 +1030,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	p->pdeath_signal = 0;

 	/* Perform scheduler related setup */
-	sched_fork(p);
+	sched_fork(p, clone_flags);

 	/*
 	 * Ok, make it visible to the rest of the system.
@@ -1098,6 +1108,7 @@ struct task_struct *copy_process(unsigned long clone_flags,
 	retval = 0;

 fork_out:
+	unlock_cpu_hotplug();
 	if (retval)
 		return ERR_PTR(retval);
 	return p;
@@ -1204,31 +1215,10 @@ long do_fork(unsigned long clone_flags,
 			set_tsk_thread_flag(p, TIF_SIGPENDING);
 		}

-		if (!(clone_flags & CLONE_STOPPED)) {
-			/*
-			 * Do the wakeup last. On SMP we treat fork() and
-			 * CLONE_VM separately, because fork() has already
-			 * created cache footprint on this CPU (due to
-			 * copying the pagetables), hence migration would
-			 * probably be costy. Threads on the other hand
-			 * have less traction to the current CPU, and if
-			 * there's an imbalance then the scheduler can
-			 * migrate this fresh thread now, before it
-			 * accumulates a larger cache footprint:
-			 */
-			if (clone_flags & CLONE_VM)
-				wake_up_forked_thread(p);
-			else
-				wake_up_forked_process(p);
-		} else {
-			int cpu = get_cpu();
-
+		if (!(clone_flags & CLONE_STOPPED))
+			wake_up_new_task(p, clone_flags);
+		else
 			p->state = TASK_STOPPED;
-			if (cpu_is_offline(task_cpu(p)))
-				set_task_cpu(p, cpu);
-
-			put_cpu();
-		}
 		++total_forks;

 		if (unlikely (trace)) {
@@ -1240,12 +1230,7 @@ long do_fork(unsigned long clone_flags,
 			wait_for_completion(&vfork);
 			if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE))
 				ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
-		} else
-			/*
-			 * Let the child process run first, to avoid most of the
-			 * COW overhead when the child exec()s afterwards.
-			 */
-			set_need_resched();
+		}
 	}
 	return pid;
 }

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -865,12 +865,50 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
 	return try_to_wake_up(p, state, 0);
 }

+#ifdef CONFIG_SMP
+static int find_idlest_cpu(struct task_struct *p, int this_cpu,
+			   struct sched_domain *sd);
+#endif
+
 /*
 * Perform scheduler related setup for a newly forked process p.
- * p is forked by current.
+ * p is forked by current. It also does runqueue balancing.
+ * The cpu hotplug lock is held.
 */
-void fastcall sched_fork(task_t *p)
+void fastcall sched_fork(task_t *p, unsigned long clone_flags)
 {
+	int cpu;
+#ifdef CONFIG_SMP
+	struct sched_domain *tmp, *sd = NULL;
+	preempt_disable();
+	cpu = smp_processor_id();
+
+	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) {
+		/*
+		 * New thread that is not a vfork.
+		 * Find the largest domain that this CPU is part of that
+		 * is willing to balance on clone:
+		 */
+		for_each_domain(cpu, tmp)
+			if (tmp->flags & SD_BALANCE_CLONE)
+				sd = tmp;
+		if (sd)
+			cpu = find_idlest_cpu(p, cpu, sd);
+	}
+	preempt_enable();
+#else
+	cpu = smp_processor_id();
+#endif
+	/*
+	 * The task hasn't been attached yet, so cpus_allowed mask cannot
+	 * change. The cpus_allowed mask of the parent may have changed
+	 * after it is copied, and it may then move to a CPU that is not
+	 * allowed for the child.
+	 */
+	if (unlikely(!cpu_isset(cpu, p->cpus_allowed)))
+		cpu = any_online_cpu(p->cpus_allowed);
+	set_task_cpu(p, cpu);
+
 	/*
 	 * We mark the process as running here, but have not actually
 	 * inserted it onto the runqueue yet. This guarantees that
@@ -920,43 +958,81 @@ void fastcall sched_fork(task_t *p)
 }

 /*
- * wake_up_forked_process - wake up a freshly forked process.
+ * wake_up_new_task - wake up a newly created task for the first time.
 *
 * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created process.
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
 */
-void fastcall wake_up_forked_process(task_t * p)
+void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags)
 {
 	unsigned long flags;
-	runqueue_t *rq = task_rq_lock(current, &flags);
+	int this_cpu, cpu;
+	runqueue_t *rq;
+
+	rq = task_rq_lock(p, &flags);
+	cpu = task_cpu(p);
+	this_cpu = smp_processor_id();

 	BUG_ON(p->state != TASK_RUNNING);

 	/*
 	 * We decrease the sleep average of forking parents
 	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive.
+	 * from forking tasks that are max-interactive. The parent
+	 * (current) is done further down, under its lock.
 	 */
-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
 	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);

 	p->interactive_credit = 0;

 	p->prio = effective_prio(p);
-	set_task_cpu(p, smp_processor_id());

-	if (unlikely(!current->array))
+	if (likely(cpu == this_cpu)) {
+		if (!(clone_flags & CLONE_VM)) {
+			/*
+			 * The VM isn't cloned, so we're in a good position to
+			 * do child-runs-first in anticipation of an exec. This
+			 * usually avoids a lot of COW overhead.
+			 */
+			if (unlikely(!current->array))
+				__activate_task(p, rq);
+			else {
+				p->prio = current->prio;
+				list_add_tail(&p->run_list, &current->run_list);
+				p->array = current->array;
+				p->array->nr_active++;
+				rq->nr_running++;
+			}
+			set_need_resched();
+		} else {
+			/* Run child last */
+			__activate_task(p, rq);
+		}
+	} else {
+		runqueue_t *this_rq = cpu_rq(this_cpu);
+
+		/*
+		 * Not the local CPU - must adjust timestamp. This should
+		 * get optimised away in the !CONFIG_SMP case.
+		 */
+		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
+					+ rq->timestamp_last_tick;
 		__activate_task(p, rq);
-	else {
-		p->prio = current->prio;
-		list_add_tail(&p->run_list, &current->run_list);
-		p->array = current->array;
-		p->array->nr_active++;
-		rq->nr_running++;
+		if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+
+		current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+			PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	}
+
+	if (unlikely(cpu != this_cpu)) {
+		task_rq_unlock(rq, &flags);
+		rq = task_rq_lock(current, &flags);
+	}
+	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
+		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 	task_rq_unlock(rq, &flags);
 }

@@ -1225,89 +1301,6 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
 	return this_cpu;
 }

-/*
- * wake_up_forked_thread - wake up a freshly forked thread.
- *
- * This function will do some initial scheduler statistics housekeeping
- * that must be done for every newly created context, and it also does
- * runqueue balancing.
- */
-void fastcall wake_up_forked_thread(task_t * p)
-{
-	unsigned long flags;
-	int this_cpu = get_cpu(), cpu;
-	struct sched_domain *tmp, *sd = NULL;
-	runqueue_t *this_rq = cpu_rq(this_cpu), *rq;
-
-	/*
-	 * Find the largest domain that this CPU is part of that
-	 * is willing to balance on clone:
-	 */
-	for_each_domain(this_cpu, tmp)
-		if (tmp->flags & SD_BALANCE_CLONE)
-			sd = tmp;
-	if (sd)
-		cpu = find_idlest_cpu(p, this_cpu, sd);
-	else
-		cpu = this_cpu;
-
-	local_irq_save(flags);
-lock_again:
-	rq = cpu_rq(cpu);
-	double_rq_lock(this_rq, rq);
-
-	BUG_ON(p->state != TASK_RUNNING);
-
-	/*
-	 * We did find_idlest_cpu() unlocked, so in theory
-	 * the mask could have changed - just dont migrate
-	 * in this case:
-	 */
-	if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) {
-		cpu = this_cpu;
-		double_rq_unlock(this_rq, rq);
-		goto lock_again;
-	}
-	/*
-	 * We decrease the sleep average of forking parents
-	 * and children as well, to keep max-interactive tasks
-	 * from forking tasks that are max-interactive.
-	 */
-	current->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(current) *
-		PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
-	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
-		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
-
-	p->interactive_credit = 0;
-
-	p->prio = effective_prio(p);
-	set_task_cpu(p, cpu);
-
-	if (cpu == this_cpu) {
-		if (unlikely(!current->array))
-			__activate_task(p, rq);
-		else {
-			p->prio = current->prio;
-			list_add_tail(&p->run_list, &current->run_list);
-			p->array = current->array;
-			p->array->nr_active++;
-			rq->nr_running++;
-		}
-	} else {
-		/* Not the local CPU - must adjust timestamp */
-		p->timestamp = (p->timestamp - this_rq->timestamp_last_tick)
-					+ rq->timestamp_last_tick;
-		__activate_task(p, rq);
-		if (TASK_PREEMPTS_CURR(p, rq))
-			resched_task(rq->curr);
-	}
-
-	double_rq_unlock(this_rq, rq);
-	local_irq_restore(flags);
-	put_cpu();
-}
-
 /*
 * If dest_cpu is allowed for this process, migrate the task to it.
 * This is accomplished by forcing the cpu_allowed mask to only
@@ -1341,13 +1334,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
 }

 /*
- * sched_balance_exec(): find the highest-level, exec-balance-capable
+ * sched_exec(): find the highest-level, exec-balance-capable
 * domain and try to migrate the task to the least loaded CPU.
 *
 * execve() is a valuable balancing opportunity, because at this point
 * the task has the smallest effective memory and cache footprint.
 */
-void sched_balance_exec(void)
+void sched_exec(void)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	int new_cpu, this_cpu = get_cpu();