[PATCH] sched: trivial fixes, cleanups

From: Ingo Molnar <mingo@elte.hu> The trivial fixes. - added recent trivial bits from Nick's and my patches. - hotplug CPU fix - early init cleanup

[PATCH] sched: trivial fixes, cleanups
From: Ingo Molnar <mingo@elte.hu> The trivial fixes. - added recent trivial bits from Nick's and my patches. - hotplug CPU fix - early init cleanup
850f7d78 · Andrew Morton · Linus Torvalds · fa8f2c50 · 850f7d78 · 850f7d78
Commit 850f7d78 authored May 09, 2004 by Andrew Morton Committed by Linus Torvalds May 09, 2004
7 changed files
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -719,7 +719,7 @@ config X86_PAE
 # Common NUMA Features
 config NUMA
-	bool "Numa Memory Allocation Support"
+	bool "Numa Memory Allocation and Scheduler Support"
 	depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI))
 	default n if X86_PC
 	default y if (X86_NUMAQ || X86_SUMMIT)

--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -1162,9 +1162,9 @@ __init void arch_init_sched_domains(void)
 		first_cpu = last_cpu = NULL;
 		if (i != first_cpu(cpu_domain->span)) {
-			cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER;
+			cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
 			cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
-				SD_FLAG_SHARE_CPUPOWER;
+				SD_SHARE_CPUPOWER;
 			continue;
 		}
@@ -1258,7 +1258,7 @@ __init void arch_init_sched_domains(void)
 		cpu_domain->groups = cpu_group;
 	}
 }
-#else /* CONFIG_NUMA */
+#else /* !CONFIG_NUMA */
 static struct sched_group sched_group_cpus[NR_CPUS];
 static struct sched_group sched_group_phys[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
@@ -1286,9 +1286,9 @@ __init void arch_init_sched_domains(void)
 		first_cpu = last_cpu = NULL;
 		if (i != first_cpu(cpu_domain->span)) {
-			cpu_sched_domain(i)->flags |= SD_FLAG_SHARE_CPUPOWER;
+			cpu_sched_domain(i)->flags |= SD_SHARE_CPUPOWER;
 			cpu_sched_domain(first_cpu(cpu_domain->span))->flags |=
-				SD_FLAG_SHARE_CPUPOWER;
+				SD_SHARE_CPUPOWER;
 			continue;
 		}

--- a/include/asm-i386/param.h
+++ b/include/asm-i386/param.h
@@ -5,6 +5,8 @@
 # define HZ		1000		/* Internal kernel timer frequency */
 # define USER_HZ	100		/* .. some user interfaces are in "ticks" */
 # define CLOCKS_PER_SEC		(USER_HZ)	/* like times() */
+# define JIFFIES_TO_MSEC(x)	(x)
+# define MSEC_TO_JIFFIES(x)	(x)
 #endif
 #ifndef HZ

--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -650,7 +650,7 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
 #ifdef CONFIG_SCHED_SMT
 #define ARCH_HAS_SCHED_DOMAIN
-#define ARCH_HAS_SCHED_WAKE_BALANCE
+#define ARCH_HAS_SCHED_WAKE_IDLE
 #endif
 #endif /* __ASM_I386_PROCESSOR_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -543,14 +543,13 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #ifdef CONFIG_SMP
-#define SCHED_LOAD_SHIFT 7	/* increase resolution of load calculations */
+#define SCHED_LOAD_SCALE	128UL	/* increase resolution of load */
-#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT)
-#define SD_FLAG_NEWIDLE		1	/* Balance when about to become idle */
+#define SD_BALANCE_NEWIDLE	1	/* Balance when about to become idle */
-#define SD_FLAG_EXEC		2	/* Balance on exec */
+#define SD_BALANCE_EXEC		2	/* Balance on exec */
-#define SD_FLAG_WAKE		4	/* Balance on task wakeup */
+#define SD_WAKE_IDLE		4	/* Wake to idle CPU on task wakeup */
-#define SD_FLAG_FASTMIGRATE	8	/* Sync wakes put task on waking CPU */
+#define SD_WAKE_AFFINE		8	/* Wake task to waking CPU */
-#define SD_FLAG_SHARE_CPUPOWER	16	/* Domain members share cpu power */
+#define SD_SHARE_CPUPOWER	16	/* Domain members share cpu power */
 struct sched_group {
 	struct sched_group *next;	/* Must be a circular list */
@@ -577,7 +576,7 @@ struct sched_domain {
 	unsigned long long cache_hot_time; /* Task considered cache hot (ns) */
 	unsigned int cache_nice_tries;	/* Leave cache hot tasks for # tries */
 	unsigned int per_cpu_gain;	/* CPU % gained by adding domain cpus */
-	int flags;			/* See SD_FLAG_* */
+	int flags;			/* See SD_* */
 	/* Runtime fields. */
 	unsigned long last_balance;	/* init to jiffies. units in jiffies */
@@ -597,7 +596,9 @@ struct sched_domain {
 	.cache_hot_time		= 0,			\
 	.cache_nice_tries	= 0,			\
 	.per_cpu_gain		= 15,			\
-	.flags			= SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE | SD_FLAG_WAKE,\
+	.flags			= SD_BALANCE_NEWIDLE	\
+				 | SD_WAKE_AFFINE	\
+				 | SD_WAKE_IDLE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
@@ -615,7 +616,8 @@ struct sched_domain {
 	.cache_hot_time		= (5*1000000/2),	\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
-	.flags			= SD_FLAG_FASTMIGRATE | SD_FLAG_NEWIDLE,\
+	.flags			= SD_BALANCE_NEWIDLE	\
+				| SD_WAKE_AFFINE,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
@@ -634,7 +636,7 @@ struct sched_domain {
 	.cache_hot_time		= (10*1000000),		\
 	.cache_nice_tries	= 1,			\
 	.per_cpu_gain		= 100,			\
-	.flags			= SD_FLAG_EXEC,		\
+	.flags			= SD_BALANCE_EXEC,	\
 	.last_balance		= jiffies,		\
 	.balance_interval	= 1,			\
 	.nr_balance_failed	= 0,			\
@@ -645,6 +647,9 @@ DECLARE_PER_CPU(struct sched_domain, base_domains);
 #define cpu_sched_domain(cpu)	(&per_cpu(base_domains, (cpu)))
 #define this_sched_domain()	(&__get_cpu_var(base_domains))
+#define for_each_domain(cpu, domain) \
+	for (domain = cpu_sched_domain(cpu); domain; domain = domain->parent)
 extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
 #else
 static inline int set_cpus_allowed(task_t *p, cpumask_t new_mask)

--- a/init/main.c
+++ b/init/main.c
@@ -417,6 +417,13 @@ asmlinkage void __init start_kernel(void)
 	 */
 	smp_prepare_boot_cpu();
+	/*
+	 * Set up the scheduler prior starting any interrupts (such as the
+	 * timer interrupt). Full topology setup happens at smp_init()
+	 * time - but meanwhile we still have a functioning scheduler.
+	 */
+	sched_init();
 	build_all_zonelists();
 	page_alloc_init();
 	printk("Kernel command line: %s\n", saved_command_line);
@@ -428,7 +435,7 @@ asmlinkage void __init start_kernel(void)
 	rcu_init();
 	init_IRQ();
 	pidhash_init();
-	sched_init();
+	init_timers();
 	softirq_init();
 	time_init();

--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,13 @@
 #define NS_TO_JIFFIES(TIME)	((TIME) / (1000000000 / HZ))
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1000000000 / HZ))
+#ifndef JIFFIES_TO_MSEC
+# define JIFFIES_TO_MSEC(x) ((x) * 1000 / HZ)
+#endif
+#ifndef MSEC_TO_JIFFIES
+# define MSEC_TO_JIFFIES(x) ((x) * HZ / 1000)
+#endif
 /*
 * These are the 'tuning knobs' of the scheduler:
 *
@@ -257,16 +264,6 @@ const unsigned long scheduling_functions_end_here =
 # define task_running(rq, p)		((rq)->curr == (p))
 #endif
-static inline void nr_running_inc(runqueue_t *rq)
-{
-	rq->nr_running++;
-}
-static inline void nr_running_dec(runqueue_t *rq)
-{
-	rq->nr_running--;
-}
 /*
 * task_rq_lock - lock the runqueue a given task resides on and disable
 * interrupts.  Note the ordering: we can safely lookup the task_rq without
@@ -367,7 +364,7 @@ static int effective_prio(task_t *p)
 static inline void __activate_task(task_t *p, runqueue_t *rq)
 {
 	enqueue_task(p, rq->active);
-	nr_running_inc(rq);
+	rq->nr_running++;
 }
 static void recalc_task_prio(task_t *p, unsigned long long now)
@@ -488,7 +485,7 @@ static inline void activate_task(task_t *p, runqueue_t *rq)
 */
 static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 {
-	nr_running_dec(rq);
+	rq->nr_running--;
 	if (p->state == TASK_UNINTERRUPTIBLE)
 		rq->nr_uninterruptible++;
 	dequeue_task(p, p->array);
@@ -502,9 +499,9 @@ static inline void deactivate_task(struct task_struct *p, runqueue_t *rq)
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
+#ifdef CONFIG_SMP
 static inline void resched_task(task_t *p)
 {
-#ifdef CONFIG_SMP
 	int need_resched, nrpolling;
 	preempt_disable();
@@ -516,10 +513,13 @@ static inline void resched_task(task_t *p)
 	if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id()))
 		smp_send_reschedule(task_cpu(p));
 	preempt_enable();
+}
 #else
+static inline void resched_task(task_t *p)
+{
 	set_tsk_need_resched(p);
-#endif
 }
+#endif
 /**
 * task_curr - is this task currently executing on a CPU?
@@ -611,13 +611,14 @@ void kick_process(task_t *p)
 }
 EXPORT_SYMBOL_GPL(kick_process);
 /*
 * Return a low guess at the load of cpu.
 */
 static inline unsigned long get_low_cpu_load(int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT;
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	return min(rq->cpu_load, load_now);
 }
@@ -625,7 +626,7 @@ static inline unsigned long get_low_cpu_load(int cpu)
 static inline unsigned long get_high_cpu_load(int cpu)
 {
 	runqueue_t *rq = cpu_rq(cpu);
-	unsigned long load_now = rq->nr_running << SCHED_LOAD_SHIFT;
+	unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
 	return max(rq->cpu_load, load_now);
 }
@@ -633,26 +634,27 @@ static inline unsigned long get_high_cpu_load(int cpu)
 #endif
 /*
- * sched_balance_wake can be used with SMT architectures to wake a
+ * wake_idle() is useful especially on SMT architectures to wake a
- * task onto an idle sibling if cpu is not idle. Returns cpu if
+ * task onto an idle sibling if we would otherwise wake it onto a
- * cpu is idle or no siblings are idle, otherwise returns an idle
+ * busy sibling.
- * sibling.
+ *
+ * Returns the CPU we should wake onto.
 */
-#if defined(CONFIG_SMP) && defined(ARCH_HAS_SCHED_WAKE_BALANCE)
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int sched_balance_wake(int cpu, task_t *p)
+static int wake_idle(int cpu, task_t *p)
 {
 	cpumask_t tmp;
-	struct sched_domain *domain;
+	struct sched_domain *sd;
 	int i;
 	if (idle_cpu(cpu))
 		return cpu;
-	domain = cpu_sched_domain(cpu);
+	sd = cpu_sched_domain(cpu);
-	if (!(domain->flags & SD_FLAG_WAKE))
+	if (!(sd->flags & SD_WAKE_IDLE))
 		return cpu;
-	cpus_and(tmp, domain->span, cpu_online_map);
+	cpus_and(tmp, sd->span, cpu_online_map);
 	for_each_cpu_mask(i, tmp) {
 		if (!cpu_isset(i, p->cpus_allowed))
 			continue;
@@ -664,7 +666,7 @@ static int sched_balance_wake(int cpu, task_t *p)
 	return cpu;
 }
 #else
-static inline int sched_balance_wake(int cpu, task_t *p)
+static inline int wake_idle(int cpu, task_t *p)
 {
 	return cpu;
 }
@@ -694,8 +696,8 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 #ifdef CONFIG_SMP
 	unsigned long long now;
 	unsigned long load, this_load;
-	int new_cpu;
 	struct sched_domain *sd;
+	int new_cpu;
 #endif
 	rq = task_rq_lock(p, &flags);
@@ -706,49 +708,44 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	if (p->array)
 		goto out_running;
-	this_cpu = smp_processor_id();
 	cpu = task_cpu(p);
+	this_cpu = smp_processor_id();
 #ifdef CONFIG_SMP
-	if (cpu == this_cpu || unlikely(cpu_is_offline(this_cpu)))
+	if (unlikely(task_running(rq, p) || cpu_is_offline(this_cpu)))
 		goto out_activate;
-	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)
+	new_cpu = this_cpu; /* Wake to this CPU if we can */
-				|| task_running(rq, p)))
-		goto out_activate;
+	if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+		goto out_set_cpu;
 	/* Passive load balancing */
 	load = get_low_cpu_load(cpu);
 	this_load = get_high_cpu_load(this_cpu) + SCHED_LOAD_SCALE;
-	if (load > this_load) {
+	if (load > this_load)
-		new_cpu = sched_balance_wake(this_cpu, p);
+		goto out_set_cpu;
-		set_task_cpu(p, new_cpu);
-		goto repeat_lock_task;
-	}
 	now = sched_clock();
-	sd = cpu_sched_domain(this_cpu);
 	/*
-	 * Fast-migrate the task if it's not running or
+	 * Migrate the task to the waking domain.
-	 * runnable currently. Do not violate hard affinity.
+	 * Do not violate hard affinity.
 	 */
-	do {
+	for_each_domain(this_cpu, sd) {
-		if (!(sd->flags & SD_FLAG_FASTMIGRATE))
+		if (!(sd->flags & SD_WAKE_AFFINE))
 			break;
 		if (now - p->timestamp < sd->cache_hot_time)
 			break;
-		if (cpu_isset(cpu, sd->span)) {
+		if (cpu_isset(cpu, sd->span))
-			new_cpu = sched_balance_wake(this_cpu, p);
+			goto out_set_cpu;
-			set_task_cpu(p, new_cpu);
-			goto repeat_lock_task;
 	}
-		sd = sd->parent;
-	} while (sd);
-	new_cpu = sched_balance_wake(cpu, p);
+	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-	if (new_cpu != cpu) {
+out_set_cpu:
+	new_cpu = wake_idle(new_cpu, p);
+	if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) {
 		set_task_cpu(p, new_cpu);
 		goto repeat_lock_task;
 	}
@@ -778,6 +775,14 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 		p->activated = -1;
 	}
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption, if the woken up task will run on
+	 * this cpu. (in this case the 'I will reschedule' promise of
+	 * the waker guarantees that the freshly woken up task is going
+	 * to be considered on this CPU.)
+	 */
 	if (sync && cpu == this_cpu) {
 		__activate_task(p, rq);
 	} else {
@@ -794,6 +799,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
 	return success;
 }
 int fastcall wake_up_process(task_t * p)
 {
 	return try_to_wake_up(p, TASK_STOPPED |
@@ -897,7 +903,7 @@ void fastcall wake_up_forked_process(task_t * p)
 		list_add_tail(&p->run_list, &current->run_list);
 		p->array = current->array;
 		p->array->nr_active++;
-		nr_running_inc(rq);
+		rq->nr_running++;
 	}
 	task_rq_unlock(rq, &flags);
 }
@@ -1114,8 +1120,8 @@ enum idle_type
 */
 static void sched_migrate_task(task_t *p, int dest_cpu)
 {
-	runqueue_t *rq;
 	migration_req_t req;
+	runqueue_t *rq;
 	unsigned long flags;
 	lock_cpu_hotplug();
@@ -1136,6 +1142,7 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
 		 * the migration.
 		 */
 		tlb_migrate_prepare(current->mm);
+		unlock_cpu_hotplug();
 		return;
 	}
@@ -1146,9 +1153,9 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
 /*
 * Find the least loaded CPU.  Slightly favor the current CPU by
- * setting its runqueue length as the minimum to start.
+ * setting its load as the minimum to start.
 */
-static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain)
+static int sched_best_cpu(struct task_struct *p, struct sched_domain *sd)
 {
 	cpumask_t tmp;
 	int i, min_load, this_cpu, best_cpu;
@@ -1156,7 +1163,7 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain)
 	best_cpu = this_cpu = task_cpu(p);
 	min_load = INT_MAX;
-	cpus_and(tmp, domain->span, cpu_online_map);
+	cpus_and(tmp, sd->span, cpu_online_map);
 	for_each_cpu_mask(i, tmp) {
 		unsigned long load;
 		if (i == this_cpu)
@@ -1168,30 +1175,42 @@ static int sched_best_cpu(struct task_struct *p, struct sched_domain *domain)
 			best_cpu = i;
 			min_load = load;
 		}
 	}
 	return best_cpu;
 }
+/*
+ * sched_balance_exec(): find the highest-level, exec-balance-capable
+ * domain and try to migrate the task to the least loaded CPU.
+ *
+ * execve() is a valuable balancing opportunity, because at this point
+ * the task has the smallest effective memory and cache footprint.
+ */
 void sched_balance_exec(void)
 {
-	struct sched_domain *domain = this_sched_domain();
+	struct sched_domain *sd, *best_sd = NULL;
 	int new_cpu;
-	int this_cpu = smp_processor_id();
+	int this_cpu = get_cpu();
-	if (numnodes == 1)
-		return;
+	/* Prefer the current CPU if there's only this task running */
 	if (this_rq()->nr_running <= 1)
-		return;
+		goto out;
-	while (domain->parent && !(domain->flags & SD_FLAG_EXEC))
+	for_each_domain(this_cpu, sd) {
-		domain = domain->parent;
+		if (sd->flags & SD_BALANCE_EXEC)
+			best_sd = sd;
+	}
-	if (domain->flags & SD_FLAG_EXEC) {
+	if (best_sd) {
-		new_cpu = sched_best_cpu(current, domain);
+		new_cpu = sched_best_cpu(current, best_sd);
-		if (new_cpu != this_cpu)
+		if (new_cpu != this_cpu) {
+			put_cpu();
 			sched_migrate_task(current, new_cpu);
+			return;
 		}
+	}
+out:
+	put_cpu();
 }
 #endif /* CONFIG_NUMA */
@@ -1214,14 +1233,14 @@ static inline void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
 * pull_task - move a task from a remote runqueue to the local runqueue.
 * Both runqueues must be locked.
 */
-static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array,
+static inline
-		task_t *p, runqueue_t *this_rq, prio_array_t *this_array,
+void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
-		int this_cpu)
+		runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
 {
 	dequeue_task(p, src_array);
-	nr_running_dec(src_rq);
+	src_rq->nr_running--;
 	set_task_cpu(p, this_cpu);
-	nr_running_inc(this_rq);
+	this_rq->nr_running++;
 	enqueue_task(p, this_array);
 	p->timestamp = sched_clock() -
 				(src_rq->timestamp_last_tick - p->timestamp);
@@ -1238,7 +1257,7 @@ static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array,
 */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-		struct sched_domain *domain, enum idle_type idle)
+		struct sched_domain *sd, enum idle_type idle)
 {
 	/*
 	 * We do not migrate tasks that are:
@@ -1253,9 +1272,9 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 	/* Aggressive migration if we've failed balancing */
 	if (idle == NEWLY_IDLE ||
-			domain->nr_balance_failed < domain->cache_nice_tries) {
+			sd->nr_balance_failed < sd->cache_nice_tries) {
 		if ((rq->timestamp_last_tick - p->timestamp)
-						< domain->cache_hot_time)
+						< sd->cache_hot_time)
 			return 0;
 	}
@@ -1270,7 +1289,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
 * Called with both runqueues locked.
 */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
-			unsigned long max_nr_move, struct sched_domain *domain,
+			unsigned long max_nr_move, struct sched_domain *sd,
 			enum idle_type idle)
 {
 	int idx;
@@ -1305,7 +1324,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 	else
 		idx = find_next_bit(array->bitmap, MAX_PRIO, idx);
 	if (idx >= MAX_PRIO) {
-		if (array == busiest->expired) {
+		if (array == busiest->expired && busiest->active->nr_active) {
 			array = busiest->active;
 			dst_array = this_rq->active;
 			goto new_array;
@@ -1320,7 +1339,7 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 	curr = curr->prev;
-	if (!can_migrate_task(tmp, busiest, this_cpu, domain, idle)) {
+	if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
 		if (curr != head)
 			goto skip_queue;
 		idx++;
@@ -1346,20 +1365,16 @@ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
 * moved to restore balance via the imbalance parameter.
 */
 static struct sched_group *
-find_busiest_group(struct sched_domain *domain, int this_cpu,
+find_busiest_group(struct sched_domain *sd, int this_cpu,
 		unsigned long *imbalance, enum idle_type idle)
 {
-	unsigned long max_load, avg_load, total_load, this_load;
+	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-	unsigned int total_pwr;
+	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-	struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups;
-	max_load = 0;
+	if (unlikely(!group))
-	this_load = 0;
+		return NULL;
-	total_load = 0;
-	total_pwr = 0;
-	if (group == NULL)
+	max_load = this_load = total_load = total_pwr = 0;
-		goto out_balanced;
 	do {
 		cpumask_t tmp;
@@ -1372,6 +1387,11 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 		/* Tally up the load of all CPUs in the group */
 		avg_load = 0;
 		cpus_and(tmp, group->cpumask, cpu_online_map);
+		if (unlikely(cpus_empty(tmp))) {
+			WARN_ON(1);
+			return NULL;
+		}
 		for_each_cpu_mask(i, tmp) {
 			/* Bias balancing toward cpus of our domain */
 			if (local_group) {
@@ -1390,7 +1410,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 		total_pwr += group->cpu_power;
 		/* Adjust by relative CPU power of the group */
-		avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power;
+		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
 		if (local_group) {
 			this_load = avg_load;
@@ -1403,7 +1423,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 		}
 nextgroup:
 		group = group->next;
-	} while (group != domain->groups);
+	} while (group != sd->groups);
 	if (!busiest || this_load >= max_load)
 		goto out_balanced;
@@ -1412,7 +1432,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 	if (idle == NOT_IDLE) {
 		if (this_load >= avg_load ||
-			100*max_load <= domain->imbalance_pct*this_load)
+			100*max_load <= sd->imbalance_pct*this_load)
 		goto out_balanced;
 	}
@@ -1441,7 +1461,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 		pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
 		pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
-		pwr_now >>= SCHED_LOAD_SHIFT;
+		pwr_now /= SCHED_LOAD_SCALE;
 		/* Amount of load we'd subtract */
 		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
@@ -1452,7 +1472,7 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 		/* Amount of load we'd add */
 		tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
 		pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp);
-		pwr_move >>= SCHED_LOAD_SHIFT;
+		pwr_move /= SCHED_LOAD_SCALE;
 		/* Move if we gain another 8th of a CPU worth of throughput */
 		if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
@@ -1463,9 +1483,9 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 	/* How many tasks to actually move to equalise the imbalance */
 	*imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
-				>> SCHED_LOAD_SHIFT;
+				/ SCHED_LOAD_SCALE;
 	/* Get rid of the scaling factor, rounding *up* as we divide */
-	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT;
+	*imbalance = (*imbalance + SCHED_LOAD_SCALE/2) / SCHED_LOAD_SCALE;
 	return busiest;
@@ -1485,14 +1505,12 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
 static runqueue_t *find_busiest_queue(struct sched_group *group)
 {
 	cpumask_t tmp;
-	int i;
+	unsigned long load, max_load = 0;
-	unsigned long max_load = 0;
 	runqueue_t *busiest = NULL;
+	int i;
 	cpus_and(tmp, group->cpumask, cpu_online_map);
 	for_each_cpu_mask(i, tmp) {
-		unsigned long load;
 		load = get_low_cpu_load(i);
 		if (load > max_load) {
@@ -1511,42 +1529,38 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
 * Called with this_rq unlocked.
 */
 static int load_balance(int this_cpu, runqueue_t *this_rq,
-			struct sched_domain *domain, enum idle_type idle)
+			struct sched_domain *sd, enum idle_type idle)
 {
 	struct sched_group *group;
-	runqueue_t *busiest = NULL;
+	runqueue_t *busiest;
 	unsigned long imbalance;
-	int balanced = 0, failed = 0;
+	int nr_moved;
-	int nr_moved = 0;
 	spin_lock(&this_rq->lock);
-	group = find_busiest_group(domain, this_cpu, &imbalance, idle);
+	group = find_busiest_group(sd, this_cpu, &imbalance, idle);
-	if (!group) {
+	if (!group)
-		balanced = 1;
+		goto out_balanced;
-		goto out;
-	}
 	busiest = find_busiest_queue(group);
-	if (!busiest || busiest == this_rq) {
+	if (!busiest)
-		balanced = 1;
+		goto out_balanced;
-		goto out;
+	if (unlikely(busiest == this_rq)) {
+		WARN_ON(1);
+		goto out_balanced;
 	}
 	/* Attempt to move tasks */
 	double_lock_balance(this_rq, busiest);
-	nr_moved = move_tasks(this_rq, this_cpu, busiest,
+	nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle);
-					imbalance, domain, idle);
-	spin_unlock(&busiest->lock);
-out:
 	spin_unlock(&this_rq->lock);
+	spin_unlock(&busiest->lock);
-	if (!balanced && nr_moved == 0)
+	if (!nr_moved) {
-		failed = 1;
+		sd->nr_balance_failed++;
-	if (failed && busiest &&
+		if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-	   		domain->nr_balance_failed > domain->cache_nice_tries) {
 			int wake = 0;
 			spin_lock(&busiest->lock);
@@ -1558,21 +1572,29 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 			spin_unlock(&busiest->lock);
 			if (wake)
 				wake_up_process(busiest->migration_thread);
-	}
-	if (failed)
+			/*
-		domain->nr_balance_failed++;
+			 * We've kicked active balancing, reset the failure
-	else
+			 * counter.
-		domain->nr_balance_failed = 0;
+			 */
+			sd->nr_balance_failed = sd->cache_nice_tries;
-	if (balanced) {
-		if (domain->balance_interval < domain->max_interval)
-			domain->balance_interval *= 2;
-	} else {
-		domain->balance_interval = domain->min_interval;
 		}
+	} else
+		sd->nr_balance_failed = 0;
+	/* We were unbalanced, so reset the balancing interval */
+	sd->balance_interval = sd->min_interval;
 	return nr_moved;
+out_balanced:
+	spin_unlock(&this_rq->lock);
+	/* tune up the balancing interval */
+	if (sd->balance_interval < sd->max_interval)
+		sd->balance_interval *= 2;
+	return 0;
 }
 /*
@@ -1583,14 +1605,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
 * this_rq is locked.
 */
 static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
-			struct sched_domain *domain)
+				struct sched_domain *sd)
 {
 	struct sched_group *group;
 	runqueue_t *busiest = NULL;
 	unsigned long imbalance;
 	int nr_moved = 0;
-	group = find_busiest_group(domain, this_cpu, &imbalance, NEWLY_IDLE);
+	group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
 	if (!group)
 		goto out;
@@ -1602,7 +1624,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 	double_lock_balance(this_rq, busiest);
 	nr_moved = move_tasks(this_rq, this_cpu, busiest,
-					imbalance, domain, NEWLY_IDLE);
+					imbalance, sd, NEWLY_IDLE);
 	spin_unlock(&busiest->lock);
@@ -1616,25 +1638,22 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
 */
 static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 {
-	struct sched_domain *domain = this_sched_domain();
+	struct sched_domain *sd;
 	if (unlikely(cpu_is_offline(this_cpu)))
 		return;
-	do {
+	for_each_domain(this_cpu, sd) {
-		if (unlikely(!domain->groups))
+		if (unlikely(!sd->groups))
-			/* hasn't been setup yet */
+			return;
-			break;
-		if (domain->flags & SD_FLAG_NEWIDLE) {
+		if (sd->flags & SD_BALANCE_NEWIDLE) {
-			if (load_balance_newidle(this_cpu, this_rq, domain)) {
+			if (load_balance_newidle(this_cpu, this_rq, sd)) {
 				/* We've pulled tasks over so stop searching */
 				break;
 			}
 		}
+	}
-		domain = domain->parent;
-	} while (domain);
 }
 /*
@@ -1647,36 +1666,26 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 */
 static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 {
-	int i;
+	struct sched_domain *sd;
-	struct sched_domain *sd = cpu_sched_domain(busiest_cpu);
 	struct sched_group *group, *busy_group;
+	int i;
 	if (busiest->nr_running <= 1)
 		return;
-	/* sd->parent should never cause a NULL dereference, if it did so,
+	for_each_domain(busiest_cpu, sd) {
- 	 * then push_cpu was set to a buggy value */
+		if (cpu_isset(busiest->push_cpu, sd->span))
-	while (!cpu_isset(busiest->push_cpu, sd->span)) {
+			break;
- 		sd = sd->parent;
-		if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) {
-			WARN_ON(1);
-			return;
-		}
 	}
-	if (!sd->groups) {
+	if (!sd->parent && !cpu_isset(busiest->push_cpu, sd->span)) {
 		WARN_ON(1);
 		return;
 	}
 	group = sd->groups;
-	while (!cpu_isset(busiest_cpu, group->cpumask)) {
+	while (!cpu_isset(busiest_cpu, group->cpumask))
 		group = group->next;
-		if (group == sd->groups) {
-			WARN_ON(1);
-			return;
-		}
-	}
 	busy_group = group;
 	group = sd->groups;
@@ -1719,59 +1728,60 @@ static void active_load_balance(runqueue_t *busiest, int busiest_cpu)
 /* Don't have all balancing operations going off at once */
 #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS)
-static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle)
+static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
+						enum idle_type idle)
 {
 	unsigned long old_load, this_load;
 	unsigned long j = jiffies + CPU_OFFSET(this_cpu);
-	struct sched_domain *domain = this_sched_domain();
+	struct sched_domain *sd;
 	if (unlikely(cpu_is_offline(this_cpu)))
 		return;
 	/* Update our load */
 	old_load = this_rq->cpu_load;
-	this_load = this_rq->nr_running << SCHED_LOAD_SHIFT;
+	this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
 	this_rq->cpu_load = (old_load + this_load) / 2;
-	/* Run through all this CPU's domains */
+	for_each_domain(this_cpu, sd) {
-	do {
+		unsigned long interval = sd->balance_interval;
-		unsigned long interval;
-		if (unlikely(!domain->groups))
+		if (unlikely(!sd->groups))
-			break;
+			return;
-		interval = domain->balance_interval;
 		if (idle != IDLE)
-			interval *= domain->busy_factor;
+			interval *= sd->busy_factor;
 		/* scale ms to jiffies */
-		interval = interval * HZ / 1000;
+		interval = MSEC_TO_JIFFIES(interval);
 		if (unlikely(interval == 0))
 			interval = 1;
-		if (j - domain->last_balance >= interval) {
+		if (j - sd->last_balance >= interval) {
-			if (load_balance(this_cpu, this_rq, domain, idle)) {
+			if (load_balance(this_cpu, this_rq, sd, idle)) {
 				/* We've pulled tasks over so no longer idle */
 				idle = NOT_IDLE;
 			}
-			domain->last_balance += interval;
+			sd->last_balance += interval;
+		}
 	}
-		domain = domain->parent;
-	} while (domain);
 }
 #else
 /*
 * on UP we do not need to balance between CPUs:
 */
-static inline void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle)
+static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle)
+{
+}
+static inline void idle_balance(int cpu, runqueue_t *rq)
 {
 }
 #endif
-#ifdef CONFIG_SCHED_SMT
 static inline int wake_priority_sleeper(runqueue_t *rq)
-{	/*
+{
+#ifdef CONFIG_SCHED_SMT
+	/*
 	 * If an SMT sibling task has been put to sleep for priority
 	 * reasons reschedule the idle task to see if it can now run.
 	 */
@@ -1779,14 +1789,9 @@ static inline int wake_priority_sleeper(runqueue_t *rq)
 		resched_task(rq->idle);
 		return 1;
 	}
+#endif
 	return 0;
 }
-#else
-static inline int wake_priority_sleeper(runqueue_t *rq)
-{
-	return 0;
-}
-#endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -1937,10 +1942,8 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
 	struct sched_domain *sd = cpu_sched_domain(cpu);
 	cpumask_t sibling_map;
-	if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) {
+	if (!(sd->flags & SD_SHARE_CPUPOWER))
-		/* Not SMT */
 		return;
-	}
 	cpus_and(sibling_map, sd->span, cpu_online_map);
 	cpu_clear(cpu, sibling_map);
@@ -1960,14 +1963,12 @@ static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq)
 static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
 {
-	int ret = 0, i;
 	struct sched_domain *sd = cpu_sched_domain(cpu);
 	cpumask_t sibling_map;
+	int ret = 0, i;
-	if (!(sd->flags & SD_FLAG_SHARE_CPUPOWER)) {
+	if (!(sd->flags & SD_SHARE_CPUPOWER))
-		/* Not SMT */
 		return 0;
-	}
 	cpus_and(sibling_map, sd->span, cpu_online_map);
 	cpu_clear(cpu, sibling_map);
@@ -1989,7 +1990,7 @@ static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p)
 		if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) >
 			task_timeslice(p) || rt_task(smt_curr)) &&
 			p->mm && smt_curr->mm && !rt_task(p))
-				ret |= 1;
+				ret = 1;
 		/*
 		 * Reschedule a lower priority task on the SMT sibling,
@@ -2079,9 +2080,7 @@ asmlinkage void __sched schedule(void)
 	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
-#ifdef CONFIG_SMP
 		idle_balance(cpu, rq);
-#endif
 		if (!rq->nr_running) {
 			next = rq->idle;
 			rq->expired_timestamp = 0;
@@ -2627,7 +2626,7 @@ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param)
 		if (task_running(rq, p)) {
 			if (p->prio > oldprio)
 				resched_task(rq->curr);
-		} else if (p->prio < rq->curr->prio)
+		} else if (TASK_PREEMPTS_CURR(p, rq))
 			resched_task(rq->curr);
 	}
@@ -3406,24 +3405,24 @@ static void __init arch_init_sched_domains(void)
 	for_each_cpu(i) {
 		int node = cpu_to_node(i);
 		cpumask_t nodemask = node_to_cpumask(node);
-		struct sched_domain *node_domain = &per_cpu(node_domains, i);
+		struct sched_domain *node_sd = &per_cpu(node_domains, i);
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *cpu_sd = cpu_sched_domain(i);
-		*node_domain = SD_NODE_INIT;
+		*node_sd = SD_NODE_INIT;
-		node_domain->span = cpu_possible_map;
+		node_sd->span = cpu_possible_map;
-		*cpu_domain = SD_CPU_INIT;
+		*cpu_sd = SD_CPU_INIT;
-		cpus_and(cpu_domain->span, nodemask, cpu_possible_map);
+		cpus_and(cpu_sd->span, nodemask, cpu_possible_map);
-		cpu_domain->parent = node_domain;
+		cpu_sd->parent = node_sd;
 	}
 	/* Set up groups */
 	for (i = 0; i < MAX_NUMNODES; i++) {
-		struct sched_group *first_cpu = NULL, *last_cpu = NULL;
+		cpumask_t tmp = node_to_cpumask(i);
-		int j;
 		cpumask_t nodemask;
+		struct sched_group *first_cpu = NULL, *last_cpu = NULL;
 		struct sched_group *node = &sched_group_nodes[i];
-		cpumask_t tmp = node_to_cpumask(i);
+		int j;
 		cpus_and(nodemask, tmp, cpu_possible_map);
@@ -3458,14 +3457,14 @@ static void __init arch_init_sched_domains(void)
 	mb();
 	for_each_cpu(i) {
-		struct sched_domain *node_domain = &per_cpu(node_domains, i);
+		struct sched_domain *node_sd = &per_cpu(node_domains, i);
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *cpu_sd = cpu_sched_domain(i);
-		node_domain->groups = &sched_group_nodes[cpu_to_node(i)];
+		node_sd->groups = &sched_group_nodes[cpu_to_node(i)];
-		cpu_domain->groups = &sched_group_cpus[i];
+		cpu_sd->groups = &sched_group_cpus[i];
 	}
 }
-#else /* CONFIG_NUMA */
+#else /* !CONFIG_NUMA */
 static void __init arch_init_sched_domains(void)
 {
 	int i;
@@ -3473,10 +3472,10 @@ static void __init arch_init_sched_domains(void)
 	/* Set up domains */
 	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *cpu_sd = cpu_sched_domain(i);
-		*cpu_domain = SD_CPU_INIT;
+		*cpu_sd = SD_CPU_INIT;
-		cpu_domain->span = cpu_possible_map;
+		cpu_sd->span = cpu_possible_map;
 	}
 	/* Set up CPU groups */
@@ -3497,15 +3496,15 @@ static void __init arch_init_sched_domains(void)
 	mb();
 	for_each_cpu(i) {
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *cpu_sd = cpu_sched_domain(i);
-		cpu_domain->groups = &sched_group_cpus[i];
+		cpu_sd->groups = &sched_group_cpus[i];
 	}
 }
 #endif /* CONFIG_NUMA */
 #endif /* ARCH_HAS_SCHED_DOMAIN */
-#undef SCHED_DOMAIN_DEBUG
+#define SCHED_DOMAIN_DEBUG
 #ifdef SCHED_DOMAIN_DEBUG
 void sched_domain_debug(void)
 {
@@ -3513,7 +3512,7 @@ void sched_domain_debug(void)
 	for_each_cpu(i) {
 		int level = 0;
-		struct sched_domain *cpu_domain = cpu_sched_domain(i);
+		struct sched_domain *cpu_sd = cpu_sched_domain(i);
 		printk(KERN_DEBUG "CPU%d: %s\n",
 				i, (cpu_online(i) ? " online" : "offline"));
@@ -3521,10 +3520,10 @@ void sched_domain_debug(void)
 		do {
 			int j;
 			char str[NR_CPUS];
-			struct sched_group *group = cpu_domain->groups;
+			struct sched_group *group = cpu_sd->groups;
 			cpumask_t groupmask, tmp;
-			cpumask_snprintf(str, NR_CPUS, cpu_domain->span);
+			cpumask_scnprintf(str, NR_CPUS, cpu_sd->span);
 			cpus_clear(groupmask);
 			printk(KERN_DEBUG);
@@ -3532,10 +3531,12 @@ void sched_domain_debug(void)
 				printk(" ");
 			printk("domain %d: span %s\n", level, str);
-			if (!cpu_isset(i, cpu_domain->span))
+			if (!cpu_isset(i, cpu_sd->span))
 				printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i);
 			if (!cpu_isset(i, group->cpumask))
 				printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i);
+			if (!group->cpu_power)
+				printk(KERN_DEBUG "ERROR domain->cpu_power not set\n");
 			printk(KERN_DEBUG);
 			for (j = 0; j < level + 2; j++)
@@ -3556,26 +3557,26 @@ void sched_domain_debug(void)
 				cpus_or(groupmask, groupmask, group->cpumask);
-				cpumask_snprintf(str, NR_CPUS, group->cpumask);
+				cpumask_scnprintf(str, NR_CPUS, group->cpumask);
 				printk(" %s", str);
 				group = group->next;
-			} while (group != cpu_domain->groups);
+			} while (group != cpu_sd->groups);
 			printk("\n");
-			if (!cpus_equal(cpu_domain->span, groupmask))
+			if (!cpus_equal(cpu_sd->span, groupmask))
 				printk(KERN_DEBUG "ERROR groups don't span domain->span\n");
 			level++;
-			cpu_domain = cpu_domain->parent;
+			cpu_sd = cpu_sd->parent;
-			if (cpu_domain) {
+			if (cpu_sd) {
-				cpus_and(tmp, groupmask, cpu_domain->span);
+				cpus_and(tmp, groupmask, cpu_sd->span);
 				if (!cpus_equal(tmp, groupmask))
 					printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n");
 			}
-		} while (cpu_domain);
+		} while (cpu_sd);
 	}
 }
 #else
@@ -3635,8 +3636,6 @@ void __init sched_init(void)
 	set_task_cpu(current, smp_processor_id());
 	wake_up_forked_process(current);
-	init_timers();
 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */