Commit 482b9933 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] sched-group-power

From: Nick Piggin <piggin@cyberone.com.au>

The following patch implements a cpu_power member to struct sched_group.

This allows special casing to be removed for SMT groups in the balancing
code.  It does not take CPU hotplug into account yet, but that shouldn't be
too hard.

I have tested it on the NUMAQ by pretending it has SMT.  Works as expected.
Active balances across nodes.
parent 3de8a6b4
...@@ -1150,7 +1150,6 @@ __init void arch_init_sched_domains(void) ...@@ -1150,7 +1150,6 @@ __init void arch_init_sched_domains(void)
*phys_domain = SD_CPU_INIT; *phys_domain = SD_CPU_INIT;
phys_domain->span = nodemask; phys_domain->span = nodemask;
phys_domain->flags |= SD_FLAG_IDLE;
*node_domain = SD_NODE_INIT; *node_domain = SD_NODE_INIT;
node_domain->span = cpu_online_map; node_domain->span = cpu_online_map;
...@@ -1170,6 +1169,7 @@ __init void arch_init_sched_domains(void) ...@@ -1170,6 +1169,7 @@ __init void arch_init_sched_domains(void)
cpu->cpumask = CPU_MASK_NONE; cpu->cpumask = CPU_MASK_NONE;
cpu_set(j, cpu->cpumask); cpu_set(j, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu) if (!first_cpu)
first_cpu = cpu; first_cpu = cpu;
...@@ -1183,6 +1183,7 @@ __init void arch_init_sched_domains(void) ...@@ -1183,6 +1183,7 @@ __init void arch_init_sched_domains(void)
for (i = 0; i < MAX_NUMNODES; i++) { for (i = 0; i < MAX_NUMNODES; i++) {
int j; int j;
cpumask_t nodemask; cpumask_t nodemask;
struct sched_group *node = &sched_group_nodes[i];
cpus_and(nodemask, node_to_cpumask(i), cpu_online_map); cpus_and(nodemask, node_to_cpumask(i), cpu_online_map);
if (cpus_empty(nodemask)) if (cpus_empty(nodemask))
...@@ -1198,6 +1199,12 @@ __init void arch_init_sched_domains(void) ...@@ -1198,6 +1199,12 @@ __init void arch_init_sched_domains(void)
continue; continue;
cpu->cpumask = cpu_domain->span; cpu->cpumask = cpu_domain->span;
/*
* Make each extra sibling increase power by 10% of
* the basic CPU. This is very arbitrary.
*/
cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
node->cpu_power += cpu->cpu_power;
if (!first_cpu) if (!first_cpu)
first_cpu = cpu; first_cpu = cpu;
...@@ -1219,6 +1226,7 @@ __init void arch_init_sched_domains(void) ...@@ -1219,6 +1226,7 @@ __init void arch_init_sched_domains(void)
continue; continue;
cpu->cpumask = nodemask; cpu->cpumask = nodemask;
/* ->cpu_power already setup */
if (!first_cpu) if (!first_cpu)
first_cpu = cpu; first_cpu = cpu;
...@@ -1228,7 +1236,6 @@ __init void arch_init_sched_domains(void) ...@@ -1228,7 +1236,6 @@ __init void arch_init_sched_domains(void)
} }
last_cpu->next = first_cpu; last_cpu->next = first_cpu;
mb(); mb();
for_each_cpu_mask(i, cpu_online_map) { for_each_cpu_mask(i, cpu_online_map) {
int node = cpu_to_node(i); int node = cpu_to_node(i);
...@@ -1266,7 +1273,6 @@ __init void arch_init_sched_domains(void) ...@@ -1266,7 +1273,6 @@ __init void arch_init_sched_domains(void)
*phys_domain = SD_CPU_INIT; *phys_domain = SD_CPU_INIT;
phys_domain->span = cpu_online_map; phys_domain->span = cpu_online_map;
phys_domain->flags |= SD_FLAG_IDLE;
} }
/* Set up CPU (sibling) groups */ /* Set up CPU (sibling) groups */
...@@ -1283,6 +1289,7 @@ __init void arch_init_sched_domains(void) ...@@ -1283,6 +1289,7 @@ __init void arch_init_sched_domains(void)
cpus_clear(cpu->cpumask); cpus_clear(cpu->cpumask);
cpu_set(j, cpu->cpumask); cpu_set(j, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu) if (!first_cpu)
first_cpu = cpu; first_cpu = cpu;
...@@ -1303,6 +1310,8 @@ __init void arch_init_sched_domains(void) ...@@ -1303,6 +1310,8 @@ __init void arch_init_sched_domains(void)
continue; continue;
cpu->cpumask = cpu_domain->span; cpu->cpumask = cpu_domain->span;
/* See SMT+NUMA setup for comment */
cpu->cpu_power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE*(cpus_weight(cpu->cpumask)-1) / 10;
if (!first_cpu) if (!first_cpu)
first_cpu = cpu; first_cpu = cpu;
......
...@@ -543,15 +543,25 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) ...@@ -543,15 +543,25 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */
#define SCHED_LOAD_SCALE (1UL << SCHED_LOAD_SHIFT)
#define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */ #define SD_FLAG_NEWIDLE 1 /* Balance when about to become idle */
#define SD_FLAG_EXEC 2 /* Balance on exec */ #define SD_FLAG_EXEC 2 /* Balance on exec */
#define SD_FLAG_WAKE 4 /* Balance on task wakeup */ #define SD_FLAG_WAKE 4 /* Balance on task wakeup */
#define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */ #define SD_FLAG_FASTMIGRATE 8 /* Sync wakes put task on waking CPU */
#define SD_FLAG_IDLE 16 /* Should not have all CPUs idle */
struct sched_group { struct sched_group {
struct sched_group *next; /* Must be a circular list */ struct sched_group *next; /* Must be a circular list */
cpumask_t cpumask; cpumask_t cpumask;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU. This should be read only (except for setup). Although
* it will need to be written to at cpu hot(un)plug time, perhaps the
* cpucontrol semaphore will provide enough exclusion?
*/
unsigned long cpu_power;
}; };
struct sched_domain { struct sched_domain {
......
...@@ -192,9 +192,6 @@ struct prio_array { ...@@ -192,9 +192,6 @@ struct prio_array {
struct list_head queue[MAX_PRIO]; struct list_head queue[MAX_PRIO];
}; };
#define SCHED_LOAD_SHIFT 7 /* increase resolution of load calculations */
#define SCHED_LOAD_SCALE (1 << SCHED_LOAD_SHIFT)
/* /*
* This is the main, per-CPU runqueue data structure. * This is the main, per-CPU runqueue data structure.
* *
...@@ -1353,16 +1350,14 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1353,16 +1350,14 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
unsigned long *imbalance, enum idle_type idle) unsigned long *imbalance, enum idle_type idle)
{ {
unsigned long max_load, avg_load, total_load, this_load; unsigned long max_load, avg_load, total_load, this_load;
int modify, total_nr_cpus, busiest_nr_cpus, this_nr_cpus; unsigned int total_pwr;
enum idle_type package_idle = IDLE; int modify;
struct sched_group *busiest = NULL, *group = domain->groups; struct sched_group *busiest = NULL, *this = NULL, *group = domain->groups;
max_load = 0; max_load = 0;
this_load = 0; this_load = 0;
total_load = 0; total_load = 0;
total_nr_cpus = 0; total_pwr = 0;
busiest_nr_cpus = 0;
this_nr_cpus = 0;
if (group == NULL) if (group == NULL)
goto out_balanced; goto out_balanced;
...@@ -1393,8 +1388,6 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1393,8 +1388,6 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
/* Bias balancing toward cpus of our domain */ /* Bias balancing toward cpus of our domain */
if (local_group) { if (local_group) {
load = get_high_cpu_load(i, modify); load = get_high_cpu_load(i, modify);
if (!idle_cpu(i))
package_idle = NOT_IDLE;
} else } else
load = get_low_cpu_load(i, modify); load = get_low_cpu_load(i, modify);
...@@ -1406,48 +1399,34 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1406,48 +1399,34 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
goto nextgroup; goto nextgroup;
total_load += avg_load; total_load += avg_load;
total_pwr += group->cpu_power;
/* /* Adjust by relative CPU power of the group */
* Load is cumulative over SD_FLAG_IDLE domains, but avg_load = (avg_load << SCHED_LOAD_SHIFT) / group->cpu_power;
* spread over !SD_FLAG_IDLE domains. For example, 2
* processes running on an SMT CPU puts a load of 2 on
* that CPU, however 2 processes running on 2 CPUs puts
* a load of 1 on that domain.
*
* This should be configurable so as SMT siblings become
* more powerful, they can "spread" more load - for example,
* the above case might only count as a load of 1.7.
*/
if (!(domain->flags & SD_FLAG_IDLE)) {
avg_load /= nr_cpus;
total_nr_cpus += nr_cpus;
} else
total_nr_cpus++;
if (avg_load > max_load)
max_load = avg_load;
if (local_group) { if (local_group) {
this_load = avg_load; this_load = avg_load;
this_nr_cpus = nr_cpus; this = group;
} else if (avg_load >= max_load) { goto nextgroup;
}
if (avg_load > max_load) {
max_load = avg_load;
busiest = group; busiest = group;
busiest_nr_cpus = nr_cpus;
} }
nextgroup: nextgroup:
group = group->next; group = group->next;
} while (group != domain->groups); } while (group != domain->groups);
if (!busiest) if (!busiest || this_load >= max_load)
goto out_balanced; goto out_balanced;
avg_load = total_load / total_nr_cpus; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
if (this_load >= avg_load) if (idle == NOT_IDLE) {
goto out_balanced; if (this_load >= avg_load ||
100*max_load <= domain->imbalance_pct*this_load)
if (idle == NOT_IDLE && 100*max_load <= domain->imbalance_pct*this_load)
goto out_balanced; goto out_balanced;
}
/* /*
* We're trying to get all the cpus to the average_load, so we don't * We're trying to get all the cpus to the average_load, so we don't
...@@ -1461,15 +1440,44 @@ find_busiest_group(struct sched_domain *domain, int this_cpu, ...@@ -1461,15 +1440,44 @@ find_busiest_group(struct sched_domain *domain, int this_cpu,
* appear as very large values with unsigned longs. * appear as very large values with unsigned longs.
*/ */
*imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2; *imbalance = (min(max_load - avg_load, avg_load - this_load) + 1) / 2;
/* Get rid of the scaling factor, rounding *up* as we divide */
*imbalance = (*imbalance + SCHED_LOAD_SCALE/2 + 1)
>> SCHED_LOAD_SHIFT;
if (*imbalance == 0) if (*imbalance <= SCHED_LOAD_SCALE/2) {
unsigned long pwr_now = 0, pwr_move = 0;
unsigned long tmp;
/*
* OK, we don't have enough imbalance to justify moving tasks,
* however we may be able to increase total CPU power used by
* moving them.
*/
pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load);
pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load);
pwr_now >>= SCHED_LOAD_SHIFT;
/* Amount of load we'd subtract */
tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power;
if (max_load > tmp)
pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE,
max_load - tmp);
/* Amount of load we'd add */
tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power;
pwr_move += this->cpu_power*min(this->cpu_power, this_load + tmp);
pwr_move >>= SCHED_LOAD_SHIFT;
/* Move if we gain another 8th of a CPU worth of throughput */
if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8)
goto out_balanced; goto out_balanced;
*imbalance = 1;
return busiest;
}
/* How many tasks to actually move to equalise the imbalance */ /* How many tasks to actually move to equalise the imbalance */
*imbalance *= min(busiest_nr_cpus, this_nr_cpus); *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power))
>> SCHED_LOAD_SHIFT;
/* Get rid of the scaling factor, rounding *up* as we divide */
*imbalance = (*imbalance + SCHED_LOAD_SCALE/2) >> SCHED_LOAD_SHIFT;
return busiest; return busiest;
...@@ -1550,16 +1558,10 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1550,16 +1558,10 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
if (!balanced && nr_moved == 0) if (!balanced && nr_moved == 0)
failed = 1; failed = 1;
if (domain->flags & SD_FLAG_IDLE && failed && busiest && if (failed && busiest &&
domain->nr_balance_failed > domain->cache_nice_tries) { domain->nr_balance_failed > domain->cache_nice_tries) {
int i;
for_each_cpu_mask(i, group->cpumask) {
int wake = 0; int wake = 0;
if (!cpu_online(i))
continue;
busiest = cpu_rq(i);
spin_lock(&busiest->lock); spin_lock(&busiest->lock);
if (!busiest->active_balance) { if (!busiest->active_balance) {
busiest->active_balance = 1; busiest->active_balance = 1;
...@@ -1570,7 +1572,6 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, ...@@ -1570,7 +1572,6 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
if (wake) if (wake)
wake_up_process(busiest->migration_thread); wake_up_process(busiest->migration_thread);
} }
}
if (failed) if (failed)
domain->nr_balance_failed++; domain->nr_balance_failed++;
...@@ -3325,12 +3326,14 @@ static void __init arch_init_sched_domains(void) ...@@ -3325,12 +3326,14 @@ static void __init arch_init_sched_domains(void)
continue; continue;
node->cpumask = nodemask; node->cpumask = nodemask;
node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask);
for_each_cpu_mask(j, node->cpumask) { for_each_cpu_mask(j, node->cpumask) {
struct sched_group *cpu = &sched_group_cpus[j]; struct sched_group *cpu = &sched_group_cpus[j];
cpus_clear(cpu->cpumask); cpus_clear(cpu->cpumask);
cpu_set(j, cpu->cpumask); cpu_set(j, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu) if (!first_cpu)
first_cpu = cpu; first_cpu = cpu;
...@@ -3377,6 +3380,7 @@ static void __init arch_init_sched_domains(void) ...@@ -3377,6 +3380,7 @@ static void __init arch_init_sched_domains(void)
cpus_clear(cpu->cpumask); cpus_clear(cpu->cpumask);
cpu_set(i, cpu->cpumask); cpu_set(i, cpu->cpumask);
cpu->cpu_power = SCHED_LOAD_SCALE;
if (!first_cpu) if (!first_cpu)
first_cpu = cpu; first_cpu = cpu;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment