Commit 4142c3eb authored by Rik van Riel's avatar Rik van Riel Committed by Ingo Molnar

sched/numa: Spread memory according to CPU and memory use

The pseudo-interleaving in NUMA placement has a fundamental problem:
using hard usage thresholds to spread memory equally between nodes
can prevent workloads from converging, or keep memory "trapped" on
nodes where the workload is barely running any more.

In order for workloads to properly converge, the memory migration
should not be stopped when nodes reach parity, but instead be
distributed according to how heavily memory is used from each node.
This way memory migration and task migration reinforce each other,
instead of one putting the brakes on the other.

Remove the hard thresholds from the pseudo-interleaving code, and
instead use a more gradual policy on memory placement. This also
seems to improve convergence of workloads that do not run flat out,
but sleep in between bursts of activity.

We still want to slow down NUMA scanning and migration once a workload
has settled on a few actively used nodes, so keep the 3/4 hysteresis
in place. Keep track of whether a workload is actively running on
multiple nodes, so task_numa_migrate does a full scan of the system
for better task placement.

In the case of running 3 SPECjbb2005 instances on a 4 node system,
this code seems to result in fairer distribution of memory between
nodes, with more memory bandwidth for each instance.
Signed-off-by: default avatarRik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: mgorman@suse.de
Link: http://lkml.kernel.org/r/20160125170739.2fc9a641@annuminas.surriel.com
[ Minor readability tweaks. ]
Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent cb251765
...@@ -932,10 +932,11 @@ struct numa_group { ...@@ -932,10 +932,11 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */ spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks; int nr_tasks;
pid_t gid; pid_t gid;
int active_nodes;
struct rcu_head rcu; struct rcu_head rcu;
nodemask_t active_nodes;
unsigned long total_faults; unsigned long total_faults;
unsigned long max_faults_cpu;
/* /*
* Faults_cpu is used to decide whether memory should move * Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted * towards the CPU. As a consequence, these stats are weighted
...@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) ...@@ -994,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
} }
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
* between these nodes are slowed down, to allow things to settle down.
*/
#define ACTIVE_NODE_FRACTION 3
static bool numa_is_active_node(int nid, struct numa_group *ng)
{
return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
}
/* Handle placement on systems where not all nodes are directly connected. */ /* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid, static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task) int maxdist, bool task)
...@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, ...@@ -1143,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
return true; return true;
/* /*
* Do not migrate if the destination is not a node that * Destination node is much more heavily used than the source
* is actively used by this numa group. * node? Allow migration.
*/ */
if (!node_isset(dst_nid, ng->active_nodes)) if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
return false; ACTIVE_NODE_FRACTION)
/*
* Source is a node that is not actively used by this
* numa group, while the destination is. Migrate.
*/
if (!node_isset(src_nid, ng->active_nodes))
return true; return true;
/* /*
* Both source and destination are nodes in active * Distribute memory according to CPU & memory use on each node,
* use by this numa group. Maximize memory bandwidth * with 3/4 hysteresis to avoid unnecessary memory migrations:
* by migrating from more heavily used groups, to less *
* heavily used ones, spreading the load around. * faults_cpu(dst) 3 faults_cpu(src)
* Use a 1/4 hysteresis to avoid spurious page movement. * --------------- * - > ---------------
* faults_mem(dst) 4 faults_mem(src)
*/ */
return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
} }
static unsigned long weighted_cpuload(const int cpu); static unsigned long weighted_cpuload(const int cpu);
...@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p) ...@@ -1509,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_task = NULL, .best_task = NULL,
.best_imp = 0, .best_imp = 0,
.best_cpu = -1 .best_cpu = -1,
}; };
struct sched_domain *sd; struct sched_domain *sd;
unsigned long taskweight, groupweight; unsigned long taskweight, groupweight;
...@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p) ...@@ -1561,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group, * multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations. * we need to check other locations.
*/ */
if (env.best_cpu == -1 || (p->numa_group && if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) { for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid) if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue; continue;
...@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p) ...@@ -1597,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here. * trying for a better one later. Do not set the preferred node here.
*/ */
if (p->numa_group) { if (p->numa_group) {
struct numa_group *ng = p->numa_group;
if (env.best_cpu == -1) if (env.best_cpu == -1)
nid = env.src_nid; nid = env.src_nid;
else else
nid = env.dst_nid; nid = env.dst_nid;
if (node_isset(nid, p->numa_group->active_nodes)) if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid); sched_setnuma(p, env.dst_nid);
} }
...@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p) ...@@ -1652,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
} }
/* /*
* Find the nodes on which the workload is actively running. We do this by * Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can * tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently * be different from the set of nodes where the workload's memory is currently
* located. * located.
*
* The bitmask is used to make smarter decisions on when to do NUMA page
* migrations, To prevent flip-flopping, and excessive page migrations, nodes
* are added when they cause over 6/16 of the maximum number of faults, but
* only removed when they drop below 3/16.
*/ */
static void update_numa_active_node_mask(struct numa_group *numa_group) static void numa_group_count_active_nodes(struct numa_group *numa_group)
{ {
unsigned long faults, max_faults = 0; unsigned long faults, max_faults = 0;
int nid; int nid, active_nodes = 0;
for_each_online_node(nid) { for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid); faults = group_faults_cpu(numa_group, nid);
...@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) ...@@ -1675,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
for_each_online_node(nid) { for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid); faults = group_faults_cpu(numa_group, nid);
if (!node_isset(nid, numa_group->active_nodes)) { if (faults * ACTIVE_NODE_FRACTION > max_faults)
if (faults > max_faults * 6 / 16) active_nodes++;
node_set(nid, numa_group->active_nodes);
} else if (faults < max_faults * 3 / 16)
node_clear(nid, numa_group->active_nodes);
} }
numa_group->max_faults_cpu = max_faults;
numa_group->active_nodes = active_nodes;
} }
/* /*
...@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p) ...@@ -1971,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]); update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) { if (p->numa_group) {
update_numa_active_node_mask(p->numa_group); numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock); spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid); max_nid = preferred_group_nid(p, max_group_nid);
} }
...@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, ...@@ -2015,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
return; return;
atomic_set(&grp->refcount, 1); atomic_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock); spin_lock_init(&grp->lock);
grp->gid = p->pid; grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */ /* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids; nr_node_ids;
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i]; grp->faults[i] = p->numa_faults[i];
...@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) ...@@ -2136,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
bool migrated = flags & TNF_MIGRATED; bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current); int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL); int local = !!(flags & TNF_FAULT_LOCAL);
struct numa_group *ng;
int priv; int priv;
if (!static_branch_likely(&sched_numa_balancing)) if (!static_branch_likely(&sched_numa_balancing))
...@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) ...@@ -2176,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the * actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down. * scan rate to slow down when a workload has settled down.
*/ */
if (!priv && !local && p->numa_group && ng = p->numa_group;
node_isset(cpu_node, p->numa_group->active_nodes) && if (!priv && !local && ng && ng->active_nodes > 1 &&
node_isset(mem_node, p->numa_group->active_nodes)) numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
local = 1; local = 1;
task_numa_placement(p); task_numa_placement(p);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment