Commit 04bb2f94 authored by Rik van Riel's avatar Rik van Riel Committed by Ingo Molnar

sched/numa: Adjust scan rate in task_numa_placement

Adjust numa_scan_period in task_numa_placement, depending on how much
useful work the numa code can do. The more local faults there are in a
given scan window the longer the period (and hence the slower the scan rate)
during the next window. If there are excessive shared faults then the scan
period will decrease with the amount of scaling depending on whether the
ratio of shared/private faults. If the preferred node changes then the
scan rate is reset to recheck if the task is properly placed.
Signed-off-by: default avatarRik van Riel <riel@redhat.com>
Signed-off-by: default avatarMel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: default avatarPeter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-59-git-send-email-mgorman@suse.deSigned-off-by: default avatarIngo Molnar <mingo@kernel.org>
parent 3e6a9418
...@@ -1365,6 +1365,14 @@ struct task_struct { ...@@ -1365,6 +1365,14 @@ struct task_struct {
*/ */
unsigned long *numa_faults_buffer; unsigned long *numa_faults_buffer;
/*
* numa_faults_locality tracks if faults recorded during the last
* scan window were remote/local. The task scan period is adapted
* based on the locality of the faults with different weights
* depending on whether they were shared or private faults
*/
unsigned long numa_faults_locality[2];
int numa_preferred_nid; int numa_preferred_nid;
unsigned long numa_pages_migrated; unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_NUMA_BALANCING */
...@@ -1455,6 +1463,7 @@ struct task_struct { ...@@ -1455,6 +1463,7 @@ struct task_struct {
#define TNF_MIGRATED 0x01 #define TNF_MIGRATED 0x01
#define TNF_NO_GROUP 0x02 #define TNF_NO_GROUP 0x02
#define TNF_SHARED 0x04 #define TNF_SHARED 0x04
#define TNF_FAULT_LOCAL 0x08
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags); extern void task_numa_fault(int last_node, int node, int pages, int flags);
......
...@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p) ...@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
sched_setnuma(p, env.dst_nid); sched_setnuma(p, env.dst_nid);
/*
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
p->numa_scan_period = task_scan_min(p);
if (env.best_task == NULL) { if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu); int ret = migrate_task_to(p, env.best_cpu);
return ret; return ret;
...@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p) ...@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p)
p->numa_migrate_retry = jiffies + HZ*5; p->numa_migrate_retry = jiffies + HZ*5;
} }
/*
* When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
* increments. The more local the fault statistics are, the higher the scan
* period will be for the next scan window. If local/remote ratio is below
* NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
* scan period will decrease
*/
#define NUMA_PERIOD_SLOTS 10
#define NUMA_PERIOD_THRESHOLD 3
/*
* Increase the scan period (slow down scanning) if the majority of
* our memory is already on our local node, or if the majority of
* the page accesses are shared with other processes.
* Otherwise, decrease the scan period.
*/
static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
int ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
unsigned long local = p->numa_faults_locality[1];
/*
* If there were no record hinting faults then either the task is
* completely idle or all activity is areas that are not of interest
* to automatic numa balancing. Scan slower
*/
if (local + shared == 0) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
p->mm->numa_next_scan = jiffies +
msecs_to_jiffies(p->numa_scan_period);
return;
}
/*
* Prepare to scale scan period relative to the current period.
* == NUMA_PERIOD_THRESHOLD scan period stays the same
* < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
if (ratio >= NUMA_PERIOD_THRESHOLD) {
int slot = ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
/*
* Scale scan rate increases based on sharing. There is an
* inverse relationship between the degree of sharing and
* the adjustment made to the scanning period. Broadly
* speaking the intent is that there is little point
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
task_scan_min(p), task_scan_max(p));
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
static void task_numa_placement(struct task_struct *p) static void task_numa_placement(struct task_struct *p)
{ {
int seq, nid, max_nid = -1, max_group_nid = -1; int seq, nid, max_nid = -1, max_group_nid = -1;
unsigned long max_faults = 0, max_group_faults = 0; unsigned long max_faults = 0, max_group_faults = 0;
unsigned long fault_types[2] = { 0, 0 };
spinlock_t *group_lock = NULL; spinlock_t *group_lock = NULL;
seq = ACCESS_ONCE(p->mm->numa_scan_seq); seq = ACCESS_ONCE(p->mm->numa_scan_seq);
...@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p) ...@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p)
/* Decay existing window, copy faults since last scan */ /* Decay existing window, copy faults since last scan */
p->numa_faults[i] >>= 1; p->numa_faults[i] >>= 1;
p->numa_faults[i] += p->numa_faults_buffer[i]; p->numa_faults[i] += p->numa_faults_buffer[i];
fault_types[priv] += p->numa_faults_buffer[i];
p->numa_faults_buffer[i] = 0; p->numa_faults_buffer[i] = 0;
faults += p->numa_faults[i]; faults += p->numa_faults[i];
...@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p) ...@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p)
} }
} }
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) { if (p->numa_group) {
/* /*
* If the preferred task and group nids are different, * If the preferred task and group nids are different,
...@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) ...@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
BUG_ON(p->numa_faults_buffer); BUG_ON(p->numa_faults_buffer);
p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids); p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
p->total_numa_faults = 0; p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
} }
/* /*
...@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) ...@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv); task_numa_group(p, last_cpupid, flags, &priv);
} }
/*
* If pages are properly placed (did not migrate) then scan slower.
* This is reset periodically in case of phase changes
*/
if (!migrated) {
/* Initialise if necessary */
if (!p->numa_scan_period_max)
p->numa_scan_period_max = task_scan_max(p);
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period + 10);
}
task_numa_placement(p); task_numa_placement(p);
/* Retry task to preferred node migration if it previously failed */ /* Retry task to preferred node migration if it previously failed */
...@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) ...@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
p->numa_pages_migrated += pages; p->numa_pages_migrated += pages;
p->numa_faults_buffer[task_faults_idx(node, priv)] += pages; p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
} }
static void reset_ptenuma_scan(struct task_struct *p) static void reset_ptenuma_scan(struct task_struct *p)
...@@ -1701,18 +1775,6 @@ void task_numa_work(struct callback_head *work) ...@@ -1701,18 +1775,6 @@ void task_numa_work(struct callback_head *work)
} }
out: out:
/*
* If the whole process was scanned without updates then no NUMA
* hinting faults are being recorded and scan rate should be lower.
*/
if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
p->numa_scan_period = min(p->numa_scan_period_max,
p->numa_scan_period << 1);
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
mm->numa_next_scan = next_scan;
}
/* /*
* It is possible to reach the end of the VMA list but the last few * It is possible to reach the end of the VMA list but the last few
* VMAs are not guaranteed to the vma_migratable. If they are not, we * VMAs are not guaranteed to the vma_migratable. If they are not, we
......
...@@ -1296,8 +1296,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1296,8 +1296,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_nid = page_to_nid(page); page_nid = page_to_nid(page);
last_cpupid = page_cpupid_last(page); last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS); count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == this_nid) if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
flags |= TNF_FAULT_LOCAL;
}
/* /*
* Avoid grouping on DSO/COW pages in specific and RO pages * Avoid grouping on DSO/COW pages in specific and RO pages
......
...@@ -3527,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3527,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} }
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid) unsigned long addr, int page_nid,
int *flags)
{ {
get_page(page); get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS); count_vm_numa_event(NUMA_HINT_FAULTS);
if (page_nid == numa_node_id()) if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
*flags |= TNF_FAULT_LOCAL;
}
return mpol_misplaced(page, vma, addr); return mpol_misplaced(page, vma, addr);
} }
...@@ -3593,7 +3596,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -3593,7 +3596,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
last_cpupid = page_cpupid_last(page); last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page); page_nid = page_to_nid(page);
target_nid = numa_migrate_prep(page, vma, addr, page_nid); target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
pte_unmap_unlock(ptep, ptl); pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) { if (target_nid == -1) {
put_page(page); put_page(page);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment