Commit 6ae11b27 authored by Lee Schermerhorn's avatar Lee Schermerhorn Committed by Linus Torvalds

hugetlb: add nodemask arg to huge page alloc, free and surplus adjust functions

In preparation for constraining huge page allocation and freeing by the
controlling task's numa mempolicy, add a "nodes_allowed" nodemask pointer
to the allocate, free and surplus adjustment functions.  For now, pass
NULL to indicate default behavior--i.e., use node_online_map.  A
subsqeuent patch will derive a non-default mask from the controlling
task's numa mempolicy.

Note that this method of updating the global hstate nr_hugepages under the
constraint of a nodemask simplifies keeping the global state
consistent--especially the number of persistent and surplus pages relative
to reservations and overcommit limits.  There are undoubtedly other ways
to do this, but this works for both interfaces: mempolicy and per node
attributes.

[rientjes@google.com: fix HIGHMEM compile error]
Signed-off-by: default avatarLee Schermerhorn <lee.schermerhorn@hp.com>
Reviewed-by: default avatarMel Gorman <mel@csn.ul.ie>
Acked-by: default avatarDavid Rientjes <rientjes@google.com>
Reviewed-by: default avatarAndi Kleen <andi@firstfloor.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Adam Litke <agl@us.ibm.com>
Cc: Andy Whitcroft <apw@canonical.com>
Cc: Eric Whitney <eric.whitney@hp.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: default avatarDavid Rientjes <rientjes@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 9a76db09
...@@ -622,48 +622,56 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) ...@@ -622,48 +622,56 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
} }
/* /*
* common helper function for hstate_next_node_to_{alloc|free}. * common helper functions for hstate_next_node_to_{alloc|free}.
* return next node in node_online_map, wrapping at end. * We may have allocated or freed a huge page based on a different
* nodes_allowed previously, so h->next_node_to_{alloc|free} might
* be outside of *nodes_allowed. Ensure that we use an allowed
* node for alloc or free.
*/ */
static int next_node_allowed(int nid) static int next_node_allowed(int nid, nodemask_t *nodes_allowed)
{ {
nid = next_node(nid, node_online_map); nid = next_node(nid, *nodes_allowed);
if (nid == MAX_NUMNODES) if (nid == MAX_NUMNODES)
nid = first_node(node_online_map); nid = first_node(*nodes_allowed);
VM_BUG_ON(nid >= MAX_NUMNODES); VM_BUG_ON(nid >= MAX_NUMNODES);
return nid; return nid;
} }
static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed)
{
if (!node_isset(nid, *nodes_allowed))
nid = next_node_allowed(nid, nodes_allowed);
return nid;
}
/* /*
* Use a helper variable to find the next node and then * returns the previously saved node ["this node"] from which to
* copy it back to next_nid_to_alloc afterwards: * allocate a persistent huge page for the pool and advance the
* otherwise there's a window in which a racer might * next node from which to allocate, handling wrap at end of node
* pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * mask.
* But we don't need to use a spin_lock here: it really
* doesn't matter if occasionally a racer chooses the
* same nid as we do. Move nid forward in the mask even
* if we just successfully allocated a hugepage so that
* the next caller gets hugepages on the next node.
*/ */
static int hstate_next_node_to_alloc(struct hstate *h) static int hstate_next_node_to_alloc(struct hstate *h,
nodemask_t *nodes_allowed)
{ {
int nid, next_nid; int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed);
h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed);
nid = h->next_nid_to_alloc;
next_nid = next_node_allowed(nid);
h->next_nid_to_alloc = next_nid;
return nid; return nid;
} }
static int alloc_fresh_huge_page(struct hstate *h) static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
{ {
struct page *page; struct page *page;
int start_nid; int start_nid;
int next_nid; int next_nid;
int ret = 0; int ret = 0;
start_nid = hstate_next_node_to_alloc(h); start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
next_nid = start_nid; next_nid = start_nid;
do { do {
...@@ -672,7 +680,7 @@ static int alloc_fresh_huge_page(struct hstate *h) ...@@ -672,7 +680,7 @@ static int alloc_fresh_huge_page(struct hstate *h)
ret = 1; ret = 1;
break; break;
} }
next_nid = hstate_next_node_to_alloc(h); next_nid = hstate_next_node_to_alloc(h, nodes_allowed);
} while (next_nid != start_nid); } while (next_nid != start_nid);
if (ret) if (ret)
...@@ -684,18 +692,20 @@ static int alloc_fresh_huge_page(struct hstate *h) ...@@ -684,18 +692,20 @@ static int alloc_fresh_huge_page(struct hstate *h)
} }
/* /*
* helper for free_pool_huge_page() - return the next node * helper for free_pool_huge_page() - return the previously saved
* from which to free a huge page. Advance the next node id * node ["this node"] from which to free a huge page. Advance the
* whether or not we find a free huge page to free so that the * next node id whether or not we find a free huge page to free so
* next attempt to free addresses the next node. * that the next attempt to free addresses the next node.
*/ */
static int hstate_next_node_to_free(struct hstate *h) static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
{ {
int nid, next_nid; int nid;
VM_BUG_ON(!nodes_allowed);
nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed);
h->next_nid_to_free = next_node_allowed(nid, nodes_allowed);
nid = h->next_nid_to_free;
next_nid = next_node_allowed(nid);
h->next_nid_to_free = next_nid;
return nid; return nid;
} }
...@@ -705,13 +715,14 @@ static int hstate_next_node_to_free(struct hstate *h) ...@@ -705,13 +715,14 @@ static int hstate_next_node_to_free(struct hstate *h)
* balanced over allowed nodes. * balanced over allowed nodes.
* Called with hugetlb_lock locked. * Called with hugetlb_lock locked.
*/ */
static int free_pool_huge_page(struct hstate *h, bool acct_surplus) static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
bool acct_surplus)
{ {
int start_nid; int start_nid;
int next_nid; int next_nid;
int ret = 0; int ret = 0;
start_nid = hstate_next_node_to_free(h); start_nid = hstate_next_node_to_free(h, nodes_allowed);
next_nid = start_nid; next_nid = start_nid;
do { do {
...@@ -735,7 +746,7 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) ...@@ -735,7 +746,7 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
ret = 1; ret = 1;
break; break;
} }
next_nid = hstate_next_node_to_free(h); next_nid = hstate_next_node_to_free(h, nodes_allowed);
} while (next_nid != start_nid); } while (next_nid != start_nid);
return ret; return ret;
...@@ -937,7 +948,7 @@ static void return_unused_surplus_pages(struct hstate *h, ...@@ -937,7 +948,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* on-line nodes for us and will handle the hstate accounting. * on-line nodes for us and will handle the hstate accounting.
*/ */
while (nr_pages--) { while (nr_pages--) {
if (!free_pool_huge_page(h, 1)) if (!free_pool_huge_page(h, &node_online_map, 1))
break; break;
} }
} }
...@@ -1047,7 +1058,8 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) ...@@ -1047,7 +1058,8 @@ int __weak alloc_bootmem_huge_page(struct hstate *h)
void *addr; void *addr;
addr = __alloc_bootmem_node_nopanic( addr = __alloc_bootmem_node_nopanic(
NODE_DATA(hstate_next_node_to_alloc(h)), NODE_DATA(hstate_next_node_to_alloc(h,
&node_online_map)),
huge_page_size(h), huge_page_size(h), 0); huge_page_size(h), huge_page_size(h), 0);
if (addr) { if (addr) {
...@@ -1102,7 +1114,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) ...@@ -1102,7 +1114,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
if (h->order >= MAX_ORDER) { if (h->order >= MAX_ORDER) {
if (!alloc_bootmem_huge_page(h)) if (!alloc_bootmem_huge_page(h))
break; break;
} else if (!alloc_fresh_huge_page(h)) } else if (!alloc_fresh_huge_page(h, &node_online_map))
break; break;
} }
h->max_huge_pages = i; h->max_huge_pages = i;
...@@ -1144,14 +1156,15 @@ static void __init report_hugepages(void) ...@@ -1144,14 +1156,15 @@ static void __init report_hugepages(void)
} }
#ifdef CONFIG_HIGHMEM #ifdef CONFIG_HIGHMEM
static void try_to_free_low(struct hstate *h, unsigned long count) static void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{ {
int i; int i;
if (h->order >= MAX_ORDER) if (h->order >= MAX_ORDER)
return; return;
for (i = 0; i < MAX_NUMNODES; ++i) { for_each_node_mask(i, *nodes_allowed) {
struct page *page, *next; struct page *page, *next;
struct list_head *freel = &h->hugepage_freelists[i]; struct list_head *freel = &h->hugepage_freelists[i];
list_for_each_entry_safe(page, next, freel, lru) { list_for_each_entry_safe(page, next, freel, lru) {
...@@ -1167,7 +1180,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) ...@@ -1167,7 +1180,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count)
} }
} }
#else #else
static inline void try_to_free_low(struct hstate *h, unsigned long count) static inline void try_to_free_low(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{ {
} }
#endif #endif
...@@ -1177,7 +1191,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) ...@@ -1177,7 +1191,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count)
* balanced by operating on them in a round-robin fashion. * balanced by operating on them in a round-robin fashion.
* Returns 1 if an adjustment was made. * Returns 1 if an adjustment was made.
*/ */
static int adjust_pool_surplus(struct hstate *h, int delta) static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
int delta)
{ {
int start_nid, next_nid; int start_nid, next_nid;
int ret = 0; int ret = 0;
...@@ -1185,9 +1200,9 @@ static int adjust_pool_surplus(struct hstate *h, int delta) ...@@ -1185,9 +1200,9 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
VM_BUG_ON(delta != -1 && delta != 1); VM_BUG_ON(delta != -1 && delta != 1);
if (delta < 0) if (delta < 0)
start_nid = hstate_next_node_to_alloc(h); start_nid = hstate_next_node_to_alloc(h, nodes_allowed);
else else
start_nid = hstate_next_node_to_free(h); start_nid = hstate_next_node_to_free(h, nodes_allowed);
next_nid = start_nid; next_nid = start_nid;
do { do {
...@@ -1197,7 +1212,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) ...@@ -1197,7 +1212,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
* To shrink on this node, there must be a surplus page * To shrink on this node, there must be a surplus page
*/ */
if (!h->surplus_huge_pages_node[nid]) { if (!h->surplus_huge_pages_node[nid]) {
next_nid = hstate_next_node_to_alloc(h); next_nid = hstate_next_node_to_alloc(h,
nodes_allowed);
continue; continue;
} }
} }
...@@ -1207,7 +1223,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) ...@@ -1207,7 +1223,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
*/ */
if (h->surplus_huge_pages_node[nid] >= if (h->surplus_huge_pages_node[nid] >=
h->nr_huge_pages_node[nid]) { h->nr_huge_pages_node[nid]) {
next_nid = hstate_next_node_to_free(h); next_nid = hstate_next_node_to_free(h,
nodes_allowed);
continue; continue;
} }
} }
...@@ -1222,7 +1239,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) ...@@ -1222,7 +1239,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta)
} }
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
nodemask_t *nodes_allowed)
{ {
unsigned long min_count, ret; unsigned long min_count, ret;
...@@ -1242,7 +1260,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) ...@@ -1242,7 +1260,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
*/ */
spin_lock(&hugetlb_lock); spin_lock(&hugetlb_lock);
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, -1)) if (!adjust_pool_surplus(h, nodes_allowed, -1))
break; break;
} }
...@@ -1253,7 +1271,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) ...@@ -1253,7 +1271,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
* and reducing the surplus. * and reducing the surplus.
*/ */
spin_unlock(&hugetlb_lock); spin_unlock(&hugetlb_lock);
ret = alloc_fresh_huge_page(h); ret = alloc_fresh_huge_page(h, nodes_allowed);
spin_lock(&hugetlb_lock); spin_lock(&hugetlb_lock);
if (!ret) if (!ret)
goto out; goto out;
...@@ -1277,13 +1295,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) ...@@ -1277,13 +1295,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count)
*/ */
min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
min_count = max(count, min_count); min_count = max(count, min_count);
try_to_free_low(h, min_count); try_to_free_low(h, min_count, nodes_allowed);
while (min_count < persistent_huge_pages(h)) { while (min_count < persistent_huge_pages(h)) {
if (!free_pool_huge_page(h, 0)) if (!free_pool_huge_page(h, nodes_allowed, 0))
break; break;
} }
while (count < persistent_huge_pages(h)) { while (count < persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, 1)) if (!adjust_pool_surplus(h, nodes_allowed, 1))
break; break;
} }
out: out:
...@@ -1329,7 +1347,7 @@ static ssize_t nr_hugepages_store(struct kobject *kobj, ...@@ -1329,7 +1347,7 @@ static ssize_t nr_hugepages_store(struct kobject *kobj,
if (err) if (err)
return 0; return 0;
h->max_huge_pages = set_max_huge_pages(h, input); h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map);
return count; return count;
} }
...@@ -1571,7 +1589,8 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, ...@@ -1571,7 +1589,8 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
proc_doulongvec_minmax(table, write, buffer, length, ppos); proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write) if (write)
h->max_huge_pages = set_max_huge_pages(h, tmp); h->max_huge_pages = set_max_huge_pages(h, tmp,
&node_online_map);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment