Commit 19770b32 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds

mm: filter based on a nodemask as well as a gfp_mask

The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy.  As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering.  This eliminates the need for
MPOL_BIND to create a custom zonelist.

A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist.  I.e., pages will be allocated from the closest allowed node with
available memory.

[Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: default avatarMel Gorman <mel@csn.ul.ie>
Acked-by: default avatarChristoph Lameter <clameter@sgi.com>
Signed-off-by: default avatarLee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent dd1a239f
......@@ -182,14 +182,9 @@ Components of Memory Policies
The Default mode does not use the optional set of nodes.
MPOL_BIND: This mode specifies that memory must come from the
set of nodes specified by the policy.
The memory policy APIs do not specify an order in which the nodes
will be searched. However, unlike "local allocation", the Bind
policy does not consider the distance between the nodes. Rather,
allocations will fallback to the nodes specified by the policy in
order of numeric node id. Like everything in Linux, this is subject
to change.
set of nodes specified by the policy. Memory will be allocated from
the node in the set with sufficient free memory that is closest to
the node where the allocation takes place.
MPOL_PREFERRED: This mode specifies that the allocation should be
attempted from the single node specified in the policy. If that
......
......@@ -360,16 +360,17 @@ void invalidate_bdev(struct block_device *bdev)
*/
static void free_more_memory(void)
{
struct zoneref *zrefs;
struct zone *zone;
int nid;
wakeup_pdflush(1024);
yield();
for_each_online_node(nid) {
zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
gfp_zone(GFP_NOFS));
if (zrefs->zone)
(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
gfp_zone(GFP_NOFS), NULL,
&zone);
if (zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
GFP_NOFS);
}
......
......@@ -26,7 +26,7 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
void cpuset_update_task_memory_state(void);
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
......@@ -103,7 +103,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
static inline void cpuset_init_current_mems_allowed(void) {}
static inline void cpuset_update_task_memory_state(void) {}
static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
return 1;
}
......
......@@ -182,6 +182,10 @@ static inline void arch_alloc_page(struct page *page, int order) { }
extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *);
extern struct page *
__alloc_pages_nodemask(gfp_t, unsigned int,
struct zonelist *, nodemask_t *nodemask);
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
......
......@@ -54,19 +54,20 @@ struct mm_struct;
* mmap_sem.
*
* Freeing policy:
* When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
* All other policies don't have any external state. mpol_free() handles this.
* Mempolicy objects are reference counted. A mempolicy will be freed when
* mpol_free() decrements the reference count to zero.
*
* Copying policy objects:
* For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this.
* mpol_copy() allocates a new mempolicy and copies the specified mempolicy
* to the new storage. The reference count of the new object is initialized
* to 1, representing the caller of mpol_copy().
*/
struct mempolicy {
atomic_t refcnt;
short policy; /* See MPOL_* above */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave */
nodemask_t nodes; /* interleave/bind */
/* undefined for default */
} v;
nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
......@@ -151,7 +152,8 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
extern struct mempolicy default_policy;
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask);
extern unsigned slab_node(struct mempolicy *policy);
extern enum zone_type policy_zone;
......@@ -239,8 +241,11 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
}
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
*mpol = NULL;
*nodemask = NULL;
return node_zonelist(0, gfp_flags);
}
......
......@@ -749,36 +749,60 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
#endif /* CONFIG_NUMA */
}
static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
/**
* next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
* @z - The cursor used as a starting point for the search
* @highest_zoneidx - The zone index of the highest zone to return
* @nodes - An optional nodemask to filter the zonelist with
* @zone - The first suitable zone found is returned via this parameter
*
* This function returns the next zone at or below a given zone index that is
* within the allowed nodemask using a cursor as the starting point for the
* search. The zoneref returned is a cursor that is used as the next starting
* point for future calls to next_zones_zonelist().
*/
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone);
/* Returns the first zone at or below highest_zoneidx in a zonelist */
/**
* first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
* @zonelist - The zonelist to search for a suitable zone
* @highest_zoneidx - The zone index of the highest zone to return
* @nodes - An optional nodemask to filter the zonelist with
* @zone - The first suitable zone found is returned via this parameter
*
* This function returns the first zone at or below a given zone index that is
* within the allowed nodemask. The zoneref returned is a cursor that can be
* used to iterate the zonelist with next_zones_zonelist. The cursor should
* not be used by the caller as it does not match the value of the zone
* returned.
*/
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx)
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
struct zoneref *z;
/* Find the first suitable zone to use for the allocation */
z = zonelist->_zonerefs;
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
return z;
return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
zone);
}
/* Returns the next zone at or below highest_zoneidx in a zonelist */
static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx)
{
/* Find the next suitable zone to use for the allocation */
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
return z;
}
/**
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
* @zone - The current zone in the iterator
* @z - The current pointer within zonelist->zones being iterated
* @zlist - The zonelist being iterated
* @highidx - The zone index of the highest zone to return
* @nodemask - Nodemask allowed by the allocator
*
* This iterator iterates though all zones at or below a given zone index and
* within a given nodemask
*/
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
zone; \
z = next_zones_zonelist(z, highidx, nodemask, &zone)) \
/**
* for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
......@@ -790,11 +814,7 @@ static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
* This iterator iterates though all zones at or below a given zone index.
*/
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
for (z = first_zones_zonelist(zlist, highidx), \
zone = zonelist_zone(z++); \
zone; \
z = next_zones_zonelist(z, highidx), \
zone = zonelist_zone(z++))
for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
......
......@@ -1958,22 +1958,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
}
/**
* cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
* @zl: the zonelist to be checked
* cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
* @nodemask: the nodemask to be checked
*
* Are any of the nodes on zonelist zl allowed in current->mems_allowed?
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
*/
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
int i;
for (i = 0; zl->_zonerefs[i].zone; i++) {
int nid = zonelist_node_idx(&zl->_zonerefs[i]);
if (node_isset(nid, current->mems_allowed))
return 1;
}
return 0;
return nodes_intersects(*nodemask, current->mems_allowed);
}
/*
......
......@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
struct zonelist *zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol);
htlb_alloc_mask, &mpol, &nodemask);
struct zone *zone;
struct zoneref *z;
for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
!list_empty(&hugepage_freelists[nid])) {
......
......@@ -163,42 +163,25 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
return 0;
}
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
/* Check that the nodemask contains at least one populated zone */
static int is_valid_nodemask(nodemask_t *nodemask)
{
struct zonelist *zl;
int num, max, nd;
enum zone_type k;
int nd, k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
max++; /* space for zlcache_ptr (see mmzone.h) */
zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return ERR_PTR(-ENOMEM);
zl->zlcache_ptr = NULL;
num = 0;
/* First put in the highest zones from all nodes, then all the next
lower zones etc. Avoid empty zones because the memory allocator
doesn't like them. If you implement node hot removal you
have to fix that. */
k = MAX_NR_ZONES - 1;
while (1) {
for_each_node_mask(nd, *nodes) {
struct zone *z = &NODE_DATA(nd)->node_zones[k];
/* Check that there is something useful in this mask */
k = policy_zone;
for_each_node_mask(nd, *nodemask) {
struct zone *z;
for (k = 0; k <= policy_zone; k++) {
z = &NODE_DATA(nd)->node_zones[k];
if (z->present_pages > 0)
zoneref_set_zone(z, &zl->_zonerefs[num++]);
}
if (k == 0)
break;
k--;
return 1;
}
if (num == 0) {
kfree(zl);
return ERR_PTR(-EINVAL);
}
zl->_zonerefs[num].zone = NULL;
zl->_zonerefs[num].zone_idx = 0;
return zl;
return 0;
}
/* Create a new policy */
......@@ -229,12 +212,11 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
policy->v.preferred_node = -1;
break;
case MPOL_BIND:
policy->v.zonelist = bind_zonelist(nodes);
if (IS_ERR(policy->v.zonelist)) {
void *error_code = policy->v.zonelist;
if (!is_valid_nodemask(nodes)) {
kmem_cache_free(policy_cache, policy);
return error_code;
return ERR_PTR(-EINVAL);
}
policy->v.nodes = *nodes;
break;
}
policy->policy = mode;
......@@ -500,19 +482,12 @@ static long do_set_mempolicy(int mode, nodemask_t *nodes)
/* Fill a zone bitmap for a policy */
static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
{
int i;
nodes_clear(*nodes);
switch (p->policy) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->_zonerefs[i].zone; i++) {
struct zoneref *zref;
zref = &p->v.zonelist->_zonerefs[i];
node_set(zonelist_node_idx(zref), *nodes);
}
break;
case MPOL_DEFAULT:
break;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
......@@ -1160,6 +1135,18 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
return pol;
}
/* Return a nodemask representing a mempolicy */
static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->policy == MPOL_BIND) &&
gfp_zone(gfp) >= policy_zone &&
cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
return &policy->v.nodes;
return NULL;
}
/* Return a zonelist representing a mempolicy */
static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
{
......@@ -1172,12 +1159,17 @@ static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
nd = numa_node_id();
break;
case MPOL_BIND:
/* Lower zones don't get a policy applied */
/* Careful: current->mems_allowed might have moved */
if (gfp_zone(gfp) >= policy_zone)
if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
return policy->v.zonelist;
/*FALL THROUGH*/
/*
* Normally, MPOL_BIND allocations node-local are node-local
* within the allowed nodemask. However, if __GFP_THISNODE is
* set and the current node is part of the mask, we use the
* the zonelist for the first node in the mask instead.
*/
nd = numa_node_id();
if (unlikely(gfp & __GFP_THISNODE) &&
unlikely(!node_isset(nd, policy->v.nodes)))
nd = first_node(policy->v.nodes);
break;
case MPOL_INTERLEAVE: /* should not happen */
case MPOL_DEFAULT:
nd = numa_node_id();
......@@ -1220,7 +1212,14 @@ unsigned slab_node(struct mempolicy *policy)
* Follow bind policy behavior and start allocation at the
* first node.
*/
return zonelist_node_idx(policy->v.zonelist->_zonerefs);
struct zonelist *zonelist;
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
(void)first_zones_zonelist(zonelist, highest_zoneidx,
&policy->v.nodes,
&zone);
return zone->node;
}
case MPOL_PREFERRED:
......@@ -1278,25 +1277,31 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
* @vma = virtual memory area whose policy is sought
* @addr = address in @vma for shared policy lookup and interleave policy
* @gfp_flags = for requested zone
* @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
* @mpol = pointer to mempolicy pointer for reference counted mempolicy
* @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
*
* Returns a zonelist suitable for a huge page allocation.
* If the effective policy is 'BIND, returns pointer to policy's zonelist.
* If the effective policy is 'BIND, returns pointer to local node's zonelist,
* and a pointer to the mempolicy's @nodemask for filtering the zonelist.
* If it is also a policy for which get_vma_policy() returns an extra
* reference, we must hold that reference until after allocation.
* reference, we must hold that reference until after the allocation.
* In that case, return policy via @mpol so hugetlb allocation can drop
* the reference. For non-'BIND referenced policies, we can/do drop the
* reference here, so the caller doesn't need to know about the special case
* for default and current task policy.
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol)
gfp_t gfp_flags, struct mempolicy **mpol,
nodemask_t **nodemask)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
*mpol = NULL; /* probably no unref needed */
if (pol->policy == MPOL_INTERLEAVE) {
*nodemask = NULL; /* assume !MPOL_BIND */
if (pol->policy == MPOL_BIND) {
*nodemask = &pol->v.nodes;
} else if (pol->policy == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
......@@ -1376,14 +1381,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
/*
* slow path: ref counted policy -- shared or vma
*/
struct page *page = __alloc_pages(gfp, 0, zl);
struct page *page = __alloc_pages_nodemask(gfp, 0,
zl, nodemask_policy(gfp, pol));
__mpol_free(pol);
return page;
}
/*
* fast path: default or task policy
*/
return __alloc_pages(gfp, 0, zl);
return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
}
/**
......@@ -1415,7 +1421,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
pol = &default_policy;
if (pol->policy == MPOL_INTERLEAVE)
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
return __alloc_pages_nodemask(gfp, order,
zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
}
EXPORT_SYMBOL(alloc_pages_current);
......@@ -1440,14 +1447,6 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
}
*new = *old;
atomic_set(&new->refcnt, 1);
if (new->policy == MPOL_BIND) {
int sz = ksize(old->v.zonelist);
new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
if (!new->v.zonelist) {
kmem_cache_free(policy_cache, new);
return ERR_PTR(-ENOMEM);
}
}
return new;
}
......@@ -1461,21 +1460,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
switch (a->policy) {
case MPOL_DEFAULT:
return 1;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
return nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED:
return a->v.preferred_node == b->v.preferred_node;
case MPOL_BIND: {
int i;
for (i = 0; a->v.zonelist->_zonerefs[i].zone; i++) {
struct zone *za, *zb;
za = zonelist_zone(&a->v.zonelist->_zonerefs[i]);
zb = zonelist_zone(&b->v.zonelist->_zonerefs[i]);
if (za != zb)
return 0;
}
return b->v.zonelist->_zonerefs[i].zone == NULL;
}
default:
BUG();
return 0;
......@@ -1487,8 +1477,6 @@ void __mpol_free(struct mempolicy *p)
{
if (!atomic_dec_and_test(&p->refcnt))
return;
if (p->policy == MPOL_BIND)
kfree(p->v.zonelist);
p->policy = MPOL_DEFAULT;
kmem_cache_free(policy_cache, p);
}
......@@ -1779,6 +1767,8 @@ static void mpol_rebind_policy(struct mempolicy *pol,
switch (pol->policy) {
case MPOL_DEFAULT:
break;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
pol->v.nodes = tmp;
......@@ -1791,32 +1781,6 @@ static void mpol_rebind_policy(struct mempolicy *pol,
*mpolmask, *newmask);
*mpolmask = *newmask;
break;
case MPOL_BIND: {
nodemask_t nodes;
struct zoneref *z;
struct zonelist *zonelist;
nodes_clear(nodes);
for (z = pol->v.zonelist->_zonerefs; z->zone; z++)
node_set(zonelist_node_idx(z), nodes);
nodes_remap(tmp, nodes, *mpolmask, *newmask);
nodes = tmp;
zonelist = bind_zonelist(&nodes);
/* If no mem, then zonelist is NULL and we keep old zonelist.
* If that old zonelist has no remaining mems_allowed nodes,
* then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
*/
if (!IS_ERR(zonelist)) {
/* Good - got mem - substitute new zonelist */
kfree(pol->v.zonelist);
pol->v.zonelist = zonelist;
}
*mpolmask = *newmask;
break;
}
default:
BUG();
break;
......@@ -1879,9 +1843,7 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
break;
case MPOL_BIND:
get_zonemask(pol, &nodes);
break;
/* Fall through */
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
......
......@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
return zone;
}
static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
{
#ifdef CONFIG_NUMA
return node_isset(zonelist_node_idx(zref), *nodes);
#else
return 1;
#endif /* CONFIG_NUMA */
}
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
/*
* Find the next suitable zone to use for the allocation.
* Only filter based on nodemask if it's set
*/
if (likely(nodes == NULL))
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
else
while (zonelist_zone_idx(z) > highest_zoneidx ||
(z->zone && !zref_in_nodemask(z, nodes)))
z++;
*zone = zonelist_zone(z++);
return z;
}
......@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
{
struct zoneref *z;
......@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
z = first_zones_zonelist(zonelist, high_zoneidx);
classzone_idx = zonelist_zone_idx(z);
preferred_zone = zonelist_zone(z);
(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
&preferred_zone);
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
......@@ -1447,9 +1448,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
static struct page *
__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
......@@ -1478,7 +1479,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
return NULL;
}
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
if (page)
goto got_pg;
......@@ -1523,7 +1524,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
page = get_page_from_freelist(gfp_mask, order, zonelist,
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags);
if (page)
goto got_pg;
......@@ -1536,7 +1537,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
page = get_page_from_freelist(gfp_mask, order,
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;
......@@ -1571,7 +1572,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
drain_all_pages();
if (likely(did_some_progress)) {
page = get_page_from_freelist(gfp_mask, order,
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, alloc_flags);
if (page)
goto got_pg;
......@@ -1587,8 +1588,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET);
if (page) {
clear_zonelist_oom(zonelist, gfp_mask);
goto got_pg;
......@@ -1637,6 +1639,20 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
return page;
}
struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
}
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
}
EXPORT_SYMBOL(__alloc_pages);
/*
......@@ -1880,6 +1896,12 @@ void show_free_areas(void)
show_swap_cache_info();
}
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
/*
* Builds allocation fallback zone lists.
*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment