Commit 19770b32 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds

mm: filter based on a nodemask as well as a gfp_mask

The MPOL_BIND policy creates a zonelist that is used for allocations
controlled by that mempolicy.  As the per-node zonelist is already being
filtered based on a zone id, this patch adds a version of __alloc_pages() that
takes a nodemask for further filtering.  This eliminates the need for
MPOL_BIND to create a custom zonelist.

A positive benefit of this is that allocations using MPOL_BIND now use the
local node's distance-ordered zonelist instead of a custom node-id-ordered
zonelist.  I.e., pages will be allocated from the closest allowed node with
available memory.

[Lee.Schermerhorn@hp.com: Mempolicy: update stale documentation and comments]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask]
[Lee.Schermerhorn@hp.com: Mempolicy: make dequeue_huge_page_vma() obey MPOL_BIND nodemask rework]
Signed-off-by: default avatarMel Gorman <mel@csn.ul.ie>
Acked-by: default avatarChristoph Lameter <clameter@sgi.com>
Signed-off-by: default avatarLee Schermerhorn <lee.schermerhorn@hp.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent dd1a239f
......@@ -182,14 +182,9 @@ Components of Memory Policies
The Default mode does not use the optional set of nodes.
MPOL_BIND: This mode specifies that memory must come from the
set of nodes specified by the policy.
The memory policy APIs do not specify an order in which the nodes
will be searched. However, unlike "local allocation", the Bind
policy does not consider the distance between the nodes. Rather,
allocations will fallback to the nodes specified by the policy in
order of numeric node id. Like everything in Linux, this is subject
to change.
set of nodes specified by the policy. Memory will be allocated from
the node in the set with sufficient free memory that is closest to
the node where the allocation takes place.
MPOL_PREFERRED: This mode specifies that the allocation should be
attempted from the single node specified in the policy. If that
......
......@@ -360,16 +360,17 @@ void invalidate_bdev(struct block_device *bdev)
*/
static void free_more_memory(void)
{
struct zoneref *zrefs;
struct zone *zone;
int nid;
wakeup_pdflush(1024);
yield();
for_each_online_node(nid) {
zrefs = first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
gfp_zone(GFP_NOFS));
if (zrefs->zone)
(void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
gfp_zone(GFP_NOFS), NULL,
&zone);
if (zone)
try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
GFP_NOFS);
}
......
......@@ -26,7 +26,7 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
#define cpuset_current_mems_allowed (current->mems_allowed)
void cpuset_init_current_mems_allowed(void);
void cpuset_update_task_memory_state(void);
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
......@@ -103,7 +103,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
static inline void cpuset_init_current_mems_allowed(void) {}
static inline void cpuset_update_task_memory_state(void) {}
static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
return 1;
}
......
......@@ -182,6 +182,10 @@ static inline void arch_alloc_page(struct page *page, int order) { }
extern struct page *__alloc_pages(gfp_t, unsigned int, struct zonelist *);
extern struct page *
__alloc_pages_nodemask(gfp_t, unsigned int,
struct zonelist *, nodemask_t *nodemask);
static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
unsigned int order)
{
......
......@@ -54,19 +54,20 @@ struct mm_struct;
* mmap_sem.
*
* Freeing policy:
* When policy is MPOL_BIND v.zonelist is kmalloc'ed and must be kfree'd.
* All other policies don't have any external state. mpol_free() handles this.
* Mempolicy objects are reference counted. A mempolicy will be freed when
* mpol_free() decrements the reference count to zero.
*
* Copying policy objects:
* For MPOL_BIND the zonelist must be always duplicated. mpol_clone() does this.
* mpol_copy() allocates a new mempolicy and copies the specified mempolicy
* to the new storage. The reference count of the new object is initialized
* to 1, representing the caller of mpol_copy().
*/
struct mempolicy {
atomic_t refcnt;
short policy; /* See MPOL_* above */
union {
struct zonelist *zonelist; /* bind */
short preferred_node; /* preferred */
nodemask_t nodes; /* interleave */
nodemask_t nodes; /* interleave/bind */
/* undefined for default */
} v;
nodemask_t cpuset_mems_allowed; /* mempolicy relative to these nodes */
......@@ -151,7 +152,8 @@ extern void mpol_fix_fork_child_flag(struct task_struct *p);
extern struct mempolicy default_policy;
extern struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol);
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask);
extern unsigned slab_node(struct mempolicy *policy);
extern enum zone_type policy_zone;
......@@ -239,8 +241,11 @@ static inline void mpol_fix_fork_child_flag(struct task_struct *p)
}
static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol)
unsigned long addr, gfp_t gfp_flags,
struct mempolicy **mpol, nodemask_t **nodemask)
{
*mpol = NULL;
*nodemask = NULL;
return node_zonelist(0, gfp_flags);
}
......
......@@ -749,36 +749,60 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
#endif /* CONFIG_NUMA */
}
static inline void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
/**
* next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
* @z - The cursor used as a starting point for the search
* @highest_zoneidx - The zone index of the highest zone to return
* @nodes - An optional nodemask to filter the zonelist with
* @zone - The first suitable zone found is returned via this parameter
*
* This function returns the next zone at or below a given zone index that is
* within the allowed nodemask using a cursor as the starting point for the
* search. The zoneref returned is a cursor that is used as the next starting
* point for future calls to next_zones_zonelist().
*/
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone);
/* Returns the first zone at or below highest_zoneidx in a zonelist */
/**
* first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
* @zonelist - The zonelist to search for a suitable zone
* @highest_zoneidx - The zone index of the highest zone to return
* @nodes - An optional nodemask to filter the zonelist with
* @zone - The first suitable zone found is returned via this parameter
*
* This function returns the first zone at or below a given zone index that is
* within the allowed nodemask. The zoneref returned is a cursor that can be
* used to iterate the zonelist with next_zones_zonelist. The cursor should
* not be used by the caller as it does not match the value of the zone
* returned.
*/
static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
enum zone_type highest_zoneidx)
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
struct zoneref *z;
/* Find the first suitable zone to use for the allocation */
z = zonelist->_zonerefs;
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
return z;
return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes,
zone);
}
/* Returns the next zone at or below highest_zoneidx in a zonelist */
static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx)
{
/* Find the next suitable zone to use for the allocation */
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
return z;
}
/**
* for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask
* @zone - The current zone in the iterator
* @z - The current pointer within zonelist->zones being iterated
* @zlist - The zonelist being iterated
* @highidx - The zone index of the highest zone to return
* @nodemask - Nodemask allowed by the allocator
*
* This iterator iterates though all zones at or below a given zone index and
* within a given nodemask
*/
#define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone); \
zone; \
z = next_zones_zonelist(z, highidx, nodemask, &zone)) \
/**
* for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
......@@ -790,11 +814,7 @@ static inline struct zoneref *next_zones_zonelist(struct zoneref *z,
* This iterator iterates though all zones at or below a given zone index.
*/
#define for_each_zone_zonelist(zone, z, zlist, highidx) \
for (z = first_zones_zonelist(zlist, highidx), \
zone = zonelist_zone(z++); \
zone; \
z = next_zones_zonelist(z, highidx), \
zone = zonelist_zone(z++))
for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL)
#ifdef CONFIG_SPARSEMEM
#include <asm/sparsemem.h>
......
......@@ -1958,22 +1958,14 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
}
/**
* cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
* @zl: the zonelist to be checked
* cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
* @nodemask: the nodemask to be checked
*
* Are any of the nodes on zonelist zl allowed in current->mems_allowed?
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
*/
int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
int i;
for (i = 0; zl->_zonerefs[i].zone; i++) {
int nid = zonelist_node_idx(&zl->_zonerefs[i]);
if (node_isset(nid, current->mems_allowed))
return 1;
}
return 0;
return nodes_intersects(*nodemask, current->mems_allowed);
}
/*
......
......@@ -95,12 +95,14 @@ static struct page *dequeue_huge_page_vma(struct vm_area_struct *vma,
int nid;
struct page *page = NULL;
struct mempolicy *mpol;
nodemask_t *nodemask;
struct zonelist *zonelist = huge_zonelist(vma, address,
htlb_alloc_mask, &mpol);
htlb_alloc_mask, &mpol, &nodemask);
struct zone *zone;
struct zoneref *z;
for_each_zone_zonelist(zone, z, zonelist, MAX_NR_ZONES - 1) {
for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) &&
!list_empty(&hugepage_freelists[nid])) {
......
This diff is collapsed.
......@@ -42,3 +42,33 @@ struct zone *next_zone(struct zone *zone)
return zone;
}
static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
{
#ifdef CONFIG_NUMA
return node_isset(zonelist_node_idx(zref), *nodes);
#else
return 1;
#endif /* CONFIG_NUMA */
}
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
nodemask_t *nodes,
struct zone **zone)
{
/*
* Find the next suitable zone to use for the allocation.
* Only filter based on nodemask if it's set
*/
if (likely(nodes == NULL))
while (zonelist_zone_idx(z) > highest_zoneidx)
z++;
else
while (zonelist_zone_idx(z) > highest_zoneidx ||
(z->zone && !zref_in_nodemask(z, nodes)))
z++;
*zone = zonelist_zone(z++);
return z;
}
......@@ -1377,7 +1377,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
struct zonelist *zonelist, int high_zoneidx, int alloc_flags)
{
struct zoneref *z;
......@@ -1388,16 +1388,17 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
int zlc_active = 0; /* set if using zonelist_cache */
int did_zlc_setup = 0; /* just call zlc_setup() one time */
z = first_zones_zonelist(zonelist, high_zoneidx);
classzone_idx = zonelist_zone_idx(z);
preferred_zone = zonelist_zone(z);
(void)first_zones_zonelist(zonelist, high_zoneidx, nodemask,
&preferred_zone);
classzone_idx = zone_idx(preferred_zone);
zonelist_scan:
/*
* Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
for_each_zone_zonelist_nodemask(zone, z, zonelist,
high_zoneidx, nodemask) {
if (NUMA_BUILD && zlc_active &&
!zlc_zone_worth_trying(zonelist, z, allowednodes))
continue;
......@@ -1447,9 +1448,9 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
/*
* This is the 'heart' of the zoned buddy allocator.
*/
struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
static struct page *
__alloc_pages_internal(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
enum zone_type high_zoneidx = gfp_zone(gfp_mask);
......@@ -1478,7 +1479,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
return NULL;
}
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET);
if (page)
goto got_pg;
......@@ -1523,7 +1524,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
page = get_page_from_freelist(gfp_mask, order, zonelist,
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags);
if (page)
goto got_pg;
......@@ -1536,7 +1537,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
nofail_alloc:
/* go through the zonelist yet again, ignoring mins */
page = get_page_from_freelist(gfp_mask, order,
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, ALLOC_NO_WATERMARKS);
if (page)
goto got_pg;
......@@ -1571,7 +1572,7 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
drain_all_pages();
if (likely(did_some_progress)) {
page = get_page_from_freelist(gfp_mask, order,
page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx, alloc_flags);
if (page)
goto got_pg;
......@@ -1587,8 +1588,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
* a parallel oom killing, we must fail if we're still
* under heavy pressure.
*/
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET);
page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
order, zonelist, high_zoneidx,
ALLOC_WMARK_HIGH|ALLOC_CPUSET);
if (page) {
clear_zonelist_oom(zonelist, gfp_mask);
goto got_pg;
......@@ -1637,6 +1639,20 @@ __alloc_pages(gfp_t gfp_mask, unsigned int order,
return page;
}
struct page *
__alloc_pages(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist)
{
return __alloc_pages_internal(gfp_mask, order, zonelist, NULL);
}
struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
return __alloc_pages_internal(gfp_mask, order, zonelist, nodemask);
}
EXPORT_SYMBOL(__alloc_pages);
/*
......@@ -1880,6 +1896,12 @@ void show_free_areas(void)
show_swap_cache_info();
}
static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
{
zoneref->zone = zone;
zoneref->zone_idx = zone_idx(zone);
}
/*
* Builds allocation fallback zone lists.
*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment