Commit af70f767 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Fix page allocator lower zone protection for NUMA

From: Martin Hicks <mort@wildopensource.com>

This changes __alloc_pages() so it uses precalculated values for the "min".
This should prevent the problem of min incrementing from zone to zone across
many nodes on a NUMA machine.  The result of falling back to other nodes with
the old incremental min calculations was that the min value became very
large.
parent 7860b371
...@@ -54,6 +54,15 @@ struct per_cpu_pageset { ...@@ -54,6 +54,15 @@ struct per_cpu_pageset {
struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */ struct per_cpu_pages pcp[2]; /* 0: hot. 1: cold */
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
#define GFP_ZONEMASK 0x03
/* /*
* On machines where it is needed (eg PCs) we divide physical memory * On machines where it is needed (eg PCs) we divide physical memory
* into multiple physical zones. On a PC we have 3 zones: * into multiple physical zones. On a PC we have 3 zones:
...@@ -70,6 +79,19 @@ struct zone { ...@@ -70,6 +79,19 @@ struct zone {
spinlock_t lock; spinlock_t lock;
unsigned long free_pages; unsigned long free_pages;
unsigned long pages_min, pages_low, pages_high; unsigned long pages_min, pages_low, pages_high;
/*
* protection[] is a pre-calculated number of extra pages that must be
* available in a zone in order for __alloc_pages() to allocate memory
* from the zone. i.e., for a GFP_KERNEL alloc of "order" there must
* be "(1<<order) + protection[ZONE_NORMAL]" free pages in the zone
* for us to choose to allocate the page from that zone.
*
* It uses both min_free_kbytes and sysctl_lower_zone_protection.
* The protection values are recalculated if either of these values
* change. The array elements are in zonelist order:
* [0] == GFP_DMA, [1] == GFP_KERNEL, [2] == GFP_HIGHMEM.
*/
unsigned long protection[MAX_NR_ZONES];
ZONE_PADDING(_pad1_) ZONE_PADDING(_pad1_)
...@@ -157,14 +179,6 @@ struct zone { ...@@ -157,14 +179,6 @@ struct zone {
unsigned long present_pages; /* amount of memory (excluding holes) */ unsigned long present_pages; /* amount of memory (excluding holes) */
} ____cacheline_maxaligned_in_smp; } ____cacheline_maxaligned_in_smp;
#define ZONE_DMA 0
#define ZONE_NORMAL 1
#define ZONE_HIGHMEM 2
#define MAX_NR_ZONES 3 /* Sync this with ZONES_SHIFT */
#define ZONES_SHIFT 2 /* ceil(log2(MAX_NR_ZONES)) */
#define GFP_ZONEMASK 0x03
/* /*
* The "priority" of VM scanning is how much of the queues we will scan in one * The "priority" of VM scanning is how much of the queues we will scan in one
...@@ -228,6 +242,11 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive, ...@@ -228,6 +242,11 @@ void get_zone_counts(unsigned long *active, unsigned long *inactive,
void build_all_zonelists(void); void build_all_zonelists(void);
void wakeup_kswapd(struct zone *zone); void wakeup_kswapd(struct zone *zone);
/*
* zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc.
*/
#define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones)
/** /**
* for_each_pgdat - helper macro to iterate over all nodes * for_each_pgdat - helper macro to iterate over all nodes
* @pgdat - pointer to a pg_data_t variable * @pgdat - pointer to a pg_data_t variable
...@@ -300,6 +319,8 @@ struct ctl_table; ...@@ -300,6 +319,8 @@ struct ctl_table;
struct file; struct file;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *); void __user *, size_t *);
int lower_zone_protection_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *);
#include <linux/topology.h> #include <linux/topology.h>
/* Returns the number of the current Node. */ /* Returns the number of the current Node. */
......
...@@ -722,7 +722,7 @@ static ctl_table vm_table[] = { ...@@ -722,7 +722,7 @@ static ctl_table vm_table[] = {
.data = &sysctl_lower_zone_protection, .data = &sysctl_lower_zone_protection,
.maxlen = sizeof(sysctl_lower_zone_protection), .maxlen = sizeof(sysctl_lower_zone_protection),
.mode = 0644, .mode = 0644,
.proc_handler = &proc_dointvec_minmax, .proc_handler = &lower_zone_protection_sysctl_handler,
.strategy = &sysctl_intvec, .strategy = &sysctl_intvec,
.extra1 = &zero, .extra1 = &zero,
}, },
......
...@@ -552,6 +552,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -552,6 +552,7 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
struct task_struct *p = current; struct task_struct *p = current;
int i; int i;
int cold; int cold;
int alloc_type;
int do_retry; int do_retry;
might_sleep_if(wait); might_sleep_if(wait);
...@@ -564,20 +565,20 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -564,20 +565,20 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
if (zones[0] == NULL) /* no zones in the zonelist */ if (zones[0] == NULL) /* no zones in the zonelist */
return NULL; return NULL;
alloc_type = zone_idx(zones[0]);
/* Go through the zonelist once, looking for a zone with enough free */ /* Go through the zonelist once, looking for a zone with enough free */
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) { for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i]; struct zone *z = zones[i];
unsigned long local_low;
min = (1<<order) + z->protection[alloc_type];
/* /*
* This is the fabled 'incremental min'. We let real-time tasks * We let real-time tasks dip their real-time paws a little
* dip their real-time paws a little deeper into reserves. * deeper into reserves.
*/ */
local_low = z->pages_low;
if (rt_task(p)) if (rt_task(p))
local_low >>= 1; min -= z->pages_low >> 1;
min += local_low;
if (z->free_pages >= min || if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) { (!wait && z->free_pages >= z->pages_high)) {
...@@ -585,7 +586,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -585,7 +586,6 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
if (page) if (page)
goto got_pg; goto got_pg;
} }
min += z->pages_low * sysctl_lower_zone_protection;
} }
/* we're somewhat low on memory, failed to find what we needed */ /* we're somewhat low on memory, failed to find what we needed */
...@@ -593,24 +593,22 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -593,24 +593,22 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
wakeup_kswapd(zones[i]); wakeup_kswapd(zones[i]);
/* Go through the zonelist again, taking __GFP_HIGH into account */ /* Go through the zonelist again, taking __GFP_HIGH into account */
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) { for (i = 0; zones[i] != NULL; i++) {
unsigned long local_min;
struct zone *z = zones[i]; struct zone *z = zones[i];
local_min = z->pages_min; min = (1<<order) + z->protection[alloc_type];
if (gfp_mask & __GFP_HIGH) if (gfp_mask & __GFP_HIGH)
local_min >>= 2; min -= z->pages_low >> 2;
if (rt_task(p)) if (rt_task(p))
local_min >>= 1; min -= z->pages_low >> 1;
min += local_min;
if (z->free_pages >= min || if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) { (!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold); page = buffered_rmqueue(z, order, cold);
if (page) if (page)
goto got_pg; goto got_pg;
} }
min += local_min * sysctl_lower_zone_protection;
} }
/* here we're in the low on memory slow path */ /* here we're in the low on memory slow path */
...@@ -642,18 +640,17 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, ...@@ -642,18 +640,17 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order,
p->flags &= ~PF_MEMALLOC; p->flags &= ~PF_MEMALLOC;
/* go through the zonelist yet one more time */ /* go through the zonelist yet one more time */
min = 1UL << order;
for (i = 0; zones[i] != NULL; i++) { for (i = 0; zones[i] != NULL; i++) {
struct zone *z = zones[i]; struct zone *z = zones[i];
min += z->pages_min; min = (1UL << order) + z->protection[alloc_type];
if (z->free_pages >= min || if (z->free_pages >= min ||
(!wait && z->free_pages >= z->pages_high)) { (!wait && z->free_pages >= z->pages_high)) {
page = buffered_rmqueue(z, order, cold); page = buffered_rmqueue(z, order, cold);
if (page) if (page)
goto got_pg; goto got_pg;
} }
min += z->pages_low * sysctl_lower_zone_protection;
} }
/* /*
...@@ -1056,6 +1053,8 @@ void show_free_areas(void) ...@@ -1056,6 +1053,8 @@ void show_free_areas(void)
ps.nr_page_table_pages); ps.nr_page_table_pages);
for_each_zone(zone) { for_each_zone(zone) {
int i;
show_node(zone); show_node(zone);
printk("%s" printk("%s"
" free:%lukB" " free:%lukB"
...@@ -1075,6 +1074,10 @@ void show_free_areas(void) ...@@ -1075,6 +1074,10 @@ void show_free_areas(void)
K(zone->nr_inactive), K(zone->nr_inactive),
K(zone->present_pages) K(zone->present_pages)
); );
printk("protections[]:");
for (i = 0; i < MAX_NR_ZONES; i++)
printk(" %lu", zone->protection[i]);
printk("\n");
} }
for_each_zone(zone) { for_each_zone(zone) {
...@@ -1744,6 +1747,93 @@ void __init page_alloc_init(void) ...@@ -1744,6 +1747,93 @@ void __init page_alloc_init(void)
hotcpu_notifier(page_alloc_cpu_notify, 0); hotcpu_notifier(page_alloc_cpu_notify, 0);
} }
static unsigned long higherzone_val(struct zone *z, int max_zone,
int alloc_type)
{
int z_idx = zone_idx(z);
struct zone *higherzone;
unsigned long pages;
/* there is no higher zone to get a contribution from */
if (z_idx == MAX_NR_ZONES-1)
return 0;
higherzone = &z->zone_pgdat->node_zones[z_idx+1];
/* We always start with the higher zone's protection value */
pages = higherzone->protection[alloc_type];
/*
* We get a lower-zone-protection contribution only if there are
* pages in the higher zone and if we're not the highest zone
* in the current zonelist. e.g., never happens for GFP_DMA. Happens
* only for ZONE_DMA in a GFP_KERNEL allocation and happens for ZONE_DMA
* and ZONE_NORMAL for a GFP_HIGHMEM allocation.
*/
if (higherzone->present_pages && z_idx < alloc_type)
pages += higherzone->pages_low * sysctl_lower_zone_protection;
return pages;
}
/*
* setup_per_zone_protection - called whenver min_free_kbytes or
* sysctl_lower_zone_protection changes. Ensures that each zone
* has a correct pages_protected value, so an adequate number of
* pages are left in the zone after a successful __alloc_pages().
*
* This algorithm is way confusing. I tries to keep the same behavior
* as we had with the incremental min iterative algorithm.
*/
static void setup_per_zone_protection(void)
{
struct pglist_data *pgdat;
struct zone *zones, *zone;
int max_zone;
int i, j;
for_each_pgdat(pgdat) {
zones = pgdat->node_zones;
for (i = 0, max_zone = 0; i < MAX_NR_ZONES; i++)
if (zones[i].present_pages)
max_zone = i;
/*
* For each of the different allocation types:
* GFP_DMA -> GFP_KERNEL -> GFP_HIGHMEM
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
/*
* For each of the zones:
* ZONE_HIGHMEM -> ZONE_NORMAL -> ZONE_DMA
*/
for (j = MAX_NR_ZONES-1; j >= 0; j--) {
zone = &zones[j];
/*
* We never protect zones that don't have memory
* in them (j>max_zone) or zones that aren't in
* the zonelists for a certain type of
* allocation (j>i). We have to assign these to
* zero because the lower zones take
* contributions from the higher zones.
*/
if (j > max_zone || j > i) {
zone->protection[i] = 0;
continue;
}
/*
* The contribution of the next higher zone
*/
zone->protection[i] = higherzone_val(zone,
max_zone, i);
zone->protection[i] += zone->pages_low;
}
}
}
}
/* /*
* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
* that the pages_{min,low,high} values for each zone are set correctly * that the pages_{min,low,high} values for each zone are set correctly
...@@ -1757,9 +1847,10 @@ static void setup_per_zone_pages_min(void) ...@@ -1757,9 +1847,10 @@ static void setup_per_zone_pages_min(void)
unsigned long flags; unsigned long flags;
/* Calculate total number of !ZONE_HIGHMEM pages */ /* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) for_each_zone(zone) {
if (!is_highmem(zone)) if (!is_highmem(zone))
lowmem_pages += zone->present_pages; lowmem_pages += zone->present_pages;
}
for_each_zone(zone) { for_each_zone(zone) {
spin_lock_irqsave(&zone->lru_lock, flags); spin_lock_irqsave(&zone->lru_lock, flags);
...@@ -1827,13 +1918,14 @@ static int __init init_per_zone_pages_min(void) ...@@ -1827,13 +1918,14 @@ static int __init init_per_zone_pages_min(void)
if (min_free_kbytes > 16384) if (min_free_kbytes > 16384)
min_free_kbytes = 16384; min_free_kbytes = 16384;
setup_per_zone_pages_min(); setup_per_zone_pages_min();
setup_per_zone_protection();
return 0; return 0;
} }
module_init(init_per_zone_pages_min) module_init(init_per_zone_pages_min)
/* /*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* that we can call setup_per_zone_pages_min() whenever min_free_kbytes * that we can call two helper functions whenever min_free_kbytes
* changes. * changes.
*/ */
int min_free_kbytes_sysctl_handler(ctl_table *table, int write, int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
...@@ -1841,5 +1933,19 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, ...@@ -1841,5 +1933,19 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
{ {
proc_dointvec(table, write, file, buffer, length); proc_dointvec(table, write, file, buffer, length);
setup_per_zone_pages_min(); setup_per_zone_pages_min();
setup_per_zone_protection();
return 0;
}
/*
* lower_zone_protection_sysctl_handler - just a wrapper around
* proc_dointvec() so that we can call setup_per_zone_protection()
* whenever sysctl_lower_zone_protection changes.
*/
int lower_zone_protection_sysctl_handler(ctl_table *table, int write,
struct file *file, void __user *buffer, size_t *length)
{
proc_dointvec_minmax(table, write, file, buffer, length);
setup_per_zone_protection();
return 0; return 0;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment