Commit ccc98a67 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] _alloc_pages cleanup

Patch from Martin Bligh.  It should only affect machines using
discontigmem.

"This patch is was originally from Andrea's tree (from SGI??), and has
been tweaked since by both Christoph (who cleaned up all the code),
and myself (who just hit it until it worked).

It removes _alloc_pages, and adds all nodes to the zonelists
directly, which also changes the fallback zone order to something more
sensible ...  instead of: "foreach (node) { foreach (zone) }" we now
do something more like "foreach (zone_type) { foreach (node) }"

Christoph has a more recent version that's fancier and does a couple
more cleanups, but it seems to have a bug in it that I can't track
down easily, so I propose we do the simple thing for now, and take the
rest of the cleanups when it works ...  it seems to build nicely on
top of this seperately to me.

Tested on 16-way NUMA-Q with discontigmem + NUMA support."
parent e07316f9
......@@ -1726,7 +1726,7 @@ void __init mem_init(void)
* Set up the zero page, mark it reserved, so that page count
* is not manipulated when freeing the page from user ptes.
*/
mem_map_zero = _alloc_pages(GFP_KERNEL, 0);
mem_map_zero = alloc_pages(GFP_KERNEL, 0);
if (mem_map_zero == NULL) {
prom_printf("paging_init: Cannot alloc zero page.\n");
prom_halt();
......
......@@ -36,12 +36,10 @@ extern plat_pg_data_t *plat_node_data[];
#ifdef CONFIG_ALPHA_WILDFIRE
# define ALPHA_PA_TO_NID(pa) ((pa) >> 36) /* 16 nodes max due 43bit kseg */
#define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */
#define MAX_NUMNODES WILDFIRE_MAX_QBB
# define NODE_MAX_MEM_SIZE (64L * 1024L * 1024L * 1024L) /* 64 GB */
#else
# define ALPHA_PA_TO_NID(pa) (0)
#define NODE_MAX_MEM_SIZE (~0UL)
#define MAX_NUMNODES 1
# define NODE_MAX_MEM_SIZE (~0UL)
#endif
#define PHYSADDR_TO_NID(pa) ALPHA_PA_TO_NID(pa)
......
#ifndef _ASM_MAX_NUMNODES_H
#define _ASM_MAX_NUMNODES_H
/*
* Currently the Wildfire is the only discontigmem/NUMA capable Alpha core.
*/
#if defined(CONFIG_ALPHA_WILDFIRE) || defined(CONFIG_ALPHA_GENERIC)
# include <asm/core_wildfire.h>
# define MAX_NUMNODES WILDFIRE_MAX_QBB
#endif
#endif /* _ASM_MAX_NUMNODES_H */
......@@ -6,6 +6,8 @@
#ifndef _ASM_MMZONE_H_
#define _ASM_MMZONE_H_
#include <asm/smp.h>
#ifdef CONFIG_DISCONTIGMEM
#ifdef CONFIG_X86_NUMAQ
......
......@@ -39,18 +39,25 @@
* can allocate highmem pages, the *get*page*() variants return
* virtual kernel addresses to the allocated page(s).
*/
extern struct page * FASTCALL(_alloc_pages(unsigned int gfp_mask, unsigned int order));
extern struct page * FASTCALL(__alloc_pages(unsigned int gfp_mask, unsigned int order, struct zonelist *zonelist));
extern struct page * alloc_pages_node(int nid, unsigned int gfp_mask, unsigned int order);
/*
* We get the zone list from the current node and the gfp_mask.
* This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones.
*
* For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets
* optimized to &contig_page_data at compile-time.
*/
static inline struct page * alloc_pages(unsigned int gfp_mask, unsigned int order)
{
/*
* Gets optimized away by the compiler.
*/
if (order >= MAX_ORDER)
pg_data_t *pgdat = NODE_DATA(numa_node_id());
unsigned int idx = (gfp_mask & GFP_ZONEMASK);
if (unlikely(order >= MAX_ORDER))
return NULL;
return _alloc_pages(gfp_mask, order);
return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + idx);
}
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
......
......@@ -10,11 +10,14 @@
#include <linux/wait.h>
#include <linux/cache.h>
#include <asm/atomic.h>
#ifdef CONFIG_DISCONTIGMEM
#include <asm/numnodes.h>
#endif
#ifndef MAX_NUMNODES
#define MAX_NUMNODES 1
#endif
/*
* Free memory management - zoned buddy allocator.
*/
/* Free memory management - zoned buddy allocator. */
#ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 11
#else
......@@ -137,7 +140,7 @@ struct zone {
* footprint of this construct is very small.
*/
struct zonelist {
struct zone *zones[MAX_NR_ZONES+1]; // NULL delimited
struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
};
#define GFP_ZONEMASK 0x0f
......@@ -190,6 +193,7 @@ extern void calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
extern void free_area_init_core(pg_data_t *pgdat, unsigned long *zones_size,
unsigned long *zholes_size);
void get_zone_counts(unsigned long *active, unsigned long *inactive);
extern void build_all_zonelists(void);
extern pg_data_t contig_page_data;
......
......@@ -392,6 +392,7 @@ asmlinkage void __init start_kernel(void)
printk(linux_banner);
setup_arch(&command_line);
setup_per_cpu_areas();
build_all_zonelists();
printk("Kernel command line: %s\n", saved_command_line);
parse_options(command_line);
trap_init();
......
......@@ -90,7 +90,6 @@ EXPORT_SYMBOL(do_brk);
EXPORT_SYMBOL(exit_mm);
/* internal kernel memory management */
EXPORT_SYMBOL(_alloc_pages);
EXPORT_SYMBOL(__alloc_pages);
EXPORT_SYMBOL(alloc_pages_node);
EXPORT_SYMBOL(__get_free_pages);
......@@ -117,6 +116,7 @@ EXPORT_SYMBOL(vunmap);
EXPORT_SYMBOL(vmalloc_to_page);
EXPORT_SYMBOL(remap_page_range);
#ifndef CONFIG_DISCONTIGMEM
EXPORT_SYMBOL(contig_page_data);
EXPORT_SYMBOL(mem_map);
EXPORT_SYMBOL(max_mapnr);
#endif
......
......@@ -85,48 +85,4 @@ void __init free_area_init_node(int nid, pg_data_t *pgdat, struct page *pmap,
memset(pgdat->valid_addr_bitmap, 0, size);
}
static struct page * alloc_pages_pgdat(pg_data_t *pgdat, unsigned int gfp_mask,
unsigned int order)
{
return __alloc_pages(gfp_mask, order, pgdat->node_zonelists + (gfp_mask & GFP_ZONEMASK));
}
/*
* This can be refined. Currently, tries to do round robin, instead
* should do concentratic circle search, starting from current node.
*/
struct page * _alloc_pages(unsigned int gfp_mask, unsigned int order)
{
struct page *ret = 0;
pg_data_t *start, *temp;
#ifndef CONFIG_NUMA
unsigned long flags;
static pg_data_t *next = 0;
#endif
if (order >= MAX_ORDER)
return NULL;
#ifdef CONFIG_NUMA
temp = NODE_DATA(numa_node_id());
#else
if (!next)
next = pgdat_list;
temp = next;
next = next->pgdat_next;
#endif
start = temp;
while (temp) {
if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
temp = temp->pgdat_next;
}
temp = pgdat_list;
while (temp != start) {
if ((ret = alloc_pages_pgdat(temp, gfp_mask, order)))
return(ret);
temp = temp->pgdat_next;
}
return(0);
}
#endif /* CONFIG_DISCONTIGMEM */
......@@ -256,14 +256,6 @@ int is_head_of_free_region(struct page *page)
}
#endif /* CONFIG_SOFTWARE_SUSPEND */
#ifndef CONFIG_DISCONTIGMEM
struct page *_alloc_pages(unsigned int gfp_mask, unsigned int order)
{
return __alloc_pages(gfp_mask, order,
contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
}
#endif
static /* inline */ struct page *
balance_classzone(struct zone* classzone, unsigned int gfp_mask,
unsigned int order, int * freed)
......@@ -679,13 +671,41 @@ void show_free_areas(void)
/*
* Builds allocation fallback zone lists.
*/
static inline void build_zonelists(pg_data_t *pgdat)
static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k)
{
switch (k) {
struct zone *zone;
default:
BUG();
case ZONE_HIGHMEM:
zone = pgdat->node_zones + ZONE_HIGHMEM;
if (zone->size) {
#ifndef CONFIG_HIGHMEM
BUG();
#endif
zonelist->zones[j++] = zone;
}
case ZONE_NORMAL:
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->size)
zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->size)
zonelist->zones[j++] = zone;
}
return j;
}
static void __init build_zonelists(pg_data_t *pgdat)
{
int i, j, k;
int i, j, k, node, local_node;
local_node = pgdat->node_id;
printk("Building zonelist for node : %d\n", local_node);
for (i = 0; i <= GFP_ZONEMASK; i++) {
struct zonelist *zonelist;
struct zone *zone;
zonelist = pgdat->node_zonelists + i;
memset(zonelist, 0, sizeof(*zonelist));
......@@ -697,33 +717,32 @@ static inline void build_zonelists(pg_data_t *pgdat)
if (i & __GFP_DMA)
k = ZONE_DMA;
switch (k) {
default:
BUG();
/*
* fallthrough:
*/
case ZONE_HIGHMEM:
zone = pgdat->node_zones + ZONE_HIGHMEM;
if (zone->size) {
#ifndef CONFIG_HIGHMEM
BUG();
#endif
zonelist->zones[j++] = zone;
}
case ZONE_NORMAL:
zone = pgdat->node_zones + ZONE_NORMAL;
if (zone->size)
zonelist->zones[j++] = zone;
case ZONE_DMA:
zone = pgdat->node_zones + ZONE_DMA;
if (zone->size)
zonelist->zones[j++] = zone;
}
j = build_zonelists_node(pgdat, zonelist, j, k);
/*
* Now we build the zonelist so that it contains the zones
* of all the other nodes.
* We don't want to pressure a particular node, so when
* building the zones for node N, we make sure that the
* zones coming right after the local ones are those from
* node N+1 (modulo N)
*/
for (node = local_node + 1; node < numnodes; node++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
for (node = 0; node < local_node; node++)
j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
zonelist->zones[j++] = NULL;
}
}
void __init build_all_zonelists(void)
{
int i;
for(i = 0 ; i < numnodes ; i++)
build_zonelists(NODE_DATA(i));
}
void __init calculate_totalpages (pg_data_t *pgdat, unsigned long *zones_size,
unsigned long *zholes_size)
{
......@@ -919,7 +938,6 @@ void __init free_area_init_core(pg_data_t *pgdat,
(unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
}
}
build_zonelists(pgdat);
}
#ifndef CONFIG_DISCONTIGMEM
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment