Commit 76a0722f authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] node-local mem_map for ia32 discontigmem

From Martin Bligh.

This patch remaps the lmem_map (struct page) arrays for each node onto
their own nodes.  This is non-trivial, since all of ZONE_NORMAL, and
hence permanently mapped KVA resides on node 0.

Very early in the boot sequence, it calculates the size of the lmem_map
arrays (rounding up to the nearest large page size), and reserves a
suitable amount of permanent KVA by shifting down max_low_pfn to create
a gap between max_low_pfn and highstart_pfn (both of which are normally
about 896Mb).

It then uses the new set_pmd_pfn function to set up the pmds correctly
so that the large pages point at the physical addresses reserved from
the remote nodes.

Tested on NUMA-Q and some ratty old i386 PC kicking around under my
desk (on 2.5.36-mm1).  Was good for a 20% improvement in system time on
kernel compile when I initially benchmarked it against 2.5.32 or
something - due to a reduction in inter-node traffic, better
interconnect cache usage and locality.  Should have no effect on any
system other than i386 NUMA systems.
parent 4b4b90a7
/*
* Written by: Patricia Gaughen, IBM Corporation
* Written by: Patricia Gaughen <gone@us.ibm.com>, IBM Corporation
* August 2002: added remote node KVA remap - Martin J. Bligh
*
* Copyright (C) 2002, IBM Corp.
*
......@@ -19,8 +20,6 @@
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
*
* Send feedback to <gone@us.ibm.com>
*/
#include <linux/config.h>
......@@ -113,35 +112,98 @@ static void __init register_bootmem_low_pages(unsigned long system_max_low_pfn)
}
}
#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
unsigned long node_remap_start_pfn[MAX_NUMNODES];
unsigned long node_remap_size[MAX_NUMNODES];
unsigned long node_remap_offset[MAX_NUMNODES];
void *node_remap_start_vaddr[MAX_NUMNODES];
extern void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
void __init remap_numa_kva(void)
{
void *vaddr;
unsigned long pfn;
int node;
for (node = 1; node < numnodes; ++node) {
for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
set_pmd_pfn((ulong) vaddr,
node_remap_start_pfn[node] + pfn,
PAGE_KERNEL_LARGE);
}
}
}
static unsigned long calculate_numa_remap_pages(void)
{
int nid;
unsigned long size, reserve_pages = 0;
for (nid = 1; nid < numnodes; nid++) {
/* calculate the size of the mem_map needed in bytes */
size = (node_end_pfn[nid] - node_start_pfn[nid] + 1)
* sizeof(struct page);
/* convert size to large (pmd size) pages, rounding up */
size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
/* now the roundup is correct, convert to PAGE_SIZE pages */
size = size * PTRS_PER_PTE;
printk("Reserving %ld pages of KVA for lmem_map of node %d\n",
size, nid);
node_remap_size[nid] = size;
reserve_pages += size;
node_remap_offset[nid] = reserve_pages;
printk("Shrinking node %d from %ld pages to %ld pages\n",
nid, node_end_pfn[nid], node_end_pfn[nid] - size);
node_end_pfn[nid] -= size;
node_remap_start_pfn[nid] = node_end_pfn[nid];
}
printk("Reserving total of %ld pages for numa KVA remap\n",
reserve_pages);
return reserve_pages;
}
unsigned long __init setup_memory(void)
{
int nid;
unsigned long bootmap_size, system_start_pfn, system_max_low_pfn;
unsigned long reserve_pages;
get_memcfg_numa();
reserve_pages = calculate_numa_remap_pages();
/*
* partially used pages are not usable - thus
* we are rounding upwards:
*/
/* partially used pages are not usable - thus round upwards */
system_start_pfn = min_low_pfn = PFN_UP(__pa(&_end));
find_max_pfn();
system_max_low_pfn = max_low_pfn = find_max_low_pfn();
#ifdef CONFIG_HIGHMEM
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > system_max_low_pfn) {
highstart_pfn = system_max_low_pfn;
}
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
highstart_pfn = highend_pfn = max_pfn;
if (max_pfn > system_max_low_pfn)
highstart_pfn = system_max_low_pfn;
printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
pages_to_mb(highend_pfn - highstart_pfn));
#endif
system_max_low_pfn = max_low_pfn = max_low_pfn - reserve_pages;
printk(KERN_NOTICE "%ldMB LOWMEM available.\n",
pages_to_mb(system_max_low_pfn));
for (nid = 0; nid < numnodes; nid++)
printk("min_low_pfn = %ld, max_low_pfn = %ld, highstart_pfn = %ld\n",
min_low_pfn, max_low_pfn, highstart_pfn);
printk("Low memory ends at vaddr %08lx\n",
(ulong) pfn_to_kaddr(max_low_pfn));
for (nid = 0; nid < numnodes; nid++) {
allocate_pgdat(nid);
node_remap_start_vaddr[nid] = pfn_to_kaddr(
highstart_pfn - node_remap_offset[nid]);
printk ("node %d will remap to vaddr %08lx - %08lx\n", nid,
(ulong) node_remap_start_vaddr[nid],
(ulong) pfn_to_kaddr(highstart_pfn
- node_remap_offset[nid] + node_remap_size[nid]));
}
printk("High memory starts at vaddr %08lx\n",
(ulong) pfn_to_kaddr(highstart_pfn));
for (nid = 0; nid < numnodes; nid++)
find_max_pfn_node(nid);
......@@ -244,7 +306,18 @@ void __init zone_sizes_init(void)
#endif
}
}
free_area_init_node(nid, NODE_DATA(nid), 0, zones_size, start, 0);
/*
* We let the lmem_map for node 0 be allocated from the
* normal bootmem allocator, but other nodes come from the
* remapped KVA area - mbligh
*/
if (nid)
free_area_init_node(nid, NODE_DATA(nid),
node_remap_start_vaddr[nid], zones_size,
start, 0);
else
free_area_init_node(nid, NODE_DATA(nid), 0,
zones_size, start, 0);
}
return;
}
......
......@@ -245,6 +245,12 @@ extern void set_highmem_pages_init(int);
unsigned long __PAGE_KERNEL = _PAGE_KERNEL;
#ifndef CONFIG_DISCONTIGMEM
#define remap_numa_kva() do {} while (0)
#else
extern void __init remap_numa_kva(void);
#endif
static void __init pagetable_init (void)
{
unsigned long vaddr;
......@@ -269,6 +275,7 @@ static void __init pagetable_init (void)
}
kernel_physical_mapping_init(pgd_base);
remap_numa_kva();
/*
* Fixed mappings, only the page table structure has to be
......@@ -449,7 +456,11 @@ void __init mem_init(void)
set_max_mapnr_init();
#ifdef CONFIG_HIGHMEM
high_memory = (void *) __va(highstart_pfn * PAGE_SIZE);
#else
high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
#endif
/* clear the zero-page */
memset(empty_zero_page, 0, PAGE_SIZE);
......
......@@ -84,6 +84,39 @@ static void set_pte_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
__flush_tlb_one(vaddr);
}
/*
* Associate a large virtual page frame with a given physical page frame
* and protection flags for that frame. pfn is for the base of the page,
* vaddr is what the page gets mapped to - both must be properly aligned.
* The pmd must already be instantiated. Assumes PAE mode.
*/
void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags)
{
pgd_t *pgd;
pmd_t *pmd;
if (vaddr & (PMD_SIZE-1)) { /* vaddr is misaligned */
printk ("set_pmd_pfn: vaddr misaligned\n");
return; /* BUG(); */
}
if (pfn & (PTRS_PER_PTE-1)) { /* pfn is misaligned */
printk ("set_pmd_pfn: pfn misaligned\n");
return; /* BUG(); */
}
pgd = swapper_pg_dir + __pgd_offset(vaddr);
if (pgd_none(*pgd)) {
printk ("set_pmd_pfn: pgd_none\n");
return; /* BUG(); */
}
pmd = pmd_offset(pgd, vaddr);
set_pmd(pmd, pfn_pmd(pfn, flags));
/*
* It's enough to flush this one mapping.
* (PGE mappings get flushed as well)
*/
__flush_tlb_one(vaddr);
}
void __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t flags)
{
unsigned long address = __fix_to_virt(idx);
......
......@@ -142,6 +142,7 @@ static __inline__ int get_order(unsigned long size)
#define MAXMEM ((unsigned long)(-PAGE_OFFSET-VMALLOC_RESERVE))
#define __pa(x) ((unsigned long)(x)-PAGE_OFFSET)
#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
#ifndef CONFIG_DISCONTIGMEM
#define pfn_to_page(pfn) (mem_map + (pfn))
#define page_to_pfn(page) ((unsigned long)((page) - mem_map))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment