Commit 919ee677 authored by David S. Miller's avatar David S. Miller

[SPARC64]: Add NUMA support.

Currently there is only code to parse NUMA attributes on
sun4v/niagara systems, but later on we will add such parsing
for older systems.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 1f261ef5
VERSION = 2 VERSION = 2
PATCHLEVEL = 6 PATCHLEVEL = 6
SUBLEVEL = 25 SUBLEVEL = 25
EXTRAVERSION = EXTRAVERSION = -numa
NAME = Funky Weasel is Jiggy wit it NAME = Funky Weasel is Jiggy wit it
# *DOCUMENTATION* # *DOCUMENTATION*
......
...@@ -250,6 +250,26 @@ endchoice ...@@ -250,6 +250,26 @@ endchoice
endmenu endmenu
config NUMA
bool "NUMA support"
config NODES_SHIFT
int
default "4"
depends on NEED_MULTIPLE_NODES
# Some NUMA nodes have memory ranges that span
# other nodes. Even though a pfn is valid and
# between a node's start and end pfns, it may not
# reside on that node. See memmap_init_zone()
# for details.
config NODES_SPAN_OTHER_NODES
def_bool y
depends on NEED_MULTIPLE_NODES
config ARCH_POPULATES_NODE_MAP
def_bool y
config ARCH_SELECT_MEMORY_MODEL config ARCH_SELECT_MEMORY_MODEL
def_bool y def_bool y
......
# #
# Automatically generated make config: don't edit # Automatically generated make config: don't edit
# Linux kernel version: 2.6.25 # Linux kernel version: 2.6.25-numa
# Sun Apr 20 01:33:21 2008 # Wed Apr 23 04:49:08 2008
# #
CONFIG_SPARC=y CONFIG_SPARC=y
CONFIG_SPARC64=y CONFIG_SPARC64=y
...@@ -152,6 +152,8 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y ...@@ -152,6 +152,8 @@ CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_HUGETLB_PAGE_SIZE_4MB=y CONFIG_HUGETLB_PAGE_SIZE_4MB=y
# CONFIG_HUGETLB_PAGE_SIZE_512K is not set # CONFIG_HUGETLB_PAGE_SIZE_512K is not set
# CONFIG_HUGETLB_PAGE_SIZE_64K is not set # CONFIG_HUGETLB_PAGE_SIZE_64K is not set
# CONFIG_NUMA is not set
CONFIG_ARCH_POPULATES_NODE_MAP=y
CONFIG_ARCH_SELECT_MEMORY_MODEL=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y
CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_ARCH_SPARSEMEM_ENABLE=y
CONFIG_ARCH_SPARSEMEM_DEFAULT=y CONFIG_ARCH_SPARSEMEM_DEFAULT=y
...@@ -787,7 +789,6 @@ CONFIG_I2C_ALGOBIT=y ...@@ -787,7 +789,6 @@ CONFIG_I2C_ALGOBIT=y
# CONFIG_SENSORS_PCF8574 is not set # CONFIG_SENSORS_PCF8574 is not set
# CONFIG_PCF8575 is not set # CONFIG_PCF8575 is not set
# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_PCF8591 is not set
# CONFIG_TPS65010 is not set
# CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_MAX6875 is not set
# CONFIG_SENSORS_TSL2550 is not set # CONFIG_SENSORS_TSL2550 is not set
# CONFIG_I2C_DEBUG_CORE is not set # CONFIG_I2C_DEBUG_CORE is not set
...@@ -869,6 +870,7 @@ CONFIG_SSB_POSSIBLE=y ...@@ -869,6 +870,7 @@ CONFIG_SSB_POSSIBLE=y
# Multifunction device drivers # Multifunction device drivers
# #
# CONFIG_MFD_SM501 is not set # CONFIG_MFD_SM501 is not set
# CONFIG_HTC_PASIC3 is not set
# #
# Multimedia devices # Multimedia devices
...@@ -1219,10 +1221,6 @@ CONFIG_USB_STORAGE=m ...@@ -1219,10 +1221,6 @@ CONFIG_USB_STORAGE=m
# CONFIG_NEW_LEDS is not set # CONFIG_NEW_LEDS is not set
# CONFIG_INFINIBAND is not set # CONFIG_INFINIBAND is not set
# CONFIG_RTC_CLASS is not set # CONFIG_RTC_CLASS is not set
#
# Userspace I/O
#
# CONFIG_UIO is not set # CONFIG_UIO is not set
# #
...@@ -1399,6 +1397,7 @@ CONFIG_SCHEDSTATS=y ...@@ -1399,6 +1397,7 @@ CONFIG_SCHEDSTATS=y
CONFIG_DEBUG_BUGVERBOSE=y CONFIG_DEBUG_BUGVERBOSE=y
# CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_INFO is not set
# CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_VM is not set
# CONFIG_DEBUG_WRITECOUNT is not set
# CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_LIST is not set
# CONFIG_DEBUG_SG is not set # CONFIG_DEBUG_SG is not set
# CONFIG_BOOT_PRINTK_DELAY is not set # CONFIG_BOOT_PRINTK_DELAY is not set
...@@ -1425,53 +1424,82 @@ CONFIG_ASYNC_CORE=m ...@@ -1425,53 +1424,82 @@ CONFIG_ASYNC_CORE=m
CONFIG_ASYNC_MEMCPY=m CONFIG_ASYNC_MEMCPY=m
CONFIG_ASYNC_XOR=m CONFIG_ASYNC_XOR=m
CONFIG_CRYPTO=y CONFIG_CRYPTO=y
#
# Crypto core or helper
#
CONFIG_CRYPTO_ALGAPI=y CONFIG_CRYPTO_ALGAPI=y
CONFIG_CRYPTO_AEAD=y CONFIG_CRYPTO_AEAD=y
CONFIG_CRYPTO_BLKCIPHER=y CONFIG_CRYPTO_BLKCIPHER=y
# CONFIG_CRYPTO_SEQIV is not set
CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH=y
CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER=y
CONFIG_CRYPTO_GF128MUL=m
CONFIG_CRYPTO_NULL=m
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_AUTHENC=y
CONFIG_CRYPTO_TEST=m
#
# Authenticated Encryption with Associated Data
#
# CONFIG_CRYPTO_CCM is not set
# CONFIG_CRYPTO_GCM is not set
# CONFIG_CRYPTO_SEQIV is not set
#
# Block modes
#
CONFIG_CRYPTO_CBC=y
# CONFIG_CRYPTO_CTR is not set
# CONFIG_CRYPTO_CTS is not set
CONFIG_CRYPTO_ECB=m
CONFIG_CRYPTO_LRW=m
CONFIG_CRYPTO_PCBC=m
CONFIG_CRYPTO_XTS=m
#
# Hash modes
#
CONFIG_CRYPTO_HMAC=y CONFIG_CRYPTO_HMAC=y
CONFIG_CRYPTO_XCBC=y CONFIG_CRYPTO_XCBC=y
CONFIG_CRYPTO_NULL=m
#
# Digest
#
CONFIG_CRYPTO_CRC32C=m
CONFIG_CRYPTO_MD4=y CONFIG_CRYPTO_MD4=y
CONFIG_CRYPTO_MD5=y CONFIG_CRYPTO_MD5=y
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_SHA1=y CONFIG_CRYPTO_SHA1=y
CONFIG_CRYPTO_SHA256=m CONFIG_CRYPTO_SHA256=m
CONFIG_CRYPTO_SHA512=m CONFIG_CRYPTO_SHA512=m
CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_TGR192=m CONFIG_CRYPTO_TGR192=m
CONFIG_CRYPTO_GF128MUL=m CONFIG_CRYPTO_WP512=m
CONFIG_CRYPTO_ECB=m
CONFIG_CRYPTO_CBC=y #
CONFIG_CRYPTO_PCBC=m # Ciphers
CONFIG_CRYPTO_LRW=m #
CONFIG_CRYPTO_XTS=m
# CONFIG_CRYPTO_CTR is not set
# CONFIG_CRYPTO_GCM is not set
# CONFIG_CRYPTO_CCM is not set
# CONFIG_CRYPTO_CRYPTD is not set
CONFIG_CRYPTO_DES=y
CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_TWOFISH_COMMON=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_AES=m CONFIG_CRYPTO_AES=m
CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_ARC4=m
CONFIG_CRYPTO_BLOWFISH=m
CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_CAST5=m CONFIG_CRYPTO_CAST5=m
CONFIG_CRYPTO_CAST6=m CONFIG_CRYPTO_CAST6=m
CONFIG_CRYPTO_TEA=m CONFIG_CRYPTO_DES=y
CONFIG_CRYPTO_ARC4=m CONFIG_CRYPTO_FCRYPT=m
CONFIG_CRYPTO_KHAZAD=m CONFIG_CRYPTO_KHAZAD=m
CONFIG_CRYPTO_ANUBIS=m
CONFIG_CRYPTO_SEED=m
# CONFIG_CRYPTO_SALSA20 is not set # CONFIG_CRYPTO_SALSA20 is not set
CONFIG_CRYPTO_SEED=m
CONFIG_CRYPTO_SERPENT=m
CONFIG_CRYPTO_TEA=m
CONFIG_CRYPTO_TWOFISH=m
CONFIG_CRYPTO_TWOFISH_COMMON=m
#
# Compression
#
CONFIG_CRYPTO_DEFLATE=y CONFIG_CRYPTO_DEFLATE=y
CONFIG_CRYPTO_MICHAEL_MIC=m
CONFIG_CRYPTO_CRC32C=m
CONFIG_CRYPTO_CAMELLIA=m
CONFIG_CRYPTO_TEST=m
CONFIG_CRYPTO_AUTHENC=y
# CONFIG_CRYPTO_LZO is not set # CONFIG_CRYPTO_LZO is not set
CONFIG_CRYPTO_HW=y CONFIG_CRYPTO_HW=y
# CONFIG_CRYPTO_DEV_HIFN_795X is not set # CONFIG_CRYPTO_DEV_HIFN_795X is not set
...@@ -1492,3 +1520,4 @@ CONFIG_PLIST=y ...@@ -1492,3 +1520,4 @@ CONFIG_PLIST=y
CONFIG_HAS_IOMEM=y CONFIG_HAS_IOMEM=y
CONFIG_HAS_IOPORT=y CONFIG_HAS_IOPORT=y
CONFIG_HAS_DMA=y CONFIG_HAS_DMA=y
CONFIG_HAVE_LMB=y
...@@ -273,10 +273,22 @@ static void __init check_mmu_stats(void) ...@@ -273,10 +273,22 @@ static void __init check_mmu_stats(void)
mmu_stats_supported = 1; mmu_stats_supported = 1;
} }
static void register_nodes(void)
{
#ifdef CONFIG_NUMA
int i;
for (i = 0; i < MAX_NUMNODES; i++)
register_one_node(i);
#endif
}
static int __init topology_init(void) static int __init topology_init(void)
{ {
int cpu; int cpu;
register_nodes();
check_mmu_stats(); check_mmu_stats();
register_cpu_notifier(&sysfs_cpu_nb); register_cpu_notifier(&sysfs_cpu_nb);
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <linux/sort.h> #include <linux/sort.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/lmb.h> #include <linux/lmb.h>
#include <linux/mmzone.h>
#include <asm/head.h> #include <asm/head.h>
#include <asm/system.h> #include <asm/system.h>
...@@ -73,9 +74,7 @@ extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES]; ...@@ -73,9 +74,7 @@ extern struct tsb swapper_4m_tsb[KERNEL_TSB4M_NENTRIES];
#define MAX_BANKS 32 #define MAX_BANKS 32
static struct linux_prom64_registers pavail[MAX_BANKS] __initdata; static struct linux_prom64_registers pavail[MAX_BANKS] __initdata;
static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
static int pavail_ents __initdata; static int pavail_ents __initdata;
static int pavail_rescan_ents __initdata;
static int cmp_p64(const void *a, const void *b) static int cmp_p64(const void *a, const void *b)
{ {
...@@ -716,19 +715,28 @@ void get_new_mmu_context(struct mm_struct *mm) ...@@ -716,19 +715,28 @@ void get_new_mmu_context(struct mm_struct *mm)
smp_new_mmu_context_version(); smp_new_mmu_context_version();
} }
/* Find a free area for the bootmem map, avoiding the kernel image static int numa_enabled = 1;
* and the initial ramdisk. static int numa_debug;
*/
static unsigned long __init choose_bootmap_pfn(unsigned long start_pfn, static int __init early_numa(char *p)
unsigned long end_pfn)
{ {
unsigned long bootmap_size; if (!p)
return 0;
if (strstr(p, "off"))
numa_enabled = 0;
bootmap_size = bootmem_bootmap_pages(end_pfn - start_pfn); if (strstr(p, "debug"))
bootmap_size <<= PAGE_SHIFT; numa_debug = 1;
return lmb_alloc(bootmap_size, PAGE_SIZE) >> PAGE_SHIFT; return 0;
} }
early_param("numa", early_numa);
#define numadbg(f, a...) \
do { if (numa_debug) \
printk(KERN_INFO f, ## a); \
} while (0)
static void __init find_ramdisk(unsigned long phys_base) static void __init find_ramdisk(unsigned long phys_base)
{ {
...@@ -755,6 +763,9 @@ static void __init find_ramdisk(unsigned long phys_base) ...@@ -755,6 +763,9 @@ static void __init find_ramdisk(unsigned long phys_base)
ramdisk_image -= KERNBASE; ramdisk_image -= KERNBASE;
ramdisk_image += phys_base; ramdisk_image += phys_base;
numadbg("Found ramdisk at physical address 0x%lx, size %u\n",
ramdisk_image, sparc_ramdisk_size);
initrd_start = ramdisk_image; initrd_start = ramdisk_image;
initrd_end = ramdisk_image + sparc_ramdisk_size; initrd_end = ramdisk_image + sparc_ramdisk_size;
...@@ -763,60 +774,625 @@ static void __init find_ramdisk(unsigned long phys_base) ...@@ -763,60 +774,625 @@ static void __init find_ramdisk(unsigned long phys_base)
#endif #endif
} }
/* About pages_avail, this is the value we will use to calculate struct node_mem_mask {
* the zholes_size[] argument given to free_area_init_node(). The unsigned long mask;
* page allocator uses this to calculate nr_kernel_pages, unsigned long val;
* nr_all_pages and zone->present_pages. On NUMA it is used unsigned long bootmem_paddr;
* to calculate zone->min_unmapped_pages and zone->min_slab_pages. };
* static struct node_mem_mask node_masks[MAX_NUMNODES];
* So this number should really be set to what the page allocator static int num_node_masks;
* actually ends up with. This means:
* 1) It should include bootmem map pages, we'll release those. int numa_cpu_lookup_table[NR_CPUS];
* 2) It should not include the kernel image, except for the cpumask_t numa_cpumask_lookup_table[MAX_NUMNODES];
* __init sections which we will also release.
* 3) It should include the initrd image, since we'll release #ifdef CONFIG_NEED_MULTIPLE_NODES
* that too. static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
struct mdesc_mblock {
u64 base;
u64 size;
u64 offset; /* RA-to-PA */
};
static struct mdesc_mblock *mblocks;
static int num_mblocks;
static unsigned long ra_to_pa(unsigned long addr)
{
int i;
for (i = 0; i < num_mblocks; i++) {
struct mdesc_mblock *m = &mblocks[i];
if (addr >= m->base &&
addr < (m->base + m->size)) {
addr += m->offset;
break;
}
}
return addr;
}
static int find_node(unsigned long addr)
{
int i;
addr = ra_to_pa(addr);
for (i = 0; i < num_node_masks; i++) {
struct node_mem_mask *p = &node_masks[i];
if ((addr & p->mask) == p->val)
return i;
}
return -1;
}
static unsigned long nid_range(unsigned long start, unsigned long end,
int *nid)
{
*nid = find_node(start);
start += PAGE_SIZE;
while (start < end) {
int n = find_node(start);
if (n != *nid)
break;
start += PAGE_SIZE;
}
return start;
}
#else
static unsigned long nid_range(unsigned long start, unsigned long end,
int *nid)
{
*nid = 0;
return end;
}
#endif
/* This must be invoked after performing all of the necessary
* add_active_range() calls for 'nid'. We need to be able to get
* correct data from get_pfn_range_for_nid().
*/ */
static unsigned long __init bootmem_init(unsigned long *pages_avail, static void __init allocate_node_data(int nid)
unsigned long phys_base) {
unsigned long paddr, num_pages, start_pfn, end_pfn;
struct pglist_data *p;
#ifdef CONFIG_NEED_MULTIPLE_NODES
paddr = lmb_alloc_nid(sizeof(struct pglist_data),
SMP_CACHE_BYTES, nid, nid_range);
if (!paddr) {
prom_printf("Cannot allocate pglist_data for nid[%d]\n", nid);
prom_halt();
}
NODE_DATA(nid) = __va(paddr);
memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
NODE_DATA(nid)->bdata = &plat_node_bdata[nid];
#endif
p = NODE_DATA(nid);
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
p->node_start_pfn = start_pfn;
p->node_spanned_pages = end_pfn - start_pfn;
if (p->node_spanned_pages) {
num_pages = bootmem_bootmap_pages(p->node_spanned_pages);
paddr = lmb_alloc_nid(num_pages << PAGE_SHIFT, PAGE_SIZE, nid,
nid_range);
if (!paddr) {
prom_printf("Cannot allocate bootmap for nid[%d]\n",
nid);
prom_halt();
}
node_masks[nid].bootmem_paddr = paddr;
}
}
static void init_node_masks_nonnuma(void)
{ {
unsigned long end_pfn;
int i; int i;
*pages_avail = lmb_phys_mem_size() >> PAGE_SHIFT; numadbg("Initializing tables for non-numa.\n");
end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
/* Initialize the boot-time allocator. */ node_masks[0].mask = node_masks[0].val = 0;
max_pfn = max_low_pfn = end_pfn; num_node_masks = 1;
min_low_pfn = (phys_base >> PAGE_SHIFT);
init_bootmem_node(NODE_DATA(0), for (i = 0; i < NR_CPUS; i++)
choose_bootmap_pfn(min_low_pfn, end_pfn), numa_cpu_lookup_table[i] = 0;
min_low_pfn, end_pfn);
/* Now register the available physical memory with the numa_cpumask_lookup_table[0] = CPU_MASK_ALL;
* allocator. }
*/
for (i = 0; i < lmb.memory.cnt; i++) #ifdef CONFIG_NEED_MULTIPLE_NODES
free_bootmem(lmb.memory.region[i].base, struct pglist_data *node_data[MAX_NUMNODES];
lmb_size_bytes(&lmb.memory, i));
EXPORT_SYMBOL(numa_cpu_lookup_table);
EXPORT_SYMBOL(numa_cpumask_lookup_table);
EXPORT_SYMBOL(node_data);
struct mdesc_mlgroup {
u64 node;
u64 latency;
u64 match;
u64 mask;
};
static struct mdesc_mlgroup *mlgroups;
static int num_mlgroups;
static int scan_pio_for_cfg_handle(struct mdesc_handle *md, u64 pio,
u32 cfg_handle)
{
u64 arc;
mdesc_for_each_arc(arc, md, pio, MDESC_ARC_TYPE_FWD) {
u64 target = mdesc_arc_target(md, arc);
const u64 *val;
val = mdesc_get_property(md, target,
"cfg-handle", NULL);
if (val && *val == cfg_handle)
return 0;
}
return -ENODEV;
}
static int scan_arcs_for_cfg_handle(struct mdesc_handle *md, u64 grp,
u32 cfg_handle)
{
u64 arc, candidate, best_latency = ~(u64)0;
candidate = MDESC_NODE_NULL;
mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
u64 target = mdesc_arc_target(md, arc);
const char *name = mdesc_node_name(md, target);
const u64 *val;
if (strcmp(name, "pio-latency-group"))
continue;
val = mdesc_get_property(md, target, "latency", NULL);
if (!val)
continue;
if (*val < best_latency) {
candidate = target;
best_latency = *val;
}
}
if (candidate == MDESC_NODE_NULL)
return -ENODEV;
return scan_pio_for_cfg_handle(md, candidate, cfg_handle);
}
int of_node_to_nid(struct device_node *dp)
{
const struct linux_prom64_registers *regs;
struct mdesc_handle *md;
u32 cfg_handle;
int count, nid;
u64 grp;
if (!mlgroups)
return -1;
regs = of_get_property(dp, "reg", NULL);
if (!regs)
return -1;
cfg_handle = (regs->phys_addr >> 32UL) & 0x0fffffff;
md = mdesc_grab();
count = 0;
nid = -1;
mdesc_for_each_node_by_name(md, grp, "group") {
if (!scan_arcs_for_cfg_handle(md, grp, cfg_handle)) {
nid = count;
break;
}
count++;
}
mdesc_release(md);
return nid;
}
static void add_node_ranges(void)
{
int i;
for (i = 0; i < lmb.memory.cnt; i++) {
unsigned long size = lmb_size_bytes(&lmb.memory, i);
unsigned long start, end;
start = lmb.memory.region[i].base;
end = start + size;
while (start < end) {
unsigned long this_end;
int nid;
this_end = nid_range(start, end, &nid);
numadbg("Adding active range nid[%d] "
"start[%lx] end[%lx]\n",
nid, start, this_end);
add_active_range(nid,
start >> PAGE_SHIFT,
this_end >> PAGE_SHIFT);
start = this_end;
}
}
}
for (i = 0; i < lmb.reserved.cnt; i++) static int __init grab_mlgroups(struct mdesc_handle *md)
reserve_bootmem(lmb.reserved.region[i].base, {
lmb_size_bytes(&lmb.reserved, i), unsigned long paddr;
BOOTMEM_DEFAULT); int count = 0;
u64 node;
mdesc_for_each_node_by_name(md, node, "memory-latency-group")
count++;
if (!count)
return -ENOENT;
paddr = lmb_alloc(count * sizeof(struct mdesc_mlgroup),
SMP_CACHE_BYTES);
if (!paddr)
return -ENOMEM;
mlgroups = __va(paddr);
num_mlgroups = count;
count = 0;
mdesc_for_each_node_by_name(md, node, "memory-latency-group") {
struct mdesc_mlgroup *m = &mlgroups[count++];
const u64 *val;
m->node = node;
val = mdesc_get_property(md, node, "latency", NULL);
m->latency = *val;
val = mdesc_get_property(md, node, "address-match", NULL);
m->match = *val;
val = mdesc_get_property(md, node, "address-mask", NULL);
m->mask = *val;
numadbg("MLGROUP[%d]: node[%lx] latency[%lx] "
"match[%lx] mask[%lx]\n",
count - 1, m->node, m->latency, m->match, m->mask);
}
*pages_avail -= PAGE_ALIGN(kern_size) >> PAGE_SHIFT; return 0;
}
for (i = 0; i < lmb.memory.cnt; ++i) { static int __init grab_mblocks(struct mdesc_handle *md)
unsigned long start_pfn, end_pfn, pages; {
unsigned long paddr;
int count = 0;
u64 node;
mdesc_for_each_node_by_name(md, node, "mblock")
count++;
if (!count)
return -ENOENT;
paddr = lmb_alloc(count * sizeof(struct mdesc_mblock),
SMP_CACHE_BYTES);
if (!paddr)
return -ENOMEM;
mblocks = __va(paddr);
num_mblocks = count;
count = 0;
mdesc_for_each_node_by_name(md, node, "mblock") {
struct mdesc_mblock *m = &mblocks[count++];
const u64 *val;
val = mdesc_get_property(md, node, "base", NULL);
m->base = *val;
val = mdesc_get_property(md, node, "size", NULL);
m->size = *val;
val = mdesc_get_property(md, node,
"address-congruence-offset", NULL);
m->offset = *val;
numadbg("MBLOCK[%d]: base[%lx] size[%lx] offset[%lx]\n",
count - 1, m->base, m->size, m->offset);
}
return 0;
}
static void __init numa_parse_mdesc_group_cpus(struct mdesc_handle *md,
u64 grp, cpumask_t *mask)
{
u64 arc;
cpus_clear(*mask);
mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_BACK) {
u64 target = mdesc_arc_target(md, arc);
const char *name = mdesc_node_name(md, target);
const u64 *id;
if (strcmp(name, "cpu"))
continue;
id = mdesc_get_property(md, target, "id", NULL);
if (*id < NR_CPUS)
cpu_set(*id, *mask);
}
}
static struct mdesc_mlgroup * __init find_mlgroup(u64 node)
{
int i;
for (i = 0; i < num_mlgroups; i++) {
struct mdesc_mlgroup *m = &mlgroups[i];
if (m->node == node)
return m;
}
return NULL;
}
static int __init numa_attach_mlgroup(struct mdesc_handle *md, u64 grp,
int index)
{
struct mdesc_mlgroup *candidate = NULL;
u64 arc, best_latency = ~(u64)0;
struct node_mem_mask *n;
mdesc_for_each_arc(arc, md, grp, MDESC_ARC_TYPE_FWD) {
u64 target = mdesc_arc_target(md, arc);
struct mdesc_mlgroup *m = find_mlgroup(target);
if (!m)
continue;
if (m->latency < best_latency) {
candidate = m;
best_latency = m->latency;
}
}
if (!candidate)
return -ENOENT;
if (num_node_masks != index) {
printk(KERN_ERR "Inconsistent NUMA state, "
"index[%d] != num_node_masks[%d]\n",
index, num_node_masks);
return -EINVAL;
}
n = &node_masks[num_node_masks++];
n->mask = candidate->mask;
n->val = candidate->match;
numadbg("NUMA NODE[%d]: mask[%lx] val[%lx] (latency[%lx])\n",
index, n->mask, n->val, candidate->latency);
return 0;
}
static int __init numa_parse_mdesc_group(struct mdesc_handle *md, u64 grp,
int index)
{
cpumask_t mask;
int cpu;
numa_parse_mdesc_group_cpus(md, grp, &mask);
for_each_cpu_mask(cpu, mask)
numa_cpu_lookup_table[cpu] = index;
numa_cpumask_lookup_table[index] = mask;
if (numa_debug) {
printk(KERN_INFO "NUMA GROUP[%d]: cpus [ ", index);
for_each_cpu_mask(cpu, mask)
printk("%d ", cpu);
printk("]\n");
}
return numa_attach_mlgroup(md, grp, index);
}
static int __init numa_parse_mdesc(void)
{
struct mdesc_handle *md = mdesc_grab();
int i, err, count;
u64 node;
node = mdesc_node_by_name(md, MDESC_NODE_NULL, "latency-groups");
if (node == MDESC_NODE_NULL) {
mdesc_release(md);
return -ENOENT;
}
err = grab_mblocks(md);
if (err < 0)
goto out;
err = grab_mlgroups(md);
if (err < 0)
goto out;
count = 0;
mdesc_for_each_node_by_name(md, node, "group") {
err = numa_parse_mdesc_group(md, node, count);
if (err < 0)
break;
count++;
}
add_node_ranges();
for (i = 0; i < num_node_masks; i++) {
allocate_node_data(i);
node_set_online(i);
}
err = 0;
out:
mdesc_release(md);
return err;
}
static int __init numa_parse_sun4u(void)
{
return -1;
}
static int __init bootmem_init_numa(void)
{
int err = -1;
numadbg("bootmem_init_numa()\n");
if (numa_enabled) {
if (tlb_type == hypervisor)
err = numa_parse_mdesc();
else
err = numa_parse_sun4u();
}
return err;
}
#else
static int bootmem_init_numa(void)
{
return -1;
}
#endif
static void __init bootmem_init_nonnuma(void)
{
unsigned long top_of_ram = lmb_end_of_DRAM();
unsigned long total_ram = lmb_phys_mem_size();
unsigned int i;
numadbg("bootmem_init_nonnuma()\n");
printk(KERN_INFO "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
top_of_ram, total_ram);
printk(KERN_INFO "Memory hole size: %ldMB\n",
(top_of_ram - total_ram) >> 20);
init_node_masks_nonnuma();
for (i = 0; i < lmb.memory.cnt; i++) {
unsigned long size = lmb_size_bytes(&lmb.memory, i);
unsigned long start_pfn, end_pfn;
if (!size)
continue;
pages = lmb_size_pages(&lmb.memory, i);
start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT; start_pfn = lmb.memory.region[i].base >> PAGE_SHIFT;
end_pfn = start_pfn + pages; end_pfn = start_pfn + lmb_size_pages(&lmb.memory, i);
add_active_range(0, start_pfn, end_pfn);
}
memory_present(0, start_pfn, end_pfn); allocate_node_data(0);
node_set_online(0);
}
static void __init reserve_range_in_node(int nid, unsigned long start,
unsigned long end)
{
numadbg(" reserve_range_in_node(nid[%d],start[%lx],end[%lx]\n",
nid, start, end);
while (start < end) {
unsigned long this_end;
int n;
this_end = nid_range(start, end, &n);
if (n == nid) {
numadbg(" MATCH reserving range [%lx:%lx]\n",
start, this_end);
reserve_bootmem_node(NODE_DATA(nid), start,
(this_end - start), BOOTMEM_DEFAULT);
} else
numadbg(" NO MATCH, advancing start to %lx\n",
this_end);
start = this_end;
} }
}
static void __init trim_reserved_in_node(int nid)
{
int i;
numadbg(" trim_reserved_in_node(%d)\n", nid);
for (i = 0; i < lmb.reserved.cnt; i++) {
unsigned long start = lmb.reserved.region[i].base;
unsigned long size = lmb_size_bytes(&lmb.reserved, i);
unsigned long end = start + size;
reserve_range_in_node(nid, start, end);
}
}
static void __init bootmem_init_one_node(int nid)
{
struct pglist_data *p;
numadbg("bootmem_init_one_node(%d)\n", nid);
p = NODE_DATA(nid);
if (p->node_spanned_pages) {
unsigned long paddr = node_masks[nid].bootmem_paddr;
unsigned long end_pfn;
end_pfn = p->node_start_pfn + p->node_spanned_pages;
numadbg(" init_bootmem_node(%d, %lx, %lx, %lx)\n",
nid, paddr >> PAGE_SHIFT, p->node_start_pfn, end_pfn);
init_bootmem_node(p, paddr >> PAGE_SHIFT,
p->node_start_pfn, end_pfn);
numadbg(" free_bootmem_with_active_regions(%d, %lx)\n",
nid, end_pfn);
free_bootmem_with_active_regions(nid, end_pfn);
trim_reserved_in_node(nid);
numadbg(" sparse_memory_present_with_active_regions(%d)\n",
nid);
sparse_memory_present_with_active_regions(nid);
}
}
static unsigned long __init bootmem_init(unsigned long phys_base)
{
unsigned long end_pfn;
int nid;
end_pfn = lmb_end_of_DRAM() >> PAGE_SHIFT;
max_pfn = max_low_pfn = end_pfn;
min_low_pfn = (phys_base >> PAGE_SHIFT);
if (bootmem_init_numa() < 0)
bootmem_init_nonnuma();
/* XXX cpu notifier XXX */
for_each_online_node(nid)
bootmem_init_one_node(nid);
sparse_init(); sparse_init();
...@@ -1112,7 +1688,7 @@ void __init setup_per_cpu_areas(void) ...@@ -1112,7 +1688,7 @@ void __init setup_per_cpu_areas(void)
void __init paging_init(void) void __init paging_init(void)
{ {
unsigned long end_pfn, pages_avail, shift, phys_base; unsigned long end_pfn, shift, phys_base;
unsigned long real_end, i; unsigned long real_end, i;
/* These build time checkes make sure that the dcache_dirty_cpu() /* These build time checkes make sure that the dcache_dirty_cpu()
...@@ -1220,27 +1796,21 @@ void __init paging_init(void) ...@@ -1220,27 +1796,21 @@ void __init paging_init(void)
sun4v_mdesc_init(); sun4v_mdesc_init();
/* Setup bootmem... */ /* Setup bootmem... */
pages_avail = 0; last_valid_pfn = end_pfn = bootmem_init(phys_base);
last_valid_pfn = end_pfn = bootmem_init(&pages_avail, phys_base);
#ifndef CONFIG_NEED_MULTIPLE_NODES
max_mapnr = last_valid_pfn; max_mapnr = last_valid_pfn;
#endif
kernel_physical_mapping_init(); kernel_physical_mapping_init();
{ {
unsigned long zones_size[MAX_NR_ZONES]; unsigned long max_zone_pfns[MAX_NR_ZONES];
unsigned long zholes_size[MAX_NR_ZONES];
int znum;
for (znum = 0; znum < MAX_NR_ZONES; znum++) memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
zones_size[znum] = zholes_size[znum] = 0;
zones_size[ZONE_NORMAL] = end_pfn; max_zone_pfns[ZONE_NORMAL] = end_pfn;
zholes_size[ZONE_NORMAL] = end_pfn - pages_avail;
free_area_init_node(0, &contig_page_data, zones_size, free_area_init_nodes(max_zone_pfns);
__pa(PAGE_OFFSET) >> PAGE_SHIFT,
zholes_size);
} }
printk("Booting Linux...\n"); printk("Booting Linux...\n");
...@@ -1249,21 +1819,52 @@ void __init paging_init(void) ...@@ -1249,21 +1819,52 @@ void __init paging_init(void)
cpu_probe(); cpu_probe();
} }
static void __init taint_real_pages(void) int __init page_in_phys_avail(unsigned long paddr)
{
int i;
paddr &= PAGE_MASK;
for (i = 0; i < pavail_ents; i++) {
unsigned long start, end;
start = pavail[i].phys_addr;
end = start + pavail[i].reg_size;
if (paddr >= start && paddr < end)
return 1;
}
if (paddr >= kern_base && paddr < (kern_base + kern_size))
return 1;
#ifdef CONFIG_BLK_DEV_INITRD
if (paddr >= __pa(initrd_start) &&
paddr < __pa(PAGE_ALIGN(initrd_end)))
return 1;
#endif
return 0;
}
static struct linux_prom64_registers pavail_rescan[MAX_BANKS] __initdata;
static int pavail_rescan_ents __initdata;
/* Certain OBP calls, such as fetching "available" properties, can
* claim physical memory. So, along with initializing the valid
* address bitmap, what we do here is refetch the physical available
* memory list again, and make sure it provides at least as much
* memory as 'pavail' does.
*/
static void setup_valid_addr_bitmap_from_pavail(void)
{ {
int i; int i;
read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents); read_obp_memory("available", &pavail_rescan[0], &pavail_rescan_ents);
/* Find changes discovered in the physmem available rescan and
* reserve the lost portions in the bootmem maps.
*/
for (i = 0; i < pavail_ents; i++) { for (i = 0; i < pavail_ents; i++) {
unsigned long old_start, old_end; unsigned long old_start, old_end;
old_start = pavail[i].phys_addr; old_start = pavail[i].phys_addr;
old_end = old_start + old_end = old_start + pavail[i].reg_size;
pavail[i].reg_size;
while (old_start < old_end) { while (old_start < old_end) {
int n; int n;
...@@ -1281,7 +1882,16 @@ static void __init taint_real_pages(void) ...@@ -1281,7 +1882,16 @@ static void __init taint_real_pages(void)
goto do_next_page; goto do_next_page;
} }
} }
reserve_bootmem(old_start, PAGE_SIZE, BOOTMEM_DEFAULT);
prom_printf("mem_init: Lost memory in pavail\n");
prom_printf("mem_init: OLD start[%lx] size[%lx]\n",
pavail[i].phys_addr,
pavail[i].reg_size);
prom_printf("mem_init: NEW start[%lx] size[%lx]\n",
pavail_rescan[i].phys_addr,
pavail_rescan[i].reg_size);
prom_printf("mem_init: Cannot continue, aborting.\n");
prom_halt();
do_next_page: do_next_page:
old_start += PAGE_SIZE; old_start += PAGE_SIZE;
...@@ -1289,32 +1899,6 @@ static void __init taint_real_pages(void) ...@@ -1289,32 +1899,6 @@ static void __init taint_real_pages(void)
} }
} }
int __init page_in_phys_avail(unsigned long paddr)
{
int i;
paddr &= PAGE_MASK;
for (i = 0; i < pavail_rescan_ents; i++) {
unsigned long start, end;
start = pavail_rescan[i].phys_addr;
end = start + pavail_rescan[i].reg_size;
if (paddr >= start && paddr < end)
return 1;
}
if (paddr >= kern_base && paddr < (kern_base + kern_size))
return 1;
#ifdef CONFIG_BLK_DEV_INITRD
if (paddr >= __pa(initrd_start) &&
paddr < __pa(PAGE_ALIGN(initrd_end)))
return 1;
#endif
return 0;
}
void __init mem_init(void) void __init mem_init(void)
{ {
unsigned long codepages, datapages, initpages; unsigned long codepages, datapages, initpages;
...@@ -1337,14 +1921,26 @@ void __init mem_init(void) ...@@ -1337,14 +1921,26 @@ void __init mem_init(void)
addr += PAGE_SIZE; addr += PAGE_SIZE;
} }
taint_real_pages(); setup_valid_addr_bitmap_from_pavail();
high_memory = __va(last_valid_pfn << PAGE_SHIFT); high_memory = __va(last_valid_pfn << PAGE_SHIFT);
#ifdef CONFIG_NEED_MULTIPLE_NODES
for_each_online_node(i) {
if (NODE_DATA(i)->node_spanned_pages != 0) {
totalram_pages +=
free_all_bootmem_node(NODE_DATA(i));
}
}
#else
totalram_pages = free_all_bootmem();
#endif
/* We subtract one to account for the mem_map_zero page /* We subtract one to account for the mem_map_zero page
* allocated below. * allocated below.
*/ */
totalram_pages = num_physpages = free_all_bootmem() - 1; totalram_pages -= 1;
num_physpages = totalram_pages;
/* /*
* Set up the zero page, mark it reserved, so that page count * Set up the zero page, mark it reserved, so that page count
......
#ifndef _SPARC64_MMZONE_H
#define _SPARC64_MMZONE_H
#ifdef CONFIG_NEED_MULTIPLE_NODES
extern struct pglist_data *node_data[];
#define NODE_DATA(nid) (node_data[nid])
#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) (NODE_DATA(nid)->node_end_pfn)
extern int numa_cpu_lookup_table[];
extern cpumask_t numa_cpumask_lookup_table[];
#endif /* CONFIG_NEED_MULTIPLE_NODES */
#endif /* _SPARC64_MMZONE_H */
#ifndef _ASM_SPARC64_TOPOLOGY_H #ifndef _ASM_SPARC64_TOPOLOGY_H
#define _ASM_SPARC64_TOPOLOGY_H #define _ASM_SPARC64_TOPOLOGY_H
#ifdef CONFIG_NUMA
#include <asm/mmzone.h>
static inline int cpu_to_node(int cpu)
{
return numa_cpu_lookup_table[cpu];
}
#define parent_node(node) (node)
static inline cpumask_t node_to_cpumask(int node)
{
return numa_cpumask_lookup_table[node];
}
/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
#define node_to_cpumask_ptr(v, node) \
cpumask_t *v = &(numa_cpumask_lookup_table[node])
#define node_to_cpumask_ptr_next(v, node) \
v = &(numa_cpumask_lookup_table[node])
static inline int node_to_first_cpu(int node)
{
cpumask_t tmp;
tmp = node_to_cpumask(node);
return first_cpu(tmp);
}
struct pci_bus;
#ifdef CONFIG_PCI
extern int pcibus_to_node(struct pci_bus *pbus);
#else
static inline int pcibus_to_node(struct pci_bus *pbus)
{
return -1;
}
#endif
#define pcibus_to_cpumask(bus) \
(pcibus_to_node(bus) == -1 ? \
CPU_MASK_ALL : \
node_to_cpumask(pcibus_to_node(bus)))
#define SD_NODE_INIT (struct sched_domain) { \
.min_interval = 8, \
.max_interval = 32, \
.busy_factor = 32, \
.imbalance_pct = 125, \
.cache_nice_tries = 2, \
.busy_idx = 3, \
.idle_idx = 2, \
.newidle_idx = 0, \
.wake_idx = 1, \
.forkexec_idx = 1, \
.flags = SD_LOAD_BALANCE \
| SD_BALANCE_FORK \
| SD_BALANCE_EXEC \
| SD_SERIALIZE \
| SD_WAKE_BALANCE, \
.last_balance = jiffies, \
.balance_interval = 1, \
}
#else /* CONFIG_NUMA */
#include <asm-generic/topology.h>
#endif /* !(CONFIG_NUMA) */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#define topology_physical_package_id(cpu) (cpu_data(cpu).proc_id) #define topology_physical_package_id(cpu) (cpu_data(cpu).proc_id)
#define topology_core_id(cpu) (cpu_data(cpu).core_id) #define topology_core_id(cpu) (cpu_data(cpu).core_id)
...@@ -10,8 +81,6 @@ ...@@ -10,8 +81,6 @@
#define smt_capable() (sparc64_multi_core) #define smt_capable() (sparc64_multi_core)
#endif /* CONFIG_SMP */ #endif /* CONFIG_SMP */
#include <asm-generic/topology.h>
#define cpu_coregroup_map(cpu) (cpu_core_map[cpu]) #define cpu_coregroup_map(cpu) (cpu_core_map[cpu])
#endif /* _ASM_SPARC64_TOPOLOGY_H */ #endif /* _ASM_SPARC64_TOPOLOGY_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment