Commit 0edcf8d6 authored by Ingo Molnar's avatar Ingo Molnar

Merge branch 'tj-percpu' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc into core/percpu

Conflicts:
	arch/x86/include/asm/pgtable.h
parents 87b20307 40150d37
......@@ -189,9 +189,21 @@ callback_init(void * kernel_end)
if (alpha_using_srm) {
static struct vm_struct console_remap_vm;
unsigned long vaddr = VMALLOC_START;
unsigned long nr_pages = 0;
unsigned long vaddr;
unsigned long i, j;
/* calculate needed size */
for (i = 0; i < crb->map_entries; ++i)
nr_pages += crb->map[i].count;
/* register the vm area */
console_remap_vm.flags = VM_ALLOC;
console_remap_vm.size = nr_pages << PAGE_SHIFT;
vm_area_register_early(&console_remap_vm, PAGE_SIZE);
vaddr = (unsigned long)consle_remap_vm.addr;
/* Set up the third level PTEs and update the virtual
addresses of the CRB entries. */
for (i = 0; i < crb->map_entries; ++i) {
......@@ -213,12 +225,6 @@ callback_init(void * kernel_end)
vaddr += PAGE_SIZE;
}
}
/* Let vmalloc know that we've allocated some space. */
console_remap_vm.flags = VM_ALLOC;
console_remap_vm.addr = (void *) VMALLOC_START;
console_remap_vm.size = vaddr - VMALLOC_START;
vmlist = &console_remap_vm;
}
callback_init_done = 1;
......
......@@ -181,7 +181,7 @@ source "kernel/Kconfig.preempt"
config QUICKLIST
def_bool y
config HAVE_ARCH_BOOTMEM_NODE
config HAVE_ARCH_BOOTMEM
def_bool n
config ARCH_HAVE_MEMORY_PRESENT
......
......@@ -135,6 +135,9 @@ config ARCH_HAS_CACHE_LINE_SIZE
config HAVE_SETUP_PER_CPU_AREA
def_bool y
config HAVE_DYNAMIC_PER_CPU_AREA
def_bool y
config HAVE_CPUMASK_OF_CPU_MAP
def_bool X86_64_SMP
......@@ -1122,7 +1125,7 @@ config NODES_SHIFT
Specify the maximum number of NUMA Nodes available on the target
system. Increases memory reserved to accomodate various tables.
config HAVE_ARCH_BOOTMEM_NODE
config HAVE_ARCH_BOOTMEM
def_bool y
depends on X86_32 && NUMA
......
......@@ -91,45 +91,12 @@ static inline int pfn_valid(int pfn)
#endif /* CONFIG_DISCONTIGMEM */
#ifdef CONFIG_NEED_MULTIPLE_NODES
/*
* Following are macros that are specific to this numa platform.
*/
#define reserve_bootmem(addr, size, flags) \
reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
#define alloc_bootmem(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_nopanic(x) \
__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
__pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
#define alloc_bootmem_pages(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_nopanic(x) \
__alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
__pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
#define alloc_bootmem_node(pgdat, x) \
({ \
struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
__pa(MAX_DMA_ADDRESS)); \
})
#define alloc_bootmem_pages_node(pgdat, x) \
({ \
struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
__pa(MAX_DMA_ADDRESS)); \
})
#define alloc_bootmem_low_pages_node(pgdat, x) \
/* always use node 0 for bootmem on this numa platform */
#define alloc_bootmem_core(__bdata, size, align, goal, limit) \
({ \
struct pglist_data __maybe_unused \
*__alloc_bootmem_node__pgdat = (pgdat); \
__alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
bootmem_data_t __maybe_unused * __abm_bdata_dummy = (__bdata); \
__alloc_bootmem_core(NODE_DATA(0)->bdata, \
(size), (align), (goal), (limit)); \
})
#endif /* CONFIG_NEED_MULTIPLE_NODES */
......
......@@ -43,6 +43,14 @@
#else /* ...!ASSEMBLY */
#include <linux/stringify.h>
#include <asm/sections.h>
#define __addr_to_pcpu_ptr(addr) \
(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
+ (unsigned long)__per_cpu_start)
#define __pcpu_ptr_to_addr(ptr) \
(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
- (unsigned long)__per_cpu_start)
#ifdef CONFIG_SMP
#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x
......
......@@ -288,6 +288,8 @@ static inline int is_new_memtype_allowed(unsigned long flags,
return 1;
}
pmd_t *populate_extra_pmd(unsigned long vaddr);
pte_t *populate_extra_pte(unsigned long vaddr);
#endif /* __ASSEMBLY__ */
#ifdef CONFIG_X86_32
......
......@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
if (!data)
return -ENOMEM;
data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
per_cpu(drv_data, cpu) = data;
if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
......
......@@ -16,6 +16,7 @@
#include <linux/cpu.h>
#include <linux/delay.h>
#include <linux/uaccess.h>
#include <linux/percpu.h>
#include <asm/apic.h>
......@@ -55,13 +56,13 @@ static inline void print_stack_overflow(void) { }
union irq_ctx {
struct thread_info tinfo;
u32 stack[THREAD_SIZE/sizeof(u32)];
};
} __attribute__((aligned(PAGE_SIZE)));
static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
static void call_on_stack(void *func, void *stack)
{
......@@ -81,7 +82,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
u32 *isp, arg1, arg2;
curctx = (union irq_ctx *) current_thread_info();
irqctx = hardirq_ctx[smp_processor_id()];
irqctx = __get_cpu_var(hardirq_ctx);
/*
* this is where we switch to the IRQ stack. However, if we are
......@@ -125,34 +126,34 @@ void __cpuinit irq_ctx_init(int cpu)
{
union irq_ctx *irqctx;
if (hardirq_ctx[cpu])
if (per_cpu(hardirq_ctx, cpu))
return;
irqctx = (union irq_ctx*) &hardirq_stack[cpu*THREAD_SIZE];
irqctx = &per_cpu(hardirq_stack, cpu);
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
hardirq_ctx[cpu] = irqctx;
per_cpu(hardirq_ctx, cpu) = irqctx;
irqctx = (union irq_ctx *) &softirq_stack[cpu*THREAD_SIZE];
irqctx = &per_cpu(softirq_stack, cpu);
irqctx->tinfo.task = NULL;
irqctx->tinfo.exec_domain = NULL;
irqctx->tinfo.cpu = cpu;
irqctx->tinfo.preempt_count = 0;
irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
softirq_ctx[cpu] = irqctx;
per_cpu(softirq_ctx, cpu) = irqctx;
printk(KERN_DEBUG "CPU %u irqstacks, hard=%p soft=%p\n",
cpu, hardirq_ctx[cpu], softirq_ctx[cpu]);
cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
}
void irq_ctx_exit(int cpu)
{
hardirq_ctx[cpu] = NULL;
per_cpu(hardirq_ctx, cpu) = NULL;
}
asmlinkage void do_softirq(void)
......@@ -169,7 +170,7 @@ asmlinkage void do_softirq(void)
if (local_softirq_pending()) {
curctx = current_thread_info();
irqctx = softirq_ctx[smp_processor_id()];
irqctx = __get_cpu_var(softirq_ctx);
irqctx->tinfo.task = curctx->task;
irqctx->tinfo.previous_esp = current_stack_pointer;
......
......@@ -7,6 +7,7 @@
#include <linux/crash_dump.h>
#include <linux/smp.h>
#include <linux/topology.h>
#include <linux/pfn.h>
#include <asm/sections.h>
#include <asm/processor.h>
#include <asm/setup.h>
......@@ -41,6 +42,321 @@ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = {
};
EXPORT_SYMBOL(__per_cpu_offset);
/**
* pcpu_need_numa - determine percpu allocation needs to consider NUMA
*
* If NUMA is not configured or there is only one NUMA node available,
* there is no reason to consider NUMA. This function determines
* whether percpu allocation should consider NUMA or not.
*
* RETURNS:
* true if NUMA should be considered; otherwise, false.
*/
static bool __init pcpu_need_numa(void)
{
#ifdef CONFIG_NEED_MULTIPLE_NODES
pg_data_t *last = NULL;
unsigned int cpu;
for_each_possible_cpu(cpu) {
int node = early_cpu_to_node(cpu);
if (node_online(node) && NODE_DATA(node) &&
last && last != NODE_DATA(node))
return true;
last = NODE_DATA(node);
}
#endif
return false;
}
/**
* pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu
* @cpu: cpu to allocate for
* @size: size allocation in bytes
* @align: alignment
*
* Allocate @size bytes aligned at @align for cpu @cpu. This wrapper
* does the right thing for NUMA regardless of the current
* configuration.
*
* RETURNS:
* Pointer to the allocated area on success, NULL on failure.
*/
static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
unsigned long align)
{
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NEED_MULTIPLE_NODES
int node = early_cpu_to_node(cpu);
void *ptr;
if (!node_online(node) || !NODE_DATA(node)) {
ptr = __alloc_bootmem_nopanic(size, align, goal);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n",
cpu, size, __pa(ptr));
} else {
ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
size, align, goal);
pr_debug("per cpu data for cpu%d %lu bytes on node%d at "
"%016lx\n", cpu, size, node, __pa(ptr));
}
return ptr;
#else
return __alloc_bootmem_nopanic(size, align, goal);
#endif
}
/*
* Remap allocator
*
* This allocator uses PMD page as unit. A PMD page is allocated for
* each cpu and each is remapped into vmalloc area using PMD mapping.
* As PMD page is quite large, only part of it is used for the first
* chunk. Unused part is returned to the bootmem allocator.
*
* So, the PMD pages are mapped twice - once to the physical mapping
* and to the vmalloc area for the first percpu chunk. The double
* mapping does add one more PMD TLB entry pressure but still is much
* better than only using 4k mappings while still being NUMA friendly.
*/
#ifdef CONFIG_NEED_MULTIPLE_NODES
static size_t pcpur_size __initdata;
static void **pcpur_ptrs __initdata;
static struct page * __init pcpur_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpur_size)
return NULL;
return virt_to_page(pcpur_ptrs[cpu] + off);
}
static ssize_t __init setup_pcpu_remap(size_t static_size)
{
static struct vm_struct vm;
pg_data_t *last;
size_t ptrs_size;
unsigned int cpu;
ssize_t ret;
/*
* If large page isn't supported, there's no benefit in doing
* this. Also, on non-NUMA, embedding is better.
*/
if (!cpu_has_pse || pcpu_need_numa())
return -EINVAL;
last = NULL;
for_each_possible_cpu(cpu) {
int node = early_cpu_to_node(cpu);
if (node_online(node) && NODE_DATA(node) &&
last && last != NODE_DATA(node))
goto proceed;
last = NODE_DATA(node);
}
return -EINVAL;
proceed:
/*
* Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary.
*/
pcpur_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
if (pcpur_size > PMD_SIZE) {
pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n");
return -EINVAL;
}
/* allocate pointer array and alloc large pages */
ptrs_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpur_ptrs[0]));
pcpur_ptrs = alloc_bootmem(ptrs_size);
for_each_possible_cpu(cpu) {
pcpur_ptrs[cpu] = pcpu_alloc_bootmem(cpu, PMD_SIZE, PMD_SIZE);
if (!pcpur_ptrs[cpu])
goto enomem;
/*
* Only use pcpur_size bytes and give back the rest.
*
* Ingo: The 2MB up-rounding bootmem is needed to make
* sure the partial 2MB page is still fully RAM - it's
* not well-specified to have a PAT-incompatible area
* (unmapped RAM, device memory, etc.) in that hole.
*/
free_bootmem(__pa(pcpur_ptrs[cpu] + pcpur_size),
PMD_SIZE - pcpur_size);
memcpy(pcpur_ptrs[cpu], __per_cpu_load, static_size);
}
/* allocate address and map */
vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * PMD_SIZE;
vm_area_register_early(&vm, PMD_SIZE);
for_each_possible_cpu(cpu) {
pmd_t *pmd;
pmd = populate_extra_pmd((unsigned long)vm.addr
+ cpu * PMD_SIZE);
set_pmd(pmd, pfn_pmd(page_to_pfn(virt_to_page(pcpur_ptrs[cpu])),
PAGE_KERNEL_LARGE));
}
/* we're ready, commit */
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", vm.addr, static_size);
ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
pcpur_size - static_size, vm.addr, NULL);
goto out_free_ar;
enomem:
for_each_possible_cpu(cpu)
if (pcpur_ptrs[cpu])
free_bootmem(__pa(pcpur_ptrs[cpu]), PMD_SIZE);
ret = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pcpur_ptrs), ptrs_size);
return ret;
}
#else
static ssize_t __init setup_pcpu_remap(size_t static_size)
{
return -EINVAL;
}
#endif
/*
* Embedding allocator
*
* The first chunk is sized to just contain the static area plus
* PERCPU_DYNAMIC_RESERVE and allocated as a contiguous area using
* bootmem allocator and used as-is without being mapped into vmalloc
* area. This enables the first chunk to piggy back on the linear
* physical PMD mapping and doesn't add any additional pressure to
* TLB.
*/
static void *pcpue_ptr __initdata;
static size_t pcpue_unit_size __initdata;
static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size
+ ((size_t)pageno << PAGE_SHIFT));
}
static ssize_t __init setup_pcpu_embed(size_t static_size)
{
unsigned int cpu;
/*
* If large page isn't supported, there's no benefit in doing
* this. Also, embedding allocation doesn't play well with
* NUMA.
*/
if (!cpu_has_pse || pcpu_need_numa())
return -EINVAL;
/* allocate and copy */
pcpue_unit_size = PFN_ALIGN(static_size + PERCPU_DYNAMIC_RESERVE);
pcpue_unit_size = max(pcpue_unit_size, PCPU_MIN_UNIT_SIZE);
pcpue_ptr = pcpu_alloc_bootmem(0, num_possible_cpus() * pcpue_unit_size,
PAGE_SIZE);
if (!pcpue_ptr)
return -ENOMEM;
for_each_possible_cpu(cpu)
memcpy(pcpue_ptr + cpu * pcpue_unit_size, __per_cpu_load,
static_size);
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_unit_size >> PAGE_SHIFT, pcpue_ptr, static_size);
return pcpu_setup_first_chunk(pcpue_get_page, static_size,
pcpue_unit_size,
pcpue_unit_size - static_size, pcpue_ptr,
NULL);
}
/*
* 4k page allocator
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page and most of initialization is done by the generic
* setup function.
*/
static struct page **pcpu4k_pages __initdata;
static int pcpu4k_nr_static_pages __initdata;
static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
if (pageno < pcpu4k_nr_static_pages)
return pcpu4k_pages[cpu * pcpu4k_nr_static_pages + pageno];
return NULL;
}
static void __init pcpu4k_populate_pte(unsigned long addr)
{
populate_extra_pte(addr);
}
static ssize_t __init setup_pcpu_4k(size_t static_size)
{
size_t pages_size;
unsigned int cpu;
int i, j;
ssize_t ret;
pcpu4k_nr_static_pages = PFN_UP(static_size);
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(pcpu4k_nr_static_pages * num_possible_cpus()
* sizeof(pcpu4k_pages[0]));
pcpu4k_pages = alloc_bootmem(pages_size);
/* allocate and copy */
j = 0;
for_each_possible_cpu(cpu)
for (i = 0; i < pcpu4k_nr_static_pages; i++) {
void *ptr;
ptr = pcpu_alloc_bootmem(cpu, PAGE_SIZE, PAGE_SIZE);
if (!ptr)
goto enomem;
memcpy(ptr, __per_cpu_load + i * PAGE_SIZE, PAGE_SIZE);
pcpu4k_pages[j++] = virt_to_page(ptr);
}
/* we're ready, commit */
pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
pcpu4k_nr_static_pages, static_size);
ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, 0, NULL,
pcpu4k_populate_pte);
goto out_free_ar;
enomem:
while (--j >= 0)
free_bootmem(__pa(page_address(pcpu4k_pages[j])), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pcpu4k_pages), pages_size);
return ret;
}
static inline void setup_percpu_segment(int cpu)
{
#ifdef CONFIG_X86_32
......@@ -61,38 +377,35 @@ static inline void setup_percpu_segment(int cpu)
*/
void __init setup_per_cpu_areas(void)
{
ssize_t size;
char *ptr;
int cpu;
/* Copy section for each CPU (we discard the original) */
size = roundup(PERCPU_ENOUGH_ROOM, PAGE_SIZE);
size_t static_size = __per_cpu_end - __per_cpu_start;
unsigned int cpu;
unsigned long delta;
size_t pcpu_unit_size;
ssize_t ret;
pr_info("NR_CPUS:%d nr_cpumask_bits:%d nr_cpu_ids:%d nr_node_ids:%d\n",
NR_CPUS, nr_cpumask_bits, nr_cpu_ids, nr_node_ids);
pr_info("PERCPU: Allocating %zd bytes of per cpu data\n", size);
/*
* Allocate percpu area. If PSE is supported, try to make use
* of large page mappings. Please read comments on top of
* each allocator for details.
*/
ret = setup_pcpu_remap(static_size);
if (ret < 0)
ret = setup_pcpu_embed(static_size);
if (ret < 0)
ret = setup_pcpu_4k(static_size);
if (ret < 0)
panic("cannot allocate static percpu area (%zu bytes, err=%zd)",
static_size, ret);
for_each_possible_cpu(cpu) {
#ifndef CONFIG_NEED_MULTIPLE_NODES
ptr = alloc_bootmem_pages(size);
#else
int node = early_cpu_to_node(cpu);
if (!node_online(node) || !NODE_DATA(node)) {
ptr = alloc_bootmem_pages(size);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d at %016lx\n",
cpu, __pa(ptr));
} else {
ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
pr_debug("per cpu data for cpu%d on node%d at %016lx\n",
cpu, node, __pa(ptr));
}
#endif
pcpu_unit_size = ret;
memcpy(ptr, __per_cpu_load, __per_cpu_end - __per_cpu_start);
per_cpu_offset(cpu) = ptr - __per_cpu_start;
/* alrighty, percpu areas up and running */
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu) {
per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size;
per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu);
per_cpu(cpu_number, cpu) = cpu;
setup_percpu_segment(cpu);
......
......@@ -137,6 +137,23 @@ static pte_t * __init one_page_table_init(pmd_t *pmd)
return pte_offset_kernel(pmd, 0);
}
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
int pgd_idx = pgd_index(vaddr);
int pmd_idx = pmd_index(vaddr);
return one_md_table_init(swapper_pg_dir + pgd_idx) + pmd_idx;
}
pte_t * __init populate_extra_pte(unsigned long vaddr)
{
int pte_idx = pte_index(vaddr);
pmd_t *pmd;
pmd = populate_extra_pmd(vaddr);
return one_page_table_init(pmd) + pte_idx;
}
static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
unsigned long vaddr, pte_t *lastpte)
{
......
......@@ -168,34 +168,51 @@ static __ref void *spp_getpage(void)
return ptr;
}
void
set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
static pud_t * __init fill_pud(pgd_t *pgd, unsigned long vaddr)
{
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
if (pgd_none(*pgd)) {
pud_t *pud = (pud_t *)spp_getpage();
pgd_populate(&init_mm, pgd, pud);
if (pud != pud_offset(pgd, 0))
printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
pud, pud_offset(pgd, 0));
}
return pud_offset(pgd, vaddr);
}
pud = pud_page + pud_index(vaddr);
static pmd_t * __init fill_pmd(pud_t *pud, unsigned long vaddr)
{
if (pud_none(*pud)) {
pmd = (pmd_t *) spp_getpage();
pmd_t *pmd = (pmd_t *) spp_getpage();
pud_populate(&init_mm, pud, pmd);
if (pmd != pmd_offset(pud, 0)) {
if (pmd != pmd_offset(pud, 0))
printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
pmd, pmd_offset(pud, 0));
return;
}
pmd, pmd_offset(pud, 0));
}
pmd = pmd_offset(pud, vaddr);
return pmd_offset(pud, vaddr);
}
static pte_t * __init fill_pte(pmd_t *pmd, unsigned long vaddr)
{
if (pmd_none(*pmd)) {
pte = (pte_t *) spp_getpage();
pte_t *pte = (pte_t *) spp_getpage();
pmd_populate_kernel(&init_mm, pmd, pte);
if (pte != pte_offset_kernel(pmd, 0)) {
if (pte != pte_offset_kernel(pmd, 0))
printk(KERN_ERR "PAGETABLE BUG #02!\n");
return;
}
}
return pte_offset_kernel(pmd, vaddr);
}
void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
pud = pud_page + pud_index(vaddr);
pmd = fill_pmd(pud, vaddr);
pte = fill_pte(pmd, vaddr);
pte = pte_offset_kernel(pmd, vaddr);
set_pte(pte, new_pte);
/*
......@@ -205,8 +222,7 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
__flush_tlb_one(vaddr);
}
void
set_pte_vaddr(unsigned long vaddr, pte_t pteval)
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
{
pgd_t *pgd;
pud_t *pud_page;
......@@ -223,6 +239,24 @@ set_pte_vaddr(unsigned long vaddr, pte_t pteval)
set_pte_vaddr_pud(pud_page, vaddr, pteval);
}
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
{
pgd_t *pgd;
pud_t *pud;
pgd = pgd_offset_k(vaddr);
pud = fill_pud(pgd, vaddr);
return fill_pmd(pud, vaddr);
}
pte_t * __init populate_extra_pte(unsigned long vaddr)
{
pmd_t *pmd;
pmd = populate_extra_pmd(vaddr);
return fill_pte(pmd, vaddr);
}
/*
* Create large page table mappings for a range of physical addresses.
*/
......
......@@ -363,7 +363,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
if (!bt->sequence)
goto err;
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
if (!bt->msg_data)
goto err;
......
......@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
continue;
}
if (!performance || !percpu_ptr(performance, i)) {
if (!performance || !per_cpu_ptr(performance, i)) {
retval = -EINVAL;
continue;
}
pr->performance = percpu_ptr(performance, i);
pr->performance = per_cpu_ptr(performance, i);
cpumask_set_cpu(i, pr->performance->shared_cpu_map);
if (acpi_processor_get_psd(pr)) {
retval = -EINVAL;
......
......@@ -65,23 +65,20 @@ extern void free_bootmem(unsigned long addr, unsigned long size);
#define BOOTMEM_DEFAULT 0
#define BOOTMEM_EXCLUSIVE (1<<0)
extern int reserve_bootmem(unsigned long addr,
unsigned long size,
int flags);
extern int reserve_bootmem_node(pg_data_t *pgdat,
unsigned long physaddr,
unsigned long size,
int flags);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
extern int reserve_bootmem(unsigned long addr, unsigned long size, int flags);
#endif
unsigned long physaddr,
unsigned long size,
int flags);
extern void *__alloc_bootmem_nopanic(unsigned long size,
extern void *__alloc_bootmem(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem(unsigned long size,
extern void *__alloc_bootmem_nopanic(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
......@@ -90,30 +87,35 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_low(unsigned long size,
unsigned long align,
unsigned long goal);
extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
unsigned long size,
unsigned long align,
unsigned long goal);
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
#define alloc_bootmem(x) \
__alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_nopanic(x) \
__alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_pages(x) \
__alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_nopanic(x) \
__alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low(x, PAGE_SIZE, 0)
#define alloc_bootmem_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node(pgdat, x) \
__alloc_bootmem_node(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_pages_node_nopanic(pgdat, x) \
__alloc_bootmem_node_nopanic(pgdat, x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
#define alloc_bootmem_low(x) \
__alloc_bootmem_low(x, SMP_CACHE_BYTES, 0)
#define alloc_bootmem_low_pages(x) \
__alloc_bootmem_low(x, PAGE_SIZE, 0)
#define alloc_bootmem_low_pages_node(pgdat, x) \
__alloc_bootmem_low_node(pgdat, x, PAGE_SIZE, 0)
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
extern int reserve_bootmem_generic(unsigned long addr, unsigned long size,
int flags);
......
......@@ -76,52 +76,98 @@
#ifdef CONFIG_SMP
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
/* minimum unit size, also is the maximum supported allocation size */
#define PCPU_MIN_UNIT_SIZE (16UL << PAGE_SHIFT)
/*
* PERCPU_DYNAMIC_RESERVE indicates the amount of free area to piggy
* back on the first chunk if arch is manually allocating and mapping
* it for faster access (as a part of large page mapping for example).
* Note that dynamic percpu allocator covers both static and dynamic
* areas, so these values are bigger than PERCPU_MODULE_RESERVE.
*
* On typical configuration with modules, the following values leave
* about 8k of free space on the first chunk after boot on both x86_32
* and 64 when module support is enabled. When module support is
* disabled, it's much tighter.
*/
#ifndef PERCPU_DYNAMIC_RESERVE
# if BITS_PER_LONG > 32
# ifdef CONFIG_MODULES
# define PERCPU_DYNAMIC_RESERVE (6 << PAGE_SHIFT)
# else
# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
# endif
# else
# ifdef CONFIG_MODULES
# define PERCPU_DYNAMIC_RESERVE (4 << PAGE_SHIFT)
# else
# define PERCPU_DYNAMIC_RESERVE (2 << PAGE_SHIFT)
# endif
# endif
#endif /* PERCPU_DYNAMIC_RESERVE */
extern void *pcpu_base_addr;
typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t unit_size,
size_t free_size, void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn);
/*
* Use this to get to a cpu's version of the per-cpu object
* dynamically allocated. Non-atomic access to the current CPU's
* version should probably be combined with get_cpu()/put_cpu().
*/
#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
struct percpu_data {
void *ptrs[1];
};
#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
/*
* Use this to get to a cpu's version of the per-cpu object dynamically
* allocated. Non-atomic access to the current CPU's version should
* probably be combined with get_cpu()/put_cpu().
*/
#define percpu_ptr(ptr, cpu) \
({ \
struct percpu_data *__p = __percpu_disguise(ptr); \
(__typeof__(ptr))__p->ptrs[(cpu)]; \
#define per_cpu_ptr(ptr, cpu) \
({ \
struct percpu_data *__p = __percpu_disguise(ptr); \
(__typeof__(ptr))__p->ptrs[(cpu)]; \
})
extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
extern void percpu_free(void *__pdata);
#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
extern void *__alloc_percpu(size_t size, size_t align);
extern void free_percpu(void *__pdata);
#else /* CONFIG_SMP */
#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
static inline void *__alloc_percpu(size_t size, size_t align)
{
/*
* Can't easily make larger alignment work with kmalloc. WARN
* on it. Larger alignment should only be used for module
* percpu sections on SMP for which this path isn't used.
*/
WARN_ON_ONCE(align > __alignof__(unsigned long long));
return kzalloc(size, gfp);
}
static inline void percpu_free(void *__pdata)
static inline void free_percpu(void *p)
{
kfree(__pdata);
kfree(p);
}
#endif /* CONFIG_SMP */
#define percpu_alloc_mask(size, gfp, mask) \
__percpu_alloc_mask((size), (gfp), &(mask))
#define percpu_alloc(size, gfp) percpu_alloc_mask((size), (gfp), cpu_online_map)
/* (legacy) interface for use without CPU hotplug handling */
#define __alloc_percpu(size) percpu_alloc_mask((size), GFP_KERNEL, \
cpu_possible_map)
#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type))
#define free_percpu(ptr) percpu_free((ptr))
#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu))
#define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type), \
__alignof__(type))
#endif /* __LINUX_PERCPU_H */
......@@ -95,6 +95,9 @@ extern struct vm_struct *remove_vm_area(const void *addr);
extern int map_vm_area(struct vm_struct *area, pgprot_t prot,
struct page ***pages);
extern int map_kernel_range_noflush(unsigned long start, unsigned long size,
pgprot_t prot, struct page **pages);
extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size);
extern void unmap_kernel_range(unsigned long addr, unsigned long size);
/* Allocate/destroy a 'vmalloc' VM area. */
......@@ -110,5 +113,6 @@ extern long vwrite(char *buf, char *addr, unsigned long count);
*/
extern rwlock_t vmlist_lock;
extern struct vm_struct *vmlist;
extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
#endif /* _LINUX_VMALLOC_H */
......@@ -51,6 +51,7 @@
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
#include <linux/async.h>
#include <linux/percpu.h>
#if 0
#define DEBUGP printk
......@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
}
#ifdef CONFIG_SMP
#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
static void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
{
void *ptr;
if (align > PAGE_SIZE) {
printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
name, align, PAGE_SIZE);
align = PAGE_SIZE;
}
ptr = __alloc_percpu(size, align);
if (!ptr)
printk(KERN_WARNING
"Could not allocate %lu bytes percpu data\n", size);
return ptr;
}
static void percpu_modfree(void *freeme)
{
free_percpu(freeme);
}
#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
/* Number of blocks used and allocated. */
static unsigned int pcpu_num_used, pcpu_num_allocated;
/* Size of each block. -ve means used. */
......@@ -480,21 +509,6 @@ static void percpu_modfree(void *freeme)
}
}
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
const char *secstrings)
{
return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
}
static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
{
int cpu;
for_each_possible_cpu(cpu)
memcpy(pcpudest + per_cpu_offset(cpu), from, size);
}
static int percpu_modinit(void)
{
pcpu_num_used = 2;
......@@ -513,7 +527,26 @@ static int percpu_modinit(void)
return 0;
}
__initcall(percpu_modinit);
#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
static unsigned int find_pcpusec(Elf_Ehdr *hdr,
Elf_Shdr *sechdrs,
const char *secstrings)
{
return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
}
static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
{
int cpu;
for_each_possible_cpu(cpu)
memcpy(pcpudest + per_cpu_offset(cpu), from, size);
}
#else /* ... !CONFIG_SMP */
static inline void *percpu_modalloc(unsigned long size, unsigned long align,
const char *name)
{
......@@ -535,6 +568,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
/* pcpusec should be 0, and size of that section should be 0. */
BUG_ON(size != 0);
}
#endif /* CONFIG_SMP */
#define MODINFO_ATTR(field) \
......
......@@ -9476,7 +9476,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
{
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
#ifndef CONFIG_64BIT
......@@ -9495,7 +9495,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
{
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
#ifndef CONFIG_64BIT
/*
......@@ -9591,7 +9591,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
ca = task_ca(tsk);
for (; ca; ca = ca->parent) {
u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
*cpuusage += cputime;
}
}
......
......@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
* doesn't hit this CPU until we're ready. */
get_cpu();
for_each_online_cpu(i) {
sm_work = percpu_ptr(stop_machine_work, i);
sm_work = per_cpu_ptr(stop_machine_work, i);
INIT_WORK(sm_work, stop_cpu);
queue_work_on(i, stop_machine_wq, sm_work);
}
......
......@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
obj-$(CONFIG_SMP) += percpu.o
else
obj-$(CONFIG_SMP) += allocpercpu.o
endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
......@@ -99,45 +99,51 @@ static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp,
__percpu_populate_mask((__pdata), (size), (gfp), &(mask))
/**
* percpu_alloc_mask - initial setup of per-cpu data
* alloc_percpu - initial setup of per-cpu data
* @size: size of per-cpu object
* @gfp: may sleep or not etc.
* @mask: populate per-data for cpu's selected through mask bits
* @align: alignment
*
* Populating per-cpu data for all online cpu's would be a typical use case,
* which is simplified by the percpu_alloc() wrapper.
* Per-cpu objects are populated with zeroed buffers.
* Allocate dynamic percpu area. Percpu objects are populated with
* zeroed buffers.
*/
void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
void *__alloc_percpu(size_t size, size_t align)
{
/*
* We allocate whole cache lines to avoid false sharing
*/
size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size());
void *pdata = kzalloc(sz, gfp);
void *pdata = kzalloc(sz, GFP_KERNEL);
void *__pdata = __percpu_disguise(pdata);
/*
* Can't easily make larger alignment work with kmalloc. WARN
* on it. Larger alignment should only be used for module
* percpu sections on SMP for which this path isn't used.
*/
WARN_ON_ONCE(align > __alignof__(unsigned long long));
if (unlikely(!pdata))
return NULL;
if (likely(!__percpu_populate_mask(__pdata, size, gfp, mask)))
if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL,
&cpu_possible_map)))
return __pdata;
kfree(pdata);
return NULL;
}
EXPORT_SYMBOL_GPL(__percpu_alloc_mask);
EXPORT_SYMBOL_GPL(__alloc_percpu);
/**
* percpu_free - final cleanup of per-cpu data
* free_percpu - final cleanup of per-cpu data
* @__pdata: object to clean up
*
* We simply clean up any per-cpu object left. No need for the client to
* track and specify through a bis mask which per-cpu objects are to free.
*/
void percpu_free(void *__pdata)
void free_percpu(void *__pdata)
{
if (unlikely(!__pdata))
return;
__percpu_depopulate_mask(__pdata, &cpu_possible_map);
kfree(__percpu_disguise(__pdata));
}
EXPORT_SYMBOL_GPL(percpu_free);
EXPORT_SYMBOL_GPL(free_percpu);
......@@ -37,6 +37,16 @@ static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
static int bootmem_debug;
/*
* If an arch needs to apply workarounds to bootmem allocation, it can
* set CONFIG_HAVE_ARCH_BOOTMEM and define a wrapper around
* __alloc_bootmem_core().
*/
#ifndef CONFIG_HAVE_ARCH_BOOTMEM
#define alloc_bootmem_core(bdata, size, align, goal, limit) \
__alloc_bootmem_core((bdata), (size), (align), (goal), (limit))
#endif
static int __init bootmem_debug_setup(char *buf)
{
bootmem_debug = 1;
......@@ -382,7 +392,6 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
}
#ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
/**
* reserve_bootmem - mark a page range as usable
* @addr: starting address of the range
......@@ -403,7 +412,6 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
return mark_bootmem(start, end, 1, flags);
}
#endif /* !CONFIG_HAVE_ARCH_BOOTMEM_NODE */
static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx,
unsigned long step)
......@@ -428,7 +436,7 @@ static unsigned long align_off(struct bootmem_data *bdata, unsigned long off,
return ALIGN(base + off, align) - base;
}
static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
static void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
unsigned long size, unsigned long align,
unsigned long goal, unsigned long limit)
{
......
/*
* linux/mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
* This file is released under the GPLv2.
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
* chunk is consisted of num_possible_cpus() units and the first chunk
* is used for static percpu variables in the kernel image (special
* boot time alloc/init handling necessary as these areas need to be
* brought up before allocation services are running). Unit grows as
* necessary and all units grow or shrink in unison. When a chunk is
* filled up, another chunk is allocated. ie. in vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
* c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring
* percpu base registers UNIT_SIZE apart.
*
* There are usually many small percpu allocations many of them as
* small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
* area in the chunk. This helps the allocator not to iterate the
* chunk maps unnecessarily.
*
* Allocation state in each chunk is kept using an array of integers
* on chunk->map. A positive value in the map represents a free
* region and negative allocated. Allocation inside a chunk is done
* by scanning this map sequentially and serving the first matching
* entry. This is mostly copied from the percpu_modalloc() allocator.
* Chunks are also linked into a rb tree to ease address to chunk
* mapping during free.
*
* To use this allocator, arch code should do the followings.
*
* - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back
*
* - use pcpu_setup_first_chunk() during percpu area initialization to
* setup the first chunk containing the kernel static percpu area
*/
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
struct rb_node rb_node; /* key is chunk->vm->addr */
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
struct vm_struct *vm; /* mapped vmalloc region */
int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
bool immutable; /* no [de]population allowed */
struct page *page[]; /* #cpus * UNIT_PAGES */
};
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
static int pcpu_chunk_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
/* the size of kernel static area */
static int pcpu_static_size __read_mostly;
/*
* One mutex to rule them all.
*
* The following mutex is grabbed in the outermost public alloc/free
* interface functions and released only when the operation is
* complete. As such, every function in this file other than the
* outermost functions are called under pcpu_mutex.
*
* It can easily be switched to use spinlock such that only the area
* allocation and page population commit are protected with it doing
* actual [de]allocation without holding any lock. However, given
* what this allocator does, I think it's better to let them run
* sequentially.
*/
static DEFINE_MUTEX(pcpu_mutex);
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
static int __pcpu_size_to_slot(int size)
{
int highbit = fls(size); /* size is in bytes */
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}
static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size)
return pcpu_nr_slots - 1;
return __pcpu_size_to_slot(size);
}
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
return 0;
return pcpu_size_to_slot(chunk->free_size);
}
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
return cpu * pcpu_unit_pages + page_idx;
}
static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return &chunk->page[pcpu_page_idx(cpu, page_idx)];
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return (unsigned long)chunk->vm->addr +
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
}
static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
int page_idx)
{
return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
}
/**
* pcpu_realloc - versatile realloc
* @p: the current pointer (can be NULL for new allocations)
* @size: the current size in bytes (can be 0 for new allocations)
* @new_size: the wanted new size in bytes (can be 0 for free)
*
* More robust realloc which can be used to allocate, resize or free a
* memory area of arbitrary size. If the needed size goes over
* PAGE_SIZE, kernel VM is used.
*
* RETURNS:
* The new pointer on success, NULL on failure.
*/
static void *pcpu_realloc(void *p, size_t size, size_t new_size)
{
void *new;
if (new_size <= PAGE_SIZE)
new = kmalloc(new_size, GFP_KERNEL);
else
new = vmalloc(new_size);
if (new_size && !new)
return NULL;
memcpy(new, p, min(size, new_size));
if (new_size > size)
memset(new + size, 0, new_size - size);
if (size <= PAGE_SIZE)
kfree(p);
else
vfree(p);
return new;
}
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
*
* This function is called after an allocation or free changed @chunk.
* New slot according to the changed state is determined and @chunk is
* moved to the slot.
*/
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
if (oslot != nslot) {
if (oslot < nslot)
list_move(&chunk->list, &pcpu_slot[nslot]);
else
list_move_tail(&chunk->list, &pcpu_slot[nslot]);
}
}
static struct rb_node **pcpu_chunk_rb_search(void *addr,
struct rb_node **parentp)
{
struct rb_node **p = &pcpu_addr_root.rb_node;
struct rb_node *parent = NULL;
struct pcpu_chunk *chunk;
while (*p) {
parent = *p;
chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
if (addr < chunk->vm->addr)
p = &(*p)->rb_left;
else if (addr > chunk->vm->addr)
p = &(*p)->rb_right;
else
break;
}
if (parentp)
*parentp = parent;
return p;
}
/**
* pcpu_chunk_addr_search - search for chunk containing specified address
* @addr: address to search for
*
* Look for chunk which might contain @addr. More specifically, it
* searchs for the chunk with the highest start address which isn't
* beyond @addr.
*
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
struct rb_node *n, *parent;
struct pcpu_chunk *chunk;
n = *pcpu_chunk_rb_search(addr, &parent);
if (!n) {
/* no exactly matching chunk, the parent is the closest */
n = parent;
BUG_ON(!n);
}
chunk = rb_entry(n, struct pcpu_chunk, rb_node);
if (addr < chunk->vm->addr) {
/* the parent was the next one, look for the previous one */
n = rb_prev(n);
BUG_ON(!n);
chunk = rb_entry(n, struct pcpu_chunk, rb_node);
}
return chunk;
}
/**
* pcpu_chunk_addr_insert - insert chunk into address rb tree
* @new: chunk to insert
*
* Insert @new into address rb tree.
*/
static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
{
struct rb_node **p, *parent;
p = pcpu_chunk_rb_search(new->vm->addr, &parent);
BUG_ON(*p);
rb_link_node(&new->rb_node, parent, p);
rb_insert_color(&new->rb_node, &pcpu_addr_root);
}
/**
* pcpu_split_block - split a map block
* @chunk: chunk of interest
* @i: index of map block to split
* @head: head size in bytes (can be 0)
* @tail: tail size in bytes (can be 0)
*
* Split the @i'th map block into two or three blocks. If @head is
* non-zero, @head bytes block is inserted before block @i moving it
* to @i+1 and reducing its size by @head bytes.
*
* If @tail is non-zero, the target block, which can be @i or @i+1
* depending on @head, is reduced by @tail bytes and @tail byte block
* is inserted after the target block.
*
* RETURNS:
* 0 on success, -errno on failure.
*/
static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
{
int nr_extra = !!head + !!tail;
int target = chunk->map_used + nr_extra;
/* reallocation required? */
if (chunk->map_alloc < target) {
int new_alloc = chunk->map_alloc;
int *new;
while (new_alloc < target)
new_alloc *= 2;
new = pcpu_realloc(chunk->map,
chunk->map_alloc * sizeof(new[0]),
new_alloc * sizeof(new[0]));
if (!new)
return -ENOMEM;
chunk->map_alloc = new_alloc;
chunk->map = new;
}
/* insert a new subblock */
memmove(&chunk->map[i + nr_extra], &chunk->map[i],
sizeof(chunk->map[0]) * (chunk->map_used - i));
chunk->map_used += nr_extra;
if (head) {
chunk->map[i + 1] = chunk->map[i] - head;
chunk->map[i++] = head;
}
if (tail) {
chunk->map[i++] -= tail;
chunk->map[i] = tail;
}
return 0;
}
/**
* pcpu_alloc_area - allocate area from a pcpu_chunk
* @chunk: chunk of interest
* @size: wanted size in bytes
* @align: wanted align
*
* Try to allocate @size bytes area aligned at @align from @chunk.
* Note that this function only allocates the offset. It doesn't
* populate or map the area.
*
* RETURNS:
* Allocated offset in @chunk on success, -errno on failure.
*/
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
{
int oslot = pcpu_chunk_slot(chunk);
int max_contig = 0;
int i, off;
/*
* The static chunk initially doesn't have map attached
* because kmalloc wasn't available during init. Give it one.
*/
if (unlikely(!chunk->map)) {
chunk->map = pcpu_realloc(NULL, 0,
PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
if (!chunk->map)
return -ENOMEM;
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = -pcpu_static_size;
if (chunk->free_size)
chunk->map[chunk->map_used++] = chunk->free_size;
}
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
bool is_last = i + 1 == chunk->map_used;
int head, tail;
/* extra for alignment requirement */
head = ALIGN(off, align) - off;
BUG_ON(i == 0 && head != 0);
if (chunk->map[i] < 0)
continue;
if (chunk->map[i] < head + size) {
max_contig = max(chunk->map[i], max_contig);
continue;
}
/*
* If head is small or the previous block is free,
* merge'em. Note that 'small' is defined as smaller
* than sizeof(int), which is very small but isn't too
* uncommon for percpu allocations.
*/
if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
if (chunk->map[i - 1] > 0)
chunk->map[i - 1] += head;
else {
chunk->map[i - 1] -= head;
chunk->free_size -= head;
}
chunk->map[i] -= head;
off += head;
head = 0;
}
/* if tail is small, just keep it around */
tail = chunk->map[i] - head - size;
if (tail < sizeof(int))
tail = 0;
/* split if warranted */
if (head || tail) {
if (pcpu_split_block(chunk, i, head, tail))
return -ENOMEM;
if (head) {
i++;
off += head;
max_contig = max(chunk->map[i - 1], max_contig);
}
if (tail)
max_contig = max(chunk->map[i + 1], max_contig);
}
/* update hint and mark allocated */
if (is_last)
chunk->contig_hint = max_contig; /* fully scanned */
else
chunk->contig_hint = max(chunk->contig_hint,
max_contig);
chunk->free_size -= chunk->map[i];
chunk->map[i] = -chunk->map[i];
pcpu_chunk_relocate(chunk, oslot);
return off;
}
chunk->contig_hint = max_contig; /* fully scanned */
pcpu_chunk_relocate(chunk, oslot);
/*
* Tell the upper layer that this chunk has no area left.
* Note that this is not an error condition but a notification
* to upper layer that it needs to look at other chunks.
* -ENOSPC is chosen as it isn't used in memory subsystem and
* matches the meaning in a way.
*/
return -ENOSPC;
}
/**
* pcpu_free_area - free area to a pcpu_chunk
* @chunk: chunk of interest
* @freeme: offset of area to free
*
* Free area starting from @freeme to @chunk. Note that this function
* only modifies the allocation map. It doesn't depopulate or unmap
* the area.
*/
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
{
int oslot = pcpu_chunk_slot(chunk);
int i, off;
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
if (off == freeme)
break;
BUG_ON(off != freeme);
BUG_ON(chunk->map[i] > 0);
chunk->map[i] = -chunk->map[i];
chunk->free_size += chunk->map[i];
/* merge with previous? */
if (i > 0 && chunk->map[i - 1] >= 0) {
chunk->map[i - 1] += chunk->map[i];
chunk->map_used--;
memmove(&chunk->map[i], &chunk->map[i + 1],
(chunk->map_used - i) * sizeof(chunk->map[0]));
i--;
}
/* merge with next? */
if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
chunk->map[i] += chunk->map[i + 1];
chunk->map_used--;
memmove(&chunk->map[i + 1], &chunk->map[i + 2],
(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
}
chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
pcpu_chunk_relocate(chunk, oslot);
}
/**
* pcpu_unmap - unmap pages out of a pcpu_chunk
* @chunk: chunk of interest
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
* @flush: whether to flush cache and tlb or not
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* If @flush is true, vcache is flushed before unmapping and tlb
* after.
*/
static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
bool flush)
{
unsigned int last = num_possible_cpus() - 1;
unsigned int cpu;
/* unmap must not be done on immutable chunk */
WARN_ON(chunk->immutable);
/*
* Each flushing trial can be very expensive, issue flush on
* the whole region at once rather than doing it for each cpu.
* This could be an overkill but is more scalable.
*/
if (flush)
flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
for_each_possible_cpu(cpu)
unmap_kernel_range_noflush(
pcpu_chunk_addr(chunk, cpu, page_start),
(page_end - page_start) << PAGE_SHIFT);
/* ditto as flush_cache_vunmap() */
if (flush)
flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
}
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
* @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
* and tlb after.
*/
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
bool flush)
{
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
int unmap_start = -1;
int uninitialized_var(unmap_end);
unsigned int cpu;
int i;
for (i = page_start; i < page_end; i++) {
for_each_possible_cpu(cpu) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
if (!*pagep)
continue;
__free_page(*pagep);
/*
* If it's partial depopulation, it might get
* populated or depopulated again. Mark the
* page gone.
*/
*pagep = NULL;
unmap_start = unmap_start < 0 ? i : unmap_start;
unmap_end = i + 1;
}
}
if (unmap_start >= 0)
pcpu_unmap(chunk, unmap_start, unmap_end, flush);
}
/**
* pcpu_map - map pages into a pcpu_chunk
* @chunk: chunk of interest
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
* For each cpu, map pages [@page_start,@page_end) into @chunk.
* vcache is flushed afterwards.
*/
static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
{
unsigned int last = num_possible_cpus() - 1;
unsigned int cpu;
int err;
/* map must not be done on immutable chunk */
WARN_ON(chunk->immutable);
for_each_possible_cpu(cpu) {
err = map_kernel_range_noflush(
pcpu_chunk_addr(chunk, cpu, page_start),
(page_end - page_start) << PAGE_SHIFT,
PAGE_KERNEL,
pcpu_chunk_pagep(chunk, cpu, page_start));
if (err < 0)
return err;
}
/* flush at once, please read comments in pcpu_unmap() */
flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
pcpu_chunk_addr(chunk, last, page_end));
return 0;
}
/**
* pcpu_populate_chunk - populate and map an area of a pcpu_chunk
* @chunk: chunk of interest
* @off: offset to the area to populate
* @size: size of the area to populate in bytes
*
* For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk. The area is cleared on return.
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
int map_start = -1;
int map_end;
unsigned int cpu;
int i;
for (i = page_start; i < page_end; i++) {
if (pcpu_chunk_page_occupied(chunk, i)) {
if (map_start >= 0) {
if (pcpu_map(chunk, map_start, map_end))
goto err;
map_start = -1;
}
continue;
}
map_start = map_start < 0 ? i : map_start;
map_end = i + 1;
for_each_possible_cpu(cpu) {
struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
*pagep = alloc_pages_node(cpu_to_node(cpu),
alloc_mask, 0);
if (!*pagep)
goto err;
}
}
if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
goto err;
for_each_possible_cpu(cpu)
memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0,
size);
return 0;
err:
/* likely under heavy memory pressure, give memory back */
pcpu_depopulate_chunk(chunk, off, size, true);
return -ENOMEM;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
if (chunk->vm)
free_vm_area(chunk->vm);
pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
kfree(chunk);
}
static struct pcpu_chunk *alloc_pcpu_chunk(void)
{
struct pcpu_chunk *chunk;
chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
if (!chunk)
return NULL;
chunk->map = pcpu_realloc(NULL, 0,
PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
}
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
return chunk;
}
/**
* __alloc_percpu - allocate percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_percpu(size_t size, size_t align)
{
void *ptr = NULL;
struct pcpu_chunk *chunk;
int slot, off;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
return NULL;
}
mutex_lock(&pcpu_mutex);
/* allocate area */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint)
continue;
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
if (off != -ENOSPC)
goto out_unlock;
}
}
/* hmmm... no space left, create a new chunk */
chunk = alloc_pcpu_chunk();
if (!chunk)
goto out_unlock;
pcpu_chunk_relocate(chunk, -1);
pcpu_chunk_addr_insert(chunk);
off = pcpu_alloc_area(chunk, size, align);
if (off < 0)
goto out_unlock;
area_found:
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
pcpu_free_area(chunk, off);
goto out_unlock;
}
ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
out_unlock:
mutex_unlock(&pcpu_mutex);
return ptr;
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
{
WARN_ON(chunk->immutable);
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
list_del(&chunk->list);
rb_erase(&chunk->rb_node, &pcpu_addr_root);
free_pcpu_chunk(chunk);
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
* Free percpu area @ptr. Might sleep.
*/
void free_percpu(void *ptr)
{
void *addr = __pcpu_ptr_to_addr(ptr);
struct pcpu_chunk *chunk;
int off;
if (!ptr)
return;
mutex_lock(&pcpu_mutex);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->vm->addr;
pcpu_free_area(chunk, off);
/* the chunk became fully free, kill one if there are other free ones */
if (chunk->free_size == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos,
&pcpu_slot[pcpu_chunk_slot(chunk)], list)
if (pos != chunk) {
pcpu_kill_chunk(pos);
break;
}
}
mutex_unlock(&pcpu_mutex);
}
EXPORT_SYMBOL_GPL(free_percpu);
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @get_page_fn: callback to fetch page pointer
* @static_size: the size of static percpu area in bytes
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, 0 for auto
* @free_size: free size in bytes, 0 for auto
* @base_addr: mapped address, NULL for auto
* @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
* setup path. The first two parameters are mandatory. The rest are
* optional.
*
* @get_page_fn() should return pointer to percpu page given cpu
* number and page number. It should at least return enough pages to
* cover the static area. The returned pages for static area should
* have been initialized with valid data. If @unit_size is specified,
* it can also return pages after the static area. NULL return
* indicates end of pages for the cpu. Note that @get_page_fn() must
* return the same number of pages for all cpus.
*
* @unit_size, if non-zero, determines unit size and must be aligned
* to PAGE_SIZE and equal to or larger than @static_size + @free_size.
*
* @free_size determines the number of free bytes after the static
* area in the first chunk. If zero, whatever left is available.
* Specifying non-zero value make percpu leave the area after
* @static_size + @free_size alone.
*
* Non-null @base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it. percpu must not mess
* with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
* @populate_pte_fn doesn't make any sense.
*
* @populate_pte_fn is used to populate the pagetable. NULL means the
* caller already populated the pagetable.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
size_t static_size, size_t unit_size,
size_t free_size, void *base_addr,
pcpu_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct static_vm;
struct pcpu_chunk *static_chunk;
unsigned int cpu;
int nr_pages;
int err, i;
/* santiy checks */
BUG_ON(!static_size);
BUG_ON(!unit_size && free_size);
BUG_ON(unit_size && unit_size < static_size + free_size);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(base_addr && !unit_size);
BUG_ON(base_addr && populate_pte_fn);
if (unit_size)
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
else
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
PFN_UP(static_size));
pcpu_static_size = static_size;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
/* init static_chunk */
static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&static_chunk->list);
static_chunk->vm = &static_vm;
if (free_size)
static_chunk->free_size = free_size;
else
static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
static_chunk->contig_hint = static_chunk->free_size;
/* allocate vm address */
static_vm.flags = VM_ALLOC;
static_vm.size = pcpu_chunk_size;
if (!base_addr)
vm_area_register_early(&static_vm, PAGE_SIZE);
else {
/*
* Pages already mapped. No need to remap into
* vmalloc area. In this case the static chunk can't
* be mapped or unmapped by percpu and is marked
* immutable.
*/
static_vm.addr = base_addr;
static_chunk->immutable = true;
}
/* assign pages */
nr_pages = -1;
for_each_possible_cpu(cpu) {
for (i = 0; i < pcpu_unit_pages; i++) {
struct page *page = get_page_fn(cpu, i);
if (!page)
break;
*pcpu_chunk_pagep(static_chunk, cpu, i) = page;
}
BUG_ON(i < PFN_UP(pcpu_static_size));
if (nr_pages < 0)
nr_pages = i;
else
BUG_ON(nr_pages != i);
}
/* map them */
if (populate_pte_fn) {
for_each_possible_cpu(cpu)
for (i = 0; i < nr_pages; i++)
populate_pte_fn(pcpu_chunk_addr(static_chunk,
cpu, i));
err = pcpu_map(static_chunk, 0, nr_pages);
if (err)
panic("failed to setup static percpu area, err=%d\n",
err);
}
/* link static_chunk in */
pcpu_chunk_relocate(static_chunk, -1);
pcpu_chunk_addr_insert(static_chunk);
/* we're done */
pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
return pcpu_unit_size;
}
......@@ -24,6 +24,7 @@
#include <linux/radix-tree.h>
#include <linux/rcupdate.h>
#include <linux/bootmem.h>
#include <linux/pfn.h>
#include <asm/atomic.h>
#include <asm/uaccess.h>
......@@ -152,8 +153,8 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
*
* Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
*/
static int vmap_page_range(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
static int vmap_page_range_noflush(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
pgd_t *pgd;
unsigned long next;
......@@ -169,13 +170,22 @@ static int vmap_page_range(unsigned long start, unsigned long end,
if (err)
break;
} while (pgd++, addr = next, addr != end);
flush_cache_vmap(start, end);
if (unlikely(err))
return err;
return nr;
}
static int vmap_page_range(unsigned long start, unsigned long end,
pgprot_t prot, struct page **pages)
{
int ret;
ret = vmap_page_range_noflush(start, end, prot, pages);
flush_cache_vmap(start, end);
return ret;
}
static inline int is_vmalloc_or_module_addr(const void *x)
{
/*
......@@ -982,6 +992,32 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
}
EXPORT_SYMBOL(vm_map_ram);
/**
* vm_area_register_early - register vmap area early during boot
* @vm: vm_struct to register
* @align: requested alignment
*
* This function is used to register kernel vm area before
* vmalloc_init() is called. @vm->size and @vm->flags should contain
* proper values on entry and other fields should be zero. On return,
* vm->addr contains the allocated address.
*
* DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
*/
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
static size_t vm_init_off __initdata;
unsigned long addr;
addr = ALIGN(VMALLOC_START + vm_init_off, align);
vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
vm->addr = (void *)addr;
vm->next = vmlist;
vmlist = vm;
}
void __init vmalloc_init(void)
{
struct vmap_area *va;
......@@ -1009,6 +1045,58 @@ void __init vmalloc_init(void)
vmap_initialized = true;
}
/**
* map_kernel_range_noflush - map kernel VM area with the specified pages
* @addr: start of the VM area to map
* @size: size of the VM area to map
* @prot: page protection flags to use
* @pages: pages to map
*
* Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
* specify should have been allocated using get_vm_area() and its
* friends.
*
* NOTE:
* This function does NOT do any cache flushing. The caller is
* responsible for calling flush_cache_vmap() on to-be-mapped areas
* before calling this function.
*
* RETURNS:
* The number of pages mapped on success, -errno on failure.
*/
int map_kernel_range_noflush(unsigned long addr, unsigned long size,
pgprot_t prot, struct page **pages)
{
return vmap_page_range_noflush(addr, addr + size, prot, pages);
}
/**
* unmap_kernel_range_noflush - unmap kernel VM area
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
*
* Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
* specify should have been allocated using get_vm_area() and its
* friends.
*
* NOTE:
* This function does NOT do any cache flushing. The caller is
* responsible for calling flush_cache_vunmap() on to-be-mapped areas
* before calling this function and flush_tlb_kernel_range() after.
*/
void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
{
vunmap_page_range(addr, addr + size);
}
/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
*
* Similar to unmap_kernel_range_noflush() but flushes vcache before
* the unmapping and tlb after.
*/
void unmap_kernel_range(unsigned long addr, unsigned long size)
{
unsigned long end = addr + size;
......
......@@ -1375,10 +1375,10 @@ EXPORT_SYMBOL_GPL(snmp_fold_field);
int snmp_mib_init(void *ptr[2], size_t mibsize)
{
BUG_ON(ptr == NULL);
ptr[0] = __alloc_percpu(mibsize);
ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
if (!ptr[0])
goto err0;
ptr[1] = __alloc_percpu(mibsize);
ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long long));
if (!ptr[1])
goto err1;
return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment