init_64.c 23.8 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
/*
 *  linux/arch/x86_64/mm/init.c
 *
 *  Copyright (C) 1995  Linus Torvalds
 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
 *  Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
 */

#include <linux/signal.h>
#include <linux/sched.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/types.h>
#include <linux/ptrace.h>
#include <linux/mman.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/smp.h>
#include <linux/init.h>
21
#include <linux/initrd.h>
Linus Torvalds's avatar
Linus Torvalds committed
22 23 24
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/proc_fs.h>
25
#include <linux/pci.h>
26
#include <linux/pfn.h>
27
#include <linux/poison.h>
28
#include <linux/dma-mapping.h>
29 30
#include <linux/module.h>
#include <linux/memory_hotplug.h>
31
#include <linux/nmi.h>
Linus Torvalds's avatar
Linus Torvalds committed
32 33

#include <asm/processor.h>
34
#include <asm/bios_ebda.h>
Linus Torvalds's avatar
Linus Torvalds committed
35 36 37 38 39 40 41 42 43 44 45 46
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/dma.h>
#include <asm/fixmap.h>
#include <asm/e820.h>
#include <asm/apic.h>
#include <asm/tlb.h>
#include <asm/mmu_context.h>
#include <asm/proto.h>
#include <asm/smp.h>
47
#include <asm/sections.h>
48
#include <asm/kdebug.h>
49
#include <asm/numa.h>
50
#include <asm/cacheflush.h>
51
#include <asm/init.h>
Linus Torvalds's avatar
Linus Torvalds committed
52

53 54
static unsigned long dma_reserve __initdata;

Ingo Molnar's avatar
Ingo Molnar committed
55 56 57 58 59 60 61 62 63 64 65 66 67 68
static int __init parse_direct_gbpages_off(char *arg)
{
	direct_gbpages = 0;
	return 0;
}
early_param("nogbpages", parse_direct_gbpages_off);

static int __init parse_direct_gbpages_on(char *arg)
{
	direct_gbpages = 1;
	return 0;
}
early_param("gbpages", parse_direct_gbpages_on);

Linus Torvalds's avatar
Linus Torvalds committed
69 70 71 72 73 74
/*
 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
 * physical space so we can cache the place of the first one and move
 * around without checking the pgd every time.
 */

75
pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
76 77 78 79
EXPORT_SYMBOL_GPL(__supported_pte_mask);

int force_personality32;

Ingo Molnar's avatar
Ingo Molnar committed
80 81 82 83 84 85 86 87
/*
 * noexec32=on|off
 * Control non executable heap for 32bit processes.
 * To control the stack too use noexec=off
 *
 * on	PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
 * off	PROT_READ implies PROT_EXEC
 */
88 89 90 91 92 93 94 95 96 97
static int __init nonx32_setup(char *str)
{
	if (!strcmp(str, "on"))
		force_personality32 &= ~READ_IMPLIES_EXEC;
	else if (!strcmp(str, "off"))
		force_personality32 |= READ_IMPLIES_EXEC;
	return 1;
}
__setup("noexec32=", nonx32_setup);

98 99 100 101 102
/*
 * NOTE: This function is marked __ref because it calls __init function
 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
 */
static __ref void *spp_getpage(void)
Thomas Gleixner's avatar
Thomas Gleixner committed
103
{
Linus Torvalds's avatar
Linus Torvalds committed
104
	void *ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
105

Linus Torvalds's avatar
Linus Torvalds committed
106
	if (after_bootmem)
107
		ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
Linus Torvalds's avatar
Linus Torvalds committed
108 109
	else
		ptr = alloc_bootmem_pages(PAGE_SIZE);
Thomas Gleixner's avatar
Thomas Gleixner committed
110 111 112 113 114

	if (!ptr || ((unsigned long)ptr & ~PAGE_MASK)) {
		panic("set_pte_phys: cannot allocate page data %s\n",
			after_bootmem ? "after bootmem" : "");
	}
Linus Torvalds's avatar
Linus Torvalds committed
115

116
	pr_debug("spp_getpage %p\n", ptr);
Thomas Gleixner's avatar
Thomas Gleixner committed
117

Linus Torvalds's avatar
Linus Torvalds committed
118
	return ptr;
Thomas Gleixner's avatar
Thomas Gleixner committed
119
}
Linus Torvalds's avatar
Linus Torvalds committed
120

121
static pud_t *fill_pud(pgd_t *pgd, unsigned long vaddr)
Linus Torvalds's avatar
Linus Torvalds committed
122
{
123 124 125 126 127 128 129 130 131
	if (pgd_none(*pgd)) {
		pud_t *pud = (pud_t *)spp_getpage();
		pgd_populate(&init_mm, pgd, pud);
		if (pud != pud_offset(pgd, 0))
			printk(KERN_ERR "PAGETABLE BUG #00! %p <-> %p\n",
			       pud, pud_offset(pgd, 0));
	}
	return pud_offset(pgd, vaddr);
}
Linus Torvalds's avatar
Linus Torvalds committed
132

133
static pmd_t *fill_pmd(pud_t *pud, unsigned long vaddr)
134
{
Linus Torvalds's avatar
Linus Torvalds committed
135
	if (pud_none(*pud)) {
136
		pmd_t *pmd = (pmd_t *) spp_getpage();
137
		pud_populate(&init_mm, pud, pmd);
138
		if (pmd != pmd_offset(pud, 0))
139
			printk(KERN_ERR "PAGETABLE BUG #01! %p <-> %p\n",
140
			       pmd, pmd_offset(pud, 0));
Linus Torvalds's avatar
Linus Torvalds committed
141
	}
142 143 144
	return pmd_offset(pud, vaddr);
}

145
static pte_t *fill_pte(pmd_t *pmd, unsigned long vaddr)
146
{
Linus Torvalds's avatar
Linus Torvalds committed
147
	if (pmd_none(*pmd)) {
148
		pte_t *pte = (pte_t *) spp_getpage();
149
		pmd_populate_kernel(&init_mm, pmd, pte);
150
		if (pte != pte_offset_kernel(pmd, 0))
151
			printk(KERN_ERR "PAGETABLE BUG #02!\n");
Linus Torvalds's avatar
Linus Torvalds committed
152
	}
153 154 155 156 157 158 159 160 161 162 163 164
	return pte_offset_kernel(pmd, vaddr);
}

void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
{
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	pud = pud_page + pud_index(vaddr);
	pmd = fill_pmd(pud, vaddr);
	pte = fill_pte(pmd, vaddr);
Linus Torvalds's avatar
Linus Torvalds committed
165 166 167 168 169 170 171 172 173 174

	set_pte(pte, new_pte);

	/*
	 * It's enough to flush this one mapping.
	 * (PGE mappings get flushed as well)
	 */
	__flush_tlb_one(vaddr);
}

175
void set_pte_vaddr(unsigned long vaddr, pte_t pteval)
176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
{
	pgd_t *pgd;
	pud_t *pud_page;

	pr_debug("set_pte_vaddr %lx to %lx\n", vaddr, native_pte_val(pteval));

	pgd = pgd_offset_k(vaddr);
	if (pgd_none(*pgd)) {
		printk(KERN_ERR
			"PGD FIXMAP MISSING, it should be setup in head.S!\n");
		return;
	}
	pud_page = (pud_t*)pgd_page_vaddr(*pgd);
	set_pte_vaddr_pud(pud_page, vaddr, pteval);
}

192
pmd_t * __init populate_extra_pmd(unsigned long vaddr)
193 194 195 196 197
{
	pgd_t *pgd;
	pud_t *pud;

	pgd = pgd_offset_k(vaddr);
198 199 200 201 202 203 204
	pud = fill_pud(pgd, vaddr);
	return fill_pmd(pud, vaddr);
}

pte_t * __init populate_extra_pte(unsigned long vaddr)
{
	pmd_t *pmd;
205

206 207
	pmd = populate_extra_pmd(vaddr);
	return fill_pte(pmd, vaddr);
208 209
}

210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
/*
 * Create large page table mappings for a range of physical addresses.
 */
static void __init __init_extra_mapping(unsigned long phys, unsigned long size,
						pgprot_t prot)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	BUG_ON((phys & ~PMD_MASK) || (size & ~PMD_MASK));
	for (; size; phys += PMD_SIZE, size -= PMD_SIZE) {
		pgd = pgd_offset_k((unsigned long)__va(phys));
		if (pgd_none(*pgd)) {
			pud = (pud_t *) spp_getpage();
			set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pud = pud_offset(pgd, (unsigned long)__va(phys));
		if (pud_none(*pud)) {
			pmd = (pmd_t *) spp_getpage();
			set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE |
						_PAGE_USER));
		}
		pmd = pmd_offset(pud, phys);
		BUG_ON(!pmd_none(*pmd));
		set_pmd(pmd, __pmd(phys | pgprot_val(prot)));
	}
}

void __init init_extra_mapping_wb(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE);
}

void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
{
	__init_extra_mapping(phys, size, PAGE_KERNEL_LARGE_NOCACHE);
}

250
/*
251 252 253
 * The head.S code sets up the kernel high mapping:
 *
 *   from __START_KERNEL_map to __START_KERNEL_map + size (== _end-_text)
254 255 256 257 258 259 260 261 262 263 264 265
 *
 * phys_addr holds the negative offset to the kernel, which is added
 * to the compile time generated pmds. This results in invalid pmds up
 * to the point where we hit the physaddr 0 mapping.
 *
 * We limit the mappings to the region from _text to _end.  _end is
 * rounded up to the 2MB boundary. This catches the invalid pmds as
 * well, as they are located before _text:
 */
void __init cleanup_highmap(void)
{
	unsigned long vaddr = __START_KERNEL_map;
266
	unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
267 268 269 270
	pmd_t *pmd = level2_kernel_pgt;
	pmd_t *last_pmd = pmd + PTRS_PER_PMD;

	for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) {
271
		if (pmd_none(*pmd))
272 273 274 275 276 277
			continue;
		if (vaddr < (unsigned long) _text || vaddr > end)
			set_pmd(pmd, __pmd(0));
	}
}

278
static __ref void *alloc_low_page(unsigned long *phys)
Thomas Gleixner's avatar
Thomas Gleixner committed
279
{
280
	unsigned long pfn = e820_table_end++;
Linus Torvalds's avatar
Linus Torvalds committed
281 282
	void *adr;

283
	if (after_bootmem) {
284
		adr = (void *)get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK);
285
		*phys = __pa(adr);
Thomas Gleixner's avatar
Thomas Gleixner committed
286

287 288 289
		return adr;
	}

290
	if (pfn >= e820_table_top)
Thomas Gleixner's avatar
Thomas Gleixner committed
291
		panic("alloc_low_page: ran out of memory");
292

293
	adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
294
	memset(adr, 0, PAGE_SIZE);
295 296 297
	*phys  = pfn * PAGE_SIZE;
	return adr;
}
Linus Torvalds's avatar
Linus Torvalds committed
298

299
static __ref void unmap_low_page(void *adr)
Thomas Gleixner's avatar
Thomas Gleixner committed
300
{
301 302 303
	if (after_bootmem)
		return;

304
	early_iounmap(adr, PAGE_SIZE);
Thomas Gleixner's avatar
Thomas Gleixner committed
305
}
Linus Torvalds's avatar
Linus Torvalds committed
306

307
static unsigned long __meminit
308 309
phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
	      pgprot_t prot)
310 311
{
	unsigned pages = 0;
312
	unsigned long last_map_addr = end;
313
	int i;
314

315 316 317 318 319 320 321 322 323 324 325 326
	pte_t *pte = pte_page + pte_index(addr);

	for(i = pte_index(addr); i < PTRS_PER_PTE; i++, addr += PAGE_SIZE, pte++) {

		if (addr >= end) {
			if (!after_bootmem) {
				for(; i < PTRS_PER_PTE; i++, pte++)
					set_pte(pte, __pte(0));
			}
			break;
		}

327 328 329 330 331 332
		/*
		 * We will re-use the existing mapping.
		 * Xen for example has some special requirements, like mapping
		 * pagetable pages as RO. So assume someone who pre-setup
		 * these mappings are more intelligent.
		 */
333 334
		if (pte_val(*pte)) {
			pages++;
335
			continue;
336
		}
337 338 339 340 341

		if (0)
			printk("   pte=%p addr=%lx pte=%016lx\n",
			       pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
		pages++;
342
		set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
343
		last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
344
	}
345

346
	update_page_count(PG_LEVEL_4K, pages);
347 348

	return last_map_addr;
349 350
}

351
static unsigned long __meminit
352 353
phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
		pgprot_t prot)
354 355 356
{
	pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);

357
	return phys_pte_init(pte, address, end, prot);
358 359
}

360
static unsigned long __meminit
361
phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
362
	      unsigned long page_size_mask, pgprot_t prot)
363
{
364
	unsigned long pages = 0;
365
	unsigned long last_map_addr = end;
366

367
	int i = pmd_index(address);
368

369
	for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
370
		unsigned long pte_phys;
371
		pmd_t *pmd = pmd_page + pmd_index(address);
372
		pte_t *pte;
373
		pgprot_t new_prot = prot;
374

375
		if (address >= end) {
Thomas Gleixner's avatar
Thomas Gleixner committed
376
			if (!after_bootmem) {
377 378
				for (; i < PTRS_PER_PMD; i++, pmd++)
					set_pmd(pmd, __pmd(0));
Thomas Gleixner's avatar
Thomas Gleixner committed
379
			}
380 381
			break;
		}
382

383
		if (pmd_val(*pmd)) {
384 385
			if (!pmd_large(*pmd)) {
				spin_lock(&init_mm.page_table_lock);
386
				last_map_addr = phys_pte_update(pmd, address,
387
								end, prot);
388
				spin_unlock(&init_mm.page_table_lock);
389
				continue;
390
			}
391 392 393 394 395 396 397 398 399 400 401 402
			/*
			 * If we are ok with PG_LEVEL_2M mapping, then we will
			 * use the existing mapping,
			 *
			 * Otherwise, we will split the large page mapping but
			 * use the same existing protection bits except for
			 * large page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
403 404
			if (page_size_mask & (1 << PG_LEVEL_2M)) {
				pages++;
405
				continue;
406
			}
407
			new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
408 409
		}

410
		if (page_size_mask & (1<<PG_LEVEL_2M)) {
411
			pages++;
412
			spin_lock(&init_mm.page_table_lock);
413
			set_pte((pte_t *)pmd,
414 415
				pfn_pte(address >> PAGE_SHIFT,
					__pgprot(pgprot_val(prot) | _PAGE_PSE)));
416
			spin_unlock(&init_mm.page_table_lock);
417
			last_map_addr = (address & PMD_MASK) + PMD_SIZE;
418
			continue;
419
		}
420

421
		pte = alloc_low_page(&pte_phys);
422
		last_map_addr = phys_pte_init(pte, address, end, new_prot);
423 424
		unmap_low_page(pte);

425
		spin_lock(&init_mm.page_table_lock);
426
		pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
427
		spin_unlock(&init_mm.page_table_lock);
428
	}
429
	update_page_count(PG_LEVEL_2M, pages);
430
	return last_map_addr;
431 432
}

433
static unsigned long __meminit
434
phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
435
		unsigned long page_size_mask, pgprot_t prot)
436
{
Thomas Gleixner's avatar
Thomas Gleixner committed
437
	pmd_t *pmd = pmd_offset(pud, 0);
438 439
	unsigned long last_map_addr;

440
	last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
441
	__flush_tlb_all();
442
	return last_map_addr;
443 444
}

445
static unsigned long __meminit
446 447
phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
			 unsigned long page_size_mask)
Thomas Gleixner's avatar
Thomas Gleixner committed
448
{
449
	unsigned long pages = 0;
450
	unsigned long last_map_addr = end;
451
	int i = pud_index(addr);
452

Thomas Gleixner's avatar
Thomas Gleixner committed
453
	for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE) {
454 455
		unsigned long pmd_phys;
		pud_t *pud = pud_page + pud_index(addr);
Linus Torvalds's avatar
Linus Torvalds committed
456
		pmd_t *pmd;
457
		pgprot_t prot = PAGE_KERNEL;
Linus Torvalds's avatar
Linus Torvalds committed
458

459
		if (addr >= end)
Linus Torvalds's avatar
Linus Torvalds committed
460 461
			break;

Thomas Gleixner's avatar
Thomas Gleixner committed
462 463 464
		if (!after_bootmem &&
				!e820_any_mapped(addr, addr+PUD_SIZE, 0)) {
			set_pud(pud, __pud(0));
Linus Torvalds's avatar
Linus Torvalds committed
465
			continue;
Thomas Gleixner's avatar
Thomas Gleixner committed
466
		}
Linus Torvalds's avatar
Linus Torvalds committed
467

468
		if (pud_val(*pud)) {
469
			if (!pud_large(*pud)) {
470
				last_map_addr = phys_pmd_update(pud, addr, end,
471
							 page_size_mask, prot);
472 473
				continue;
			}
474 475 476 477 478 479 480 481 482 483 484 485
			/*
			 * If we are ok with PG_LEVEL_1G mapping, then we will
			 * use the existing mapping.
			 *
			 * Otherwise, we will split the gbpage mapping but use
			 * the same existing protection  bits except for large
			 * page, so that we don't violate Intel's TLB
			 * Application note (317080) which says, while changing
			 * the page sizes, new and old translations should
			 * not differ with respect to page frame and
			 * attributes.
			 */
486 487
			if (page_size_mask & (1 << PG_LEVEL_1G)) {
				pages++;
488
				continue;
489
			}
490
			prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
491 492
		}

493
		if (page_size_mask & (1<<PG_LEVEL_1G)) {
494
			pages++;
495
			spin_lock(&init_mm.page_table_lock);
496 497
			set_pte((pte_t *)pud,
				pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
498
			spin_unlock(&init_mm.page_table_lock);
499
			last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
500 501 502
			continue;
		}

503
		pmd = alloc_low_page(&pmd_phys);
504 505
		last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
					      prot);
506
		unmap_low_page(pmd);
507 508

		spin_lock(&init_mm.page_table_lock);
509
		pud_populate(&init_mm, pud, __va(pmd_phys));
510
		spin_unlock(&init_mm.page_table_lock);
Linus Torvalds's avatar
Linus Torvalds committed
511
	}
512
	__flush_tlb_all();
513

514
	update_page_count(PG_LEVEL_1G, pages);
515

516
	return last_map_addr;
Thomas Gleixner's avatar
Thomas Gleixner committed
517
}
Linus Torvalds's avatar
Linus Torvalds committed
518

519
static unsigned long __meminit
520 521
phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
		 unsigned long page_size_mask)
522 523 524 525 526
{
	pud_t *pud;

	pud = (pud_t *)pgd_page_vaddr(*pgd);

527
	return phys_pud_init(pud, addr, end, page_size_mask);
528 529
}

530
unsigned long __meminit
531 532 533
kernel_physical_mapping_init(unsigned long start,
			     unsigned long end,
			     unsigned long page_size_mask)
Thomas Gleixner's avatar
Thomas Gleixner committed
534
{
Linus Torvalds's avatar
Linus Torvalds committed
535

536
	unsigned long next, last_map_addr = end;
Linus Torvalds's avatar
Linus Torvalds committed
537 538 539 540 541

	start = (unsigned long)__va(start);
	end = (unsigned long)__va(end);

	for (; start < end; start = next) {
542
		pgd_t *pgd = pgd_offset_k(start);
Thomas Gleixner's avatar
Thomas Gleixner committed
543
		unsigned long pud_phys;
544 545
		pud_t *pud;

546
		next = (start + PGDIR_SIZE) & PGDIR_MASK;
547 548 549 550
		if (next > end)
			next = end;

		if (pgd_val(*pgd)) {
551 552
			last_map_addr = phys_pud_update(pgd, __pa(start),
						 __pa(end), page_size_mask);
553 554 555
			continue;
		}

556
		pud = alloc_low_page(&pud_phys);
557 558
		last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
						 page_size_mask);
559
		unmap_low_page(pud);
560 561 562 563

		spin_lock(&init_mm.page_table_lock);
		pgd_populate(&init_mm, pgd, __va(pud_phys));
		spin_unlock(&init_mm.page_table_lock);
Thomas Gleixner's avatar
Thomas Gleixner committed
564
	}
565
	__flush_tlb_all();
Linus Torvalds's avatar
Linus Torvalds committed
566

567 568
	return last_map_addr;
}
569

570
#ifndef CONFIG_NUMA
571 572
void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
				int acpi, int k8)
573 574 575 576 577 578 579 580
{
	unsigned long bootmap_size, bootmap;

	bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
	bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
				 PAGE_SIZE);
	if (bootmap == -1L)
		panic("Cannot find bootmem map of size %ld\n", bootmap_size);
Yinghai Lu's avatar
Yinghai Lu committed
581 582 583
	/* don't touch min_low_pfn */
	bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
					 0, end_pfn);
584 585 586 587 588
	e820_register_active_regions(0, start_pfn, end_pfn);
	free_bootmem_with_active_regions(0, end_pfn);
	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
}
589
#endif
590

Linus Torvalds's avatar
Linus Torvalds committed
591 592
void __init paging_init(void)
{
593
	unsigned long max_zone_pfns[MAX_NR_ZONES];
Thomas Gleixner's avatar
Thomas Gleixner committed
594

595 596 597
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
Yinghai Lu's avatar
Yinghai Lu committed
598
	max_zone_pfns[ZONE_NORMAL] = max_pfn;
599

600
	sparse_memory_present_with_active_regions(MAX_NUMNODES);
601
	sparse_init();
602 603 604 605 606 607 608 609 610

	/*
	 * clear the default setting with node 0
	 * note: don't use nodes_clear here, that is really clearing when
	 *	 numa support is not compiled in, and later node_set_state
	 *	 will not set it back.
	 */
	node_clear_state(0, N_NORMAL_MEMORY);

611
	free_area_init_nodes(max_zone_pfns);
Linus Torvalds's avatar
Linus Torvalds committed
612 613
}

614 615 616
/*
 * Memory hotplug specific functions
 */
617
#ifdef CONFIG_MEMORY_HOTPLUG
618 619 620 621
/*
 * Memory is added always to NORMAL zone. This means you will never get
 * additional DMA/DMA32 memory.
 */
622
int arch_add_memory(int nid, u64 start, u64 size)
623
{
624
	struct pglist_data *pgdat = NODE_DATA(nid);
625
	struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
626
	unsigned long last_mapped_pfn, start_pfn = start >> PAGE_SHIFT;
627 628 629
	unsigned long nr_pages = size >> PAGE_SHIFT;
	int ret;

630
	last_mapped_pfn = init_memory_mapping(start, start + size);
631 632
	if (last_mapped_pfn > max_pfn_mapped)
		max_pfn_mapped = last_mapped_pfn;
633

634
	ret = __add_pages(nid, zone, start_pfn, nr_pages);
635
	WARN_ON_ONCE(ret);
636 637 638

	return ret;
}
639
EXPORT_SYMBOL_GPL(arch_add_memory);
640

641
#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
642 643 644 645
int memory_add_physaddr_to_nid(u64 start)
{
	return 0;
}
646
EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
647 648
#endif

649 650
#endif /* CONFIG_MEMORY_HOTPLUG */

651
static struct kcore_list kcore_vsyscall;
Linus Torvalds's avatar
Linus Torvalds committed
652 653 654

void __init mem_init(void)
{
655
	long codesize, reservedpages, datasize, initsize;
656
	unsigned long absent_pages;
Linus Torvalds's avatar
Linus Torvalds committed
657

658
	pci_iommu_alloc();
Linus Torvalds's avatar
Linus Torvalds committed
659

660
	/* clear_bss() already clear the empty_zero_page */
Linus Torvalds's avatar
Linus Torvalds committed
661 662 663 664

	reservedpages = 0;

	/* this will put all low memory onto the freelists */
665
#ifdef CONFIG_NUMA
666
	totalram_pages = numa_free_all_bootmem();
Linus Torvalds's avatar
Linus Torvalds committed
667
#else
668
	totalram_pages = free_all_bootmem();
Linus Torvalds's avatar
Linus Torvalds committed
669
#endif
670 671 672

	absent_pages = absent_pages_in_range(0, max_pfn);
	reservedpages = max_pfn - totalram_pages - absent_pages;
Linus Torvalds's avatar
Linus Torvalds committed
673 674 675 676 677 678 679
	after_bootmem = 1;

	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;

	/* Register memory areas for /proc/kcore */
Thomas Gleixner's avatar
Thomas Gleixner committed
680
	kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
KAMEZAWA Hiroyuki's avatar
KAMEZAWA Hiroyuki committed
681
			 VSYSCALL_END - VSYSCALL_START, KCORE_OTHER);
Linus Torvalds's avatar
Linus Torvalds committed
682

683
	printk(KERN_INFO "Memory: %luk/%luk available (%ldk kernel code, "
684
			 "%ldk absent, %ldk reserved, %ldk data, %ldk init)\n",
685
		nr_free_pages() << (PAGE_SHIFT-10),
Yinghai Lu's avatar
Yinghai Lu committed
686
		max_pfn << (PAGE_SHIFT-10),
Linus Torvalds's avatar
Linus Torvalds committed
687
		codesize >> 10,
688
		absent_pages << (PAGE_SHIFT-10),
Linus Torvalds's avatar
Linus Torvalds committed
689 690 691 692 693
		reservedpages << (PAGE_SHIFT-10),
		datasize >> 10,
		initsize >> 10);
}

694
#ifdef CONFIG_DEBUG_RODATA
695 696
const int rodata_test_data = 0xC3;
EXPORT_SYMBOL_GPL(rodata_test_data);
697

698
int kernel_set_to_readonly;
699 700 701

void set_kernel_text_rw(void)
{
702
	unsigned long start = PFN_ALIGN(_text);
703 704 705 706 707 708 709 710 711 712 713 714 715
	unsigned long end = PFN_ALIGN(__start_rodata);

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read write\n",
		 start, end);

	set_memory_rw(start, (end - start) >> PAGE_SHIFT);
}

void set_kernel_text_ro(void)
{
716
	unsigned long start = PFN_ALIGN(_text);
717 718 719 720 721 722 723 724 725 726 727
	unsigned long end = PFN_ALIGN(__start_rodata);

	if (!kernel_set_to_readonly)
		return;

	pr_debug("Set kernel text: %lx - %lx for read only\n",
		 start, end);

	set_memory_ro(start, (end - start) >> PAGE_SHIFT);
}

728 729
void mark_rodata_ro(void)
{
730
	unsigned long start = PFN_ALIGN(_text);
731 732
	unsigned long rodata_start =
		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
733 734 735 736
	unsigned long end = (unsigned long) &__end_rodata_hpage_align;
	unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
	unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
	unsigned long data_start = (unsigned long) &_sdata;
737

738
	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
739
	       (end - start) >> 10);
740 741
	set_memory_ro(start, (end - start) >> PAGE_SHIFT);

742 743
	kernel_set_to_readonly = 1;

744 745 746 747
	/*
	 * The rodata section (but not the kernel text!) should also be
	 * not-executable.
	 */
748
	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
749

750 751
	rodata_test();

752
#ifdef CONFIG_CPA_DEBUG
753
	printk(KERN_INFO "Testing CPA: undo %lx-%lx\n", start, end);
754
	set_memory_rw(start, (end-start) >> PAGE_SHIFT);
755

756
	printk(KERN_INFO "Testing CPA: again\n");
757
	set_memory_ro(start, (end-start) >> PAGE_SHIFT);
758
#endif
759 760 761 762 763 764 765 766

	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(text_end)),
			(unsigned long)
				 page_address(virt_to_page(rodata_start)));
	free_init_pages("unused kernel memory",
			(unsigned long) page_address(virt_to_page(rodata_end)),
			(unsigned long) page_address(virt_to_page(data_start)));
767
}
768

769 770
#endif

771 772
int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
				   int flags)
Thomas Gleixner's avatar
Thomas Gleixner committed
773
{
774
#ifdef CONFIG_NUMA
775
	int nid, next_nid;
776
	int ret;
777 778
#endif
	unsigned long pfn = phys >> PAGE_SHIFT;
Thomas Gleixner's avatar
Thomas Gleixner committed
779

Yinghai Lu's avatar
Yinghai Lu committed
780
	if (pfn >= max_pfn) {
Thomas Gleixner's avatar
Thomas Gleixner committed
781 782 783 784
		/*
		 * This can happen with kdump kernels when accessing
		 * firmware tables:
		 */
785
		if (pfn < max_pfn_mapped)
786
			return -EFAULT;
Thomas Gleixner's avatar
Thomas Gleixner committed
787

788
		printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
789
				phys, len);
790
		return -EFAULT;
791 792 793 794
	}

	/* Should check here against the e820 map to avoid double free */
#ifdef CONFIG_NUMA
795 796 797
	nid = phys_to_nid(phys);
	next_nid = phys_to_nid(phys + len - 1);
	if (nid == next_nid)
798
		ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
799
	else
800 801 802 803 804
		ret = reserve_bootmem(phys, len, flags);

	if (ret != 0)
		return ret;

Thomas Gleixner's avatar
Thomas Gleixner committed
805
#else
806
	reserve_bootmem(phys, len, flags);
Linus Torvalds's avatar
Linus Torvalds committed
807
#endif
808

809
	if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
810
		dma_reserve += len / PAGE_SIZE;
811 812
		set_dma_reserve(dma_reserve);
	}
813 814

	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
815 816
}

Thomas Gleixner's avatar
Thomas Gleixner committed
817 818
int kern_addr_valid(unsigned long addr)
{
Linus Torvalds's avatar
Linus Torvalds committed
819
	unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
Thomas Gleixner's avatar
Thomas Gleixner committed
820 821 822 823
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;
Linus Torvalds's avatar
Linus Torvalds committed
824 825

	if (above != 0 && above != -1UL)
Thomas Gleixner's avatar
Thomas Gleixner committed
826 827
		return 0;

Linus Torvalds's avatar
Linus Torvalds committed
828 829 830 831 832 833
	pgd = pgd_offset_k(addr);
	if (pgd_none(*pgd))
		return 0;

	pud = pud_offset(pgd, addr);
	if (pud_none(*pud))
Thomas Gleixner's avatar
Thomas Gleixner committed
834
		return 0;
Linus Torvalds's avatar
Linus Torvalds committed
835 836 837 838

	pmd = pmd_offset(pud, addr);
	if (pmd_none(*pmd))
		return 0;
Thomas Gleixner's avatar
Thomas Gleixner committed
839

Linus Torvalds's avatar
Linus Torvalds committed
840 841 842 843 844 845
	if (pmd_large(*pmd))
		return pfn_valid(pmd_pfn(*pmd));

	pte = pte_offset_kernel(pmd, addr);
	if (pte_none(*pte))
		return 0;
Thomas Gleixner's avatar
Thomas Gleixner committed
846

Linus Torvalds's avatar
Linus Torvalds committed
847 848 849
	return pfn_valid(pte_pfn(*pte));
}

Thomas Gleixner's avatar
Thomas Gleixner committed
850 851 852 853 854
/*
 * A pseudo VMA to allow ptrace access for the vsyscall page.  This only
 * covers the 64bit vsyscall page now. 32bit has a real VMA now and does
 * not need special handling anymore:
 */
Linus Torvalds's avatar
Linus Torvalds committed
855
static struct vm_area_struct gate_vma = {
Thomas Gleixner's avatar
Thomas Gleixner committed
856 857 858 859
	.vm_start	= VSYSCALL_START,
	.vm_end		= VSYSCALL_START + (VSYSCALL_MAPPED_PAGES * PAGE_SIZE),
	.vm_page_prot	= PAGE_READONLY_EXEC,
	.vm_flags	= VM_READ | VM_EXEC
Linus Torvalds's avatar
Linus Torvalds committed
860 861 862 863 864
};

struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
{
#ifdef CONFIG_IA32_EMULATION
865 866
	if (test_tsk_thread_flag(tsk, TIF_IA32))
		return NULL;
Linus Torvalds's avatar
Linus Torvalds committed
867 868 869 870 871 872 873
#endif
	return &gate_vma;
}

int in_gate_area(struct task_struct *task, unsigned long addr)
{
	struct vm_area_struct *vma = get_gate_vma(task);
Thomas Gleixner's avatar
Thomas Gleixner committed
874

875 876
	if (!vma)
		return 0;
Thomas Gleixner's avatar
Thomas Gleixner committed
877

Linus Torvalds's avatar
Linus Torvalds committed
878 879 880
	return (addr >= vma->vm_start) && (addr < vma->vm_end);
}

Thomas Gleixner's avatar
Thomas Gleixner committed
881 882 883 884
/*
 * Use this when you have no reliable task/vma, typically from interrupt
 * context. It is less reliable than using the task's vma and may give
 * false positives:
Linus Torvalds's avatar
Linus Torvalds committed
885 886 887
 */
int in_gate_area_no_task(unsigned long addr)
{
888
	return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
Linus Torvalds's avatar
Linus Torvalds committed
889
}
890

891 892 893 894 895 896 897 898
const char *arch_vma_name(struct vm_area_struct *vma)
{
	if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
		return "[vdso]";
	if (vma == &gate_vma)
		return "[vsyscall]";
	return NULL;
}
899 900 901 902 903

#ifdef CONFIG_SPARSEMEM_VMEMMAP
/*
 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
 */
904 905 906 907
static long __meminitdata addr_start, addr_end;
static void __meminitdata *p_start, *p_end;
static int __meminitdata node_start;

Thomas Gleixner's avatar
Thomas Gleixner committed
908 909
int __meminit
vmemmap_populate(struct page *start_page, unsigned long size, int node)
910 911 912 913 914 915 916 917 918
{
	unsigned long addr = (unsigned long)start_page;
	unsigned long end = (unsigned long)(start_page + size);
	unsigned long next;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;

	for (; addr < end; addr = next) {
919
		void *p = NULL;
920 921 922 923

		pgd = vmemmap_pgd_populate(addr, node);
		if (!pgd)
			return -ENOMEM;
Thomas Gleixner's avatar
Thomas Gleixner committed
924

925 926 927 928
		pud = vmemmap_pud_populate(pgd, addr, node);
		if (!pud)
			return -ENOMEM;

929 930 931 932 933 934 935 936
		if (!cpu_has_pse) {
			next = (addr + PAGE_SIZE) & PAGE_MASK;
			pmd = vmemmap_pmd_populate(pud, addr, node);

			if (!pmd)
				return -ENOMEM;

			p = vmemmap_pte_populate(pmd, addr, node);
Thomas Gleixner's avatar
Thomas Gleixner committed
937

938 939 940
			if (!p)
				return -ENOMEM;

941 942
			addr_end = addr + PAGE_SIZE;
			p_end = p + PAGE_SIZE;
Thomas Gleixner's avatar
Thomas Gleixner committed
943
		} else {
944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966
			next = pmd_addr_end(addr, end);

			pmd = pmd_offset(pud, addr);
			if (pmd_none(*pmd)) {
				pte_t entry;

				p = vmemmap_alloc_block(PMD_SIZE, node);
				if (!p)
					return -ENOMEM;

				entry = pfn_pte(__pa(p) >> PAGE_SHIFT,
						PAGE_KERNEL_LARGE);
				set_pmd(pmd, __pmd(pte_val(entry)));

				/* check to see if we have contiguous blocks */
				if (p_end != p || node_start != node) {
					if (p_start)
						printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
						       addr_start, addr_end-1, p_start, p_end-1, node_start);
					addr_start = addr;
					node_start = node;
					p_start = p;
				}
Yinghai Lu's avatar
Yinghai Lu committed
967 968 969

				addr_end = addr + PMD_SIZE;
				p_end = p + PMD_SIZE;
970 971
			} else
				vmemmap_verify((pte_t *)pmd, node, addr, next);
Thomas Gleixner's avatar
Thomas Gleixner committed
972
		}
973

974 975 976
	}
	return 0;
}
977 978 979 980 981 982 983 984 985 986 987

void __meminit vmemmap_populate_print_last(void)
{
	if (p_start) {
		printk(KERN_DEBUG " [%lx-%lx] PMD -> [%p-%p] on node %d\n",
			addr_start, addr_end-1, p_start, p_end-1, node_start);
		p_start = NULL;
		p_end = NULL;
		node_start = 0;
	}
}
988
#endif