mm: reinstate ZERO_PAGE

KAMEZAWA Hiroyuki has observed customers of earlier kernels taking advantage of the ZERO_PAGE: which we stopped do_anonymous_page() from using in 2.6.24. And there were a couple of regression reports on LKML. Following suggestions from Linus, reinstate do_anonymous_page() use of the ZERO_PAGE; but this time avoid dirtying its struct page cacheline with (map)count updates - let vm_normal_page() regard it as abnormal. Use it only on arches which __HAVE_ARCH_PTE_SPECIAL (x86, s390, sh32, most powerpc): that's not essential, but minimizes additional branches (keeping them in the unlikely pte_special case); and incidentally excludes mips (some models of which needed eight colours of ZERO_PAGE to avoid costly exceptions). Don't be fanatical about avoiding ZERO_PAGE updates: get_user_pages() callers won't want to make exceptions for it, so increment its count there. Changes to mlock and migration? happily seems not needed. In most places it's quicker to check pfn than struct page address: prepare a __read_mostly zero_pfn for that. Does get_dump_page() still need its ZERO_PAGE check? probably not, but keep it anyway. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

mm: reinstate ZERO_PAGE
KAMEZAWA Hiroyuki has observed customers of earlier kernels taking advantage of the ZERO_PAGE: which we stopped do_anonymous_page() from using in 2.6.24. And there were a couple of regression reports on LKML. Following suggestions from Linus, reinstate do_anonymous_page() use of the ZERO_PAGE; but this time avoid dirtying its struct page cacheline with (map)count updates - let vm_normal_page() regard it as abnormal. Use it only on arches which __HAVE_ARCH_PTE_SPECIAL (x86, s390, sh32, most powerpc): that's not essential, but minimizes additional branches (keeping them in the unlikely pte_special case); and incidentally excludes mips (some models of which needed eight colours of ZERO_PAGE to avoid costly exceptions). Don't be fanatical about avoiding ZERO_PAGE updates: get_user_pages() callers won't want to make exceptions for it, so increment its count there. Changes to mlock and migration? happily seems not needed. In most places it's quicker to check pfn than struct page address: prepare a __read_mostly zero_pfn for that. Does get_dump_page() still need its ZERO_PAGE check? probably not, but keep it anyway. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Acked-by: Rik van Riel <riel@redhat.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Nick Piggin <npiggin@suse.de> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Minchan Kim <minchan.kim@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
a13ea5b7 · Hugh Dickins · Linus Torvalds · 1ac0cb5d · a13ea5b7
Commit a13ea5b7 authored Sep 21, 2009 by Hugh Dickins Committed by Linus Torvalds Sep 22, 2009
Hide whitespace changes
Inline Side-by-side

Showing with 44 additions and 9 deletions

mm/memory.c mm/memory.c +44 -9

No files found.
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -108,6 +108,17 @@ static int __init disable_randmaps(char *s)
 }
 __setup("norandmaps", disable_randmaps);
+static unsigned long zero_pfn __read_mostly;
+/*
+ * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
+ */
+static int __init init_zero_pfn(void)
+{
+	zero_pfn = page_to_pfn(ZERO_PAGE(0));
+	return 0;
+}
+core_initcall(init_zero_pfn);
 /*
 * If a p?d_bad entry is found while walking page tables, report
@@ -499,7 +510,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 	if (HAVE_PTE_SPECIAL) {
 		if (likely(!pte_special(pte)))
 			goto check_pfn;
-		if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
+		if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
+			return NULL;
+		if (pfn != zero_pfn)
 			print_bad_pte(vma, addr, pte, NULL);
 		return NULL;
 	}
@@ -1144,9 +1157,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 		goto no_page;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
 	page = vm_normal_page(vma, address, pte);
-	if (unlikely(!page))
+	if (unlikely(!page)) {
-		goto bad_page;
+		if ((flags & FOLL_DUMP) ||
+		    pte_pfn(pte) != zero_pfn)
+			goto bad_page;
+		page = pte_page(pte);
+	}
 	if (flags & FOLL_GET)
 		get_page(page);
@@ -2084,10 +2102,19 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	VM_BUG_ON(old_page == ZERO_PAGE(0));
-	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+	if (pte_pfn(orig_pte) == zero_pfn) {
-	if (!new_page)
+		new_page = alloc_zeroed_user_highpage_movable(vma, address);
-		goto oom;
+		if (!new_page)
+			goto oom;
+	} else {
+		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+		if (!new_page)
+			goto oom;
+		cow_user_page(new_page, old_page, address, vma);
+	}
+	__SetPageUptodate(new_page);
 	/*
 	 * Don't let another task, with possibly unlocked vma,
 	 * keep the mlocked page.
@@ -2097,8 +2124,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		clear_page_mlock(old_page);
 		unlock_page(old_page);
 	}
-	cow_user_page(new_page, old_page, address, vma);
-	__SetPageUptodate(new_page);
 	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
 		goto oom_free_new;
@@ -2639,6 +2664,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
+	if (HAVE_PTE_SPECIAL && !(flags & FAULT_FLAG_WRITE)) {
+		entry = pte_mkspecial(pfn_pte(zero_pfn, vma->vm_page_prot));
+		ptl = pte_lockptr(mm, pmd);
+		spin_lock(ptl);
+		if (!pte_none(*page_table))
+			goto unlock;
+		goto setpte;
+	}
 	/* Allocate our own private page. */
 	pte_unmap(page_table);
@@ -2662,6 +2696,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	inc_mm_counter(mm, anon_rss);
 	page_add_new_anon_rmap(page, vma, address);
+setpte:
 	set_pte_at(mm, address, page_table, entry);
 	/* No need to invalidate - it was non-present before */