Commit 0fe6e20b authored by Naoya Horiguchi's avatar Naoya Horiguchi Committed by Andi Kleen

hugetlb, rmap: add reverse mapping for hugepage

This patch adds reverse mapping feature for hugepage by introducing
mapcount for shared/private-mapped hugepage and anon_vma for
private-mapped hugepage.

While hugepage is not currently swappable, reverse mapping can be useful
for memory error handler.

Without this patch, memory error handler cannot identify processes
using the bad hugepage nor unmap it from them. That is:
- for shared hugepage:
  we can collect processes using a hugepage through pagecache,
  but can not unmap the hugepage because of the lack of mapcount.
- for privately mapped hugepage:
  we can neither collect processes nor unmap the hugepage.
This patch solves these problems.

This patch include the bug fix given by commit 23be7468, so reverts it.

Dependency:
  "hugetlb: move definition of is_vm_hugetlb_page() to hugepage_inline.h"

ChangeLog since May 24.
- create hugetlb_inline.h and move is_vm_hugetlb_index() in it.
- move functions setting up anon_vma for hugepage into mm/rmap.c.

ChangeLog since May 13.
- rebased to 2.6.34
- fix logic error (in case that private mapping and shared mapping coexist)
- move is_vm_hugetlb_page() into include/linux/mm.h to use this function
  from linear_page_index()
- define and use linear_hugepage_index() instead of compound_order()
- use page_move_anon_rmap() in hugetlb_cow()
- copy exclusive switch of __set_page_anon_rmap() into hugepage counterpart.
- revert commit 24be7468 completely
Signed-off-by: default avatarNaoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Acked-by: default avatarFengguang Wu <fengguang.wu@intel.com>
Acked-by: default avatarMel Gorman <mel@csn.ul.ie>
Signed-off-by: default avatarAndi Kleen <ak@linux.intel.com>
parent 8edf344c
...@@ -99,6 +99,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) ...@@ -99,6 +99,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define is_hugepage_only_range(mm, addr, len) 0 #define is_hugepage_only_range(mm, addr, len) 0
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0
#define hugetlb_change_protection(vma, address, end, newprot) #define hugetlb_change_protection(vma, address, end, newprot)
......
...@@ -282,10 +282,16 @@ static inline loff_t page_offset(struct page *page) ...@@ -282,10 +282,16 @@ static inline loff_t page_offset(struct page *page)
return ((loff_t)page->index) << PAGE_CACHE_SHIFT; return ((loff_t)page->index) << PAGE_CACHE_SHIFT;
} }
extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
unsigned long address);
static inline pgoff_t linear_page_index(struct vm_area_struct *vma, static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address) unsigned long address)
{ {
pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff_t pgoff;
if (unlikely(is_vm_hugetlb_page(vma)))
return linear_hugepage_index(vma, address);
pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff; pgoff += vma->vm_pgoff;
return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT); return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
} }
......
...@@ -48,15 +48,6 @@ ...@@ -48,15 +48,6 @@
#define POISON_FREE 0x6b /* for use-after-free poisoning */ #define POISON_FREE 0x6b /* for use-after-free poisoning */
#define POISON_END 0xa5 /* end-byte of poisoning */ #define POISON_END 0xa5 /* end-byte of poisoning */
/********** mm/hugetlb.c **********/
/*
* Private mappings of hugetlb pages use this poisoned value for
* page->mapping. The core VM should not be doing anything with this mapping
* but futex requires the existence of some page->mapping value even though it
* is unused if PAGE_MAPPING_ANON is set.
*/
#define HUGETLB_POISON ((void *)(0x00300300 + POISON_POINTER_DELTA + PAGE_MAPPING_ANON))
/********** arch/$ARCH/mm/init.c **********/ /********** arch/$ARCH/mm/init.c **********/
#define POISON_FREE_INITMEM 0xcc #define POISON_FREE_INITMEM 0xcc
......
...@@ -140,6 +140,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon ...@@ -140,6 +140,11 @@ void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned lon
void page_add_file_rmap(struct page *); void page_add_file_rmap(struct page *);
void page_remove_rmap(struct page *); void page_remove_rmap(struct page *);
void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
unsigned long);
static inline void page_dup_rmap(struct page *page) static inline void page_dup_rmap(struct page *page)
{ {
atomic_inc(&page->_mapcount); atomic_inc(&page->_mapcount);
......
...@@ -18,6 +18,7 @@ ...@@ -18,6 +18,7 @@
#include <linux/bootmem.h> #include <linux/bootmem.h>
#include <linux/sysfs.h> #include <linux/sysfs.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/rmap.h>
#include <asm/page.h> #include <asm/page.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
...@@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, ...@@ -220,6 +221,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h)); (vma->vm_pgoff >> huge_page_order(h));
} }
pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
unsigned long address)
{
return vma_hugecache_offset(hstate_vma(vma), vma, address);
}
/* /*
* Return the size of the pages allocated when backing a VMA. In the majority * Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries. * cases this will be same size as used by the page table entries.
...@@ -552,6 +559,7 @@ static void free_huge_page(struct page *page) ...@@ -552,6 +559,7 @@ static void free_huge_page(struct page *page)
set_page_private(page, 0); set_page_private(page, 0);
page->mapping = NULL; page->mapping = NULL;
BUG_ON(page_count(page)); BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
INIT_LIST_HEAD(&page->lru); INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock); spin_lock(&hugetlb_lock);
...@@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, ...@@ -2129,6 +2137,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte); entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry); ptepage = pte_page(entry);
get_page(ptepage); get_page(ptepage);
page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry); set_huge_pte_at(dst, addr, dst_pte, entry);
} }
spin_unlock(&src->page_table_lock); spin_unlock(&src->page_table_lock);
...@@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, ...@@ -2207,6 +2216,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end); flush_tlb_range(vma, start, end);
mmu_notifier_invalidate_range_end(mm, start, end); mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) { list_for_each_entry_safe(page, tmp, &page_list, lru) {
page_remove_rmap(page);
list_del(&page->lru); list_del(&page->lru);
put_page(page); put_page(page);
} }
...@@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2272,6 +2282,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
return 1; return 1;
} }
/*
* Hugetlb_cow() should be called with page lock of the original hugepage held.
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte, unsigned long address, pte_t *ptep, pte_t pte,
struct page *pagecache_page) struct page *pagecache_page)
...@@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2286,8 +2299,11 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy: retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy /* If no-one else is actually using this page, avoid the copy
* and just make the page writable */ * and just make the page writable */
avoidcopy = (page_count(old_page) == 1); avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) { if (avoidcopy) {
if (!trylock_page(old_page))
if (PageAnon(old_page))
page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep); set_huge_ptep_writable(vma, address, ptep);
return 0; return 0;
} }
...@@ -2338,6 +2354,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2338,6 +2354,13 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
return -PTR_ERR(new_page); return -PTR_ERR(new_page);
} }
/*
* When the original hugepage is shared one, it does not have
* anon_vma prepared.
*/
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
copy_huge_page(new_page, old_page, address, vma); copy_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page); __SetPageUptodate(new_page);
...@@ -2352,6 +2375,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2352,6 +2375,8 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
huge_ptep_clear_flush(vma, address, ptep); huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep, set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1)); make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page);
hugepage_add_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */ /* Make the old page be freed below */
new_page = old_page; new_page = old_page;
} }
...@@ -2452,10 +2477,17 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2452,10 +2477,17 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&inode->i_lock); spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h); inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock); spin_unlock(&inode->i_lock);
page_dup_rmap(page);
} else { } else {
lock_page(page); lock_page(page);
page->mapping = HUGETLB_POISON; if (unlikely(anon_vma_prepare(vma))) {
ret = VM_FAULT_OOM;
goto backout_unlocked;
}
hugepage_add_new_anon_rmap(page, vma, address);
} }
} else {
page_dup_rmap(page);
} }
/* /*
...@@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2507,6 +2539,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep; pte_t *ptep;
pte_t entry; pte_t entry;
int ret; int ret;
struct page *page = NULL;
struct page *pagecache_page = NULL; struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex); static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma); struct hstate *h = hstate_vma(vma);
...@@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2548,6 +2581,11 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address); vma, address);
} }
if (!pagecache_page) {
page = pte_page(entry);
lock_page(page);
}
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */ /* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
...@@ -2573,6 +2611,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2573,6 +2611,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (pagecache_page) { if (pagecache_page) {
unlock_page(pagecache_page); unlock_page(pagecache_page);
put_page(pagecache_page); put_page(pagecache_page);
} else {
unlock_page(page);
} }
out_mutex: out_mutex:
......
...@@ -56,6 +56,7 @@ ...@@ -56,6 +56,7 @@
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/mmu_notifier.h> #include <linux/mmu_notifier.h>
#include <linux/migrate.h> #include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
...@@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) ...@@ -326,6 +327,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
unsigned long address; unsigned long address;
if (unlikely(is_vm_hugetlb_page(vma)))
pgoff = page->index << huge_page_order(page_hstate(page));
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
/* page should be within @vma mapping range */ /* page should be within @vma mapping range */
...@@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, ...@@ -369,6 +372,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pte_t *pte; pte_t *pte;
spinlock_t *ptl; spinlock_t *ptl;
if (unlikely(PageHuge(page))) {
pte = huge_pte_offset(mm, address);
ptl = &mm->page_table_lock;
goto check;
}
pgd = pgd_offset(mm, address); pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd)) if (!pgd_present(*pgd))
return NULL; return NULL;
...@@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, ...@@ -389,6 +398,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
} }
ptl = pte_lockptr(mm, pmd); ptl = pte_lockptr(mm, pmd);
check:
spin_lock(ptl); spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl; *ptlp = ptl;
...@@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page) ...@@ -873,6 +883,12 @@ void page_remove_rmap(struct page *page)
page_clear_dirty(page); page_clear_dirty(page);
set_page_dirty(page); set_page_dirty(page);
} }
/*
* Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
* and not charged by memcg for now.
*/
if (unlikely(PageHuge(page)))
return;
if (PageAnon(page)) { if (PageAnon(page)) {
mem_cgroup_uncharge_page(page); mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page, NR_ANON_PAGES); __dec_zone_page_state(page, NR_ANON_PAGES);
...@@ -1445,3 +1461,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *, ...@@ -1445,3 +1461,46 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
return rmap_walk_file(page, rmap_one, arg); return rmap_walk_file(page, rmap_one, arg);
} }
#endif /* CONFIG_MIGRATION */ #endif /* CONFIG_MIGRATION */
#ifdef CONFIG_HUGETLBFS
/*
* The following three functions are for anonymous (private mapped) hugepages.
* Unlike common anonymous pages, anonymous hugepages have no accounting code
* and no lru code, because we handle hugepages differently from common pages.
*/
static void __hugepage_set_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, int exclusive)
{
struct anon_vma *anon_vma = vma->anon_vma;
BUG_ON(!anon_vma);
if (!exclusive) {
struct anon_vma_chain *avc;
avc = list_entry(vma->anon_vma_chain.prev,
struct anon_vma_chain, same_vma);
anon_vma = avc->anon_vma;
}
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
page->index = linear_page_index(vma, address);
}
void hugepage_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
struct anon_vma *anon_vma = vma->anon_vma;
int first;
BUG_ON(!anon_vma);
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
first = atomic_inc_and_test(&page->_mapcount);
if (first)
__hugepage_set_anon_rmap(page, vma, address, 0);
}
void hugepage_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(&page->_mapcount, 0);
__hugepage_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLBFS */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment