Commit f1ebdd60 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6

* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (22 commits)
  Add _addr_lsb field to ia64 siginfo
  Fix migration.c compilation on s390
  HWPOISON: Remove retry loop for try_to_unmap
  HWPOISON: Turn addr_valid from bitfield into char
  HWPOISON: Disable DEBUG by default
  HWPOISON: Convert pr_debugs to pr_info
  HWPOISON: Improve comments in memory-failure.c
  x86: HWPOISON: Report correct address granuality for huge hwpoison faults
  Encode huge page size for VM_FAULT_HWPOISON errors
  Fix build error with !CONFIG_MIGRATION
  hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning
  Clean up __page_set_anon_rmap
  HWPOISON, hugetlb: fix unpoison for hugepage
  HWPOISON, hugetlb: soft offlining for hugepage
  HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED
  hugetlb: move refcounting in hugepage allocation inside hugetlb_lock
  HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page()
  hugetlb: hugepage migration core
  hugetlb: redefine hugepage copy functions
  hugetlb: add allocate function for hugepage migration
  ...
parents f99d0553 46e387bb
...@@ -62,6 +62,7 @@ typedef struct siginfo { ...@@ -62,6 +62,7 @@ typedef struct siginfo {
int _imm; /* immediate value for "break" */ int _imm; /* immediate value for "break" */
unsigned int _flags; /* see below */ unsigned int _flags; /* see below */
unsigned long _isr; /* isr */ unsigned long _isr; /* isr */
short _addr_lsb; /* lsb of faulting address */
} _sigfault; } _sigfault;
/* SIGPOLL */ /* SIGPOLL */
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
#include <linux/kprobes.h> /* __kprobes, ... */ #include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */ #include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */
...@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) ...@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
static void static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address, force_sig_info_fault(int si_signo, int si_code, unsigned long address,
struct task_struct *tsk) struct task_struct *tsk, int fault)
{ {
unsigned lsb = 0;
siginfo_t info; siginfo_t info;
info.si_signo = si_signo; info.si_signo = si_signo;
info.si_errno = 0; info.si_errno = 0;
info.si_code = si_code; info.si_code = si_code;
info.si_addr = (void __user *)address; info.si_addr = (void __user *)address;
info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; if (fault & VM_FAULT_HWPOISON_LARGE)
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
if (fault & VM_FAULT_HWPOISON)
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
force_sig_info(si_signo, &info, tsk); force_sig_info(si_signo, &info, tsk);
} }
...@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, ...@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14; tsk->thread.trap_no = 14;
force_sig_info_fault(SIGSEGV, si_code, address, tsk); force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
return; return;
} }
...@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, ...@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
tsk->thread.trap_no = 14; tsk->thread.trap_no = 14;
#ifdef CONFIG_MEMORY_FAILURE #ifdef CONFIG_MEMORY_FAILURE
if (fault & VM_FAULT_HWPOISON) { if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
printk(KERN_ERR printk(KERN_ERR
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address); tsk->comm, tsk->pid, address);
code = BUS_MCEERR_AR; code = BUS_MCEERR_AR;
} }
#endif #endif
force_sig_info_fault(SIGBUS, code, address, tsk); force_sig_info_fault(SIGBUS, code, address, tsk, fault);
} }
static noinline void static noinline void
...@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, ...@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) { if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address); out_of_memory(regs, error_code, address);
} else { } else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault); do_sigbus(regs, error_code, address, fault);
else else
BUG(); BUG();
......
...@@ -31,6 +31,7 @@ ...@@ -31,6 +31,7 @@
#include <linux/statfs.h> #include <linux/statfs.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/magic.h> #include <linux/magic.h>
#include <linux/migrate.h>
#include <asm/uaccess.h> #include <asm/uaccess.h>
...@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page) ...@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
return 0; return 0;
} }
static int hugetlbfs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page)
{
int rc;
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc)
return rc;
migrate_page_copy(newpage, page);
return 0;
}
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{ {
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
...@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = { ...@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin, .write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end, .write_end = hugetlbfs_write_end,
.set_page_dirty = hugetlbfs_set_page_dirty, .set_page_dirty = hugetlbfs_set_page_dirty,
.migratepage = hugetlbfs_migrate_page,
}; };
......
...@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, ...@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr); err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
#ifdef __ARCH_SI_TRAPNO #ifdef __ARCH_SI_TRAPNO
err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
#endif
#ifdef BUS_MCEERR_AO
/*
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
if (kinfo->si_code == BUS_MCEERR_AR ||
kinfo->si_code == BUS_MCEERR_AO)
err |= __put_user((short) kinfo->si_addr_lsb,
&uinfo->ssi_addr_lsb);
#endif #endif
break; break;
case __SI_CHLD: case __SI_CHLD:
......
...@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, ...@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma, struct vm_area_struct *vma,
int acctflags); int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page); int dequeue_hwpoisoned_huge_page(struct page *page);
void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable; extern unsigned long hugepages_treat_as_movable;
extern const unsigned long hugetlb_zero, hugetlb_infinity; extern const unsigned long hugetlb_zero, hugetlb_infinity;
...@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) ...@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0 #define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0 #define dequeue_hwpoisoned_huge_page(page) 0
static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
#define hugetlb_change_protection(vma, address, end, newprot) #define hugetlb_change_protection(vma, address, end, newprot)
...@@ -228,6 +232,8 @@ struct huge_bootmem_page { ...@@ -228,6 +232,8 @@ struct huge_bootmem_page {
struct hstate *hstate; struct hstate *hstate;
}; };
struct page *alloc_huge_page_node(struct hstate *h, int nid);
/* arch callback */ /* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h); int __init alloc_bootmem_huge_page(struct hstate *h);
...@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page) ...@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
return size_to_hstate(PAGE_SIZE << compound_order(page)); return size_to_hstate(PAGE_SIZE << compound_order(page));
} }
static inline unsigned hstate_index_to_shift(unsigned index)
{
return hstates[index].order + PAGE_SHIFT;
}
#else #else
struct hstate {}; struct hstate {};
#define alloc_huge_page_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL #define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL #define hstate_file(f) NULL
#define hstate_vma(v) NULL #define hstate_vma(v) NULL
...@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) ...@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
{ {
return 1; return 1;
} }
#define hstate_index_to_shift(index) 0
#endif #endif
#endif /* _LINUX_HUGETLB_H */ #endif /* _LINUX_HUGETLB_H */
...@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *, ...@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
struct page *, struct page *); struct page *, struct page *);
extern int migrate_pages(struct list_head *l, new_page_t x, extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining); unsigned long private, int offlining);
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining);
extern int fail_migrate_page(struct address_space *, extern int fail_migrate_page(struct address_space *,
struct page *, struct page *); struct page *, struct page *);
...@@ -23,12 +25,17 @@ extern int migrate_prep_local(void); ...@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
extern int migrate_vmas(struct mm_struct *mm, extern int migrate_vmas(struct mm_struct *mm,
const nodemask_t *from, const nodemask_t *to, const nodemask_t *from, const nodemask_t *to,
unsigned long flags); unsigned long flags);
extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
#else #else
#define PAGE_MIGRATION 0 #define PAGE_MIGRATION 0
static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_lru_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x, static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining) { return -ENOSYS; } unsigned long private, int offlining) { return -ENOSYS; }
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining) { return -ENOSYS; }
static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; }
static inline int migrate_prep_local(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; }
...@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm, ...@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
return -ENOSYS; return -ENOSYS;
} }
static inline void migrate_page_copy(struct page *newpage,
struct page *page) {}
static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
return -ENOSYS;
}
/* Possible settings for the migrate_page() method in address_operations */ /* Possible settings for the migrate_page() method in address_operations */
#define migrate_page NULL #define migrate_page NULL
#define fail_migrate_page NULL #define fail_migrate_page NULL
......
...@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page) ...@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004 #define VM_FAULT_MAJOR 0x0004
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ #define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
VM_FAULT_HWPOISON_LARGE)
/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
/* /*
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM. * Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
......
...@@ -33,6 +33,7 @@ struct signalfd_siginfo { ...@@ -33,6 +33,7 @@ struct signalfd_siginfo {
__u64 ssi_utime; __u64 ssi_utime;
__u64 ssi_stime; __u64 ssi_stime;
__u64 ssi_addr; __u64 ssi_addr;
__u16 ssi_addr_lsb;
/* /*
* Pad strcture to 128 bytes. Remember to update the * Pad strcture to 128 bytes. Remember to update the
...@@ -43,7 +44,7 @@ struct signalfd_siginfo { ...@@ -43,7 +44,7 @@ struct signalfd_siginfo {
* comes out of a read(2) and we really don't want to have * comes out of a read(2) and we really don't want to have
* a compat on read(2). * a compat on read(2).
*/ */
__u8 __pad[48]; __u8 __pad[46];
}; };
......
...@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, ...@@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page,
} }
} }
static void copy_gigantic_page(struct page *dst, struct page *src, static void copy_user_gigantic_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma) unsigned long addr, struct vm_area_struct *vma)
{ {
int i; int i;
struct hstate *h = hstate_vma(vma); struct hstate *h = hstate_vma(vma);
struct page *dst_base = dst; struct page *dst_base = dst;
struct page *src_base = src; struct page *src_base = src;
might_sleep();
for (i = 0; i < pages_per_huge_page(h); ) { for (i = 0; i < pages_per_huge_page(h); ) {
cond_resched(); cond_resched();
copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
...@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, ...@@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src,
src = mem_map_next(src, src_base, i); src = mem_map_next(src, src_base, i);
} }
} }
static void copy_huge_page(struct page *dst, struct page *src,
static void copy_user_huge_page(struct page *dst, struct page *src,
unsigned long addr, struct vm_area_struct *vma) unsigned long addr, struct vm_area_struct *vma)
{ {
int i; int i;
struct hstate *h = hstate_vma(vma); struct hstate *h = hstate_vma(vma);
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
copy_gigantic_page(dst, src, addr, vma); copy_user_gigantic_page(dst, src, addr, vma);
return; return;
} }
...@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, ...@@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src,
} }
} }
static void copy_gigantic_page(struct page *dst, struct page *src)
{
int i;
struct hstate *h = page_hstate(src);
struct page *dst_base = dst;
struct page *src_base = src;
for (i = 0; i < pages_per_huge_page(h); ) {
cond_resched();
copy_highpage(dst, src);
i++;
dst = mem_map_next(dst, dst_base, i);
src = mem_map_next(src, src_base, i);
}
}
void copy_huge_page(struct page *dst, struct page *src)
{
int i;
struct hstate *h = page_hstate(src);
if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
copy_gigantic_page(dst, src);
return;
}
might_sleep();
for (i = 0; i < pages_per_huge_page(h); i++) {
cond_resched();
copy_highpage(dst + i, src + i);
}
}
static void enqueue_huge_page(struct hstate *h, struct page *page) static void enqueue_huge_page(struct hstate *h, struct page *page)
{ {
int nid = page_to_nid(page); int nid = page_to_nid(page);
...@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) ...@@ -466,11 +501,24 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
h->free_huge_pages_node[nid]++; h->free_huge_pages_node[nid]++;
} }
static struct page *dequeue_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
if (list_empty(&h->hugepage_freelists[nid]))
return NULL;
page = list_entry(h->hugepage_freelists[nid].next, struct page, lru);
list_del(&page->lru);
set_page_refcounted(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
}
static struct page *dequeue_huge_page_vma(struct hstate *h, static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma, struct vm_area_struct *vma,
unsigned long address, int avoid_reserve) unsigned long address, int avoid_reserve)
{ {
int nid;
struct page *page = NULL; struct page *page = NULL;
struct mempolicy *mpol; struct mempolicy *mpol;
nodemask_t *nodemask; nodemask_t *nodemask;
...@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, ...@@ -496,19 +544,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
for_each_zone_zonelist_nodemask(zone, z, zonelist, for_each_zone_zonelist_nodemask(zone, z, zonelist,
MAX_NR_ZONES - 1, nodemask) { MAX_NR_ZONES - 1, nodemask) {
nid = zone_to_nid(zone); if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) {
if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && page = dequeue_huge_page_node(h, zone_to_nid(zone));
!list_empty(&h->hugepage_freelists[nid])) { if (page) {
page = list_entry(h->hugepage_freelists[nid].next, if (!avoid_reserve)
struct page, lru); decrement_hugepage_resv_vma(h, vma);
list_del(&page->lru); break;
h->free_huge_pages--; }
h->free_huge_pages_node[nid]--;
if (!avoid_reserve)
decrement_hugepage_resv_vma(h, vma);
break;
} }
} }
err: err:
...@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, ...@@ -770,11 +812,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
return ret; return ret;
} }
static struct page *alloc_buddy_huge_page(struct hstate *h, static struct page *alloc_buddy_huge_page(struct hstate *h, int nid)
struct vm_area_struct *vma, unsigned long address)
{ {
struct page *page; struct page *page;
unsigned int nid; unsigned int r_nid;
if (h->order >= MAX_ORDER) if (h->order >= MAX_ORDER)
return NULL; return NULL;
...@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, ...@@ -812,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
} }
spin_unlock(&hugetlb_lock); spin_unlock(&hugetlb_lock);
page = alloc_pages(htlb_alloc_mask|__GFP_COMP| if (nid == NUMA_NO_NODE)
__GFP_REPEAT|__GFP_NOWARN, page = alloc_pages(htlb_alloc_mask|__GFP_COMP|
huge_page_order(h)); __GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
else
page = alloc_pages_exact_node(nid,
htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
if (page && arch_prepare_hugepage(page)) { if (page && arch_prepare_hugepage(page)) {
__free_pages(page, huge_page_order(h)); __free_pages(page, huge_page_order(h));
...@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, ...@@ -823,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
spin_lock(&hugetlb_lock); spin_lock(&hugetlb_lock);
if (page) { if (page) {
/* r_nid = page_to_nid(page);
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
*/
put_page_testzero(page);
VM_BUG_ON(page_count(page));
nid = page_to_nid(page);
set_compound_page_dtor(page, free_huge_page); set_compound_page_dtor(page, free_huge_page);
/* /*
* We incremented the global counters already * We incremented the global counters already
*/ */
h->nr_huge_pages_node[nid]++; h->nr_huge_pages_node[r_nid]++;
h->surplus_huge_pages_node[nid]++; h->surplus_huge_pages_node[r_nid]++;
__count_vm_event(HTLB_BUDDY_PGALLOC); __count_vm_event(HTLB_BUDDY_PGALLOC);
} else { } else {
h->nr_huge_pages--; h->nr_huge_pages--;
...@@ -847,6 +887,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, ...@@ -847,6 +887,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
return page; return page;
} }
/*
* This allocation function is useful in the context where vma is irrelevant.
* E.g. soft-offlining uses this function because it only cares physical
* address of error page.
*/
struct page *alloc_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
spin_lock(&hugetlb_lock);
page = dequeue_huge_page_node(h, nid);
spin_unlock(&hugetlb_lock);
if (!page)
page = alloc_buddy_huge_page(h, nid);
return page;
}
/* /*
* Increase the hugetlb pool such that it can accomodate a reservation * Increase the hugetlb pool such that it can accomodate a reservation
* of size 'delta'. * of size 'delta'.
...@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) ...@@ -871,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta)
retry: retry:
spin_unlock(&hugetlb_lock); spin_unlock(&hugetlb_lock);
for (i = 0; i < needed; i++) { for (i = 0; i < needed; i++) {
page = alloc_buddy_huge_page(h, NULL, 0); page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) { if (!page)
/* /*
* We were not able to allocate enough pages to * We were not able to allocate enough pages to
* satisfy the entire reservation so we free what * satisfy the entire reservation so we free what
* we've allocated so far. * we've allocated so far.
*/ */
spin_lock(&hugetlb_lock);
needed = 0;
goto free; goto free;
}
list_add(&page->lru, &surplus_list); list_add(&page->lru, &surplus_list);
} }
...@@ -908,31 +964,31 @@ static int gather_surplus_pages(struct hstate *h, int delta) ...@@ -908,31 +964,31 @@ static int gather_surplus_pages(struct hstate *h, int delta)
needed += allocated; needed += allocated;
h->resv_huge_pages += delta; h->resv_huge_pages += delta;
ret = 0; ret = 0;
free:
spin_unlock(&hugetlb_lock);
/* Free the needed pages to the hugetlb pool */ /* Free the needed pages to the hugetlb pool */
list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
if ((--needed) < 0) if ((--needed) < 0)
break; break;
list_del(&page->lru); list_del(&page->lru);
/*
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
*/
put_page_testzero(page);
VM_BUG_ON(page_count(page));
enqueue_huge_page(h, page); enqueue_huge_page(h, page);
} }
/* Free unnecessary surplus pages to the buddy allocator */ /* Free unnecessary surplus pages to the buddy allocator */
free:
if (!list_empty(&surplus_list)) { if (!list_empty(&surplus_list)) {
spin_unlock(&hugetlb_lock);
list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_for_each_entry_safe(page, tmp, &surplus_list, lru) {
list_del(&page->lru); list_del(&page->lru);
/* put_page(page);
* The page has a reference count of zero already, so
* call free_huge_page directly instead of using
* put_page. This must be done with hugetlb_lock
* unlocked which is safe because free_huge_page takes
* hugetlb_lock before deciding how to free the page.
*/
free_huge_page(page);
} }
spin_lock(&hugetlb_lock);
} }
spin_lock(&hugetlb_lock);
return ret; return ret;
} }
...@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, ...@@ -1052,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
spin_unlock(&hugetlb_lock); spin_unlock(&hugetlb_lock);
if (!page) { if (!page) {
page = alloc_buddy_huge_page(h, vma, addr); page = alloc_buddy_huge_page(h, NUMA_NO_NODE);
if (!page) { if (!page) {
hugetlb_put_quota(inode->i_mapping, chg); hugetlb_put_quota(inode->i_mapping, chg);
return ERR_PTR(-VM_FAULT_SIGBUS); return ERR_PTR(-VM_FAULT_SIGBUS);
} }
} }
set_page_refcounted(page);
set_page_private(page, (unsigned long) mapping); set_page_private(page, (unsigned long) mapping);
vma_commit_reservation(h, vma, addr); vma_commit_reservation(h, vma, addr);
...@@ -2153,6 +2208,19 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, ...@@ -2153,6 +2208,19 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
return -ENOMEM; return -ENOMEM;
} }
static int is_hugetlb_entry_migration(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
return 0;
swp = pte_to_swp_entry(pte);
if (non_swap_entry(swp) && is_migration_entry(swp)) {
return 1;
} else
return 0;
}
static int is_hugetlb_entry_hwpoisoned(pte_t pte) static int is_hugetlb_entry_hwpoisoned(pte_t pte)
{ {
swp_entry_t swp; swp_entry_t swp;
...@@ -2383,7 +2451,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2383,7 +2451,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(anon_vma_prepare(vma))) if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM; return VM_FAULT_OOM;
copy_huge_page(new_page, old_page, address, vma); copy_user_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page); __SetPageUptodate(new_page);
/* /*
...@@ -2515,21 +2583,19 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2515,21 +2583,19 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
hugepage_add_new_anon_rmap(page, vma, address); hugepage_add_new_anon_rmap(page, vma, address);
} }
} else { } else {
/*
* If memory error occurs between mmap() and fault, some process
* don't have hwpoisoned swap entry for errored virtual address.
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
ret = VM_FAULT_HWPOISON |
VM_FAULT_SET_HINDEX(h - hstates);
goto backout_unlocked;
}
page_dup_rmap(page); page_dup_rmap(page);
} }
/*
* Since memory error handler replaces pte into hwpoison swap entry
* at the time of error handling, a process which reserved but not have
* the mapping to the error hugepage does not have hwpoison swap entry.
* So we need to block accesses from such a process by checking
* PG_hwpoison bit here.
*/
if (unlikely(PageHWPoison(page))) {
ret = VM_FAULT_HWPOISON;
goto backout_unlocked;
}
/* /*
* If we are going to COW a private mapping later, we examine the * If we are going to COW a private mapping later, we examine the
* pending reservations for this page now. This will ensure that * pending reservations for this page now. This will ensure that
...@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2587,8 +2653,12 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, address); ptep = huge_pte_offset(mm, address);
if (ptep) { if (ptep) {
entry = huge_ptep_get(ptep); entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) if (unlikely(is_hugetlb_entry_migration(entry))) {
return VM_FAULT_HWPOISON; migration_entry_wait(mm, (pmd_t *)ptep, address);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(h - hstates);
} }
ptep = huge_pte_alloc(mm, address, huge_page_size(h)); ptep = huge_pte_alloc(mm, address, huge_page_size(h));
...@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) ...@@ -2878,18 +2948,41 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_acct_memory(h, -(chg - freed)); hugetlb_acct_memory(h, -(chg - freed));
} }
#ifdef CONFIG_MEMORY_FAILURE
/* Should be called in hugetlb_lock */
static int is_hugepage_on_freelist(struct page *hpage)
{
struct page *page;
struct page *tmp;
struct hstate *h = page_hstate(hpage);
int nid = page_to_nid(hpage);
list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru)
if (page == hpage)
return 1;
return 0;
}
/* /*
* This function is called from memory failure code. * This function is called from memory failure code.
* Assume the caller holds page lock of the head page. * Assume the caller holds page lock of the head page.
*/ */
void __isolate_hwpoisoned_huge_page(struct page *hpage) int dequeue_hwpoisoned_huge_page(struct page *hpage)
{ {
struct hstate *h = page_hstate(hpage); struct hstate *h = page_hstate(hpage);
int nid = page_to_nid(hpage); int nid = page_to_nid(hpage);
int ret = -EBUSY;
spin_lock(&hugetlb_lock); spin_lock(&hugetlb_lock);
list_del(&hpage->lru); if (is_hugepage_on_freelist(hpage)) {
h->free_huge_pages--; list_del(&hpage->lru);
h->free_huge_pages_node[nid]--; set_page_refcounted(hpage);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
ret = 0;
}
spin_unlock(&hugetlb_lock); spin_unlock(&hugetlb_lock);
return ret;
} }
#endif
...@@ -7,21 +7,26 @@ ...@@ -7,21 +7,26 @@
* Free Software Foundation. * Free Software Foundation.
* *
* High level machine check handler. Handles pages reported by the * High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a 2bit ECC memory or cache * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure. * failure.
*
* In addition there is a "soft offline" entry point that allows stop using
* not-yet-corrupted-by-suspicious pages without killing anything.
* *
* Handles page cache pages in various states. The tricky part * Handles page cache pages in various states. The tricky part
* here is that we can access any page asynchronous to other VM * here is that we can access any page asynchronously in respect to
* users, because memory failures could happen anytime and anywhere, * other VM users, because memory failures could happen anytime and
* possibly violating some of their assumptions. This is why this code * anywhere. This could violate some of their assumptions. This is why
* has to be extremely careful. Generally it tries to use normal locking * this code has to be extremely careful. Generally it tries to use
* rules, as in get the standard locks, even if that means the * normal locking rules, as in get the standard locks, even if that means
* error handling takes potentially a long time. * the error handling takes potentially a long time.
* *
* The operation to map back from RMAP chains to processes has to walk * There are several operations here with exponential complexity because
* the complete process list and has non linear complexity with the number * of unsuitable VM data structures. For example the operation to map back
* mappings. In short it can be quite slow. But since memory corruptions * from RMAP chains to processes has to walk the complete process list and
* are rare we hope to get away with this. * has non linear complexity with the number. But since memory corruptions
* are rare we hope to get away with this. This avoids impacting the core
* VM.
*/ */
/* /*
...@@ -30,7 +35,6 @@ ...@@ -30,7 +35,6 @@
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel * - pass bad pages to kdump next kernel
*/ */
#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/page-flags.h> #include <linux/page-flags.h>
...@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p) ...@@ -78,7 +82,7 @@ static int hwpoison_filter_dev(struct page *p)
return 0; return 0;
/* /*
* page_mapping() does not accept slab page * page_mapping() does not accept slab pages.
*/ */
if (PageSlab(p)) if (PageSlab(p))
return -EINVAL; return -EINVAL;
...@@ -268,7 +272,7 @@ struct to_kill { ...@@ -268,7 +272,7 @@ struct to_kill {
struct list_head nd; struct list_head nd;
struct task_struct *tsk; struct task_struct *tsk;
unsigned long addr; unsigned long addr;
unsigned addr_valid:1; char addr_valid;
}; };
/* /*
...@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, ...@@ -309,7 +313,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* a SIGKILL because the error is not contained anymore. * a SIGKILL because the error is not contained anymore.
*/ */
if (tk->addr == -EFAULT) { if (tk->addr == -EFAULT) {
pr_debug("MCE: Unable to find user space address %lx in %s\n", pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm); page_to_pfn(p), tsk->comm);
tk->addr_valid = 0; tk->addr_valid = 0;
} }
...@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) ...@@ -577,7 +581,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
pfn, err); pfn, err);
} else if (page_has_private(p) && } else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) { !try_to_release_page(p, GFP_NOIO)) {
pr_debug("MCE %#lx: failed to release buffers\n", pfn); pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else { } else {
ret = RECOVERED; ret = RECOVERED;
} }
...@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) ...@@ -693,11 +697,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* Issues: * Issues:
* - Error on hugepage is contained in hugepage unit (not in raw page unit.) * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd. * To narrow down kill region to one page, we need to break up pmd.
* - To support soft-offlining for hugepage, we need to support hugepage
* migration.
*/ */
static int me_huge_page(struct page *p, unsigned long pfn) static int me_huge_page(struct page *p, unsigned long pfn)
{ {
int res = 0;
struct page *hpage = compound_head(p); struct page *hpage = compound_head(p);
/* /*
* We can safely recover from error on free or reserved (i.e. * We can safely recover from error on free or reserved (i.e.
...@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) ...@@ -710,8 +713,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
* so there is no race between isolation and mapping/unmapping. * so there is no race between isolation and mapping/unmapping.
*/ */
if (!(page_mapping(hpage) || PageAnon(hpage))) { if (!(page_mapping(hpage) || PageAnon(hpage))) {
__isolate_hwpoisoned_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage);
return RECOVERED; if (!res)
return RECOVERED;
} }
return DELAYED; return DELAYED;
} }
...@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, ...@@ -836,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
} }
#define N_UNMAP_TRIES 5
/* /*
* Do all that is necessary to remove user space mappings. Unmap * Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty. * the pages and send SIGBUS to the processes if the data was dirty.
...@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ...@@ -849,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
struct address_space *mapping; struct address_space *mapping;
LIST_HEAD(tokill); LIST_HEAD(tokill);
int ret; int ret;
int i;
int kill = 1; int kill = 1;
struct page *hpage = compound_head(p); struct page *hpage = compound_head(p);
...@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, ...@@ -903,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (kill) if (kill)
collect_procs(hpage, &tokill); collect_procs(hpage, &tokill);
/* ret = try_to_unmap(hpage, ttu);
* try_to_unmap can fail temporarily due to races.
* Try a few times (RED-PEN better strategy?)
*/
for (i = 0; i < N_UNMAP_TRIES; i++) {
ret = try_to_unmap(hpage, ttu);
if (ret == SWAP_SUCCESS)
break;
pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
}
if (ret != SWAP_SUCCESS) if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage)); pfn, page_mapcount(hpage));
...@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) ...@@ -981,7 +972,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* We need/can do nothing about count=0 pages. * We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand: * 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper. * prep_new_page() will be the gate keeper.
* 2) it's part of a non-compound high order page. * 2) it's a free hugepage, which is also safe:
* an affected hugepage will be dequeued from hugepage freelist,
* so there's no concern about reusing it ever after.
* 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from * Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been * R/W the page; let's pray that the page has been
* used and will be freed some time later. * used and will be freed some time later.
...@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) ...@@ -993,6 +987,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
if (is_free_buddy_page(p)) { if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED); action_result(pfn, "free buddy", DELAYED);
return 0; return 0;
} else if (PageHuge(hpage)) {
/*
* Check "just unpoisoned", "filter hit", and
* "race with other subpage."
*/
lock_page_nosync(hpage);
if (!PageHWPoison(hpage)
|| (hwpoison_filter(p) && TestClearPageHWPoison(p))
|| (p != hpage && TestSetPageHWPoison(hpage))) {
atomic_long_sub(nr_pages, &mce_bad_pages);
return 0;
}
set_page_hwpoison_huge_page(hpage);
res = dequeue_hwpoisoned_huge_page(hpage);
action_result(pfn, "free huge",
res ? IGNORED : DELAYED);
unlock_page(hpage);
return res;
} else { } else {
action_result(pfn, "high order kernel", IGNORED); action_result(pfn, "high order kernel", IGNORED);
return -EBUSY; return -EBUSY;
...@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn) ...@@ -1147,16 +1159,26 @@ int unpoison_memory(unsigned long pfn)
page = compound_head(p); page = compound_head(p);
if (!PageHWPoison(p)) { if (!PageHWPoison(p)) {
pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0; return 0;
} }
nr_pages = 1 << compound_order(page); nr_pages = 1 << compound_order(page);
if (!get_page_unless_zero(page)) { if (!get_page_unless_zero(page)) {
/*
* Since HWPoisoned hugepage should have non-zero refcount,
* race between memory failure and unpoison seems to happen.
* In such case unpoison fails and memory failure runs
* to the end.
*/
if (PageHuge(page)) {
pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
return 0;
}
if (TestClearPageHWPoison(p)) if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &mce_bad_pages); atomic_long_sub(nr_pages, &mce_bad_pages);
pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0; return 0;
} }
...@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn) ...@@ -1168,12 +1190,12 @@ int unpoison_memory(unsigned long pfn)
* the free buddy page pool. * the free buddy page pool.
*/ */
if (TestClearPageHWPoison(page)) { if (TestClearPageHWPoison(page)) {
pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
atomic_long_sub(nr_pages, &mce_bad_pages); atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1; freeit = 1;
if (PageHuge(page))
clear_page_hwpoison_huge_page(page);
} }
if (PageHuge(p))
clear_page_hwpoison_huge_page(page);
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
...@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory); ...@@ -1187,7 +1209,11 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private, int **x) static struct page *new_page(struct page *p, unsigned long private, int **x)
{ {
int nid = page_to_nid(p); int nid = page_to_nid(p);
return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); if (PageHuge(p))
return alloc_huge_page_node(page_hstate(compound_head(p)),
nid);
else
return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
} }
/* /*
...@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ...@@ -1215,14 +1241,21 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
* was free. * was free.
*/ */
set_migratetype_isolate(p); set_migratetype_isolate(p);
/*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
if (!get_page_unless_zero(compound_head(p))) { if (!get_page_unless_zero(compound_head(p))) {
if (is_free_buddy_page(p)) { if (PageHuge(p)) {
pr_debug("get_any_page: %#lx free buddy page\n", pfn); pr_info("get_any_page: %#lx free huge page\n", pfn);
ret = dequeue_hwpoisoned_huge_page(compound_head(p));
} else if (is_free_buddy_page(p)) {
pr_info("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */ /* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p); SetPageHWPoison(p);
ret = 0; ret = 0;
} else { } else {
pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
pfn, p->flags); pfn, p->flags);
ret = -EIO; ret = -EIO;
} }
...@@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) ...@@ -1235,6 +1268,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
return ret; return ret;
} }
static int soft_offline_huge_page(struct page *page, int flags)
{
int ret;
unsigned long pfn = page_to_pfn(page);
struct page *hpage = compound_head(page);
LIST_HEAD(pagelist);
ret = get_any_page(page, pfn, flags);
if (ret < 0)
return ret;
if (ret == 0)
goto done;
if (PageHWPoison(hpage)) {
put_page(hpage);
pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
/* Keep page count to indicate a given hugepage is isolated. */
list_add(&hpage->lru, &pagelist);
ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
if (ret) {
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
return ret;
}
done:
if (!PageHWPoison(hpage))
atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
/* keep elevated page count for bad page */
return ret;
}
/** /**
* soft_offline_page - Soft offline a page. * soft_offline_page - Soft offline a page.
* @page: page to offline * @page: page to offline
...@@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags) ...@@ -1262,6 +1334,9 @@ int soft_offline_page(struct page *page, int flags)
int ret; int ret;
unsigned long pfn = page_to_pfn(page); unsigned long pfn = page_to_pfn(page);
if (PageHuge(page))
return soft_offline_huge_page(page, flags);
ret = get_any_page(page, pfn, flags); ret = get_any_page(page, pfn, flags);
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags) ...@@ -1288,7 +1363,7 @@ int soft_offline_page(struct page *page, int flags)
goto done; goto done;
} }
if (!PageLRU(page)) { if (!PageLRU(page)) {
pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags); pfn, page->flags);
return -EIO; return -EIO;
} }
...@@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags) ...@@ -1302,7 +1377,7 @@ int soft_offline_page(struct page *page, int flags)
if (PageHWPoison(page)) { if (PageHWPoison(page)) {
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
pr_debug("soft offline: %#lx page already poisoned\n", pfn); pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY; return -EBUSY;
} }
...@@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags) ...@@ -1323,7 +1398,7 @@ int soft_offline_page(struct page *page, int flags)
put_page(page); put_page(page);
if (ret == 1) { if (ret == 1) {
ret = 0; ret = 0;
pr_debug("soft_offline: %#lx: invalidated\n", pfn); pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done; goto done;
} }
...@@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags) ...@@ -1339,13 +1414,13 @@ int soft_offline_page(struct page *page, int flags)
list_add(&page->lru, &pagelist); list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
if (ret) { if (ret) {
pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags); pfn, ret, page->flags);
if (ret > 0) if (ret > 0)
ret = -EIO; ret = -EIO;
} }
} else { } else {
pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags); pfn, ret, page_count(page), page->flags);
} }
if (ret) if (ret)
......
...@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_OOM) if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM; return i ? i : -ENOMEM;
if (ret & if (ret &
(VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
VM_FAULT_SIGBUS))
return i ? i : -EFAULT; return i ? i : -EFAULT;
BUG(); BUG();
} }
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <linux/security.h> #include <linux/security.h>
#include <linux/memcontrol.h> #include <linux/memcontrol.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include "internal.h" #include "internal.h"
...@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ...@@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte_t *ptep, pte; pte_t *ptep, pte;
spinlock_t *ptl; spinlock_t *ptl;
pgd = pgd_offset(mm, addr); if (unlikely(PageHuge(new))) {
if (!pgd_present(*pgd)) ptep = huge_pte_offset(mm, addr);
goto out; if (!ptep)
goto out;
ptl = &mm->page_table_lock;
} else {
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
goto out;
pud = pud_offset(pgd, addr); pud = pud_offset(pgd, addr);
if (!pud_present(*pud)) if (!pud_present(*pud))
goto out; goto out;
pmd = pmd_offset(pud, addr); pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd)) if (!pmd_present(*pmd))
goto out; goto out;
ptep = pte_offset_map(pmd, addr); ptep = pte_offset_map(pmd, addr);
if (!is_swap_pte(*ptep)) { if (!is_swap_pte(*ptep)) {
pte_unmap(ptep); pte_unmap(ptep);
goto out; goto out;
} }
ptl = pte_lockptr(mm, pmd);
}
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl); spin_lock(ptl);
pte = *ptep; pte = *ptep;
if (!is_swap_pte(pte)) if (!is_swap_pte(pte))
...@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ...@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry)) if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte); pte = pte_mkwrite(pte);
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new))
pte = pte_mkhuge(pte);
#endif
flush_cache_page(vma, addr, pte_pfn(pte)); flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte); set_pte_at(mm, addr, ptep, pte);
if (PageAnon(new)) if (PageHuge(new)) {
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
page_dup_rmap(new);
} else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr); page_add_anon_rmap(new, vma, addr);
else else
page_add_file_rmap(new); page_add_file_rmap(new);
...@@ -275,12 +293,60 @@ static int migrate_page_move_mapping(struct address_space *mapping, ...@@ -275,12 +293,60 @@ static int migrate_page_move_mapping(struct address_space *mapping,
return 0; return 0;
} }
/*
* The expected number of remaining references is the same as that
* of migrate_page_move_mapping().
*/
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
int expected_count;
void **pslot;
if (!mapping) {
if (page_count(page) != 1)
return -EAGAIN;
return 0;
}
spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
(struct page *)radix_tree_deref_slot(pslot) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
if (!page_freeze_refs(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
page_unfreeze_refs(page, expected_count);
__put_page(page);
spin_unlock_irq(&mapping->tree_lock);
return 0;
}
/* /*
* Copy the page to its new location * Copy the page to its new location
*/ */
static void migrate_page_copy(struct page *newpage, struct page *page) void migrate_page_copy(struct page *newpage, struct page *page)
{ {
copy_highpage(newpage, page); if (PageHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
if (PageError(page)) if (PageError(page))
SetPageError(newpage); SetPageError(newpage);
...@@ -723,6 +789,92 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, ...@@ -723,6 +789,92 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
return rc; return rc;
} }
/*
* Counterpart of unmap_and_move_page() for hugepage migration.
*
* This function doesn't wait the completion of hugepage I/O
* because there is no race between I/O and migration for hugepage.
* Note that currently hugepage I/O occurs only in direct I/O
* where no lock is held and PG_writeback is irrelevant,
* and writeback status of all subpages are counted in the reference
* count of the head page (i.e. if all subpages of a 2MB hugepage are
* under direct I/O, the reference of the head page is 512 and a bit more.)
* This means that when we try to migrate hugepage whose subpages are
* doing direct I/O, some references remain after try_to_unmap() and
* hugepage migration fails without data corruption.
*
* There is also no race when direct I/O is issued on the page under migration,
* because then pte is replaced with migration swap entry and direct I/O code
* will wait in the page fault for migration to complete.
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
int force, int offlining)
{
int rc = 0;
int *result = NULL;
struct page *new_hpage = get_new_page(hpage, private, &result);
int rcu_locked = 0;
struct anon_vma *anon_vma = NULL;
if (!new_hpage)
return -ENOMEM;
rc = -EAGAIN;
if (!trylock_page(hpage)) {
if (!force)
goto out;
lock_page(hpage);
}
if (PageAnon(hpage)) {
rcu_read_lock();
rcu_locked = 1;
if (page_mapped(hpage)) {
anon_vma = page_anon_vma(hpage);
atomic_inc(&anon_vma->external_refcount);
}
}
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, 1);
if (rc)
remove_migration_ptes(hpage, hpage);
if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
&anon_vma->lock)) {
int empty = list_empty(&anon_vma->head);
spin_unlock(&anon_vma->lock);
if (empty)
anon_vma_free(anon_vma);
}
if (rcu_locked)
rcu_read_unlock();
out:
unlock_page(hpage);
if (rc != -EAGAIN) {
list_del(&hpage->lru);
put_page(hpage);
}
put_page(new_hpage);
if (result) {
if (rc)
*result = rc;
else
*result = page_to_nid(new_hpage);
}
return rc;
}
/* /*
* migrate_pages * migrate_pages
* *
...@@ -788,6 +940,52 @@ int migrate_pages(struct list_head *from, ...@@ -788,6 +940,52 @@ int migrate_pages(struct list_head *from,
return nr_failed + retry; return nr_failed + retry;
} }
int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, int offlining)
{
int retry = 1;
int nr_failed = 0;
int pass = 0;
struct page *page;
struct page *page2;
int rc;
for (pass = 0; pass < 10 && retry; pass++) {
retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
cond_resched();
rc = unmap_and_move_huge_page(get_new_page,
private, page, pass > 2, offlining);
switch(rc) {
case -ENOMEM:
goto out;
case -EAGAIN:
retry++;
break;
case 0:
break;
default:
/* Permanent failure */
nr_failed++;
break;
}
}
}
rc = 0;
out:
list_for_each_entry_safe(page, page2, from, lru)
put_page(page);
if (rc)
return rc;
return nr_failed + retry;
}
#ifdef CONFIG_NUMA #ifdef CONFIG_NUMA
/* /*
* Move a list of individual pages * Move a list of individual pages
......
...@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page, ...@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
} }
/** /**
* __page_set_anon_rmap - setup new anonymous rmap * __page_set_anon_rmap - set up new anonymous rmap
* @page: the page to add the mapping to * @page: Page to add to rmap
* @vma: the vm area in which the mapping is added * @vma: VM area to add page to.
* @address: the user virtual address mapped * @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process * @exclusive: the page is exclusively owned by the current process
*/ */
static void __page_set_anon_rmap(struct page *page, static void __page_set_anon_rmap(struct page *page,
...@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page, ...@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma); BUG_ON(!anon_vma);
if (PageAnon(page))
return;
/* /*
* If the page isn't exclusively mapped into this vma, * If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the * we must use the _oldest_ possible anon_vma for the
* page mapping! * page mapping!
*/ */
if (!exclusive) { if (!exclusive)
if (PageAnon(page))
return;
anon_vma = anon_vma->root; anon_vma = anon_vma->root;
} else {
/*
* In this case, swapped-out-but-not-discarded swap-cache
* is remapped. So, no need to update page->mapping here.
* We convice anon_vma poitned by page->mapping is not obsolete
* because vma->anon_vma is necessary to be a family of it.
*/
if (PageAnon(page))
return;
}
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma; page->mapping = (struct address_space *) anon_vma;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment