Commit f1ebdd60 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6

* 'hwpoison' of git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6: (22 commits)
  Add _addr_lsb field to ia64 siginfo
  Fix migration.c compilation on s390
  HWPOISON: Remove retry loop for try_to_unmap
  HWPOISON: Turn addr_valid from bitfield into char
  HWPOISON: Disable DEBUG by default
  HWPOISON: Convert pr_debugs to pr_info
  HWPOISON: Improve comments in memory-failure.c
  x86: HWPOISON: Report correct address granuality for huge hwpoison faults
  Encode huge page size for VM_FAULT_HWPOISON errors
  Fix build error with !CONFIG_MIGRATION
  hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning
  Clean up __page_set_anon_rmap
  HWPOISON, hugetlb: fix unpoison for hugepage
  HWPOISON, hugetlb: soft offlining for hugepage
  HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED
  hugetlb: move refcounting in hugepage allocation inside hugetlb_lock
  HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page()
  hugetlb: hugepage migration core
  hugetlb: redefine hugepage copy functions
  hugetlb: add allocate function for hugepage migration
  ...
parents f99d0553 46e387bb
......@@ -62,6 +62,7 @@ typedef struct siginfo {
int _imm; /* immediate value for "break" */
unsigned int _flags; /* see below */
unsigned long _isr; /* isr */
short _addr_lsb; /* lsb of faulting address */
} _sigfault;
/* SIGPOLL */
......
......@@ -11,6 +11,7 @@
#include <linux/kprobes.h> /* __kprobes, ... */
#include <linux/mmiotrace.h> /* kmmio_handler, ... */
#include <linux/perf_event.h> /* perf_sw_event */
#include <linux/hugetlb.h> /* hstate_index_to_shift */
#include <asm/traps.h> /* dotraplinkage, ... */
#include <asm/pgalloc.h> /* pgd_*(), ... */
......@@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
static void
force_sig_info_fault(int si_signo, int si_code, unsigned long address,
struct task_struct *tsk)
struct task_struct *tsk, int fault)
{
unsigned lsb = 0;
siginfo_t info;
info.si_signo = si_signo;
info.si_errno = 0;
info.si_code = si_code;
info.si_addr = (void __user *)address;
info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0;
if (fault & VM_FAULT_HWPOISON_LARGE)
lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
if (fault & VM_FAULT_HWPOISON)
lsb = PAGE_SHIFT;
info.si_addr_lsb = lsb;
force_sig_info(si_signo, &info, tsk);
}
......@@ -722,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
tsk->thread.error_code = error_code | (address >= TASK_SIZE);
tsk->thread.trap_no = 14;
force_sig_info_fault(SIGSEGV, si_code, address, tsk);
force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
return;
}
......@@ -807,14 +813,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
tsk->thread.trap_no = 14;
#ifdef CONFIG_MEMORY_FAILURE
if (fault & VM_FAULT_HWPOISON) {
if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
printk(KERN_ERR
"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
tsk->comm, tsk->pid, address);
code = BUS_MCEERR_AR;
}
#endif
force_sig_info_fault(SIGBUS, code, address, tsk);
force_sig_info_fault(SIGBUS, code, address, tsk, fault);
}
static noinline void
......@@ -824,7 +830,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
if (fault & VM_FAULT_OOM) {
out_of_memory(regs, error_code, address);
} else {
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON))
if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
VM_FAULT_HWPOISON_LARGE))
do_sigbus(regs, error_code, address, fault);
else
BUG();
......
......@@ -31,6 +31,7 @@
#include <linux/statfs.h>
#include <linux/security.h>
#include <linux/magic.h>
#include <linux/migrate.h>
#include <asm/uaccess.h>
......@@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page)
return 0;
}
static int hugetlbfs_migrate_page(struct address_space *mapping,
struct page *newpage, struct page *page)
{
int rc;
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
if (rc)
return rc;
migrate_page_copy(newpage, page);
return 0;
}
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb);
......@@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
.set_page_dirty = hugetlbfs_set_page_dirty,
.migratepage = hugetlbfs_migrate_page,
};
......
......@@ -98,6 +98,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
err |= __put_user((long) kinfo->si_addr, &uinfo->ssi_addr);
#ifdef __ARCH_SI_TRAPNO
err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno);
#endif
#ifdef BUS_MCEERR_AO
/*
* Other callers might not initialize the si_lsb field,
* so check explicitly for the right codes here.
*/
if (kinfo->si_code == BUS_MCEERR_AR ||
kinfo->si_code == BUS_MCEERR_AO)
err |= __put_user((short) kinfo->si_addr_lsb,
&uinfo->ssi_addr_lsb);
#endif
break;
case __SI_CHLD:
......
......@@ -43,7 +43,8 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma,
int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
void __isolate_hwpoisoned_huge_page(struct page *page);
int dequeue_hwpoisoned_huge_page(struct page *page);
void copy_huge_page(struct page *dst, struct page *src);
extern unsigned long hugepages_treat_as_movable;
extern const unsigned long hugetlb_zero, hugetlb_infinity;
......@@ -101,7 +102,10 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
#define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; })
#define huge_pte_offset(mm, address) 0
#define __isolate_hwpoisoned_huge_page(page) 0
#define dequeue_hwpoisoned_huge_page(page) 0
static inline void copy_huge_page(struct page *dst, struct page *src)
{
}
#define hugetlb_change_protection(vma, address, end, newprot)
......@@ -228,6 +232,8 @@ struct huge_bootmem_page {
struct hstate *hstate;
};
struct page *alloc_huge_page_node(struct hstate *h, int nid);
/* arch callback */
int __init alloc_bootmem_huge_page(struct hstate *h);
......@@ -301,8 +307,14 @@ static inline struct hstate *page_hstate(struct page *page)
return size_to_hstate(PAGE_SIZE << compound_order(page));
}
static inline unsigned hstate_index_to_shift(unsigned index)
{
return hstates[index].order + PAGE_SHIFT;
}
#else
struct hstate {};
#define alloc_huge_page_node(h, nid) NULL
#define alloc_bootmem_huge_page(h) NULL
#define hstate_file(f) NULL
#define hstate_vma(v) NULL
......@@ -317,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h)
{
return 1;
}
#define hstate_index_to_shift(index) 0
#endif
#endif /* _LINUX_HUGETLB_H */
......@@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *,
struct page *, struct page *);
extern int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining);
extern int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining);
extern int fail_migrate_page(struct address_space *,
struct page *, struct page *);
......@@ -23,12 +25,17 @@ extern int migrate_prep_local(void);
extern int migrate_vmas(struct mm_struct *mm,
const nodemask_t *from, const nodemask_t *to,
unsigned long flags);
extern void migrate_page_copy(struct page *newpage, struct page *page);
extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
#else
#define PAGE_MIGRATION 0
static inline void putback_lru_pages(struct list_head *l) {}
static inline int migrate_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining) { return -ENOSYS; }
static inline int migrate_huge_pages(struct list_head *l, new_page_t x,
unsigned long private, int offlining) { return -ENOSYS; }
static inline int migrate_prep(void) { return -ENOSYS; }
static inline int migrate_prep_local(void) { return -ENOSYS; }
......@@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm,
return -ENOSYS;
}
static inline void migrate_page_copy(struct page *newpage,
struct page *page) {}
static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
return -ENOSYS;
}
/* Possible settings for the migrate_page() method in address_operations */
#define migrate_page NULL
#define fail_migrate_page NULL
......
......@@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page)
#define VM_FAULT_SIGBUS 0x0002
#define VM_FAULT_MAJOR 0x0004
#define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */
#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */
#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */
#define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */
#define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON)
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \
VM_FAULT_HWPOISON_LARGE)
/* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((x) << 12)
#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf)
/*
* Can be called by the pagefault handler when it gets a VM_FAULT_OOM.
......
......@@ -33,6 +33,7 @@ struct signalfd_siginfo {
__u64 ssi_utime;
__u64 ssi_stime;
__u64 ssi_addr;
__u16 ssi_addr_lsb;
/*
* Pad strcture to 128 bytes. Remember to update the
......@@ -43,7 +44,7 @@ struct signalfd_siginfo {
* comes out of a read(2) and we really don't want to have
* a compat on read(2).
*/
__u8 __pad[48];
__u8 __pad[46];
};
......
This diff is collapsed.
This diff is collapsed.
......@@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (ret & VM_FAULT_OOM)
return i ? i : -ENOMEM;
if (ret &
(VM_FAULT_HWPOISON|VM_FAULT_SIGBUS))
(VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE|
VM_FAULT_SIGBUS))
return i ? i : -EFAULT;
BUG();
}
......
......@@ -32,6 +32,7 @@
#include <linux/security.h>
#include <linux/memcontrol.h>
#include <linux/syscalls.h>
#include <linux/hugetlb.h>
#include <linux/gfp.h>
#include "internal.h"
......@@ -95,6 +96,12 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte_t *ptep, pte;
spinlock_t *ptl;
if (unlikely(PageHuge(new))) {
ptep = huge_pte_offset(mm, addr);
if (!ptep)
goto out;
ptl = &mm->page_table_lock;
} else {
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
goto out;
......@@ -115,6 +122,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
}
ptl = pte_lockptr(mm, pmd);
}
spin_lock(ptl);
pte = *ptep;
if (!is_swap_pte(pte))
......@@ -130,10 +139,19 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
if (is_write_migration_entry(entry))
pte = pte_mkwrite(pte);
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new))
pte = pte_mkhuge(pte);
#endif
flush_cache_page(vma, addr, pte_pfn(pte));
set_pte_at(mm, addr, ptep, pte);
if (PageHuge(new)) {
if (PageAnon(new))
hugepage_add_anon_rmap(new, vma, addr);
else
page_dup_rmap(new);
} else if (PageAnon(new))
page_add_anon_rmap(new, vma, addr);
else
page_add_file_rmap(new);
......@@ -275,11 +293,59 @@ static int migrate_page_move_mapping(struct address_space *mapping,
return 0;
}
/*
* The expected number of remaining references is the same as that
* of migrate_page_move_mapping().
*/
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
int expected_count;
void **pslot;
if (!mapping) {
if (page_count(page) != 1)
return -EAGAIN;
return 0;
}
spin_lock_irq(&mapping->tree_lock);
pslot = radix_tree_lookup_slot(&mapping->page_tree,
page_index(page));
expected_count = 2 + page_has_private(page);
if (page_count(page) != expected_count ||
(struct page *)radix_tree_deref_slot(pslot) != page) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
if (!page_freeze_refs(page, expected_count)) {
spin_unlock_irq(&mapping->tree_lock);
return -EAGAIN;
}
get_page(newpage);
radix_tree_replace_slot(pslot, newpage);
page_unfreeze_refs(page, expected_count);
__put_page(page);
spin_unlock_irq(&mapping->tree_lock);
return 0;
}
/*
* Copy the page to its new location
*/
static void migrate_page_copy(struct page *newpage, struct page *page)
void migrate_page_copy(struct page *newpage, struct page *page)
{
if (PageHuge(page))
copy_huge_page(newpage, page);
else
copy_highpage(newpage, page);
if (PageError(page))
......@@ -723,6 +789,92 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
return rc;
}
/*
* Counterpart of unmap_and_move_page() for hugepage migration.
*
* This function doesn't wait the completion of hugepage I/O
* because there is no race between I/O and migration for hugepage.
* Note that currently hugepage I/O occurs only in direct I/O
* where no lock is held and PG_writeback is irrelevant,
* and writeback status of all subpages are counted in the reference
* count of the head page (i.e. if all subpages of a 2MB hugepage are
* under direct I/O, the reference of the head page is 512 and a bit more.)
* This means that when we try to migrate hugepage whose subpages are
* doing direct I/O, some references remain after try_to_unmap() and
* hugepage migration fails without data corruption.
*
* There is also no race when direct I/O is issued on the page under migration,
* because then pte is replaced with migration swap entry and direct I/O code
* will wait in the page fault for migration to complete.
*/
static int unmap_and_move_huge_page(new_page_t get_new_page,
unsigned long private, struct page *hpage,
int force, int offlining)
{
int rc = 0;
int *result = NULL;
struct page *new_hpage = get_new_page(hpage, private, &result);
int rcu_locked = 0;
struct anon_vma *anon_vma = NULL;
if (!new_hpage)
return -ENOMEM;
rc = -EAGAIN;
if (!trylock_page(hpage)) {
if (!force)
goto out;
lock_page(hpage);
}
if (PageAnon(hpage)) {
rcu_read_lock();
rcu_locked = 1;
if (page_mapped(hpage)) {
anon_vma = page_anon_vma(hpage);
atomic_inc(&anon_vma->external_refcount);
}
}
try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
if (!page_mapped(hpage))
rc = move_to_new_page(new_hpage, hpage, 1);
if (rc)
remove_migration_ptes(hpage, hpage);
if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount,
&anon_vma->lock)) {
int empty = list_empty(&anon_vma->head);
spin_unlock(&anon_vma->lock);
if (empty)
anon_vma_free(anon_vma);
}
if (rcu_locked)
rcu_read_unlock();
out:
unlock_page(hpage);
if (rc != -EAGAIN) {
list_del(&hpage->lru);
put_page(hpage);
}
put_page(new_hpage);
if (result) {
if (rc)
*result = rc;
else
*result = page_to_nid(new_hpage);
}
return rc;
}
/*
* migrate_pages
*
......@@ -788,6 +940,52 @@ int migrate_pages(struct list_head *from,
return nr_failed + retry;
}
int migrate_huge_pages(struct list_head *from,
new_page_t get_new_page, unsigned long private, int offlining)
{
int retry = 1;
int nr_failed = 0;
int pass = 0;
struct page *page;
struct page *page2;
int rc;
for (pass = 0; pass < 10 && retry; pass++) {
retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
cond_resched();
rc = unmap_and_move_huge_page(get_new_page,
private, page, pass > 2, offlining);
switch(rc) {
case -ENOMEM:
goto out;
case -EAGAIN:
retry++;
break;
case 0:
break;
default:
/* Permanent failure */
nr_failed++;
break;
}
}
}
rc = 0;
out:
list_for_each_entry_safe(page, page2, from, lru)
put_page(page);
if (rc)
return rc;
return nr_failed + retry;
}
#ifdef CONFIG_NUMA
/*
* Move a list of individual pages
......
......@@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page,
}
/**
* __page_set_anon_rmap - setup new anonymous rmap
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
* __page_set_anon_rmap - set up new anonymous rmap
* @page: Page to add to rmap
* @vma: VM area to add page to.
* @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
*/
static void __page_set_anon_rmap(struct page *page,
......@@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
if (PageAnon(page))
return;
/*
* If the page isn't exclusively mapped into this vma,
* we must use the _oldest_ possible anon_vma for the
* page mapping!
*/
if (!exclusive) {
if (PageAnon(page))
return;
if (!exclusive)
anon_vma = anon_vma->root;
} else {
/*
* In this case, swapped-out-but-not-discarded swap-cache
* is remapped. So, no need to update page->mapping here.
* We convice anon_vma poitned by page->mapping is not obsolete
* because vma->anon_vma is necessary to be a family of it.
*/
if (PageAnon(page))
return;
}
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
page->mapping = (struct address_space *) anon_vma;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment