Commit 86fbf161 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (incoming from Andrew)

Merge patches from Andrew Morton:
 "23 fixes and a MAINTAINERS update"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (24 commits)
  mm/hugetlb: check for pte NULL pointer in __page_check_address()
  fix build with make 3.80
  mm/mempolicy: fix !vma in new_vma_page()
  MAINTAINERS: add Davidlohr as GPT maintainer
  mm/memory-failure.c: recheck PageHuge() after hugetlb page migrate successfully
  mm/compaction: respect ignore_skip_hint in update_pageblock_skip
  mm/mempolicy: correct putback method for isolate pages if failed
  mm: add missing dependency in Kconfig
  sh: always link in helper functions extracted from libgcc
  mm: page_alloc: exclude unreclaimable allocations from zone fairness policy
  mm: numa: defer TLB flush for THP migration as long as possible
  mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates
  mm: fix TLB flush race between migration, and change_protection_range
  mm: numa: avoid unnecessary disruption of NUMA hinting during migration
  mm: numa: clear numa hinting information on mprotect
  sched: numa: skip inaccessible VMAs
  mm: numa: avoid unnecessary work on the failure path
  mm: numa: ensure anon_vma is locked to prevent parallel THP splits
  mm: numa: do not clear PTE for pte_numa update
  mm: numa: do not clear PMD during PTE update scan
  ...
parents a36c160c 98398c32
...@@ -3833,6 +3833,12 @@ T: git git://linuxtv.org/media_tree.git ...@@ -3833,6 +3833,12 @@ T: git git://linuxtv.org/media_tree.git
S: Maintained S: Maintained
F: drivers/media/usb/gspca/ F: drivers/media/usb/gspca/
GUID PARTITION TABLE (GPT)
M: Davidlohr Bueso <davidlohr@hp.com>
L: linux-efi@vger.kernel.org
S: Maintained
F: block/partitions/efi.*
STK1160 USB VIDEO CAPTURE DRIVER STK1160 USB VIDEO CAPTURE DRIVER
M: Ezequiel Garcia <elezegarcia@gmail.com> M: Ezequiel Garcia <elezegarcia@gmail.com>
L: linux-media@vger.kernel.org L: linux-media@vger.kernel.org
......
...@@ -732,19 +732,13 @@ export mod_strip_cmd ...@@ -732,19 +732,13 @@ export mod_strip_cmd
# Select initial ramdisk compression format, default is gzip(1). # Select initial ramdisk compression format, default is gzip(1).
# This shall be used by the dracut(8) tool while creating an initramfs image. # This shall be used by the dracut(8) tool while creating an initramfs image.
# #
INITRD_COMPRESS=gzip INITRD_COMPRESS-y := gzip
ifeq ($(CONFIG_RD_BZIP2), y) INITRD_COMPRESS-$(CONFIG_RD_BZIP2) := bzip2
INITRD_COMPRESS=bzip2 INITRD_COMPRESS-$(CONFIG_RD_LZMA) := lzma
else ifeq ($(CONFIG_RD_LZMA), y) INITRD_COMPRESS-$(CONFIG_RD_XZ) := xz
INITRD_COMPRESS=lzma INITRD_COMPRESS-$(CONFIG_RD_LZO) := lzo
else ifeq ($(CONFIG_RD_XZ), y) INITRD_COMPRESS-$(CONFIG_RD_LZ4) := lz4
INITRD_COMPRESS=xz export INITRD_COMPRESS := $(INITRD_COMPRESS-y)
else ifeq ($(CONFIG_RD_LZO), y)
INITRD_COMPRESS=lzo
else ifeq ($(CONFIG_RD_LZ4), y)
INITRD_COMPRESS=lz4
endif
export INITRD_COMPRESS
ifdef CONFIG_MODULE_SIG_ALL ifdef CONFIG_MODULE_SIG_ALL
MODSECKEY = ./signing_key.priv MODSECKEY = ./signing_key.priv
......
...@@ -6,7 +6,7 @@ lib-y = delay.o memmove.o memchr.o \ ...@@ -6,7 +6,7 @@ lib-y = delay.o memmove.o memchr.o \
checksum.o strlen.o div64.o div64-generic.o checksum.o strlen.o div64.o div64-generic.o
# Extracted from libgcc # Extracted from libgcc
lib-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \ obj-y += movmem.o ashldi3.o ashrdi3.o lshrdi3.o \
ashlsi3.o ashrsi3.o ashiftrt.o lshrsi3.o \ ashlsi3.o ashrsi3.o ashiftrt.o lshrsi3.o \
udiv_qrnnd.o udiv_qrnnd.o
......
...@@ -619,7 +619,7 @@ static inline unsigned long pte_present(pte_t pte) ...@@ -619,7 +619,7 @@ static inline unsigned long pte_present(pte_t pte)
} }
#define pte_accessible pte_accessible #define pte_accessible pte_accessible
static inline unsigned long pte_accessible(pte_t a) static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
{ {
return pte_val(a) & _PAGE_VALID; return pte_val(a) & _PAGE_VALID;
} }
...@@ -847,7 +847,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, ...@@ -847,7 +847,7 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
* SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U
* and SUN4V pte layout, so this inline test is fine. * and SUN4V pte layout, so this inline test is fine.
*/ */
if (likely(mm != &init_mm) && pte_accessible(orig)) if (likely(mm != &init_mm) && pte_accessible(mm, orig))
tlb_batch_add(mm, addr, ptep, orig, fullmm); tlb_batch_add(mm, addr, ptep, orig, fullmm);
} }
......
...@@ -452,9 +452,16 @@ static inline int pte_present(pte_t a) ...@@ -452,9 +452,16 @@ static inline int pte_present(pte_t a)
} }
#define pte_accessible pte_accessible #define pte_accessible pte_accessible
static inline int pte_accessible(pte_t a) static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
{ {
return pte_flags(a) & _PAGE_PRESENT; if (pte_flags(a) & _PAGE_PRESENT)
return true;
if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
mm_tlb_flush_pending(mm))
return true;
return false;
} }
static inline int pte_hidden(pte_t pte) static inline int pte_hidden(pte_t pte)
......
...@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, ...@@ -83,6 +83,12 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
pte_t pte = gup_get_pte(ptep); pte_t pte = gup_get_pte(ptep);
struct page *page; struct page *page;
/* Similar to the PMD case, NUMA hinting must take slow path */
if (pte_numa(pte)) {
pte_unmap(ptep);
return 0;
}
if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
pte_unmap(ptep); pte_unmap(ptep);
return 0; return 0;
...@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, ...@@ -167,6 +173,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
if (pmd_none(pmd) || pmd_trans_splitting(pmd)) if (pmd_none(pmd) || pmd_trans_splitting(pmd))
return 0; return 0;
if (unlikely(pmd_large(pmd))) { if (unlikely(pmd_large(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
if (pmd_numa(pmd))
return 0;
if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
return 0; return 0;
} else { } else {
......
...@@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) ...@@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b)
#endif #endif
#ifndef pte_accessible #ifndef pte_accessible
# define pte_accessible(pte) ((void)(pte),1) # define pte_accessible(mm, pte) ((void)(pte), 1)
#endif #endif
#ifndef flush_tlb_fix_spurious_fault #ifndef flush_tlb_fix_spurious_fault
......
...@@ -90,10 +90,19 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, ...@@ -90,10 +90,19 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#endif /* CONFIG_MIGRATION */ #endif /* CONFIG_MIGRATION */
#ifdef CONFIG_NUMA_BALANCING #ifdef CONFIG_NUMA_BALANCING
extern bool pmd_trans_migrating(pmd_t pmd);
extern void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd);
extern int migrate_misplaced_page(struct page *page, extern int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node); struct vm_area_struct *vma, int node);
extern bool migrate_ratelimited(int node); extern bool migrate_ratelimited(int node);
#else #else
static inline bool pmd_trans_migrating(pmd_t pmd)
{
return false;
}
static inline void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
{
}
static inline int migrate_misplaced_page(struct page *page, static inline int migrate_misplaced_page(struct page *page,
struct vm_area_struct *vma, int node) struct vm_area_struct *vma, int node)
{ {
......
...@@ -442,6 +442,14 @@ struct mm_struct { ...@@ -442,6 +442,14 @@ struct mm_struct {
/* numa_scan_seq prevents two threads setting pte_numa */ /* numa_scan_seq prevents two threads setting pte_numa */
int numa_scan_seq; int numa_scan_seq;
#endif
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
/*
* An operation with batched TLB flushing is going on. Anything that
* can move process memory needs to flush the TLB when moving a
* PROT_NONE or PROT_NUMA mapped page.
*/
bool tlb_flush_pending;
#endif #endif
struct uprobes_state uprobes_state; struct uprobes_state uprobes_state;
}; };
...@@ -459,4 +467,45 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm) ...@@ -459,4 +467,45 @@ static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
return mm->cpu_vm_mask_var; return mm->cpu_vm_mask_var;
} }
#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
/*
* Memory barriers to keep this state in sync are graciously provided by
* the page table locks, outside of which no page table modifications happen.
* The barriers below prevent the compiler from re-ordering the instructions
* around the memory barriers that are already present in the code.
*/
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
barrier();
return mm->tlb_flush_pending;
}
static inline void set_tlb_flush_pending(struct mm_struct *mm)
{
mm->tlb_flush_pending = true;
/*
* Guarantee that the tlb_flush_pending store does not leak into the
* critical section updating the page tables
*/
smp_mb__before_spinlock();
}
/* Clearing is done after a TLB flush, which also provides a barrier. */
static inline void clear_tlb_flush_pending(struct mm_struct *mm)
{
barrier();
mm->tlb_flush_pending = false;
}
#else
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
return false;
}
static inline void set_tlb_flush_pending(struct mm_struct *mm)
{
}
static inline void clear_tlb_flush_pending(struct mm_struct *mm)
{
}
#endif
#endif /* _LINUX_MM_TYPES_H */ #endif /* _LINUX_MM_TYPES_H */
...@@ -43,6 +43,7 @@ extern int unregister_reboot_notifier(struct notifier_block *); ...@@ -43,6 +43,7 @@ extern int unregister_reboot_notifier(struct notifier_block *);
* Architecture-specific implementations of sys_reboot commands. * Architecture-specific implementations of sys_reboot commands.
*/ */
extern void migrate_to_reboot_cpu(void);
extern void machine_restart(char *cmd); extern void machine_restart(char *cmd);
extern void machine_halt(void); extern void machine_halt(void);
extern void machine_power_off(void); extern void machine_power_off(void);
......
...@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) ...@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
spin_lock_init(&mm->page_table_lock); spin_lock_init(&mm->page_table_lock);
mm_init_aio(mm); mm_init_aio(mm);
mm_init_owner(mm, p); mm_init_owner(mm, p);
clear_tlb_flush_pending(mm);
if (likely(!mm_alloc_pgd(mm))) { if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0; mm->def_flags = 0;
......
...@@ -1680,6 +1680,7 @@ int kernel_kexec(void) ...@@ -1680,6 +1680,7 @@ int kernel_kexec(void)
{ {
kexec_in_progress = true; kexec_in_progress = true;
kernel_restart_prepare(NULL); kernel_restart_prepare(NULL);
migrate_to_reboot_cpu();
printk(KERN_EMERG "Starting new kernel\n"); printk(KERN_EMERG "Starting new kernel\n");
machine_shutdown(); machine_shutdown();
} }
......
...@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb) ...@@ -104,7 +104,7 @@ int unregister_reboot_notifier(struct notifier_block *nb)
} }
EXPORT_SYMBOL(unregister_reboot_notifier); EXPORT_SYMBOL(unregister_reboot_notifier);
static void migrate_to_reboot_cpu(void) void migrate_to_reboot_cpu(void)
{ {
/* The boot cpu is always logical cpu 0 */ /* The boot cpu is always logical cpu 0 */
int cpu = reboot_cpu; int cpu = reboot_cpu;
......
...@@ -1738,6 +1738,13 @@ void task_numa_work(struct callback_head *work) ...@@ -1738,6 +1738,13 @@ void task_numa_work(struct callback_head *work)
(vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
continue; continue;
/*
* Skip inaccessible VMAs to avoid any confusion between
* PROT_NONE and NUMA hinting ptes
*/
if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
continue;
do { do {
start = max(start, vma->vm_start); start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
......
...@@ -543,7 +543,7 @@ config ZSWAP ...@@ -543,7 +543,7 @@ config ZSWAP
config MEM_SOFT_DIRTY config MEM_SOFT_DIRTY
bool "Track memory changes" bool "Track memory changes"
depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY && PROC_FS
select PROC_PAGE_MONITOR select PROC_PAGE_MONITOR
help help
This option enables memory changes tracking by introducing a This option enables memory changes tracking by introducing a
......
...@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc, ...@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct compact_control *cc,
bool migrate_scanner) bool migrate_scanner)
{ {
struct zone *zone = cc->zone; struct zone *zone = cc->zone;
if (cc->ignore_skip_hint)
return;
if (!page) if (!page)
return; return;
......
...@@ -882,6 +882,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -882,6 +882,10 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
ret = 0; ret = 0;
goto out_unlock; goto out_unlock;
} }
/* mmap_sem prevents this happening but warn if that changes */
WARN_ON(pmd_trans_migrating(pmd));
if (unlikely(pmd_trans_splitting(pmd))) { if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */ /* split huge page running from under us */
spin_unlock(src_ptl); spin_unlock(src_ptl);
...@@ -1243,6 +1247,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, ...@@ -1243,6 +1247,10 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd))
return ERR_PTR(-EFAULT); return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
goto out;
page = pmd_page(*pmd); page = pmd_page(*pmd);
VM_BUG_ON(!PageHead(page)); VM_BUG_ON(!PageHead(page));
if (flags & FOLL_TOUCH) { if (flags & FOLL_TOUCH) {
...@@ -1295,6 +1303,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1295,6 +1303,17 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(!pmd_same(pmd, *pmdp))) if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock; goto out_unlock;
/*
* If there are potential migrations, wait for completion and retry
* without disrupting NUMA hinting information. Do not relock and
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
spin_unlock(ptl);
wait_migrate_huge_page(vma->anon_vma, pmdp);
goto out;
}
page = pmd_page(pmd); page = pmd_page(pmd);
BUG_ON(is_huge_zero_page(page)); BUG_ON(is_huge_zero_page(page));
page_nid = page_to_nid(page); page_nid = page_to_nid(page);
...@@ -1323,23 +1342,22 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1323,23 +1342,22 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* If the page was locked, there are no parallel migrations */ /* If the page was locked, there are no parallel migrations */
if (page_locked) if (page_locked)
goto clear_pmdnuma; goto clear_pmdnuma;
}
/* /* Migration could have started since the pmd_trans_migrating check */
* Otherwise wait for potential migrations and retry. We do if (!page_locked) {
* relock and check_same as the page may no longer be mapped.
* As the fault is being retried, do not account for it.
*/
spin_unlock(ptl); spin_unlock(ptl);
wait_on_page_locked(page); wait_on_page_locked(page);
page_nid = -1; page_nid = -1;
goto out; goto out;
} }
/* Page is misplaced, serialise migrations and parallel THP splits */ /*
* Page is misplaced. Page lock serialises migrations. Acquire anon_vma
* to serialises splits
*/
get_page(page); get_page(page);
spin_unlock(ptl); spin_unlock(ptl);
if (!page_locked)
lock_page(page);
anon_vma = page_lock_anon_vma_read(page); anon_vma = page_lock_anon_vma_read(page);
/* Confirm the PMD did not change while page_table_lock was released */ /* Confirm the PMD did not change while page_table_lock was released */
...@@ -1351,6 +1369,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1351,6 +1369,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock; goto out_unlock;
} }
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
page_nid = -1;
goto clear_pmdnuma;
}
/* /*
* Migrate the THP to the requested node, returns with page unlocked * Migrate the THP to the requested node, returns with page unlocked
* and pmd_numa cleared. * and pmd_numa cleared.
...@@ -1517,6 +1542,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -1517,6 +1542,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
ret = 1; ret = 1;
if (!prot_numa) { if (!prot_numa) {
entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmdp_get_and_clear(mm, addr, pmd);
if (pmd_numa(entry))
entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot); entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR; ret = HPAGE_PMD_NR;
BUG_ON(pmd_write(entry)); BUG_ON(pmd_write(entry));
...@@ -1531,7 +1558,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -1531,7 +1558,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
*/ */
if (!is_huge_zero_page(page) && if (!is_huge_zero_page(page) &&
!pmd_numa(*pmd)) { !pmd_numa(*pmd)) {
entry = pmdp_get_and_clear(mm, addr, pmd); entry = *pmd;
entry = pmd_mknuma(entry); entry = pmd_mknuma(entry);
ret = HPAGE_PMD_NR; ret = HPAGE_PMD_NR;
} }
......
...@@ -1505,10 +1505,16 @@ static int soft_offline_huge_page(struct page *page, int flags) ...@@ -1505,10 +1505,16 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (ret > 0) if (ret > 0)
ret = -EIO; ret = -EIO;
} else { } else {
set_page_hwpoison_huge_page(hpage); /* overcommit hugetlb page will be freed to buddy */
dequeue_hwpoisoned_huge_page(hpage); if (PageHuge(page)) {
atomic_long_add(1 << compound_order(hpage), set_page_hwpoison_huge_page(hpage);
&num_poisoned_pages); dequeue_hwpoisoned_huge_page(hpage);
atomic_long_add(1 << compound_order(hpage),
&num_poisoned_pages);
} else {
SetPageHWPoison(page);
atomic_long_inc(&num_poisoned_pages);
}
} }
return ret; return ret;
} }
......
...@@ -1197,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * ...@@ -1197,14 +1197,16 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int *
break; break;
vma = vma->vm_next; vma = vma->vm_next;
} }
if (PageHuge(page)) {
if (vma)
return alloc_huge_page_noerr(vma, address, 1);
else
return NULL;
}
/* /*
* queue_pages_range() confirms that @page belongs to some vma, * if !vma, alloc_page_vma() will use task or system default policy
* so vma shouldn't be NULL.
*/ */
BUG_ON(!vma);
if (PageHuge(page))
return alloc_huge_page_noerr(vma, address, 1);
return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
} }
#else #else
...@@ -1318,7 +1320,7 @@ static long do_mbind(unsigned long start, unsigned long len, ...@@ -1318,7 +1320,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (nr_failed && (flags & MPOL_MF_STRICT)) if (nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO; err = -EIO;
} else } else
putback_lru_pages(&pagelist); putback_movable_pages(&pagelist);
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
mpol_out: mpol_out:
......
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include <linux/hugetlb_cgroup.h> #include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/balloon_compaction.h> #include <linux/balloon_compaction.h>
#include <linux/mmu_notifier.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
...@@ -1654,6 +1655,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) ...@@ -1654,6 +1655,18 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
return 1; return 1;
} }
bool pmd_trans_migrating(pmd_t pmd)
{
struct page *page = pmd_page(pmd);
return PageLocked(page);
}
void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
{
struct page *page = pmd_page(*pmd);
wait_on_page_locked(page);
}
/* /*
* Attempt to migrate a misplaced page to the specified destination * Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on * node. Caller is expected to have an elevated reference count on
...@@ -1716,12 +1729,14 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1716,12 +1729,14 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
struct page *page, int node) struct page *page, int node)
{ {
spinlock_t *ptl; spinlock_t *ptl;
unsigned long haddr = address & HPAGE_PMD_MASK;
pg_data_t *pgdat = NODE_DATA(node); pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0; int isolated = 0;
struct page *new_page = NULL; struct page *new_page = NULL;
struct mem_cgroup *memcg = NULL; struct mem_cgroup *memcg = NULL;
int page_lru = page_is_file_cache(page); int page_lru = page_is_file_cache(page);
unsigned long mmun_start = address & HPAGE_PMD_MASK;
unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
pmd_t orig_entry;
/* /*
* Rate-limit the amount of data that is being migrated to a node. * Rate-limit the amount of data that is being migrated to a node.
...@@ -1744,6 +1759,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1744,6 +1759,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
goto out_fail; goto out_fail;
} }
if (mm_tlb_flush_pending(mm))
flush_tlb_range(vma, mmun_start, mmun_end);
/* Prepare a page as a migration target */ /* Prepare a page as a migration target */
__set_page_locked(new_page); __set_page_locked(new_page);
SetPageSwapBacked(new_page); SetPageSwapBacked(new_page);
...@@ -1755,9 +1773,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1755,9 +1773,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page)); WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */ /* Recheck the target PMD */
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd); ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, entry))) { if (unlikely(!pmd_same(*pmd, entry) || page_count(page) != 2)) {
fail_putback:
spin_unlock(ptl); spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */ /* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page)) if (TestClearPageActive(new_page))
...@@ -1774,7 +1795,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1774,7 +1795,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
putback_lru_page(page); putback_lru_page(page);
mod_zone_page_state(page_zone(page), mod_zone_page_state(page_zone(page),
NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
goto out_fail;
goto out_unlock;
} }
/* /*
...@@ -1786,16 +1808,35 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1786,16 +1808,35 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
*/ */
mem_cgroup_prepare_migration(page, new_page, &memcg); mem_cgroup_prepare_migration(page, new_page, &memcg);
orig_entry = *pmd;
entry = mk_pmd(new_page, vma->vm_page_prot); entry = mk_pmd(new_page, vma->vm_page_prot);
entry = pmd_mknonnuma(entry);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
entry = pmd_mkhuge(entry); entry = pmd_mkhuge(entry);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
pmdp_clear_flush(vma, haddr, pmd); /*
set_pmd_at(mm, haddr, pmd, entry); * Clear the old entry under pagetable lock and establish the new PTE.
page_add_new_anon_rmap(new_page, vma, haddr); * Any parallel GUP will either observe the old page blocking on the
* page lock, block on the page table lock or observe the new page.
* The SetPageUptodate on the new page and page_add_new_anon_rmap
* guarantee the copy is visible before the pagetable update.
*/
flush_cache_range(vma, mmun_start, mmun_end);
page_add_new_anon_rmap(new_page, vma, mmun_start);
pmdp_clear_flush(vma, mmun_start, pmd);
set_pmd_at(mm, mmun_start, pmd, entry);
flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry); update_mmu_cache_pmd(vma, address, &entry);
if (page_count(page) != 2) {
set_pmd_at(mm, mmun_start, pmd, orig_entry);
flush_tlb_range(vma, mmun_start, mmun_end);
update_mmu_cache_pmd(vma, address, &entry);
page_remove_rmap(new_page);
goto fail_putback;
}
page_remove_rmap(page); page_remove_rmap(page);
/* /*
* Finish the charge transaction under the page table lock to * Finish the charge transaction under the page table lock to
* prevent split_huge_page() from dividing up the charge * prevent split_huge_page() from dividing up the charge
...@@ -1803,6 +1844,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1803,6 +1844,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
*/ */
mem_cgroup_end_migration(memcg, page, new_page, true); mem_cgroup_end_migration(memcg, page, new_page, true);
spin_unlock(ptl); spin_unlock(ptl);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(new_page); unlock_page(new_page);
unlock_page(page); unlock_page(page);
...@@ -1820,10 +1862,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, ...@@ -1820,10 +1862,15 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
out_fail: out_fail:
count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
out_dropref: out_dropref:
entry = pmd_mknonnuma(entry); ptl = pmd_lock(mm, pmd);
set_pmd_at(mm, haddr, pmd, entry); if (pmd_same(*pmd, entry)) {
update_mmu_cache_pmd(vma, address, &entry); entry = pmd_mknonnuma(entry);
set_pmd_at(mm, mmun_start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
spin_unlock(ptl);
out_unlock:
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
return 0; return 0;
......
...@@ -52,17 +52,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -52,17 +52,21 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_t ptent; pte_t ptent;
bool updated = false; bool updated = false;
ptent = ptep_modify_prot_start(mm, addr, pte);
if (!prot_numa) { if (!prot_numa) {
ptent = ptep_modify_prot_start(mm, addr, pte);
if (pte_numa(ptent))
ptent = pte_mknonnuma(ptent);
ptent = pte_modify(ptent, newprot); ptent = pte_modify(ptent, newprot);
updated = true; updated = true;
} else { } else {
struct page *page; struct page *page;
ptent = *pte;
page = vm_normal_page(vma, addr, oldpte); page = vm_normal_page(vma, addr, oldpte);
if (page) { if (page) {
if (!pte_numa(oldpte)) { if (!pte_numa(oldpte)) {
ptent = pte_mknuma(ptent); ptent = pte_mknuma(ptent);
set_pte_at(mm, addr, pte, ptent);
updated = true; updated = true;
} }
} }
...@@ -79,7 +83,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -79,7 +83,10 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (updated) if (updated)
pages++; pages++;
ptep_modify_prot_commit(mm, addr, pte, ptent);
/* Only !prot_numa always clears the pte */
if (!prot_numa)
ptep_modify_prot_commit(mm, addr, pte, ptent);
} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) { } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
swp_entry_t entry = pte_to_swp_entry(oldpte); swp_entry_t entry = pte_to_swp_entry(oldpte);
...@@ -181,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, ...@@ -181,6 +188,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
BUG_ON(addr >= end); BUG_ON(addr >= end);
pgd = pgd_offset(mm, addr); pgd = pgd_offset(mm, addr);
flush_cache_range(vma, addr, end); flush_cache_range(vma, addr, end);
set_tlb_flush_pending(mm);
do { do {
next = pgd_addr_end(addr, end); next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) if (pgd_none_or_clear_bad(pgd))
...@@ -192,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma, ...@@ -192,6 +200,7 @@ static unsigned long change_protection_range(struct vm_area_struct *vma,
/* Only flush the TLB if we actually modified any entries: */ /* Only flush the TLB if we actually modified any entries: */
if (pages) if (pages)
flush_tlb_range(vma, start, end); flush_tlb_range(vma, start, end);
clear_tlb_flush_pending(mm);
return pages; return pages;
} }
......
...@@ -1920,7 +1920,8 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, ...@@ -1920,7 +1920,8 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
* back to remote zones that do not partake in the * back to remote zones that do not partake in the
* fairness round-robin cycle of this zonelist. * fairness round-robin cycle of this zonelist.
*/ */
if (alloc_flags & ALLOC_WMARK_LOW) { if ((alloc_flags & ALLOC_WMARK_LOW) &&
(gfp_mask & GFP_MOVABLE_MASK)) {
if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0) if (zone_page_state(zone, NR_ALLOC_BATCH) <= 0)
continue; continue;
if (zone_reclaim_mode && if (zone_reclaim_mode &&
......
...@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma, ...@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
pte_t *ptep) pte_t *ptep)
{ {
struct mm_struct *mm = (vma)->vm_mm;
pte_t pte; pte_t pte;
pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); pte = ptep_get_and_clear(mm, address, ptep);
if (pte_accessible(pte)) if (pte_accessible(mm, pte))
flush_tlb_page(vma, address); flush_tlb_page(vma, address);
return pte; return pte;
} }
...@@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) ...@@ -191,6 +192,9 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp) pmd_t *pmdp)
{ {
pmd_t entry = *pmdp;
if (pmd_numa(entry))
entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp)); set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(*pmdp));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
} }
......
...@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, ...@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
spinlock_t *ptl; spinlock_t *ptl;
if (unlikely(PageHuge(page))) { if (unlikely(PageHuge(page))) {
/* when pud is not present, pte will be NULL */
pte = huge_pte_offset(mm, address); pte = huge_pte_offset(mm, address);
if (!pte)
return NULL;
ptl = huge_pte_lockptr(page_hstate(page), mm, pte); ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
goto check; goto check;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment