Commit 3ecafda9 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "16 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping
  mm/kmemleak.c: fix unused-function warning
  init: initialize jump labels before command line option parsing
  kernel/watchdog_hld.c: hard lockup message should end with a newline
  kcov: improve CONFIG_ARCH_HAS_KCOV help text
  mm: fix inactive list balancing between NUMA nodes and cgroups
  mm/hotplug: treat CMA pages as unmovable
  proc: fixup proc-pid-vm test
  proc: fix map_files test on F29
  mm/vmstat.c: fix /proc/vmstat format for CONFIG_DEBUG_TLBFLUSH=y CONFIG_SMP=n
  mm/memory_hotplug: do not unlock after failing to take the device_hotplug_lock
  mm: swapoff: shmem_unuse() stop eviction without igrab()
  mm: swapoff: take notice of completion sooner
  mm: swapoff: remove too limiting SWAP_UNUSE_MAX_TRIES
  mm: swapoff: shmem_find_swap_entries() filter out other types
  slab: store tagged freelist for off-slab slabmgmt
parents b222e9af 04f5866e
...@@ -506,7 +506,7 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr, ...@@ -506,7 +506,7 @@ static ssize_t probe_store(struct device *dev, struct device_attribute *attr,
ret = lock_device_hotplug_sysfs(); ret = lock_device_hotplug_sysfs();
if (ret) if (ret)
goto out; return ret;
nid = memory_add_physaddr_to_nid(phys_addr); nid = memory_add_physaddr_to_nid(phys_addr);
ret = __add_memory(nid, phys_addr, ret = __add_memory(nid, phys_addr,
......
...@@ -993,6 +993,8 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) ...@@ -993,6 +993,8 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
* will only be one mm, so no big deal. * will only be one mm, so no big deal.
*/ */
down_write(&mm->mmap_sem); down_write(&mm->mmap_sem);
if (!mmget_still_valid(mm))
goto skip_mm;
mutex_lock(&ufile->umap_lock); mutex_lock(&ufile->umap_lock);
list_for_each_entry_safe (priv, next_priv, &ufile->umaps, list_for_each_entry_safe (priv, next_priv, &ufile->umaps,
list) { list) {
...@@ -1007,6 +1009,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile) ...@@ -1007,6 +1009,7 @@ void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE); vma->vm_flags &= ~(VM_SHARED | VM_MAYSHARE);
} }
mutex_unlock(&ufile->umap_lock); mutex_unlock(&ufile->umap_lock);
skip_mm:
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
mmput(mm); mmput(mm);
} }
......
...@@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, ...@@ -1143,6 +1143,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = -EINTR; count = -EINTR;
goto out_mm; goto out_mm;
} }
/*
* Avoid to modify vma->vm_flags
* without locked ops while the
* coredump reads the vm_flags.
*/
if (!mmget_still_valid(mm)) {
/*
* Silently return "count"
* like if get_task_mm()
* failed. FIXME: should this
* function have returned
* -ESRCH if get_task_mm()
* failed like if
* get_proc_task() fails?
*/
up_write(&mm->mmap_sem);
goto out_mm;
}
for (vma = mm->mmap; vma; vma = vma->vm_next) { for (vma = mm->mmap; vma; vma = vma->vm_next) {
vma->vm_flags &= ~VM_SOFTDIRTY; vma->vm_flags &= ~VM_SOFTDIRTY;
vma_set_page_prot(vma); vma_set_page_prot(vma);
......
...@@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, ...@@ -629,6 +629,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
/* the various vma->vm_userfaultfd_ctx still points to it */ /* the various vma->vm_userfaultfd_ctx still points to it */
down_write(&mm->mmap_sem); down_write(&mm->mmap_sem);
/* no task can run (and in turn coredump) yet */
VM_WARN_ON(!mmget_still_valid(mm));
for (vma = mm->mmap; vma; vma = vma->vm_next) for (vma = mm->mmap; vma; vma = vma->vm_next)
if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
...@@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) ...@@ -883,6 +885,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
* taking the mmap_sem for writing. * taking the mmap_sem for writing.
*/ */
down_write(&mm->mmap_sem); down_write(&mm->mmap_sem);
if (!mmget_still_valid(mm))
goto skip_mm;
prev = NULL; prev = NULL;
for (vma = mm->mmap; vma; vma = vma->vm_next) { for (vma = mm->mmap; vma; vma = vma->vm_next) {
cond_resched(); cond_resched();
...@@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) ...@@ -905,6 +909,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
vma->vm_flags = new_flags; vma->vm_flags = new_flags;
vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
} }
skip_mm:
up_write(&mm->mmap_sem); up_write(&mm->mmap_sem);
mmput(mm); mmput(mm);
wakeup: wakeup:
...@@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ...@@ -1333,6 +1338,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
goto out; goto out;
down_write(&mm->mmap_sem); down_write(&mm->mmap_sem);
if (!mmget_still_valid(mm))
goto out_unlock;
vma = find_vma_prev(mm, start, &prev); vma = find_vma_prev(mm, start, &prev);
if (!vma) if (!vma)
goto out_unlock; goto out_unlock;
...@@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, ...@@ -1520,6 +1527,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto out; goto out;
down_write(&mm->mmap_sem); down_write(&mm->mmap_sem);
if (!mmget_still_valid(mm))
goto out_unlock;
vma = find_vma_prev(mm, start, &prev); vma = find_vma_prev(mm, start, &prev);
if (!vma) if (!vma)
goto out_unlock; goto out_unlock;
......
...@@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm) ...@@ -49,6 +49,27 @@ static inline void mmdrop(struct mm_struct *mm)
__mmdrop(mm); __mmdrop(mm);
} }
/*
* This has to be called after a get_task_mm()/mmget_not_zero()
* followed by taking the mmap_sem for writing before modifying the
* vmas or anything the coredump pretends not to change from under it.
*
* NOTE: find_extend_vma() called from GUP context is the only place
* that can modify the "mm" (notably the vm_start/end) under mmap_sem
* for reading and outside the context of the process, so it is also
* the only case that holds the mmap_sem for reading that must call
* this function. Generally if the mmap_sem is hold for reading
* there's no need of this check after get_task_mm()/mmget_not_zero().
*
* This function can be obsoleted and the check can be removed, after
* the coredump code will hold the mmap_sem for writing before
* invoking the ->core_dump methods.
*/
static inline bool mmget_still_valid(struct mm_struct *mm)
{
return likely(!mm->core_state);
}
/** /**
* mmget() - Pin the address space associated with a &struct mm_struct. * mmget() - Pin the address space associated with a &struct mm_struct.
* @mm: The address space to pin. * @mm: The address space to pin.
......
...@@ -21,6 +21,7 @@ struct shmem_inode_info { ...@@ -21,6 +21,7 @@ struct shmem_inode_info {
struct list_head swaplist; /* chain of maybes on swap */ struct list_head swaplist; /* chain of maybes on swap */
struct shared_policy policy; /* NUMA memory alloc policy */ struct shared_policy policy; /* NUMA memory alloc policy */
struct simple_xattrs xattrs; /* list of xattrs */ struct simple_xattrs xattrs; /* list of xattrs */
atomic_t stop_eviction; /* hold when working on inode */
struct inode vfs_inode; struct inode vfs_inode;
}; };
......
...@@ -582,6 +582,8 @@ asmlinkage __visible void __init start_kernel(void) ...@@ -582,6 +582,8 @@ asmlinkage __visible void __init start_kernel(void)
page_alloc_init(); page_alloc_init();
pr_notice("Kernel command line: %s\n", boot_command_line); pr_notice("Kernel command line: %s\n", boot_command_line);
/* parameters may set static keys */
jump_label_init();
parse_early_param(); parse_early_param();
after_dashes = parse_args("Booting kernel", after_dashes = parse_args("Booting kernel",
static_command_line, __start___param, static_command_line, __start___param,
...@@ -591,8 +593,6 @@ asmlinkage __visible void __init start_kernel(void) ...@@ -591,8 +593,6 @@ asmlinkage __visible void __init start_kernel(void)
parse_args("Setting init args", after_dashes, NULL, 0, -1, -1, parse_args("Setting init args", after_dashes, NULL, 0, -1, -1,
NULL, set_init_arg); NULL, set_init_arg);
jump_label_init();
/* /*
* These use large bootmem allocations and must precede * These use large bootmem allocations and must precede
* kmem_cache_init() * kmem_cache_init()
......
...@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event, ...@@ -135,7 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
if (__this_cpu_read(hard_watchdog_warn) == true) if (__this_cpu_read(hard_watchdog_warn) == true)
return; return;
pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu); pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
this_cpu);
print_modules(); print_modules();
print_irqtrace_events(current); print_irqtrace_events(current);
if (regs) if (regs)
......
...@@ -753,9 +753,9 @@ endmenu # "Memory Debugging" ...@@ -753,9 +753,9 @@ endmenu # "Memory Debugging"
config ARCH_HAS_KCOV config ARCH_HAS_KCOV
bool bool
help help
KCOV does not have any arch-specific code, but currently it is enabled An architecture should select this when it can successfully
only for x86_64. KCOV requires testing on other archs, and most likely build and run with CONFIG_KCOV. This typically requires
disabling of instrumentation for some early boot code. disabling instrumentation for some early boot code.
config CC_HAS_SANCOV_TRACE_PC config CC_HAS_SANCOV_TRACE_PC
def_bool $(cc-option,-fsanitize-coverage=trace-pc) def_bool $(cc-option,-fsanitize-coverage=trace-pc)
......
...@@ -1401,6 +1401,7 @@ static void scan_block(void *_start, void *_end, ...@@ -1401,6 +1401,7 @@ static void scan_block(void *_start, void *_end,
/* /*
* Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency. * Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
*/ */
#ifdef CONFIG_SMP
static void scan_large_block(void *start, void *end) static void scan_large_block(void *start, void *end)
{ {
void *next; void *next;
...@@ -1412,6 +1413,7 @@ static void scan_large_block(void *start, void *end) ...@@ -1412,6 +1413,7 @@ static void scan_large_block(void *start, void *end)
cond_resched(); cond_resched();
} }
} }
#endif
/* /*
* Scan a memory block corresponding to a kmemleak_object. A condition is * Scan a memory block corresponding to a kmemleak_object. A condition is
......
...@@ -45,6 +45,7 @@ ...@@ -45,6 +45,7 @@
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
#include <linux/pkeys.h> #include <linux/pkeys.h>
#include <linux/oom.h> #include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include <asm/cacheflush.h> #include <asm/cacheflush.h>
...@@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) ...@@ -2525,7 +2526,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
vma = find_vma_prev(mm, addr, &prev); vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr)) if (vma && (vma->vm_start <= addr))
return vma; return vma;
if (!prev || expand_stack(prev, addr)) /* don't alter vm_end if the coredump is running */
if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
return NULL; return NULL;
if (prev->vm_flags & VM_LOCKED) if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL); populate_vma_page_range(prev, addr, prev->vm_end, NULL);
...@@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) ...@@ -2551,6 +2553,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
return vma; return vma;
if (!(vma->vm_flags & VM_GROWSDOWN)) if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL; return NULL;
/* don't alter vm_start if the coredump is running */
if (!mmget_still_valid(mm))
return NULL;
start = vma->vm_start; start = vma->vm_start;
if (expand_stack(vma, addr)) if (expand_stack(vma, addr))
return NULL; return NULL;
......
...@@ -8005,7 +8005,10 @@ void *__init alloc_large_system_hash(const char *tablename, ...@@ -8005,7 +8005,10 @@ void *__init alloc_large_system_hash(const char *tablename,
bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
int migratetype, int flags) int migratetype, int flags)
{ {
unsigned long pfn, iter, found; unsigned long found;
unsigned long iter = 0;
unsigned long pfn = page_to_pfn(page);
const char *reason = "unmovable page";
/* /*
* TODO we could make this much more efficient by not checking every * TODO we could make this much more efficient by not checking every
...@@ -8015,17 +8018,20 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, ...@@ -8015,17 +8018,20 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
* can still lead to having bootmem allocations in zone_movable. * can still lead to having bootmem allocations in zone_movable.
*/ */
/* if (is_migrate_cma_page(page)) {
* CMA allocations (alloc_contig_range) really need to mark isolate /*
* CMA pageblocks even when they are not movable in fact so consider * CMA allocations (alloc_contig_range) really need to mark
* them movable here. * isolate CMA pageblocks even when they are not movable in fact
*/ * so consider them movable here.
if (is_migrate_cma(migratetype) && */
is_migrate_cma(get_pageblock_migratetype(page))) if (is_migrate_cma(migratetype))
return false; return false;
reason = "CMA page";
goto unmovable;
}
pfn = page_to_pfn(page); for (found = 0; iter < pageblock_nr_pages; iter++) {
for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter; unsigned long check = pfn + iter;
if (!pfn_valid_within(check)) if (!pfn_valid_within(check))
...@@ -8105,7 +8111,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, ...@@ -8105,7 +8111,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
unmovable: unmovable:
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE); WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
if (flags & REPORT_FAILURE) if (flags & REPORT_FAILURE)
dump_page(pfn_to_page(pfn+iter), "unmovable page"); dump_page(pfn_to_page(pfn + iter), reason);
return true; return true;
} }
......
...@@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode) ...@@ -1081,9 +1081,14 @@ static void shmem_evict_inode(struct inode *inode)
} }
spin_unlock(&sbinfo->shrinklist_lock); spin_unlock(&sbinfo->shrinklist_lock);
} }
if (!list_empty(&info->swaplist)) { while (!list_empty(&info->swaplist)) {
/* Wait while shmem_unuse() is scanning this inode... */
wait_var_event(&info->stop_eviction,
!atomic_read(&info->stop_eviction));
mutex_lock(&shmem_swaplist_mutex); mutex_lock(&shmem_swaplist_mutex);
list_del_init(&info->swaplist); /* ...but beware of the race if we peeked too early */
if (!atomic_read(&info->stop_eviction))
list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
} }
} }
...@@ -1099,10 +1104,11 @@ extern struct swap_info_struct *swap_info[]; ...@@ -1099,10 +1104,11 @@ extern struct swap_info_struct *swap_info[];
static int shmem_find_swap_entries(struct address_space *mapping, static int shmem_find_swap_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries, pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices, struct page **entries, pgoff_t *indices,
bool frontswap) unsigned int type, bool frontswap)
{ {
XA_STATE(xas, &mapping->i_pages, start); XA_STATE(xas, &mapping->i_pages, start);
struct page *page; struct page *page;
swp_entry_t entry;
unsigned int ret = 0; unsigned int ret = 0;
if (!nr_entries) if (!nr_entries)
...@@ -1116,13 +1122,12 @@ static int shmem_find_swap_entries(struct address_space *mapping, ...@@ -1116,13 +1122,12 @@ static int shmem_find_swap_entries(struct address_space *mapping,
if (!xa_is_value(page)) if (!xa_is_value(page))
continue; continue;
if (frontswap) { entry = radix_to_swp_entry(page);
swp_entry_t entry = radix_to_swp_entry(page); if (swp_type(entry) != type)
continue;
if (!frontswap_test(swap_info[swp_type(entry)], if (frontswap &&
swp_offset(entry))) !frontswap_test(swap_info[type], swp_offset(entry)))
continue; continue;
}
indices[ret] = xas.xa_index; indices[ret] = xas.xa_index;
entries[ret] = page; entries[ret] = page;
...@@ -1194,7 +1199,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, ...@@ -1194,7 +1199,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
pvec.pages, indices, pvec.pages, indices,
frontswap); type, frontswap);
if (pvec.nr == 0) { if (pvec.nr == 0) {
ret = 0; ret = 0;
break; break;
...@@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap, ...@@ -1227,36 +1232,27 @@ int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse) unsigned long *fs_pages_to_unuse)
{ {
struct shmem_inode_info *info, *next; struct shmem_inode_info *info, *next;
struct inode *inode;
struct inode *prev_inode = NULL;
int error = 0; int error = 0;
if (list_empty(&shmem_swaplist)) if (list_empty(&shmem_swaplist))
return 0; return 0;
mutex_lock(&shmem_swaplist_mutex); mutex_lock(&shmem_swaplist_mutex);
/*
* The extra refcount on the inode is necessary to safely dereference
* p->next after re-acquiring the lock. New shmem inodes with swap
* get added to the end of the list and we will scan them all.
*/
list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) { list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
if (!info->swapped) { if (!info->swapped) {
list_del_init(&info->swaplist); list_del_init(&info->swaplist);
continue; continue;
} }
/*
inode = igrab(&info->vfs_inode); * Drop the swaplist mutex while searching the inode for swap;
if (!inode) * but before doing so, make sure shmem_evict_inode() will not
continue; * remove placeholder inode from swaplist, nor let it be freed
* (igrab() would protect from unlink, but not from unmount).
*/
atomic_inc(&info->stop_eviction);
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
if (prev_inode)
iput(prev_inode);
prev_inode = inode;
error = shmem_unuse_inode(inode, type, frontswap, error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
fs_pages_to_unuse); fs_pages_to_unuse);
cond_resched(); cond_resched();
...@@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap, ...@@ -1264,14 +1260,13 @@ int shmem_unuse(unsigned int type, bool frontswap,
next = list_next_entry(info, swaplist); next = list_next_entry(info, swaplist);
if (!info->swapped) if (!info->swapped)
list_del_init(&info->swaplist); list_del_init(&info->swaplist);
if (atomic_dec_and_test(&info->stop_eviction))
wake_up_var(&info->stop_eviction);
if (error) if (error)
break; break;
} }
mutex_unlock(&shmem_swaplist_mutex); mutex_unlock(&shmem_swaplist_mutex);
if (prev_inode)
iput(prev_inode);
return error; return error;
} }
...@@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode ...@@ -2238,6 +2233,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
info = SHMEM_I(inode); info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info); memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock); spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL; info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE; info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->shrinklist);
......
...@@ -2374,7 +2374,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep, ...@@ -2374,7 +2374,6 @@ static void *alloc_slabmgmt(struct kmem_cache *cachep,
/* Slab management obj is off-slab. */ /* Slab management obj is off-slab. */
freelist = kmem_cache_alloc_node(cachep->freelist_cache, freelist = kmem_cache_alloc_node(cachep->freelist_cache,
local_flags, nodeid); local_flags, nodeid);
freelist = kasan_reset_tag(freelist);
if (!freelist) if (!freelist)
return NULL; return NULL;
} else { } else {
......
...@@ -2023,7 +2023,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, ...@@ -2023,7 +2023,6 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
* If the boolean frontswap is true, only unuse pages_to_unuse pages; * If the boolean frontswap is true, only unuse pages_to_unuse pages;
* pages_to_unuse==0 means all pages; ignored if frontswap is false * pages_to_unuse==0 means all pages; ignored if frontswap is false
*/ */
#define SWAP_UNUSE_MAX_TRIES 3
int try_to_unuse(unsigned int type, bool frontswap, int try_to_unuse(unsigned int type, bool frontswap,
unsigned long pages_to_unuse) unsigned long pages_to_unuse)
{ {
...@@ -2035,7 +2034,6 @@ int try_to_unuse(unsigned int type, bool frontswap, ...@@ -2035,7 +2034,6 @@ int try_to_unuse(unsigned int type, bool frontswap,
struct page *page; struct page *page;
swp_entry_t entry; swp_entry_t entry;
unsigned int i; unsigned int i;
int retries = 0;
if (!si->inuse_pages) if (!si->inuse_pages)
return 0; return 0;
...@@ -2053,11 +2051,9 @@ int try_to_unuse(unsigned int type, bool frontswap, ...@@ -2053,11 +2051,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
spin_lock(&mmlist_lock); spin_lock(&mmlist_lock);
p = &init_mm.mmlist; p = &init_mm.mmlist;
while ((p = p->next) != &init_mm.mmlist) { while (si->inuse_pages &&
if (signal_pending(current)) { !signal_pending(current) &&
retval = -EINTR; (p = p->next) != &init_mm.mmlist) {
break;
}
mm = list_entry(p, struct mm_struct, mmlist); mm = list_entry(p, struct mm_struct, mmlist);
if (!mmget_not_zero(mm)) if (!mmget_not_zero(mm))
...@@ -2084,7 +2080,9 @@ int try_to_unuse(unsigned int type, bool frontswap, ...@@ -2084,7 +2080,9 @@ int try_to_unuse(unsigned int type, bool frontswap,
mmput(prev_mm); mmput(prev_mm);
i = 0; i = 0;
while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { while (si->inuse_pages &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i, frontswap)) != 0) {
entry = swp_entry(type, i); entry = swp_entry(type, i);
page = find_get_page(swap_address_space(entry), i); page = find_get_page(swap_address_space(entry), i);
...@@ -2117,14 +2115,18 @@ int try_to_unuse(unsigned int type, bool frontswap, ...@@ -2117,14 +2115,18 @@ int try_to_unuse(unsigned int type, bool frontswap,
* If yes, we would need to do retry the unuse logic again. * If yes, we would need to do retry the unuse logic again.
* Under global memory pressure, swap entries can be reinserted back * Under global memory pressure, swap entries can be reinserted back
* into process space after the mmlist loop above passes over them. * into process space after the mmlist loop above passes over them.
* Its not worth continuosuly retrying to unuse the swap in this case. *
* So we try SWAP_UNUSE_MAX_TRIES times. * Limit the number of retries? No: when mmget_not_zero() above fails,
* that mm is likely to be freeing swap from exit_mmap(), which proceeds
* at its own independent pace; and even shmem_writepage() could have
* been preempted after get_swap_page(), temporarily hiding that swap.
* It's easy and robust (though cpu-intensive) just to keep retrying.
*/ */
if (++retries >= SWAP_UNUSE_MAX_TRIES) if (si->inuse_pages) {
retval = -EBUSY; if (!signal_pending(current))
else if (si->inuse_pages) goto retry;
goto retry; retval = -EINTR;
}
out: out:
return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval; return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
} }
......
...@@ -2176,7 +2176,6 @@ static void shrink_active_list(unsigned long nr_to_scan, ...@@ -2176,7 +2176,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
* 10TB 320 32GB * 10TB 320 32GB
*/ */
static bool inactive_list_is_low(struct lruvec *lruvec, bool file, static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
struct mem_cgroup *memcg,
struct scan_control *sc, bool actual_reclaim) struct scan_control *sc, bool actual_reclaim)
{ {
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
...@@ -2197,16 +2196,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, ...@@ -2197,16 +2196,12 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
if (memcg)
refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
else
refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
/* /*
* When refaults are being observed, it means a new workingset * When refaults are being observed, it means a new workingset
* is being established. Disable active list protection to get * is being established. Disable active list protection to get
* rid of the stale workingset quickly. * rid of the stale workingset quickly.
*/ */
refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
if (file && actual_reclaim && lruvec->refaults != refaults) { if (file && actual_reclaim && lruvec->refaults != refaults) {
inactive_ratio = 0; inactive_ratio = 0;
} else { } else {
...@@ -2227,12 +2222,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, ...@@ -2227,12 +2222,10 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
} }
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
struct lruvec *lruvec, struct mem_cgroup *memcg, struct lruvec *lruvec, struct scan_control *sc)
struct scan_control *sc)
{ {
if (is_active_lru(lru)) { if (is_active_lru(lru)) {
if (inactive_list_is_low(lruvec, is_file_lru(lru), if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
memcg, sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru); shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0; return 0;
} }
...@@ -2332,7 +2325,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, ...@@ -2332,7 +2325,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* anonymous pages on the LRU in eligible zones. * anonymous pages on the LRU in eligible zones.
* Otherwise, the small LRU gets thrashed. * Otherwise, the small LRU gets thrashed.
*/ */
if (!inactive_list_is_low(lruvec, false, memcg, sc, false) && if (!inactive_list_is_low(lruvec, false, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx) lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
>> sc->priority) { >> sc->priority) {
scan_balance = SCAN_ANON; scan_balance = SCAN_ANON;
...@@ -2350,7 +2343,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, ...@@ -2350,7 +2343,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
* lruvec even if it has plenty of old anonymous pages unless the * lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure. * system is under heavy pressure.
*/ */
if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && if (!inactive_list_is_low(lruvec, true, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE; scan_balance = SCAN_FILE;
goto out; goto out;
...@@ -2503,7 +2496,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc ...@@ -2503,7 +2496,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
nr[lru] -= nr_to_scan; nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan, nr_reclaimed += shrink_list(lru, nr_to_scan,
lruvec, memcg, sc); lruvec, sc);
} }
} }
...@@ -2570,7 +2563,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc ...@@ -2570,7 +2563,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
* Even if we did not try to evict anon pages at all, we want to * Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio. * rebalance the anon lru active/inactive ratio.
*/ */
if (inactive_list_is_low(lruvec, false, memcg, sc, true)) if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec, shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON); sc, LRU_ACTIVE_ANON);
} }
...@@ -2969,12 +2962,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) ...@@ -2969,12 +2962,8 @@ static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
unsigned long refaults; unsigned long refaults;
struct lruvec *lruvec; struct lruvec *lruvec;
if (memcg)
refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
else
refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
lruvec = mem_cgroup_lruvec(pgdat, memcg); lruvec = mem_cgroup_lruvec(pgdat, memcg);
refaults = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults; lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
} }
...@@ -3339,7 +3328,7 @@ static void age_active_anon(struct pglist_data *pgdat, ...@@ -3339,7 +3328,7 @@ static void age_active_anon(struct pglist_data *pgdat,
do { do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
if (inactive_list_is_low(lruvec, false, memcg, sc, true)) if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec, shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON); sc, LRU_ACTIVE_ANON);
......
...@@ -1274,13 +1274,8 @@ const char * const vmstat_text[] = { ...@@ -1274,13 +1274,8 @@ const char * const vmstat_text[] = {
#endif #endif
#endif /* CONFIG_MEMORY_BALLOON */ #endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH #ifdef CONFIG_DEBUG_TLBFLUSH
#ifdef CONFIG_SMP
"nr_tlb_remote_flush", "nr_tlb_remote_flush",
"nr_tlb_remote_flush_received", "nr_tlb_remote_flush_received",
#else
"", /* nr_tlb_remote_flush */
"", /* nr_tlb_remote_flush_received */
#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all", "nr_tlb_local_flush_all",
"nr_tlb_local_flush_one", "nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */ #endif /* CONFIG_DEBUG_TLBFLUSH */
......
...@@ -187,8 +187,8 @@ static int make_exe(const uint8_t *payload, size_t len) ...@@ -187,8 +187,8 @@ static int make_exe(const uint8_t *payload, size_t len)
ph.p_offset = 0; ph.p_offset = 0;
ph.p_vaddr = VADDR; ph.p_vaddr = VADDR;
ph.p_paddr = 0; ph.p_paddr = 0;
ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); ph.p_filesz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + sizeof(payload); ph.p_memsz = sizeof(struct elf64_hdr) + sizeof(struct elf64_phdr) + len;
ph.p_align = 4096; ph.p_align = 4096;
fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700); fd = openat(AT_FDCWD, "/tmp", O_WRONLY|O_EXCL|O_TMPFILE, 0700);
......
...@@ -46,12 +46,9 @@ static void fail(const char *fmt, unsigned long a, unsigned long b) ...@@ -46,12 +46,9 @@ static void fail(const char *fmt, unsigned long a, unsigned long b)
int main(void) int main(void)
{ {
const unsigned int PAGE_SIZE = sysconf(_SC_PAGESIZE); const int PAGE_SIZE = sysconf(_SC_PAGESIZE);
#ifdef __arm__ const unsigned long va_max = 1UL << 32;
unsigned long va = 2 * PAGE_SIZE; unsigned long va;
#else
unsigned long va = 0;
#endif
void *p; void *p;
int fd; int fd;
unsigned long a, b; unsigned long a, b;
...@@ -60,10 +57,13 @@ int main(void) ...@@ -60,10 +57,13 @@ int main(void)
if (fd == -1) if (fd == -1)
return 1; return 1;
p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0); for (va = 0; va < va_max; va += PAGE_SIZE) {
if (p == MAP_FAILED) { p = mmap((void *)va, PAGE_SIZE, PROT_NONE, MAP_PRIVATE|MAP_FILE|MAP_FIXED, fd, 0);
if (errno == EPERM) if (p == (void *)va)
return 4; break;
}
if (va == va_max) {
fprintf(stderr, "error: mmap doesn't like you\n");
return 1; return 1;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment