Commit 5af9c2e1 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton:
 "22 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (22 commits)
  epoll: restrict EPOLLEXCLUSIVE to POLLIN and POLLOUT
  radix-tree: fix oops after radix_tree_iter_retry
  MAINTAINERS: trim the file triggers for ABI/API
  dax: dirty inode only if required
  thp: make deferred_split_scan() work again
  mm: replace vma_lock_anon_vma with anon_vma_lock_read/write
  ocfs2/dlm: clear refmap bit of recovery lock while doing local recovery cleanup
  um: asm/page.h: remove the pte_high member from struct pte_t
  mm, hugetlb: don't require CMA for runtime gigantic pages
  mm/hugetlb: fix gigantic page initialization/allocation
  mm: downgrade VM_BUG in isolate_lru_page() to warning
  mempolicy: do not try to queue pages from !vma_migratable()
  mm, vmstat: fix wrong WQ sleep when memory reclaim doesn't make any progress
  vmstat: make vmstat_update deferrable
  mm, vmstat: make quiet_vmstat lighter
  mm/Kconfig: correct description of DEFERRED_STRUCT_PAGE_INIT
  memblock: don't mark memblock_phys_mem_size() as __init
  dump_stack: avoid potential deadlocks
  mm: validate_mm browse_rb SMP race condition
  m32r: fix build failure due to SMP and MMU
  ...
parents 5d6a6a75 b6a515c8
......@@ -223,9 +223,7 @@ F: drivers/scsi/aacraid/
ABI/API
L: linux-api@vger.kernel.org
F: Documentation/ABI/
F: include/linux/syscalls.h
F: include/uapi/
F: kernel/sys_ni.c
ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
......
......@@ -276,6 +276,7 @@ source "kernel/Kconfig.preempt"
config SMP
bool "Symmetric multi-processing support"
depends on MMU
---help---
This enables support for systems with more than one CPU. If you have
a system with only one CPU, say N. If you have a system with more
......
......@@ -34,21 +34,18 @@ struct page;
#if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT)
typedef struct { unsigned long pte_low, pte_high; } pte_t;
typedef struct { unsigned long pte; } pte_t;
typedef struct { unsigned long pmd; } pmd_t;
typedef struct { unsigned long pgd; } pgd_t;
#define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32))
#define pte_get_bits(pte, bits) ((pte).pte_low & (bits))
#define pte_set_bits(pte, bits) ((pte).pte_low |= (bits))
#define pte_clear_bits(pte, bits) ((pte).pte_low &= ~(bits))
#define pte_copy(to, from) ({ (to).pte_high = (from).pte_high; \
smp_wmb(); \
(to).pte_low = (from).pte_low; })
#define pte_is_zero(pte) (!((pte).pte_low & ~_PAGE_NEWPAGE) && !(pte).pte_high)
#define pte_set_val(pte, phys, prot) \
({ (pte).pte_high = (phys) >> 32; \
(pte).pte_low = (phys) | pgprot_val(prot); })
#define pte_val(p) ((p).pte)
#define pte_get_bits(p, bits) ((p).pte & (bits))
#define pte_set_bits(p, bits) ((p).pte |= (bits))
#define pte_clear_bits(p, bits) ((p).pte &= ~(bits))
#define pte_copy(to, from) ({ (to).pte = (from).pte; })
#define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE))
#define pte_set_val(p, phys, prot) \
({ (p).pte = (phys) | pgprot_val(prot); })
#define pmd_val(x) ((x).pmd)
#define __pmd(x) ((pmd_t) { (x) } )
......
......@@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt)
}
__setup("hugepagesz=", setup_hugepagesz);
#ifdef CONFIG_CMA
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static __init int gigantic_pages_init(void)
{
/* With CMA we can allocate gigantic pages at runtime */
/* With compaction or CMA we can allocate gigantic pages at runtime */
if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0;
......
......@@ -1730,6 +1730,12 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return __dax_fault(vma, vmf, blkdev_get_block, NULL);
}
static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
return dax_pfn_mkwrite(vma, vmf);
}
static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags)
{
......@@ -1739,7 +1745,7 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
static const struct vm_operations_struct blkdev_dax_vm_ops = {
.fault = blkdev_dax_fault,
.pmd_fault = blkdev_dax_pmd_fault,
.pfn_mkwrite = blkdev_dax_fault,
.pfn_mkwrite = blkdev_dax_pfn_mkwrite,
};
static const struct vm_operations_struct blkdev_default_vm_ops = {
......
......@@ -358,7 +358,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
void *entry;
WARN_ON_ONCE(pmd_entry && !dirty);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
spin_lock_irq(&mapping->tree_lock);
......
......@@ -94,6 +94,11 @@
/* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
/* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4
......@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* wait list.
*/
if (waitqueue_active(&ep->wq)) {
ewake = 1;
if ((epi->event.events & EPOLLEXCLUSIVE) &&
!((unsigned long)key & POLLFREE)) {
switch ((unsigned long)key & EPOLLINOUT_BITS) {
case POLLIN:
if (epi->event.events & POLLIN)
ewake = 1;
break;
case POLLOUT:
if (epi->event.events & POLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
wake_up_locked(&ep->wq);
}
if (waitqueue_active(&ep->poll_wait))
......@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups.
*/
if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD ||
(op == EPOLL_CTL_ADD && is_file_epoll(tf.file))))
goto error_tgt_fput;
if (epds.events & EPOLLEXCLUSIVE) {
if (op == EPOLL_CTL_MOD)
goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
/*
* At this point it is safe to assume that the "private_data" contains
......@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break;
case EPOLL_CTL_MOD:
if (epi) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
if (!(epi->event.events & EPOLLEXCLUSIVE)) {
epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
}
} else
error = -ENOENT;
break;
......
......@@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break;
}
}
dlm_lockres_clear_refmap_bit(dlm, res,
dead_node);
spin_unlock(&res->spinlock);
continue;
}
......
......@@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void)
}
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_CMA
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
/* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype);
extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
#endif
#ifdef CONFIG_CMA
/* CMA stuff */
extern void init_cma_reserved_pageblock(struct page *page);
#endif
#endif /* __LINUX_GFP_H */
......@@ -400,7 +400,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
* @iter: pointer to radix tree iterator
* Returns: current chunk size
*/
static __always_inline unsigned
static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter)
{
return iter->next_index - iter->index;
......@@ -434,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
return slot + offset + 1;
}
} else {
unsigned size = radix_tree_chunk_size(iter) - 1;
long size = radix_tree_chunk_size(iter);
while (size--) {
while (--size > 0) {
slot++;
iter->index++;
if (likely(*slot))
......
......@@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma)
__put_anon_vma(anon_vma);
}
static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
down_write(&anon_vma->root->rwsem);
}
static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
up_write(&anon_vma->root->rwsem);
}
static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{
down_write(&anon_vma->root->rwsem);
......
......@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
__set_current_state(TASK_INTERRUPTIBLE);
schedule();
while (!signal_pending(current)) {
__set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
set_restore_sigmask();
return -ERESTARTNOHAND;
}
......
......@@ -25,6 +25,7 @@ static atomic_t dump_lock = ATOMIC_INIT(-1);
asmlinkage __visible void dump_stack(void)
{
unsigned long flags;
int was_locked;
int old;
int cpu;
......@@ -33,9 +34,8 @@ asmlinkage __visible void dump_stack(void)
* Permit this cpu to perform nested stack dumps while serialising
* against other CPUs
*/
preempt_disable();
retry:
local_irq_save(flags);
cpu = smp_processor_id();
old = atomic_cmpxchg(&dump_lock, -1, cpu);
if (old == -1) {
......@@ -43,6 +43,7 @@ asmlinkage __visible void dump_stack(void)
} else if (old == cpu) {
was_locked = 1;
} else {
local_irq_restore(flags);
cpu_relax();
goto retry;
}
......@@ -52,7 +53,7 @@ asmlinkage __visible void dump_stack(void)
if (!was_locked)
atomic_set(&dump_lock, -1);
preempt_enable();
local_irq_restore(flags);
}
#else
asmlinkage __visible void dump_stack(void)
......
......@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kswapd"
bool "Defer initialisation of struct pages to kthreads"
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
......@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
a subset of memmap at boot and then initialise the rest in parallel
when kswapd starts. This has a potential performance impact on
processes running early in the lifetime of the systemm until kswapd
finishes the initialisation.
by starting one-off "pgdatinitX" kernel thread for each node X. This
has a potential performance impact on processes running early in the
lifetime of the system until these kthreads finish the
initialisation.
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
......
......@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
schedule_timeout(1);
schedule_timeout_uninterruptible(1);
else
cond_resched();
......
......@@ -3482,7 +3482,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
list_for_each_safe(pos, next, &list) {
list_for_each_safe(pos, next, &pgdata->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
......
......@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
......@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
BUG_ON(page_mapcount(page));
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
......@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_page_count(p, 0);
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
}
/*
......
......@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
* Remaining API functions
*/
phys_addr_t __init memblock_phys_mem_size(void)
phys_addr_t __init_memblock memblock_phys_mem_size(void)
{
return memblock.memory.total_size;
}
......
......@@ -548,8 +548,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
goto retry;
}
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, qp->pagelist, flags);
migrate_page_add(page, qp->pagelist, flags);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
......@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
if (vma->vm_flags & VM_PFNMAP)
if (!vma_migratable(vma))
return 1;
if (endvma > end)
......@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
if (vma_migratable(vma) &&
vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
change_prot_numa(vma, start, endvma);
return 1;
}
if ((flags & MPOL_MF_STRICT) ||
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
vma_migratable(vma)))
/* queue pages from current vma */
/* queue pages from current vma */
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return 0;
return 1;
}
......
......@@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
}
#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct rb_root *root)
static int browse_rb(struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
......@@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root)
vma->vm_start, vma->vm_end);
bug = 1;
}
spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
spin_unlock(&mm->page_table_lock);
i++;
pn = nd;
prev = vma->vm_start;
......@@ -456,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap;
while (vma) {
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
vma_lock_anon_vma(vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
vma_unlock_anon_vma(vma);
if (anon_vma) {
anon_vma_lock_read(anon_vma);
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
......@@ -475,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address);
bug = 1;
}
i = browse_rb(&mm->mm_rb);
i = browse_rb(mm);
if (i != mm->map_count) {
if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i);
......@@ -2142,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
int error;
int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
/*
* We must make sure the anon_vma is allocated
* so that the anon_vma locking is not a noop.
*/
/* Guard against wrapping around to address 0. */
if (address < PAGE_ALIGN(address+4))
address = PAGE_ALIGN(address+4);
else
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
* Also guard against wrapping around to address 0.
*/
if (address < PAGE_ALIGN(address+4))
address = PAGE_ALIGN(address+4);
else {
vma_unlock_anon_vma(vma);
return -ENOMEM;
}
error = 0;
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
......@@ -2185,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
* vma_lock_anon_vma() doesn't help here, as
* anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
......@@ -2208,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
}
vma_unlock_anon_vma(vma);
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
......@@ -2224,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
int error;
/*
* We must make sure the anon_vma is allocated
* so that the anon_vma locking is not a noop.
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
address &= PAGE_MASK;
error = security_mmap_addr(address);
if (error)
return error;
vma_lock_anon_vma(vma);
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
......@@ -2260,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
* vma_lock_anon_vma() doesn't help here, as
* anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
......@@ -2281,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
}
vma_unlock_anon_vma(vma);
anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
......
......@@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
return !has_unmovable_pages(zone, page, 0, true);
}
#ifdef CONFIG_CMA
#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
{
......
......@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
......
......@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
* If we were marked on cpu_stat_off clear the flag
* so that vmstat_shepherd doesn't schedule us again.
*/
queue_delayed_work_on(smp_processor_id(), vmstat_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
if (!cpumask_test_and_clear_cpu(smp_processor_id(),
cpu_stat_off)) {
queue_delayed_work_on(smp_processor_id(), vmstat_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
}
} else {
/*
* We did not update any counters so the app may be in
......@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
void quiet_vmstat(void)
{
if (system_state != SYSTEM_RUNNING)
return;
do {
if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
cancel_delayed_work(this_cpu_ptr(&vmstat_work));
} while (refresh_cpu_vm_stats(false));
}
/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
......@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
return false;
}
void quiet_vmstat(void)
{
if (system_state != SYSTEM_RUNNING)
return;
/*
* If we are already in hands of the shepherd then there
* is nothing for us to do here.
*/
if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
return;
if (!need_update(smp_processor_id()))
return;
/*
* Just refresh counters and do not care about the pending delayed
* vmstat_update. It doesn't fire that often to matter and canceling
* it would be too expensive from this path.
* vmstat_shepherd will take care about that for us.
*/
refresh_cpu_vm_stats(false);
}
/*
* Shepherd worker thread that checks the
......@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
for_each_cpu(cpu, cpu_stat_off)
if (need_update(cpu) &&
cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
queue_delayed_work_on(cpu, vmstat_wq,
&per_cpu(vmstat_work, cpu), 0);
for_each_cpu(cpu, cpu_stat_off) {
struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
if (need_update(cpu)) {
if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
} else {
/*
* Cancel the work if quiet_vmstat has put this
* cpu on cpu_stat_off because the work item might
* be still scheduled
*/
cancel_delayed_work(dw);
}
}
put_online_cpus();
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
}
static void __init start_shepherd_timer(void)
......@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
int cpu;
for_each_possible_cpu(cpu)
INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment