Commit b3c03db6 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "10 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  x86/mm: split vmalloc_sync_all()
  mm, slub: prevent kmalloc_node crashes and memory leaks
  mm/mmu_notifier: silence PROVE_RCU_LIST warnings
  epoll: fix possible lost wakeup on epoll_ctl() path
  mm: do not allow MADV_PAGEOUT for CoW pages
  mm, memcg: throttle allocators based on ancestral memory.high
  mm, memcg: fix corruption on 64-bit divisor in memory.high throttling
  page-flags: fix a crash at SetPageError(THP_SWAP)
  mm/hotplug: fix hot remove failure in SPARSEMEM|!VMEMMAP case
  memcg: fix NULL pointer dereference in __mem_cgroup_usage_unregister_event
parents b74b991f 763802b5
......@@ -190,7 +190,7 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
return pmd_k;
}
void vmalloc_sync_all(void)
static void vmalloc_sync(void)
{
unsigned long address;
......@@ -217,6 +217,16 @@ void vmalloc_sync_all(void)
}
}
void vmalloc_sync_mappings(void)
{
vmalloc_sync();
}
void vmalloc_sync_unmappings(void)
{
vmalloc_sync();
}
/*
* 32-bit:
*
......@@ -319,11 +329,23 @@ static void dump_pagetable(unsigned long address)
#else /* CONFIG_X86_64: */
void vmalloc_sync_all(void)
void vmalloc_sync_mappings(void)
{
/*
* 64-bit mappings might allocate new p4d/pud pages
* that need to be propagated to all tasks' PGDs.
*/
sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
}
void vmalloc_sync_unmappings(void)
{
/*
* Unmappings never allocate or free p4d/pud pages.
* No work is required here.
*/
}
/*
* 64-bit:
*
......
......@@ -171,7 +171,7 @@ int ghes_estatus_pool_init(int num_ghes)
* New allocation must be visible in all pgd before it can be found by
* an NMI allocating from the pool.
*/
vmalloc_sync_all();
vmalloc_sync_mappings();
rc = gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1);
if (rc)
......
......@@ -1854,9 +1854,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
waiter = true;
init_waitqueue_entry(&wait, current);
spin_lock_irq(&ep->wq.lock);
write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
spin_unlock_irq(&ep->wq.lock);
write_unlock_irq(&ep->lock);
}
for (;;) {
......@@ -1904,9 +1904,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
goto fetch_events;
if (waiter) {
spin_lock_irq(&ep->wq.lock);
write_lock_irq(&ep->lock);
__remove_wait_queue(&ep->wq, &wait);
spin_unlock_irq(&ep->wq.lock);
write_unlock_irq(&ep->lock);
}
return res;
......
......@@ -311,7 +311,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
__PAGEFLAG(Locked, locked, PF_NO_TAIL)
PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD)
PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL)
PAGEFLAG(Referenced, referenced, PF_HEAD)
TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
__SETPAGEFLAG(Referenced, referenced, PF_HEAD)
......
......@@ -141,7 +141,8 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma,
extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff);
void vmalloc_sync_all(void);
void vmalloc_sync_mappings(void);
void vmalloc_sync_unmappings(void);
/*
* Lowlevel-APIs (not for driver use!)
......
......@@ -519,7 +519,7 @@ NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
vmalloc_sync_all();
vmalloc_sync_mappings();
return atomic_notifier_chain_register(&die_chain, nb);
}
EXPORT_SYMBOL_GPL(register_die_notifier);
......
......@@ -335,12 +335,14 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
}
page = pmd_page(orig_pmd);
if (next - addr != HPAGE_PMD_SIZE) {
int err;
/* Do not interfere with other mappings of this page */
if (page_mapcount(page) != 1)
goto huge_unlock;
if (next - addr != HPAGE_PMD_SIZE) {
int err;
get_page(page);
spin_unlock(ptl);
lock_page(page);
......@@ -426,6 +428,10 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
continue;
}
/* Do not interfere with other mappings of this page */
if (page_mapcount(page) != 1)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (pte_young(ptent)) {
......
......@@ -2297,28 +2297,41 @@ static void high_work_func(struct work_struct *work)
#define MEMCG_DELAY_SCALING_SHIFT 14
/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
* Get the number of jiffies that we should penalise a mischievous cgroup which
* is exceeding its memory.high by checking both it and its ancestors.
*/
void mem_cgroup_handle_over_high(void)
static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
unsigned int nr_pages)
{
unsigned long usage, high, clamped_high;
unsigned long pflags;
unsigned long penalty_jiffies, overage;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
unsigned long penalty_jiffies;
u64 max_overage = 0;
if (likely(!nr_pages))
return;
do {
unsigned long usage, high;
u64 overage;
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
current->memcg_nr_pages_over_high = 0;
usage = page_counter_read(&memcg->memory);
high = READ_ONCE(memcg->high);
/*
* Prevent division by 0 in overage calculation by acting as if
* it was a threshold of 1 page
*/
high = max(high, 1UL);
overage = usage - high;
overage <<= MEMCG_DELAY_PRECISION_SHIFT;
overage = div64_u64(overage, high);
if (overage > max_overage)
max_overage = overage;
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
if (!max_overage)
return 0;
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*
* We use overage compared to memory.high to calculate the number of
* jiffies to sleep (penalty_jiffies). Ideally this value should be
* fairly lenient on small overages, and increasingly harsh when the
......@@ -2326,24 +2339,9 @@ void mem_cgroup_handle_over_high(void)
* its crazy behaviour, so we exponentially increase the delay based on
* overage amount.
*/
usage = page_counter_read(&memcg->memory);
high = READ_ONCE(memcg->high);
if (usage <= high)
goto out;
/*
* Prevent division by 0 in overage calculation by acting as if it was a
* threshold of 1 page
*/
clamped_high = max(high, 1UL);
overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
clamped_high);
penalty_jiffies = ((u64)overage * overage * HZ)
>> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
penalty_jiffies = max_overage * max_overage * HZ;
penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
/*
* Factor in the task's own contribution to the overage, such that four
......@@ -2360,7 +2358,32 @@ void mem_cgroup_handle_over_high(void)
* application moving forwards and also permit diagnostics, albeit
* extremely slowly.
*/
penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
}
/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
unsigned long penalty_jiffies;
unsigned long pflags;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
if (likely(!nr_pages))
return;
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
current->memcg_nr_pages_over_high = 0;
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*/
penalty_jiffies = calculate_high_delay(memcg, nr_pages);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
......@@ -4027,7 +4050,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
unsigned long usage;
int i, j, size;
int i, j, size, entries;
mutex_lock(&memcg->thresholds_lock);
......@@ -4047,14 +4070,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
size = 0;
size = entries = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
size++;
else
entries++;
}
new = thresholds->spare;
/* If no items related to eventfd have been cleared, nothing to do */
if (!entries)
goto unlock;
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
kfree(new);
......
......@@ -307,7 +307,8 @@ static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
* ->release returns.
*/
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist)
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
srcu_read_lock_held(&srcu))
/*
* If ->release runs before mmu_notifier_unregister it must be
* handled, as it's the only way for the driver to flush all
......@@ -370,7 +371,8 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist) {
&mm->notifier_subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
if (subscription->ops->clear_flush_young)
young |= subscription->ops->clear_flush_young(
subscription, mm, start, end);
......@@ -389,7 +391,8 @@ int __mmu_notifier_clear_young(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist) {
&mm->notifier_subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
if (subscription->ops->clear_young)
young |= subscription->ops->clear_young(subscription,
mm, start, end);
......@@ -407,7 +410,8 @@ int __mmu_notifier_test_young(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist) {
&mm->notifier_subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
if (subscription->ops->test_young) {
young = subscription->ops->test_young(subscription, mm,
address);
......@@ -428,7 +432,8 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist) {
&mm->notifier_subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
if (subscription->ops->change_pte)
subscription->ops->change_pte(subscription, mm, address,
pte);
......@@ -476,7 +481,8 @@ static int mn_hlist_invalidate_range_start(
int id;
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
const struct mmu_notifier_ops *ops = subscription->ops;
if (ops->invalidate_range_start) {
......@@ -528,7 +534,8 @@ mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
int id;
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist) {
hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
/*
* Call invalidate_range here too to avoid the need for the
* subsystem of having to register an invalidate_range_end
......@@ -582,7 +589,8 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
id = srcu_read_lock(&srcu);
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist) {
&mm->notifier_subscriptions->list, hlist,
srcu_read_lock_held(&srcu)) {
if (subscription->ops->invalidate_range)
subscription->ops->invalidate_range(subscription, mm,
start, end);
......@@ -714,7 +722,8 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
spin_lock(&mm->notifier_subscriptions->lock);
hlist_for_each_entry_rcu(subscription,
&mm->notifier_subscriptions->list, hlist) {
&mm->notifier_subscriptions->list, hlist,
lockdep_is_held(&mm->notifier_subscriptions->lock)) {
if (subscription->ops != ops)
continue;
......
......@@ -370,10 +370,14 @@ void vm_unmap_aliases(void)
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
* Implement a stub for vmalloc_sync_[un]mapping() if the architecture
* chose not to have one.
*/
void __weak vmalloc_sync_all(void)
void __weak vmalloc_sync_mappings(void)
{
}
void __weak vmalloc_sync_unmappings(void)
{
}
......
......@@ -1973,8 +1973,6 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
else if (!node_present_pages(node))
searchnode = node_to_mem_node(node);
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
......@@ -2563,17 +2561,27 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
struct page *page;
page = c->page;
if (!page)
if (!page) {
/*
* if the node is not online or has no normal memory, just
* ignore the node constraint
*/
if (unlikely(node != NUMA_NO_NODE &&
!node_state(node, N_NORMAL_MEMORY)))
node = NUMA_NO_NODE;
goto new_slab;
}
redo:
if (unlikely(!node_match(page, node))) {
int searchnode = node;
if (node != NUMA_NO_NODE && !node_present_pages(node))
searchnode = node_to_mem_node(node);
if (unlikely(!node_match(page, searchnode))) {
/*
* same as above but node_match() being false already
* implies node != NUMA_NO_NODE
*/
if (!node_state(node, N_NORMAL_MEMORY)) {
node = NUMA_NO_NODE;
goto redo;
} else {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
......
......@@ -734,6 +734,7 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
struct page *memmap = NULL;
bool empty;
unsigned long *subsection_map = ms->usage
? &ms->usage->subsection_map[0] : NULL;
......@@ -764,7 +765,8 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
* For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
*/
bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
if (empty) {
unsigned long section_nr = pfn_to_section_nr(pfn);
/*
......@@ -779,13 +781,15 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
ms->usage = NULL;
}
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
ms->section_mem_map = (unsigned long)NULL;
}
if (section_is_early && memmap)
free_map_bootmem(memmap);
else
depopulate_section_memmap(pfn, nr_pages, altmap);
if (empty)
ms->section_mem_map = (unsigned long)NULL;
}
static struct page * __meminit section_activate(int nid, unsigned long pfn,
......
......@@ -1295,7 +1295,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* First make sure the mappings are removed from all page-tables
* before they are freed.
*/
vmalloc_sync_all();
vmalloc_sync_unmappings();
/*
* TODO: to calculate a flush range without looping.
......@@ -3128,16 +3128,19 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
EXPORT_SYMBOL(remap_vmalloc_range);
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
* Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
* not to have one.
*
* The purpose of this function is to make sure the vmalloc area
* mappings are identical in all page-tables in the system.
*/
void __weak vmalloc_sync_all(void)
void __weak vmalloc_sync_mappings(void)
{
}
void __weak vmalloc_sync_unmappings(void)
{
}
static int f(pte_t *pte, unsigned long addr, void *data)
{
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment