Commit b4e98d9a authored by Kirill A. Shutemov's avatar Kirill A. Shutemov Committed by Linus Torvalds

mm: account pud page tables

On a machine with 5-level paging support a process can allocate
significant amount of memory and stay unnoticed by oom-killer and memory
cgroup.  The trick is to allocate a lot of PUD page tables.  We don't
account PUD page tables, only PMD and PTE.

We already addressed the same issue for PMD page tables, see commit
dc6c9a35 ("mm: account pmd page tables to the process").
Introduction of 5-level paging brings the same issue for PUD page
tables.

The patch expands accounting to PUD level.

[kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/]
  Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com
[heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting]
  Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com
Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.comSigned-off-by: default avatarKirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: default avatarHeiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: default avatarRik van Riel <riel@redhat.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 7d6c4dfa
...@@ -629,10 +629,10 @@ oom_dump_tasks ...@@ -629,10 +629,10 @@ oom_dump_tasks
Enables a system-wide task dump (excluding kernel threads) to be produced Enables a system-wide task dump (excluding kernel threads) to be produced
when the kernel performs an OOM-killing and includes such information as when the kernel performs an OOM-killing and includes such information as
pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents,
score, and name. This is helpful to determine why the OOM killer was oom_score_adj score, and name. This is helpful to determine why the OOM
invoked, to identify the rogue task that caused it, and to determine why killer was invoked, to identify the rogue task that caused it, and to
the OOM killer chose the task it did to kill. determine why the OOM killer chose the task it did to kill.
If this is set to zero, this information is suppressed. On very If this is set to zero, this information is suppressed. On very
large systems with thousands of tasks it may not be feasible to dump large systems with thousands of tasks it may not be feasible to dump
......
...@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, ...@@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
pud = pud_offset(pgd, start); pud = pud_offset(pgd, start);
pgd_clear(pgd); pgd_clear(pgd);
pud_free_tlb(tlb, pud, start); pud_free_tlb(tlb, pud, start);
mm_dec_nr_puds(tlb->mm);
} }
/* /*
......
...@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk, ...@@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk,
mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce_limit = STACK_TOP_MAX;
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_REGION3; _ASCE_USER_BITS | _ASCE_TYPE_REGION3;
/* pgd_alloc() did not account this pud */
mm_inc_nr_puds(mm);
break; break;
case -PAGE_SIZE: case -PAGE_SIZE:
/* forked 5-level task, set new asce with new_mm->pgd */ /* forked 5-level task, set new asce with new_mm->pgd */
...@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk, ...@@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk,
/* forked 2-level compat task, set new asce with new mm->pgd */ /* forked 2-level compat task, set new asce with new mm->pgd */
mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
/* pgd_alloc() did not increase mm->nr_pmds */ /* pgd_alloc() did not account this pmd */
mm_inc_nr_pmds(mm); mm_inc_nr_pmds(mm);
} }
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
......
...@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, ...@@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
pud = pud_offset(pgd, start); pud = pud_offset(pgd, start);
pgd_clear(pgd); pgd_clear(pgd);
pud_free_tlb(tlb, pud, start); pud_free_tlb(tlb, pud, start);
mm_dec_nr_puds(tlb->mm);
} }
void hugetlb_free_pgd_range(struct mmu_gather *tlb, void hugetlb_free_pgd_range(struct mmu_gather *tlb,
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
void task_mem(struct seq_file *m, struct mm_struct *mm) void task_mem(struct seq_file *m, struct mm_struct *mm)
{ {
unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem;
unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
anon = get_mm_counter(mm, MM_ANONPAGES); anon = get_mm_counter(mm, MM_ANONPAGES);
...@@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) ...@@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
swap = get_mm_counter(mm, MM_SWAPENTS); swap = get_mm_counter(mm, MM_SWAPENTS);
ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm);
seq_printf(m, seq_printf(m,
"VmPeak:\t%8lu kB\n" "VmPeak:\t%8lu kB\n"
"VmSize:\t%8lu kB\n" "VmSize:\t%8lu kB\n"
...@@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) ...@@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
"VmLib:\t%8lu kB\n" "VmLib:\t%8lu kB\n"
"VmPTE:\t%8lu kB\n" "VmPTE:\t%8lu kB\n"
"VmPMD:\t%8lu kB\n" "VmPMD:\t%8lu kB\n"
"VmPUD:\t%8lu kB\n"
"VmSwap:\t%8lu kB\n", "VmSwap:\t%8lu kB\n",
hiwater_vm << (PAGE_SHIFT-10), hiwater_vm << (PAGE_SHIFT-10),
total_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10),
...@@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) ...@@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
mm->stack_vm << (PAGE_SHIFT-10), text, lib, mm->stack_vm << (PAGE_SHIFT-10), text, lib,
ptes >> 10, ptes >> 10,
pmds >> 10, pmds >> 10,
puds >> 10,
swap << (PAGE_SHIFT-10)); swap << (PAGE_SHIFT-10));
hugetlb_report_usage(m, mm); hugetlb_report_usage(m, mm);
} }
......
...@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, ...@@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address);
#endif #endif
#ifdef __PAGETABLE_PUD_FOLDED #if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU)
static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d,
unsigned long address) unsigned long address)
{ {
return 0; return 0;
} }
static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
{
return 0;
}
static inline void mm_nr_puds_init(struct mm_struct *mm) {}
static inline void mm_inc_nr_puds(struct mm_struct *mm) {}
static inline void mm_dec_nr_puds(struct mm_struct *mm) {}
#else #else
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address);
static inline void mm_nr_puds_init(struct mm_struct *mm)
{
atomic_long_set(&mm->nr_puds, 0);
}
static inline unsigned long mm_nr_puds(const struct mm_struct *mm)
{
return atomic_long_read(&mm->nr_puds);
}
static inline void mm_inc_nr_puds(struct mm_struct *mm)
{
atomic_long_inc(&mm->nr_puds);
}
static inline void mm_dec_nr_puds(struct mm_struct *mm)
{
atomic_long_dec(&mm->nr_puds);
}
#endif #endif
#if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU)
...@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, ...@@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud,
static inline void mm_nr_pmds_init(struct mm_struct *mm) {} static inline void mm_nr_pmds_init(struct mm_struct *mm) {}
static inline unsigned long mm_nr_pmds(struct mm_struct *mm) static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
{ {
return 0; return 0;
} }
...@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm) ...@@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm)
atomic_long_set(&mm->nr_pmds, 0); atomic_long_set(&mm->nr_pmds, 0);
} }
static inline unsigned long mm_nr_pmds(struct mm_struct *mm) static inline unsigned long mm_nr_pmds(const struct mm_struct *mm)
{ {
return atomic_long_read(&mm->nr_pmds); return atomic_long_read(&mm->nr_pmds);
} }
......
...@@ -404,6 +404,9 @@ struct mm_struct { ...@@ -404,6 +404,9 @@ struct mm_struct {
atomic_long_t nr_ptes; /* PTE page table pages */ atomic_long_t nr_ptes; /* PTE page table pages */
#if CONFIG_PGTABLE_LEVELS > 2 #if CONFIG_PGTABLE_LEVELS > 2
atomic_long_t nr_pmds; /* PMD page table pages */ atomic_long_t nr_pmds; /* PMD page table pages */
#endif
#if CONFIG_PGTABLE_LEVELS > 3
atomic_long_t nr_puds; /* PUD page table pages */
#endif #endif
int map_count; /* number of VMAs */ int map_count; /* number of VMAs */
......
...@@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, ...@@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->core_state = NULL; mm->core_state = NULL;
atomic_long_set(&mm->nr_ptes, 0); atomic_long_set(&mm->nr_ptes, 0);
mm_nr_pmds_init(mm); mm_nr_pmds_init(mm);
mm_nr_puds_init(mm);
mm->map_count = 0; mm->map_count = 0;
mm->locked_vm = 0; mm->locked_vm = 0;
mm->pinned_vm = 0; mm->pinned_vm = 0;
...@@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm) ...@@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm)
if (mm_nr_pmds(mm)) if (mm_nr_pmds(mm))
pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n",
mm_nr_pmds(mm)); mm_nr_pmds(mm));
if (mm_nr_puds(mm))
pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n",
mm_nr_puds(mm));
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
VM_BUG_ON_MM(mm->pmd_huge_pte, mm); VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
......
...@@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm) ...@@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm)
"get_unmapped_area %p\n" "get_unmapped_area %p\n"
#endif #endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "pgd %p mm_users %d mm_count %d\n"
"nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n"
...@@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm) ...@@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm)
mm->pgd, atomic_read(&mm->mm_users), mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count), atomic_read(&mm->mm_count),
atomic_long_read((atomic_long_t *)&mm->nr_ptes), atomic_long_read((atomic_long_t *)&mm->nr_ptes),
mm_nr_pmds((struct mm_struct *)mm), mm_nr_pmds(mm),
mm_nr_puds(mm),
mm->map_count, mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
......
...@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, ...@@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d,
pud = pud_offset(p4d, start); pud = pud_offset(p4d, start);
p4d_clear(p4d); p4d_clear(p4d);
pud_free_tlb(tlb, pud, start); pud_free_tlb(tlb, pud, start);
mm_dec_nr_puds(tlb->mm);
} }
static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd,
...@@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) ...@@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_5LEVEL_HACK #ifndef __ARCH_HAS_5LEVEL_HACK
if (p4d_present(*p4d)) /* Another has populated it */ if (!p4d_present(*p4d)) {
pud_free(mm, new); mm_inc_nr_puds(mm);
else
p4d_populate(mm, p4d, new); p4d_populate(mm, p4d, new);
#else } else /* Another has populated it */
if (pgd_present(*p4d)) /* Another has populated it */
pud_free(mm, new); pud_free(mm, new);
else #else
if (!pgd_present(*p4d)) {
mm_inc_nr_puds(mm);
pgd_populate(mm, p4d, new); pgd_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
#endif /* __ARCH_HAS_5LEVEL_HACK */ #endif /* __ARCH_HAS_5LEVEL_HACK */
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
return 0; return 0;
......
...@@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, ...@@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* task's rss, pagetable and swap space use. * task's rss, pagetable and swap space use.
*/ */
points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) +
mm_nr_puds(p->mm);
task_unlock(p); task_unlock(p);
/* /*
...@@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) ...@@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
struct task_struct *p; struct task_struct *p;
struct task_struct *task; struct task_struct *task;
pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n");
rcu_read_lock(); rcu_read_lock();
for_each_process(p) { for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask)) if (oom_unkillable_task(p, memcg, nodemask))
...@@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) ...@@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue; continue;
} }
pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)), task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm), task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
atomic_long_read(&task->mm->nr_ptes), atomic_long_read(&task->mm->nr_ptes),
mm_nr_pmds(task->mm), mm_nr_pmds(task->mm),
mm_nr_puds(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS), get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm); task->signal->oom_score_adj, task->comm);
task_unlock(task); task_unlock(task);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment