Commit def5efe0 authored by David Rientjes's avatar David Rientjes Committed by Linus Torvalds

mm, madvise: fail with ENOMEM when splitting vma will hit max_map_count

If madvise(2) advice will result in the underlying vma being split and
the number of areas mapped by the process will exceed
/proc/sys/vm/max_map_count as a result, return ENOMEM instead of EAGAIN.

EAGAIN is returned by madvise(2) when a kernel resource, such as slab,
is temporarily unavailable.  It indicates that userspace should retry
the advice in the near future.  This is important for advice such as
MADV_DONTNEED which is often used by malloc implementations to free
memory back to the system: we really do want to free memory back when
madvise(2) returns EAGAIN because slab allocations (for vmas, anon_vmas,
or mempolicies) cannot be allocated.

Encountering /proc/sys/vm/max_map_count is not a temporary failure,
however, so return ENOMEM to indicate this is a more serious issue.  A
followup patch to the man page will specify this behavior.

Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1701241431120.42507@chino.kir.corp.google.comSigned-off-by: default avatarDavid Rientjes <rientjes@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Anshuman Khandual <khandual@linux.vnet.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 712c604d
......@@ -376,8 +376,8 @@ max_map_count:
This file contains the maximum number of memory map areas a process
may have. Memory map areas are used as a side-effect of calling
malloc, directly by mmap and mprotect, and also when loading shared
libraries.
malloc, directly by mmap, mprotect, and madvise, and also when loading
shared libraries.
While most applications need less than a thousand maps, certain
programs, particularly malloc debuggers, may consume lots of them,
......
......@@ -38,6 +38,10 @@ the range for whenever the KSM daemon is started; even if the range
cannot contain any pages which KSM could actually merge; even if
MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
If a region of memory must be split into at least one new MADV_MERGEABLE
or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
will exceed vm.max_map_count (see Documentation/sysctl/vm.txt).
Like other madvise calls, they are intended for use on mapped areas of
the user address space: they will report ENOMEM if the specified range
includes unmapped gaps (though working on the intervening mapped areas),
......
......@@ -2041,8 +2041,10 @@ extern struct vm_area_struct *vma_merge(struct mm_struct *,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
struct mempolicy *, struct vm_userfaultfd_ctx);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int split_vma(struct mm_struct *,
struct vm_area_struct *, unsigned long addr, int new_below);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *);
extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *,
struct rb_node **, struct rb_node *);
......
......@@ -92,14 +92,28 @@ static long madvise_behavior(struct vm_area_struct *vma,
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
if (error)
if (error) {
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
goto out;
}
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
if (error)
if (error) {
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
goto out;
}
break;
}
......@@ -120,26 +134,45 @@ static long madvise_behavior(struct vm_area_struct *vma,
*prev = vma;
if (start != vma->vm_start) {
error = split_vma(mm, vma, start, 1);
if (error)
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
error = -ENOMEM;
goto out;
}
error = __split_vma(mm, vma, start, 1);
if (error) {
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
goto out;
}
}
if (end != vma->vm_end) {
error = split_vma(mm, vma, end, 0);
if (error)
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
error = -ENOMEM;
goto out;
}
error = __split_vma(mm, vma, end, 0);
if (error) {
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
goto out;
}
}
success:
/*
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
out:
if (error == -ENOMEM)
error = -EAGAIN;
return error;
}
......
......@@ -2499,10 +2499,10 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
}
/*
* __split_vma() bypasses sysctl_max_map_count checking. We use this on the
* munmap path where it doesn't make sense to fail.
* __split_vma() bypasses sysctl_max_map_count checking. We use this where it
* has already been checked or doesn't make sense to fail.
*/
static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, int new_below)
{
struct vm_area_struct *new;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment