Commit 2e7ce7d3 authored by Liam R. Howlett's avatar Liam R. Howlett Committed by Andrew Morton

mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()

Avoid allocating a new VMA when it a vma modification can occur.  When a
brk() can expand or contract a VMA, then the single store operation will
only modify one index of the maple tree instead of causing a node to split
or coalesce.  This avoids unnecessary allocations/frees of maple tree
nodes and VMAs.

Move some limit & flag verifications out of the do_brk_flags() function to
use only relevant checks in the code path of bkr() and vm_brk_flags().

Set the vma to check if it can expand in vm_brk_flags() if extra criteria
are met.

Drop userfaultfd from do_brk_flags() path and only use it in
vm_brk_flags() path since that is the only place a munmap will happen.

Add a wraper for munmap for the brk case called do_brk_munmap().

Link: https://lkml.kernel.org/r/20220906194824.2110408-23-Liam.Howlett@oracle.comSigned-off-by: default avatarLiam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: default avatarYu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 94d815b2
...@@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) ...@@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
return next; return next;
} }
static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, /*
* check_brk_limits() - Use platform specific check of range & verify mlock
* limits.
* @addr: The address to check
* @len: The size of increase.
*
* Return: 0 on success.
*/
static int check_brk_limits(unsigned long addr, unsigned long len)
{
unsigned long mapped_addr;
mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
return mlock_future_check(current->mm, current->mm->def_flags, len);
}
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
unsigned long newbrk, unsigned long oldbrk,
struct list_head *uf); struct list_head *uf);
static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
unsigned long addr, unsigned long request,
unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk) SYSCALL_DEFINE1(brk, unsigned long, brk)
{ {
unsigned long newbrk, oldbrk, origbrk; unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct vm_area_struct *next; struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk; unsigned long min_brk;
bool populate; bool populate;
bool downgraded = false; bool downgraded = false;
LIST_HEAD(uf); LIST_HEAD(uf);
MA_STATE(mas, &mm->mm_mt, 0, 0);
if (mmap_write_lock_killable(mm)) if (mmap_write_lock_killable(mm))
return -EINTR; return -EINTR;
...@@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) ...@@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/* /*
* Always allow shrinking brk. * Always allow shrinking brk.
* __do_munmap() may downgrade mmap_lock to read. * do_brk_munmap() may downgrade mmap_lock to read.
*/ */
if (brk <= mm->brk) { if (brk <= mm->brk) {
int ret; int ret;
/* Search one past newbrk */
mas_set(&mas, newbrk);
brkvma = mas_find(&mas, oldbrk);
BUG_ON(brkvma == NULL);
if (brkvma->vm_start >= oldbrk)
goto out; /* mapping intersects with an existing non-brk vma. */
/* /*
* mm->brk must to be protected by write mmap_lock so update it * mm->brk must be protected by write mmap_lock.
* before downgrading mmap_lock. When __do_munmap() fails, * do_brk_munmap() may downgrade the lock, so update it
* mm->brk will be restored from origbrk. * before calling do_brk_munmap().
*/ */
mm->brk = brk; mm->brk = brk;
ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); mas.last = oldbrk - 1;
if (ret < 0) { ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
mm->brk = origbrk; if (ret == 1) {
goto out;
} else if (ret == 1) {
downgraded = true; downgraded = true;
}
goto success; goto success;
} else if (!ret)
goto success;
mm->brk = origbrk;
goto out;
} }
/* Check against existing mmap mappings. */ if (check_brk_limits(oldbrk, newbrk - oldbrk))
next = find_vma(mm, oldbrk); goto out;
/*
* Only check if the next VMA is within the stack_guard_gap of the
* expansion area
*/
mas_set(&mas, oldbrk);
next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out; goto out;
brkvma = mas_prev(&mas, mm->start_brk);
/* Ok, looks good - let it rip. */ /* Ok, looks good - let it rip. */
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out; goto out;
mm->brk = brk; mm->brk = brk;
success: success:
...@@ -2762,38 +2802,55 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, ...@@ -2762,38 +2802,55 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
} }
/* /*
* this is really a simplified "do_mmap". it only handles * brk_munmap() - Unmap a parital vma.
* anonymous maps. eventually we may be able to do some * @mas: The maple tree state.
* brk-specific accounting here. * @vma: The vma to be modified
*/ * @newbrk: the start of the address to unmap
static int do_brk_flags(unsigned long addr, unsigned long len, * @oldbrk: The end of the address to unmap
unsigned long flags, struct list_head *uf) * @uf: The userfaultfd list_head
*
* Returns: 1 on success.
* unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
* possible.
*/
static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
unsigned long newbrk, unsigned long oldbrk,
struct list_head *uf)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *vma, *prev; int ret;
pgoff_t pgoff = addr >> PAGE_SHIFT;
int error;
unsigned long mapped_addr;
validate_mm_mt(mm);
/* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
if (IS_ERR_VALUE(mapped_addr))
return mapped_addr;
error = mlock_future_check(mm, mm->def_flags, len); arch_unmap(mm, newbrk, oldbrk);
if (error) ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true);
return error; validate_mm_mt(mm);
return ret;
}
/* Clear old maps, set up prev and uf */ /*
if (munmap_vma_range(mm, addr, len, &prev, uf)) * do_brk_flags() - Increase the brk vma if the flags match.
return -ENOMEM; * @mas: The maple tree state.
* @addr: The start address
* @len: The length of the increase
* @vma: The vma,
* @flags: The VMA Flags
*
* Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
* do not match then create a new anonymous VMA. Eventually we may be able to
* do some brk-specific accounting here.
*/
static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
unsigned long addr, unsigned long len,
unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *prev = NULL;
/* Check against address space limits *after* clearing old maps... */ validate_mm_mt(mm);
/*
* Check against address space limits by the changed size
* Note: This happens *after* clearing old mappings in some code paths.
*/
flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM; return -ENOMEM;
...@@ -2803,30 +2860,54 @@ static int do_brk_flags(unsigned long addr, unsigned long len, ...@@ -2803,30 +2860,54 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM; return -ENOMEM;
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
/* /*
* create a vma struct for an anonymous mapping * Expand the existing vma if possible; Note that singular lists do not
* occur after forking, so the expand will only happen on new VMAs.
*/ */
vma = vm_area_alloc(mm); if (vma &&
if (!vma) { (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
vm_unacct_memory(len >> PAGE_SHIFT); ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
return -ENOMEM; mas->index = vma->vm_start;
mas->last = addr + len - 1;
vma_adjust_trans_huge(vma, addr, addr + len, 0);
if (vma->anon_vma) {
anon_vma_lock_write(vma->anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
} }
vma->vm_end = addr + len;
vma->vm_flags |= VM_SOFTDIRTY;
if (mas_store_gfp(mas, vma, GFP_KERNEL))
goto mas_expand_failed;
if (vma->anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
anon_vma_unlock_write(vma->anon_vma);
}
khugepaged_enter_vma(vma, flags);
goto out;
}
prev = vma;
/* create a vma struct for an anonymous mapping */
vma = vm_area_alloc(mm);
if (!vma)
goto vma_alloc_fail;
vma_set_anonymous(vma); vma_set_anonymous(vma);
vma->vm_start = addr; vma->vm_start = addr;
vma->vm_end = addr + len; vma->vm_end = addr + len;
vma->vm_pgoff = pgoff; vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags; vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags); vma->vm_page_prot = vm_get_page_prot(flags);
if (vma_link(mm, vma, prev)) mas_set_range(mas, vma->vm_start, addr + len - 1);
goto no_vma_link; if (mas_store_gfp(mas, vma, GFP_KERNEL))
goto mas_store_fail;
if (!prev)
prev = mas_prev(mas, 0);
__vma_link_list(mm, vma, prev);
mm->map_count++;
out: out:
perf_event_mmap(vma); perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT; mm->total_vm += len >> PAGE_SHIFT;
...@@ -2837,18 +2918,29 @@ static int do_brk_flags(unsigned long addr, unsigned long len, ...@@ -2837,18 +2918,29 @@ static int do_brk_flags(unsigned long addr, unsigned long len,
validate_mm_mt(mm); validate_mm_mt(mm);
return 0; return 0;
no_vma_link: mas_store_fail:
vm_area_free(vma); vm_area_free(vma);
vma_alloc_fail:
vm_unacct_memory(len >> PAGE_SHIFT);
return -ENOMEM;
mas_expand_failed:
if (vma->anon_vma) {
anon_vma_interval_tree_post_update_vma(vma);
anon_vma_unlock_write(vma->anon_vma);
}
return -ENOMEM; return -ENOMEM;
} }
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
unsigned long len; unsigned long len;
int ret; int ret;
bool populate; bool populate;
LIST_HEAD(uf); LIST_HEAD(uf);
MA_STATE(mas, &mm->mm_mt, addr, addr);
len = PAGE_ALIGN(request); len = PAGE_ALIGN(request);
if (len < request) if (len < request)
...@@ -2859,13 +2951,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) ...@@ -2859,13 +2951,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (mmap_write_lock_killable(mm)) if (mmap_write_lock_killable(mm))
return -EINTR; return -EINTR;
ret = do_brk_flags(addr, len, flags, &uf); /* Until we need other flags, refuse anything except VM_EXEC. */
if ((flags & (~VM_EXEC)) != 0)
return -EINVAL;
ret = check_brk_limits(addr, len);
if (ret)
goto limits_failed;
if (find_vma_intersection(mm, addr, addr + len))
ret = do_munmap(mm, addr, len, &uf);
if (ret)
goto munmap_failed;
vma = mas_prev(&mas, 0);
if (!vma || vma->vm_end != addr || vma_policy(vma) ||
!can_vma_merge_after(vma, flags, NULL, NULL,
addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL))
vma = NULL;
ret = do_brk_flags(&mas, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0); populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm); mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf); userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret) if (populate && !ret)
mm_populate(addr, len); mm_populate(addr, len);
return ret; return ret;
munmap_failed:
limits_failed:
mmap_write_unlock(mm);
return ret;
} }
EXPORT_SYMBOL(vm_brk_flags); EXPORT_SYMBOL(vm_brk_flags);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment