Commit c7f8f31c authored by Suren Baghdasaryan's avatar Suren Baghdasaryan Committed by Andrew Morton

mm: separate vma->lock from vm_area_struct

vma->lock being part of the vm_area_struct causes performance regression
during page faults because during contention its count and owner fields
are constantly updated and having other parts of vm_area_struct used
during page fault handling next to them causes constant cache line
bouncing.  Fix that by moving the lock outside of the vm_area_struct.

All attempts to keep vma->lock inside vm_area_struct in a separate cache
line still produce performance regression especially on NUMA machines. 
Smallest regression was achieved when lock is placed in the fourth cache
line but that bloats vm_area_struct to 256 bytes.

Considering performance and memory impact, separate lock looks like the
best option.  It increases memory footprint of each VMA but that can be
optimized later if the new size causes issues.  Note that after this
change vma_init() does not allocate or initialize vma->lock anymore.  A
number of drivers allocate a pseudo VMA on the stack but they never use
the VMA's lock, therefore it does not need to be allocated.  The future
drivers which might need the VMA lock should use
vm_area_alloc()/vm_area_free() to allocate the VMA.

Link: https://lkml.kernel.org/r/20230227173632.3292573-34-surenb@google.comSigned-off-by: default avatarSuren Baghdasaryan <surenb@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 0d2ebf9c
...@@ -628,12 +628,6 @@ struct vm_operations_struct { ...@@ -628,12 +628,6 @@ struct vm_operations_struct {
}; };
#ifdef CONFIG_PER_VMA_LOCK #ifdef CONFIG_PER_VMA_LOCK
static inline void vma_init_lock(struct vm_area_struct *vma)
{
init_rwsem(&vma->lock);
vma->vm_lock_seq = -1;
}
/* /*
* Try to read-lock a vma. The function is allowed to occasionally yield false * Try to read-lock a vma. The function is allowed to occasionally yield false
* locked result to avoid performance overhead, in which case we fall back to * locked result to avoid performance overhead, in which case we fall back to
...@@ -645,17 +639,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma) ...@@ -645,17 +639,17 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq)) if (vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))
return false; return false;
if (unlikely(down_read_trylock(&vma->lock) == 0)) if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
return false; return false;
/* /*
* Overflow might produce false locked result. * Overflow might produce false locked result.
* False unlocked result is impossible because we modify and check * False unlocked result is impossible because we modify and check
* vma->vm_lock_seq under vma->lock protection and mm->mm_lock_seq * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq
* modification invalidates all existing locks. * modification invalidates all existing locks.
*/ */
if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) { if (unlikely(vma->vm_lock_seq == READ_ONCE(vma->vm_mm->mm_lock_seq))) {
up_read(&vma->lock); up_read(&vma->vm_lock->lock);
return false; return false;
} }
return true; return true;
...@@ -664,7 +658,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) ...@@ -664,7 +658,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma)
static inline void vma_end_read(struct vm_area_struct *vma) static inline void vma_end_read(struct vm_area_struct *vma)
{ {
rcu_read_lock(); /* keeps vma alive till the end of up_read */ rcu_read_lock(); /* keeps vma alive till the end of up_read */
up_read(&vma->lock); up_read(&vma->vm_lock->lock);
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -687,9 +681,9 @@ static inline void vma_start_write(struct vm_area_struct *vma) ...@@ -687,9 +681,9 @@ static inline void vma_start_write(struct vm_area_struct *vma)
if (__is_vma_write_locked(vma, &mm_lock_seq)) if (__is_vma_write_locked(vma, &mm_lock_seq))
return; return;
down_write(&vma->lock); down_write(&vma->vm_lock->lock);
vma->vm_lock_seq = mm_lock_seq; vma->vm_lock_seq = mm_lock_seq;
up_write(&vma->lock); up_write(&vma->vm_lock->lock);
} }
static inline bool vma_try_start_write(struct vm_area_struct *vma) static inline bool vma_try_start_write(struct vm_area_struct *vma)
...@@ -740,6 +734,10 @@ static inline void vma_mark_detached(struct vm_area_struct *vma, ...@@ -740,6 +734,10 @@ static inline void vma_mark_detached(struct vm_area_struct *vma,
#endif /* CONFIG_PER_VMA_LOCK */ #endif /* CONFIG_PER_VMA_LOCK */
/*
* WARNING: vma_init does not initialize vma->vm_lock.
* Use vm_area_alloc()/vm_area_free() if vma needs locking.
*/
static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
{ {
static const struct vm_operations_struct dummy_vm_ops = {}; static const struct vm_operations_struct dummy_vm_ops = {};
...@@ -749,7 +747,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) ...@@ -749,7 +747,6 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_ops = &dummy_vm_ops; vma->vm_ops = &dummy_vm_ops;
INIT_LIST_HEAD(&vma->anon_vma_chain); INIT_LIST_HEAD(&vma->anon_vma_chain);
vma_mark_detached(vma, false); vma_mark_detached(vma, false);
vma_init_lock(vma);
} }
/* Use when VMA is not part of the VMA tree and needs no locking */ /* Use when VMA is not part of the VMA tree and needs no locking */
......
...@@ -471,6 +471,10 @@ struct anon_vma_name { ...@@ -471,6 +471,10 @@ struct anon_vma_name {
char name[]; char name[];
}; };
struct vma_lock {
struct rw_semaphore lock;
};
/* /*
* This struct describes a virtual memory area. There is one of these * This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory * per VM-area/task. A VM area is any part of the process virtual memory
...@@ -505,7 +509,7 @@ struct vm_area_struct { ...@@ -505,7 +509,7 @@ struct vm_area_struct {
#ifdef CONFIG_PER_VMA_LOCK #ifdef CONFIG_PER_VMA_LOCK
int vm_lock_seq; int vm_lock_seq;
struct rw_semaphore lock; struct vma_lock *vm_lock;
/* Flag to indicate areas detached from the mm->mm_mt tree */ /* Flag to indicate areas detached from the mm->mm_mt tree */
bool detached; bool detached;
......
...@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep; ...@@ -451,13 +451,49 @@ static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */ /* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep; static struct kmem_cache *mm_cachep;
#ifdef CONFIG_PER_VMA_LOCK
/* SLAB cache for vm_area_struct.lock */
static struct kmem_cache *vma_lock_cachep;
static bool vma_lock_alloc(struct vm_area_struct *vma)
{
vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
if (!vma->vm_lock)
return false;
init_rwsem(&vma->vm_lock->lock);
vma->vm_lock_seq = -1;
return true;
}
static inline void vma_lock_free(struct vm_area_struct *vma)
{
kmem_cache_free(vma_lock_cachep, vma->vm_lock);
}
#else /* CONFIG_PER_VMA_LOCK */
static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
static inline void vma_lock_free(struct vm_area_struct *vma) {}
#endif /* CONFIG_PER_VMA_LOCK */
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{ {
struct vm_area_struct *vma; struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (vma) if (!vma)
vma_init(vma, mm); return NULL;
vma_init(vma, mm);
if (!vma_lock_alloc(vma)) {
kmem_cache_free(vm_area_cachep, vma);
return NULL;
}
return vma; return vma;
} }
...@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) ...@@ -465,24 +501,30 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{ {
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (new) { if (!new)
ASSERT_EXCLUSIVE_WRITER(orig->vm_flags); return NULL;
ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
/* ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
* orig->shared.rb may be modified concurrently, but the clone ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
* will be reinitialized. /*
*/ * orig->shared.rb may be modified concurrently, but the clone
data_race(memcpy(new, orig, sizeof(*new))); * will be reinitialized.
INIT_LIST_HEAD(&new->anon_vma_chain); */
vma_init_lock(new); data_race(memcpy(new, orig, sizeof(*new)));
dup_anon_vma_name(orig, new); if (!vma_lock_alloc(new)) {
kmem_cache_free(vm_area_cachep, new);
return NULL;
} }
INIT_LIST_HEAD(&new->anon_vma_chain);
dup_anon_vma_name(orig, new);
return new; return new;
} }
void __vm_area_free(struct vm_area_struct *vma) void __vm_area_free(struct vm_area_struct *vma)
{ {
free_anon_vma_name(vma); free_anon_vma_name(vma);
vma_lock_free(vma);
kmem_cache_free(vm_area_cachep, vma); kmem_cache_free(vm_area_cachep, vma);
} }
...@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) ...@@ -493,7 +535,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head)
vm_rcu); vm_rcu);
/* The vma should not be locked while being destroyed. */ /* The vma should not be locked while being destroyed. */
VM_BUG_ON_VMA(rwsem_is_locked(&vma->lock), vma); VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
__vm_area_free(vma); __vm_area_free(vma);
} }
#endif #endif
...@@ -3152,6 +3194,9 @@ void __init proc_caches_init(void) ...@@ -3152,6 +3194,9 @@ void __init proc_caches_init(void)
NULL); NULL);
vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
#ifdef CONFIG_PER_VMA_LOCK
vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
#endif
mmap_init(); mmap_init();
nsproxy_cache_init(); nsproxy_cache_init();
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment