Commit 1b46884a authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] lighten mmlist_lock

Let's lighten the global spinlock mmlist_lock.

What's it for?
1. Its original role is to guard mmlist.
2. It later got a second role, to prevent get_task_mm from raising
   mm_users from the dead, just after it went down to 0.

Firstly consider the second: __exit_mm sets tsk->mm NULL while holding
task_lock before calling mmput; so mmlist_lock only guards against the
exceptional case, of get_task_mm on a kernel workthread which did AIO's
use_mm (which transiently sets its tsk->mm without raising mm_users) on an
mm now exiting.

Well, I don't think get_task_mm should succeed at all on use_mm tasks.
It's mainly used by /proc/pid and ptrace, seems at best confusing for those
to present the kernel thread as having a user mm, which it won't have a
moment later.  Define PF_BORROWED_MM, set in use_mm, clear in unuse_mm
(though we could just leave it), get_task_mm give NULL if set.

Secondly consider the first: and what's mmlist for?
1. Its original role was for swap_out to scan: rmap ended that in 2.5.27.
2. In 2.4.10 it got a second role, for try_to_unuse to scan for swapoff.

So, make mmlist a list of mms which maybe have pages on swap: add mm to
mmlist when first swap entry is assigned in try_to_unmap_one (pageout), or
in copy_page_range (fork); and mmput remove it from mmlist as before,
except usually list_empty and there's no need to lock.  drain_mmlist added
to swapoff, to empty out the mmlist if no swap is then in use.

mmput leave mm on mmlist until after its exit_mmap, so try_to_unmap_one can
still add mm to mmlist without worrying about the mm_users 0 case; but
try_to_unuse must avoid the mm_users 0 case (when an mm might be removed
from mmlist, and freed, while it's down in unuse_process): use
atomic_inc_return now all architectures support that.

Some of the detailed comments in try_to_unuse have grown out of date:
updated and trimmed some, but leave SWAP_MAP_MAX for another occasion.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 5dfd31d1
...@@ -165,10 +165,8 @@ void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags) ...@@ -165,10 +165,8 @@ void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
* against pageattr.c; it is the unique case in which a valid change * against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults. * of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed. * vmalloc faults work because attached pagetables are never freed.
* If the locking proves to be non-performant, a ticketing scheme with * The locking scheme was chosen on the basis of manfred's
* checks at dup_mmap(), exec(), and other mmlist addition points * recommendations and having no core impact whatsoever.
* could be used. The locking scheme was chosen on the basis of
* manfred's recommendations and having no core impact whatsoever.
* -- wli * -- wli
*/ */
spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED; spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
......
...@@ -572,6 +572,7 @@ static void use_mm(struct mm_struct *mm) ...@@ -572,6 +572,7 @@ static void use_mm(struct mm_struct *mm)
struct task_struct *tsk = current; struct task_struct *tsk = current;
task_lock(tsk); task_lock(tsk);
tsk->flags |= PF_BORROWED_MM;
active_mm = tsk->active_mm; active_mm = tsk->active_mm;
atomic_inc(&mm->mm_count); atomic_inc(&mm->mm_count);
tsk->mm = mm; tsk->mm = mm;
...@@ -598,6 +599,7 @@ void unuse_mm(struct mm_struct *mm) ...@@ -598,6 +599,7 @@ void unuse_mm(struct mm_struct *mm)
struct task_struct *tsk = current; struct task_struct *tsk = current;
task_lock(tsk); task_lock(tsk);
tsk->flags &= ~PF_BORROWED_MM;
tsk->mm = NULL; tsk->mm = NULL;
/* active_mm is still 'mm' */ /* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk); enter_lazy_tlb(mm, tsk);
......
...@@ -530,12 +530,6 @@ static int exec_mmap(struct mm_struct *mm) ...@@ -530,12 +530,6 @@ static int exec_mmap(struct mm_struct *mm)
struct task_struct *tsk; struct task_struct *tsk;
struct mm_struct * old_mm, *active_mm; struct mm_struct * old_mm, *active_mm;
/* Add it to the list of mm's */
spin_lock(&mmlist_lock);
list_add(&mm->mmlist, &init_mm.mmlist);
mmlist_nr++;
spin_unlock(&mmlist_lock);
/* Notify parent that we're no longer interested in the old VM */ /* Notify parent that we're no longer interested in the old VM */
tsk = current; tsk = current;
old_mm = current->mm; old_mm = current->mm;
......
...@@ -218,7 +218,7 @@ struct mm_struct { ...@@ -218,7 +218,7 @@ struct mm_struct {
struct rw_semaphore mmap_sem; struct rw_semaphore mmap_sem;
spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ spinlock_t page_table_lock; /* Protects task page tables and mm->rss */
struct list_head mmlist; /* List of all active mm's. These are globally strung struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
* together off init_mm.mmlist, and are protected * together off init_mm.mmlist, and are protected
* by mmlist_lock * by mmlist_lock
*/ */
...@@ -252,8 +252,6 @@ struct mm_struct { ...@@ -252,8 +252,6 @@ struct mm_struct {
struct kioctx default_kioctx; struct kioctx default_kioctx;
}; };
extern int mmlist_nr;
struct sighand_struct { struct sighand_struct {
atomic_t count; atomic_t count;
struct k_sigaction action[_NSIG]; struct k_sigaction action[_NSIG];
...@@ -722,6 +720,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0) ...@@ -722,6 +720,7 @@ do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
#define PF_SWAPOFF 0x00080000 /* I am in swapoff */ #define PF_SWAPOFF 0x00080000 /* I am in swapoff */
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */ #define PF_SYNCWRITE 0x00200000 /* I am doing a sync write */
#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
extern int set_cpus_allowed(task_t *p, cpumask_t new_mask); extern int set_cpus_allowed(task_t *p, cpumask_t new_mask);
......
...@@ -179,17 +179,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm) ...@@ -179,17 +179,6 @@ static inline int dup_mmap(struct mm_struct * mm, struct mm_struct * oldmm)
rb_parent = NULL; rb_parent = NULL;
pprev = &mm->mmap; pprev = &mm->mmap;
/*
* Add it to the mmlist after the parent.
* Doing it this way means that we can order the list,
* and fork() won't mess up the ordering significantly.
* Add it first so that swapoff can see any swap entries.
*/
spin_lock(&mmlist_lock);
list_add(&mm->mmlist, &current->mm->mmlist);
mmlist_nr++;
spin_unlock(&mmlist_lock);
for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
struct file *file; struct file *file;
...@@ -289,7 +278,6 @@ static inline void mm_free_pgd(struct mm_struct * mm) ...@@ -289,7 +278,6 @@ static inline void mm_free_pgd(struct mm_struct * mm)
#endif /* CONFIG_MMU */ #endif /* CONFIG_MMU */
spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; spinlock_t mmlist_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
int mmlist_nr;
#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) #define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) #define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))
...@@ -301,6 +289,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm) ...@@ -301,6 +289,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm)
atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_users, 1);
atomic_set(&mm->mm_count, 1); atomic_set(&mm->mm_count, 1);
init_rwsem(&mm->mmap_sem); init_rwsem(&mm->mmap_sem);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_waiters = 0; mm->core_waiters = 0;
mm->nr_ptes = 0; mm->nr_ptes = 0;
mm->page_table_lock = SPIN_LOCK_UNLOCKED; mm->page_table_lock = SPIN_LOCK_UNLOCKED;
...@@ -350,12 +339,14 @@ void fastcall __mmdrop(struct mm_struct *mm) ...@@ -350,12 +339,14 @@ void fastcall __mmdrop(struct mm_struct *mm)
*/ */
void mmput(struct mm_struct *mm) void mmput(struct mm_struct *mm)
{ {
if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { if (atomic_dec_and_test(&mm->mm_users)) {
list_del(&mm->mmlist);
mmlist_nr--;
spin_unlock(&mmlist_lock);
exit_aio(mm); exit_aio(mm);
exit_mmap(mm); exit_mmap(mm);
if (!list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
list_del(&mm->mmlist);
spin_unlock(&mmlist_lock);
}
put_swap_token(mm); put_swap_token(mm);
mmdrop(mm); mmdrop(mm);
} }
...@@ -365,15 +356,11 @@ EXPORT_SYMBOL_GPL(mmput); ...@@ -365,15 +356,11 @@ EXPORT_SYMBOL_GPL(mmput);
/** /**
* get_task_mm - acquire a reference to the task's mm * get_task_mm - acquire a reference to the task's mm
* *
* Returns %NULL if the task has no mm. Checks if the use count * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning
* of the mm is non-zero and if so returns a reference to it, after * this kernel workthread has transiently adopted a user mm with use_mm,
* to do its AIO) is not set and if so returns a reference to it, after
* bumping up the use count. User must release the mm via mmput() * bumping up the use count. User must release the mm via mmput()
* after use. Typically used by /proc and ptrace. * after use. Typically used by /proc and ptrace.
*
* If the use count is zero, it means that this mm is going away,
* so return %NULL. This only happens in the case of an AIO daemon
* which has temporarily adopted an mm (see use_mm), in the course
* of its final mmput, before exit_aio has completed.
*/ */
struct mm_struct *get_task_mm(struct task_struct *task) struct mm_struct *get_task_mm(struct task_struct *task)
{ {
...@@ -382,12 +369,10 @@ struct mm_struct *get_task_mm(struct task_struct *task) ...@@ -382,12 +369,10 @@ struct mm_struct *get_task_mm(struct task_struct *task)
task_lock(task); task_lock(task);
mm = task->mm; mm = task->mm;
if (mm) { if (mm) {
spin_lock(&mmlist_lock); if (task->flags & PF_BORROWED_MM)
if (!atomic_read(&mm->mm_users))
mm = NULL; mm = NULL;
else else
atomic_inc(&mm->mm_users); atomic_inc(&mm->mm_users);
spin_unlock(&mmlist_lock);
} }
task_unlock(task); task_unlock(task);
return mm; return mm;
......
...@@ -289,8 +289,15 @@ skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK; ...@@ -289,8 +289,15 @@ skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
goto cont_copy_pte_range_noset; goto cont_copy_pte_range_noset;
/* pte contains position in swap, so copy. */ /* pte contains position in swap, so copy. */
if (!pte_present(pte)) { if (!pte_present(pte)) {
if (!pte_file(pte)) if (!pte_file(pte)) {
swap_duplicate(pte_to_swp_entry(pte)); swap_duplicate(pte_to_swp_entry(pte));
if (list_empty(&dst->mmlist)) {
spin_lock(&mmlist_lock);
list_add(&dst->mmlist,
&src->mmlist);
spin_unlock(&mmlist_lock);
}
}
set_pte(dst_pte, pte); set_pte(dst_pte, pte);
goto cont_copy_pte_range_noset; goto cont_copy_pte_range_noset;
} }
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
* mm->page_table_lock * mm->page_table_lock
* zone->lru_lock (in mark_page_accessed) * zone->lru_lock (in mark_page_accessed)
* swap_list_lock (in swap_free etc's swap_info_get) * swap_list_lock (in swap_free etc's swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* swap_device_lock (in swap_duplicate, swap_info_get) * swap_device_lock (in swap_duplicate, swap_info_get)
* mapping->private_lock (in __set_page_dirty_buffers) * mapping->private_lock (in __set_page_dirty_buffers)
* inode_lock (in set_page_dirty's __mark_inode_dirty) * inode_lock (in set_page_dirty's __mark_inode_dirty)
...@@ -576,6 +577,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) ...@@ -576,6 +577,11 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
*/ */
BUG_ON(!PageSwapCache(page)); BUG_ON(!PageSwapCache(page));
swap_duplicate(entry); swap_duplicate(entry);
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
list_add(&mm->mmlist, &init_mm.mmlist);
spin_unlock(&mmlist_lock);
}
set_pte(pte, swp_entry_to_pte(entry)); set_pte(pte, swp_entry_to_pte(entry));
BUG_ON(pte_file(*pte)); BUG_ON(pte_file(*pte));
} }
......
...@@ -648,11 +648,12 @@ static int try_to_unuse(unsigned int type) ...@@ -648,11 +648,12 @@ static int try_to_unuse(unsigned int type)
* *
* A simpler strategy would be to start at the last mm we * A simpler strategy would be to start at the last mm we
* freed the previous entry from; but that would take less * freed the previous entry from; but that would take less
* advantage of mmlist ordering (now preserved by swap_out()), * advantage of mmlist ordering, which clusters forked mms
* which clusters forked address spaces together, most recent * together, child after parent. If we race with dup_mmap(), we
* child immediately after parent. If we race with dup_mmap(), * prefer to resolve parent before child, lest we miss entries
* we very much want to resolve parent before child, otherwise * duplicated after we scanned child: using last mm would invert
* we may miss some entries: using last mm would invert that. * that. Though it's only a serious concern when an overflowed
* swap count is reset from SWAP_MAP_MAX, preventing a rescan.
*/ */
start_mm = &init_mm; start_mm = &init_mm;
atomic_inc(&init_mm.mm_users); atomic_inc(&init_mm.mm_users);
...@@ -660,15 +661,7 @@ static int try_to_unuse(unsigned int type) ...@@ -660,15 +661,7 @@ static int try_to_unuse(unsigned int type)
/* /*
* Keep on scanning until all entries have gone. Usually, * Keep on scanning until all entries have gone. Usually,
* one pass through swap_map is enough, but not necessarily: * one pass through swap_map is enough, but not necessarily:
* mmput() removes mm from mmlist before exit_mmap() and its * there are races when an instance of an entry might be missed.
* zap_page_range(). That's not too bad, those entries are
* on their way out, and handled faster there than here.
* do_munmap() behaves similarly, taking the range out of mm's
* vma list before zap_page_range(). But unfortunately, when
* unmapping a part of a vma, it takes the whole out first,
* then reinserts what's left after (might even reschedule if
* open() method called) - so swap entries may be invisible
* to swapoff for a while, then reappear - but that is rare.
*/ */
while ((i = find_next_to_unuse(si, i)) != 0) { while ((i = find_next_to_unuse(si, i)) != 0) {
if (signal_pending(current)) { if (signal_pending(current)) {
...@@ -720,7 +713,7 @@ static int try_to_unuse(unsigned int type) ...@@ -720,7 +713,7 @@ static int try_to_unuse(unsigned int type)
wait_on_page_writeback(page); wait_on_page_writeback(page);
/* /*
* Remove all references to entry, without blocking. * Remove all references to entry.
* Whenever we reach init_mm, there's no address space * Whenever we reach init_mm, there's no address space
* to search, but use it as a reminder to search shmem. * to search, but use it as a reminder to search shmem.
*/ */
...@@ -745,7 +738,10 @@ static int try_to_unuse(unsigned int type) ...@@ -745,7 +738,10 @@ static int try_to_unuse(unsigned int type)
while (*swap_map > 1 && !retval && while (*swap_map > 1 && !retval &&
(p = p->next) != &start_mm->mmlist) { (p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist); mm = list_entry(p, struct mm_struct, mmlist);
atomic_inc(&mm->mm_users); if (atomic_inc_return(&mm->mm_users) == 1) {
atomic_dec(&mm->mm_users);
continue;
}
spin_unlock(&mmlist_lock); spin_unlock(&mmlist_lock);
mmput(prev_mm); mmput(prev_mm);
prev_mm = mm; prev_mm = mm;
...@@ -858,6 +854,26 @@ static int try_to_unuse(unsigned int type) ...@@ -858,6 +854,26 @@ static int try_to_unuse(unsigned int type)
return retval; return retval;
} }
/*
* After a successful try_to_unuse, if no swap is now in use, we know we
* can empty the mmlist. swap_list_lock must be held on entry and exit.
* Note that mmlist_lock nests inside swap_list_lock, and an mm must be
* added to the mmlist just after page_duplicate - before would be racy.
*/
static void drain_mmlist(void)
{
struct list_head *p, *next;
unsigned int i;
for (i = 0; i < nr_swapfiles; i++)
if (swap_info[i].inuse_pages)
return;
spin_lock(&mmlist_lock);
list_for_each_safe(p, next, &init_mm.mmlist)
list_del_init(p);
spin_unlock(&mmlist_lock);
}
/* /*
* Use this swapdev's extent info to locate the (PAGE_SIZE) block which * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
* corresponds to page offset `offset'. * corresponds to page offset `offset'.
...@@ -1172,6 +1188,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile) ...@@ -1172,6 +1188,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
} }
down(&swapon_sem); down(&swapon_sem);
swap_list_lock(); swap_list_lock();
drain_mmlist();
swap_device_lock(p); swap_device_lock(p);
swap_file = p->swap_file; swap_file = p->swap_file;
p->swap_file = NULL; p->swap_file = NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment