Commit 9c50823e authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] msync(): perform dirty page levelling

It seems sensible to perform dirty page throttling in msync: as the application
dirties pages we can kick off pdflush early, or even force the msync() caller
to perform writeout, or even throttle the msync() caller.

The main effect of this is to start disk writeback earlier if we've just
discovered that a large amount of pagecache has been dirtied.  (Otherwise it
wouldn't happen for up to five seconds, next time pdflush wakes up).

It also will cause the page-dirtying process to get panalised for dirtying
those pages rather than whacking someone else with the problem.

We should do this for munmap() and possibly even exit(), too.

We drop the mmap_sem while performing the dirty page balancing.  It doesn't
seem right to hold mmap_sem for that long.

Note that this patch only affects MS_ASYNC.  MS_SYNC will be syncing all the
dirty pages anyway.

We note that msync(MS_SYNC) does a full-file-sync inside mmap_sem, and always
has.  We can fix that up...

The patch also tightens up the mmap_sem coverage in sys_msync(): no point in
taking it while we perform the incoming arg checking.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 4741c9fd
...@@ -12,17 +12,20 @@ ...@@ -12,17 +12,20 @@
#include <linux/mm.h> #include <linux/mm.h>
#include <linux/mman.h> #include <linux/mman.h>
#include <linux/hugetlb.h> #include <linux/hugetlb.h>
#include <linux/writeback.h>
#include <linux/file.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end) unsigned long addr, unsigned long end)
{ {
pte_t *pte; pte_t *pte;
spinlock_t *ptl; spinlock_t *ptl;
int progress = 0; int progress = 0;
unsigned long ret = 0;
again: again:
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
...@@ -44,57 +47,63 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, ...@@ -44,57 +47,63 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
continue; continue;
if (ptep_clear_flush_dirty(vma, addr, pte) || if (ptep_clear_flush_dirty(vma, addr, pte) ||
page_test_and_clear_dirty(page)) page_test_and_clear_dirty(page))
set_page_dirty(page); ret += set_page_dirty(page);
progress += 3; progress += 3;
} while (pte++, addr += PAGE_SIZE, addr != end); } while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap_unlock(pte - 1, ptl); pte_unmap_unlock(pte - 1, ptl);
cond_resched(); cond_resched();
if (addr != end) if (addr != end)
goto again; goto again;
return ret;
} }
static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end) pud_t *pud, unsigned long addr, unsigned long end)
{ {
pmd_t *pmd; pmd_t *pmd;
unsigned long next; unsigned long next;
unsigned long ret = 0;
pmd = pmd_offset(pud, addr); pmd = pmd_offset(pud, addr);
do { do {
next = pmd_addr_end(addr, end); next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd)) if (pmd_none_or_clear_bad(pmd))
continue; continue;
msync_pte_range(vma, pmd, addr, next); ret += msync_pte_range(vma, pmd, addr, next);
} while (pmd++, addr = next, addr != end); } while (pmd++, addr = next, addr != end);
return ret;
} }
static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd, static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end) pgd_t *pgd, unsigned long addr, unsigned long end)
{ {
pud_t *pud; pud_t *pud;
unsigned long next; unsigned long next;
unsigned long ret = 0;
pud = pud_offset(pgd, addr); pud = pud_offset(pgd, addr);
do { do {
next = pud_addr_end(addr, end); next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud)) if (pud_none_or_clear_bad(pud))
continue; continue;
msync_pmd_range(vma, pud, addr, next); ret += msync_pmd_range(vma, pud, addr, next);
} while (pud++, addr = next, addr != end); } while (pud++, addr = next, addr != end);
return ret;
} }
static void msync_page_range(struct vm_area_struct *vma, static unsigned long msync_page_range(struct vm_area_struct *vma,
unsigned long addr, unsigned long end) unsigned long addr, unsigned long end)
{ {
pgd_t *pgd; pgd_t *pgd;
unsigned long next; unsigned long next;
unsigned long ret = 0;
/* For hugepages we can't go walking the page table normally, /* For hugepages we can't go walking the page table normally,
* but that's ok, hugetlbfs is memory based, so we don't need * but that's ok, hugetlbfs is memory based, so we don't need
* to do anything more on an msync(). * to do anything more on an msync().
*/ */
if (vma->vm_flags & VM_HUGETLB) if (vma->vm_flags & VM_HUGETLB)
return; return 0;
BUG_ON(addr >= end); BUG_ON(addr >= end);
pgd = pgd_offset(vma->vm_mm, addr); pgd = pgd_offset(vma->vm_mm, addr);
...@@ -103,8 +112,9 @@ static void msync_page_range(struct vm_area_struct *vma, ...@@ -103,8 +112,9 @@ static void msync_page_range(struct vm_area_struct *vma,
next = pgd_addr_end(addr, end); next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) if (pgd_none_or_clear_bad(pgd))
continue; continue;
msync_pud_range(vma, pgd, addr, next); ret += msync_pud_range(vma, pgd, addr, next);
} while (pgd++, addr = next, addr != end); } while (pgd++, addr = next, addr != end);
return ret;
} }
/* /*
...@@ -118,8 +128,9 @@ static void msync_page_range(struct vm_area_struct *vma, ...@@ -118,8 +128,9 @@ static void msync_page_range(struct vm_area_struct *vma,
* So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
* applications. * applications.
*/ */
static int msync_interval(struct vm_area_struct *vma, static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
unsigned long addr, unsigned long end, int flags) unsigned long end, int flags,
unsigned long *nr_pages_dirtied)
{ {
int ret = 0; int ret = 0;
struct file *file = vma->vm_file; struct file *file = vma->vm_file;
...@@ -128,7 +139,7 @@ static int msync_interval(struct vm_area_struct *vma, ...@@ -128,7 +139,7 @@ static int msync_interval(struct vm_area_struct *vma,
return -EBUSY; return -EBUSY;
if (file && (vma->vm_flags & VM_SHARED)) { if (file && (vma->vm_flags & VM_SHARED)) {
msync_page_range(vma, addr, end); *nr_pages_dirtied = msync_page_range(vma, addr, end);
if (flags & MS_SYNC) { if (flags & MS_SYNC) {
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
...@@ -157,11 +168,8 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) ...@@ -157,11 +168,8 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
unsigned long end; unsigned long end;
struct vm_area_struct *vma; struct vm_area_struct *vma;
int unmapped_error, error = -EINVAL; int unmapped_error, error = -EINVAL;
int done = 0;
if (flags & MS_SYNC)
current->flags |= PF_SYNCWRITE;
down_read(&current->mm->mmap_sem);
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC)) if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out; goto out;
if (start & ~PAGE_MASK) if (start & ~PAGE_MASK)
...@@ -180,13 +188,19 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) ...@@ -180,13 +188,19 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
* If the interval [start,end) covers some unmapped address ranges, * If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -ENOMEM at the end. * just ignore them, but return -ENOMEM at the end.
*/ */
down_read(&current->mm->mmap_sem);
if (flags & MS_SYNC)
current->flags |= PF_SYNCWRITE;
vma = find_vma(current->mm, start); vma = find_vma(current->mm, start);
unmapped_error = 0; unmapped_error = 0;
for (;;) { do {
unsigned long nr_pages_dirtied = 0;
struct file *file;
/* Still start < end. */ /* Still start < end. */
error = -ENOMEM; error = -ENOMEM;
if (!vma) if (!vma)
goto out; goto out_unlock;
/* Here start < vma->vm_end. */ /* Here start < vma->vm_end. */
if (start < vma->vm_start) { if (start < vma->vm_start) {
unmapped_error = -ENOMEM; unmapped_error = -ENOMEM;
...@@ -195,22 +209,37 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) ...@@ -195,22 +209,37 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
/* Here vma->vm_start <= start < vma->vm_end. */ /* Here vma->vm_start <= start < vma->vm_end. */
if (end <= vma->vm_end) { if (end <= vma->vm_end) {
if (start < end) { if (start < end) {
error = msync_interval(vma, start, end, flags); error = msync_interval(vma, start, end, flags,
&nr_pages_dirtied);
if (error) if (error)
goto out; goto out_unlock;
} }
error = unmapped_error; error = unmapped_error;
goto out; done = 1;
} } else {
/* Here vma->vm_start <= start < vma->vm_end < end. */ /* Here vma->vm_start <= start < vma->vm_end < end. */
error = msync_interval(vma, start, vma->vm_end, flags); error = msync_interval(vma, start, vma->vm_end, flags,
&nr_pages_dirtied);
if (error) if (error)
goto out; goto out_unlock;
}
file = vma->vm_file;
start = vma->vm_end; start = vma->vm_end;
if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
get_file(file);
up_read(&current->mm->mmap_sem);
balance_dirty_pages_ratelimited_nr(file->f_mapping,
nr_pages_dirtied);
fput(file);
down_read(&current->mm->mmap_sem);
vma = find_vma(current->mm, start);
} else {
vma = vma->vm_next; vma = vma->vm_next;
} }
out: } while (!done);
up_read(&current->mm->mmap_sem); out_unlock:
current->flags &= ~PF_SYNCWRITE; current->flags &= ~PF_SYNCWRITE;
up_read(&current->mm->mmap_sem);
out:
return error; return error;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment