Commit ef00e08e authored by Linus Torvalds's avatar Linus Torvalds

readahead: clean up and simplify the code for filemap page fault readahead

This shouldn't really change behavior all that much, but the single rather
complex function with read-ahead inside a loop etc is broken up into more
manageable pieces.

The behaviour is also less subtle, with the read-ahead being done up-front
rather than inside some subtle loop and thus avoiding the now unnecessary
extra state variables (ie "did_readaround" is gone).

Fengguang: the code split in fact fixed a bug reported by Pavel Levshin:
the PGMAJFAULT accounting used to be bypassed when MADV_RANDOM is set, in
which case the original code will directly jump to no_cached_page reading.

Cc: Pavel Levshin <lpk@581.spb.su>
Cc: <wli@movementarian.org>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: default avatarWu Fengguang <fengguang.wu@intel.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 51daa88e
...@@ -1456,6 +1456,68 @@ static int page_cache_read(struct file *file, pgoff_t offset) ...@@ -1456,6 +1456,68 @@ static int page_cache_read(struct file *file, pgoff_t offset)
#define MMAP_LOTSAMISS (100) #define MMAP_LOTSAMISS (100)
/*
* Synchronous readahead happens when we don't even find
* a page in the page cache at all.
*/
static void do_sync_mmap_readahead(struct vm_area_struct *vma,
struct file_ra_state *ra,
struct file *file,
pgoff_t offset)
{
unsigned long ra_pages;
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
if (VM_SequentialReadHint(vma)) {
page_cache_sync_readahead(mapping, ra, file, offset, 1);
return;
}
if (ra->mmap_miss < INT_MAX)
ra->mmap_miss++;
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
return;
ra_pages = max_sane_readahead(ra->ra_pages);
if (ra_pages) {
pgoff_t start = 0;
if (offset > ra_pages / 2)
start = offset - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
}
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
* so we want to possibly extend the readahead further..
*/
static void do_async_mmap_readahead(struct vm_area_struct *vma,
struct file_ra_state *ra,
struct file *file,
struct page *page,
pgoff_t offset)
{
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
return;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
if (PageReadahead(page))
page_cache_async_readahead(mapping, ra, file, page, offset, 1);
}
/** /**
* filemap_fault - read in file data for page fault handling * filemap_fault - read in file data for page fault handling
* @vma: vma in which the fault was taken * @vma: vma in which the fault was taken
...@@ -1475,78 +1537,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1475,78 +1537,44 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping; struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra; struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
pgoff_t offset = vmf->pgoff;
struct page *page; struct page *page;
pgoff_t size; pgoff_t size;
int did_readaround = 0;
int ret = 0; int ret = 0;
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (vmf->pgoff >= size) if (offset >= size)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
/* If we don't want any read-ahead, don't bother */
if (VM_RandomReadHint(vma))
goto no_cached_page;
/* /*
* Do we have something in the page cache already? * Do we have something in the page cache already?
*/ */
retry_find: page = find_get_page(mapping, offset);
page = find_lock_page(mapping, vmf->pgoff); if (likely(page)) {
/*
* For sequential accesses, we use the generic readahead logic.
*/
if (VM_SequentialReadHint(vma)) {
if (!page) {
page_cache_sync_readahead(mapping, ra, file,
vmf->pgoff, 1);
page = find_lock_page(mapping, vmf->pgoff);
if (!page)
goto no_cached_page;
}
if (PageReadahead(page)) {
page_cache_async_readahead(mapping, ra, file, page,
vmf->pgoff, 1);
}
}
if (!page) {
unsigned long ra_pages;
ra->mmap_miss++;
/* /*
* Do we miss much more than hit in this file? If so, * We found the page, so try async readahead before
* stop bothering with read-ahead. It will only hurt. * waiting for the lock.
*/ */
if (ra->mmap_miss > MMAP_LOTSAMISS) do_async_mmap_readahead(vma, ra, file, page, offset);
goto no_cached_page; lock_page(page);
/* /* Did it get truncated? */
* To keep the pgmajfault counter straight, we need to if (unlikely(page->mapping != mapping)) {
* check did_readaround, as this is an inner loop. unlock_page(page);
*/ put_page(page);
if (!did_readaround) { goto no_cached_page;
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
did_readaround = 1;
ra_pages = max_sane_readahead(file->f_ra.ra_pages);
if (ra_pages) {
pgoff_t start = 0;
if (vmf->pgoff > ra_pages / 2)
start = vmf->pgoff - ra_pages / 2;
do_page_cache_readahead(mapping, file, start, ra_pages);
} }
page = find_lock_page(mapping, vmf->pgoff); } else {
/* No page in the page cache at all */
do_sync_mmap_readahead(vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
ret = VM_FAULT_MAJOR;
retry_find:
page = find_lock_page(mapping, offset);
if (!page) if (!page)
goto no_cached_page; goto no_cached_page;
} }
if (!did_readaround)
ra->mmap_miss--;
/* /*
* We have a locked page in the page cache, now we need to check * We have a locked page in the page cache, now we need to check
* that it's up-to-date. If not, it is going to be due to an error. * that it's up-to-date. If not, it is going to be due to an error.
...@@ -1554,18 +1582,18 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1554,18 +1582,18 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
if (unlikely(!PageUptodate(page))) if (unlikely(!PageUptodate(page)))
goto page_not_uptodate; goto page_not_uptodate;
/* Must recheck i_size under page lock */ /*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(vmf->pgoff >= size)) { if (unlikely(offset >= size)) {
unlock_page(page); unlock_page(page);
page_cache_release(page); page_cache_release(page);
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
/* ra->prev_pos = (loff_t)offset << PAGE_CACHE_SHIFT;
* Found the page and have a reference on it.
*/
ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
vmf->page = page; vmf->page = page;
return ret | VM_FAULT_LOCKED; return ret | VM_FAULT_LOCKED;
...@@ -1574,7 +1602,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1574,7 +1602,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
* We're only likely to ever get here if MADV_RANDOM is in * We're only likely to ever get here if MADV_RANDOM is in
* effect. * effect.
*/ */
error = page_cache_read(file, vmf->pgoff); error = page_cache_read(file, offset);
/* /*
* The page we want has now been added to the page cache. * The page we want has now been added to the page cache.
...@@ -1594,12 +1622,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1594,12 +1622,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
page_not_uptodate: page_not_uptodate:
/* IO error path */
if (!did_readaround) {
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
}
/* /*
* Umm, take care of errors if the page isn't up-to-date. * Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously, * Try to re-read it _once_. We do this synchronously,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment