Commit dd9fd0e0 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] rmap: nonlinear truncation

From: Hugh Dickins <hugh@veritas.com>

The earlier changes introducing PageAnon left truncated pages mapped into
nonlinear vmas unswappable.  Once we go to object-based rmap, it's
impossible to find where file page is mapped once page->mapping cleared:
switching them to anonymous is odd, and breaks strict commit accounting.

So now handle truncation of nonlinear vmas correctly.  And factor in
Daniel's cluster filesystem needs while we're there: when invalidating
local cache, we do want to unmap shared pages from all mms, but we do not
want to discard private COWed modifications of those pages (which
truncation discards to satisfy the SIGBUS semantics demanded by specs).

Drew from Daniel's patch (LKML 2 Mar 04), but didn't always follow it;
fewer name changes, but still some - "unmap" rather than "invalidate".
zap_page_range is not exported, safe to give it and all the too-many layers
an extra zap_details arg, in normal cases just NULL.

Given details, zap_pte_range checks page mapping or index to skip anon or
untruncated pages.  I didn't realize before implementing, that in nonlinear
case, it should set a file pte when truncating - otherwise linear pages
might appear in place of SIGBUS.  I suspect this implies that ->populate
functions ought to set file ptes beyond EOF instead of failing, but haven't
changed them as yet.

To avoid making yet another copy of that ugly linear pgidx test, added
inline function linear_page_index (to pagemap.h to get PAGE_CACHE_SIZE,
though as usual things don't really work if it differs from PAGE_SIZE). 
Ooh, I thought I'd removed ___add_to_page_cache last time, do so now.

unmap_page_range static, shift its hugepage check up into sole caller
unmap_vmas.  Killed "killme" debug from unmap_vmas, not seen it trigger.
unmap_mapping_range is exported without restriction: I'm one of those who
believe it should be generally available.  But I'm wrongly placed to decide
that, probably just sob quietly to myself if _GPL added later.
parent 3df9aaf3
......@@ -410,7 +410,7 @@ static inline size_t read_zero_pagealigned(char * buf, size_t size)
if (count > size)
count = size;
zap_page_range(vma, addr, count);
zap_page_range(vma, addr, count, NULL);
zeromap_page_range(vma, addr, count, PAGE_COPY);
size -= count;
......
......@@ -439,22 +439,27 @@ struct file *shmem_file_setup(char * name, loff_t size, unsigned long flags);
void shmem_lock(struct file * file, int lock);
int shmem_zero_setup(struct vm_area_struct *);
struct zap_details;
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size);
unsigned long size, struct zap_details *);
int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
struct vm_area_struct *start_vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted);
void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long size);
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *);
void clear_page_tables(struct mmu_gather *tlb, unsigned long first, int nr);
int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
struct vm_area_struct *vma);
int zeromap_page_range(struct vm_area_struct *vma, unsigned long from,
unsigned long size, pgprot_t prot);
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows);
static inline void unmap_shared_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen)
{
unmap_mapping_range(mapping, holebegin, holelen, 0);
}
extern void invalidate_mmap_range(struct address_space *mapping,
loff_t const holebegin,
loff_t const holelen);
extern int vmtruncate(struct inode * inode, loff_t offset);
extern pmd_t *FASTCALL(__pmd_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address));
extern pte_t *FASTCALL(pte_alloc_kernel(struct mm_struct *mm, pmd_t *pmd, unsigned long address));
......
......@@ -139,14 +139,12 @@ static inline unsigned long get_page_cache_size(void)
return atomic_read(&nr_pagecache);
}
static inline void ___add_to_page_cache(struct page *page,
struct address_space *mapping, unsigned long index)
static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
unsigned long address)
{
page->mapping = mapping;
page->index = index;
mapping->nrpages++;
pagecache_acct(1);
pgoff_t pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
pgoff += vma->vm_pgoff;
return pgoff >> (PAGE_CACHE_SHIFT - PAGE_SHIFT);
}
extern void FASTCALL(__lock_page(struct page *page));
......
......@@ -62,7 +62,7 @@
* ->mapping->tree_lock
*
* ->i_sem
* ->i_shared_sem (truncate->invalidate_mmap_range)
* ->i_shared_sem (truncate->unmap_mapping_range)
*
* ->mmap_sem
* ->i_shared_sem (various places)
......@@ -1363,11 +1363,7 @@ static int filemap_populate(struct vm_area_struct *vma,
* If a nonlinear mapping then store the file page offset
* in the pte.
*/
unsigned long pgidx;
pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
pgidx += vma->vm_pgoff;
pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
if (pgoff != pgidx) {
if (pgoff != linear_page_index(vma, addr)) {
err = install_file_pte(mm, vma, addr, pgoff, prot);
if (err)
return err;
......
......@@ -95,7 +95,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
zap_page_range(vma, start, end - start);
zap_page_range(vma, start, end - start, NULL);
return 0;
}
......
......@@ -384,9 +384,19 @@ skip_copy_pmd_range: address = (address + PGDIR_SIZE) & PGDIR_MASK;
return -ENOMEM;
}
static void
zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd,
unsigned long address, unsigned long size)
/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
struct vm_area_struct *nonlinear_vma; /* Check page->index if set */
struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */
};
static void zap_pte_range(struct mmu_gather *tlb,
pmd_t *pmd, unsigned long address,
unsigned long size, struct zap_details *details)
{
unsigned long offset;
pte_t *ptep;
......@@ -408,35 +418,64 @@ zap_pte_range(struct mmu_gather *tlb, pmd_t * pmd,
if (pte_none(pte))
continue;
if (pte_present(pte)) {
struct page *page = NULL;
unsigned long pfn = pte_pfn(pte);
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
if (PageReserved(page))
page = NULL;
}
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping &&
details->check_mapping != page->mapping)
continue;
/*
* Each page->index must be checked when
* invalidating or truncating nonlinear.
*/
if (details->nonlinear_vma &&
(page->index < details->first_index ||
page->index > details->last_index))
continue;
}
pte = ptep_get_and_clear(ptep);
tlb_remove_tlb_entry(tlb, ptep, address+offset);
if (pfn_valid(pfn)) {
struct page *page = pfn_to_page(pfn);
if (!PageReserved(page)) {
if (unlikely(!page))
continue;
if (unlikely(details) && details->nonlinear_vma
&& linear_page_index(details->nonlinear_vma,
address+offset) != page->index)
set_pte(ptep, pgoff_to_pte(page->index));
if (pte_dirty(pte))
set_page_dirty(page);
if (pte_young(pte) &&
page_mapping(page))
if (pte_young(pte) && page_mapping(page))
mark_page_accessed(page);
tlb->freed++;
page_remove_rmap(page, ptep);
tlb_remove_page(tlb, page);
continue;
}
}
} else {
/*
* If details->check_mapping, we leave swap entries;
* if details->nonlinear_vma, we leave file entries.
*/
if (unlikely(details))
continue;
if (!pte_file(pte))
free_swap_and_cache(pte_to_swp_entry(pte));
pte_clear(ptep);
}
}
pte_unmap(ptep-1);
}
static void
zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir,
unsigned long address, unsigned long size)
static void zap_pmd_range(struct mmu_gather *tlb,
pgd_t * dir, unsigned long address,
unsigned long size, struct zap_details *details)
{
pmd_t * pmd;
unsigned long end;
......@@ -453,28 +492,23 @@ zap_pmd_range(struct mmu_gather *tlb, pgd_t * dir,
if (end > ((address + PGDIR_SIZE) & PGDIR_MASK))
end = ((address + PGDIR_SIZE) & PGDIR_MASK);
do {
zap_pte_range(tlb, pmd, address, end - address);
zap_pte_range(tlb, pmd, address, end - address, details);
address = (address + PMD_SIZE) & PMD_MASK;
pmd++;
} while (address < end);
}
void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address, unsigned long end)
static void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long address,
unsigned long end, struct zap_details *details)
{
pgd_t * dir;
if (is_vm_hugetlb_page(vma)) {
unmap_hugepage_range(vma, address, end);
return;
}
BUG_ON(address >= end);
dir = pgd_offset(vma->vm_mm, address);
tlb_start_vma(tlb, vma);
do {
zap_pmd_range(tlb, dir, address, end - address);
zap_pmd_range(tlb, dir, address, end - address, details);
address = (address + PGDIR_SIZE) & PGDIR_MASK;
dir++;
} while (address && (address < end));
......@@ -504,6 +538,7 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
* @start_addr: virtual address at which to start unmapping
* @end_addr: virtual address at which to end unmapping
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
* Returns the number of vma's which were covered by the unmapping.
*
......@@ -524,22 +559,14 @@ void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
*/
int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted)
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
{
unsigned long zap_bytes = ZAP_BLOCK_SIZE;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
int ret = 0;
if (vma) { /* debug. killme. */
if (end_addr <= vma->vm_start)
printk("%s: end_addr(0x%08lx) <= vm_start(0x%08lx)\n",
__FUNCTION__, end_addr, vma->vm_start);
if (start_addr >= vma->vm_end)
printk("%s: start_addr(0x%08lx) <= vm_end(0x%08lx)\n",
__FUNCTION__, start_addr, vma->vm_end);
}
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long start;
unsigned long end;
......@@ -558,17 +585,20 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
while (start != end) {
unsigned long block;
if (is_vm_hugetlb_page(vma))
block = end - start;
else
block = min(zap_bytes, end - start);
if (!tlb_start_valid) {
tlb_start = start;
tlb_start_valid = 1;
}
unmap_page_range(*tlbp, vma, start, start + block);
if (is_vm_hugetlb_page(vma)) {
block = end - start;
unmap_hugepage_range(vma, start, end);
} else {
block = min(zap_bytes, end - start);
unmap_page_range(*tlbp, vma, start,
start + block, details);
}
start += block;
zap_bytes -= block;
if ((long)zap_bytes > 0)
......@@ -582,9 +612,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
}
zap_bytes = ZAP_BLOCK_SIZE;
}
if (vma->vm_next && vma->vm_next->vm_start < vma->vm_end)
printk("%s: VMA list is not sorted correctly!\n",
__FUNCTION__);
}
return ret;
}
......@@ -594,9 +621,10 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*/
void zap_page_range(struct vm_area_struct *vma,
unsigned long address, unsigned long size)
void zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
struct mmu_gather *tlb;
......@@ -613,7 +641,7 @@ void zap_page_range(struct vm_area_struct *vma,
lru_add_drain();
spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted);
unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end);
spin_unlock(&mm->page_table_lock);
}
......@@ -1130,46 +1158,46 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
}
/*
* Helper function for invalidate_mmap_range().
* Both hba and hlen are page numbers in PAGE_SIZE units.
* An hlen of zero blows away the entire portion file after hba.
* Helper function for unmap_mapping_range().
*/
static void
invalidate_mmap_range_list(struct list_head *head,
unsigned long const hba,
unsigned long const hlen)
static void unmap_mapping_range_list(struct list_head *head,
struct zap_details *details)
{
struct list_head *curr;
unsigned long hea; /* last page of hole. */
unsigned long vba;
unsigned long vea; /* last page of corresponding uva hole. */
struct vm_area_struct *vp;
unsigned long zba;
unsigned long zea;
hea = hba + hlen - 1; /* avoid overflow. */
if (hea < hba)
hea = ULONG_MAX;
list_for_each(curr, head) {
vp = list_entry(curr, struct vm_area_struct, shared);
vba = vp->vm_pgoff;
vea = vba + ((vp->vm_end - vp->vm_start) >> PAGE_SHIFT) - 1;
if (hea < vba || vea < hba)
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
list_for_each_entry(vma, head, shared) {
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
details->nonlinear_vma = vma;
zap_page_range(vma, vma->vm_start,
vma->vm_end - vma->vm_start, details);
details->nonlinear_vma = NULL;
continue;
}
vba = vma->vm_pgoff;
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
if (vba > details->last_index || vea < details->first_index)
continue; /* Mapping disjoint from hole. */
zba = (hba <= vba) ? vba : hba;
zea = (vea <= hea) ? vea : hea;
zap_page_range(vp,
((zba - vba) << PAGE_SHIFT) + vp->vm_start,
(zea - zba + 1) << PAGE_SHIFT);
zba = details->first_index;
if (zba < vba)
zba = vba;
zea = details->last_index;
if (zea > vea)
zea = vea;
zap_page_range(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start,
(zea - zba + 1) << PAGE_SHIFT,
details->check_mapping? details: NULL);
}
}
/**
* invalidate_mmap_range - invalidate the portion of all mmaps
* unmap_mapping_range - unmap the portion of all mmaps
* in the specified address_space corresponding to the specified
* page range in the underlying file.
* @address_space: the address space containing mmaps to be invalidated.
* @holebegin: byte in first page to invalidate, relative to the start of
* @address_space: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
* boundary. Note that this is different from vmtruncate(), which
* must keep the partial page. In contrast, we must get rid of
......@@ -1177,31 +1205,45 @@ invalidate_mmap_range_list(struct list_head *head,
* @holelen: size of prospective hole in bytes. This will be rounded
* up to a PAGE_SIZE boundary. A holelen of zero truncates to the
* end of the file.
* @even_cows: 1 when truncating a file, unmap even private COWed pages;
* but 0 when invalidating pagecache, don't throw away private data.
*/
void invalidate_mmap_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen)
void unmap_mapping_range(struct address_space *mapping,
loff_t const holebegin, loff_t const holelen, int even_cows)
{
unsigned long hba = holebegin >> PAGE_SHIFT;
unsigned long hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
struct zap_details details;
pgoff_t hba = holebegin >> PAGE_SHIFT;
pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
/* Check for overflow. */
if (sizeof(holelen) > sizeof(hlen)) {
long long holeend =
(holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (holeend & ~(long long)ULONG_MAX)
hlen = ULONG_MAX - hba + 1;
}
details.check_mapping = even_cows? NULL: mapping;
details.nonlinear_vma = NULL;
details.first_index = hba;
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
down(&mapping->i_shared_sem);
/* Protect against page fault */
atomic_inc(&mapping->truncate_count);
if (unlikely(!list_empty(&mapping->i_mmap)))
invalidate_mmap_range_list(&mapping->i_mmap, hba, hlen);
unmap_mapping_range_list(&mapping->i_mmap, &details);
/* Don't waste time to check mapping on fully shared vmas */
details.check_mapping = NULL;
if (unlikely(!list_empty(&mapping->i_mmap_shared)))
invalidate_mmap_range_list(&mapping->i_mmap_shared, hba, hlen);
unmap_mapping_range_list(&mapping->i_mmap_shared, &details);
up(&mapping->i_shared_sem);
}
EXPORT_SYMBOL_GPL(invalidate_mmap_range);
EXPORT_SYMBOL(unmap_mapping_range);
/*
* Handle all mappings that got truncated by a "truncate()"
......@@ -1219,7 +1261,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
if (inode->i_size < offset)
goto do_expand;
i_size_write(inode, offset);
invalidate_mmap_range(mapping, offset + PAGE_SIZE - 1, 0);
unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
truncate_inode_pages(mapping, offset);
goto out_truncate;
......@@ -1498,7 +1540,7 @@ do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_lock(&mm->page_table_lock);
/*
* For a file-backed vma, someone could have truncated or otherwise
* invalidated this page. If invalidate_mmap_range got called,
* invalidated this page. If unmap_mapping_range got called,
* retry getting the page.
*/
if (mapping &&
......
......@@ -728,7 +728,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
fput(file);
/* Undo any partial mapping done by a device driver. */
zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start);
zap_page_range(vma, vma->vm_start, vma->vm_end - vma->vm_start, NULL);
free_vma:
kmem_cache_free(vm_area_cachep, vma);
unacct_error:
......@@ -1160,7 +1160,7 @@ static void unmap_region(struct mm_struct *mm,
lru_add_drain();
tlb = tlb_gather_mmu(mm, 0);
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted);
unmap_vmas(&tlb, mm, vma, start, end, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
if (is_hugepage_only_range(start, end - start))
......@@ -1446,7 +1446,7 @@ void exit_mmap(struct mm_struct *mm)
flush_cache_mm(mm);
/* Use ~0UL here to ensure all VMAs in the mm are unmapped */
mm->map_count -= unmap_vmas(&tlb, mm, mm->mmap, 0,
~0UL, &nr_accounted);
~0UL, &nr_accounted, NULL);
vm_unacct_memory(nr_accounted);
BUG_ON(mm->map_count); /* This is just debugging */
clear_page_tables(tlb, FIRST_USER_PGD_NR, USER_PTRS_PER_PGD);
......
......@@ -359,16 +359,12 @@ static int fastcall try_to_unmap_one(struct page * page, pte_addr_t paddr)
set_pte(ptep, swp_entry_to_pte(entry));
BUG_ON(pte_file(*ptep));
} else {
unsigned long pgidx;
/*
* If a nonlinear mapping then store the file page offset
* in the pte.
*/
BUG_ON(!page->mapping);
pgidx = (address - vma->vm_start) >> PAGE_SHIFT;
pgidx += vma->vm_pgoff;
pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
if (page->index != pgidx) {
if (page->index != linear_page_index(vma, address)) {
set_pte(ptep, pgoff_to_pte(page->index));
BUG_ON(!pte_file(*ptep));
}
......
......@@ -1055,11 +1055,7 @@ static int shmem_populate(struct vm_area_struct *vma,
* If a nonlinear mapping then store the file page
* offset in the pte.
*/
unsigned long pgidx;
pgidx = (addr - vma->vm_start) >> PAGE_SHIFT;
pgidx += vma->vm_pgoff;
pgidx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
if (pgoff != pgidx) {
if (pgoff != linear_page_index(vma, addr)) {
err = install_file_pte(mm, vma, addr, pgoff, prot);
if (err)
return err;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment