Commit 574823bf authored by Linus Torvalds's avatar Linus Torvalds

Change mincore() to count "mapped" pages rather than "cached" pages

The semantics of what "in core" means for the mincore() system call are
somewhat unclear, but Linux has always (since 2.3.52, which is when
mincore() was initially done) treated it as "page is available in page
cache" rather than "page is mapped in the mapping".

The problem with that traditional semantic is that it exposes a lot of
system cache state that it really probably shouldn't, and that users
shouldn't really even care about.

So let's try to avoid that information leak by simply changing the
semantics to be that mincore() counts actual mapped pages, not pages
that might be cheaply mapped if they were faulted (note the "might be"
part of the old semantics: being in the cache doesn't actually guarantee
that you can access them without IO anyway, since things like network
filesystems may have to revalidate the cache before use).

In many ways the old semantics were somewhat insane even aside from the
information leak issue.  From the very beginning (and that beginning is
a long time ago: 2.3.52 was released in March 2000, I think), the code
had a comment saying

  Later we can get more picky about what "in core" means precisely.

and this is that "later".  Admittedly it is much later than is really
comfortable.

NOTE! This is a real semantic change, and it is for example known to
change the output of "fincore", since that program literally does a
mmmap without populating it, and then doing "mincore()" on that mapping
that doesn't actually have any pages in it.

I'm hoping that nobody actually has any workflow that cares, and the
info leak is real.

We may have to do something different if it turns out that people have
valid reasons to want the old semantics, and if we can limit the
information leak sanely.

Cc: Kevin Easton <kevin@guarana.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Masatake YAMATO <yamato@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 94bd8a05
...@@ -42,72 +42,14 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, ...@@ -42,72 +42,14 @@ static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
return 0; return 0;
} }
/*
* Later we can get more picky about what "in core" means precisely.
* For now, simply check to see if the page is in the page cache,
* and is up to date; i.e. that no page-in operation would be required
* at this time if an application were to map and access this page.
*/
static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
{
unsigned char present = 0;
struct page *page;
/*
* When tmpfs swaps out a page from a file, any process mapping that
* file will not get a swp_entry_t in its pte, but rather it is like
* any other file mapping (ie. marked !present and faulted in with
* tmpfs's .fault). So swapped out tmpfs mappings are tested here.
*/
#ifdef CONFIG_SWAP
if (shmem_mapping(mapping)) {
page = find_get_entry(mapping, pgoff);
/*
* shmem/tmpfs may return swap: account for swapcache
* page too.
*/
if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
}
} else
page = find_get_page(mapping, pgoff);
#else
page = find_get_page(mapping, pgoff);
#endif
if (page) {
present = PageUptodate(page);
put_page(page);
}
return present;
}
static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
struct vm_area_struct *vma, unsigned char *vec)
{
unsigned long nr = (end - addr) >> PAGE_SHIFT;
int i;
if (vma->vm_file) {
pgoff_t pgoff;
pgoff = linear_page_index(vma, addr);
for (i = 0; i < nr; i++, pgoff++)
vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff);
} else {
for (i = 0; i < nr; i++)
vec[i] = 0;
}
return nr;
}
static int mincore_unmapped_range(unsigned long addr, unsigned long end, static int mincore_unmapped_range(unsigned long addr, unsigned long end,
struct mm_walk *walk) struct mm_walk *walk)
{ {
walk->private += __mincore_unmapped_range(addr, end, unsigned char *vec = walk->private;
walk->vma, walk->private); unsigned long nr = (end - addr) >> PAGE_SHIFT;
memset(vec, 0, nr);
walk->private += nr;
return 0; return 0;
} }
...@@ -127,8 +69,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ...@@ -127,8 +69,9 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
goto out; goto out;
} }
/* We'll consider a THP page under construction to be there */
if (pmd_trans_unstable(pmd)) { if (pmd_trans_unstable(pmd)) {
__mincore_unmapped_range(addr, end, vma, vec); memset(vec, 1, nr);
goto out; goto out;
} }
...@@ -137,28 +80,17 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, ...@@ -137,28 +80,17 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
pte_t pte = *ptep; pte_t pte = *ptep;
if (pte_none(pte)) if (pte_none(pte))
__mincore_unmapped_range(addr, addr + PAGE_SIZE, *vec = 0;
vma, vec);
else if (pte_present(pte)) else if (pte_present(pte))
*vec = 1; *vec = 1;
else { /* pte is a swap entry */ else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte); swp_entry_t entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
/* /*
* migration or hwpoison entries are always * migration or hwpoison entries are always
* uptodate * uptodate
*/ */
*vec = 1; *vec = !!non_swap_entry(entry);
} else {
#ifdef CONFIG_SWAP
*vec = mincore_page(swap_address_space(entry),
swp_offset(entry));
#else
WARN_ON(1);
*vec = 1;
#endif
}
} }
vec++; vec++;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment