Commit 6b3a7077 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'page-refs' (page ref overflow)

Merge page ref overflow branch.

Jann Horn reported that he can overflow the page ref count with
sufficient memory (and a filesystem that is intentionally extremely
slow).

Admittedly it's not exactly easy.  To have more than four billion
references to a page requires a minimum of 32GB of kernel memory just
for the pointers to the pages, much less any metadata to keep track of
those pointers.  Jann needed a total of 140GB of memory and a specially
crafted filesystem that leaves all reads pending (in order to not ever
free the page references and just keep adding more).

Still, we have a fairly straightforward way to limit the two obvious
user-controllable sources of page references: direct-IO like page
references gotten through get_user_pages(), and the splice pipe page
duplication.  So let's just do that.

* branch page-refs:
  fs: prevent page refcount overflow in pipe_buf_get
  mm: prevent get_user_pages() from overflowing page refcount
  mm: add 'try_get_page()' helper function
  mm: make page ref count overflow check tighter and more explicit
parents 4443f8e6 15fab63e
......@@ -2056,10 +2056,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
ret = -EINVAL;
if (rem < len) {
pipe_unlock(pipe);
goto out;
}
if (rem < len)
goto out_free;
rem = len;
while (rem) {
......@@ -2077,7 +2075,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
pipe->nrbufs--;
} else {
pipe_buf_get(pipe, ibuf);
if (!pipe_buf_get(pipe, ibuf))
goto out_free;
*obuf = *ibuf;
obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
obuf->len = rem;
......@@ -2100,11 +2100,11 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
ret = fuse_dev_do_write(fud, &cs, len);
pipe_lock(pipe);
out_free:
for (idx = 0; idx < nbuf; idx++)
pipe_buf_release(pipe, &bufs[idx]);
pipe_unlock(pipe);
out:
kvfree(bufs);
return ret;
}
......
......@@ -188,9 +188,9 @@ EXPORT_SYMBOL(generic_pipe_buf_steal);
* in the tee() system call, when we duplicate the buffers in one
* pipe into another.
*/
void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
{
get_page(buf->page);
return try_get_page(buf->page);
}
EXPORT_SYMBOL(generic_pipe_buf_get);
......
......@@ -1593,7 +1593,11 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
pipe_buf_get(ipipe, ibuf);
if (!pipe_buf_get(ipipe, ibuf)) {
if (ret == 0)
ret = -EFAULT;
break;
}
*obuf = *ibuf;
/*
......@@ -1667,7 +1671,11 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* Get a reference to this pipe buffer,
* so we can copy the contents over.
*/
pipe_buf_get(ipipe, ibuf);
if (!pipe_buf_get(ipipe, ibuf)) {
if (ret == 0)
ret = -EFAULT;
break;
}
obuf = opipe->bufs + nbuf;
*obuf = *ibuf;
......
......@@ -966,6 +966,10 @@ static inline bool is_pci_p2pdma_page(const struct page *page)
}
#endif /* CONFIG_DEV_PAGEMAP_OPS */
/* 127: arbitrary random number, small enough to assemble well */
#define page_ref_zero_or_close_to_overflow(page) \
((unsigned int) page_ref_count(page) + 127u <= 127u)
static inline void get_page(struct page *page)
{
page = compound_head(page);
......@@ -973,8 +977,17 @@ static inline void get_page(struct page *page)
* Getting a normal page or the head of a compound page
* requires to already have an elevated page->_refcount.
*/
VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page);
page_ref_inc(page);
}
static inline __must_check bool try_get_page(struct page *page)
{
page = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(page) <= 0))
return false;
page_ref_inc(page);
return true;
}
static inline void put_page(struct page *page)
......
......@@ -101,18 +101,20 @@ struct pipe_buf_operations {
/*
* Get a reference to the pipe buffer.
*/
void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
bool (*get)(struct pipe_inode_info *, struct pipe_buffer *);
};
/**
* pipe_buf_get - get a reference to a pipe_buffer
* @pipe: the pipe that the buffer belongs to
* @buf: the buffer to get a reference to
*
* Return: %true if the reference was successfully obtained.
*/
static inline void pipe_buf_get(struct pipe_inode_info *pipe,
static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
buf->ops->get(pipe, buf);
return buf->ops->get(pipe, buf);
}
/**
......@@ -171,7 +173,7 @@ struct pipe_inode_info *alloc_pipe_info(void);
void free_pipe_info(struct pipe_inode_info *);
/* Generic pipe buffer ops functions */
void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
......
......@@ -7041,12 +7041,16 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
buf->private = 0;
}
static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
if (ref->ref > INT_MAX/2)
return false;
ref->ref++;
return true;
}
/* Pipe buffer operations for a buffer. */
......
......@@ -160,8 +160,12 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto retry;
}
if (flags & FOLL_GET)
get_page(page);
if (flags & FOLL_GET) {
if (unlikely(!try_get_page(page))) {
page = ERR_PTR(-ENOMEM);
goto out;
}
}
if (flags & FOLL_TOUCH) {
if ((flags & FOLL_WRITE) &&
!pte_dirty(pte) && !PageDirty(page))
......@@ -298,7 +302,10 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
} else {
get_page(page);
if (unlikely(!try_get_page(page))) {
spin_unlock(ptl);
return ERR_PTR(-ENOMEM);
}
spin_unlock(ptl);
lock_page(page);
ret = split_huge_page(page);
......@@ -500,7 +507,10 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
if (is_device_public_page(*page))
goto unmap;
}
get_page(*page);
if (unlikely(!try_get_page(*page))) {
ret = -ENOMEM;
goto unmap;
}
out:
ret = 0;
unmap:
......@@ -1545,6 +1555,20 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
}
}
/*
* Return the compund head page with ref appropriately incremented,
* or NULL if that failed.
*/
static inline struct page *try_get_compound_head(struct page *page, int refs)
{
struct page *head = compound_head(page);
if (WARN_ON_ONCE(page_ref_count(head) < 0))
return NULL;
if (unlikely(!page_cache_add_speculative(head, refs)))
return NULL;
return head;
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
int write, struct page **pages, int *nr)
......@@ -1579,9 +1603,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
head = compound_head(page);
if (!page_cache_get_speculative(head))
head = try_get_compound_head(page, 1);
if (!head)
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
......@@ -1720,8 +1744,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
head = compound_head(pmd_page(orig));
if (!page_cache_add_speculative(head, refs)) {
head = try_get_compound_head(pmd_page(orig), refs);
if (!head) {
*nr -= refs;
return 0;
}
......@@ -1758,8 +1782,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
head = compound_head(pud_page(orig));
if (!page_cache_add_speculative(head, refs)) {
head = try_get_compound_head(pud_page(orig), refs);
if (!head) {
*nr -= refs;
return 0;
}
......@@ -1795,8 +1819,8 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
refs++;
} while (addr += PAGE_SIZE, addr != end);
head = compound_head(pgd_page(orig));
if (!page_cache_add_speculative(head, refs)) {
head = try_get_compound_head(pgd_page(orig), refs);
if (!head) {
*nr -= refs;
return 0;
}
......
......@@ -4299,6 +4299,19 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
/*
* Instead of doing 'try_get_page()' below in the same_page
* loop, just check the count once here.
*/
if (unlikely(page_count(page) <= 0)) {
if (pages) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
break;
}
}
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment