Commit 33b3b041 authored by David Howells's avatar David Howells Committed by Steve French

splice: Add a func to do a splice from an O_DIRECT file without ITER_PIPE

Implement a function, direct_file_splice(), that deals with this by using
an ITER_BVEC iterator instead of an ITER_PIPE iterator as the former won't
free its buffers when reverted.  The function bulk allocates all the
buffers it thinks it is going to use in advance, does the read
synchronously and only then trims the buffer down.  The pages we did use
get pushed into the pipe.

This fixes a problem with the upcoming iov_iter_extract_pages() function,
whereby pages extracted from a non-user-backed iterator such as ITER_PIPE
aren't pinned.  __iomap_dio_rw(), however, calls iov_iter_revert() to
shorten the iterator to just the bufferage it is going to use - which has
the side-effect of freeing the excess pipe buffers, even though they're
attached to a bio and may get written to by DMA (thanks to Hillf Danton for
spotting this[1]).

This then causes memory corruption that is particularly noticeable when the
syzbot test[2] is run.  The test boils down to:

	out = creat(argv[1], 0666);
	ftruncate(out, 0x800);
	lseek(out, 0x200, SEEK_SET);
	in = open(argv[1], O_RDONLY | O_DIRECT | O_NOFOLLOW);
	sendfile(out, in, NULL, 0x1dd00);

run repeatedly in parallel.  What I think is happening is that ftruncate()
occasionally shortens the DIO read that's about to be made by sendfile's
splice core by reducing i_size.

This should be more efficient for DIO read by virtue of doing a bulk page
allocation, but slightly less efficient by ignoring any partial page in the
pipe.

Reported-by: syzbot+a440341a59e3b7142895@syzkaller.appspotmail.com
Signed-off-by: default avatarDavid Howells <dhowells@redhat.com>
Reviewed-by: default avatarJens Axboe <axboe@kernel.dk>
cc: Christoph Hellwig <hch@lst.de>
cc: Al Viro <viro@zeniv.linux.org.uk>
cc: David Hildenbrand <david@redhat.com>
cc: John Hubbard <jhubbard@nvidia.com>
cc: linux-mm@kvack.org
cc: linux-block@vger.kernel.org
cc: linux-fsdevel@vger.kernel.org
Link: https://lore.kernel.org/r/20230207094731.1390-1-hdanton@sina.com/ [1]
Link: https://lore.kernel.org/r/000000000000b0b3c005f3a09383@google.com/ [2]
Signed-off-by: default avatarSteve French <stfrench@microsoft.com>
parent 07073eb0
...@@ -282,6 +282,98 @@ void splice_shrink_spd(struct splice_pipe_desc *spd) ...@@ -282,6 +282,98 @@ void splice_shrink_spd(struct splice_pipe_desc *spd)
kfree(spd->partial); kfree(spd->partial);
} }
/*
* Splice data from an O_DIRECT file into pages and then add them to the output
* pipe.
*/
ssize_t direct_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
struct iov_iter to;
struct bio_vec *bv;
struct kiocb kiocb;
struct page **pages;
ssize_t ret;
size_t used, npages, chunk, remain, reclaim;
int i;
/* Work out how much data we can actually add into the pipe */
used = pipe_occupancy(pipe->head, pipe->tail);
npages = max_t(ssize_t, pipe->max_usage - used, 0);
len = min_t(size_t, len, npages * PAGE_SIZE);
npages = DIV_ROUND_UP(len, PAGE_SIZE);
bv = kzalloc(array_size(npages, sizeof(bv[0])) +
array_size(npages, sizeof(struct page *)), GFP_KERNEL);
if (!bv)
return -ENOMEM;
pages = (void *)(bv + npages);
npages = alloc_pages_bulk_array(GFP_USER, npages, pages);
if (!npages) {
kfree(bv);
return -ENOMEM;
}
remain = len = min_t(size_t, len, npages * PAGE_SIZE);
for (i = 0; i < npages; i++) {
chunk = min_t(size_t, PAGE_SIZE, remain);
bv[i].bv_page = pages[i];
bv[i].bv_offset = 0;
bv[i].bv_len = chunk;
remain -= chunk;
}
/* Do the I/O */
iov_iter_bvec(&to, ITER_DEST, bv, npages, len);
init_sync_kiocb(&kiocb, in);
kiocb.ki_pos = *ppos;
ret = call_read_iter(in, &kiocb, &to);
reclaim = npages * PAGE_SIZE;
remain = 0;
if (ret > 0) {
reclaim -= ret;
remain = ret;
*ppos = kiocb.ki_pos;
file_accessed(in);
} else if (ret < 0) {
/*
* callers of ->splice_read() expect -EAGAIN on
* "can't put anything in there", rather than -EFAULT.
*/
if (ret == -EFAULT)
ret = -EAGAIN;
}
/* Free any pages that didn't get touched at all. */
reclaim /= PAGE_SIZE;
if (reclaim) {
npages -= reclaim;
release_pages(pages + npages, reclaim);
}
/* Push the remaining pages into the pipe. */
for (i = 0; i < npages; i++) {
struct pipe_buffer *buf = pipe_head_buf(pipe);
chunk = min_t(size_t, remain, PAGE_SIZE);
*buf = (struct pipe_buffer) {
.ops = &default_pipe_buf_ops,
.page = bv[i].bv_page,
.offset = 0,
.len = chunk,
};
pipe->head++;
remain -= chunk;
}
kfree(bv);
return ret;
}
/** /**
* generic_file_splice_read - splice data from file to a pipe * generic_file_splice_read - splice data from file to a pipe
* @in: file to splice from * @in: file to splice from
......
...@@ -3166,6 +3166,9 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb, ...@@ -3166,6 +3166,9 @@ ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
ssize_t filemap_splice_read(struct file *in, loff_t *ppos, ssize_t filemap_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, struct pipe_inode_info *pipe,
size_t len, unsigned int flags); size_t len, unsigned int flags);
ssize_t direct_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags);
extern ssize_t generic_file_splice_read(struct file *, loff_t *, extern ssize_t generic_file_splice_read(struct file *, loff_t *,
struct pipe_inode_info *, size_t, unsigned int); struct pipe_inode_info *, size_t, unsigned int);
extern ssize_t iter_file_splice_write(struct pipe_inode_info *, extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment