file.c 81.5 KB
Newer Older
1 2
/*
  FUSE: Filesystem in Userspace
Miklos Szeredi's avatar
Miklos Szeredi committed
3
  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
4 5 6 7 8 9 10 11 12 13

  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
*/

#include "fuse_i.h"

#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/kernel.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
14
#include <linux/sched.h>
15
#include <linux/sched/signal.h>
16
#include <linux/module.h>
17
#include <linux/compat.h>
18
#include <linux/swap.h>
19
#include <linux/falloc.h>
20
#include <linux/uio.h>
21

Miklos Szeredi's avatar
Miklos Szeredi committed
22 23 24 25 26 27 28 29 30 31 32 33
struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
			       struct fuse_page_desc **desc)
{
	struct page **pages;

	pages = kzalloc(npages * (sizeof(struct page *) +
				  sizeof(struct fuse_page_desc)), flags);
	*desc = (void *) (pages + npages);

	return pages;
}

34 35
static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
			  int opcode, struct fuse_open_out *outargp)
36 37
{
	struct fuse_open_in inarg;
38
	FUSE_ARGS(args);
39 40

	memset(&inarg, 0, sizeof(inarg));
41 42 43
	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
	if (!fc->atomic_o_trunc)
		inarg.flags &= ~O_TRUNC;
44 45 46 47 48 49 50 51
	args.opcode = opcode;
	args.nodeid = nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(*outargp);
	args.out_args[0].value = outargp;
52

53
	return fuse_simple_request(fc, &args);
54 55
}

Tejun Heo's avatar
Tejun Heo committed
56
struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
57 58
{
	struct fuse_file *ff;
Tejun Heo's avatar
Tejun Heo committed
59

60
	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL);
Tejun Heo's avatar
Tejun Heo committed
61 62 63
	if (unlikely(!ff))
		return NULL;

64
	ff->fc = fc;
65
	ff->reserved_req = fuse_request_alloc(0);
Tejun Heo's avatar
Tejun Heo committed
66 67 68
	if (unlikely(!ff->reserved_req)) {
		kfree(ff);
		return NULL;
69
	}
Tejun Heo's avatar
Tejun Heo committed
70 71

	INIT_LIST_HEAD(&ff->write_entry);
72
	mutex_init(&ff->readdir.lock);
73
	refcount_set(&ff->count, 1);
Tejun Heo's avatar
Tejun Heo committed
74 75 76
	RB_CLEAR_NODE(&ff->polled_node);
	init_waitqueue_head(&ff->poll_wait);

77
	ff->kh = atomic64_inc_return(&fc->khctr);
Tejun Heo's avatar
Tejun Heo committed
78

79 80 81 82 83
	return ff;
}

void fuse_file_free(struct fuse_file *ff)
{
84
	fuse_request_free(ff->reserved_req);
85
	mutex_destroy(&ff->readdir.lock);
86 87 88
	kfree(ff);
}

89
static struct fuse_file *fuse_file_get(struct fuse_file *ff)
90
{
91
	refcount_inc(&ff->count);
92 93 94
	return ff;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
95 96
static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
{
97
	iput(req->misc.release.inode);
Miklos Szeredi's avatar
Miklos Szeredi committed
98 99
}

100
static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
101
{
102
	if (refcount_dec_and_test(&ff->count)) {
103
		struct fuse_req *req = ff->reserved_req;
104

105
		if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
106 107 108 109
			/*
			 * Drop the release request when client does not
			 * implement 'open'
			 */
Miklos Szeredi's avatar
Miklos Szeredi committed
110
			__clear_bit(FR_BACKGROUND, &req->flags);
111
			iput(req->misc.release.inode);
112 113
			fuse_put_request(ff->fc, req);
		} else if (sync) {
114
			__set_bit(FR_FORCE, &req->flags);
Miklos Szeredi's avatar
Miklos Szeredi committed
115
			__clear_bit(FR_BACKGROUND, &req->flags);
116
			fuse_request_send(ff->fc, req);
117
			iput(req->misc.release.inode);
118 119 120
			fuse_put_request(ff->fc, req);
		} else {
			req->end = fuse_release_end;
Miklos Szeredi's avatar
Miklos Szeredi committed
121
			__set_bit(FR_BACKGROUND, &req->flags);
122 123
			fuse_request_send_background(ff->fc, req);
		}
124 125 126 127
		kfree(ff);
	}
}

128 129
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
		 bool isdir)
130 131 132 133 134 135 136 137
{
	struct fuse_file *ff;
	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;

	ff = fuse_file_alloc(fc);
	if (!ff)
		return -ENOMEM;

138
	ff->fh = 0;
139 140
	/* Default for no-open */
	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
141
	if (isdir ? !fc->no_opendir : !fc->no_open) {
142 143 144 145 146 147 148 149
		struct fuse_open_out outarg;
		int err;

		err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
		if (!err) {
			ff->fh = outarg.fh;
			ff->open_flags = outarg.open_flags;

150
		} else if (err != -ENOSYS) {
151 152 153
			fuse_file_free(ff);
			return err;
		} else {
154 155 156 157
			if (isdir)
				fc->no_opendir = 1;
			else
				fc->no_open = 1;
158
		}
159 160 161
	}

	if (isdir)
162
		ff->open_flags &= ~FOPEN_DIRECT_IO;
163 164

	ff->nodeid = nodeid;
165
	file->private_data = ff;
166 167 168

	return 0;
}
169
EXPORT_SYMBOL_GPL(fuse_do_open);
170

171 172 173 174 175 176 177 178 179
static void fuse_link_write_file(struct file *file)
{
	struct inode *inode = file_inode(file);
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct fuse_file *ff = file->private_data;
	/*
	 * file may be written through mmap, so chain it onto the
	 * inodes's write_file list
	 */
180
	spin_lock(&fi->lock);
181 182
	if (list_empty(&ff->write_entry))
		list_add(&ff->write_entry, &fi->write_files);
183
	spin_unlock(&fi->lock);
184 185
}

186
void fuse_finish_open(struct inode *inode, struct file *file)
187
{
188
	struct fuse_file *ff = file->private_data;
189
	struct fuse_conn *fc = get_fuse_conn(inode);
190 191

	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
Miklos Szeredi's avatar
Miklos Szeredi committed
192
		invalidate_inode_pages2(inode->i_mapping);
193 194 195
	if (ff->open_flags & FOPEN_STREAM)
		stream_open(inode, file);
	else if (ff->open_flags & FOPEN_NONSEEKABLE)
Tejun Heo's avatar
Tejun Heo committed
196
		nonseekable_open(inode, file);
197 198 199
	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
		struct fuse_inode *fi = get_fuse_inode(inode);

200
		spin_lock(&fi->lock);
201
		fi->attr_version = atomic64_inc_return(&fc->attr_version);
202
		i_size_write(inode, 0);
203
		spin_unlock(&fi->lock);
204
		fuse_invalidate_attr(inode);
205 206
		if (fc->writeback_cache)
			file_update_time(file);
207
	}
208 209
	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
		fuse_link_write_file(file);
210 211
}

212
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
213
{
Tejun Heo's avatar
Tejun Heo committed
214
	struct fuse_conn *fc = get_fuse_conn(inode);
215
	int err;
216 217 218
	bool lock_inode = (file->f_flags & O_TRUNC) &&
			  fc->atomic_o_trunc &&
			  fc->writeback_cache;
219 220 221 222 223

	err = generic_file_open(inode, file);
	if (err)
		return err;

224
	if (lock_inode)
Al Viro's avatar
Al Viro committed
225
		inode_lock(inode);
226

227
	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
228

229 230
	if (!err)
		fuse_finish_open(inode, file);
231

232
	if (lock_inode)
Al Viro's avatar
Al Viro committed
233
		inode_unlock(inode);
234 235

	return err;
236 237
}

238 239
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
				 int flags, int opcode)
240
{
241
	struct fuse_conn *fc = ff->fc;
242
	struct fuse_req *req = ff->reserved_req;
243
	struct fuse_release_in *inarg = &req->misc.release.in;
244

245 246 247 248 249 250
	/* Inode is NULL on error path of fuse_create_open() */
	if (likely(fi)) {
		spin_lock(&fi->lock);
		list_del(&ff->write_entry);
		spin_unlock(&fi->lock);
	}
251 252 253 254 255
	spin_lock(&fc->lock);
	if (!RB_EMPTY_NODE(&ff->polled_node))
		rb_erase(&ff->polled_node, &fc->polled_files);
	spin_unlock(&fc->lock);

256
	wake_up_interruptible_all(&ff->poll_wait);
257

258
	inarg->fh = ff->fh;
259
	inarg->flags = flags;
260
	req->in.h.opcode = opcode;
261
	req->in.h.nodeid = ff->nodeid;
262 263 264
	req->in.numargs = 1;
	req->in.args[0].size = sizeof(struct fuse_release_in);
	req->in.args[0].value = inarg;
265 266
}

267
void fuse_release_common(struct file *file, bool isdir)
268
{
269
	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
270 271
	struct fuse_file *ff = file->private_data;
	struct fuse_req *req = ff->reserved_req;
272
	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
Tejun Heo's avatar
Tejun Heo committed
273

274
	fuse_prepare_release(fi, ff, file->f_flags, opcode);
Tejun Heo's avatar
Tejun Heo committed
275

Miklos Szeredi's avatar
Miklos Szeredi committed
276 277 278 279 280 281
	if (ff->flock) {
		struct fuse_release_in *inarg = &req->misc.release.in;
		inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
		inarg->lock_owner = fuse_lock_owner_id(ff->fc,
						       (fl_owner_t) file);
	}
282 283
	/* Hold inode until release is finished */
	req->misc.release.inode = igrab(file_inode(file));
Tejun Heo's avatar
Tejun Heo committed
284 285 286 287 288

	/*
	 * Normally this will send the RELEASE request, however if
	 * some asynchronous READ or WRITE requests are outstanding,
	 * the sending will be delayed.
289 290 291 292
	 *
	 * Make the release synchronous if this is a fuseblk mount,
	 * synchronous RELEASE is allowed (and desirable) in this case
	 * because the server can be trusted not to screw up.
Tejun Heo's avatar
Tejun Heo committed
293
	 */
294
	fuse_file_put(ff, ff->fc->destroy, isdir);
295 296
}

297 298
static int fuse_open(struct inode *inode, struct file *file)
{
299
	return fuse_open_common(inode, file, false);
300 301 302 303
}

static int fuse_release(struct inode *inode, struct file *file)
{
304 305 306 307
	struct fuse_conn *fc = get_fuse_conn(inode);

	/* see fuse_vma_close() for !writeback_cache case */
	if (fc->writeback_cache)
Miklos Szeredi's avatar
Miklos Szeredi committed
308
		write_inode_now(inode, 1);
309

310
	fuse_release_common(file, false);
311 312 313 314 315

	/* return value is ignored by VFS */
	return 0;
}

316
void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
317
{
318
	WARN_ON(refcount_read(&ff->count) > 1);
319
	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
320 321 322 323
	/*
	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
	 * synchronous, we are fine with not doing igrab() here"
	 */
324
	fuse_file_put(ff, true, false);
325
}
326
EXPORT_SYMBOL_GPL(fuse_sync_release);
327

328
/*
329 330
 * Scramble the ID space with XTEA, so that the value of the files_struct
 * pointer is not exposed to userspace.
331
 */
332
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
333
{
334 335 336 337 338 339 340 341 342 343 344 345 346 347
	u32 *k = fc->scramble_key;
	u64 v = (unsigned long) id;
	u32 v0 = v;
	u32 v1 = v >> 32;
	u32 sum = 0;
	int i;

	for (i = 0; i < 32; i++) {
		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
		sum += 0x9E3779B9;
		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
	}

	return (u64) v0 + ((u64) v1 << 32);
348 349
}

350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
static struct fuse_req *fuse_find_writeback(struct fuse_inode *fi,
					    pgoff_t idx_from, pgoff_t idx_to)
{
	struct fuse_req *req;

	list_for_each_entry(req, &fi->writepages, writepages_entry) {
		pgoff_t curr_index;

		WARN_ON(get_fuse_inode(req->inode) != fi);
		curr_index = req->misc.write.in.offset >> PAGE_SHIFT;
		if (idx_from < curr_index + req->num_pages &&
		    curr_index <= idx_to) {
			return req;
		}
	}
	return NULL;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
368
/*
369
 * Check if any page in a range is under writeback
Miklos Szeredi's avatar
Miklos Szeredi committed
370 371 372 373
 *
 * This is currently done by walking the list of writepage requests
 * for the inode, which can be pretty inefficient.
 */
374 375
static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
				   pgoff_t idx_to)
Miklos Szeredi's avatar
Miklos Szeredi committed
376 377
{
	struct fuse_inode *fi = get_fuse_inode(inode);
378
	bool found;
Miklos Szeredi's avatar
Miklos Szeredi committed
379

380
	spin_lock(&fi->lock);
381
	found = fuse_find_writeback(fi, idx_from, idx_to);
382
	spin_unlock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
383 384 385 386

	return found;
}

387 388 389 390 391
static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
{
	return fuse_range_is_writeback(inode, index, index);
}

Miklos Szeredi's avatar
Miklos Szeredi committed
392 393 394 395 396 397
/*
 * Wait for page writeback to be completed.
 *
 * Since fuse doesn't rely on the VM writeback tracking, this has to
 * use some other means.
 */
398
static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
Miklos Szeredi's avatar
Miklos Szeredi committed
399 400 401 402 403 404
{
	struct fuse_inode *fi = get_fuse_inode(inode);

	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
}

405 406 407 408 409 410 411 412 413 414 415 416 417 418 419
/*
 * Wait for all pending writepages on the inode to finish.
 *
 * This is currently done by blocking further writes with FUSE_NOWRITE
 * and waiting for all sent writes to complete.
 *
 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
 * could conflict with truncation.
 */
static void fuse_sync_writes(struct inode *inode)
{
	fuse_set_nowrite(inode);
	fuse_release_nowrite(inode);
}

420
static int fuse_flush(struct file *file, fl_owner_t id)
421
{
422
	struct inode *inode = file_inode(file);
423 424 425
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_file *ff = file->private_data;
	struct fuse_flush_in inarg;
426
	FUSE_ARGS(args);
427 428
	int err;

429 430 431
	if (is_bad_inode(inode))
		return -EIO;

432 433 434
	if (fc->no_flush)
		return 0;

Miklos Szeredi's avatar
Miklos Szeredi committed
435
	err = write_inode_now(inode, 1);
436 437 438
	if (err)
		return err;

Al Viro's avatar
Al Viro committed
439
	inode_lock(inode);
440
	fuse_sync_writes(inode);
Al Viro's avatar
Al Viro committed
441
	inode_unlock(inode);
442

443
	err = filemap_check_errors(file->f_mapping);
444 445 446
	if (err)
		return err;

447 448
	memset(&inarg, 0, sizeof(inarg));
	inarg.fh = ff->fh;
449
	inarg.lock_owner = fuse_lock_owner_id(fc, id);
450 451 452 453 454 455 456 457
	args.opcode = FUSE_FLUSH;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.force = true;

	err = fuse_simple_request(fc, &args);
458 459 460 461 462 463 464
	if (err == -ENOSYS) {
		fc->no_flush = 1;
		err = 0;
	}
	return err;
}

465
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
466
		      int datasync, int opcode)
467
{
468
	struct inode *inode = file->f_mapping->host;
469 470
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_file *ff = file->private_data;
471
	FUSE_ARGS(args);
472
	struct fuse_fsync_in inarg;
473 474 475

	memset(&inarg, 0, sizeof(inarg));
	inarg.fh = ff->fh;
476
	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
477 478 479 480 481
	args.opcode = opcode;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
482 483 484 485 486 487 488 489
	return fuse_simple_request(fc, &args);
}

static int fuse_fsync(struct file *file, loff_t start, loff_t end,
		      int datasync)
{
	struct inode *inode = file->f_mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
490 491
	int err;

492 493 494
	if (is_bad_inode(inode))
		return -EIO;

Al Viro's avatar
Al Viro committed
495
	inode_lock(inode);
496

Miklos Szeredi's avatar
Miklos Szeredi committed
497 498 499 500 501
	/*
	 * Start writeback against all dirty pages of the inode, then
	 * wait for all outstanding writes, before sending the FSYNC
	 * request.
	 */
502
	err = file_write_and_wait_range(file, start, end);
Miklos Szeredi's avatar
Miklos Szeredi committed
503
	if (err)
504
		goto out;
Miklos Szeredi's avatar
Miklos Szeredi committed
505 506

	fuse_sync_writes(inode);
507 508 509

	/*
	 * Due to implementation of fuse writeback
510
	 * file_write_and_wait_range() does not catch errors.
511 512
	 * We have to do this directly after fuse_sync_writes()
	 */
513
	err = file_check_and_advance_wb_err(file);
514 515 516
	if (err)
		goto out;

Miklos Szeredi's avatar
Miklos Szeredi committed
517 518 519
	err = sync_inode_metadata(inode, 1);
	if (err)
		goto out;
Miklos Szeredi's avatar
Miklos Szeredi committed
520

521
	if (fc->no_fsync)
Miklos Szeredi's avatar
Miklos Szeredi committed
522
		goto out;
523

524
	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
525
	if (err == -ENOSYS) {
526
		fc->no_fsync = 1;
527 528
		err = 0;
	}
529
out:
Al Viro's avatar
Al Viro committed
530
	inode_unlock(inode);
531

532
	return err;
533 534
}

535 536
void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
		    size_t count, int opcode)
537
{
538
	struct fuse_read_in *inarg = &req->misc.read.in;
539
	struct fuse_file *ff = file->private_data;
540

541 542 543
	inarg->fh = ff->fh;
	inarg->offset = pos;
	inarg->size = count;
544
	inarg->flags = file->f_flags;
545
	req->in.h.opcode = opcode;
546
	req->in.h.nodeid = ff->nodeid;
547 548
	req->in.numargs = 1;
	req->in.args[0].size = sizeof(struct fuse_read_in);
549
	req->in.args[0].value = inarg;
550 551 552 553 554
	req->out.argvar = 1;
	req->out.numargs = 1;
	req->out.args[0].size = count;
}

555
struct fuse_io_args {
556 557 558 559 560 561 562 563 564
	union {
		struct {
			struct fuse_read_in in;
		} read;
		struct {
			struct fuse_write_in in;
			struct fuse_write_out out;
		} write;
	};
565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
	struct fuse_args_pages ap;
};

void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
			 size_t count, int opcode)
{
	struct fuse_file *ff = file->private_data;
	struct fuse_args *args = &ia->ap.args;

	ia->read.in.fh = ff->fh;
	ia->read.in.offset = pos;
	ia->read.in.size = count;
	ia->read.in.flags = file->f_flags;
	args->opcode = opcode;
	args->nodeid = ff->nodeid;
	args->in_numargs = 1;
	args->in_args[0].size = sizeof(ia->read.in);
	args->in_args[0].value = &ia->read.in;
	args->out_argvar = true;
	args->out_numargs = 1;
	args->out_args[0].size = count;
}

588
static void fuse_release_user_pages(struct fuse_req *req, bool should_dirty)
589 590 591 592 593
{
	unsigned i;

	for (i = 0; i < req->num_pages; i++) {
		struct page *page = req->pages[i];
594
		if (should_dirty)
595 596 597 598 599
			set_page_dirty_lock(page);
		put_page(page);
	}
}

600 601 602 603 604
static void fuse_io_release(struct kref *kref)
{
	kfree(container_of(kref, struct fuse_io_priv, refcnt));
}

605 606 607 608 609 610 611 612 613 614 615
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{
	if (io->err)
		return io->err;

	if (io->bytes >= 0 && io->write)
		return -EIO;

	return io->bytes < 0 ? io->size : io->bytes;
}

616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642
/**
 * In case of short read, the caller sets 'pos' to the position of
 * actual end of fuse request in IO request. Otherwise, if bytes_requested
 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
 *
 * An example:
 * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
 * both submitted asynchronously. The first of them was ACKed by userspace as
 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
 * second request was ACKed as short, e.g. only 1K was read, resulting in
 * pos == 33K.
 *
 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
 * will be equal to the length of the longest contiguous fragment of
 * transferred data starting from the beginning of IO request.
 */
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
	int left;

	spin_lock(&io->lock);
	if (err)
		io->err = io->err ? : err;
	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
		io->bytes = pos;

	left = --io->reqs;
643
	if (!left && io->blocking)
644
		complete(io->done);
645 646
	spin_unlock(&io->lock);

647
	if (!left && !io->blocking) {
648
		ssize_t res = fuse_get_res_by_io(io);
649

650 651 652 653
		if (res >= 0) {
			struct inode *inode = file_inode(io->iocb->ki_filp);
			struct fuse_conn *fc = get_fuse_conn(inode);
			struct fuse_inode *fi = get_fuse_inode(inode);
654

655
			spin_lock(&fi->lock);
656
			fi->attr_version = atomic64_inc_return(&fc->attr_version);
657
			spin_unlock(&fi->lock);
658 659
		}

660
		io->iocb->ki_complete(io->iocb, res, 0);
661
	}
662 663

	kref_put(&io->refcnt, fuse_io_release);
664 665 666 667 668 669 670
}

static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
{
	struct fuse_io_priv *io = req->io;
	ssize_t pos = -1;

671
	fuse_release_user_pages(req, io->should_dirty);
672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689

	if (io->write) {
		if (req->misc.write.in.size != req->misc.write.out.size)
			pos = req->misc.write.in.offset - io->offset +
				req->misc.write.out.size;
	} else {
		if (req->misc.read.in.size != req->out.args[0].size)
			pos = req->misc.read.in.offset - io->offset +
				req->out.args[0].size;
	}

	fuse_aio_complete(io, req->out.h.error, pos);
}

static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
		size_t num_bytes, struct fuse_io_priv *io)
{
	spin_lock(&io->lock);
690
	kref_get(&io->refcnt);
691 692 693 694 695 696 697
	io->size += num_bytes;
	io->reqs++;
	spin_unlock(&io->lock);

	req->io = io;
	req->end = fuse_aio_complete_req;

698
	__fuse_get_request(req);
699 700 701 702 703
	fuse_request_send_background(fc, req);

	return num_bytes;
}

704
static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
705
			     loff_t pos, size_t count, fl_owner_t owner)
706
{
707
	struct file *file = io->iocb->ki_filp;
708 709
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
710

711
	fuse_read_fill(req, file, pos, count, FUSE_READ);
712
	if (owner != NULL) {
713
		struct fuse_read_in *inarg = &req->misc.read.in;
714 715 716 717

		inarg->read_flags |= FUSE_READ_LOCKOWNER;
		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
	}
718 719 720 721

	if (io->async)
		return fuse_async_req_send(fc, req, count, io);

722
	fuse_request_send(fc, req);
723
	return req->out.args[0].size;
724 725
}

726 727 728 729 730 731
static void fuse_read_update_size(struct inode *inode, loff_t size,
				  u64 attr_ver)
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);

732
	spin_lock(&fi->lock);
733 734
	if (attr_ver == fi->attr_version && size < inode->i_size &&
	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
735
		fi->attr_version = atomic64_inc_return(&fc->attr_version);
736 737
		i_size_write(inode, size);
	}
738
	spin_unlock(&fi->lock);
739 740
}

741 742
static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
			    struct page **pages, unsigned int num_pages)
743
{
744 745 746 747 748 749 750 751 752
	struct fuse_conn *fc = get_fuse_conn(inode);

	if (fc->writeback_cache) {
		/*
		 * A hole in a file. Some data after the hole are in page cache,
		 * but have not reached the client fs yet. So, the hole is not
		 * present there.
		 */
		int i;
753 754
		int start_idx = num_read >> PAGE_SHIFT;
		size_t off = num_read & (PAGE_SIZE - 1);
755

756 757
		for (i = start_idx; i < num_pages; i++) {
			zero_user_segment(pages[i], off, PAGE_SIZE);
758 759 760
			off = 0;
		}
	} else {
761
		loff_t pos = page_offset(pages[0]) + num_read;
762 763
		fuse_read_update_size(inode, pos, attr_ver);
	}
764 765
}

766
static int fuse_do_readpage(struct file *file, struct page *page)
767 768 769
{
	struct inode *inode = page->mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
770
	loff_t pos = page_offset(page);
771 772 773 774 775 776 777 778 779
	struct fuse_page_desc desc = { .length = PAGE_SIZE };
	struct fuse_io_args ia = {
		.ap.args.page_zeroing = true,
		.ap.args.out_pages = true,
		.ap.num_pages = 1,
		.ap.pages = &page,
		.ap.descs = &desc,
	};
	ssize_t res;
780
	u64 attr_ver;
781

Miklos Szeredi's avatar
Miklos Szeredi committed
782
	/*
Lucas De Marchi's avatar
Lucas De Marchi committed
783
	 * Page writeback can extend beyond the lifetime of the
Miklos Szeredi's avatar
Miklos Szeredi committed
784 785 786 787 788
	 * page-cache page, so make sure we read a properly synced
	 * page.
	 */
	fuse_wait_on_page_writeback(inode, page->index);

789 790
	attr_ver = fuse_get_attr_version(fc);

791 792 793 794 795 796 797 798 799 800
	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
	res = fuse_simple_request(fc, &ia.ap.args);
	if (res < 0)
		return res;
	/*
	 * Short read means EOF.  If file size is larger, truncate it
	 */
	if (res < desc.length)
		fuse_short_read(inode, attr_ver, res, ia.ap.pages,
				ia.ap.num_pages);
801

802
	SetPageUptodate(page);
803

804
	return 0;
805 806 807 808 809 810 811 812 813 814 815 816
}

static int fuse_readpage(struct file *file, struct page *page)
{
	struct inode *inode = page->mapping->host;
	int err;

	err = -EIO;
	if (is_bad_inode(inode))
		goto out;

	err = fuse_do_readpage(file, page);
817
	fuse_invalidate_atime(inode);
818 819 820 821 822
 out:
	unlock_page(page);
	return err;
}

823
static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
824
{
825
	int i;
826 827
	size_t count = req->misc.read.in.size;
	size_t num_read = req->out.args[0].size;
828
	struct address_space *mapping = NULL;
829

830 831
	for (i = 0; mapping == NULL && i < req->num_pages; i++)
		mapping = req->pages[i]->mapping;
832

833 834 835 836 837 838
	if (mapping) {
		struct inode *inode = mapping->host;

		/*
		 * Short read means EOF. If file size is larger, truncate it
		 */
839
		if (!req->out.h.error && num_read < count)
840 841
			fuse_short_read(inode, req->misc.read.attr_ver,
					num_read, req->pages, req->num_pages);
842

843
		fuse_invalidate_atime(inode);
844
	}
845

846 847 848 849
	for (i = 0; i < req->num_pages; i++) {
		struct page *page = req->pages[i];
		if (!req->out.h.error)
			SetPageUptodate(page);
850 851
		else
			SetPageError(page);
852
		unlock_page(page);
853
		put_page(page);
854
	}
855
	if (req->ff)
856
		fuse_file_put(req->ff, false, false);
857 858
}

859
static void fuse_send_readpages(struct fuse_req *req, struct file *file)
860
{
861 862
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
863
	loff_t pos = page_offset(req->pages[0]);
864
	size_t count = req->num_pages << PAGE_SHIFT;
865 866

	req->out.argpages = 1;
867
	req->out.page_zeroing = 1;
868
	req->out.page_replace = 1;
869
	fuse_read_fill(req, file, pos, count, FUSE_READ);
870
	req->misc.read.attr_ver = fuse_get_attr_version(fc);
871
	if (fc->async_read) {
872
		req->ff = fuse_file_get(ff);
873
		req->end = fuse_readpages_end;
874
		fuse_request_send_background(fc, req);
875
	} else {
876
		fuse_request_send(fc, req);
877
		fuse_readpages_end(fc, req);
878
		fuse_put_request(fc, req);
879
	}
880 881
}

882
struct fuse_fill_data {
883
	struct fuse_req *req;
884
	struct file *file;
885
	struct inode *inode;
886
	unsigned nr_pages;
887 888 889 890
};

static int fuse_readpages_fill(void *_data, struct page *page)
{
891
	struct fuse_fill_data *data = _data;
892 893 894 895
	struct fuse_req *req = data->req;
	struct inode *inode = data->inode;
	struct fuse_conn *fc = get_fuse_conn(inode);

Miklos Szeredi's avatar
Miklos Szeredi committed
896 897
	fuse_wait_on_page_writeback(inode, page->index);

898
	if (req->num_pages &&
899
	    (req->num_pages == fc->max_pages ||
900
	     (req->num_pages + 1) * PAGE_SIZE > fc->max_read ||
901
	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
902 903
		unsigned int nr_alloc = min_t(unsigned int, data->nr_pages,
					      fc->max_pages);
904
		fuse_send_readpages(req, data->file);
905 906 907 908 909 910
		if (fc->async_read)
			req = fuse_get_req_for_background(fc, nr_alloc);
		else
			req = fuse_get_req(fc, nr_alloc);

		data->req = req;
911
		if (IS_ERR(req)) {
912
			unlock_page(page);
913
			return PTR_ERR(req);
914 915
		}
	}
916 917

	if (WARN_ON(req->num_pages >= req->max_pages)) {
918
		unlock_page(page);
919 920 921 922
		fuse_put_request(fc, req);
		return -EIO;
	}

923
	get_page(page);
924
	req->pages[req->num_pages] = page;
925
	req->page_descs[req->num_pages].length = PAGE_SIZE;
Miklos Szeredi's avatar
Miklos Szeredi committed
926
	req->num_pages++;
927
	data->nr_pages--;
928 929 930 931 932 933 934 935
	return 0;
}

static int fuse_readpages(struct file *file, struct address_space *mapping,
			  struct list_head *pages, unsigned nr_pages)
{
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
936
	struct fuse_fill_data data;
937
	int err;
938
	unsigned int nr_alloc = min_t(unsigned int, nr_pages, fc->max_pages);
939

940
	err = -EIO;
941
	if (is_bad_inode(inode))
942
		goto out;
943

944
	data.file = file;
945
	data.inode = inode;
946 947 948 949
	if (fc->async_read)
		data.req = fuse_get_req_for_background(fc, nr_alloc);
	else
		data.req = fuse_get_req(fc, nr_alloc);
950
	data.nr_pages = nr_pages;
951
	err = PTR_ERR(data.req);
952
	if (IS_ERR(data.req))
953
		goto out;
954 955

	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
956 957
	if (!err) {
		if (data.req->num_pages)
958
			fuse_send_readpages(data.req, file);
959 960 961
		else
			fuse_put_request(fc, data.req);
	}
962
out:
963
	return err;
964 965
}

966
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
Miklos Szeredi's avatar
Miklos Szeredi committed
967 968
{
	struct inode *inode = iocb->ki_filp->f_mapping->host;
969
	struct fuse_conn *fc = get_fuse_conn(inode);
Miklos Szeredi's avatar
Miklos Szeredi committed
970

971 972 973 974 975 976
	/*
	 * In auto invalidate mode, always update attributes on read.
	 * Otherwise, only update if we attempt to read past EOF (to ensure
	 * i_size is up to date).
	 */
	if (fc->auto_inval_data ||
977
	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
Miklos Szeredi's avatar
Miklos Szeredi committed
978
		int err;
Miklos Szeredi's avatar
Miklos Szeredi committed
979
		err = fuse_update_attributes(inode, iocb->ki_filp);
Miklos Szeredi's avatar
Miklos Szeredi committed
980 981 982 983
		if (err)
			return err;
	}

984
	return generic_file_read_iter(iocb, to);
Miklos Szeredi's avatar
Miklos Szeredi committed
985 986
}

987
static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
988
			    loff_t pos, size_t count)
989
{
990 991
	struct fuse_write_in *inarg = &req->misc.write.in;
	struct fuse_write_out *outarg = &req->misc.write.out;
992

993 994 995
	inarg->fh = ff->fh;
	inarg->offset = pos;
	inarg->size = count;
996
	req->in.h.opcode = FUSE_WRITE;
997
	req->in.h.nodeid = ff->nodeid;
998
	req->in.numargs = 2;
999
	if (ff->fc->minor < 9)
1000 1001 1002
		req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
	else
		req->in.args[0].size = sizeof(struct fuse_write_in);
1003
	req->in.args[0].value = inarg;
1004 1005 1006
	req->in.args[1].size = count;
	req->out.numargs = 1;
	req->out.args[0].size = sizeof(struct fuse_write_out);
1007 1008 1009
	req->out.args[0].value = outarg;
}

1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043
static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
				 loff_t pos, size_t count)
{
	struct fuse_args *args = &ia->ap.args;

	ia->write.in.fh = ff->fh;
	ia->write.in.offset = pos;
	ia->write.in.size = count;
	args->opcode = FUSE_WRITE;
	args->nodeid = ff->nodeid;
	args->in_numargs = 2;
	if (ff->fc->minor < 9)
		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
	else
		args->in_args[0].size = sizeof(ia->write.in);
	args->in_args[0].value = &ia->write.in;
	args->in_args[1].size = count;
	args->out_numargs = 1;
	args->out_args[0].size = sizeof(ia->write.out);
	args->out_args[0].value = &ia->write.out;
}

static unsigned int fuse_write_flags(struct kiocb *iocb)
{
	unsigned int flags = iocb->ki_filp->f_flags;

	if (iocb->ki_flags & IOCB_DSYNC)
		flags |= O_DSYNC;
	if (iocb->ki_flags & IOCB_SYNC)
		flags |= O_SYNC;

	return flags;
}

1044
static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
1045
			      loff_t pos, size_t count, fl_owner_t owner)
1046
{
1047 1048
	struct kiocb *iocb = io->iocb;
	struct file *file = iocb->ki_filp;
1049 1050
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
1051 1052
	struct fuse_write_in *inarg = &req->misc.write.in;

1053
	fuse_write_fill(req, ff, pos, count);
1054
	inarg->flags = fuse_write_flags(iocb);
1055 1056 1057 1058
	if (owner != NULL) {
		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
	}
1059 1060 1061 1062

	if (io->async)
		return fuse_async_req_send(fc, req, count, io);

1063
	fuse_request_send(fc, req);
1064
	return req->misc.write.out.size;
1065 1066
}

1067
bool fuse_write_update_size(struct inode *inode, loff_t pos)
1068 1069 1070
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
1071
	bool ret = false;
1072

1073
	spin_lock(&fi->lock);
1074
	fi->attr_version = atomic64_inc_return(&fc->attr_version);
1075
	if (pos > inode->i_size) {
1076
		i_size_write(inode, pos);
1077 1078
		ret = true;
	}
1079
	spin_unlock(&fi->lock);
1080 1081

	return ret;
1082 1083
}

1084 1085 1086
static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
				     struct kiocb *iocb, struct inode *inode,
				     loff_t pos, size_t count)
Nick Piggin's avatar
Nick Piggin committed
1087
{
1088 1089 1090 1091 1092 1093
	struct fuse_args_pages *ap = &ia->ap;
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
	unsigned int offset, i;
	int err;
Nick Piggin's avatar
Nick Piggin committed
1094

1095 1096
	for (i = 0; i < ap->num_pages; i++)
		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
Nick Piggin's avatar
Nick Piggin committed
1097

1098 1099
	fuse_write_args_fill(ia, ff, pos, count);
	ia->write.in.flags = fuse_write_flags(iocb);
Nick Piggin's avatar
Nick Piggin committed
1100

1101 1102 1103 1104 1105 1106
	err = fuse_simple_request(fc, &ap->args);

	offset = ap->descs[0].offset;
	count = ia->write.out.size;
	for (i = 0; i < ap->num_pages; i++) {
		struct page *page = ap->pages[i];
Nick Piggin's avatar
Nick Piggin committed
1107

1108
		if (!err && !offset && count >= PAGE_SIZE)
Nick Piggin's avatar
Nick Piggin committed
1109 1110
			SetPageUptodate(page);

1111 1112
		if (count > PAGE_SIZE - offset)
			count -= PAGE_SIZE - offset;
Nick Piggin's avatar
Nick Piggin committed
1113 1114 1115 1116 1117
		else
			count = 0;
		offset = 0;

		unlock_page(page);
1118
		put_page(page);
Nick Piggin's avatar
Nick Piggin committed
1119 1120
	}

1121
	return err;
Nick Piggin's avatar
Nick Piggin committed
1122 1123
}

1124 1125 1126 1127
static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
				     struct address_space *mapping,
				     struct iov_iter *ii, loff_t pos,
				     unsigned int max_pages)
Nick Piggin's avatar
Nick Piggin committed
1128 1129
{
	struct fuse_conn *fc = get_fuse_conn(mapping->host);
1130
	unsigned offset = pos & (PAGE_SIZE - 1);
Nick Piggin's avatar
Nick Piggin committed
1131 1132 1133
	size_t count = 0;
	int err;

1134 1135
	ap->args.in_pages = true;
	ap->descs[0].offset = offset;
Nick Piggin's avatar
Nick Piggin committed
1136 1137 1138 1139

	do {
		size_t tmp;
		struct page *page;
1140 1141
		pgoff_t index = pos >> PAGE_SHIFT;
		size_t bytes = min_t(size_t, PAGE_SIZE - offset,
Nick Piggin's avatar
Nick Piggin committed
1142 1143 1144 1145 1146 1147 1148 1149 1150 1151
				     iov_iter_count(ii));

		bytes = min_t(size_t, bytes, fc->max_write - count);

 again:
		err = -EFAULT;
		if (iov_iter_fault_in_readable(ii, bytes))
			break;

		err = -ENOMEM;
1152
		page = grab_cache_page_write_begin(mapping, index, 0);
Nick Piggin's avatar
Nick Piggin committed
1153 1154 1155
		if (!page)
			break;

1156 1157 1158
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

Nick Piggin's avatar
Nick Piggin committed
1159 1160 1161
		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
		flush_dcache_page(page);

1162
		iov_iter_advance(ii, tmp);
Nick Piggin's avatar
Nick Piggin committed
1163 1164
		if (!tmp) {
			unlock_page(page);
1165
			put_page(page);
Nick Piggin's avatar
Nick Piggin committed
1166 1167 1168 1169 1170
			bytes = min(bytes, iov_iter_single_seg_count(ii));
			goto again;
		}

		err = 0;
1171 1172 1173
		ap->pages[ap->num_pages] = page;
		ap->descs[ap->num_pages].length = tmp;
		ap->num_pages++;
Nick Piggin's avatar
Nick Piggin committed
1174 1175 1176 1177

		count += tmp;
		pos += tmp;
		offset += tmp;
1178
		if (offset == PAGE_SIZE)
Nick Piggin's avatar
Nick Piggin committed
1179 1180
			offset = 0;

1181 1182
		if (!fc->big_writes)
			break;
Nick Piggin's avatar
Nick Piggin committed
1183
	} while (iov_iter_count(ii) && count < fc->max_write &&
1184
		 ap->num_pages < max_pages && offset == 0);
Nick Piggin's avatar
Nick Piggin committed
1185 1186 1187 1188

	return count > 0 ? count : err;
}

1189 1190
static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
				     unsigned int max_pages)
1191
{
1192
	return min_t(unsigned int,
1193 1194
		     ((pos + len - 1) >> PAGE_SHIFT) -
		     (pos >> PAGE_SHIFT) + 1,
1195
		     max_pages);
1196 1197
}

1198
static ssize_t fuse_perform_write(struct kiocb *iocb,
Nick Piggin's avatar
Nick Piggin committed
1199 1200 1201 1202 1203
				  struct address_space *mapping,
				  struct iov_iter *ii, loff_t pos)
{
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
1204
	struct fuse_inode *fi = get_fuse_inode(inode);
Nick Piggin's avatar
Nick Piggin committed
1205 1206 1207
	int err = 0;
	ssize_t res = 0;

1208 1209 1210
	if (inode->i_size < pos + iov_iter_count(ii))
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

Nick Piggin's avatar
Nick Piggin committed
1211 1212
	do {
		ssize_t count;
1213 1214
		struct fuse_io_args ia = {};
		struct fuse_args_pages *ap = &ia.ap;
1215 1216
		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
						      fc->max_pages);
Nick Piggin's avatar
Nick Piggin committed
1217

1218 1219 1220
		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
		if (!ap->pages) {
			err = -ENOMEM;
Nick Piggin's avatar
Nick Piggin committed
1221 1222 1223
			break;
		}

1224
		count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
Nick Piggin's avatar
Nick Piggin committed
1225 1226 1227
		if (count <= 0) {
			err = count;
		} else {
1228 1229
			err = fuse_send_write_pages(&ia, iocb, inode,
						    pos, count);
Nick Piggin's avatar
Nick Piggin committed
1230
			if (!err) {
1231 1232
				size_t num_written = ia.write.out.size;

Nick Piggin's avatar
Nick Piggin committed
1233 1234 1235 1236 1237 1238 1239 1240
				res += num_written;
				pos += num_written;

				/* break out of the loop on short write */
				if (num_written != count)
					err = -EIO;
			}
		}
1241
		kfree(ap->pages);
Nick Piggin's avatar
Nick Piggin committed
1242 1243 1244 1245 1246
	} while (!err && iov_iter_count(ii));

	if (res > 0)
		fuse_write_update_size(inode, pos);

1247
	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
Nick Piggin's avatar
Nick Piggin committed
1248 1249 1250 1251 1252
	fuse_invalidate_attr(inode);

	return res > 0 ? res : err;
}

1253
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
Nick Piggin's avatar
Nick Piggin committed
1254 1255 1256 1257
{
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
	ssize_t written = 0;
1258
	ssize_t written_buffered = 0;
Nick Piggin's avatar
Nick Piggin committed
1259 1260
	struct inode *inode = mapping->host;
	ssize_t err;
1261
	loff_t endbyte = 0;
Nick Piggin's avatar
Nick Piggin committed
1262

1263 1264
	if (get_fuse_conn(inode)->writeback_cache) {
		/* Update size (EOF optimization) and mode (SUID clearing) */
Miklos Szeredi's avatar
Miklos Szeredi committed
1265
		err = fuse_update_attributes(mapping->host, file);
1266 1267 1268
		if (err)
			return err;

Al Viro's avatar
Al Viro committed
1269
		return generic_file_write_iter(iocb, from);
1270 1271
	}

Al Viro's avatar
Al Viro committed
1272
	inode_lock(inode);
Nick Piggin's avatar
Nick Piggin committed
1273 1274

	/* We can write back this queue in page reclaim */
1275
	current->backing_dev_info = inode_to_bdi(inode);
Nick Piggin's avatar
Nick Piggin committed
1276

1277 1278
	err = generic_write_checks(iocb, from);
	if (err <= 0)
Nick Piggin's avatar
Nick Piggin committed
1279 1280
		goto out;

1281
	err = file_remove_privs(file);
Nick Piggin's avatar
Nick Piggin committed
1282 1283 1284
	if (err)
		goto out;

1285 1286 1287
	err = file_update_time(file);
	if (err)
		goto out;
Nick Piggin's avatar
Nick Piggin committed
1288

1289
	if (iocb->ki_flags & IOCB_DIRECT) {
1290
		loff_t pos = iocb->ki_pos;
1291
		written = generic_file_direct_write(iocb, from);
Al Viro's avatar
Al Viro committed
1292
		if (written < 0 || !iov_iter_count(from))
1293 1294 1295
			goto out;

		pos += written;
Nick Piggin's avatar
Nick Piggin committed
1296

1297
		written_buffered = fuse_perform_write(iocb, mapping, from, pos);
1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309
		if (written_buffered < 0) {
			err = written_buffered;
			goto out;
		}
		endbyte = pos + written_buffered - 1;

		err = filemap_write_and_wait_range(file->f_mapping, pos,
						   endbyte);
		if (err)
			goto out;

		invalidate_mapping_pages(file->f_mapping,
1310 1311
					 pos >> PAGE_SHIFT,
					 endbyte >> PAGE_SHIFT);
1312 1313 1314 1315

		written += written_buffered;
		iocb->ki_pos = pos + written_buffered;
	} else {
1316
		written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
1317
		if (written >= 0)
1318
			iocb->ki_pos += written;
1319
	}
Nick Piggin's avatar
Nick Piggin committed
1320 1321
out:
	current->backing_dev_info = NULL;
Al Viro's avatar
Al Viro committed
1322
	inode_unlock(inode);
1323 1324
	if (written > 0)
		written = generic_write_sync(iocb, written);
Nick Piggin's avatar
Nick Piggin committed
1325 1326 1327 1328

	return written ? written : err;
}

1329 1330 1331
static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
					       unsigned int index,
					       unsigned int nr_pages)
1332 1333 1334
{
	int i;

1335
	for (i = index; i < index + nr_pages; i++)
1336
		descs[i].length = PAGE_SIZE - descs[i].offset;
1337 1338
}

1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
}

static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
					size_t max_size)
{
	return min(iov_iter_single_seg_count(ii), max_size);
}

1350
static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
1351
			       size_t *nbytesp, int write)
Miklos Szeredi's avatar
Miklos Szeredi committed
1352
{
1353
	size_t nbytes = 0;  /* # bytes already packed in req */
1354
	ssize_t ret = 0;
1355

1356
	/* Special case for kernel I/O: can copy directly into the buffer */
1357
	if (iov_iter_is_kvec(ii)) {
1358 1359 1360
		unsigned long user_addr = fuse_get_user_addr(ii);
		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);

1361 1362 1363 1364 1365
		if (write)
			req->in.args[1].value = (void *) user_addr;
		else
			req->out.args[0].value = (void *) user_addr;

1366 1367
		iov_iter_advance(ii, frag_size);
		*nbytesp = frag_size;
1368 1369
		return 0;
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1370

1371
	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
1372
		unsigned npages;
Al Viro's avatar
Al Viro committed
1373
		size_t start;
1374
		ret = iov_iter_get_pages(ii, &req->pages[req->num_pages],
1375
					*nbytesp - nbytes,
1376 1377
					req->max_pages - req->num_pages,
					&start);
1378
		if (ret < 0)
1379
			break;
1380

1381 1382
		iov_iter_advance(ii, ret);
		nbytes += ret;
1383

1384 1385
		ret += start;
		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
1386

1387
		req->page_descs[req->num_pages].offset = start;
1388 1389
		fuse_page_descs_length_init(req->page_descs, req->num_pages,
					    npages);
1390 1391 1392

		req->num_pages += npages;
		req->page_descs[req->num_pages - 1].length -=
1393
			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1394
	}
1395 1396 1397 1398 1399 1400

	if (write)
		req->in.argpages = 1;
	else
		req->out.argpages = 1;

1401
	*nbytesp = nbytes;
1402

1403
	return ret < 0 ? ret : 0;
Miklos Szeredi's avatar
Miklos Szeredi committed
1404 1405
}

1406 1407
ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
		       loff_t *ppos, int flags)
Miklos Szeredi's avatar
Miklos Szeredi committed
1408
{
1409 1410
	int write = flags & FUSE_DIO_WRITE;
	int cuse = flags & FUSE_DIO_CUSE;
1411
	struct file *file = io->iocb->ki_filp;
1412
	struct inode *inode = file->f_mapping->host;
1413 1414
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
Miklos Szeredi's avatar
Miklos Szeredi committed
1415 1416
	size_t nmax = write ? fc->max_write : fc->max_read;
	loff_t pos = *ppos;
1417
	size_t count = iov_iter_count(iter);
1418 1419
	pgoff_t idx_from = pos >> PAGE_SHIFT;
	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
Miklos Szeredi's avatar
Miklos Szeredi committed
1420
	ssize_t res = 0;
1421
	struct fuse_req *req;
1422
	int err = 0;
1423

1424
	if (io->async)
1425 1426
		req = fuse_get_req_for_background(fc, iov_iter_npages(iter,
								fc->max_pages));
1427
	else
1428
		req = fuse_get_req(fc, iov_iter_npages(iter, fc->max_pages));
1429 1430
	if (IS_ERR(req))
		return PTR_ERR(req);
Miklos Szeredi's avatar
Miklos Szeredi committed
1431

1432 1433
	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
		if (!write)
Al Viro's avatar
Al Viro committed
1434
			inode_lock(inode);
1435 1436
		fuse_sync_writes(inode);
		if (!write)
Al Viro's avatar
Al Viro committed
1437
			inode_unlock(inode);
1438 1439
	}

1440
	io->should_dirty = !write && iter_is_iovec(iter);
Miklos Szeredi's avatar
Miklos Szeredi committed
1441 1442
	while (count) {
		size_t nres;
1443
		fl_owner_t owner = current->files;
1444
		size_t nbytes = min(count, nmax);
1445 1446
		err = fuse_get_user_pages(req, iter, &nbytes, write);
		if (err && !nbytes)
Miklos Szeredi's avatar
Miklos Szeredi committed
1447
			break;
1448

1449 1450 1451 1452 1453 1454 1455
		if (write) {
			if (!capable(CAP_FSETID)) {
				struct fuse_write_in *inarg;

				inarg = &req->misc.write.in;
				inarg->write_flags |= FUSE_WRITE_KILL_PRIV;
			}
1456
			nres = fuse_send_write(req, io, pos, nbytes, owner);
1457
		} else {
1458
			nres = fuse_send_read(req, io, pos, nbytes, owner);
1459
		}
1460

1461
		if (!io->async)
1462
			fuse_release_user_pages(req, io->should_dirty);
Miklos Szeredi's avatar
Miklos Szeredi committed
1463
		if (req->out.h.error) {
1464
			err = req->out.h.error;
Miklos Szeredi's avatar
Miklos Szeredi committed
1465 1466
			break;
		} else if (nres > nbytes) {
1467 1468
			res = 0;
			err = -EIO;
Miklos Szeredi's avatar
Miklos Szeredi committed
1469 1470 1471 1472 1473 1474 1475
			break;
		}
		count -= nres;
		res += nres;
		pos += nres;
		if (nres != nbytes)
			break;
1476 1477
		if (count) {
			fuse_put_request(fc, req);
1478 1479
			if (io->async)
				req = fuse_get_req_for_background(fc,
1480
					iov_iter_npages(iter, fc->max_pages));
1481
			else
1482 1483
				req = fuse_get_req(fc, iov_iter_npages(iter,
								fc->max_pages));
1484 1485 1486
			if (IS_ERR(req))
				break;
		}
Miklos Szeredi's avatar
Miklos Szeredi committed
1487
	}
1488 1489
	if (!IS_ERR(req))
		fuse_put_request(fc, req);
1490
	if (res > 0)
Miklos Szeredi's avatar
Miklos Szeredi committed
1491 1492
		*ppos = pos;

1493
	return res > 0 ? res : err;
Miklos Szeredi's avatar
Miklos Szeredi committed
1494
}
1495
EXPORT_SYMBOL_GPL(fuse_direct_io);
Miklos Szeredi's avatar
Miklos Szeredi committed
1496

1497
static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1498 1499
				  struct iov_iter *iter,
				  loff_t *ppos)
Miklos Szeredi's avatar
Miklos Szeredi committed
1500
{
1501
	ssize_t res;
1502
	struct inode *inode = file_inode(io->iocb->ki_filp);
1503

1504
	res = fuse_direct_io(io, iter, ppos, 0);
1505

1506
	fuse_invalidate_atime(inode);
1507 1508

	return res;
Miklos Szeredi's avatar
Miklos Szeredi committed
1509 1510
}

1511 1512
static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);

1513
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1514
{
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
	ssize_t res;

	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
		res = fuse_direct_IO(iocb, to);
	} else {
		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);

		res = __fuse_direct_read(&io, to, &iocb->ki_pos);
	}

	return res;
1526 1527
}

1528
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1529
{
1530 1531
	struct inode *inode = file_inode(iocb->ki_filp);
	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1532
	ssize_t res;
1533 1534

	/* Don't allow parallel writes to the same file */
Al Viro's avatar
Al Viro committed
1535
	inode_lock(inode);
1536
	res = generic_write_checks(iocb, from);
1537 1538 1539 1540 1541 1542 1543 1544
	if (res > 0) {
		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
			res = fuse_direct_IO(iocb, from);
		} else {
			res = fuse_direct_io(&io, from, &iocb->ki_pos,
					     FUSE_DIO_WRITE);
		}
	}
1545
	fuse_invalidate_attr(inode);
1546
	if (res > 0)
1547
		fuse_write_update_size(inode, iocb->ki_pos);
Al Viro's avatar
Al Viro committed
1548
	inode_unlock(inode);
1549 1550 1551 1552

	return res;
}

1553 1554
static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
1555 1556 1557 1558 1559
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;

	if (is_bad_inode(file_inode(file)))
		return -EIO;
1560 1561 1562 1563 1564 1565 1566 1567 1568

	if (!(ff->open_flags & FOPEN_DIRECT_IO))
		return fuse_cache_read_iter(iocb, to);
	else
		return fuse_direct_read_iter(iocb, to);
}

static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
1569 1570 1571 1572 1573
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;

	if (is_bad_inode(file_inode(file)))
		return -EIO;
1574 1575 1576 1577 1578 1579 1580

	if (!(ff->open_flags & FOPEN_DIRECT_IO))
		return fuse_cache_write_iter(iocb, from);
	else
		return fuse_direct_write_iter(iocb, from);
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1581
static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
1582
{
1583 1584 1585 1586
	int i;

	for (i = 0; i < req->num_pages; i++)
		__free_page(req->pages[i]);
1587 1588

	if (req->ff)
1589
		fuse_file_put(req->ff, false, false);
Miklos Szeredi's avatar
Miklos Szeredi committed
1590 1591 1592 1593 1594 1595
}

static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
{
	struct inode *inode = req->inode;
	struct fuse_inode *fi = get_fuse_inode(inode);
1596
	struct backing_dev_info *bdi = inode_to_bdi(inode);
1597
	int i;
Miklos Szeredi's avatar
Miklos Szeredi committed
1598 1599

	list_del(&req->writepages_entry);
1600
	for (i = 0; i < req->num_pages; i++) {
1601
		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1602
		dec_node_page_state(req->pages[i], NR_WRITEBACK_TEMP);
1603
		wb_writeout_inc(&bdi->wb);
1604
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1605 1606 1607
	wake_up(&fi->page_waitq);
}

1608
/* Called under fi->lock, may release and reacquire it */
1609 1610
static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
				loff_t size)
1611 1612
__releases(fi->lock)
__acquires(fi->lock)
Miklos Szeredi's avatar
Miklos Szeredi committed
1613
{
1614
	struct fuse_req *aux, *next;
Miklos Szeredi's avatar
Miklos Szeredi committed
1615 1616
	struct fuse_inode *fi = get_fuse_inode(req->inode);
	struct fuse_write_in *inarg = &req->misc.write.in;
1617
	__u64 data_size = req->num_pages * PAGE_SIZE;
1618
	bool queued;
Miklos Szeredi's avatar
Miklos Szeredi committed
1619

1620 1621
	if (inarg->offset + data_size <= size) {
		inarg->size = data_size;
Miklos Szeredi's avatar
Miklos Szeredi committed
1622
	} else if (inarg->offset < size) {
1623
		inarg->size = size - inarg->offset;
Miklos Szeredi's avatar
Miklos Szeredi committed
1624 1625 1626
	} else {
		/* Got truncated off completely */
		goto out_free;
1627
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1628 1629

	req->in.args[1].size = inarg->size;
1630
	queued = fuse_request_queue_background(fc, req);
1631 1632 1633 1634 1635
	/* Fails on broken connection only */
	if (unlikely(!queued))
		goto out_free;

	fi->writectr++;
Miklos Szeredi's avatar
Miklos Szeredi committed
1636 1637 1638 1639
	return;

 out_free:
	fuse_writepage_finish(fc, req);
1640
	spin_unlock(&fi->lock);
1641 1642 1643 1644 1645 1646 1647 1648 1649

	/* After fuse_writepage_finish() aux request list is private */
	for (aux = req->misc.write.next; aux; aux = next) {
		next = aux->misc.write.next;
		aux->misc.write.next = NULL;
		fuse_writepage_free(fc, aux);
		fuse_put_request(fc, aux);
	}

Miklos Szeredi's avatar
Miklos Szeredi committed
1650
	fuse_writepage_free(fc, req);
1651
	fuse_put_request(fc, req);
1652
	spin_lock(&fi->lock);
1653 1654
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1655 1656 1657 1658
/*
 * If fi->writectr is positive (no truncate or fsync going on) send
 * all queued writepage requests.
 *
1659
 * Called with fi->lock
Miklos Szeredi's avatar
Miklos Szeredi committed
1660 1661
 */
void fuse_flush_writepages(struct inode *inode)
1662 1663
__releases(fi->lock)
__acquires(fi->lock)
1664
{
Miklos Szeredi's avatar
Miklos Szeredi committed
1665 1666
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
1667
	loff_t crop = i_size_read(inode);
Miklos Szeredi's avatar
Miklos Szeredi committed
1668 1669 1670 1671 1672
	struct fuse_req *req;

	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
		req = list_entry(fi->queued_writes.next, struct fuse_req, list);
		list_del_init(&req->list);
1673
		fuse_send_writepage(fc, req, crop);
Miklos Szeredi's avatar
Miklos Szeredi committed
1674 1675 1676 1677 1678 1679 1680 1681 1682
	}
}

static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
{
	struct inode *inode = req->inode;
	struct fuse_inode *fi = get_fuse_inode(inode);

	mapping_set_error(inode->i_mapping, req->out.h.error);
1683
	spin_lock(&fi->lock);
1684
	while (req->misc.write.next) {
1685 1686
		struct fuse_conn *fc = get_fuse_conn(inode);
		struct fuse_write_in *inarg = &req->misc.write.in;
1687 1688 1689
		struct fuse_req *next = req->misc.write.next;
		req->misc.write.next = next->misc.write.next;
		next->misc.write.next = NULL;
1690
		next->ff = fuse_file_get(req->ff);
1691
		list_add(&next->writepages_entry, &fi->writepages);
1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716

		/*
		 * Skip fuse_flush_writepages() to make it easy to crop requests
		 * based on primary request size.
		 *
		 * 1st case (trivial): there are no concurrent activities using
		 * fuse_set/release_nowrite.  Then we're on safe side because
		 * fuse_flush_writepages() would call fuse_send_writepage()
		 * anyway.
		 *
		 * 2nd case: someone called fuse_set_nowrite and it is waiting
		 * now for completion of all in-flight requests.  This happens
		 * rarely and no more than once per page, so this should be
		 * okay.
		 *
		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
		 * that fuse_set_nowrite returned implies that all in-flight
		 * requests were completed along with all of their secondary
		 * requests.  Further primary requests are blocked by negative
		 * writectr.  Hence there cannot be any in-flight requests and
		 * no invocations of fuse_writepage_end() while we're in
		 * fuse_set_nowrite..fuse_release_nowrite section.
		 */
		fuse_send_writepage(fc, next, inarg->offset + inarg->size);
1717
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1718 1719
	fi->writectr--;
	fuse_writepage_finish(fc, req);
1720
	spin_unlock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
1721 1722 1723
	fuse_writepage_free(fc, req);
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1724 1725
static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
					       struct fuse_inode *fi)
1726
{
1727
	struct fuse_file *ff = NULL;
1728

1729
	spin_lock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
1730
	if (!list_empty(&fi->write_files)) {
1731 1732 1733 1734
		ff = list_entry(fi->write_files.next, struct fuse_file,
				write_entry);
		fuse_file_get(ff);
	}
1735
	spin_unlock(&fi->lock);
1736 1737 1738 1739

	return ff;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755
static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
					     struct fuse_inode *fi)
{
	struct fuse_file *ff = __fuse_write_file_get(fc, fi);
	WARN_ON(!ff);
	return ff;
}

int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct fuse_file *ff;
	int err;

	ff = __fuse_write_file_get(fc, fi);
1756
	err = fuse_flush_times(inode, ff);
Miklos Szeredi's avatar
Miklos Szeredi committed
1757
	if (ff)
1758
		fuse_file_put(ff, false, false);
Miklos Szeredi's avatar
Miklos Szeredi committed
1759 1760 1761 1762

	return err;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1763 1764 1765 1766 1767 1768 1769 1770
static int fuse_writepage_locked(struct page *page)
{
	struct address_space *mapping = page->mapping;
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct fuse_req *req;
	struct page *tmp_page;
1771
	int error = -ENOMEM;
Miklos Szeredi's avatar
Miklos Szeredi committed
1772 1773 1774

	set_page_writeback(page);

1775
	req = fuse_request_alloc_nofs(1);
Miklos Szeredi's avatar
Miklos Szeredi committed
1776 1777 1778
	if (!req)
		goto err;

Miklos Szeredi's avatar
Miklos Szeredi committed
1779 1780
	/* writeback always goes to bg_queue */
	__set_bit(FR_BACKGROUND, &req->flags);
Miklos Szeredi's avatar
Miklos Szeredi committed
1781 1782 1783 1784
	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
	if (!tmp_page)
		goto err_free;

1785
	error = -EIO;
1786
	req->ff = fuse_write_file_get(fc, fi);
1787
	if (!req->ff)
1788
		goto err_nofile;
1789

1790
	fuse_write_fill(req, req->ff, page_offset(page), 0);
Miklos Szeredi's avatar
Miklos Szeredi committed
1791 1792

	copy_highpage(tmp_page, page);
1793
	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
1794
	req->misc.write.next = NULL;
1795
	req->in.argpages = 1;
Miklos Szeredi's avatar
Miklos Szeredi committed
1796 1797
	req->num_pages = 1;
	req->pages[0] = tmp_page;
1798
	req->page_descs[0].offset = 0;
1799
	req->page_descs[0].length = PAGE_SIZE;
Miklos Szeredi's avatar
Miklos Szeredi committed
1800 1801 1802
	req->end = fuse_writepage_end;
	req->inode = inode;

1803
	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1804
	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
Miklos Szeredi's avatar
Miklos Szeredi committed
1805

1806
	spin_lock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
1807 1808 1809
	list_add(&req->writepages_entry, &fi->writepages);
	list_add_tail(&req->list, &fi->queued_writes);
	fuse_flush_writepages(inode);
1810
	spin_unlock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
1811

1812 1813
	end_page_writeback(page);

Miklos Szeredi's avatar
Miklos Szeredi committed
1814 1815
	return 0;

1816 1817
err_nofile:
	__free_page(tmp_page);
Miklos Szeredi's avatar
Miklos Szeredi committed
1818 1819 1820
err_free:
	fuse_request_free(req);
err:
1821
	mapping_set_error(page->mapping, error);
Miklos Szeredi's avatar
Miklos Szeredi committed
1822
	end_page_writeback(page);
1823
	return error;
Miklos Szeredi's avatar
Miklos Szeredi committed
1824 1825 1826 1827 1828 1829
}

static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
	int err;

1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841
	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
		/*
		 * ->writepages() should be called for sync() and friends.  We
		 * should only get here on direct reclaim and then we are
		 * allowed to skip a page which is already in flight
		 */
		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);

		redirty_page_for_writepage(wbc, page);
		return 0;
	}

Miklos Szeredi's avatar
Miklos Szeredi committed
1842 1843 1844 1845 1846 1847
	err = fuse_writepage_locked(page);
	unlock_page(page);

	return err;
}

1848 1849 1850 1851
struct fuse_fill_wb_data {
	struct fuse_req *req;
	struct fuse_file *ff;
	struct inode *inode;
1852
	struct page **orig_pages;
1853 1854 1855 1856 1857 1858 1859
};

static void fuse_writepages_send(struct fuse_fill_wb_data *data)
{
	struct fuse_req *req = data->req;
	struct inode *inode = data->inode;
	struct fuse_inode *fi = get_fuse_inode(inode);
1860 1861
	int num_pages = req->num_pages;
	int i;
1862 1863

	req->ff = fuse_file_get(data->ff);
1864
	spin_lock(&fi->lock);
1865 1866
	list_add_tail(&req->list, &fi->queued_writes);
	fuse_flush_writepages(inode);
1867
	spin_unlock(&fi->lock);
1868 1869 1870

	for (i = 0; i < num_pages; i++)
		end_page_writeback(data->orig_pages[i]);
1871 1872
}

1873 1874
/*
 * First recheck under fi->lock if the offending offset is still under
1875 1876 1877
 * writeback.  If yes, then iterate auxiliary write requests, to see if there's
 * one already added for a page at this offset.  If there's none, then insert
 * this new request onto the auxiliary list, otherwise reuse the existing one by
1878 1879
 * copying the new page contents over to the old temporary page.
 */
1880 1881 1882 1883 1884 1885 1886 1887
static bool fuse_writepage_in_flight(struct fuse_req *new_req,
				     struct page *page)
{
	struct fuse_conn *fc = get_fuse_conn(new_req->inode);
	struct fuse_inode *fi = get_fuse_inode(new_req->inode);
	struct fuse_req *tmp;
	struct fuse_req *old_req;

1888
	WARN_ON(new_req->num_pages != 0);
1889

1890
	spin_lock(&fi->lock);
1891
	list_del(&new_req->writepages_entry);
1892 1893
	old_req = fuse_find_writeback(fi, page->index, page->index);
	if (!old_req) {
1894
		list_add(&new_req->writepages_entry, &fi->writepages);
1895
		spin_unlock(&fi->lock);
1896
		return false;
1897
	}
1898

1899
	new_req->num_pages = 1;
1900
	for (tmp = old_req->misc.write.next; tmp; tmp = tmp->misc.write.next) {
1901 1902 1903
		pgoff_t curr_index;

		WARN_ON(tmp->inode != new_req->inode);
1904
		curr_index = tmp->misc.write.in.offset >> PAGE_SHIFT;
1905 1906 1907
		if (curr_index == page->index) {
			WARN_ON(tmp->num_pages != 1);
			WARN_ON(!test_bit(FR_PENDING, &tmp->flags));
1908
			swap(tmp->pages[0], new_req->pages[0]);
1909
			break;
1910 1911 1912
		}
	}

1913 1914 1915 1916
	if (!tmp) {
		new_req->misc.write.next = old_req->misc.write.next;
		old_req->misc.write.next = new_req;
	}
1917

1918
	spin_unlock(&fi->lock);
1919 1920 1921

	if (tmp) {
		struct backing_dev_info *bdi = inode_to_bdi(new_req->inode);
1922

1923
		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1924
		dec_node_page_state(new_req->pages[0], NR_WRITEBACK_TEMP);
1925
		wb_writeout_inc(&bdi->wb);
1926 1927 1928
		fuse_writepage_free(fc, new_req);
		fuse_request_free(new_req);
	}
1929

1930
	return true;
1931 1932
}

1933 1934 1935 1936 1937 1938
static int fuse_writepages_fill(struct page *page,
		struct writeback_control *wbc, void *_data)
{
	struct fuse_fill_wb_data *data = _data;
	struct fuse_req *req = data->req;
	struct inode *inode = data->inode;
1939
	struct fuse_inode *fi = get_fuse_inode(inode);
1940 1941
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct page *tmp_page;
1942
	bool is_writeback;
1943 1944 1945 1946 1947 1948 1949 1950 1951
	int err;

	if (!data->ff) {
		err = -EIO;
		data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
		if (!data->ff)
			goto out_unlock;
	}

1952 1953 1954 1955 1956 1957 1958 1959 1960
	/*
	 * Being under writeback is unlikely but possible.  For example direct
	 * read to an mmaped fuse file will set the page dirty twice; once when
	 * the pages are faulted with get_user_pages(), and then after the read
	 * completed.
	 */
	is_writeback = fuse_page_is_writeback(inode, page->index);

	if (req && req->num_pages &&
1961
	    (is_writeback || req->num_pages == fc->max_pages ||
1962
	     (req->num_pages + 1) * PAGE_SIZE > fc->max_write ||
1963 1964 1965
	     data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
		fuse_writepages_send(data);
		data->req = NULL;
Miklos Szeredi's avatar
Miklos Szeredi committed
1966 1967 1968 1969 1970
	} else if (req && req->num_pages == req->max_pages) {
		if (!fuse_req_realloc_pages(fc, req, GFP_NOFS)) {
			fuse_writepages_send(data);
			req = data->req = NULL;
		}
1971
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1972

1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994
	err = -ENOMEM;
	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
	if (!tmp_page)
		goto out_unlock;

	/*
	 * The page must not be redirtied until the writeout is completed
	 * (i.e. userspace has sent a reply to the write request).  Otherwise
	 * there could be more than one temporary page instance for each real
	 * page.
	 *
	 * This is ensured by holding the page lock in page_mkwrite() while
	 * checking fuse_page_is_writeback().  We already hold the page lock
	 * since clear_page_dirty_for_io() and keep it held until we add the
	 * request to the fi->writepages list and increment req->num_pages.
	 * After this fuse_page_is_writeback() will indicate that the page is
	 * under writeback, so we can release the page lock.
	 */
	if (data->req == NULL) {
		struct fuse_inode *fi = get_fuse_inode(inode);

		err = -ENOMEM;
Miklos Szeredi's avatar
Miklos Szeredi committed
1995
		req = fuse_request_alloc_nofs(FUSE_REQ_INLINE_PAGES);
1996 1997 1998 1999 2000 2001 2002
		if (!req) {
			__free_page(tmp_page);
			goto out_unlock;
		}

		fuse_write_fill(req, data->ff, page_offset(page), 0);
		req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
2003
		req->misc.write.next = NULL;
2004
		req->in.argpages = 1;
Miklos Szeredi's avatar
Miklos Szeredi committed
2005
		__set_bit(FR_BACKGROUND, &req->flags);
2006 2007 2008 2009
		req->num_pages = 0;
		req->end = fuse_writepage_end;
		req->inode = inode;

2010
		spin_lock(&fi->lock);
2011
		list_add(&req->writepages_entry, &fi->writepages);
2012
		spin_unlock(&fi->lock);
2013 2014 2015 2016 2017 2018 2019 2020 2021 2022

		data->req = req;
	}
	set_page_writeback(page);

	copy_highpage(tmp_page, page);
	req->pages[req->num_pages] = tmp_page;
	req->page_descs[req->num_pages].offset = 0;
	req->page_descs[req->num_pages].length = PAGE_SIZE;

2023
	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2024
	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
2025 2026 2027 2028 2029 2030 2031

	err = 0;
	if (is_writeback && fuse_writepage_in_flight(req, page)) {
		end_page_writeback(page);
		data->req = NULL;
		goto out_unlock;
	}
2032
	data->orig_pages[req->num_pages] = page;
2033 2034

	/*
2035
	 * Protected by fi->lock against concurrent access by
2036 2037
	 * fuse_page_is_writeback().
	 */
2038
	spin_lock(&fi->lock);
2039
	req->num_pages++;
2040
	spin_unlock(&fi->lock);
2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051

out_unlock:
	unlock_page(page);

	return err;
}

static int fuse_writepages(struct address_space *mapping,
			   struct writeback_control *wbc)
{
	struct inode *inode = mapping->host;
2052
	struct fuse_conn *fc = get_fuse_conn(inode);
2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063
	struct fuse_fill_wb_data data;
	int err;

	err = -EIO;
	if (is_bad_inode(inode))
		goto out;

	data.inode = inode;
	data.req = NULL;
	data.ff = NULL;

2064
	err = -ENOMEM;
2065
	data.orig_pages = kcalloc(fc->max_pages,
2066
				  sizeof(struct page *),
2067 2068 2069 2070
				  GFP_NOFS);
	if (!data.orig_pages)
		goto out;

2071 2072 2073 2074 2075 2076 2077 2078
	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
	if (data.req) {
		/* Ignore errors if we can write at least one page */
		BUG_ON(!data.req->num_pages);
		fuse_writepages_send(&data);
		err = 0;
	}
	if (data.ff)
2079
		fuse_file_put(data.ff, false, false);
2080 2081

	kfree(data.orig_pages);
2082 2083 2084 2085
out:
	return err;
}

2086 2087 2088 2089 2090 2091 2092 2093
/*
 * It's worthy to make sure that space is reserved on disk for the write,
 * but how to implement it without killing performance need more thinking.
 */
static int fuse_write_begin(struct file *file, struct address_space *mapping,
		loff_t pos, unsigned len, unsigned flags,
		struct page **pagep, void **fsdata)
{
2094
	pgoff_t index = pos >> PAGE_SHIFT;
Al Viro's avatar
Al Viro committed
2095
	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107
	struct page *page;
	loff_t fsize;
	int err = -ENOMEM;

	WARN_ON(!fc->writeback_cache);

	page = grab_cache_page_write_begin(mapping, index, flags);
	if (!page)
		goto error;

	fuse_wait_on_page_writeback(mapping->host, page->index);

2108
	if (PageUptodate(page) || len == PAGE_SIZE)
2109 2110 2111 2112 2113 2114
		goto success;
	/*
	 * Check if the start this page comes after the end of file, in which
	 * case the readpage can be optimized away.
	 */
	fsize = i_size_read(mapping->host);
2115 2116
	if (fsize <= (pos & PAGE_MASK)) {
		size_t off = pos & ~PAGE_MASK;
2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129
		if (off)
			zero_user_segment(page, 0, off);
		goto success;
	}
	err = fuse_do_readpage(file, page);
	if (err)
		goto cleanup;
success:
	*pagep = page;
	return 0;

cleanup:
	unlock_page(page);
2130
	put_page(page);
2131 2132 2133 2134 2135 2136 2137 2138 2139 2140
error:
	return err;
}

static int fuse_write_end(struct file *file, struct address_space *mapping,
		loff_t pos, unsigned len, unsigned copied,
		struct page *page, void *fsdata)
{
	struct inode *inode = page->mapping->host;

2141 2142 2143 2144
	/* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
	if (!copied)
		goto unlock;

2145 2146
	if (!PageUptodate(page)) {
		/* Zero any unwritten bytes at the end of the page */
2147
		size_t endoff = (pos + copied) & ~PAGE_MASK;
2148
		if (endoff)
2149
			zero_user_segment(page, endoff, PAGE_SIZE);
2150 2151 2152 2153 2154
		SetPageUptodate(page);
	}

	fuse_write_update_size(inode, pos + copied);
	set_page_dirty(page);
2155 2156

unlock:
2157
	unlock_page(page);
2158
	put_page(page);
2159 2160 2161 2162

	return copied;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198
static int fuse_launder_page(struct page *page)
{
	int err = 0;
	if (clear_page_dirty_for_io(page)) {
		struct inode *inode = page->mapping->host;
		err = fuse_writepage_locked(page);
		if (!err)
			fuse_wait_on_page_writeback(inode, page->index);
	}
	return err;
}

/*
 * Write back dirty pages now, because there may not be any suitable
 * open files later
 */
static void fuse_vma_close(struct vm_area_struct *vma)
{
	filemap_write_and_wait(vma->vm_file->f_mapping);
}

/*
 * Wait for writeback against this page to complete before allowing it
 * to be marked dirty again, and hence written back again, possibly
 * before the previous writepage completed.
 *
 * Block here, instead of in ->writepage(), so that the userspace fs
 * can only block processes actually operating on the filesystem.
 *
 * Otherwise unprivileged userspace fs would be able to block
 * unrelated:
 *
 * - page migration
 * - sync(2)
 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
 */
2199
static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
Miklos Szeredi's avatar
Miklos Szeredi committed
2200
{
2201
	struct page *page = vmf->page;
2202
	struct inode *inode = file_inode(vmf->vma->vm_file);
Miklos Szeredi's avatar
Miklos Szeredi committed
2203

2204
	file_update_time(vmf->vma->vm_file);
Miklos Szeredi's avatar
Miklos Szeredi committed
2205 2206 2207 2208 2209
	lock_page(page);
	if (page->mapping != inode->i_mapping) {
		unlock_page(page);
		return VM_FAULT_NOPAGE;
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
2210 2211

	fuse_wait_on_page_writeback(inode, page->index);
Miklos Szeredi's avatar
Miklos Szeredi committed
2212
	return VM_FAULT_LOCKED;
Miklos Szeredi's avatar
Miklos Szeredi committed
2213 2214
}

2215
static const struct vm_operations_struct fuse_file_vm_ops = {
Miklos Szeredi's avatar
Miklos Szeredi committed
2216 2217
	.close		= fuse_vma_close,
	.fault		= filemap_fault,
2218
	.map_pages	= filemap_map_pages,
Miklos Szeredi's avatar
Miklos Szeredi committed
2219 2220 2221 2222 2223
	.page_mkwrite	= fuse_page_mkwrite,
};

static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235
	struct fuse_file *ff = file->private_data;

	if (ff->open_flags & FOPEN_DIRECT_IO) {
		/* Can't provide the coherency needed for MAP_SHARED */
		if (vma->vm_flags & VM_MAYSHARE)
			return -ENODEV;

		invalidate_inode_pages2(file->f_mapping);

		return generic_file_mmap(file, vma);
	}

2236 2237 2238
	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
		fuse_link_write_file(file);

Miklos Szeredi's avatar
Miklos Szeredi committed
2239 2240
	file_accessed(file);
	vma->vm_ops = &fuse_file_vm_ops;
2241 2242 2243
	return 0;
}

2244 2245
static int convert_fuse_file_lock(struct fuse_conn *fc,
				  const struct fuse_file_lock *ffl,
2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259
				  struct file_lock *fl)
{
	switch (ffl->type) {
	case F_UNLCK:
		break;

	case F_RDLCK:
	case F_WRLCK:
		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
		    ffl->end < ffl->start)
			return -EIO;

		fl->fl_start = ffl->start;
		fl->fl_end = ffl->end;
2260 2261

		/*
2262 2263
		 * Convert pid into init's pid namespace.  The locks API will
		 * translate it into the caller's pid namespace.
2264 2265
		 */
		rcu_read_lock();
2266
		fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2267
		rcu_read_unlock();
2268 2269 2270 2271 2272 2273 2274 2275 2276
		break;

	default:
		return -EIO;
	}
	fl->fl_type = ffl->type;
	return 0;
}

2277
static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2278
			 const struct file_lock *fl, int opcode, pid_t pid,
2279
			 int flock, struct fuse_lk_in *inarg)
2280
{
2281
	struct inode *inode = file_inode(file);
2282
	struct fuse_conn *fc = get_fuse_conn(inode);
2283
	struct fuse_file *ff = file->private_data;
2284 2285 2286 2287 2288 2289 2290 2291

	memset(inarg, 0, sizeof(*inarg));
	inarg->fh = ff->fh;
	inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
	inarg->lk.start = fl->fl_start;
	inarg->lk.end = fl->fl_end;
	inarg->lk.type = fl->fl_type;
	inarg->lk.pid = pid;
2292
	if (flock)
2293
		inarg->lk_flags |= FUSE_LK_FLOCK;
2294 2295 2296 2297 2298
	args->opcode = opcode;
	args->nodeid = get_node_id(inode);
	args->in_numargs = 1;
	args->in_args[0].size = sizeof(*inarg);
	args->in_args[0].value = inarg;
2299 2300 2301 2302
}

static int fuse_getlk(struct file *file, struct file_lock *fl)
{
2303
	struct inode *inode = file_inode(file);
2304
	struct fuse_conn *fc = get_fuse_conn(inode);
2305 2306
	FUSE_ARGS(args);
	struct fuse_lk_in inarg;
2307 2308 2309
	struct fuse_lk_out outarg;
	int err;

2310
	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2311 2312 2313
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2314
	err = fuse_simple_request(fc, &args);
2315
	if (!err)
2316
		err = convert_fuse_file_lock(fc, &outarg.lk, fl);
2317 2318 2319 2320

	return err;
}

2321
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2322
{
2323
	struct inode *inode = file_inode(file);
2324
	struct fuse_conn *fc = get_fuse_conn(inode);
2325 2326
	FUSE_ARGS(args);
	struct fuse_lk_in inarg;
2327
	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2328 2329
	struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
	pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
2330 2331
	int err;

2332
	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2333 2334 2335 2336
		/* NLM needs asynchronous locks, which we don't support yet */
		return -ENOLCK;
	}

2337
	/* Unlock on close is handled by the flush method */
2338
	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
2339 2340
		return 0;

2341
	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2342
	err = fuse_simple_request(fc, &args);
2343

2344 2345 2346
	/* locking is restartable */
	if (err == -EINTR)
		err = -ERESTARTSYS;
2347

2348 2349 2350 2351 2352
	return err;
}

static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
{
2353
	struct inode *inode = file_inode(file);
2354 2355 2356
	struct fuse_conn *fc = get_fuse_conn(inode);
	int err;

Miklos Szeredi's avatar
Miklos Szeredi committed
2357 2358 2359
	if (cmd == F_CANCELLK) {
		err = 0;
	} else if (cmd == F_GETLK) {
2360
		if (fc->no_lock) {
2361
			posix_test_lock(file, fl);
2362 2363 2364 2365 2366
			err = 0;
		} else
			err = fuse_getlk(file, fl);
	} else {
		if (fc->no_lock)
Miklos Szeredi's avatar
Miklos Szeredi committed
2367
			err = posix_lock_file(file, fl, NULL);
2368
		else
2369
			err = fuse_setlk(file, fl, 0);
2370 2371 2372 2373
	}
	return err;
}

2374 2375
static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
{
2376
	struct inode *inode = file_inode(file);
2377 2378 2379
	struct fuse_conn *fc = get_fuse_conn(inode);
	int err;

Miklos Szeredi's avatar
Miklos Szeredi committed
2380
	if (fc->no_flock) {
2381
		err = locks_lock_file_wait(file, fl);
2382
	} else {
Miklos Szeredi's avatar
Miklos Szeredi committed
2383 2384
		struct fuse_file *ff = file->private_data;

2385
		/* emulate flock with POSIX locks */
Miklos Szeredi's avatar
Miklos Szeredi committed
2386
		ff->flock = true;
2387 2388 2389 2390 2391 2392
		err = fuse_setlk(file, fl, 1);
	}

	return err;
}

2393 2394 2395 2396
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
2397
	FUSE_ARGS(args);
2398 2399 2400 2401 2402 2403 2404 2405 2406 2407
	struct fuse_bmap_in inarg;
	struct fuse_bmap_out outarg;
	int err;

	if (!inode->i_sb->s_bdev || fc->no_bmap)
		return 0;

	memset(&inarg, 0, sizeof(inarg));
	inarg.block = block;
	inarg.blocksize = inode->i_sb->s_blocksize;
2408 2409 2410 2411 2412 2413 2414 2415
	args.opcode = FUSE_BMAP;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2416
	err = fuse_simple_request(fc, &args);
2417 2418 2419 2420 2421 2422
	if (err == -ENOSYS)
		fc->no_bmap = 1;

	return err ? 0 : outarg.block;
}

2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
	struct inode *inode = file->f_mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_file *ff = file->private_data;
	FUSE_ARGS(args);
	struct fuse_lseek_in inarg = {
		.fh = ff->fh,
		.offset = offset,
		.whence = whence
	};
	struct fuse_lseek_out outarg;
	int err;

	if (fc->no_lseek)
		goto fallback;

2440 2441 2442 2443 2444 2445 2446 2447
	args.opcode = FUSE_LSEEK;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459
	err = fuse_simple_request(fc, &args);
	if (err) {
		if (err == -ENOSYS) {
			fc->no_lseek = 1;
			goto fallback;
		}
		return err;
	}

	return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);

fallback:
Miklos Szeredi's avatar
Miklos Szeredi committed
2460
	err = fuse_update_attributes(inode, file);
2461 2462 2463 2464 2465 2466
	if (!err)
		return generic_file_llseek(file, offset, whence);
	else
		return err;
}

2467
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
Miklos Szeredi's avatar
Miklos Szeredi committed
2468 2469
{
	loff_t retval;
2470
	struct inode *inode = file_inode(file);
Miklos Szeredi's avatar
Miklos Szeredi committed
2471

2472 2473 2474 2475
	switch (whence) {
	case SEEK_SET:
	case SEEK_CUR:
		 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2476
		retval = generic_file_llseek(file, offset, whence);
2477 2478
		break;
	case SEEK_END:
Al Viro's avatar
Al Viro committed
2479
		inode_lock(inode);
Miklos Szeredi's avatar
Miklos Szeredi committed
2480
		retval = fuse_update_attributes(inode, file);
2481 2482
		if (!retval)
			retval = generic_file_llseek(file, offset, whence);
Al Viro's avatar
Al Viro committed
2483
		inode_unlock(inode);
2484 2485 2486
		break;
	case SEEK_HOLE:
	case SEEK_DATA:
Al Viro's avatar
Al Viro committed
2487
		inode_lock(inode);
2488
		retval = fuse_lseek(file, offset, whence);
Al Viro's avatar
Al Viro committed
2489
		inode_unlock(inode);
2490 2491 2492 2493
		break;
	default:
		retval = -EINVAL;
	}
2494

Miklos Szeredi's avatar
Miklos Szeredi committed
2495 2496 2497
	return retval;
}

2498 2499 2500 2501 2502 2503
/*
 * CUSE servers compiled on 32bit broke on 64bit kernels because the
 * ABI was defined to be 'struct iovec' which is different on 32bit
 * and 64bit.  Fortunately we can determine which structure the server
 * used from the size of the reply.
 */
Miklos Szeredi's avatar
Miklos Szeredi committed
2504 2505 2506
static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
				     size_t transferred, unsigned count,
				     bool is_compat)
2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535
{
#ifdef CONFIG_COMPAT
	if (count * sizeof(struct compat_iovec) == transferred) {
		struct compat_iovec *ciov = src;
		unsigned i;

		/*
		 * With this interface a 32bit server cannot support
		 * non-compat (i.e. ones coming from 64bit apps) ioctl
		 * requests
		 */
		if (!is_compat)
			return -EINVAL;

		for (i = 0; i < count; i++) {
			dst[i].iov_base = compat_ptr(ciov[i].iov_base);
			dst[i].iov_len = ciov[i].iov_len;
		}
		return 0;
	}
#endif

	if (count * sizeof(struct iovec) != transferred)
		return -EIO;

	memcpy(dst, src, transferred);
	return 0;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
2536
/* Make sure iov_length() won't overflow */
2537 2538
static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
				 size_t count)
Miklos Szeredi's avatar
Miklos Szeredi committed
2539 2540
{
	size_t n;
2541
	u32 max = fc->max_pages << PAGE_SHIFT;
Miklos Szeredi's avatar
Miklos Szeredi committed
2542

2543
	for (n = 0; n < count; n++, iov++) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2544 2545 2546 2547 2548 2549 2550
		if (iov->iov_len > (size_t) max)
			return -ENOMEM;
		max -= iov->iov_len;
	}
	return 0;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586
static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
				 void *src, size_t transferred, unsigned count,
				 bool is_compat)
{
	unsigned i;
	struct fuse_ioctl_iovec *fiov = src;

	if (fc->minor < 16) {
		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
						 count, is_compat);
	}

	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
		return -EIO;

	for (i = 0; i < count; i++) {
		/* Did the server supply an inappropriate value? */
		if (fiov[i].base != (unsigned long) fiov[i].base ||
		    fiov[i].len != (unsigned long) fiov[i].len)
			return -EIO;

		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
		dst[i].iov_len = (size_t) fiov[i].len;

#ifdef CONFIG_COMPAT
		if (is_compat &&
		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
		     (compat_size_t) dst[i].iov_len != fiov[i].len))
			return -EIO;
#endif
	}

	return 0;
}


Tejun Heo's avatar
Tejun Heo committed
2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632
/*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
 * to dereference the passed pointer, so the parameter requires deep
 * copying but FUSE has no idea whatsoever about what to copy in or
 * out.
 *
 * This is solved by allowing FUSE server to retry ioctl with
 * necessary in/out iovecs.  Let's assume the ioctl implementation
 * needs to read in the following structure.
 *
 * struct a {
 *	char	*buf;
 *	size_t	buflen;
 * }
 *
 * On the first callout to FUSE server, inarg->in_size and
 * inarg->out_size will be NULL; then, the server completes the ioctl
 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
 * the actual iov array to
 *
 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
 *
 * which tells FUSE to copy in the requested area and retry the ioctl.
 * On the second round, the server has access to the structure and
 * from that it can tell what to look for next, so on the invocation,
 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
 *
 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
 *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
 *
 * FUSE will copy both struct a and the pointed buffer from the
 * process doing the ioctl and retry ioctl with both struct a and the
 * buffer.
 *
 * This time, FUSE server has everything it needs and completes ioctl
 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
 *
 * Copying data out works the same way.
 *
 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
 * automatically initializes in and out iovs by decoding @cmd with
 * _IOC_* macros and the server is not allowed to request RETRY.  This
 * limits ioctl data transfers to well-formed ioctls and is the forced
 * behavior for all FUSE servers.
 */
2633 2634
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
		   unsigned int flags)
Tejun Heo's avatar
Tejun Heo committed
2635 2636
{
	struct fuse_file *ff = file->private_data;
2637
	struct fuse_conn *fc = ff->fc;
Tejun Heo's avatar
Tejun Heo committed
2638 2639 2640 2641 2642 2643 2644
	struct fuse_ioctl_in inarg = {
		.fh = ff->fh,
		.cmd = cmd,
		.arg = arg,
		.flags = flags
	};
	struct fuse_ioctl_out outarg;
Miklos Szeredi's avatar
Miklos Szeredi committed
2645
	struct iovec *iov_page = NULL;
Tejun Heo's avatar
Tejun Heo committed
2646
	struct iovec *in_iov = NULL, *out_iov = NULL;
2647 2648 2649
	unsigned int in_iovs = 0, out_iovs = 0, max_pages;
	size_t in_size, out_size, c;
	ssize_t transferred;
2650 2651
	int err, i;
	struct iov_iter ii;
2652
	struct fuse_args_pages ap = {};
Tejun Heo's avatar
Tejun Heo committed
2653

Miklos Szeredi's avatar
Miklos Szeredi committed
2654 2655 2656
#if BITS_PER_LONG == 32
	inarg.flags |= FUSE_IOCTL_32BIT;
#else
2657
	if (flags & FUSE_IOCTL_COMPAT) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2658
		inarg.flags |= FUSE_IOCTL_32BIT;
2659 2660 2661 2662 2663
#ifdef CONFIG_X86_X32
		if (in_x32_syscall())
			inarg.flags |= FUSE_IOCTL_COMPAT_X32;
#endif
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
2664 2665
#endif

Tejun Heo's avatar
Tejun Heo committed
2666
	/* assume all the iovs returned by client always fits in a page */
Miklos Szeredi's avatar
Miklos Szeredi committed
2667
	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
Tejun Heo's avatar
Tejun Heo committed
2668 2669

	err = -ENOMEM;
2670
	ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
Miklos Szeredi's avatar
Miklos Szeredi committed
2671
	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
2672
	if (!ap.pages || !iov_page)
Tejun Heo's avatar
Tejun Heo committed
2673 2674
		goto out;

2675 2676
	fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);

Tejun Heo's avatar
Tejun Heo committed
2677 2678 2679 2680 2681
	/*
	 * If restricted, initialize IO parameters as encoded in @cmd.
	 * RETRY from server is not allowed.
	 */
	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2682
		struct iovec *iov = iov_page;
Tejun Heo's avatar
Tejun Heo committed
2683

2684
		iov->iov_base = (void __user *)arg;
Tejun Heo's avatar
Tejun Heo committed
2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710
		iov->iov_len = _IOC_SIZE(cmd);

		if (_IOC_DIR(cmd) & _IOC_WRITE) {
			in_iov = iov;
			in_iovs = 1;
		}

		if (_IOC_DIR(cmd) & _IOC_READ) {
			out_iov = iov;
			out_iovs = 1;
		}
	}

 retry:
	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
	inarg.out_size = out_size = iov_length(out_iov, out_iovs);

	/*
	 * Out data can be used either for actual out data or iovs,
	 * make sure there always is at least one page.
	 */
	out_size = max_t(size_t, out_size, PAGE_SIZE);
	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);

	/* make sure there are enough buffer pages and init request with them */
	err = -ENOMEM;
2711
	if (max_pages > fc->max_pages)
Tejun Heo's avatar
Tejun Heo committed
2712
		goto out;
2713 2714 2715
	while (ap.num_pages < max_pages) {
		ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
		if (!ap.pages[ap.num_pages])
Tejun Heo's avatar
Tejun Heo committed
2716
			goto out;
2717
		ap.num_pages++;
Tejun Heo's avatar
Tejun Heo committed
2718 2719 2720 2721
	}


	/* okay, let's send it to the client */
2722 2723 2724 2725 2726
	ap.args.opcode = FUSE_IOCTL;
	ap.args.nodeid = ff->nodeid;
	ap.args.in_numargs = 1;
	ap.args.in_args[0].size = sizeof(inarg);
	ap.args.in_args[0].value = &inarg;
Tejun Heo's avatar
Tejun Heo committed
2727
	if (in_size) {
2728 2729 2730
		ap.args.in_numargs++;
		ap.args.in_args[1].size = in_size;
		ap.args.in_pages = true;
Tejun Heo's avatar
Tejun Heo committed
2731

2732 2733
		err = -EFAULT;
		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
2734 2735
		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
			c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2736 2737 2738
			if (c != PAGE_SIZE && iov_iter_count(&ii))
				goto out;
		}
Tejun Heo's avatar
Tejun Heo committed
2739 2740
	}

2741 2742 2743 2744 2745 2746
	ap.args.out_numargs = 2;
	ap.args.out_args[0].size = sizeof(outarg);
	ap.args.out_args[0].value = &outarg;
	ap.args.out_args[1].size = out_size;
	ap.args.out_pages = true;
	ap.args.out_argvar = true;
Tejun Heo's avatar
Tejun Heo committed
2747

2748 2749 2750
	transferred = fuse_simple_request(fc, &ap.args);
	err = transferred;
	if (transferred < 0)
Tejun Heo's avatar
Tejun Heo committed
2751 2752 2753 2754
		goto out;

	/* did it ask for retry? */
	if (outarg.flags & FUSE_IOCTL_RETRY) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2755
		void *vaddr;
Tejun Heo's avatar
Tejun Heo committed
2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774

		/* no retry if in restricted mode */
		err = -EIO;
		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
			goto out;

		in_iovs = outarg.in_iovs;
		out_iovs = outarg.out_iovs;

		/*
		 * Make sure things are in boundary, separate checks
		 * are to protect against overflow.
		 */
		err = -ENOMEM;
		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
		    out_iovs > FUSE_IOCTL_MAX_IOV ||
		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
			goto out;

2775
		vaddr = kmap_atomic(ap.pages[0]);
Miklos Szeredi's avatar
Miklos Szeredi committed
2776
		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
2777 2778
					    transferred, in_iovs + out_iovs,
					    (flags & FUSE_IOCTL_COMPAT) != 0);
2779
		kunmap_atomic(vaddr);
2780 2781
		if (err)
			goto out;
Tejun Heo's avatar
Tejun Heo committed
2782

Miklos Szeredi's avatar
Miklos Szeredi committed
2783
		in_iov = iov_page;
Tejun Heo's avatar
Tejun Heo committed
2784 2785
		out_iov = in_iov + in_iovs;

2786
		err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
Miklos Szeredi's avatar
Miklos Szeredi committed
2787 2788 2789
		if (err)
			goto out;

2790
		err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
Miklos Szeredi's avatar
Miklos Szeredi committed
2791 2792 2793
		if (err)
			goto out;

Tejun Heo's avatar
Tejun Heo committed
2794 2795 2796 2797 2798 2799 2800
		goto retry;
	}

	err = -EIO;
	if (transferred > inarg.out_size)
		goto out;

2801 2802
	err = -EFAULT;
	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
2803 2804
	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
		c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2805 2806 2807 2808
		if (c != PAGE_SIZE && iov_iter_count(&ii))
			goto out;
	}
	err = 0;
Tejun Heo's avatar
Tejun Heo committed
2809
 out:
Miklos Szeredi's avatar
Miklos Szeredi committed
2810
	free_page((unsigned long) iov_page);
2811 2812 2813
	while (ap.num_pages)
		__free_page(ap.pages[--ap.num_pages]);
	kfree(ap.pages);
Tejun Heo's avatar
Tejun Heo committed
2814 2815 2816

	return err ? err : outarg.result;
}
2817
EXPORT_SYMBOL_GPL(fuse_do_ioctl);
Tejun Heo's avatar
Tejun Heo committed
2818

2819 2820
long fuse_ioctl_common(struct file *file, unsigned int cmd,
		       unsigned long arg, unsigned int flags)
2821
{
2822
	struct inode *inode = file_inode(file);
2823 2824
	struct fuse_conn *fc = get_fuse_conn(inode);

2825
	if (!fuse_allow_current_process(fc))
2826 2827 2828 2829 2830 2831 2832 2833
		return -EACCES;

	if (is_bad_inode(inode))
		return -EIO;

	return fuse_do_ioctl(file, cmd, arg, flags);
}

Tejun Heo's avatar
Tejun Heo committed
2834 2835 2836
static long fuse_file_ioctl(struct file *file, unsigned int cmd,
			    unsigned long arg)
{
2837
	return fuse_ioctl_common(file, cmd, arg, 0);
Tejun Heo's avatar
Tejun Heo committed
2838 2839 2840 2841 2842
}

static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
				   unsigned long arg)
{
2843
	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
Tejun Heo's avatar
Tejun Heo committed
2844 2845
}

Tejun Heo's avatar
Tejun Heo committed
2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886
/*
 * All files which have been polled are linked to RB tree
 * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
 * find the matching one.
 */
static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
					      struct rb_node **parent_out)
{
	struct rb_node **link = &fc->polled_files.rb_node;
	struct rb_node *last = NULL;

	while (*link) {
		struct fuse_file *ff;

		last = *link;
		ff = rb_entry(last, struct fuse_file, polled_node);

		if (kh < ff->kh)
			link = &last->rb_left;
		else if (kh > ff->kh)
			link = &last->rb_right;
		else
			return link;
	}

	if (parent_out)
		*parent_out = last;
	return link;
}

/*
 * The file is about to be polled.  Make sure it's on the polled_files
 * RB tree.  Note that files once added to the polled_files tree are
 * not removed before the file is released.  This is because a file
 * polled once is likely to be polled again.
 */
static void fuse_register_polled_file(struct fuse_conn *fc,
				      struct fuse_file *ff)
{
	spin_lock(&fc->lock);
	if (RB_EMPTY_NODE(&ff->polled_node)) {
2887
		struct rb_node **link, *uninitialized_var(parent);
Tejun Heo's avatar
Tejun Heo committed
2888 2889 2890 2891 2892 2893 2894 2895 2896

		link = fuse_find_polled_node(fc, ff->kh, &parent);
		BUG_ON(*link);
		rb_link_node(&ff->polled_node, parent, link);
		rb_insert_color(&ff->polled_node, &fc->polled_files);
	}
	spin_unlock(&fc->lock);
}

Al Viro's avatar
Al Viro committed
2897
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
Tejun Heo's avatar
Tejun Heo committed
2898 2899
{
	struct fuse_file *ff = file->private_data;
2900
	struct fuse_conn *fc = ff->fc;
Tejun Heo's avatar
Tejun Heo committed
2901 2902
	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
	struct fuse_poll_out outarg;
2903
	FUSE_ARGS(args);
Tejun Heo's avatar
Tejun Heo committed
2904 2905 2906 2907 2908 2909
	int err;

	if (fc->no_poll)
		return DEFAULT_POLLMASK;

	poll_wait(file, &ff->poll_wait, wait);
2910
	inarg.events = mangle_poll(poll_requested_events(wait));
Tejun Heo's avatar
Tejun Heo committed
2911 2912 2913 2914 2915 2916 2917 2918 2919 2920

	/*
	 * Ask for notification iff there's someone waiting for it.
	 * The client may ignore the flag and always notify.
	 */
	if (waitqueue_active(&ff->poll_wait)) {
		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
		fuse_register_polled_file(fc, ff);
	}

2921 2922 2923 2924 2925 2926 2927 2928
	args.opcode = FUSE_POLL;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2929
	err = fuse_simple_request(fc, &args);
Tejun Heo's avatar
Tejun Heo committed
2930 2931

	if (!err)
2932
		return demangle_poll(outarg.revents);
Tejun Heo's avatar
Tejun Heo committed
2933 2934 2935 2936
	if (err == -ENOSYS) {
		fc->no_poll = 1;
		return DEFAULT_POLLMASK;
	}
2937
	return EPOLLERR;
Tejun Heo's avatar
Tejun Heo committed
2938
}
2939
EXPORT_SYMBOL_GPL(fuse_file_poll);
Tejun Heo's avatar
Tejun Heo committed
2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964

/*
 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
 * wakes up the poll waiters.
 */
int fuse_notify_poll_wakeup(struct fuse_conn *fc,
			    struct fuse_notify_poll_wakeup_out *outarg)
{
	u64 kh = outarg->kh;
	struct rb_node **link;

	spin_lock(&fc->lock);

	link = fuse_find_polled_node(fc, kh, NULL);
	if (*link) {
		struct fuse_file *ff;

		ff = rb_entry(*link, struct fuse_file, polled_node);
		wake_up_interruptible_sync(&ff->poll_wait);
	}

	spin_unlock(&fc->lock);
	return 0;
}

2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975
static void fuse_do_truncate(struct file *file)
{
	struct inode *inode = file->f_mapping->host;
	struct iattr attr;

	attr.ia_valid = ATTR_SIZE;
	attr.ia_size = i_size_read(inode);

	attr.ia_file = file;
	attr.ia_valid |= ATTR_FILE;

2976
	fuse_do_setattr(file_dentry(file), &attr, file);
2977 2978
}

2979
static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
2980
{
2981
	return round_up(off, fc->max_pages << PAGE_SHIFT);
2982 2983
}

2984
static ssize_t
2985
fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
2986
{
2987
	DECLARE_COMPLETION_ONSTACK(wait);
2988
	ssize_t ret = 0;
2989 2990
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
2991
	bool async_dio = ff->fc->async_dio;
2992
	loff_t pos = 0;
2993 2994
	struct inode *inode;
	loff_t i_size;
2995
	size_t count = iov_iter_count(iter);
2996
	loff_t offset = iocb->ki_pos;
2997
	struct fuse_io_priv *io;
2998 2999

	pos = offset;
3000 3001
	inode = file->f_mapping->host;
	i_size = i_size_read(inode);
3002

3003
	if ((iov_iter_rw(iter) == READ) && (offset > i_size))
3004 3005
		return 0;

3006
	/* optimization for short read */
3007
	if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
3008 3009
		if (offset >= i_size)
			return 0;
3010
		iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
3011
		count = iov_iter_count(iter);
3012 3013
	}

3014
	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3015 3016
	if (!io)
		return -ENOMEM;
3017
	spin_lock_init(&io->lock);
3018
	kref_init(&io->refcnt);
3019 3020 3021 3022
	io->reqs = 1;
	io->bytes = -1;
	io->size = 0;
	io->offset = offset;
3023
	io->write = (iov_iter_rw(iter) == WRITE);
3024 3025 3026
	io->err = 0;
	/*
	 * By default, we want to optimize all I/Os with async request
3027
	 * submission to the client filesystem if supported.
3028
	 */
3029
	io->async = async_dio;
3030
	io->iocb = iocb;
3031
	io->blocking = is_sync_kiocb(iocb);
3032 3033

	/*
3034 3035
	 * We cannot asynchronously extend the size of a file.
	 * In such case the aio will behave exactly like sync io.
3036
	 */
3037 3038
	if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
		io->blocking = true;
3039

3040
	if (io->async && io->blocking) {
3041 3042 3043 3044 3045
		/*
		 * Additional reference to keep io around after
		 * calling fuse_aio_complete()
		 */
		kref_get(&io->refcnt);
3046
		io->done = &wait;
3047
	}
3048

3049
	if (iov_iter_rw(iter) == WRITE) {
3050
		ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3051 3052
		fuse_invalidate_attr(inode);
	} else {
3053
		ret = __fuse_direct_read(io, iter, &pos);
3054
	}
3055

3056
	if (io->async) {
3057 3058
		bool blocking = io->blocking;

3059 3060 3061
		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);

		/* we have a non-extending, async request, so return */
3062
		if (!blocking)
3063 3064
			return -EIOCBQUEUED;

3065 3066
		wait_for_completion(&wait);
		ret = fuse_get_res_by_io(io);
3067 3068
	}

3069
	kref_put(&io->refcnt, fuse_io_release);
3070

3071
	if (iov_iter_rw(iter) == WRITE) {
3072 3073 3074 3075 3076
		if (ret > 0)
			fuse_write_update_size(inode, pos);
		else if (ret < 0 && offset + count > i_size)
			fuse_do_truncate(file);
	}
3077 3078 3079 3080

	return ret;
}

3081 3082 3083 3084 3085 3086 3087 3088 3089 3090
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);

	if (!err)
		fuse_sync_writes(inode);

	return err;
}

3091 3092
static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
				loff_t length)
3093 3094
{
	struct fuse_file *ff = file->private_data;
3095
	struct inode *inode = file_inode(file);
3096
	struct fuse_inode *fi = get_fuse_inode(inode);
3097
	struct fuse_conn *fc = ff->fc;
3098
	FUSE_ARGS(args);
3099 3100 3101 3102 3103 3104 3105
	struct fuse_fallocate_in inarg = {
		.fh = ff->fh,
		.offset = offset,
		.length = length,
		.mode = mode
	};
	int err;
3106 3107
	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
			   (mode & FALLOC_FL_PUNCH_HOLE);
3108

Miklos Szeredi's avatar
Miklos Szeredi committed
3109 3110 3111
	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
		return -EOPNOTSUPP;

3112 3113 3114
	if (fc->no_fallocate)
		return -EOPNOTSUPP;

3115
	if (lock_inode) {
Al Viro's avatar
Al Viro committed
3116
		inode_lock(inode);
3117 3118
		if (mode & FALLOC_FL_PUNCH_HOLE) {
			loff_t endbyte = offset + length - 1;
3119 3120

			err = fuse_writeback_range(inode, offset, endbyte);
3121 3122 3123
			if (err)
				goto out;
		}
3124 3125
	}

3126 3127 3128 3129
	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
	    offset + length > i_size_read(inode)) {
		err = inode_newsize_ok(inode, offset + length);
		if (err)
3130
			goto out;
3131 3132
	}

3133 3134 3135
	if (!(mode & FALLOC_FL_KEEP_SIZE))
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

3136 3137 3138 3139 3140
	args.opcode = FUSE_FALLOCATE;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
3141
	err = fuse_simple_request(fc, &args);
3142 3143 3144 3145
	if (err == -ENOSYS) {
		fc->no_fallocate = 1;
		err = -EOPNOTSUPP;
	}
3146 3147 3148 3149
	if (err)
		goto out;

	/* we could have extended the file */
3150 3151 3152
	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
		bool changed = fuse_write_update_size(inode, offset + length);

3153 3154
		if (changed && fc->writeback_cache)
			file_update_time(file);
3155
	}
3156 3157 3158 3159 3160 3161

	if (mode & FALLOC_FL_PUNCH_HOLE)
		truncate_pagecache_range(inode, offset, offset + length - 1);

	fuse_invalidate_attr(inode);

3162
out:
3163 3164 3165
	if (!(mode & FALLOC_FL_KEEP_SIZE))
		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

3166
	if (lock_inode)
Al Viro's avatar
Al Viro committed
3167
		inode_unlock(inode);
3168

3169 3170 3171
	return err;
}

3172 3173 3174
static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
				      struct file *file_out, loff_t pos_out,
				      size_t len, unsigned int flags)
3175 3176 3177
{
	struct fuse_file *ff_in = file_in->private_data;
	struct fuse_file *ff_out = file_out->private_data;
3178
	struct inode *inode_in = file_inode(file_in);
3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201
	struct inode *inode_out = file_inode(file_out);
	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
	struct fuse_conn *fc = ff_in->fc;
	FUSE_ARGS(args);
	struct fuse_copy_file_range_in inarg = {
		.fh_in = ff_in->fh,
		.off_in = pos_in,
		.nodeid_out = ff_out->nodeid,
		.fh_out = ff_out->fh,
		.off_out = pos_out,
		.len = len,
		.flags = flags
	};
	struct fuse_write_out outarg;
	ssize_t err;
	/* mark unstable when write-back is not used, and file_out gets
	 * extended */
	bool is_unstable = (!fc->writeback_cache) &&
			   ((pos_out + len) > inode_out->i_size);

	if (fc->no_copy_file_range)
		return -EOPNOTSUPP;

3202 3203 3204
	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
		return -EXDEV;

3205 3206
	if (fc->writeback_cache) {
		inode_lock(inode_in);
3207
		err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
3208 3209 3210 3211 3212
		inode_unlock(inode_in);
		if (err)
			return err;
	}

3213 3214
	inode_lock(inode_out);

3215 3216 3217 3218
	err = file_modified(file_out);
	if (err)
		goto out;

3219
	if (fc->writeback_cache) {
3220
		err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
3221 3222 3223 3224 3225 3226 3227
		if (err)
			goto out;
	}

	if (is_unstable)
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);

3228 3229 3230 3231 3232 3233 3234 3235
	args.opcode = FUSE_COPY_FILE_RANGE;
	args.nodeid = ff_in->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256
	err = fuse_simple_request(fc, &args);
	if (err == -ENOSYS) {
		fc->no_copy_file_range = 1;
		err = -EOPNOTSUPP;
	}
	if (err)
		goto out;

	if (fc->writeback_cache) {
		fuse_write_update_size(inode_out, pos_out + outarg.size);
		file_update_time(file_out);
	}

	fuse_invalidate_attr(inode_out);

	err = outarg.size;
out:
	if (is_unstable)
		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);

	inode_unlock(inode_out);
3257
	file_accessed(file_in);
3258 3259 3260 3261

	return err;
}

3262 3263 3264 3265 3266 3267 3268 3269 3270
static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
				    struct file *dst_file, loff_t dst_off,
				    size_t len, unsigned int flags)
{
	ssize_t ret;

	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
				     len, flags);

3271
	if (ret == -EOPNOTSUPP || ret == -EXDEV)
3272 3273 3274 3275 3276
		ret = generic_copy_file_range(src_file, src_off, dst_file,
					      dst_off, len, flags);
	return ret;
}

3277
static const struct file_operations fuse_file_operations = {
Miklos Szeredi's avatar
Miklos Szeredi committed
3278
	.llseek		= fuse_file_llseek,
3279
	.read_iter	= fuse_file_read_iter,
Al Viro's avatar
Al Viro committed
3280
	.write_iter	= fuse_file_write_iter,
3281 3282 3283 3284 3285
	.mmap		= fuse_file_mmap,
	.open		= fuse_open,
	.flush		= fuse_flush,
	.release	= fuse_release,
	.fsync		= fuse_fsync,
3286
	.lock		= fuse_file_lock,
3287
	.flock		= fuse_file_flock,
3288 3289
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
Tejun Heo's avatar
Tejun Heo committed
3290 3291
	.unlocked_ioctl	= fuse_file_ioctl,
	.compat_ioctl	= fuse_file_compat_ioctl,
Tejun Heo's avatar
Tejun Heo committed
3292
	.poll		= fuse_file_poll,
3293
	.fallocate	= fuse_file_fallocate,
3294
	.copy_file_range = fuse_copy_file_range,
Miklos Szeredi's avatar
Miklos Szeredi committed
3295 3296
};

3297
static const struct address_space_operations fuse_file_aops  = {
3298
	.readpage	= fuse_readpage,
Miklos Szeredi's avatar
Miklos Szeredi committed
3299
	.writepage	= fuse_writepage,
3300
	.writepages	= fuse_writepages,
Miklos Szeredi's avatar
Miklos Szeredi committed
3301
	.launder_page	= fuse_launder_page,
3302
	.readpages	= fuse_readpages,
Miklos Szeredi's avatar
Miklos Szeredi committed
3303
	.set_page_dirty	= __set_page_dirty_nobuffers,
3304
	.bmap		= fuse_bmap,
3305
	.direct_IO	= fuse_direct_IO,
3306 3307
	.write_begin	= fuse_write_begin,
	.write_end	= fuse_write_end,
3308 3309 3310 3311
};

void fuse_init_file_inode(struct inode *inode)
{
3312 3313
	struct fuse_inode *fi = get_fuse_inode(inode);

3314 3315
	inode->i_fop = &fuse_file_operations;
	inode->i_data.a_ops = &fuse_file_aops;
3316 3317 3318 3319 3320 3321

	INIT_LIST_HEAD(&fi->write_files);
	INIT_LIST_HEAD(&fi->queued_writes);
	fi->writectr = 0;
	init_waitqueue_head(&fi->page_waitq);
	INIT_LIST_HEAD(&fi->writepages);
3322
}