file.c 82.2 KB
Newer Older
1 2
/*
  FUSE: Filesystem in Userspace
Miklos Szeredi's avatar
Miklos Szeredi committed
3
  Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
4 5 6 7 8 9 10 11 12 13

  This program can be distributed under the terms of the GNU GPL.
  See the file COPYING.
*/

#include "fuse_i.h"

#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/kernel.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
14
#include <linux/sched.h>
15
#include <linux/sched/signal.h>
16
#include <linux/module.h>
17
#include <linux/compat.h>
18
#include <linux/swap.h>
19
#include <linux/falloc.h>
20
#include <linux/uio.h>
21

22 23
static struct page **fuse_pages_alloc(unsigned int npages, gfp_t flags,
				      struct fuse_page_desc **desc)
Miklos Szeredi's avatar
Miklos Szeredi committed
24 25 26 27 28 29 30 31 32 33
{
	struct page **pages;

	pages = kzalloc(npages * (sizeof(struct page *) +
				  sizeof(struct fuse_page_desc)), flags);
	*desc = (void *) (pages + npages);

	return pages;
}

34 35
static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
			  int opcode, struct fuse_open_out *outargp)
36 37
{
	struct fuse_open_in inarg;
38
	FUSE_ARGS(args);
39 40

	memset(&inarg, 0, sizeof(inarg));
41 42 43
	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
	if (!fc->atomic_o_trunc)
		inarg.flags &= ~O_TRUNC;
44 45 46 47 48 49 50 51
	args.opcode = opcode;
	args.nodeid = nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(*outargp);
	args.out_args[0].value = outargp;
52

53
	return fuse_simple_request(fc, &args);
54 55
}

56 57 58 59 60 61
struct fuse_release_args {
	struct fuse_args args;
	struct fuse_release_in inarg;
	struct inode *inode;
};

Tejun Heo's avatar
Tejun Heo committed
62
struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
63 64
{
	struct fuse_file *ff;
Tejun Heo's avatar
Tejun Heo committed
65

66
	ff = kzalloc(sizeof(struct fuse_file), GFP_KERNEL_ACCOUNT);
Tejun Heo's avatar
Tejun Heo committed
67 68 69
	if (unlikely(!ff))
		return NULL;

70
	ff->fc = fc;
71 72
	ff->release_args = kzalloc(sizeof(*ff->release_args),
				   GFP_KERNEL_ACCOUNT);
73
	if (!ff->release_args) {
Tejun Heo's avatar
Tejun Heo committed
74 75
		kfree(ff);
		return NULL;
76
	}
Tejun Heo's avatar
Tejun Heo committed
77 78

	INIT_LIST_HEAD(&ff->write_entry);
79
	mutex_init(&ff->readdir.lock);
80
	refcount_set(&ff->count, 1);
Tejun Heo's avatar
Tejun Heo committed
81 82 83
	RB_CLEAR_NODE(&ff->polled_node);
	init_waitqueue_head(&ff->poll_wait);

84
	ff->kh = atomic64_inc_return(&fc->khctr);
Tejun Heo's avatar
Tejun Heo committed
85

86 87 88 89 90
	return ff;
}

void fuse_file_free(struct fuse_file *ff)
{
91
	kfree(ff->release_args);
92
	mutex_destroy(&ff->readdir.lock);
93 94 95
	kfree(ff);
}

96
static struct fuse_file *fuse_file_get(struct fuse_file *ff)
97
{
98
	refcount_inc(&ff->count);
99 100 101
	return ff;
}

102 103
static void fuse_release_end(struct fuse_conn *fc, struct fuse_args *args,
			     int error)
Miklos Szeredi's avatar
Miklos Szeredi committed
104
{
105 106 107 108
	struct fuse_release_args *ra = container_of(args, typeof(*ra), args);

	iput(ra->inode);
	kfree(ra);
Miklos Szeredi's avatar
Miklos Szeredi committed
109 110
}

111
static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir)
112
{
113
	if (refcount_dec_and_test(&ff->count)) {
114
		struct fuse_args *args = &ff->release_args->args;
115

116
		if (isdir ? ff->fc->no_opendir : ff->fc->no_open) {
117 118
			/* Do nothing when client does not implement 'open' */
			fuse_release_end(ff->fc, args, 0);
119
		} else if (sync) {
120 121
			fuse_simple_request(ff->fc, args);
			fuse_release_end(ff->fc, args, 0);
122
		} else {
123 124 125 126
			args->end = fuse_release_end;
			if (fuse_simple_background(ff->fc, args,
						   GFP_KERNEL | __GFP_NOFAIL))
				fuse_release_end(ff->fc, args, -ENOTCONN);
127
		}
128 129 130 131
		kfree(ff);
	}
}

132 133
int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
		 bool isdir)
134 135 136 137 138 139 140 141
{
	struct fuse_file *ff;
	int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;

	ff = fuse_file_alloc(fc);
	if (!ff)
		return -ENOMEM;

142
	ff->fh = 0;
143 144
	/* Default for no-open */
	ff->open_flags = FOPEN_KEEP_CACHE | (isdir ? FOPEN_CACHE_DIR : 0);
145
	if (isdir ? !fc->no_opendir : !fc->no_open) {
146 147 148 149 150 151 152 153
		struct fuse_open_out outarg;
		int err;

		err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
		if (!err) {
			ff->fh = outarg.fh;
			ff->open_flags = outarg.open_flags;

154
		} else if (err != -ENOSYS) {
155 156 157
			fuse_file_free(ff);
			return err;
		} else {
158 159 160 161
			if (isdir)
				fc->no_opendir = 1;
			else
				fc->no_open = 1;
162
		}
163 164 165
	}

	if (isdir)
166
		ff->open_flags &= ~FOPEN_DIRECT_IO;
167 168

	ff->nodeid = nodeid;
169
	file->private_data = ff;
170 171 172

	return 0;
}
173
EXPORT_SYMBOL_GPL(fuse_do_open);
174

175 176 177 178 179 180 181 182 183
static void fuse_link_write_file(struct file *file)
{
	struct inode *inode = file_inode(file);
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct fuse_file *ff = file->private_data;
	/*
	 * file may be written through mmap, so chain it onto the
	 * inodes's write_file list
	 */
184
	spin_lock(&fi->lock);
185 186
	if (list_empty(&ff->write_entry))
		list_add(&ff->write_entry, &fi->write_files);
187
	spin_unlock(&fi->lock);
188 189
}

190
void fuse_finish_open(struct inode *inode, struct file *file)
191
{
192
	struct fuse_file *ff = file->private_data;
193
	struct fuse_conn *fc = get_fuse_conn(inode);
194 195

	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
Miklos Szeredi's avatar
Miklos Szeredi committed
196
		invalidate_inode_pages2(inode->i_mapping);
197 198 199
	if (ff->open_flags & FOPEN_STREAM)
		stream_open(inode, file);
	else if (ff->open_flags & FOPEN_NONSEEKABLE)
Tejun Heo's avatar
Tejun Heo committed
200
		nonseekable_open(inode, file);
201 202 203
	if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
		struct fuse_inode *fi = get_fuse_inode(inode);

204
		spin_lock(&fi->lock);
205
		fi->attr_version = atomic64_inc_return(&fc->attr_version);
206
		i_size_write(inode, 0);
207
		spin_unlock(&fi->lock);
208
		fuse_invalidate_attr(inode);
209 210
		if (fc->writeback_cache)
			file_update_time(file);
211
	}
212 213
	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
		fuse_link_write_file(file);
214 215
}

216
int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
217
{
Tejun Heo's avatar
Tejun Heo committed
218
	struct fuse_conn *fc = get_fuse_conn(inode);
219
	int err;
220
	bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
221 222
			  fc->atomic_o_trunc &&
			  fc->writeback_cache;
223 224 225 226 227

	err = generic_file_open(inode, file);
	if (err)
		return err;

228
	if (is_wb_truncate) {
Al Viro's avatar
Al Viro committed
229
		inode_lock(inode);
230 231
		fuse_set_nowrite(inode);
	}
232

233
	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
234

235 236
	if (!err)
		fuse_finish_open(inode, file);
237

238 239
	if (is_wb_truncate) {
		fuse_release_nowrite(inode);
Al Viro's avatar
Al Viro committed
240
		inode_unlock(inode);
241
	}
242 243

	return err;
244 245
}

246 247
static void fuse_prepare_release(struct fuse_inode *fi, struct fuse_file *ff,
				 int flags, int opcode)
248
{
249
	struct fuse_conn *fc = ff->fc;
250
	struct fuse_release_args *ra = ff->release_args;
251

252 253 254 255 256 257
	/* Inode is NULL on error path of fuse_create_open() */
	if (likely(fi)) {
		spin_lock(&fi->lock);
		list_del(&ff->write_entry);
		spin_unlock(&fi->lock);
	}
258 259 260 261 262
	spin_lock(&fc->lock);
	if (!RB_EMPTY_NODE(&ff->polled_node))
		rb_erase(&ff->polled_node, &fc->polled_files);
	spin_unlock(&fc->lock);

263
	wake_up_interruptible_all(&ff->poll_wait);
264

265 266 267 268 269 270 271 272 273
	ra->inarg.fh = ff->fh;
	ra->inarg.flags = flags;
	ra->args.in_numargs = 1;
	ra->args.in_args[0].size = sizeof(struct fuse_release_in);
	ra->args.in_args[0].value = &ra->inarg;
	ra->args.opcode = opcode;
	ra->args.nodeid = ff->nodeid;
	ra->args.force = true;
	ra->args.nocreds = true;
274 275
}

276
void fuse_release_common(struct file *file, bool isdir)
277
{
278
	struct fuse_inode *fi = get_fuse_inode(file_inode(file));
279
	struct fuse_file *ff = file->private_data;
280
	struct fuse_release_args *ra = ff->release_args;
281
	int opcode = isdir ? FUSE_RELEASEDIR : FUSE_RELEASE;
Tejun Heo's avatar
Tejun Heo committed
282

283
	fuse_prepare_release(fi, ff, file->f_flags, opcode);
Tejun Heo's avatar
Tejun Heo committed
284

Miklos Szeredi's avatar
Miklos Szeredi committed
285
	if (ff->flock) {
286 287 288
		ra->inarg.release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
		ra->inarg.lock_owner = fuse_lock_owner_id(ff->fc,
							  (fl_owner_t) file);
Miklos Szeredi's avatar
Miklos Szeredi committed
289
	}
290
	/* Hold inode until release is finished */
291
	ra->inode = igrab(file_inode(file));
Tejun Heo's avatar
Tejun Heo committed
292 293 294 295 296

	/*
	 * Normally this will send the RELEASE request, however if
	 * some asynchronous READ or WRITE requests are outstanding,
	 * the sending will be delayed.
297 298 299 300
	 *
	 * Make the release synchronous if this is a fuseblk mount,
	 * synchronous RELEASE is allowed (and desirable) in this case
	 * because the server can be trusted not to screw up.
Tejun Heo's avatar
Tejun Heo committed
301
	 */
302
	fuse_file_put(ff, ff->fc->destroy, isdir);
303 304
}

305 306
static int fuse_open(struct inode *inode, struct file *file)
{
307
	return fuse_open_common(inode, file, false);
308 309 310 311
}

static int fuse_release(struct inode *inode, struct file *file)
{
312 313 314 315
	struct fuse_conn *fc = get_fuse_conn(inode);

	/* see fuse_vma_close() for !writeback_cache case */
	if (fc->writeback_cache)
Miklos Szeredi's avatar
Miklos Szeredi committed
316
		write_inode_now(inode, 1);
317

318
	fuse_release_common(file, false);
319 320 321 322 323

	/* return value is ignored by VFS */
	return 0;
}

324
void fuse_sync_release(struct fuse_inode *fi, struct fuse_file *ff, int flags)
325
{
326
	WARN_ON(refcount_read(&ff->count) > 1);
327
	fuse_prepare_release(fi, ff, flags, FUSE_RELEASE);
328 329 330 331
	/*
	 * iput(NULL) is a no-op and since the refcount is 1 and everything's
	 * synchronous, we are fine with not doing igrab() here"
	 */
332
	fuse_file_put(ff, true, false);
333
}
334
EXPORT_SYMBOL_GPL(fuse_sync_release);
335

336
/*
337 338
 * Scramble the ID space with XTEA, so that the value of the files_struct
 * pointer is not exposed to userspace.
339
 */
340
u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
341
{
342 343 344 345 346 347 348 349 350 351 352 353 354 355
	u32 *k = fc->scramble_key;
	u64 v = (unsigned long) id;
	u32 v0 = v;
	u32 v1 = v >> 32;
	u32 sum = 0;
	int i;

	for (i = 0; i < 32; i++) {
		v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
		sum += 0x9E3779B9;
		v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
	}

	return (u64) v0 + ((u64) v1 << 32);
356 357
}

358 359 360 361 362 363 364 365 366
struct fuse_writepage_args {
	struct fuse_io_args ia;
	struct list_head writepages_entry;
	struct list_head queue_entry;
	struct fuse_writepage_args *next;
	struct inode *inode;
};

static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi,
367 368
					    pgoff_t idx_from, pgoff_t idx_to)
{
369
	struct fuse_writepage_args *wpa;
370

371
	list_for_each_entry(wpa, &fi->writepages, writepages_entry) {
372 373
		pgoff_t curr_index;

374 375 376
		WARN_ON(get_fuse_inode(wpa->inode) != fi);
		curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT;
		if (idx_from < curr_index + wpa->ia.ap.num_pages &&
377
		    curr_index <= idx_to) {
378
			return wpa;
379 380 381 382 383
		}
	}
	return NULL;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
384
/*
385
 * Check if any page in a range is under writeback
Miklos Szeredi's avatar
Miklos Szeredi committed
386 387 388 389
 *
 * This is currently done by walking the list of writepage requests
 * for the inode, which can be pretty inefficient.
 */
390 391
static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
				   pgoff_t idx_to)
Miklos Szeredi's avatar
Miklos Szeredi committed
392 393
{
	struct fuse_inode *fi = get_fuse_inode(inode);
394
	bool found;
Miklos Szeredi's avatar
Miklos Szeredi committed
395

396
	spin_lock(&fi->lock);
397
	found = fuse_find_writeback(fi, idx_from, idx_to);
398
	spin_unlock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
399 400 401 402

	return found;
}

403 404 405 406 407
static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
{
	return fuse_range_is_writeback(inode, index, index);
}

Miklos Szeredi's avatar
Miklos Szeredi committed
408 409 410 411 412 413
/*
 * Wait for page writeback to be completed.
 *
 * Since fuse doesn't rely on the VM writeback tracking, this has to
 * use some other means.
 */
414
static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
Miklos Szeredi's avatar
Miklos Szeredi committed
415 416 417 418 419 420
{
	struct fuse_inode *fi = get_fuse_inode(inode);

	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
}

421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
/*
 * Wait for all pending writepages on the inode to finish.
 *
 * This is currently done by blocking further writes with FUSE_NOWRITE
 * and waiting for all sent writes to complete.
 *
 * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
 * could conflict with truncation.
 */
static void fuse_sync_writes(struct inode *inode)
{
	fuse_set_nowrite(inode);
	fuse_release_nowrite(inode);
}

436
static int fuse_flush(struct file *file, fl_owner_t id)
437
{
438
	struct inode *inode = file_inode(file);
439 440 441
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_file *ff = file->private_data;
	struct fuse_flush_in inarg;
442
	FUSE_ARGS(args);
443 444
	int err;

445 446 447
	if (is_bad_inode(inode))
		return -EIO;

448 449 450
	if (fc->no_flush)
		return 0;

Miklos Szeredi's avatar
Miklos Szeredi committed
451
	err = write_inode_now(inode, 1);
452 453 454
	if (err)
		return err;

Al Viro's avatar
Al Viro committed
455
	inode_lock(inode);
456
	fuse_sync_writes(inode);
Al Viro's avatar
Al Viro committed
457
	inode_unlock(inode);
458

459
	err = filemap_check_errors(file->f_mapping);
460 461 462
	if (err)
		return err;

463 464
	memset(&inarg, 0, sizeof(inarg));
	inarg.fh = ff->fh;
465
	inarg.lock_owner = fuse_lock_owner_id(fc, id);
466 467 468 469 470 471 472 473
	args.opcode = FUSE_FLUSH;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.force = true;

	err = fuse_simple_request(fc, &args);
474 475 476 477 478 479 480
	if (err == -ENOSYS) {
		fc->no_flush = 1;
		err = 0;
	}
	return err;
}

481
int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
482
		      int datasync, int opcode)
483
{
484
	struct inode *inode = file->f_mapping->host;
485 486
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_file *ff = file->private_data;
487
	FUSE_ARGS(args);
488
	struct fuse_fsync_in inarg;
489 490 491

	memset(&inarg, 0, sizeof(inarg));
	inarg.fh = ff->fh;
492
	inarg.fsync_flags = datasync ? FUSE_FSYNC_FDATASYNC : 0;
493 494 495 496 497
	args.opcode = opcode;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
498 499 500 501 502 503 504 505
	return fuse_simple_request(fc, &args);
}

static int fuse_fsync(struct file *file, loff_t start, loff_t end,
		      int datasync)
{
	struct inode *inode = file->f_mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
506 507
	int err;

508 509 510
	if (is_bad_inode(inode))
		return -EIO;

Al Viro's avatar
Al Viro committed
511
	inode_lock(inode);
512

Miklos Szeredi's avatar
Miklos Szeredi committed
513 514 515 516 517
	/*
	 * Start writeback against all dirty pages of the inode, then
	 * wait for all outstanding writes, before sending the FSYNC
	 * request.
	 */
518
	err = file_write_and_wait_range(file, start, end);
Miklos Szeredi's avatar
Miklos Szeredi committed
519
	if (err)
520
		goto out;
Miklos Szeredi's avatar
Miklos Szeredi committed
521 522

	fuse_sync_writes(inode);
523 524 525

	/*
	 * Due to implementation of fuse writeback
526
	 * file_write_and_wait_range() does not catch errors.
527 528
	 * We have to do this directly after fuse_sync_writes()
	 */
529
	err = file_check_and_advance_wb_err(file);
530 531 532
	if (err)
		goto out;

Miklos Szeredi's avatar
Miklos Szeredi committed
533 534 535
	err = sync_inode_metadata(inode, 1);
	if (err)
		goto out;
Miklos Szeredi's avatar
Miklos Szeredi committed
536

537
	if (fc->no_fsync)
Miklos Szeredi's avatar
Miklos Szeredi committed
538
		goto out;
539

540
	err = fuse_fsync_common(file, start, end, datasync, FUSE_FSYNC);
541
	if (err == -ENOSYS) {
542
		fc->no_fsync = 1;
543 544
		err = 0;
	}
545
out:
Al Viro's avatar
Al Viro committed
546
	inode_unlock(inode);
547

548
	return err;
549 550
}

551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
			 size_t count, int opcode)
{
	struct fuse_file *ff = file->private_data;
	struct fuse_args *args = &ia->ap.args;

	ia->read.in.fh = ff->fh;
	ia->read.in.offset = pos;
	ia->read.in.size = count;
	ia->read.in.flags = file->f_flags;
	args->opcode = opcode;
	args->nodeid = ff->nodeid;
	args->in_numargs = 1;
	args->in_args[0].size = sizeof(ia->read.in);
	args->in_args[0].value = &ia->read.in;
	args->out_argvar = true;
	args->out_numargs = 1;
	args->out_args[0].size = count;
}

571 572
static void fuse_release_user_pages(struct fuse_args_pages *ap,
				    bool should_dirty)
573
{
574
	unsigned int i;
575

576
	for (i = 0; i < ap->num_pages; i++) {
577
		if (should_dirty)
578 579
			set_page_dirty_lock(ap->pages[i]);
		put_page(ap->pages[i]);
580 581 582
	}
}

583 584 585 586 587
static void fuse_io_release(struct kref *kref)
{
	kfree(container_of(kref, struct fuse_io_priv, refcnt));
}

588 589 590 591 592 593 594 595 596 597 598
static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
{
	if (io->err)
		return io->err;

	if (io->bytes >= 0 && io->write)
		return -EIO;

	return io->bytes < 0 ? io->size : io->bytes;
}

599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625
/**
 * In case of short read, the caller sets 'pos' to the position of
 * actual end of fuse request in IO request. Otherwise, if bytes_requested
 * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
 *
 * An example:
 * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
 * both submitted asynchronously. The first of them was ACKed by userspace as
 * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
 * second request was ACKed as short, e.g. only 1K was read, resulting in
 * pos == 33K.
 *
 * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
 * will be equal to the length of the longest contiguous fragment of
 * transferred data starting from the beginning of IO request.
 */
static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
{
	int left;

	spin_lock(&io->lock);
	if (err)
		io->err = io->err ? : err;
	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
		io->bytes = pos;

	left = --io->reqs;
626
	if (!left && io->blocking)
627
		complete(io->done);
628 629
	spin_unlock(&io->lock);

630
	if (!left && !io->blocking) {
631
		ssize_t res = fuse_get_res_by_io(io);
632

633 634 635 636
		if (res >= 0) {
			struct inode *inode = file_inode(io->iocb->ki_filp);
			struct fuse_conn *fc = get_fuse_conn(inode);
			struct fuse_inode *fi = get_fuse_inode(inode);
637

638
			spin_lock(&fi->lock);
639
			fi->attr_version = atomic64_inc_return(&fc->attr_version);
640
			spin_unlock(&fi->lock);
641 642
		}

643
		io->iocb->ki_complete(io->iocb, res, 0);
644
	}
645 646

	kref_put(&io->refcnt, fuse_io_release);
647 648
}

649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
static struct fuse_io_args *fuse_io_alloc(struct fuse_io_priv *io,
					  unsigned int npages)
{
	struct fuse_io_args *ia;

	ia = kzalloc(sizeof(*ia), GFP_KERNEL);
	if (ia) {
		ia->io = io;
		ia->ap.pages = fuse_pages_alloc(npages, GFP_KERNEL,
						&ia->ap.descs);
		if (!ia->ap.pages) {
			kfree(ia);
			ia = NULL;
		}
	}
	return ia;
}

static void fuse_io_free(struct fuse_io_args *ia)
668
{
669 670 671 672 673 674 675 676 677
	kfree(ia->ap.pages);
	kfree(ia);
}

static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_args *args,
				  int err)
{
	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
	struct fuse_io_priv *io = ia->io;
678 679
	ssize_t pos = -1;

680
	fuse_release_user_pages(&ia->ap, io->should_dirty);
681

682 683 684 685 686 687 688 689 690
	if (err) {
		/* Nothing */
	} else if (io->write) {
		if (ia->write.out.size > ia->write.in.size) {
			err = -EIO;
		} else if (ia->write.in.size != ia->write.out.size) {
			pos = ia->write.in.offset - io->offset +
				ia->write.out.size;
		}
691
	} else {
692 693 694 695
		u32 outsize = args->out_args[0].size;

		if (ia->read.in.size != outsize)
			pos = ia->read.in.offset - io->offset + outsize;
696 697
	}

698 699
	fuse_aio_complete(io, err, pos);
	fuse_io_free(ia);
700 701
}

702 703
static ssize_t fuse_async_req_send(struct fuse_conn *fc,
				   struct fuse_io_args *ia, size_t num_bytes)
704
{
705 706 707
	ssize_t err;
	struct fuse_io_priv *io = ia->io;

708
	spin_lock(&io->lock);
709
	kref_get(&io->refcnt);
710 711 712 713
	io->size += num_bytes;
	io->reqs++;
	spin_unlock(&io->lock);

714 715
	ia->ap.args.end = fuse_aio_complete_req;
	err = fuse_simple_background(fc, &ia->ap.args, GFP_KERNEL);
716 717
	if (err)
		fuse_aio_complete_req(fc, &ia->ap.args, err);
718

719
	return num_bytes;
720 721
}

722 723
static ssize_t fuse_send_read(struct fuse_io_args *ia, loff_t pos, size_t count,
			      fl_owner_t owner)
724
{
725
	struct file *file = ia->io->iocb->ki_filp;
726 727
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
728

729
	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
730
	if (owner != NULL) {
731 732
		ia->read.in.read_flags |= FUSE_READ_LOCKOWNER;
		ia->read.in.lock_owner = fuse_lock_owner_id(fc, owner);
733
	}
734

735 736
	if (ia->io->async)
		return fuse_async_req_send(fc, ia, count);
737

738
	return fuse_simple_request(fc, &ia->ap.args);
739 740
}

741 742 743 744 745 746
static void fuse_read_update_size(struct inode *inode, loff_t size,
				  u64 attr_ver)
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);

747
	spin_lock(&fi->lock);
748 749
	if (attr_ver == fi->attr_version && size < inode->i_size &&
	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
750
		fi->attr_version = atomic64_inc_return(&fc->attr_version);
751 752
		i_size_write(inode, size);
	}
753
	spin_unlock(&fi->lock);
754 755
}

756
static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read,
757
			    struct fuse_args_pages *ap)
758
{
759 760 761 762 763 764 765 766 767
	struct fuse_conn *fc = get_fuse_conn(inode);

	if (fc->writeback_cache) {
		/*
		 * A hole in a file. Some data after the hole are in page cache,
		 * but have not reached the client fs yet. So, the hole is not
		 * present there.
		 */
		int i;
768 769
		int start_idx = num_read >> PAGE_SHIFT;
		size_t off = num_read & (PAGE_SIZE - 1);
770

771 772
		for (i = start_idx; i < ap->num_pages; i++) {
			zero_user_segment(ap->pages[i], off, PAGE_SIZE);
773 774 775
			off = 0;
		}
	} else {
776
		loff_t pos = page_offset(ap->pages[0]) + num_read;
777 778
		fuse_read_update_size(inode, pos, attr_ver);
	}
779 780
}

781
static int fuse_do_readpage(struct file *file, struct page *page)
782 783 784
{
	struct inode *inode = page->mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
785
	loff_t pos = page_offset(page);
786 787 788 789 790 791 792 793 794
	struct fuse_page_desc desc = { .length = PAGE_SIZE };
	struct fuse_io_args ia = {
		.ap.args.page_zeroing = true,
		.ap.args.out_pages = true,
		.ap.num_pages = 1,
		.ap.pages = &page,
		.ap.descs = &desc,
	};
	ssize_t res;
795
	u64 attr_ver;
796

Miklos Szeredi's avatar
Miklos Szeredi committed
797
	/*
Lucas De Marchi's avatar
Lucas De Marchi committed
798
	 * Page writeback can extend beyond the lifetime of the
Miklos Szeredi's avatar
Miklos Szeredi committed
799 800 801 802 803
	 * page-cache page, so make sure we read a properly synced
	 * page.
	 */
	fuse_wait_on_page_writeback(inode, page->index);

804 805
	attr_ver = fuse_get_attr_version(fc);

806 807 808 809 810 811 812 813
	fuse_read_args_fill(&ia, file, pos, desc.length, FUSE_READ);
	res = fuse_simple_request(fc, &ia.ap.args);
	if (res < 0)
		return res;
	/*
	 * Short read means EOF.  If file size is larger, truncate it
	 */
	if (res < desc.length)
814
		fuse_short_read(inode, attr_ver, res, &ia.ap);
815

816
	SetPageUptodate(page);
817

818
	return 0;
819 820 821 822 823 824 825 826 827 828 829 830
}

static int fuse_readpage(struct file *file, struct page *page)
{
	struct inode *inode = page->mapping->host;
	int err;

	err = -EIO;
	if (is_bad_inode(inode))
		goto out;

	err = fuse_do_readpage(file, page);
831
	fuse_invalidate_atime(inode);
832 833 834 835 836
 out:
	unlock_page(page);
	return err;
}

837 838
static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_args *args,
			       int err)
839
{
840
	int i;
841 842 843 844
	struct fuse_io_args *ia = container_of(args, typeof(*ia), ap.args);
	struct fuse_args_pages *ap = &ia->ap;
	size_t count = ia->read.in.size;
	size_t num_read = args->out_args[0].size;
845
	struct address_space *mapping = NULL;
846

847 848
	for (i = 0; mapping == NULL && i < ap->num_pages; i++)
		mapping = ap->pages[i]->mapping;
849

850 851 852 853 854 855
	if (mapping) {
		struct inode *inode = mapping->host;

		/*
		 * Short read means EOF. If file size is larger, truncate it
		 */
856 857
		if (!err && num_read < count)
			fuse_short_read(inode, ia->read.attr_ver, num_read, ap);
858

859
		fuse_invalidate_atime(inode);
860
	}
861

862 863 864 865
	for (i = 0; i < ap->num_pages; i++) {
		struct page *page = ap->pages[i];

		if (!err)
866
			SetPageUptodate(page);
867 868
		else
			SetPageError(page);
869
		unlock_page(page);
870
		put_page(page);
871
	}
872 873 874 875
	if (ia->ff)
		fuse_file_put(ia->ff, false, false);

	fuse_io_free(ia);
876 877
}

878
static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file)
879
{
880 881
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
882 883 884 885 886 887 888 889 890 891
	struct fuse_args_pages *ap = &ia->ap;
	loff_t pos = page_offset(ap->pages[0]);
	size_t count = ap->num_pages << PAGE_SHIFT;
	int err;

	ap->args.out_pages = true;
	ap->args.page_zeroing = true;
	ap->args.page_replace = true;
	fuse_read_args_fill(ia, file, pos, count, FUSE_READ);
	ia->read.attr_ver = fuse_get_attr_version(fc);
892
	if (fc->async_read) {
893 894 895 896 897
		ia->ff = fuse_file_get(ff);
		ap->args.end = fuse_readpages_end;
		err = fuse_simple_background(fc, &ap->args, GFP_KERNEL);
		if (!err)
			return;
898
	} else {
899
		err = fuse_simple_request(fc, &ap->args);
900
	}
901
	fuse_readpages_end(fc, &ap->args, err);
902 903
}

904
struct fuse_fill_data {
905
	struct fuse_io_args *ia;
906
	struct file *file;
907
	struct inode *inode;
908 909
	unsigned int nr_pages;
	unsigned int max_pages;
910 911 912 913
};

static int fuse_readpages_fill(void *_data, struct page *page)
{
914
	struct fuse_fill_data *data = _data;
915 916
	struct fuse_io_args *ia = data->ia;
	struct fuse_args_pages *ap = &ia->ap;
917 918 919
	struct inode *inode = data->inode;
	struct fuse_conn *fc = get_fuse_conn(inode);

Miklos Szeredi's avatar
Miklos Szeredi committed
920 921
	fuse_wait_on_page_writeback(inode, page->index);

922 923 924 925 926 927 928 929 930
	if (ap->num_pages &&
	    (ap->num_pages == fc->max_pages ||
	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_read ||
	     ap->pages[ap->num_pages - 1]->index + 1 != page->index)) {
		data->max_pages = min_t(unsigned int, data->nr_pages,
					fc->max_pages);
		fuse_send_readpages(ia, data->file);
		data->ia = ia = fuse_io_alloc(NULL, data->max_pages);
		if (!ia) {
931
			unlock_page(page);
932
			return -ENOMEM;
933
		}
934
		ap = &ia->ap;
935
	}
936

937
	if (WARN_ON(ap->num_pages >= data->max_pages)) {
938
		unlock_page(page);
939
		fuse_io_free(ia);
940 941 942
		return -EIO;
	}

943
	get_page(page);
944 945 946
	ap->pages[ap->num_pages] = page;
	ap->descs[ap->num_pages].length = PAGE_SIZE;
	ap->num_pages++;
947
	data->nr_pages--;
948 949 950 951 952 953 954 955
	return 0;
}

static int fuse_readpages(struct file *file, struct address_space *mapping,
			  struct list_head *pages, unsigned nr_pages)
{
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
956
	struct fuse_fill_data data;
957
	int err;
958

959
	err = -EIO;
960
	if (is_bad_inode(inode))
961
		goto out;
962

963
	data.file = file;
964
	data.inode = inode;
965
	data.nr_pages = nr_pages;
966 967 968 969 970
	data.max_pages = min_t(unsigned int, nr_pages, fc->max_pages);
;
	data.ia = fuse_io_alloc(NULL, data.max_pages);
	err = -ENOMEM;
	if (!data.ia)
971
		goto out;
972 973

	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
974
	if (!err) {
975 976
		if (data.ia->ap.num_pages)
			fuse_send_readpages(data.ia, file);
977
		else
978
			fuse_io_free(data.ia);
979
	}
980
out:
981
	return err;
982 983
}

984
static ssize_t fuse_cache_read_iter(struct kiocb *iocb, struct iov_iter *to)
Miklos Szeredi's avatar
Miklos Szeredi committed
985 986
{
	struct inode *inode = iocb->ki_filp->f_mapping->host;
987
	struct fuse_conn *fc = get_fuse_conn(inode);
Miklos Szeredi's avatar
Miklos Szeredi committed
988

989 990 991 992 993 994
	/*
	 * In auto invalidate mode, always update attributes on read.
	 * Otherwise, only update if we attempt to read past EOF (to ensure
	 * i_size is up to date).
	 */
	if (fc->auto_inval_data ||
995
	    (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
Miklos Szeredi's avatar
Miklos Szeredi committed
996
		int err;
Miklos Szeredi's avatar
Miklos Szeredi committed
997
		err = fuse_update_attributes(inode, iocb->ki_filp);
Miklos Szeredi's avatar
Miklos Szeredi committed
998 999 1000 1001
		if (err)
			return err;
	}

1002
	return generic_file_read_iter(iocb, to);
Miklos Szeredi's avatar
Miklos Szeredi committed
1003 1004
}

1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff,
				 loff_t pos, size_t count)
{
	struct fuse_args *args = &ia->ap.args;

	ia->write.in.fh = ff->fh;
	ia->write.in.offset = pos;
	ia->write.in.size = count;
	args->opcode = FUSE_WRITE;
	args->nodeid = ff->nodeid;
	args->in_numargs = 2;
	if (ff->fc->minor < 9)
		args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
	else
		args->in_args[0].size = sizeof(ia->write.in);
	args->in_args[0].value = &ia->write.in;
	args->in_args[1].size = count;
	args->out_numargs = 1;
	args->out_args[0].size = sizeof(ia->write.out);
	args->out_args[0].value = &ia->write.out;
}

static unsigned int fuse_write_flags(struct kiocb *iocb)
{
	unsigned int flags = iocb->ki_filp->f_flags;

	if (iocb->ki_flags & IOCB_DSYNC)
		flags |= O_DSYNC;
	if (iocb->ki_flags & IOCB_SYNC)
		flags |= O_SYNC;

	return flags;
}

1039 1040
static ssize_t fuse_send_write(struct fuse_io_args *ia, loff_t pos,
			       size_t count, fl_owner_t owner)
1041
{
1042
	struct kiocb *iocb = ia->io->iocb;
1043
	struct file *file = iocb->ki_filp;
1044 1045
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
1046 1047
	struct fuse_write_in *inarg = &ia->write.in;
	ssize_t err;
1048

1049
	fuse_write_args_fill(ia, ff, pos, count);
1050
	inarg->flags = fuse_write_flags(iocb);
1051 1052 1053 1054
	if (owner != NULL) {
		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
	}
1055

1056 1057 1058 1059 1060 1061
	if (ia->io->async)
		return fuse_async_req_send(fc, ia, count);

	err = fuse_simple_request(fc, &ia->ap.args);
	if (!err && ia->write.out.size > count)
		err = -EIO;
1062

1063
	return err ?: ia->write.out.size;
1064 1065
}

1066
bool fuse_write_update_size(struct inode *inode, loff_t pos)
1067 1068 1069
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
1070
	bool ret = false;
1071

1072
	spin_lock(&fi->lock);
1073
	fi->attr_version = atomic64_inc_return(&fc->attr_version);
1074
	if (pos > inode->i_size) {
1075
		i_size_write(inode, pos);
1076 1077
		ret = true;
	}
1078
	spin_unlock(&fi->lock);
1079 1080

	return ret;
1081 1082
}

1083 1084 1085
static ssize_t fuse_send_write_pages(struct fuse_io_args *ia,
				     struct kiocb *iocb, struct inode *inode,
				     loff_t pos, size_t count)
Nick Piggin's avatar
Nick Piggin committed
1086
{
1087 1088 1089 1090 1091 1092
	struct fuse_args_pages *ap = &ia->ap;
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
	unsigned int offset, i;
	int err;
Nick Piggin's avatar
Nick Piggin committed
1093

1094 1095
	for (i = 0; i < ap->num_pages; i++)
		fuse_wait_on_page_writeback(inode, ap->pages[i]->index);
Nick Piggin's avatar
Nick Piggin committed
1096

1097 1098
	fuse_write_args_fill(ia, ff, pos, count);
	ia->write.in.flags = fuse_write_flags(iocb);
Nick Piggin's avatar
Nick Piggin committed
1099

1100
	err = fuse_simple_request(fc, &ap->args);
Miklos Szeredi's avatar
Miklos Szeredi committed
1101 1102
	if (!err && ia->write.out.size > count)
		err = -EIO;
1103 1104 1105 1106 1107

	offset = ap->descs[0].offset;
	count = ia->write.out.size;
	for (i = 0; i < ap->num_pages; i++) {
		struct page *page = ap->pages[i];
Nick Piggin's avatar
Nick Piggin committed
1108

1109
		if (!err && !offset && count >= PAGE_SIZE)
Nick Piggin's avatar
Nick Piggin committed
1110 1111
			SetPageUptodate(page);

1112 1113
		if (count > PAGE_SIZE - offset)
			count -= PAGE_SIZE - offset;
Nick Piggin's avatar
Nick Piggin committed
1114 1115 1116 1117 1118
		else
			count = 0;
		offset = 0;

		unlock_page(page);
1119
		put_page(page);
Nick Piggin's avatar
Nick Piggin committed
1120 1121
	}

1122
	return err;
Nick Piggin's avatar
Nick Piggin committed
1123 1124
}

1125 1126 1127 1128
static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap,
				     struct address_space *mapping,
				     struct iov_iter *ii, loff_t pos,
				     unsigned int max_pages)
Nick Piggin's avatar
Nick Piggin committed
1129 1130
{
	struct fuse_conn *fc = get_fuse_conn(mapping->host);
1131
	unsigned offset = pos & (PAGE_SIZE - 1);
Nick Piggin's avatar
Nick Piggin committed
1132 1133 1134
	size_t count = 0;
	int err;

1135 1136
	ap->args.in_pages = true;
	ap->descs[0].offset = offset;
Nick Piggin's avatar
Nick Piggin committed
1137 1138 1139 1140

	do {
		size_t tmp;
		struct page *page;
1141 1142
		pgoff_t index = pos >> PAGE_SHIFT;
		size_t bytes = min_t(size_t, PAGE_SIZE - offset,
Nick Piggin's avatar
Nick Piggin committed
1143 1144 1145 1146 1147 1148 1149 1150 1151 1152
				     iov_iter_count(ii));

		bytes = min_t(size_t, bytes, fc->max_write - count);

 again:
		err = -EFAULT;
		if (iov_iter_fault_in_readable(ii, bytes))
			break;

		err = -ENOMEM;
1153
		page = grab_cache_page_write_begin(mapping, index, 0);
Nick Piggin's avatar
Nick Piggin committed
1154 1155 1156
		if (!page)
			break;

1157 1158 1159
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

Nick Piggin's avatar
Nick Piggin committed
1160 1161 1162
		tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
		flush_dcache_page(page);

1163
		iov_iter_advance(ii, tmp);
Nick Piggin's avatar
Nick Piggin committed
1164 1165
		if (!tmp) {
			unlock_page(page);
1166
			put_page(page);
Nick Piggin's avatar
Nick Piggin committed
1167 1168 1169 1170 1171
			bytes = min(bytes, iov_iter_single_seg_count(ii));
			goto again;
		}

		err = 0;
1172 1173 1174
		ap->pages[ap->num_pages] = page;
		ap->descs[ap->num_pages].length = tmp;
		ap->num_pages++;
Nick Piggin's avatar
Nick Piggin committed
1175 1176 1177 1178

		count += tmp;
		pos += tmp;
		offset += tmp;
1179
		if (offset == PAGE_SIZE)
Nick Piggin's avatar
Nick Piggin committed
1180 1181
			offset = 0;

1182 1183
		if (!fc->big_writes)
			break;
Nick Piggin's avatar
Nick Piggin committed
1184
	} while (iov_iter_count(ii) && count < fc->max_write &&
1185
		 ap->num_pages < max_pages && offset == 0);
Nick Piggin's avatar
Nick Piggin committed
1186 1187 1188 1189

	return count > 0 ? count : err;
}

1190 1191
static inline unsigned int fuse_wr_pages(loff_t pos, size_t len,
				     unsigned int max_pages)
1192
{
1193
	return min_t(unsigned int,
1194 1195
		     ((pos + len - 1) >> PAGE_SHIFT) -
		     (pos >> PAGE_SHIFT) + 1,
1196
		     max_pages);
1197 1198
}

1199
static ssize_t fuse_perform_write(struct kiocb *iocb,
Nick Piggin's avatar
Nick Piggin committed
1200 1201 1202 1203 1204
				  struct address_space *mapping,
				  struct iov_iter *ii, loff_t pos)
{
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
1205
	struct fuse_inode *fi = get_fuse_inode(inode);
Nick Piggin's avatar
Nick Piggin committed
1206 1207 1208
	int err = 0;
	ssize_t res = 0;

1209 1210 1211
	if (inode->i_size < pos + iov_iter_count(ii))
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

Nick Piggin's avatar
Nick Piggin committed
1212 1213
	do {
		ssize_t count;
1214 1215
		struct fuse_io_args ia = {};
		struct fuse_args_pages *ap = &ia.ap;
1216 1217
		unsigned int nr_pages = fuse_wr_pages(pos, iov_iter_count(ii),
						      fc->max_pages);
Nick Piggin's avatar
Nick Piggin committed
1218

1219 1220 1221
		ap->pages = fuse_pages_alloc(nr_pages, GFP_KERNEL, &ap->descs);
		if (!ap->pages) {
			err = -ENOMEM;
Nick Piggin's avatar
Nick Piggin committed
1222 1223 1224
			break;
		}

1225
		count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages);
Nick Piggin's avatar
Nick Piggin committed
1226 1227 1228
		if (count <= 0) {
			err = count;
		} else {
1229 1230
			err = fuse_send_write_pages(&ia, iocb, inode,
						    pos, count);
Nick Piggin's avatar
Nick Piggin committed
1231
			if (!err) {
1232 1233
				size_t num_written = ia.write.out.size;

Nick Piggin's avatar
Nick Piggin committed
1234 1235 1236 1237 1238 1239 1240 1241
				res += num_written;
				pos += num_written;

				/* break out of the loop on short write */
				if (num_written != count)
					err = -EIO;
			}
		}
1242
		kfree(ap->pages);
Nick Piggin's avatar
Nick Piggin committed
1243 1244 1245 1246 1247
	} while (!err && iov_iter_count(ii));

	if (res > 0)
		fuse_write_update_size(inode, pos);

1248
	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
Nick Piggin's avatar
Nick Piggin committed
1249 1250 1251 1252 1253
	fuse_invalidate_attr(inode);

	return res > 0 ? res : err;
}

1254
static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
Nick Piggin's avatar
Nick Piggin committed
1255 1256 1257 1258
{
	struct file *file = iocb->ki_filp;
	struct address_space *mapping = file->f_mapping;
	ssize_t written = 0;
1259
	ssize_t written_buffered = 0;
Nick Piggin's avatar
Nick Piggin committed
1260 1261
	struct inode *inode = mapping->host;
	ssize_t err;
1262
	loff_t endbyte = 0;
Nick Piggin's avatar
Nick Piggin committed
1263

1264 1265
	if (get_fuse_conn(inode)->writeback_cache) {
		/* Update size (EOF optimization) and mode (SUID clearing) */
Miklos Szeredi's avatar
Miklos Szeredi committed
1266
		err = fuse_update_attributes(mapping->host, file);
1267 1268 1269
		if (err)
			return err;

Al Viro's avatar
Al Viro committed
1270
		return generic_file_write_iter(iocb, from);
1271 1272
	}

Al Viro's avatar
Al Viro committed
1273
	inode_lock(inode);
Nick Piggin's avatar
Nick Piggin committed
1274 1275

	/* We can write back this queue in page reclaim */
1276
	current->backing_dev_info = inode_to_bdi(inode);
Nick Piggin's avatar
Nick Piggin committed
1277

1278 1279
	err = generic_write_checks(iocb, from);
	if (err <= 0)
Nick Piggin's avatar
Nick Piggin committed
1280 1281
		goto out;

1282
	err = file_remove_privs(file);
Nick Piggin's avatar
Nick Piggin committed
1283 1284 1285
	if (err)
		goto out;

1286 1287 1288
	err = file_update_time(file);
	if (err)
		goto out;
Nick Piggin's avatar
Nick Piggin committed
1289

1290
	if (iocb->ki_flags & IOCB_DIRECT) {
1291
		loff_t pos = iocb->ki_pos;
1292
		written = generic_file_direct_write(iocb, from);
Al Viro's avatar
Al Viro committed
1293
		if (written < 0 || !iov_iter_count(from))
1294 1295 1296
			goto out;

		pos += written;
Nick Piggin's avatar
Nick Piggin committed
1297

1298
		written_buffered = fuse_perform_write(iocb, mapping, from, pos);
1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310
		if (written_buffered < 0) {
			err = written_buffered;
			goto out;
		}
		endbyte = pos + written_buffered - 1;

		err = filemap_write_and_wait_range(file->f_mapping, pos,
						   endbyte);
		if (err)
			goto out;

		invalidate_mapping_pages(file->f_mapping,
1311 1312
					 pos >> PAGE_SHIFT,
					 endbyte >> PAGE_SHIFT);
1313 1314 1315 1316

		written += written_buffered;
		iocb->ki_pos = pos + written_buffered;
	} else {
1317
		written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
1318
		if (written >= 0)
1319
			iocb->ki_pos += written;
1320
	}
Nick Piggin's avatar
Nick Piggin committed
1321 1322
out:
	current->backing_dev_info = NULL;
Al Viro's avatar
Al Viro committed
1323
	inode_unlock(inode);
1324 1325
	if (written > 0)
		written = generic_write_sync(iocb, written);
Nick Piggin's avatar
Nick Piggin committed
1326 1327 1328 1329

	return written ? written : err;
}

1330 1331 1332
static inline void fuse_page_descs_length_init(struct fuse_page_desc *descs,
					       unsigned int index,
					       unsigned int nr_pages)
1333 1334 1335
{
	int i;

1336
	for (i = index; i < index + nr_pages; i++)
1337
		descs[i].length = PAGE_SIZE - descs[i].offset;
1338 1339
}

1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350
static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
{
	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
}

static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
					size_t max_size)
{
	return min(iov_iter_single_seg_count(ii), max_size);
}

1351 1352 1353
static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
			       size_t *nbytesp, int write,
			       unsigned int max_pages)
Miklos Szeredi's avatar
Miklos Szeredi committed
1354
{
1355
	size_t nbytes = 0;  /* # bytes already packed in req */
1356
	ssize_t ret = 0;
1357

1358
	/* Special case for kernel I/O: can copy directly into the buffer */
1359
	if (iov_iter_is_kvec(ii)) {
1360 1361 1362
		unsigned long user_addr = fuse_get_user_addr(ii);
		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);

1363
		if (write)
1364
			ap->args.in_args[1].value = (void *) user_addr;
1365
		else
1366
			ap->args.out_args[0].value = (void *) user_addr;
1367

1368 1369
		iov_iter_advance(ii, frag_size);
		*nbytesp = frag_size;
1370 1371
		return 0;
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1372

1373
	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
1374
		unsigned npages;
Al Viro's avatar
Al Viro committed
1375
		size_t start;
1376
		ret = iov_iter_get_pages(ii, &ap->pages[ap->num_pages],
1377
					*nbytesp - nbytes,
1378
					max_pages - ap->num_pages,
1379
					&start);
1380
		if (ret < 0)
1381
			break;
1382

1383 1384
		iov_iter_advance(ii, ret);
		nbytes += ret;
1385

1386 1387
		ret += start;
		npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
1388

1389 1390
		ap->descs[ap->num_pages].offset = start;
		fuse_page_descs_length_init(ap->descs, ap->num_pages, npages);
1391

1392 1393
		ap->num_pages += npages;
		ap->descs[ap->num_pages - 1].length -=
1394
			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
1395
	}
1396 1397

	if (write)
1398
		ap->args.in_pages = 1;
1399
	else
1400
		ap->args.out_pages = 1;
1401

1402
	*nbytesp = nbytes;
1403

1404
	return ret < 0 ? ret : 0;
Miklos Szeredi's avatar
Miklos Szeredi committed
1405 1406
}

1407 1408
ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
		       loff_t *ppos, int flags)
Miklos Szeredi's avatar
Miklos Szeredi committed
1409
{
1410 1411
	int write = flags & FUSE_DIO_WRITE;
	int cuse = flags & FUSE_DIO_CUSE;
1412
	struct file *file = io->iocb->ki_filp;
1413
	struct inode *inode = file->f_mapping->host;
1414 1415
	struct fuse_file *ff = file->private_data;
	struct fuse_conn *fc = ff->fc;
Miklos Szeredi's avatar
Miklos Szeredi committed
1416 1417
	size_t nmax = write ? fc->max_write : fc->max_read;
	loff_t pos = *ppos;
1418
	size_t count = iov_iter_count(iter);
1419 1420
	pgoff_t idx_from = pos >> PAGE_SHIFT;
	pgoff_t idx_to = (pos + count - 1) >> PAGE_SHIFT;
Miklos Szeredi's avatar
Miklos Szeredi committed
1421
	ssize_t res = 0;
1422
	int err = 0;
1423 1424
	struct fuse_io_args *ia;
	unsigned int max_pages;
1425

1426 1427 1428 1429
	max_pages = iov_iter_npages(iter, fc->max_pages);
	ia = fuse_io_alloc(io, max_pages);
	if (!ia)
		return -ENOMEM;
Miklos Szeredi's avatar
Miklos Szeredi committed
1430

1431
	ia->io = io;
1432 1433
	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
		if (!write)
Al Viro's avatar
Al Viro committed
1434
			inode_lock(inode);
1435 1436
		fuse_sync_writes(inode);
		if (!write)
Al Viro's avatar
Al Viro committed
1437
			inode_unlock(inode);
1438 1439
	}

1440
	io->should_dirty = !write && iter_is_iovec(iter);
Miklos Szeredi's avatar
Miklos Szeredi committed
1441
	while (count) {
1442
		ssize_t nres;
1443
		fl_owner_t owner = current->files;
1444
		size_t nbytes = min(count, nmax);
1445 1446 1447

		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
					  max_pages);
1448
		if (err && !nbytes)
Miklos Szeredi's avatar
Miklos Szeredi committed
1449
			break;
1450

1451
		if (write) {
1452 1453
			if (!capable(CAP_FSETID))
				ia->write.in.write_flags |= FUSE_WRITE_KILL_PRIV;
1454

1455
			nres = fuse_send_write(ia, pos, nbytes, owner);
1456
		} else {
1457
			nres = fuse_send_read(ia, pos, nbytes, owner);
1458
		}
1459

1460 1461 1462 1463 1464 1465 1466
		if (!io->async || nres < 0) {
			fuse_release_user_pages(&ia->ap, io->should_dirty);
			fuse_io_free(ia);
		}
		ia = NULL;
		if (nres < 0) {
			err = nres;
Miklos Szeredi's avatar
Miklos Szeredi committed
1467 1468
			break;
		}
1469 1470
		WARN_ON(nres > nbytes);

Miklos Szeredi's avatar
Miklos Szeredi committed
1471 1472 1473 1474 1475
		count -= nres;
		res += nres;
		pos += nres;
		if (nres != nbytes)
			break;
1476
		if (count) {
1477 1478 1479
			max_pages = iov_iter_npages(iter, fc->max_pages);
			ia = fuse_io_alloc(io, max_pages);
			if (!ia)
1480 1481
				break;
		}
Miklos Szeredi's avatar
Miklos Szeredi committed
1482
	}
1483 1484
	if (ia)
		fuse_io_free(ia);
1485
	if (res > 0)
Miklos Szeredi's avatar
Miklos Szeredi committed
1486 1487
		*ppos = pos;

1488
	return res > 0 ? res : err;
Miklos Szeredi's avatar
Miklos Szeredi committed
1489
}
1490
EXPORT_SYMBOL_GPL(fuse_direct_io);
Miklos Szeredi's avatar
Miklos Szeredi committed
1491

1492
static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
1493 1494
				  struct iov_iter *iter,
				  loff_t *ppos)
Miklos Szeredi's avatar
Miklos Szeredi committed
1495
{
1496
	ssize_t res;
1497
	struct inode *inode = file_inode(io->iocb->ki_filp);
1498

1499
	res = fuse_direct_io(io, iter, ppos, 0);
1500

1501
	fuse_invalidate_atime(inode);
1502 1503

	return res;
Miklos Szeredi's avatar
Miklos Szeredi committed
1504 1505
}

1506 1507
static ssize_t fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter);

1508
static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
1509
{
1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520
	ssize_t res;

	if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
		res = fuse_direct_IO(iocb, to);
	} else {
		struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);

		res = __fuse_direct_read(&io, to, &iocb->ki_pos);
	}

	return res;
1521 1522
}

1523
static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
1524
{
1525 1526
	struct inode *inode = file_inode(iocb->ki_filp);
	struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
1527
	ssize_t res;
1528 1529

	/* Don't allow parallel writes to the same file */
Al Viro's avatar
Al Viro committed
1530
	inode_lock(inode);
1531
	res = generic_write_checks(iocb, from);
1532 1533 1534 1535 1536 1537 1538 1539
	if (res > 0) {
		if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
			res = fuse_direct_IO(iocb, from);
		} else {
			res = fuse_direct_io(&io, from, &iocb->ki_pos,
					     FUSE_DIO_WRITE);
		}
	}
1540
	fuse_invalidate_attr(inode);
1541
	if (res > 0)
1542
		fuse_write_update_size(inode, iocb->ki_pos);
Al Viro's avatar
Al Viro committed
1543
	inode_unlock(inode);
1544 1545 1546 1547

	return res;
}

1548 1549
static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
1550 1551 1552 1553 1554
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;

	if (is_bad_inode(file_inode(file)))
		return -EIO;
1555 1556 1557 1558 1559 1560 1561 1562 1563

	if (!(ff->open_flags & FOPEN_DIRECT_IO))
		return fuse_cache_read_iter(iocb, to);
	else
		return fuse_direct_read_iter(iocb, to);
}

static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
1564 1565 1566 1567 1568
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;

	if (is_bad_inode(file_inode(file)))
		return -EIO;
1569 1570 1571 1572 1573 1574 1575

	if (!(ff->open_flags & FOPEN_DIRECT_IO))
		return fuse_cache_write_iter(iocb, from);
	else
		return fuse_direct_write_iter(iocb, from);
}

1576
static void fuse_writepage_free(struct fuse_writepage_args *wpa)
1577
{
1578
	struct fuse_args_pages *ap = &wpa->ia.ap;
1579 1580
	int i;

1581 1582 1583 1584 1585
	for (i = 0; i < ap->num_pages; i++)
		__free_page(ap->pages[i]);

	if (wpa->ia.ff)
		fuse_file_put(wpa->ia.ff, false, false);
1586

1587 1588
	kfree(ap->pages);
	kfree(wpa);
Miklos Szeredi's avatar
Miklos Szeredi committed
1589 1590
}

1591 1592
static void fuse_writepage_finish(struct fuse_conn *fc,
				  struct fuse_writepage_args *wpa)
Miklos Szeredi's avatar
Miklos Szeredi committed
1593
{
1594 1595
	struct fuse_args_pages *ap = &wpa->ia.ap;
	struct inode *inode = wpa->inode;
Miklos Szeredi's avatar
Miklos Szeredi committed
1596
	struct fuse_inode *fi = get_fuse_inode(inode);
1597
	struct backing_dev_info *bdi = inode_to_bdi(inode);
1598
	int i;
Miklos Szeredi's avatar
Miklos Szeredi committed
1599

1600 1601
	list_del(&wpa->writepages_entry);
	for (i = 0; i < ap->num_pages; i++) {
1602
		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1603
		dec_node_page_state(ap->pages[i], NR_WRITEBACK_TEMP);
1604
		wb_writeout_inc(&bdi->wb);
1605
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1606 1607 1608
	wake_up(&fi->page_waitq);
}

1609
/* Called under fi->lock, may release and reacquire it */
1610 1611
static void fuse_send_writepage(struct fuse_conn *fc,
				struct fuse_writepage_args *wpa, loff_t size)
1612 1613
__releases(fi->lock)
__acquires(fi->lock)
Miklos Szeredi's avatar
Miklos Szeredi committed
1614
{
1615 1616 1617 1618 1619 1620
	struct fuse_writepage_args *aux, *next;
	struct fuse_inode *fi = get_fuse_inode(wpa->inode);
	struct fuse_write_in *inarg = &wpa->ia.write.in;
	struct fuse_args *args = &wpa->ia.ap.args;
	__u64 data_size = wpa->ia.ap.num_pages * PAGE_SIZE;
	int err;
Miklos Szeredi's avatar
Miklos Szeredi committed
1621

1622
	fi->writectr++;
1623 1624
	if (inarg->offset + data_size <= size) {
		inarg->size = data_size;
Miklos Szeredi's avatar
Miklos Szeredi committed
1625
	} else if (inarg->offset < size) {
1626
		inarg->size = size - inarg->offset;
Miklos Szeredi's avatar
Miklos Szeredi committed
1627 1628 1629
	} else {
		/* Got truncated off completely */
		goto out_free;
1630
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1631

1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
	args->in_args[1].size = inarg->size;
	args->force = true;
	args->nocreds = true;

	err = fuse_simple_background(fc, args, GFP_ATOMIC);
	if (err == -ENOMEM) {
		spin_unlock(&fi->lock);
		err = fuse_simple_background(fc, args, GFP_NOFS | __GFP_NOFAIL);
		spin_lock(&fi->lock);
	}

1643
	/* Fails on broken connection only */
1644
	if (unlikely(err))
1645 1646
		goto out_free;

Miklos Szeredi's avatar
Miklos Szeredi committed
1647 1648 1649
	return;

 out_free:
1650 1651
	fi->writectr--;
	fuse_writepage_finish(fc, wpa);
1652
	spin_unlock(&fi->lock);
1653 1654

	/* After fuse_writepage_finish() aux request list is private */
1655 1656 1657 1658
	for (aux = wpa->next; aux; aux = next) {
		next = aux->next;
		aux->next = NULL;
		fuse_writepage_free(aux);
1659 1660
	}

1661
	fuse_writepage_free(wpa);
1662
	spin_lock(&fi->lock);
1663 1664
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1665 1666 1667 1668
/*
 * If fi->writectr is positive (no truncate or fsync going on) send
 * all queued writepage requests.
 *
1669
 * Called with fi->lock
Miklos Szeredi's avatar
Miklos Szeredi committed
1670 1671
 */
void fuse_flush_writepages(struct inode *inode)
1672 1673
__releases(fi->lock)
__acquires(fi->lock)
1674
{
Miklos Szeredi's avatar
Miklos Szeredi committed
1675 1676
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
1677
	loff_t crop = i_size_read(inode);
1678
	struct fuse_writepage_args *wpa;
Miklos Szeredi's avatar
Miklos Szeredi committed
1679 1680

	while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
1681 1682 1683 1684
		wpa = list_entry(fi->queued_writes.next,
				 struct fuse_writepage_args, queue_entry);
		list_del_init(&wpa->queue_entry);
		fuse_send_writepage(fc, wpa, crop);
Miklos Szeredi's avatar
Miklos Szeredi committed
1685 1686 1687
	}
}

1688 1689
static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_args *args,
			       int error)
Miklos Szeredi's avatar
Miklos Szeredi committed
1690
{
1691 1692 1693
	struct fuse_writepage_args *wpa =
		container_of(args, typeof(*wpa), ia.ap.args);
	struct inode *inode = wpa->inode;
Miklos Szeredi's avatar
Miklos Szeredi committed
1694 1695
	struct fuse_inode *fi = get_fuse_inode(inode);

1696
	mapping_set_error(inode->i_mapping, error);
1697
	spin_lock(&fi->lock);
1698
	while (wpa->next) {
1699
		struct fuse_conn *fc = get_fuse_conn(inode);
1700 1701 1702 1703 1704 1705
		struct fuse_write_in *inarg = &wpa->ia.write.in;
		struct fuse_writepage_args *next = wpa->next;

		wpa->next = next->next;
		next->next = NULL;
		next->ia.ff = fuse_file_get(wpa->ia.ff);
1706
		list_add(&next->writepages_entry, &fi->writepages);
1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731

		/*
		 * Skip fuse_flush_writepages() to make it easy to crop requests
		 * based on primary request size.
		 *
		 * 1st case (trivial): there are no concurrent activities using
		 * fuse_set/release_nowrite.  Then we're on safe side because
		 * fuse_flush_writepages() would call fuse_send_writepage()
		 * anyway.
		 *
		 * 2nd case: someone called fuse_set_nowrite and it is waiting
		 * now for completion of all in-flight requests.  This happens
		 * rarely and no more than once per page, so this should be
		 * okay.
		 *
		 * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
		 * of fuse_set_nowrite..fuse_release_nowrite section.  The fact
		 * that fuse_set_nowrite returned implies that all in-flight
		 * requests were completed along with all of their secondary
		 * requests.  Further primary requests are blocked by negative
		 * writectr.  Hence there cannot be any in-flight requests and
		 * no invocations of fuse_writepage_end() while we're in
		 * fuse_set_nowrite..fuse_release_nowrite section.
		 */
		fuse_send_writepage(fc, next, inarg->offset + inarg->size);
1732
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
1733
	fi->writectr--;
1734
	fuse_writepage_finish(fc, wpa);
1735
	spin_unlock(&fi->lock);
1736
	fuse_writepage_free(wpa);
Miklos Szeredi's avatar
Miklos Szeredi committed
1737 1738
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1739 1740
static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
					       struct fuse_inode *fi)
1741
{
1742
	struct fuse_file *ff = NULL;
1743

1744
	spin_lock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
1745
	if (!list_empty(&fi->write_files)) {
1746 1747 1748 1749
		ff = list_entry(fi->write_files.next, struct fuse_file,
				write_entry);
		fuse_file_get(ff);
	}
1750
	spin_unlock(&fi->lock);
1751 1752 1753 1754

	return ff;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770
static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
					     struct fuse_inode *fi)
{
	struct fuse_file *ff = __fuse_write_file_get(fc, fi);
	WARN_ON(!ff);
	return ff;
}

int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
{
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
	struct fuse_file *ff;
	int err;

	ff = __fuse_write_file_get(fc, fi);
1771
	err = fuse_flush_times(inode, ff);
Miklos Szeredi's avatar
Miklos Szeredi committed
1772
	if (ff)
1773
		fuse_file_put(ff, false, false);
Miklos Szeredi's avatar
Miklos Szeredi committed
1774 1775 1776 1777

	return err;
}

1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796
static struct fuse_writepage_args *fuse_writepage_args_alloc(void)
{
	struct fuse_writepage_args *wpa;
	struct fuse_args_pages *ap;

	wpa = kzalloc(sizeof(*wpa), GFP_NOFS);
	if (wpa) {
		ap = &wpa->ia.ap;
		ap->num_pages = 0;
		ap->pages = fuse_pages_alloc(1, GFP_NOFS, &ap->descs);
		if (!ap->pages) {
			kfree(wpa);
			wpa = NULL;
		}
	}
	return wpa;

}

Miklos Szeredi's avatar
Miklos Szeredi committed
1797 1798 1799 1800 1801 1802
static int fuse_writepage_locked(struct page *page)
{
	struct address_space *mapping = page->mapping;
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_inode *fi = get_fuse_inode(inode);
1803 1804
	struct fuse_writepage_args *wpa;
	struct fuse_args_pages *ap;
Miklos Szeredi's avatar
Miklos Szeredi committed
1805
	struct page *tmp_page;
1806
	int error = -ENOMEM;
Miklos Szeredi's avatar
Miklos Szeredi committed
1807 1808 1809

	set_page_writeback(page);

1810 1811
	wpa = fuse_writepage_args_alloc();
	if (!wpa)
Miklos Szeredi's avatar
Miklos Szeredi committed
1812
		goto err;
1813
	ap = &wpa->ia.ap;
Miklos Szeredi's avatar
Miklos Szeredi committed
1814 1815 1816 1817 1818

	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
	if (!tmp_page)
		goto err_free;

1819
	error = -EIO;
1820 1821
	wpa->ia.ff = fuse_write_file_get(fc, fi);
	if (!wpa->ia.ff)
1822
		goto err_nofile;
1823

1824
	fuse_write_args_fill(&wpa->ia, wpa->ia.ff, page_offset(page), 0);
Miklos Szeredi's avatar
Miklos Szeredi committed
1825 1826

	copy_highpage(tmp_page, page);
1827 1828 1829 1830 1831 1832 1833 1834 1835
	wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
	wpa->next = NULL;
	ap->args.in_pages = true;
	ap->num_pages = 1;
	ap->pages[0] = tmp_page;
	ap->descs[0].offset = 0;
	ap->descs[0].length = PAGE_SIZE;
	ap->args.end = fuse_writepage_end;
	wpa->inode = inode;
Miklos Szeredi's avatar
Miklos Szeredi committed
1836

1837
	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
1838
	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
Miklos Szeredi's avatar
Miklos Szeredi committed
1839

1840
	spin_lock(&fi->lock);
1841 1842
	list_add(&wpa->writepages_entry, &fi->writepages);
	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
Miklos Szeredi's avatar
Miklos Szeredi committed
1843
	fuse_flush_writepages(inode);
1844
	spin_unlock(&fi->lock);
Miklos Szeredi's avatar
Miklos Szeredi committed
1845

1846 1847
	end_page_writeback(page);

Miklos Szeredi's avatar
Miklos Szeredi committed
1848 1849
	return 0;

1850 1851
err_nofile:
	__free_page(tmp_page);
Miklos Szeredi's avatar
Miklos Szeredi committed
1852
err_free:
1853
	kfree(wpa);
Miklos Szeredi's avatar
Miklos Szeredi committed
1854
err:
1855
	mapping_set_error(page->mapping, error);
Miklos Szeredi's avatar
Miklos Szeredi committed
1856
	end_page_writeback(page);
1857
	return error;
Miklos Szeredi's avatar
Miklos Szeredi committed
1858 1859 1860 1861 1862 1863
}

static int fuse_writepage(struct page *page, struct writeback_control *wbc)
{
	int err;

1864 1865 1866 1867 1868 1869 1870 1871 1872
	if (fuse_page_is_writeback(page->mapping->host, page->index)) {
		/*
		 * ->writepages() should be called for sync() and friends.  We
		 * should only get here on direct reclaim and then we are
		 * allowed to skip a page which is already in flight
		 */
		WARN_ON(wbc->sync_mode == WB_SYNC_ALL);

		redirty_page_for_writepage(wbc, page);
1873
		unlock_page(page);
1874 1875 1876
		return 0;
	}

Miklos Szeredi's avatar
Miklos Szeredi committed
1877 1878 1879 1880 1881 1882
	err = fuse_writepage_locked(page);
	unlock_page(page);

	return err;
}

1883
struct fuse_fill_wb_data {
1884
	struct fuse_writepage_args *wpa;
1885 1886
	struct fuse_file *ff;
	struct inode *inode;
1887
	struct page **orig_pages;
1888
	unsigned int max_pages;
1889 1890
};

1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916
static bool fuse_pages_realloc(struct fuse_fill_wb_data *data)
{
	struct fuse_args_pages *ap = &data->wpa->ia.ap;
	struct fuse_conn *fc = get_fuse_conn(data->inode);
	struct page **pages;
	struct fuse_page_desc *descs;
	unsigned int npages = min_t(unsigned int,
				    max_t(unsigned int, data->max_pages * 2,
					  FUSE_DEFAULT_MAX_PAGES_PER_REQ),
				    fc->max_pages);
	WARN_ON(npages <= data->max_pages);

	pages = fuse_pages_alloc(npages, GFP_NOFS, &descs);
	if (!pages)
		return false;

	memcpy(pages, ap->pages, sizeof(struct page *) * ap->num_pages);
	memcpy(descs, ap->descs, sizeof(struct fuse_page_desc) * ap->num_pages);
	kfree(ap->pages);
	ap->pages = pages;
	ap->descs = descs;
	data->max_pages = npages;

	return true;
}

1917 1918
static void fuse_writepages_send(struct fuse_fill_wb_data *data)
{
1919
	struct fuse_writepage_args *wpa = data->wpa;
1920 1921
	struct inode *inode = data->inode;
	struct fuse_inode *fi = get_fuse_inode(inode);
1922
	int num_pages = wpa->ia.ap.num_pages;
1923
	int i;
1924

1925
	wpa->ia.ff = fuse_file_get(data->ff);
1926
	spin_lock(&fi->lock);
1927
	list_add_tail(&wpa->queue_entry, &fi->queued_writes);
1928
	fuse_flush_writepages(inode);
1929
	spin_unlock(&fi->lock);
1930 1931 1932

	for (i = 0; i < num_pages; i++)
		end_page_writeback(data->orig_pages[i]);
1933 1934
}

1935 1936
/*
 * First recheck under fi->lock if the offending offset is still under
1937 1938 1939
 * writeback.  If yes, then iterate auxiliary write requests, to see if there's
 * one already added for a page at this offset.  If there's none, then insert
 * this new request onto the auxiliary list, otherwise reuse the existing one by
1940 1941
 * copying the new page contents over to the old temporary page.
 */
1942
static bool fuse_writepage_in_flight(struct fuse_writepage_args *new_wpa,
1943 1944
				     struct page *page)
{
1945 1946 1947 1948
	struct fuse_inode *fi = get_fuse_inode(new_wpa->inode);
	struct fuse_writepage_args *tmp;
	struct fuse_writepage_args *old_wpa;
	struct fuse_args_pages *new_ap = &new_wpa->ia.ap;
1949

1950
	WARN_ON(new_ap->num_pages != 0);
1951

1952
	spin_lock(&fi->lock);
1953 1954 1955 1956
	list_del(&new_wpa->writepages_entry);
	old_wpa = fuse_find_writeback(fi, page->index, page->index);
	if (!old_wpa) {
		list_add(&new_wpa->writepages_entry, &fi->writepages);
1957
		spin_unlock(&fi->lock);
1958
		return false;
1959
	}
1960

1961 1962
	new_ap->num_pages = 1;
	for (tmp = old_wpa->next; tmp; tmp = tmp->next) {
1963 1964
		pgoff_t curr_index;

1965 1966
		WARN_ON(tmp->inode != new_wpa->inode);
		curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT;
1967
		if (curr_index == page->index) {
1968 1969
			WARN_ON(tmp->ia.ap.num_pages != 1);
			swap(tmp->ia.ap.pages[0], new_ap->pages[0]);
1970
			break;
1971 1972 1973
		}
	}

1974
	if (!tmp) {
1975 1976
		new_wpa->next = old_wpa->next;
		old_wpa->next = new_wpa;
1977
	}
1978

1979
	spin_unlock(&fi->lock);
1980 1981

	if (tmp) {
1982
		struct backing_dev_info *bdi = inode_to_bdi(new_wpa->inode);
1983

1984
		dec_wb_stat(&bdi->wb, WB_WRITEBACK);
1985
		dec_node_page_state(new_ap->pages[0], NR_WRITEBACK_TEMP);
1986
		wb_writeout_inc(&bdi->wb);
1987
		fuse_writepage_free(new_wpa);
1988
	}
1989

1990
	return true;
1991 1992
}

1993 1994 1995 1996
static int fuse_writepages_fill(struct page *page,
		struct writeback_control *wbc, void *_data)
{
	struct fuse_fill_wb_data *data = _data;
1997 1998
	struct fuse_writepage_args *wpa = data->wpa;
	struct fuse_args_pages *ap = &wpa->ia.ap;
1999
	struct inode *inode = data->inode;
2000
	struct fuse_inode *fi = get_fuse_inode(inode);
2001 2002
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct page *tmp_page;
2003
	bool is_writeback;
2004 2005 2006 2007
	int err;

	if (!data->ff) {
		err = -EIO;
2008
		data->ff = fuse_write_file_get(fc, fi);
2009 2010 2011 2012
		if (!data->ff)
			goto out_unlock;
	}

2013 2014 2015 2016 2017 2018 2019 2020
	/*
	 * Being under writeback is unlikely but possible.  For example direct
	 * read to an mmaped fuse file will set the page dirty twice; once when
	 * the pages are faulted with get_user_pages(), and then after the read
	 * completed.
	 */
	is_writeback = fuse_page_is_writeback(inode, page->index);

2021 2022 2023 2024
	if (wpa && ap->num_pages &&
	    (is_writeback || ap->num_pages == fc->max_pages ||
	     (ap->num_pages + 1) * PAGE_SIZE > fc->max_write ||
	     data->orig_pages[ap->num_pages - 1]->index + 1 != page->index)) {
2025
		fuse_writepages_send(data);
2026 2027 2028
		data->wpa = NULL;
	} else if (wpa && ap->num_pages == data->max_pages) {
		if (!fuse_pages_realloc(data)) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2029
			fuse_writepages_send(data);
2030
			data->wpa = NULL;
Miklos Szeredi's avatar
Miklos Szeredi committed
2031
		}
2032
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
2033

2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047
	err = -ENOMEM;
	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
	if (!tmp_page)
		goto out_unlock;

	/*
	 * The page must not be redirtied until the writeout is completed
	 * (i.e. userspace has sent a reply to the write request).  Otherwise
	 * there could be more than one temporary page instance for each real
	 * page.
	 *
	 * This is ensured by holding the page lock in page_mkwrite() while
	 * checking fuse_page_is_writeback().  We already hold the page lock
	 * since clear_page_dirty_for_io() and keep it held until we add the
2048
	 * request to the fi->writepages list and increment ap->num_pages.
2049 2050 2051
	 * After this fuse_page_is_writeback() will indicate that the page is
	 * under writeback, so we can release the page lock.
	 */
2052
	if (data->wpa == NULL) {
2053
		err = -ENOMEM;
2054 2055
		wpa = fuse_writepage_args_alloc();
		if (!wpa) {
2056 2057 2058
			__free_page(tmp_page);
			goto out_unlock;
		}
2059
		data->max_pages = 1;
2060

2061 2062 2063 2064 2065 2066 2067 2068
		ap = &wpa->ia.ap;
		fuse_write_args_fill(&wpa->ia, data->ff, page_offset(page), 0);
		wpa->ia.write.in.write_flags |= FUSE_WRITE_CACHE;
		wpa->next = NULL;
		ap->args.in_pages = true;
		ap->args.end = fuse_writepage_end;
		ap->num_pages = 0;
		wpa->inode = inode;
2069

2070
		spin_lock(&fi->lock);
2071
		list_add(&wpa->writepages_entry, &fi->writepages);
2072
		spin_unlock(&fi->lock);
2073

2074
		data->wpa = wpa;
2075 2076 2077 2078
	}
	set_page_writeback(page);

	copy_highpage(tmp_page, page);
2079 2080 2081
	ap->pages[ap->num_pages] = tmp_page;
	ap->descs[ap->num_pages].offset = 0;
	ap->descs[ap->num_pages].length = PAGE_SIZE;
2082

2083
	inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
2084
	inc_node_page_state(tmp_page, NR_WRITEBACK_TEMP);
2085 2086

	err = 0;
2087
	if (is_writeback && fuse_writepage_in_flight(wpa, page)) {
2088
		end_page_writeback(page);
2089
		data->wpa = NULL;
2090 2091
		goto out_unlock;
	}
2092
	data->orig_pages[ap->num_pages] = page;
2093 2094

	/*
2095
	 * Protected by fi->lock against concurrent access by
2096 2097
	 * fuse_page_is_writeback().
	 */
2098
	spin_lock(&fi->lock);
2099
	ap->num_pages++;
2100
	spin_unlock(&fi->lock);
2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111

out_unlock:
	unlock_page(page);

	return err;
}

static int fuse_writepages(struct address_space *mapping,
			   struct writeback_control *wbc)
{
	struct inode *inode = mapping->host;
2112
	struct fuse_conn *fc = get_fuse_conn(inode);
2113 2114 2115 2116 2117 2118 2119 2120
	struct fuse_fill_wb_data data;
	int err;

	err = -EIO;
	if (is_bad_inode(inode))
		goto out;

	data.inode = inode;
2121
	data.wpa = NULL;
2122 2123
	data.ff = NULL;

2124
	err = -ENOMEM;
2125
	data.orig_pages = kcalloc(fc->max_pages,
2126
				  sizeof(struct page *),
2127 2128 2129 2130
				  GFP_NOFS);
	if (!data.orig_pages)
		goto out;

2131
	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
2132
	if (data.wpa) {
2133
		/* Ignore errors if we can write at least one page */
2134
		WARN_ON(!data.wpa->ia.ap.num_pages);
2135 2136 2137 2138
		fuse_writepages_send(&data);
		err = 0;
	}
	if (data.ff)
2139
		fuse_file_put(data.ff, false, false);
2140 2141

	kfree(data.orig_pages);
2142 2143 2144 2145
out:
	return err;
}

2146 2147 2148 2149 2150 2151 2152 2153
/*
 * It's worthy to make sure that space is reserved on disk for the write,
 * but how to implement it without killing performance need more thinking.
 */
static int fuse_write_begin(struct file *file, struct address_space *mapping,
		loff_t pos, unsigned len, unsigned flags,
		struct page **pagep, void **fsdata)
{
2154
	pgoff_t index = pos >> PAGE_SHIFT;
Al Viro's avatar
Al Viro committed
2155
	struct fuse_conn *fc = get_fuse_conn(file_inode(file));
2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167
	struct page *page;
	loff_t fsize;
	int err = -ENOMEM;

	WARN_ON(!fc->writeback_cache);

	page = grab_cache_page_write_begin(mapping, index, flags);
	if (!page)
		goto error;

	fuse_wait_on_page_writeback(mapping->host, page->index);

2168
	if (PageUptodate(page) || len == PAGE_SIZE)
2169 2170 2171 2172 2173 2174
		goto success;
	/*
	 * Check if the start this page comes after the end of file, in which
	 * case the readpage can be optimized away.
	 */
	fsize = i_size_read(mapping->host);
2175 2176
	if (fsize <= (pos & PAGE_MASK)) {
		size_t off = pos & ~PAGE_MASK;
2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189
		if (off)
			zero_user_segment(page, 0, off);
		goto success;
	}
	err = fuse_do_readpage(file, page);
	if (err)
		goto cleanup;
success:
	*pagep = page;
	return 0;

cleanup:
	unlock_page(page);
2190
	put_page(page);
2191 2192 2193 2194 2195 2196 2197 2198 2199 2200
error:
	return err;
}

static int fuse_write_end(struct file *file, struct address_space *mapping,
		loff_t pos, unsigned len, unsigned copied,
		struct page *page, void *fsdata)
{
	struct inode *inode = page->mapping->host;

2201 2202 2203 2204
	/* Haven't copied anything?  Skip zeroing, size extending, dirtying. */
	if (!copied)
		goto unlock;

2205 2206
	if (!PageUptodate(page)) {
		/* Zero any unwritten bytes at the end of the page */
2207
		size_t endoff = (pos + copied) & ~PAGE_MASK;
2208
		if (endoff)
2209
			zero_user_segment(page, endoff, PAGE_SIZE);
2210 2211 2212 2213 2214
		SetPageUptodate(page);
	}

	fuse_write_update_size(inode, pos + copied);
	set_page_dirty(page);
2215 2216

unlock:
2217
	unlock_page(page);
2218
	put_page(page);
2219 2220 2221 2222

	return copied;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258
static int fuse_launder_page(struct page *page)
{
	int err = 0;
	if (clear_page_dirty_for_io(page)) {
		struct inode *inode = page->mapping->host;
		err = fuse_writepage_locked(page);
		if (!err)
			fuse_wait_on_page_writeback(inode, page->index);
	}
	return err;
}

/*
 * Write back dirty pages now, because there may not be any suitable
 * open files later
 */
static void fuse_vma_close(struct vm_area_struct *vma)
{
	filemap_write_and_wait(vma->vm_file->f_mapping);
}

/*
 * Wait for writeback against this page to complete before allowing it
 * to be marked dirty again, and hence written back again, possibly
 * before the previous writepage completed.
 *
 * Block here, instead of in ->writepage(), so that the userspace fs
 * can only block processes actually operating on the filesystem.
 *
 * Otherwise unprivileged userspace fs would be able to block
 * unrelated:
 *
 * - page migration
 * - sync(2)
 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
 */
2259
static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf)
Miklos Szeredi's avatar
Miklos Szeredi committed
2260
{
2261
	struct page *page = vmf->page;
2262
	struct inode *inode = file_inode(vmf->vma->vm_file);
Miklos Szeredi's avatar
Miklos Szeredi committed
2263

2264
	file_update_time(vmf->vma->vm_file);
Miklos Szeredi's avatar
Miklos Szeredi committed
2265 2266 2267 2268 2269
	lock_page(page);
	if (page->mapping != inode->i_mapping) {
		unlock_page(page);
		return VM_FAULT_NOPAGE;
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
2270 2271

	fuse_wait_on_page_writeback(inode, page->index);
Miklos Szeredi's avatar
Miklos Szeredi committed
2272
	return VM_FAULT_LOCKED;
Miklos Szeredi's avatar
Miklos Szeredi committed
2273 2274
}

2275
static const struct vm_operations_struct fuse_file_vm_ops = {
Miklos Szeredi's avatar
Miklos Szeredi committed
2276 2277
	.close		= fuse_vma_close,
	.fault		= filemap_fault,
2278
	.map_pages	= filemap_map_pages,
Miklos Szeredi's avatar
Miklos Szeredi committed
2279 2280 2281 2282 2283
	.page_mkwrite	= fuse_page_mkwrite,
};

static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
{
2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295
	struct fuse_file *ff = file->private_data;

	if (ff->open_flags & FOPEN_DIRECT_IO) {
		/* Can't provide the coherency needed for MAP_SHARED */
		if (vma->vm_flags & VM_MAYSHARE)
			return -ENODEV;

		invalidate_inode_pages2(file->f_mapping);

		return generic_file_mmap(file, vma);
	}

2296 2297 2298
	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
		fuse_link_write_file(file);

Miklos Szeredi's avatar
Miklos Szeredi committed
2299 2300
	file_accessed(file);
	vma->vm_ops = &fuse_file_vm_ops;
2301 2302 2303
	return 0;
}

2304 2305
static int convert_fuse_file_lock(struct fuse_conn *fc,
				  const struct fuse_file_lock *ffl,
2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319
				  struct file_lock *fl)
{
	switch (ffl->type) {
	case F_UNLCK:
		break;

	case F_RDLCK:
	case F_WRLCK:
		if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
		    ffl->end < ffl->start)
			return -EIO;

		fl->fl_start = ffl->start;
		fl->fl_end = ffl->end;
2320 2321

		/*
2322 2323
		 * Convert pid into init's pid namespace.  The locks API will
		 * translate it into the caller's pid namespace.
2324 2325
		 */
		rcu_read_lock();
2326
		fl->fl_pid = pid_nr_ns(find_pid_ns(ffl->pid, fc->pid_ns), &init_pid_ns);
2327
		rcu_read_unlock();
2328 2329 2330 2331 2332 2333 2334 2335 2336
		break;

	default:
		return -EIO;
	}
	fl->fl_type = ffl->type;
	return 0;
}

2337
static void fuse_lk_fill(struct fuse_args *args, struct file *file,
2338
			 const struct file_lock *fl, int opcode, pid_t pid,
2339
			 int flock, struct fuse_lk_in *inarg)
2340
{
2341
	struct inode *inode = file_inode(file);
2342
	struct fuse_conn *fc = get_fuse_conn(inode);
2343
	struct fuse_file *ff = file->private_data;
2344 2345 2346 2347 2348 2349 2350 2351

	memset(inarg, 0, sizeof(*inarg));
	inarg->fh = ff->fh;
	inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
	inarg->lk.start = fl->fl_start;
	inarg->lk.end = fl->fl_end;
	inarg->lk.type = fl->fl_type;
	inarg->lk.pid = pid;
2352
	if (flock)
2353
		inarg->lk_flags |= FUSE_LK_FLOCK;
2354 2355 2356 2357 2358
	args->opcode = opcode;
	args->nodeid = get_node_id(inode);
	args->in_numargs = 1;
	args->in_args[0].size = sizeof(*inarg);
	args->in_args[0].value = inarg;
2359 2360 2361 2362
}

static int fuse_getlk(struct file *file, struct file_lock *fl)
{
2363
	struct inode *inode = file_inode(file);
2364
	struct fuse_conn *fc = get_fuse_conn(inode);
2365 2366
	FUSE_ARGS(args);
	struct fuse_lk_in inarg;
2367 2368 2369
	struct fuse_lk_out outarg;
	int err;

2370
	fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
2371 2372 2373
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2374
	err = fuse_simple_request(fc, &args);
2375
	if (!err)
2376
		err = convert_fuse_file_lock(fc, &outarg.lk, fl);
2377 2378 2379 2380

	return err;
}

2381
static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
2382
{
2383
	struct inode *inode = file_inode(file);
2384
	struct fuse_conn *fc = get_fuse_conn(inode);
2385 2386
	FUSE_ARGS(args);
	struct fuse_lk_in inarg;
2387
	int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
2388 2389
	struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL;
	pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns);
2390 2391
	int err;

2392
	if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2393 2394 2395 2396
		/* NLM needs asynchronous locks, which we don't support yet */
		return -ENOLCK;
	}

2397
	/* Unlock on close is handled by the flush method */
2398
	if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX)
2399 2400
		return 0;

2401
	fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg);
2402
	err = fuse_simple_request(fc, &args);
2403

2404 2405 2406
	/* locking is restartable */
	if (err == -EINTR)
		err = -ERESTARTSYS;
2407

2408 2409 2410 2411 2412
	return err;
}

static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
{
2413
	struct inode *inode = file_inode(file);
2414 2415 2416
	struct fuse_conn *fc = get_fuse_conn(inode);
	int err;

Miklos Szeredi's avatar
Miklos Szeredi committed
2417 2418 2419
	if (cmd == F_CANCELLK) {
		err = 0;
	} else if (cmd == F_GETLK) {
2420
		if (fc->no_lock) {
2421
			posix_test_lock(file, fl);
2422 2423 2424 2425 2426
			err = 0;
		} else
			err = fuse_getlk(file, fl);
	} else {
		if (fc->no_lock)
Miklos Szeredi's avatar
Miklos Szeredi committed
2427
			err = posix_lock_file(file, fl, NULL);
2428
		else
2429
			err = fuse_setlk(file, fl, 0);
2430 2431 2432 2433
	}
	return err;
}

2434 2435
static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
{
2436
	struct inode *inode = file_inode(file);
2437 2438 2439
	struct fuse_conn *fc = get_fuse_conn(inode);
	int err;

Miklos Szeredi's avatar
Miklos Szeredi committed
2440
	if (fc->no_flock) {
2441
		err = locks_lock_file_wait(file, fl);
2442
	} else {
Miklos Szeredi's avatar
Miklos Szeredi committed
2443 2444
		struct fuse_file *ff = file->private_data;

2445
		/* emulate flock with POSIX locks */
Miklos Szeredi's avatar
Miklos Szeredi committed
2446
		ff->flock = true;
2447 2448 2449 2450 2451 2452
		err = fuse_setlk(file, fl, 1);
	}

	return err;
}

2453 2454 2455 2456
static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
{
	struct inode *inode = mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
2457
	FUSE_ARGS(args);
2458 2459 2460 2461 2462 2463 2464 2465 2466 2467
	struct fuse_bmap_in inarg;
	struct fuse_bmap_out outarg;
	int err;

	if (!inode->i_sb->s_bdev || fc->no_bmap)
		return 0;

	memset(&inarg, 0, sizeof(inarg));
	inarg.block = block;
	inarg.blocksize = inode->i_sb->s_blocksize;
2468 2469 2470 2471 2472 2473 2474 2475
	args.opcode = FUSE_BMAP;
	args.nodeid = get_node_id(inode);
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2476
	err = fuse_simple_request(fc, &args);
2477 2478 2479 2480 2481 2482
	if (err == -ENOSYS)
		fc->no_bmap = 1;

	return err ? 0 : outarg.block;
}

2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499
static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
{
	struct inode *inode = file->f_mapping->host;
	struct fuse_conn *fc = get_fuse_conn(inode);
	struct fuse_file *ff = file->private_data;
	FUSE_ARGS(args);
	struct fuse_lseek_in inarg = {
		.fh = ff->fh,
		.offset = offset,
		.whence = whence
	};
	struct fuse_lseek_out outarg;
	int err;

	if (fc->no_lseek)
		goto fallback;

2500 2501 2502 2503 2504 2505 2506 2507
	args.opcode = FUSE_LSEEK;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519
	err = fuse_simple_request(fc, &args);
	if (err) {
		if (err == -ENOSYS) {
			fc->no_lseek = 1;
			goto fallback;
		}
		return err;
	}

	return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);

fallback:
Miklos Szeredi's avatar
Miklos Szeredi committed
2520
	err = fuse_update_attributes(inode, file);
2521 2522 2523 2524 2525 2526
	if (!err)
		return generic_file_llseek(file, offset, whence);
	else
		return err;
}

2527
static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
Miklos Szeredi's avatar
Miklos Szeredi committed
2528 2529
{
	loff_t retval;
2530
	struct inode *inode = file_inode(file);
Miklos Szeredi's avatar
Miklos Szeredi committed
2531

2532 2533 2534 2535
	switch (whence) {
	case SEEK_SET:
	case SEEK_CUR:
		 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
2536
		retval = generic_file_llseek(file, offset, whence);
2537 2538
		break;
	case SEEK_END:
Al Viro's avatar
Al Viro committed
2539
		inode_lock(inode);
Miklos Szeredi's avatar
Miklos Szeredi committed
2540
		retval = fuse_update_attributes(inode, file);
2541 2542
		if (!retval)
			retval = generic_file_llseek(file, offset, whence);
Al Viro's avatar
Al Viro committed
2543
		inode_unlock(inode);
2544 2545 2546
		break;
	case SEEK_HOLE:
	case SEEK_DATA:
Al Viro's avatar
Al Viro committed
2547
		inode_lock(inode);
2548
		retval = fuse_lseek(file, offset, whence);
Al Viro's avatar
Al Viro committed
2549
		inode_unlock(inode);
2550 2551 2552 2553
		break;
	default:
		retval = -EINVAL;
	}
2554

Miklos Szeredi's avatar
Miklos Szeredi committed
2555 2556 2557
	return retval;
}

2558 2559 2560 2561 2562 2563
/*
 * CUSE servers compiled on 32bit broke on 64bit kernels because the
 * ABI was defined to be 'struct iovec' which is different on 32bit
 * and 64bit.  Fortunately we can determine which structure the server
 * used from the size of the reply.
 */
Miklos Szeredi's avatar
Miklos Szeredi committed
2564 2565 2566
static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
				     size_t transferred, unsigned count,
				     bool is_compat)
2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595
{
#ifdef CONFIG_COMPAT
	if (count * sizeof(struct compat_iovec) == transferred) {
		struct compat_iovec *ciov = src;
		unsigned i;

		/*
		 * With this interface a 32bit server cannot support
		 * non-compat (i.e. ones coming from 64bit apps) ioctl
		 * requests
		 */
		if (!is_compat)
			return -EINVAL;

		for (i = 0; i < count; i++) {
			dst[i].iov_base = compat_ptr(ciov[i].iov_base);
			dst[i].iov_len = ciov[i].iov_len;
		}
		return 0;
	}
#endif

	if (count * sizeof(struct iovec) != transferred)
		return -EIO;

	memcpy(dst, src, transferred);
	return 0;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
2596
/* Make sure iov_length() won't overflow */
2597 2598
static int fuse_verify_ioctl_iov(struct fuse_conn *fc, struct iovec *iov,
				 size_t count)
Miklos Szeredi's avatar
Miklos Szeredi committed
2599 2600
{
	size_t n;
2601
	u32 max = fc->max_pages << PAGE_SHIFT;
Miklos Szeredi's avatar
Miklos Szeredi committed
2602

2603
	for (n = 0; n < count; n++, iov++) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2604 2605 2606 2607 2608 2609 2610
		if (iov->iov_len > (size_t) max)
			return -ENOMEM;
		max -= iov->iov_len;
	}
	return 0;
}

Miklos Szeredi's avatar
Miklos Szeredi committed
2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646
static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
				 void *src, size_t transferred, unsigned count,
				 bool is_compat)
{
	unsigned i;
	struct fuse_ioctl_iovec *fiov = src;

	if (fc->minor < 16) {
		return fuse_copy_ioctl_iovec_old(dst, src, transferred,
						 count, is_compat);
	}

	if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
		return -EIO;

	for (i = 0; i < count; i++) {
		/* Did the server supply an inappropriate value? */
		if (fiov[i].base != (unsigned long) fiov[i].base ||
		    fiov[i].len != (unsigned long) fiov[i].len)
			return -EIO;

		dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
		dst[i].iov_len = (size_t) fiov[i].len;

#ifdef CONFIG_COMPAT
		if (is_compat &&
		    (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
		     (compat_size_t) dst[i].iov_len != fiov[i].len))
			return -EIO;
#endif
	}

	return 0;
}


Tejun Heo's avatar
Tejun Heo committed
2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692
/*
 * For ioctls, there is no generic way to determine how much memory
 * needs to be read and/or written.  Furthermore, ioctls are allowed
 * to dereference the passed pointer, so the parameter requires deep
 * copying but FUSE has no idea whatsoever about what to copy in or
 * out.
 *
 * This is solved by allowing FUSE server to retry ioctl with
 * necessary in/out iovecs.  Let's assume the ioctl implementation
 * needs to read in the following structure.
 *
 * struct a {
 *	char	*buf;
 *	size_t	buflen;
 * }
 *
 * On the first callout to FUSE server, inarg->in_size and
 * inarg->out_size will be NULL; then, the server completes the ioctl
 * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
 * the actual iov array to
 *
 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a) } }
 *
 * which tells FUSE to copy in the requested area and retry the ioctl.
 * On the second round, the server has access to the structure and
 * from that it can tell what to look for next, so on the invocation,
 * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
 *
 * { { .iov_base = inarg.arg,	.iov_len = sizeof(struct a)	},
 *   { .iov_base = a.buf,	.iov_len = a.buflen		} }
 *
 * FUSE will copy both struct a and the pointed buffer from the
 * process doing the ioctl and retry ioctl with both struct a and the
 * buffer.
 *
 * This time, FUSE server has everything it needs and completes ioctl
 * without FUSE_IOCTL_RETRY which finishes the ioctl call.
 *
 * Copying data out works the same way.
 *
 * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
 * automatically initializes in and out iovs by decoding @cmd with
 * _IOC_* macros and the server is not allowed to request RETRY.  This
 * limits ioctl data transfers to well-formed ioctls and is the forced
 * behavior for all FUSE servers.
 */
2693 2694
long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
		   unsigned int flags)
Tejun Heo's avatar
Tejun Heo committed
2695 2696
{
	struct fuse_file *ff = file->private_data;
2697
	struct fuse_conn *fc = ff->fc;
Tejun Heo's avatar
Tejun Heo committed
2698 2699 2700 2701 2702 2703 2704
	struct fuse_ioctl_in inarg = {
		.fh = ff->fh,
		.cmd = cmd,
		.arg = arg,
		.flags = flags
	};
	struct fuse_ioctl_out outarg;
Miklos Szeredi's avatar
Miklos Szeredi committed
2705
	struct iovec *iov_page = NULL;
Tejun Heo's avatar
Tejun Heo committed
2706
	struct iovec *in_iov = NULL, *out_iov = NULL;
2707 2708 2709
	unsigned int in_iovs = 0, out_iovs = 0, max_pages;
	size_t in_size, out_size, c;
	ssize_t transferred;
2710 2711
	int err, i;
	struct iov_iter ii;
2712
	struct fuse_args_pages ap = {};
Tejun Heo's avatar
Tejun Heo committed
2713

Miklos Szeredi's avatar
Miklos Szeredi committed
2714 2715 2716
#if BITS_PER_LONG == 32
	inarg.flags |= FUSE_IOCTL_32BIT;
#else
2717
	if (flags & FUSE_IOCTL_COMPAT) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2718
		inarg.flags |= FUSE_IOCTL_32BIT;
2719 2720 2721 2722 2723
#ifdef CONFIG_X86_X32
		if (in_x32_syscall())
			inarg.flags |= FUSE_IOCTL_COMPAT_X32;
#endif
	}
Miklos Szeredi's avatar
Miklos Szeredi committed
2724 2725
#endif

Tejun Heo's avatar
Tejun Heo committed
2726
	/* assume all the iovs returned by client always fits in a page */
Miklos Szeredi's avatar
Miklos Szeredi committed
2727
	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
Tejun Heo's avatar
Tejun Heo committed
2728 2729

	err = -ENOMEM;
2730
	ap.pages = fuse_pages_alloc(fc->max_pages, GFP_KERNEL, &ap.descs);
Miklos Szeredi's avatar
Miklos Szeredi committed
2731
	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
2732
	if (!ap.pages || !iov_page)
Tejun Heo's avatar
Tejun Heo committed
2733 2734
		goto out;

2735 2736
	fuse_page_descs_length_init(ap.descs, 0, fc->max_pages);

Tejun Heo's avatar
Tejun Heo committed
2737 2738 2739 2740 2741
	/*
	 * If restricted, initialize IO parameters as encoded in @cmd.
	 * RETRY from server is not allowed.
	 */
	if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2742
		struct iovec *iov = iov_page;
Tejun Heo's avatar
Tejun Heo committed
2743

2744
		iov->iov_base = (void __user *)arg;
Tejun Heo's avatar
Tejun Heo committed
2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770
		iov->iov_len = _IOC_SIZE(cmd);

		if (_IOC_DIR(cmd) & _IOC_WRITE) {
			in_iov = iov;
			in_iovs = 1;
		}

		if (_IOC_DIR(cmd) & _IOC_READ) {
			out_iov = iov;
			out_iovs = 1;
		}
	}

 retry:
	inarg.in_size = in_size = iov_length(in_iov, in_iovs);
	inarg.out_size = out_size = iov_length(out_iov, out_iovs);

	/*
	 * Out data can be used either for actual out data or iovs,
	 * make sure there always is at least one page.
	 */
	out_size = max_t(size_t, out_size, PAGE_SIZE);
	max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);

	/* make sure there are enough buffer pages and init request with them */
	err = -ENOMEM;
2771
	if (max_pages > fc->max_pages)
Tejun Heo's avatar
Tejun Heo committed
2772
		goto out;
2773 2774 2775
	while (ap.num_pages < max_pages) {
		ap.pages[ap.num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
		if (!ap.pages[ap.num_pages])
Tejun Heo's avatar
Tejun Heo committed
2776
			goto out;
2777
		ap.num_pages++;
Tejun Heo's avatar
Tejun Heo committed
2778 2779 2780 2781
	}


	/* okay, let's send it to the client */
2782 2783 2784 2785 2786
	ap.args.opcode = FUSE_IOCTL;
	ap.args.nodeid = ff->nodeid;
	ap.args.in_numargs = 1;
	ap.args.in_args[0].size = sizeof(inarg);
	ap.args.in_args[0].value = &inarg;
Tejun Heo's avatar
Tejun Heo committed
2787
	if (in_size) {
2788 2789 2790
		ap.args.in_numargs++;
		ap.args.in_args[1].size = in_size;
		ap.args.in_pages = true;
Tejun Heo's avatar
Tejun Heo committed
2791

2792 2793
		err = -EFAULT;
		iov_iter_init(&ii, WRITE, in_iov, in_iovs, in_size);
2794 2795
		for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
			c = copy_page_from_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2796 2797 2798
			if (c != PAGE_SIZE && iov_iter_count(&ii))
				goto out;
		}
Tejun Heo's avatar
Tejun Heo committed
2799 2800
	}

2801 2802 2803 2804 2805 2806
	ap.args.out_numargs = 2;
	ap.args.out_args[0].size = sizeof(outarg);
	ap.args.out_args[0].value = &outarg;
	ap.args.out_args[1].size = out_size;
	ap.args.out_pages = true;
	ap.args.out_argvar = true;
Tejun Heo's avatar
Tejun Heo committed
2807

2808 2809 2810
	transferred = fuse_simple_request(fc, &ap.args);
	err = transferred;
	if (transferred < 0)
Tejun Heo's avatar
Tejun Heo committed
2811 2812 2813 2814
		goto out;

	/* did it ask for retry? */
	if (outarg.flags & FUSE_IOCTL_RETRY) {
Miklos Szeredi's avatar
Miklos Szeredi committed
2815
		void *vaddr;
Tejun Heo's avatar
Tejun Heo committed
2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834

		/* no retry if in restricted mode */
		err = -EIO;
		if (!(flags & FUSE_IOCTL_UNRESTRICTED))
			goto out;

		in_iovs = outarg.in_iovs;
		out_iovs = outarg.out_iovs;

		/*
		 * Make sure things are in boundary, separate checks
		 * are to protect against overflow.
		 */
		err = -ENOMEM;
		if (in_iovs > FUSE_IOCTL_MAX_IOV ||
		    out_iovs > FUSE_IOCTL_MAX_IOV ||
		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
			goto out;

2835
		vaddr = kmap_atomic(ap.pages[0]);
Miklos Szeredi's avatar
Miklos Szeredi committed
2836
		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
2837 2838
					    transferred, in_iovs + out_iovs,
					    (flags & FUSE_IOCTL_COMPAT) != 0);
2839
		kunmap_atomic(vaddr);
2840 2841
		if (err)
			goto out;
Tejun Heo's avatar
Tejun Heo committed
2842

Miklos Szeredi's avatar
Miklos Szeredi committed
2843
		in_iov = iov_page;
Tejun Heo's avatar
Tejun Heo committed
2844 2845
		out_iov = in_iov + in_iovs;

2846
		err = fuse_verify_ioctl_iov(fc, in_iov, in_iovs);
Miklos Szeredi's avatar
Miklos Szeredi committed
2847 2848 2849
		if (err)
			goto out;

2850
		err = fuse_verify_ioctl_iov(fc, out_iov, out_iovs);
Miklos Szeredi's avatar
Miklos Szeredi committed
2851 2852 2853
		if (err)
			goto out;

Tejun Heo's avatar
Tejun Heo committed
2854 2855 2856 2857 2858 2859 2860
		goto retry;
	}

	err = -EIO;
	if (transferred > inarg.out_size)
		goto out;

2861 2862
	err = -EFAULT;
	iov_iter_init(&ii, READ, out_iov, out_iovs, transferred);
2863 2864
	for (i = 0; iov_iter_count(&ii) && !WARN_ON(i >= ap.num_pages); i++) {
		c = copy_page_to_iter(ap.pages[i], 0, PAGE_SIZE, &ii);
2865 2866 2867 2868
		if (c != PAGE_SIZE && iov_iter_count(&ii))
			goto out;
	}
	err = 0;
Tejun Heo's avatar
Tejun Heo committed
2869
 out:
Miklos Szeredi's avatar
Miklos Szeredi committed
2870
	free_page((unsigned long) iov_page);
2871 2872 2873
	while (ap.num_pages)
		__free_page(ap.pages[--ap.num_pages]);
	kfree(ap.pages);
Tejun Heo's avatar
Tejun Heo committed
2874 2875 2876

	return err ? err : outarg.result;
}
2877
EXPORT_SYMBOL_GPL(fuse_do_ioctl);
Tejun Heo's avatar
Tejun Heo committed
2878

2879 2880
long fuse_ioctl_common(struct file *file, unsigned int cmd,
		       unsigned long arg, unsigned int flags)
2881
{
2882
	struct inode *inode = file_inode(file);
2883 2884
	struct fuse_conn *fc = get_fuse_conn(inode);

2885
	if (!fuse_allow_current_process(fc))
2886 2887 2888 2889 2890 2891 2892 2893
		return -EACCES;

	if (is_bad_inode(inode))
		return -EIO;

	return fuse_do_ioctl(file, cmd, arg, flags);
}

Tejun Heo's avatar
Tejun Heo committed
2894 2895 2896
static long fuse_file_ioctl(struct file *file, unsigned int cmd,
			    unsigned long arg)
{
2897
	return fuse_ioctl_common(file, cmd, arg, 0);
Tejun Heo's avatar
Tejun Heo committed
2898 2899 2900 2901 2902
}

static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
				   unsigned long arg)
{
2903
	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
Tejun Heo's avatar
Tejun Heo committed
2904 2905
}

Tejun Heo's avatar
Tejun Heo committed
2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946
/*
 * All files which have been polled are linked to RB tree
 * fuse_conn->polled_files which is indexed by kh.  Walk the tree and
 * find the matching one.
 */
static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
					      struct rb_node **parent_out)
{
	struct rb_node **link = &fc->polled_files.rb_node;
	struct rb_node *last = NULL;

	while (*link) {
		struct fuse_file *ff;

		last = *link;
		ff = rb_entry(last, struct fuse_file, polled_node);

		if (kh < ff->kh)
			link = &last->rb_left;
		else if (kh > ff->kh)
			link = &last->rb_right;
		else
			return link;
	}

	if (parent_out)
		*parent_out = last;
	return link;
}

/*
 * The file is about to be polled.  Make sure it's on the polled_files
 * RB tree.  Note that files once added to the polled_files tree are
 * not removed before the file is released.  This is because a file
 * polled once is likely to be polled again.
 */
static void fuse_register_polled_file(struct fuse_conn *fc,
				      struct fuse_file *ff)
{
	spin_lock(&fc->lock);
	if (RB_EMPTY_NODE(&ff->polled_node)) {
2947
		struct rb_node **link, *uninitialized_var(parent);
Tejun Heo's avatar
Tejun Heo committed
2948 2949 2950 2951 2952 2953 2954 2955 2956

		link = fuse_find_polled_node(fc, ff->kh, &parent);
		BUG_ON(*link);
		rb_link_node(&ff->polled_node, parent, link);
		rb_insert_color(&ff->polled_node, &fc->polled_files);
	}
	spin_unlock(&fc->lock);
}

Al Viro's avatar
Al Viro committed
2957
__poll_t fuse_file_poll(struct file *file, poll_table *wait)
Tejun Heo's avatar
Tejun Heo committed
2958 2959
{
	struct fuse_file *ff = file->private_data;
2960
	struct fuse_conn *fc = ff->fc;
Tejun Heo's avatar
Tejun Heo committed
2961 2962
	struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
	struct fuse_poll_out outarg;
2963
	FUSE_ARGS(args);
Tejun Heo's avatar
Tejun Heo committed
2964 2965 2966 2967 2968 2969
	int err;

	if (fc->no_poll)
		return DEFAULT_POLLMASK;

	poll_wait(file, &ff->poll_wait, wait);
2970
	inarg.events = mangle_poll(poll_requested_events(wait));
Tejun Heo's avatar
Tejun Heo committed
2971 2972 2973 2974 2975 2976 2977 2978 2979 2980

	/*
	 * Ask for notification iff there's someone waiting for it.
	 * The client may ignore the flag and always notify.
	 */
	if (waitqueue_active(&ff->poll_wait)) {
		inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
		fuse_register_polled_file(fc, ff);
	}

2981 2982 2983 2984 2985 2986 2987 2988
	args.opcode = FUSE_POLL;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
2989
	err = fuse_simple_request(fc, &args);
Tejun Heo's avatar
Tejun Heo committed
2990 2991

	if (!err)
2992
		return demangle_poll(outarg.revents);
Tejun Heo's avatar
Tejun Heo committed
2993 2994 2995 2996
	if (err == -ENOSYS) {
		fc->no_poll = 1;
		return DEFAULT_POLLMASK;
	}
2997
	return EPOLLERR;
Tejun Heo's avatar
Tejun Heo committed
2998
}
2999
EXPORT_SYMBOL_GPL(fuse_file_poll);
Tejun Heo's avatar
Tejun Heo committed
3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024

/*
 * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
 * wakes up the poll waiters.
 */
int fuse_notify_poll_wakeup(struct fuse_conn *fc,
			    struct fuse_notify_poll_wakeup_out *outarg)
{
	u64 kh = outarg->kh;
	struct rb_node **link;

	spin_lock(&fc->lock);

	link = fuse_find_polled_node(fc, kh, NULL);
	if (*link) {
		struct fuse_file *ff;

		ff = rb_entry(*link, struct fuse_file, polled_node);
		wake_up_interruptible_sync(&ff->poll_wait);
	}

	spin_unlock(&fc->lock);
	return 0;
}

3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035
static void fuse_do_truncate(struct file *file)
{
	struct inode *inode = file->f_mapping->host;
	struct iattr attr;

	attr.ia_valid = ATTR_SIZE;
	attr.ia_size = i_size_read(inode);

	attr.ia_file = file;
	attr.ia_valid |= ATTR_FILE;

3036
	fuse_do_setattr(file_dentry(file), &attr, file);
3037 3038
}

3039
static inline loff_t fuse_round_up(struct fuse_conn *fc, loff_t off)
3040
{
3041
	return round_up(off, fc->max_pages << PAGE_SHIFT);
3042 3043
}

3044
static ssize_t
3045
fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3046
{
3047
	DECLARE_COMPLETION_ONSTACK(wait);
3048
	ssize_t ret = 0;
3049 3050
	struct file *file = iocb->ki_filp;
	struct fuse_file *ff = file->private_data;
3051
	bool async_dio = ff->fc->async_dio;
3052
	loff_t pos = 0;
3053 3054
	struct inode *inode;
	loff_t i_size;
3055
	size_t count = iov_iter_count(iter);
3056
	loff_t offset = iocb->ki_pos;
3057
	struct fuse_io_priv *io;
3058 3059

	pos = offset;
3060 3061
	inode = file->f_mapping->host;
	i_size = i_size_read(inode);
3062

3063
	if ((iov_iter_rw(iter) == READ) && (offset > i_size))
3064 3065
		return 0;

3066
	/* optimization for short read */
3067
	if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
3068 3069
		if (offset >= i_size)
			return 0;
3070
		iov_iter_truncate(iter, fuse_round_up(ff->fc, i_size - offset));
3071
		count = iov_iter_count(iter);
3072 3073
	}

3074
	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
3075 3076
	if (!io)
		return -ENOMEM;
3077
	spin_lock_init(&io->lock);
3078
	kref_init(&io->refcnt);
3079 3080 3081 3082
	io->reqs = 1;
	io->bytes = -1;
	io->size = 0;
	io->offset = offset;
3083
	io->write = (iov_iter_rw(iter) == WRITE);
3084 3085 3086
	io->err = 0;
	/*
	 * By default, we want to optimize all I/Os with async request
3087
	 * submission to the client filesystem if supported.
3088
	 */
3089
	io->async = async_dio;
3090
	io->iocb = iocb;
3091
	io->blocking = is_sync_kiocb(iocb);
3092 3093

	/*
3094 3095
	 * We cannot asynchronously extend the size of a file.
	 * In such case the aio will behave exactly like sync io.
3096
	 */
3097 3098
	if ((offset + count > i_size) && iov_iter_rw(iter) == WRITE)
		io->blocking = true;
3099

3100
	if (io->async && io->blocking) {
3101 3102 3103 3104 3105
		/*
		 * Additional reference to keep io around after
		 * calling fuse_aio_complete()
		 */
		kref_get(&io->refcnt);
3106
		io->done = &wait;
3107
	}
3108

3109
	if (iov_iter_rw(iter) == WRITE) {
3110
		ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
3111 3112
		fuse_invalidate_attr(inode);
	} else {
3113
		ret = __fuse_direct_read(io, iter, &pos);
3114
	}
3115

3116
	if (io->async) {
3117 3118
		bool blocking = io->blocking;

3119 3120 3121
		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);

		/* we have a non-extending, async request, so return */
3122
		if (!blocking)
3123 3124
			return -EIOCBQUEUED;

3125 3126
		wait_for_completion(&wait);
		ret = fuse_get_res_by_io(io);
3127 3128
	}

3129
	kref_put(&io->refcnt, fuse_io_release);
3130

3131
	if (iov_iter_rw(iter) == WRITE) {
3132 3133 3134 3135 3136
		if (ret > 0)
			fuse_write_update_size(inode, pos);
		else if (ret < 0 && offset + count > i_size)
			fuse_do_truncate(file);
	}
3137 3138 3139 3140

	return ret;
}

3141 3142 3143 3144 3145 3146 3147 3148 3149 3150
static int fuse_writeback_range(struct inode *inode, loff_t start, loff_t end)
{
	int err = filemap_write_and_wait_range(inode->i_mapping, start, end);

	if (!err)
		fuse_sync_writes(inode);

	return err;
}

3151 3152
static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
				loff_t length)
3153 3154
{
	struct fuse_file *ff = file->private_data;
3155
	struct inode *inode = file_inode(file);
3156
	struct fuse_inode *fi = get_fuse_inode(inode);
3157
	struct fuse_conn *fc = ff->fc;
3158
	FUSE_ARGS(args);
3159 3160 3161 3162 3163 3164 3165
	struct fuse_fallocate_in inarg = {
		.fh = ff->fh,
		.offset = offset,
		.length = length,
		.mode = mode
	};
	int err;
3166 3167
	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
			   (mode & FALLOC_FL_PUNCH_HOLE);
3168

Miklos Szeredi's avatar
Miklos Szeredi committed
3169 3170 3171
	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
		return -EOPNOTSUPP;

3172 3173 3174
	if (fc->no_fallocate)
		return -EOPNOTSUPP;

3175
	if (lock_inode) {
Al Viro's avatar
Al Viro committed
3176
		inode_lock(inode);
3177 3178
		if (mode & FALLOC_FL_PUNCH_HOLE) {
			loff_t endbyte = offset + length - 1;
3179 3180

			err = fuse_writeback_range(inode, offset, endbyte);
3181 3182 3183
			if (err)
				goto out;
		}
3184 3185
	}

3186 3187 3188 3189
	if (!(mode & FALLOC_FL_KEEP_SIZE) &&
	    offset + length > i_size_read(inode)) {
		err = inode_newsize_ok(inode, offset + length);
		if (err)
3190
			goto out;
3191 3192
	}

3193 3194 3195
	if (!(mode & FALLOC_FL_KEEP_SIZE))
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

3196 3197 3198 3199 3200
	args.opcode = FUSE_FALLOCATE;
	args.nodeid = ff->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
3201
	err = fuse_simple_request(fc, &args);
3202 3203 3204 3205
	if (err == -ENOSYS) {
		fc->no_fallocate = 1;
		err = -EOPNOTSUPP;
	}
3206 3207 3208 3209
	if (err)
		goto out;

	/* we could have extended the file */
3210 3211 3212
	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
		bool changed = fuse_write_update_size(inode, offset + length);

3213 3214
		if (changed && fc->writeback_cache)
			file_update_time(file);
3215
	}
3216 3217 3218 3219 3220 3221

	if (mode & FALLOC_FL_PUNCH_HOLE)
		truncate_pagecache_range(inode, offset, offset + length - 1);

	fuse_invalidate_attr(inode);

3222
out:
3223 3224 3225
	if (!(mode & FALLOC_FL_KEEP_SIZE))
		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);

3226
	if (lock_inode)
Al Viro's avatar
Al Viro committed
3227
		inode_unlock(inode);
3228

3229 3230 3231
	return err;
}

3232 3233 3234
static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in,
				      struct file *file_out, loff_t pos_out,
				      size_t len, unsigned int flags)
3235 3236 3237
{
	struct fuse_file *ff_in = file_in->private_data;
	struct fuse_file *ff_out = file_out->private_data;
3238
	struct inode *inode_in = file_inode(file_in);
3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261
	struct inode *inode_out = file_inode(file_out);
	struct fuse_inode *fi_out = get_fuse_inode(inode_out);
	struct fuse_conn *fc = ff_in->fc;
	FUSE_ARGS(args);
	struct fuse_copy_file_range_in inarg = {
		.fh_in = ff_in->fh,
		.off_in = pos_in,
		.nodeid_out = ff_out->nodeid,
		.fh_out = ff_out->fh,
		.off_out = pos_out,
		.len = len,
		.flags = flags
	};
	struct fuse_write_out outarg;
	ssize_t err;
	/* mark unstable when write-back is not used, and file_out gets
	 * extended */
	bool is_unstable = (!fc->writeback_cache) &&
			   ((pos_out + len) > inode_out->i_size);

	if (fc->no_copy_file_range)
		return -EOPNOTSUPP;

3262 3263 3264
	if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
		return -EXDEV;

3265 3266
	if (fc->writeback_cache) {
		inode_lock(inode_in);
3267
		err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
3268 3269 3270 3271 3272
		inode_unlock(inode_in);
		if (err)
			return err;
	}

3273 3274
	inode_lock(inode_out);

3275 3276 3277 3278
	err = file_modified(file_out);
	if (err)
		goto out;

3279
	if (fc->writeback_cache) {
3280
		err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
3281 3282 3283 3284 3285 3286 3287
		if (err)
			goto out;
	}

	if (is_unstable)
		set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);

3288 3289 3290 3291 3292 3293 3294 3295
	args.opcode = FUSE_COPY_FILE_RANGE;
	args.nodeid = ff_in->nodeid;
	args.in_numargs = 1;
	args.in_args[0].size = sizeof(inarg);
	args.in_args[0].value = &inarg;
	args.out_numargs = 1;
	args.out_args[0].size = sizeof(outarg);
	args.out_args[0].value = &outarg;
3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316
	err = fuse_simple_request(fc, &args);
	if (err == -ENOSYS) {
		fc->no_copy_file_range = 1;
		err = -EOPNOTSUPP;
	}
	if (err)
		goto out;

	if (fc->writeback_cache) {
		fuse_write_update_size(inode_out, pos_out + outarg.size);
		file_update_time(file_out);
	}

	fuse_invalidate_attr(inode_out);

	err = outarg.size;
out:
	if (is_unstable)
		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);

	inode_unlock(inode_out);
3317
	file_accessed(file_in);
3318 3319 3320 3321

	return err;
}

3322 3323 3324 3325 3326 3327 3328 3329 3330
static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
				    struct file *dst_file, loff_t dst_off,
				    size_t len, unsigned int flags)
{
	ssize_t ret;

	ret = __fuse_copy_file_range(src_file, src_off, dst_file, dst_off,
				     len, flags);

3331
	if (ret == -EOPNOTSUPP || ret == -EXDEV)
3332 3333 3334 3335 3336
		ret = generic_copy_file_range(src_file, src_off, dst_file,
					      dst_off, len, flags);
	return ret;
}

3337
static const struct file_operations fuse_file_operations = {
Miklos Szeredi's avatar
Miklos Szeredi committed
3338
	.llseek		= fuse_file_llseek,
3339
	.read_iter	= fuse_file_read_iter,
Al Viro's avatar
Al Viro committed
3340
	.write_iter	= fuse_file_write_iter,
3341 3342 3343 3344 3345
	.mmap		= fuse_file_mmap,
	.open		= fuse_open,
	.flush		= fuse_flush,
	.release	= fuse_release,
	.fsync		= fuse_fsync,
3346
	.lock		= fuse_file_lock,
3347
	.flock		= fuse_file_flock,
3348 3349
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
Tejun Heo's avatar
Tejun Heo committed
3350 3351
	.unlocked_ioctl	= fuse_file_ioctl,
	.compat_ioctl	= fuse_file_compat_ioctl,
Tejun Heo's avatar
Tejun Heo committed
3352
	.poll		= fuse_file_poll,
3353
	.fallocate	= fuse_file_fallocate,
3354
	.copy_file_range = fuse_copy_file_range,
Miklos Szeredi's avatar
Miklos Szeredi committed
3355 3356
};

3357
static const struct address_space_operations fuse_file_aops  = {
3358
	.readpage	= fuse_readpage,
Miklos Szeredi's avatar
Miklos Szeredi committed
3359
	.writepage	= fuse_writepage,
3360
	.writepages	= fuse_writepages,
Miklos Szeredi's avatar
Miklos Szeredi committed
3361
	.launder_page	= fuse_launder_page,
3362
	.readpages	= fuse_readpages,
Miklos Szeredi's avatar
Miklos Szeredi committed
3363
	.set_page_dirty	= __set_page_dirty_nobuffers,
3364
	.bmap		= fuse_bmap,
3365
	.direct_IO	= fuse_direct_IO,
3366 3367
	.write_begin	= fuse_write_begin,
	.write_end	= fuse_write_end,
3368 3369 3370 3371
};

void fuse_init_file_inode(struct inode *inode)
{
3372 3373
	struct fuse_inode *fi = get_fuse_inode(inode);

3374 3375
	inode->i_fop = &fuse_file_operations;
	inode->i_data.a_ops = &fuse_file_aops;
3376 3377 3378 3379 3380 3381

	INIT_LIST_HEAD(&fi->write_files);
	INIT_LIST_HEAD(&fi->queued_writes);
	fi->writectr = 0;
	init_waitqueue_head(&fi->page_waitq);
	INIT_LIST_HEAD(&fi->writepages);
3382
}