ordered-data.c 30.9 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

#include <linux/slab.h>
7
#include <linux/blkdev.h>
8
#include <linux/writeback.h>
9
#include <linux/sched/mm.h>
10
#include "misc.h"
11 12 13
#include "ctree.h"
#include "transaction.h"
#include "btrfs_inode.h"
14
#include "extent_io.h"
15
#include "disk-io.h"
16
#include "compression.h"
17
#include "delalloc-space.h"
18
#include "qgroup.h"
19
#include "subpage.h"
20

21 22
static struct kmem_cache *btrfs_ordered_extent_cache;

23
static u64 entry_end(struct btrfs_ordered_extent *entry)
24
{
25
	if (entry->file_offset + entry->num_bytes < entry->file_offset)
26
		return (u64)-1;
27
	return entry->file_offset + entry->num_bytes;
28 29
}

30 31 32
/* returns NULL if the insertion worked, or it returns the node it did find
 * in the tree
 */
33 34
static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
				   struct rb_node *node)
35
{
36 37
	struct rb_node **p = &root->rb_node;
	struct rb_node *parent = NULL;
38
	struct btrfs_ordered_extent *entry;
39

40
	while (*p) {
41
		parent = *p;
42
		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
43

44
		if (file_offset < entry->file_offset)
45
			p = &(*p)->rb_left;
46
		else if (file_offset >= entry_end(entry))
47 48 49 50 51 52 53 54 55 56
			p = &(*p)->rb_right;
		else
			return parent;
	}

	rb_link_node(node, parent, p);
	rb_insert_color(node, root);
	return NULL;
}

57 58 59 60
/*
 * look for a given offset in the tree, and if it can't be found return the
 * first lesser offset
 */
61 62
static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
				     struct rb_node **prev_ret)
63
{
64
	struct rb_node *n = root->rb_node;
65
	struct rb_node *prev = NULL;
66 67 68
	struct rb_node *test;
	struct btrfs_ordered_extent *entry;
	struct btrfs_ordered_extent *prev_entry = NULL;
69

70
	while (n) {
71
		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
72 73 74
		prev = n;
		prev_entry = entry;

75
		if (file_offset < entry->file_offset)
76
			n = n->rb_left;
77
		else if (file_offset >= entry_end(entry))
78 79 80 81 82 83 84
			n = n->rb_right;
		else
			return n;
	}
	if (!prev_ret)
		return NULL;

85
	while (prev && file_offset >= entry_end(prev_entry)) {
86 87 88 89 90 91 92 93 94 95 96 97 98
		test = rb_next(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		if (file_offset < entry_end(prev_entry))
			break;

		prev = test;
	}
	if (prev)
		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
				      rb_node);
99
	while (prev && file_offset < entry_end(prev_entry)) {
100 101 102 103 104 105
		test = rb_prev(prev);
		if (!test)
			break;
		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
				      rb_node);
		prev = test;
106 107 108 109 110
	}
	*prev_ret = prev;
	return NULL;
}

111 112 113 114
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
			  u64 len)
{
	if (file_offset + len <= entry->file_offset ||
115
	    entry->file_offset + entry->num_bytes <= file_offset)
116 117 118 119
		return 0;
	return 1;
}

120 121 122 123
/*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
 */
124 125
static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
					  u64 file_offset)
126
{
127
	struct rb_root *root = &tree->tree;
128
	struct rb_node *prev = NULL;
129
	struct rb_node *ret;
130 131 132 133 134
	struct btrfs_ordered_extent *entry;

	if (tree->last) {
		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
				 rb_node);
135
		if (in_range(file_offset, entry->file_offset, entry->num_bytes))
136 137 138
			return tree->last;
	}
	ret = __tree_search(root, file_offset, &prev);
139
	if (!ret)
140 141 142
		ret = prev;
	if (ret)
		tree->last = ret;
143 144 145
	return ret;
}

146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
/**
 * Add an ordered extent to the per-inode tree.
 *
 * @inode:           Inode that this extent is for.
 * @file_offset:     Logical offset in file where the extent starts.
 * @num_bytes:       Logical length of extent in file.
 * @ram_bytes:       Full length of unencoded data.
 * @disk_bytenr:     Offset of extent on disk.
 * @disk_num_bytes:  Size of extent on disk.
 * @offset:          Offset into unencoded data where file data starts.
 * @flags:           Flags specifying type of extent (1 << BTRFS_ORDERED_*).
 * @compress_type:   Compression algorithm used for data.
 *
 * Most of these parameters correspond to &struct btrfs_file_extent_item. The
 * tree is given a single reference on the ordered extent that was inserted.
161
 *
162
 * Return: 0 or -ENOMEM.
163
 */
164 165 166 167
int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset,
			     u64 num_bytes, u64 ram_bytes, u64 disk_bytenr,
			     u64 disk_num_bytes, u64 offset, unsigned flags,
			     int compress_type)
168
{
169 170 171
	struct btrfs_root *root = inode->root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
172 173
	struct rb_node *node;
	struct btrfs_ordered_extent *entry;
174 175
	int ret;

176 177
	if (flags &
	    ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) {
178
		/* For nocow write, we can release the qgroup rsv right now */
179
		ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes);
180 181 182 183 184 185 186 187
		if (ret < 0)
			return ret;
		ret = 0;
	} else {
		/*
		 * The ordered extent has reserved qgroup space, release now
		 * and pass the reserved number for qgroup_record to free.
		 */
188
		ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes);
189 190 191
		if (ret < 0)
			return ret;
	}
192
	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
193 194 195
	if (!entry)
		return -ENOMEM;

196
	entry->file_offset = file_offset;
197
	entry->num_bytes = num_bytes;
198 199
	entry->ram_bytes = ram_bytes;
	entry->disk_bytenr = disk_bytenr;
200
	entry->disk_num_bytes = disk_num_bytes;
201
	entry->offset = offset;
202
	entry->bytes_left = num_bytes;
203
	entry->inode = igrab(&inode->vfs_inode);
204
	entry->compress_type = compress_type;
205
	entry->truncated_len = (u64)-1;
206
	entry->qgroup_rsv = ret;
207
	entry->physical = (u64)-1;
208

209 210
	ASSERT((flags & ~BTRFS_ORDERED_TYPE_FLAGS) == 0);
	entry->flags = flags;
211

212 213 214
	percpu_counter_add_batch(&fs_info->ordered_bytes, num_bytes,
				 fs_info->delalloc_batch);

215
	/* one ref for the tree */
216
	refcount_set(&entry->refs, 1);
217 218
	init_waitqueue_head(&entry->wait);
	INIT_LIST_HEAD(&entry->list);
219
	INIT_LIST_HEAD(&entry->log_list);
220
	INIT_LIST_HEAD(&entry->root_extent_list);
221 222
	INIT_LIST_HEAD(&entry->work_list);
	init_completion(&entry->completion);
223

224
	trace_btrfs_ordered_extent_add(inode, entry);
225

226
	spin_lock_irq(&tree->lock);
227 228
	node = tree_insert(&tree->tree, file_offset,
			   &entry->rb_node);
229
	if (node)
230 231 232
		btrfs_panic(fs_info, -EEXIST,
				"inconsistency in ordered tree at offset %llu",
				file_offset);
233
	spin_unlock_irq(&tree->lock);
234

235
	spin_lock(&root->ordered_extent_lock);
236
	list_add_tail(&entry->root_extent_list,
237 238 239
		      &root->ordered_extents);
	root->nr_ordered_extents++;
	if (root->nr_ordered_extents == 1) {
240
		spin_lock(&fs_info->ordered_root_lock);
241
		BUG_ON(!list_empty(&root->ordered_root));
242 243
		list_add_tail(&root->ordered_root, &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);
244 245
	}
	spin_unlock(&root->ordered_extent_lock);
246

247 248 249 250 251
	/*
	 * We don't need the count_max_extents here, we can assume that all of
	 * that work has been done at higher layers, so this is truly the
	 * smallest the extent is going to get.
	 */
252 253 254
	spin_lock(&inode->lock);
	btrfs_mod_outstanding_extents(inode, 1);
	spin_unlock(&inode->lock);
255

256 257 258
	return 0;
}

259 260
/*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
261 262
 * when an ordered extent is finished.  If the list covers more than one
 * ordered extent, it is split across multiples.
263
 */
264
void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry,
265
			   struct btrfs_ordered_sum *sum)
266
{
267
	struct btrfs_ordered_inode_tree *tree;
268

269
	tree = &BTRFS_I(entry->inode)->ordered_tree;
270
	spin_lock_irq(&tree->lock);
271
	list_add_tail(&sum->list, &entry->list);
272
	spin_unlock_irq(&tree->lock);
273 274
}

275
/*
276
 * Mark all ordered extents io inside the specified range finished.
277
 *
278
 * @page:	 The involved page for the operation.
279 280 281 282 283 284 285
 *		 For uncompressed buffered IO, the page status also needs to be
 *		 updated to indicate whether the pending ordered io is finished.
 *		 Can be NULL for direct IO and compressed write.
 *		 For these cases, callers are ensured they won't execute the
 *		 endio function twice.
 * @finish_func: The function to be executed when all the IO of an ordered
 *		 extent are finished.
286
 *
287
 * This function is called for endio, thus the range must have ordered
288
 * extent(s) covering it.
289
 */
290 291 292 293
void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode,
				struct page *page, u64 file_offset,
				u64 num_bytes, btrfs_func_t finish_func,
				bool uptodate)
294
{
295
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
296 297
	struct btrfs_fs_info *fs_info = inode->root->fs_info;
	struct btrfs_workqueue *wq;
298 299
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
300
	unsigned long flags;
301 302 303 304 305 306 307 308 309 310
	u64 cur = file_offset;

	if (btrfs_is_free_space_inode(inode))
		wq = fs_info->endio_freespace_worker;
	else
		wq = fs_info->endio_write_workers;

	if (page)
		ASSERT(page->mapping && page_offset(page) <= file_offset &&
		       file_offset + num_bytes <= page_offset(page) + PAGE_SIZE);
311

312
	spin_lock_irqsave(&tree->lock, flags);
313 314 315 316 317 318 319 320 321
	while (cur < file_offset + num_bytes) {
		u64 entry_end;
		u64 end;
		u32 len;

		node = tree_search(tree, cur);
		/* No ordered extents at all */
		if (!node)
			break;
322

323 324
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		entry_end = entry->file_offset + entry->num_bytes;
325
		/*
326 327 328
		 * |<-- OE --->|  |
		 *		  cur
		 * Go to next OE.
329
		 */
330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365
		if (cur >= entry_end) {
			node = rb_next(node);
			/* No more ordered extents, exit */
			if (!node)
				break;
			entry = rb_entry(node, struct btrfs_ordered_extent,
					 rb_node);

			/* Go to next ordered extent and continue */
			cur = entry->file_offset;
			continue;
		}
		/*
		 * |	|<--- OE --->|
		 * cur
		 * Go to the start of OE.
		 */
		if (cur < entry->file_offset) {
			cur = entry->file_offset;
			continue;
		}

		/*
		 * Now we are definitely inside one ordered extent.
		 *
		 * |<--- OE --->|
		 *	|
		 *	cur
		 */
		end = min(entry->file_offset + entry->num_bytes,
			  file_offset + num_bytes) - 1;
		ASSERT(end + 1 - cur < U32_MAX);
		len = end + 1 - cur;

		if (page) {
			/*
366 367
			 * Ordered (Private2) bit indicates whether we still
			 * have pending io unfinished for the ordered extent.
368 369 370
			 *
			 * If there's no such bit, we need to skip to next range.
			 */
371
			if (!btrfs_page_test_ordered(fs_info, page, cur, len)) {
372 373 374
				cur += len;
				continue;
			}
375
			btrfs_page_clear_ordered(fs_info, page, cur, len);
376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403
		}

		/* Now we're fine to update the accounting */
		if (unlikely(len > entry->bytes_left)) {
			WARN_ON(1);
			btrfs_crit(fs_info,
"bad ordered extent accounting, root=%llu ino=%llu OE offset=%llu OE len=%llu to_dec=%u left=%llu",
				   inode->root->root_key.objectid,
				   btrfs_ino(inode),
				   entry->file_offset,
				   entry->num_bytes,
				   len, entry->bytes_left);
			entry->bytes_left = 0;
		} else {
			entry->bytes_left -= len;
		}

		if (!uptodate)
			set_bit(BTRFS_ORDERED_IOERR, &entry->flags);

		/*
		 * All the IO of the ordered extent is finished, we need to queue
		 * the finish_func to be executed.
		 */
		if (entry->bytes_left == 0) {
			set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
			cond_wake_up(&entry->wait);
			refcount_inc(&entry->refs);
404
			trace_btrfs_ordered_extent_mark_finished(inode, entry);
405 406 407 408 409 410
			spin_unlock_irqrestore(&tree->lock, flags);
			btrfs_init_work(&entry->work, finish_func, NULL, NULL);
			btrfs_queue_work(wq, &entry->work);
			spin_lock_irqsave(&tree->lock, flags);
		}
		cur += len;
411
	}
412
	spin_unlock_irqrestore(&tree->lock, flags);
413 414
}

415
/*
416 417 418 419 420 421 422 423
 * Finish IO for one ordered extent across a given range.  The range can only
 * contain one ordered extent.
 *
 * @cached:	 The cached ordered extent. If not NULL, we can skip the tree
 *               search and use the ordered extent directly.
 * 		 Will be also used to store the finished ordered extent.
 * @file_offset: File offset for the finished IO
 * @io_size:	 Length of the finish IO range
424
 *
425 426 427 428 429 430
 * Return true if the ordered extent is finished in the range, and update
 * @cached.
 * Return false otherwise.
 *
 * NOTE: The range can NOT cross multiple ordered extents.
 * Thus caller should ensure the range doesn't cross ordered extents.
431
 */
432 433
bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
				    struct btrfs_ordered_extent **cached,
434
				    u64 file_offset, u64 io_size)
435
{
436
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
437
	struct rb_node *node;
438
	struct btrfs_ordered_extent *entry = NULL;
439
	unsigned long flags;
440
	bool finished = false;
441

442 443 444 445 446 447
	spin_lock_irqsave(&tree->lock, flags);
	if (cached && *cached) {
		entry = *cached;
		goto have_entry;
	}

448
	node = tree_search(tree, file_offset);
449
	if (!node)
450
		goto out;
451

452
	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
453
have_entry:
454
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
455 456
		goto out;

457
	if (io_size > entry->bytes_left)
458
		btrfs_crit(inode->root->fs_info,
459
			   "bad ordered accounting left %llu size %llu",
460
		       entry->bytes_left, io_size);
461

462
	entry->bytes_left -= io_size;
463

464
	if (entry->bytes_left == 0) {
465 466 467 468 469
		/*
		 * Ensure only one caller can set the flag and finished_ret
		 * accordingly
		 */
		finished = !test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
470 471
		/* test_and_set_bit implies a barrier */
		cond_wake_up_nomb(&entry->wait);
472
	}
473
out:
474
	if (finished && cached && entry) {
475
		*cached = entry;
476
		refcount_inc(&entry->refs);
477
		trace_btrfs_ordered_extent_dec_test_pending(inode, entry);
478
	}
479
	spin_unlock_irqrestore(&tree->lock, flags);
480
	return finished;
481
}
482

483 484 485 486
/*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
 */
487
void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
488
{
489 490 491
	struct list_head *cur;
	struct btrfs_ordered_sum *sum;

492
	trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry);
493

494
	if (refcount_dec_and_test(&entry->refs)) {
495
		ASSERT(list_empty(&entry->root_extent_list));
496
		ASSERT(list_empty(&entry->log_list));
497
		ASSERT(RB_EMPTY_NODE(&entry->rb_node));
498 499
		if (entry->inode)
			btrfs_add_delayed_iput(entry->inode);
500
		while (!list_empty(&entry->list)) {
501 502 503
			cur = entry->list.next;
			sum = list_entry(cur, struct btrfs_ordered_sum, list);
			list_del(&sum->list);
504
			kvfree(sum);
505
		}
506
		kmem_cache_free(btrfs_ordered_extent_cache, entry);
507
	}
508
}
509

510 511
/*
 * remove an ordered extent from the tree.  No references are dropped
512
 * and waiters are woken up.
513
 */
514
void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode,
515
				 struct btrfs_ordered_extent *entry)
516
{
517
	struct btrfs_ordered_inode_tree *tree;
518
	struct btrfs_root *root = btrfs_inode->root;
519
	struct btrfs_fs_info *fs_info = root->fs_info;
520
	struct rb_node *node;
521
	bool pending;
522

523 524 525 526
	/* This is paired with btrfs_add_ordered_extent. */
	spin_lock(&btrfs_inode->lock);
	btrfs_mod_outstanding_extents(btrfs_inode, -1);
	spin_unlock(&btrfs_inode->lock);
527 528 529 530 531 532 533 534 535
	if (root != fs_info->tree_root) {
		u64 release;

		if (test_bit(BTRFS_ORDERED_ENCODED, &entry->flags))
			release = entry->disk_num_bytes;
		else
			release = entry->num_bytes;
		btrfs_delalloc_release_metadata(btrfs_inode, release, false);
	}
536

537 538
	percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes,
				 fs_info->delalloc_batch);
539

540
	tree = &btrfs_inode->ordered_tree;
541
	spin_lock_irq(&tree->lock);
542
	node = &entry->rb_node;
543
	rb_erase(node, &tree->tree);
544
	RB_CLEAR_NODE(node);
545 546
	if (tree->last == node)
		tree->last = NULL;
547
	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
548
	pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags);
549
	spin_unlock_irq(&tree->lock);
550

551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577
	/*
	 * The current running transaction is waiting on us, we need to let it
	 * know that we're complete and wake it up.
	 */
	if (pending) {
		struct btrfs_transaction *trans;

		/*
		 * The checks for trans are just a formality, it should be set,
		 * but if it isn't we don't want to deref/assert under the spin
		 * lock, so be nice and check if trans is set, but ASSERT() so
		 * if it isn't set a developer will notice.
		 */
		spin_lock(&fs_info->trans_lock);
		trans = fs_info->running_transaction;
		if (trans)
			refcount_inc(&trans->use_count);
		spin_unlock(&fs_info->trans_lock);

		ASSERT(trans);
		if (trans) {
			if (atomic_dec_and_test(&trans->pending_ordered))
				wake_up(&trans->pending_wait);
			btrfs_put_transaction(trans);
		}
	}

578
	spin_lock(&root->ordered_extent_lock);
579
	list_del_init(&entry->root_extent_list);
580
	root->nr_ordered_extents--;
581

582
	trace_btrfs_ordered_extent_remove(btrfs_inode, entry);
583

584
	if (!root->nr_ordered_extents) {
585
		spin_lock(&fs_info->ordered_root_lock);
586 587
		BUG_ON(list_empty(&root->ordered_root));
		list_del_init(&root->ordered_root);
588
		spin_unlock(&fs_info->ordered_root_lock);
589 590
	}
	spin_unlock(&root->ordered_extent_lock);
591
	wake_up(&entry->wait);
592 593
}

594
static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
595 596 597 598
{
	struct btrfs_ordered_extent *ordered;

	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
599
	btrfs_start_ordered_extent(ordered, 1);
600 601 602
	complete(&ordered->completion);
}

603 604 605 606
/*
 * wait for all the ordered extents in a root.  This is done when balancing
 * space between drives.
 */
607
u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
608
			       const u64 range_start, const u64 range_len)
609
{
610
	struct btrfs_fs_info *fs_info = root->fs_info;
611 612 613
	LIST_HEAD(splice);
	LIST_HEAD(skipped);
	LIST_HEAD(works);
614
	struct btrfs_ordered_extent *ordered, *next;
615
	u64 count = 0;
616
	const u64 range_end = range_start + range_len;
617

618
	mutex_lock(&root->ordered_extent_mutex);
619 620
	spin_lock(&root->ordered_extent_lock);
	list_splice_init(&root->ordered_extents, &splice);
621
	while (!list_empty(&splice) && nr) {
622 623
		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
					   root_extent_list);
624

625 626
		if (range_end <= ordered->disk_bytenr ||
		    ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) {
627 628 629 630 631
			list_move_tail(&ordered->root_extent_list, &skipped);
			cond_resched_lock(&root->ordered_extent_lock);
			continue;
		}

632 633
		list_move_tail(&ordered->root_extent_list,
			       &root->ordered_extents);
634
		refcount_inc(&ordered->refs);
635
		spin_unlock(&root->ordered_extent_lock);
636

637 638
		btrfs_init_work(&ordered->flush_work,
				btrfs_run_ordered_extent_work, NULL, NULL);
639
		list_add_tail(&ordered->work_list, &works);
640
		btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
641

642
		cond_resched();
643
		spin_lock(&root->ordered_extent_lock);
644
		if (nr != U64_MAX)
645 646
			nr--;
		count++;
647
	}
648
	list_splice_tail(&skipped, &root->ordered_extents);
649
	list_splice_tail(&splice, &root->ordered_extents);
650
	spin_unlock(&root->ordered_extent_lock);
651 652 653 654 655 656 657

	list_for_each_entry_safe(ordered, next, &works, work_list) {
		list_del_init(&ordered->work_list);
		wait_for_completion(&ordered->completion);
		btrfs_put_ordered_extent(ordered);
		cond_resched();
	}
658
	mutex_unlock(&root->ordered_extent_mutex);
659 660

	return count;
661 662
}

663
void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
664
			     const u64 range_start, const u64 range_len)
665 666 667
{
	struct btrfs_root *root;
	struct list_head splice;
668
	u64 done;
669 670 671

	INIT_LIST_HEAD(&splice);

672
	mutex_lock(&fs_info->ordered_operations_mutex);
673 674
	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
675
	while (!list_empty(&splice) && nr) {
676 677
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
678
		root = btrfs_grab_root(root);
679 680 681 682 683
		BUG_ON(!root);
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
		spin_unlock(&fs_info->ordered_root_lock);

684 685
		done = btrfs_wait_ordered_extents(root, nr,
						  range_start, range_len);
686
		btrfs_put_root(root);
687 688

		spin_lock(&fs_info->ordered_root_lock);
689
		if (nr != U64_MAX) {
690 691
			nr -= done;
		}
692
	}
693
	list_splice_tail(&splice, &fs_info->ordered_roots);
694
	spin_unlock(&fs_info->ordered_root_lock);
695
	mutex_unlock(&fs_info->ordered_operations_mutex);
696 697
}

698 699 700 701 702 703 704
/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
 * in the extent, and it waits on the io completion code to insert
 * metadata into the btree corresponding to the extent
 */
705
void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait)
706 707
{
	u64 start = entry->file_offset;
708
	u64 end = start + entry->num_bytes - 1;
709
	struct btrfs_inode *inode = BTRFS_I(entry->inode);
710

711
	trace_btrfs_ordered_extent_start(inode, entry);
712

713 714 715
	/*
	 * pages in the range can be dirty, clean or writeback.  We
	 * start IO on any dirty ones so the wait doesn't stall waiting
716
	 * for the flusher thread to find them
717
	 */
718
	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
719
		filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end);
720
	if (wait) {
721 722
		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
						 &entry->flags));
723
	}
724
}
725

726 727 728
/*
 * Used to wait on ordered extents across a large range of bytes.
 */
729
int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
730
{
731
	int ret = 0;
732
	int ret_wb = 0;
733
	u64 end;
734
	u64 orig_end;
735
	struct btrfs_ordered_extent *ordered;
736 737

	if (start + len < start) {
738
		orig_end = INT_LIMIT(loff_t);
739 740
	} else {
		orig_end = start + len - 1;
741 742
		if (orig_end > INT_LIMIT(loff_t))
			orig_end = INT_LIMIT(loff_t);
743
	}
744

745 746 747
	/* start IO across the range first to instantiate any delalloc
	 * extents
	 */
748
	ret = btrfs_fdatawrite_range(inode, start, orig_end);
749 750
	if (ret)
		return ret;
751

752 753 754 755 756 757 758 759
	/*
	 * If we have a writeback error don't return immediately. Wait first
	 * for any ordered extents that haven't completed yet. This is to make
	 * sure no one can dirty the same page ranges and call writepages()
	 * before the ordered extents complete - to avoid failures (-EEXIST)
	 * when adding the new ordered extents to the ordered tree.
	 */
	ret_wb = filemap_fdatawait_range(inode->i_mapping, start, orig_end);
760

761
	end = orig_end;
762
	while (1) {
763
		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end);
764
		if (!ordered)
765
			break;
766
		if (ordered->file_offset > orig_end) {
767 768 769
			btrfs_put_ordered_extent(ordered);
			break;
		}
770
		if (ordered->file_offset + ordered->num_bytes <= start) {
771 772 773
			btrfs_put_ordered_extent(ordered);
			break;
		}
774
		btrfs_start_ordered_extent(ordered, 1);
775
		end = ordered->file_offset;
776 777 778 779 780
		/*
		 * If the ordered extent had an error save the error but don't
		 * exit without waiting first for all other ordered extents in
		 * the range to complete.
		 */
781 782
		if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
			ret = -EIO;
783
		btrfs_put_ordered_extent(ordered);
784
		if (end == 0 || end == start)
785 786 787
			break;
		end--;
	}
788
	return ret_wb ? ret_wb : ret;
789 790
}

791 792 793 794
/*
 * find an ordered extent corresponding to file_offset.  return NULL if
 * nothing is found, otherwise take a reference on the extent and return it
 */
795
struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode,
796 797 798 799 800
							 u64 file_offset)
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;
801
	unsigned long flags;
802

803
	tree = &inode->ordered_tree;
804
	spin_lock_irqsave(&tree->lock, flags);
805 806 807 808 809
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
810
	if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
811
		entry = NULL;
812
	if (entry) {
813
		refcount_inc(&entry->refs);
814 815
		trace_btrfs_ordered_extent_lookup(inode, entry);
	}
816
out:
817
	spin_unlock_irqrestore(&tree->lock, flags);
818 819 820
	return entry;
}

821 822 823
/* Since the DIO code tries to lock a wide area we need to look for any ordered
 * extents that exist in the range, rather than just the start of the range.
 */
824 825
struct btrfs_ordered_extent *btrfs_lookup_ordered_range(
		struct btrfs_inode *inode, u64 file_offset, u64 len)
826 827 828 829 830
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

831
	tree = &inode->ordered_tree;
832
	spin_lock_irq(&tree->lock);
833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
	node = tree_search(tree, file_offset);
	if (!node) {
		node = tree_search(tree, file_offset + len);
		if (!node)
			goto out;
	}

	while (1) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			break;

		if (entry->file_offset >= file_offset + len) {
			entry = NULL;
			break;
		}
		entry = NULL;
		node = rb_next(node);
		if (!node)
			break;
	}
out:
855
	if (entry) {
856
		refcount_inc(&entry->refs);
857 858
		trace_btrfs_ordered_extent_lookup_range(inode, entry);
	}
859
	spin_unlock_irq(&tree->lock);
860 861 862
	return entry;
}

863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886
/*
 * Adds all ordered extents to the given list. The list ends up sorted by the
 * file_offset of the ordered extents.
 */
void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode,
					   struct list_head *list)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *n;

	ASSERT(inode_is_locked(&inode->vfs_inode));

	spin_lock_irq(&tree->lock);
	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
		struct btrfs_ordered_extent *ordered;

		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);

		if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags))
			continue;

		ASSERT(list_empty(&ordered->log_list));
		list_add_tail(&ordered->log_list, list);
		refcount_inc(&ordered->refs);
887
		trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered);
888 889 890 891
	}
	spin_unlock_irq(&tree->lock);
}

892 893 894 895
/*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
 */
896
struct btrfs_ordered_extent *
897
btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset)
898 899 900 901 902
{
	struct btrfs_ordered_inode_tree *tree;
	struct rb_node *node;
	struct btrfs_ordered_extent *entry = NULL;

903
	tree = &inode->ordered_tree;
904
	spin_lock_irq(&tree->lock);
905 906 907 908 909
	node = tree_search(tree, file_offset);
	if (!node)
		goto out;

	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
910
	refcount_inc(&entry->refs);
911
	trace_btrfs_ordered_extent_lookup_first(inode, entry);
912
out:
913
	spin_unlock_irq(&tree->lock);
914
	return entry;
915
}
916

917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985
/*
 * Lookup the first ordered extent that overlaps the range
 * [@file_offset, @file_offset + @len).
 *
 * The difference between this and btrfs_lookup_first_ordered_extent() is
 * that this one won't return any ordered extent that does not overlap the range.
 * And the difference against btrfs_lookup_ordered_extent() is, this function
 * ensures the first ordered extent gets returned.
 */
struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range(
			struct btrfs_inode *inode, u64 file_offset, u64 len)
{
	struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree;
	struct rb_node *node;
	struct rb_node *cur;
	struct rb_node *prev;
	struct rb_node *next;
	struct btrfs_ordered_extent *entry = NULL;

	spin_lock_irq(&tree->lock);
	node = tree->tree.rb_node;
	/*
	 * Here we don't want to use tree_search() which will use tree->last
	 * and screw up the search order.
	 * And __tree_search() can't return the adjacent ordered extents
	 * either, thus here we do our own search.
	 */
	while (node) {
		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);

		if (file_offset < entry->file_offset) {
			node = node->rb_left;
		} else if (file_offset >= entry_end(entry)) {
			node = node->rb_right;
		} else {
			/*
			 * Direct hit, got an ordered extent that starts at
			 * @file_offset
			 */
			goto out;
		}
	}
	if (!entry) {
		/* Empty tree */
		goto out;
	}

	cur = &entry->rb_node;
	/* We got an entry around @file_offset, check adjacent entries */
	if (entry->file_offset < file_offset) {
		prev = cur;
		next = rb_next(cur);
	} else {
		prev = rb_prev(cur);
		next = cur;
	}
	if (prev) {
		entry = rb_entry(prev, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	if (next) {
		entry = rb_entry(next, struct btrfs_ordered_extent, rb_node);
		if (range_overlaps(entry, file_offset, len))
			goto out;
	}
	/* No ordered extent in the range */
	entry = NULL;
out:
986
	if (entry) {
987
		refcount_inc(&entry->refs);
988 989 990
		trace_btrfs_ordered_extent_lookup_first_range(inode, entry);
	}

991 992 993 994
	spin_unlock_irq(&tree->lock);
	return entry;
}

995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007
/*
 * btrfs_flush_ordered_range - Lock the passed range and ensures all pending
 * ordered extents in it are run to completion.
 *
 * @inode:        Inode whose ordered tree is to be searched
 * @start:        Beginning of range to flush
 * @end:          Last byte of range to lock
 * @cached_state: If passed, will return the extent state responsible for the
 * locked range. It's the caller's responsibility to free the cached state.
 *
 * This function always returns with the given range locked, ensuring after it's
 * called no order extent can be pending.
 */
1008
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
1009 1010 1011 1012
					u64 end,
					struct extent_state **cached_state)
{
	struct btrfs_ordered_extent *ordered;
1013 1014
	struct extent_state *cache = NULL;
	struct extent_state **cachedp = &cache;
1015 1016

	if (cached_state)
1017
		cachedp = cached_state;
1018 1019

	while (1) {
1020
		lock_extent_bits(&inode->io_tree, start, end, cachedp);
1021 1022
		ordered = btrfs_lookup_ordered_range(inode, start,
						     end - start + 1);
1023 1024 1025 1026 1027 1028 1029
		if (!ordered) {
			/*
			 * If no external cached_state has been passed then
			 * decrement the extra ref taken for cachedp since we
			 * aren't exposing it outside of this function
			 */
			if (!cached_state)
1030
				refcount_dec(&cache->refs);
1031
			break;
1032
		}
1033
		unlock_extent_cached(&inode->io_tree, start, end, cachedp);
1034
		btrfs_start_ordered_extent(ordered, 1);
1035 1036 1037 1038
		btrfs_put_ordered_extent(ordered);
	}
}

1039 1040 1041 1042
static int clone_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pos,
				u64 len)
{
	struct inode *inode = ordered->inode;
1043
	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
1044 1045
	u64 file_offset = ordered->file_offset + pos;
	u64 disk_bytenr = ordered->disk_bytenr + pos;
1046
	unsigned long flags = ordered->flags & BTRFS_ORDERED_TYPE_FLAGS;
1047

1048
	/*
1049 1050
	 * The splitting extent is already counted and will be added again in
	 * btrfs_add_ordered_extent_*(). Subtract len to avoid double counting.
1051
	 */
1052
	percpu_counter_add_batch(&fs_info->ordered_bytes, -len,
1053
				 fs_info->delalloc_batch);
1054 1055 1056 1057
	WARN_ON_ONCE(flags & (1 << BTRFS_ORDERED_COMPRESSED));
	return btrfs_add_ordered_extent(BTRFS_I(inode), file_offset, len, len,
					disk_bytenr, len, 0, flags,
					ordered->compress_type);
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068
}

int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
				u64 post)
{
	struct inode *inode = ordered->inode;
	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
	struct rb_node *node;
	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
	int ret = 0;

1069 1070
	trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered);

1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095
	spin_lock_irq(&tree->lock);
	/* Remove from tree once */
	node = &ordered->rb_node;
	rb_erase(node, &tree->tree);
	RB_CLEAR_NODE(node);
	if (tree->last == node)
		tree->last = NULL;

	ordered->file_offset += pre;
	ordered->disk_bytenr += pre;
	ordered->num_bytes -= (pre + post);
	ordered->disk_num_bytes -= (pre + post);
	ordered->bytes_left -= (pre + post);

	/* Re-insert the node */
	node = tree_insert(&tree->tree, ordered->file_offset, &ordered->rb_node);
	if (node)
		btrfs_panic(fs_info, -EEXIST,
			"zoned: inconsistency in ordered tree at offset %llu",
			    ordered->file_offset);

	spin_unlock_irq(&tree->lock);

	if (pre)
		ret = clone_ordered_extent(ordered, 0, pre);
1096
	if (ret == 0 && post)
1097 1098 1099 1100 1101 1102
		ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
					   post);

	return ret;
}

1103 1104 1105 1106
int __init ordered_data_init(void)
{
	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
				     sizeof(struct btrfs_ordered_extent), 0,
1107
				     SLAB_MEM_SPREAD,
1108 1109 1110
				     NULL);
	if (!btrfs_ordered_extent_cache)
		return -ENOMEM;
1111

1112 1113 1114
	return 0;
}

1115
void __cold ordered_data_exit(void)
1116
{
1117
	kmem_cache_destroy(btrfs_ordered_extent_cache);
1118
}