disk-io.c 139 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
Chris Mason's avatar
Chris Mason committed
2 3 4 5
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */

Chris Mason's avatar
Chris Mason committed
6
#include <linux/fs.h>
7
#include <linux/blkdev.h>
8
#include <linux/radix-tree.h>
9
#include <linux/writeback.h>
10
#include <linux/workqueue.h>
11
#include <linux/kthread.h>
12
#include <linux/slab.h>
13
#include <linux/migrate.h>
14
#include <linux/ratelimit.h>
15
#include <linux/uuid.h>
16
#include <linux/semaphore.h>
17
#include <linux/error-injection.h>
18
#include <linux/crc32c.h>
19
#include <linux/sched/mm.h>
20
#include <asm/unaligned.h>
21
#include <crypto/hash.h>
22 23
#include "ctree.h"
#include "disk-io.h"
24
#include "transaction.h"
25
#include "btrfs_inode.h"
26
#include "bio.h"
27
#include "print-tree.h"
28
#include "locking.h"
29
#include "tree-log.h"
30
#include "free-space-cache.h"
31
#include "free-space-tree.h"
32
#include "dev-replace.h"
David Woodhouse's avatar
David Woodhouse committed
33
#include "raid56.h"
34
#include "sysfs.h"
35
#include "qgroup.h"
36
#include "compression.h"
37
#include "tree-checker.h"
38
#include "ref-verify.h"
39
#include "block-group.h"
40
#include "discard.h"
41
#include "space-info.h"
42
#include "zoned.h"
43
#include "subpage.h"
44
#include "fs.h"
45
#include "accessors.h"
46
#include "extent-tree.h"
47
#include "root-tree.h"
48
#include "defrag.h"
49
#include "uuid-tree.h"
50
#include "relocation.h"
51
#include "scrub.h"
52
#include "super.h"
53

54 55 56 57
#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
				 BTRFS_HEADER_FLAG_RELOC |\
				 BTRFS_SUPER_FLAG_ERROR |\
				 BTRFS_SUPER_FLAG_SEEDING |\
58 59
				 BTRFS_SUPER_FLAG_METADUMP |\
				 BTRFS_SUPER_FLAG_METADUMP_V2)
60

61 62
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info);
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info);
63

64 65 66 67 68 69
static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
{
	if (fs_info->csum_shash)
		crypto_free_shash(fs_info->csum_shash);
}

70
/*
71
 * Compute the csum of a btree block and store the result to provided buffer.
72
 */
73
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
74
{
75
	struct btrfs_fs_info *fs_info = buf->fs_info;
76 77
	int num_pages;
	u32 first_page_part;
78
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
79
	char *kaddr;
80
	int i;
81 82 83

	shash->tfm = fs_info->csum_shash;
	crypto_shash_init(shash);
84 85 86 87 88 89 90

	if (buf->addr) {
		/* Pages are contiguous, handle them as a big one. */
		kaddr = buf->addr;
		first_page_part = fs_info->nodesize;
		num_pages = 1;
	} else {
91
		kaddr = folio_address(buf->folios[0]);
92 93 94 95
		first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
		num_pages = num_extent_pages(buf);
	}

96
	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
97
			    first_page_part - BTRFS_CSUM_SIZE);
98

99 100 101 102 103 104
	/*
	 * Multiple single-page folios case would reach here.
	 *
	 * nodesize <= PAGE_SIZE and large folio all handled by above
	 * crypto_shash_update() already.
	 */
105
	for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
106
		kaddr = folio_address(buf->folios[i]);
107
		crypto_shash_update(shash, kaddr, PAGE_SIZE);
108
	}
109
	memset(result, 0, BTRFS_CSUM_SIZE);
110
	crypto_shash_final(shash, result);
111 112
}

113 114 115 116 117 118
/*
 * we can't consider a given block up to date unless the transid of the
 * block matches the transid in the parent node's pointer.  This is how we
 * detect blocks that either didn't get written at all or got written
 * in the wrong place.
 */
119
int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, int atomic)
120
{
121
	if (!extent_buffer_uptodate(eb))
122 123
		return 0;

124 125 126
	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
		return 1;

127 128 129
	if (atomic)
		return -EAGAIN;

130 131 132
	if (!extent_buffer_uptodate(eb) ||
	    btrfs_header_generation(eb) != parent_transid) {
		btrfs_err_rl(eb->fs_info,
133 134
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
			eb->start, eb->read_mirror,
135
			parent_transid, btrfs_header_generation(eb));
136
		clear_extent_buffer_uptodate(eb);
137
		return 0;
138
	}
139
	return 1;
140 141
}

142 143 144 145
static bool btrfs_supported_super_csum(u16 csum_type)
{
	switch (csum_type) {
	case BTRFS_CSUM_TYPE_CRC32:
146
	case BTRFS_CSUM_TYPE_XXHASH:
147
	case BTRFS_CSUM_TYPE_SHA256:
148
	case BTRFS_CSUM_TYPE_BLAKE2:
149 150 151 152 153 154
		return true;
	default:
		return false;
	}
}

155 156 157 158
/*
 * Return 0 if the superblock checksum type matches the checksum value of that
 * algorithm. Pass the raw disk superblock data.
 */
159 160
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
			   const struct btrfs_super_block *disk_sb)
161
{
162
	char result[BTRFS_CSUM_SIZE];
163 164 165
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);

	shash->tfm = fs_info->csum_shash;
166

167 168 169 170 171
	/*
	 * The super_block structure does not span the whole
	 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space is
	 * filled with zeros and is included in the checksum.
	 */
172
	crypto_shash_digest(shash, (const u8 *)disk_sb + BTRFS_CSUM_SIZE,
173
			    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, result);
174

175
	if (memcmp(disk_sb->csum, result, fs_info->csum_size))
176
		return 1;
177

178
	return 0;
179 180
}

181 182 183 184
static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
				      int mirror_num)
{
	struct btrfs_fs_info *fs_info = eb->fs_info;
185
	int num_folios = num_extent_folios(eb);
186 187 188 189 190
	int ret = 0;

	if (sb_rdonly(fs_info->sb))
		return -EROFS;

191 192 193
	for (int i = 0; i < num_folios; i++) {
		struct folio *folio = eb->folios[i];
		u64 start = max_t(u64, eb->start, folio_pos(folio));
194
		u64 end = min_t(u64, eb->start + eb->len,
195
				folio_pos(folio) + eb->folio_size);
196
		u32 len = end - start;
197

198
		ret = btrfs_repair_io_failure(fs_info, 0, start, len,
199 200
					      start, folio, offset_in_folio(folio, start),
					      mirror_num);
201 202 203 204 205 206 207
		if (ret)
			break;
	}

	return ret;
}

208 209 210
/*
 * helper to read a given tree block, doing retries as required when
 * the checksums don't match and we have alternate mirrors to try.
211
 *
212 213
 * @check:		expected tree parentness check, see the comments of the
 *			structure for details.
214
 */
215
int btrfs_read_extent_buffer(struct extent_buffer *eb,
216
			     struct btrfs_tree_parent_check *check)
217
{
218
	struct btrfs_fs_info *fs_info = eb->fs_info;
219
	int failed = 0;
220 221 222
	int ret;
	int num_copies = 0;
	int mirror_num = 0;
223
	int failed_mirror = 0;
224

225 226
	ASSERT(check);

227
	while (1) {
228
		clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
229 230 231
		ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
		if (!ret)
			break;
232

233
		num_copies = btrfs_num_copies(fs_info,
234
					      eb->start, eb->len);
235
		if (num_copies == 1)
236
			break;
237

238 239 240 241 242
		if (!failed_mirror) {
			failed = 1;
			failed_mirror = eb->read_mirror;
		}

243
		mirror_num++;
244 245 246
		if (mirror_num == failed_mirror)
			mirror_num++;

247
		if (mirror_num > num_copies)
248
			break;
249
	}
250

251
	if (failed && !ret && failed_mirror)
252
		btrfs_repair_eb_io_failure(eb, failed_mirror);
253 254

	return ret;
255
}
256

257 258 259 260
/*
 * Checksum a dirty tree block before IO.
 */
blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
261
{
262
	struct extent_buffer *eb = bbio->private;
263
	struct btrfs_fs_info *fs_info = eb->fs_info;
264
	u64 found_start = btrfs_header_bytenr(eb);
265
	u64 last_trans;
266 267 268
	u8 result[BTRFS_CSUM_SIZE];
	int ret;

269 270 271 272 273 274
	/* Btree blocks are always contiguous on disk. */
	if (WARN_ON_ONCE(bbio->file_offset != eb->start))
		return BLK_STS_IOERR;
	if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
		return BLK_STS_IOERR;

275 276 277 278 279
	/*
	 * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
	 * checksum it but zero-out its content. This is done to preserve
	 * ordering of I/O without unnecessarily writing out data.
	 */
280
	if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
281
		memzero_extent_buffer(eb, 0, eb->len);
282 283 284 285 286
		return BLK_STS_OK;
	}

	if (WARN_ON_ONCE(found_start != eb->start))
		return BLK_STS_IOERR;
287 288
	if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
					       eb->start, eb->len)))
289 290
		return BLK_STS_IOERR;

291 292 293 294 295 296 297 298
	ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
				    offsetof(struct btrfs_header, fsid),
				    BTRFS_FSID_SIZE) == 0);
	csum_tree_block(eb, result);

	if (btrfs_header_level(eb))
		ret = btrfs_check_node(eb);
	else
299
		ret = btrfs_check_leaf(eb);
300

301 302 303 304 305 306 307
	if (ret < 0)
		goto error;

	/*
	 * Also check the generation, the eb reached here must be newer than
	 * last committed. Or something seriously wrong happened.
	 */
308 309
	last_trans = btrfs_get_last_trans_committed(fs_info);
	if (unlikely(btrfs_header_generation(eb) <= last_trans)) {
310
		ret = -EUCLEAN;
311
		btrfs_err(fs_info,
312
			"block=%llu bad generation, have %llu expect > %llu",
313
			  eb->start, btrfs_header_generation(eb), last_trans);
314
		goto error;
315 316
	}
	write_extent_buffer(eb, result, 0, fs_info->csum_size);
317
	return BLK_STS_OK;
318 319 320 321 322

error:
	btrfs_print_tree(eb, 0);
	btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
		  eb->start);
323 324 325 326 327 328 329 330
	/*
	 * Be noisy if this is an extent buffer from a log tree. We don't abort
	 * a transaction in case there's a bad log tree extent buffer, we just
	 * fallback to a transaction commit. Still we want to know when there is
	 * a bad log tree extent buffer, as that may signal a bug somewhere.
	 */
	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
		btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
331 332 333
	return errno_to_blk_status(ret);
}

334
static bool check_tree_block_fsid(struct extent_buffer *eb)
Yan Zheng's avatar
Yan Zheng committed
335
{
336
	struct btrfs_fs_info *fs_info = eb->fs_info;
337
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
338
	u8 fsid[BTRFS_FSID_SIZE];
Yan Zheng's avatar
Yan Zheng committed
339

340 341
	read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid),
			   BTRFS_FSID_SIZE);
342

343
	/*
344 345 346 347
	 * alloc_fsid_devices() copies the fsid into fs_devices::metadata_uuid.
	 * This is then overwritten by metadata_uuid if it is present in the
	 * device_list_add(). The same true for a seed device as well. So use of
	 * fs_devices::metadata_uuid is appropriate here.
348
	 */
349
	if (memcmp(fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) == 0)
350
		return false;
351 352 353

	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list)
		if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE))
354
			return false;
355

356
	return true;
Yan Zheng's avatar
Yan Zheng committed
357 358
}

359
/* Do basic extent buffer checks at read time */
360 361
int btrfs_validate_extent_buffer(struct extent_buffer *eb,
				 struct btrfs_tree_parent_check *check)
362
{
363
	struct btrfs_fs_info *fs_info = eb->fs_info;
364
	u64 found_start;
365 366
	const u32 csum_size = fs_info->csum_size;
	u8 found_level;
367
	u8 result[BTRFS_CSUM_SIZE];
368
	const u8 *header_csum;
369
	int ret = 0;
370

371 372
	ASSERT(check);

373
	found_start = btrfs_header_bytenr(eb);
374
	if (found_start != eb->start) {
375 376 377
		btrfs_err_rl(fs_info,
			"bad tree block start, mirror %u want %llu have %llu",
			     eb->read_mirror, eb->start, found_start);
378
		ret = -EIO;
379
		goto out;
380
	}
381
	if (check_tree_block_fsid(eb)) {
382 383
		btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u",
			     eb->start, eb->read_mirror);
384
		ret = -EIO;
385
		goto out;
386
	}
387
	found_level = btrfs_header_level(eb);
388
	if (found_level >= BTRFS_MAX_LEVEL) {
389 390 391
		btrfs_err(fs_info,
			"bad tree block level, mirror %u level %d on logical %llu",
			eb->read_mirror, btrfs_header_level(eb), eb->start);
392
		ret = -EIO;
393
		goto out;
394
	}
395

396
	csum_tree_block(eb, result);
397
	header_csum = folio_address(eb->folios[0]) +
398
		get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
399

400
	if (memcmp(result, header_csum, csum_size) != 0) {
401
		btrfs_warn_rl(fs_info,
402 403
"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d",
			      eb->start, eb->read_mirror,
404
			      CSUM_FMT_VALUE(csum_size, header_csum),
405 406
			      CSUM_FMT_VALUE(csum_size, result),
			      btrfs_header_level(eb));
407
		ret = -EUCLEAN;
408
		goto out;
409 410
	}

411
	if (found_level != check->level) {
412 413 414
		btrfs_err(fs_info,
		"level verify failed on logical %llu mirror %u wanted %u found %u",
			  eb->start, eb->read_mirror, check->level, found_level);
415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452
		ret = -EIO;
		goto out;
	}
	if (unlikely(check->transid &&
		     btrfs_header_generation(eb) != check->transid)) {
		btrfs_err_rl(eb->fs_info,
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
				eb->start, eb->read_mirror, check->transid,
				btrfs_header_generation(eb));
		ret = -EIO;
		goto out;
	}
	if (check->has_first_key) {
		struct btrfs_key *expect_key = &check->first_key;
		struct btrfs_key found_key;

		if (found_level)
			btrfs_node_key_to_cpu(eb, &found_key, 0);
		else
			btrfs_item_key_to_cpu(eb, &found_key, 0);
		if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
			btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
				  eb->start, check->transid,
				  expect_key->objectid,
				  expect_key->type, expect_key->offset,
				  found_key.objectid, found_key.type,
				  found_key.offset);
			ret = -EUCLEAN;
			goto out;
		}
	}
	if (check->owner_root) {
		ret = btrfs_check_eb_owner(eb, check->owner_root);
		if (ret < 0)
			goto out;
	}

453 454 455 456 457
	/*
	 * If this is a leaf block and it is corrupt, set the corrupt bit so
	 * that we don't try and read the other copies of this block, just
	 * return -EIO.
	 */
458
	if (found_level == 0 && btrfs_check_leaf(eb)) {
459 460 461
		set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
		ret = -EIO;
	}
462

463
	if (found_level > 0 && btrfs_check_node(eb))
Liu Bo's avatar
Liu Bo committed
464 465
		ret = -EIO;

466
	if (ret)
467
		btrfs_err(fs_info,
468 469
		"read time tree block corruption detected on logical %llu mirror %u",
			  eb->start, eb->read_mirror);
470 471 472 473
out:
	return ret;
}

Jan Beulich's avatar
Jan Beulich committed
474
#ifdef CONFIG_MIGRATION
475 476
static int btree_migrate_folio(struct address_space *mapping,
		struct folio *dst, struct folio *src, enum migrate_mode mode)
477 478 479 480 481
{
	/*
	 * we can't safely write a btree page from here,
	 * we haven't done the locking hook
	 */
482
	if (folio_test_dirty(src))
483 484 485 486 487
		return -EAGAIN;
	/*
	 * Buffers may be managed in a filesystem specific way.
	 * We must have no buffers or drop them.
	 */
488 489
	if (folio_get_private(src) &&
	    !filemap_release_folio(src, GFP_KERNEL))
490
		return -EAGAIN;
491
	return migrate_folio(mapping, dst, src, mode);
492
}
493 494
#else
#define btree_migrate_folio NULL
Jan Beulich's avatar
Jan Beulich committed
495
#endif
496

497 498 499
static int btree_writepages(struct address_space *mapping,
			    struct writeback_control *wbc)
{
500 501
	int ret;

502
	if (wbc->sync_mode == WB_SYNC_NONE) {
503
		struct btrfs_fs_info *fs_info;
504 505 506 507

		if (wbc->for_kupdate)
			return 0;

508
		fs_info = inode_to_fs_info(mapping->host);
509
		/* this is a bit racy, but that's ok */
510 511 512
		ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
					     BTRFS_DIRTY_METADATA_THRESH,
					     fs_info->dirty_metadata_batch);
513
		if (ret < 0)
514 515
			return 0;
	}
516
	return btree_write_cache_pages(mapping, wbc);
517 518
}

519
static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags)
520
{
521 522
	if (folio_test_writeback(folio) || folio_test_dirty(folio))
		return false;
523

524
	return try_release_extent_buffer(&folio->page);
525 526
}

527 528
static void btree_invalidate_folio(struct folio *folio, size_t offset,
				 size_t length)
529
{
530
	struct extent_io_tree *tree;
531 532

	tree = &folio_to_inode(folio)->io_tree;
533
	extent_invalidate_folio(tree, folio, offset);
534
	btree_release_folio(folio, GFP_NOFS);
535
	if (folio_get_private(folio)) {
536
		btrfs_warn(folio_to_fs_info(folio),
537 538 539
			   "folio private not zero on folio %llu",
			   (unsigned long long)folio_pos(folio));
		folio_detach_private(folio);
540
	}
541 542
}

543
#ifdef DEBUG
544 545 546
static bool btree_dirty_folio(struct address_space *mapping,
		struct folio *folio)
{
547
	struct btrfs_fs_info *fs_info = inode_to_fs_info(mapping->host);
548
	struct btrfs_subpage_info *spi = fs_info->subpage_info;
549
	struct btrfs_subpage *subpage;
550
	struct extent_buffer *eb;
551
	int cur_bit = 0;
552
	u64 page_start = folio_pos(folio);
553 554

	if (fs_info->sectorsize == PAGE_SIZE) {
555
		eb = folio_get_private(folio);
556 557 558
		BUG_ON(!eb);
		BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
		BUG_ON(!atomic_read(&eb->refs));
559
		btrfs_assert_tree_write_locked(eb);
560
		return filemap_dirty_folio(mapping, folio);
561
	}
562 563

	ASSERT(spi);
564
	subpage = folio_get_private(folio);
565

566 567 568
	for (cur_bit = spi->dirty_offset;
	     cur_bit < spi->dirty_offset + spi->bitmap_nr_bits;
	     cur_bit++) {
569 570 571 572
		unsigned long flags;
		u64 cur;

		spin_lock_irqsave(&subpage->lock, flags);
573
		if (!test_bit(cur_bit, subpage->bitmaps)) {
574 575 576 577 578
			spin_unlock_irqrestore(&subpage->lock, flags);
			continue;
		}
		spin_unlock_irqrestore(&subpage->lock, flags);
		cur = page_start + cur_bit * fs_info->sectorsize;
579

580 581 582 583
		eb = find_extent_buffer(fs_info, cur);
		ASSERT(eb);
		ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
		ASSERT(atomic_read(&eb->refs));
584
		btrfs_assert_tree_write_locked(eb);
585 586
		free_extent_buffer(eb);

587
		cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits) - 1;
588
	}
589
	return filemap_dirty_folio(mapping, folio);
590
}
591 592 593
#else
#define btree_dirty_folio filemap_dirty_folio
#endif
594

595
static const struct address_space_operations btree_aops = {
596
	.writepages	= btree_writepages,
597
	.release_folio	= btree_release_folio,
598
	.invalidate_folio = btree_invalidate_folio,
599 600
	.migrate_folio	= btree_migrate_folio,
	.dirty_folio	= btree_dirty_folio,
601 602
};

603 604
struct extent_buffer *btrfs_find_create_tree_block(
						struct btrfs_fs_info *fs_info,
605 606
						u64 bytenr, u64 owner_root,
						int level)
607
{
608 609
	if (btrfs_is_testing(fs_info))
		return alloc_test_extent_buffer(fs_info, bytenr);
610
	return alloc_extent_buffer(fs_info, bytenr, owner_root, level);
611 612
}

613 614 615 616
/*
 * Read tree block at logical address @bytenr and do variant basic but critical
 * verification.
 *
617 618
 * @check:		expected tree parentness check, see comments of the
 *			structure for details.
619
 */
620
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
621
				      struct btrfs_tree_parent_check *check)
622 623 624 625
{
	struct extent_buffer *buf = NULL;
	int ret;

626 627 628 629
	ASSERT(check);

	buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
					   check->level);
630 631
	if (IS_ERR(buf))
		return buf;
632

633
	ret = btrfs_read_extent_buffer(buf, check);
634
	if (ret) {
635
		free_extent_buffer_stale(buf);
636
		return ERR_PTR(ret);
637
	}
638
	if (btrfs_check_eb_owner(buf, check->owner_root)) {
639 640 641
		free_extent_buffer_stale(buf);
		return ERR_PTR(-EUCLEAN);
	}
642
	return buf;
643

644 645
}

646
static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
647
			 u64 objectid)
648
{
649
	bool dummy = btrfs_is_testing(fs_info);
650 651 652 653

	memset(&root->root_key, 0, sizeof(root->root_key));
	memset(&root->root_item, 0, sizeof(root->root_item));
	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
654
	root->fs_info = fs_info;
655
	root->root_key.objectid = objectid;
Chris Mason's avatar
Chris Mason committed
656
	root->node = NULL;
657
	root->commit_root = NULL;
658
	root->state = 0;
659
	RB_CLEAR_NODE(&root->rb_node);
660

661
	root->last_trans = 0;
662
	root->free_objectid = 0;
663
	root->nr_delalloc_inodes = 0;
664
	root->nr_ordered_extents = 0;
665
	root->inode_tree = RB_ROOT;
666
	xa_init(&root->delayed_nodes);
667 668

	btrfs_init_root_block_rsv(root);
669 670

	INIT_LIST_HEAD(&root->dirty_list);
671
	INIT_LIST_HEAD(&root->root_list);
672 673
	INIT_LIST_HEAD(&root->delalloc_inodes);
	INIT_LIST_HEAD(&root->delalloc_root);
674 675
	INIT_LIST_HEAD(&root->ordered_extents);
	INIT_LIST_HEAD(&root->ordered_root);
676
	INIT_LIST_HEAD(&root->reloc_dirty_list);
677
	spin_lock_init(&root->inode_lock);
678
	spin_lock_init(&root->delalloc_lock);
679
	spin_lock_init(&root->ordered_extent_lock);
680
	spin_lock_init(&root->accounting_lock);
681
	spin_lock_init(&root->qgroup_meta_rsv_lock);
682
	mutex_init(&root->objectid_mutex);
683
	mutex_init(&root->log_mutex);
684
	mutex_init(&root->ordered_extent_mutex);
685
	mutex_init(&root->delalloc_mutex);
686
	init_waitqueue_head(&root->qgroup_flush_wait);
687 688 689
	init_waitqueue_head(&root->log_writer_wait);
	init_waitqueue_head(&root->log_commit_wait[0]);
	init_waitqueue_head(&root->log_commit_wait[1]);
690 691
	INIT_LIST_HEAD(&root->log_ctxs[0]);
	INIT_LIST_HEAD(&root->log_ctxs[1]);
692 693 694
	atomic_set(&root->log_commit[0], 0);
	atomic_set(&root->log_commit[1], 0);
	atomic_set(&root->log_writers, 0);
695
	atomic_set(&root->log_batch, 0);
696
	refcount_set(&root->refs, 1);
697
	atomic_set(&root->snapshot_force_cow, 0);
698
	atomic_set(&root->nr_swapfiles, 0);
699
	btrfs_set_root_log_transid(root, 0);
700
	root->log_transid_committed = -1;
701
	btrfs_set_root_last_log_commit(root, 0);
702
	root->anon_dev = 0;
703
	if (!dummy) {
704
		extent_io_tree_init(fs_info, &root->dirty_log_pages,
705
				    IO_TREE_ROOT_DIRTY_LOG_PAGES);
706
		extent_io_tree_init(fs_info, &root->log_csum_range,
707
				    IO_TREE_LOG_CSUM_RANGE);
708
	}
709

710
	spin_lock_init(&root->root_item_lock);
711
	btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
712 713
#ifdef CONFIG_BTRFS_DEBUG
	INIT_LIST_HEAD(&root->leak_list);
714
	spin_lock(&fs_info->fs_roots_radix_lock);
715
	list_add_tail(&root->leak_list, &fs_info->allocated_roots);
716
	spin_unlock(&fs_info->fs_roots_radix_lock);
717
#endif
718 719
}

720
static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info,
721
					   u64 objectid, gfp_t flags)
722
{
723
	struct btrfs_root *root = kzalloc(sizeof(*root), flags);
724
	if (root)
725
		__setup_root(root, fs_info, objectid);
726 727 728
	return root;
}

729 730
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/* Should only be used by the testing infrastructure */
731
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info)
732 733 734
{
	struct btrfs_root *root;

735 736 737
	if (!fs_info)
		return ERR_PTR(-EINVAL);

738
	root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID, GFP_KERNEL);
739 740
	if (!root)
		return ERR_PTR(-ENOMEM);
741

742
	/* We don't use the stripesize in selftest, set it as sectorsize */
743
	root->alloc_bytenr = 0;
744 745 746 747 748

	return root;
}
#endif

749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768
static int global_root_cmp(struct rb_node *a_node, const struct rb_node *b_node)
{
	const struct btrfs_root *a = rb_entry(a_node, struct btrfs_root, rb_node);
	const struct btrfs_root *b = rb_entry(b_node, struct btrfs_root, rb_node);

	return btrfs_comp_cpu_keys(&a->root_key, &b->root_key);
}

static int global_root_key_cmp(const void *k, const struct rb_node *node)
{
	const struct btrfs_key *key = k;
	const struct btrfs_root *root = rb_entry(node, struct btrfs_root, rb_node);

	return btrfs_comp_cpu_keys(key, &root->root_key);
}

int btrfs_global_root_insert(struct btrfs_root *root)
{
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct rb_node *tmp;
769
	int ret = 0;
770 771 772 773 774

	write_lock(&fs_info->global_root_lock);
	tmp = rb_find_add(&root->rb_node, &fs_info->global_root_tree, global_root_cmp);
	write_unlock(&fs_info->global_root_lock);

775 776 777
	if (tmp) {
		ret = -EEXIST;
		btrfs_warn(fs_info, "global root %llu %llu already exists",
778
			   btrfs_root_id(root), root->root_key.offset);
779 780
	}
	return ret;
781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806
}

void btrfs_global_root_delete(struct btrfs_root *root)
{
	struct btrfs_fs_info *fs_info = root->fs_info;

	write_lock(&fs_info->global_root_lock);
	rb_erase(&root->rb_node, &fs_info->global_root_tree);
	write_unlock(&fs_info->global_root_lock);
}

struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
				     struct btrfs_key *key)
{
	struct rb_node *node;
	struct btrfs_root *root = NULL;

	read_lock(&fs_info->global_root_lock);
	node = rb_find(key, &fs_info->global_root_tree, global_root_key_cmp);
	if (node)
		root = container_of(node, struct btrfs_root, rb_node);
	read_unlock(&fs_info->global_root_lock);

	return root;
}

807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr)
{
	struct btrfs_block_group *block_group;
	u64 ret;

	if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
		return 0;

	if (bytenr)
		block_group = btrfs_lookup_block_group(fs_info, bytenr);
	else
		block_group = btrfs_lookup_first_block_group(fs_info, bytenr);
	ASSERT(block_group);
	if (!block_group)
		return 0;
	ret = block_group->global_root_id;
	btrfs_put_block_group(block_group);

	return ret;
}

828 829 830 831 832
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
	struct btrfs_key key = {
		.objectid = BTRFS_CSUM_TREE_OBJECTID,
		.type = BTRFS_ROOT_ITEM_KEY,
833
		.offset = btrfs_global_root_id(fs_info, bytenr),
834 835 836 837 838 839 840 841 842 843
	};

	return btrfs_global_root(fs_info, &key);
}

struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
{
	struct btrfs_key key = {
		.objectid = BTRFS_EXTENT_TREE_OBJECTID,
		.type = BTRFS_ROOT_ITEM_KEY,
844
		.offset = btrfs_global_root_id(fs_info, bytenr),
845 846 847 848 849
	};

	return btrfs_global_root(fs_info, &key);
}

850 851 852 853 854 855 856
struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
		return fs_info->block_group_root;
	return btrfs_extent_root(fs_info, 0);
}

857 858 859
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
				     u64 objectid)
{
860
	struct btrfs_fs_info *fs_info = trans->fs_info;
861 862 863 864
	struct extent_buffer *leaf;
	struct btrfs_root *tree_root = fs_info->tree_root;
	struct btrfs_root *root;
	struct btrfs_key key;
865
	unsigned int nofs_flag;
866 867
	int ret = 0;

868 869 870 871 872
	/*
	 * We're holding a transaction handle, so use a NOFS memory allocation
	 * context to avoid deadlock if reclaim happens.
	 */
	nofs_flag = memalloc_nofs_save();
873
	root = btrfs_alloc_root(fs_info, objectid, GFP_KERNEL);
874
	memalloc_nofs_restore(nofs_flag);
875 876 877 878 879 880 881
	if (!root)
		return ERR_PTR(-ENOMEM);

	root->root_key.objectid = objectid;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = 0;

882
	leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
883
				      0, BTRFS_NESTING_NORMAL);
884 885
	if (IS_ERR(leaf)) {
		ret = PTR_ERR(leaf);
886
		leaf = NULL;
887
		goto fail;
888 889 890
	}

	root->node = leaf;
891
	btrfs_mark_buffer_dirty(trans, leaf);
892 893

	root->commit_root = btrfs_root_node(root);
894
	set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
895

896 897
	btrfs_set_root_flags(&root->root_item, 0);
	btrfs_set_root_limit(&root->root_item, 0);
898 899 900 901 902 903 904
	btrfs_set_root_bytenr(&root->root_item, leaf->start);
	btrfs_set_root_generation(&root->root_item, trans->transid);
	btrfs_set_root_level(&root->root_item, 0);
	btrfs_set_root_refs(&root->root_item, 1);
	btrfs_set_root_used(&root->root_item, leaf->len);
	btrfs_set_root_last_snapshot(&root->root_item, 0);
	btrfs_set_root_dirid(&root->root_item, 0);
905
	if (is_fstree(objectid))
906 907 908
		generate_random_guid(root->root_item.uuid);
	else
		export_guid(root->root_item.uuid, &guid_null);
909
	btrfs_set_root_drop_level(&root->root_item, 0);
910

911 912
	btrfs_tree_unlock(leaf);

913 914 915 916 917 918 919
	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = 0;
	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
	if (ret)
		goto fail;

920 921
	return root;

922
fail:
923
	btrfs_put_root(root);
924

925
	return ERR_PTR(ret);
926 927
}

928 929
static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
					 struct btrfs_fs_info *fs_info)
930 931
{
	struct btrfs_root *root;
932

933
	root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID, GFP_NOFS);
934
	if (!root)
935
		return ERR_PTR(-ENOMEM);
936 937 938 939

	root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID;
	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
	root->root_key.offset = BTRFS_TREE_LOG_OBJECTID;
940

941 942 943 944 945 946 947 948
	return root;
}

int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
			      struct btrfs_root *root)
{
	struct extent_buffer *leaf;

949
	/*
950
	 * DON'T set SHAREABLE bit for log trees.
951
	 *
952 953 954 955 956
	 * Log trees are not exposed to user space thus can't be snapshotted,
	 * and they go away before a real commit is actually done.
	 *
	 * They do store pointers to file data extents, and those reference
	 * counts still get updated (along with back refs to the log tree).
957
	 */
958

959
	leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID,
960
			NULL, 0, 0, 0, 0, BTRFS_NESTING_NORMAL);
961 962
	if (IS_ERR(leaf))
		return PTR_ERR(leaf);
963

964
	root->node = leaf;
965

966
	btrfs_mark_buffer_dirty(trans, root->node);
967
	btrfs_tree_unlock(root->node);
968 969

	return 0;
970 971 972 973 974 975 976 977 978 979
}

int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
			     struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *log_root;

	log_root = alloc_log_tree(trans, fs_info);
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);
980

981 982 983 984 985 986 987
	if (!btrfs_is_zoned(fs_info)) {
		int ret = btrfs_alloc_log_tree_node(trans, log_root);

		if (ret) {
			btrfs_put_root(log_root);
			return ret;
		}
988 989
	}

990 991 992 993 994 995 996 997
	WARN_ON(fs_info->log_root_tree);
	fs_info->log_root_tree = log_root;
	return 0;
}

int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
		       struct btrfs_root *root)
{
998
	struct btrfs_fs_info *fs_info = root->fs_info;
999 1000
	struct btrfs_root *log_root;
	struct btrfs_inode_item *inode_item;
1001
	int ret;
1002

1003
	log_root = alloc_log_tree(trans, fs_info);
1004 1005 1006
	if (IS_ERR(log_root))
		return PTR_ERR(log_root);

1007 1008 1009 1010 1011 1012
	ret = btrfs_alloc_log_tree_node(trans, log_root);
	if (ret) {
		btrfs_put_root(log_root);
		return ret;
	}

1013
	log_root->last_trans = trans->transid;
1014
	log_root->root_key.offset = btrfs_root_id(root);
1015 1016

	inode_item = &log_root->root_item.inode;
1017 1018 1019
	btrfs_set_stack_inode_generation(inode_item, 1);
	btrfs_set_stack_inode_size(inode_item, 3);
	btrfs_set_stack_inode_nlink(inode_item, 1);
1020
	btrfs_set_stack_inode_nbytes(inode_item,
1021
				     fs_info->nodesize);
1022
	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
1023

1024
	btrfs_set_root_node(&log_root->root_item, log_root->node);
1025 1026 1027

	WARN_ON(root->log_root);
	root->log_root = log_root;
1028
	btrfs_set_root_log_transid(root, 0);
1029
	root->log_transid_committed = -1;
1030
	btrfs_set_root_last_log_commit(root, 0);
1031 1032 1033
	return 0;
}

1034 1035 1036
static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
					      struct btrfs_path *path,
					      struct btrfs_key *key)
1037 1038
{
	struct btrfs_root *root;
1039
	struct btrfs_tree_parent_check check = { 0 };
1040
	struct btrfs_fs_info *fs_info = tree_root->fs_info;
1041
	u64 generation;
1042
	int ret;
1043
	int level;
1044

1045
	root = btrfs_alloc_root(fs_info, key->objectid, GFP_NOFS);
1046 1047
	if (!root)
		return ERR_PTR(-ENOMEM);
1048

1049 1050
	ret = btrfs_find_root(tree_root, key, path,
			      &root->root_item, &root->root_key);
1051
	if (ret) {
1052 1053
		if (ret > 0)
			ret = -ENOENT;
1054
		goto fail;
1055
	}
1056

1057
	generation = btrfs_root_generation(&root->root_item);
1058
	level = btrfs_root_level(&root->root_item);
1059 1060 1061 1062 1063
	check.level = level;
	check.transid = generation;
	check.owner_root = key->objectid;
	root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
				     &check);
1064 1065
	if (IS_ERR(root->node)) {
		ret = PTR_ERR(root->node);
1066
		root->node = NULL;
1067
		goto fail;
1068 1069
	}
	if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
1070
		ret = -EIO;
1071
		goto fail;
1072
	}
1073 1074 1075 1076 1077

	/*
	 * For real fs, and not log/reloc trees, root owner must
	 * match its root node owner
	 */
1078
	if (!btrfs_is_testing(fs_info) &&
1079 1080 1081
	    btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
	    btrfs_root_id(root) != BTRFS_TREE_RELOC_OBJECTID &&
	    btrfs_root_id(root) != btrfs_header_owner(root->node)) {
1082 1083
		btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
1084
			   btrfs_root_id(root), root->node->start,
1085
			   btrfs_header_owner(root->node),
1086
			   btrfs_root_id(root));
1087 1088 1089
		ret = -EUCLEAN;
		goto fail;
	}
1090
	root->commit_root = btrfs_root_node(root);
1091
	return root;
1092
fail:
1093
	btrfs_put_root(root);
1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109
	return ERR_PTR(ret);
}

struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
					struct btrfs_key *key)
{
	struct btrfs_root *root;
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return ERR_PTR(-ENOMEM);
	root = read_tree_root_path(tree_root, path, key);
	btrfs_free_path(path);

	return root;
1110 1111
}

1112 1113 1114 1115 1116 1117
/*
 * Initialize subvolume root in-memory structure
 *
 * @anon_dev:	anonymous device to attach to the root, if zero, allocate new
 */
static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev)
1118 1119 1120
{
	int ret;

1121
	btrfs_drew_lock_init(&root->snapshot_lock);
1122

1123
	if (btrfs_root_id(root) != BTRFS_TREE_LOG_OBJECTID &&
1124
	    !btrfs_is_data_reloc_root(root) &&
1125
	    is_fstree(btrfs_root_id(root))) {
1126
		set_bit(BTRFS_ROOT_SHAREABLE, &root->state);
1127 1128 1129
		btrfs_check_and_init_root_item(&root->root_item);
	}

1130 1131 1132 1133
	/*
	 * Don't assign anonymous block device to roots that are not exposed to
	 * userspace, the id pool is limited to 1M
	 */
1134
	if (is_fstree(btrfs_root_id(root)) &&
1135
	    btrfs_root_refs(&root->root_item) > 0) {
1136 1137 1138 1139 1140 1141 1142
		if (!anon_dev) {
			ret = get_anon_bdev(&root->anon_dev);
			if (ret)
				goto fail;
		} else {
			root->anon_dev = anon_dev;
		}
1143
	}
1144 1145

	mutex_lock(&root->objectid_mutex);
1146
	ret = btrfs_init_root_free_objectid(root);
1147 1148
	if (ret) {
		mutex_unlock(&root->objectid_mutex);
Liu Bo's avatar
Liu Bo committed
1149
		goto fail;
1150 1151
	}

1152
	ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
1153 1154 1155

	mutex_unlock(&root->objectid_mutex);

1156 1157
	return 0;
fail:
1158
	/* The caller is responsible to call btrfs_free_fs_root */
1159 1160 1161
	return ret;
}

1162 1163
static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
					       u64 root_id)
1164 1165 1166
{
	struct btrfs_root *root;

1167 1168 1169
	spin_lock(&fs_info->fs_roots_radix_lock);
	root = radix_tree_lookup(&fs_info->fs_roots_radix,
				 (unsigned long)root_id);
1170
	root = btrfs_grab_root(root);
1171
	spin_unlock(&fs_info->fs_roots_radix_lock);
1172 1173 1174
	return root;
}

1175 1176 1177
static struct btrfs_root *btrfs_get_global_root(struct btrfs_fs_info *fs_info,
						u64 objectid)
{
1178 1179 1180 1181 1182 1183
	struct btrfs_key key = {
		.objectid = objectid,
		.type = BTRFS_ROOT_ITEM_KEY,
		.offset = 0,
	};

1184 1185
	switch (objectid) {
	case BTRFS_ROOT_TREE_OBJECTID:
1186
		return btrfs_grab_root(fs_info->tree_root);
1187
	case BTRFS_EXTENT_TREE_OBJECTID:
1188
		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1189
	case BTRFS_CHUNK_TREE_OBJECTID:
1190
		return btrfs_grab_root(fs_info->chunk_root);
1191
	case BTRFS_DEV_TREE_OBJECTID:
1192
		return btrfs_grab_root(fs_info->dev_root);
1193
	case BTRFS_CSUM_TREE_OBJECTID:
1194
		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1195
	case BTRFS_QUOTA_TREE_OBJECTID:
1196
		return btrfs_grab_root(fs_info->quota_root);
1197
	case BTRFS_UUID_TREE_OBJECTID:
1198
		return btrfs_grab_root(fs_info->uuid_root);
1199
	case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
1200
		return btrfs_grab_root(fs_info->block_group_root);
1201
	case BTRFS_FREE_SPACE_TREE_OBJECTID:
1202
		return btrfs_grab_root(btrfs_global_root(fs_info, &key));
1203 1204
	case BTRFS_RAID_STRIPE_TREE_OBJECTID:
		return btrfs_grab_root(fs_info->stripe_root);
1205 1206 1207
	default:
		return NULL;
	}
1208 1209
}

1210 1211 1212 1213 1214
int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
			 struct btrfs_root *root)
{
	int ret;

1215 1216 1217 1218 1219 1220
	ret = radix_tree_preload(GFP_NOFS);
	if (ret)
		return ret;

	spin_lock(&fs_info->fs_roots_radix_lock);
	ret = radix_tree_insert(&fs_info->fs_roots_radix,
1221
				(unsigned long)btrfs_root_id(root),
1222
				root);
1223
	if (ret == 0) {
1224
		btrfs_grab_root(root);
1225
		set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
1226
	}
1227 1228
	spin_unlock(&fs_info->fs_roots_radix_lock);
	radix_tree_preload_end();
1229 1230 1231 1232

	return ret;
}

1233 1234 1235 1236 1237 1238
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info)
{
#ifdef CONFIG_BTRFS_DEBUG
	struct btrfs_root *root;

	while (!list_empty(&fs_info->allocated_roots)) {
1239 1240
		char buf[BTRFS_ROOT_NAME_BUF_LEN];

1241 1242
		root = list_first_entry(&fs_info->allocated_roots,
					struct btrfs_root, leak_list);
1243
		btrfs_err(fs_info, "leaked root %s refcount %d",
1244
			  btrfs_root_name(&root->root_key, buf),
1245
			  refcount_read(&root->refs));
1246
		WARN_ON_ONCE(1);
1247
		while (refcount_read(&root->refs) > 1)
1248 1249
			btrfs_put_root(root);
		btrfs_put_root(root);
1250 1251 1252 1253
	}
#endif
}

1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265
static void free_global_roots(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
	struct rb_node *node;

	while ((node = rb_first_postorder(&fs_info->global_root_tree)) != NULL) {
		root = rb_entry(node, struct btrfs_root, rb_node);
		rb_erase(&root->rb_node, &fs_info->global_root_tree);
		btrfs_put_root(root);
	}
}

1266 1267
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
{
1268 1269
	struct percpu_counter *em_counter = &fs_info->evictable_extent_maps;

1270 1271
	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
	percpu_counter_destroy(&fs_info->delalloc_bytes);
1272
	percpu_counter_destroy(&fs_info->ordered_bytes);
1273 1274 1275
	if (percpu_counter_initialized(em_counter))
		ASSERT(percpu_counter_sum_positive(em_counter) == 0);
	percpu_counter_destroy(em_counter);
1276 1277 1278 1279
	percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
	btrfs_free_csum_hash(fs_info);
	btrfs_free_stripe_hash_table(fs_info);
	btrfs_free_ref_cache(fs_info);
1280 1281
	kfree(fs_info->balance_ctl);
	kfree(fs_info->delayed_root);
1282
	free_global_roots(fs_info);
1283 1284 1285 1286 1287 1288
	btrfs_put_root(fs_info->tree_root);
	btrfs_put_root(fs_info->chunk_root);
	btrfs_put_root(fs_info->dev_root);
	btrfs_put_root(fs_info->quota_root);
	btrfs_put_root(fs_info->uuid_root);
	btrfs_put_root(fs_info->fs_root);
1289
	btrfs_put_root(fs_info->data_reloc_root);
1290
	btrfs_put_root(fs_info->block_group_root);
1291
	btrfs_put_root(fs_info->stripe_root);
1292
	btrfs_check_leaked_roots(fs_info);
1293
	btrfs_extent_buffer_leak_debug_check(fs_info);
1294 1295
	kfree(fs_info->super_copy);
	kfree(fs_info->super_for_commit);
1296
	kfree(fs_info->subpage_info);
1297 1298 1299 1300
	kvfree(fs_info);
}


1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314
/*
 * Get an in-memory reference of a root structure.
 *
 * For essential trees like root/extent tree, we grab it from fs_info directly.
 * For subvolume trees, we check the cached filesystem roots first. If not
 * found, then read it from disk and add it to cached fs roots.
 *
 * Caller should release the root by calling btrfs_put_root() after the usage.
 *
 * NOTE: Reloc and log trees can't be read by this function as they share the
 *	 same root objectid.
 *
 * @objectid:	root id
 * @anon_dev:	preallocated anonymous block device number for new roots,
1315
 *		pass NULL for a new allocation.
1316 1317 1318 1319
 * @check_ref:	whether to check root item references, If true, return -ENOENT
 *		for orphan roots
 */
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
1320
					     u64 objectid, dev_t *anon_dev,
1321
					     bool check_ref)
1322 1323
{
	struct btrfs_root *root;
1324
	struct btrfs_path *path;
1325
	struct btrfs_key key;
1326 1327
	int ret;

1328 1329 1330
	root = btrfs_get_global_root(fs_info, objectid);
	if (root)
		return root;
1331 1332 1333 1334 1335 1336 1337 1338 1339 1340

	/*
	 * If we're called for non-subvolume trees, and above function didn't
	 * find one, do not try to read it from disk.
	 *
	 * This is namely for free-space-tree and quota tree, which can change
	 * at runtime and should only be grabbed from fs_info.
	 */
	if (!is_fstree(objectid) && objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
		return ERR_PTR(-ENOENT);
1341
again:
1342
	root = btrfs_lookup_fs_root(fs_info, objectid);
1343
	if (root) {
1344 1345 1346 1347 1348 1349
		/*
		 * Some other caller may have read out the newly inserted
		 * subvolume already (for things like backref walk etc).  Not
		 * that common but still possible.  In that case, we just need
		 * to free the anon_dev.
		 */
1350 1351 1352
		if (unlikely(anon_dev && *anon_dev)) {
			free_anon_bdev(*anon_dev);
			*anon_dev = 0;
1353 1354
		}

1355
		if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1356
			btrfs_put_root(root);
1357
			return ERR_PTR(-ENOENT);
1358
		}
1359
		return root;
1360
	}
1361

1362 1363 1364 1365
	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	root = btrfs_read_tree_root(fs_info->tree_root, &key);
1366 1367
	if (IS_ERR(root))
		return root;
1368

1369
	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
1370
		ret = -ENOENT;
1371
		goto fail;
1372
	}
1373

1374
	ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
1375 1376
	if (ret)
		goto fail;
1377

1378 1379 1380 1381 1382
	path = btrfs_alloc_path();
	if (!path) {
		ret = -ENOMEM;
		goto fail;
	}
1383 1384
	key.objectid = BTRFS_ORPHAN_OBJECTID;
	key.type = BTRFS_ORPHAN_ITEM_KEY;
1385
	key.offset = objectid;
1386 1387

	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1388
	btrfs_free_path(path);
1389 1390 1391
	if (ret < 0)
		goto fail;
	if (ret == 0)
1392
		set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state);
1393

1394
	ret = btrfs_insert_fs_root(fs_info, root);
1395
	if (ret) {
1396 1397
		if (ret == -EEXIST) {
			btrfs_put_root(root);
1398
			goto again;
1399
		}
1400
		goto fail;
1401
	}
1402
	return root;
1403
fail:
1404 1405
	/*
	 * If our caller provided us an anonymous device, then it's his
1406
	 * responsibility to free it in case we fail. So we have to set our
1407 1408 1409
	 * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
	 * and once again by our caller.
	 */
1410
	if (anon_dev && *anon_dev)
1411
		root->anon_dev = 0;
1412
	btrfs_put_root(root);
1413
	return ERR_PTR(ret);
1414 1415
}

1416 1417 1418 1419 1420 1421 1422 1423 1424 1425
/*
 * Get in-memory reference of a root structure
 *
 * @objectid:	tree objectid
 * @check_ref:	if set, verify that the tree exists and the item has at least
 *		one reference
 */
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
				     u64 objectid, bool check_ref)
{
1426
	return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
1427 1428 1429 1430 1431 1432 1433
}

/*
 * Get in-memory reference of a root structure, created as new, optionally pass
 * the anonymous block device id
 *
 * @objectid:	tree objectid
1434 1435
 * @anon_dev:	if NULL, allocate a new anonymous block device or use the
 *		parameter value if not NULL
1436 1437
 */
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
1438
					 u64 objectid, dev_t *anon_dev)
1439 1440 1441 1442
{
	return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}

1443
/*
1444 1445
 * Return a root for the given objectid.
 *
1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
 * @fs_info:	the fs_info
 * @objectid:	the objectid we need to lookup
 *
 * This is exclusively used for backref walking, and exists specifically because
 * of how qgroups does lookups.  Qgroups will do a backref lookup at delayed ref
 * creation time, which means we may have to read the tree_root in order to look
 * up a fs root that is not in memory.  If the root is not in memory we will
 * read the tree root commit root and look up the fs root from there.  This is a
 * temporary root, it will not be inserted into the radix tree as it doesn't
 * have the most uptodate information, it'll simply be discarded once the
 * backref code is finished using the root.
 */
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
						 struct btrfs_path *path,
						 u64 objectid)
{
	struct btrfs_root *root;
	struct btrfs_key key;

	ASSERT(path->search_commit_root && path->skip_locking);

	/*
	 * This can return -ENOENT if we ask for a root that doesn't exist, but
	 * since this is called via the backref walking code we won't be looking
	 * up a root that doesn't exist, unless there's corruption.  So if root
	 * != NULL just return it.
	 */
	root = btrfs_get_global_root(fs_info, objectid);
	if (root)
		return root;

	root = btrfs_lookup_fs_root(fs_info, objectid);
	if (root)
		return root;

	key.objectid = objectid;
	key.type = BTRFS_ROOT_ITEM_KEY;
	key.offset = (u64)-1;
	root = read_tree_root_path(fs_info->tree_root, path, &key);
	btrfs_release_path(path);

	return root;
}

1490 1491
static int cleaner_kthread(void *arg)
{
1492
	struct btrfs_fs_info *fs_info = arg;
1493
	int again;
1494

1495
	while (1) {
1496
		again = 0;
1497

1498 1499
		set_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);

1500
		/* Make the cleaner go to sleep early. */
1501
		if (btrfs_need_cleaner_sleep(fs_info))
1502 1503
			goto sleep;

1504 1505 1506 1507
		/*
		 * Do not do anything if we might cause open_ctree() to block
		 * before we have finished mounting the filesystem.
		 */
1508
		if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
1509 1510
			goto sleep;

1511
		if (!mutex_trylock(&fs_info->cleaner_mutex))
1512 1513
			goto sleep;

1514 1515 1516 1517
		/*
		 * Avoid the problem that we change the status of the fs
		 * during the above check and trylock.
		 */
1518
		if (btrfs_need_cleaner_sleep(fs_info)) {
1519
			mutex_unlock(&fs_info->cleaner_mutex);
1520
			goto sleep;
1521
		}
1522

1523 1524 1525
		if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags))
			btrfs_sysfs_feature_update(fs_info);

1526
		btrfs_run_delayed_iputs(fs_info);
1527

1528
		again = btrfs_clean_one_deleted_snapshot(fs_info);
1529
		mutex_unlock(&fs_info->cleaner_mutex);
1530 1531

		/*
1532 1533
		 * The defragger has dealt with the R/O remount and umount,
		 * needn't do anything special here.
1534
		 */
1535
		btrfs_run_defrag_inodes(fs_info);
1536 1537

		/*
1538
		 * Acquires fs_info->reclaim_bgs_lock to avoid racing
1539 1540
		 * with relocation (btrfs_relocate_chunk) and relocation
		 * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
1541
		 * after acquiring fs_info->reclaim_bgs_lock. So we
1542 1543 1544
		 * can't hold, nor need to, fs_info->cleaner_mutex when deleting
		 * unused block groups.
		 */
1545
		btrfs_delete_unused_bgs(fs_info);
1546 1547 1548 1549 1550 1551 1552

		/*
		 * Reclaim block groups in the reclaim_bgs list after we deleted
		 * all unused block_groups. This possibly gives us some more free
		 * space.
		 */
		btrfs_reclaim_bgs(fs_info);
1553
sleep:
1554
		clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
1555 1556 1557 1558
		if (kthread_should_park())
			kthread_parkme();
		if (kthread_should_stop())
			return 0;
1559
		if (!again) {
1560
			set_current_state(TASK_INTERRUPTIBLE);
1561
			schedule();
1562 1563
			__set_current_state(TASK_RUNNING);
		}
1564
	}
1565 1566 1567 1568 1569
}

static int transaction_kthread(void *arg)
{
	struct btrfs_root *root = arg;
1570
	struct btrfs_fs_info *fs_info = root->fs_info;
1571 1572
	struct btrfs_trans_handle *trans;
	struct btrfs_transaction *cur;
1573
	u64 transid;
1574
	time64_t delta;
1575
	unsigned long delay;
1576
	bool cannot_commit;
1577 1578

	do {
1579
		cannot_commit = false;
1580
		delay = msecs_to_jiffies(fs_info->commit_interval * 1000);
1581
		mutex_lock(&fs_info->transaction_kthread_mutex);
1582

1583 1584
		spin_lock(&fs_info->trans_lock);
		cur = fs_info->running_transaction;
1585
		if (!cur) {
1586
			spin_unlock(&fs_info->trans_lock);
1587 1588
			goto sleep;
		}
1589

1590
		delta = ktime_get_seconds() - cur->start_time;
1591
		if (!test_and_clear_bit(BTRFS_FS_COMMIT_TRANS, &fs_info->flags) &&
1592
		    cur->state < TRANS_STATE_COMMIT_PREP &&
1593
		    delta < fs_info->commit_interval) {
1594
			spin_unlock(&fs_info->trans_lock);
1595 1596 1597
			delay -= msecs_to_jiffies((delta - 1) * 1000);
			delay = min(delay,
				    msecs_to_jiffies(fs_info->commit_interval * 1000));
1598 1599
			goto sleep;
		}
1600
		transid = cur->transid;
1601
		spin_unlock(&fs_info->trans_lock);
1602

1603
		/* If the file system is aborted, this will always fail. */
1604
		trans = btrfs_attach_transaction(root);
1605
		if (IS_ERR(trans)) {
1606 1607
			if (PTR_ERR(trans) != -ENOENT)
				cannot_commit = true;
1608
			goto sleep;
1609
		}
1610
		if (transid == trans->transid) {
1611
			btrfs_commit_transaction(trans);
1612
		} else {
1613
			btrfs_end_transaction(trans);
1614
		}
1615
sleep:
1616 1617
		wake_up_process(fs_info->cleaner_kthread);
		mutex_unlock(&fs_info->transaction_kthread_mutex);
1618

1619
		if (BTRFS_FS_ERROR(fs_info))
1620
			btrfs_cleanup_transaction(fs_info);
1621
		if (!kthread_should_stop() &&
1622
				(!btrfs_transaction_blocked(fs_info) ||
1623
				 cannot_commit))
1624
			schedule_timeout_interruptible(delay);
1625 1626 1627 1628
	} while (!kthread_should_stop());
	return 0;
}

1629
/*
1630 1631 1632
 * This will find the highest generation in the array of root backups.  The
 * index of the highest array is returned, or -EINVAL if we can't find
 * anything.
1633 1634 1635 1636 1637
 *
 * We check to make sure the array is valid by comparing the
 * generation of the latest  root in the array with the generation
 * in the super block.  If they don't match we pitch it.
 */
1638
static int find_newest_super_backup(struct btrfs_fs_info *info)
1639
{
1640
	const u64 newest_gen = btrfs_super_generation(info->super_copy);
1641 1642 1643 1644 1645 1646 1647 1648
	u64 cur;
	struct btrfs_root_backup *root_backup;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		root_backup = info->super_copy->super_roots + i;
		cur = btrfs_backup_tree_root_gen(root_backup);
		if (cur == newest_gen)
1649
			return i;
1650 1651
	}

1652
	return -EINVAL;
1653 1654 1655 1656 1657 1658 1659 1660 1661
}

/*
 * copy all the root pointers into the super backup array.
 * this will bump the backup pointer by one when it is
 * done
 */
static void backup_super_roots(struct btrfs_fs_info *info)
{
1662
	const int next_backup = info->backup_root_index;
1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687
	struct btrfs_root_backup *root_backup;

	root_backup = info->super_for_commit->super_roots + next_backup;

	/*
	 * make sure all of our padding and empty slots get zero filled
	 * regardless of which ones we use today
	 */
	memset(root_backup, 0, sizeof(*root_backup));

	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;

	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
	btrfs_set_backup_tree_root_gen(root_backup,
			       btrfs_header_generation(info->tree_root->node));

	btrfs_set_backup_tree_root_level(root_backup,
			       btrfs_header_level(info->tree_root->node));

	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
	btrfs_set_backup_chunk_root_gen(root_backup,
			       btrfs_header_generation(info->chunk_root->node));
	btrfs_set_backup_chunk_root_level(root_backup,
			       btrfs_header_level(info->chunk_root->node));

1688
	if (!btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE)) {
1689
		struct btrfs_root *extent_root = btrfs_extent_root(info, 0);
1690
		struct btrfs_root *csum_root = btrfs_csum_root(info, 0);
1691 1692 1693 1694 1695 1696 1697

		btrfs_set_backup_extent_root(root_backup,
					     extent_root->node->start);
		btrfs_set_backup_extent_root_gen(root_backup,
				btrfs_header_generation(extent_root->node));
		btrfs_set_backup_extent_root_level(root_backup,
					btrfs_header_level(extent_root->node));
1698 1699 1700 1701 1702 1703

		btrfs_set_backup_csum_root(root_backup, csum_root->node->start);
		btrfs_set_backup_csum_root_gen(root_backup,
					       btrfs_header_generation(csum_root->node));
		btrfs_set_backup_csum_root_level(root_backup,
						 btrfs_header_level(csum_root->node));
1704
	}
1705

1706 1707 1708 1709 1710 1711 1712 1713
	/*
	 * we might commit during log recovery, which happens before we set
	 * the fs_root.  Make sure it is valid before we fill it in.
	 */
	if (info->fs_root && info->fs_root->node) {
		btrfs_set_backup_fs_root(root_backup,
					 info->fs_root->node->start);
		btrfs_set_backup_fs_root_gen(root_backup,
1714
			       btrfs_header_generation(info->fs_root->node));
1715
		btrfs_set_backup_fs_root_level(root_backup,
1716
			       btrfs_header_level(info->fs_root->node));
1717
	}
1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740

	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
	btrfs_set_backup_dev_root_gen(root_backup,
			       btrfs_header_generation(info->dev_root->node));
	btrfs_set_backup_dev_root_level(root_backup,
				       btrfs_header_level(info->dev_root->node));

	btrfs_set_backup_total_bytes(root_backup,
			     btrfs_super_total_bytes(info->super_copy));
	btrfs_set_backup_bytes_used(root_backup,
			     btrfs_super_bytes_used(info->super_copy));
	btrfs_set_backup_num_devices(root_backup,
			     btrfs_super_num_devices(info->super_copy));

	/*
	 * if we don't copy this out to the super_copy, it won't get remembered
	 * for the next commit
	 */
	memcpy(&info->super_copy->super_roots,
	       &info->super_for_commit->super_roots,
	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
}

1741
/*
1742 1743
 * Reads a backup root based on the passed priority. Prio 0 is the newest, prio
 * 1/2/3 are 2nd newest/3rd newest/4th (oldest) backup roots
1744
 *
1745 1746
 * @fs_info:  filesystem whose backup roots need to be read
 * @priority: priority of backup root required
1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784
 *
 * Returns backup root index on success and -EINVAL otherwise.
 */
static int read_backup_root(struct btrfs_fs_info *fs_info, u8 priority)
{
	int backup_index = find_newest_super_backup(fs_info);
	struct btrfs_super_block *super = fs_info->super_copy;
	struct btrfs_root_backup *root_backup;

	if (priority < BTRFS_NUM_BACKUP_ROOTS && backup_index >= 0) {
		if (priority == 0)
			return backup_index;

		backup_index = backup_index + BTRFS_NUM_BACKUP_ROOTS - priority;
		backup_index %= BTRFS_NUM_BACKUP_ROOTS;
	} else {
		return -EINVAL;
	}

	root_backup = super->super_roots + backup_index;

	btrfs_set_super_generation(super,
				   btrfs_backup_tree_root_gen(root_backup));
	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
	btrfs_set_super_root_level(super,
				   btrfs_backup_tree_root_level(root_backup));
	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));

	/*
	 * Fixme: the total bytes and num_devices need to match or we should
	 * need a fsck
	 */
	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));

	return backup_index;
}

Liu Bo's avatar
Liu Bo committed
1785 1786 1787
/* helper to cleanup workers */
static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
1788
	btrfs_destroy_workqueue(fs_info->fixup_workers);
1789
	btrfs_destroy_workqueue(fs_info->delalloc_workers);
1790
	btrfs_destroy_workqueue(fs_info->workers);
1791 1792
	if (fs_info->endio_workers)
		destroy_workqueue(fs_info->endio_workers);
1793 1794
	if (fs_info->rmw_workers)
		destroy_workqueue(fs_info->rmw_workers);
1795 1796
	if (fs_info->compressed_write_workers)
		destroy_workqueue(fs_info->compressed_write_workers);
1797 1798
	btrfs_destroy_workqueue(fs_info->endio_write_workers);
	btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
1799
	btrfs_destroy_workqueue(fs_info->delayed_workers);
1800
	btrfs_destroy_workqueue(fs_info->caching_workers);
1801
	btrfs_destroy_workqueue(fs_info->flush_workers);
1802
	btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
1803 1804
	if (fs_info->discard_ctl.discard_workers)
		destroy_workqueue(fs_info->discard_ctl.discard_workers);
1805 1806 1807 1808 1809
	/*
	 * Now that all other work queues are destroyed, we can safely destroy
	 * the queues used for metadata I/O, since tasks from those other work
	 * queues can do metadata I/O operations.
	 */
1810 1811
	if (fs_info->endio_meta_workers)
		destroy_workqueue(fs_info->endio_meta_workers);
Liu Bo's avatar
Liu Bo committed
1812 1813
}

1814 1815 1816 1817 1818 1819 1820 1821 1822 1823
static void free_root_extent_buffers(struct btrfs_root *root)
{
	if (root) {
		free_extent_buffer(root->node);
		free_extent_buffer(root->commit_root);
		root->node = NULL;
		root->commit_root = NULL;
	}
}

1824 1825 1826 1827 1828 1829 1830 1831 1832 1833
static void free_global_root_pointers(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root, *tmp;

	rbtree_postorder_for_each_entry_safe(root, tmp,
					     &fs_info->global_root_tree,
					     rb_node)
		free_root_extent_buffers(root);
}

1834
/* helper to cleanup tree roots */
1835
static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
1836
{
1837
	free_root_extent_buffers(info->tree_root);
1838

1839
	free_global_root_pointers(info);
1840 1841 1842
	free_root_extent_buffers(info->dev_root);
	free_root_extent_buffers(info->quota_root);
	free_root_extent_buffers(info->uuid_root);
1843
	free_root_extent_buffers(info->fs_root);
1844
	free_root_extent_buffers(info->data_reloc_root);
1845
	free_root_extent_buffers(info->block_group_root);
1846
	free_root_extent_buffers(info->stripe_root);
1847
	if (free_chunk_root)
1848
		free_root_extent_buffers(info->chunk_root);
1849 1850
}

1851 1852 1853 1854 1855 1856 1857
void btrfs_put_root(struct btrfs_root *root)
{
	if (!root)
		return;

	if (refcount_dec_and_test(&root->refs)) {
		WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
1858
		WARN_ON(test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state));
1859 1860
		if (root->anon_dev)
			free_anon_bdev(root->anon_dev);
1861
		free_root_extent_buffers(root);
1862
#ifdef CONFIG_BTRFS_DEBUG
1863
		spin_lock(&root->fs_info->fs_roots_radix_lock);
1864
		list_del_init(&root->leak_list);
1865
		spin_unlock(&root->fs_info->fs_roots_radix_lock);
1866 1867 1868 1869 1870
#endif
		kfree(root);
	}
}

1871
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
1872
{
1873 1874 1875
	int ret;
	struct btrfs_root *gang[8];
	int i;
1876 1877

	while (!list_empty(&fs_info->dead_roots)) {
1878 1879 1880
		gang[0] = list_entry(fs_info->dead_roots.next,
				     struct btrfs_root, root_list);
		list_del(&gang[0]->root_list);
1881

1882 1883 1884
		if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
		btrfs_put_root(gang[0]);
1885 1886
	}

1887 1888 1889 1890 1891 1892 1893 1894
	while (1) {
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, 0,
					     ARRAY_SIZE(gang));
		if (!ret)
			break;
		for (i = 0; i < ret; i++)
			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
1895 1896
	}
}
1897

1898 1899 1900 1901 1902 1903 1904 1905
static void btrfs_init_scrub(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->scrub_lock);
	atomic_set(&fs_info->scrubs_running, 0);
	atomic_set(&fs_info->scrub_pause_req, 0);
	atomic_set(&fs_info->scrubs_paused, 0);
	atomic_set(&fs_info->scrub_cancel_req, 0);
	init_waitqueue_head(&fs_info->scrub_pause_wait);
1906
	refcount_set(&fs_info->scrub_workers_refcnt, 0);
1907 1908
}

1909 1910 1911 1912 1913 1914 1915 1916
static void btrfs_init_balance(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->balance_lock);
	mutex_init(&fs_info->balance_mutex);
	atomic_set(&fs_info->balance_pause_req, 0);
	atomic_set(&fs_info->balance_cancel_req, 0);
	fs_info->balance_ctl = NULL;
	init_waitqueue_head(&fs_info->balance_wait_q);
1917
	atomic_set(&fs_info->reloc_cancel_req, 0);
1918 1919
}

1920
static int btrfs_init_btree_inode(struct super_block *sb)
1921
{
1922
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
1923 1924
	unsigned long hash = btrfs_inode_hash(BTRFS_BTREE_INODE_OBJECTID,
					      fs_info->tree_root);
1925 1926 1927 1928 1929
	struct inode *inode;

	inode = new_inode(sb);
	if (!inode)
		return -ENOMEM;
1930 1931 1932

	inode->i_ino = BTRFS_BTREE_INODE_OBJECTID;
	set_nlink(inode, 1);
1933 1934 1935 1936 1937
	/*
	 * we set the i_size on the btree inode to the max possible int.
	 * the real end of the address space is determined by all of
	 * the devices in the system
	 */
1938 1939
	inode->i_size = OFFSET_MAX;
	inode->i_mapping->a_ops = &btree_aops;
1940
	mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
1941

1942
	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
1943
	extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
1944
			    IO_TREE_BTREE_INODE_IO);
1945
	extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
1946

1947
	BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
1948 1949 1950
	BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID;
	BTRFS_I(inode)->location.type = 0;
	BTRFS_I(inode)->location.offset = 0;
1951
	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
1952
	__insert_inode_hash(inode, hash);
1953 1954 1955
	fs_info->btree_inode = inode;

	return 0;
1956 1957
}

1958 1959 1960
static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
1961
	init_rwsem(&fs_info->dev_replace.rwsem);
1962
	init_waitqueue_head(&fs_info->dev_replace.replace_wait);
1963 1964
}

1965 1966 1967 1968 1969 1970 1971 1972
static void btrfs_init_qgroup(struct btrfs_fs_info *fs_info)
{
	spin_lock_init(&fs_info->qgroup_lock);
	mutex_init(&fs_info->qgroup_ioctl_lock);
	fs_info->qgroup_tree = RB_ROOT;
	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
	fs_info->qgroup_seq = 1;
	fs_info->qgroup_ulist = NULL;
1973
	fs_info->qgroup_rescan_running = false;
1974
	fs_info->qgroup_drop_subtree_thres = BTRFS_MAX_LEVEL;
1975 1976 1977
	mutex_init(&fs_info->qgroup_rescan_lock);
}

1978
static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
1979
{
1980
	u32 max_active = fs_info->thread_pool_size;
1981
	unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
1982
	unsigned int ordered_flags = WQ_MEM_RECLAIM | WQ_FREEZABLE;
1983 1984

	fs_info->workers =
1985
		btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
1986 1987

	fs_info->delalloc_workers =
1988 1989
		btrfs_alloc_workqueue(fs_info, "delalloc",
				      flags, max_active, 2);
1990 1991

	fs_info->flush_workers =
1992 1993
		btrfs_alloc_workqueue(fs_info, "flush_delalloc",
				      flags, max_active, 0);
1994 1995

	fs_info->caching_workers =
1996
		btrfs_alloc_workqueue(fs_info, "cache", flags, max_active, 0);
1997 1998

	fs_info->fixup_workers =
1999
		btrfs_alloc_ordered_workqueue(fs_info, "fixup", ordered_flags);
2000 2001

	fs_info->endio_workers =
2002
		alloc_workqueue("btrfs-endio", flags, max_active);
2003
	fs_info->endio_meta_workers =
2004
		alloc_workqueue("btrfs-endio-meta", flags, max_active);
2005
	fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
2006
	fs_info->endio_write_workers =
2007 2008
		btrfs_alloc_workqueue(fs_info, "endio-write", flags,
				      max_active, 2);
2009 2010
	fs_info->compressed_write_workers =
		alloc_workqueue("btrfs-compressed-write", flags, max_active);
2011
	fs_info->endio_freespace_worker =
2012 2013
		btrfs_alloc_workqueue(fs_info, "freespace-write", flags,
				      max_active, 0);
2014
	fs_info->delayed_workers =
2015 2016
		btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
				      max_active, 0);
2017
	fs_info->qgroup_rescan_workers =
2018 2019
		btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan",
					      ordered_flags);
2020
	fs_info->discard_ctl.discard_workers =
2021
		alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE);
2022

2023
	if (!(fs_info->workers &&
2024
	      fs_info->delalloc_workers && fs_info->flush_workers &&
2025
	      fs_info->endio_workers && fs_info->endio_meta_workers &&
2026
	      fs_info->compressed_write_workers &&
2027
	      fs_info->endio_write_workers &&
2028
	      fs_info->endio_freespace_worker && fs_info->rmw_workers &&
2029 2030
	      fs_info->caching_workers && fs_info->fixup_workers &&
	      fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
2031
	      fs_info->discard_ctl.discard_workers)) {
2032 2033 2034 2035 2036 2037
		return -ENOMEM;
	}

	return 0;
}

2038 2039 2040
static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
{
	struct crypto_shash *csum_shash;
2041
	const char *csum_driver = btrfs_super_csum_driver(csum_type);
2042

2043
	csum_shash = crypto_alloc_shash(csum_driver, 0, 0);
2044 2045 2046

	if (IS_ERR(csum_shash)) {
		btrfs_err(fs_info, "error allocating %s hash for checksum",
2047
			  csum_driver);
2048 2049 2050 2051 2052
		return PTR_ERR(csum_shash);
	}

	fs_info->csum_shash = csum_shash;

2053 2054 2055 2056 2057 2058 2059 2060 2061 2062
	/*
	 * Check if the checksum implementation is a fast accelerated one.
	 * As-is this is a bit of a hack and should be replaced once the csum
	 * implementations provide that information themselves.
	 */
	switch (csum_type) {
	case BTRFS_CSUM_TYPE_CRC32:
		if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
			set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
		break;
2063 2064 2065
	case BTRFS_CSUM_TYPE_XXHASH:
		set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
		break;
2066 2067 2068 2069
	default:
		break;
	}

2070 2071 2072
	btrfs_info(fs_info, "using %s (%s) checksum algorithm",
			btrfs_super_csum_name(csum_type),
			crypto_shash_driver_name(csum_shash));
2073 2074 2075
	return 0;
}

2076 2077 2078 2079
static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
			    struct btrfs_fs_devices *fs_devices)
{
	int ret;
2080
	struct btrfs_tree_parent_check check = { 0 };
2081 2082 2083
	struct btrfs_root *log_tree_root;
	struct btrfs_super_block *disk_super = fs_info->super_copy;
	u64 bytenr = btrfs_super_log_root(disk_super);
2084
	int level = btrfs_super_log_root_level(disk_super);
2085 2086

	if (fs_devices->rw_devices == 0) {
2087
		btrfs_warn(fs_info, "log replay required on RO media");
2088 2089 2090
		return -EIO;
	}

2091 2092
	log_tree_root = btrfs_alloc_root(fs_info, BTRFS_TREE_LOG_OBJECTID,
					 GFP_KERNEL);
2093 2094 2095
	if (!log_tree_root)
		return -ENOMEM;

2096 2097 2098 2099
	check.level = level;
	check.transid = fs_info->generation + 1;
	check.owner_root = BTRFS_TREE_LOG_OBJECTID;
	log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
2100
	if (IS_ERR(log_tree_root->node)) {
2101
		btrfs_warn(fs_info, "failed to read log tree");
2102
		ret = PTR_ERR(log_tree_root->node);
2103
		log_tree_root->node = NULL;
2104
		btrfs_put_root(log_tree_root);
2105
		return ret;
2106 2107
	}
	if (!extent_buffer_uptodate(log_tree_root->node)) {
2108
		btrfs_err(fs_info, "failed to read log tree");
2109
		btrfs_put_root(log_tree_root);
2110 2111
		return -EIO;
	}
2112

2113 2114 2115
	/* returns with log_tree_root freed on success */
	ret = btrfs_recover_log_trees(log_tree_root);
	if (ret) {
2116 2117
		btrfs_handle_fs_error(fs_info, ret,
				      "Failed to recover log tree");
2118
		btrfs_put_root(log_tree_root);
2119 2120 2121
		return ret;
	}

2122
	if (sb_rdonly(fs_info->sb)) {
2123
		ret = btrfs_commit_super(fs_info);
2124 2125 2126 2127 2128 2129 2130
		if (ret)
			return ret;
	}

	return 0;
}

2131 2132 2133 2134 2135 2136
static int load_global_roots_objectid(struct btrfs_root *tree_root,
				      struct btrfs_path *path, u64 objectid,
				      const char *name)
{
	struct btrfs_fs_info *fs_info = tree_root->fs_info;
	struct btrfs_root *root;
2137
	u64 max_global_id = 0;
2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172
	int ret;
	struct btrfs_key key = {
		.objectid = objectid,
		.type = BTRFS_ROOT_ITEM_KEY,
		.offset = 0,
	};
	bool found = false;

	/* If we have IGNOREDATACSUMS skip loading these roots. */
	if (objectid == BTRFS_CSUM_TREE_OBJECTID &&
	    btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
		set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
		return 0;
	}

	while (1) {
		ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
		if (ret < 0)
			break;

		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
			ret = btrfs_next_leaf(tree_root, path);
			if (ret) {
				if (ret > 0)
					ret = 0;
				break;
			}
		}
		ret = 0;

		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
		if (key.objectid != objectid)
			break;
		btrfs_release_path(path);

2173 2174 2175 2176 2177 2178 2179
		/*
		 * Just worry about this for extent tree, it'll be the same for
		 * everybody.
		 */
		if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
			max_global_id = max(max_global_id, key.offset);

2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196
		found = true;
		root = read_tree_root_path(tree_root, path, &key);
		if (IS_ERR(root)) {
			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
				ret = PTR_ERR(root);
			break;
		}
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		ret = btrfs_global_root_insert(root);
		if (ret) {
			btrfs_put_root(root);
			break;
		}
		key.offset++;
	}
	btrfs_release_path(path);

2197 2198 2199
	if (objectid == BTRFS_EXTENT_TREE_OBJECTID)
		fs_info->nr_global_roots = max_global_id + 1;

2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239
	if (!found || ret) {
		if (objectid == BTRFS_CSUM_TREE_OBJECTID)
			set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);

		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS))
			ret = ret ? ret : -ENOENT;
		else
			ret = 0;
		btrfs_err(fs_info, "failed to load root %s", name);
	}
	return ret;
}

static int load_global_roots(struct btrfs_root *tree_root)
{
	struct btrfs_path *path;
	int ret = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	ret = load_global_roots_objectid(tree_root, path,
					 BTRFS_EXTENT_TREE_OBJECTID, "extent");
	if (ret)
		goto out;
	ret = load_global_roots_objectid(tree_root, path,
					 BTRFS_CSUM_TREE_OBJECTID, "csum");
	if (ret)
		goto out;
	if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE))
		goto out;
	ret = load_global_roots_objectid(tree_root, path,
					 BTRFS_FREE_SPACE_TREE_OBJECTID,
					 "free space");
out:
	btrfs_free_path(path);
	return ret;
}

2240
static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
2241
{
2242
	struct btrfs_root *tree_root = fs_info->tree_root;
2243
	struct btrfs_root *root;
2244 2245 2246
	struct btrfs_key location;
	int ret;

2247
	ASSERT(fs_info->tree_root);
2248

2249 2250 2251 2252
	ret = load_global_roots(tree_root);
	if (ret)
		return ret;

2253 2254 2255
	location.type = BTRFS_ROOT_ITEM_KEY;
	location.offset = 0;

2256
	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) {
2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270
		location.objectid = BTRFS_BLOCK_GROUP_TREE_OBJECTID;
		root = btrfs_read_tree_root(tree_root, &location);
		if (IS_ERR(root)) {
			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
				ret = PTR_ERR(root);
				goto out;
			}
		} else {
			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
			fs_info->block_group_root = root;
		}
	}

	location.objectid = BTRFS_DEV_TREE_OBJECTID;
2271
	root = btrfs_read_tree_root(tree_root, &location);
2272
	if (IS_ERR(root)) {
2273 2274 2275 2276 2277 2278 2279
		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
			ret = PTR_ERR(root);
			goto out;
		}
	} else {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->dev_root = root;
2280
	}
2281
	/* Initialize fs_info for all devices in any case */
2282 2283 2284
	ret = btrfs_init_devices_late(fs_info);
	if (ret)
		goto out;
2285

2286 2287 2288 2289
	/*
	 * This tree can share blocks with some other fs tree during relocation
	 * and we need a proper setup by btrfs_get_fs_root
	 */
2290 2291
	root = btrfs_get_fs_root(tree_root->fs_info,
				 BTRFS_DATA_RELOC_TREE_OBJECTID, true);
2292
	if (IS_ERR(root)) {
2293 2294 2295 2296 2297 2298 2299
		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
			ret = PTR_ERR(root);
			goto out;
		}
	} else {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->data_reloc_root = root;
2300 2301
	}

2302
	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
2303 2304 2305 2306
	root = btrfs_read_tree_root(tree_root, &location);
	if (!IS_ERR(root)) {
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->quota_root = root;
2307 2308 2309
	}

	location.objectid = BTRFS_UUID_TREE_OBJECTID;
2310 2311
	root = btrfs_read_tree_root(tree_root, &location);
	if (IS_ERR(root)) {
2312 2313 2314 2315 2316
		if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
			ret = PTR_ERR(root);
			if (ret != -ENOENT)
				goto out;
		}
2317
	} else {
2318 2319
		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
		fs_info->uuid_root = root;
2320 2321
	}

2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335
	if (btrfs_fs_incompat(fs_info, RAID_STRIPE_TREE)) {
		location.objectid = BTRFS_RAID_STRIPE_TREE_OBJECTID;
		root = btrfs_read_tree_root(tree_root, &location);
		if (IS_ERR(root)) {
			if (!btrfs_test_opt(fs_info, IGNOREBADROOTS)) {
				ret = PTR_ERR(root);
				goto out;
			}
		} else {
			set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
			fs_info->stripe_root = root;
		}
	}

2336
	return 0;
2337 2338 2339 2340
out:
	btrfs_warn(fs_info, "failed to read root (objectid=%llu): %d",
		   location.objectid, ret);
	return ret;
2341 2342
}

2343 2344 2345 2346 2347 2348 2349 2350 2351 2352
/*
 * Real super block validation
 * NOTE: super csum type and incompat features will not be checked here.
 *
 * @sb:		super block to check
 * @mirror_num:	the super block number to check its bytenr:
 * 		0	the primary (1st) sb
 * 		1, 2	2nd and 3rd backup copy
 * 	       -1	skip bytenr check
 */
2353 2354
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
			 struct btrfs_super_block *sb, int mirror_num)
2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393
{
	u64 nodesize = btrfs_super_nodesize(sb);
	u64 sectorsize = btrfs_super_sectorsize(sb);
	int ret = 0;

	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
		btrfs_err(fs_info, "no valid FS found");
		ret = -EINVAL;
	}
	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) {
		btrfs_err(fs_info, "unrecognized or unsupported super flag: %llu",
				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
		ret = -EINVAL;
	}
	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "tree_root level too big: %d >= %d",
				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "chunk_root level too big: %d >= %d",
				btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}
	if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) {
		btrfs_err(fs_info, "log_root level too big: %d >= %d",
				btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL);
		ret = -EINVAL;
	}

	/*
	 * Check sectorsize and nodesize first, other check will need it.
	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
	 */
	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize);
		ret = -EINVAL;
	}
2394 2395

	/*
2396 2397 2398 2399 2400 2401
	 * We only support at most two sectorsizes: 4K and PAGE_SIZE.
	 *
	 * We can support 16K sectorsize with 64K page size without problem,
	 * but such sectorsize/pagesize combination doesn't make much sense.
	 * 4K will be our future standard, PAGE_SIZE is supported from the very
	 * beginning.
2402
	 */
2403
	if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
2404
		btrfs_err(fs_info,
2405
			"sectorsize %llu not yet supported for page size %lu",
2406 2407 2408
			sectorsize, PAGE_SIZE);
		ret = -EINVAL;
	}
2409

2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437
	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
		btrfs_err(fs_info, "invalid nodesize %llu", nodesize);
		ret = -EINVAL;
	}
	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
		btrfs_err(fs_info, "invalid leafsize %u, should be %llu",
			  le32_to_cpu(sb->__unused_leafsize), nodesize);
		ret = -EINVAL;
	}

	/* Root alignment check */
	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "tree_root block unaligned: %llu",
			   btrfs_super_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "chunk_root block unaligned: %llu",
			   btrfs_super_chunk_root(sb));
		ret = -EINVAL;
	}
	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
		btrfs_warn(fs_info, "log_root block unaligned: %llu",
			   btrfs_super_log_root(sb));
		ret = -EINVAL;
	}

2438 2439
	if (!fs_info->fs_devices->temp_fsid &&
	    memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
2440 2441
		btrfs_err(fs_info,
		"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
2442
			  sb->fsid, fs_info->fs_devices->fsid);
2443 2444 2445
		ret = -EINVAL;
	}

2446 2447
	if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
		   BTRFS_FSID_SIZE) != 0) {
2448 2449
		btrfs_err(fs_info,
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
2450
			  btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
2451 2452 2453
		ret = -EINVAL;
	}

2454 2455 2456 2457 2458 2459 2460 2461
	if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid,
		   BTRFS_FSID_SIZE) != 0) {
		btrfs_err(fs_info,
			"dev_item UUID does not match metadata fsid: %pU != %pU",
			fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid);
		ret = -EINVAL;
	}

2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473
	/*
	 * Artificial requirement for block-group-tree to force newer features
	 * (free-space-tree, no-holes) so the test matrix is smaller.
	 */
	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
	    (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) ||
	     !btrfs_fs_incompat(fs_info, NO_HOLES))) {
		btrfs_err(fs_info,
		"block-group-tree feature requires fres-space-tree and no-holes");
		ret = -EINVAL;
	}

2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495
	/*
	 * Hint to catch really bogus numbers, bitflips or so, more exact checks are
	 * done later
	 */
	if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) {
		btrfs_err(fs_info, "bytes_used is too small %llu",
			  btrfs_super_bytes_used(sb));
		ret = -EINVAL;
	}
	if (!is_power_of_2(btrfs_super_stripesize(sb))) {
		btrfs_err(fs_info, "invalid stripesize %u",
			  btrfs_super_stripesize(sb));
		ret = -EINVAL;
	}
	if (btrfs_super_num_devices(sb) > (1UL << 31))
		btrfs_warn(fs_info, "suspicious number of devices: %llu",
			   btrfs_super_num_devices(sb));
	if (btrfs_super_num_devices(sb) == 0) {
		btrfs_err(fs_info, "number of devices is 0");
		ret = -EINVAL;
	}

2496 2497
	if (mirror_num >= 0 &&
	    btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) {
2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540
		btrfs_err(fs_info, "super offset mismatch %llu != %u",
			  btrfs_super_bytenr(sb), BTRFS_SUPER_INFO_OFFSET);
		ret = -EINVAL;
	}

	/*
	 * Obvious sys_chunk_array corruptions, it must hold at least one key
	 * and one chunk
	 */
	if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
		btrfs_err(fs_info, "system chunk array too big %u > %u",
			  btrfs_super_sys_array_size(sb),
			  BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
		ret = -EINVAL;
	}
	if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
			+ sizeof(struct btrfs_chunk)) {
		btrfs_err(fs_info, "system chunk array too small %u < %zu",
			  btrfs_super_sys_array_size(sb),
			  sizeof(struct btrfs_disk_key)
			  + sizeof(struct btrfs_chunk));
		ret = -EINVAL;
	}

	/*
	 * The generation is a global counter, we'll trust it more than the others
	 * but it's still possible that it's the one that's wrong.
	 */
	if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))
		btrfs_warn(fs_info,
			"suspicious: generation < chunk_root_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_chunk_root_generation(sb));
	if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb)
	    && btrfs_super_cache_generation(sb) != (u64)-1)
		btrfs_warn(fs_info,
			"suspicious: generation < cache_generation: %llu < %llu",
			btrfs_super_generation(sb),
			btrfs_super_cache_generation(sb));

	return ret;
}

2541 2542 2543 2544 2545 2546 2547
/*
 * Validation of super block at mount time.
 * Some checks already done early at mount time, like csum type and incompat
 * flags will be skipped.
 */
static int btrfs_validate_mount_super(struct btrfs_fs_info *fs_info)
{
2548
	return btrfs_validate_super(fs_info, fs_info->super_copy, 0);
2549 2550
}

2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561
/*
 * Validation of super block at write time.
 * Some checks like bytenr check will be skipped as their values will be
 * overwritten soon.
 * Extra checks like csum type and incompat flags will be done here.
 */
static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info,
				      struct btrfs_super_block *sb)
{
	int ret;

2562
	ret = btrfs_validate_super(fs_info, sb, -1);
2563 2564
	if (ret < 0)
		goto out;
2565
	if (!btrfs_supported_super_csum(btrfs_super_csum_type(sb))) {
2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585
		ret = -EUCLEAN;
		btrfs_err(fs_info, "invalid csum type, has %u want %u",
			  btrfs_super_csum_type(sb), BTRFS_CSUM_TYPE_CRC32);
		goto out;
	}
	if (btrfs_super_incompat_flags(sb) & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
		ret = -EUCLEAN;
		btrfs_err(fs_info,
		"invalid incompat flags, has 0x%llx valid mask 0x%llx",
			  btrfs_super_incompat_flags(sb),
			  (unsigned long long)BTRFS_FEATURE_INCOMPAT_SUPP);
		goto out;
	}
out:
	if (ret < 0)
		btrfs_err(fs_info,
		"super block corruption detected before writing it to disk");
	return ret;
}

2586 2587
static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
{
2588 2589 2590
	struct btrfs_tree_parent_check check = {
		.level = level,
		.transid = gen,
2591
		.owner_root = btrfs_root_id(root)
2592
	};
2593 2594
	int ret = 0;

2595
	root->node = read_tree_block(root->fs_info, bytenr, &check);
2596 2597 2598
	if (IS_ERR(root->node)) {
		ret = PTR_ERR(root->node);
		root->node = NULL;
2599 2600 2601
		return ret;
	}
	if (!extent_buffer_uptodate(root->node)) {
2602 2603
		free_extent_buffer(root->node);
		root->node = NULL;
2604
		return -EIO;
2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622
	}

	btrfs_set_root_node(&root->root_item, root->node);
	root->commit_root = btrfs_root_node(root);
	btrfs_set_root_refs(&root->root_item, 1);
	return ret;
}

static int load_important_roots(struct btrfs_fs_info *fs_info)
{
	struct btrfs_super_block *sb = fs_info->super_copy;
	u64 gen, bytenr;
	int level, ret;

	bytenr = btrfs_super_root(sb);
	gen = btrfs_super_generation(sb);
	level = btrfs_super_root_level(sb);
	ret = load_super_root(fs_info->tree_root, bytenr, gen, level);
2623
	if (ret) {
2624
		btrfs_warn(fs_info, "couldn't read tree root");
2625 2626
		return ret;
	}
2627
	return 0;
2628 2629
}

2630
static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
2631
{
2632
	int backup_index = find_newest_super_backup(fs_info);
2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655
	struct btrfs_super_block *sb = fs_info->super_copy;
	struct btrfs_root *tree_root = fs_info->tree_root;
	bool handle_error = false;
	int ret = 0;
	int i;

	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
		if (handle_error) {
			if (!IS_ERR(tree_root->node))
				free_extent_buffer(tree_root->node);
			tree_root->node = NULL;

			if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
				break;

			free_root_pointers(fs_info, 0);

			/*
			 * Don't use the log in recovery mode, it won't be
			 * valid
			 */
			btrfs_set_super_log_root(sb, 0);

2656
			btrfs_warn(fs_info, "try to load backup roots slot %d", i);
2657
			ret = read_backup_root(fs_info, i);
2658
			backup_index = ret;
2659 2660 2661 2662
			if (ret < 0)
				return ret;
		}

2663 2664
		ret = load_important_roots(fs_info);
		if (ret) {
2665
			handle_error = true;
2666 2667 2668
			continue;
		}

2669 2670 2671 2672
		/*
		 * No need to hold btrfs_root::objectid_mutex since the fs
		 * hasn't been fully initialised and we are the only user
		 */
2673
		ret = btrfs_init_root_free_objectid(tree_root);
2674 2675 2676 2677 2678
		if (ret < 0) {
			handle_error = true;
			continue;
		}

2679
		ASSERT(tree_root->free_objectid <= BTRFS_LAST_FREE_OBJECTID);
2680 2681 2682 2683 2684 2685 2686 2687

		ret = btrfs_read_roots(fs_info);
		if (ret < 0) {
			handle_error = true;
			continue;
		}

		/* All successful */
2688
		fs_info->generation = btrfs_header_generation(tree_root->node);
2689
		btrfs_set_last_trans_committed(fs_info, fs_info->generation);
2690
		fs_info->last_reloc_trans = 0;
2691 2692 2693 2694 2695 2696 2697 2698

		/* Always begin writing backup roots after the one being used */
		if (backup_index < 0) {
			fs_info->backup_root_index = 0;
		} else {
			fs_info->backup_root_index = backup_index + 1;
			fs_info->backup_root_index %= BTRFS_NUM_BACKUP_ROOTS;
		}
2699 2700 2701 2702 2703 2704
		break;
	}

	return ret;
}

2705
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
2706
{
2707
	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
2708
	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
Chris Mason's avatar
Chris Mason committed
2709
	INIT_LIST_HEAD(&fs_info->trans_list);
2710
	INIT_LIST_HEAD(&fs_info->dead_roots);
Yan, Zheng's avatar
Yan, Zheng committed
2711
	INIT_LIST_HEAD(&fs_info->delayed_iputs);
2712
	INIT_LIST_HEAD(&fs_info->delalloc_roots);
2713
	INIT_LIST_HEAD(&fs_info->caching_block_groups);
2714
	spin_lock_init(&fs_info->delalloc_root_lock);
Josef Bacik's avatar
Josef Bacik committed
2715
	spin_lock_init(&fs_info->trans_lock);
2716
	spin_lock_init(&fs_info->fs_roots_radix_lock);
Yan, Zheng's avatar
Yan, Zheng committed
2717
	spin_lock_init(&fs_info->delayed_iput_lock);
2718
	spin_lock_init(&fs_info->defrag_inodes_lock);
2719
	spin_lock_init(&fs_info->super_lock);
2720
	spin_lock_init(&fs_info->buffer_lock);
2721
	spin_lock_init(&fs_info->unused_bgs_lock);
2722
	spin_lock_init(&fs_info->treelog_bg_lock);
2723
	spin_lock_init(&fs_info->zone_active_bgs_lock);
2724
	spin_lock_init(&fs_info->relocation_bg_lock);
2725
	rwlock_init(&fs_info->tree_mod_log_lock);
2726
	rwlock_init(&fs_info->global_root_lock);
2727
	mutex_init(&fs_info->unused_bg_unpin_mutex);
2728
	mutex_init(&fs_info->reclaim_bgs_lock);
Chris Mason's avatar
Chris Mason committed
2729
	mutex_init(&fs_info->reloc_mutex);
2730
	mutex_init(&fs_info->delalloc_root_mutex);
2731
	mutex_init(&fs_info->zoned_meta_io_lock);
2732
	mutex_init(&fs_info->zoned_data_reloc_io_lock);
2733
	seqlock_init(&fs_info->profiles_lock);
2734

2735
	btrfs_lockdep_init_map(fs_info, btrfs_trans_num_writers);
2736
	btrfs_lockdep_init_map(fs_info, btrfs_trans_num_extwriters);
2737
	btrfs_lockdep_init_map(fs_info, btrfs_trans_pending_ordered);
2738
	btrfs_lockdep_init_map(fs_info, btrfs_ordered_extent);
2739 2740
	btrfs_state_lockdep_init_map(fs_info, btrfs_trans_commit_prep,
				     BTRFS_LOCKDEP_TRANS_COMMIT_PREP);
2741 2742 2743 2744 2745 2746
	btrfs_state_lockdep_init_map(fs_info, btrfs_trans_unblocked,
				     BTRFS_LOCKDEP_TRANS_UNBLOCKED);
	btrfs_state_lockdep_init_map(fs_info, btrfs_trans_super_committed,
				     BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED);
	btrfs_state_lockdep_init_map(fs_info, btrfs_trans_completed,
				     BTRFS_LOCKDEP_TRANS_COMPLETED);
2747

2748
	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
2749
	INIT_LIST_HEAD(&fs_info->space_info);
2750
	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
2751
	INIT_LIST_HEAD(&fs_info->unused_bgs);
2752
	INIT_LIST_HEAD(&fs_info->reclaim_bgs);
2753
	INIT_LIST_HEAD(&fs_info->zone_active_bgs);
2754 2755
#ifdef CONFIG_BTRFS_DEBUG
	INIT_LIST_HEAD(&fs_info->allocated_roots);
2756 2757
	INIT_LIST_HEAD(&fs_info->allocated_ebs);
	spin_lock_init(&fs_info->eb_leak_lock);
2758
#endif
2759 2760
	fs_info->mapping_tree = RB_ROOT_CACHED;
	rwlock_init(&fs_info->mapping_tree_lock);
2761 2762 2763 2764 2765 2766 2767
	btrfs_init_block_rsv(&fs_info->global_block_rsv,
			     BTRFS_BLOCK_RSV_GLOBAL);
	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
			     BTRFS_BLOCK_RSV_DELOPS);
2768 2769 2770
	btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
			     BTRFS_BLOCK_RSV_DELREFS);

2771
	atomic_set(&fs_info->async_delalloc_pages, 0);
2772
	atomic_set(&fs_info->defrag_running, 0);
2773
	atomic_set(&fs_info->nr_delayed_iputs, 0);
2774
	atomic64_set(&fs_info->tree_mod_seq, 0);
2775
	fs_info->global_root_tree = RB_ROOT;
2776
	fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE;
Josef Bacik's avatar
Josef Bacik committed
2777
	fs_info->metadata_ratio = 0;
2778
	fs_info->defrag_inodes = RB_ROOT;
2779
	atomic64_set(&fs_info->free_chunk_space, 0);
2780
	fs_info->tree_mod_log = RB_ROOT;
2781
	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
2782
	btrfs_init_ref_verify(fs_info);
2783

2784 2785
	fs_info->thread_pool_size = min_t(unsigned long,
					  num_online_cpus() + 2, 8);
2786

2787 2788
	INIT_LIST_HEAD(&fs_info->ordered_roots);
	spin_lock_init(&fs_info->ordered_root_lock);
2789

2790
	btrfs_init_scrub(fs_info);
2791
	btrfs_init_balance(fs_info);
2792
	btrfs_init_async_reclaim_work(fs_info);
Arne Jansen's avatar
Arne Jansen committed
2793

2794
	rwlock_init(&fs_info->block_group_cache_lock);
2795
	fs_info->block_group_cache_tree = RB_ROOT_CACHED;
2796

2797
	extent_io_tree_init(fs_info, &fs_info->excluded_extents,
2798
			    IO_TREE_FS_EXCLUDED_EXTENTS);
Chris Mason's avatar
Chris Mason committed
2799

2800
	mutex_init(&fs_info->ordered_operations_mutex);
2801
	mutex_init(&fs_info->tree_log_mutex);
2802
	mutex_init(&fs_info->chunk_mutex);
2803 2804
	mutex_init(&fs_info->transaction_kthread_mutex);
	mutex_init(&fs_info->cleaner_mutex);
2805
	mutex_init(&fs_info->ro_block_group_mutex);
2806
	init_rwsem(&fs_info->commit_root_sem);
2807
	init_rwsem(&fs_info->cleanup_work_sem);
2808
	init_rwsem(&fs_info->subvol_sem);
2809
	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
2810

2811
	btrfs_init_dev_replace_locks(fs_info);
2812
	btrfs_init_qgroup(fs_info);
2813
	btrfs_discard_init(fs_info);
2814

2815 2816 2817
	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);

2818
	init_waitqueue_head(&fs_info->transaction_throttle);
2819
	init_waitqueue_head(&fs_info->transaction_wait);
Sage Weil's avatar
Sage Weil committed
2820
	init_waitqueue_head(&fs_info->transaction_blocked_wait);
2821
	init_waitqueue_head(&fs_info->async_submit_wait);
2822
	init_waitqueue_head(&fs_info->delayed_iputs_wait);
2823

2824 2825 2826
	/* Usable values until the real ones are cached from the superblock */
	fs_info->nodesize = 4096;
	fs_info->sectorsize = 4096;
2827
	fs_info->sectorsize_bits = ilog2(4096);
2828 2829
	fs_info->stripesize = 4096;

2830 2831 2832
	/* Default compress algorithm when user does -o compress */
	fs_info->compress_type = BTRFS_COMPRESS_ZLIB;

2833 2834
	fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;

2835 2836 2837
	spin_lock_init(&fs_info->swapfile_pins_lock);
	fs_info->swapfile_pins = RB_ROOT;

2838 2839
	fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
	INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
2840 2841 2842 2843 2844 2845 2846
}

static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
{
	int ret;

	fs_info->sb = sb;
2847
	/* Temporary fixed values for block size until we read the superblock. */
2848 2849
	sb->s_blocksize = BTRFS_BDEV_BLOCKSIZE;
	sb->s_blocksize_bits = blksize_bits(BTRFS_BDEV_BLOCKSIZE);
2850

2851
	ret = percpu_counter_init(&fs_info->ordered_bytes, 0, GFP_KERNEL);
2852
	if (ret)
Josef Bacik's avatar
Josef Bacik committed
2853
		return ret;
2854

2855 2856 2857 2858
	ret = percpu_counter_init(&fs_info->evictable_extent_maps, 0, GFP_KERNEL);
	if (ret)
		return ret;

2859 2860
	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
	if (ret)
Josef Bacik's avatar
Josef Bacik committed
2861
		return ret;
2862 2863 2864 2865 2866 2867

	fs_info->dirty_metadata_batch = PAGE_SIZE *
					(1 + ilog2(nr_cpu_ids));

	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
	if (ret)
Josef Bacik's avatar
Josef Bacik committed
2868
		return ret;
2869 2870 2871 2872

	ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
			GFP_KERNEL);
	if (ret)
Josef Bacik's avatar
Josef Bacik committed
2873
		return ret;
2874 2875 2876

	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
					GFP_KERNEL);
Josef Bacik's avatar
Josef Bacik committed
2877 2878
	if (!fs_info->delayed_root)
		return -ENOMEM;
2879 2880
	btrfs_init_delayed_root(fs_info->delayed_root);

2881 2882 2883
	if (sb_rdonly(sb))
		set_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);

Josef Bacik's avatar
Josef Bacik committed
2884
	return btrfs_alloc_stripe_hash_table(fs_info);
2885 2886
}

2887 2888
static int btrfs_uuid_rescan_kthread(void *data)
{
2889
	struct btrfs_fs_info *fs_info = data;
2890 2891 2892 2893 2894 2895 2896 2897 2898
	int ret;

	/*
	 * 1st step is to iterate through the existing UUID tree and
	 * to delete all entries that contain outdated data.
	 * 2nd step is to add all missing entries to the UUID tree.
	 */
	ret = btrfs_uuid_tree_iterate(fs_info);
	if (ret < 0) {
2899 2900 2901
		if (ret != -EINTR)
			btrfs_warn(fs_info, "iterating uuid_tree failed %d",
				   ret);
2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923
		up(&fs_info->uuid_tree_rescan_sem);
		return ret;
	}
	return btrfs_uuid_scan_kthread(data);
}

static int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
{
	struct task_struct *task;

	down(&fs_info->uuid_tree_rescan_sem);
	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
	if (IS_ERR(task)) {
		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
		btrfs_warn(fs_info, "failed to start uuid_rescan task");
		up(&fs_info->uuid_tree_rescan_sem);
		return PTR_ERR(task);
	}

	return 0;
}

2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940
static int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
	u64 root_objectid = 0;
	struct btrfs_root *gang[8];
	int i = 0;
	int err = 0;
	unsigned int ret = 0;

	while (1) {
		spin_lock(&fs_info->fs_roots_radix_lock);
		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, root_objectid,
					     ARRAY_SIZE(gang));
		if (!ret) {
			spin_unlock(&fs_info->fs_roots_radix_lock);
			break;
		}
2941
		root_objectid = btrfs_root_id(gang[ret - 1]) + 1;
2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956

		for (i = 0; i < ret; i++) {
			/* Avoid to grab roots in dead_roots. */
			if (btrfs_root_refs(&gang[i]->root_item) == 0) {
				gang[i] = NULL;
				continue;
			}
			/* Grab all the search result for later use. */
			gang[i] = btrfs_grab_root(gang[i]);
		}
		spin_unlock(&fs_info->fs_roots_radix_lock);

		for (i = 0; i < ret; i++) {
			if (!gang[i])
				continue;
2957
			root_objectid = btrfs_root_id(gang[i]);
2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973
			err = btrfs_orphan_cleanup(gang[i]);
			if (err)
				goto out;
			btrfs_put_root(gang[i]);
		}
		root_objectid++;
	}
out:
	/* Release the uncleaned roots due to error. */
	for (; i < ret; i++) {
		if (gang[i])
			btrfs_put_root(gang[i]);
	}
	return err;
}

2974 2975 2976 2977 2978 2979 2980
/*
 * Mounting logic specific to read-write file systems. Shared by open_ctree
 * and btrfs_remount when remounting from read-only to read-write.
 */
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
{
	int ret;
2981
	const bool cache_opt = btrfs_test_opt(fs_info, SPACE_CACHE);
2982
	bool rebuild_free_space_tree = false;
2983 2984 2985

	if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
2986 2987 2988 2989 2990
		if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
			btrfs_warn(fs_info,
				   "'clear_cache' option is ignored with extent tree v2");
		else
			rebuild_free_space_tree = true;
2991 2992 2993
	} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
		   !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
		btrfs_warn(fs_info, "free space tree is invalid");
2994
		rebuild_free_space_tree = true;
2995 2996
	}

2997 2998 2999
	if (rebuild_free_space_tree) {
		btrfs_info(fs_info, "rebuilding free space tree");
		ret = btrfs_rebuild_free_space_tree(fs_info);
3000 3001
		if (ret) {
			btrfs_warn(fs_info,
3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013
				   "failed to rebuild free space tree: %d", ret);
			goto out;
		}
	}

	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
	    !btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
		btrfs_info(fs_info, "disabling free space tree");
		ret = btrfs_delete_free_space_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				   "failed to disable free space tree: %d", ret);
3014 3015 3016
			goto out;
		}
	}
3017

3018 3019 3020
	/*
	 * btrfs_find_orphan_roots() is responsible for finding all the dead
	 * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
3021
	 * them into the fs_info->fs_roots_radix tree. This must be done before
3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032
	 * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
	 * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
	 * item before the root's tree is deleted - this means that if we unmount
	 * or crash before the deletion completes, on the next mount we will not
	 * delete what remains of the tree because the orphan item does not
	 * exists anymore, which is what tells us we have a pending deletion.
	 */
	ret = btrfs_find_orphan_roots(fs_info);
	if (ret)
		goto out;

3033 3034 3035 3036
	ret = btrfs_cleanup_fs_roots(fs_info);
	if (ret)
		goto out;

3037 3038 3039 3040 3041 3042 3043 3044
	down_read(&fs_info->cleanup_work_sem);
	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
		up_read(&fs_info->cleanup_work_sem);
		goto out;
	}
	up_read(&fs_info->cleanup_work_sem);

3045
	mutex_lock(&fs_info->cleaner_mutex);
3046
	ret = btrfs_recover_relocation(fs_info);
3047 3048 3049 3050 3051 3052
	mutex_unlock(&fs_info->cleaner_mutex);
	if (ret < 0) {
		btrfs_warn(fs_info, "failed to recover relocation: %d", ret);
		goto out;
	}

3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063
	if (btrfs_test_opt(fs_info, FREE_SPACE_TREE) &&
	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
		btrfs_info(fs_info, "creating free space tree");
		ret = btrfs_create_free_space_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				"failed to create free space tree: %d", ret);
			goto out;
		}
	}

3064 3065 3066 3067 3068 3069
	if (cache_opt != btrfs_free_space_cache_v1_active(fs_info)) {
		ret = btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
		if (ret)
			goto out;
	}

3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095
	ret = btrfs_resume_balance_async(fs_info);
	if (ret)
		goto out;

	ret = btrfs_resume_dev_replace_async(fs_info);
	if (ret) {
		btrfs_warn(fs_info, "failed to resume dev_replace");
		goto out;
	}

	btrfs_qgroup_rescan_resume(fs_info);

	if (!fs_info->uuid_root) {
		btrfs_info(fs_info, "creating UUID tree");
		ret = btrfs_create_uuid_tree(fs_info);
		if (ret) {
			btrfs_warn(fs_info,
				   "failed to create the UUID tree %d", ret);
			goto out;
		}
	}

out:
	return ret;
}

3096 3097 3098
/*
 * Do various sanity and dependency checks of different features.
 *
3099 3100
 * @is_rw_mount:	If the mount is read-write.
 *
3101 3102 3103 3104 3105 3106 3107 3108 3109 3110
 * This is the place for less strict checks (like for subpage or artificial
 * feature dependencies).
 *
 * For strict checks or possible corruption detection, see
 * btrfs_validate_super().
 *
 * This should be called after btrfs_parse_options(), as some mount options
 * (space cache related) can modify on-disk format like free space tree and
 * screw up certain feature dependencies.
 */
3111
int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount)
3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149
{
	struct btrfs_super_block *disk_super = fs_info->super_copy;
	u64 incompat = btrfs_super_incompat_flags(disk_super);
	const u64 compat_ro = btrfs_super_compat_ro_flags(disk_super);
	const u64 compat_ro_unsupp = (compat_ro & ~BTRFS_FEATURE_COMPAT_RO_SUPP);

	if (incompat & ~BTRFS_FEATURE_INCOMPAT_SUPP) {
		btrfs_err(fs_info,
		"cannot mount because of unknown incompat features (0x%llx)",
		    incompat);
		return -EINVAL;
	}

	/* Runtime limitation for mixed block groups. */
	if ((incompat & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
	    (fs_info->sectorsize != fs_info->nodesize)) {
		btrfs_err(fs_info,
"unequal nodesize/sectorsize (%u != %u) are not allowed for mixed block groups",
			fs_info->nodesize, fs_info->sectorsize);
		return -EINVAL;
	}

	/* Mixed backref is an always-enabled feature. */
	incompat |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;

	/* Set compression related flags just in case. */
	if (fs_info->compress_type == BTRFS_COMPRESS_LZO)
		incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
	else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD)
		incompat |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD;

	/*
	 * An ancient flag, which should really be marked deprecated.
	 * Such runtime limitation doesn't really need a incompat flag.
	 */
	if (btrfs_super_nodesize(disk_super) > PAGE_SIZE)
		incompat |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;

3150
	if (compat_ro_unsupp && is_rw_mount) {
3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203
		btrfs_err(fs_info,
	"cannot mount read-write because of unknown compat_ro features (0x%llx)",
		       compat_ro);
		return -EINVAL;
	}

	/*
	 * We have unsupported RO compat features, although RO mounted, we
	 * should not cause any metadata writes, including log replay.
	 * Or we could screw up whatever the new feature requires.
	 */
	if (compat_ro_unsupp && btrfs_super_log_root(disk_super) &&
	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
		btrfs_err(fs_info,
"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
			  compat_ro);
		return -EINVAL;
	}

	/*
	 * Artificial limitations for block group tree, to force
	 * block-group-tree to rely on no-holes and free-space-tree.
	 */
	if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) &&
	    (!btrfs_fs_incompat(fs_info, NO_HOLES) ||
	     !btrfs_test_opt(fs_info, FREE_SPACE_TREE))) {
		btrfs_err(fs_info,
"block-group-tree feature requires no-holes and free-space-tree features");
		return -EINVAL;
	}

	/*
	 * Subpage runtime limitation on v1 cache.
	 *
	 * V1 space cache still has some hard codeed PAGE_SIZE usage, while
	 * we're already defaulting to v2 cache, no need to bother v1 as it's
	 * going to be deprecated anyway.
	 */
	if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) {
		btrfs_warn(fs_info,
	"v1 space cache is not supported for page size %lu with sectorsize %u",
			   PAGE_SIZE, fs_info->sectorsize);
		return -EINVAL;
	}

	/* This can be called by remount, we need to protect the super block. */
	spin_lock(&fs_info->super_lock);
	btrfs_set_super_incompat_flags(disk_super, incompat);
	spin_unlock(&fs_info->super_lock);

	return 0;
}

3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218
int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices,
		      char *options)
{
	u32 sectorsize;
	u32 nodesize;
	u32 stripesize;
	u64 generation;
	u16 csum_type;
	struct btrfs_super_block *disk_super;
	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
	struct btrfs_root *tree_root;
	struct btrfs_root *chunk_root;
	int ret;
	int level;

3219
	ret = init_mount_fs_info(fs_info, sb);
3220
	if (ret)
3221
		goto fail;
David Woodhouse's avatar
David Woodhouse committed
3222

3223 3224 3225 3226 3227 3228 3229 3230
	/* These need to be init'ed before we start creating inodes and such. */
	tree_root = btrfs_alloc_root(fs_info, BTRFS_ROOT_TREE_OBJECTID,
				     GFP_KERNEL);
	fs_info->tree_root = tree_root;
	chunk_root = btrfs_alloc_root(fs_info, BTRFS_CHUNK_TREE_OBJECTID,
				      GFP_KERNEL);
	fs_info->chunk_root = chunk_root;
	if (!tree_root || !chunk_root) {
3231
		ret = -ENOMEM;
Josef Bacik's avatar
Josef Bacik committed
3232
		goto fail;
3233 3234
	}

3235
	ret = btrfs_init_btree_inode(sb);
3236
	if (ret)
Josef Bacik's avatar
Josef Bacik committed
3237
		goto fail;
3238

3239
	invalidate_bdev(fs_devices->latest_dev->bdev);
3240 3241 3242 3243

	/*
	 * Read super block and check the signature bytes only
	 */
3244
	disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev);
3245
	if (IS_ERR(disk_super)) {
3246
		ret = PTR_ERR(disk_super);
3247
		goto fail_alloc;
3248
	}
Chris Mason's avatar
Chris Mason committed
3249

3250
	btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid);
3251
	/*
3252
	 * Verify the type first, if that or the checksum value are
3253 3254
	 * corrupted, we'll find out
	 */
3255
	csum_type = btrfs_super_csum_type(disk_super);
3256
	if (!btrfs_supported_super_csum(csum_type)) {
3257
		btrfs_err(fs_info, "unsupported checksum algorithm: %u",
3258
			  csum_type);
3259
		ret = -EINVAL;
3260
		btrfs_release_disk_super(disk_super);
3261 3262 3263
		goto fail_alloc;
	}

3264 3265
	fs_info->csum_size = btrfs_super_csum_size(disk_super);

3266 3267
	ret = btrfs_init_csum_hash(fs_info, csum_type);
	if (ret) {
3268
		btrfs_release_disk_super(disk_super);
3269 3270 3271
		goto fail_alloc;
	}

3272 3273 3274 3275
	/*
	 * We want to check superblock checksum, the type is stored inside.
	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
	 */
3276
	if (btrfs_check_super_csum(fs_info, disk_super)) {
3277
		btrfs_err(fs_info, "superblock checksum mismatch");
3278
		ret = -EINVAL;
3279
		btrfs_release_disk_super(disk_super);
3280
		goto fail_alloc;
3281 3282 3283 3284 3285 3286 3287
	}

	/*
	 * super_copy is zeroed at allocation time and we never touch the
	 * following bytes up to INFO_SIZE, the checksum is calculated from
	 * the whole block of INFO_SIZE
	 */
3288 3289
	memcpy(fs_info->super_copy, disk_super, sizeof(*fs_info->super_copy));
	btrfs_release_disk_super(disk_super);
3290

3291 3292 3293 3294
	disk_super = fs_info->super_copy;

	memcpy(fs_info->super_for_commit, fs_info->super_copy,
	       sizeof(*fs_info->super_for_commit));
3295

3296
	ret = btrfs_validate_mount_super(fs_info);
3297
	if (ret) {
3298
		btrfs_err(fs_info, "superblock contains fatal errors");
3299
		ret = -EINVAL;
3300
		goto fail_alloc;
3301 3302
	}

3303 3304 3305
	if (!btrfs_super_root(disk_super)) {
		btrfs_err(fs_info, "invalid superblock tree root bytenr");
		ret = -EINVAL;
3306
		goto fail_alloc;
3307
	}
3308

3309
	/* check FS state, whether FS is broken. */
3310
	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
3311
		WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
3312

3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325
	/* Set up fs_info before parsing mount options */
	nodesize = btrfs_super_nodesize(disk_super);
	sectorsize = btrfs_super_sectorsize(disk_super);
	stripesize = sectorsize;
	fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));

	fs_info->nodesize = nodesize;
	fs_info->sectorsize = sectorsize;
	fs_info->sectorsize_bits = ilog2(sectorsize);
	fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
	fs_info->stripesize = stripesize;

3326 3327 3328 3329 3330 3331
	/*
	 * Handle the space caching options appropriately now that we have the
	 * super block loaded and validated.
	 */
	btrfs_set_free_space_cache_settings(fs_info);

3332 3333
	if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
		ret = -EINVAL;
3334
		goto fail_alloc;
3335
	}
3336

3337
	ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
3338
	if (ret < 0)
3339 3340
		goto fail_alloc;

3341 3342 3343 3344 3345 3346
	/*
	 * At this point our mount options are validated, if we set ->max_inline
	 * to something non-standard make sure we truncate it to sectorsize.
	 */
	fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);

3347 3348 3349
	if (sectorsize < PAGE_SIZE) {
		struct btrfs_subpage_info *subpage_info;

3350 3351 3352
		btrfs_warn(fs_info,
		"read-write for sector size %u with page size %lu is experimental",
			   sectorsize, PAGE_SIZE);
3353
		subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
3354 3355
		if (!subpage_info) {
			ret = -ENOMEM;
3356
			goto fail_alloc;
3357
		}
3358 3359
		btrfs_init_subpage_info(subpage_info, sectorsize);
		fs_info->subpage_info = subpage_info;
3360
	}
3361

3362
	ret = btrfs_init_workqueues(fs_info);
3363
	if (ret)
3364
		goto fail_sb_buffer;
3365

3366 3367
	sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super);
	sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE);
3368

3369
	/* Update the values for the current filesystem. */
3370 3371
	sb->s_blocksize = sectorsize;
	sb->s_blocksize_bits = blksize_bits(sectorsize);
3372
	memcpy(&sb->s_uuid, fs_info->fs_devices->fsid, BTRFS_FSID_SIZE);
3373

3374
	mutex_lock(&fs_info->chunk_mutex);
3375
	ret = btrfs_read_sys_array(fs_info);
3376
	mutex_unlock(&fs_info->chunk_mutex);
3377
	if (ret) {
3378
		btrfs_err(fs_info, "failed to read the system array: %d", ret);
3379
		goto fail_sb_buffer;
3380
	}
3381

3382
	generation = btrfs_super_chunk_root_generation(disk_super);
3383
	level = btrfs_super_chunk_root_level(disk_super);
3384 3385 3386
	ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super),
			      generation, level);
	if (ret) {
3387
		btrfs_err(fs_info, "failed to read chunk root");
3388
		goto fail_tree_roots;
3389
	}
3390

3391
	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
3392 3393
			   offsetof(struct btrfs_header, chunk_tree_uuid),
			   BTRFS_UUID_SIZE);
3394

3395
	ret = btrfs_read_chunk_tree(fs_info);
Yan Zheng's avatar
Yan Zheng committed
3396
	if (ret) {
3397
		btrfs_err(fs_info, "failed to read chunk tree: %d", ret);
3398
		goto fail_tree_roots;
Yan Zheng's avatar
Yan Zheng committed
3399
	}
3400

3401
	/*
3402 3403 3404
	 * At this point we know all the devices that make this filesystem,
	 * including the seed devices but we don't know yet if the replace
	 * target is required. So free devices that are not part of this
3405
	 * filesystem but skip the replace target device which is checked
3406
	 * below in btrfs_init_dev_replace().
3407
	 */
3408
	btrfs_free_extra_devids(fs_devices);
3409
	if (!fs_devices->latest_dev->bdev) {
3410
		btrfs_err(fs_info, "failed to read devices");
3411
		ret = -EIO;
3412 3413 3414
		goto fail_tree_roots;
	}

3415
	ret = init_tree_roots(fs_info);
3416
	if (ret)
3417
		goto fail_tree_roots;
3418

3419 3420 3421 3422 3423 3424 3425 3426
	/*
	 * Get zone type information of zoned block devices. This will also
	 * handle emulation of a zoned filesystem if a regular device has the
	 * zoned incompat feature flag set.
	 */
	ret = btrfs_get_dev_zone_info_all_devices(fs_info);
	if (ret) {
		btrfs_err(fs_info,
3427
			  "zoned: failed to read device zone info: %d", ret);
3428 3429 3430
		goto fail_block_groups;
	}

3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442
	/*
	 * If we have a uuid root and we're not being told to rescan we need to
	 * check the generation here so we can set the
	 * BTRFS_FS_UPDATE_UUID_TREE_GEN bit.  Otherwise we could commit the
	 * transaction during a balance or the log replay without updating the
	 * uuid generation, and then if we crash we would rescan the uuid tree,
	 * even though it was perfectly fine.
	 */
	if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
	    fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
		set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);

3443 3444 3445 3446 3447 3448 3449
	ret = btrfs_verify_dev_extents(fs_info);
	if (ret) {
		btrfs_err(fs_info,
			  "failed to verify dev extents against chunks: %d",
			  ret);
		goto fail_block_groups;
	}
3450 3451
	ret = btrfs_recover_balance(fs_info);
	if (ret) {
3452
		btrfs_err(fs_info, "failed to recover balance: %d", ret);
3453 3454 3455
		goto fail_block_groups;
	}

3456 3457
	ret = btrfs_init_dev_stats(fs_info);
	if (ret) {
3458
		btrfs_err(fs_info, "failed to init dev_stats: %d", ret);
3459 3460 3461
		goto fail_block_groups;
	}

3462 3463
	ret = btrfs_init_dev_replace(fs_info);
	if (ret) {
3464
		btrfs_err(fs_info, "failed to init dev_replace: %d", ret);
3465 3466 3467
		goto fail_block_groups;
	}

3468 3469 3470 3471 3472 3473 3474
	ret = btrfs_check_zoned_mode(fs_info);
	if (ret) {
		btrfs_err(fs_info, "failed to initialize zoned mode: %d",
			  ret);
		goto fail_block_groups;
	}

3475
	ret = btrfs_sysfs_add_fsid(fs_devices);
3476
	if (ret) {
3477 3478
		btrfs_err(fs_info, "failed to init sysfs fsid interface: %d",
				ret);
3479 3480 3481
		goto fail_block_groups;
	}

3482
	ret = btrfs_sysfs_add_mounted(fs_info);
3483
	if (ret) {
3484
		btrfs_err(fs_info, "failed to init sysfs interface: %d", ret);
3485
		goto fail_fsdev_sysfs;
3486 3487 3488 3489
	}

	ret = btrfs_init_space_info(fs_info);
	if (ret) {
3490
		btrfs_err(fs_info, "failed to initialize space info: %d", ret);
3491
		goto fail_sysfs;
3492 3493
	}

3494
	ret = btrfs_read_block_groups(fs_info);
3495
	if (ret) {
3496
		btrfs_err(fs_info, "failed to read block groups: %d", ret);
3497
		goto fail_sysfs;
3498
	}
3499

3500 3501
	btrfs_free_zone_cache(fs_info);

3502 3503
	btrfs_check_active_zone_reservation(fs_info);

3504 3505
	if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
	    !btrfs_check_rw_degradable(fs_info, NULL)) {
3506
		btrfs_warn(fs_info,
3507
		"writable mount is not allowed due to too many missing devices");
3508
		ret = -EINVAL;
3509
		goto fail_sysfs;
3510
	}
3511

3512
	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info,
3513
					       "btrfs-cleaner");
3514 3515
	if (IS_ERR(fs_info->cleaner_kthread)) {
		ret = PTR_ERR(fs_info->cleaner_kthread);
3516
		goto fail_sysfs;
3517
	}
3518 3519 3520 3521

	fs_info->transaction_kthread = kthread_run(transaction_kthread,
						   tree_root,
						   "btrfs-transaction");
3522 3523
	if (IS_ERR(fs_info->transaction_kthread)) {
		ret = PTR_ERR(fs_info->transaction_kthread);
3524
		goto fail_cleaner;
3525
	}
3526

3527 3528 3529
	ret = btrfs_read_qgroup_config(fs_info);
	if (ret)
		goto fail_trans_kthread;
3530

3531 3532 3533
	if (btrfs_build_ref_tree(fs_info))
		btrfs_err(fs_info, "couldn't build ref tree");

3534 3535
	/* do not make disk changes in broken FS or nologreplay is given */
	if (btrfs_super_log_root(disk_super) != 0 &&
3536
	    !btrfs_test_opt(fs_info, NOLOGREPLAY)) {
3537
		btrfs_info(fs_info, "start tree-log replay");
3538
		ret = btrfs_replay_log(fs_info, fs_devices);
3539
		if (ret)
3540
			goto fail_qgroup;
3541
	}
3542

3543
	fs_info->fs_root = btrfs_get_fs_root(fs_info, BTRFS_FS_TREE_OBJECTID, true);
3544
	if (IS_ERR(fs_info->fs_root)) {
3545 3546
		ret = PTR_ERR(fs_info->fs_root);
		btrfs_warn(fs_info, "failed to read fs tree: %d", ret);
3547
		fs_info->fs_root = NULL;
3548
		goto fail_qgroup;
3549
	}
Chris Mason's avatar
Chris Mason committed
3550

3551
	if (sb_rdonly(sb))
3552
		return 0;
3553

3554
	ret = btrfs_start_pre_rw_mount(fs_info);
3555
	if (ret) {
3556
		close_ctree(fs_info);
3557
		return ret;
3558
	}
3559
	btrfs_discard_resume(fs_info);
3560

3561 3562 3563
	if (fs_info->uuid_root &&
	    (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) ||
	     fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) {
3564
		btrfs_info(fs_info, "checking UUID tree");
3565 3566
		ret = btrfs_check_uuid_tree(fs_info);
		if (ret) {
3567 3568
			btrfs_warn(fs_info,
				"failed to check the UUID tree: %d", ret);
3569
			close_ctree(fs_info);
3570 3571
			return ret;
		}
3572
	}
3573

3574
	set_bit(BTRFS_FS_OPEN, &fs_info->flags);
3575

3576 3577 3578 3579
	/* Kick the cleaner thread so it'll start deleting snapshots. */
	if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
		wake_up_process(fs_info->cleaner_kthread);

3580
	return 0;
Chris Mason's avatar
Chris Mason committed
3581

3582 3583
fail_qgroup:
	btrfs_free_qgroup_config(fs_info);
3584 3585
fail_trans_kthread:
	kthread_stop(fs_info->transaction_kthread);
3586
	btrfs_cleanup_transaction(fs_info);
3587
	btrfs_free_fs_roots(fs_info);
3588
fail_cleaner:
3589
	kthread_stop(fs_info->cleaner_kthread);
3590 3591 3592 3593 3594 3595 3596

	/*
	 * make sure we're done with the btree inode before we stop our
	 * kthreads
	 */
	filemap_write_and_wait(fs_info->btree_inode->i_mapping);

3597
fail_sysfs:
3598
	btrfs_sysfs_remove_mounted(fs_info);
3599

3600 3601 3602
fail_fsdev_sysfs:
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);

3603
fail_block_groups:
Josef Bacik's avatar
Josef Bacik committed
3604
	btrfs_put_block_group_cache(fs_info);
3605 3606

fail_tree_roots:
3607 3608
	if (fs_info->data_reloc_root)
		btrfs_drop_and_free_fs_root(fs_info, fs_info->data_reloc_root);
3609
	free_root_pointers(fs_info, true);
3610
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
3611

Chris Mason's avatar
Chris Mason committed
3612
fail_sb_buffer:
Liu Bo's avatar
Liu Bo committed
3613
	btrfs_stop_all_workers(fs_info);
3614
	btrfs_free_block_groups(fs_info);
3615
fail_alloc:
3616
	btrfs_mapping_tree_free(fs_info);
3617

3618
	iput(fs_info->btree_inode);
3619
fail:
3620
	btrfs_close_devices(fs_info->fs_devices);
3621 3622
	ASSERT(ret < 0);
	return ret;
3623
}
3624
ALLOW_ERROR_INJECTION(open_ctree, ERRNO);
3625

3626
static void btrfs_end_super_write(struct bio *bio)
3627
{
3628
	struct btrfs_device *device = bio->bi_private;
3629
	struct folio_iter fi;
3630

3631
	bio_for_each_folio_all(fi, bio) {
3632 3633
		if (bio->bi_status) {
			btrfs_warn_rl_in_rcu(device->fs_info,
3634
				"lost super block write due to IO error on %s (%d)",
3635
				btrfs_dev_name(device),
3636 3637 3638
				blk_status_to_errno(bio->bi_status));
			btrfs_dev_stat_inc_and_print(device,
						     BTRFS_DEV_STAT_WRITE_ERRS);
3639 3640 3641 3642 3643 3644
			/* Ensure failure if the primary sb fails. */
			if (bio->bi_opf & REQ_FUA)
				atomic_add(BTRFS_SUPER_PRIMARY_WRITE_ERROR,
					   &device->sb_write_errors);
			else
				atomic_inc(&device->sb_write_errors);
3645
		}
3646 3647
		folio_unlock(fi.folio);
		folio_put(fi.folio);
3648
	}
3649 3650

	bio_put(bio);
3651 3652
}

3653
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
3654
						   int copy_num, bool drop_cache)
3655 3656
{
	struct btrfs_super_block *super;
3657
	struct page *page;
3658
	u64 bytenr, bytenr_orig;
3659
	struct address_space *mapping = bdev->bd_inode->i_mapping;
3660 3661 3662 3663 3664 3665 3666 3667
	int ret;

	bytenr_orig = btrfs_sb_offset(copy_num);
	ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr);
	if (ret == -ENOENT)
		return ERR_PTR(-EINVAL);
	else if (ret)
		return ERR_PTR(ret);
3668

3669
	if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev))
3670
		return ERR_PTR(-EINVAL);
3671

3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684
	if (drop_cache) {
		/* This should only be called with the primary sb. */
		ASSERT(copy_num == 0);

		/*
		 * Drop the page of the primary superblock, so later read will
		 * always read from the device.
		 */
		invalidate_inode_pages2_range(mapping,
				bytenr >> PAGE_SHIFT,
				(bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT);
	}

3685 3686 3687
	page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS);
	if (IS_ERR(page))
		return ERR_CAST(page);
3688

3689
	super = page_address(page);
3690 3691 3692 3693 3694
	if (btrfs_super_magic(super) != BTRFS_MAGIC) {
		btrfs_release_disk_super(super);
		return ERR_PTR(-ENODATA);
	}

3695
	if (btrfs_super_bytenr(super) != bytenr_orig) {
3696 3697
		btrfs_release_disk_super(super);
		return ERR_PTR(-EINVAL);
3698 3699
	}

3700
	return super;
3701 3702 3703
}


3704
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev)
Yan Zheng's avatar
Yan Zheng committed
3705
{
3706
	struct btrfs_super_block *super, *latest = NULL;
Yan Zheng's avatar
Yan Zheng committed
3707 3708 3709 3710 3711 3712 3713 3714 3715
	int i;
	u64 transid = 0;

	/* we would like to check all the supers, but that would make
	 * a btrfs mount succeed after a mkfs from a different FS.
	 * So, we need to add a special mount option to scan for
	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
	 */
	for (i = 0; i < 1; i++) {
3716
		super = btrfs_read_dev_one_super(bdev, i, false);
3717
		if (IS_ERR(super))
Yan Zheng's avatar
Yan Zheng committed
3718 3719 3720
			continue;

		if (!latest || btrfs_super_generation(super) > transid) {
3721 3722 3723 3724
			if (latest)
				btrfs_release_disk_super(super);

			latest = super;
Yan Zheng's avatar
Yan Zheng committed
3725 3726 3727
			transid = btrfs_super_generation(super);
		}
	}
3728

3729
	return super;
Yan Zheng's avatar
Yan Zheng committed
3730 3731
}

3732
/*
3733
 * Write superblock @sb to the @device. Do not wait for completion, all the
3734
 * folios we use for writing are locked.
3735
 *
3736 3737 3738
 * Write @max_mirrors copies of the superblock, where 0 means default that fit
 * the expected device size at commit time. Note that max_mirrors must be
 * same for write and wait phases.
3739
 *
3740
 * Return number of errors when folio is not found or submission fails.
3741
 */
Yan Zheng's avatar
Yan Zheng committed
3742
static int write_dev_supers(struct btrfs_device *device,
3743
			    struct btrfs_super_block *sb, int max_mirrors)
Yan Zheng's avatar
Yan Zheng committed
3744
{
3745
	struct btrfs_fs_info *fs_info = device->fs_info;
3746
	struct address_space *mapping = device->bdev->bd_inode->i_mapping;
3747
	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
Yan Zheng's avatar
Yan Zheng committed
3748
	int i;
3749 3750
	int ret;
	u64 bytenr, bytenr_orig;
Yan Zheng's avatar
Yan Zheng committed
3751

3752 3753
	atomic_set(&device->sb_write_errors, 0);

Yan Zheng's avatar
Yan Zheng committed
3754 3755 3756
	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

3757 3758
	shash->tfm = fs_info->csum_shash;

Yan Zheng's avatar
Yan Zheng committed
3759
	for (i = 0; i < max_mirrors; i++) {
3760
		struct folio *folio;
3761 3762
		struct bio *bio;
		struct btrfs_super_block *disk_super;
3763
		size_t offset;
3764

3765 3766 3767 3768 3769 3770 3771 3772
		bytenr_orig = btrfs_sb_offset(i);
		ret = btrfs_sb_log_location(device, i, WRITE, &bytenr);
		if (ret == -ENOENT) {
			continue;
		} else if (ret < 0) {
			btrfs_err(device->fs_info,
				"couldn't get super block location for mirror %d",
				i);
3773
			atomic_inc(&device->sb_write_errors);
3774 3775
			continue;
		}
3776 3777
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
Yan Zheng's avatar
Yan Zheng committed
3778 3779
			break;

3780
		btrfs_set_super_bytenr(sb, bytenr_orig);
3781

3782 3783 3784
		crypto_shash_digest(shash, (const char *)sb + BTRFS_CSUM_SIZE,
				    BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE,
				    sb->csum);
3785

3786 3787 3788 3789
		folio = __filemap_get_folio(mapping, bytenr >> PAGE_SHIFT,
					    FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
					    GFP_NOFS);
		if (IS_ERR(folio)) {
3790
			btrfs_err(device->fs_info,
3791
			    "couldn't get super block page for bytenr %llu",
3792
			    bytenr);
3793
			atomic_inc(&device->sb_write_errors);
3794
			continue;
3795
		}
3796
		ASSERT(folio_order(folio) == 0);
3797

3798 3799
		offset = offset_in_folio(folio, bytenr);
		disk_super = folio_address(folio) + offset;
3800
		memcpy(disk_super, sb, BTRFS_SUPER_INFO_SIZE);
3801

3802 3803 3804 3805 3806
		/*
		 * Directly use bios here instead of relying on the page cache
		 * to do I/O, so we don't lose the ability to do integrity
		 * checking.
		 */
3807 3808 3809
		bio = bio_alloc(device->bdev, 1,
				REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO,
				GFP_NOFS);
3810 3811 3812
		bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT;
		bio->bi_private = device;
		bio->bi_end_io = btrfs_end_super_write;
3813
		bio_add_folio_nofail(bio, folio, BTRFS_SUPER_INFO_SIZE, offset);
Yan Zheng's avatar
Yan Zheng committed
3814

Chris Mason's avatar
Chris Mason committed
3815
		/*
3816 3817 3818
		 * We FUA only the first super block.  The others we allow to
		 * go down lazy and there's a short window where the on-disk
		 * copies might still contain the older version.
Chris Mason's avatar
Chris Mason committed
3819
		 */
3820
		if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
3821
			bio->bi_opf |= REQ_FUA;
3822
		submit_bio(bio);
3823 3824

		if (btrfs_advance_sb_log(device, i))
3825
			atomic_inc(&device->sb_write_errors);
Yan Zheng's avatar
Yan Zheng committed
3826
	}
3827
	return atomic_read(&device->sb_write_errors) < i ? 0 : -1;
Yan Zheng's avatar
Yan Zheng committed
3828 3829
}

3830 3831 3832 3833
/*
 * Wait for write completion of superblocks done by write_dev_supers,
 * @max_mirrors same for write and wait phases.
 *
3834 3835
 * Return -1 if primary super block write failed or when there were no super block
 * copies written. Otherwise 0.
3836 3837 3838 3839 3840
 */
static int wait_dev_supers(struct btrfs_device *device, int max_mirrors)
{
	int i;
	int errors = 0;
3841
	bool primary_failed = false;
3842
	int ret;
3843 3844 3845 3846 3847 3848
	u64 bytenr;

	if (max_mirrors == 0)
		max_mirrors = BTRFS_SUPER_MIRROR_MAX;

	for (i = 0; i < max_mirrors; i++) {
3849
		struct folio *folio;
3850

3851 3852 3853 3854 3855 3856 3857 3858 3859
		ret = btrfs_sb_log_location(device, i, READ, &bytenr);
		if (ret == -ENOENT) {
			break;
		} else if (ret < 0) {
			errors++;
			if (i == 0)
				primary_failed = true;
			continue;
		}
3860 3861 3862 3863
		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
		    device->commit_total_bytes)
			break;

3864 3865
		folio = filemap_get_folio(device->bdev->bd_inode->i_mapping,
					  bytenr >> PAGE_SHIFT);
3866 3867
		/* If the folio has been removed, then we know it completed. */
		if (IS_ERR(folio))
3868
			continue;
3869 3870 3871 3872 3873
		ASSERT(folio_order(folio) == 0);

		/* Folio will be unlocked once the write completes. */
		folio_wait_locked(folio);
		folio_put(folio);
3874 3875
	}

3876 3877 3878
	errors += atomic_read(&device->sb_write_errors);
	if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)
		primary_failed = true;
3879 3880 3881 3882 3883 3884
	if (primary_failed) {
		btrfs_err(device->fs_info, "error writing primary super block to device %llu",
			  device->devid);
		return -1;
	}

3885 3886 3887
	return errors < i ? 0 : -1;
}

Chris Mason's avatar
Chris Mason committed
3888 3889 3890 3891
/*
 * endio for the write_dev_flush, this will wake anyone waiting
 * for the barrier when it is done
 */
3892
static void btrfs_end_empty_barrier(struct bio *bio)
Chris Mason's avatar
Chris Mason committed
3893
{
3894
	bio_uninit(bio);
3895
	complete(bio->bi_private);
Chris Mason's avatar
Chris Mason committed
3896 3897 3898
}

/*
3899 3900
 * Submit a flush request to the device if it supports it. Error handling is
 * done in the waiting counterpart.
Chris Mason's avatar
Chris Mason committed
3901
 */
3902
static void write_dev_flush(struct btrfs_device *device)
Chris Mason's avatar
Chris Mason committed
3903
{
3904
	struct bio *bio = &device->flush_bio;
Chris Mason's avatar
Chris Mason committed
3905

3906 3907
	device->last_flush_error = BLK_STS_OK;

3908 3909
	bio_init(bio, device->bdev, NULL, 0,
		 REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH);
Chris Mason's avatar
Chris Mason committed
3910 3911 3912
	bio->bi_end_io = btrfs_end_empty_barrier;
	init_completion(&device->flush_wait);
	bio->bi_private = &device->flush_wait;
3913
	submit_bio(bio);
3914
	set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
3915
}
Chris Mason's avatar
Chris Mason committed
3916

3917 3918
/*
 * If the flush bio has been submitted by write_dev_flush, wait for it.
3919
 * Return true for any error, and false otherwise.
3920
 */
3921
static bool wait_dev_flush(struct btrfs_device *device)
3922
{
3923
	struct bio *bio = &device->flush_bio;
Chris Mason's avatar
Chris Mason committed
3924

3925
	if (!test_and_clear_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state))
3926
		return false;
Chris Mason's avatar
Chris Mason committed
3927

3928
	wait_for_completion_io(&device->flush_wait);
Chris Mason's avatar
Chris Mason committed
3929

3930 3931 3932
	if (bio->bi_status) {
		device->last_flush_error = bio->bi_status;
		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS);
3933
		return true;
3934 3935
	}

3936
	return false;
Chris Mason's avatar
Chris Mason committed
3937 3938 3939 3940 3941 3942 3943 3944 3945 3946
}

/*
 * send an empty flush down to each device in parallel,
 * then wait for them
 */
static int barrier_all_devices(struct btrfs_fs_info *info)
{
	struct list_head *head;
	struct btrfs_device *dev;
3947
	int errors_wait = 0;
Chris Mason's avatar
Chris Mason committed
3948

3949
	lockdep_assert_held(&info->fs_devices->device_list_mutex);
Chris Mason's avatar
Chris Mason committed
3950 3951
	/* send down all the barriers */
	head = &info->fs_devices->devices;
3952
	list_for_each_entry(dev, head, dev_list) {
3953
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3954
			continue;
3955
		if (!dev->bdev)
Chris Mason's avatar
Chris Mason committed
3956
			continue;
3957
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3958
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
Chris Mason's avatar
Chris Mason committed
3959 3960
			continue;

3961
		write_dev_flush(dev);
Chris Mason's avatar
Chris Mason committed
3962 3963 3964
	}

	/* wait for all the barriers */
3965
	list_for_each_entry(dev, head, dev_list) {
3966
		if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
3967
			continue;
Chris Mason's avatar
Chris Mason committed
3968
		if (!dev->bdev) {
3969
			errors_wait++;
Chris Mason's avatar
Chris Mason committed
3970 3971
			continue;
		}
3972
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
3973
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
Chris Mason's avatar
Chris Mason committed
3974 3975
			continue;

3976
		if (wait_dev_flush(dev))
3977
			errors_wait++;
3978 3979
	}

3980 3981 3982 3983 3984 3985 3986
	/*
	 * Checks last_flush_error of disks in order to determine the device
	 * state.
	 */
	if (errors_wait && !btrfs_check_rw_degradable(info, NULL))
		return -EIO;

Chris Mason's avatar
Chris Mason committed
3987 3988 3989
	return 0;
}

3990 3991
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags)
{
3992 3993
	int raid_type;
	int min_tolerated = INT_MAX;
3994

3995 3996
	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 ||
	    (flags & BTRFS_AVAIL_ALLOC_BIT_SINGLE))
3997
		min_tolerated = min_t(int, min_tolerated,
3998 3999
				    btrfs_raid_array[BTRFS_RAID_SINGLE].
				    tolerated_failures);
4000

4001 4002 4003
	for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
		if (raid_type == BTRFS_RAID_SINGLE)
			continue;
4004
		if (!(flags & btrfs_raid_array[raid_type].bg_flag))
4005
			continue;
4006
		min_tolerated = min_t(int, min_tolerated,
4007 4008 4009
				    btrfs_raid_array[raid_type].
				    tolerated_failures);
	}
4010

4011
	if (min_tolerated == INT_MAX) {
4012
		pr_warn("BTRFS: unknown raid flag: %llu", flags);
4013 4014 4015 4016
		min_tolerated = 0;
	}

	return min_tolerated;
4017 4018
}

4019
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors)
4020
{
4021
	struct list_head *head;
4022
	struct btrfs_device *dev;
4023
	struct btrfs_super_block *sb;
4024 4025 4026
	struct btrfs_dev_item *dev_item;
	int ret;
	int do_barriers;
4027 4028
	int max_errors;
	int total_errors = 0;
4029
	u64 flags;
4030

4031
	do_barriers = !btrfs_test_opt(fs_info, NOBARRIER);
4032 4033 4034 4035 4036 4037 4038 4039

	/*
	 * max_mirrors == 0 indicates we're from commit_transaction,
	 * not from fsync where the tree roots in fs_info have not
	 * been consistent on disk.
	 */
	if (max_mirrors == 0)
		backup_super_roots(fs_info);
4040

4041
	sb = fs_info->super_for_commit;
4042
	dev_item = &sb->dev_item;
4043

4044 4045 4046
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
	head = &fs_info->fs_devices->devices;
	max_errors = btrfs_super_num_devices(fs_info->super_copy) - 1;
Chris Mason's avatar
Chris Mason committed
4047

4048
	if (do_barriers) {
4049
		ret = barrier_all_devices(fs_info);
4050 4051
		if (ret) {
			mutex_unlock(
4052 4053 4054
				&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, ret,
					      "errors while submitting device barriers.");
4055 4056 4057
			return ret;
		}
	}
Chris Mason's avatar
Chris Mason committed
4058

4059
	list_for_each_entry(dev, head, dev_list) {
4060 4061 4062 4063
		if (!dev->bdev) {
			total_errors++;
			continue;
		}
4064
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4065
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4066 4067
			continue;

Yan Zheng's avatar
Yan Zheng committed
4068
		btrfs_set_stack_device_generation(dev_item, 0);
4069 4070
		btrfs_set_stack_device_type(dev_item, dev->type);
		btrfs_set_stack_device_id(dev_item, dev->devid);
4071
		btrfs_set_stack_device_total_bytes(dev_item,
4072
						   dev->commit_total_bytes);
4073 4074
		btrfs_set_stack_device_bytes_used(dev_item,
						  dev->commit_bytes_used);
4075 4076 4077 4078
		btrfs_set_stack_device_io_align(dev_item, dev->io_align);
		btrfs_set_stack_device_io_width(dev_item, dev->io_width);
		btrfs_set_stack_device_sector_size(dev_item, dev->sector_size);
		memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE);
4079 4080
		memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid,
		       BTRFS_FSID_SIZE);
Yan Zheng's avatar
Yan Zheng committed
4081

4082 4083 4084
		flags = btrfs_super_flags(sb);
		btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN);

4085 4086 4087 4088 4089 4090 4091 4092
		ret = btrfs_validate_write_super(fs_info, sb);
		if (ret < 0) {
			mutex_unlock(&fs_info->fs_devices->device_list_mutex);
			btrfs_handle_fs_error(fs_info, -EUCLEAN,
				"unexpected superblock corruption detected");
			return -EUCLEAN;
		}

4093
		ret = write_dev_supers(dev, sb, max_mirrors);
4094 4095
		if (ret)
			total_errors++;
4096
	}
4097
	if (total_errors > max_errors) {
4098 4099 4100
		btrfs_err(fs_info, "%d errors while writing supers",
			  total_errors);
		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4101

4102
		/* FUA is masked off if unsupported and can't be the reason */
4103 4104 4105
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
4106
		return -EIO;
4107
	}
4108

Yan Zheng's avatar
Yan Zheng committed
4109
	total_errors = 0;
4110
	list_for_each_entry(dev, head, dev_list) {
4111 4112
		if (!dev->bdev)
			continue;
4113
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4114
		    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))
4115 4116
			continue;

4117
		ret = wait_dev_supers(dev, max_mirrors);
Yan Zheng's avatar
Yan Zheng committed
4118 4119
		if (ret)
			total_errors++;
4120
	}
4121
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4122
	if (total_errors > max_errors) {
4123 4124 4125
		btrfs_handle_fs_error(fs_info, -EIO,
				      "%d errors while writing supers",
				      total_errors);
4126
		return -EIO;
4127
	}
4128 4129 4130
	return 0;
}

4131 4132 4133
/* Drop a fs root from the radix tree and free it. */
void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
				  struct btrfs_root *root)
Chris Mason's avatar
Chris Mason committed
4134
{
4135 4136
	bool drop_ref = false;

4137 4138
	spin_lock(&fs_info->fs_roots_radix_lock);
	radix_tree_delete(&fs_info->fs_roots_radix,
4139
			  (unsigned long)btrfs_root_id(root));
4140
	if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
4141
		drop_ref = true;
4142
	spin_unlock(&fs_info->fs_roots_radix_lock);
4143

4144
	if (BTRFS_FS_ERROR(fs_info)) {
4145
		ASSERT(root->log_root == NULL);
4146
		if (root->reloc_root) {
4147
			btrfs_put_root(root->reloc_root);
4148 4149 4150
			root->reloc_root = NULL;
		}
	}
Liu Bo's avatar
Liu Bo committed
4151

4152 4153
	if (drop_ref)
		btrfs_put_root(root);
Chris Mason's avatar
Chris Mason committed
4154 4155
}

4156
int btrfs_commit_super(struct btrfs_fs_info *fs_info)
4157
{
4158
	struct btrfs_root *root = fs_info->tree_root;
4159
	struct btrfs_trans_handle *trans;
4160

4161
	mutex_lock(&fs_info->cleaner_mutex);
4162
	btrfs_run_delayed_iputs(fs_info);
4163 4164
	mutex_unlock(&fs_info->cleaner_mutex);
	wake_up_process(fs_info->cleaner_kthread);
4165 4166

	/* wait until ongoing cleanup work done */
4167 4168
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
4169

4170
	trans = btrfs_join_transaction(root);
4171 4172
	if (IS_ERR(trans))
		return PTR_ERR(trans);
4173
	return btrfs_commit_transaction(trans);
4174 4175
}

4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194
static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info)
{
	struct btrfs_transaction *trans;
	struct btrfs_transaction *tmp;
	bool found = false;

	/*
	 * This function is only called at the very end of close_ctree(),
	 * thus no other running transaction, no need to take trans_lock.
	 */
	ASSERT(test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags));
	list_for_each_entry_safe(trans, tmp, &fs_info->trans_list, list) {
		struct extent_state *cached = NULL;
		u64 dirty_bytes = 0;
		u64 cur = 0;
		u64 found_start;
		u64 found_end;

		found = true;
4195
		while (find_first_extent_bit(&trans->dirty_pages, cur,
4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214
			&found_start, &found_end, EXTENT_DIRTY, &cached)) {
			dirty_bytes += found_end + 1 - found_start;
			cur = found_end + 1;
		}
		btrfs_warn(fs_info,
	"transaction %llu (with %llu dirty metadata bytes) is not committed",
			   trans->transid, dirty_bytes);
		btrfs_cleanup_one_transaction(trans, fs_info);

		if (trans == fs_info->running_transaction)
			fs_info->running_transaction = NULL;
		list_del_init(&trans->list);

		btrfs_put_transaction(trans);
		trace_btrfs_transaction_commit(fs_info);
	}
	ASSERT(!found);
}

4215
void __cold close_ctree(struct btrfs_fs_info *fs_info)
4216 4217 4218
{
	int ret;

4219
	set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags);
4220

4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231
	/*
	 * If we had UNFINISHED_DROPS we could still be processing them, so
	 * clear that bit and wake up relocation so it can stop.
	 * We must do this before stopping the block group reclaim task, because
	 * at btrfs_relocate_block_group() we wait for this bit, and after the
	 * wait we stop with -EINTR if btrfs_fs_closing() returns non-zero - we
	 * have just set BTRFS_FS_CLOSING_START, so btrfs_fs_closing() will
	 * return 1.
	 */
	btrfs_wake_unfinished_drop(fs_info);

4232 4233 4234 4235 4236 4237 4238 4239 4240 4241
	/*
	 * We may have the reclaim task running and relocating a data block group,
	 * in which case it may create delayed iputs. So stop it before we park
	 * the cleaner kthread otherwise we can get new delayed iputs after
	 * parking the cleaner, and that can make the async reclaim task to hang
	 * if it's waiting for delayed iputs to complete, since the cleaner is
	 * parked and can not run delayed iputs - this will make us hang when
	 * trying to stop the async reclaim task.
	 */
	cancel_work_sync(&fs_info->reclaim_bgs_work);
4242 4243 4244 4245 4246 4247 4248
	/*
	 * We don't want the cleaner to start new transactions, add more delayed
	 * iputs, etc. while we're closing. We can't use kthread_stop() yet
	 * because that frees the task_struct, and the transaction kthread might
	 * still try to wake up the cleaner.
	 */
	kthread_park(fs_info->cleaner_kthread);
4249

4250
	/* wait for the qgroup rescan worker to stop */
4251
	btrfs_qgroup_wait_for_completion(fs_info, false);
4252

4253 4254 4255 4256 4257
	/* wait for the uuid_scan task to finish */
	down(&fs_info->uuid_tree_rescan_sem);
	/* avoid complains from lockdep et al., set sem back to initial state */
	up(&fs_info->uuid_tree_rescan_sem);

4258
	/* pause restriper - we want to resume on mount */
4259
	btrfs_pause_balance(fs_info);
4260

4261 4262
	btrfs_dev_replace_suspend_for_unmount(fs_info);

4263
	btrfs_scrub_cancel(fs_info);
4264 4265 4266 4267 4268 4269

	/* wait for any defraggers to finish */
	wait_event(fs_info->transaction_wait,
		   (atomic_read(&fs_info->defrag_running) == 0));

	/* clear out the rbtree of defraggable inodes */
4270
	btrfs_cleanup_defrag_inodes(fs_info);
4271

4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296
	/*
	 * After we parked the cleaner kthread, ordered extents may have
	 * completed and created new delayed iputs. If one of the async reclaim
	 * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
	 * can hang forever trying to stop it, because if a delayed iput is
	 * added after it ran btrfs_run_delayed_iputs() and before it called
	 * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
	 * no one else to run iputs.
	 *
	 * So wait for all ongoing ordered extents to complete and then run
	 * delayed iputs. This works because once we reach this point no one
	 * can either create new ordered extents nor create delayed iputs
	 * through some other means.
	 *
	 * Also note that btrfs_wait_ordered_roots() is not safe here, because
	 * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
	 * but the delayed iput for the respective inode is made only when doing
	 * the final btrfs_put_ordered_extent() (which must happen at
	 * btrfs_finish_ordered_io() when we are unmounting).
	 */
	btrfs_flush_workqueue(fs_info->endio_write_workers);
	/* Ordered extents for free space inodes. */
	btrfs_flush_workqueue(fs_info->endio_freespace_worker);
	btrfs_run_delayed_iputs(fs_info);

4297
	cancel_work_sync(&fs_info->async_reclaim_work);
4298
	cancel_work_sync(&fs_info->async_data_reclaim_work);
4299
	cancel_work_sync(&fs_info->preempt_reclaim_work);
4300

4301 4302 4303
	/* Cancel or finish ongoing discard work */
	btrfs_discard_cleanup(fs_info);

4304
	if (!sb_rdonly(fs_info->sb)) {
4305
		/*
4306 4307
		 * The cleaner kthread is stopped, so do one final pass over
		 * unused block groups.
4308
		 */
4309
		btrfs_delete_unused_bgs(fs_info);
4310

4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323
		/*
		 * There might be existing delayed inode workers still running
		 * and holding an empty delayed inode item. We must wait for
		 * them to complete first because they can create a transaction.
		 * This happens when someone calls btrfs_balance_delayed_items()
		 * and then a transaction commit runs the same delayed nodes
		 * before any delayed worker has done something with the nodes.
		 * We must wait for any worker here and not at transaction
		 * commit time since that could cause a deadlock.
		 * This is a very rare case.
		 */
		btrfs_flush_workqueue(fs_info->delayed_workers);

4324
		ret = btrfs_commit_super(fs_info);
4325
		if (ret)
4326
			btrfs_err(fs_info, "commit super ret %d", ret);
4327 4328
	}

4329
	if (BTRFS_FS_ERROR(fs_info))
4330
		btrfs_error_commit_super(fs_info);
4331

4332 4333
	kthread_stop(fs_info->transaction_kthread);
	kthread_stop(fs_info->cleaner_kthread);
4334

4335
	ASSERT(list_empty(&fs_info->delayed_iputs));
4336
	set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
4337

4338 4339 4340 4341 4342
	if (btrfs_check_quota_leak(fs_info)) {
		WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
		btrfs_err(fs_info, "qgroup reserved space leaked");
	}

4343
	btrfs_free_qgroup_config(fs_info);
4344
	ASSERT(list_empty(&fs_info->delalloc_roots));
4345

4346
	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
4347
		btrfs_info(fs_info, "at unmount delalloc count %lld",
4348
		       percpu_counter_sum(&fs_info->delalloc_bytes));
4349
	}
4350

4351
	if (percpu_counter_sum(&fs_info->ordered_bytes))
4352
		btrfs_info(fs_info, "at unmount dio bytes count %lld",
4353
			   percpu_counter_sum(&fs_info->ordered_bytes));
4354

4355
	btrfs_sysfs_remove_mounted(fs_info);
4356
	btrfs_sysfs_remove_fsid(fs_info->fs_devices);
4357

4358 4359
	btrfs_put_block_group_cache(fs_info);

4360 4361 4362 4363 4364
	/*
	 * we must make sure there is not any read request to
	 * submit after we stopping all workers.
	 */
	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
4365 4366
	btrfs_stop_all_workers(fs_info);

4367
	/* We shouldn't have any transaction open at this point */
4368
	warn_about_uncommitted_trans(fs_info);
4369

4370
	clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
4371
	free_root_pointers(fs_info, true);
4372
	btrfs_free_fs_roots(fs_info);
4373

4374 4375 4376 4377 4378 4379 4380 4381 4382
	/*
	 * We must free the block groups after dropping the fs_roots as we could
	 * have had an IO error and have left over tree log blocks that aren't
	 * cleaned up until the fs roots are freed.  This makes the block group
	 * accounting appear to be wrong because there's pending reserved bytes,
	 * so make sure we do the block group cleanup afterwards.
	 */
	btrfs_free_block_groups(fs_info);

4383
	iput(fs_info->btree_inode);
4384

4385
	btrfs_mapping_tree_free(fs_info);
4386
	btrfs_close_devices(fs_info->fs_devices);
4387 4388
}

4389 4390
void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans,
			     struct extent_buffer *buf)
4391
{
4392
	struct btrfs_fs_info *fs_info = buf->fs_info;
4393
	u64 transid = btrfs_header_generation(buf);
4394

4395 4396 4397
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
	/*
	 * This is a fast path so only do this check if we have sanity tests
4398
	 * enabled.  Normal people shouldn't be using unmapped buffers as dirty
4399 4400
	 * outside of the sanity tests.
	 */
4401
	if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &buf->bflags)))
4402 4403
		return;
#endif
4404 4405
	/* This is an active transaction (its state < TRANS_STATE_UNBLOCKED). */
	ASSERT(trans->transid == fs_info->generation);
4406
	btrfs_assert_tree_write_locked(buf);
4407
	if (unlikely(transid != fs_info->generation)) {
4408
		btrfs_abort_transaction(trans, -EUCLEAN);
4409 4410 4411
		btrfs_crit(fs_info,
"dirty buffer transid mismatch, logical %llu found transid %llu running transid %llu",
			   buf->start, transid, fs_info->generation);
4412
	}
4413
	set_extent_buffer_dirty(buf);
4414 4415
}

4416
static void __btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info,
4417
					int flush_delayed)
4418 4419 4420 4421 4422
{
	/*
	 * looks as though older kernels can get into trouble with
	 * this code, they end up stuck in balance_dirty_pages forever
	 */
4423
	int ret;
4424 4425 4426 4427

	if (current->flags & PF_MEMALLOC)
		return;

4428
	if (flush_delayed)
4429
		btrfs_balance_delayed_items(fs_info);
4430

4431 4432 4433
	ret = __percpu_counter_compare(&fs_info->dirty_metadata_bytes,
				     BTRFS_DIRTY_METADATA_THRESH,
				     fs_info->dirty_metadata_batch);
4434
	if (ret > 0) {
4435
		balance_dirty_pages_ratelimited(fs_info->btree_inode->i_mapping);
4436 4437 4438
	}
}

4439
void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info)
4440
{
4441
	__btrfs_btree_balance_dirty(fs_info, 1);
4442
}
4443

4444
void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
4445
{
4446
	__btrfs_btree_balance_dirty(fs_info, 0);
4447
}
4448

4449
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
4450
{
4451 4452 4453
	/* cleanup FS via transaction */
	btrfs_cleanup_transaction(fs_info);

4454
	mutex_lock(&fs_info->cleaner_mutex);
4455
	btrfs_run_delayed_iputs(fs_info);
4456
	mutex_unlock(&fs_info->cleaner_mutex);
4457

4458 4459
	down_write(&fs_info->cleanup_work_sem);
	up_write(&fs_info->cleanup_work_sem);
4460 4461
}

4462 4463
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479
	struct btrfs_root *gang[8];
	u64 root_objectid = 0;
	int ret;

	spin_lock(&fs_info->fs_roots_radix_lock);
	while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
					     (void **)gang, root_objectid,
					     ARRAY_SIZE(gang))) != 0) {
		int i;

		for (i = 0; i < ret; i++)
			gang[i] = btrfs_grab_root(gang[i]);
		spin_unlock(&fs_info->fs_roots_radix_lock);

		for (i = 0; i < ret; i++) {
			if (!gang[i])
4480
				continue;
4481
			root_objectid = btrfs_root_id(gang[i]);
4482 4483
			btrfs_free_log(NULL, gang[i]);
			btrfs_put_root(gang[i]);
4484
		}
4485 4486
		root_objectid++;
		spin_lock(&fs_info->fs_roots_radix_lock);
4487
	}
4488
	spin_unlock(&fs_info->fs_roots_radix_lock);
4489 4490 4491
	btrfs_free_log_root_tree(NULL, fs_info);
}

4492
static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
4493 4494 4495
{
	struct btrfs_ordered_extent *ordered;

4496
	spin_lock(&root->ordered_extent_lock);
4497 4498 4499 4500
	/*
	 * This will just short circuit the ordered completion stuff which will
	 * make sure the ordered extent gets properly cleaned up.
	 */
4501
	list_for_each_entry(ordered, &root->ordered_extents,
4502 4503
			    root_extent_list)
		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
4504 4505 4506 4507 4508 4509
	spin_unlock(&root->ordered_extent_lock);
}

static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
4510
	LIST_HEAD(splice);
4511 4512 4513 4514 4515 4516

	spin_lock(&fs_info->ordered_root_lock);
	list_splice_init(&fs_info->ordered_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					ordered_root);
4517 4518
		list_move_tail(&root->ordered_root,
			       &fs_info->ordered_roots);
4519

4520
		spin_unlock(&fs_info->ordered_root_lock);
4521 4522
		btrfs_destroy_ordered_extents(root);

4523 4524
		cond_resched();
		spin_lock(&fs_info->ordered_root_lock);
4525 4526
	}
	spin_unlock(&fs_info->ordered_root_lock);
4527 4528 4529 4530 4531 4532 4533 4534

	/*
	 * We need this here because if we've been flipped read-only we won't
	 * get sync() from the umount, so we need to make sure any ordered
	 * extents that haven't had their dirty pages IO start writeout yet
	 * actually get run and error out properly.
	 */
	btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
4535 4536
}

4537 4538
static void btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
				       struct btrfs_fs_info *fs_info)
4539 4540 4541 4542 4543 4544 4545 4546
{
	struct rb_node *node;
	struct btrfs_delayed_ref_root *delayed_refs;
	struct btrfs_delayed_ref_node *ref;

	delayed_refs = &trans->delayed_refs;

	spin_lock(&delayed_refs->lock);
4547
	if (atomic_read(&delayed_refs->num_entries) == 0) {
4548
		spin_unlock(&delayed_refs->lock);
4549
		btrfs_debug(fs_info, "delayed_refs has NO entry");
4550
		return;
4551 4552
	}

4553
	while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
4554
		struct btrfs_delayed_ref_head *head;
4555
		struct rb_node *n;
4556
		bool pin_bytes = false;
4557

4558 4559
		head = rb_entry(node, struct btrfs_delayed_ref_head,
				href_node);
4560
		if (btrfs_delayed_ref_lock(delayed_refs, head))
4561
			continue;
4562

4563
		spin_lock(&head->lock);
4564
		while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
4565 4566
			ref = rb_entry(n, struct btrfs_delayed_ref_node,
				       ref_node);
4567
			rb_erase_cached(&ref->ref_node, &head->ref_tree);
4568
			RB_CLEAR_NODE(&ref->ref_node);
4569 4570
			if (!list_empty(&ref->add_list))
				list_del(&ref->add_list);
4571 4572
			atomic_dec(&delayed_refs->num_entries);
			btrfs_put_delayed_ref(ref);
4573
			btrfs_delayed_refs_rsv_release(fs_info, 1, 0);
4574
		}
4575 4576 4577
		if (head->must_insert_reserved)
			pin_bytes = true;
		btrfs_free_delayed_extent_op(head->extent_op);
4578
		btrfs_delete_ref_head(delayed_refs, head);
4579 4580 4581
		spin_unlock(&head->lock);
		spin_unlock(&delayed_refs->lock);
		mutex_unlock(&head->mutex);
4582

4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603
		if (pin_bytes) {
			struct btrfs_block_group *cache;

			cache = btrfs_lookup_block_group(fs_info, head->bytenr);
			BUG_ON(!cache);

			spin_lock(&cache->space_info->lock);
			spin_lock(&cache->lock);
			cache->pinned += head->num_bytes;
			btrfs_space_info_update_bytes_pinned(fs_info,
				cache->space_info, head->num_bytes);
			cache->reserved -= head->num_bytes;
			cache->space_info->bytes_reserved -= head->num_bytes;
			spin_unlock(&cache->lock);
			spin_unlock(&cache->space_info->lock);

			btrfs_put_block_group(cache);

			btrfs_error_unpin_extent_range(fs_info, head->bytenr,
				head->bytenr + head->num_bytes - 1);
		}
4604
		btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
4605
		btrfs_put_delayed_ref_head(head);
4606 4607 4608
		cond_resched();
		spin_lock(&delayed_refs->lock);
	}
4609
	btrfs_qgroup_destroy_extent_records(trans);
4610 4611 4612 4613

	spin_unlock(&delayed_refs->lock);
}

4614
static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
4615 4616
{
	struct btrfs_inode *btrfs_inode;
4617
	LIST_HEAD(splice);
4618

4619 4620
	spin_lock(&root->delalloc_lock);
	list_splice_init(&root->delalloc_inodes, &splice);
4621 4622

	while (!list_empty(&splice)) {
4623
		struct inode *inode = NULL;
4624 4625
		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
					       delalloc_inodes);
4626
		btrfs_del_delalloc_inode(btrfs_inode);
4627
		spin_unlock(&root->delalloc_lock);
4628

4629 4630 4631 4632 4633 4634
		/*
		 * Make sure we get a live inode and that it'll not disappear
		 * meanwhile.
		 */
		inode = igrab(&btrfs_inode->vfs_inode);
		if (inode) {
4635 4636 4637
			unsigned int nofs_flag;

			nofs_flag = memalloc_nofs_save();
4638
			invalidate_inode_pages2(inode->i_mapping);
4639
			memalloc_nofs_restore(nofs_flag);
4640 4641
			iput(inode);
		}
4642
		spin_lock(&root->delalloc_lock);
4643
	}
4644 4645 4646 4647 4648 4649
	spin_unlock(&root->delalloc_lock);
}

static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *root;
4650
	LIST_HEAD(splice);
4651 4652 4653 4654 4655 4656

	spin_lock(&fs_info->delalloc_root_lock);
	list_splice_init(&fs_info->delalloc_roots, &splice);
	while (!list_empty(&splice)) {
		root = list_first_entry(&splice, struct btrfs_root,
					 delalloc_root);
4657
		root = btrfs_grab_root(root);
4658 4659 4660 4661
		BUG_ON(!root);
		spin_unlock(&fs_info->delalloc_root_lock);

		btrfs_destroy_delalloc_inodes(root);
4662
		btrfs_put_root(root);
4663 4664 4665 4666

		spin_lock(&fs_info->delalloc_root_lock);
	}
	spin_unlock(&fs_info->delalloc_root_lock);
4667 4668
}

4669 4670 4671
static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info,
					 struct extent_io_tree *dirty_pages,
					 int mark)
4672 4673 4674 4675 4676
{
	struct extent_buffer *eb;
	u64 start = 0;
	u64 end;

4677 4678
	while (find_first_extent_bit(dirty_pages, start, &start, &end,
				     mark, NULL)) {
4679
		clear_extent_bits(dirty_pages, start, end, mark);
4680
		while (start <= end) {
4681 4682
			eb = find_extent_buffer(fs_info, start);
			start += fs_info->nodesize;
4683
			if (!eb)
4684
				continue;
4685 4686

			btrfs_tree_lock(eb);
4687
			wait_on_extent_buffer_writeback(eb);
4688
			btrfs_clear_buffer_dirty(NULL, eb);
4689
			btrfs_tree_unlock(eb);
4690

4691
			free_extent_buffer_stale(eb);
4692 4693 4694 4695
		}
	}
}

4696 4697
static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info,
					struct extent_io_tree *unpin)
4698 4699 4700 4701 4702
{
	u64 start;
	u64 end;

	while (1) {
4703 4704
		struct extent_state *cached_state = NULL;

4705 4706 4707 4708 4709 4710 4711
		/*
		 * The btrfs_finish_extent_commit() may get the same range as
		 * ours between find_first_extent_bit and clear_extent_dirty.
		 * Hence, hold the unused_bg_unpin_mutex to avoid double unpin
		 * the same extent range.
		 */
		mutex_lock(&fs_info->unused_bg_unpin_mutex);
4712 4713
		if (!find_first_extent_bit(unpin, 0, &start, &end,
					   EXTENT_DIRTY, &cached_state)) {
4714
			mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4715
			break;
4716
		}
4717

4718 4719
		clear_extent_dirty(unpin, start, end, &cached_state);
		free_extent_state(cached_state);
4720
		btrfs_error_unpin_extent_range(fs_info, start, end);
4721
		mutex_unlock(&fs_info->unused_bg_unpin_mutex);
4722 4723 4724 4725
		cond_resched();
	}
}

4726
static void btrfs_cleanup_bg_io(struct btrfs_block_group *cache)
4727 4728 4729 4730 4731
{
	struct inode *inode;

	inode = cache->io_ctl.inode;
	if (inode) {
4732 4733 4734
		unsigned int nofs_flag;

		nofs_flag = memalloc_nofs_save();
4735
		invalidate_inode_pages2(inode->i_mapping);
4736 4737
		memalloc_nofs_restore(nofs_flag);

4738 4739 4740 4741
		BTRFS_I(inode)->generation = 0;
		cache->io_ctl.inode = NULL;
		iput(inode);
	}
4742
	ASSERT(cache->io_ctl.pages == NULL);
4743 4744 4745 4746
	btrfs_put_block_group(cache);
}

void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans,
4747
			     struct btrfs_fs_info *fs_info)
4748
{
4749
	struct btrfs_block_group *cache;
4750 4751 4752 4753

	spin_lock(&cur_trans->dirty_bgs_lock);
	while (!list_empty(&cur_trans->dirty_bgs)) {
		cache = list_first_entry(&cur_trans->dirty_bgs,
4754
					 struct btrfs_block_group,
4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770
					 dirty_list);

		if (!list_empty(&cache->io_list)) {
			spin_unlock(&cur_trans->dirty_bgs_lock);
			list_del_init(&cache->io_list);
			btrfs_cleanup_bg_io(cache);
			spin_lock(&cur_trans->dirty_bgs_lock);
		}

		list_del_init(&cache->dirty_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);

		spin_unlock(&cur_trans->dirty_bgs_lock);
		btrfs_put_block_group(cache);
4771
		btrfs_dec_delayed_refs_rsv_bg_updates(fs_info);
4772 4773 4774 4775
		spin_lock(&cur_trans->dirty_bgs_lock);
	}
	spin_unlock(&cur_trans->dirty_bgs_lock);

4776 4777 4778 4779
	/*
	 * Refer to the definition of io_bgs member for details why it's safe
	 * to use it without any locking
	 */
4780 4781
	while (!list_empty(&cur_trans->io_bgs)) {
		cache = list_first_entry(&cur_trans->io_bgs,
4782
					 struct btrfs_block_group,
4783 4784 4785 4786 4787 4788 4789 4790 4791 4792
					 io_list);

		list_del_init(&cache->io_list);
		spin_lock(&cache->lock);
		cache->disk_cache_state = BTRFS_DC_ERROR;
		spin_unlock(&cache->lock);
		btrfs_cleanup_bg_io(cache);
	}
}

4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811
static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info)
{
	struct btrfs_root *gang[8];
	int i;
	int ret;

	spin_lock(&fs_info->fs_roots_radix_lock);
	while (1) {
		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
						 (void **)gang, 0,
						 ARRAY_SIZE(gang),
						 BTRFS_ROOT_TRANS_TAG);
		if (ret == 0)
			break;
		for (i = 0; i < ret; i++) {
			struct btrfs_root *root = gang[i];

			btrfs_qgroup_free_meta_all_pertrans(root);
			radix_tree_tag_clear(&fs_info->fs_roots_radix,
4812
					(unsigned long)btrfs_root_id(root),
4813 4814 4815 4816 4817 4818
					BTRFS_ROOT_TRANS_TAG);
		}
	}
	spin_unlock(&fs_info->fs_roots_radix_lock);
}

4819
void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
4820
				   struct btrfs_fs_info *fs_info)
4821
{
4822 4823
	struct btrfs_device *dev, *tmp;

4824
	btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
4825 4826 4827
	ASSERT(list_empty(&cur_trans->dirty_bgs));
	ASSERT(list_empty(&cur_trans->io_bgs));

4828 4829 4830 4831 4832
	list_for_each_entry_safe(dev, tmp, &cur_trans->dev_update_list,
				 post_commit_list) {
		list_del_init(&dev->post_commit_list);
	}

4833
	btrfs_destroy_delayed_refs(cur_trans, fs_info);
4834

4835
	cur_trans->state = TRANS_STATE_COMMIT_START;
4836
	wake_up(&fs_info->transaction_blocked_wait);
4837

4838
	cur_trans->state = TRANS_STATE_UNBLOCKED;
4839
	wake_up(&fs_info->transaction_wait);
4840

4841
	btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
4842
				     EXTENT_DIRTY);
4843
	btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents);
4844

4845 4846
	cur_trans->state =TRANS_STATE_COMPLETED;
	wake_up(&cur_trans->commit_wait);
4847 4848
}

4849
static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info)
4850 4851 4852
{
	struct btrfs_transaction *t;

4853
	mutex_lock(&fs_info->transaction_kthread_mutex);
4854

4855 4856 4857
	spin_lock(&fs_info->trans_lock);
	while (!list_empty(&fs_info->trans_list)) {
		t = list_first_entry(&fs_info->trans_list,
4858
				     struct btrfs_transaction, list);
4859
		if (t->state >= TRANS_STATE_COMMIT_PREP) {
4860
			refcount_inc(&t->use_count);
4861
			spin_unlock(&fs_info->trans_lock);
4862
			btrfs_wait_for_commit(fs_info, t->transid);
4863
			btrfs_put_transaction(t);
4864
			spin_lock(&fs_info->trans_lock);
4865 4866
			continue;
		}
4867
		if (t == fs_info->running_transaction) {
4868
			t->state = TRANS_STATE_COMMIT_DOING;
4869
			spin_unlock(&fs_info->trans_lock);
4870 4871 4872 4873 4874 4875 4876
			/*
			 * We wait for 0 num_writers since we don't hold a trans
			 * handle open currently for this transaction.
			 */
			wait_event(t->writer_wait,
				   atomic_read(&t->num_writers) == 0);
		} else {
4877
			spin_unlock(&fs_info->trans_lock);
4878
		}
4879
		btrfs_cleanup_one_transaction(t, fs_info);
4880

4881 4882 4883
		spin_lock(&fs_info->trans_lock);
		if (t == fs_info->running_transaction)
			fs_info->running_transaction = NULL;
4884
		list_del_init(&t->list);
4885
		spin_unlock(&fs_info->trans_lock);
4886

4887
		btrfs_put_transaction(t);
4888
		trace_btrfs_transaction_commit(fs_info);
4889
		spin_lock(&fs_info->trans_lock);
4890
	}
4891 4892
	spin_unlock(&fs_info->trans_lock);
	btrfs_destroy_all_ordered_extents(fs_info);
4893 4894
	btrfs_destroy_delayed_inodes(fs_info);
	btrfs_assert_delayed_root_empty(fs_info);
4895
	btrfs_destroy_all_delalloc_inodes(fs_info);
4896
	btrfs_drop_all_logs(fs_info);
4897
	btrfs_free_all_qgroup_pertrans(fs_info);
4898
	mutex_unlock(&fs_info->transaction_kthread_mutex);
4899 4900 4901

	return 0;
}
4902

4903
int btrfs_init_root_free_objectid(struct btrfs_root *root)
4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921
{
	struct btrfs_path *path;
	int ret;
	struct extent_buffer *l;
	struct btrfs_key search_key;
	struct btrfs_key found_key;
	int slot;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
	search_key.type = -1;
	search_key.offset = (u64)-1;
	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
	if (ret < 0)
		goto error;
4922 4923 4924 4925 4926 4927 4928 4929
	if (ret == 0) {
		/*
		 * Key with offset -1 found, there would have to exist a root
		 * with such id, but this is out of valid range.
		 */
		ret = -EUCLEAN;
		goto error;
	}
4930 4931 4932 4933
	if (path->slots[0] > 0) {
		slot = path->slots[0] - 1;
		l = path->nodes[0];
		btrfs_item_key_to_cpu(l, &found_key, slot);
4934 4935
		root->free_objectid = max_t(u64, found_key.objectid + 1,
					    BTRFS_FIRST_FREE_OBJECTID);
4936
	} else {
4937
		root->free_objectid = BTRFS_FIRST_FREE_OBJECTID;
4938 4939 4940 4941 4942 4943 4944
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

4945
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid)
4946 4947 4948 4949
{
	int ret;
	mutex_lock(&root->objectid_mutex);

4950
	if (unlikely(root->free_objectid >= BTRFS_LAST_FREE_OBJECTID)) {
4951 4952
		btrfs_warn(root->fs_info,
			   "the objectid of root %llu reaches its highest value",
4953
			   btrfs_root_id(root));
4954 4955 4956 4957
		ret = -ENOSPC;
		goto out;
	}

4958
	*objectid = root->free_objectid++;
4959 4960 4961 4962 4963
	ret = 0;
out:
	mutex_unlock(&root->objectid_mutex);
	return ret;
}