volumes.c 221 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4
/*
 * Copyright (C) 2007 Oracle.  All rights reserved.
 */
5

6
#include <linux/sched.h>
7
#include <linux/sched/mm.h>
8
#include <linux/slab.h>
9
#include <linux/ratelimit.h>
10
#include <linux/kthread.h>
11
#include <linux/semaphore.h>
12
#include <linux/uuid.h>
13
#include <linux/list_sort.h>
14
#include <linux/namei.h>
15
#include "misc.h"
16 17 18 19
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
David Woodhouse's avatar
David Woodhouse committed
20
#include "raid56.h"
21
#include "rcu-string.h"
22
#include "dev-replace.h"
23
#include "sysfs.h"
24
#include "tree-checker.h"
25
#include "space-info.h"
26
#include "block-group.h"
27
#include "discard.h"
28
#include "zoned.h"
29
#include "fs.h"
30
#include "accessors.h"
31
#include "uuid-tree.h"
32
#include "ioctl.h"
33
#include "relocation.h"
34
#include "scrub.h"
35
#include "super.h"
36
#include "raid-stripe-tree.h"
37

38 39 40 41
#define BTRFS_BLOCK_GROUP_STRIPE_MASK	(BTRFS_BLOCK_GROUP_RAID0 | \
					 BTRFS_BLOCK_GROUP_RAID10 | \
					 BTRFS_BLOCK_GROUP_RAID56_MASK)

42 43 44 45 46 47 48 49 50 51 52
struct btrfs_io_geometry {
	u32 stripe_index;
	u32 stripe_nr;
	int mirror_num;
	int num_stripes;
	u64 stripe_offset;
	u64 raid56_full_stripe_start;
	int max_errors;
	enum btrfs_map_op op;
};

53 54 55 56 57
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
	[BTRFS_RAID_RAID10] = {
		.sub_stripes	= 2,
		.dev_stripes	= 1,
		.devs_max	= 0,	/* 0 == as many as possible */
58
		.devs_min	= 2,
59
		.tolerated_failures = 1,
60 61
		.devs_increment	= 2,
		.ncopies	= 2,
62
		.nparity        = 0,
63
		.raid_name	= "raid10",
64
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID10,
65
		.mindev_error	= BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
66 67 68 69 70 71
	},
	[BTRFS_RAID_RAID1] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 2,
		.devs_min	= 2,
72
		.tolerated_failures = 1,
73 74
		.devs_increment	= 2,
		.ncopies	= 2,
75
		.nparity        = 0,
76
		.raid_name	= "raid1",
77
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1,
78
		.mindev_error	= BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
79
	},
80 81 82
	[BTRFS_RAID_RAID1C3] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
83
		.devs_max	= 3,
84 85 86 87
		.devs_min	= 3,
		.tolerated_failures = 2,
		.devs_increment	= 3,
		.ncopies	= 3,
88
		.nparity        = 0,
89 90 91 92
		.raid_name	= "raid1c3",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C3,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
	},
93 94 95
	[BTRFS_RAID_RAID1C4] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
96
		.devs_max	= 4,
97 98 99 100
		.devs_min	= 4,
		.tolerated_failures = 3,
		.devs_increment	= 4,
		.ncopies	= 4,
101
		.nparity        = 0,
102 103 104 105
		.raid_name	= "raid1c4",
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID1C4,
		.mindev_error	= BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
	},
106 107 108 109 110
	[BTRFS_RAID_DUP] = {
		.sub_stripes	= 1,
		.dev_stripes	= 2,
		.devs_max	= 1,
		.devs_min	= 1,
111
		.tolerated_failures = 0,
112 113
		.devs_increment	= 1,
		.ncopies	= 2,
114
		.nparity        = 0,
115
		.raid_name	= "dup",
116
		.bg_flag	= BTRFS_BLOCK_GROUP_DUP,
117
		.mindev_error	= 0,
118 119 120 121 122
	},
	[BTRFS_RAID_RAID0] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
123
		.devs_min	= 1,
124
		.tolerated_failures = 0,
125 126
		.devs_increment	= 1,
		.ncopies	= 1,
127
		.nparity        = 0,
128
		.raid_name	= "raid0",
129
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID0,
130
		.mindev_error	= 0,
131 132 133 134 135 136
	},
	[BTRFS_RAID_SINGLE] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 1,
		.devs_min	= 1,
137
		.tolerated_failures = 0,
138 139
		.devs_increment	= 1,
		.ncopies	= 1,
140
		.nparity        = 0,
141
		.raid_name	= "single",
142
		.bg_flag	= 0,
143
		.mindev_error	= 0,
144 145 146 147 148 149
	},
	[BTRFS_RAID_RAID5] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 2,
150
		.tolerated_failures = 1,
151
		.devs_increment	= 1,
152
		.ncopies	= 1,
153
		.nparity        = 1,
154
		.raid_name	= "raid5",
155
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID5,
156
		.mindev_error	= BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
157 158 159 160 161 162
	},
	[BTRFS_RAID_RAID6] = {
		.sub_stripes	= 1,
		.dev_stripes	= 1,
		.devs_max	= 0,
		.devs_min	= 3,
163
		.tolerated_failures = 2,
164
		.devs_increment	= 1,
165
		.ncopies	= 1,
166
		.nparity        = 2,
167
		.raid_name	= "raid6",
168
		.bg_flag	= BTRFS_BLOCK_GROUP_RAID6,
169
		.mindev_error	= BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
170 171 172
	},
};

173 174 175 176 177 178
/*
 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 * can be used as index to access btrfs_raid_array[].
 */
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
179
	const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
180

181 182 183 184
	if (!profile)
		return BTRFS_RAID_SINGLE;

	return BTRFS_BG_FLAG_TO_INDEX(profile);
185 186
}

187
const char *btrfs_bg_type_to_raid_name(u64 flags)
188
{
189 190 191
	const int index = btrfs_bg_flags_to_raid_index(flags);

	if (index >= BTRFS_NR_RAID_TYPES)
192 193
		return NULL;

194
	return btrfs_raid_array[index].raid_name;
195 196
}

197 198 199 200 201 202 203
int btrfs_nr_parity_stripes(u64 type)
{
	enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type);

	return btrfs_raid_array[index].nparity;
}

204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
/*
 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 * bytes including terminating null byte.
 */
void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
{
	int i;
	int ret;
	char *bp = buf;
	u64 flags = bg_flags;
	u32 size_bp = size_buf;

	if (!flags) {
		strcpy(bp, "NONE");
		return;
	}

#define DESCRIBE_FLAG(flag, desc)						\
	do {								\
		if (flags & (flag)) {					\
			ret = snprintf(bp, size_bp, "%s|", (desc));	\
			if (ret < 0 || ret >= size_bp)			\
				goto out_overflow;			\
			size_bp -= ret;					\
			bp += ret;					\
			flags &= ~(flag);				\
		}							\
	} while (0)

	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
	DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");

	DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
		DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
			      btrfs_raid_array[i].raid_name);
#undef DESCRIBE_FLAG

	if (flags) {
		ret = snprintf(bp, size_bp, "0x%llx|", flags);
		size_bp -= ret;
	}

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last | */

	/*
	 * The text is trimmed, it's up to the caller to provide sufficiently
	 * large buffer
	 */
out_overflow:;
}

258
static int init_first_rw_device(struct btrfs_trans_handle *trans);
259
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
260
static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
Yan Zheng's avatar
Yan Zheng committed
261

262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277
/*
 * Device locking
 * ==============
 *
 * There are several mutexes that protect manipulation of devices and low-level
 * structures like chunks but not block groups, extents or files
 *
 * uuid_mutex (global lock)
 * ------------------------
 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 * device) or requested by the device= mount option
 *
 * the mutex can be very coarse and can cover long-running operations
 *
 * protects: updates to fs_devices counters like missing devices, rw devices,
278
 * seeding, structure cloning, opening/closing devices at mount/umount time
279 280 281
 *
 * global::fs_devs - add, remove, updates to the global list
 *
282 283 284
 * does not protect: manipulation of the fs_devices::devices list in general
 * but in mount context it could be used to exclude list modifications by eg.
 * scan ioctl
285 286 287 288 289 290 291 292 293 294 295 296
 *
 * btrfs_device::name - renames (write side), read is RCU
 *
 * fs_devices::device_list_mutex (per-fs, with RCU)
 * ------------------------------------------------
 * protects updates to fs_devices::devices, ie. adding and deleting
 *
 * simple list traversal with read-only actions can be done with RCU protection
 *
 * may be used to exclude some operations from running concurrently without any
 * modifications to the list (see write_all_supers)
 *
297 298 299
 * Is not required at mount and close times, because our device list is
 * protected by the uuid_mutex at that point.
 *
300 301 302 303 304 305 306 307
 * balance_mutex
 * -------------
 * protects balance structures (status, state) and context accessed from
 * several places (internally, ioctl)
 *
 * chunk_mutex
 * -----------
 * protects chunks, adding or removing during allocation, trim or when a new
308 309 310
 * device is added/removed. Additionally it also protects post_commit_list of
 * individual devices, since they can be added to the transaction's
 * post_commit_list only with chunk_mutex held.
311 312 313 314 315 316 317 318 319 320 321
 *
 * cleaner_mutex
 * -------------
 * a big lock that is held by the cleaner thread and prevents running subvolume
 * cleaning together with relocation or delayed iputs
 *
 *
 * Lock nesting
 * ============
 *
 * uuid_mutex
322 323 324
 *   device_list_mutex
 *     chunk_mutex
 *   balance_mutex
325 326
 *
 *
327 328
 * Exclusive operations
 * ====================
329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353
 *
 * Maintains the exclusivity of the following operations that apply to the
 * whole filesystem and cannot run in parallel.
 *
 * - Balance (*)
 * - Device add
 * - Device remove
 * - Device replace (*)
 * - Resize
 *
 * The device operations (as above) can be in one of the following states:
 *
 * - Running state
 * - Paused state
 * - Completed state
 *
 * Only device operations marked with (*) can go into the Paused state for the
 * following reasons:
 *
 * - ioctl (only Balance can be Paused through ioctl)
 * - filesystem remounted as read-only
 * - filesystem unmounted and mounted as read-only
 * - system power-cycle and filesystem mounted as read-only
 * - filesystem or device errors leading to forced read-only
 *
354 355
 * The status of exclusive operation is set and cleared atomically.
 * During the course of Paused state, fs_info::exclusive_operation remains set.
356 357
 * A device operation in Paused or Running state can be canceled or resumed
 * either by ioctl (Balance only) or when remounted as read-write.
358
 * The exclusive status is cleared when the device operation is canceled or
359
 * completed.
360 361
 */

362
DEFINE_MUTEX(uuid_mutex);
363
static LIST_HEAD(fs_uuids);
364
struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
365 366 367
{
	return &fs_uuids;
}
368

369
/*
370 371 372 373
 * Allocate new btrfs_fs_devices structure identified by a fsid.
 *
 * @fsid:    if not NULL, copy the UUID to fs_devices::fsid and to
 *           fs_devices::metadata_fsid
374 375 376 377 378
 *
 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 * The returned struct is not linked onto any lists and can be destroyed with
 * kfree() right away.
 */
379
static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
380 381 382
{
	struct btrfs_fs_devices *fs_devs;

383
	fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
384 385 386 387 388 389 390
	if (!fs_devs)
		return ERR_PTR(-ENOMEM);

	mutex_init(&fs_devs->device_list_mutex);

	INIT_LIST_HEAD(&fs_devs->devices);
	INIT_LIST_HEAD(&fs_devs->alloc_list);
391
	INIT_LIST_HEAD(&fs_devs->fs_list);
392
	INIT_LIST_HEAD(&fs_devs->seed_list);
393

394 395
	if (fsid) {
		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
396
		memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
397
	}
398

399 400 401
	return fs_devs;
}

402
static void btrfs_free_device(struct btrfs_device *device)
403
{
404
	WARN_ON(!list_empty(&device->post_commit_list));
405
	rcu_string_free(device->name);
406
	extent_io_tree_release(&device->alloc_state);
407
	btrfs_destroy_dev_zone_info(device);
408 409 410
	kfree(device);
}

Yan Zheng's avatar
Yan Zheng committed
411 412 413
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
	struct btrfs_device *device;
414

Yan Zheng's avatar
Yan Zheng committed
415 416 417 418 419
	WARN_ON(fs_devices->opened);
	while (!list_empty(&fs_devices->devices)) {
		device = list_entry(fs_devices->devices.next,
				    struct btrfs_device, dev_list);
		list_del(&device->dev_list);
420
		btrfs_free_device(device);
Yan Zheng's avatar
Yan Zheng committed
421 422 423 424
	}
	kfree(fs_devices);
}

425
void __exit btrfs_cleanup_fs_uuids(void)
426 427 428
{
	struct btrfs_fs_devices *fs_devices;

Yan Zheng's avatar
Yan Zheng committed
429 430
	while (!list_empty(&fs_uuids)) {
		fs_devices = list_entry(fs_uuids.next,
431 432
					struct btrfs_fs_devices, fs_list);
		list_del(&fs_devices->fs_list);
Yan Zheng's avatar
Yan Zheng committed
433
		free_fs_devices(fs_devices);
434 435 436
	}
}

437 438 439 440 441 442 443 444 445 446 447 448 449 450 451
static bool match_fsid_fs_devices(const struct btrfs_fs_devices *fs_devices,
				  const u8 *fsid, const u8 *metadata_fsid)
{
	if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0)
		return false;

	if (!metadata_fsid)
		return true;

	if (memcmp(metadata_fsid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0)
		return false;

	return true;
}

452 453
static noinline struct btrfs_fs_devices *find_fsid(
		const u8 *fsid, const u8 *metadata_fsid)
454 455 456
{
	struct btrfs_fs_devices *fs_devices;

457 458
	ASSERT(fsid);

459
	/* Handle non-split brain cases */
460
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
461 462
		if (match_fsid_fs_devices(fs_devices, fsid, metadata_fsid))
			return fs_devices;
463 464 465 466
	}
	return NULL;
}

467
static int
468
btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder,
469
		      int flush, struct file **bdev_file,
470
		      struct btrfs_super_block **disk_super)
471
{
472
	struct block_device *bdev;
473 474
	int ret;

475
	*bdev_file = bdev_file_open_by_path(device_path, flags, holder, NULL);
476

477 478
	if (IS_ERR(*bdev_file)) {
		ret = PTR_ERR(*bdev_file);
479 480
		btrfs_err(NULL, "failed to open device for path %s with flags 0x%x: %d",
			  device_path, flags, ret);
481 482
		goto error;
	}
483
	bdev = file_bdev(*bdev_file);
484 485

	if (flush)
486
		sync_blockdev(bdev);
487
	if (holder) {
488
		ret = set_blocksize(*bdev_file, BTRFS_BDEV_BLOCKSIZE);
489 490 491 492
		if (ret) {
			fput(*bdev_file);
			goto error;
		}
493
	}
494 495
	invalidate_bdev(bdev);
	*disk_super = btrfs_read_dev_super(bdev);
496 497
	if (IS_ERR(*disk_super)) {
		ret = PTR_ERR(*disk_super);
498
		fput(*bdev_file);
499 500 501 502 503 504
		goto error;
	}

	return 0;

error:
505
	*disk_super = NULL;
506
	*bdev_file = NULL;
507 508 509
	return ret;
}

510 511 512
/*
 *  Search and remove all stale devices (which are not mounted).  When both
 *  inputs are NULL, it will search and release all stale devices.
513
 *
514 515
 *  @devt:         Optional. When provided will it release all unmounted devices
 *                 matching this devt only.
516
 *  @skip_device:  Optional. Will skip this device when searching for the stale
517
 *                 devices.
518 519 520 521
 *
 *  Return:	0 for success or if @devt is 0.
 *		-EBUSY if @devt is a mounted device.
 *		-ENOENT if @devt does not match any device in the list.
522
 */
523
static int btrfs_free_stale_devices(dev_t devt, struct btrfs_device *skip_device)
Anand Jain's avatar
Anand Jain committed
524
{
525 526
	struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
	struct btrfs_device *device, *tmp_device;
527 528
	int ret;
	bool freed = false;
529

530 531
	lockdep_assert_held(&uuid_mutex);

532 533
	/* Return good status if there is no instance of devt. */
	ret = 0;
534
	list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
Anand Jain's avatar
Anand Jain committed
535

536
		mutex_lock(&fs_devices->device_list_mutex);
537 538 539
		list_for_each_entry_safe(device, tmp_device,
					 &fs_devices->devices, dev_list) {
			if (skip_device && skip_device == device)
540
				continue;
541
			if (devt && devt != device->devt)
542
				continue;
543
			if (fs_devices->opened) {
544
				if (devt)
545 546 547
					ret = -EBUSY;
				break;
			}
Anand Jain's avatar
Anand Jain committed
548 549

			/* delete the stale device */
550 551 552 553
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);

554
			freed = true;
555 556
		}
		mutex_unlock(&fs_devices->device_list_mutex);
557

558 559 560 561
		if (fs_devices->num_devices == 0) {
			btrfs_sysfs_remove_fsid(fs_devices);
			list_del(&fs_devices->fs_list);
			free_fs_devices(fs_devices);
Anand Jain's avatar
Anand Jain committed
562 563
		}
	}
564

565 566 567 568
	/* If there is at least one freed device return 0. */
	if (freed)
		return 0;

569
	return ret;
Anand Jain's avatar
Anand Jain committed
570 571
}

572
static struct btrfs_fs_devices *find_fsid_by_device(
573 574
					struct btrfs_super_block *disk_super,
					dev_t devt, bool *same_fsid_diff_dev)
575 576
{
	struct btrfs_fs_devices *fsid_fs_devices;
577
	struct btrfs_fs_devices *devt_fs_devices;
578 579
	const bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
					BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
580
	bool found_by_devt = false;
581 582 583 584 585

	/* Find the fs_device by the usual method, if found use it. */
	fsid_fs_devices = find_fsid(disk_super->fsid,
		    has_metadata_uuid ? disk_super->metadata_uuid : NULL);

586 587 588 589
	/* The temp_fsid feature is supported only with single device filesystem. */
	if (btrfs_super_num_devices(disk_super) != 1)
		return fsid_fs_devices;

590 591 592 593 594 595 596 597
	/*
	 * A seed device is an integral component of the sprout device, which
	 * functions as a multi-device filesystem. So, temp-fsid feature is
	 * not supported.
	 */
	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING)
		return fsid_fs_devices;

598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637
	/* Try to find a fs_devices by matching devt. */
	list_for_each_entry(devt_fs_devices, &fs_uuids, fs_list) {
		struct btrfs_device *device;

		list_for_each_entry(device, &devt_fs_devices->devices, dev_list) {
			if (device->devt == devt) {
				found_by_devt = true;
				break;
			}
		}
		if (found_by_devt)
			break;
	}

	if (found_by_devt) {
		/* Existing device. */
		if (fsid_fs_devices == NULL) {
			if (devt_fs_devices->opened == 0) {
				/* Stale device. */
				return NULL;
			} else {
				/* temp_fsid is mounting a subvol. */
				return devt_fs_devices;
			}
		} else {
			/* Regular or temp_fsid device mounting a subvol. */
			return devt_fs_devices;
		}
	} else {
		/* New device. */
		if (fsid_fs_devices == NULL) {
			return NULL;
		} else {
			/* sb::fsid is already used create a new temp_fsid. */
			*same_fsid_diff_dev = true;
			return NULL;
		}
	}

	/* Not reached. */
638 639
}

640 641 642 643 644
/*
 * This is only used on mount, and we are protected from competing things
 * messing with our fs_devices by the uuid_mutex, thus we do not need the
 * fs_devices->device_list_mutex here.
 */
645
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
646
			struct btrfs_device *device, blk_mode_t flags,
647 648
			void *holder)
{
649
	struct file *bdev_file;
650 651 652 653 654 655 656 657 658 659
	struct btrfs_super_block *disk_super;
	u64 devid;
	int ret;

	if (device->bdev)
		return -EINVAL;
	if (!device->name)
		return -EINVAL;

	ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
660
				    &bdev_file, &disk_super);
661 662 663 664 665
	if (ret)
		return ret;

	devid = btrfs_stack_device_id(&disk_super->dev_item);
	if (devid != device->devid)
666
		goto error_free_page;
667 668

	if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
669
		goto error_free_page;
670 671 672 673

	device->generation = btrfs_super_generation(disk_super);

	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
674 675 676 677
		if (btrfs_super_incompat_flags(disk_super) &
		    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
			pr_err(
		"BTRFS: Invalid seeding and uuid-changed device detected\n");
678
			goto error_free_page;
679 680
		}

681
		clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
682
		fs_devices->seeding = true;
683
	} else {
684
		if (bdev_read_only(file_bdev(bdev_file)))
685 686 687
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
		else
			set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
688 689
	}

690
	if (!bdev_nonrot(file_bdev(bdev_file)))
691
		fs_devices->rotating = true;
692

693
	if (bdev_max_discard_sectors(file_bdev(bdev_file)))
694 695
		fs_devices->discardable = true;

696 697
	device->bdev_file = bdev_file;
	device->bdev = file_bdev(bdev_file);
698
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
699

700 701 702 703 704 705 706 707 708 709
	if (device->devt != device->bdev->bd_dev) {
		btrfs_warn(NULL,
			   "device %s maj:min changed from %d:%d to %d:%d",
			   device->name->str, MAJOR(device->devt),
			   MINOR(device->devt), MAJOR(device->bdev->bd_dev),
			   MINOR(device->bdev->bd_dev));

		device->devt = device->bdev->bd_dev;
	}

710
	fs_devices->open_devices++;
711 712
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
713
		fs_devices->rw_devices++;
714
		list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
715
	}
716
	btrfs_release_disk_super(disk_super);
717 718 719

	return 0;

720 721
error_free_page:
	btrfs_release_disk_super(disk_super);
722
	fput(bdev_file);
723 724 725 726

	return -EINVAL;
}

727
const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb)
728 729 730 731 732 733 734
{
	bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
				  BTRFS_FEATURE_INCOMPAT_METADATA_UUID);

	return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
}

735 736 737 738
/*
 * Add new device to list of registered devices
 *
 * Returns:
739 740
 * device pointer which was just added or updated when successful
 * error pointer when failed
741
 */
742
static noinline struct btrfs_device *device_list_add(const char *path,
743 744
			   struct btrfs_super_block *disk_super,
			   bool *new_device_added)
745 746
{
	struct btrfs_device *device;
747
	struct btrfs_fs_devices *fs_devices = NULL;
748
	struct rcu_string *name;
749
	u64 found_transid = btrfs_super_generation(disk_super);
750
	u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
751 752
	dev_t path_devt;
	int error;
753
	bool same_fsid_diff_dev = false;
754 755 756
	bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
		BTRFS_FEATURE_INCOMPAT_METADATA_UUID);

757
	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_CHANGING_FSID_V2) {
758 759 760 761 762
		btrfs_err(NULL,
"device %s has incomplete metadata_uuid change, please use btrfstune to complete",
			  path);
		return ERR_PTR(-EAGAIN);
	}
763

764
	error = lookup_bdev(path, &path_devt);
765 766 767
	if (error) {
		btrfs_err(NULL, "failed to lookup block device for path %s: %d",
			  path, error);
768
		return ERR_PTR(error);
769
	}
770

771
	fs_devices = find_fsid_by_device(disk_super, path_devt, &same_fsid_diff_dev);
772 773

	if (!fs_devices) {
774
		fs_devices = alloc_fs_devices(disk_super->fsid);
775 776 777
		if (IS_ERR(fs_devices))
			return ERR_CAST(fs_devices);

778 779 780 781
		if (has_metadata_uuid)
			memcpy(fs_devices->metadata_uuid,
			       disk_super->metadata_uuid, BTRFS_FSID_SIZE);

782 783 784
		if (same_fsid_diff_dev) {
			generate_random_uuid(fs_devices->fsid);
			fs_devices->temp_fsid = true;
785 786 787
		pr_info("BTRFS: device %s (%d:%d) using temp-fsid %pU\n",
				path, MAJOR(path_devt), MINOR(path_devt),
				fs_devices->fsid);
788
		}
789

790
		mutex_lock(&fs_devices->device_list_mutex);
791
		list_add(&fs_devices->fs_list, &fs_uuids);
792

793 794
		device = NULL;
	} else {
795 796 797 798 799
		struct btrfs_dev_lookup_args args = {
			.devid = devid,
			.uuid = disk_super->dev_item.uuid,
		};

800
		mutex_lock(&fs_devices->device_list_mutex);
801
		device = btrfs_find_device(fs_devices, &args);
802

803
		if (found_transid > fs_devices->latest_generation) {
804 805
			memcpy(fs_devices->fsid, disk_super->fsid,
					BTRFS_FSID_SIZE);
806 807
			memcpy(fs_devices->metadata_uuid,
			       btrfs_sb_fsid_ptr(disk_super), BTRFS_FSID_SIZE);
808
		}
809
	}
810

811
	if (!device) {
812 813
		unsigned int nofs_flag;

814
		if (fs_devices->opened) {
815
			btrfs_err(NULL,
816 817 818
"device %s (%d:%d) belongs to fsid %pU, and the fs is already mounted, scanned by %s (%d)",
				  path, MAJOR(path_devt), MINOR(path_devt),
				  fs_devices->fsid, current->comm,
819
				  task_pid_nr(current));
820
			mutex_unlock(&fs_devices->device_list_mutex);
821
			return ERR_PTR(-EBUSY);
822
		}
Yan Zheng's avatar
Yan Zheng committed
823

824
		nofs_flag = memalloc_nofs_save();
825
		device = btrfs_alloc_device(NULL, &devid,
826 827
					    disk_super->dev_item.uuid, path);
		memalloc_nofs_restore(nofs_flag);
828
		if (IS_ERR(device)) {
829
			mutex_unlock(&fs_devices->device_list_mutex);
830
			/* we can safely leave the fs_devices entry around */
831
			return device;
832
		}
833

834
		device->devt = path_devt;
835

836
		list_add_rcu(&device->dev_list, &fs_devices->devices);
837
		fs_devices->num_devices++;
838

Yan Zheng's avatar
Yan Zheng committed
839
		device->fs_devices = fs_devices;
840
		*new_device_added = true;
841 842

		if (disk_super->label[0])
843
			pr_info(
844
"BTRFS: device label %s devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
845
				disk_super->label, devid, found_transid, path,
846
				MAJOR(path_devt), MINOR(path_devt),
847
				current->comm, task_pid_nr(current));
848
		else
849
			pr_info(
850
"BTRFS: device fsid %pU devid %llu transid %llu %s (%d:%d) scanned by %s (%d)\n",
851
				disk_super->fsid, devid, found_transid, path,
852
				MAJOR(path_devt), MINOR(path_devt),
853
				current->comm, task_pid_nr(current));
854

855
	} else if (!device->name || strcmp(device->name->str, path)) {
856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876
		/*
		 * When FS is already mounted.
		 * 1. If you are here and if the device->name is NULL that
		 *    means this device was missing at time of FS mount.
		 * 2. If you are here and if the device->name is different
		 *    from 'path' that means either
		 *      a. The same device disappeared and reappeared with
		 *         different name. or
		 *      b. The missing-disk-which-was-replaced, has
		 *         reappeared now.
		 *
		 * We must allow 1 and 2a above. But 2b would be a spurious
		 * and unintentional.
		 *
		 * Further in case of 1 and 2a above, the disk at 'path'
		 * would have missed some transaction when it was away and
		 * in case of 2a the stale bdev has to be updated as well.
		 * 2b must not be allowed at all time.
		 */

		/*
877 878 879 880
		 * For now, we do allow update to btrfs_fs_device through the
		 * btrfs dev scan cli after FS has been mounted.  We're still
		 * tracking a problem where systems fail mount by subvolume id
		 * when we reject replacement on a mounted FS.
881
		 */
882
		if (!fs_devices->opened && found_transid < device->generation) {
883 884 885 886 887 888 889
			/*
			 * That is if the FS is _not_ mounted and if you
			 * are here, that means there is more than one
			 * disk with same uuid and devid.We keep the one
			 * with larger generation number or the last-in if
			 * generation are equal.
			 */
890
			mutex_unlock(&fs_devices->device_list_mutex);
891 892 893
			btrfs_err(NULL,
"device %s already registered with a higher generation, found %llu expect %llu",
				  path, found_transid, device->generation);
894
			return ERR_PTR(-EEXIST);
895
		}
896

897 898 899
		/*
		 * We are going to replace the device path for a given devid,
		 * make sure it's the same device if the device is mounted
900 901 902 903 904
		 *
		 * NOTE: the device->fs_info may not be reliable here so pass
		 * in a NULL to message helpers instead. This avoids a possible
		 * use-after-free when the fs_info and fs_info->sb are already
		 * torn down.
905 906
		 */
		if (device->bdev) {
907
			if (device->devt != path_devt) {
908
				mutex_unlock(&fs_devices->device_list_mutex);
909
				btrfs_warn_in_rcu(NULL,
910 911 912 913
	"duplicate device %s devid %llu generation %llu scanned by %s (%d)",
						  path, devid, found_transid,
						  current->comm,
						  task_pid_nr(current));
914 915
				return ERR_PTR(-EEXIST);
			}
916
			btrfs_info_in_rcu(NULL,
917
	"devid %llu device path %s changed to %s scanned by %s (%d)",
918
					  devid, btrfs_dev_name(device),
919 920
					  path, current->comm,
					  task_pid_nr(current));
921 922
		}

923
		name = rcu_string_strdup(path, GFP_NOFS);
924 925
		if (!name) {
			mutex_unlock(&fs_devices->device_list_mutex);
926
			return ERR_PTR(-ENOMEM);
927
		}
928 929
		rcu_string_free(device->name);
		rcu_assign_pointer(device->name, name);
930
		if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
931
			fs_devices->missing_devices--;
932
			clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
933
		}
934
		device->devt = path_devt;
935 936
	}

937 938 939 940 941 942
	/*
	 * Unmount does not free the btrfs_device struct but would zero
	 * generation along with most of the other members. So just update
	 * it back. We need it to pick the disk with largest generation
	 * (as above).
	 */
943
	if (!fs_devices->opened) {
944
		device->generation = found_transid;
945 946 947
		fs_devices->latest_generation = max_t(u64, found_transid,
						fs_devices->latest_generation);
	}
948

949 950
	fs_devices->total_devices = btrfs_super_num_devices(disk_super);

951
	mutex_unlock(&fs_devices->device_list_mutex);
952
	return device;
953 954
}

Yan Zheng's avatar
Yan Zheng committed
955 956 957 958 959
static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
{
	struct btrfs_fs_devices *fs_devices;
	struct btrfs_device *device;
	struct btrfs_device *orig_dev;
960
	int ret = 0;
Yan Zheng's avatar
Yan Zheng committed
961

962 963
	lockdep_assert_held(&uuid_mutex);

964
	fs_devices = alloc_fs_devices(orig->fsid);
965 966
	if (IS_ERR(fs_devices))
		return fs_devices;
Yan Zheng's avatar
Yan Zheng committed
967

Josef Bacik's avatar
Josef Bacik committed
968
	fs_devices->total_devices = orig->total_devices;
Yan Zheng's avatar
Yan Zheng committed
969 970

	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
971 972 973 974 975 976 977 978
		const char *dev_path = NULL;

		/*
		 * This is ok to do without RCU read locked because we hold the
		 * uuid mutex so nothing we touch in here is going to disappear.
		 */
		if (orig_dev->name)
			dev_path = orig_dev->name->str;
979

980
		device = btrfs_alloc_device(NULL, &orig_dev->devid,
981
					    orig_dev->uuid, dev_path);
982 983
		if (IS_ERR(device)) {
			ret = PTR_ERR(device);
Yan Zheng's avatar
Yan Zheng committed
984
			goto error;
985
		}
Yan Zheng's avatar
Yan Zheng committed
986

987 988 989 990 991 992 993 994 995 996 997 998
		if (orig_dev->zone_info) {
			struct btrfs_zoned_device_info *zone_info;

			zone_info = btrfs_clone_dev_zone_info(orig_dev);
			if (!zone_info) {
				btrfs_free_device(device);
				ret = -ENOMEM;
				goto error;
			}
			device->zone_info = zone_info;
		}

Yan Zheng's avatar
Yan Zheng committed
999 1000 1001 1002 1003 1004 1005
		list_add(&device->dev_list, &fs_devices->devices);
		device->fs_devices = fs_devices;
		fs_devices->num_devices++;
	}
	return fs_devices;
error:
	free_fs_devices(fs_devices);
1006
	return ERR_PTR(ret);
Yan Zheng's avatar
Yan Zheng committed
1007 1008
}

1009
static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1010
				      struct btrfs_device **latest_dev)
1011
{
1012
	struct btrfs_device *device, *next;
1013

1014
	/* This is the initialized path, it is safe to release the devices. */
1015
	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1016
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1017
			if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1018
				      &device->dev_state) &&
1019 1020
			    !test_bit(BTRFS_DEV_STATE_MISSING,
				      &device->dev_state) &&
1021 1022 1023
			    (!*latest_dev ||
			     device->generation > (*latest_dev)->generation)) {
				*latest_dev = device;
1024
			}
Yan Zheng's avatar
Yan Zheng committed
1025
			continue;
1026
		}
Yan Zheng's avatar
Yan Zheng committed
1027

1028 1029 1030 1031 1032 1033 1034
		/*
		 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
		 * in btrfs_init_dev_replace() so just continue.
		 */
		if (device->devid == BTRFS_DEV_REPLACE_DEVID)
			continue;

1035 1036
		if (device->bdev_file) {
			fput(device->bdev_file);
Yan Zheng's avatar
Yan Zheng committed
1037
			device->bdev = NULL;
1038
			device->bdev_file = NULL;
Yan Zheng's avatar
Yan Zheng committed
1039 1040
			fs_devices->open_devices--;
		}
1041
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Yan Zheng's avatar
Yan Zheng committed
1042
			list_del_init(&device->dev_alloc_list);
1043
			clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1044
			fs_devices->rw_devices--;
Yan Zheng's avatar
Yan Zheng committed
1045
		}
Yan Zheng's avatar
Yan Zheng committed
1046 1047
		list_del_init(&device->dev_list);
		fs_devices->num_devices--;
1048
		btrfs_free_device(device);
1049
	}
Yan Zheng's avatar
Yan Zheng committed
1050

1051 1052 1053 1054 1055 1056
}

/*
 * After we have read the system tree and know devids belonging to this
 * filesystem, remove the device which does not belong there.
 */
1057
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1058 1059
{
	struct btrfs_device *latest_dev = NULL;
1060
	struct btrfs_fs_devices *seed_dev;
1061 1062

	mutex_lock(&uuid_mutex);
1063
	__btrfs_free_extra_devids(fs_devices, &latest_dev);
1064 1065

	list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1066
		__btrfs_free_extra_devids(seed_dev, &latest_dev);
Yan Zheng's avatar
Yan Zheng committed
1067

1068
	fs_devices->latest_dev = latest_dev;
1069

1070 1071
	mutex_unlock(&uuid_mutex);
}
1072

1073 1074
static void btrfs_close_bdev(struct btrfs_device *device)
{
1075 1076 1077
	if (!device->bdev)
		return;

1078
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1079 1080 1081 1082
		sync_blockdev(device->bdev);
		invalidate_bdev(device->bdev);
	}

1083
	fput(device->bdev_file);
1084 1085
}

1086
static void btrfs_close_one_device(struct btrfs_device *device)
1087 1088 1089
{
	struct btrfs_fs_devices *fs_devices = device->fs_devices;

1090
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1091 1092 1093 1094 1095
	    device->devid != BTRFS_DEV_REPLACE_DEVID) {
		list_del_init(&device->dev_alloc_list);
		fs_devices->rw_devices--;
	}

1096 1097 1098
	if (device->devid == BTRFS_DEV_REPLACE_DEVID)
		clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);

1099 1100
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
		clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1101
		fs_devices->missing_devices--;
1102
	}
1103

1104
	btrfs_close_bdev(device);
1105
	if (device->bdev) {
1106
		fs_devices->open_devices--;
1107
		device->bdev = NULL;
1108
	}
1109
	clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1110
	btrfs_destroy_dev_zone_info(device);
1111

1112 1113 1114
	device->fs_info = NULL;
	atomic_set(&device->dev_stats_ccnt, 0);
	extent_io_tree_release(&device->alloc_state);
1115

1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128
	/*
	 * Reset the flush error record. We might have a transient flush error
	 * in this mount, and if so we aborted the current transaction and set
	 * the fs to an error state, guaranteeing no super blocks can be further
	 * committed. However that error might be transient and if we unmount the
	 * filesystem and mount it again, we should allow the mount to succeed
	 * (btrfs_check_rw_degradable() should not fail) - if after mounting the
	 * filesystem again we still get flush errors, then we will again abort
	 * any transaction and set the error state, guaranteeing no commits of
	 * unsafe super blocks.
	 */
	device->last_flush_error = 0;

1129
	/* Verify the device is back in a pristine state  */
1130 1131 1132 1133
	WARN_ON(test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
	WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
	WARN_ON(!list_empty(&device->dev_alloc_list));
	WARN_ON(!list_empty(&device->post_commit_list));
1134 1135
}

1136
static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1137
{
1138
	struct btrfs_device *device, *tmp;
Yan Zheng's avatar
Yan Zheng committed
1139

1140 1141
	lockdep_assert_held(&uuid_mutex);

Yan Zheng's avatar
Yan Zheng committed
1142
	if (--fs_devices->opened > 0)
1143
		return;
1144

1145
	list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1146
		btrfs_close_one_device(device);
1147

Yan Zheng's avatar
Yan Zheng committed
1148 1149
	WARN_ON(fs_devices->open_devices);
	WARN_ON(fs_devices->rw_devices);
Yan Zheng's avatar
Yan Zheng committed
1150
	fs_devices->opened = 0;
1151
	fs_devices->seeding = false;
1152
	fs_devices->fs_info = NULL;
1153 1154
}

1155
void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
Yan Zheng's avatar
Yan Zheng committed
1156
{
1157 1158
	LIST_HEAD(list);
	struct btrfs_fs_devices *tmp;
Yan Zheng's avatar
Yan Zheng committed
1159 1160

	mutex_lock(&uuid_mutex);
1161
	close_fs_devices(fs_devices);
1162
	if (!fs_devices->opened) {
1163
		list_splice_init(&fs_devices->seed_list, &list);
Yan Zheng's avatar
Yan Zheng committed
1164

1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
		/*
		 * If the struct btrfs_fs_devices is not assembled with any
		 * other device, it can be re-initialized during the next mount
		 * without the needing device-scan step. Therefore, it can be
		 * fully freed.
		 */
		if (fs_devices->num_devices == 1) {
			list_del(&fs_devices->fs_list);
			free_fs_devices(fs_devices);
		}
	}


1178
	list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1179
		close_fs_devices(fs_devices);
1180
		list_del(&fs_devices->seed_list);
Yan Zheng's avatar
Yan Zheng committed
1181 1182
		free_fs_devices(fs_devices);
	}
1183
	mutex_unlock(&uuid_mutex);
Yan Zheng's avatar
Yan Zheng committed
1184 1185
}

1186
static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1187
				blk_mode_t flags, void *holder)
1188 1189
{
	struct btrfs_device *device;
1190
	struct btrfs_device *latest_dev = NULL;
1191
	struct btrfs_device *tmp_device;
1192
	int ret = 0;
1193

1194 1195
	list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
				 dev_list) {
1196
		int ret2;
1197

1198 1199
		ret2 = btrfs_open_one_device(fs_devices, device, flags, holder);
		if (ret2 == 0 &&
1200
		    (!latest_dev || device->generation > latest_dev->generation)) {
1201
			latest_dev = device;
1202
		} else if (ret2 == -ENODATA) {
1203 1204 1205 1206
			fs_devices->num_devices--;
			list_del(&device->dev_list);
			btrfs_free_device(device);
		}
1207 1208
		if (ret == 0 && ret2 != 0)
			ret = ret2;
1209
	}
1210 1211 1212 1213

	if (fs_devices->open_devices == 0) {
		if (ret)
			return ret;
1214
		return -EINVAL;
1215
	}
1216

Yan Zheng's avatar
Yan Zheng committed
1217
	fs_devices->opened = 1;
1218
	fs_devices->latest_dev = latest_dev;
Yan Zheng's avatar
Yan Zheng committed
1219
	fs_devices->total_rw_bytes = 0;
1220
	fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1221
	fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1222 1223

	return 0;
Yan Zheng's avatar
Yan Zheng committed
1224 1225
}

1226 1227
static int devid_cmp(void *priv, const struct list_head *a,
		     const struct list_head *b)
1228
{
1229
	const struct btrfs_device *dev1, *dev2;
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240

	dev1 = list_entry(a, struct btrfs_device, dev_list);
	dev2 = list_entry(b, struct btrfs_device, dev_list);

	if (dev1->devid < dev2->devid)
		return -1;
	else if (dev1->devid > dev2->devid)
		return 1;
	return 0;
}

Yan Zheng's avatar
Yan Zheng committed
1241
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1242
		       blk_mode_t flags, void *holder)
Yan Zheng's avatar
Yan Zheng committed
1243 1244 1245
{
	int ret;

1246
	lockdep_assert_held(&uuid_mutex);
1247 1248
	/*
	 * The device_list_mutex cannot be taken here in case opening the
1249
	 * underlying device takes further locks like open_mutex.
1250 1251 1252 1253
	 *
	 * We also don't need the lock here as this is called during mount and
	 * exclusion is provided by uuid_mutex
	 */
1254

Yan Zheng's avatar
Yan Zheng committed
1255
	if (fs_devices->opened) {
Yan Zheng's avatar
Yan Zheng committed
1256 1257
		fs_devices->opened++;
		ret = 0;
Yan Zheng's avatar
Yan Zheng committed
1258
	} else {
1259
		list_sort(NULL, &fs_devices->devices, devid_cmp);
1260
		ret = open_fs_devices(fs_devices, flags, holder);
Yan Zheng's avatar
Yan Zheng committed
1261
	}
1262

1263 1264 1265
	return ret;
}

1266
void btrfs_release_disk_super(struct btrfs_super_block *super)
1267
{
1268 1269
	struct page *page = virt_to_page(super);

1270 1271 1272
	put_page(page);
}

1273
static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1274
						       u64 bytenr, u64 bytenr_orig)
1275
{
1276 1277
	struct btrfs_super_block *disk_super;
	struct page *page;
1278 1279 1280 1281
	void *p;
	pgoff_t index;

	/* make sure our super fits in the device */
1282
	if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev))
1283
		return ERR_PTR(-EINVAL);
1284 1285

	/* make sure our super fits in the page */
1286 1287
	if (sizeof(*disk_super) > PAGE_SIZE)
		return ERR_PTR(-EINVAL);
1288 1289 1290

	/* make sure our super doesn't straddle pages on disk */
	index = bytenr >> PAGE_SHIFT;
1291 1292
	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
		return ERR_PTR(-EINVAL);
1293 1294

	/* pull in the page with our super */
1295
	page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL);
1296

1297 1298
	if (IS_ERR(page))
		return ERR_CAST(page);
1299

1300
	p = page_address(page);
1301 1302

	/* align our pointer to the offset of the super block */
1303
	disk_super = p + offset_in_page(bytenr);
1304

1305
	if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1306
	    btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1307
		btrfs_release_disk_super(p);
1308
		return ERR_PTR(-EINVAL);
1309 1310
	}

1311 1312
	if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
		disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1313

1314
	return disk_super;
1315 1316
}

1317
int btrfs_forget_devices(dev_t devt)
1318 1319 1320 1321
{
	int ret;

	mutex_lock(&uuid_mutex);
1322
	ret = btrfs_free_stale_devices(devt, NULL);
1323 1324 1325 1326 1327
	mutex_unlock(&uuid_mutex);

	return ret;
}

1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368
static bool btrfs_skip_registration(struct btrfs_super_block *disk_super,
				    const char *path, dev_t devt,
				    bool mount_arg_dev)
{
	struct btrfs_fs_devices *fs_devices;

	/*
	 * Do not skip device registration for mounted devices with matching
	 * maj:min but different paths. Booting without initrd relies on
	 * /dev/root initially, later replaced with the actual root device.
	 * A successful scan ensures grub2-probe selects the correct device.
	 */
	list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
		struct btrfs_device *device;

		mutex_lock(&fs_devices->device_list_mutex);

		if (!fs_devices->opened) {
			mutex_unlock(&fs_devices->device_list_mutex);
			continue;
		}

		list_for_each_entry(device, &fs_devices->devices, dev_list) {
			if (device->bdev && (device->bdev->bd_dev == devt) &&
			    strcmp(device->name->str, path) != 0) {
				mutex_unlock(&fs_devices->device_list_mutex);

				/* Do not skip registration. */
				return false;
			}
		}
		mutex_unlock(&fs_devices->device_list_mutex);
	}

	if (!mount_arg_dev && btrfs_super_num_devices(disk_super) == 1 &&
	    !(btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING))
		return true;

	return false;
}

1369 1370 1371
/*
 * Look for a btrfs signature on a device. This may be called out of the mount path
 * and we are not allowed to call set_blocksize during the scan. The superblock
1372 1373 1374 1375 1376
 * is read via pagecache.
 *
 * With @mount_arg_dev it's a scan during mount time that will always register
 * the device or return an error. Multi-device and seeding devices are registered
 * in both cases.
1377
 */
1378 1379
struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
					   bool mount_arg_dev)
1380 1381
{
	struct btrfs_super_block *disk_super;
1382
	bool new_device_added = false;
1383
	struct btrfs_device *device = NULL;
1384
	struct file *bdev_file;
1385
	u64 bytenr;
1386
	dev_t devt;
1387
	int ret;
1388

1389 1390
	lockdep_assert_held(&uuid_mutex);

1391
	/*
1392 1393 1394 1395 1396
	 * Avoid an exclusive open here, as the systemd-udev may initiate the
	 * device scan which may race with the user's mount or mkfs command,
	 * resulting in failure.
	 * Since the device scan is solely for reading purposes, there is no
	 * need for an exclusive open. Additionally, the devices are read again
1397 1398 1399 1400
	 * during the mount process. It is ok to get some inconsistent
	 * values temporarily, as the device paths of the fsid are the only
	 * required information for assembling the volume.
	 */
1401 1402 1403
	bdev_file = bdev_file_open_by_path(path, flags, NULL, NULL);
	if (IS_ERR(bdev_file))
		return ERR_CAST(bdev_file);
1404

1405 1406 1407 1408 1409 1410
	/*
	 * We would like to check all the super blocks, but doing so would
	 * allow a mount to succeed after a mkfs from a different filesystem.
	 * Currently, recovery from a bad primary btrfs superblock is done
	 * using the userspace command 'btrfs check --super'.
	 */
1411
	ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr);
1412 1413 1414 1415
	if (ret) {
		device = ERR_PTR(ret);
		goto error_bdev_put;
	}
1416

1417
	disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr,
1418
					   btrfs_sb_offset(0));
1419 1420
	if (IS_ERR(disk_super)) {
		device = ERR_CAST(disk_super);
1421
		goto error_bdev_put;
1422
	}
1423

1424 1425 1426 1427
	devt = file_bdev(bdev_file)->bd_dev;
	if (btrfs_skip_registration(disk_super, path, devt, mount_arg_dev)) {
		pr_debug("BTRFS: skip registering single non-seed device %s (%d:%d)\n",
			  path, MAJOR(devt), MINOR(devt));
1428

1429
		btrfs_free_stale_devices(devt, NULL);
1430 1431 1432 1433 1434

		device = NULL;
		goto free_disk_super;
	}

1435
	device = device_list_add(path, disk_super, &new_device_added);
1436 1437
	if (!IS_ERR(device) && new_device_added)
		btrfs_free_stale_devices(device->devt, device);
1438

1439
free_disk_super:
1440
	btrfs_release_disk_super(disk_super);
1441 1442

error_bdev_put:
1443
	fput(bdev_file);
1444

1445
	return device;
1446
}
1447

1448 1449 1450 1451 1452 1453
/*
 * Try to find a chunk that intersects [start, start + len] range and when one
 * such is found, record the end of it in *start
 */
static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
				    u64 len)
1454
{
1455
	u64 physical_start, physical_end;
1456

1457
	lockdep_assert_held(&device->fs_info->chunk_mutex);
1458

1459 1460 1461
	if (find_first_extent_bit(&device->alloc_state, *start,
				  &physical_start, &physical_end,
				  CHUNK_ALLOCATED, NULL)) {
1462

1463 1464
		if (in_range(physical_start, *start, len) ||
		    in_range(*start, physical_start,
1465
			     physical_end + 1 - physical_start)) {
1466 1467
			*start = physical_end + 1;
			return true;
1468 1469
		}
	}
1470
	return false;
1471 1472
}

1473
static u64 dev_extent_search_start(struct btrfs_device *device)
1474 1475 1476
{
	switch (device->fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
1477
		return BTRFS_DEVICE_RANGE_RESERVED;
1478 1479 1480 1481 1482 1483
	case BTRFS_CHUNK_ALLOC_ZONED:
		/*
		 * We don't care about the starting region like regular
		 * allocator, because we anyway use/reserve the first two zones
		 * for superblock logging.
		 */
1484
		return 0;
1485 1486 1487 1488 1489
	default:
		BUG();
	}
}

1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521
static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
					u64 *hole_start, u64 *hole_size,
					u64 num_bytes)
{
	u64 zone_size = device->zone_info->zone_size;
	u64 pos;
	int ret;
	bool changed = false;

	ASSERT(IS_ALIGNED(*hole_start, zone_size));

	while (*hole_size > 0) {
		pos = btrfs_find_allocatable_zones(device, *hole_start,
						   *hole_start + *hole_size,
						   num_bytes);
		if (pos != *hole_start) {
			*hole_size = *hole_start + *hole_size - pos;
			*hole_start = pos;
			changed = true;
			if (*hole_size < num_bytes)
				break;
		}

		ret = btrfs_ensure_empty_zones(device, pos, num_bytes);

		/* Range is ensured to be empty */
		if (!ret)
			return changed;

		/* Given hole range was invalid (outside of device) */
		if (ret == -ERANGE) {
			*hole_start += *hole_size;
1522
			*hole_size = 0;
1523
			return true;
1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
		}

		*hole_start += zone_size;
		*hole_size -= zone_size;
		changed = true;
	}

	return changed;
}

1534 1535 1536
/*
 * Check if specified hole is suitable for allocation.
 *
1537 1538 1539 1540 1541
 * @device:	the device which we have the hole
 * @hole_start: starting position of the hole
 * @hole_size:	the size of the hole
 * @num_bytes:	the size of the free space that we need
 *
1542
 * This function may modify @hole_start and @hole_size to reflect the suitable
1543 1544 1545 1546 1547 1548 1549 1550
 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
 */
static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
				  u64 *hole_size, u64 num_bytes)
{
	bool changed = false;
	u64 hole_end = *hole_start + *hole_size;

1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581
	for (;;) {
		/*
		 * Check before we set max_hole_start, otherwise we could end up
		 * sending back this offset anyway.
		 */
		if (contains_pending_extent(device, hole_start, *hole_size)) {
			if (hole_end >= *hole_start)
				*hole_size = hole_end - *hole_start;
			else
				*hole_size = 0;
			changed = true;
		}

		switch (device->fs_devices->chunk_alloc_policy) {
		case BTRFS_CHUNK_ALLOC_REGULAR:
			/* No extra check */
			break;
		case BTRFS_CHUNK_ALLOC_ZONED:
			if (dev_extent_hole_check_zoned(device, hole_start,
							hole_size, num_bytes)) {
				changed = true;
				/*
				 * The changed hole can contain pending extent.
				 * Loop again to check that.
				 */
				continue;
			}
			break;
		default:
			BUG();
		}
1582 1583 1584 1585 1586 1587

		break;
	}

	return changed;
}
1588

1589
/*
1590 1591
 * Find free space in the specified device.
 *
1592 1593 1594 1595 1596 1597
 * @device:	  the device which we search the free space in
 * @num_bytes:	  the size of the free space that we need
 * @search_start: the position from which to begin the search
 * @start:	  store the start of the free space.
 * @len:	  the size of the free space. that we find, or the size
 *		  of the max free space if we don't find suitable free space
1598
 *
1599 1600
 * This does a pretty simple search, the expectation is that it is called very
 * infrequently and that a given device has a small number of extents.
1601 1602 1603 1604 1605 1606 1607 1608
 *
 * @start is used to store the start of the free space if we find. But if we
 * don't find suitable free space, it will be used to store the start position
 * of the max free space.
 *
 * @len is used to store the size of the free space that we find.
 * But if we don't find suitable free space, it is used to store the size of
 * the max free space.
1609 1610 1611 1612 1613
 *
 * NOTE: This function will search *commit* root of device tree, and does extra
 * check to ensure dev extents are not double allocated.
 * This makes the function safe to allocate dev extents but may not report
 * correct usable device space, as device extent freed in current transaction
1614
 * is not reported as available.
1615
 */
1616 1617
static int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
				u64 *start, u64 *len)
1618
{
1619 1620
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1621
	struct btrfs_key key;
1622
	struct btrfs_dev_extent *dev_extent;
Yan Zheng's avatar
Yan Zheng committed
1623
	struct btrfs_path *path;
1624
	u64 search_start;
1625 1626
	u64 hole_size;
	u64 max_hole_start;
1627
	u64 max_hole_size = 0;
1628
	u64 extent_end;
1629 1630
	u64 search_end = device->total_bytes;
	int ret;
1631
	int slot;
1632
	struct extent_buffer *l;
1633

1634
	search_start = dev_extent_search_start(device);
1635
	max_hole_start = search_start;
1636

1637 1638 1639
	WARN_ON(device->zone_info &&
		!IS_ALIGNED(num_bytes, device->zone_info->zone_size));

1640
	path = btrfs_alloc_path();
1641 1642 1643 1644
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}
1645
again:
1646 1647
	if (search_start >= search_end ||
		test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1648
		ret = -ENOSPC;
1649
		goto out;
1650 1651
	}

1652
	path->reada = READA_FORWARD;
1653 1654
	path->search_commit_root = 1;
	path->skip_locking = 1;
1655

1656 1657 1658
	key.objectid = device->devid;
	key.offset = search_start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1659

1660
	ret = btrfs_search_backwards(root, &key, path);
1661
	if (ret < 0)
1662 1663
		goto out;

1664
	while (search_start < search_end) {
1665 1666 1667 1668 1669 1670 1671
		l = path->nodes[0];
		slot = path->slots[0];
		if (slot >= btrfs_header_nritems(l)) {
			ret = btrfs_next_leaf(root, path);
			if (ret == 0)
				continue;
			if (ret < 0)
1672 1673 1674
				goto out;

			break;
1675 1676 1677 1678 1679 1680 1681
		}
		btrfs_item_key_to_cpu(l, &key, slot);

		if (key.objectid < device->devid)
			goto next;

		if (key.objectid > device->devid)
1682
			break;
1683

1684
		if (key.type != BTRFS_DEV_EXTENT_KEY)
1685
			goto next;
1686

1687 1688 1689
		if (key.offset > search_end)
			break;

1690 1691
		if (key.offset > search_start) {
			hole_size = key.offset - search_start;
1692 1693
			dev_extent_hole_check(device, &search_start, &hole_size,
					      num_bytes);
1694

1695 1696 1697 1698
			if (hole_size > max_hole_size) {
				max_hole_start = search_start;
				max_hole_size = hole_size;
			}
1699

1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711
			/*
			 * If this free space is greater than which we need,
			 * it must be the max free space that we have found
			 * until now, so max_hole_start must point to the start
			 * of this free space and the length of this free space
			 * is stored in max_hole_size. Thus, we return
			 * max_hole_start and max_hole_size and go back to the
			 * caller.
			 */
			if (hole_size >= num_bytes) {
				ret = 0;
				goto out;
1712 1713 1714 1715
			}
		}

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1716 1717 1718 1719
		extent_end = key.offset + btrfs_dev_extent_length(l,
								  dev_extent);
		if (extent_end > search_start)
			search_start = extent_end;
1720 1721 1722 1723 1724
next:
		path->slots[0]++;
		cond_resched();
	}

1725 1726 1727 1728 1729
	/*
	 * At this point, search_start should be the end of
	 * allocated dev extents, and when shrinking the device,
	 * search_end may be smaller than search_start.
	 */
1730
	if (search_end > search_start) {
1731
		hole_size = search_end - search_start;
1732 1733
		if (dev_extent_hole_check(device, &search_start, &hole_size,
					  num_bytes)) {
1734 1735 1736
			btrfs_release_path(path);
			goto again;
		}
1737

1738 1739 1740 1741
		if (hole_size > max_hole_size) {
			max_hole_start = search_start;
			max_hole_size = hole_size;
		}
1742 1743
	}

1744
	/* See above. */
1745
	if (max_hole_size < num_bytes)
1746 1747 1748 1749
		ret = -ENOSPC;
	else
		ret = 0;

1750
	ASSERT(max_hole_start + max_hole_size <= search_end);
1751
out:
Yan Zheng's avatar
Yan Zheng committed
1752
	btrfs_free_path(path);
1753
	*start = max_hole_start;
1754
	if (len)
1755
		*len = max_hole_size;
1756 1757 1758
	return ret;
}

1759
static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1760
			  struct btrfs_device *device,
Miao Xie's avatar
Miao Xie committed
1761
			  u64 start, u64 *dev_extent_len)
1762
{
1763 1764
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
1765 1766 1767
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
1768 1769 1770
	struct btrfs_key found_key;
	struct extent_buffer *leaf = NULL;
	struct btrfs_dev_extent *extent = NULL;
1771 1772 1773 1774 1775 1776 1777 1778

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = device->devid;
	key.offset = start;
	key.type = BTRFS_DEV_EXTENT_KEY;
1779
again:
1780
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1781 1782 1783
	if (ret > 0) {
		ret = btrfs_previous_item(root, path, key.objectid,
					  BTRFS_DEV_EXTENT_KEY);
1784 1785
		if (ret)
			goto out;
1786 1787 1788 1789 1790 1791
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
		BUG_ON(found_key.offset > start || found_key.offset +
		       btrfs_dev_extent_length(leaf, extent) < start);
1792 1793 1794
		key = found_key;
		btrfs_release_path(path);
		goto again;
1795 1796 1797 1798
	} else if (ret == 0) {
		leaf = path->nodes[0];
		extent = btrfs_item_ptr(leaf, path->slots[0],
					struct btrfs_dev_extent);
1799 1800
	} else {
		goto out;
1801
	}
1802

Miao Xie's avatar
Miao Xie committed
1803 1804
	*dev_extent_len = btrfs_dev_extent_length(leaf, extent);

1805
	ret = btrfs_del_item(trans, root, path);
1806
	if (ret == 0)
1807
		set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1808
out:
1809 1810 1811 1812
	btrfs_free_path(path);
	return ret;
}

1813
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1814
{
1815 1816
	struct rb_node *n;
	u64 ret = 0;
1817

1818 1819
	read_lock(&fs_info->mapping_tree_lock);
	n = rb_last(&fs_info->mapping_tree.rb_root);
1820
	if (n) {
1821 1822 1823 1824
		struct btrfs_chunk_map *map;

		map = rb_entry(n, struct btrfs_chunk_map, rb_node);
		ret = map->start + map->chunk_len;
1825
	}
1826
	read_unlock(&fs_info->mapping_tree_lock);
1827

1828 1829 1830
	return ret;
}

1831 1832
static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
				    u64 *devid_ret)
1833 1834 1835 1836
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
Yan Zheng's avatar
Yan Zheng committed
1837 1838 1839 1840 1841
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;
1842 1843 1844 1845 1846

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = (u64)-1;

1847
	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1848 1849 1850
	if (ret < 0)
		goto error;

1851 1852 1853 1854 1855 1856
	if (ret == 0) {
		/* Corruption */
		btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
		ret = -EUCLEAN;
		goto error;
	}
1857

1858 1859
	ret = btrfs_previous_item(fs_info->chunk_root, path,
				  BTRFS_DEV_ITEMS_OBJECTID,
1860 1861
				  BTRFS_DEV_ITEM_KEY);
	if (ret) {
1862
		*devid_ret = 1;
1863 1864 1865
	} else {
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				      path->slots[0]);
1866
		*devid_ret = found_key.offset + 1;
1867 1868 1869
	}
	ret = 0;
error:
Yan Zheng's avatar
Yan Zheng committed
1870
	btrfs_free_path(path);
1871 1872 1873 1874 1875 1876 1877
	return ret;
}

/*
 * the device information is stored in the chunk root
 * the btrfs_device struct should be fully filled in
 */
1878
static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1879
			    struct btrfs_device *device)
1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893
{
	int ret;
	struct btrfs_path *path;
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	unsigned long ptr;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
Yan Zheng's avatar
Yan Zheng committed
1894
	key.offset = device->devid;
1895

1896
	btrfs_reserve_chunk_metadata(trans, true);
1897 1898
	ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
				      &key, sizeof(*dev_item));
1899
	btrfs_trans_release_chunk_metadata(trans);
1900 1901 1902 1903 1904 1905 1906
	if (ret)
		goto out;

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
Yan Zheng's avatar
Yan Zheng committed
1907
	btrfs_set_device_generation(leaf, dev_item, 0);
1908 1909 1910 1911
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1912 1913 1914 1915
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
1916 1917 1918
	btrfs_set_device_group(leaf, dev_item, 0);
	btrfs_set_device_seek_speed(leaf, dev_item, 0);
	btrfs_set_device_bandwidth(leaf, dev_item, 0);
1919
	btrfs_set_device_start_offset(leaf, dev_item, 0);
1920

1921
	ptr = btrfs_device_uuid(dev_item);
1922
	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1923
	ptr = btrfs_device_fsid(dev_item);
1924 1925
	write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
			    ptr, BTRFS_FSID_SIZE);
1926
	btrfs_mark_buffer_dirty(trans, leaf);
1927

Yan Zheng's avatar
Yan Zheng committed
1928
	ret = 0;
1929 1930 1931 1932
out:
	btrfs_free_path(path);
	return ret;
}
1933

1934 1935 1936
/*
 * Function to update ctime/mtime for a given device path.
 * Mainly used for ctime/mtime based probe like libblkid.
1937 1938
 *
 * We don't care about errors here, this is just to be kind to userspace.
1939
 */
1940
static void update_dev_time(const char *device_path)
1941
{
1942 1943
	struct path path;
	int ret;
1944

1945 1946
	ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
	if (ret)
1947
		return;
1948

1949
	inode_update_time(d_inode(path.dentry), S_MTIME | S_CTIME | S_VERSION);
1950
	path_put(&path);
1951 1952
}

1953 1954
static int btrfs_rm_dev_item(struct btrfs_trans_handle *trans,
			     struct btrfs_device *device)
1955
{
1956
	struct btrfs_root *root = device->fs_info->chunk_root;
1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

1969
	btrfs_reserve_chunk_metadata(trans, false);
1970
	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1971
	btrfs_trans_release_chunk_metadata(trans);
1972 1973 1974
	if (ret) {
		if (ret > 0)
			ret = -ENOENT;
1975 1976 1977 1978 1979 1980 1981 1982 1983
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
	return ret;
}

1984 1985 1986 1987 1988 1989 1990
/*
 * Verify that @num_devices satisfies the RAID profile constraints in the whole
 * filesystem. It's up to the caller to adjust that number regarding eg. device
 * replace.
 */
static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
		u64 num_devices)
1991 1992
{
	u64 all_avail;
1993
	unsigned seq;
1994
	int i;
1995

1996
	do {
1997
		seq = read_seqbegin(&fs_info->profiles_lock);
1998

1999 2000 2001 2002
		all_avail = fs_info->avail_data_alloc_bits |
			    fs_info->avail_system_alloc_bits |
			    fs_info->avail_metadata_alloc_bits;
	} while (read_seqretry(&fs_info->profiles_lock, seq));
2003

2004
	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2005
		if (!(all_avail & btrfs_raid_array[i].bg_flag))
2006
			continue;
2007

2008 2009
		if (num_devices < btrfs_raid_array[i].devs_min)
			return btrfs_raid_array[i].mindev_error;
David Woodhouse's avatar
David Woodhouse committed
2010 2011
	}

2012
	return 0;
2013 2014
}

2015 2016
static struct btrfs_device * btrfs_find_next_active_device(
		struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
2017
{
Yan Zheng's avatar
Yan Zheng committed
2018
	struct btrfs_device *next_device;
2019 2020 2021

	list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
		if (next_device != device &&
2022 2023
		    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
		    && next_device->bdev)
2024 2025 2026 2027 2028 2029 2030
			return next_device;
	}

	return NULL;
}

/*
2031
 * Helper function to check if the given device is part of s_bdev / latest_dev
2032 2033 2034 2035
 * and replace it with the provided or the next active device, in the context
 * where this function called, there should be always be another device (or
 * this_dev) which is active.
 */
2036
void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2037
					    struct btrfs_device *next_device)
2038
{
2039
	struct btrfs_fs_info *fs_info = device->fs_info;
2040

2041
	if (!next_device)
2042
		next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2043
							    device);
2044 2045 2046 2047 2048 2049
	ASSERT(next_device);

	if (fs_info->sb->s_bdev &&
			(fs_info->sb->s_bdev == device->bdev))
		fs_info->sb->s_bdev = next_device->bdev;

2050 2051
	if (fs_info->fs_devices->latest_dev->bdev == device->bdev)
		fs_info->fs_devices->latest_dev = next_device;
2052 2053
}

2054 2055 2056 2057 2058 2059 2060 2061
/*
 * Return btrfs_fs_devices::num_devices excluding the device that's being
 * currently replaced.
 */
static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
{
	u64 num_devices = fs_info->fs_devices->num_devices;

2062
	down_read(&fs_info->dev_replace.rwsem);
2063 2064 2065 2066
	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
		ASSERT(num_devices > 1);
		num_devices--;
	}
2067
	up_read(&fs_info->dev_replace.rwsem);
2068 2069 2070 2071

	return num_devices;
}

2072 2073 2074 2075
static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info,
				     struct block_device *bdev, int copy_num)
{
	struct btrfs_super_block *disk_super;
2076 2077
	const size_t len = sizeof(disk_super->magic);
	const u64 bytenr = btrfs_sb_offset(copy_num);
2078 2079
	int ret;

2080
	disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr);
2081 2082 2083
	if (IS_ERR(disk_super))
		return;

2084 2085 2086 2087 2088
	memset(&disk_super->magic, 0, len);
	folio_mark_dirty(virt_to_folio(disk_super));
	btrfs_release_disk_super(disk_super);

	ret = sync_blockdev_range(bdev, bytenr, bytenr + len - 1);
2089 2090 2091 2092 2093
	if (ret)
		btrfs_warn(fs_info, "error clearing superblock number %d (%d)",
			copy_num, ret);
}

2094
void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, struct btrfs_device *device)
2095 2096
{
	int copy_num;
2097
	struct block_device *bdev = device->bdev;
2098 2099 2100 2101 2102

	if (!bdev)
		return;

	for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2103
		if (bdev_is_zoned(bdev))
2104
			btrfs_reset_sb_log_zones(bdev, copy_num);
2105 2106
		else
			btrfs_scratch_superblock(fs_info, bdev, copy_num);
2107 2108 2109 2110 2111 2112
	}

	/* Notify udev that device has changed */
	btrfs_kobject_uevent(bdev, KOBJ_CHANGE);

	/* Update ctime/mtime for device path for libblkid */
2113
	update_dev_time(device->name->str);
2114 2115
}

2116 2117
int btrfs_rm_device(struct btrfs_fs_info *fs_info,
		    struct btrfs_dev_lookup_args *args,
2118
		    struct file **bdev_file)
2119
{
2120
	struct btrfs_trans_handle *trans;
2121
	struct btrfs_device *device;
2122
	struct btrfs_fs_devices *cur_devices;
2123
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Yan Zheng's avatar
Yan Zheng committed
2124
	u64 num_devices;
2125 2126
	int ret = 0;

2127 2128 2129 2130 2131
	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
		btrfs_err(fs_info, "device remove not supported on extent tree v2 yet");
		return -EINVAL;
	}

2132 2133 2134 2135 2136
	/*
	 * The device list in fs_devices is accessed without locks (neither
	 * uuid_mutex nor device_list_mutex) as it won't change on a mounted
	 * filesystem and another device rm cannot run.
	 */
2137
	num_devices = btrfs_num_devices(fs_info);
2138

2139
	ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2140
	if (ret)
2141
		return ret;
2142

2143 2144 2145
	device = btrfs_find_device(fs_info->fs_devices, args);
	if (!device) {
		if (args->missing)
2146 2147
			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
		else
2148
			ret = -ENOENT;
2149
		return ret;
2150
	}
2151

2152 2153 2154
	if (btrfs_pinned_by_swapfile(fs_info, device)) {
		btrfs_warn_in_rcu(fs_info,
		  "cannot remove device %s (devid %llu) due to active swapfile",
2155
				  btrfs_dev_name(device), device->devid);
2156
		return -ETXTBSY;
2157 2158
	}

2159 2160
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
		return BTRFS_ERROR_DEV_TGT_REPLACE;
2161

2162
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2163 2164
	    fs_info->fs_devices->rw_devices == 1)
		return BTRFS_ERROR_DEV_ONLY_WRITABLE;
Yan Zheng's avatar
Yan Zheng committed
2165

2166
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2167
		mutex_lock(&fs_info->chunk_mutex);
Yan Zheng's avatar
Yan Zheng committed
2168
		list_del_init(&device->dev_alloc_list);
2169
		device->fs_devices->rw_devices--;
2170
		mutex_unlock(&fs_info->chunk_mutex);
2171
	}
2172 2173 2174

	ret = btrfs_shrink_device(device, 0);
	if (ret)
2175
		goto error_undo;
2176

2177 2178 2179
	trans = btrfs_start_transaction(fs_info->chunk_root, 0);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2180
		goto error_undo;
2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192
	}

	ret = btrfs_rm_dev_item(trans, device);
	if (ret) {
		/* Any error in dev item removal is critical */
		btrfs_crit(fs_info,
			   "failed to remove device item for devid %llu: %d",
			   device->devid, ret);
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
		return ret;
	}
2193

2194
	clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2195
	btrfs_scrub_cancel_dev(device);
2196 2197 2198 2199

	/*
	 * the device list mutex makes sure that we don't change
	 * the device list while someone else is writing out all
2200 2201 2202 2203 2204
	 * the device supers. Whoever is writing all supers, should
	 * lock the device list mutex before getting the number of
	 * devices in the super block (super_copy). Conversely,
	 * whoever updates the number of devices in the super block
	 * (super_copy) should hold the device list mutex.
2205
	 */
2206

2207 2208 2209
	/*
	 * In normal cases the cur_devices == fs_devices. But in case
	 * of deleting a seed device, the cur_devices should point to
2210
	 * its own fs_devices listed under the fs_devices->seed_list.
2211
	 */
2212
	cur_devices = device->fs_devices;
2213
	mutex_lock(&fs_devices->device_list_mutex);
2214
	list_del_rcu(&device->dev_list);
2215

2216 2217
	cur_devices->num_devices--;
	cur_devices->total_devices--;
2218 2219 2220
	/* Update total_devices of the parent fs_devices if it's seed */
	if (cur_devices != fs_devices)
		fs_devices->total_devices--;
Yan Zheng's avatar
Yan Zheng committed
2221

2222
	if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2223
		cur_devices->missing_devices--;
2224

2225
	btrfs_assign_next_active_device(device, NULL);
Yan Zheng's avatar
Yan Zheng committed
2226

2227
	if (device->bdev_file) {
2228
		cur_devices->open_devices--;
2229
		/* remove sysfs entry */
2230
		btrfs_sysfs_remove_device(device);
2231
	}
2232

2233 2234
	num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
	btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2235
	mutex_unlock(&fs_devices->device_list_mutex);
Yan Zheng's avatar
Yan Zheng committed
2236

2237
	/*
2238 2239 2240 2241 2242
	 * At this point, the device is zero sized and detached from the
	 * devices list.  All that's left is to zero out the old supers and
	 * free the device.
	 *
	 * We cannot call btrfs_close_bdev() here because we're holding the sb
2243 2244 2245
	 * write lock, and fput() on the block device will pull in the
	 * ->open_mutex on the block device and it's dependencies.  Instead
	 *  just flush the device and let the caller do the final bdev_release.
2246
	 */
2247
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2248
		btrfs_scratch_superblocks(fs_info, device);
2249 2250 2251 2252 2253
		if (device->bdev) {
			sync_blockdev(device->bdev);
			invalidate_bdev(device->bdev);
		}
	}
2254

2255
	*bdev_file = device->bdev_file;
2256 2257
	synchronize_rcu();
	btrfs_free_device(device);
2258

2259 2260 2261 2262 2263 2264 2265
	/*
	 * This can happen if cur_devices is the private seed devices list.  We
	 * cannot call close_fs_devices() here because it expects the uuid_mutex
	 * to be held, but in fact we don't need that for the private
	 * seed_devices, we can simply decrement cur_devices->opened and then
	 * remove it from our list and free the fs_devices.
	 */
2266
	if (cur_devices->num_devices == 0) {
2267
		list_del_init(&cur_devices->seed_list);
2268 2269
		ASSERT(cur_devices->opened == 1);
		cur_devices->opened--;
2270
		free_fs_devices(cur_devices);
Yan Zheng's avatar
Yan Zheng committed
2271 2272
	}

2273 2274
	ret = btrfs_commit_transaction(trans);

2275
	return ret;
2276

2277
error_undo:
2278
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2279
		mutex_lock(&fs_info->chunk_mutex);
2280
		list_add(&device->dev_alloc_list,
2281
			 &fs_devices->alloc_list);
2282
		device->fs_devices->rw_devices++;
2283
		mutex_unlock(&fs_info->chunk_mutex);
2284
	}
2285
	return ret;
2286 2287
}

2288
void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2289
{
2290 2291
	struct btrfs_fs_devices *fs_devices;

2292
	lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2293

2294 2295 2296 2297 2298 2299 2300
	/*
	 * in case of fs with no seed, srcdev->fs_devices will point
	 * to fs_devices of fs_info. However when the dev being replaced is
	 * a seed dev it will point to the seed's local fs_devices. In short
	 * srcdev will have its correct fs_devices in both the cases.
	 */
	fs_devices = srcdev->fs_devices;
2301

2302
	list_del_rcu(&srcdev->dev_list);
2303
	list_del(&srcdev->dev_alloc_list);
2304
	fs_devices->num_devices--;
2305
	if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2306
		fs_devices->missing_devices--;
2307

2308
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2309
		fs_devices->rw_devices--;
2310

2311
	if (srcdev->bdev)
2312
		fs_devices->open_devices--;
2313 2314
}

2315
void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2316 2317
{
	struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2318

2319 2320
	mutex_lock(&uuid_mutex);

2321
	btrfs_close_bdev(srcdev);
2322 2323
	synchronize_rcu();
	btrfs_free_device(srcdev);
2324 2325 2326

	/* if this is no devs we rather delete the fs_devices */
	if (!fs_devices->num_devices) {
2327 2328 2329 2330 2331 2332 2333 2334
		/*
		 * On a mounted FS, num_devices can't be zero unless it's a
		 * seed. In case of a seed device being replaced, the replace
		 * target added to the sprout FS, so there will be no more
		 * device left under the seed FS.
		 */
		ASSERT(fs_devices->seeding);

2335
		list_del_init(&fs_devices->seed_list);
2336
		close_fs_devices(fs_devices);
2337
		free_fs_devices(fs_devices);
2338
	}
2339
	mutex_unlock(&uuid_mutex);
2340 2341
}

2342
void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2343
{
2344
	struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2345 2346

	mutex_lock(&fs_devices->device_list_mutex);
2347

2348
	btrfs_sysfs_remove_device(tgtdev);
2349

2350
	if (tgtdev->bdev)
2351
		fs_devices->open_devices--;
2352

2353
	fs_devices->num_devices--;
2354

2355
	btrfs_assign_next_active_device(tgtdev, NULL);
2356 2357 2358

	list_del_rcu(&tgtdev->dev_list);

2359
	mutex_unlock(&fs_devices->device_list_mutex);
2360

2361
	btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev);
2362 2363

	btrfs_close_bdev(tgtdev);
2364 2365
	synchronize_rcu();
	btrfs_free_device(tgtdev);
2366 2367
}

2368 2369
/*
 * Populate args from device at path.
2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388
 *
 * @fs_info:	the filesystem
 * @args:	the args to populate
 * @path:	the path to the device
 *
 * This will read the super block of the device at @path and populate @args with
 * the devid, fsid, and uuid.  This is meant to be used for ioctls that need to
 * lookup a device to operate on, but need to do it before we take any locks.
 * This properly handles the special case of "missing" that a user may pass in,
 * and does some basic sanity checks.  The caller must make sure that @path is
 * properly NUL terminated before calling in, and must call
 * btrfs_put_dev_args_from_path() in order to free up the temporary fsid and
 * uuid buffers.
 *
 * Return: 0 for success, -errno for failure
 */
int btrfs_get_dev_args_from_path(struct btrfs_fs_info *fs_info,
				 struct btrfs_dev_lookup_args *args,
				 const char *path)
2389 2390
{
	struct btrfs_super_block *disk_super;
2391
	struct file *bdev_file;
2392
	int ret;
2393

2394 2395 2396 2397 2398 2399
	if (!path || !path[0])
		return -EINVAL;
	if (!strcmp(path, "missing")) {
		args->missing = true;
		return 0;
	}
2400

2401 2402 2403 2404 2405 2406
	args->uuid = kzalloc(BTRFS_UUID_SIZE, GFP_KERNEL);
	args->fsid = kzalloc(BTRFS_FSID_SIZE, GFP_KERNEL);
	if (!args->uuid || !args->fsid) {
		btrfs_put_dev_args_from_path(args);
		return -ENOMEM;
	}
2407

2408
	ret = btrfs_get_bdev_and_sb(path, BLK_OPEN_READ, NULL, 0,
2409
				    &bdev_file, &disk_super);
2410 2411
	if (ret) {
		btrfs_put_dev_args_from_path(args);
2412
		return ret;
2413 2414
	}

2415 2416
	args->devid = btrfs_stack_device_id(&disk_super->dev_item);
	memcpy(args->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE);
2417
	if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2418
		memcpy(args->fsid, disk_super->metadata_uuid, BTRFS_FSID_SIZE);
2419
	else
2420
		memcpy(args->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
2421
	btrfs_release_disk_super(disk_super);
2422
	fput(bdev_file);
2423
	return 0;
2424 2425
}

2426
/*
2427 2428 2429
 * Only use this jointly with btrfs_get_dev_args_from_path() because we will
 * allocate our ->uuid and ->fsid pointers, everybody else uses local variables
 * that don't need to be freed.
2430
 */
2431 2432 2433 2434 2435 2436 2437 2438
void btrfs_put_dev_args_from_path(struct btrfs_dev_lookup_args *args)
{
	kfree(args->uuid);
	kfree(args->fsid);
	args->uuid = NULL;
	args->fsid = NULL;
}

2439
struct btrfs_device *btrfs_find_device_by_devspec(
2440 2441
		struct btrfs_fs_info *fs_info, u64 devid,
		const char *device_path)
2442
{
2443
	BTRFS_DEV_LOOKUP_ARGS(args);
2444
	struct btrfs_device *device;
2445
	int ret;
2446

2447
	if (devid) {
2448 2449
		args.devid = devid;
		device = btrfs_find_device(fs_info->fs_devices, &args);
2450 2451
		if (!device)
			return ERR_PTR(-ENOENT);
2452 2453 2454
		return device;
	}

2455 2456 2457 2458 2459 2460
	ret = btrfs_get_dev_args_from_path(fs_info, &args, device_path);
	if (ret)
		return ERR_PTR(ret);
	device = btrfs_find_device(fs_info->fs_devices, &args);
	btrfs_put_dev_args_from_path(&args);
	if (!device)
2461
		return ERR_PTR(-ENOENT);
2462
	return device;
2463 2464
}

2465
static struct btrfs_fs_devices *btrfs_init_sprout(struct btrfs_fs_info *fs_info)
Yan Zheng's avatar
Yan Zheng committed
2466
{
2467
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
Yan Zheng's avatar
Yan Zheng committed
2468
	struct btrfs_fs_devices *old_devices;
Yan Zheng's avatar
Yan Zheng committed
2469
	struct btrfs_fs_devices *seed_devices;
Yan Zheng's avatar
Yan Zheng committed
2470

2471
	lockdep_assert_held(&uuid_mutex);
Yan Zheng's avatar
Yan Zheng committed
2472
	if (!fs_devices->seeding)
2473
		return ERR_PTR(-EINVAL);
Yan Zheng's avatar
Yan Zheng committed
2474

2475 2476 2477 2478
	/*
	 * Private copy of the seed devices, anchored at
	 * fs_info->fs_devices->seed_list
	 */
2479
	seed_devices = alloc_fs_devices(NULL);
2480
	if (IS_ERR(seed_devices))
2481
		return seed_devices;
Yan Zheng's avatar
Yan Zheng committed
2482

2483 2484 2485 2486 2487 2488
	/*
	 * It's necessary to retain a copy of the original seed fs_devices in
	 * fs_uuids so that filesystems which have been seeded can successfully
	 * reference the seed device from open_seed_devices. This also supports
	 * multiple fs seed.
	 */
Yan Zheng's avatar
Yan Zheng committed
2489 2490 2491
	old_devices = clone_fs_devices(fs_devices);
	if (IS_ERR(old_devices)) {
		kfree(seed_devices);
2492
		return old_devices;
Yan Zheng's avatar
Yan Zheng committed
2493
	}
Yan Zheng's avatar
Yan Zheng committed
2494

2495
	list_add(&old_devices->fs_list, &fs_uuids);
Yan Zheng's avatar
Yan Zheng committed
2496

Yan Zheng's avatar
Yan Zheng committed
2497 2498 2499 2500
	memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
	seed_devices->opened = 1;
	INIT_LIST_HEAD(&seed_devices->devices);
	INIT_LIST_HEAD(&seed_devices->alloc_list);
2501
	mutex_init(&seed_devices->device_list_mutex);
2502

2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537
	return seed_devices;
}

/*
 * Splice seed devices into the sprout fs_devices.
 * Generate a new fsid for the sprouted read-write filesystem.
 */
static void btrfs_setup_sprout(struct btrfs_fs_info *fs_info,
			       struct btrfs_fs_devices *seed_devices)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_super_block *disk_super = fs_info->super_copy;
	struct btrfs_device *device;
	u64 super_flags;

	/*
	 * We are updating the fsid, the thread leading to device_list_add()
	 * could race, so uuid_mutex is needed.
	 */
	lockdep_assert_held(&uuid_mutex);

	/*
	 * The threads listed below may traverse dev_list but can do that without
	 * device_list_mutex:
	 * - All device ops and balance - as we are in btrfs_exclop_start.
	 * - Various dev_list readers - are using RCU.
	 * - btrfs_ioctl_fitrim() - is using RCU.
	 *
	 * For-read threads as below are using device_list_mutex:
	 * - Readonly scrub btrfs_scrub_dev()
	 * - Readonly scrub btrfs_scrub_progress()
	 * - btrfs_get_dev_stats()
	 */
	lockdep_assert_held(&fs_devices->device_list_mutex);

2538 2539
	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
			      synchronize_rcu);
Miao Xie's avatar
Miao Xie committed
2540 2541
	list_for_each_entry(device, &seed_devices->devices, dev_list)
		device->fs_devices = seed_devices;
2542

2543
	fs_devices->seeding = false;
Yan Zheng's avatar
Yan Zheng committed
2544 2545
	fs_devices->num_devices = 0;
	fs_devices->open_devices = 0;
2546
	fs_devices->missing_devices = 0;
2547
	fs_devices->rotating = false;
2548
	list_add(&seed_devices->seed_list, &fs_devices->seed_list);
Yan Zheng's avatar
Yan Zheng committed
2549 2550

	generate_random_uuid(fs_devices->fsid);
2551
	memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
Yan Zheng's avatar
Yan Zheng committed
2552
	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2553

Yan Zheng's avatar
Yan Zheng committed
2554 2555 2556 2557 2558 2559
	super_flags = btrfs_super_flags(disk_super) &
		      ~BTRFS_SUPER_FLAG_SEEDING;
	btrfs_set_super_flags(disk_super, super_flags);
}

/*
2560
 * Store the expected generation for seed devices in device items.
Yan Zheng's avatar
Yan Zheng committed
2561
 */
2562
static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
Yan Zheng's avatar
Yan Zheng committed
2563
{
2564
	BTRFS_DEV_LOOKUP_ARGS(args);
2565
	struct btrfs_fs_info *fs_info = trans->fs_info;
2566
	struct btrfs_root *root = fs_info->chunk_root;
Yan Zheng's avatar
Yan Zheng committed
2567 2568 2569 2570 2571
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_dev_item *dev_item;
	struct btrfs_device *device;
	struct btrfs_key key;
2572
	u8 fs_uuid[BTRFS_FSID_SIZE];
Yan Zheng's avatar
Yan Zheng committed
2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584
	u8 dev_uuid[BTRFS_UUID_SIZE];
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = BTRFS_DEV_ITEM_KEY;

	while (1) {
2585
		btrfs_reserve_chunk_metadata(trans, false);
Yan Zheng's avatar
Yan Zheng committed
2586
		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2587
		btrfs_trans_release_chunk_metadata(trans);
Yan Zheng's avatar
Yan Zheng committed
2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600
		if (ret < 0)
			goto error;

		leaf = path->nodes[0];
next_slot:
		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret > 0)
				break;
			if (ret < 0)
				goto error;
			leaf = path->nodes[0];
			btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2601
			btrfs_release_path(path);
Yan Zheng's avatar
Yan Zheng committed
2602 2603 2604 2605 2606 2607 2608 2609 2610 2611
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
		if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
		    key.type != BTRFS_DEV_ITEM_KEY)
			break;

		dev_item = btrfs_item_ptr(leaf, path->slots[0],
					  struct btrfs_dev_item);
2612
		args.devid = btrfs_device_id(leaf, dev_item);
2613
		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
Yan Zheng's avatar
Yan Zheng committed
2614
				   BTRFS_UUID_SIZE);
2615
		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2616
				   BTRFS_FSID_SIZE);
2617 2618 2619
		args.uuid = dev_uuid;
		args.fsid = fs_uuid;
		device = btrfs_find_device(fs_info->fs_devices, &args);
2620
		BUG_ON(!device); /* Logic error */
Yan Zheng's avatar
Yan Zheng committed
2621 2622 2623 2624

		if (device->fs_devices->seeding) {
			btrfs_set_device_generation(leaf, dev_item,
						    device->generation);
2625
			btrfs_mark_buffer_dirty(trans, leaf);
Yan Zheng's avatar
Yan Zheng committed
2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636
		}

		path->slots[0]++;
		goto next_slot;
	}
	ret = 0;
error:
	btrfs_free_path(path);
	return ret;
}

2637
int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2638
{
2639
	struct btrfs_root *root = fs_info->dev_root;
2640 2641
	struct btrfs_trans_handle *trans;
	struct btrfs_device *device;
2642
	struct file *bdev_file;
2643
	struct super_block *sb = fs_info->sb;
2644
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2645
	struct btrfs_fs_devices *seed_devices = NULL;
2646 2647
	u64 orig_super_total_bytes;
	u64 orig_super_num_devices;
2648
	int ret = 0;
2649
	bool seeding_dev = false;
2650
	bool locked = false;
2651

2652
	if (sb_rdonly(sb) && !fs_devices->seeding)
2653
		return -EROFS;
2654

2655
	bdev_file = bdev_file_open_by_path(device_path, BLK_OPEN_WRITE,
2656
					fs_info->bdev_holder, NULL);
2657 2658
	if (IS_ERR(bdev_file))
		return PTR_ERR(bdev_file);
2659

2660
	if (!btrfs_check_device_zone_type(fs_info, file_bdev(bdev_file))) {
2661 2662 2663 2664
		ret = -EINVAL;
		goto error;
	}

2665
	if (fs_devices->seeding) {
2666
		seeding_dev = true;
Yan Zheng's avatar
Yan Zheng committed
2667 2668
		down_write(&sb->s_umount);
		mutex_lock(&uuid_mutex);
2669
		locked = true;
Yan Zheng's avatar
Yan Zheng committed
2670 2671
	}

2672
	sync_blockdev(file_bdev(bdev_file));
2673

2674 2675
	rcu_read_lock();
	list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2676
		if (device->bdev == file_bdev(bdev_file)) {
2677
			ret = -EEXIST;
2678
			rcu_read_unlock();
Yan Zheng's avatar
Yan Zheng committed
2679
			goto error;
2680 2681
		}
	}
2682
	rcu_read_unlock();
2683

2684
	device = btrfs_alloc_device(fs_info, NULL, NULL, device_path);
2685
	if (IS_ERR(device)) {
2686
		/* we can safely leave the fs_devices entry around */
2687
		ret = PTR_ERR(device);
Yan Zheng's avatar
Yan Zheng committed
2688
		goto error;
2689 2690
	}

2691
	device->fs_info = fs_info;
2692 2693
	device->bdev_file = bdev_file;
	device->bdev = file_bdev(bdev_file);
2694 2695 2696
	ret = lookup_bdev(device_path, &device->devt);
	if (ret)
		goto error_free_device;
2697

2698
	ret = btrfs_get_dev_zone_info(device, false);
2699 2700 2701
	if (ret)
		goto error_free_device;

2702
	trans = btrfs_start_transaction(root, 0);
2703 2704
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
2705
		goto error_free_zone;
2706 2707
	}

2708
	set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
Yan Zheng's avatar
Yan Zheng committed
2709
	device->generation = trans->transid;
2710 2711 2712
	device->io_width = fs_info->sectorsize;
	device->io_align = fs_info->sectorsize;
	device->sector_size = fs_info->sectorsize;
2713
	device->total_bytes =
2714
		round_down(bdev_nr_bytes(device->bdev), fs_info->sectorsize);
2715
	device->disk_total_bytes = device->total_bytes;
2716
	device->commit_total_bytes = device->total_bytes;
2717
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2718
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2719
	device->dev_stats_valid = 1;
2720
	set_blocksize(device->bdev_file, BTRFS_BDEV_BLOCKSIZE);
2721

Yan Zheng's avatar
Yan Zheng committed
2722
	if (seeding_dev) {
2723
		btrfs_clear_sb_rdonly(sb);
2724 2725 2726 2727 2728

		/* GFP_KERNEL allocation must not be under device_list_mutex */
		seed_devices = btrfs_init_sprout(fs_info);
		if (IS_ERR(seed_devices)) {
			ret = PTR_ERR(seed_devices);
2729 2730 2731
			btrfs_abort_transaction(trans, ret);
			goto error_trans;
		}
2732 2733 2734 2735 2736
	}

	mutex_lock(&fs_devices->device_list_mutex);
	if (seeding_dev) {
		btrfs_setup_sprout(fs_info, seed_devices);
2737 2738
		btrfs_assign_next_active_device(fs_info->fs_devices->latest_dev,
						device);
Yan Zheng's avatar
Yan Zheng committed
2739
	}
2740

2741
	device->fs_devices = fs_devices;
2742

2743
	mutex_lock(&fs_info->chunk_mutex);
2744 2745 2746 2747 2748 2749 2750
	list_add_rcu(&device->dev_list, &fs_devices->devices);
	list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
	fs_devices->num_devices++;
	fs_devices->open_devices++;
	fs_devices->rw_devices++;
	fs_devices->total_devices++;
	fs_devices->total_rw_bytes += device->total_bytes;
2751

2752
	atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2753

2754
	if (!bdev_nonrot(device->bdev))
2755
		fs_devices->rotating = true;
Chris Mason's avatar
Chris Mason committed
2756

2757
	orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2758
	btrfs_set_super_total_bytes(fs_info->super_copy,
2759 2760
		round_down(orig_super_total_bytes + device->total_bytes,
			   fs_info->sectorsize));
2761

2762 2763 2764
	orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
	btrfs_set_super_num_devices(fs_info->super_copy,
				    orig_super_num_devices + 1);
2765

Miao Xie's avatar
Miao Xie committed
2766 2767 2768 2769
	/*
	 * we've got more storage, clear any full flags on the space
	 * infos
	 */
2770
	btrfs_clear_space_info_full(fs_info);
Miao Xie's avatar
Miao Xie committed
2771

2772
	mutex_unlock(&fs_info->chunk_mutex);
2773 2774

	/* Add sysfs device entry */
2775
	btrfs_sysfs_add_device(device);
2776

2777
	mutex_unlock(&fs_devices->device_list_mutex);
2778

Yan Zheng's avatar
Yan Zheng committed
2779
	if (seeding_dev) {
2780
		mutex_lock(&fs_info->chunk_mutex);
2781
		ret = init_first_rw_device(trans);
2782
		mutex_unlock(&fs_info->chunk_mutex);
2783
		if (ret) {
2784
			btrfs_abort_transaction(trans, ret);
2785
			goto error_sysfs;
2786
		}
Miao Xie's avatar
Miao Xie committed
2787 2788
	}

2789
	ret = btrfs_add_dev_item(trans, device);
Miao Xie's avatar
Miao Xie committed
2790
	if (ret) {
2791
		btrfs_abort_transaction(trans, ret);
2792
		goto error_sysfs;
Miao Xie's avatar
Miao Xie committed
2793 2794 2795
	}

	if (seeding_dev) {
2796
		ret = btrfs_finish_sprout(trans);
2797
		if (ret) {
2798
			btrfs_abort_transaction(trans, ret);
2799
			goto error_sysfs;
2800
		}
2801

2802 2803
		/*
		 * fs_devices now represents the newly sprouted filesystem and
2804
		 * its fsid has been changed by btrfs_sprout_splice().
2805 2806
		 */
		btrfs_sysfs_update_sprout_fsid(fs_devices);
Yan Zheng's avatar
Yan Zheng committed
2807 2808
	}

2809
	ret = btrfs_commit_transaction(trans);
2810

Yan Zheng's avatar
Yan Zheng committed
2811 2812 2813
	if (seeding_dev) {
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
2814
		locked = false;
2815

2816 2817 2818
		if (ret) /* transaction commit */
			return ret;

2819
		ret = btrfs_relocate_sys_chunks(fs_info);
2820
		if (ret < 0)
2821
			btrfs_handle_fs_error(fs_info, ret,
2822
				    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2823 2824 2825 2826
		trans = btrfs_attach_transaction(root);
		if (IS_ERR(trans)) {
			if (PTR_ERR(trans) == -ENOENT)
				return 0;
2827 2828 2829
			ret = PTR_ERR(trans);
			trans = NULL;
			goto error_sysfs;
2830
		}
2831
		ret = btrfs_commit_transaction(trans);
Yan Zheng's avatar
Yan Zheng committed
2832
	}
2833

2834 2835 2836 2837 2838 2839 2840
	/*
	 * Now that we have written a new super block to this device, check all
	 * other fs_devices list if device_path alienates any other scanned
	 * device.
	 * We can ignore the return value as it typically returns -EINVAL and
	 * only succeeds if the device was an alien.
	 */
2841
	btrfs_forget_devices(device->devt);
2842 2843

	/* Update ctime/mtime for blkid or udev */
2844
	update_dev_time(device_path);
2845

Yan Zheng's avatar
Yan Zheng committed
2846
	return ret;
2847

2848
error_sysfs:
2849
	btrfs_sysfs_remove_device(device);
2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865
	mutex_lock(&fs_info->fs_devices->device_list_mutex);
	mutex_lock(&fs_info->chunk_mutex);
	list_del_rcu(&device->dev_list);
	list_del(&device->dev_alloc_list);
	fs_info->fs_devices->num_devices--;
	fs_info->fs_devices->open_devices--;
	fs_info->fs_devices->rw_devices--;
	fs_info->fs_devices->total_devices--;
	fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
	atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
	btrfs_set_super_total_bytes(fs_info->super_copy,
				    orig_super_total_bytes);
	btrfs_set_super_num_devices(fs_info->super_copy,
				    orig_super_num_devices);
	mutex_unlock(&fs_info->chunk_mutex);
	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2866
error_trans:
2867
	if (seeding_dev)
2868
		btrfs_set_sb_rdonly(sb);
2869 2870
	if (trans)
		btrfs_end_transaction(trans);
2871 2872
error_free_zone:
	btrfs_destroy_dev_zone_info(device);
2873
error_free_device:
2874
	btrfs_free_device(device);
Yan Zheng's avatar
Yan Zheng committed
2875
error:
2876
	fput(bdev_file);
2877
	if (locked) {
Yan Zheng's avatar
Yan Zheng committed
2878 2879 2880
		mutex_unlock(&uuid_mutex);
		up_write(&sb->s_umount);
	}
2881
	return ret;
2882 2883
}

2884 2885
static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
					struct btrfs_device *device)
2886 2887 2888
{
	int ret;
	struct btrfs_path *path;
2889
	struct btrfs_root *root = device->fs_info->chunk_root;
2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918
	struct btrfs_dev_item *dev_item;
	struct extent_buffer *leaf;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.type = BTRFS_DEV_ITEM_KEY;
	key.offset = device->devid;

	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
	if (ret < 0)
		goto out;

	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);

	btrfs_set_device_id(leaf, dev_item, device->devid);
	btrfs_set_device_type(leaf, dev_item, device->type);
	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2919 2920 2921 2922
	btrfs_set_device_total_bytes(leaf, dev_item,
				     btrfs_device_get_disk_total_bytes(device));
	btrfs_set_device_bytes_used(leaf, dev_item,
				    btrfs_device_get_bytes_used(device));
2923
	btrfs_mark_buffer_dirty(trans, leaf);
2924 2925 2926 2927 2928 2929

out:
	btrfs_free_path(path);
	return ret;
}

Miao Xie's avatar
Miao Xie committed
2930
int btrfs_grow_device(struct btrfs_trans_handle *trans,
2931 2932
		      struct btrfs_device *device, u64 new_size)
{
2933 2934
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_super_block *super_copy = fs_info->super_copy;
Miao Xie's avatar
Miao Xie committed
2935 2936
	u64 old_total;
	u64 diff;
2937
	int ret;
2938

2939
	if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
Yan Zheng's avatar
Yan Zheng committed
2940
		return -EACCES;
Miao Xie's avatar
Miao Xie committed
2941

2942 2943
	new_size = round_down(new_size, fs_info->sectorsize);

2944
	mutex_lock(&fs_info->chunk_mutex);
Miao Xie's avatar
Miao Xie committed
2945
	old_total = btrfs_super_total_bytes(super_copy);
2946
	diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
Miao Xie's avatar
Miao Xie committed
2947

2948
	if (new_size <= device->total_bytes ||
2949
	    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2950
		mutex_unlock(&fs_info->chunk_mutex);
Yan Zheng's avatar
Yan Zheng committed
2951
		return -EINVAL;
Miao Xie's avatar
Miao Xie committed
2952
	}
Yan Zheng's avatar
Yan Zheng committed
2953

2954 2955
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total + diff, fs_info->sectorsize));
Yan Zheng's avatar
Yan Zheng committed
2956
	device->fs_devices->total_rw_bytes += diff;
2957
	atomic64_add(diff, &fs_info->free_chunk_space);
Yan Zheng's avatar
Yan Zheng committed
2958

2959 2960
	btrfs_device_set_total_bytes(device, new_size);
	btrfs_device_set_disk_total_bytes(device, new_size);
2961
	btrfs_clear_space_info_full(device->fs_info);
2962 2963 2964
	if (list_empty(&device->post_commit_list))
		list_add_tail(&device->post_commit_list,
			      &trans->transaction->dev_update_list);
2965
	mutex_unlock(&fs_info->chunk_mutex);
2966

2967 2968 2969 2970 2971
	btrfs_reserve_chunk_metadata(trans, false);
	ret = btrfs_update_device(trans, device);
	btrfs_trans_release_chunk_metadata(trans);

	return ret;
2972 2973
}

2974
static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2975
{
2976
	struct btrfs_fs_info *fs_info = trans->fs_info;
2977
	struct btrfs_root *root = fs_info->chunk_root;
2978 2979 2980 2981 2982 2983 2984 2985
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

2986
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2987 2988 2989 2990
	key.offset = chunk_offset;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2991 2992 2993
	if (ret < 0)
		goto out;
	else if (ret > 0) { /* Logic error or corruption */
2994 2995 2996 2997
		btrfs_err(fs_info, "failed to lookup chunk %llu when freeing",
			  chunk_offset);
		btrfs_abort_transaction(trans, -ENOENT);
		ret = -EUCLEAN;
2998 2999
		goto out;
	}
3000 3001

	ret = btrfs_del_item(trans, root, path);
3002 3003 3004 3005 3006
	if (ret < 0) {
		btrfs_err(fs_info, "failed to delete chunk %llu item", chunk_offset);
		btrfs_abort_transaction(trans, ret);
		goto out;
	}
3007
out:
3008
	btrfs_free_path(path);
3009
	return ret;
3010 3011
}

3012
static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3013
{
3014
	struct btrfs_super_block *super_copy = fs_info->super_copy;
3015 3016 3017 3018 3019 3020 3021 3022 3023 3024
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
	u8 *ptr;
	int ret = 0;
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
	u32 cur;
	struct btrfs_key key;

3025
	lockdep_assert_held(&fs_info->chunk_mutex);
3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044
	array_size = btrfs_super_sys_array_size(super_copy);

	ptr = super_copy->sys_chunk_array;
	cur = 0;

	while (cur < array_size) {
		disk_key = (struct btrfs_disk_key *)ptr;
		btrfs_disk_key_to_cpu(&key, disk_key);

		len = sizeof(*disk_key);

		if (key.type == BTRFS_CHUNK_ITEM_KEY) {
			chunk = (struct btrfs_chunk *)(ptr + len);
			num_stripes = btrfs_stack_chunk_num_stripes(chunk);
			len += btrfs_chunk_item_size(num_stripes);
		} else {
			ret = -EIO;
			break;
		}
3045
		if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057
		    key.offset == chunk_offset) {
			memmove(ptr, ptr + len, array_size - (cur + len));
			array_size -= len;
			btrfs_set_super_sys_array_size(super_copy, array_size);
		} else {
			ptr += len;
			cur += len;
		}
	}
	return ret;
}

3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132
struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
						    u64 logical, u64 length)
{
	struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
	struct rb_node *prev = NULL;
	struct rb_node *orig_prev;
	struct btrfs_chunk_map *map;
	struct btrfs_chunk_map *prev_map = NULL;

	while (node) {
		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
		prev = node;
		prev_map = map;

		if (logical < map->start) {
			node = node->rb_left;
		} else if (logical >= map->start + map->chunk_len) {
			node = node->rb_right;
		} else {
			refcount_inc(&map->refs);
			return map;
		}
	}

	if (!prev)
		return NULL;

	orig_prev = prev;
	while (prev && logical >= prev_map->start + prev_map->chunk_len) {
		prev = rb_next(prev);
		prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
	}

	if (!prev) {
		prev = orig_prev;
		prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
		while (prev && logical < prev_map->start) {
			prev = rb_prev(prev);
			prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
		}
	}

	if (prev) {
		u64 end = logical + length;

		/*
		 * Caller can pass a U64_MAX length when it wants to get any
		 * chunk starting at an offset of 'logical' or higher, so deal
		 * with underflow by resetting the end offset to U64_MAX.
		 */
		if (end < logical)
			end = U64_MAX;

		if (end > prev_map->start &&
		    logical < prev_map->start + prev_map->chunk_len) {
			refcount_inc(&prev_map->refs);
			return prev_map;
		}
	}

	return NULL;
}

struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
					     u64 logical, u64 length)
{
	struct btrfs_chunk_map *map;

	read_lock(&fs_info->mapping_tree_lock);
	map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
	read_unlock(&fs_info->mapping_tree_lock);

	return map;
}

3133
/*
3134 3135
 * Find the mapping containing the given logical extent.
 *
3136 3137 3138 3139 3140
 * @logical: Logical block offset in bytes.
 * @length: Length of extent in bytes.
 *
 * Return: Chunk mapping or ERR_PTR.
 */
3141 3142
struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
					    u64 logical, u64 length)
3143
{
3144
	struct btrfs_chunk_map *map;
3145

3146
	map = btrfs_find_chunk_map(fs_info, logical, length);
3147

3148
	if (unlikely(!map)) {
3149 3150
		btrfs_crit(fs_info,
			   "unable to find chunk map for logical %llu length %llu",
3151 3152 3153 3154
			   logical, length);
		return ERR_PTR(-EINVAL);
	}

3155
	if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
3156
		btrfs_crit(fs_info,
3157
			   "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
3158 3159 3160
			   logical, logical + length, map->start,
			   map->start + map->chunk_len);
		btrfs_free_chunk_map(map);
3161 3162 3163
		return ERR_PTR(-EINVAL);
	}

3164 3165
	/* Callers are responsible for dropping the reference. */
	return map;
3166 3167
}

3168
static int remove_chunk_item(struct btrfs_trans_handle *trans,
3169
			     struct btrfs_chunk_map *map, u64 chunk_offset)
3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190
{
	int i;

	/*
	 * Removing chunk items and updating the device items in the chunks btree
	 * requires holding the chunk_mutex.
	 * See the comment at btrfs_chunk_alloc() for the details.
	 */
	lockdep_assert_held(&trans->fs_info->chunk_mutex);

	for (i = 0; i < map->num_stripes; i++) {
		int ret;

		ret = btrfs_update_device(trans, map->stripes[i].dev);
		if (ret)
			return ret;
	}

	return btrfs_free_chunk(trans, chunk_offset);
}

3191
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3192
{
3193
	struct btrfs_fs_info *fs_info = trans->fs_info;
3194
	struct btrfs_chunk_map *map;
Miao Xie's avatar
Miao Xie committed
3195
	u64 dev_extent_len = 0;
3196
	int i, ret = 0;
3197
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3198

3199 3200
	map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(map)) {
3201 3202
		/*
		 * This is a logic error, but we don't want to just rely on the
3203
		 * user having built with ASSERT enabled, so if ASSERT doesn't
3204 3205 3206
		 * do anything we still error out.
		 */
		ASSERT(0);
3207
		return PTR_ERR(map);
3208
	}
3209

3210
	/*
3211 3212 3213 3214 3215 3216 3217 3218
	 * First delete the device extent items from the devices btree.
	 * We take the device_list_mutex to avoid racing with the finishing phase
	 * of a device replace operation. See the comment below before acquiring
	 * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
	 * because that can result in a deadlock when deleting the device extent
	 * items from the devices btree - COWing an extent buffer from the btree
	 * may result in allocating a new metadata chunk, which would attempt to
	 * lock again fs_info->chunk_mutex.
3219 3220
	 */
	mutex_lock(&fs_devices->device_list_mutex);
3221
	for (i = 0; i < map->num_stripes; i++) {
3222
		struct btrfs_device *device = map->stripes[i].dev;
Miao Xie's avatar
Miao Xie committed
3223 3224 3225
		ret = btrfs_free_dev_extent(trans, device,
					    map->stripes[i].physical,
					    &dev_extent_len);
3226
		if (ret) {
3227
			mutex_unlock(&fs_devices->device_list_mutex);
3228
			btrfs_abort_transaction(trans, ret);
3229 3230
			goto out;
		}
3231

Miao Xie's avatar
Miao Xie committed
3232
		if (device->bytes_used > 0) {
3233
			mutex_lock(&fs_info->chunk_mutex);
Miao Xie's avatar
Miao Xie committed
3234 3235
			btrfs_device_set_bytes_used(device,
					device->bytes_used - dev_extent_len);
3236
			atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3237
			btrfs_clear_space_info_full(fs_info);
3238
			mutex_unlock(&fs_info->chunk_mutex);
Miao Xie's avatar
Miao Xie committed
3239
		}
3240 3241
	}
	mutex_unlock(&fs_devices->device_list_mutex);
3242

3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287
	/*
	 * We acquire fs_info->chunk_mutex for 2 reasons:
	 *
	 * 1) Just like with the first phase of the chunk allocation, we must
	 *    reserve system space, do all chunk btree updates and deletions, and
	 *    update the system chunk array in the superblock while holding this
	 *    mutex. This is for similar reasons as explained on the comment at
	 *    the top of btrfs_chunk_alloc();
	 *
	 * 2) Prevent races with the final phase of a device replace operation
	 *    that replaces the device object associated with the map's stripes,
	 *    because the device object's id can change at any time during that
	 *    final phase of the device replace operation
	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
	 *    replaced device and then see it with an ID of
	 *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
	 *    the device item, which does not exists on the chunk btree.
	 *    The finishing phase of device replace acquires both the
	 *    device_list_mutex and the chunk_mutex, in that order, so we are
	 *    safe by just acquiring the chunk_mutex.
	 */
	trans->removing_chunk = true;
	mutex_lock(&fs_info->chunk_mutex);

	check_system_chunk(trans, map->type);

	ret = remove_chunk_item(trans, map, chunk_offset);
	/*
	 * Normally we should not get -ENOSPC since we reserved space before
	 * through the call to check_system_chunk().
	 *
	 * Despite our system space_info having enough free space, we may not
	 * be able to allocate extents from its block groups, because all have
	 * an incompatible profile, which will force us to allocate a new system
	 * block group with the right profile, or right after we called
	 * check_system_space() above, a scrub turned the only system block group
	 * with enough free space into RO mode.
	 * This is explained with more detail at do_chunk_alloc().
	 *
	 * So if we get -ENOSPC, allocate a new system chunk and retry once.
	 */
	if (ret == -ENOSPC) {
		const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
		struct btrfs_block_group *sys_bg;

3288
		sys_bg = btrfs_create_chunk(trans, sys_flags);
3289 3290 3291 3292 3293 3294 3295
		if (IS_ERR(sys_bg)) {
			ret = PTR_ERR(sys_bg);
			btrfs_abort_transaction(trans, ret);
			goto out;
		}

		ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3296 3297 3298
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
3299
		}
3300

3301 3302 3303 3304 3305 3306
		ret = remove_chunk_item(trans, map, chunk_offset);
		if (ret) {
			btrfs_abort_transaction(trans, ret);
			goto out;
		}
	} else if (ret) {
3307
		btrfs_abort_transaction(trans, ret);
3308 3309
		goto out;
	}
3310

3311
	trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
3312

3313
	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3314
		ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3315
		if (ret) {
3316
			btrfs_abort_transaction(trans, ret);
3317 3318
			goto out;
		}
3319 3320
	}

3321 3322 3323 3324 3325 3326 3327 3328 3329
	mutex_unlock(&fs_info->chunk_mutex);
	trans->removing_chunk = false;

	/*
	 * We are done with chunk btree updates and deletions, so release the
	 * system space we previously reserved (with check_system_chunk()).
	 */
	btrfs_trans_release_chunk_metadata(trans);

3330
	ret = btrfs_remove_block_group(trans, map);
3331
	if (ret) {
3332
		btrfs_abort_transaction(trans, ret);
3333 3334
		goto out;
	}
Yan Zheng's avatar
Yan Zheng committed
3335

3336
out:
3337 3338 3339 3340
	if (trans->removing_chunk) {
		mutex_unlock(&fs_info->chunk_mutex);
		trans->removing_chunk = false;
	}
Yan Zheng's avatar
Yan Zheng committed
3341
	/* once for us */
3342
	btrfs_free_chunk_map(map);
3343 3344
	return ret;
}
Yan Zheng's avatar
Yan Zheng committed
3345

3346
int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3347
{
3348
	struct btrfs_root *root = fs_info->chunk_root;
3349
	struct btrfs_trans_handle *trans;
3350
	struct btrfs_block_group *block_group;
3351
	u64 length;
3352
	int ret;
Yan Zheng's avatar
Yan Zheng committed
3353

3354 3355 3356 3357 3358 3359
	if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
		btrfs_err(fs_info,
			  "relocate: not supported on extent tree v2 yet");
		return -EINVAL;
	}

3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371
	/*
	 * Prevent races with automatic removal of unused block groups.
	 * After we relocate and before we remove the chunk with offset
	 * chunk_offset, automatic removal of the block group can kick in,
	 * resulting in a failure when calling btrfs_remove_chunk() below.
	 *
	 * Make sure to acquire this mutex before doing a tree search (dev
	 * or chunk trees) to find chunks. Otherwise the cleaner kthread might
	 * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
	 * we release the path used to search the chunk/dev tree and before
	 * the current task acquires this mutex and calls us.
	 */
3372
	lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3373

3374
	/* step one, relocate all the extents inside this chunk */
3375
	btrfs_scrub_pause(fs_info);
3376
	ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3377
	btrfs_scrub_continue(fs_info);
3378 3379 3380 3381 3382 3383 3384
	if (ret) {
		/*
		 * If we had a transaction abort, stop all running scrubs.
		 * See transaction.c:cleanup_transaction() why we do it here.
		 */
		if (BTRFS_FS_ERROR(fs_info))
			btrfs_scrub_cancel(fs_info);
3385
		return ret;
3386
	}
3387

3388 3389 3390 3391
	block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
	if (!block_group)
		return -ENOENT;
	btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3392
	length = block_group->length;
3393 3394
	btrfs_put_block_group(block_group);

3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408
	/*
	 * On a zoned file system, discard the whole block group, this will
	 * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
	 * resetting the zone fails, don't treat it as a fatal problem from the
	 * filesystem's point of view.
	 */
	if (btrfs_is_zoned(fs_info)) {
		ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
		if (ret)
			btrfs_info(fs_info,
				"failed to reset zone %llu after relocation",
				chunk_offset);
	}

3409 3410 3411 3412 3413 3414 3415 3416
	trans = btrfs_start_trans_remove_block_group(root->fs_info,
						     chunk_offset);
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		btrfs_handle_fs_error(root->fs_info, ret, NULL);
		return ret;
	}

3417
	/*
3418 3419
	 * step two, delete the device extents and the
	 * chunk tree entries
3420
	 */
3421
	ret = btrfs_remove_chunk(trans, chunk_offset);
3422
	btrfs_end_transaction(trans);
3423
	return ret;
Yan Zheng's avatar
Yan Zheng committed
3424 3425
}

3426
static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
Yan Zheng's avatar
Yan Zheng committed
3427
{
3428
	struct btrfs_root *chunk_root = fs_info->chunk_root;
Yan Zheng's avatar
Yan Zheng committed
3429 3430 3431 3432 3433 3434
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_chunk *chunk;
	struct btrfs_key key;
	struct btrfs_key found_key;
	u64 chunk_type;
3435 3436
	bool retried = false;
	int failed = 0;
Yan Zheng's avatar
Yan Zheng committed
3437 3438 3439 3440 3441 3442
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3443
again:
Yan Zheng's avatar
Yan Zheng committed
3444 3445 3446 3447 3448
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

	while (1) {
3449
		mutex_lock(&fs_info->reclaim_bgs_lock);
Yan Zheng's avatar
Yan Zheng committed
3450
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3451
		if (ret < 0) {
3452
			mutex_unlock(&fs_info->reclaim_bgs_lock);
Yan Zheng's avatar
Yan Zheng committed
3453
			goto error;
3454
		}
3455 3456 3457 3458 3459 3460 3461 3462 3463
		if (ret == 0) {
			/*
			 * On the first search we would find chunk tree with
			 * offset -1, which is not possible. On subsequent
			 * loops this would find an existing item on an invalid
			 * offset (one less than the previous one, wrong
			 * alignment and size).
			 */
			ret = -EUCLEAN;
3464
			mutex_unlock(&fs_info->reclaim_bgs_lock);
3465 3466
			goto error;
		}
Yan Zheng's avatar
Yan Zheng committed
3467 3468 3469

		ret = btrfs_previous_item(chunk_root, path, key.objectid,
					  key.type);
3470
		if (ret)
3471
			mutex_unlock(&fs_info->reclaim_bgs_lock);
Yan Zheng's avatar
Yan Zheng committed
3472 3473 3474 3475
		if (ret < 0)
			goto error;
		if (ret > 0)
			break;
3476

Yan Zheng's avatar
Yan Zheng committed
3477 3478
		leaf = path->nodes[0];
		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3479

Yan Zheng's avatar
Yan Zheng committed
3480 3481 3482
		chunk = btrfs_item_ptr(leaf, path->slots[0],
				       struct btrfs_chunk);
		chunk_type = btrfs_chunk_type(leaf, chunk);
3483
		btrfs_release_path(path);
3484

Yan Zheng's avatar
Yan Zheng committed
3485
		if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3486
			ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3487 3488
			if (ret == -ENOSPC)
				failed++;
HIMANGI SARAOGI's avatar
HIMANGI SARAOGI committed
3489 3490
			else
				BUG_ON(ret);
Yan Zheng's avatar
Yan Zheng committed
3491
		}
3492
		mutex_unlock(&fs_info->reclaim_bgs_lock);
3493

Yan Zheng's avatar
Yan Zheng committed
3494 3495 3496 3497 3498
		if (found_key.offset == 0)
			break;
		key.offset = found_key.offset - 1;
	}
	ret = 0;
3499 3500 3501 3502
	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
3503
	} else if (WARN_ON(failed && retried)) {
3504 3505
		ret = -ENOSPC;
	}
Yan Zheng's avatar
Yan Zheng committed
3506 3507 3508
error:
	btrfs_free_path(path);
	return ret;
3509 3510
}

3511 3512 3513 3514 3515 3516 3517 3518
/*
 * return 1 : allocate a data chunk successfully,
 * return <0: errors during allocating a data chunk,
 * return 0 : no need to allocate a data chunk.
 */
static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
				      u64 chunk_offset)
{
3519
	struct btrfs_block_group *cache;
3520 3521 3522 3523 3524 3525 3526 3527
	u64 bytes_used;
	u64 chunk_type;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
	ASSERT(cache);
	chunk_type = cache->flags;
	btrfs_put_block_group(cache);

3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547
	if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
		return 0;

	spin_lock(&fs_info->data_sinfo->lock);
	bytes_used = fs_info->data_sinfo->bytes_used;
	spin_unlock(&fs_info->data_sinfo->lock);

	if (!bytes_used) {
		struct btrfs_trans_handle *trans;
		int ret;

		trans =	btrfs_join_transaction(fs_info->tree_root);
		if (IS_ERR(trans))
			return PTR_ERR(trans);

		ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
		btrfs_end_transaction(trans);
		if (ret < 0)
			return ret;
		return 1;
3548
	}
3549

3550 3551 3552
	return 0;
}

3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590
static void btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
					   const struct btrfs_disk_balance_args *disk)
{
	memset(cpu, 0, sizeof(*cpu));

	cpu->profiles = le64_to_cpu(disk->profiles);
	cpu->usage = le64_to_cpu(disk->usage);
	cpu->devid = le64_to_cpu(disk->devid);
	cpu->pstart = le64_to_cpu(disk->pstart);
	cpu->pend = le64_to_cpu(disk->pend);
	cpu->vstart = le64_to_cpu(disk->vstart);
	cpu->vend = le64_to_cpu(disk->vend);
	cpu->target = le64_to_cpu(disk->target);
	cpu->flags = le64_to_cpu(disk->flags);
	cpu->limit = le64_to_cpu(disk->limit);
	cpu->stripes_min = le32_to_cpu(disk->stripes_min);
	cpu->stripes_max = le32_to_cpu(disk->stripes_max);
}

static void btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
					   const struct btrfs_balance_args *cpu)
{
	memset(disk, 0, sizeof(*disk));

	disk->profiles = cpu_to_le64(cpu->profiles);
	disk->usage = cpu_to_le64(cpu->usage);
	disk->devid = cpu_to_le64(cpu->devid);
	disk->pstart = cpu_to_le64(cpu->pstart);
	disk->pend = cpu_to_le64(cpu->pend);
	disk->vstart = cpu_to_le64(cpu->vstart);
	disk->vend = cpu_to_le64(cpu->vend);
	disk->target = cpu_to_le64(cpu->target);
	disk->flags = cpu_to_le64(cpu->flags);
	disk->limit = cpu_to_le64(cpu->limit);
	disk->stripes_min = cpu_to_le32(cpu->stripes_min);
	disk->stripes_max = cpu_to_le32(cpu->stripes_max);
}

3591
static int insert_balance_item(struct btrfs_fs_info *fs_info,
3592 3593
			       struct btrfs_balance_control *bctl)
{
3594
	struct btrfs_root *root = fs_info->tree_root;
3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613
	struct btrfs_trans_handle *trans;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3614
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3615 3616 3617 3618 3619 3620 3621 3622 3623 3624
	key.offset = 0;

	ret = btrfs_insert_empty_item(trans, root, path, &key,
				      sizeof(*item));
	if (ret)
		goto out;

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

3625
	memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3626 3627 3628 3629 3630 3631 3632 3633 3634 3635

	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
	btrfs_set_balance_data(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
	btrfs_set_balance_meta(leaf, item, &disk_bargs);
	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
	btrfs_set_balance_sys(leaf, item, &disk_bargs);

	btrfs_set_balance_flags(leaf, item, bctl->flags);

3636
	btrfs_mark_buffer_dirty(trans, leaf);
3637 3638
out:
	btrfs_free_path(path);
3639
	err = btrfs_commit_transaction(trans);
3640 3641 3642 3643 3644
	if (err && !ret)
		ret = err;
	return ret;
}

3645
static int del_balance_item(struct btrfs_fs_info *fs_info)
3646
{
3647
	struct btrfs_root *root = fs_info->tree_root;
3648 3649 3650 3651 3652 3653 3654 3655 3656
	struct btrfs_trans_handle *trans;
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret, err;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

3657
	trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3658 3659 3660 3661 3662 3663
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

	key.objectid = BTRFS_BALANCE_OBJECTID;
3664
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677
	key.offset = 0;

	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
	if (ret < 0)
		goto out;
	if (ret > 0) {
		ret = -ENOENT;
		goto out;
	}

	ret = btrfs_del_item(trans, root, path);
out:
	btrfs_free_path(path);
3678
	err = btrfs_commit_transaction(trans);
3679 3680 3681 3682 3683
	if (err && !ret)
		ret = err;
	return ret;
}

3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707
/*
 * This is a heuristic used to reduce the number of chunks balanced on
 * resume after balance was interrupted.
 */
static void update_balance_args(struct btrfs_balance_control *bctl)
{
	/*
	 * Turn on soft mode for chunk types that were being converted.
	 */
	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;

	/*
	 * Turn on usage filter if is not already used.  The idea is
	 * that chunks that we have already balanced should be
	 * reasonably full.  Don't do it for chunks that are being
	 * converted - that will keep us from relocating unconverted
	 * (albeit full) chunks.
	 */
	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3708
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3709 3710 3711 3712 3713
	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->data.usage = 90;
	}
	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3714
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3715 3716 3717 3718 3719
	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->sys.usage = 90;
	}
	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3720
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3721 3722 3723 3724 3725 3726
	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
		bctl->meta.usage = 90;
	}
}

3727 3728 3729 3730
/*
 * Clear the balance status in fs_info and delete the balance item from disk.
 */
static void reset_balance_state(struct btrfs_fs_info *fs_info)
3731 3732
{
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3733
	int ret;
3734

3735
	ASSERT(fs_info->balance_ctl);
3736 3737 3738 3739 3740 3741

	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = NULL;
	spin_unlock(&fs_info->balance_lock);

	kfree(bctl);
3742 3743 3744
	ret = del_balance_item(fs_info);
	if (ret)
		btrfs_handle_fs_error(fs_info, ret, NULL);
3745 3746
}

Ilya Dryomov's avatar
Ilya Dryomov committed
3747 3748 3749 3750
/*
 * Balance filters.  Return 1 if chunk should be filtered out
 * (should not be balanced).
 */
3751
static int chunk_profiles_filter(u64 chunk_type,
Ilya Dryomov's avatar
Ilya Dryomov committed
3752 3753
				 struct btrfs_balance_args *bargs)
{
3754 3755
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
Ilya Dryomov's avatar
Ilya Dryomov committed
3756

3757
	if (bargs->profiles & chunk_type)
Ilya Dryomov's avatar
Ilya Dryomov committed
3758 3759 3760 3761 3762
		return 0;

	return 1;
}

3763
static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
Ilya Dryomov's avatar
Ilya Dryomov committed
3764
			      struct btrfs_balance_args *bargs)
3765
{
3766
	struct btrfs_block_group *cache;
3767 3768 3769 3770 3771 3772
	u64 chunk_used;
	u64 user_thresh_min;
	u64 user_thresh_max;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3773
	chunk_used = cache->used;
3774 3775 3776 3777

	if (bargs->usage_min == 0)
		user_thresh_min = 0;
	else
3778
		user_thresh_min = mult_perc(cache->length, bargs->usage_min);
3779 3780 3781 3782

	if (bargs->usage_max == 0)
		user_thresh_max = 1;
	else if (bargs->usage_max > 100)
3783
		user_thresh_max = cache->length;
3784
	else
3785
		user_thresh_max = mult_perc(cache->length, bargs->usage_max);
3786 3787 3788 3789 3790 3791 3792 3793

	if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

3794
static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3795
		u64 chunk_offset, struct btrfs_balance_args *bargs)
Ilya Dryomov's avatar
Ilya Dryomov committed
3796
{
3797
	struct btrfs_block_group *cache;
Ilya Dryomov's avatar
Ilya Dryomov committed
3798 3799 3800 3801
	u64 chunk_used, user_thresh;
	int ret = 1;

	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3802
	chunk_used = cache->used;
Ilya Dryomov's avatar
Ilya Dryomov committed
3803

3804
	if (bargs->usage_min == 0)
3805
		user_thresh = 1;
3806
	else if (bargs->usage > 100)
3807
		user_thresh = cache->length;
3808
	else
3809
		user_thresh = mult_perc(cache->length, bargs->usage);
3810

Ilya Dryomov's avatar
Ilya Dryomov committed
3811 3812 3813 3814 3815 3816 3817
	if (chunk_used < user_thresh)
		ret = 0;

	btrfs_put_block_group(cache);
	return ret;
}

Ilya Dryomov's avatar
Ilya Dryomov committed
3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834
static int chunk_devid_filter(struct extent_buffer *leaf,
			      struct btrfs_chunk *chunk,
			      struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	int i;

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
			return 0;
	}

	return 1;
}

3835 3836 3837 3838 3839 3840
static u64 calc_data_stripes(u64 type, int num_stripes)
{
	const int index = btrfs_bg_flags_to_raid_index(type);
	const int ncopies = btrfs_raid_array[index].ncopies;
	const int nparity = btrfs_raid_array[index].nparity;

3841
	return (num_stripes - nparity) / ncopies;
3842 3843
}

Ilya Dryomov's avatar
Ilya Dryomov committed
3844 3845 3846 3847 3848 3849 3850 3851 3852
/* [pstart, pend) */
static int chunk_drange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	struct btrfs_stripe *stripe;
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
	u64 stripe_offset;
	u64 stripe_length;
3853
	u64 type;
Ilya Dryomov's avatar
Ilya Dryomov committed
3854 3855 3856 3857 3858 3859
	int factor;
	int i;

	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
		return 0;

3860 3861
	type = btrfs_chunk_type(leaf, chunk);
	factor = calc_data_stripes(type, num_stripes);
Ilya Dryomov's avatar
Ilya Dryomov committed
3862 3863 3864 3865 3866 3867 3868 3869

	for (i = 0; i < num_stripes; i++) {
		stripe = btrfs_stripe_nr(chunk, i);
		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
			continue;

		stripe_offset = btrfs_stripe_offset(leaf, stripe);
		stripe_length = btrfs_chunk_length(leaf, chunk);
3870
		stripe_length = div_u64(stripe_length, factor);
Ilya Dryomov's avatar
Ilya Dryomov committed
3871 3872 3873 3874 3875 3876 3877 3878 3879

		if (stripe_offset < bargs->pend &&
		    stripe_offset + stripe_length > bargs->pstart)
			return 0;
	}

	return 1;
}

3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893
/* [vstart, vend) */
static int chunk_vrange_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       u64 chunk_offset,
			       struct btrfs_balance_args *bargs)
{
	if (chunk_offset < bargs->vend &&
	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
		/* at least part of the chunk is inside this vrange */
		return 0;

	return 1;
}

3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906
static int chunk_stripes_range_filter(struct extent_buffer *leaf,
			       struct btrfs_chunk *chunk,
			       struct btrfs_balance_args *bargs)
{
	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

	if (bargs->stripes_min <= num_stripes
			&& num_stripes <= bargs->stripes_max)
		return 0;

	return 1;
}

3907
static int chunk_soft_convert_filter(u64 chunk_type,
3908 3909 3910 3911 3912
				     struct btrfs_balance_args *bargs)
{
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return 0;

3913 3914
	chunk_type = chunk_to_extended(chunk_type) &
				BTRFS_EXTENDED_PROFILE_MASK;
3915

3916
	if (bargs->target == chunk_type)
3917 3918 3919 3920 3921
		return 1;

	return 0;
}

3922
static int should_balance_chunk(struct extent_buffer *leaf,
3923 3924
				struct btrfs_chunk *chunk, u64 chunk_offset)
{
3925
	struct btrfs_fs_info *fs_info = leaf->fs_info;
3926
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942
	struct btrfs_balance_args *bargs = NULL;
	u64 chunk_type = btrfs_chunk_type(leaf, chunk);

	/* type filter */
	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
		return 0;
	}

	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
		bargs = &bctl->data;
	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
		bargs = &bctl->sys;
	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
		bargs = &bctl->meta;

Ilya Dryomov's avatar
Ilya Dryomov committed
3943 3944 3945 3946
	/* profiles filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
	    chunk_profiles_filter(chunk_type, bargs)) {
		return 0;
Ilya Dryomov's avatar
Ilya Dryomov committed
3947 3948 3949 3950
	}

	/* usage filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3951
	    chunk_usage_filter(fs_info, chunk_offset, bargs)) {
Ilya Dryomov's avatar
Ilya Dryomov committed
3952
		return 0;
3953
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3954
	    chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3955
		return 0;
Ilya Dryomov's avatar
Ilya Dryomov committed
3956 3957 3958 3959 3960 3961
	}

	/* devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
	    chunk_devid_filter(leaf, chunk, bargs)) {
		return 0;
Ilya Dryomov's avatar
Ilya Dryomov committed
3962 3963 3964 3965
	}

	/* drange filter, makes sense only with devid filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3966
	    chunk_drange_filter(leaf, chunk, bargs)) {
Ilya Dryomov's avatar
Ilya Dryomov committed
3967
		return 0;
3968 3969 3970 3971 3972 3973
	}

	/* vrange filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
		return 0;
Ilya Dryomov's avatar
Ilya Dryomov committed
3974 3975
	}

3976 3977 3978 3979 3980 3981
	/* stripes filter */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
	    chunk_stripes_range_filter(leaf, chunk, bargs)) {
		return 0;
	}

3982 3983 3984 3985 3986 3987
	/* soft profile changing mode */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
	    chunk_soft_convert_filter(chunk_type, bargs)) {
		return 0;
	}

3988 3989 3990 3991 3992 3993 3994 3995
	/*
	 * limited by count, must be the last filter
	 */
	if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
		if (bargs->limit == 0)
			return 0;
		else
			bargs->limit--;
3996 3997 3998
	} else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
		/*
		 * Same logic as the 'limit' filter; the minimum cannot be
3999
		 * determined here because we do not have the global information
4000 4001 4002 4003 4004 4005
		 * about the count of all chunks that satisfy the filters.
		 */
		if (bargs->limit_max == 0)
			return 0;
		else
			bargs->limit_max--;
4006 4007
	}

4008 4009 4010
	return 1;
}

4011
static int __btrfs_balance(struct btrfs_fs_info *fs_info)
4012
{
4013
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4014
	struct btrfs_root *chunk_root = fs_info->chunk_root;
4015
	u64 chunk_type;
4016
	struct btrfs_chunk *chunk;
4017
	struct btrfs_path *path = NULL;
4018 4019
	struct btrfs_key key;
	struct btrfs_key found_key;
4020 4021
	struct extent_buffer *leaf;
	int slot;
4022 4023
	int ret;
	int enospc_errors = 0;
4024
	bool counting = true;
4025
	/* The single value limit and min/max limits use the same bytes in the */
4026 4027 4028
	u64 limit_data = bctl->data.limit;
	u64 limit_meta = bctl->meta.limit;
	u64 limit_sys = bctl->sys.limit;
4029 4030 4031
	u32 count_data = 0;
	u32 count_meta = 0;
	u32 count_sys = 0;
4032
	int chunk_reserved = 0;
4033 4034

	path = btrfs_alloc_path();
4035 4036 4037 4038
	if (!path) {
		ret = -ENOMEM;
		goto error;
	}
4039 4040 4041 4042 4043 4044

	/* zero out stat counters */
	spin_lock(&fs_info->balance_lock);
	memset(&bctl->stat, 0, sizeof(bctl->stat));
	spin_unlock(&fs_info->balance_lock);
again:
4045
	if (!counting) {
4046 4047 4048 4049
		/*
		 * The single value limit and min/max limits use the same bytes
		 * in the
		 */
4050 4051 4052 4053
		bctl->data.limit = limit_data;
		bctl->meta.limit = limit_meta;
		bctl->sys.limit = limit_sys;
	}
4054 4055 4056 4057
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.offset = (u64)-1;
	key.type = BTRFS_CHUNK_ITEM_KEY;

4058
	while (1) {
4059
		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
4060
		    atomic_read(&fs_info->balance_cancel_req)) {
4061 4062 4063 4064
			ret = -ECANCELED;
			goto error;
		}

4065
		mutex_lock(&fs_info->reclaim_bgs_lock);
4066
		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
4067
		if (ret < 0) {
4068
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4069
			goto error;
4070
		}
4071 4072 4073 4074 4075 4076

		/*
		 * this shouldn't happen, it means the last relocate
		 * failed
		 */
		if (ret == 0)
4077
			BUG(); /* FIXME break ? */
4078 4079 4080

		ret = btrfs_previous_item(chunk_root, path, 0,
					  BTRFS_CHUNK_ITEM_KEY);
4081
		if (ret) {
4082
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4083
			ret = 0;
4084
			break;
4085
		}
4086

4087 4088 4089
		leaf = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(leaf, &found_key, slot);
4090

4091
		if (found_key.objectid != key.objectid) {
4092
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4093
			break;
4094
		}
4095

4096
		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
4097
		chunk_type = btrfs_chunk_type(leaf, chunk);
4098

4099 4100 4101 4102 4103 4104
		if (!counting) {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.considered++;
			spin_unlock(&fs_info->balance_lock);
		}

4105
		ret = should_balance_chunk(leaf, chunk, found_key.offset);
4106

4107
		btrfs_release_path(path);
4108
		if (!ret) {
4109
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4110
			goto loop;
4111
		}
4112

4113
		if (counting) {
4114
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4115 4116 4117
			spin_lock(&fs_info->balance_lock);
			bctl->stat.expected++;
			spin_unlock(&fs_info->balance_lock);
4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138

			if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
				count_data++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
				count_sys++;
			else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
				count_meta++;

			goto loop;
		}

		/*
		 * Apply limit_min filter, no need to check if the LIMITS
		 * filter is used, limit_min is 0 by default
		 */
		if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
					count_data < bctl->data.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
					count_meta < bctl->meta.limit_min)
				|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
					count_sys < bctl->sys.limit_min)) {
4139
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4140 4141 4142
			goto loop;
		}

4143 4144 4145 4146 4147 4148 4149 4150 4151
		if (!chunk_reserved) {
			/*
			 * We may be relocating the only data chunk we have,
			 * which could potentially end up with losing data's
			 * raid profile, so lets allocate an empty one in
			 * advance.
			 */
			ret = btrfs_may_alloc_data_chunk(fs_info,
							 found_key.offset);
4152
			if (ret < 0) {
4153
				mutex_unlock(&fs_info->reclaim_bgs_lock);
4154
				goto error;
4155 4156
			} else if (ret == 1) {
				chunk_reserved = 1;
4157 4158 4159
			}
		}

4160
		ret = btrfs_relocate_chunk(fs_info, found_key.offset);
4161
		mutex_unlock(&fs_info->reclaim_bgs_lock);
4162
		if (ret == -ENOSPC) {
4163
			enospc_errors++;
4164 4165 4166 4167 4168 4169 4170
		} else if (ret == -ETXTBSY) {
			btrfs_info(fs_info,
	   "skipping relocation of block group %llu due to active swapfile",
				   found_key.offset);
			ret = 0;
		} else if (ret) {
			goto error;
4171 4172 4173 4174 4175
		} else {
			spin_lock(&fs_info->balance_lock);
			bctl->stat.completed++;
			spin_unlock(&fs_info->balance_lock);
		}
4176
loop:
4177 4178
		if (found_key.offset == 0)
			break;
4179
		key.offset = found_key.offset - 1;
4180
	}
4181

4182 4183 4184 4185 4186
	if (counting) {
		btrfs_release_path(path);
		counting = false;
		goto again;
	}
4187 4188
error:
	btrfs_free_path(path);
4189
	if (enospc_errors) {
4190
		btrfs_info(fs_info, "%d enospc errors during balance",
4191
			   enospc_errors);
4192 4193 4194 4195
		if (!ret)
			ret = -ENOSPC;
	}

4196 4197 4198
	return ret;
}

4199 4200 4201 4202 4203
/*
 * See if a given profile is valid and reduced.
 *
 * @flags:     profile to validate
 * @extended:  if true @flags is treated as an extended profile
4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219
 */
static int alloc_profile_is_valid(u64 flags, int extended)
{
	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
			       BTRFS_BLOCK_GROUP_PROFILE_MASK);

	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;

	/* 1) check that all other bits are zeroed */
	if (flags & ~mask)
		return 0;

	/* 2) see if profile is reduced */
	if (flags == 0)
		return !extended; /* "0" is valid for usual profiles */

4220
	return has_single_bit_set(flags);
4221 4222
}

4223 4224 4225 4226 4227 4228 4229
/*
 * Validate target profile against allowed profiles and return true if it's OK.
 * Otherwise print the error message and return false.
 */
static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
		const struct btrfs_balance_args *bargs,
		u64 allowed, const char *type)
4230
{
4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241
	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
		return true;

	/* Profile is valid and does not have bits outside of the allowed set */
	if (alloc_profile_is_valid(bargs->target, 1) &&
	    (bargs->target & ~allowed) == 0)
		return true;

	btrfs_err(fs_info, "balance: invalid convert %s profile %s",
			type, btrfs_bg_type_to_raid_name(bargs->target));
	return false;
4242 4243
}

4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287
/*
 * Fill @buf with textual description of balance filter flags @bargs, up to
 * @size_buf including the terminating null. The output may be trimmed if it
 * does not fit into the provided buffer.
 */
static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
				 u32 size_buf)
{
	int ret;
	u32 size_bp = size_buf;
	char *bp = buf;
	u64 flags = bargs->flags;
	char tmp_buf[128] = {'\0'};

	if (!flags)
		return;

#define CHECK_APPEND_NOARG(a)						\
	do {								\
		ret = snprintf(bp, size_bp, (a));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

#define CHECK_APPEND_1ARG(a, v1)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

#define CHECK_APPEND_2ARG(a, v1, v2)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1), (v2));		\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

4288 4289 4290
	if (flags & BTRFS_BALANCE_ARGS_CONVERT)
		CHECK_APPEND_1ARG("convert=%s,",
				  btrfs_bg_type_to_raid_name(bargs->target));
4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397

	if (flags & BTRFS_BALANCE_ARGS_SOFT)
		CHECK_APPEND_NOARG("soft,");

	if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
		btrfs_describe_block_groups(bargs->profiles, tmp_buf,
					    sizeof(tmp_buf));
		CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
	}

	if (flags & BTRFS_BALANCE_ARGS_USAGE)
		CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);

	if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
		CHECK_APPEND_2ARG("usage=%u..%u,",
				  bargs->usage_min, bargs->usage_max);

	if (flags & BTRFS_BALANCE_ARGS_DEVID)
		CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);

	if (flags & BTRFS_BALANCE_ARGS_DRANGE)
		CHECK_APPEND_2ARG("drange=%llu..%llu,",
				  bargs->pstart, bargs->pend);

	if (flags & BTRFS_BALANCE_ARGS_VRANGE)
		CHECK_APPEND_2ARG("vrange=%llu..%llu,",
				  bargs->vstart, bargs->vend);

	if (flags & BTRFS_BALANCE_ARGS_LIMIT)
		CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);

	if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
		CHECK_APPEND_2ARG("limit=%u..%u,",
				bargs->limit_min, bargs->limit_max);

	if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
		CHECK_APPEND_2ARG("stripes=%u..%u,",
				  bargs->stripes_min, bargs->stripes_max);

#undef CHECK_APPEND_2ARG
#undef CHECK_APPEND_1ARG
#undef CHECK_APPEND_NOARG

out_overflow:

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
	else
		buf[0] = '\0';
}

static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
{
	u32 size_buf = 1024;
	char tmp_buf[192] = {'\0'};
	char *buf;
	char *bp;
	u32 size_bp = size_buf;
	int ret;
	struct btrfs_balance_control *bctl = fs_info->balance_ctl;

	buf = kzalloc(size_buf, GFP_KERNEL);
	if (!buf)
		return;

	bp = buf;

#define CHECK_APPEND_1ARG(a, v1)					\
	do {								\
		ret = snprintf(bp, size_bp, (a), (v1));			\
		if (ret < 0 || ret >= size_bp)				\
			goto out_overflow;				\
		size_bp -= ret;						\
		bp += ret;						\
	} while (0)

	if (bctl->flags & BTRFS_BALANCE_FORCE)
		CHECK_APPEND_1ARG("%s", "-f ");

	if (bctl->flags & BTRFS_BALANCE_DATA) {
		describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-d%s ", tmp_buf);
	}

	if (bctl->flags & BTRFS_BALANCE_METADATA) {
		describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-m%s ", tmp_buf);
	}

	if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
		describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
		CHECK_APPEND_1ARG("-s%s ", tmp_buf);
	}

#undef CHECK_APPEND_1ARG

out_overflow:

	if (size_bp < size_buf)
		buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
	btrfs_info(fs_info, "balance: %s %s",
		   (bctl->flags & BTRFS_BALANCE_RESUME) ?
		   "resume" : "start", buf);

	kfree(buf);
}

4398
/*
4399
 * Should be called with balance mutexe held
4400
 */
4401 4402
int btrfs_balance(struct btrfs_fs_info *fs_info,
		  struct btrfs_balance_control *bctl,
4403 4404
		  struct btrfs_ioctl_balance_args *bargs)
{
4405
	u64 meta_target, data_target;
4406
	u64 allowed;
4407
	int mixed = 0;
4408
	int ret;
4409
	u64 num_devices;
4410
	unsigned seq;
4411
	bool reducing_redundancy;
4412
	bool paused = false;
4413
	int i;
4414

4415
	if (btrfs_fs_closing(fs_info) ||
4416
	    atomic_read(&fs_info->balance_pause_req) ||
4417
	    btrfs_should_cancel_balance(fs_info)) {
4418 4419 4420 4421
		ret = -EINVAL;
		goto out;
	}

4422 4423 4424 4425
	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
		mixed = 1;

4426 4427 4428 4429
	/*
	 * In case of mixed groups both data and meta should be picked,
	 * and identical options should be given for both of them.
	 */
4430 4431
	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
	if (mixed && (bctl->flags & allowed)) {
4432 4433 4434
		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4435
			btrfs_err(fs_info,
4436
	  "balance: mixed groups data and metadata options must be the same");
4437 4438 4439 4440 4441
			ret = -EINVAL;
			goto out;
		}
	}

4442 4443
	/*
	 * rw_devices will not change at the moment, device add/delete/replace
4444
	 * are exclusive
4445 4446
	 */
	num_devices = fs_info->fs_devices->rw_devices;
4447 4448 4449 4450 4451 4452 4453

	/*
	 * SINGLE profile on-disk has no profile bit, but in-memory we have a
	 * special bit for it, to make it easier to distinguish.  Thus we need
	 * to set it manually, or balance would refuse the profile.
	 */
	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4454 4455 4456
	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
		if (num_devices >= btrfs_raid_array[i].devs_min)
			allowed |= btrfs_raid_array[i].bg_flag;
4457

4458 4459 4460
	if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
	    !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
	    !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4461 4462 4463 4464
		ret = -EINVAL;
		goto out;
	}

4465 4466 4467 4468 4469 4470 4471 4472 4473 4474
	/*
	 * Allow to reduce metadata or system integrity only if force set for
	 * profiles with redundancy (copies, parity)
	 */
	allowed = 0;
	for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
		if (btrfs_raid_array[i].ncopies >= 2 ||
		    btrfs_raid_array[i].tolerated_failures >= 1)
			allowed |= btrfs_raid_array[i].bg_flag;
	}
4475 4476 4477 4478 4479 4480 4481 4482
	do {
		seq = read_seqbegin(&fs_info->profiles_lock);

		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_system_alloc_bits & allowed) &&
		     !(bctl->sys.target & allowed)) ||
		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
		     (fs_info->avail_metadata_alloc_bits & allowed) &&
4483
		     !(bctl->meta.target & allowed)))
4484
			reducing_redundancy = true;
4485
		else
4486
			reducing_redundancy = false;
4487 4488 4489 4490 4491 4492

		/* if we're not converting, the target field is uninitialized */
		meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
			bctl->meta.target : fs_info->avail_metadata_alloc_bits;
		data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
			bctl->data.target : fs_info->avail_data_alloc_bits;
4493
	} while (read_seqretry(&fs_info->profiles_lock, seq));
4494

4495
	if (reducing_redundancy) {
4496 4497
		if (bctl->flags & BTRFS_BALANCE_FORCE) {
			btrfs_info(fs_info,
4498
			   "balance: force reducing metadata redundancy");
4499 4500
		} else {
			btrfs_err(fs_info,
4501
	"balance: reduces metadata redundancy, use --force if you want this");
4502 4503 4504 4505 4506
			ret = -EINVAL;
			goto out;
		}
	}

4507 4508
	if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
		btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4509
		btrfs_warn(fs_info,
4510
	"balance: metadata profile %s has lower redundancy than data profile %s",
4511 4512
				btrfs_bg_type_to_raid_name(meta_target),
				btrfs_bg_type_to_raid_name(data_target));
4513 4514
	}

4515
	ret = insert_balance_item(fs_info, bctl);
4516
	if (ret && ret != -EEXIST)
4517 4518
		goto out;

4519 4520
	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
		BUG_ON(ret == -EEXIST);
4521 4522 4523 4524
		BUG_ON(fs_info->balance_ctl);
		spin_lock(&fs_info->balance_lock);
		fs_info->balance_ctl = bctl;
		spin_unlock(&fs_info->balance_lock);
4525 4526 4527 4528 4529 4530
	} else {
		BUG_ON(ret != -EEXIST);
		spin_lock(&fs_info->balance_lock);
		update_balance_args(bctl);
		spin_unlock(&fs_info->balance_lock);
	}
4531

4532 4533
	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
	set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4534
	describe_balance_start_or_resume(fs_info);
4535 4536 4537 4538 4539
	mutex_unlock(&fs_info->balance_mutex);

	ret = __btrfs_balance(fs_info);

	mutex_lock(&fs_info->balance_mutex);
4540
	if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) {
4541
		btrfs_info(fs_info, "balance: paused");
4542
		btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
4543
		paused = true;
4544
	}
4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560
	/*
	 * Balance can be canceled by:
	 *
	 * - Regular cancel request
	 *   Then ret == -ECANCELED and balance_cancel_req > 0
	 *
	 * - Fatal signal to "btrfs" process
	 *   Either the signal caught by wait_reserve_ticket() and callers
	 *   got -EINTR, or caught by btrfs_should_cancel_balance() and
	 *   got -ECANCELED.
	 *   Either way, in this case balance_cancel_req = 0, and
	 *   ret == -EINTR or ret == -ECANCELED.
	 *
	 * So here we only check the return value to catch canceled balance.
	 */
	else if (ret == -ECANCELED || ret == -EINTR)
4561 4562 4563 4564
		btrfs_info(fs_info, "balance: canceled");
	else
		btrfs_info(fs_info, "balance: ended with status: %d", ret);

4565
	clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4566 4567 4568

	if (bargs) {
		memset(bargs, 0, sizeof(*bargs));
4569
		btrfs_update_ioctl_balance_args(fs_info, bargs);
4570 4571
	}

4572 4573
	/* We didn't pause, we can clean everything up. */
	if (!paused) {
4574
		reset_balance_state(fs_info);
4575
		btrfs_exclop_finish(fs_info);
4576 4577
	}

4578
	wake_up(&fs_info->balance_wait_q);
4579 4580 4581

	return ret;
out:
4582
	if (bctl->flags & BTRFS_BALANCE_RESUME)
4583
		reset_balance_state(fs_info);
4584
	else
4585
		kfree(bctl);
4586
	btrfs_exclop_finish(fs_info);
4587

4588 4589 4590 4591 4592
	return ret;
}

static int balance_kthread(void *data)
{
4593
	struct btrfs_fs_info *fs_info = data;
4594
	int ret = 0;
4595

4596
	sb_start_write(fs_info->sb);
4597
	mutex_lock(&fs_info->balance_mutex);
4598
	if (fs_info->balance_ctl)
4599
		ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4600
	mutex_unlock(&fs_info->balance_mutex);
4601
	sb_end_write(fs_info->sb);
4602

4603 4604 4605
	return ret;
}

4606 4607 4608 4609
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
{
	struct task_struct *tsk;

4610
	mutex_lock(&fs_info->balance_mutex);
4611
	if (!fs_info->balance_ctl) {
4612
		mutex_unlock(&fs_info->balance_mutex);
4613 4614
		return 0;
	}
4615
	mutex_unlock(&fs_info->balance_mutex);
4616

4617
	if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4618
		btrfs_info(fs_info, "balance: resume skipped");
4619 4620 4621
		return 0;
	}

4622 4623 4624 4625
	spin_lock(&fs_info->super_lock);
	ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
	fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
	spin_unlock(&fs_info->super_lock);
4626 4627 4628 4629 4630 4631 4632 4633 4634
	/*
	 * A ro->rw remount sequence should continue with the paused balance
	 * regardless of who pauses it, system or the user as of now, so set
	 * the resume flag.
	 */
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
	spin_unlock(&fs_info->balance_lock);

4635
	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4636
	return PTR_ERR_OR_ZERO(tsk);
4637 4638
}

4639
int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653
{
	struct btrfs_balance_control *bctl;
	struct btrfs_balance_item *item;
	struct btrfs_disk_balance_args disk_bargs;
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	int ret;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	key.objectid = BTRFS_BALANCE_OBJECTID;
4654
	key.type = BTRFS_TEMPORARY_ITEM_KEY;
4655 4656
	key.offset = 0;

4657
	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4658
	if (ret < 0)
4659
		goto out;
4660 4661
	if (ret > 0) { /* ret = -ENOENT; */
		ret = 0;
4662 4663 4664 4665 4666 4667 4668
		goto out;
	}

	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
	if (!bctl) {
		ret = -ENOMEM;
		goto out;
4669 4670 4671 4672 4673
	}

	leaf = path->nodes[0];
	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);

4674 4675
	bctl->flags = btrfs_balance_flags(leaf, item);
	bctl->flags |= BTRFS_BALANCE_RESUME;
4676 4677 4678 4679 4680 4681 4682 4683

	btrfs_balance_data(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
	btrfs_balance_meta(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
	btrfs_balance_sys(leaf, item, &disk_bargs);
	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);

4684 4685 4686 4687 4688 4689 4690 4691 4692 4693
	/*
	 * This should never happen, as the paused balance state is recovered
	 * during mount without any chance of other exclusive ops to collide.
	 *
	 * This gives the exclusive op status to balance and keeps in paused
	 * state until user intervention (cancel or umount). If the ownership
	 * cannot be assigned, show a message but do not fail. The balance
	 * is in a paused state and must have fs_info::balance_ctl properly
	 * set up.
	 */
4694
	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED))
4695
		btrfs_warn(fs_info,
4696
	"balance: cannot set exclusive op status, resume manually");
4697

4698 4699
	btrfs_release_path(path);

4700
	mutex_lock(&fs_info->balance_mutex);
4701 4702 4703 4704
	BUG_ON(fs_info->balance_ctl);
	spin_lock(&fs_info->balance_lock);
	fs_info->balance_ctl = bctl;
	spin_unlock(&fs_info->balance_lock);
4705
	mutex_unlock(&fs_info->balance_mutex);
4706 4707
out:
	btrfs_free_path(path);
4708 4709 4710
	return ret;
}

4711 4712 4713 4714 4715 4716 4717 4718 4719 4720
int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
{
	int ret = 0;

	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

4721
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4722 4723 4724 4725
		atomic_inc(&fs_info->balance_pause_req);
		mutex_unlock(&fs_info->balance_mutex);

		wait_event(fs_info->balance_wait_q,
4726
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4727 4728 4729

		mutex_lock(&fs_info->balance_mutex);
		/* we are good with balance_ctl ripped off from under us */
4730
		BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4731 4732 4733 4734 4735 4736 4737 4738 4739
		atomic_dec(&fs_info->balance_pause_req);
	} else {
		ret = -ENOTCONN;
	}

	mutex_unlock(&fs_info->balance_mutex);
	return ret;
}

4740 4741 4742 4743 4744 4745 4746 4747
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
{
	mutex_lock(&fs_info->balance_mutex);
	if (!fs_info->balance_ctl) {
		mutex_unlock(&fs_info->balance_mutex);
		return -ENOTCONN;
	}

4748 4749 4750 4751 4752 4753 4754 4755 4756 4757
	/*
	 * A paused balance with the item stored on disk can be resumed at
	 * mount time if the mount is read-write. Otherwise it's still paused
	 * and we must not allow cancelling as it deletes the item.
	 */
	if (sb_rdonly(fs_info->sb)) {
		mutex_unlock(&fs_info->balance_mutex);
		return -EROFS;
	}

4758 4759 4760 4761 4762
	atomic_inc(&fs_info->balance_cancel_req);
	/*
	 * if we are running just wait and return, balance item is
	 * deleted in btrfs_balance in this case
	 */
4763
	if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4764 4765
		mutex_unlock(&fs_info->balance_mutex);
		wait_event(fs_info->balance_wait_q,
4766
			   !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4767 4768 4769
		mutex_lock(&fs_info->balance_mutex);
	} else {
		mutex_unlock(&fs_info->balance_mutex);
4770 4771 4772 4773
		/*
		 * Lock released to allow other waiters to continue, we'll
		 * reexamine the status again.
		 */
4774 4775
		mutex_lock(&fs_info->balance_mutex);

4776
		if (fs_info->balance_ctl) {
4777
			reset_balance_state(fs_info);
4778
			btrfs_exclop_finish(fs_info);
4779
			btrfs_info(fs_info, "balance: canceled");
4780
		}
4781 4782
	}

4783
	ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4784 4785 4786 4787 4788
	atomic_dec(&fs_info->balance_cancel_req);
	mutex_unlock(&fs_info->balance_mutex);
	return 0;
}

4789 4790 4791 4792 4793 4794 4795
/*
 * shrinking a device means finding all of the device extents past
 * the new size, and then following the back refs to the chunks.
 * The chunk relocation code actually frees the device extent
 */
int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
{
4796 4797
	struct btrfs_fs_info *fs_info = device->fs_info;
	struct btrfs_root *root = fs_info->dev_root;
4798 4799 4800 4801 4802 4803 4804
	struct btrfs_trans_handle *trans;
	struct btrfs_dev_extent *dev_extent = NULL;
	struct btrfs_path *path;
	u64 length;
	u64 chunk_offset;
	int ret;
	int slot;
4805 4806
	int failed = 0;
	bool retried = false;
4807 4808
	struct extent_buffer *l;
	struct btrfs_key key;
4809
	struct btrfs_super_block *super_copy = fs_info->super_copy;
4810
	u64 old_total = btrfs_super_total_bytes(super_copy);
4811
	u64 old_size = btrfs_device_get_total_bytes(device);
4812
	u64 diff;
4813
	u64 start;
4814
	u64 free_diff = 0;
4815 4816

	new_size = round_down(new_size, fs_info->sectorsize);
4817
	start = new_size;
4818
	diff = round_down(old_size - new_size, fs_info->sectorsize);
4819

4820
	if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4821 4822
		return -EINVAL;

4823 4824 4825 4826
	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

4827
	path->reada = READA_BACK;
4828

4829 4830 4831 4832 4833 4834
	trans = btrfs_start_transaction(root, 0);
	if (IS_ERR(trans)) {
		btrfs_free_path(path);
		return PTR_ERR(trans);
	}

4835
	mutex_lock(&fs_info->chunk_mutex);
4836

4837
	btrfs_device_set_total_bytes(device, new_size);
4838
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Yan Zheng's avatar
Yan Zheng committed
4839
		device->fs_devices->total_rw_bytes -= diff;
4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852

		/*
		 * The new free_chunk_space is new_size - used, so we have to
		 * subtract the delta of the old free_chunk_space which included
		 * old_size - used.  If used > new_size then just subtract this
		 * entire device's free space.
		 */
		if (device->bytes_used < new_size)
			free_diff = (old_size - device->bytes_used) -
				    (new_size - device->bytes_used);
		else
			free_diff = old_size - device->bytes_used;
		atomic64_sub(free_diff, &fs_info->free_chunk_space);
4853
	}
4854 4855 4856 4857 4858 4859

	/*
	 * Once the device's size has been set to the new size, ensure all
	 * in-memory chunks are synced to disk so that the loop below sees them
	 * and relocates them accordingly.
	 */
4860
	if (contains_pending_extent(device, &start, diff)) {
4861 4862 4863 4864 4865 4866 4867 4868
		mutex_unlock(&fs_info->chunk_mutex);
		ret = btrfs_commit_transaction(trans);
		if (ret)
			goto done;
	} else {
		mutex_unlock(&fs_info->chunk_mutex);
		btrfs_end_transaction(trans);
	}
4869

4870
again:
4871 4872 4873 4874
	key.objectid = device->devid;
	key.offset = (u64)-1;
	key.type = BTRFS_DEV_EXTENT_KEY;

4875
	do {
4876
		mutex_lock(&fs_info->reclaim_bgs_lock);
4877
		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4878
		if (ret < 0) {
4879
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4880
			goto done;
4881
		}
4882 4883 4884

		ret = btrfs_previous_item(root, path, 0, key.type);
		if (ret) {
4885
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4886 4887
			if (ret < 0)
				goto done;
4888
			ret = 0;
4889
			btrfs_release_path(path);
4890
			break;
4891 4892 4893 4894 4895 4896
		}

		l = path->nodes[0];
		slot = path->slots[0];
		btrfs_item_key_to_cpu(l, &key, path->slots[0]);

4897
		if (key.objectid != device->devid) {
4898
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4899
			btrfs_release_path(path);
4900
			break;
4901
		}
4902 4903 4904 4905

		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
		length = btrfs_dev_extent_length(l, dev_extent);

4906
		if (key.offset + length <= new_size) {
4907
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4908
			btrfs_release_path(path);
4909
			break;
4910
		}
4911 4912

		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4913
		btrfs_release_path(path);
4914

4915 4916 4917 4918 4919 4920 4921 4922
		/*
		 * We may be relocating the only data chunk we have,
		 * which could potentially end up with losing data's
		 * raid profile, so lets allocate an empty one in
		 * advance.
		 */
		ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
		if (ret < 0) {
4923
			mutex_unlock(&fs_info->reclaim_bgs_lock);
4924 4925 4926
			goto done;
		}

4927
		ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4928
		mutex_unlock(&fs_info->reclaim_bgs_lock);
4929
		if (ret == -ENOSPC) {
4930
			failed++;
4931 4932 4933 4934 4935 4936 4937 4938
		} else if (ret) {
			if (ret == -ETXTBSY) {
				btrfs_warn(fs_info,
		   "could not shrink block group %llu due to active swapfile",
					   chunk_offset);
			}
			goto done;
		}
4939
	} while (key.offset-- > 0);
4940 4941 4942 4943 4944 4945 4946 4947

	if (failed && !retried) {
		failed = 0;
		retried = true;
		goto again;
	} else if (failed && retried) {
		ret = -ENOSPC;
		goto done;
4948 4949
	}

4950
	/* Shrinking succeeded, else we would be at "done". */
4951
	trans = btrfs_start_transaction(root, 0);
4952 4953 4954 4955 4956
	if (IS_ERR(trans)) {
		ret = PTR_ERR(trans);
		goto done;
	}

4957
	mutex_lock(&fs_info->chunk_mutex);
4958 4959 4960 4961
	/* Clear all state bits beyond the shrunk device size */
	clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
			  CHUNK_STATE_MASK);

4962
	btrfs_device_set_disk_total_bytes(device, new_size);
4963 4964 4965
	if (list_empty(&device->post_commit_list))
		list_add_tail(&device->post_commit_list,
			      &trans->transaction->dev_update_list);
4966 4967

	WARN_ON(diff > old_total);
4968 4969
	btrfs_set_super_total_bytes(super_copy,
			round_down(old_total - diff, fs_info->sectorsize));
4970
	mutex_unlock(&fs_info->chunk_mutex);
Miao Xie's avatar
Miao Xie committed
4971

4972
	btrfs_reserve_chunk_metadata(trans, false);
Miao Xie's avatar
Miao Xie committed
4973 4974
	/* Now btrfs_update_device() will change the on-disk size. */
	ret = btrfs_update_device(trans, device);
4975
	btrfs_trans_release_chunk_metadata(trans);
4976 4977 4978 4979 4980 4981
	if (ret < 0) {
		btrfs_abort_transaction(trans, ret);
		btrfs_end_transaction(trans);
	} else {
		ret = btrfs_commit_transaction(trans);
	}
4982 4983
done:
	btrfs_free_path(path);
4984
	if (ret) {
4985
		mutex_lock(&fs_info->chunk_mutex);
4986
		btrfs_device_set_total_bytes(device, old_size);
4987
		if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4988
			device->fs_devices->total_rw_bytes += diff;
4989 4990
			atomic64_add(free_diff, &fs_info->free_chunk_space);
		}
4991
		mutex_unlock(&fs_info->chunk_mutex);
4992
	}
4993 4994 4995
	return ret;
}

4996
static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4997 4998 4999
			   struct btrfs_key *key,
			   struct btrfs_chunk *chunk, int item_size)
{
5000
	struct btrfs_super_block *super_copy = fs_info->super_copy;
5001 5002 5003 5004
	struct btrfs_disk_key disk_key;
	u32 array_size;
	u8 *ptr;

5005 5006
	lockdep_assert_held(&fs_info->chunk_mutex);

5007
	array_size = btrfs_super_sys_array_size(super_copy);
5008
	if (array_size + item_size + sizeof(disk_key)
5009
			> BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
5010 5011 5012 5013 5014 5015 5016 5017 5018
		return -EFBIG;

	ptr = super_copy->sys_chunk_array + array_size;
	btrfs_cpu_key_to_disk(&disk_key, key);
	memcpy(ptr, &disk_key, sizeof(disk_key));
	ptr += sizeof(disk_key);
	memcpy(ptr, chunk, item_size);
	item_size += sizeof(disk_key);
	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
5019

5020 5021 5022
	return 0;
}

5023 5024 5025 5026
/*
 * sort the devices in descending order by max_avail, total_avail
 */
static int btrfs_cmp_device_info(const void *a, const void *b)
5027
{
5028 5029
	const struct btrfs_device_info *di_a = a;
	const struct btrfs_device_info *di_b = b;
5030

5031
	if (di_a->max_avail > di_b->max_avail)
5032
		return -1;
5033
	if (di_a->max_avail < di_b->max_avail)
5034
		return 1;
5035 5036 5037 5038 5039
	if (di_a->total_avail > di_b->total_avail)
		return -1;
	if (di_a->total_avail < di_b->total_avail)
		return 1;
	return 0;
5040
}
5041

David Woodhouse's avatar
David Woodhouse committed
5042 5043
static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
5044
	if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
David Woodhouse's avatar
David Woodhouse committed
5045 5046
		return;

5047
	btrfs_set_fs_incompat(info, RAID56);
David Woodhouse's avatar
David Woodhouse committed
5048 5049
}

5050 5051 5052 5053 5054 5055 5056 5057
static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
{
	if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
		return;

	btrfs_set_fs_incompat(info, RAID1C34);
}

5058
/*
5059
 * Structure used internally for btrfs_create_chunk() function.
5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082
 * Wraps needed parameters.
 */
struct alloc_chunk_ctl {
	u64 start;
	u64 type;
	/* Total number of stripes to allocate */
	int num_stripes;
	/* sub_stripes info for map */
	int sub_stripes;
	/* Stripes per device */
	int dev_stripes;
	/* Maximum number of devices to use */
	int devs_max;
	/* Minimum number of devices to use */
	int devs_min;
	/* ndevs has to be a multiple of this */
	int devs_increment;
	/* Number of copies */
	int ncopies;
	/* Number of stripes worth of bytes to store parity information */
	int nparity;
	u64 max_stripe_size;
	u64 max_chunk_size;
5083
	u64 dev_extent_min;
5084 5085 5086 5087 5088
	u64 stripe_size;
	u64 chunk_size;
	int ndevs;
};

5089 5090 5091 5092
static void init_alloc_chunk_ctl_policy_regular(
				struct btrfs_fs_devices *fs_devices,
				struct alloc_chunk_ctl *ctl)
{
5093
	struct btrfs_space_info *space_info;
5094

5095 5096 5097 5098
	space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type);
	ASSERT(space_info);

	ctl->max_chunk_size = READ_ONCE(space_info->chunk_size);
5099
	ctl->max_stripe_size = min_t(u64, ctl->max_chunk_size, SZ_1G);
5100 5101 5102

	if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM)
		ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK);
5103 5104

	/* We don't want a chunk larger than 10% of writable space */
5105
	ctl->max_chunk_size = min(mult_perc(fs_devices->total_rw_bytes, 10),
5106
				  ctl->max_chunk_size);
5107
	ctl->dev_extent_min = btrfs_stripe_nr_to_offset(ctl->dev_stripes);
5108 5109
}

5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130
static void init_alloc_chunk_ctl_policy_zoned(
				      struct btrfs_fs_devices *fs_devices,
				      struct alloc_chunk_ctl *ctl)
{
	u64 zone_size = fs_devices->fs_info->zone_size;
	u64 limit;
	int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
	int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
	u64 min_chunk_size = min_data_stripes * zone_size;
	u64 type = ctl->type;

	ctl->max_stripe_size = zone_size;
	if (type & BTRFS_BLOCK_GROUP_DATA) {
		ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
						 zone_size);
	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
		ctl->max_chunk_size = ctl->max_stripe_size;
	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
		ctl->max_chunk_size = 2 * ctl->max_stripe_size;
		ctl->devs_max = min_t(int, ctl->devs_max,
				      BTRFS_MAX_DEVS_SYS_CHUNK);
5131 5132
	} else {
		BUG();
5133 5134 5135
	}

	/* We don't want a chunk larger than 10% of writable space */
5136
	limit = max(round_down(mult_perc(fs_devices->total_rw_bytes, 10),
5137 5138 5139 5140 5141 5142
			       zone_size),
		    min_chunk_size);
	ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
	ctl->dev_extent_min = zone_size * ctl->dev_stripes;
}

5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162
static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
				 struct alloc_chunk_ctl *ctl)
{
	int index = btrfs_bg_flags_to_raid_index(ctl->type);

	ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
	ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
	ctl->devs_max = btrfs_raid_array[index].devs_max;
	if (!ctl->devs_max)
		ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
	ctl->devs_min = btrfs_raid_array[index].devs_min;
	ctl->devs_increment = btrfs_raid_array[index].devs_increment;
	ctl->ncopies = btrfs_raid_array[index].ncopies;
	ctl->nparity = btrfs_raid_array[index].nparity;
	ctl->ndevs = 0;

	switch (fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
		break;
5163 5164 5165
	case BTRFS_CHUNK_ALLOC_ZONED:
		init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
		break;
5166 5167 5168 5169 5170
	default:
		BUG();
	}
}

5171 5172 5173
static int gather_device_info(struct btrfs_fs_devices *fs_devices,
			      struct alloc_chunk_ctl *ctl,
			      struct btrfs_device_info *devices_info)
5174
{
5175
	struct btrfs_fs_info *info = fs_devices->fs_info;
5176
	struct btrfs_device *device;
5177
	u64 total_avail;
5178
	u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5179
	int ret;
5180 5181 5182
	int ndevs = 0;
	u64 max_avail;
	u64 dev_offset;
5183

5184
	/*
5185 5186
	 * in the first pass through the devices list, we gather information
	 * about the available holes on each device.
5187
	 */
5188
	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5189
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
Julia Lawall's avatar
Julia Lawall committed
5190
			WARN(1, KERN_ERR
5191
			       "BTRFS: read-only device in alloc_list\n");
5192 5193
			continue;
		}
5194

5195 5196
		if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
					&device->dev_state) ||
5197
		    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5198
			continue;
5199

5200 5201 5202 5203
		if (device->total_bytes > device->bytes_used)
			total_avail = device->total_bytes - device->bytes_used;
		else
			total_avail = 0;
5204 5205

		/* If there is no space on this device, skip it. */
5206
		if (total_avail < ctl->dev_extent_min)
5207
			continue;
5208

5209 5210
		ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
					   &max_avail);
5211
		if (ret && ret != -ENOSPC)
5212
			return ret;
5213

5214
		if (ret == 0)
5215
			max_avail = dev_extent_want;
5216

5217
		if (max_avail < ctl->dev_extent_min) {
5218 5219
			if (btrfs_test_opt(info, ENOSPC_DEBUG))
				btrfs_debug(info,
5220
			"%s: devid %llu has no free space, have=%llu want=%llu",
5221
					    __func__, device->devid, max_avail,
5222
					    ctl->dev_extent_min);
5223
			continue;
5224
		}
5225

5226 5227 5228 5229 5230
		if (ndevs == fs_devices->rw_devices) {
			WARN(1, "%s: found more than %llu devices\n",
			     __func__, fs_devices->rw_devices);
			break;
		}
5231 5232 5233 5234 5235 5236
		devices_info[ndevs].dev_offset = dev_offset;
		devices_info[ndevs].max_avail = max_avail;
		devices_info[ndevs].total_avail = total_avail;
		devices_info[ndevs].dev = device;
		++ndevs;
	}
5237
	ctl->ndevs = ndevs;
5238

5239 5240 5241
	/*
	 * now sort the devices by hole size / available space
	 */
5242
	sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5243
	     btrfs_cmp_device_info, NULL);
5244

5245 5246 5247
	return 0;
}

5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284
static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
				      struct btrfs_device_info *devices_info)
{
	/* Number of stripes that count for block group size */
	int data_stripes;

	/*
	 * The primary goal is to maximize the number of stripes, so use as
	 * many devices as possible, even if the stripes are not maximum sized.
	 *
	 * The DUP profile stores more than one stripe per device, the
	 * max_avail is the total size so we have to adjust.
	 */
	ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
				   ctl->dev_stripes);
	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;

	/* This will have to be fixed for RAID1 and RAID10 over more drives */
	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

	/*
	 * Use the number of data stripes to figure out how big this chunk is
	 * really going to be in terms of logical address space, and compare
	 * that answer with the max chunk size. If it's higher, we try to
	 * reduce stripe_size.
	 */
	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
		/*
		 * Reduce stripe_size, round it up to a 16MB boundary again and
		 * then use it, unless it ends up being even bigger than the
		 * previous value we had already.
		 */
		ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
							data_stripes), SZ_16M),
				       ctl->stripe_size);
	}

5285 5286 5287
	/* Stripe size should not go beyond 1G. */
	ctl->stripe_size = min_t(u64, ctl->stripe_size, SZ_1G);

5288 5289 5290 5291 5292 5293 5294
	/* Align to BTRFS_STRIPE_LEN */
	ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
	ctl->chunk_size = ctl->stripe_size * data_stripes;

	return 0;
}

5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316 5317 5318 5319 5320 5321 5322 5323 5324 5325 5326
static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
				    struct btrfs_device_info *devices_info)
{
	u64 zone_size = devices_info[0].dev->zone_info->zone_size;
	/* Number of stripes that count for block group size */
	int data_stripes;

	/*
	 * It should hold because:
	 *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
	 */
	ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);

	ctl->stripe_size = zone_size;
	ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
	data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;

	/* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
	if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
		ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
					     ctl->stripe_size) + ctl->nparity,
				     ctl->dev_stripes);
		ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
		data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
		ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
	}

	ctl->chunk_size = ctl->stripe_size * data_stripes;

	return 0;
}

5327 5328 5329 5330 5331 5332 5333 5334 5335 5336 5337 5338 5339 5340 5341 5342 5343 5344 5345 5346 5347 5348 5349 5350 5351 5352 5353
static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
			      struct alloc_chunk_ctl *ctl,
			      struct btrfs_device_info *devices_info)
{
	struct btrfs_fs_info *info = fs_devices->fs_info;

	/*
	 * Round down to number of usable stripes, devs_increment can be any
	 * number so we can't use round_down() that requires power of 2, while
	 * rounddown is safe.
	 */
	ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);

	if (ctl->ndevs < ctl->devs_min) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
			btrfs_debug(info,
	"%s: not enough devices with free space: have=%d minimum required=%d",
				    __func__, ctl->ndevs, ctl->devs_min);
		}
		return -ENOSPC;
	}

	ctl->ndevs = min(ctl->ndevs, ctl->devs_max);

	switch (fs_devices->chunk_alloc_policy) {
	case BTRFS_CHUNK_ALLOC_REGULAR:
		return decide_stripe_size_regular(ctl, devices_info);
5354 5355
	case BTRFS_CHUNK_ALLOC_ZONED:
		return decide_stripe_size_zoned(ctl, devices_info);
5356 5357 5358 5359 5360
	default:
		BUG();
	}
}

5361 5362 5363 5364 5365 5366 5367 5368 5369 5370 5371 5372 5373 5374 5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387 5388 5389 5390 5391 5392 5393 5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408 5409 5410 5411 5412 5413 5414 5415 5416 5417 5418 5419 5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434 5435 5436 5437 5438 5439 5440 5441 5442 5443 5444 5445 5446
static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
{
	for (int i = 0; i < map->num_stripes; i++) {
		struct btrfs_io_stripe *stripe = &map->stripes[i];
		struct btrfs_device *device = stripe->dev;

		set_extent_bit(&device->alloc_state, stripe->physical,
			       stripe->physical + map->stripe_size - 1,
			       bits | EXTENT_NOWAIT, NULL);
	}
}

static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
{
	for (int i = 0; i < map->num_stripes; i++) {
		struct btrfs_io_stripe *stripe = &map->stripes[i];
		struct btrfs_device *device = stripe->dev;

		__clear_extent_bit(&device->alloc_state, stripe->physical,
				   stripe->physical + map->stripe_size - 1,
				   bits | EXTENT_NOWAIT,
				   NULL, NULL);
	}
}

void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
{
	write_lock(&fs_info->mapping_tree_lock);
	rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
	RB_CLEAR_NODE(&map->rb_node);
	chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
	write_unlock(&fs_info->mapping_tree_lock);

	/* Once for the tree reference. */
	btrfs_free_chunk_map(map);
}

EXPORT_FOR_TESTS
int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
{
	struct rb_node **p;
	struct rb_node *parent = NULL;
	bool leftmost = true;

	write_lock(&fs_info->mapping_tree_lock);
	p = &fs_info->mapping_tree.rb_root.rb_node;
	while (*p) {
		struct btrfs_chunk_map *entry;

		parent = *p;
		entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);

		if (map->start < entry->start) {
			p = &(*p)->rb_left;
		} else if (map->start > entry->start) {
			p = &(*p)->rb_right;
			leftmost = false;
		} else {
			write_unlock(&fs_info->mapping_tree_lock);
			return -EEXIST;
		}
	}
	rb_link_node(&map->rb_node, parent, p);
	rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
	chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
	chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
	write_unlock(&fs_info->mapping_tree_lock);

	return 0;
}

EXPORT_FOR_TESTS
struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
{
	struct btrfs_chunk_map *map;

	map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
	if (!map)
		return NULL;

	refcount_set(&map->refs, 1);
	RB_CLEAR_NODE(&map->rb_node);

	return map;
}

5447
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5448 5449
			struct alloc_chunk_ctl *ctl,
			struct btrfs_device_info *devices_info)
5450 5451
{
	struct btrfs_fs_info *info = trans->fs_info;
5452
	struct btrfs_chunk_map *map;
5453
	struct btrfs_block_group *block_group;
5454 5455
	u64 start = ctl->start;
	u64 type = ctl->type;
5456 5457
	int ret;

5458
	map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
5459
	if (!map)
5460
		return ERR_PTR(-ENOMEM);
5461 5462 5463 5464 5465 5466 5467 5468

	map->start = start;
	map->chunk_len = ctl->chunk_size;
	map->stripe_size = ctl->stripe_size;
	map->type = type;
	map->io_align = BTRFS_STRIPE_LEN;
	map->io_width = BTRFS_STRIPE_LEN;
	map->sub_stripes = ctl->sub_stripes;
5469
	map->num_stripes = ctl->num_stripes;
5470

5471 5472
	for (int i = 0; i < ctl->ndevs; i++) {
		for (int j = 0; j < ctl->dev_stripes; j++) {
5473
			int s = i * ctl->dev_stripes + j;
5474 5475
			map->stripes[s].dev = devices_info[i].dev;
			map->stripes[s].physical = devices_info[i].dev_offset +
5476
						   j * ctl->stripe_size;
5477 5478
		}
	}
5479

5480
	trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5481

5482
	ret = btrfs_add_chunk_map(info, map);
5483
	if (ret) {
5484
		btrfs_free_chunk_map(map);
5485
		return ERR_PTR(ret);
5486
	}
5487

5488
	block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
5489 5490 5491 5492
	if (IS_ERR(block_group)) {
		btrfs_remove_chunk_map(info, map);
		return block_group;
	}
Yan Zheng's avatar
Yan Zheng committed
5493

5494
	for (int i = 0; i < map->num_stripes; i++) {
5495 5496
		struct btrfs_device *dev = map->stripes[i].dev;

5497
		btrfs_device_set_bytes_used(dev,
5498
					    dev->bytes_used + ctl->stripe_size);
5499 5500 5501 5502
		if (list_empty(&dev->post_commit_list))
			list_add_tail(&dev->post_commit_list,
				      &trans->transaction->dev_update_list);
	}
5503

5504
	atomic64_sub(ctl->stripe_size * map->num_stripes,
5505
		     &info->free_chunk_space);
5506

5507
	check_raid56_incompat_flag(info, type);
5508
	check_raid1c34_incompat_flag(info, type);
David Woodhouse's avatar
David Woodhouse committed
5509

5510
	return block_group;
5511 5512
}

5513
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
5514
					    u64 type)
5515 5516 5517 5518 5519
{
	struct btrfs_fs_info *info = trans->fs_info;
	struct btrfs_fs_devices *fs_devices = info->fs_devices;
	struct btrfs_device_info *devices_info = NULL;
	struct alloc_chunk_ctl ctl;
5520
	struct btrfs_block_group *block_group;
5521 5522
	int ret;

5523 5524
	lockdep_assert_held(&info->chunk_mutex);

5525 5526
	if (!alloc_profile_is_valid(type, 0)) {
		ASSERT(0);
5527
		return ERR_PTR(-EINVAL);
5528 5529 5530 5531 5532
	}

	if (list_empty(&fs_devices->alloc_list)) {
		if (btrfs_test_opt(info, ENOSPC_DEBUG))
			btrfs_debug(info, "%s: no writable device", __func__);
5533
		return ERR_PTR(-ENOSPC);
5534 5535 5536 5537 5538
	}

	if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
		btrfs_err(info, "invalid chunk type 0x%llx requested", type);
		ASSERT(0);
5539
		return ERR_PTR(-EINVAL);
5540 5541
	}

5542
	ctl.start = find_next_chunk(info);
5543 5544 5545 5546 5547 5548
	ctl.type = type;
	init_alloc_chunk_ctl(fs_devices, &ctl);

	devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
			       GFP_NOFS);
	if (!devices_info)
5549
		return ERR_PTR(-ENOMEM);
5550 5551

	ret = gather_device_info(fs_devices, &ctl, devices_info);
5552 5553
	if (ret < 0) {
		block_group = ERR_PTR(ret);
5554
		goto out;
5555
	}
5556 5557

	ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5558 5559
	if (ret < 0) {
		block_group = ERR_PTR(ret);
5560
		goto out;
5561
	}
5562

5563
	block_group = create_chunk(trans, &ctl, devices_info);
5564 5565

out:
5566
	kfree(devices_info);
5567
	return block_group;
Yan Zheng's avatar
Yan Zheng committed
5568 5569
}

5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585
/*
 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
 * chunks.
 *
 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
 * phases.
 */
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
				     struct btrfs_block_group *bg)
{
	struct btrfs_fs_info *fs_info = trans->fs_info;
	struct btrfs_root *chunk_root = fs_info->chunk_root;
	struct btrfs_key key;
	struct btrfs_chunk *chunk;
	struct btrfs_stripe *stripe;
5586
	struct btrfs_chunk_map *map;
5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614
	size_t item_size;
	int i;
	int ret;

	/*
	 * We take the chunk_mutex for 2 reasons:
	 *
	 * 1) Updates and insertions in the chunk btree must be done while holding
	 *    the chunk_mutex, as well as updating the system chunk array in the
	 *    superblock. See the comment on top of btrfs_chunk_alloc() for the
	 *    details;
	 *
	 * 2) To prevent races with the final phase of a device replace operation
	 *    that replaces the device object associated with the map's stripes,
	 *    because the device object's id can change at any time during that
	 *    final phase of the device replace operation
	 *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
	 *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
	 *    which would cause a failure when updating the device item, which does
	 *    not exists, or persisting a stripe of the chunk item with such ID.
	 *    Here we can't use the device_list_mutex because our caller already
	 *    has locked the chunk_mutex, and the final phase of device replace
	 *    acquires both mutexes - first the device_list_mutex and then the
	 *    chunk_mutex. Using any of those two mutexes protects us from a
	 *    concurrent device replace.
	 */
	lockdep_assert_held(&fs_info->chunk_mutex);

5615 5616 5617
	map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
	if (IS_ERR(map)) {
		ret = PTR_ERR(map);
5618 5619 5620 5621 5622 5623 5624 5625 5626 5627
		btrfs_abort_transaction(trans, ret);
		return ret;
	}

	item_size = btrfs_chunk_item_size(map->num_stripes);

	chunk = kzalloc(item_size, GFP_NOFS);
	if (!chunk) {
		ret = -ENOMEM;
		btrfs_abort_transaction(trans, ret);
5628
		goto out;
Yan Zheng's avatar
Yan Zheng committed
5629 5630
	}

5631 5632 5633 5634 5635 5636 5637 5638
	for (i = 0; i < map->num_stripes; i++) {
		struct btrfs_device *device = map->stripes[i].dev;

		ret = btrfs_update_device(trans, device);
		if (ret)
			goto out;
	}

Yan Zheng's avatar
Yan Zheng committed
5639
	stripe = &chunk->stripe;
5640
	for (i = 0; i < map->num_stripes; i++) {
5641 5642
		struct btrfs_device *device = map->stripes[i].dev;
		const u64 dev_offset = map->stripes[i].physical;
5643

5644 5645 5646
		btrfs_set_stack_stripe_devid(stripe, device->devid);
		btrfs_set_stack_stripe_offset(stripe, dev_offset);
		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
Yan Zheng's avatar
Yan Zheng committed
5647
		stripe++;
5648 5649
	}

5650
	btrfs_set_stack_chunk_length(chunk, bg->length);
5651
	btrfs_set_stack_chunk_owner(chunk, BTRFS_EXTENT_TREE_OBJECTID);
5652
	btrfs_set_stack_chunk_stripe_len(chunk, BTRFS_STRIPE_LEN);
Yan Zheng's avatar
Yan Zheng committed
5653 5654
	btrfs_set_stack_chunk_type(chunk, map->type);
	btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5655 5656
	btrfs_set_stack_chunk_io_align(chunk, BTRFS_STRIPE_LEN);
	btrfs_set_stack_chunk_io_width(chunk, BTRFS_STRIPE_LEN);
5657
	btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
Yan Zheng's avatar
Yan Zheng committed
5658
	btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5659

Yan Zheng's avatar
Yan Zheng committed
5660 5661
	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
	key.type = BTRFS_CHUNK_ITEM_KEY;
5662
	key.offset = bg->start;
5663

Yan Zheng's avatar
Yan Zheng committed
5664
	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5665 5666 5667
	if (ret)
		goto out;

5668
	set_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED, &bg->runtime_flags);
5669 5670

	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5671
		ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5672 5673
		if (ret)
			goto out;
5674
	}
5675

5676
out:
5677
	kfree(chunk);
5678
	btrfs_free_chunk_map(map);
5679
	return ret;
Yan Zheng's avatar
Yan Zheng committed
5680
}
5681

5682
static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
Yan Zheng's avatar
Yan Zheng committed
5683
{
5684
	struct btrfs_fs_info *fs_info = trans->fs_info;
Yan Zheng's avatar
Yan Zheng committed
5685
	u64 alloc_profile;
5686 5687 5688 5689 5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708
	struct btrfs_block_group *meta_bg;
	struct btrfs_block_group *sys_bg;

	/*
	 * When adding a new device for sprouting, the seed device is read-only
	 * so we must first allocate a metadata and a system chunk. But before
	 * adding the block group items to the extent, device and chunk btrees,
	 * we must first:
	 *
	 * 1) Create both chunks without doing any changes to the btrees, as
	 *    otherwise we would get -ENOSPC since the block groups from the
	 *    seed device are read-only;
	 *
	 * 2) Add the device item for the new sprout device - finishing the setup
	 *    of a new block group requires updating the device item in the chunk
	 *    btree, so it must exist when we attempt to do it. The previous step
	 *    ensures this does not fail with -ENOSPC.
	 *
	 * After that we can add the block group items to their btrees:
	 * update existing device item in the chunk btree, add a new block group
	 * item to the extent btree, add a new chunk item to the chunk btree and
	 * finally add the new device extent items to the devices btree.
	 */
Yan Zheng's avatar
Yan Zheng committed
5709

5710
	alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5711
	meta_bg = btrfs_create_chunk(trans, alloc_profile);
5712 5713
	if (IS_ERR(meta_bg))
		return PTR_ERR(meta_bg);
Yan Zheng's avatar
Yan Zheng committed
5714

5715
	alloc_profile = btrfs_system_alloc_profile(fs_info);
5716
	sys_bg = btrfs_create_chunk(trans, alloc_profile);
5717 5718 5719 5720
	if (IS_ERR(sys_bg))
		return PTR_ERR(sys_bg);

	return 0;
Yan Zheng's avatar
Yan Zheng committed
5721 5722
}

5723
static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
5724
{
5725
	const int index = btrfs_bg_flags_to_raid_index(map->type);
Yan Zheng's avatar
Yan Zheng committed
5726

5727
	return btrfs_raid_array[index].tolerated_failures;
Yan Zheng's avatar
Yan Zheng committed
5728 5729
}

5730
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
Yan Zheng's avatar
Yan Zheng committed
5731
{
5732
	struct btrfs_chunk_map *map;
5733
	int miss_ndevs = 0;
Yan Zheng's avatar
Yan Zheng committed
5734
	int i;
5735
	bool ret = true;
Yan Zheng's avatar
Yan Zheng committed
5736

5737 5738
	map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
	if (IS_ERR(map))
5739
		return false;
Yan Zheng's avatar
Yan Zheng committed
5740 5741

	for (i = 0; i < map->num_stripes; i++) {
5742 5743
		if (test_bit(BTRFS_DEV_STATE_MISSING,
					&map->stripes[i].dev->dev_state)) {
5744 5745 5746
			miss_ndevs++;
			continue;
		}
5747 5748
		if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
					&map->stripes[i].dev->dev_state)) {
5749
			ret = false;
5750
			goto end;
Yan Zheng's avatar
Yan Zheng committed
5751 5752
		}
	}
5753 5754

	/*
5755 5756
	 * If the number of missing devices is larger than max errors, we can
	 * not write the data into that chunk successfully.
5757 5758
	 */
	if (miss_ndevs > btrfs_chunk_max_errors(map))
5759
		ret = false;
5760
end:
5761
	btrfs_free_chunk_map(map);
5762
	return ret;
5763 5764
}

5765
void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
5766
{
5767 5768 5769 5770
	write_lock(&fs_info->mapping_tree_lock);
	while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
		struct btrfs_chunk_map *map;
		struct rb_node *node;
5771

5772 5773 5774 5775 5776 5777 5778 5779
		node = rb_first_cached(&fs_info->mapping_tree);
		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
		rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
		RB_CLEAR_NODE(&map->rb_node);
		chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
		/* Once for the tree ref. */
		btrfs_free_chunk_map(map);
		cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
5780
	}
5781
	write_unlock(&fs_info->mapping_tree_lock);
5782 5783
}

5784 5785 5786 5787 5788 5789 5790 5791 5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804
static int btrfs_chunk_map_num_copies(const struct btrfs_chunk_map *map)
{
	enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(map->type);

	if (map->type & BTRFS_BLOCK_GROUP_RAID5)
		return 2;

	/*
	 * There could be two corrupted data stripes, we need to loop retry in
	 * order to rebuild the correct data.
	 *
	 * Fail a stripe at a time on every retry except the stripe under
	 * reconstruction.
	 */
	if (map->type & BTRFS_BLOCK_GROUP_RAID6)
		return map->num_stripes;

	/* Non-RAID56, use their ncopies from btrfs_raid_array. */
	return btrfs_raid_array[index].ncopies;
}

5805
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5806
{
5807
	struct btrfs_chunk_map *map;
5808
	int ret;
5809

5810 5811
	map = btrfs_get_chunk_map(fs_info, logical, len);
	if (IS_ERR(map))
5812 5813 5814 5815 5816 5817
		/*
		 * We could return errors for these cases, but that could get
		 * ugly and we'd probably do the same thing which is just not do
		 * anything else and exit, so return 1 so the callers don't try
		 * to use other copies.
		 */
5818 5819
		return 1;

5820
	ret = btrfs_chunk_map_num_copies(map);
5821
	btrfs_free_chunk_map(map);
5822 5823 5824
	return ret;
}

5825
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
David Woodhouse's avatar
David Woodhouse committed
5826 5827
				    u64 logical)
{
5828
	struct btrfs_chunk_map *map;
5829
	unsigned long len = fs_info->sectorsize;
David Woodhouse's avatar
David Woodhouse committed
5830

5831 5832 5833
	if (!btrfs_fs_incompat(fs_info, RAID56))
		return len;

5834
	map = btrfs_get_chunk_map(fs_info, logical, len);
David Woodhouse's avatar
David Woodhouse committed
5835

5836
	if (!WARN_ON(IS_ERR(map))) {
5837
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5838
			len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
5839
		btrfs_free_chunk_map(map);
5840
	}
David Woodhouse's avatar
David Woodhouse committed
5841 5842 5843
	return len;
}

5844
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
David Woodhouse's avatar
David Woodhouse committed
5845
{
5846
	struct btrfs_chunk_map *map;
David Woodhouse's avatar
David Woodhouse committed
5847 5848
	int ret = 0;

5849 5850 5851
	if (!btrfs_fs_incompat(fs_info, RAID56))
		return 0;

5852
	map = btrfs_get_chunk_map(fs_info, logical, len);
David Woodhouse's avatar
David Woodhouse committed
5853

5854
	if (!WARN_ON(IS_ERR(map))) {
5855 5856
		if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			ret = 1;
5857
		btrfs_free_chunk_map(map);
5858
	}
David Woodhouse's avatar
David Woodhouse committed
5859 5860 5861
	return ret;
}

5862
static int find_live_mirror(struct btrfs_fs_info *fs_info,
5863
			    struct btrfs_chunk_map *map, int first,
5864
			    int dev_replace_is_ongoing)
5865
{
5866
	const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy);
5867
	int i;
5868
	int num_stripes;
5869
	int preferred_mirror;
5870 5871 5872
	int tolerance;
	struct btrfs_device *srcdev;

5873
	ASSERT((map->type &
5874
		 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5875 5876 5877 5878 5879 5880

	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
		num_stripes = map->sub_stripes;
	else
		num_stripes = map->num_stripes;

5881
	switch (policy) {
5882 5883
	default:
		/* Shouldn't happen, just warn and use pid instead of failing */
5884 5885 5886
		btrfs_warn_rl(fs_info, "unknown read_policy type %u, reset to pid",
			      policy);
		WRITE_ONCE(fs_info->fs_devices->read_policy, BTRFS_READ_POLICY_PID);
5887 5888 5889 5890 5891
		fallthrough;
	case BTRFS_READ_POLICY_PID:
		preferred_mirror = first + (current->pid % num_stripes);
		break;
	}
5892

5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905
	if (dev_replace_is_ongoing &&
	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
		srcdev = fs_info->dev_replace.srcdev;
	else
		srcdev = NULL;

	/*
	 * try to avoid the drive that is the source drive for a
	 * dev-replace procedure, only choose it if no other non-missing
	 * mirror is available
	 */
	for (tolerance = 0; tolerance < 2; tolerance++) {
5906 5907 5908
		if (map->stripes[preferred_mirror].dev->bdev &&
		    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
			return preferred_mirror;
5909
		for (i = first; i < first + num_stripes; i++) {
5910 5911 5912 5913
			if (map->stripes[i].dev->bdev &&
			    (tolerance || map->stripes[i].dev != srcdev))
				return i;
		}
5914
	}
5915

5916 5917 5918
	/* we couldn't find one that doesn't fail.  Just return something
	 * and the io error handling code will clean up eventually
	 */
5919
	return preferred_mirror;
5920 5921
}

5922
static struct btrfs_io_context *alloc_btrfs_io_context(struct btrfs_fs_info *fs_info,
5923
						       u64 logical,
5924
						       u16 total_stripes)
5925
{
5926 5927 5928
	struct btrfs_io_context *bioc;

	bioc = kzalloc(
5929 5930 5931
		 /* The size of btrfs_io_context */
		sizeof(struct btrfs_io_context) +
		/* Plus the variable array for the stripes */
5932
		sizeof(struct btrfs_io_stripe) * (total_stripes),
5933 5934 5935 5936
		GFP_NOFS);

	if (!bioc)
		return NULL;
5937

5938
	refcount_set(&bioc->refs, 1);
5939

5940
	bioc->fs_info = fs_info;
5941
	bioc->replace_stripe_src = -1;
5942
	bioc->full_stripe_logical = (u64)-1;
5943
	bioc->logical = logical;
5944

5945
	return bioc;
5946 5947
}

5948
void btrfs_get_bioc(struct btrfs_io_context *bioc)
5949
{
5950 5951
	WARN_ON(!refcount_read(&bioc->refs));
	refcount_inc(&bioc->refs);
5952 5953
}

5954
void btrfs_put_bioc(struct btrfs_io_context *bioc)
5955
{
5956
	if (!bioc)
5957
		return;
5958 5959
	if (refcount_dec_and_test(&bioc->refs))
		kfree(bioc);
5960 5961
}

5962 5963 5964 5965
/*
 * Please note that, discard won't be sent to target device of device
 * replace.
 */
5966 5967 5968
struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
					       u64 logical, u64 *length_ret,
					       u32 *num_stripes)
5969
{
5970
	struct btrfs_chunk_map *map;
5971
	struct btrfs_discard_stripe *stripes;
5972
	u64 length = *length_ret;
5973
	u64 offset;
5974 5975 5976
	u32 stripe_nr;
	u32 stripe_nr_end;
	u32 stripe_cnt;
5977 5978 5979 5980 5981
	u64 stripe_end_offset;
	u64 stripe_offset;
	u32 stripe_index;
	u32 factor = 0;
	u32 sub_stripes = 0;
5982
	u32 stripes_per_dev = 0;
5983 5984
	u32 remaining_stripes = 0;
	u32 last_stripe = 0;
5985
	int ret;
5986 5987
	int i;

5988 5989 5990
	map = btrfs_get_chunk_map(fs_info, logical, length);
	if (IS_ERR(map))
		return ERR_CAST(map);
5991

5992 5993 5994
	/* we don't discard raid56 yet */
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		ret = -EOPNOTSUPP;
5995
		goto out_free_map;
5996
	}
5997

5998 5999
	offset = logical - map->start;
	length = min_t(u64, map->start + map->chunk_len - logical, length);
6000
	*length_ret = length;
6001 6002 6003 6004 6005

	/*
	 * stripe_nr counts the total number of stripes we have to stride
	 * to get to this block
	 */
6006
	stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
6007 6008

	/* stripe_offset is the offset of this block in its stripe */
6009
	stripe_offset = offset - btrfs_stripe_nr_to_offset(stripe_nr);
6010

6011 6012
	stripe_nr_end = round_up(offset + length, BTRFS_STRIPE_LEN) >>
			BTRFS_STRIPE_LEN_SHIFT;
6013
	stripe_cnt = stripe_nr_end - stripe_nr;
6014
	stripe_end_offset = btrfs_stripe_nr_to_offset(stripe_nr_end) -
6015 6016 6017 6018 6019 6020
			    (offset + length);
	/*
	 * after this, stripe_nr is the number of stripes on this
	 * device we have to walk to find the data, and stripe_index is
	 * the number of our device in the stripe array
	 */
6021
	*num_stripes = 1;
6022 6023 6024 6025 6026 6027 6028 6029 6030
	stripe_index = 0;
	if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
			 BTRFS_BLOCK_GROUP_RAID10)) {
		if (map->type & BTRFS_BLOCK_GROUP_RAID0)
			sub_stripes = 1;
		else
			sub_stripes = map->sub_stripes;

		factor = map->num_stripes / sub_stripes;
6031
		*num_stripes = min_t(u64, map->num_stripes,
6032
				    sub_stripes * stripe_cnt);
6033 6034
		stripe_index = stripe_nr % factor;
		stripe_nr /= factor;
6035
		stripe_index *= sub_stripes;
6036 6037 6038 6039

		remaining_stripes = stripe_cnt % factor;
		stripes_per_dev = stripe_cnt / factor;
		last_stripe = ((stripe_nr_end - 1) % factor) * sub_stripes;
6040
	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
6041
				BTRFS_BLOCK_GROUP_DUP)) {
6042
		*num_stripes = map->num_stripes;
6043
	} else {
6044 6045
		stripe_index = stripe_nr % map->num_stripes;
		stripe_nr /= map->num_stripes;
6046 6047
	}

6048 6049
	stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS);
	if (!stripes) {
6050
		ret = -ENOMEM;
6051
		goto out_free_map;
6052 6053
	}

6054 6055
	for (i = 0; i < *num_stripes; i++) {
		stripes[i].physical =
6056
			map->stripes[stripe_index].physical +
6057
			stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
6058
		stripes[i].dev = map->stripes[stripe_index].dev;
6059 6060 6061

		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
				 BTRFS_BLOCK_GROUP_RAID10)) {
6062
			stripes[i].length = btrfs_stripe_nr_to_offset(stripes_per_dev);
6063 6064

			if (i / sub_stripes < remaining_stripes)
6065
				stripes[i].length += BTRFS_STRIPE_LEN;
6066 6067 6068 6069 6070 6071 6072 6073 6074 6075

			/*
			 * Special for the first stripe and
			 * the last stripe:
			 *
			 * |-------|...|-------|
			 *     |----------|
			 *    off     end_off
			 */
			if (i < sub_stripes)
6076
				stripes[i].length -= stripe_offset;
6077 6078 6079 6080

			if (stripe_index >= last_stripe &&
			    stripe_index <= (last_stripe +
					     sub_stripes - 1))
6081
				stripes[i].length -= stripe_end_offset;
6082 6083 6084 6085

			if (i == sub_stripes - 1)
				stripe_offset = 0;
		} else {
6086
			stripes[i].length = length;
6087 6088 6089 6090 6091 6092 6093 6094 6095
		}

		stripe_index++;
		if (stripe_index == map->num_stripes) {
			stripe_index = 0;
			stripe_nr++;
		}
	}

6096
	btrfs_free_chunk_map(map);
6097 6098
	return stripes;
out_free_map:
6099
	btrfs_free_chunk_map(map);
6100
	return ERR_PTR(ret);
6101 6102
}

6103 6104 6105 6106 6107
static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
{
	struct btrfs_block_group *cache;
	bool ret;

6108
	/* Non zoned filesystem does not use "to_copy" flag */
6109 6110 6111 6112 6113
	if (!btrfs_is_zoned(fs_info))
		return false;

	cache = btrfs_lookup_block_group(fs_info, logical);

6114
	ret = test_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
6115 6116 6117 6118 6119

	btrfs_put_block_group(cache);
	return ret;
}

6120
static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc,
6121
				      struct btrfs_dev_replace *dev_replace,
6122
				      u64 logical,
6123
				      struct btrfs_io_geometry *io_geom)
6124 6125
{
	u64 srcdev_devid = dev_replace->srcdev->devid;
6126 6127 6128 6129
	/*
	 * At this stage, num_stripes is still the real number of stripes,
	 * excluding the duplicated stripes.
	 */
6130 6131
	int num_stripes = io_geom->num_stripes;
	int max_errors = io_geom->max_errors;
6132
	int nr_extra_stripes = 0;
6133 6134
	int i;

6135 6136 6137 6138 6139 6140
	/*
	 * A block group which has "to_copy" set will eventually be copied by
	 * the dev-replace process. We can avoid cloning IO here.
	 */
	if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
		return;
6141

6142 6143 6144 6145 6146 6147 6148 6149 6150 6151 6152 6153 6154
	/*
	 * Duplicate the write operations while the dev-replace procedure is
	 * running. Since the copying of the old disk to the new disk takes
	 * place at run time while the filesystem is mounted writable, the
	 * regular write operations to the old disk have to be duplicated to go
	 * to the new disk as well.
	 *
	 * Note that device->missing is handled by the caller, and that the
	 * write to the old disk is already set up in the stripes array.
	 */
	for (i = 0; i < num_stripes; i++) {
		struct btrfs_io_stripe *old = &bioc->stripes[i];
		struct btrfs_io_stripe *new = &bioc->stripes[num_stripes + nr_extra_stripes];
6155

6156 6157
		if (old->dev->devid != srcdev_devid)
			continue;
6158

6159 6160 6161 6162 6163 6164
		new->physical = old->physical;
		new->dev = dev_replace->tgtdev;
		if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
			bioc->replace_stripe_src = i;
		nr_extra_stripes++;
	}
6165

6166 6167 6168 6169 6170 6171 6172
	/* We can only have at most 2 extra nr_stripes (for DUP). */
	ASSERT(nr_extra_stripes <= 2);
	/*
	 * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for
	 * replace.
	 * If we have 2 extra stripes, only choose the one with smaller physical.
	 */
6173
	if (io_geom->op == BTRFS_MAP_GET_READ_MIRRORS && nr_extra_stripes == 2) {
6174 6175
		struct btrfs_io_stripe *first = &bioc->stripes[num_stripes];
		struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1];
6176

6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187
		/* Only DUP can have two extra stripes. */
		ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP);

		/*
		 * Swap the last stripe stripes and reduce @nr_extra_stripes.
		 * The extra stripe would still be there, but won't be accessed.
		 */
		if (first->physical > second->physical) {
			swap(second->physical, first->physical);
			swap(second->dev, first->dev);
			nr_extra_stripes--;
6188 6189 6190
		}
	}

6191 6192
	io_geom->num_stripes = num_stripes + nr_extra_stripes;
	io_geom->max_errors = max_errors + nr_extra_stripes;
6193
	bioc->replace_nr_stripes = nr_extra_stripes;
6194 6195
}

6196 6197
static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
			    struct btrfs_io_geometry *io_geom)
6198
{
6199
	/*
6200 6201
	 * Stripe_nr is the stripe where this block falls.  stripe_offset is
	 * the offset of this block in its stripe.
6202
	 */
6203 6204 6205
	io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
	io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
	ASSERT(io_geom->stripe_offset < U32_MAX);
6206

6207
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6208 6209
		unsigned long full_stripe_len =
			btrfs_stripe_nr_to_offset(nr_data_stripes(map));
6210

6211 6212 6213 6214 6215 6216 6217 6218 6219
		/*
		 * For full stripe start, we use previously calculated
		 * @stripe_nr. Align it to nr_data_stripes, then multiply with
		 * STRIPE_LEN.
		 *
		 * By this we can avoid u64 division completely.  And we have
		 * to go rounddown(), not round_down(), as nr_data_stripes is
		 * not ensured to be power of 2.
		 */
6220 6221
		io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
			rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
6222

6223 6224
		ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
		ASSERT(io_geom->raid56_full_stripe_start <= offset);
6225
		/*
6226 6227
		 * For writes to RAID56, allow to write a full stripe set, but
		 * no straddling of stripe sets.
6228
		 */
6229 6230
		if (io_geom->op == BTRFS_MAP_WRITE)
			return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
6231 6232
	}

6233 6234 6235 6236 6237
	/*
	 * For other RAID types and for RAID56 reads, allow a single stripe (on
	 * a single disk).
	 */
	if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
6238
		return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
6239
	return U64_MAX;
6240 6241
}

6242 6243 6244 6245
static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
			 u64 *length, struct btrfs_io_stripe *dst,
			 struct btrfs_chunk_map *map,
			 struct btrfs_io_geometry *io_geom)
6246
{
6247
	dst->dev = map->stripes[io_geom->stripe_index].dev;
6248

6249 6250
	if (io_geom->op == BTRFS_MAP_READ &&
	    btrfs_need_stripe_tree_update(fs_info, map->type))
6251
		return btrfs_get_raid_extent_offset(fs_info, logical, length,
6252 6253
						    map->type,
						    io_geom->stripe_index, dst);
6254

6255 6256 6257
	dst->physical = map->stripes[io_geom->stripe_index].physical +
			io_geom->stripe_offset +
			btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
6258
	return 0;
6259 6260
}

6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281
static bool is_single_device_io(struct btrfs_fs_info *fs_info,
				const struct btrfs_io_stripe *smap,
				const struct btrfs_chunk_map *map,
				int num_alloc_stripes,
				enum btrfs_map_op op, int mirror_num)
{
	if (!smap)
		return false;

	if (num_alloc_stripes != 1)
		return false;

	if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
		return false;

	if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
		return false;

	return true;
}

6282 6283 6284 6285 6286 6287 6288 6289 6290
static void map_blocks_raid0(const struct btrfs_chunk_map *map,
			     struct btrfs_io_geometry *io_geom)
{
	io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
	io_geom->stripe_nr /= map->num_stripes;
	if (io_geom->op == BTRFS_MAP_READ)
		io_geom->mirror_num = 1;
}

6291 6292 6293 6294 6295 6296 6297 6298 6299 6300 6301 6302 6303 6304 6305 6306 6307 6308 6309 6310
static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
			     struct btrfs_chunk_map *map,
			     struct btrfs_io_geometry *io_geom,
			     bool dev_replace_is_ongoing)
{
	if (io_geom->op != BTRFS_MAP_READ) {
		io_geom->num_stripes = map->num_stripes;
		return;
	}

	if (io_geom->mirror_num) {
		io_geom->stripe_index = io_geom->mirror_num - 1;
		return;
	}

	io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
						 dev_replace_is_ongoing);
	io_geom->mirror_num = io_geom->stripe_index + 1;
}

6311 6312 6313 6314 6315 6316 6317 6318 6319 6320 6321 6322 6323 6324 6325 6326
static void map_blocks_dup(const struct btrfs_chunk_map *map,
			   struct btrfs_io_geometry *io_geom)
{
	if (io_geom->op != BTRFS_MAP_READ) {
		io_geom->num_stripes = map->num_stripes;
		return;
	}

	if (io_geom->mirror_num) {
		io_geom->stripe_index = io_geom->mirror_num - 1;
		return;
	}

	io_geom->mirror_num = 1;
}

6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337 6338 6339 6340 6341 6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353 6354
static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
			      struct btrfs_chunk_map *map,
			      struct btrfs_io_geometry *io_geom,
			      bool dev_replace_is_ongoing)
{
	u32 factor = map->num_stripes / map->sub_stripes;
	int old_stripe_index;

	io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
	io_geom->stripe_nr /= factor;

	if (io_geom->op != BTRFS_MAP_READ) {
		io_geom->num_stripes = map->sub_stripes;
		return;
	}

	if (io_geom->mirror_num) {
		io_geom->stripe_index += io_geom->mirror_num - 1;
		return;
	}

	old_stripe_index = io_geom->stripe_index;
	io_geom->stripe_index = find_live_mirror(fs_info, map,
						 io_geom->stripe_index,
						 dev_replace_is_ongoing);
	io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
}

6355 6356 6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375 6376 6377 6378 6379 6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398 6399 6400 6401 6402 6403
static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
				    struct btrfs_io_geometry *io_geom,
				    u64 logical, u64 *length)
{
	int data_stripes = nr_data_stripes(map);

	/*
	 * Needs full stripe mapping.
	 *
	 * Push stripe_nr back to the start of the full stripe For those cases
	 * needing a full stripe, @stripe_nr is the full stripe number.
	 *
	 * Originally we go raid56_full_stripe_start / full_stripe_len, but
	 * that can be expensive.  Here we just divide @stripe_nr with
	 * @data_stripes.
	 */
	io_geom->stripe_nr /= data_stripes;

	/* RAID[56] write or recovery. Return all stripes */
	io_geom->num_stripes = map->num_stripes;
	io_geom->max_errors = btrfs_chunk_max_errors(map);

	/* Return the length to the full stripe end. */
	*length = min(logical + *length,
		      io_geom->raid56_full_stripe_start + map->start +
		      btrfs_stripe_nr_to_offset(data_stripes)) -
		logical;
	io_geom->stripe_index = 0;
	io_geom->stripe_offset = 0;
}

static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
				   struct btrfs_io_geometry *io_geom)
{
	int data_stripes = nr_data_stripes(map);

	ASSERT(io_geom->mirror_num <= 1);
	/* Just grab the data stripe directly. */
	io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
	io_geom->stripe_nr /= data_stripes;

	/* We distribute the parity blocks across stripes. */
	io_geom->stripe_index =
		(io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;

	if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
		io_geom->mirror_num = 1;
}

6404 6405 6406 6407 6408 6409 6410 6411
static void map_blocks_single(const struct btrfs_chunk_map *map,
			      struct btrfs_io_geometry *io_geom)
{
	io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
	io_geom->stripe_nr /= map->num_stripes;
	io_geom->mirror_num = io_geom->stripe_index + 1;
}

6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422 6423 6424 6425 6426 6427 6428 6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446
/*
 * Map one logical range to one or more physical ranges.
 *
 * @length:		(Mandatory) mapped length of this run.
 *			One logical range can be split into different segments
 *			due to factors like zones and RAID0/5/6/10 stripe
 *			boundaries.
 *
 * @bioc_ret:		(Mandatory) returned btrfs_io_context structure.
 *			which has one or more physical ranges (btrfs_io_stripe)
 *			recorded inside.
 *			Caller should call btrfs_put_bioc() to free it after use.
 *
 * @smap:		(Optional) single physical range optimization.
 *			If the map request can be fulfilled by one single
 *			physical range, and this is parameter is not NULL,
 *			then @bioc_ret would be NULL, and @smap would be
 *			updated.
 *
 * @mirror_num_ret:	(Mandatory) returned mirror number if the original
 *			value is 0.
 *
 *			Mirror number 0 means to choose any live mirrors.
 *
 *			For non-RAID56 profiles, non-zero mirror_num means
 *			the Nth mirror. (e.g. mirror_num 1 means the first
 *			copy).
 *
 *			For RAID56 profile, mirror 1 means rebuild from P and
 *			the remaining data stripes.
 *
 *			For RAID6 profile, mirror > 2 means mark another
 *			data/P stripe error and rebuild from the remaining
 *			stripes..
 */
6447 6448 6449
int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
		    u64 logical, u64 *length,
		    struct btrfs_io_context **bioc_ret,
6450
		    struct btrfs_io_stripe *smap, int *mirror_num_ret)
6451
{
6452
	struct btrfs_chunk_map *map;
6453
	struct btrfs_io_geometry io_geom = { 0 };
6454
	u64 map_offset;
6455
	int ret = 0;
6456
	int num_copies;
6457
	struct btrfs_io_context *bioc = NULL;
6458 6459
	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
	int dev_replace_is_ongoing = 0;
6460
	u16 num_alloc_stripes;
6461
	u64 max_len;
6462

6463
	ASSERT(bioc_ret);
6464

6465 6466 6467 6468 6469
	io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
	io_geom.num_stripes = 1;
	io_geom.stripe_index = 0;
	io_geom.op = op;

6470 6471 6472
	map = btrfs_get_chunk_map(fs_info, logical, *length);
	if (IS_ERR(map))
		return PTR_ERR(map);
6473

6474 6475 6476 6477
	num_copies = btrfs_chunk_map_num_copies(map);
	if (io_geom.mirror_num > num_copies)
		return -EINVAL;

6478
	map_offset = logical - map->start;
6479
	io_geom.raid56_full_stripe_start = (u64)-1;
6480
	max_len = btrfs_max_io_len(map, map_offset, &io_geom);
6481
	*length = min_t(u64, map->chunk_len - map_offset, max_len);
6482

6483
	down_read(&dev_replace->rwsem);
6484
	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6485 6486 6487 6488
	/*
	 * Hold the semaphore for read during the whole operation, write is
	 * requested at commit time but must wait.
	 */
6489
	if (!dev_replace_is_ongoing)
6490
		up_read(&dev_replace->rwsem);
6491

6492 6493
	switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
	case BTRFS_BLOCK_GROUP_RAID0:
6494
		map_blocks_raid0(map, &io_geom);
6495 6496 6497 6498
		break;
	case BTRFS_BLOCK_GROUP_RAID1:
	case BTRFS_BLOCK_GROUP_RAID1C3:
	case BTRFS_BLOCK_GROUP_RAID1C4:
6499
		map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
6500 6501
		break;
	case BTRFS_BLOCK_GROUP_DUP:
6502
		map_blocks_dup(map, &io_geom);
6503 6504
		break;
	case BTRFS_BLOCK_GROUP_RAID10:
6505
		map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
6506 6507 6508
		break;
	case BTRFS_BLOCK_GROUP_RAID5:
	case BTRFS_BLOCK_GROUP_RAID6:
6509 6510 6511 6512
		if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
			map_blocks_raid56_write(map, &io_geom, logical, length);
		else
			map_blocks_raid56_read(map, &io_geom);
6513 6514
		break;
	default:
6515
		/*
6516
		 * After this, stripe_nr is the number of stripes on this
6517 6518
		 * device we have to walk to find the data, and stripe_index is
		 * the number of our device in the stripe array
6519
		 */
6520
		map_blocks_single(map, &io_geom);
6521
		break;
6522
	}
6523
	if (io_geom.stripe_index >= map->num_stripes) {
6524 6525
		btrfs_crit(fs_info,
			   "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6526
			   io_geom.stripe_index, map->num_stripes);
6527 6528 6529
		ret = -EINVAL;
		goto out;
	}
6530

6531
	num_alloc_stripes = io_geom.num_stripes;
6532 6533 6534 6535 6536 6537 6538 6539 6540 6541
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
	    op != BTRFS_MAP_READ)
		/*
		 * For replace case, we need to add extra stripes for extra
		 * duplicated stripes.
		 *
		 * For both WRITE and GET_READ_MIRRORS, we may have at most
		 * 2 more stripes (DUP types, otherwise 1).
		 */
		num_alloc_stripes += 2;
6542

6543 6544 6545 6546 6547
	/*
	 * If this I/O maps to a single device, try to return the device and
	 * physical block information on the stack instead of allocating an
	 * I/O context structure.
	 */
6548
	if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
6549
				io_geom.mirror_num)) {
6550
		ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
6551
		if (mirror_num_ret)
6552
			*mirror_num_ret = io_geom.mirror_num;
6553 6554 6555 6556
		*bioc_ret = NULL;
		goto out;
	}

6557
	bioc = alloc_btrfs_io_context(fs_info, logical, num_alloc_stripes);
6558
	if (!bioc) {
6559 6560 6561
		ret = -ENOMEM;
		goto out;
	}
6562
	bioc->map_type = map->type;
6563

6564 6565 6566 6567 6568 6569 6570
	/*
	 * For RAID56 full map, we need to make sure the stripes[] follows the
	 * rule that data stripes are all ordered, then followed with P and Q
	 * (if we have).
	 *
	 * It's still mostly the same as other profiles, just with extra rotation.
	 */
6571
	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
6572
	    (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
6573 6574 6575 6576 6577 6578 6579 6580
		/*
		 * For RAID56 @stripe_nr is already the number of full stripes
		 * before us, which is also the rotation value (needs to modulo
		 * with num_stripes).
		 *
		 * In this case, we just add @stripe_nr with @i, then do the
		 * modulo, to reduce one modulo call.
		 */
6581
		bioc->full_stripe_logical = map->start +
6582 6583
			btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
						  nr_data_stripes(map));
6584
		for (int i = 0; i < io_geom.num_stripes; i++) {
6585 6586 6587 6588 6589 6590 6591 6592 6593
			struct btrfs_io_stripe *dst = &bioc->stripes[i];
			u32 stripe_index;

			stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
			dst->dev = map->stripes[stripe_index].dev;
			dst->physical =
				map->stripes[stripe_index].physical +
				io_geom.stripe_offset +
				btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
6594
		}
6595 6596 6597 6598 6599
	} else {
		/*
		 * For all other non-RAID56 profiles, just copy the target
		 * stripe into the bioc.
		 */
6600
		for (int i = 0; i < io_geom.num_stripes; i++) {
6601 6602
			ret = set_io_stripe(fs_info, logical, length,
					    &bioc->stripes[i], map, &io_geom);
6603 6604
			if (ret < 0)
				break;
6605
			io_geom.stripe_index++;
6606
		}
6607
	}
6608

6609 6610 6611 6612 6613 6614
	if (ret) {
		*bioc_ret = NULL;
		btrfs_put_bioc(bioc);
		goto out;
	}

6615
	if (op != BTRFS_MAP_READ)
6616
		io_geom.max_errors = btrfs_chunk_max_errors(map);
6617

6618
	if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6619
	    op != BTRFS_MAP_READ) {
6620
		handle_ops_on_dev_replace(bioc, dev_replace, logical, &io_geom);
6621 6622
	}

6623
	*bioc_ret = bioc;
6624 6625 6626
	bioc->num_stripes = io_geom.num_stripes;
	bioc->max_errors = io_geom.max_errors;
	bioc->mirror_num = io_geom.mirror_num;
6627

6628
out:
6629
	if (dev_replace_is_ongoing) {
6630 6631
		lockdep_assert_held(&dev_replace->rwsem);
		/* Unlock and let waiting writers proceed */
6632
		up_read(&dev_replace->rwsem);
6633
	}
6634
	btrfs_free_chunk_map(map);
6635
	return ret;
6636 6637
}

6638 6639 6640 6641 6642 6643 6644 6645 6646 6647 6648 6649 6650
static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args,
				      const struct btrfs_fs_devices *fs_devices)
{
	if (args->fsid == NULL)
		return true;
	if (memcmp(fs_devices->metadata_uuid, args->fsid, BTRFS_FSID_SIZE) == 0)
		return true;
	return false;
}

static bool dev_args_match_device(const struct btrfs_dev_lookup_args *args,
				  const struct btrfs_device *device)
{
6651 6652 6653 6654 6655 6656
	if (args->missing) {
		if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state) &&
		    !device->bdev)
			return true;
		return false;
	}
6657

6658
	if (device->devid != args->devid)
6659 6660 6661
		return false;
	if (args->uuid && memcmp(device->uuid, args->uuid, BTRFS_UUID_SIZE) != 0)
		return false;
6662
	return true;
6663 6664
}

6665 6666 6667 6668 6669 6670 6671
/*
 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 * return NULL.
 *
 * If devid and uuid are both specified, the match must be exact, otherwise
 * only devid is used.
 */
6672 6673
struct btrfs_device *btrfs_find_device(const struct btrfs_fs_devices *fs_devices,
				       const struct btrfs_dev_lookup_args *args)
6674
{
Yan Zheng's avatar
Yan Zheng committed
6675
	struct btrfs_device *device;
6676 6677
	struct btrfs_fs_devices *seed_devs;

6678
	if (dev_args_match_fs_devices(args, fs_devices)) {
6679
		list_for_each_entry(device, &fs_devices->devices, dev_list) {
6680
			if (dev_args_match_device(args, device))
6681 6682 6683
				return device;
		}
	}
Yan Zheng's avatar
Yan Zheng committed
6684

6685
	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6686 6687 6688 6689 6690
		if (!dev_args_match_fs_devices(args, seed_devs))
			continue;
		list_for_each_entry(device, &seed_devs->devices, dev_list) {
			if (dev_args_match_device(args, device))
				return device;
Yan Zheng's avatar
Yan Zheng committed
6691 6692
		}
	}
6693

Yan Zheng's avatar
Yan Zheng committed
6694
	return NULL;
6695 6696
}

6697
static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6698 6699 6700
					    u64 devid, u8 *dev_uuid)
{
	struct btrfs_device *device;
6701
	unsigned int nofs_flag;
6702

6703 6704 6705 6706 6707 6708
	/*
	 * We call this under the chunk_mutex, so we want to use NOFS for this
	 * allocation, however we don't want to change btrfs_alloc_device() to
	 * always do NOFS because we use it in a lot of other GFP_KERNEL safe
	 * places.
	 */
6709

6710
	nofs_flag = memalloc_nofs_save();
6711
	device = btrfs_alloc_device(NULL, &devid, dev_uuid, NULL);
6712
	memalloc_nofs_restore(nofs_flag);
6713
	if (IS_ERR(device))
6714
		return device;
6715 6716

	list_add(&device->dev_list, &fs_devices->devices);
Yan Zheng's avatar
Yan Zheng committed
6717
	device->fs_devices = fs_devices;
6718
	fs_devices->num_devices++;
6719

6720
	set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6721
	fs_devices->missing_devices++;
6722

6723 6724 6725
	return device;
}

6726 6727 6728
/*
 * Allocate new device struct, set up devid and UUID.
 *
6729 6730 6731 6732 6733 6734
 * @fs_info:	used only for generating a new devid, can be NULL if
 *		devid is provided (i.e. @devid != NULL).
 * @devid:	a pointer to devid for this device.  If NULL a new devid
 *		is generated.
 * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
 *		is generated.
6735
 * @path:	a pointer to device path if available, NULL otherwise.
6736 6737
 *
 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6738
 * on error.  Returned struct is not linked onto any lists and must be
6739
 * destroyed with btrfs_free_device.
6740 6741
 */
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6742 6743
					const u64 *devid, const u8 *uuid,
					const char *path)
6744 6745 6746 6747
{
	struct btrfs_device *dev;
	u64 tmp;

6748
	if (WARN_ON(!devid && !fs_info))
6749 6750
		return ERR_PTR(-EINVAL);

6751 6752 6753 6754 6755 6756 6757 6758 6759 6760
	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	if (!dev)
		return ERR_PTR(-ENOMEM);

	INIT_LIST_HEAD(&dev->dev_list);
	INIT_LIST_HEAD(&dev->dev_alloc_list);
	INIT_LIST_HEAD(&dev->post_commit_list);

	atomic_set(&dev->dev_stats_ccnt, 0);
	btrfs_device_data_ordered_init(dev);
6761
	extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE);
6762 6763 6764 6765 6766 6767 6768 6769

	if (devid)
		tmp = *devid;
	else {
		int ret;

		ret = find_next_devid(fs_info, &tmp);
		if (ret) {
6770
			btrfs_free_device(dev);
6771 6772 6773 6774 6775 6776 6777 6778 6779 6780
			return ERR_PTR(ret);
		}
	}
	dev->devid = tmp;

	if (uuid)
		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
	else
		generate_random_uuid(dev->uuid);

6781 6782 6783 6784 6785 6786 6787 6788 6789 6790 6791
	if (path) {
		struct rcu_string *name;

		name = rcu_string_strdup(path, GFP_KERNEL);
		if (!name) {
			btrfs_free_device(dev);
			return ERR_PTR(-ENOMEM);
		}
		rcu_assign_pointer(dev->name, name);
	}

6792 6793 6794
	return dev;
}

6795
static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6796
					u64 devid, u8 *uuid, bool error)
6797
{
6798 6799 6800 6801 6802 6803
	if (error)
		btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
	else
		btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
			      devid, uuid);
6804 6805
}

6806
u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
6807
{
6808
	const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
6809

6810
	return div_u64(map->chunk_len, data_stripes);
6811 6812
}

6813 6814 6815 6816 6817 6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829 6830 6831 6832 6833 6834 6835 6836 6837 6838 6839 6840 6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852
#if BITS_PER_LONG == 32
/*
 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
 * can't be accessed on 32bit systems.
 *
 * This function do mount time check to reject the fs if it already has
 * metadata chunk beyond that limit.
 */
static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
				  u64 logical, u64 length, u64 type)
{
	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
		return 0;

	if (logical + length < MAX_LFS_FILESIZE)
		return 0;

	btrfs_err_32bit_limit(fs_info);
	return -EOVERFLOW;
}

/*
 * This is to give early warning for any metadata chunk reaching
 * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
 * Although we can still access the metadata, it's not going to be possible
 * once the limit is reached.
 */
static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
				  u64 logical, u64 length, u64 type)
{
	if (!(type & BTRFS_BLOCK_GROUP_METADATA))
		return;

	if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
		return;

	btrfs_warn_32bit_limit(fs_info);
}
#endif

6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864 6865 6866 6867 6868 6869 6870 6871 6872 6873
static struct btrfs_device *handle_missing_device(struct btrfs_fs_info *fs_info,
						  u64 devid, u8 *uuid)
{
	struct btrfs_device *dev;

	if (!btrfs_test_opt(fs_info, DEGRADED)) {
		btrfs_report_missing_device(fs_info, devid, uuid, true);
		return ERR_PTR(-ENOENT);
	}

	dev = add_missing_dev(fs_info->fs_devices, devid, uuid);
	if (IS_ERR(dev)) {
		btrfs_err(fs_info, "failed to init missing device %llu: %ld",
			  devid, PTR_ERR(dev));
		return dev;
	}
	btrfs_report_missing_device(fs_info, devid, uuid, false);

	return dev;
}

6874
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6875 6876
			  struct btrfs_chunk *chunk)
{
6877
	BTRFS_DEV_LOOKUP_ARGS(args);
6878
	struct btrfs_fs_info *fs_info = leaf->fs_info;
6879
	struct btrfs_chunk_map *map;
6880 6881 6882
	u64 logical;
	u64 length;
	u64 devid;
6883
	u64 type;
6884
	u8 uuid[BTRFS_UUID_SIZE];
6885
	int index;
6886 6887 6888 6889 6890 6891
	int num_stripes;
	int ret;
	int i;

	logical = key->offset;
	length = btrfs_chunk_length(leaf, chunk);
6892
	type = btrfs_chunk_type(leaf, chunk);
6893
	index = btrfs_bg_flags_to_raid_index(type);
6894 6895
	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);

6896 6897 6898 6899 6900 6901 6902
#if BITS_PER_LONG == 32
	ret = check_32bit_meta_chunk(fs_info, logical, length, type);
	if (ret < 0)
		return ret;
	warn_32bit_meta_chunk(fs_info, logical, length, type);
#endif

6903 6904 6905 6906 6907
	/*
	 * Only need to verify chunk item if we're reading from sys chunk array,
	 * as chunk item in tree block is already verified by tree-checker.
	 */
	if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6908
		ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6909 6910 6911
		if (ret)
			return ret;
	}
6912

6913
	map = btrfs_find_chunk_map(fs_info, logical, 1);
6914 6915

	/* already mapped? */
6916 6917
	if (map && map->start <= logical && map->start + map->chunk_len > logical) {
		btrfs_free_chunk_map(map);
6918
		return 0;
6919 6920
	} else if (map) {
		btrfs_free_chunk_map(map);
6921 6922
	}

6923 6924
	map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
	if (!map)
6925 6926
		return -ENOMEM;

6927 6928
	map->start = logical;
	map->chunk_len = length;
6929 6930 6931
	map->num_stripes = num_stripes;
	map->io_width = btrfs_chunk_io_width(leaf, chunk);
	map->io_align = btrfs_chunk_io_align(leaf, chunk);
6932
	map->type = type;
6933 6934 6935 6936 6937 6938 6939 6940 6941
	/*
	 * We can't use the sub_stripes value, as for profiles other than
	 * RAID10, they may have 0 as sub_stripes for filesystems created by
	 * older mkfs (<v5.4).
	 * In that case, it can cause divide-by-zero errors later.
	 * Since currently sub_stripes is fixed for each profile, let's
	 * use the trusted value instead.
	 */
	map->sub_stripes = btrfs_raid_array[index].sub_stripes;
6942
	map->verified_stripes = 0;
6943
	map->stripe_size = btrfs_calc_stripe_length(map);
6944 6945 6946 6947
	for (i = 0; i < num_stripes; i++) {
		map->stripes[i].physical =
			btrfs_stripe_offset_nr(leaf, chunk, i);
		devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6948
		args.devid = devid;
6949 6950 6951
		read_extent_buffer(leaf, uuid, (unsigned long)
				   btrfs_stripe_dev_uuid_nr(chunk, i),
				   BTRFS_UUID_SIZE);
6952 6953
		args.uuid = uuid;
		map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices, &args);
6954
		if (!map->stripes[i].dev) {
6955 6956
			map->stripes[i].dev = handle_missing_device(fs_info,
								    devid, uuid);
6957
			if (IS_ERR(map->stripes[i].dev)) {
6958
				ret = PTR_ERR(map->stripes[i].dev);
6959
				btrfs_free_chunk_map(map);
6960
				return ret;
6961 6962
			}
		}
6963

6964 6965
		set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
				&(map->stripes[i].dev->dev_state));
6966 6967
	}

6968
	ret = btrfs_add_chunk_map(fs_info, map);
6969 6970 6971
	if (ret < 0) {
		btrfs_err(fs_info,
			  "failed to add chunk map, start=%llu len=%llu: %d",
6972
			  map->start, map->chunk_len, ret);
6973
	}
6974

6975
	return ret;
6976 6977
}

6978
static void fill_device_from_item(struct extent_buffer *leaf,
6979 6980 6981 6982 6983 6984
				 struct btrfs_dev_item *dev_item,
				 struct btrfs_device *device)
{
	unsigned long ptr;

	device->devid = btrfs_device_id(leaf, dev_item);
6985 6986
	device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
	device->total_bytes = device->disk_total_bytes;
6987
	device->commit_total_bytes = device->disk_total_bytes;
6988
	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6989
	device->commit_bytes_used = device->bytes_used;
6990 6991 6992 6993
	device->type = btrfs_device_type(leaf, dev_item);
	device->io_align = btrfs_device_io_align(leaf, dev_item);
	device->io_width = btrfs_device_io_width(leaf, dev_item);
	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6994
	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6995
	clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6996

6997
	ptr = btrfs_device_uuid(dev_item);
6998
	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6999 7000
}

7001
static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7002
						  u8 *fsid)
Yan Zheng's avatar
Yan Zheng committed
7003 7004 7005 7006
{
	struct btrfs_fs_devices *fs_devices;
	int ret;

7007
	lockdep_assert_held(&uuid_mutex);
7008
	ASSERT(fsid);
Yan Zheng's avatar
Yan Zheng committed
7009

7010
	/* This will match only for multi-device seed fs */
7011
	list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7012
		if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7013 7014
			return fs_devices;

Yan Zheng's avatar
Yan Zheng committed
7015

7016
	fs_devices = find_fsid(fsid, NULL);
Yan Zheng's avatar
Yan Zheng committed
7017
	if (!fs_devices) {
7018
		if (!btrfs_test_opt(fs_info, DEGRADED))
7019 7020
			return ERR_PTR(-ENOENT);

7021
		fs_devices = alloc_fs_devices(fsid);
7022 7023 7024
		if (IS_ERR(fs_devices))
			return fs_devices;

7025
		fs_devices->seeding = true;
7026 7027
		fs_devices->opened = 1;
		return fs_devices;
Yan Zheng's avatar
Yan Zheng committed
7028
	}
Yan Zheng's avatar
Yan Zheng committed
7029

7030 7031 7032 7033
	/*
	 * Upon first call for a seed fs fsid, just create a private copy of the
	 * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
	 */
Yan Zheng's avatar
Yan Zheng committed
7034
	fs_devices = clone_fs_devices(fs_devices);
7035 7036
	if (IS_ERR(fs_devices))
		return fs_devices;
Yan Zheng's avatar
Yan Zheng committed
7037

7038
	ret = open_fs_devices(fs_devices, BLK_OPEN_READ, fs_info->bdev_holder);
7039 7040
	if (ret) {
		free_fs_devices(fs_devices);
7041
		return ERR_PTR(ret);
7042
	}
Yan Zheng's avatar
Yan Zheng committed
7043 7044

	if (!fs_devices->seeding) {
7045
		close_fs_devices(fs_devices);
Yan Zheng's avatar
Yan Zheng committed
7046
		free_fs_devices(fs_devices);
7047
		return ERR_PTR(-EINVAL);
Yan Zheng's avatar
Yan Zheng committed
7048 7049
	}

7050
	list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7051

7052
	return fs_devices;
Yan Zheng's avatar
Yan Zheng committed
7053 7054
}

7055
static int read_one_dev(struct extent_buffer *leaf,
7056 7057
			struct btrfs_dev_item *dev_item)
{
7058
	BTRFS_DEV_LOOKUP_ARGS(args);
7059
	struct btrfs_fs_info *fs_info = leaf->fs_info;
7060
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7061 7062 7063
	struct btrfs_device *device;
	u64 devid;
	int ret;
7064
	u8 fs_uuid[BTRFS_FSID_SIZE];
7065 7066
	u8 dev_uuid[BTRFS_UUID_SIZE];

7067 7068
	devid = btrfs_device_id(leaf, dev_item);
	args.devid = devid;
7069
	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7070
			   BTRFS_UUID_SIZE);
7071
	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7072
			   BTRFS_FSID_SIZE);
7073 7074
	args.uuid = dev_uuid;
	args.fsid = fs_uuid;
Yan Zheng's avatar
Yan Zheng committed
7075

7076
	if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7077
		fs_devices = open_seed_devices(fs_info, fs_uuid);
7078 7079
		if (IS_ERR(fs_devices))
			return PTR_ERR(fs_devices);
Yan Zheng's avatar
Yan Zheng committed
7080 7081
	}

7082
	device = btrfs_find_device(fs_info->fs_devices, &args);
7083
	if (!device) {
7084
		if (!btrfs_test_opt(fs_info, DEGRADED)) {
7085 7086
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, true);
7087
			return -ENOENT;
7088
		}
Yan Zheng's avatar
Yan Zheng committed
7089

7090
		device = add_missing_dev(fs_devices, devid, dev_uuid);
7091 7092 7093 7094 7095 7096
		if (IS_ERR(device)) {
			btrfs_err(fs_info,
				"failed to add missing dev %llu: %ld",
				devid, PTR_ERR(device));
			return PTR_ERR(device);
		}
7097
		btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7098
	} else {
7099
		if (!device->bdev) {
7100 7101 7102
			if (!btrfs_test_opt(fs_info, DEGRADED)) {
				btrfs_report_missing_device(fs_info,
						devid, dev_uuid, true);
7103
				return -ENOENT;
7104 7105 7106
			}
			btrfs_report_missing_device(fs_info, devid,
							dev_uuid, false);
7107
		}
7108

7109 7110
		if (!device->bdev &&
		    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7111 7112 7113 7114 7115 7116
			/*
			 * this happens when a device that was properly setup
			 * in the device info lists suddenly goes bad.
			 * device->bdev is NULL, and so we have to set
			 * device->missing to one here
			 */
7117
			device->fs_devices->missing_devices++;
7118
			set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
Yan Zheng's avatar
Yan Zheng committed
7119
		}
7120 7121 7122

		/* Move the device to its own fs_devices */
		if (device->fs_devices != fs_devices) {
7123 7124
			ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
							&device->dev_state));
7125 7126 7127 7128 7129 7130 7131 7132 7133 7134

			list_move(&device->dev_list, &fs_devices->devices);
			device->fs_devices->num_devices--;
			fs_devices->num_devices++;

			device->fs_devices->missing_devices--;
			fs_devices->missing_devices++;

			device->fs_devices = fs_devices;
		}
Yan Zheng's avatar
Yan Zheng committed
7135 7136
	}

7137
	if (device->fs_devices != fs_info->fs_devices) {
7138
		BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
Yan Zheng's avatar
Yan Zheng committed
7139 7140 7141
		if (device->generation !=
		    btrfs_device_generation(leaf, dev_item))
			return -EINVAL;
7142
	}
7143 7144

	fill_device_from_item(leaf, dev_item, device);
7145
	if (device->bdev) {
7146
		u64 max_total_bytes = bdev_nr_bytes(device->bdev);
7147 7148 7149 7150 7151 7152 7153 7154

		if (device->total_bytes > max_total_bytes) {
			btrfs_err(fs_info,
			"device total_bytes should be at most %llu but found %llu",
				  max_total_bytes, device->total_bytes);
			return -EINVAL;
		}
	}
7155
	set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7156
	if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7157
	   !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
Yan Zheng's avatar
Yan Zheng committed
7158
		device->fs_devices->total_rw_bytes += device->total_bytes;
7159 7160
		atomic64_add(device->total_bytes - device->bytes_used,
				&fs_info->free_chunk_space);
7161
	}
7162 7163 7164 7165
	ret = 0;
	return ret;
}

7166
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7167
{
7168
	struct btrfs_super_block *super_copy = fs_info->super_copy;
7169
	struct extent_buffer *sb;
7170 7171
	struct btrfs_disk_key *disk_key;
	struct btrfs_chunk *chunk;
7172 7173
	u8 *array_ptr;
	unsigned long sb_array_offset;
7174
	int ret = 0;
7175 7176 7177
	u32 num_stripes;
	u32 array_size;
	u32 len = 0;
7178
	u32 cur_offset;
7179
	u64 type;
7180
	struct btrfs_key key;
7181

7182
	ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7183

7184
	/*
7185 7186 7187
	 * We allocated a dummy extent, just to use extent buffer accessors.
	 * There will be unused space after BTRFS_SUPER_INFO_SIZE, but
	 * that's fine, we will not go beyond system chunk array anyway.
7188
	 */
7189 7190 7191
	sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
	if (!sb)
		return -ENOMEM;
7192
	set_extent_buffer_uptodate(sb);
7193

7194
	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7195 7196
	array_size = btrfs_super_sys_array_size(super_copy);

7197 7198 7199
	array_ptr = super_copy->sys_chunk_array;
	sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
	cur_offset = 0;
7200

7201 7202
	while (cur_offset < array_size) {
		disk_key = (struct btrfs_disk_key *)array_ptr;
7203 7204 7205 7206
		len = sizeof(*disk_key);
		if (cur_offset + len > array_size)
			goto out_short_read;

7207 7208
		btrfs_disk_key_to_cpu(&key, disk_key);

7209 7210 7211
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
7212

7213 7214 7215 7216 7217 7218 7219
		if (key.type != BTRFS_CHUNK_ITEM_KEY) {
			btrfs_err(fs_info,
			    "unexpected item type %u in sys_array at offset %u",
				  (u32)key.type, cur_offset);
			ret = -EIO;
			break;
		}
7220

7221 7222 7223 7224 7225 7226 7227 7228
		chunk = (struct btrfs_chunk *)sb_array_offset;
		/*
		 * At least one btrfs_chunk with one stripe must be present,
		 * exact stripe count check comes afterwards
		 */
		len = btrfs_chunk_item_size(1);
		if (cur_offset + len > array_size)
			goto out_short_read;
7229

7230 7231 7232 7233 7234 7235 7236 7237
		num_stripes = btrfs_chunk_num_stripes(sb, chunk);
		if (!num_stripes) {
			btrfs_err(fs_info,
			"invalid number of stripes %u in sys_array at offset %u",
				  num_stripes, cur_offset);
			ret = -EIO;
			break;
		}
7238

7239 7240
		type = btrfs_chunk_type(sb, chunk);
		if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7241
			btrfs_err(fs_info,
7242 7243
			"invalid chunk type %llu in sys_array at offset %u",
				  type, cur_offset);
7244 7245
			ret = -EIO;
			break;
7246
		}
7247 7248 7249 7250 7251 7252 7253 7254 7255

		len = btrfs_chunk_item_size(num_stripes);
		if (cur_offset + len > array_size)
			goto out_short_read;

		ret = read_one_chunk(&key, sb, chunk);
		if (ret)
			break;

7256 7257 7258
		array_ptr += len;
		sb_array_offset += len;
		cur_offset += len;
7259
	}
7260
	clear_extent_buffer_uptodate(sb);
7261
	free_extent_buffer_stale(sb);
7262
	return ret;
7263 7264

out_short_read:
7265
	btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7266
			len, cur_offset);
7267
	clear_extent_buffer_uptodate(sb);
7268
	free_extent_buffer_stale(sb);
7269
	return -EIO;
7270 7271
}

7272 7273 7274
/*
 * Check if all chunks in the fs are OK for read-write degraded mount
 *
7275 7276
 * If the @failing_dev is specified, it's accounted as missing.
 *
7277 7278 7279
 * Return true if all chunks meet the minimal RW mount requirements.
 * Return false if any chunk doesn't meet the minimal RW mount requirements.
 */
7280 7281
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
					struct btrfs_device *failing_dev)
7282
{
7283 7284
	struct btrfs_chunk_map *map;
	u64 next_start;
7285 7286
	bool ret = true;

7287
	map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
7288
	/* No chunk at all? Return false anyway */
7289
	if (!map) {
7290 7291 7292
		ret = false;
		goto out;
	}
7293
	while (map) {
7294 7295 7296 7297 7298 7299 7300 7301 7302 7303
		int missing = 0;
		int max_tolerated;
		int i;

		max_tolerated =
			btrfs_get_num_tolerated_disk_barrier_failures(
					map->type);
		for (i = 0; i < map->num_stripes; i++) {
			struct btrfs_device *dev = map->stripes[i].dev;

7304 7305
			if (!dev || !dev->bdev ||
			    test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7306 7307
			    dev->last_flush_error)
				missing++;
7308 7309
			else if (failing_dev && failing_dev == dev)
				missing++;
7310 7311
		}
		if (missing > max_tolerated) {
7312 7313
			if (!failing_dev)
				btrfs_warn(fs_info,
7314
	"chunk %llu missing %d devices, max tolerance is %d for writable mount",
7315 7316
				   map->start, missing, max_tolerated);
			btrfs_free_chunk_map(map);
7317 7318 7319
			ret = false;
			goto out;
		}
7320 7321
		next_start = map->start + map->chunk_len;
		btrfs_free_chunk_map(map);
7322

7323
		map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
7324 7325 7326 7327 7328
	}
out:
	return ret;
}

7329 7330 7331 7332 7333
static void readahead_tree_node_children(struct extent_buffer *node)
{
	int i;
	const int nr_items = btrfs_header_nritems(node);

7334 7335
	for (i = 0; i < nr_items; i++)
		btrfs_readahead_node_child(node, i);
7336 7337
}

7338
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7339
{
7340
	struct btrfs_root *root = fs_info->chunk_root;
7341 7342 7343 7344 7345 7346
	struct btrfs_path *path;
	struct extent_buffer *leaf;
	struct btrfs_key key;
	struct btrfs_key found_key;
	int ret;
	int slot;
7347
	int iter_ret = 0;
7348
	u64 total_dev = 0;
7349
	u64 last_ra_node = 0;
7350 7351 7352 7353 7354

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

7355 7356 7357 7358
	/*
	 * uuid_mutex is needed only if we are mounting a sprout FS
	 * otherwise we don't need it.
	 */
7359 7360
	mutex_lock(&uuid_mutex);

7361 7362 7363 7364 7365 7366 7367 7368
	/*
	 * It is possible for mount and umount to race in such a way that
	 * we execute this code path, but open_fs_devices failed to clear
	 * total_rw_bytes. We certainly want it cleared before reading the
	 * device items, so clear it here.
	 */
	fs_info->fs_devices->total_rw_bytes = 0;

7369 7370 7371 7372 7373 7374 7375 7376 7377 7378 7379 7380 7381
	/*
	 * Lockdep complains about possible circular locking dependency between
	 * a disk's open_mutex (struct gendisk.open_mutex), the rw semaphores
	 * used for freeze procection of a fs (struct super_block.s_writers),
	 * which we take when starting a transaction, and extent buffers of the
	 * chunk tree if we call read_one_dev() while holding a lock on an
	 * extent buffer of the chunk tree. Since we are mounting the filesystem
	 * and at this point there can't be any concurrent task modifying the
	 * chunk tree, to keep it simple, just skip locking on the chunk tree.
	 */
	ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
	path->skip_locking = 1;

7382 7383 7384 7385 7386
	/*
	 * Read all device items, and then all the chunk items. All
	 * device items are found before any chunk item (their object id
	 * is smaller than the lowest possible object id for a chunk
	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7387 7388 7389 7390
	 */
	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
	key.offset = 0;
	key.type = 0;
7391 7392
	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
		struct extent_buffer *node = path->nodes[1];
7393

7394 7395
		leaf = path->nodes[0];
		slot = path->slots[0];
7396

7397 7398 7399 7400 7401 7402
		if (node) {
			if (last_ra_node != node->start) {
				readahead_tree_node_children(node);
				last_ra_node = node->start;
			}
		}
7403 7404 7405
		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
			struct btrfs_dev_item *dev_item;
			dev_item = btrfs_item_ptr(leaf, slot,
7406
						  struct btrfs_dev_item);
7407
			ret = read_one_dev(leaf, dev_item);
7408 7409
			if (ret)
				goto error;
7410
			total_dev++;
7411 7412
		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
			struct btrfs_chunk *chunk;
7413 7414 7415 7416 7417 7418 7419 7420 7421

			/*
			 * We are only called at mount time, so no need to take
			 * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
			 * we always lock first fs_info->chunk_mutex before
			 * acquiring any locks on the chunk tree. This is a
			 * requirement for chunk allocation, see the comment on
			 * top of btrfs_chunk_alloc() for details.
			 */
7422
			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7423
			ret = read_one_chunk(&found_key, leaf, chunk);
Yan Zheng's avatar
Yan Zheng committed
7424 7425
			if (ret)
				goto error;
7426
		}
7427 7428 7429 7430 7431
	}
	/* Catch error found during iteration */
	if (iter_ret < 0) {
		ret = iter_ret;
		goto error;
7432
	}
7433 7434 7435 7436 7437

	/*
	 * After loading chunk tree, we've got all device information,
	 * do another round of validation checks.
	 */
7438
	if (total_dev != fs_info->fs_devices->total_devices) {
7439 7440
		btrfs_warn(fs_info,
"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7441
			  btrfs_super_num_devices(fs_info->super_copy),
7442
			  total_dev);
7443 7444
		fs_info->fs_devices->total_devices = total_dev;
		btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7445
	}
7446 7447 7448
	if (btrfs_super_total_bytes(fs_info->super_copy) <
	    fs_info->fs_devices->total_rw_bytes) {
		btrfs_err(fs_info,
7449
	"super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7450 7451
			  btrfs_super_total_bytes(fs_info->super_copy),
			  fs_info->fs_devices->total_rw_bytes);
7452 7453 7454
		ret = -EINVAL;
		goto error;
	}
7455 7456
	ret = 0;
error:
7457 7458
	mutex_unlock(&uuid_mutex);

Yan Zheng's avatar
Yan Zheng committed
7459
	btrfs_free_path(path);
7460 7461
	return ret;
}
7462

7463
int btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7464
{
7465
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7466
	struct btrfs_device *device;
7467
	int ret = 0;
7468

7469 7470 7471 7472 7473 7474 7475
	fs_devices->fs_info = fs_info;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list)
		device->fs_info = fs_info;

	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7476
		list_for_each_entry(device, &seed_devs->devices, dev_list) {
7477
			device->fs_info = fs_info;
7478 7479 7480 7481
			ret = btrfs_get_dev_zone_info(device, false);
			if (ret)
				break;
		}
7482

7483
		seed_devs->fs_info = fs_info;
7484
	}
7485
	mutex_unlock(&fs_devices->device_list_mutex);
7486 7487

	return ret;
7488 7489
}

7490 7491 7492 7493 7494 7495 7496 7497 7498 7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511 7512
static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
				 const struct btrfs_dev_stats_item *ptr,
				 int index)
{
	u64 val;

	read_extent_buffer(eb, &val,
			   offsetof(struct btrfs_dev_stats_item, values) +
			    ((unsigned long)ptr) + (index * sizeof(u64)),
			   sizeof(val));
	return val;
}

static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
				      struct btrfs_dev_stats_item *ptr,
				      int index, u64 val)
{
	write_extent_buffer(eb, &val,
			    offsetof(struct btrfs_dev_stats_item, values) +
			     ((unsigned long)ptr) + (index * sizeof(u64)),
			    sizeof(val));
}

7513 7514
static int btrfs_device_init_dev_stats(struct btrfs_device *device,
				       struct btrfs_path *path)
7515
{
7516
	struct btrfs_dev_stats_item *ptr;
7517
	struct extent_buffer *eb;
7518 7519 7520 7521
	struct btrfs_key key;
	int item_size;
	int i, ret, slot;

7522 7523 7524
	if (!device->fs_info->dev_root)
		return 0;

7525 7526 7527 7528 7529 7530 7531 7532 7533
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
	key.offset = device->devid;
	ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
	if (ret) {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			btrfs_dev_stat_set(device, i, 0);
		device->dev_stats_valid = 1;
		btrfs_release_path(path);
7534
		return ret < 0 ? ret : 0;
7535 7536 7537
	}
	slot = path->slots[0];
	eb = path->nodes[0];
7538
	item_size = btrfs_item_size(eb, slot);
7539 7540 7541 7542 7543 7544 7545 7546 7547 7548 7549 7550 7551 7552

	ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
		if (item_size >= (1 + i) * sizeof(__le64))
			btrfs_dev_stat_set(device, i,
					   btrfs_dev_stats_value(eb, ptr, i));
		else
			btrfs_dev_stat_set(device, i, 0);
	}

	device->dev_stats_valid = 1;
	btrfs_dev_stat_print_on_load(device);
	btrfs_release_path(path);
7553 7554

	return 0;
7555 7556 7557 7558 7559
}

int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
{
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7560 7561
	struct btrfs_device *device;
	struct btrfs_path *path = NULL;
7562
	int ret = 0;
7563 7564

	path = btrfs_alloc_path();
7565 7566
	if (!path)
		return -ENOMEM;
7567 7568

	mutex_lock(&fs_devices->device_list_mutex);
7569 7570 7571 7572 7573
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
		ret = btrfs_device_init_dev_stats(device, path);
		if (ret)
			goto out;
	}
7574
	list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7575 7576 7577 7578 7579
		list_for_each_entry(device, &seed_devs->devices, dev_list) {
			ret = btrfs_device_init_dev_stats(device, path);
			if (ret)
				goto out;
		}
7580
	}
7581
out:
7582 7583 7584
	mutex_unlock(&fs_devices->device_list_mutex);

	btrfs_free_path(path);
7585
	return ret;
7586 7587 7588 7589 7590
}

static int update_dev_stat_item(struct btrfs_trans_handle *trans,
				struct btrfs_device *device)
{
7591
	struct btrfs_fs_info *fs_info = trans->fs_info;
7592
	struct btrfs_root *dev_root = fs_info->dev_root;
7593 7594 7595 7596 7597 7598 7599
	struct btrfs_path *path;
	struct btrfs_key key;
	struct extent_buffer *eb;
	struct btrfs_dev_stats_item *ptr;
	int ret;
	int i;

7600 7601
	key.objectid = BTRFS_DEV_STATS_OBJECTID;
	key.type = BTRFS_PERSISTENT_ITEM_KEY;
7602 7603 7604
	key.offset = device->devid;

	path = btrfs_alloc_path();
7605 7606
	if (!path)
		return -ENOMEM;
7607 7608
	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
	if (ret < 0) {
7609
		btrfs_warn_in_rcu(fs_info,
7610
			"error %d while searching for dev_stats item for device %s",
7611
				  ret, btrfs_dev_name(device));
7612 7613 7614 7615
		goto out;
	}

	if (ret == 0 &&
7616
	    btrfs_item_size(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7617 7618 7619
		/* need to delete old one and insert a new one */
		ret = btrfs_del_item(trans, dev_root, path);
		if (ret != 0) {
7620
			btrfs_warn_in_rcu(fs_info,
7621
				"delete too small dev_stats item for device %s failed %d",
7622
					  btrfs_dev_name(device), ret);
7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633
			goto out;
		}
		ret = 1;
	}

	if (ret == 1) {
		/* need to insert a new item */
		btrfs_release_path(path);
		ret = btrfs_insert_empty_item(trans, dev_root, path,
					      &key, sizeof(*ptr));
		if (ret < 0) {
7634
			btrfs_warn_in_rcu(fs_info,
7635
				"insert dev_stats item for device %s failed %d",
7636
				btrfs_dev_name(device), ret);
7637 7638 7639 7640 7641 7642 7643 7644 7645
			goto out;
		}
	}

	eb = path->nodes[0];
	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		btrfs_set_dev_stats_value(eb, ptr, i,
					  btrfs_dev_stat_read(device, i));
7646
	btrfs_mark_buffer_dirty(trans, eb);
7647 7648 7649 7650 7651 7652 7653 7654 7655

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * called from commit_transaction. Writes all changed device stats to disk.
 */
7656
int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7657
{
7658
	struct btrfs_fs_info *fs_info = trans->fs_info;
7659 7660
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
	struct btrfs_device *device;
7661
	int stats_cnt;
7662 7663 7664 7665
	int ret = 0;

	mutex_lock(&fs_devices->device_list_mutex);
	list_for_each_entry(device, &fs_devices->devices, dev_list) {
7666 7667
		stats_cnt = atomic_read(&device->dev_stats_ccnt);
		if (!device->dev_stats_valid || stats_cnt == 0)
7668 7669
			continue;

7670 7671 7672 7673 7674 7675 7676 7677 7678 7679 7680 7681 7682 7683

		/*
		 * There is a LOAD-LOAD control dependency between the value of
		 * dev_stats_ccnt and updating the on-disk values which requires
		 * reading the in-memory counters. Such control dependencies
		 * require explicit read memory barriers.
		 *
		 * This memory barriers pairs with smp_mb__before_atomic in
		 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
		 * barrier implied by atomic_xchg in
		 * btrfs_dev_stats_read_and_reset
		 */
		smp_rmb();

7684
		ret = update_dev_stat_item(trans, device);
7685
		if (!ret)
7686
			atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7687 7688 7689 7690 7691 7692
	}
	mutex_unlock(&fs_devices->device_list_mutex);

	return ret;
}

7693 7694 7695 7696
void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
{
	btrfs_dev_stat_inc(dev, index);

7697 7698
	if (!dev->dev_stats_valid)
		return;
7699
	btrfs_err_rl_in_rcu(dev->fs_info,
7700
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7701
			   btrfs_dev_name(dev),
7702 7703 7704
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7705 7706
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7707
}
7708

7709 7710
static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
{
7711 7712 7713 7714 7715 7716 7717 7718
	int i;

	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
		if (btrfs_dev_stat_read(dev, i) != 0)
			break;
	if (i == BTRFS_DEV_STAT_VALUES_MAX)
		return; /* all values == 0, suppress message */

7719
	btrfs_info_in_rcu(dev->fs_info,
7720
		"bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7721
	       btrfs_dev_name(dev),
7722 7723 7724 7725 7726 7727 7728
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
}

7729
int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7730
			struct btrfs_ioctl_get_dev_stats *stats)
7731
{
7732
	BTRFS_DEV_LOOKUP_ARGS(args);
7733
	struct btrfs_device *dev;
7734
	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7735 7736 7737
	int i;

	mutex_lock(&fs_devices->device_list_mutex);
7738 7739
	args.devid = stats->devid;
	dev = btrfs_find_device(fs_info->fs_devices, &args);
7740 7741 7742
	mutex_unlock(&fs_devices->device_list_mutex);

	if (!dev) {
7743
		btrfs_warn(fs_info, "get dev_stats failed, device not found");
7744
		return -ENODEV;
7745
	} else if (!dev->dev_stats_valid) {
7746
		btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7747
		return -ENODEV;
7748
	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7749 7750 7751 7752 7753
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
			if (stats->nr_items > i)
				stats->values[i] =
					btrfs_dev_stat_read_and_reset(dev, i);
			else
7754
				btrfs_dev_stat_set(dev, i, 0);
7755
		}
7756 7757
		btrfs_info(fs_info, "device stats zeroed by %s (%d)",
			   current->comm, task_pid_nr(current));
7758 7759 7760 7761 7762 7763 7764 7765 7766
	} else {
		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
			if (stats->nr_items > i)
				stats->values[i] = btrfs_dev_stat_read(dev, i);
	}
	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
	return 0;
}
7767

7768
/*
7769 7770 7771 7772 7773
 * Update the size and bytes used for each device where it changed.  This is
 * delayed since we would otherwise get errors while writing out the
 * superblocks.
 *
 * Must be invoked during transaction commit.
7774
 */
7775
void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7776 7777 7778
{
	struct btrfs_device *curr, *next;

7779
	ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7780

7781
	if (list_empty(&trans->dev_update_list))
7782 7783
		return;

7784 7785 7786 7787 7788 7789 7790 7791 7792 7793 7794
	/*
	 * We don't need the device_list_mutex here.  This list is owned by the
	 * transaction and the transaction must complete before the device is
	 * released.
	 */
	mutex_lock(&trans->fs_info->chunk_mutex);
	list_for_each_entry_safe(curr, next, &trans->dev_update_list,
				 post_commit_list) {
		list_del_init(&curr->post_commit_list);
		curr->commit_total_bytes = curr->disk_total_bytes;
		curr->commit_bytes_used = curr->bytes_used;
7795
	}
7796
	mutex_unlock(&trans->fs_info->chunk_mutex);
7797
}
7798

7799 7800 7801 7802 7803
/*
 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
 */
int btrfs_bg_type_to_factor(u64 flags)
{
7804 7805 7806
	const int index = btrfs_bg_flags_to_raid_index(flags);

	return btrfs_raid_array[index].ncopies;
7807
}
7808 7809 7810 7811 7812 7813 7814



static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
				 u64 chunk_offset, u64 devid,
				 u64 physical_offset, u64 physical_len)
{
7815
	struct btrfs_dev_lookup_args args = { .devid = devid };
7816
	struct btrfs_chunk_map *map;
7817
	struct btrfs_device *dev;
7818 7819 7820 7821 7822
	u64 stripe_len;
	bool found = false;
	int ret = 0;
	int i;

7823 7824
	map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
	if (!map) {
7825 7826 7827 7828 7829 7830 7831
		btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
			  physical_offset, devid);
		ret = -EUCLEAN;
		goto out;
	}

7832
	stripe_len = btrfs_calc_stripe_length(map);
7833 7834 7835
	if (physical_len != stripe_len) {
		btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7836
			  physical_offset, devid, map->start, physical_len,
7837 7838 7839 7840 7841
			  stripe_len);
		ret = -EUCLEAN;
		goto out;
	}

7842 7843 7844 7845 7846 7847 7848 7849 7850 7851
	/*
	 * Very old mkfs.btrfs (before v4.1) will not respect the reserved
	 * space. Although kernel can handle it without problem, better to warn
	 * the users.
	 */
	if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED)
		btrfs_warn(fs_info,
		"devid %llu physical %llu len %llu inside the reserved space",
			   devid, physical_offset, physical_len);

7852 7853 7854 7855 7856 7857 7858
	for (i = 0; i < map->num_stripes; i++) {
		if (map->stripes[i].dev->devid == devid &&
		    map->stripes[i].physical == physical_offset) {
			found = true;
			if (map->verified_stripes >= map->num_stripes) {
				btrfs_err(fs_info,
				"too many dev extents for chunk %llu found",
7859
					  map->start);
7860 7861 7862 7863 7864 7865 7866 7867 7868 7869 7870 7871 7872
				ret = -EUCLEAN;
				goto out;
			}
			map->verified_stripes++;
			break;
		}
	}
	if (!found) {
		btrfs_err(fs_info,
	"dev extent physical offset %llu devid %llu has no corresponding chunk",
			physical_offset, devid);
		ret = -EUCLEAN;
	}
7873

7874
	/* Make sure no dev extent is beyond device boundary */
7875
	dev = btrfs_find_device(fs_info->fs_devices, &args);
7876 7877 7878 7879 7880
	if (!dev) {
		btrfs_err(fs_info, "failed to find devid %llu", devid);
		ret = -EUCLEAN;
		goto out;
	}
7881

7882 7883 7884 7885 7886 7887 7888 7889
	if (physical_offset + physical_len > dev->disk_total_bytes) {
		btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
			  devid, physical_offset, physical_len,
			  dev->disk_total_bytes);
		ret = -EUCLEAN;
		goto out;
	}
7890 7891 7892 7893 7894 7895 7896 7897 7898 7899 7900 7901 7902 7903

	if (dev->zone_info) {
		u64 zone_size = dev->zone_info->zone_size;

		if (!IS_ALIGNED(physical_offset, zone_size) ||
		    !IS_ALIGNED(physical_len, zone_size)) {
			btrfs_err(fs_info,
"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
				  devid, physical_offset, physical_len);
			ret = -EUCLEAN;
			goto out;
		}
	}

7904
out:
7905
	btrfs_free_chunk_map(map);
7906 7907 7908 7909 7910 7911 7912 7913
	return ret;
}

static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
	struct rb_node *node;
	int ret = 0;

7914 7915 7916 7917 7918 7919
	read_lock(&fs_info->mapping_tree_lock);
	for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
		struct btrfs_chunk_map *map;

		map = rb_entry(node, struct btrfs_chunk_map, rb_node);
		if (map->num_stripes != map->verified_stripes) {
7920 7921
			btrfs_err(fs_info,
			"chunk %llu has missing dev extent, have %d expect %d",
7922
				  map->start, map->verified_stripes, map->num_stripes);
7923 7924 7925 7926 7927
			ret = -EUCLEAN;
			goto out;
		}
	}
out:
7928
	read_unlock(&fs_info->mapping_tree_lock);
7929 7930 7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943
	return ret;
}

/*
 * Ensure that all dev extents are mapped to correct chunk, otherwise
 * later chunk allocation/free would cause unexpected behavior.
 *
 * NOTE: This will iterate through the whole device tree, which should be of
 * the same size level as the chunk tree.  This slightly increases mount time.
 */
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
{
	struct btrfs_path *path;
	struct btrfs_root *root = fs_info->dev_root;
	struct btrfs_key key;
7944 7945
	u64 prev_devid = 0;
	u64 prev_dev_ext_end = 0;
7946 7947
	int ret = 0;

7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960
	/*
	 * We don't have a dev_root because we mounted with ignorebadroots and
	 * failed to load the root, so we want to skip the verification in this
	 * case for sure.
	 *
	 * However if the dev root is fine, but the tree itself is corrupted
	 * we'd still fail to mount.  This verification is only to make sure
	 * writes can happen safely, so instead just bypass this check
	 * completely in the case of IGNOREBADROOTS.
	 */
	if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
		return 0;

7961 7962 7963 7964 7965 7966 7967 7968 7969 7970 7971 7972 7973 7974
	key.objectid = 1;
	key.type = BTRFS_DEV_EXTENT_KEY;
	key.offset = 0;

	path = btrfs_alloc_path();
	if (!path)
		return -ENOMEM;

	path->reada = READA_FORWARD;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;

	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7975
		ret = btrfs_next_leaf(root, path);
7976 7977 7978 7979 7980 7981 7982 7983 7984 7985 7986 7987 7988 7989 7990 7991 7992 7993 7994 7995 7996 7997 7998 7999 8000 8001 8002
		if (ret < 0)
			goto out;
		/* No dev extents at all? Not good */
		if (ret > 0) {
			ret = -EUCLEAN;
			goto out;
		}
	}
	while (1) {
		struct extent_buffer *leaf = path->nodes[0];
		struct btrfs_dev_extent *dext;
		int slot = path->slots[0];
		u64 chunk_offset;
		u64 physical_offset;
		u64 physical_len;
		u64 devid;

		btrfs_item_key_to_cpu(leaf, &key, slot);
		if (key.type != BTRFS_DEV_EXTENT_KEY)
			break;
		devid = key.objectid;
		physical_offset = key.offset;

		dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
		chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
		physical_len = btrfs_dev_extent_length(leaf, dext);

8003 8004 8005 8006 8007 8008 8009 8010 8011
		/* Check if this dev extent overlaps with the previous one */
		if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
			btrfs_err(fs_info,
"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
				  devid, physical_offset, prev_dev_ext_end);
			ret = -EUCLEAN;
			goto out;
		}

8012 8013 8014 8015
		ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
					    physical_offset, physical_len);
		if (ret < 0)
			goto out;
8016 8017 8018
		prev_devid = devid;
		prev_dev_ext_end = physical_offset + physical_len;

8019 8020 8021 8022 8023 8024 8025 8026 8027 8028 8029 8030 8031 8032 8033
		ret = btrfs_next_item(root, path);
		if (ret < 0)
			goto out;
		if (ret > 0) {
			ret = 0;
			break;
		}
	}

	/* Ensure all chunks have corresponding dev extents */
	ret = verify_chunk_dev_extent_mapping(fs_info);
out:
	btrfs_free_path(path);
	return ret;
}
8034 8035 8036 8037 8038 8039 8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054 8055 8056 8057

/*
 * Check whether the given block group or device is pinned by any inode being
 * used as a swapfile.
 */
bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
{
	struct btrfs_swapfile_pin *sp;
	struct rb_node *node;

	spin_lock(&fs_info->swapfile_pins_lock);
	node = fs_info->swapfile_pins.rb_node;
	while (node) {
		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
		if (ptr < sp->ptr)
			node = node->rb_left;
		else if (ptr > sp->ptr)
			node = node->rb_right;
		else
			break;
	}
	spin_unlock(&fs_info->swapfile_pins_lock);
	return node != NULL;
}
8058 8059 8060

static int relocating_repair_kthread(void *data)
{
8061
	struct btrfs_block_group *cache = data;
8062 8063 8064 8065 8066 8067 8068
	struct btrfs_fs_info *fs_info = cache->fs_info;
	u64 target;
	int ret = 0;

	target = cache->start;
	btrfs_put_block_group(cache);

8069
	sb_start_write(fs_info->sb);
8070 8071 8072 8073
	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
		btrfs_info(fs_info,
			   "zoned: skip relocating block group %llu to repair: EBUSY",
			   target);
8074
		sb_end_write(fs_info->sb);
8075 8076 8077
		return -EBUSY;
	}

8078
	mutex_lock(&fs_info->reclaim_bgs_lock);
8079 8080 8081 8082 8083 8084

	/* Ensure block group still exists */
	cache = btrfs_lookup_block_group(fs_info, target);
	if (!cache)
		goto out;

8085
	if (!test_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags))
8086 8087 8088 8089 8090 8091 8092 8093 8094 8095 8096 8097 8098 8099
		goto out;

	ret = btrfs_may_alloc_data_chunk(fs_info, target);
	if (ret < 0)
		goto out;

	btrfs_info(fs_info,
		   "zoned: relocating block group %llu to repair IO failure",
		   target);
	ret = btrfs_relocate_chunk(fs_info, target);

out:
	if (cache)
		btrfs_put_block_group(cache);
8100
	mutex_unlock(&fs_info->reclaim_bgs_lock);
8101
	btrfs_exclop_finish(fs_info);
8102
	sb_end_write(fs_info->sb);
8103 8104 8105 8106

	return ret;
}

8107
bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8108 8109 8110
{
	struct btrfs_block_group *cache;

8111 8112 8113
	if (!btrfs_is_zoned(fs_info))
		return false;

8114 8115
	/* Do not attempt to repair in degraded state */
	if (btrfs_test_opt(fs_info, DEGRADED))
8116
		return true;
8117 8118 8119

	cache = btrfs_lookup_block_group(fs_info, logical);
	if (!cache)
8120
		return true;
8121

8122
	if (test_and_set_bit(BLOCK_GROUP_FLAG_RELOCATING_REPAIR, &cache->runtime_flags)) {
8123
		btrfs_put_block_group(cache);
8124
		return true;
8125 8126 8127 8128 8129
	}

	kthread_run(relocating_repair_kthread, cache,
		    "btrfs-relocating-repair");

8130
	return true;
8131
}
8132 8133 8134 8135 8136 8137 8138 8139 8140 8141

static void map_raid56_repair_block(struct btrfs_io_context *bioc,
				    struct btrfs_io_stripe *smap,
				    u64 logical)
{
	int data_stripes = nr_bioc_data_stripes(bioc);
	int i;

	for (i = 0; i < data_stripes; i++) {
		u64 stripe_start = bioc->full_stripe_logical +
8142
				   btrfs_stripe_nr_to_offset(i);
8143 8144 8145 8146 8147 8148 8149 8150 8151 8152 8153 8154 8155 8156 8157 8158 8159 8160 8161 8162 8163 8164 8165 8166 8167 8168 8169 8170 8171 8172 8173 8174 8175 8176 8177 8178

		if (logical >= stripe_start &&
		    logical < stripe_start + BTRFS_STRIPE_LEN)
			break;
	}
	ASSERT(i < data_stripes);
	smap->dev = bioc->stripes[i].dev;
	smap->physical = bioc->stripes[i].physical +
			((logical - bioc->full_stripe_logical) &
			 BTRFS_STRIPE_LEN_MASK);
}

/*
 * Map a repair write into a single device.
 *
 * A repair write is triggered by read time repair or scrub, which would only
 * update the contents of a single device.
 * Not update any other mirrors nor go through RMW path.
 *
 * Callers should ensure:
 *
 * - Call btrfs_bio_counter_inc_blocked() first
 * - The range does not cross stripe boundary
 * - Has a valid @mirror_num passed in.
 */
int btrfs_map_repair_block(struct btrfs_fs_info *fs_info,
			   struct btrfs_io_stripe *smap, u64 logical,
			   u32 length, int mirror_num)
{
	struct btrfs_io_context *bioc = NULL;
	u64 map_length = length;
	int mirror_ret = mirror_num;
	int ret;

	ASSERT(mirror_num > 0);

8179
	ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length,
8180
			      &bioc, smap, &mirror_ret);
8181 8182 8183 8184 8185 8186 8187 8188 8189 8190 8191 8192 8193 8194 8195 8196 8197 8198 8199 8200 8201 8202 8203 8204
	if (ret < 0)
		return ret;

	/* The map range should not cross stripe boundary. */
	ASSERT(map_length >= length);

	/* Already mapped to single stripe. */
	if (!bioc)
		goto out;

	/* Map the RAID56 multi-stripe writes to a single one. */
	if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
		map_raid56_repair_block(bioc, smap, logical);
		goto out;
	}

	ASSERT(mirror_num <= bioc->num_stripes);
	smap->dev = bioc->stripes[mirror_num - 1].dev;
	smap->physical = bioc->stripes[mirror_num - 1].physical;
out:
	btrfs_put_bioc(bioc);
	ASSERT(smap->dev);
	return 0;
}