send.c 216 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2 3 4 5 6 7 8 9 10 11 12
/*
 * Copyright (C) 2012 Alexander Block.  All rights reserved.
 */

#include <linux/bsearch.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/sort.h>
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
13
#include <linux/radix-tree.h>
14
#include <linux/vmalloc.h>
15
#include <linux/string.h>
16
#include <linux/compat.h>
17
#include <linux/crc32c.h>
18
#include <linux/fsverity.h>
19 20

#include "send.h"
21
#include "ctree.h"
22 23 24 25 26
#include "backref.h"
#include "locking.h"
#include "disk-io.h"
#include "btrfs_inode.h"
#include "transaction.h"
27
#include "compression.h"
28
#include "print-tree.h"
29
#include "accessors.h"
30
#include "dir-item.h"
31
#include "file-item.h"
32
#include "ioctl.h"
33
#include "verity.h"
34
#include "lru_cache.h"
35

36 37 38 39 40 41
/*
 * Maximum number of references an extent can have in order for us to attempt to
 * issue clone operations instead of write operations. This currently exists to
 * avoid hitting limitations of the backreference walking code (taking a lot of
 * time and using too much memory for extents with large number of references).
 */
42
#define SEND_MAX_EXTENT_REFS	1024
43

44 45 46 47 48 49 50 51 52 53 54 55 56 57
/*
 * A fs_path is a helper to dynamically build path names with unknown size.
 * It reallocates the internal buffer on demand.
 * It allows fast adding of path elements on the right side (normal path) and
 * fast adding to the left side (reversed path). A reversed path can also be
 * unreversed if needed.
 */
struct fs_path {
	union {
		struct {
			char *start;
			char *end;

			char *buf;
58 59
			unsigned short buf_len:15;
			unsigned short reversed:1;
60 61
			char inline_buf[];
		};
62 63 64
		/*
		 * Average path length does not exceed 200 bytes, we'll have
		 * better packing in the slab and higher chance to satisfy
65
		 * an allocation later during send.
66 67
		 */
		char pad[256];
68 69 70 71 72 73 74 75 76 77 78
	};
};
#define FS_PATH_INLINE_SIZE \
	(sizeof(struct fs_path) - offsetof(struct fs_path, inline_buf))


/* reused for each extent */
struct clone_root {
	struct btrfs_root *root;
	u64 ino;
	u64 offset;
79
	u64 num_bytes;
80
	bool found_ref;
81 82
};

83
#define SEND_MAX_NAME_CACHE_SIZE			256
84

85
/*
86 87 88
 * Limit the root_ids array of struct backref_cache_entry to 17 elements.
 * This makes the size of a cache entry to be exactly 192 bytes on x86_64, which
 * can be satisfied from the kmalloc-192 slab, without wasting any space.
89
 * The most common case is to have a single root for cloning, which corresponds
90
 * to the send root. Having the user specify more than 16 clone roots is not
91
 * common, and in such rare cases we simply don't use caching if the number of
92
 * cloning roots that lead down to a leaf is more than 17.
93
 */
94
#define SEND_MAX_BACKREF_CACHE_ROOTS			17
95 96 97

/*
 * Max number of entries in the cache.
98 99
 * With SEND_MAX_BACKREF_CACHE_ROOTS as 17, the size in bytes, excluding
 * maple tree's internal nodes, is 24K.
100 101 102 103 104 105 106 107 108 109
 */
#define SEND_MAX_BACKREF_CACHE_SIZE 128

/*
 * A backref cache entry maps a leaf to a list of IDs of roots from which the
 * leaf is accessible and we can use for clone operations.
 * With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
 * x86_64).
 */
struct backref_cache_entry {
110
	struct btrfs_lru_cache_entry entry;
111 112 113 114 115
	u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
	/* Number of valid elements in the root_ids array. */
	int num_roots;
};

116 117 118
/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
static_assert(offsetof(struct backref_cache_entry, entry) == 0);

119 120 121
/*
 * Max number of entries in the cache that stores directories that were already
 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
122
 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
123 124 125 126
 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
 */
#define SEND_MAX_DIR_CREATED_CACHE_SIZE			64

127 128 129 130 131 132 133 134
/*
 * Max number of entries in the cache that stores directories that were already
 * created. The cache uses raw struct btrfs_lru_cache_entry entries, so it uses
 * at most 4096 bytes - sizeof(struct btrfs_lru_cache_entry) is 48 bytes, but
 * the kmalloc-64 slab is used, so we get 4096 bytes (64 bytes * 64).
 */
#define SEND_MAX_DIR_UTIMES_CACHE_SIZE			64

135 136 137 138 139 140
struct send_ctx {
	struct file *send_filp;
	loff_t send_off;
	char *send_buf;
	u32 send_size;
	u32 send_max_size;
141 142 143 144 145
	/*
	 * Whether BTRFS_SEND_A_DATA attribute was already added to current
	 * command (since protocol v2, data must be the last attribute).
	 */
	bool put_data;
146
	struct page **send_buf_pages;
147
	u64 flags;	/* 'flags' member of btrfs_ioctl_send_args is u64 */
148 149
	/* Protocol version compatibility requested */
	u32 proto;
150 151 152 153 154 155 156 157 158 159 160

	struct btrfs_root *send_root;
	struct btrfs_root *parent_root;
	struct clone_root *clone_roots;
	int clone_roots_cnt;

	/* current state of the compare_tree call */
	struct btrfs_path *left_path;
	struct btrfs_path *right_path;
	struct btrfs_key *cmp_key;

161 162 163 164 165 166 167 168 169
	/*
	 * Keep track of the generation of the last transaction that was used
	 * for relocating a block group. This is periodically checked in order
	 * to detect if a relocation happened since the last check, so that we
	 * don't operate on stale extent buffers for nodes (level >= 1) or on
	 * stale disk_bytenr values of file extent items.
	 */
	u64 last_reloc_trans;

170 171 172 173 174 175 176 177
	/*
	 * infos of the currently processed inode. In case of deleted inodes,
	 * these are the values from the deleted inode.
	 */
	u64 cur_ino;
	u64 cur_inode_gen;
	u64 cur_inode_size;
	u64 cur_inode_mode;
178
	u64 cur_inode_rdev;
179
	u64 cur_inode_last_extent;
180
	u64 cur_inode_next_write_offset;
181 182 183
	bool cur_inode_new;
	bool cur_inode_new_gen;
	bool cur_inode_deleted;
184
	bool ignore_cur_inode;
185 186
	bool cur_inode_needs_verity;
	void *verity_descriptor;
187 188 189 190 191 192

	u64 send_progress;

	struct list_head new_refs;
	struct list_head deleted_refs;

193
	struct btrfs_lru_cache name_cache;
194

195 196 197 198 199
	/*
	 * The inode we are currently processing. It's not NULL only when we
	 * need to issue write commands for data extents from this inode.
	 */
	struct inode *cur_inode;
Liu Bo's avatar
Liu Bo committed
200
	struct file_ra_state ra;
201 202
	u64 page_cache_clear_start;
	bool clean_page_cache;
Liu Bo's avatar
Liu Bo committed
203

204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256
	/*
	 * We process inodes by their increasing order, so if before an
	 * incremental send we reverse the parent/child relationship of
	 * directories such that a directory with a lower inode number was
	 * the parent of a directory with a higher inode number, and the one
	 * becoming the new parent got renamed too, we can't rename/move the
	 * directory with lower inode number when we finish processing it - we
	 * must process the directory with higher inode number first, then
	 * rename/move it and then rename/move the directory with lower inode
	 * number. Example follows.
	 *
	 * Tree state when the first send was performed:
	 *
	 * .
	 * |-- a                   (ino 257)
	 *     |-- b               (ino 258)
	 *         |
	 *         |
	 *         |-- c           (ino 259)
	 *         |   |-- d       (ino 260)
	 *         |
	 *         |-- c2          (ino 261)
	 *
	 * Tree state when the second (incremental) send is performed:
	 *
	 * .
	 * |-- a                   (ino 257)
	 *     |-- b               (ino 258)
	 *         |-- c2          (ino 261)
	 *             |-- d2      (ino 260)
	 *                 |-- cc  (ino 259)
	 *
	 * The sequence of steps that lead to the second state was:
	 *
	 * mv /a/b/c/d /a/b/c2/d2
	 * mv /a/b/c /a/b/c2/d2/cc
	 *
	 * "c" has lower inode number, but we can't move it (2nd mv operation)
	 * before we move "d", which has higher inode number.
	 *
	 * So we just memorize which move/rename operations must be performed
	 * later when their respective parent is processed and moved/renamed.
	 */

	/* Indexed by parent directory inode number. */
	struct rb_root pending_dir_moves;

	/*
	 * Reverse index, indexed by the inode number of a directory that
	 * is waiting for the move/rename of its immediate parent before its
	 * own move/rename can be performed.
	 */
	struct rb_root waiting_dir_moves;
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297

	/*
	 * A directory that is going to be rm'ed might have a child directory
	 * which is in the pending directory moves index above. In this case,
	 * the directory can only be removed after the move/rename of its child
	 * is performed. Example:
	 *
	 * Parent snapshot:
	 *
	 * .                        (ino 256)
	 * |-- a/                   (ino 257)
	 *     |-- b/               (ino 258)
	 *         |-- c/           (ino 259)
	 *         |   |-- x/       (ino 260)
	 *         |
	 *         |-- y/           (ino 261)
	 *
	 * Send snapshot:
	 *
	 * .                        (ino 256)
	 * |-- a/                   (ino 257)
	 *     |-- b/               (ino 258)
	 *         |-- YY/          (ino 261)
	 *              |-- x/      (ino 260)
	 *
	 * Sequence of steps that lead to the send snapshot:
	 * rm -f /a/b/c/foo.txt
	 * mv /a/b/y /a/b/YY
	 * mv /a/b/c/x /a/b/YY
	 * rmdir /a/b/c
	 *
	 * When the child is processed, its move/rename is delayed until its
	 * parent is processed (as explained above), but all other operations
	 * like update utimes, chown, chgrp, etc, are performed and the paths
	 * that it uses for those operations must use the orphanized name of
	 * its parent (the directory we're going to rm later), so we need to
	 * memorize that name.
	 *
	 * Indexed by the inode number of the directory to be deleted.
	 */
	struct rb_root orphan_dirs;
298 299 300

	struct rb_root rbtree_new_refs;
	struct rb_root rbtree_deleted_refs;
301

302 303
	struct btrfs_lru_cache backref_cache;
	u64 backref_cache_last_reloc_trans;
304 305

	struct btrfs_lru_cache dir_created_cache;
306
	struct btrfs_lru_cache dir_utimes_cache;
307 308 309 310 311 312 313 314 315 316 317 318 319 320
};

struct pending_dir_move {
	struct rb_node node;
	struct list_head list;
	u64 parent_ino;
	u64 ino;
	u64 gen;
	struct list_head update_refs;
};

struct waiting_dir_move {
	struct rb_node node;
	u64 ino;
321 322 323 324 325 326
	/*
	 * There might be some directory that could not be removed because it
	 * was waiting for this directory inode to be moved first. Therefore
	 * after this directory is moved, we can try to rmdir the ino rmdir_ino.
	 */
	u64 rmdir_ino;
327
	u64 rmdir_gen;
328
	bool orphanized;
329 330 331 332 333 334
};

struct orphan_dir_info {
	struct rb_node node;
	u64 ino;
	u64 gen;
335
	u64 last_dir_index_offset;
336
	u64 dir_high_seq_ino;
337 338 339
};

struct name_cache_entry {
340
	/*
341 342
	 * The key in the entry is an inode number, and the generation matches
	 * the inode's generation.
343
	 */
344
	struct btrfs_lru_cache_entry entry;
345 346 347 348 349
	u64 parent_ino;
	u64 parent_gen;
	int ret;
	int need_later_update;
	int name_len;
350
	char name[] __counted_by(name_len);
351 352
};

353 354 355
/* See the comment at lru_cache.h about struct btrfs_lru_cache_entry. */
static_assert(offsetof(struct name_cache_entry, entry) == 0);

356 357 358 359 360 361 362 363 364 365
#define ADVANCE							1
#define ADVANCE_ONLY_NEXT					-1

enum btrfs_compare_tree_result {
	BTRFS_COMPARE_TREE_NEW,
	BTRFS_COMPARE_TREE_DELETED,
	BTRFS_COMPARE_TREE_CHANGED,
	BTRFS_COMPARE_TREE_SAME,
};

366
__cold
367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394
static void inconsistent_snapshot_error(struct send_ctx *sctx,
					enum btrfs_compare_tree_result result,
					const char *what)
{
	const char *result_string;

	switch (result) {
	case BTRFS_COMPARE_TREE_NEW:
		result_string = "new";
		break;
	case BTRFS_COMPARE_TREE_DELETED:
		result_string = "deleted";
		break;
	case BTRFS_COMPARE_TREE_CHANGED:
		result_string = "updated";
		break;
	case BTRFS_COMPARE_TREE_SAME:
		ASSERT(0);
		result_string = "unchanged";
		break;
	default:
		ASSERT(0);
		result_string = "unexpected";
	}

	btrfs_err(sctx->send_root->fs_info,
		  "Send: inconsistent snapshot, found %s %s for inode %llu without updated inode item, send root is %llu, parent root is %llu",
		  result_string, what, sctx->cmp_key->objectid,
395 396
		  btrfs_root_id(sctx->send_root),
		  (sctx->parent_root ?  btrfs_root_id(sctx->parent_root) : 0));
397 398
}

399 400 401 402
__maybe_unused
static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd)
{
	switch (sctx->proto) {
403 404
	case 1:	 return cmd <= BTRFS_SEND_C_MAX_V1;
	case 2:	 return cmd <= BTRFS_SEND_C_MAX_V2;
405
	case 3:	 return cmd <= BTRFS_SEND_C_MAX_V3;
406 407 408 409
	default: return false;
	}
}

410 411
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino);

412 413 414
static struct waiting_dir_move *
get_waiting_dir_move(struct send_ctx *sctx, u64 ino);

415
static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen);
416

417 418 419 420 421 422 423
static int need_send_hole(struct send_ctx *sctx)
{
	return (sctx->parent_root && !sctx->cur_inode_new &&
		!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted &&
		S_ISREG(sctx->cur_inode_mode));
}

424 425 426 427 428 429 430 431 432 433 434 435 436
static void fs_path_reset(struct fs_path *p)
{
	if (p->reversed) {
		p->start = p->buf + p->buf_len - 1;
		p->end = p->start;
		*p->start = 0;
	} else {
		p->start = p->buf;
		p->end = p->start;
		*p->start = 0;
	}
}

437
static struct fs_path *fs_path_alloc(void)
438 439 440
{
	struct fs_path *p;

441
	p = kmalloc(sizeof(*p), GFP_KERNEL);
442 443 444 445 446 447 448 449 450
	if (!p)
		return NULL;
	p->reversed = 0;
	p->buf = p->inline_buf;
	p->buf_len = FS_PATH_INLINE_SIZE;
	fs_path_reset(p);
	return p;
}

451
static struct fs_path *fs_path_alloc_reversed(void)
452 453 454
{
	struct fs_path *p;

455
	p = fs_path_alloc();
456 457 458 459 460 461 462
	if (!p)
		return NULL;
	p->reversed = 1;
	fs_path_reset(p);
	return p;
}

463
static void fs_path_free(struct fs_path *p)
464 465 466
{
	if (!p)
		return;
467 468
	if (p->buf != p->inline_buf)
		kfree(p->buf);
469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487
	kfree(p);
}

static int fs_path_len(struct fs_path *p)
{
	return p->end - p->start;
}

static int fs_path_ensure_buf(struct fs_path *p, int len)
{
	char *tmp_buf;
	int path_len;
	int old_buf_len;

	len++;

	if (p->buf_len >= len)
		return 0;

488 489 490 491 492
	if (len > PATH_MAX) {
		WARN_ON(1);
		return -ENOMEM;
	}

493 494 495
	path_len = p->end - p->start;
	old_buf_len = p->buf_len;

496 497 498 499 500
	/*
	 * Allocate to the next largest kmalloc bucket size, to let
	 * the fast path happen most of the time.
	 */
	len = kmalloc_size_roundup(len);
501 502 503
	/*
	 * First time the inline_buf does not suffice
	 */
504
	if (p->buf == p->inline_buf) {
505
		tmp_buf = kmalloc(len, GFP_KERNEL);
506 507 508
		if (tmp_buf)
			memcpy(tmp_buf, p->buf, old_buf_len);
	} else {
509
		tmp_buf = krealloc(p->buf, len, GFP_KERNEL);
510
	}
511 512 513
	if (!tmp_buf)
		return -ENOMEM;
	p->buf = tmp_buf;
514
	p->buf_len = len;
515

516 517 518 519 520 521 522 523 524 525 526 527
	if (p->reversed) {
		tmp_buf = p->buf + old_buf_len - path_len - 1;
		p->end = p->buf + p->buf_len - 1;
		p->start = p->end - path_len;
		memmove(p->start, tmp_buf, path_len + 1);
	} else {
		p->start = p->buf;
		p->end = p->start + path_len;
	}
	return 0;
}

528 529
static int fs_path_prepare_for_add(struct fs_path *p, int name_len,
				   char **prepared)
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544
{
	int ret;
	int new_len;

	new_len = p->end - p->start + name_len;
	if (p->start != p->end)
		new_len++;
	ret = fs_path_ensure_buf(p, new_len);
	if (ret < 0)
		goto out;

	if (p->reversed) {
		if (p->start != p->end)
			*--p->start = '/';
		p->start -= name_len;
545
		*prepared = p->start;
546 547 548
	} else {
		if (p->start != p->end)
			*p->end++ = '/';
549
		*prepared = p->end;
550 551 552 553 554 555 556 557 558 559 560
		p->end += name_len;
		*p->end = 0;
	}

out:
	return ret;
}

static int fs_path_add(struct fs_path *p, const char *name, int name_len)
{
	int ret;
561
	char *prepared;
562

563
	ret = fs_path_prepare_for_add(p, name_len, &prepared);
564 565
	if (ret < 0)
		goto out;
566
	memcpy(prepared, name, name_len);
567 568 569 570 571 572 573 574

out:
	return ret;
}

static int fs_path_add_path(struct fs_path *p, struct fs_path *p2)
{
	int ret;
575
	char *prepared;
576

577
	ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared);
578 579
	if (ret < 0)
		goto out;
580
	memcpy(prepared, p2->start, p2->end - p2->start);
581 582 583 584 585 586 587 588 589 590

out:
	return ret;
}

static int fs_path_add_from_extent_buffer(struct fs_path *p,
					  struct extent_buffer *eb,
					  unsigned long off, int len)
{
	int ret;
591
	char *prepared;
592

593
	ret = fs_path_prepare_for_add(p, len, &prepared);
594 595 596
	if (ret < 0)
		goto out;

597
	read_extent_buffer(eb, prepared, off, len);
598 599 600 601 602 603 604 605 606 607

out:
	return ret;
}

static int fs_path_copy(struct fs_path *p, struct fs_path *from)
{
	p->reversed = from->reversed;
	fs_path_reset(p);

608
	return fs_path_add_path(p, from);
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635
}

static void fs_path_unreverse(struct fs_path *p)
{
	char *tmp;
	int len;

	if (!p->reversed)
		return;

	tmp = p->start;
	len = p->end - p->start;
	p->start = p->buf;
	p->end = p->start + len;
	memmove(p->start, tmp, len + 1);
	p->reversed = 0;
}

static struct btrfs_path *alloc_path_for_send(void)
{
	struct btrfs_path *path;

	path = btrfs_alloc_path();
	if (!path)
		return NULL;
	path->search_commit_root = 1;
	path->skip_locking = 1;
636
	path->need_commit_sem = 1;
637 638 639
	return path;
}

640
static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off)
641 642 643 644 645
{
	int ret;
	u32 pos = 0;

	while (pos < len) {
646
		ret = kernel_write(filp, buf + pos, len - pos, off);
647
		if (ret < 0)
648
			return ret;
649
		if (ret == 0)
650
			return -EIO;
651 652 653
		pos += ret;
	}

654
	return 0;
655 656 657 658 659 660 661 662
}

static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len)
{
	struct btrfs_tlv_header *hdr;
	int total_len = sizeof(*hdr) + len;
	int left = sctx->send_max_size - sctx->send_size;

663 664 665
	if (WARN_ON_ONCE(sctx->put_data))
		return -EINVAL;

666 667 668 669
	if (unlikely(left < total_len))
		return -EOVERFLOW;

	hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size);
670 671
	put_unaligned_le16(attr, &hdr->tlv_type);
	put_unaligned_le16(len, &hdr->tlv_len);
672 673 674 675 676 677
	memcpy(hdr + 1, data, len);
	sctx->send_size += total_len;

	return 0;
}

678 679 680 681 682 683 684
#define TLV_PUT_DEFINE_INT(bits) \
	static int tlv_put_u##bits(struct send_ctx *sctx,	 	\
			u##bits attr, u##bits value)			\
	{								\
		__le##bits __tmp = cpu_to_le##bits(value);		\
		return tlv_put(sctx, attr, &__tmp, sizeof(__tmp));	\
	}
685

686
TLV_PUT_DEFINE_INT(8)
687
TLV_PUT_DEFINE_INT(32)
688
TLV_PUT_DEFINE_INT(64)
689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713

static int tlv_put_string(struct send_ctx *sctx, u16 attr,
			  const char *str, int len)
{
	if (len == -1)
		len = strlen(str);
	return tlv_put(sctx, attr, str, len);
}

static int tlv_put_uuid(struct send_ctx *sctx, u16 attr,
			const u8 *uuid)
{
	return tlv_put(sctx, attr, uuid, BTRFS_UUID_SIZE);
}

static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr,
				  struct extent_buffer *eb,
				  struct btrfs_timespec *ts)
{
	struct btrfs_timespec bts;
	read_extent_buffer(eb, &bts, (unsigned long)ts, sizeof(bts));
	return tlv_put(sctx, attr, &bts, sizeof(bts));
}


Liu Bo's avatar
Liu Bo committed
714
#define TLV_PUT(sctx, attrtype, data, attrlen) \
715
	do { \
Liu Bo's avatar
Liu Bo committed
716
		ret = tlv_put(sctx, attrtype, data, attrlen); \
717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762
		if (ret < 0) \
			goto tlv_put_failure; \
	} while (0)

#define TLV_PUT_INT(sctx, attrtype, bits, value) \
	do { \
		ret = tlv_put_u##bits(sctx, attrtype, value); \
		if (ret < 0) \
			goto tlv_put_failure; \
	} while (0)

#define TLV_PUT_U8(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 8, data)
#define TLV_PUT_U16(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 16, data)
#define TLV_PUT_U32(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 32, data)
#define TLV_PUT_U64(sctx, attrtype, data) TLV_PUT_INT(sctx, attrtype, 64, data)
#define TLV_PUT_STRING(sctx, attrtype, str, len) \
	do { \
		ret = tlv_put_string(sctx, attrtype, str, len); \
		if (ret < 0) \
			goto tlv_put_failure; \
	} while (0)
#define TLV_PUT_PATH(sctx, attrtype, p) \
	do { \
		ret = tlv_put_string(sctx, attrtype, p->start, \
			p->end - p->start); \
		if (ret < 0) \
			goto tlv_put_failure; \
	} while(0)
#define TLV_PUT_UUID(sctx, attrtype, uuid) \
	do { \
		ret = tlv_put_uuid(sctx, attrtype, uuid); \
		if (ret < 0) \
			goto tlv_put_failure; \
	} while (0)
#define TLV_PUT_BTRFS_TIMESPEC(sctx, attrtype, eb, ts) \
	do { \
		ret = tlv_put_btrfs_timespec(sctx, attrtype, eb, ts); \
		if (ret < 0) \
			goto tlv_put_failure; \
	} while (0)

static int send_header(struct send_ctx *sctx)
{
	struct btrfs_stream_header hdr;

	strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC);
763
	hdr.version = cpu_to_le32(sctx->proto);
764 765
	return write_buf(sctx->send_filp, &hdr, sizeof(hdr),
					&sctx->send_off);
766 767 768 769 770 771 772 773 774
}

/*
 * For each command/item we want to send to userspace, we call this function.
 */
static int begin_cmd(struct send_ctx *sctx, int cmd)
{
	struct btrfs_cmd_header *hdr;

775
	if (WARN_ON(!sctx->send_buf))
776 777
		return -EINVAL;

778 779 780 781 782 783
	if (unlikely(sctx->send_size != 0)) {
		btrfs_err(sctx->send_root->fs_info,
			  "send: command header buffer not empty cmd %d offset %llu",
			  cmd, sctx->send_off);
		return -EINVAL;
	}
784 785 786

	sctx->send_size += sizeof(*hdr);
	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
787
	put_unaligned_le16(cmd, &hdr->cmd);
788 789 790 791 792 793 794 795 796 797 798

	return 0;
}

static int send_cmd(struct send_ctx *sctx)
{
	int ret;
	struct btrfs_cmd_header *hdr;
	u32 crc;

	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
799 800
	put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len);
	put_unaligned_le32(0, &hdr->crc);
801

802
	crc = crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size);
803
	put_unaligned_le32(crc, &hdr->crc);
804

805 806
	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
					&sctx->send_off);
807 808

	sctx->send_size = 0;
809
	sctx->put_data = false;
810 811 812 813 814 815 816 817 818 819

	return ret;
}

/*
 * Sends a move instruction to user space
 */
static int send_rename(struct send_ctx *sctx,
		     struct fs_path *from, struct fs_path *to)
{
820
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
821 822
	int ret;

823
	btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start);
824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844

	ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from);
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	return ret;
}

/*
 * Sends a link instruction to user space
 */
static int send_link(struct send_ctx *sctx,
		     struct fs_path *path, struct fs_path *lnk)
{
845
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
846 847
	int ret;

848
	btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start);
849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868

	ret = begin_cmd(sctx, BTRFS_SEND_C_LINK);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	return ret;
}

/*
 * Sends an unlink instruction to user space
 */
static int send_unlink(struct send_ctx *sctx, struct fs_path *path)
{
869
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
870 871
	int ret;

872
	btrfs_debug(fs_info, "send_unlink %s", path->start);
873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891

	ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	return ret;
}

/*
 * Sends a rmdir instruction to user space
 */
static int send_rmdir(struct send_ctx *sctx, struct fs_path *path)
{
892
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
893 894
	int ret;

895
	btrfs_debug(fs_info, "send_rmdir %s", path->start);
896 897 898 899 900 901 902 903 904 905 906 907 908 909

	ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	return ret;
}

910 911 912 913 914 915 916 917
struct btrfs_inode_info {
	u64 size;
	u64 gen;
	u64 mode;
	u64 uid;
	u64 gid;
	u64 rdev;
	u64 fileattr;
918
	u64 nlink;
919 920
};

921 922 923
/*
 * Helper function to retrieve some fields from an inode item.
 */
924 925
static int get_inode_info(struct btrfs_root *root, u64 ino,
			  struct btrfs_inode_info *info)
926 927
{
	int ret;
928
	struct btrfs_path *path;
929 930 931
	struct btrfs_inode_item *ii;
	struct btrfs_key key;

932 933 934 935
	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

936 937 938 939 940
	key.objectid = ino;
	key.type = BTRFS_INODE_ITEM_KEY;
	key.offset = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret) {
941 942
		if (ret > 0)
			ret = -ENOENT;
943
		goto out;
944 945
	}

946 947 948
	if (!info)
		goto out;

949 950
	ii = btrfs_item_ptr(path->nodes[0], path->slots[0],
			struct btrfs_inode_item);
951 952 953 954 955 956
	info->size = btrfs_inode_size(path->nodes[0], ii);
	info->gen = btrfs_inode_generation(path->nodes[0], ii);
	info->mode = btrfs_inode_mode(path->nodes[0], ii);
	info->uid = btrfs_inode_uid(path->nodes[0], ii);
	info->gid = btrfs_inode_gid(path->nodes[0], ii);
	info->rdev = btrfs_inode_rdev(path->nodes[0], ii);
957
	info->nlink = btrfs_inode_nlink(path->nodes[0], ii);
958 959 960 961
	/*
	 * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's
	 * otherwise logically split to 32/32 parts.
	 */
962
	info->fileattr = btrfs_inode_flags(path->nodes[0], ii);
963

964 965
out:
	btrfs_free_path(path);
966 967 968
	return ret;
}

969
static int get_inode_gen(struct btrfs_root *root, u64 ino, u64 *gen)
970 971
{
	int ret;
972
	struct btrfs_inode_info info = { 0 };
973

974
	ASSERT(gen);
975 976

	ret = get_inode_info(root, ino, &info);
977
	*gen = info.gen;
978 979 980 981 982 983 984 985
	return ret;
}

typedef int (*iterate_inode_ref_t)(int num, u64 dir, int index,
				   struct fs_path *p,
				   void *ctx);

/*
986 987
 * Helper function to iterate the entries in ONE btrfs_inode_ref or
 * btrfs_inode_extref.
988 989 990
 * The iterate callback may return a non zero value to stop iteration. This can
 * be a negative value for error codes or 1 to simply stop it.
 *
991
 * path must point to the INODE_REF or INODE_EXTREF when called.
992
 */
993
static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
994 995 996
			     struct btrfs_key *found_key, int resolve,
			     iterate_inode_ref_t iterate, void *ctx)
{
997
	struct extent_buffer *eb = path->nodes[0];
998
	struct btrfs_inode_ref *iref;
999
	struct btrfs_inode_extref *extref;
1000 1001
	struct btrfs_path *tmp_path;
	struct fs_path *p;
1002
	u32 cur = 0;
1003
	u32 total;
1004
	int slot = path->slots[0];
1005 1006 1007
	u32 name_len;
	char *start;
	int ret = 0;
1008
	int num = 0;
1009
	int index;
1010 1011 1012 1013
	u64 dir;
	unsigned long name_off;
	unsigned long elem_size;
	unsigned long ptr;
1014

1015
	p = fs_path_alloc_reversed();
1016 1017 1018 1019 1020
	if (!p)
		return -ENOMEM;

	tmp_path = alloc_path_for_send();
	if (!tmp_path) {
1021
		fs_path_free(p);
1022 1023 1024 1025
		return -ENOMEM;
	}


1026 1027 1028
	if (found_key->type == BTRFS_INODE_REF_KEY) {
		ptr = (unsigned long)btrfs_item_ptr(eb, slot,
						    struct btrfs_inode_ref);
1029
		total = btrfs_item_size(eb, slot);
1030 1031 1032
		elem_size = sizeof(*iref);
	} else {
		ptr = btrfs_item_ptr_offset(eb, slot);
1033
		total = btrfs_item_size(eb, slot);
1034 1035 1036
		elem_size = sizeof(*extref);
	}

1037 1038 1039
	while (cur < total) {
		fs_path_reset(p);

1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053
		if (found_key->type == BTRFS_INODE_REF_KEY) {
			iref = (struct btrfs_inode_ref *)(ptr + cur);
			name_len = btrfs_inode_ref_name_len(eb, iref);
			name_off = (unsigned long)(iref + 1);
			index = btrfs_inode_ref_index(eb, iref);
			dir = found_key->offset;
		} else {
			extref = (struct btrfs_inode_extref *)(ptr + cur);
			name_len = btrfs_inode_extref_name_len(eb, extref);
			name_off = (unsigned long)&extref->name;
			index = btrfs_inode_extref_index(eb, extref);
			dir = btrfs_inode_extref_parent(eb, extref);
		}

1054
		if (resolve) {
1055 1056 1057
			start = btrfs_ref_to_path(root, tmp_path, name_len,
						  name_off, eb, dir,
						  p->buf, p->buf_len);
1058 1059 1060 1061 1062 1063 1064 1065 1066 1067
			if (IS_ERR(start)) {
				ret = PTR_ERR(start);
				goto out;
			}
			if (start < p->buf) {
				/* overflow , try again with larger buffer */
				ret = fs_path_ensure_buf(p,
						p->buf_len + p->buf - start);
				if (ret < 0)
					goto out;
1068 1069 1070 1071
				start = btrfs_ref_to_path(root, tmp_path,
							  name_len, name_off,
							  eb, dir,
							  p->buf, p->buf_len);
1072 1073 1074 1075
				if (IS_ERR(start)) {
					ret = PTR_ERR(start);
					goto out;
				}
1076 1077 1078 1079 1080 1081 1082 1083 1084
				if (unlikely(start < p->buf)) {
					btrfs_err(root->fs_info,
			"send: path ref buffer underflow for key (%llu %u %llu)",
						  found_key->objectid,
						  found_key->type,
						  found_key->offset);
					ret = -EINVAL;
					goto out;
				}
1085 1086 1087
			}
			p->start = start;
		} else {
1088 1089
			ret = fs_path_add_from_extent_buffer(p, eb, name_off,
							     name_len);
1090 1091 1092 1093
			if (ret < 0)
				goto out;
		}

1094 1095
		cur += elem_size + name_len;
		ret = iterate(num, dir, index, p, ctx);
1096 1097 1098 1099 1100 1101 1102
		if (ret)
			goto out;
		num++;
	}

out:
	btrfs_free_path(tmp_path);
1103
	fs_path_free(p);
1104 1105 1106 1107 1108 1109
	return ret;
}

typedef int (*iterate_dir_item_t)(int num, struct btrfs_key *di_key,
				  const char *name, int name_len,
				  const char *data, int data_len,
1110
				  void *ctx);
1111 1112 1113 1114 1115 1116 1117 1118

/*
 * Helper function to iterate the entries in ONE btrfs_dir_item.
 * The iterate callback may return a non zero value to stop iteration. This can
 * be a negative value for error codes or 1 to simply stop it.
 *
 * path must point to the dir item when called.
 */
1119
static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
1120 1121 1122 1123 1124 1125 1126
			    iterate_dir_item_t iterate, void *ctx)
{
	int ret = 0;
	struct extent_buffer *eb;
	struct btrfs_dir_item *di;
	struct btrfs_key di_key;
	char *buf = NULL;
1127
	int buf_len;
1128 1129 1130 1131 1132 1133 1134 1135
	u32 name_len;
	u32 data_len;
	u32 cur;
	u32 len;
	u32 total;
	int slot;
	int num;

1136 1137 1138
	/*
	 * Start with a small buffer (1 page). If later we end up needing more
	 * space, which can happen for xattrs on a fs with a leaf size greater
1139
	 * than the page size, attempt to increase the buffer. Typically xattr
1140 1141 1142
	 * values are small.
	 */
	buf_len = PATH_MAX;
1143
	buf = kmalloc(buf_len, GFP_KERNEL);
1144 1145 1146 1147 1148 1149 1150 1151 1152 1153
	if (!buf) {
		ret = -ENOMEM;
		goto out;
	}

	eb = path->nodes[0];
	slot = path->slots[0];
	di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
	cur = 0;
	len = 0;
1154
	total = btrfs_item_size(eb, slot);
1155 1156 1157 1158 1159 1160 1161

	num = 0;
	while (cur < total) {
		name_len = btrfs_dir_name_len(eb, di);
		data_len = btrfs_dir_data_len(eb, di);
		btrfs_dir_item_key_to_cpu(eb, di, &di_key);

1162
		if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
1163 1164 1165 1166
			if (name_len > XATTR_NAME_MAX) {
				ret = -ENAMETOOLONG;
				goto out;
			}
1167 1168
			if (name_len + data_len >
					BTRFS_MAX_XATTR_SIZE(root->fs_info)) {
1169 1170 1171 1172 1173 1174 1175
				ret = -E2BIG;
				goto out;
			}
		} else {
			/*
			 * Path too long
			 */
1176
			if (name_len + data_len > PATH_MAX) {
1177 1178 1179
				ret = -ENAMETOOLONG;
				goto out;
			}
1180 1181
		}

1182 1183 1184 1185 1186 1187 1188
		if (name_len + data_len > buf_len) {
			buf_len = name_len + data_len;
			if (is_vmalloc_addr(buf)) {
				vfree(buf);
				buf = NULL;
			} else {
				char *tmp = krealloc(buf, buf_len,
1189
						GFP_KERNEL | __GFP_NOWARN);
1190 1191 1192 1193 1194 1195

				if (!tmp)
					kfree(buf);
				buf = tmp;
			}
			if (!buf) {
1196
				buf = kvmalloc(buf_len, GFP_KERNEL);
1197 1198 1199 1200 1201 1202 1203
				if (!buf) {
					ret = -ENOMEM;
					goto out;
				}
			}
		}

1204 1205 1206 1207 1208 1209 1210 1211
		read_extent_buffer(eb, buf, (unsigned long)(di + 1),
				name_len + data_len);

		len = sizeof(*di) + name_len + data_len;
		di = (struct btrfs_dir_item *)((char *)di + len);
		cur += len;

		ret = iterate(num, &di_key, buf, name_len, buf + name_len,
1212
			      data_len, ctx);
1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
		if (ret < 0)
			goto out;
		if (ret) {
			ret = 0;
			goto out;
		}

		num++;
	}

out:
1224
	kvfree(buf);
1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
	return ret;
}

static int __copy_first_ref(int num, u64 dir, int index,
			    struct fs_path *p, void *ctx)
{
	int ret;
	struct fs_path *pt = ctx;

	ret = fs_path_copy(pt, p);
	if (ret < 0)
		return ret;

	/* we want the first only */
	return 1;
}

/*
 * Retrieve the first path of an inode. If an inode has more then one
 * ref/hardlink, this is ignored.
 */
1246
static int get_inode_path(struct btrfs_root *root,
1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271
			  u64 ino, struct fs_path *path)
{
	int ret;
	struct btrfs_key key, found_key;
	struct btrfs_path *p;

	p = alloc_path_for_send();
	if (!p)
		return -ENOMEM;

	fs_path_reset(path);

	key.objectid = ino;
	key.type = BTRFS_INODE_REF_KEY;
	key.offset = 0;

	ret = btrfs_search_slot_for_read(root, &key, p, 1, 0);
	if (ret < 0)
		goto out;
	if (ret) {
		ret = 1;
		goto out;
	}
	btrfs_item_key_to_cpu(p->nodes[0], &found_key, p->slots[0]);
	if (found_key.objectid != ino ||
1272 1273
	    (found_key.type != BTRFS_INODE_REF_KEY &&
	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
1274 1275 1276 1277
		ret = -ENOENT;
		goto out;
	}

1278 1279
	ret = iterate_inode_ref(root, p, &found_key, 1,
				__copy_first_ref, path);
1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303
	if (ret < 0)
		goto out;
	ret = 0;

out:
	btrfs_free_path(p);
	return ret;
}

struct backref_ctx {
	struct send_ctx *sctx;

	/* number of total found references */
	u64 found;

	/*
	 * used for clones found in send_root. clones found behind cur_objectid
	 * and cur_offset are not considered as allowed clones.
	 */
	u64 cur_objectid;
	u64 cur_offset;

	/* may be truncated in case it's the last extent in a file */
	u64 extent_len;
1304 1305 1306

	/* The bytenr the file extent item we are processing refers to. */
	u64 bytenr;
1307 1308 1309 1310
	/* The owner (root id) of the data backref for the current extent. */
	u64 backref_owner;
	/* The offset of the data backref for the current extent. */
	u64 backref_offset;
1311 1312 1313 1314
};

static int __clone_root_cmp_bsearch(const void *key, const void *elt)
{
1315
	u64 root = (u64)(uintptr_t)key;
1316
	const struct clone_root *cr = elt;
1317

1318
	if (root < btrfs_root_id(cr->root))
1319
		return -1;
1320
	if (root > btrfs_root_id(cr->root))
1321 1322 1323 1324 1325 1326
		return 1;
	return 0;
}

static int __clone_root_cmp_sort(const void *e1, const void *e2)
{
1327 1328
	const struct clone_root *cr1 = e1;
	const struct clone_root *cr2 = e2;
1329

1330
	if (btrfs_root_id(cr1->root) < btrfs_root_id(cr2->root))
1331
		return -1;
1332
	if (btrfs_root_id(cr1->root) > btrfs_root_id(cr2->root))
1333 1334 1335 1336 1337 1338
		return 1;
	return 0;
}

/*
 * Called for every backref that is found for the current extent.
1339
 * Results are collected in sctx->clone_roots->ino/offset.
1340
 */
1341 1342
static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
			    void *ctx_)
1343 1344
{
	struct backref_ctx *bctx = ctx_;
1345
	struct clone_root *clone_root;
1346 1347

	/* First check if the root is in the list of accepted clone sources */
1348 1349 1350 1351 1352
	clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
			     bctx->sctx->clone_roots_cnt,
			     sizeof(struct clone_root),
			     __clone_root_cmp_bsearch);
	if (!clone_root)
1353 1354
		return 0;

1355 1356
	/* This is our own reference, bail out as we can't clone from it. */
	if (clone_root->root == bctx->sctx->send_root &&
1357
	    ino == bctx->cur_objectid &&
1358 1359
	    offset == bctx->cur_offset)
		return 0;
1360 1361 1362 1363 1364

	/*
	 * Make sure we don't consider clones from send_root that are
	 * behind the current inode/offset.
	 */
1365
	if (clone_root->root == bctx->sctx->send_root) {
1366
		/*
1367 1368 1369
		 * If the source inode was not yet processed we can't issue a
		 * clone operation, as the source extent does not exist yet at
		 * the destination of the stream.
1370
		 */
1371 1372 1373 1374 1375 1376 1377 1378 1379
		if (ino > bctx->cur_objectid)
			return 0;
		/*
		 * We clone from the inode currently being sent as long as the
		 * source extent is already processed, otherwise we could try
		 * to clone from an extent that does not exist yet at the
		 * destination of the stream.
		 */
		if (ino == bctx->cur_objectid &&
1380 1381
		    offset + bctx->extent_len >
		    bctx->sctx->cur_inode_next_write_offset)
1382 1383 1384 1385
			return 0;
	}

	bctx->found++;
1386
	clone_root->found_ref = true;
1387 1388 1389 1390 1391 1392 1393

	/*
	 * If the given backref refers to a file extent item with a larger
	 * number of bytes than what we found before, use the new one so that
	 * we clone more optimally and end up doing less writes and getting
	 * less exclusive, non-shared extents at the destination.
	 */
1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404
	if (num_bytes > clone_root->num_bytes) {
		clone_root->ino = ino;
		clone_root->offset = offset;
		clone_root->num_bytes = num_bytes;

		/*
		 * Found a perfect candidate, so there's no need to continue
		 * backref walking.
		 */
		if (num_bytes >= bctx->extent_len)
			return BTRFS_ITERATE_EXTENT_INODES_STOP;
1405 1406 1407 1408 1409
	}

	return 0;
}

1410 1411 1412
static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
				 const u64 **root_ids_ret, int *root_count_ret)
{
1413 1414
	struct backref_ctx *bctx = ctx;
	struct send_ctx *sctx = bctx->sctx;
1415 1416
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
	const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
1417
	struct btrfs_lru_cache_entry *raw_entry;
1418 1419
	struct backref_cache_entry *entry;

1420
	if (sctx->backref_cache.size == 0)
1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433
		return false;

	/*
	 * If relocation happened since we first filled the cache, then we must
	 * empty the cache and can not use it, because even though we operate on
	 * read-only roots, their leaves and nodes may have been reallocated and
	 * now be used for different nodes/leaves of the same tree or some other
	 * tree.
	 *
	 * We are called from iterate_extent_inodes() while either holding a
	 * transaction handle or holding fs_info->commit_root_sem, so no need
	 * to take any lock here.
	 */
1434 1435
	if (fs_info->last_reloc_trans > sctx->backref_cache_last_reloc_trans) {
		btrfs_lru_cache_clear(&sctx->backref_cache);
1436 1437 1438
		return false;
	}

1439
	raw_entry = btrfs_lru_cache_lookup(&sctx->backref_cache, key, 0);
1440
	if (!raw_entry)
1441 1442
		return false;

1443
	entry = container_of(raw_entry, struct backref_cache_entry, entry);
1444 1445 1446 1447 1448 1449 1450 1451 1452
	*root_ids_ret = entry->root_ids;
	*root_count_ret = entry->num_roots;

	return true;
}

static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
				void *ctx)
{
1453 1454
	struct backref_ctx *bctx = ctx;
	struct send_ctx *sctx = bctx->sctx;
1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
	struct backref_cache_entry *new_entry;
	struct ulist_iterator uiter;
	struct ulist_node *node;
	int ret;

	/*
	 * We're called while holding a transaction handle or while holding
	 * fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
	 * NOFS allocation.
	 */
	new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
	/* No worries, cache is optional. */
	if (!new_entry)
		return;

1471
	new_entry->entry.key = leaf_bytenr >> fs_info->sectorsize_bits;
1472
	new_entry->entry.gen = 0;
1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499
	new_entry->num_roots = 0;
	ULIST_ITER_INIT(&uiter);
	while ((node = ulist_next(root_ids, &uiter)) != NULL) {
		const u64 root_id = node->val;
		struct clone_root *root;

		root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
			       sctx->clone_roots_cnt, sizeof(struct clone_root),
			       __clone_root_cmp_bsearch);
		if (!root)
			continue;

		/* Too many roots, just exit, no worries as caching is optional. */
		if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
			kfree(new_entry);
			return;
		}

		new_entry->root_ids[new_entry->num_roots] = root_id;
		new_entry->num_roots++;
	}

	/*
	 * We may have not added any roots to the new cache entry, which means
	 * none of the roots is part of the list of roots from which we are
	 * allowed to clone. Cache the new entry as it's still useful to avoid
	 * backref walking to determine which roots have a path to the leaf.
1500 1501 1502
	 *
	 * Also use GFP_NOFS because we're called while holding a transaction
	 * handle or while holding fs_info->commit_root_sem.
1503
	 */
1504 1505
	ret = btrfs_lru_cache_store(&sctx->backref_cache, &new_entry->entry,
				    GFP_NOFS);
1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517
	ASSERT(ret == 0 || ret == -ENOMEM);
	if (ret) {
		/* Caching is optional, no worries. */
		kfree(new_entry);
		return;
	}

	/*
	 * We are called from iterate_extent_inodes() while either holding a
	 * transaction handle or holding fs_info->commit_root_sem, so no need
	 * to take any lock here.
	 */
1518
	if (sctx->backref_cache.size == 1)
1519
		sctx->backref_cache_last_reloc_trans = fs_info->last_reloc_trans;
1520 1521
}

1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
			     const struct extent_buffer *leaf, void *ctx)
{
	const u64 refs = btrfs_extent_refs(leaf, ei);
	const struct backref_ctx *bctx = ctx;
	const struct send_ctx *sctx = bctx->sctx;

	if (bytenr == bctx->bytenr) {
		const u64 flags = btrfs_extent_flags(leaf, ei);

		if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
			return -EUCLEAN;

		/*
		 * If we have only one reference and only the send root as a
		 * clone source - meaning no clone roots were given in the
		 * struct btrfs_ioctl_send_args passed to the send ioctl - then
		 * it's our reference and there's no point in doing backref
		 * walking which is expensive, so exit early.
		 */
		if (refs == 1 && sctx->clone_roots_cnt == 1)
			return -ENOENT;
	}

	/*
	 * Backreference walking (iterate_extent_inodes() below) is currently
	 * too expensive when an extent has a large number of references, both
	 * in time spent and used memory. So for now just fallback to write
	 * operations instead of clone operations when an extent has more than
	 * a certain amount of references.
	 */
	if (refs > SEND_MAX_EXTENT_REFS)
		return -ENOENT;

	return 0;
}

1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570
static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
{
	const struct backref_ctx *bctx = ctx;

	if (ino == bctx->cur_objectid &&
	    root == bctx->backref_owner &&
	    offset == bctx->backref_offset)
		return true;

	return false;
}

1571
/*
1572 1573 1574 1575 1576 1577
 * Given an inode, offset and extent item, it finds a good clone for a clone
 * instruction. Returns -ENOENT when none could be found. The function makes
 * sure that the returned clone is usable at the point where sending is at the
 * moment. This means, that no clones are accepted which lie behind the current
 * inode+offset.
 *
1578 1579 1580 1581 1582 1583 1584 1585
 * path must point to the extent item when called.
 */
static int find_extent_clone(struct send_ctx *sctx,
			     struct btrfs_path *path,
			     u64 ino, u64 data_offset,
			     u64 ino_size,
			     struct clone_root **found)
{
1586
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
1587 1588 1589
	int ret;
	int extent_type;
	u64 logical;
1590
	u64 disk_byte;
1591 1592 1593
	u64 num_bytes;
	struct btrfs_file_extent_item *fi;
	struct extent_buffer *eb = path->nodes[0];
1594 1595
	struct backref_ctx backref_ctx = { 0 };
	struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
1596
	struct clone_root *cur_clone_root;
1597
	int compressed;
1598 1599
	u32 i;

1600 1601 1602 1603 1604 1605 1606
	/*
	 * With fallocate we can get prealloc extents beyond the inode's i_size,
	 * so we don't do anything here because clone operations can not clone
	 * to a range beyond i_size without increasing the i_size of the
	 * destination inode.
	 */
	if (data_offset >= ino_size)
1607
		return 0;
1608

1609
	fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
1610
	extent_type = btrfs_file_extent_type(eb, fi);
1611 1612
	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
		return -ENOENT;
1613

1614
	disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
1615 1616 1617 1618 1619
	if (disk_byte == 0)
		return -ENOENT;

	compressed = btrfs_file_extent_compression(eb, fi);
	num_bytes = btrfs_file_extent_num_bytes(eb, fi);
1620
	logical = disk_byte + btrfs_file_extent_offset(eb, fi);
1621 1622 1623 1624 1625 1626 1627 1628

	/*
	 * Setup the clone roots.
	 */
	for (i = 0; i < sctx->clone_roots_cnt; i++) {
		cur_clone_root = sctx->clone_roots + i;
		cur_clone_root->ino = (u64)-1;
		cur_clone_root->offset = 0;
1629
		cur_clone_root->num_bytes = 0;
1630
		cur_clone_root->found_ref = false;
1631 1632
	}

1633 1634 1635
	backref_ctx.sctx = sctx;
	backref_ctx.cur_objectid = ino;
	backref_ctx.cur_offset = data_offset;
1636
	backref_ctx.bytenr = disk_byte;
1637 1638 1639 1640 1641 1642
	/*
	 * Use the header owner and not the send root's id, because in case of a
	 * snapshot we can have shared subtrees.
	 */
	backref_ctx.backref_owner = btrfs_header_owner(eb);
	backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
1643 1644 1645 1646

	/*
	 * The last extent of a file may be too large due to page alignment.
	 * We need to adjust extent_len in this case so that the checks in
1647
	 * iterate_backrefs() work.
1648 1649
	 */
	if (data_offset + num_bytes >= ino_size)
1650
		backref_ctx.extent_len = ino_size - data_offset;
1651 1652
	else
		backref_ctx.extent_len = num_bytes;
1653 1654 1655 1656

	/*
	 * Now collect all backrefs.
	 */
1657
	backref_walk_ctx.bytenr = disk_byte;
1658
	if (compressed == BTRFS_COMPRESS_NONE)
1659
		backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
1660
	backref_walk_ctx.fs_info = fs_info;
1661 1662
	backref_walk_ctx.cache_lookup = lookup_backref_cache;
	backref_walk_ctx.cache_store = store_backref_cache;
1663
	backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
1664
	backref_walk_ctx.check_extent_item = check_extent_item;
1665
	backref_walk_ctx.user_ctx = &backref_ctx;
1666

1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677
	/*
	 * If have a single clone root, then it's the send root and we can tell
	 * the backref walking code to skip our own backref and not resolve it,
	 * since we can not use it for cloning - the source and destination
	 * ranges can't overlap and in case the leaf is shared through a subtree
	 * due to snapshots, we can't use those other roots since they are not
	 * in the list of clone roots.
	 */
	if (sctx->clone_roots_cnt == 1)
		backref_walk_ctx.skip_data_ref = skip_self_data_ref;

1678
	ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
1679
				    &backref_ctx);
1680
	if (ret < 0)
1681
		return ret;
1682

1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697
	down_read(&fs_info->commit_root_sem);
	if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
		/*
		 * A transaction commit for a transaction in which block group
		 * relocation was done just happened.
		 * The disk_bytenr of the file extent item we processed is
		 * possibly stale, referring to the extent's location before
		 * relocation. So act as if we haven't found any clone sources
		 * and fallback to write commands, which will read the correct
		 * data from the new extent location. Otherwise we will fail
		 * below because we haven't found our own back reference or we
		 * could be getting incorrect sources in case the old extent
		 * was already reallocated after the relocation.
		 */
		up_read(&fs_info->commit_root_sem);
1698
		return -ENOENT;
1699 1700 1701
	}
	up_read(&fs_info->commit_root_sem);

1702 1703 1704
	btrfs_debug(fs_info,
		    "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
		    data_offset, ino, num_bytes, logical);
1705

1706
	if (!backref_ctx.found) {
1707
		btrfs_debug(fs_info, "no clones found");
1708
		return -ENOENT;
1709
	}
1710 1711 1712

	cur_clone_root = NULL;
	for (i = 0; i < sctx->clone_roots_cnt; i++) {
1713
		struct clone_root *clone_root = &sctx->clone_roots[i];
1714

1715
		if (!clone_root->found_ref)
1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733
			continue;

		/*
		 * Choose the root from which we can clone more bytes, to
		 * minimize write operations and therefore have more extent
		 * sharing at the destination (the same as in the source).
		 */
		if (!cur_clone_root ||
		    clone_root->num_bytes > cur_clone_root->num_bytes) {
			cur_clone_root = clone_root;

			/*
			 * We found an optimal clone candidate (any inode from
			 * any root is fine), so we're done.
			 */
			if (clone_root->num_bytes >= backref_ctx.extent_len)
				break;
		}
1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745
	}

	if (cur_clone_root) {
		*found = cur_clone_root;
		ret = 0;
	} else {
		ret = -ENOENT;
	}

	return ret;
}

1746
static int read_symlink(struct btrfs_root *root,
1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768
			u64 ino,
			struct fs_path *dest)
{
	int ret;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_file_extent_item *ei;
	u8 type;
	u8 compression;
	unsigned long off;
	int len;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	key.objectid = ino;
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = 0;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779
	if (ret) {
		/*
		 * An empty symlink inode. Can happen in rare error paths when
		 * creating a symlink (transaction committed before the inode
		 * eviction handler removed the symlink inode items and a crash
		 * happened in between or the subvol was snapshoted in between).
		 * Print an informative message to dmesg/syslog so that the user
		 * can delete the symlink.
		 */
		btrfs_err(root->fs_info,
			  "Found empty symlink inode %llu at root %llu",
1780
			  ino, btrfs_root_id(root));
1781 1782 1783
		ret = -EIO;
		goto out;
	}
1784 1785 1786 1787

	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
			struct btrfs_file_extent_item);
	type = btrfs_file_extent_type(path->nodes[0], ei);
1788 1789 1790 1791 1792 1793 1794
	if (unlikely(type != BTRFS_FILE_EXTENT_INLINE)) {
		ret = -EUCLEAN;
		btrfs_crit(root->fs_info,
"send: found symlink extent that is not inline, ino %llu root %llu extent type %d",
			   ino, btrfs_root_id(root), type);
		goto out;
	}
1795
	compression = btrfs_file_extent_compression(path->nodes[0], ei);
1796 1797 1798 1799 1800 1801 1802
	if (unlikely(compression != BTRFS_COMPRESS_NONE)) {
		ret = -EUCLEAN;
		btrfs_crit(root->fs_info,
"send: found symlink extent with compression, ino %llu root %llu compression type %d",
			   ino, btrfs_root_id(root), compression);
		goto out;
	}
1803 1804

	off = btrfs_file_extent_inline_start(ei);
1805
	len = btrfs_file_extent_ram_bytes(path->nodes[0], ei);
1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833

	ret = fs_path_add_from_extent_buffer(dest, path->nodes[0], off, len);

out:
	btrfs_free_path(path);
	return ret;
}

/*
 * Helper function to generate a file name that is unique in the root of
 * send_root and parent_root. This is used to generate names for orphan inodes.
 */
static int gen_unique_name(struct send_ctx *sctx,
			   u64 ino, u64 gen,
			   struct fs_path *dest)
{
	int ret = 0;
	struct btrfs_path *path;
	struct btrfs_dir_item *di;
	char tmp[64];
	int len;
	u64 idx = 0;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	while (1) {
1834
		struct fscrypt_str tmp_name;
1835

1836
		len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
1837
				ino, gen, idx);
1838
		ASSERT(len < sizeof(tmp));
1839 1840
		tmp_name.name = tmp;
		tmp_name.len = strlen(tmp);
1841 1842 1843

		di = btrfs_lookup_dir_item(NULL, sctx->send_root,
				path, BTRFS_FIRST_FREE_OBJECTID,
1844
				&tmp_name, 0);
1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863
		btrfs_release_path(path);
		if (IS_ERR(di)) {
			ret = PTR_ERR(di);
			goto out;
		}
		if (di) {
			/* not unique, try again */
			idx++;
			continue;
		}

		if (!sctx->parent_root) {
			/* unique */
			ret = 0;
			break;
		}

		di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
				path, BTRFS_FIRST_FREE_OBJECTID,
1864
				&tmp_name, 0);
1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893
		btrfs_release_path(path);
		if (IS_ERR(di)) {
			ret = PTR_ERR(di);
			goto out;
		}
		if (di) {
			/* not unique, try again */
			idx++;
			continue;
		}
		/* unique */
		break;
	}

	ret = fs_path_add(dest, tmp, strlen(tmp));

out:
	btrfs_free_path(path);
	return ret;
}

enum inode_state {
	inode_state_no_change,
	inode_state_will_create,
	inode_state_did_create,
	inode_state_will_delete,
	inode_state_did_delete,
};

1894 1895
static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen,
			       u64 *send_gen, u64 *parent_gen)
1896 1897 1898 1899 1900
{
	int ret;
	int left_ret;
	int right_ret;
	u64 left_gen;
1901
	u64 right_gen = 0;
1902
	struct btrfs_inode_info info;
1903

1904
	ret = get_inode_info(sctx->send_root, ino, &info);
1905 1906
	if (ret < 0 && ret != -ENOENT)
		goto out;
1907 1908
	left_ret = (info.nlink == 0) ? -ENOENT : ret;
	left_gen = info.gen;
1909 1910
	if (send_gen)
		*send_gen = ((left_ret == -ENOENT) ? 0 : info.gen);
1911 1912 1913 1914

	if (!sctx->parent_root) {
		right_ret = -ENOENT;
	} else {
1915
		ret = get_inode_info(sctx->parent_root, ino, &info);
1916 1917
		if (ret < 0 && ret != -ENOENT)
			goto out;
1918 1919
		right_ret = (info.nlink == 0) ? -ENOENT : ret;
		right_gen = info.gen;
1920 1921
		if (parent_gen)
			*parent_gen = ((right_ret == -ENOENT) ? 0 : info.gen);
1922 1923 1924
	}

	if (!left_ret && !right_ret) {
1925
		if (left_gen == gen && right_gen == gen) {
1926
			ret = inode_state_no_change;
1927
		} else if (left_gen == gen) {
1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965
			if (ino < sctx->send_progress)
				ret = inode_state_did_create;
			else
				ret = inode_state_will_create;
		} else if (right_gen == gen) {
			if (ino < sctx->send_progress)
				ret = inode_state_did_delete;
			else
				ret = inode_state_will_delete;
		} else  {
			ret = -ENOENT;
		}
	} else if (!left_ret) {
		if (left_gen == gen) {
			if (ino < sctx->send_progress)
				ret = inode_state_did_create;
			else
				ret = inode_state_will_create;
		} else {
			ret = -ENOENT;
		}
	} else if (!right_ret) {
		if (right_gen == gen) {
			if (ino < sctx->send_progress)
				ret = inode_state_did_delete;
			else
				ret = inode_state_will_delete;
		} else {
			ret = -ENOENT;
		}
	} else {
		ret = -ENOENT;
	}

out:
	return ret;
}

1966 1967
static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen,
			     u64 *send_gen, u64 *parent_gen)
1968 1969 1970
{
	int ret;

1971 1972 1973
	if (ino == BTRFS_FIRST_FREE_OBJECTID)
		return 1;

1974
	ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen);
1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
	if (ret < 0)
		goto out;

	if (ret == inode_state_no_change ||
	    ret == inode_state_did_create ||
	    ret == inode_state_will_delete)
		ret = 1;
	else
		ret = 0;

out:
	return ret;
}

/*
 * Helper function to lookup a dir item in a dir.
 */
static int lookup_dir_item_inode(struct btrfs_root *root,
				 u64 dir, const char *name, int name_len,
1994
				 u64 *found_inode)
1995 1996 1997 1998 1999
{
	int ret = 0;
	struct btrfs_dir_item *di;
	struct btrfs_key key;
	struct btrfs_path *path;
2000
	struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
2001 2002 2003 2004 2005

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

2006
	di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
2007 2008
	if (IS_ERR_OR_NULL(di)) {
		ret = di ? PTR_ERR(di) : -ENOENT;
2009 2010 2011
		goto out;
	}
	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
2012 2013 2014 2015
	if (key.type == BTRFS_ROOT_ITEM_KEY) {
		ret = -ENOENT;
		goto out;
	}
2016 2017 2018 2019 2020 2021 2022
	*found_inode = key.objectid;

out:
	btrfs_free_path(path);
	return ret;
}

2023 2024 2025 2026
/*
 * Looks up the first btrfs_inode_ref of a given ino. It returns the parent dir,
 * generation of the parent dir and the name of the dir entry.
 */
2027
static int get_first_ref(struct btrfs_root *root, u64 ino,
2028 2029 2030 2031 2032 2033 2034
			 u64 *dir, u64 *dir_gen, struct fs_path *name)
{
	int ret;
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_path *path;
	int len;
2035
	u64 parent_dir;
2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	key.objectid = ino;
	key.type = BTRFS_INODE_REF_KEY;
	key.offset = 0;

	ret = btrfs_search_slot_for_read(root, &key, path, 1, 0);
	if (ret < 0)
		goto out;
	if (!ret)
		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				path->slots[0]);
2051 2052 2053
	if (ret || found_key.objectid != ino ||
	    (found_key.type != BTRFS_INODE_REF_KEY &&
	     found_key.type != BTRFS_INODE_EXTREF_KEY)) {
2054 2055 2056 2057
		ret = -ENOENT;
		goto out;
	}

2058
	if (found_key.type == BTRFS_INODE_REF_KEY) {
2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075
		struct btrfs_inode_ref *iref;
		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
				      struct btrfs_inode_ref);
		len = btrfs_inode_ref_name_len(path->nodes[0], iref);
		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
						     (unsigned long)(iref + 1),
						     len);
		parent_dir = found_key.offset;
	} else {
		struct btrfs_inode_extref *extref;
		extref = btrfs_item_ptr(path->nodes[0], path->slots[0],
					struct btrfs_inode_extref);
		len = btrfs_inode_extref_name_len(path->nodes[0], extref);
		ret = fs_path_add_from_extent_buffer(name, path->nodes[0],
					(unsigned long)&extref->name, len);
		parent_dir = btrfs_inode_extref_parent(path->nodes[0], extref);
	}
2076 2077 2078 2079
	if (ret < 0)
		goto out;
	btrfs_release_path(path);

2080
	if (dir_gen) {
2081
		ret = get_inode_gen(root, parent_dir, dir_gen);
2082 2083 2084
		if (ret < 0)
			goto out;
	}
2085

2086
	*dir = parent_dir;
2087 2088 2089 2090 2091 2092

out:
	btrfs_free_path(path);
	return ret;
}

2093
static int is_first_ref(struct btrfs_root *root,
2094 2095 2096 2097 2098 2099 2100
			u64 ino, u64 dir,
			const char *name, int name_len)
{
	int ret;
	struct fs_path *tmp_name;
	u64 tmp_dir;

2101
	tmp_name = fs_path_alloc();
2102 2103 2104
	if (!tmp_name)
		return -ENOMEM;

2105
	ret = get_first_ref(root, ino, &tmp_dir, NULL, tmp_name);
2106 2107 2108
	if (ret < 0)
		goto out;

2109
	if (dir != tmp_dir || name_len != fs_path_len(tmp_name)) {
2110 2111 2112 2113
		ret = 0;
		goto out;
	}

2114
	ret = !memcmp(tmp_name->start, name, name_len);
2115 2116

out:
2117
	fs_path_free(tmp_name);
2118 2119 2120
	return ret;
}

2121 2122 2123 2124 2125 2126 2127 2128 2129 2130
/*
 * Used by process_recorded_refs to determine if a new ref would overwrite an
 * already existing ref. In case it detects an overwrite, it returns the
 * inode/gen in who_ino/who_gen.
 * When an overwrite is detected, process_recorded_refs does proper orphanizing
 * to make sure later references to the overwritten inode are possible.
 * Orphanizing is however only required for the first ref of an inode.
 * process_recorded_refs does an additional is_first_ref check to see if
 * orphanizing is really required.
 */
2131 2132
static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
			      const char *name, int name_len,
2133
			      u64 *who_ino, u64 *who_gen, u64 *who_mode)
2134
{
2135
	int ret;
2136
	u64 parent_root_dir_gen;
2137
	u64 other_inode = 0;
2138
	struct btrfs_inode_info info;
2139 2140

	if (!sctx->parent_root)
2141
		return 0;
2142

2143
	ret = is_inode_existent(sctx, dir, dir_gen, NULL, &parent_root_dir_gen);
2144
	if (ret <= 0)
2145
		return 0;
2146

2147 2148
	/*
	 * If we have a parent root we need to verify that the parent dir was
2149
	 * not deleted and then re-created, if it was then we have no overwrite
2150
	 * and we can just unlink this entry.
2151 2152 2153
	 *
	 * @parent_root_dir_gen was set to 0 if the inode does not exist in the
	 * parent root.
2154
	 */
2155 2156 2157
	if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID &&
	    parent_root_dir_gen != dir_gen)
		return 0;
2158

2159
	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
2160
				    &other_inode);
2161 2162 2163 2164
	if (ret == -ENOENT)
		return 0;
	else if (ret < 0)
		return ret;
2165

2166 2167 2168 2169 2170
	/*
	 * Check if the overwritten ref was already processed. If yes, the ref
	 * was already unlinked/moved, so we can safely assume that we will not
	 * overwrite anything at this point in time.
	 */
2171 2172
	if (other_inode > sctx->send_progress ||
	    is_waiting_for_move(sctx, other_inode)) {
2173
		ret = get_inode_info(sctx->parent_root, other_inode, &info);
2174
		if (ret < 0)
2175
			return ret;
2176 2177

		*who_ino = other_inode;
2178 2179
		*who_gen = info.gen;
		*who_mode = info.mode;
2180
		return 1;
2181 2182
	}

2183
	return 0;
2184 2185
}

2186 2187 2188 2189 2190 2191 2192
/*
 * Checks if the ref was overwritten by an already processed inode. This is
 * used by __get_cur_name_and_parent to find out if the ref was orphanized and
 * thus the orphan name needs be used.
 * process_recorded_refs also uses it to avoid unlinking of refs that were
 * overwritten.
 */
2193 2194 2195 2196 2197
static int did_overwrite_ref(struct send_ctx *sctx,
			    u64 dir, u64 dir_gen,
			    u64 ino, u64 ino_gen,
			    const char *name, int name_len)
{
2198
	int ret;
2199
	u64 ow_inode;
2200
	u64 ow_gen = 0;
2201
	u64 send_root_dir_gen;
2202 2203

	if (!sctx->parent_root)
2204
		return 0;
2205

2206
	ret = is_inode_existent(sctx, dir, dir_gen, &send_root_dir_gen, NULL);
2207
	if (ret <= 0)
2208
		return ret;
2209

2210 2211 2212 2213 2214 2215
	/*
	 * @send_root_dir_gen was set to 0 if the inode does not exist in the
	 * send root.
	 */
	if (dir != BTRFS_FIRST_FREE_OBJECTID && send_root_dir_gen != dir_gen)
		return 0;
2216

2217 2218
	/* check if the ref was overwritten by another ref */
	ret = lookup_dir_item_inode(sctx->send_root, dir, name, name_len,
2219
				    &ow_inode);
2220
	if (ret == -ENOENT) {
2221
		/* was never and will never be overwritten */
2222 2223 2224
		return 0;
	} else if (ret < 0) {
		return ret;
2225 2226
	}

2227 2228 2229 2230
	if (ow_inode == ino) {
		ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
		if (ret < 0)
			return ret;
2231

2232 2233 2234 2235
		/* It's the same inode, so no overwrite happened. */
		if (ow_gen == ino_gen)
			return 0;
	}
2236

2237 2238 2239
	/*
	 * We know that it is or will be overwritten. Check this now.
	 * The current inode being processed might have been the one that caused
2240 2241
	 * inode 'ino' to be orphanized, therefore check if ow_inode matches
	 * the current inode being processed.
2242
	 */
2243
	if (ow_inode < sctx->send_progress)
2244
		return 1;
2245

2246 2247 2248 2249 2250 2251 2252 2253 2254 2255
	if (ino != sctx->cur_ino && ow_inode == sctx->cur_ino) {
		if (ow_gen == 0) {
			ret = get_inode_gen(sctx->send_root, ow_inode, &ow_gen);
			if (ret < 0)
				return ret;
		}
		if (ow_gen == sctx->cur_inode_gen)
			return 1;
	}

2256
	return 0;
2257 2258
}

2259 2260 2261 2262 2263
/*
 * Same as did_overwrite_ref, but also checks if it is the first ref of an inode
 * that got overwritten. This is used by process_recorded_refs to determine
 * if it has to use the path as returned by get_cur_path or the orphan name.
 */
2264 2265 2266 2267 2268 2269 2270 2271 2272 2273
static int did_overwrite_first_ref(struct send_ctx *sctx, u64 ino, u64 gen)
{
	int ret = 0;
	struct fs_path *name = NULL;
	u64 dir;
	u64 dir_gen;

	if (!sctx->parent_root)
		goto out;

2274
	name = fs_path_alloc();
2275 2276 2277
	if (!name)
		return -ENOMEM;

2278
	ret = get_first_ref(sctx->parent_root, ino, &dir, &dir_gen, name);
2279 2280 2281 2282 2283 2284 2285
	if (ret < 0)
		goto out;

	ret = did_overwrite_ref(sctx, dir, dir_gen, ino, gen,
			name->start, fs_path_len(name));

out:
2286
	fs_path_free(name);
2287 2288 2289
	return ret;
}

2290 2291
static inline struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
							 u64 ino, u64 gen)
2292
{
2293
	struct btrfs_lru_cache_entry *entry;
2294

2295 2296
	entry = btrfs_lru_cache_lookup(&sctx->name_cache, ino, gen);
	if (!entry)
2297 2298
		return NULL;

2299
	return container_of(entry, struct name_cache_entry, entry);
2300 2301
}

2302 2303 2304 2305 2306 2307 2308 2309
/*
 * Used by get_cur_path for each ref up to the root.
 * Returns 0 if it succeeded.
 * Returns 1 if the inode is not existent or got overwritten. In that case, the
 * name is an orphan name. This instructs get_cur_path to stop iterating. If 1
 * is returned, parent_ino/parent_gen are not guaranteed to be valid.
 * Returns <0 in case of error.
 */
2310 2311 2312 2313 2314 2315 2316 2317
static int __get_cur_name_and_parent(struct send_ctx *sctx,
				     u64 ino, u64 gen,
				     u64 *parent_ino,
				     u64 *parent_gen,
				     struct fs_path *dest)
{
	int ret;
	int nce_ret;
2318
	struct name_cache_entry *nce;
2319

2320 2321 2322 2323 2324
	/*
	 * First check if we already did a call to this function with the same
	 * ino/gen. If yes, check if the cache entry is still up-to-date. If yes
	 * return the cached result.
	 */
2325 2326 2327
	nce = name_cache_search(sctx, ino, gen);
	if (nce) {
		if (ino < sctx->send_progress && nce->need_later_update) {
2328
			btrfs_lru_cache_remove(&sctx->name_cache, &nce->entry);
2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340
			nce = NULL;
		} else {
			*parent_ino = nce->parent_ino;
			*parent_gen = nce->parent_gen;
			ret = fs_path_add(dest, nce->name, nce->name_len);
			if (ret < 0)
				goto out;
			ret = nce->ret;
			goto out;
		}
	}

2341 2342 2343
	/*
	 * If the inode is not existent yet, add the orphan name and return 1.
	 * This should only happen for the parent dir that we determine in
2344
	 * record_new_ref_if_needed().
2345
	 */
2346
	ret = is_inode_existent(sctx, ino, gen, NULL, NULL);
2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357
	if (ret < 0)
		goto out;

	if (!ret) {
		ret = gen_unique_name(sctx, ino, gen, dest);
		if (ret < 0)
			goto out;
		ret = 1;
		goto out_cache;
	}

2358 2359 2360 2361
	/*
	 * Depending on whether the inode was already processed or not, use
	 * send_root or parent_root for ref lookup.
	 */
2362
	if (ino < sctx->send_progress)
2363 2364
		ret = get_first_ref(sctx->send_root, ino,
				    parent_ino, parent_gen, dest);
2365
	else
2366 2367
		ret = get_first_ref(sctx->parent_root, ino,
				    parent_ino, parent_gen, dest);
2368 2369 2370
	if (ret < 0)
		goto out;

2371 2372 2373 2374
	/*
	 * Check if the ref was overwritten by an inode's ref that was processed
	 * earlier. If yes, treat as orphan and return 1.
	 */
2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387
	ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen,
			dest->start, dest->end - dest->start);
	if (ret < 0)
		goto out;
	if (ret) {
		fs_path_reset(dest);
		ret = gen_unique_name(sctx, ino, gen, dest);
		if (ret < 0)
			goto out;
		ret = 1;
	}

out_cache:
2388 2389 2390
	/*
	 * Store the result of the lookup in the name cache.
	 */
2391
	nce = kmalloc(sizeof(*nce) + fs_path_len(dest) + 1, GFP_KERNEL);
2392 2393 2394 2395 2396
	if (!nce) {
		ret = -ENOMEM;
		goto out;
	}

2397 2398
	nce->entry.key = ino;
	nce->entry.gen = gen;
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409
	nce->parent_ino = *parent_ino;
	nce->parent_gen = *parent_gen;
	nce->name_len = fs_path_len(dest);
	nce->ret = ret;
	strcpy(nce->name, dest->start);

	if (ino < sctx->send_progress)
		nce->need_later_update = 0;
	else
		nce->need_later_update = 1;

2410 2411 2412
	nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL);
	if (nce_ret < 0) {
		kfree(nce);
2413
		ret = nce_ret;
2414
	}
2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429

out:
	return ret;
}

/*
 * Magic happens here. This function returns the first ref to an inode as it
 * would look like while receiving the stream at this point in time.
 * We walk the path up to the root. For every inode in between, we check if it
 * was already processed/sent. If yes, we continue with the parent as found
 * in send_root. If not, we continue with the parent as found in parent_root.
 * If we encounter an inode that was deleted at this point in time, we use the
 * inodes "orphan" name instead of the real name and stop. Same with new inodes
 * that were not created yet and overwritten inodes/refs.
 *
2430
 * When do we have orphan inodes:
2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453
 * 1. When an inode is freshly created and thus no valid refs are available yet
 * 2. When a directory lost all it's refs (deleted) but still has dir items
 *    inside which were not processed yet (pending for move/delete). If anyone
 *    tried to get the path to the dir items, it would get a path inside that
 *    orphan directory.
 * 3. When an inode is moved around or gets new links, it may overwrite the ref
 *    of an unprocessed inode. If in that case the first ref would be
 *    overwritten, the overwritten inode gets "orphanized". Later when we
 *    process this overwritten inode, it is restored at a new place by moving
 *    the orphan inode.
 *
 * sctx->send_progress tells this function at which point in time receiving
 * would be.
 */
static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen,
			struct fs_path *dest)
{
	int ret = 0;
	struct fs_path *name = NULL;
	u64 parent_inode = 0;
	u64 parent_gen = 0;
	int stop = 0;

2454
	name = fs_path_alloc();
2455 2456 2457 2458 2459 2460 2461 2462 2463
	if (!name) {
		ret = -ENOMEM;
		goto out;
	}

	dest->reversed = 1;
	fs_path_reset(dest);

	while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) {
2464 2465
		struct waiting_dir_move *wdm;

2466 2467
		fs_path_reset(name);

2468
		if (is_waiting_for_rm(sctx, ino, gen)) {
2469 2470 2471 2472 2473 2474 2475
			ret = gen_unique_name(sctx, ino, gen, name);
			if (ret < 0)
				goto out;
			ret = fs_path_add_path(dest, name);
			break;
		}

2476 2477 2478 2479 2480
		wdm = get_waiting_dir_move(sctx, ino);
		if (wdm && wdm->orphanized) {
			ret = gen_unique_name(sctx, ino, gen, name);
			stop = 1;
		} else if (wdm) {
2481 2482 2483 2484 2485 2486 2487 2488 2489 2490
			ret = get_first_ref(sctx->parent_root, ino,
					    &parent_inode, &parent_gen, name);
		} else {
			ret = __get_cur_name_and_parent(sctx, ino, gen,
							&parent_inode,
							&parent_gen, name);
			if (ret)
				stop = 1;
		}

2491 2492
		if (ret < 0)
			goto out;
2493

2494 2495 2496 2497 2498 2499 2500 2501 2502
		ret = fs_path_add_path(dest, name);
		if (ret < 0)
			goto out;

		ino = parent_inode;
		gen = parent_gen;
	}

out:
2503
	fs_path_free(name);
2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523
	if (!ret)
		fs_path_unreverse(dest);
	return ret;
}

/*
 * Sends a BTRFS_SEND_C_SUBVOL command/item to userspace
 */
static int send_subvol_begin(struct send_ctx *sctx)
{
	int ret;
	struct btrfs_root *send_root = sctx->send_root;
	struct btrfs_root *parent_root = sctx->parent_root;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_root_ref *ref;
	struct extent_buffer *leaf;
	char *name = NULL;
	int namelen;

2524
	path = btrfs_alloc_path();
2525 2526 2527
	if (!path)
		return -ENOMEM;

2528
	name = kmalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
2529 2530 2531 2532 2533
	if (!name) {
		btrfs_free_path(path);
		return -ENOMEM;
	}

2534
	key.objectid = btrfs_root_id(send_root);
2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549
	key.type = BTRFS_ROOT_BACKREF_KEY;
	key.offset = 0;

	ret = btrfs_search_slot_for_read(send_root->fs_info->tree_root,
				&key, path, 1, 0);
	if (ret < 0)
		goto out;
	if (ret) {
		ret = -ENOENT;
		goto out;
	}

	leaf = path->nodes[0];
	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
	if (key.type != BTRFS_ROOT_BACKREF_KEY ||
2550
	    key.objectid != btrfs_root_id(send_root)) {
2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569
		ret = -ENOENT;
		goto out;
	}
	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
	namelen = btrfs_root_ref_name_len(leaf, ref);
	read_extent_buffer(leaf, name, (unsigned long)(ref + 1), namelen);
	btrfs_release_path(path);

	if (parent_root) {
		ret = begin_cmd(sctx, BTRFS_SEND_C_SNAPSHOT);
		if (ret < 0)
			goto out;
	} else {
		ret = begin_cmd(sctx, BTRFS_SEND_C_SUBVOL);
		if (ret < 0)
			goto out;
	}

	TLV_PUT_STRING(sctx, BTRFS_SEND_A_PATH, name, namelen);
2570 2571 2572 2573 2574 2575 2576 2577

	if (!btrfs_is_empty_uuid(sctx->send_root->root_item.received_uuid))
		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
			    sctx->send_root->root_item.received_uuid);
	else
		TLV_PUT_UUID(sctx, BTRFS_SEND_A_UUID,
			    sctx->send_root->root_item.uuid);

2578
	TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID,
2579
		    btrfs_root_ctransid(&sctx->send_root->root_item));
2580
	if (parent_root) {
2581 2582 2583 2584 2585 2586
		if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid))
			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
				     parent_root->root_item.received_uuid);
		else
			TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
				     parent_root->root_item.uuid);
2587
		TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
2588
			    btrfs_root_ctransid(&sctx->parent_root->root_item));
2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601
	}

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	btrfs_free_path(path);
	kfree(name);
	return ret;
}

static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size)
{
2602
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2603 2604 2605
	int ret = 0;
	struct fs_path *p;

2606
	btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size);
2607

2608
	p = fs_path_alloc();
2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625
	if (!p)
		return -ENOMEM;

	ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, ino, gen, p);
	if (ret < 0)
		goto out;
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
2626
	fs_path_free(p);
2627 2628 2629 2630 2631
	return ret;
}

static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode)
{
2632
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2633 2634 2635
	int ret = 0;
	struct fs_path *p;

2636
	btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode);
2637

2638
	p = fs_path_alloc();
2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655
	if (!p)
		return -ENOMEM;

	ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, ino, gen, p);
	if (ret < 0)
		goto out;
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
2656
	fs_path_free(p);
2657 2658 2659
	return ret;
}

2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692
static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr)
{
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
	int ret = 0;
	struct fs_path *p;

	if (sctx->proto < 2)
		return 0;

	btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr);

	p = fs_path_alloc();
	if (!p)
		return -ENOMEM;

	ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, ino, gen, p);
	if (ret < 0)
		goto out;
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	fs_path_free(p);
	return ret;
}

2693 2694
static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid)
{
2695
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2696 2697 2698
	int ret = 0;
	struct fs_path *p;

2699 2700
	btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu",
		    ino, uid, gid);
2701

2702
	p = fs_path_alloc();
2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720
	if (!p)
		return -ENOMEM;

	ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, ino, gen, p);
	if (ret < 0)
		goto out;
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
2721
	fs_path_free(p);
2722 2723 2724 2725 2726
	return ret;
}

static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen)
{
2727
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2728 2729 2730 2731 2732 2733 2734 2735
	int ret = 0;
	struct fs_path *p = NULL;
	struct btrfs_inode_item *ii;
	struct btrfs_path *path = NULL;
	struct extent_buffer *eb;
	struct btrfs_key key;
	int slot;

2736
	btrfs_debug(fs_info, "send_utimes %llu", ino);
2737

2738
	p = fs_path_alloc();
2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751
	if (!p)
		return -ENOMEM;

	path = alloc_path_for_send();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	key.objectid = ino;
	key.type = BTRFS_INODE_ITEM_KEY;
	key.offset = 0;
	ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
2752 2753
	if (ret > 0)
		ret = -ENOENT;
2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768
	if (ret < 0)
		goto out;

	eb = path->nodes[0];
	slot = path->slots[0];
	ii = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);

	ret = begin_cmd(sctx, BTRFS_SEND_C_UTIMES);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, ino, gen, p);
	if (ret < 0)
		goto out;
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2769 2770 2771
	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
	TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
2772 2773
	if (sctx->proto >= 2)
		TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime);
2774 2775 2776 2777 2778

	ret = send_cmd(sctx);

tlv_put_failure:
out:
2779
	fs_path_free(p);
2780 2781 2782 2783
	btrfs_free_path(path);
	return ret;
}

2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822
/*
 * If the cache is full, we can't remove entries from it and do a call to
 * send_utimes() for each respective inode, because we might be finishing
 * processing an inode that is a directory and it just got renamed, and existing
 * entries in the cache may refer to inodes that have the directory in their
 * full path - in which case we would generate outdated paths (pre-rename)
 * for the inodes that the cache entries point to. Instead of prunning the
 * cache when inserting, do it after we finish processing each inode at
 * finish_inode_if_needed().
 */
static int cache_dir_utimes(struct send_ctx *sctx, u64 dir, u64 gen)
{
	struct btrfs_lru_cache_entry *entry;
	int ret;

	entry = btrfs_lru_cache_lookup(&sctx->dir_utimes_cache, dir, gen);
	if (entry != NULL)
		return 0;

	/* Caching is optional, don't fail if we can't allocate memory. */
	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
	if (!entry)
		return send_utimes(sctx, dir, gen);

	entry->key = dir;
	entry->gen = gen;

	ret = btrfs_lru_cache_store(&sctx->dir_utimes_cache, entry, GFP_KERNEL);
	ASSERT(ret != -EEXIST);
	if (ret) {
		kfree(entry);
		return send_utimes(sctx, dir, gen);
	}

	return 0;
}

static int trim_dir_utimes_cache(struct send_ctx *sctx)
{
2823
	while (sctx->dir_utimes_cache.size > SEND_MAX_DIR_UTIMES_CACHE_SIZE) {
2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839
		struct btrfs_lru_cache_entry *lru;
		int ret;

		lru = btrfs_lru_cache_lru_entry(&sctx->dir_utimes_cache);
		ASSERT(lru != NULL);

		ret = send_utimes(sctx, lru->key, lru->gen);
		if (ret)
			return ret;

		btrfs_lru_cache_remove(&sctx->dir_utimes_cache, lru);
	}

	return 0;
}

2840 2841 2842 2843 2844
/*
 * Sends a BTRFS_SEND_C_MKXXX or SYMLINK command to user space. We don't have
 * a valid path yet because we did not process the refs yet. So, the inode
 * is created as orphan.
 */
2845
static int send_create_inode(struct send_ctx *sctx, u64 ino)
2846
{
2847
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
2848 2849 2850
	int ret = 0;
	struct fs_path *p;
	int cmd;
2851
	struct btrfs_inode_info info;
2852
	u64 gen;
2853
	u64 mode;
2854
	u64 rdev;
2855

2856
	btrfs_debug(fs_info, "send_create_inode %llu", ino);
2857

2858
	p = fs_path_alloc();
2859 2860 2861
	if (!p)
		return -ENOMEM;

2862
	if (ino != sctx->cur_ino) {
2863
		ret = get_inode_info(sctx->send_root, ino, &info);
2864 2865
		if (ret < 0)
			goto out;
2866 2867 2868
		gen = info.gen;
		mode = info.mode;
		rdev = info.rdev;
2869 2870 2871 2872 2873
	} else {
		gen = sctx->cur_inode_gen;
		mode = sctx->cur_inode_mode;
		rdev = sctx->cur_inode_rdev;
	}
2874

2875
	if (S_ISREG(mode)) {
2876
		cmd = BTRFS_SEND_C_MKFILE;
2877
	} else if (S_ISDIR(mode)) {
2878
		cmd = BTRFS_SEND_C_MKDIR;
2879
	} else if (S_ISLNK(mode)) {
2880
		cmd = BTRFS_SEND_C_SYMLINK;
2881
	} else if (S_ISCHR(mode) || S_ISBLK(mode)) {
2882
		cmd = BTRFS_SEND_C_MKNOD;
2883
	} else if (S_ISFIFO(mode)) {
2884
		cmd = BTRFS_SEND_C_MKFIFO;
2885
	} else if (S_ISSOCK(mode)) {
2886
		cmd = BTRFS_SEND_C_MKSOCK;
2887
	} else {
2888
		btrfs_warn(sctx->send_root->fs_info, "unexpected inode type %o",
2889
				(int)(mode & S_IFMT));
2890
		ret = -EOPNOTSUPP;
2891 2892 2893 2894 2895 2896 2897
		goto out;
	}

	ret = begin_cmd(sctx, cmd);
	if (ret < 0)
		goto out;

2898
	ret = gen_unique_name(sctx, ino, gen, p);
2899 2900 2901 2902
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
2903
	TLV_PUT_U64(sctx, BTRFS_SEND_A_INO, ino);
2904 2905 2906

	if (S_ISLNK(mode)) {
		fs_path_reset(p);
2907
		ret = read_symlink(sctx->send_root, ino, p);
2908 2909 2910 2911 2912
		if (ret < 0)
			goto out;
		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, p);
	} else if (S_ISCHR(mode) || S_ISBLK(mode) ||
		   S_ISFIFO(mode) || S_ISSOCK(mode)) {
2913 2914
		TLV_PUT_U64(sctx, BTRFS_SEND_A_RDEV, new_encode_dev(rdev));
		TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode);
2915 2916 2917 2918 2919 2920 2921 2922 2923
	}

	ret = send_cmd(sctx);
	if (ret < 0)
		goto out;


tlv_put_failure:
out:
2924
	fs_path_free(p);
2925 2926 2927
	return ret;
}

2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938
static void cache_dir_created(struct send_ctx *sctx, u64 dir)
{
	struct btrfs_lru_cache_entry *entry;
	int ret;

	/* Caching is optional, ignore any failures. */
	entry = kmalloc(sizeof(*entry), GFP_KERNEL);
	if (!entry)
		return;

	entry->key = dir;
2939
	entry->gen = 0;
2940 2941 2942 2943 2944
	ret = btrfs_lru_cache_store(&sctx->dir_created_cache, entry, GFP_KERNEL);
	if (ret < 0)
		kfree(entry);
}

2945 2946 2947 2948 2949 2950 2951 2952
/*
 * We need some special handling for inodes that get processed before the parent
 * directory got created. See process_recorded_refs for details.
 * This function does the check if we already created the dir out of order.
 */
static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
	int ret = 0;
2953
	int iter_ret = 0;
2954 2955 2956 2957 2958 2959
	struct btrfs_path *path = NULL;
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_key di_key;
	struct btrfs_dir_item *di;

2960
	if (btrfs_lru_cache_lookup(&sctx->dir_created_cache, dir, 0))
2961 2962
		return 1;

2963
	path = alloc_path_for_send();
2964 2965
	if (!path)
		return -ENOMEM;
2966 2967 2968 2969

	key.objectid = dir;
	key.type = BTRFS_DIR_INDEX_KEY;
	key.offset = 0;
2970

2971 2972
	btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
		struct extent_buffer *eb = path->nodes[0];
2973 2974

		if (found_key.objectid != key.objectid ||
2975 2976
		    found_key.type != key.type) {
			ret = 0;
2977
			break;
2978 2979
		}

2980
		di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
2981 2982
		btrfs_dir_item_key_to_cpu(eb, di, &di_key);

2983 2984
		if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
		    di_key.objectid < sctx->send_progress) {
2985
			ret = 1;
2986
			cache_dir_created(sctx, dir);
2987
			break;
2988 2989
		}
	}
2990 2991 2992
	/* Catch error found during iteration */
	if (iter_ret < 0)
		ret = iter_ret;
2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010

	btrfs_free_path(path);
	return ret;
}

/*
 * Only creates the inode if it is:
 * 1. Not a directory
 * 2. Or a directory which was not created already due to out of order
 *    directories. See did_create_dir and process_recorded_refs for details.
 */
static int send_create_inode_if_needed(struct send_ctx *sctx)
{
	int ret;

	if (S_ISDIR(sctx->cur_inode_mode)) {
		ret = did_create_dir(sctx, sctx->cur_ino);
		if (ret < 0)
3011 3012 3013
			return ret;
		else if (ret > 0)
			return 0;
3014 3015
	}

3016 3017 3018 3019 3020 3021
	ret = send_create_inode(sctx, sctx->cur_ino);

	if (ret == 0 && S_ISDIR(sctx->cur_inode_mode))
		cache_dir_created(sctx, sctx->cur_ino);

	return ret;
3022 3023
}

3024 3025 3026 3027 3028 3029 3030
struct recorded_ref {
	struct list_head list;
	char *name;
	struct fs_path *full_path;
	u64 dir;
	u64 dir_gen;
	int name_len;
3031 3032
	struct rb_node node;
	struct rb_root *root;
3033 3034
};

3035 3036 3037 3038 3039 3040 3041
static struct recorded_ref *recorded_ref_alloc(void)
{
	struct recorded_ref *ref;

	ref = kzalloc(sizeof(*ref), GFP_KERNEL);
	if (!ref)
		return NULL;
3042
	RB_CLEAR_NODE(&ref->node);
3043 3044 3045 3046 3047 3048 3049 3050
	INIT_LIST_HEAD(&ref->list);
	return ref;
}

static void recorded_ref_free(struct recorded_ref *ref)
{
	if (!ref)
		return;
3051 3052
	if (!RB_EMPTY_NODE(&ref->node))
		rb_erase(&ref->node, ref->root);
3053 3054 3055 3056 3057
	list_del(&ref->list);
	fs_path_free(ref->full_path);
	kfree(ref);
}

3058 3059 3060 3061 3062 3063 3064
static void set_ref_path(struct recorded_ref *ref, struct fs_path *path)
{
	ref->full_path = path;
	ref->name = (char *)kbasename(ref->full_path->start);
	ref->name_len = ref->full_path->end - ref->name;
}

3065 3066 3067 3068
static int dup_ref(struct recorded_ref *ref, struct list_head *list)
{
	struct recorded_ref *new;

3069
	new = recorded_ref_alloc();
3070 3071 3072 3073 3074 3075 3076 3077 3078
	if (!new)
		return -ENOMEM;

	new->dir = ref->dir;
	new->dir_gen = ref->dir_gen;
	list_add_tail(&new->list, list);
	return 0;
}

3079
static void __free_recorded_refs(struct list_head *head)
3080 3081 3082
{
	struct recorded_ref *cur;

3083 3084
	while (!list_empty(head)) {
		cur = list_entry(head->next, struct recorded_ref, list);
3085
		recorded_ref_free(cur);
3086 3087 3088 3089 3090
	}
}

static void free_recorded_refs(struct send_ctx *sctx)
{
3091 3092
	__free_recorded_refs(&sctx->new_refs);
	__free_recorded_refs(&sctx->deleted_refs);
3093 3094 3095
}

/*
3096
 * Renames/moves a file/dir to its orphan name. Used when the first
3097 3098 3099 3100 3101 3102 3103 3104 3105
 * ref of an unprocessed inode gets overwritten and for all non empty
 * directories.
 */
static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen,
			  struct fs_path *path)
{
	int ret;
	struct fs_path *orphan;

3106
	orphan = fs_path_alloc();
3107 3108 3109 3110 3111 3112 3113 3114 3115 3116
	if (!orphan)
		return -ENOMEM;

	ret = gen_unique_name(sctx, ino, gen, orphan);
	if (ret < 0)
		goto out;

	ret = send_rename(sctx, path, orphan);

out:
3117
	fs_path_free(orphan);
3118 3119 3120
	return ret;
}

3121 3122
static struct orphan_dir_info *add_orphan_dir_info(struct send_ctx *sctx,
						   u64 dir_ino, u64 dir_gen)
3123 3124 3125 3126 3127 3128 3129 3130
{
	struct rb_node **p = &sctx->orphan_dirs.rb_node;
	struct rb_node *parent = NULL;
	struct orphan_dir_info *entry, *odi;

	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct orphan_dir_info, node);
3131
		if (dir_ino < entry->ino)
3132
			p = &(*p)->rb_left;
3133
		else if (dir_ino > entry->ino)
3134
			p = &(*p)->rb_right;
3135 3136 3137 3138 3139
		else if (dir_gen < entry->gen)
			p = &(*p)->rb_left;
		else if (dir_gen > entry->gen)
			p = &(*p)->rb_right;
		else
3140 3141 3142
			return entry;
	}

3143 3144 3145 3146
	odi = kmalloc(sizeof(*odi), GFP_KERNEL);
	if (!odi)
		return ERR_PTR(-ENOMEM);
	odi->ino = dir_ino;
3147
	odi->gen = dir_gen;
3148
	odi->last_dir_index_offset = 0;
3149
	odi->dir_high_seq_ino = 0;
3150

3151 3152 3153 3154 3155
	rb_link_node(&odi->node, parent, p);
	rb_insert_color(&odi->node, &sctx->orphan_dirs);
	return odi;
}

3156 3157
static struct orphan_dir_info *get_orphan_dir_info(struct send_ctx *sctx,
						   u64 dir_ino, u64 gen)
3158 3159 3160 3161 3162 3163 3164 3165 3166 3167
{
	struct rb_node *n = sctx->orphan_dirs.rb_node;
	struct orphan_dir_info *entry;

	while (n) {
		entry = rb_entry(n, struct orphan_dir_info, node);
		if (dir_ino < entry->ino)
			n = n->rb_left;
		else if (dir_ino > entry->ino)
			n = n->rb_right;
3168 3169 3170 3171
		else if (gen < entry->gen)
			n = n->rb_left;
		else if (gen > entry->gen)
			n = n->rb_right;
3172 3173 3174 3175 3176 3177
		else
			return entry;
	}
	return NULL;
}

3178
static int is_waiting_for_rm(struct send_ctx *sctx, u64 dir_ino, u64 gen)
3179
{
3180
	struct orphan_dir_info *odi = get_orphan_dir_info(sctx, dir_ino, gen);
3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193

	return odi != NULL;
}

static void free_orphan_dir_info(struct send_ctx *sctx,
				 struct orphan_dir_info *odi)
{
	if (!odi)
		return;
	rb_erase(&odi->node, &sctx->orphan_dirs);
	kfree(odi);
}

3194 3195 3196 3197 3198
/*
 * Returns 1 if a directory can be removed at this point in time.
 * We check this by iterating all dir items and checking if the inode behind
 * the dir item was already processed.
 */
3199
static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen)
3200 3201
{
	int ret = 0;
3202
	int iter_ret = 0;
3203 3204 3205 3206 3207 3208
	struct btrfs_root *root = sctx->parent_root;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_key found_key;
	struct btrfs_key loc;
	struct btrfs_dir_item *di;
3209
	struct orphan_dir_info *odi = NULL;
3210 3211
	u64 dir_high_seq_ino = 0;
	u64 last_dir_index_offset = 0;
3212

3213 3214 3215 3216 3217 3218
	/*
	 * Don't try to rmdir the top/root subvolume dir.
	 */
	if (dir == BTRFS_FIRST_FREE_OBJECTID)
		return 0;

3219 3220 3221 3222
	odi = get_orphan_dir_info(sctx, dir, dir_gen);
	if (odi && sctx->cur_ino < odi->dir_high_seq_ino)
		return 0;

3223 3224 3225 3226
	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271
	if (!odi) {
		/*
		 * Find the inode number associated with the last dir index
		 * entry. This is very likely the inode with the highest number
		 * of all inodes that have an entry in the directory. We can
		 * then use it to avoid future calls to can_rmdir(), when
		 * processing inodes with a lower number, from having to search
		 * the parent root b+tree for dir index keys.
		 */
		key.objectid = dir;
		key.type = BTRFS_DIR_INDEX_KEY;
		key.offset = (u64)-1;

		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
		if (ret < 0) {
			goto out;
		} else if (ret > 0) {
			/* Can't happen, the root is never empty. */
			ASSERT(path->slots[0] > 0);
			if (WARN_ON(path->slots[0] == 0)) {
				ret = -EUCLEAN;
				goto out;
			}
			path->slots[0]--;
		}

		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
		if (key.objectid != dir || key.type != BTRFS_DIR_INDEX_KEY) {
			/* No index keys, dir can be removed. */
			ret = 1;
			goto out;
		}

		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
				    struct btrfs_dir_item);
		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);
		dir_high_seq_ino = loc.objectid;
		if (sctx->cur_ino < dir_high_seq_ino) {
			ret = 0;
			goto out;
		}

		btrfs_release_path(path);
	}

3272 3273
	key.objectid = dir;
	key.type = BTRFS_DIR_INDEX_KEY;
3274
	key.offset = (odi ? odi->last_dir_index_offset : 0);
3275

3276
	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
3277 3278
		struct waiting_dir_move *dm;

3279 3280
		if (found_key.objectid != key.objectid ||
		    found_key.type != key.type)
3281 3282 3283 3284 3285 3286
			break;

		di = btrfs_item_ptr(path->nodes[0], path->slots[0],
				struct btrfs_dir_item);
		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &loc);

3287 3288 3289
		dir_high_seq_ino = max(dir_high_seq_ino, loc.objectid);
		last_dir_index_offset = found_key.offset;

3290 3291 3292
		dm = get_waiting_dir_move(sctx, loc.objectid);
		if (dm) {
			dm->rmdir_ino = dir;
3293
			dm->rmdir_gen = dir_gen;
3294 3295 3296 3297
			ret = 0;
			goto out;
		}

3298
		if (loc.objectid > sctx->cur_ino) {
3299 3300 3301
			ret = 0;
			goto out;
		}
3302 3303 3304 3305
	}
	if (iter_ret < 0) {
		ret = iter_ret;
		goto out;
3306
	}
3307
	free_orphan_dir_info(sctx, odi);
3308 3309 3310 3311 3312

	ret = 1;

out:
	btrfs_free_path(path);
3313 3314 3315 3316

	if (ret)
		return ret;

3317 3318 3319 3320 3321 3322 3323
	if (!odi) {
		odi = add_orphan_dir_info(sctx, dir, dir_gen);
		if (IS_ERR(odi))
			return PTR_ERR(odi);

		odi->gen = dir_gen;
	}
3324

3325 3326
	odi->last_dir_index_offset = last_dir_index_offset;
	odi->dir_high_seq_ino = max(odi->dir_high_seq_ino, dir_high_seq_ino);
3327 3328

	return 0;
3329 3330
}

3331 3332
static int is_waiting_for_move(struct send_ctx *sctx, u64 ino)
{
3333
	struct waiting_dir_move *entry = get_waiting_dir_move(sctx, ino);
3334

3335
	return entry != NULL;
3336 3337
}

3338
static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized)
3339 3340 3341 3342 3343
{
	struct rb_node **p = &sctx->waiting_dir_moves.rb_node;
	struct rb_node *parent = NULL;
	struct waiting_dir_move *entry, *dm;

3344
	dm = kmalloc(sizeof(*dm), GFP_KERNEL);
3345 3346 3347
	if (!dm)
		return -ENOMEM;
	dm->ino = ino;
3348
	dm->rmdir_ino = 0;
3349
	dm->rmdir_gen = 0;
3350
	dm->orphanized = orphanized;
3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369

	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct waiting_dir_move, node);
		if (ino < entry->ino) {
			p = &(*p)->rb_left;
		} else if (ino > entry->ino) {
			p = &(*p)->rb_right;
		} else {
			kfree(dm);
			return -EEXIST;
		}
	}

	rb_link_node(&dm->node, parent, p);
	rb_insert_color(&dm->node, &sctx->waiting_dir_moves);
	return 0;
}

3370 3371
static struct waiting_dir_move *
get_waiting_dir_move(struct send_ctx *sctx, u64 ino)
3372 3373 3374 3375 3376 3377
{
	struct rb_node *n = sctx->waiting_dir_moves.rb_node;
	struct waiting_dir_move *entry;

	while (n) {
		entry = rb_entry(n, struct waiting_dir_move, node);
3378
		if (ino < entry->ino)
3379
			n = n->rb_left;
3380
		else if (ino > entry->ino)
3381
			n = n->rb_right;
3382 3383
		else
			return entry;
3384
	}
3385 3386 3387 3388 3389 3390 3391 3392 3393 3394
	return NULL;
}

static void free_waiting_dir_move(struct send_ctx *sctx,
				  struct waiting_dir_move *dm)
{
	if (!dm)
		return;
	rb_erase(&dm->node, &sctx->waiting_dir_moves);
	kfree(dm);
3395 3396
}

3397 3398 3399
static int add_pending_dir_move(struct send_ctx *sctx,
				u64 ino,
				u64 ino_gen,
3400 3401
				u64 parent_ino,
				struct list_head *new_refs,
3402 3403
				struct list_head *deleted_refs,
				const bool is_orphan)
3404 3405 3406
{
	struct rb_node **p = &sctx->pending_dir_moves.rb_node;
	struct rb_node *parent = NULL;
3407
	struct pending_dir_move *entry = NULL, *pm;
3408 3409 3410 3411
	struct recorded_ref *cur;
	int exists = 0;
	int ret;

3412
	pm = kmalloc(sizeof(*pm), GFP_KERNEL);
3413 3414 3415
	if (!pm)
		return -ENOMEM;
	pm->parent_ino = parent_ino;
3416 3417
	pm->ino = ino;
	pm->gen = ino_gen;
3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434
	INIT_LIST_HEAD(&pm->list);
	INIT_LIST_HEAD(&pm->update_refs);
	RB_CLEAR_NODE(&pm->node);

	while (*p) {
		parent = *p;
		entry = rb_entry(parent, struct pending_dir_move, node);
		if (parent_ino < entry->parent_ino) {
			p = &(*p)->rb_left;
		} else if (parent_ino > entry->parent_ino) {
			p = &(*p)->rb_right;
		} else {
			exists = 1;
			break;
		}
	}

3435
	list_for_each_entry(cur, deleted_refs, list) {
3436 3437 3438 3439
		ret = dup_ref(cur, &pm->update_refs);
		if (ret < 0)
			goto out;
	}
3440
	list_for_each_entry(cur, new_refs, list) {
3441 3442 3443 3444 3445
		ret = dup_ref(cur, &pm->update_refs);
		if (ret < 0)
			goto out;
	}

3446
	ret = add_waiting_dir_move(sctx, pm->ino, is_orphan);
3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482
	if (ret)
		goto out;

	if (exists) {
		list_add_tail(&pm->list, &entry->list);
	} else {
		rb_link_node(&pm->node, parent, p);
		rb_insert_color(&pm->node, &sctx->pending_dir_moves);
	}
	ret = 0;
out:
	if (ret) {
		__free_recorded_refs(&pm->update_refs);
		kfree(pm);
	}
	return ret;
}

static struct pending_dir_move *get_pending_dir_moves(struct send_ctx *sctx,
						      u64 parent_ino)
{
	struct rb_node *n = sctx->pending_dir_moves.rb_node;
	struct pending_dir_move *entry;

	while (n) {
		entry = rb_entry(n, struct pending_dir_move, node);
		if (parent_ino < entry->parent_ino)
			n = n->rb_left;
		else if (parent_ino > entry->parent_ino)
			n = n->rb_right;
		else
			return entry;
	}
	return NULL;
}

3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494
static int path_loop(struct send_ctx *sctx, struct fs_path *name,
		     u64 ino, u64 gen, u64 *ancestor_ino)
{
	int ret = 0;
	u64 parent_inode = 0;
	u64 parent_gen = 0;
	u64 start_ino = ino;

	*ancestor_ino = 0;
	while (ino != BTRFS_FIRST_FREE_OBJECTID) {
		fs_path_reset(name);

3495
		if (is_waiting_for_rm(sctx, ino, gen))
3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524
			break;
		if (is_waiting_for_move(sctx, ino)) {
			if (*ancestor_ino == 0)
				*ancestor_ino = ino;
			ret = get_first_ref(sctx->parent_root, ino,
					    &parent_inode, &parent_gen, name);
		} else {
			ret = __get_cur_name_and_parent(sctx, ino, gen,
							&parent_inode,
							&parent_gen, name);
			if (ret > 0) {
				ret = 0;
				break;
			}
		}
		if (ret < 0)
			break;
		if (parent_inode == start_ino) {
			ret = 1;
			if (*ancestor_ino == 0)
				*ancestor_ino = ino;
			break;
		}
		ino = parent_inode;
		gen = parent_gen;
	}
	return ret;
}

3525 3526 3527 3528
static int apply_dir_move(struct send_ctx *sctx, struct pending_dir_move *pm)
{
	struct fs_path *from_path = NULL;
	struct fs_path *to_path = NULL;
3529
	struct fs_path *name = NULL;
3530 3531
	u64 orig_progress = sctx->send_progress;
	struct recorded_ref *cur;
3532
	u64 parent_ino, parent_gen;
3533 3534
	struct waiting_dir_move *dm = NULL;
	u64 rmdir_ino = 0;
3535
	u64 rmdir_gen;
3536 3537
	u64 ancestor;
	bool is_orphan;
3538 3539
	int ret;

3540
	name = fs_path_alloc();
3541
	from_path = fs_path_alloc();
3542 3543 3544 3545
	if (!name || !from_path) {
		ret = -ENOMEM;
		goto out;
	}
3546

3547 3548 3549
	dm = get_waiting_dir_move(sctx, pm->ino);
	ASSERT(dm);
	rmdir_ino = dm->rmdir_ino;
3550
	rmdir_gen = dm->rmdir_gen;
3551
	is_orphan = dm->orphanized;
3552
	free_waiting_dir_move(sctx, dm);
3553

3554
	if (is_orphan) {
3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567
		ret = gen_unique_name(sctx, pm->ino,
				      pm->gen, from_path);
	} else {
		ret = get_first_ref(sctx->parent_root, pm->ino,
				    &parent_ino, &parent_gen, name);
		if (ret < 0)
			goto out;
		ret = get_cur_path(sctx, parent_ino, parent_gen,
				   from_path);
		if (ret < 0)
			goto out;
		ret = fs_path_add_path(from_path, name);
	}
3568 3569
	if (ret < 0)
		goto out;
3570

3571
	sctx->send_progress = sctx->cur_ino + 1;
3572
	ret = path_loop(sctx, name, pm->ino, pm->gen, &ancestor);
3573 3574
	if (ret < 0)
		goto out;
3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586
	if (ret) {
		LIST_HEAD(deleted_refs);
		ASSERT(ancestor > BTRFS_FIRST_FREE_OBJECTID);
		ret = add_pending_dir_move(sctx, pm->ino, pm->gen, ancestor,
					   &pm->update_refs, &deleted_refs,
					   is_orphan);
		if (ret < 0)
			goto out;
		if (rmdir_ino) {
			dm = get_waiting_dir_move(sctx, pm->ino);
			ASSERT(dm);
			dm->rmdir_ino = rmdir_ino;
3587
			dm->rmdir_gen = rmdir_gen;
3588 3589 3590
		}
		goto out;
	}
3591 3592
	fs_path_reset(name);
	to_path = name;
3593
	name = NULL;
3594 3595 3596 3597 3598 3599 3600 3601
	ret = get_cur_path(sctx, pm->ino, pm->gen, to_path);
	if (ret < 0)
		goto out;

	ret = send_rename(sctx, from_path, to_path);
	if (ret < 0)
		goto out;

3602 3603
	if (rmdir_ino) {
		struct orphan_dir_info *odi;
3604
		u64 gen;
3605

3606
		odi = get_orphan_dir_info(sctx, rmdir_ino, rmdir_gen);
3607 3608 3609 3610
		if (!odi) {
			/* already deleted */
			goto finish;
		}
3611 3612
		gen = odi->gen;

3613
		ret = can_rmdir(sctx, rmdir_ino, gen);
3614 3615 3616 3617 3618 3619 3620 3621 3622 3623
		if (ret < 0)
			goto out;
		if (!ret)
			goto finish;

		name = fs_path_alloc();
		if (!name) {
			ret = -ENOMEM;
			goto out;
		}
3624
		ret = get_cur_path(sctx, rmdir_ino, gen, name);
3625 3626 3627 3628 3629 3630 3631 3632
		if (ret < 0)
			goto out;
		ret = send_rmdir(sctx, name);
		if (ret < 0)
			goto out;
	}

finish:
3633
	ret = cache_dir_utimes(sctx, pm->ino, pm->gen);
3634 3635 3636 3637 3638 3639 3640 3641
	if (ret < 0)
		goto out;

	/*
	 * After rename/move, need to update the utimes of both new parent(s)
	 * and old parent(s).
	 */
	list_for_each_entry(cur, &pm->update_refs, list) {
3642 3643 3644
		/*
		 * The parent inode might have been deleted in the send snapshot
		 */
3645
		ret = get_inode_info(sctx->send_root, cur->dir, NULL);
3646 3647
		if (ret == -ENOENT) {
			ret = 0;
3648
			continue;
3649 3650 3651 3652
		}
		if (ret < 0)
			goto out;

3653
		ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
3654 3655 3656 3657 3658
		if (ret < 0)
			goto out;
	}

out:
3659
	fs_path_free(name);
3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676
	fs_path_free(from_path);
	fs_path_free(to_path);
	sctx->send_progress = orig_progress;

	return ret;
}

static void free_pending_move(struct send_ctx *sctx, struct pending_dir_move *m)
{
	if (!list_empty(&m->list))
		list_del(&m->list);
	if (!RB_EMPTY_NODE(&m->node))
		rb_erase(&m->node, &sctx->pending_dir_moves);
	__free_recorded_refs(&m->update_refs);
	kfree(m);
}

3677 3678
static void tail_append_pending_moves(struct send_ctx *sctx,
				      struct pending_dir_move *moves,
3679 3680 3681 3682 3683 3684 3685 3686 3687 3688
				      struct list_head *stack)
{
	if (list_empty(&moves->list)) {
		list_add_tail(&moves->list, stack);
	} else {
		LIST_HEAD(list);
		list_splice_init(&moves->list, &list);
		list_add_tail(&moves->list, stack);
		list_splice_tail(&list, stack);
	}
3689 3690 3691 3692
	if (!RB_EMPTY_NODE(&moves->node)) {
		rb_erase(&moves->node, &sctx->pending_dir_moves);
		RB_CLEAR_NODE(&moves->node);
	}
3693 3694 3695 3696 3697
}

static int apply_children_dir_moves(struct send_ctx *sctx)
{
	struct pending_dir_move *pm;
3698
	LIST_HEAD(stack);
3699 3700 3701 3702 3703 3704 3705
	u64 parent_ino = sctx->cur_ino;
	int ret = 0;

	pm = get_pending_dir_moves(sctx, parent_ino);
	if (!pm)
		return 0;

3706
	tail_append_pending_moves(sctx, pm, &stack);
3707 3708 3709 3710 3711 3712 3713 3714 3715 3716

	while (!list_empty(&stack)) {
		pm = list_first_entry(&stack, struct pending_dir_move, list);
		parent_ino = pm->ino;
		ret = apply_dir_move(sctx, pm);
		free_pending_move(sctx, pm);
		if (ret)
			goto out;
		pm = get_pending_dir_moves(sctx, parent_ino);
		if (pm)
3717
			tail_append_pending_moves(sctx, pm, &stack);
3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728
	}
	return 0;

out:
	while (!list_empty(&stack)) {
		pm = list_first_entry(&stack, struct pending_dir_move, list);
		free_pending_move(sctx, pm);
	}
	return ret;
}

3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768
/*
 * We might need to delay a directory rename even when no ancestor directory
 * (in the send root) with a higher inode number than ours (sctx->cur_ino) was
 * renamed. This happens when we rename a directory to the old name (the name
 * in the parent root) of some other unrelated directory that got its rename
 * delayed due to some ancestor with higher number that got renamed.
 *
 * Example:
 *
 * Parent snapshot:
 * .                                       (ino 256)
 * |---- a/                                (ino 257)
 * |     |---- file                        (ino 260)
 * |
 * |---- b/                                (ino 258)
 * |---- c/                                (ino 259)
 *
 * Send snapshot:
 * .                                       (ino 256)
 * |---- a/                                (ino 258)
 * |---- x/                                (ino 259)
 *       |---- y/                          (ino 257)
 *             |----- file                 (ino 260)
 *
 * Here we can not rename 258 from 'b' to 'a' without the rename of inode 257
 * from 'a' to 'x/y' happening first, which in turn depends on the rename of
 * inode 259 from 'c' to 'x'. So the order of rename commands the send stream
 * must issue is:
 *
 * 1 - rename 259 from 'c' to 'x'
 * 2 - rename 257 from 'a' to 'x/y'
 * 3 - rename 258 from 'b' to 'a'
 *
 * Returns 1 if the rename of sctx->cur_ino needs to be delayed, 0 if it can
 * be done right away and < 0 on error.
 */
static int wait_for_dest_dir_move(struct send_ctx *sctx,
				  struct recorded_ref *parent_ref,
				  const bool is_orphan)
{
3769
	struct btrfs_fs_info *fs_info = sctx->parent_root->fs_info;
3770 3771 3772 3773 3774 3775 3776
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_key di_key;
	struct btrfs_dir_item *di;
	u64 left_gen;
	u64 right_gen;
	int ret = 0;
3777
	struct waiting_dir_move *wdm;
3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797

	if (RB_EMPTY_ROOT(&sctx->waiting_dir_moves))
		return 0;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	key.objectid = parent_ref->dir;
	key.type = BTRFS_DIR_ITEM_KEY;
	key.offset = btrfs_name_hash(parent_ref->name, parent_ref->name_len);

	ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
	if (ret < 0) {
		goto out;
	} else if (ret > 0) {
		ret = 0;
		goto out;
	}

3798 3799
	di = btrfs_match_dir_item_name(fs_info, path, parent_ref->name,
				       parent_ref->name_len);
3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817
	if (!di) {
		ret = 0;
		goto out;
	}
	/*
	 * di_key.objectid has the number of the inode that has a dentry in the
	 * parent directory with the same name that sctx->cur_ino is being
	 * renamed to. We need to check if that inode is in the send root as
	 * well and if it is currently marked as an inode with a pending rename,
	 * if it is, we need to delay the rename of sctx->cur_ino as well, so
	 * that it happens after that other inode is renamed.
	 */
	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &di_key);
	if (di_key.type != BTRFS_INODE_ITEM_KEY) {
		ret = 0;
		goto out;
	}

3818
	ret = get_inode_gen(sctx->parent_root, di_key.objectid, &left_gen);
3819 3820
	if (ret < 0)
		goto out;
3821
	ret = get_inode_gen(sctx->send_root, di_key.objectid, &right_gen);
3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833
	if (ret < 0) {
		if (ret == -ENOENT)
			ret = 0;
		goto out;
	}

	/* Different inode, no need to delay the rename of sctx->cur_ino */
	if (right_gen != left_gen) {
		ret = 0;
		goto out;
	}

3834 3835
	wdm = get_waiting_dir_move(sctx, di_key.objectid);
	if (wdm && !wdm->orphanized) {
3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850
		ret = add_pending_dir_move(sctx,
					   sctx->cur_ino,
					   sctx->cur_inode_gen,
					   di_key.objectid,
					   &sctx->new_refs,
					   &sctx->deleted_refs,
					   is_orphan);
		if (!ret)
			ret = 1;
	}
out:
	btrfs_free_path(path);
	return ret;
}

3851
/*
3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883
 * Check if inode ino2, or any of its ancestors, is inode ino1.
 * Return 1 if true, 0 if false and < 0 on error.
 */
static int check_ino_in_path(struct btrfs_root *root,
			     const u64 ino1,
			     const u64 ino1_gen,
			     const u64 ino2,
			     const u64 ino2_gen,
			     struct fs_path *fs_path)
{
	u64 ino = ino2;

	if (ino1 == ino2)
		return ino1_gen == ino2_gen;

	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
		u64 parent;
		u64 parent_gen;
		int ret;

		fs_path_reset(fs_path);
		ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path);
		if (ret < 0)
			return ret;
		if (parent == ino1)
			return parent_gen == ino1_gen;
		ino = parent;
	}
	return 0;
}

/*
3884
 * Check if inode ino1 is an ancestor of inode ino2 in the given root for any
3885
 * possible path (in case ino2 is not a directory and has multiple hard links).
3886 3887 3888 3889 3890 3891 3892 3893
 * Return 1 if true, 0 if false and < 0 on error.
 */
static int is_ancestor(struct btrfs_root *root,
		       const u64 ino1,
		       const u64 ino1_gen,
		       const u64 ino2,
		       struct fs_path *fs_path)
{
3894
	bool free_fs_path = false;
3895
	int ret = 0;
3896
	int iter_ret = 0;
3897 3898
	struct btrfs_path *path = NULL;
	struct btrfs_key key;
3899 3900 3901 3902 3903

	if (!fs_path) {
		fs_path = fs_path_alloc();
		if (!fs_path)
			return -ENOMEM;
3904
		free_fs_path = true;
3905
	}
3906

3907 3908 3909 3910 3911
	path = alloc_path_for_send();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}
3912

3913 3914 3915 3916
	key.objectid = ino2;
	key.type = BTRFS_INODE_REF_KEY;
	key.offset = 0;

3917
	btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928
		struct extent_buffer *leaf = path->nodes[0];
		int slot = path->slots[0];
		u32 cur_offset = 0;
		u32 item_size;

		if (key.objectid != ino2)
			break;
		if (key.type != BTRFS_INODE_REF_KEY &&
		    key.type != BTRFS_INODE_EXTREF_KEY)
			break;

3929
		item_size = btrfs_item_size(leaf, slot);
3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950
		while (cur_offset < item_size) {
			u64 parent;
			u64 parent_gen;

			if (key.type == BTRFS_INODE_EXTREF_KEY) {
				unsigned long ptr;
				struct btrfs_inode_extref *extref;

				ptr = btrfs_item_ptr_offset(leaf, slot);
				extref = (struct btrfs_inode_extref *)
					(ptr + cur_offset);
				parent = btrfs_inode_extref_parent(leaf,
								   extref);
				cur_offset += sizeof(*extref);
				cur_offset += btrfs_inode_extref_name_len(leaf,
								  extref);
			} else {
				parent = key.offset;
				cur_offset = item_size;
			}

3951
			ret = get_inode_gen(root, parent, &parent_gen);
3952 3953 3954 3955 3956 3957
			if (ret < 0)
				goto out;
			ret = check_ino_in_path(root, ino1, ino1_gen,
						parent, parent_gen, fs_path);
			if (ret)
				goto out;
3958 3959
		}
	}
3960
	ret = 0;
3961 3962 3963 3964
	if (iter_ret < 0)
		ret = iter_ret;

out:
3965 3966
	btrfs_free_path(path);
	if (free_fs_path)
3967 3968
		fs_path_free(fs_path);
	return ret;
3969 3970
}

3971
static int wait_for_parent_move(struct send_ctx *sctx,
3972 3973
				struct recorded_ref *parent_ref,
				const bool is_orphan)
3974
{
3975
	int ret = 0;
3976
	u64 ino = parent_ref->dir;
3977
	u64 ino_gen = parent_ref->dir_gen;
3978 3979 3980 3981 3982 3983
	u64 parent_ino_before, parent_ino_after;
	struct fs_path *path_before = NULL;
	struct fs_path *path_after = NULL;
	int len1, len2;

	path_after = fs_path_alloc();
3984 3985
	path_before = fs_path_alloc();
	if (!path_after || !path_before) {
3986 3987 3988 3989
		ret = -ENOMEM;
		goto out;
	}

3990
	/*
3991 3992 3993
	 * Our current directory inode may not yet be renamed/moved because some
	 * ancestor (immediate or not) has to be renamed/moved first. So find if
	 * such ancestor exists and make sure our own rename/move happens after
3994 3995
	 * that ancestor is processed to avoid path build infinite loops (done
	 * at get_cur_path()).
3996
	 */
3997
	while (ino > BTRFS_FIRST_FREE_OBJECTID) {
3998 3999
		u64 parent_ino_after_gen;

4000
		if (is_waiting_for_move(sctx, ino)) {
4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013
			/*
			 * If the current inode is an ancestor of ino in the
			 * parent root, we need to delay the rename of the
			 * current inode, otherwise don't delayed the rename
			 * because we can end up with a circular dependency
			 * of renames, resulting in some directories never
			 * getting the respective rename operations issued in
			 * the send stream or getting into infinite path build
			 * loops.
			 */
			ret = is_ancestor(sctx->parent_root,
					  sctx->cur_ino, sctx->cur_inode_gen,
					  ino, path_before);
4014 4015
			if (ret)
				break;
4016
		}
4017 4018 4019 4020 4021

		fs_path_reset(path_before);
		fs_path_reset(path_after);

		ret = get_first_ref(sctx->send_root, ino, &parent_ino_after,
4022
				    &parent_ino_after_gen, path_after);
4023 4024 4025 4026
		if (ret < 0)
			goto out;
		ret = get_first_ref(sctx->parent_root, ino, &parent_ino_before,
				    NULL, path_before);
4027
		if (ret < 0 && ret != -ENOENT) {
4028
			goto out;
4029
		} else if (ret == -ENOENT) {
4030
			ret = 0;
4031
			break;
4032 4033 4034 4035
		}

		len1 = fs_path_len(path_before);
		len2 = fs_path_len(path_after);
4036 4037 4038
		if (ino > sctx->cur_ino &&
		    (parent_ino_before != parent_ino_after || len1 != len2 ||
		     memcmp(path_before->start, path_after->start, len1))) {
4039 4040
			u64 parent_ino_gen;

4041
			ret = get_inode_gen(sctx->parent_root, ino, &parent_ino_gen);
4042 4043 4044 4045 4046 4047
			if (ret < 0)
				goto out;
			if (ino_gen == parent_ino_gen) {
				ret = 1;
				break;
			}
4048 4049
		}
		ino = parent_ino_after;
4050
		ino_gen = parent_ino_after_gen;
4051 4052
	}

4053 4054 4055 4056
out:
	fs_path_free(path_before);
	fs_path_free(path_after);

4057 4058 4059 4060 4061 4062
	if (ret == 1) {
		ret = add_pending_dir_move(sctx,
					   sctx->cur_ino,
					   sctx->cur_inode_gen,
					   ino,
					   &sctx->new_refs,
4063
					   &sctx->deleted_refs,
4064
					   is_orphan);
4065 4066 4067 4068
		if (!ret)
			ret = 1;
	}

4069 4070 4071
	return ret;
}

4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101
static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
{
	int ret;
	struct fs_path *new_path;

	/*
	 * Our reference's name member points to its full_path member string, so
	 * we use here a new path.
	 */
	new_path = fs_path_alloc();
	if (!new_path)
		return -ENOMEM;

	ret = get_cur_path(sctx, ref->dir, ref->dir_gen, new_path);
	if (ret < 0) {
		fs_path_free(new_path);
		return ret;
	}
	ret = fs_path_add(new_path, ref->name, ref->name_len);
	if (ret < 0) {
		fs_path_free(new_path);
		return ret;
	}

	fs_path_free(ref->full_path);
	set_ref_path(ref, new_path);

	return 0;
}

4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167
/*
 * When processing the new references for an inode we may orphanize an existing
 * directory inode because its old name conflicts with one of the new references
 * of the current inode. Later, when processing another new reference of our
 * inode, we might need to orphanize another inode, but the path we have in the
 * reference reflects the pre-orphanization name of the directory we previously
 * orphanized. For example:
 *
 * parent snapshot looks like:
 *
 * .                                     (ino 256)
 * |----- f1                             (ino 257)
 * |----- f2                             (ino 258)
 * |----- d1/                            (ino 259)
 *        |----- d2/                     (ino 260)
 *
 * send snapshot looks like:
 *
 * .                                     (ino 256)
 * |----- d1                             (ino 258)
 * |----- f2/                            (ino 259)
 *        |----- f2_link/                (ino 260)
 *        |       |----- f1              (ino 257)
 *        |
 *        |----- d2                      (ino 258)
 *
 * When processing inode 257 we compute the name for inode 259 as "d1", and we
 * cache it in the name cache. Later when we start processing inode 258, when
 * collecting all its new references we set a full path of "d1/d2" for its new
 * reference with name "d2". When we start processing the new references we
 * start by processing the new reference with name "d1", and this results in
 * orphanizing inode 259, since its old reference causes a conflict. Then we
 * move on the next new reference, with name "d2", and we find out we must
 * orphanize inode 260, as its old reference conflicts with ours - but for the
 * orphanization we use a source path corresponding to the path we stored in the
 * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the
 * receiver fail since the path component "d1/" no longer exists, it was renamed
 * to "o259-6-0/" when processing the previous new reference. So in this case we
 * must recompute the path in the new reference and use it for the new
 * orphanization operation.
 */
static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref)
{
	char *name;
	int ret;

	name = kmemdup(ref->name, ref->name_len, GFP_KERNEL);
	if (!name)
		return -ENOMEM;

	fs_path_reset(ref->full_path);
	ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path);
	if (ret < 0)
		goto out;

	ret = fs_path_add(ref->full_path, name, ref->name_len);
	if (ret < 0)
		goto out;

	/* Update the reference's base name pointer. */
	set_ref_path(ref, ref->full_path);
out:
	kfree(name);
	return ret;
}

4168 4169 4170
/*
 * This does all the move/link/unlink/rmdir magic.
 */
4171
static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
4172
{
4173
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
4174 4175
	int ret = 0;
	struct recorded_ref *cur;
4176
	struct recorded_ref *cur2;
4177
	LIST_HEAD(check_dirs);
4178
	struct fs_path *valid_path = NULL;
4179
	u64 ow_inode = 0;
4180
	u64 ow_gen;
4181
	u64 ow_mode;
4182 4183
	int did_overwrite = 0;
	int is_orphan = 0;
4184
	u64 last_dir_ino_rm = 0;
4185
	bool can_rename = true;
4186
	bool orphanized_dir = false;
4187
	bool orphanized_ancestor = false;
4188

4189
	btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino);
4190

4191 4192 4193 4194
	/*
	 * This should never happen as the root dir always has the same ref
	 * which is always '..'
	 */
4195 4196 4197 4198 4199 4200 4201
	if (unlikely(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID)) {
		btrfs_err(fs_info,
			  "send: unexpected inode %llu in process_recorded_refs()",
			  sctx->cur_ino);
		ret = -EINVAL;
		goto out;
	}
4202

4203
	valid_path = fs_path_alloc();
4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240
	if (!valid_path) {
		ret = -ENOMEM;
		goto out;
	}

	/*
	 * First, check if the first ref of the current inode was overwritten
	 * before. If yes, we know that the current inode was already orphanized
	 * and thus use the orphan name. If not, we can use get_cur_path to
	 * get the path of the first ref as it would like while receiving at
	 * this point in time.
	 * New inodes are always orphan at the beginning, so force to use the
	 * orphan name in this case.
	 * The first ref is stored in valid_path and will be updated if it
	 * gets moved around.
	 */
	if (!sctx->cur_inode_new) {
		ret = did_overwrite_first_ref(sctx, sctx->cur_ino,
				sctx->cur_inode_gen);
		if (ret < 0)
			goto out;
		if (ret)
			did_overwrite = 1;
	}
	if (sctx->cur_inode_new || did_overwrite) {
		ret = gen_unique_name(sctx, sctx->cur_ino,
				sctx->cur_inode_gen, valid_path);
		if (ret < 0)
			goto out;
		is_orphan = 1;
	} else {
		ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen,
				valid_path);
		if (ret < 0)
			goto out;
	}

4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278
	/*
	 * Before doing any rename and link operations, do a first pass on the
	 * new references to orphanize any unprocessed inodes that may have a
	 * reference that conflicts with one of the new references of the current
	 * inode. This needs to happen first because a new reference may conflict
	 * with the old reference of a parent directory, so we must make sure
	 * that the path used for link and rename commands don't use an
	 * orphanized name when an ancestor was not yet orphanized.
	 *
	 * Example:
	 *
	 * Parent snapshot:
	 *
	 * .                                                      (ino 256)
	 * |----- testdir/                                        (ino 259)
	 * |          |----- a                                    (ino 257)
	 * |
	 * |----- b                                               (ino 258)
	 *
	 * Send snapshot:
	 *
	 * .                                                      (ino 256)
	 * |----- testdir_2/                                      (ino 259)
	 * |          |----- a                                    (ino 260)
	 * |
	 * |----- testdir                                         (ino 257)
	 * |----- b                                               (ino 257)
	 * |----- b2                                              (ino 258)
	 *
	 * Processing the new reference for inode 257 with name "b" may happen
	 * before processing the new reference with name "testdir". If so, we
	 * must make sure that by the time we send a link command to create the
	 * hard link "b", inode 259 was already orphanized, since the generated
	 * path in "valid_path" already contains the orphanized name for 259.
	 * We are processing inode 257, so only later when processing 259 we do
	 * the rename operation to change its temporary (orphanized) name to
	 * "testdir_2".
	 */
4279
	list_for_each_entry(cur, &sctx->new_refs, list) {
4280
		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
4281 4282
		if (ret < 0)
			goto out;
4283 4284
		if (ret == inode_state_will_create)
			continue;
4285

4286
		/*
4287 4288 4289 4290
		 * Check if this new ref would overwrite the first ref of another
		 * unprocessed inode. If yes, orphanize the overwritten inode.
		 * If we find an overwritten ref that is not the first ref,
		 * simply unlink it.
4291 4292 4293
		 */
		ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen,
				cur->name, cur->name_len,
4294
				&ow_inode, &ow_gen, &ow_mode);
4295 4296 4297
		if (ret < 0)
			goto out;
		if (ret) {
4298 4299 4300
			ret = is_first_ref(sctx->parent_root,
					   ow_inode, cur->dir, cur->name,
					   cur->name_len);
4301 4302 4303
			if (ret < 0)
				goto out;
			if (ret) {
4304
				struct name_cache_entry *nce;
4305
				struct waiting_dir_move *wdm;
4306

4307 4308 4309 4310 4311 4312
				if (orphanized_dir) {
					ret = refresh_ref_path(sctx, cur);
					if (ret < 0)
						goto out;
				}

4313 4314 4315 4316
				ret = orphanize_inode(sctx, ow_inode, ow_gen,
						cur->full_path);
				if (ret < 0)
					goto out;
4317 4318
				if (S_ISDIR(ow_mode))
					orphanized_dir = true;
4319 4320 4321 4322 4323 4324 4325

				/*
				 * If ow_inode has its rename operation delayed
				 * make sure that its orphanized name is used in
				 * the source path when performing its rename
				 * operation.
				 */
4326 4327
				wdm = get_waiting_dir_move(sctx, ow_inode);
				if (wdm)
4328 4329
					wdm->orphanized = true;

4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340
				/*
				 * Make sure we clear our orphanized inode's
				 * name from the name cache. This is because the
				 * inode ow_inode might be an ancestor of some
				 * other inode that will be orphanized as well
				 * later and has an inode number greater than
				 * sctx->send_progress. We need to prevent
				 * future name lookups from using the old name
				 * and get instead the orphan name.
				 */
				nce = name_cache_search(sctx, ow_inode, ow_gen);
4341 4342 4343
				if (nce)
					btrfs_lru_cache_remove(&sctx->name_cache,
							       &nce->entry);
4344 4345 4346 4347 4348 4349 4350 4351

				/*
				 * ow_inode might currently be an ancestor of
				 * cur_ino, therefore compute valid_path (the
				 * current path of cur_ino) again because it
				 * might contain the pre-orphanization name of
				 * ow_inode, which is no longer valid.
				 */
4352 4353 4354 4355
				ret = is_ancestor(sctx->parent_root,
						  ow_inode, ow_gen,
						  sctx->cur_ino, NULL);
				if (ret > 0) {
4356
					orphanized_ancestor = true;
4357 4358 4359 4360 4361
					fs_path_reset(valid_path);
					ret = get_cur_path(sctx, sctx->cur_ino,
							   sctx->cur_inode_gen,
							   valid_path);
				}
4362 4363
				if (ret < 0)
					goto out;
4364
			} else {
4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375
				/*
				 * If we previously orphanized a directory that
				 * collided with a new reference that we already
				 * processed, recompute the current path because
				 * that directory may be part of the path.
				 */
				if (orphanized_dir) {
					ret = refresh_ref_path(sctx, cur);
					if (ret < 0)
						goto out;
				}
4376 4377 4378 4379 4380 4381
				ret = send_unlink(sctx, cur->full_path);
				if (ret < 0)
					goto out;
			}
		}

4382 4383 4384 4385 4386 4387 4388 4389 4390 4391
	}

	list_for_each_entry(cur, &sctx->new_refs, list) {
		/*
		 * We may have refs where the parent directory does not exist
		 * yet. This happens if the parent directories inum is higher
		 * than the current inum. To handle this case, we create the
		 * parent directory out of order. But we need to check if this
		 * did already happen before due to other refs in the same dir.
		 */
4392
		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421
		if (ret < 0)
			goto out;
		if (ret == inode_state_will_create) {
			ret = 0;
			/*
			 * First check if any of the current inodes refs did
			 * already create the dir.
			 */
			list_for_each_entry(cur2, &sctx->new_refs, list) {
				if (cur == cur2)
					break;
				if (cur2->dir == cur->dir) {
					ret = 1;
					break;
				}
			}

			/*
			 * If that did not happen, check if a previous inode
			 * did already create the dir.
			 */
			if (!ret)
				ret = did_create_dir(sctx, cur->dir);
			if (ret < 0)
				goto out;
			if (!ret) {
				ret = send_create_inode(sctx, cur->dir);
				if (ret < 0)
					goto out;
4422
				cache_dir_created(sctx, cur->dir);
4423 4424 4425
			}
		}

4426 4427 4428 4429 4430 4431 4432 4433 4434 4435
		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) {
			ret = wait_for_dest_dir_move(sctx, cur, is_orphan);
			if (ret < 0)
				goto out;
			if (ret == 1) {
				can_rename = false;
				*pending_move = 1;
			}
		}

4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446
		if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root &&
		    can_rename) {
			ret = wait_for_parent_move(sctx, cur, is_orphan);
			if (ret < 0)
				goto out;
			if (ret == 1) {
				can_rename = false;
				*pending_move = 1;
			}
		}

4447 4448 4449 4450 4451
		/*
		 * link/move the ref to the new place. If we have an orphan
		 * inode, move it and update valid_path. If not, link or move
		 * it depending on the inode mode.
		 */
4452
		if (is_orphan && can_rename) {
4453 4454 4455 4456 4457 4458 4459
			ret = send_rename(sctx, valid_path, cur->full_path);
			if (ret < 0)
				goto out;
			is_orphan = 0;
			ret = fs_path_copy(valid_path, cur->full_path);
			if (ret < 0)
				goto out;
4460
		} else if (can_rename) {
4461 4462 4463 4464 4465 4466
			if (S_ISDIR(sctx->cur_inode_mode)) {
				/*
				 * Dirs can't be linked, so move it. For moved
				 * dirs, we always have one new and one deleted
				 * ref. The deleted ref is ignored later.
				 */
4467 4468 4469 4470 4471
				ret = send_rename(sctx, valid_path,
						  cur->full_path);
				if (!ret)
					ret = fs_path_copy(valid_path,
							   cur->full_path);
4472 4473 4474
				if (ret < 0)
					goto out;
			} else {
4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486
				/*
				 * We might have previously orphanized an inode
				 * which is an ancestor of our current inode,
				 * so our reference's full path, which was
				 * computed before any such orphanizations, must
				 * be updated.
				 */
				if (orphanized_dir) {
					ret = update_ref_path(sctx, cur);
					if (ret < 0)
						goto out;
				}
4487 4488 4489 4490 4491 4492
				ret = send_link(sctx, cur->full_path,
						valid_path);
				if (ret < 0)
					goto out;
			}
		}
4493
		ret = dup_ref(cur, &check_dirs);
4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504
		if (ret < 0)
			goto out;
	}

	if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_deleted) {
		/*
		 * Check if we can already rmdir the directory. If not,
		 * orphanize it. For every dir item inside that gets deleted
		 * later, we do this check again and rmdir it then if possible.
		 * See the use of check_dirs for more details.
		 */
4505
		ret = can_rmdir(sctx, sctx->cur_ino, sctx->cur_inode_gen);
4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520
		if (ret < 0)
			goto out;
		if (ret) {
			ret = send_rmdir(sctx, valid_path);
			if (ret < 0)
				goto out;
		} else if (!is_orphan) {
			ret = orphanize_inode(sctx, sctx->cur_ino,
					sctx->cur_inode_gen, valid_path);
			if (ret < 0)
				goto out;
			is_orphan = 1;
		}

		list_for_each_entry(cur, &sctx->deleted_refs, list) {
4521
			ret = dup_ref(cur, &check_dirs);
4522 4523 4524
			if (ret < 0)
				goto out;
		}
4525 4526 4527 4528 4529 4530 4531
	} else if (S_ISDIR(sctx->cur_inode_mode) &&
		   !list_empty(&sctx->deleted_refs)) {
		/*
		 * We have a moved dir. Add the old parent to check_dirs
		 */
		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
				list);
4532
		ret = dup_ref(cur, &check_dirs);
4533 4534
		if (ret < 0)
			goto out;
4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547
	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
		/*
		 * We have a non dir inode. Go through all deleted refs and
		 * unlink them if they were not already overwritten by other
		 * inodes.
		 */
		list_for_each_entry(cur, &sctx->deleted_refs, list) {
			ret = did_overwrite_ref(sctx, cur->dir, cur->dir_gen,
					sctx->cur_ino, sctx->cur_inode_gen,
					cur->name, cur->name_len);
			if (ret < 0)
				goto out;
			if (!ret) {
4548 4549 4550 4551 4552 4553 4554 4555
				/*
				 * If we orphanized any ancestor before, we need
				 * to recompute the full path for deleted names,
				 * since any such path was computed before we
				 * processed any references and orphanized any
				 * ancestor inode.
				 */
				if (orphanized_ancestor) {
4556 4557
					ret = update_ref_path(sctx, cur);
					if (ret < 0)
4558 4559
						goto out;
				}
4560 4561 4562
				ret = send_unlink(sctx, cur->full_path);
				if (ret < 0)
					goto out;
4563
			}
4564
			ret = dup_ref(cur, &check_dirs);
4565 4566 4567 4568 4569 4570 4571
			if (ret < 0)
				goto out;
		}
		/*
		 * If the inode is still orphan, unlink the orphan. This may
		 * happen when a previous inode did overwrite the first ref
		 * of this inode and no new refs were added for the current
4572 4573 4574
		 * inode. Unlinking does not mean that the inode is deleted in
		 * all cases. There may still be links to this inode in other
		 * places.
4575
		 */
4576
		if (is_orphan) {
4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588
			ret = send_unlink(sctx, valid_path);
			if (ret < 0)
				goto out;
		}
	}

	/*
	 * We did collect all parent dirs where cur_inode was once located. We
	 * now go through all these dirs and check if they are pending for
	 * deletion and if it's finally possible to perform the rmdir now.
	 * We also update the inode stats of the parent dirs here.
	 */
4589
	list_for_each_entry(cur, &check_dirs, list) {
4590 4591 4592 4593 4594
		/*
		 * In case we had refs into dirs that were not processed yet,
		 * we don't need to do the utime and rmdir logic for these dirs.
		 * The dir will be processed later.
		 */
4595
		if (cur->dir > sctx->cur_ino)
4596 4597
			continue;

4598
		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen, NULL, NULL);
4599 4600 4601 4602 4603
		if (ret < 0)
			goto out;

		if (ret == inode_state_did_create ||
		    ret == inode_state_no_change) {
4604
			ret = cache_dir_utimes(sctx, cur->dir, cur->dir_gen);
4605 4606
			if (ret < 0)
				goto out;
4607 4608
		} else if (ret == inode_state_did_delete &&
			   cur->dir != last_dir_ino_rm) {
4609
			ret = can_rmdir(sctx, cur->dir, cur->dir_gen);
4610 4611 4612
			if (ret < 0)
				goto out;
			if (ret) {
4613 4614
				ret = get_cur_path(sctx, cur->dir,
						   cur->dir_gen, valid_path);
4615 4616 4617 4618 4619
				if (ret < 0)
					goto out;
				ret = send_rmdir(sctx, valid_path);
				if (ret < 0)
					goto out;
4620
				last_dir_ino_rm = cur->dir;
4621 4622 4623 4624 4625 4626 4627
			}
		}
	}

	ret = 0;

out:
4628
	__free_recorded_refs(&check_dirs);
4629
	free_recorded_refs(sctx);
4630
	fs_path_free(valid_path);
4631 4632 4633
	return ret;
}

4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718
static int rbtree_ref_comp(const void *k, const struct rb_node *node)
{
	const struct recorded_ref *data = k;
	const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node);
	int result;

	if (data->dir > ref->dir)
		return 1;
	if (data->dir < ref->dir)
		return -1;
	if (data->dir_gen > ref->dir_gen)
		return 1;
	if (data->dir_gen < ref->dir_gen)
		return -1;
	if (data->name_len > ref->name_len)
		return 1;
	if (data->name_len < ref->name_len)
		return -1;
	result = strcmp(data->name, ref->name);
	if (result > 0)
		return 1;
	if (result < 0)
		return -1;
	return 0;
}

static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent)
{
	const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node);

	return rbtree_ref_comp(entry, parent) < 0;
}

static int record_ref_in_tree(struct rb_root *root, struct list_head *refs,
			      struct fs_path *name, u64 dir, u64 dir_gen,
			      struct send_ctx *sctx)
{
	int ret = 0;
	struct fs_path *path = NULL;
	struct recorded_ref *ref = NULL;

	path = fs_path_alloc();
	if (!path) {
		ret = -ENOMEM;
		goto out;
	}

	ref = recorded_ref_alloc();
	if (!ref) {
		ret = -ENOMEM;
		goto out;
	}

	ret = get_cur_path(sctx, dir, dir_gen, path);
	if (ret < 0)
		goto out;
	ret = fs_path_add_path(path, name);
	if (ret < 0)
		goto out;

	ref->dir = dir;
	ref->dir_gen = dir_gen;
	set_ref_path(ref, path);
	list_add_tail(&ref->list, refs);
	rb_add(&ref->node, root, rbtree_ref_less);
	ref->root = root;
out:
	if (ret) {
		if (path && (!ref || !ref->full_path))
			fs_path_free(path);
		recorded_ref_free(ref);
	}
	return ret;
}

static int record_new_ref_if_needed(int num, u64 dir, int index,
				    struct fs_path *name, void *ctx)
{
	int ret = 0;
	struct send_ctx *sctx = ctx;
	struct rb_node *node = NULL;
	struct recorded_ref data;
	struct recorded_ref *ref;
	u64 dir_gen;

4719
	ret = get_inode_gen(sctx->send_root, dir, &dir_gen);
4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748
	if (ret < 0)
		goto out;

	data.dir = dir;
	data.dir_gen = dir_gen;
	set_ref_path(&data, name);
	node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp);
	if (node) {
		ref = rb_entry(node, struct recorded_ref, node);
		recorded_ref_free(ref);
	} else {
		ret = record_ref_in_tree(&sctx->rbtree_new_refs,
					 &sctx->new_refs, name, dir, dir_gen,
					 sctx);
	}
out:
	return ret;
}

static int record_deleted_ref_if_needed(int num, u64 dir, int index,
					struct fs_path *name, void *ctx)
{
	int ret = 0;
	struct send_ctx *sctx = ctx;
	struct rb_node *node = NULL;
	struct recorded_ref data;
	struct recorded_ref *ref;
	u64 dir_gen;

4749
	ret = get_inode_gen(sctx->parent_root, dir, &dir_gen);
4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768
	if (ret < 0)
		goto out;

	data.dir = dir;
	data.dir_gen = dir_gen;
	set_ref_path(&data, name);
	node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp);
	if (node) {
		ref = rb_entry(node, struct recorded_ref, node);
		recorded_ref_free(ref);
	} else {
		ret = record_ref_in_tree(&sctx->rbtree_deleted_refs,
					 &sctx->deleted_refs, name, dir,
					 dir_gen, sctx);
	}
out:
	return ret;
}

4769 4770 4771 4772
static int record_new_ref(struct send_ctx *sctx)
{
	int ret;

4773
	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4774
				sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786
	if (ret < 0)
		goto out;
	ret = 0;

out:
	return ret;
}

static int record_deleted_ref(struct send_ctx *sctx)
{
	int ret;

4787
	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4788 4789
				sctx->cmp_key, 0, record_deleted_ref_if_needed,
				sctx);
4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801
	if (ret < 0)
		goto out;
	ret = 0;

out:
	return ret;
}

static int record_changed_ref(struct send_ctx *sctx)
{
	int ret = 0;

4802
	ret = iterate_inode_ref(sctx->send_root, sctx->left_path,
4803
			sctx->cmp_key, 0, record_new_ref_if_needed, sctx);
4804 4805
	if (ret < 0)
		goto out;
4806
	ret = iterate_inode_ref(sctx->parent_root, sctx->right_path,
4807
			sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx);
4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822
	if (ret < 0)
		goto out;
	ret = 0;

out:
	return ret;
}

/*
 * Record and process all refs at once. Needed when an inode changes the
 * generation number, which means that it was deleted and recreated.
 */
static int process_all_refs(struct send_ctx *sctx,
			    enum btrfs_compare_tree_result cmd)
{
4823 4824
	int ret = 0;
	int iter_ret = 0;
4825 4826 4827 4828 4829
	struct btrfs_root *root;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_key found_key;
	iterate_inode_ref_t cb;
4830
	int pending_move = 0;
4831 4832 4833 4834 4835 4836 4837

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	if (cmd == BTRFS_COMPARE_TREE_NEW) {
		root = sctx->send_root;
4838
		cb = record_new_ref_if_needed;
4839 4840
	} else if (cmd == BTRFS_COMPARE_TREE_DELETED) {
		root = sctx->parent_root;
4841
		cb = record_deleted_ref_if_needed;
4842
	} else {
4843 4844 4845 4846
		btrfs_err(sctx->send_root->fs_info,
				"Wrong command %d in process_all_refs", cmd);
		ret = -EINVAL;
		goto out;
4847 4848 4849 4850 4851
	}

	key.objectid = sctx->cmp_key->objectid;
	key.type = BTRFS_INODE_REF_KEY;
	key.offset = 0;
4852
	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
4853
		if (found_key.objectid != key.objectid ||
4854 4855
		    (found_key.type != BTRFS_INODE_REF_KEY &&
		     found_key.type != BTRFS_INODE_EXTREF_KEY))
4856 4857
			break;

4858
		ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
4859 4860
		if (ret < 0)
			goto out;
4861 4862 4863 4864 4865
	}
	/* Catch error found during iteration */
	if (iter_ret < 0) {
		ret = iter_ret;
		goto out;
4866
	}
4867
	btrfs_release_path(path);
4868

4869 4870 4871 4872 4873
	/*
	 * We don't actually care about pending_move as we are simply
	 * re-creating this inode and will be rename'ing it into place once we
	 * rename the parent directory.
	 */
4874
	ret = process_recorded_refs(sctx, &pending_move);
4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922
out:
	btrfs_free_path(path);
	return ret;
}

static int send_set_xattr(struct send_ctx *sctx,
			  struct fs_path *path,
			  const char *name, int name_len,
			  const char *data, int data_len)
{
	int ret = 0;

	ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);
	TLV_PUT(sctx, BTRFS_SEND_A_XATTR_DATA, data, data_len);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	return ret;
}

static int send_remove_xattr(struct send_ctx *sctx,
			  struct fs_path *path,
			  const char *name, int name_len)
{
	int ret = 0;

	ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
	TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	return ret;
}

static int __process_new_xattr(int num, struct btrfs_key *di_key,
4923 4924
			       const char *name, int name_len, const char *data,
			       int data_len, void *ctx)
4925 4926 4927 4928
{
	int ret;
	struct send_ctx *sctx = ctx;
	struct fs_path *p;
4929
	struct posix_acl_xattr_header dummy_acl;
4930

4931 4932 4933 4934
	/* Capabilities are emitted by finish_inode_if_needed */
	if (!strncmp(name, XATTR_NAME_CAPS, name_len))
		return 0;

4935
	p = fs_path_alloc();
4936 4937 4938 4939
	if (!p)
		return -ENOMEM;

	/*
4940
	 * This hack is needed because empty acls are stored as zero byte
4941
	 * data in xattrs. Problem with that is, that receiving these zero byte
4942
	 * acls will fail later. To fix this, we send a dummy acl list that
4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961
	 * only contains the version number and no entries.
	 */
	if (!strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, name_len) ||
	    !strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, name_len)) {
		if (data_len == 0) {
			dummy_acl.a_version =
					cpu_to_le32(POSIX_ACL_XATTR_VERSION);
			data = (char *)&dummy_acl;
			data_len = sizeof(dummy_acl);
		}
	}

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
	if (ret < 0)
		goto out;

	ret = send_set_xattr(sctx, p, name, name_len, data, data_len);

out:
4962
	fs_path_free(p);
4963 4964 4965 4966 4967
	return ret;
}

static int __process_deleted_xattr(int num, struct btrfs_key *di_key,
				   const char *name, int name_len,
4968
				   const char *data, int data_len, void *ctx)
4969 4970 4971 4972 4973
{
	int ret;
	struct send_ctx *sctx = ctx;
	struct fs_path *p;

4974
	p = fs_path_alloc();
4975 4976 4977 4978 4979 4980 4981 4982 4983 4984
	if (!p)
		return -ENOMEM;

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
	if (ret < 0)
		goto out;

	ret = send_remove_xattr(sctx, p, name, name_len);

out:
4985
	fs_path_free(p);
4986 4987 4988 4989 4990 4991 4992
	return ret;
}

static int process_new_xattr(struct send_ctx *sctx)
{
	int ret = 0;

4993
	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
4994
			       __process_new_xattr, sctx);
4995 4996 4997 4998 4999 5000

	return ret;
}

static int process_deleted_xattr(struct send_ctx *sctx)
{
5001
	return iterate_dir_item(sctx->parent_root, sctx->right_path,
5002
				__process_deleted_xattr, sctx);
5003 5004 5005 5006 5007 5008 5009 5010 5011 5012
}

struct find_xattr_ctx {
	const char *name;
	int name_len;
	int found_idx;
	char *found_data;
	int found_data_len;
};

5013 5014
static int __find_xattr(int num, struct btrfs_key *di_key, const char *name,
			int name_len, const char *data, int data_len, void *vctx)
5015 5016 5017 5018 5019 5020 5021
{
	struct find_xattr_ctx *ctx = vctx;

	if (name_len == ctx->name_len &&
	    strncmp(name, ctx->name, name_len) == 0) {
		ctx->found_idx = num;
		ctx->found_data_len = data_len;
5022
		ctx->found_data = kmemdup(data, data_len, GFP_KERNEL);
5023 5024 5025 5026 5027 5028 5029
		if (!ctx->found_data)
			return -ENOMEM;
		return 1;
	}
	return 0;
}

5030
static int find_xattr(struct btrfs_root *root,
5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044
		      struct btrfs_path *path,
		      struct btrfs_key *key,
		      const char *name, int name_len,
		      char **data, int *data_len)
{
	int ret;
	struct find_xattr_ctx ctx;

	ctx.name = name;
	ctx.name_len = name_len;
	ctx.found_idx = -1;
	ctx.found_data = NULL;
	ctx.found_data_len = 0;

5045
	ret = iterate_dir_item(root, path, __find_xattr, &ctx);
5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063
	if (ret < 0)
		return ret;

	if (ctx.found_idx == -1)
		return -ENOENT;
	if (data) {
		*data = ctx.found_data;
		*data_len = ctx.found_data_len;
	} else {
		kfree(ctx.found_data);
	}
	return ctx.found_idx;
}


static int __process_changed_new_xattr(int num, struct btrfs_key *di_key,
				       const char *name, int name_len,
				       const char *data, int data_len,
5064
				       void *ctx)
5065 5066 5067 5068 5069 5070
{
	int ret;
	struct send_ctx *sctx = ctx;
	char *found_data = NULL;
	int found_data_len  = 0;

5071 5072 5073
	ret = find_xattr(sctx->parent_root, sctx->right_path,
			 sctx->cmp_key, name, name_len, &found_data,
			 &found_data_len);
5074 5075
	if (ret == -ENOENT) {
		ret = __process_new_xattr(num, di_key, name, name_len, data,
5076
					  data_len, ctx);
5077 5078 5079 5080
	} else if (ret >= 0) {
		if (data_len != found_data_len ||
		    memcmp(data, found_data, data_len)) {
			ret = __process_new_xattr(num, di_key, name, name_len,
5081
						  data, data_len, ctx);
5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093
		} else {
			ret = 0;
		}
	}

	kfree(found_data);
	return ret;
}

static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key,
					   const char *name, int name_len,
					   const char *data, int data_len,
5094
					   void *ctx)
5095 5096 5097 5098
{
	int ret;
	struct send_ctx *sctx = ctx;

5099 5100
	ret = find_xattr(sctx->send_root, sctx->left_path, sctx->cmp_key,
			 name, name_len, NULL, NULL);
5101 5102
	if (ret == -ENOENT)
		ret = __process_deleted_xattr(num, di_key, name, name_len, data,
5103
					      data_len, ctx);
5104 5105 5106 5107 5108 5109 5110 5111 5112 5113
	else if (ret >= 0)
		ret = 0;

	return ret;
}

static int process_changed_xattr(struct send_ctx *sctx)
{
	int ret = 0;

5114
	ret = iterate_dir_item(sctx->send_root, sctx->left_path,
5115
			__process_changed_new_xattr, sctx);
5116 5117
	if (ret < 0)
		goto out;
5118
	ret = iterate_dir_item(sctx->parent_root, sctx->right_path,
5119
			__process_changed_deleted_xattr, sctx);
5120 5121 5122 5123 5124 5125 5126

out:
	return ret;
}

static int process_all_new_xattrs(struct send_ctx *sctx)
{
5127 5128
	int ret = 0;
	int iter_ret = 0;
5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142
	struct btrfs_root *root;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_key found_key;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	root = sctx->send_root;

	key.objectid = sctx->cmp_key->objectid;
	key.type = BTRFS_XATTR_ITEM_KEY;
	key.offset = 0;
5143
	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
5144 5145 5146
		if (found_key.objectid != key.objectid ||
		    found_key.type != key.type) {
			ret = 0;
5147
			break;
5148 5149
		}

5150
		ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
5151
		if (ret < 0)
5152
			break;
5153
	}
5154 5155 5156
	/* Catch error found during iteration */
	if (iter_ret < 0)
		ret = iter_ret;
5157 5158 5159 5160 5161

	btrfs_free_path(path);
	return ret;
}

5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193
static int send_verity(struct send_ctx *sctx, struct fs_path *path,
		       struct fsverity_descriptor *desc)
{
	int ret;

	ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path);
	TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM,
			le8_to_cpu(desc->hash_algorithm));
	TLV_PUT_U32(sctx, BTRFS_SEND_A_VERITY_BLOCK_SIZE,
			1U << le8_to_cpu(desc->log_blocksize));
	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SALT_DATA, desc->salt,
			le8_to_cpu(desc->salt_size));
	TLV_PUT(sctx, BTRFS_SEND_A_VERITY_SIG_DATA, desc->signature,
			le32_to_cpu(desc->sig_size));

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	return ret;
}

static int process_verity(struct send_ctx *sctx)
{
	int ret = 0;
	struct inode *inode;
	struct fs_path *p;

5194
	inode = btrfs_iget(sctx->cur_ino, sctx->send_root);
5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	ret = btrfs_get_verity_descriptor(inode, NULL, 0);
	if (ret < 0)
		goto iput;

	if (ret > FS_VERITY_MAX_DESCRIPTOR_SIZE) {
		ret = -EMSGSIZE;
		goto iput;
	}
	if (!sctx->verity_descriptor) {
		sctx->verity_descriptor = kvmalloc(FS_VERITY_MAX_DESCRIPTOR_SIZE,
						   GFP_KERNEL);
		if (!sctx->verity_descriptor) {
			ret = -ENOMEM;
			goto iput;
		}
	}

	ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret);
	if (ret < 0)
		goto iput;

	p = fs_path_alloc();
	if (!p) {
		ret = -ENOMEM;
		goto iput;
	}
	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
	if (ret < 0)
		goto free_path;

	ret = send_verity(sctx, p, sctx->verity_descriptor);
	if (ret < 0)
		goto free_path;

free_path:
	fs_path_free(p);
iput:
	iput(inode);
	return ret;
}

5239 5240 5241 5242 5243 5244 5245
static inline u64 max_send_read_size(const struct send_ctx *sctx)
{
	return sctx->send_max_size - SZ_16K;
}

static int put_data_header(struct send_ctx *sctx, u32 len)
{
5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259
	if (WARN_ON_ONCE(sctx->put_data))
		return -EINVAL;
	sctx->put_data = true;
	if (sctx->proto >= 2) {
		/*
		 * Since v2, the data attribute header doesn't include a length,
		 * it is implicitly to the end of the command.
		 */
		if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len)
			return -EOVERFLOW;
		put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size);
		sctx->send_size += sizeof(__le16);
	} else {
		struct btrfs_tlv_header *hdr;
5260

5261 5262 5263 5264 5265 5266 5267
		if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len)
			return -EOVERFLOW;
		hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size);
		put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type);
		put_unaligned_le16(len, &hdr->tlv_len);
		sctx->send_size += sizeof(*hdr);
	}
5268 5269 5270 5271
	return 0;
}

static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
5272 5273 5274
{
	struct btrfs_root *root = sctx->send_root;
	struct btrfs_fs_info *fs_info = root->fs_info;
5275
	struct folio *folio;
5276
	pgoff_t index = offset >> PAGE_SHIFT;
5277
	pgoff_t last_index;
5278
	unsigned pg_offset = offset_in_page(offset);
5279
	struct address_space *mapping = sctx->cur_inode->i_mapping;
5280 5281 5282 5283 5284
	int ret;

	ret = put_data_header(sctx, len);
	if (ret)
		return ret;
5285

5286
	last_index = (offset + len - 1) >> PAGE_SHIFT;
Liu Bo's avatar
Liu Bo committed
5287

5288 5289
	while (index <= last_index) {
		unsigned cur_len = min_t(unsigned, len,
5290
					 PAGE_SIZE - pg_offset);
5291

5292 5293 5294
		folio = filemap_lock_folio(mapping, index);
		if (IS_ERR(folio)) {
			page_cache_sync_readahead(mapping,
5295 5296
						  &sctx->ra, NULL, index,
						  last_index + 1 - index);
5297

5298 5299 5300
	                folio = filemap_grab_folio(mapping, index);
			if (IS_ERR(folio)) {
				ret = PTR_ERR(folio);
5301 5302 5303 5304
				break;
			}
		}

5305 5306 5307 5308
		WARN_ON(folio_order(folio));

		if (folio_test_readahead(folio))
			page_cache_async_readahead(mapping, &sctx->ra, NULL, folio,
5309
						   last_index + 1 - index);
5310

5311 5312 5313 5314 5315
		if (!folio_test_uptodate(folio)) {
			btrfs_read_folio(NULL, folio);
			folio_lock(folio);
			if (!folio_test_uptodate(folio)) {
				folio_unlock(folio);
5316 5317
				btrfs_err(fs_info,
			"send: IO error at offset %llu for inode %llu root %llu",
5318
					folio_pos(folio), sctx->cur_ino,
5319
					btrfs_root_id(sctx->send_root));
5320
				folio_put(folio);
5321 5322 5323 5324 5325
				ret = -EIO;
				break;
			}
		}

5326 5327 5328 5329
		memcpy_from_folio(sctx->send_buf + sctx->send_size, folio,
				  pg_offset, cur_len);
		folio_unlock(folio);
		folio_put(folio);
5330 5331 5332
		index++;
		pg_offset = 0;
		len -= cur_len;
5333
		sctx->send_size += cur_len;
5334
	}
5335

5336 5337 5338
	return ret;
}

5339 5340 5341 5342 5343 5344
/*
 * Read some bytes from the current inode/file and send a write command to
 * user space.
 */
static int send_write(struct send_ctx *sctx, u64 offset, u32 len)
{
5345
	struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
5346 5347 5348
	int ret = 0;
	struct fs_path *p;

5349
	p = fs_path_alloc();
5350 5351 5352
	if (!p)
		return -ENOMEM;

5353
	btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len);
5354 5355 5356 5357 5358 5359 5360 5361 5362 5363 5364

	ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5365 5366 5367
	ret = put_file_data(sctx, offset, len);
	if (ret < 0)
		goto out;
5368 5369 5370 5371 5372

	ret = send_cmd(sctx);

tlv_put_failure:
out:
5373
	fs_path_free(p);
5374
	return ret;
5375 5376 5377 5378 5379 5380 5381 5382 5383 5384 5385 5386 5387
}

/*
 * Send a clone command to user space.
 */
static int send_clone(struct send_ctx *sctx,
		      u64 offset, u32 len,
		      struct clone_root *clone_root)
{
	int ret = 0;
	struct fs_path *p;
	u64 gen;

5388 5389
	btrfs_debug(sctx->send_root->fs_info,
		    "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
5390
		    offset, len, btrfs_root_id(clone_root->root),
5391
		    clone_root->ino, clone_root->offset);
5392

5393
	p = fs_path_alloc();
5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407 5408
	if (!p)
		return -ENOMEM;

	ret = begin_cmd(sctx, BTRFS_SEND_C_CLONE);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
	if (ret < 0)
		goto out;

	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len);
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);

5409
	if (clone_root->root == sctx->send_root) {
5410
		ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen);
5411 5412 5413 5414
		if (ret < 0)
			goto out;
		ret = get_cur_path(sctx, clone_root->ino, gen, p);
	} else {
5415
		ret = get_inode_path(clone_root->root, clone_root->ino, p);
5416 5417 5418 5419
	}
	if (ret < 0)
		goto out;

5420 5421 5422 5423 5424 5425 5426 5427 5428 5429 5430 5431 5432 5433 5434
	/*
	 * If the parent we're using has a received_uuid set then use that as
	 * our clone source as that is what we will look for when doing a
	 * receive.
	 *
	 * This covers the case that we create a snapshot off of a received
	 * subvolume and then use that as the parent and try to receive on a
	 * different host.
	 */
	if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid))
		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
			     clone_root->root->root_item.received_uuid);
	else
		TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID,
			     clone_root->root->root_item.uuid);
5435
	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID,
5436
		    btrfs_root_ctransid(&clone_root->root->root_item));
5437 5438 5439 5440 5441 5442 5443 5444
	TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_OFFSET,
			clone_root->offset);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
5445
	fs_path_free(p);
5446 5447 5448
	return ret;
}

5449 5450 5451 5452 5453 5454 5455 5456 5457
/*
 * Send an update extent command to user space.
 */
static int send_update_extent(struct send_ctx *sctx,
			      u64 offset, u32 len)
{
	int ret = 0;
	struct fs_path *p;

5458
	p = fs_path_alloc();
5459 5460 5461 5462 5463 5464 5465 5466 5467 5468 5469 5470 5471 5472 5473 5474 5475 5476 5477
	if (!p)
		return -ENOMEM;

	ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
	if (ret < 0)
		goto out;

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);

	ret = send_cmd(sctx);

tlv_put_failure:
out:
5478
	fs_path_free(p);
5479 5480 5481
	return ret;
}

5482 5483 5484
static int send_hole(struct send_ctx *sctx, u64 end)
{
	struct fs_path *p = NULL;
5485
	u64 read_size = max_send_read_size(sctx);
5486 5487 5488
	u64 offset = sctx->cur_inode_last_extent;
	int ret = 0;

5489 5490 5491 5492 5493 5494 5495 5496 5497
	/*
	 * A hole that starts at EOF or beyond it. Since we do not yet support
	 * fallocate (for extent preallocation and hole punching), sending a
	 * write of zeroes starting at EOF or beyond would later require issuing
	 * a truncate operation which would undo the write and achieve nothing.
	 */
	if (offset >= sctx->cur_inode_size)
		return 0;

5498 5499 5500 5501 5502 5503
	/*
	 * Don't go beyond the inode's i_size due to prealloc extents that start
	 * after the i_size.
	 */
	end = min_t(u64, end, sctx->cur_inode_size);

5504 5505 5506
	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
		return send_update_extent(sctx, offset, end - offset);

5507 5508 5509
	p = fs_path_alloc();
	if (!p)
		return -ENOMEM;
5510 5511 5512
	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
	if (ret < 0)
		goto tlv_put_failure;
5513
	while (offset < end) {
5514
		u64 len = min(end - offset, read_size);
5515 5516 5517 5518 5519 5520

		ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE);
		if (ret < 0)
			break;
		TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
		TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
5521 5522 5523 5524 5525
		ret = put_data_header(sctx, len);
		if (ret < 0)
			break;
		memset(sctx->send_buf + sctx->send_size, 0, len);
		sctx->send_size += len;
5526 5527 5528 5529 5530
		ret = send_cmd(sctx);
		if (ret < 0)
			break;
		offset += len;
	}
5531
	sctx->cur_inode_next_write_offset = offset;
5532 5533 5534 5535 5536
tlv_put_failure:
	fs_path_free(p);
	return ret;
}

5537 5538 5539 5540 5541 5542 5543 5544 5545 5546 5547 5548 5549 5550 5551
static int send_encoded_inline_extent(struct send_ctx *sctx,
				      struct btrfs_path *path, u64 offset,
				      u64 len)
{
	struct btrfs_root *root = sctx->send_root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct inode *inode;
	struct fs_path *fspath;
	struct extent_buffer *leaf = path->nodes[0];
	struct btrfs_key key;
	struct btrfs_file_extent_item *ei;
	u64 ram_bytes;
	size_t inline_size;
	int ret;

5552
	inode = btrfs_iget(sctx->cur_ino, root);
5553 5554 5555 5556 5557 5558 5559 5560 5561 5562 5563 5564 5565 5566 5567 5568 5569 5570 5571 5572 5573 5574 5575 5576 5577 5578 5579 5580 5581 5582 5583 5584 5585 5586 5587 5588 5589 5590 5591 5592 5593 5594 5595 5596 5597 5598 5599 5600 5601 5602 5603 5604 5605 5606 5607 5608 5609 5610 5611 5612 5613 5614 5615 5616 5617 5618
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	fspath = fs_path_alloc();
	if (!fspath) {
		ret = -ENOMEM;
		goto out;
	}

	ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
	if (ret < 0)
		goto out;

	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
	ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei);
	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
		    min(key.offset + ram_bytes - offset, len));
	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset);
	ret = btrfs_encoded_io_compression_from_extent(fs_info,
				btrfs_file_extent_compression(leaf, ei));
	if (ret < 0)
		goto out;
	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);

	ret = put_data_header(sctx, inline_size);
	if (ret < 0)
		goto out;
	read_extent_buffer(leaf, sctx->send_buf + sctx->send_size,
			   btrfs_file_extent_inline_start(ei), inline_size);
	sctx->send_size += inline_size;

	ret = send_cmd(sctx);

tlv_put_failure:
out:
	fs_path_free(fspath);
	iput(inode);
	return ret;
}

static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path,
			       u64 offset, u64 len)
{
	struct btrfs_root *root = sctx->send_root;
	struct btrfs_fs_info *fs_info = root->fs_info;
	struct inode *inode;
	struct fs_path *fspath;
	struct extent_buffer *leaf = path->nodes[0];
	struct btrfs_key key;
	struct btrfs_file_extent_item *ei;
	u64 disk_bytenr, disk_num_bytes;
	u32 data_offset;
	struct btrfs_cmd_header *hdr;
	u32 crc;
	int ret;

5619
	inode = btrfs_iget(sctx->cur_ino, root);
5620 5621 5622 5623 5624 5625 5626 5627 5628 5629 5630 5631 5632 5633 5634 5635 5636 5637 5638 5639 5640 5641 5642 5643 5644 5645 5646 5647 5648 5649 5650 5651 5652 5653 5654 5655 5656 5657 5658 5659 5660 5661 5662 5663 5664 5665 5666
	if (IS_ERR(inode))
		return PTR_ERR(inode);

	fspath = fs_path_alloc();
	if (!fspath) {
		ret = -ENOMEM;
		goto out;
	}

	ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE);
	if (ret < 0)
		goto out;

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
	if (ret < 0)
		goto out;

	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
	disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei);

	TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN,
		    min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset,
			len));
	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN,
		    btrfs_file_extent_ram_bytes(leaf, ei));
	TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET,
		    offset - key.offset + btrfs_file_extent_offset(leaf, ei));
	ret = btrfs_encoded_io_compression_from_extent(fs_info,
				btrfs_file_extent_compression(leaf, ei));
	if (ret < 0)
		goto out;
	TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret);
	TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0);

	ret = put_data_header(sctx, disk_num_bytes);
	if (ret < 0)
		goto out;

	/*
	 * We want to do I/O directly into the send buffer, so get the next page
	 * boundary in the send buffer. This means that there may be a gap
	 * between the beginning of the command and the file data.
	 */
5667
	data_offset = PAGE_ALIGN(sctx->send_size);
5668 5669 5670 5671 5672 5673 5674 5675 5676 5677 5678 5679 5680 5681 5682 5683 5684 5685 5686 5687
	if (data_offset > sctx->send_max_size ||
	    sctx->send_max_size - data_offset < disk_num_bytes) {
		ret = -EOVERFLOW;
		goto out;
	}

	/*
	 * Note that send_buf is a mapping of send_buf_pages, so this is really
	 * reading into send_buf.
	 */
	ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset,
						    disk_bytenr, disk_num_bytes,
						    sctx->send_buf_pages +
						    (data_offset >> PAGE_SHIFT));
	if (ret)
		goto out;

	hdr = (struct btrfs_cmd_header *)sctx->send_buf;
	hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr));
	hdr->crc = 0;
5688 5689
	crc = crc32c(0, sctx->send_buf, sctx->send_size);
	crc = crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes);
5690 5691 5692 5693 5694 5695 5696 5697 5698 5699 5700 5701 5702 5703 5704 5705 5706 5707 5708 5709
	hdr->crc = cpu_to_le32(crc);

	ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size,
			&sctx->send_off);
	if (!ret) {
		ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset,
				disk_num_bytes, &sctx->send_off);
	}
	sctx->send_size = 0;
	sctx->put_data = false;

tlv_put_failure:
out:
	fs_path_free(fspath);
	iput(inode);
	return ret;
}

static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path,
			    const u64 offset, const u64 len)
5710
{
5711
	const u64 end = offset + len;
5712 5713
	struct extent_buffer *leaf = path->nodes[0];
	struct btrfs_file_extent_item *ei;
5714
	u64 read_size = max_send_read_size(sctx);
5715 5716 5717 5718 5719
	u64 sent = 0;

	if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
		return send_update_extent(sctx, offset, len);

5720 5721 5722 5723 5724 5725 5726 5727 5728 5729 5730 5731 5732 5733 5734 5735 5736 5737 5738 5739 5740 5741 5742 5743 5744
	ei = btrfs_item_ptr(leaf, path->slots[0],
			    struct btrfs_file_extent_item);
	if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) &&
	    btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
		bool is_inline = (btrfs_file_extent_type(leaf, ei) ==
				  BTRFS_FILE_EXTENT_INLINE);

		/*
		 * Send the compressed extent unless the compressed data is
		 * larger than the decompressed data. This can happen if we're
		 * not sending the entire extent, either because it has been
		 * partially overwritten/truncated or because this is a part of
		 * the extent that we couldn't clone in clone_range().
		 */
		if (is_inline &&
		    btrfs_file_extent_inline_item_len(leaf,
						      path->slots[0]) <= len) {
			return send_encoded_inline_extent(sctx, path, offset,
							  len);
		} else if (!is_inline &&
			   btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) {
			return send_encoded_extent(sctx, path, offset, len);
		}
	}

5745 5746 5747
	if (sctx->cur_inode == NULL) {
		struct btrfs_root *root = sctx->send_root;

5748
		sctx->cur_inode = btrfs_iget(sctx->cur_ino, root);
5749 5750 5751 5752 5753 5754 5755 5756
		if (IS_ERR(sctx->cur_inode)) {
			int err = PTR_ERR(sctx->cur_inode);

			sctx->cur_inode = NULL;
			return err;
		}
		memset(&sctx->ra, 0, sizeof(struct file_ra_state));
		file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
5757 5758 5759 5760 5761 5762 5763 5764 5765 5766 5767 5768 5769 5770 5771 5772 5773 5774 5775 5776 5777 5778

		/*
		 * It's very likely there are no pages from this inode in the page
		 * cache, so after reading extents and sending their data, we clean
		 * the page cache to avoid trashing the page cache (adding pressure
		 * to the page cache and forcing eviction of other data more useful
		 * for applications).
		 *
		 * We decide if we should clean the page cache simply by checking
		 * if the inode's mapping nrpages is 0 when we first open it, and
		 * not by using something like filemap_range_has_page() before
		 * reading an extent because when we ask the readahead code to
		 * read a given file range, it may (and almost always does) read
		 * pages from beyond that range (see the documentation for
		 * page_cache_sync_readahead()), so it would not be reliable,
		 * because after reading the first extent future calls to
		 * filemap_range_has_page() would return true because the readahead
		 * on the previous extent resulted in reading pages of the current
		 * extent as well.
		 */
		sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
		sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
5779 5780
	}

5781
	while (sent < len) {
5782
		u64 size = min(len - sent, read_size);
5783 5784 5785 5786 5787
		int ret;

		ret = send_write(sctx, offset + sent, size);
		if (ret < 0)
			return ret;
5788
		sent += size;
5789
	}
5790

5791
	if (sctx->clean_page_cache && PAGE_ALIGNED(end)) {
5792 5793 5794 5795 5796 5797 5798 5799 5800 5801 5802 5803 5804 5805 5806 5807 5808 5809 5810 5811 5812 5813 5814 5815 5816 5817 5818 5819 5820
		/*
		 * Always operate only on ranges that are a multiple of the page
		 * size. This is not only to prevent zeroing parts of a page in
		 * the case of subpage sector size, but also to guarantee we evict
		 * pages, as passing a range that is smaller than page size does
		 * not evict the respective page (only zeroes part of its content).
		 *
		 * Always start from the end offset of the last range cleared.
		 * This is because the readahead code may (and very often does)
		 * reads pages beyond the range we request for readahead. So if
		 * we have an extent layout like this:
		 *
		 *            [ extent A ] [ extent B ] [ extent C ]
		 *
		 * When we ask page_cache_sync_readahead() to read extent A, it
		 * may also trigger reads for pages of extent B. If we are doing
		 * an incremental send and extent B has not changed between the
		 * parent and send snapshots, some or all of its pages may end
		 * up being read and placed in the page cache. So when truncating
		 * the page cache we always start from the end offset of the
		 * previously processed extent up to the end of the current
		 * extent.
		 */
		truncate_inode_pages_range(&sctx->cur_inode->i_data,
					   sctx->page_cache_clear_start,
					   end - 1);
		sctx->page_cache_clear_start = end;
	}

5821 5822 5823
	return 0;
}

5824 5825 5826 5827 5828 5829 5830 5831 5832 5833 5834 5835 5836 5837 5838 5839 5840 5841 5842 5843 5844 5845 5846 5847 5848 5849 5850 5851 5852 5853 5854 5855 5856 5857 5858 5859 5860 5861 5862 5863 5864 5865 5866 5867 5868 5869 5870 5871 5872 5873 5874 5875 5876 5877 5878 5879 5880 5881
/*
 * Search for a capability xattr related to sctx->cur_ino. If the capability is
 * found, call send_set_xattr function to emit it.
 *
 * Return 0 if there isn't a capability, or when the capability was emitted
 * successfully, or < 0 if an error occurred.
 */
static int send_capabilities(struct send_ctx *sctx)
{
	struct fs_path *fspath = NULL;
	struct btrfs_path *path;
	struct btrfs_dir_item *di;
	struct extent_buffer *leaf;
	unsigned long data_ptr;
	char *buf = NULL;
	int buf_len;
	int ret = 0;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
				XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
	if (!di) {
		/* There is no xattr for this inode */
		goto out;
	} else if (IS_ERR(di)) {
		ret = PTR_ERR(di);
		goto out;
	}

	leaf = path->nodes[0];
	buf_len = btrfs_dir_data_len(leaf, di);

	fspath = fs_path_alloc();
	buf = kmalloc(buf_len, GFP_KERNEL);
	if (!fspath || !buf) {
		ret = -ENOMEM;
		goto out;
	}

	ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
	if (ret < 0)
		goto out;

	data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
	read_extent_buffer(leaf, buf, data_ptr, buf_len);

	ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
			strlen(XATTR_NAME_CAPS), buf, buf_len);
out:
	kfree(buf);
	fs_path_free(fspath);
	btrfs_free_path(path);
	return ret;
}

5882 5883 5884
static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path,
		       struct clone_root *clone_root, const u64 disk_byte,
		       u64 data_offset, u64 offset, u64 len)
5885 5886 5887 5888
{
	struct btrfs_path *path;
	struct btrfs_key key;
	int ret;
5889
	struct btrfs_inode_info info;
5890
	u64 clone_src_i_size = 0;
5891

5892 5893 5894 5895 5896 5897 5898 5899 5900 5901 5902 5903 5904 5905 5906 5907 5908
	/*
	 * Prevent cloning from a zero offset with a length matching the sector
	 * size because in some scenarios this will make the receiver fail.
	 *
	 * For example, if in the source filesystem the extent at offset 0
	 * has a length of sectorsize and it was written using direct IO, then
	 * it can never be an inline extent (even if compression is enabled).
	 * Then this extent can be cloned in the original filesystem to a non
	 * zero file offset, but it may not be possible to clone in the
	 * destination filesystem because it can be inlined due to compression
	 * on the destination filesystem (as the receiver's write operations are
	 * always done using buffered IO). The same happens when the original
	 * filesystem does not have compression enabled but the destination
	 * filesystem has.
	 */
	if (clone_root->offset == 0 &&
	    len == sctx->send_root->fs_info->sectorsize)
5909
		return send_extent_data(sctx, dst_path, offset, len);
5910

5911 5912 5913 5914
	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

5915 5916 5917 5918
	/*
	 * There are inodes that have extents that lie behind its i_size. Don't
	 * accept clones from these extents.
	 */
5919
	ret = get_inode_info(clone_root->root, clone_root->ino, &info);
5920 5921 5922
	btrfs_release_path(path);
	if (ret < 0)
		goto out;
5923
	clone_src_i_size = info.size;
5924

5925 5926 5927 5928 5929 5930 5931 5932 5933 5934 5935 5936 5937 5938 5939 5940 5941 5942 5943 5944 5945 5946 5947 5948 5949 5950 5951 5952 5953 5954 5955 5956 5957 5958 5959 5960 5961 5962 5963 5964 5965 5966
	/*
	 * We can't send a clone operation for the entire range if we find
	 * extent items in the respective range in the source file that
	 * refer to different extents or if we find holes.
	 * So check for that and do a mix of clone and regular write/copy
	 * operations if needed.
	 *
	 * Example:
	 *
	 * mkfs.btrfs -f /dev/sda
	 * mount /dev/sda /mnt
	 * xfs_io -f -c "pwrite -S 0xaa 0K 100K" /mnt/foo
	 * cp --reflink=always /mnt/foo /mnt/bar
	 * xfs_io -c "pwrite -S 0xbb 50K 50K" /mnt/foo
	 * btrfs subvolume snapshot -r /mnt /mnt/snap
	 *
	 * If when we send the snapshot and we are processing file bar (which
	 * has a higher inode number than foo) we blindly send a clone operation
	 * for the [0, 100K[ range from foo to bar, the receiver ends up getting
	 * a file bar that matches the content of file foo - iow, doesn't match
	 * the content from bar in the original filesystem.
	 */
	key.objectid = clone_root->ino;
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = clone_root->offset;
	ret = btrfs_search_slot(NULL, clone_root->root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret > 0 && path->slots[0] > 0) {
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
		if (key.objectid == clone_root->ino &&
		    key.type == BTRFS_EXTENT_DATA_KEY)
			path->slots[0]--;
	}

	while (true) {
		struct extent_buffer *leaf = path->nodes[0];
		int slot = path->slots[0];
		struct btrfs_file_extent_item *ei;
		u8 type;
		u64 ext_len;
		u64 clone_len;
5967
		u64 clone_data_offset;
5968
		bool crossed_src_i_size = false;
5969 5970 5971 5972 5973 5974 5975 5976 5977 5978 5979 5980 5981 5982 5983 5984 5985 5986 5987 5988 5989 5990 5991

		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(clone_root->root, path);
			if (ret < 0)
				goto out;
			else if (ret > 0)
				break;
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, slot);

		/*
		 * We might have an implicit trailing hole (NO_HOLES feature
		 * enabled). We deal with it after leaving this loop.
		 */
		if (key.objectid != clone_root->ino ||
		    key.type != BTRFS_EXTENT_DATA_KEY)
			break;

		ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
		type = btrfs_file_extent_type(leaf, ei);
		if (type == BTRFS_FILE_EXTENT_INLINE) {
5992
			ext_len = btrfs_file_extent_ram_bytes(leaf, ei);
5993
			ext_len = PAGE_ALIGN(ext_len);
5994 5995 5996 5997 5998 5999 6000 6001 6002 6003 6004 6005 6006
		} else {
			ext_len = btrfs_file_extent_num_bytes(leaf, ei);
		}

		if (key.offset + ext_len <= clone_root->offset)
			goto next;

		if (key.offset > clone_root->offset) {
			/* Implicit hole, NO_HOLES feature enabled. */
			u64 hole_len = key.offset - clone_root->offset;

			if (hole_len > len)
				hole_len = len;
6007 6008
			ret = send_extent_data(sctx, dst_path, offset,
					       hole_len);
6009 6010 6011 6012 6013 6014 6015 6016 6017 6018 6019 6020 6021 6022
			if (ret < 0)
				goto out;

			len -= hole_len;
			if (len == 0)
				break;
			offset += hole_len;
			clone_root->offset += hole_len;
			data_offset += hole_len;
		}

		if (key.offset >= clone_root->offset + len)
			break;

6023 6024 6025
		if (key.offset >= clone_src_i_size)
			break;

6026
		if (key.offset + ext_len > clone_src_i_size) {
6027
			ext_len = clone_src_i_size - key.offset;
6028 6029
			crossed_src_i_size = true;
		}
6030 6031 6032 6033 6034 6035 6036 6037 6038 6039 6040 6041 6042 6043 6044

		clone_data_offset = btrfs_file_extent_offset(leaf, ei);
		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
			clone_root->offset = key.offset;
			if (clone_data_offset < data_offset &&
				clone_data_offset + ext_len > data_offset) {
				u64 extent_offset;

				extent_offset = data_offset - clone_data_offset;
				ext_len -= extent_offset;
				clone_data_offset += extent_offset;
				clone_root->offset += extent_offset;
			}
		}

6045 6046 6047
		clone_len = min_t(u64, ext_len, len);

		if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte &&
6048 6049 6050 6051 6052 6053 6054 6055 6056 6057 6058 6059 6060 6061 6062 6063 6064 6065 6066 6067 6068 6069 6070 6071 6072 6073 6074 6075 6076 6077 6078 6079 6080 6081 6082
		    clone_data_offset == data_offset) {
			const u64 src_end = clone_root->offset + clone_len;
			const u64 sectorsize = SZ_64K;

			/*
			 * We can't clone the last block, when its size is not
			 * sector size aligned, into the middle of a file. If we
			 * do so, the receiver will get a failure (-EINVAL) when
			 * trying to clone or will silently corrupt the data in
			 * the destination file if it's on a kernel without the
			 * fix introduced by commit ac765f83f1397646
			 * ("Btrfs: fix data corruption due to cloning of eof
			 * block).
			 *
			 * So issue a clone of the aligned down range plus a
			 * regular write for the eof block, if we hit that case.
			 *
			 * Also, we use the maximum possible sector size, 64K,
			 * because we don't know what's the sector size of the
			 * filesystem that receives the stream, so we have to
			 * assume the largest possible sector size.
			 */
			if (src_end == clone_src_i_size &&
			    !IS_ALIGNED(src_end, sectorsize) &&
			    offset + clone_len < sctx->cur_inode_size) {
				u64 slen;

				slen = ALIGN_DOWN(src_end - clone_root->offset,
						  sectorsize);
				if (slen > 0) {
					ret = send_clone(sctx, offset, slen,
							 clone_root);
					if (ret < 0)
						goto out;
				}
6083 6084
				ret = send_extent_data(sctx, dst_path,
						       offset + slen,
6085 6086 6087 6088 6089
						       clone_len - slen);
			} else {
				ret = send_clone(sctx, offset, clone_len,
						 clone_root);
			}
6090 6091 6092 6093 6094 6095 6096 6097 6098 6099 6100 6101 6102 6103 6104 6105 6106 6107 6108
		} else if (crossed_src_i_size && clone_len < len) {
			/*
			 * If we are at i_size of the clone source inode and we
			 * can not clone from it, terminate the loop. This is
			 * to avoid sending two write operations, one with a
			 * length matching clone_len and the final one after
			 * this loop with a length of len - clone_len.
			 *
			 * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
			 * was passed to the send ioctl), this helps avoid
			 * sending an encoded write for an offset that is not
			 * sector size aligned, in case the i_size of the source
			 * inode is not sector size aligned. That will make the
			 * receiver fallback to decompression of the data and
			 * writing it using regular buffered IO, therefore while
			 * not incorrect, it's not optimal due decompression and
			 * possible re-compression at the receiver.
			 */
			break;
6109
		} else {
6110 6111
			ret = send_extent_data(sctx, dst_path, offset,
					       clone_len);
6112
		}
6113 6114 6115 6116 6117 6118 6119 6120 6121

		if (ret < 0)
			goto out;

		len -= clone_len;
		if (len == 0)
			break;
		offset += clone_len;
		clone_root->offset += clone_len;
6122 6123 6124 6125 6126 6127 6128 6129 6130 6131 6132 6133 6134 6135 6136

		/*
		 * If we are cloning from the file we are currently processing,
		 * and using the send root as the clone root, we must stop once
		 * the current clone offset reaches the current eof of the file
		 * at the receiver, otherwise we would issue an invalid clone
		 * operation (source range going beyond eof) and cause the
		 * receiver to fail. So if we reach the current eof, bail out
		 * and fallback to a regular write.
		 */
		if (clone_root->root == sctx->send_root &&
		    clone_root->ino == sctx->cur_ino &&
		    clone_root->offset >= sctx->cur_inode_next_write_offset)
			break;

6137 6138 6139 6140 6141 6142
		data_offset += clone_len;
next:
		path->slots[0]++;
	}

	if (len > 0)
6143
		ret = send_extent_data(sctx, dst_path, offset, len);
6144 6145 6146 6147 6148 6149 6150
	else
		ret = 0;
out:
	btrfs_free_path(path);
	return ret;
}

6151 6152 6153 6154 6155 6156 6157
static int send_write_or_clone(struct send_ctx *sctx,
			       struct btrfs_path *path,
			       struct btrfs_key *key,
			       struct clone_root *clone_root)
{
	int ret = 0;
	u64 offset = key->offset;
6158
	u64 end;
6159
	u64 bs = sctx->send_root->fs_info->sectorsize;
6160 6161 6162 6163 6164
	struct btrfs_file_extent_item *ei;
	u64 disk_byte;
	u64 data_offset;
	u64 num_bytes;
	struct btrfs_inode_info info = { 0 };
6165

6166 6167 6168
	end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size);
	if (offset >= end)
		return 0;
6169

6170 6171 6172 6173 6174 6175 6176 6177 6178 6179 6180 6181 6182 6183 6184 6185 6186 6187 6188 6189 6190 6191 6192 6193 6194 6195 6196 6197 6198 6199 6200 6201 6202 6203 6204
	num_bytes = end - offset;

	if (!clone_root)
		goto write_data;

	if (IS_ALIGNED(end, bs))
		goto clone_data;

	/*
	 * If the extent end is not aligned, we can clone if the extent ends at
	 * the i_size of the inode and the clone range ends at the i_size of the
	 * source inode, otherwise the clone operation fails with -EINVAL.
	 */
	if (end != sctx->cur_inode_size)
		goto write_data;

	ret = get_inode_info(clone_root->root, clone_root->ino, &info);
	if (ret < 0)
		return ret;

	if (clone_root->offset + num_bytes == info.size)
		goto clone_data;

write_data:
	ret = send_extent_data(sctx, path, offset, num_bytes);
	sctx->cur_inode_next_write_offset = end;
	return ret;

clone_data:
	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
			    struct btrfs_file_extent_item);
	disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei);
	data_offset = btrfs_file_extent_offset(path->nodes[0], ei);
	ret = clone_range(sctx, path, clone_root, disk_byte, data_offset, offset,
			  num_bytes);
6205
	sctx->cur_inode_next_write_offset = end;
6206 6207 6208 6209 6210 6211 6212 6213 6214 6215 6216 6217 6218 6219 6220 6221 6222 6223 6224 6225 6226
	return ret;
}

static int is_extent_unchanged(struct send_ctx *sctx,
			       struct btrfs_path *left_path,
			       struct btrfs_key *ekey)
{
	int ret = 0;
	struct btrfs_key key;
	struct btrfs_path *path = NULL;
	struct extent_buffer *eb;
	int slot;
	struct btrfs_key found_key;
	struct btrfs_file_extent_item *ei;
	u64 left_disknr;
	u64 right_disknr;
	u64 left_offset;
	u64 right_offset;
	u64 left_offset_fixed;
	u64 left_len;
	u64 right_len;
6227 6228
	u64 left_gen;
	u64 right_gen;
6229 6230 6231 6232 6233 6234 6235 6236 6237 6238 6239 6240 6241 6242 6243 6244
	u8 left_type;
	u8 right_type;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	eb = left_path->nodes[0];
	slot = left_path->slots[0];
	ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
	left_type = btrfs_file_extent_type(eb, ei);

	if (left_type != BTRFS_FILE_EXTENT_REG) {
		ret = 0;
		goto out;
	}
6245 6246 6247 6248
	left_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
	left_len = btrfs_file_extent_num_bytes(eb, ei);
	left_offset = btrfs_file_extent_offset(eb, ei);
	left_gen = btrfs_file_extent_generation(eb, ei);
6249 6250 6251 6252 6253 6254 6255 6256 6257 6258 6259 6260 6261 6262 6263 6264 6265 6266 6267 6268 6269 6270 6271 6272 6273 6274 6275 6276 6277 6278 6279 6280 6281 6282 6283 6284 6285 6286 6287 6288 6289

	/*
	 * Following comments will refer to these graphics. L is the left
	 * extents which we are checking at the moment. 1-8 are the right
	 * extents that we iterate.
	 *
	 *       |-----L-----|
	 * |-1-|-2a-|-3-|-4-|-5-|-6-|
	 *
	 *       |-----L-----|
	 * |--1--|-2b-|...(same as above)
	 *
	 * Alternative situation. Happens on files where extents got split.
	 *       |-----L-----|
	 * |-----------7-----------|-6-|
	 *
	 * Alternative situation. Happens on files which got larger.
	 *       |-----L-----|
	 * |-8-|
	 * Nothing follows after 8.
	 */

	key.objectid = ekey->objectid;
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = ekey->offset;
	ret = btrfs_search_slot_for_read(sctx->parent_root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret) {
		ret = 0;
		goto out;
	}

	/*
	 * Handle special case where the right side has no extents at all.
	 */
	eb = path->nodes[0];
	slot = path->slots[0];
	btrfs_item_key_to_cpu(eb, &found_key, slot);
	if (found_key.objectid != key.objectid ||
	    found_key.type != key.type) {
6290 6291
		/* If we're a hole then just pretend nothing changed */
		ret = (left_disknr) ? 0 : 1;
6292 6293 6294 6295 6296 6297 6298 6299 6300 6301
		goto out;
	}

	/*
	 * We're now on 2a, 2b or 7.
	 */
	key = found_key;
	while (key.offset < ekey->offset + left_len) {
		ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
		right_type = btrfs_file_extent_type(eb, ei);
6302 6303
		if (right_type != BTRFS_FILE_EXTENT_REG &&
		    right_type != BTRFS_FILE_EXTENT_INLINE) {
6304 6305 6306 6307
			ret = 0;
			goto out;
		}

6308
		if (right_type == BTRFS_FILE_EXTENT_INLINE) {
6309
			right_len = btrfs_file_extent_ram_bytes(eb, ei);
6310 6311 6312 6313
			right_len = PAGE_ALIGN(right_len);
		} else {
			right_len = btrfs_file_extent_num_bytes(eb, ei);
		}
6314

6315 6316 6317 6318
		/*
		 * Are we at extent 8? If yes, we know the extent is changed.
		 * This may only happen on the first iteration.
		 */
6319
		if (found_key.offset + right_len <= ekey->offset) {
6320 6321
			/* If we're a hole just pretend nothing changed */
			ret = (left_disknr) ? 0 : 1;
6322 6323 6324
			goto out;
		}

6325 6326 6327 6328 6329 6330 6331 6332 6333 6334 6335 6336 6337
		/*
		 * We just wanted to see if when we have an inline extent, what
		 * follows it is a regular extent (wanted to check the above
		 * condition for inline extents too). This should normally not
		 * happen but it's possible for example when we have an inline
		 * compressed extent representing data with a size matching
		 * the page size (currently the same as sector size).
		 */
		if (right_type == BTRFS_FILE_EXTENT_INLINE) {
			ret = 0;
			goto out;
		}

6338 6339 6340 6341
		right_disknr = btrfs_file_extent_disk_bytenr(eb, ei);
		right_offset = btrfs_file_extent_offset(eb, ei);
		right_gen = btrfs_file_extent_generation(eb, ei);

6342 6343 6344 6345 6346 6347 6348 6349 6350 6351 6352 6353
		left_offset_fixed = left_offset;
		if (key.offset < ekey->offset) {
			/* Fix the right offset for 2a and 7. */
			right_offset += ekey->offset - key.offset;
		} else {
			/* Fix the left offset for all behind 2a and 2b */
			left_offset_fixed += key.offset - ekey->offset;
		}

		/*
		 * Check if we have the same extent.
		 */
6354
		if (left_disknr != right_disknr ||
6355 6356
		    left_offset_fixed != right_offset ||
		    left_gen != right_gen) {
6357 6358 6359 6360 6361 6362 6363 6364 6365 6366 6367 6368 6369 6370 6371 6372 6373 6374 6375
			ret = 0;
			goto out;
		}

		/*
		 * Go to the next extent.
		 */
		ret = btrfs_next_item(sctx->parent_root, path);
		if (ret < 0)
			goto out;
		if (!ret) {
			eb = path->nodes[0];
			slot = path->slots[0];
			btrfs_item_key_to_cpu(eb, &found_key, slot);
		}
		if (ret || found_key.objectid != key.objectid ||
		    found_key.type != key.type) {
			key.offset += right_len;
			break;
6376 6377 6378 6379
		}
		if (found_key.offset != key.offset + right_len) {
			ret = 0;
			goto out;
6380 6381 6382 6383 6384 6385 6386 6387 6388 6389 6390 6391 6392 6393 6394 6395 6396 6397 6398
		}
		key = found_key;
	}

	/*
	 * We're now behind the left extent (treat as unchanged) or at the end
	 * of the right side (treat as changed).
	 */
	if (key.offset >= ekey->offset + left_len)
		ret = 1;
	else
		ret = 0;


out:
	btrfs_free_path(path);
	return ret;
}

6399 6400 6401 6402 6403 6404 6405 6406 6407 6408 6409 6410 6411 6412 6413 6414 6415 6416 6417 6418 6419 6420 6421 6422
static int get_last_extent(struct send_ctx *sctx, u64 offset)
{
	struct btrfs_path *path;
	struct btrfs_root *root = sctx->send_root;
	struct btrfs_key key;
	int ret;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	sctx->cur_inode_last_extent = 0;

	key.objectid = sctx->cur_ino;
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = offset;
	ret = btrfs_search_slot_for_read(root, &key, path, 0, 1);
	if (ret < 0)
		goto out;
	ret = 0;
	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
	if (key.objectid != sctx->cur_ino || key.type != BTRFS_EXTENT_DATA_KEY)
		goto out;

6423
	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6424 6425 6426 6427 6428
out:
	btrfs_free_path(path);
	return ret;
}

6429 6430 6431 6432 6433 6434 6435 6436 6437 6438 6439 6440 6441 6442 6443 6444 6445 6446 6447 6448 6449 6450 6451 6452 6453 6454 6455 6456 6457 6458 6459 6460 6461 6462 6463 6464 6465 6466 6467 6468 6469 6470 6471 6472 6473 6474 6475 6476
static int range_is_hole_in_parent(struct send_ctx *sctx,
				   const u64 start,
				   const u64 end)
{
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_root *root = sctx->parent_root;
	u64 search_start = start;
	int ret;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	key.objectid = sctx->cur_ino;
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = search_start;
	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
	if (ret < 0)
		goto out;
	if (ret > 0 && path->slots[0] > 0)
		path->slots[0]--;

	while (search_start < end) {
		struct extent_buffer *leaf = path->nodes[0];
		int slot = path->slots[0];
		struct btrfs_file_extent_item *fi;
		u64 extent_end;

		if (slot >= btrfs_header_nritems(leaf)) {
			ret = btrfs_next_leaf(root, path);
			if (ret < 0)
				goto out;
			else if (ret > 0)
				break;
			continue;
		}

		btrfs_item_key_to_cpu(leaf, &key, slot);
		if (key.objectid < sctx->cur_ino ||
		    key.type < BTRFS_EXTENT_DATA_KEY)
			goto next;
		if (key.objectid > sctx->cur_ino ||
		    key.type > BTRFS_EXTENT_DATA_KEY ||
		    key.offset >= end)
			break;

		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6477
		extent_end = btrfs_file_extent_end(path);
6478 6479 6480 6481 6482 6483 6484 6485 6486 6487 6488 6489 6490 6491 6492 6493 6494
		if (extent_end <= start)
			goto next;
		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) {
			search_start = extent_end;
			goto next;
		}
		ret = 0;
		goto out;
next:
		path->slots[0]++;
	}
	ret = 1;
out:
	btrfs_free_path(path);
	return ret;
}

6495 6496 6497 6498 6499 6500 6501 6502
static int maybe_send_hole(struct send_ctx *sctx, struct btrfs_path *path,
			   struct btrfs_key *key)
{
	int ret = 0;

	if (sctx->cur_ino != key->objectid || !need_send_hole(sctx))
		return 0;

6503 6504 6505 6506 6507 6508 6509 6510 6511 6512 6513 6514
	/*
	 * Get last extent's end offset (exclusive) if we haven't determined it
	 * yet (we're processing the first file extent item that is new), or if
	 * we're at the first slot of a leaf and the last extent's end is less
	 * than the current extent's offset, because we might have skipped
	 * entire leaves that contained only file extent items for our current
	 * inode. These leaves have a generation number smaller (older) than the
	 * one in the current leaf and the leaf our last extent came from, and
	 * are located between these 2 leaves.
	 */
	if ((sctx->cur_inode_last_extent == (u64)-1) ||
	    (path->slots[0] == 0 && sctx->cur_inode_last_extent < key->offset)) {
6515 6516 6517 6518 6519
		ret = get_last_extent(sctx, key->offset - 1);
		if (ret)
			return ret;
	}

6520 6521 6522 6523 6524 6525 6526 6527 6528 6529 6530
	if (sctx->cur_inode_last_extent < key->offset) {
		ret = range_is_hole_in_parent(sctx,
					      sctx->cur_inode_last_extent,
					      key->offset);
		if (ret < 0)
			return ret;
		else if (ret == 0)
			ret = send_hole(sctx, key->offset);
		else
			ret = 0;
	}
6531
	sctx->cur_inode_last_extent = btrfs_file_extent_end(path);
6532 6533 6534
	return ret;
}

6535 6536 6537 6538 6539
static int process_extent(struct send_ctx *sctx,
			  struct btrfs_path *path,
			  struct btrfs_key *key)
{
	struct clone_root *found_clone = NULL;
6540
	int ret = 0;
6541 6542 6543 6544 6545 6546 6547 6548 6549 6550

	if (S_ISLNK(sctx->cur_inode_mode))
		return 0;

	if (sctx->parent_root && !sctx->cur_inode_new) {
		ret = is_extent_unchanged(sctx, path, key);
		if (ret < 0)
			goto out;
		if (ret) {
			ret = 0;
6551
			goto out_hole;
6552
		}
6553 6554 6555 6556 6557 6558 6559 6560 6561 6562 6563 6564 6565 6566 6567 6568 6569 6570 6571 6572 6573 6574 6575 6576 6577 6578
	} else {
		struct btrfs_file_extent_item *ei;
		u8 type;

		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				    struct btrfs_file_extent_item);
		type = btrfs_file_extent_type(path->nodes[0], ei);
		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
		    type == BTRFS_FILE_EXTENT_REG) {
			/*
			 * The send spec does not have a prealloc command yet,
			 * so just leave a hole for prealloc'ed extents until
			 * we have enough commands queued up to justify rev'ing
			 * the send spec.
			 */
			if (type == BTRFS_FILE_EXTENT_PREALLOC) {
				ret = 0;
				goto out;
			}

			/* Have a hole, just skip it. */
			if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
				ret = 0;
				goto out;
			}
		}
6579 6580 6581 6582 6583 6584 6585 6586
	}

	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
			sctx->cur_inode_size, &found_clone);
	if (ret != -ENOENT && ret < 0)
		goto out;

	ret = send_write_or_clone(sctx, path, key, found_clone);
6587 6588 6589 6590
	if (ret)
		goto out;
out_hole:
	ret = maybe_send_hole(sctx, path, key);
6591 6592 6593 6594 6595 6596
out:
	return ret;
}

static int process_all_extents(struct send_ctx *sctx)
{
6597 6598
	int ret = 0;
	int iter_ret = 0;
6599 6600 6601 6602 6603 6604 6605 6606 6607 6608 6609 6610 6611
	struct btrfs_root *root;
	struct btrfs_path *path;
	struct btrfs_key key;
	struct btrfs_key found_key;

	root = sctx->send_root;
	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;

	key.objectid = sctx->cmp_key->objectid;
	key.type = BTRFS_EXTENT_DATA_KEY;
	key.offset = 0;
6612
	btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
6613 6614 6615
		if (found_key.objectid != key.objectid ||
		    found_key.type != key.type) {
			ret = 0;
6616
			break;
6617 6618 6619 6620
		}

		ret = process_extent(sctx, path, &found_key);
		if (ret < 0)
6621
			break;
6622
	}
6623 6624 6625
	/* Catch error found during iteration */
	if (iter_ret < 0)
		ret = iter_ret;
6626 6627 6628 6629 6630

	btrfs_free_path(path);
	return ret;
}

6631 6632 6633
static int process_recorded_refs_if_needed(struct send_ctx *sctx, int at_end,
					   int *pending_move,
					   int *refs_processed)
6634 6635 6636 6637 6638 6639
{
	int ret = 0;

	if (sctx->cur_ino == 0)
		goto out;
	if (!at_end && sctx->cur_ino == sctx->cmp_key->objectid &&
6640
	    sctx->cmp_key->type <= BTRFS_INODE_EXTREF_KEY)
6641 6642 6643 6644
		goto out;
	if (list_empty(&sctx->new_refs) && list_empty(&sctx->deleted_refs))
		goto out;

6645
	ret = process_recorded_refs(sctx, pending_move);
6646 6647 6648
	if (ret < 0)
		goto out;

6649
	*refs_processed = 1;
6650 6651 6652 6653 6654 6655 6656
out:
	return ret;
}

static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
{
	int ret = 0;
6657
	struct btrfs_inode_info info;
6658 6659 6660
	u64 left_mode;
	u64 left_uid;
	u64 left_gid;
6661
	u64 left_fileattr;
6662 6663 6664
	u64 right_mode;
	u64 right_uid;
	u64 right_gid;
6665
	u64 right_fileattr;
6666 6667
	int need_chmod = 0;
	int need_chown = 0;
6668
	bool need_fileattr = false;
6669
	int need_truncate = 1;
6670 6671
	int pending_move = 0;
	int refs_processed = 0;
6672

6673 6674 6675
	if (sctx->ignore_cur_inode)
		return 0;

6676 6677
	ret = process_recorded_refs_if_needed(sctx, at_end, &pending_move,
					      &refs_processed);
6678 6679 6680
	if (ret < 0)
		goto out;

6681 6682 6683 6684 6685 6686 6687 6688 6689 6690 6691 6692 6693 6694 6695
	/*
	 * We have processed the refs and thus need to advance send_progress.
	 * Now, calls to get_cur_xxx will take the updated refs of the current
	 * inode into account.
	 *
	 * On the other hand, if our current inode is a directory and couldn't
	 * be moved/renamed because its parent was renamed/moved too and it has
	 * a higher inode number, we can only move/rename our current inode
	 * after we moved/renamed its parent. Therefore in this case operate on
	 * the old path (pre move/rename) of our current inode, and the
	 * move/rename will be performed later.
	 */
	if (refs_processed && !pending_move)
		sctx->send_progress = sctx->cur_ino + 1;

6696 6697 6698 6699
	if (sctx->cur_ino == 0 || sctx->cur_inode_deleted)
		goto out;
	if (!at_end && sctx->cmp_key->objectid == sctx->cur_ino)
		goto out;
6700
	ret = get_inode_info(sctx->send_root, sctx->cur_ino, &info);
6701 6702
	if (ret < 0)
		goto out;
6703 6704 6705 6706
	left_mode = info.mode;
	left_uid = info.uid;
	left_gid = info.gid;
	left_fileattr = info.fileattr;
6707

6708 6709 6710
	if (!sctx->parent_root || sctx->cur_inode_new) {
		need_chown = 1;
		if (!S_ISLNK(sctx->cur_inode_mode))
6711
			need_chmod = 1;
6712 6713
		if (sctx->cur_inode_next_write_offset == sctx->cur_inode_size)
			need_truncate = 0;
6714
	} else {
6715 6716
		u64 old_size;

6717
		ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &info);
6718 6719
		if (ret < 0)
			goto out;
6720 6721 6722 6723 6724
		old_size = info.size;
		right_mode = info.mode;
		right_uid = info.uid;
		right_gid = info.gid;
		right_fileattr = info.fileattr;
6725

6726 6727 6728 6729
		if (left_uid != right_uid || left_gid != right_gid)
			need_chown = 1;
		if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode)
			need_chmod = 1;
6730 6731
		if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr)
			need_fileattr = true;
6732 6733 6734 6735
		if ((old_size == sctx->cur_inode_size) ||
		    (sctx->cur_inode_size > old_size &&
		     sctx->cur_inode_next_write_offset == sctx->cur_inode_size))
			need_truncate = 0;
6736 6737 6738
	}

	if (S_ISREG(sctx->cur_inode_mode)) {
6739
		if (need_send_hole(sctx)) {
6740 6741 6742
			if (sctx->cur_inode_last_extent == (u64)-1 ||
			    sctx->cur_inode_last_extent <
			    sctx->cur_inode_size) {
6743 6744 6745 6746
				ret = get_last_extent(sctx, (u64)-1);
				if (ret)
					goto out;
			}
6747 6748 6749 6750 6751
			if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
				ret = range_is_hole_in_parent(sctx,
						      sctx->cur_inode_last_extent,
						      sctx->cur_inode_size);
				if (ret < 0) {
6752
					goto out;
6753 6754 6755 6756 6757 6758 6759 6760
				} else if (ret == 0) {
					ret = send_hole(sctx, sctx->cur_inode_size);
					if (ret < 0)
						goto out;
				} else {
					/* Range is already a hole, skip. */
					ret = 0;
				}
6761 6762
			}
		}
6763 6764 6765 6766 6767 6768 6769
		if (need_truncate) {
			ret = send_truncate(sctx, sctx->cur_ino,
					    sctx->cur_inode_gen,
					    sctx->cur_inode_size);
			if (ret < 0)
				goto out;
		}
6770 6771 6772 6773 6774 6775 6776 6777 6778 6779 6780 6781 6782 6783
	}

	if (need_chown) {
		ret = send_chown(sctx, sctx->cur_ino, sctx->cur_inode_gen,
				left_uid, left_gid);
		if (ret < 0)
			goto out;
	}
	if (need_chmod) {
		ret = send_chmod(sctx, sctx->cur_ino, sctx->cur_inode_gen,
				left_mode);
		if (ret < 0)
			goto out;
	}
6784 6785 6786 6787 6788 6789
	if (need_fileattr) {
		ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen,
				    left_fileattr);
		if (ret < 0)
			goto out;
	}
6790 6791 6792

	if (proto_cmd_ok(sctx, BTRFS_SEND_C_ENABLE_VERITY)
	    && sctx->cur_inode_needs_verity) {
6793 6794 6795 6796
		ret = process_verity(sctx);
		if (ret < 0)
			goto out;
	}
6797

6798 6799 6800 6801
	ret = send_capabilities(sctx);
	if (ret < 0)
		goto out;

6802
	/*
6803 6804
	 * If other directory inodes depended on our current directory
	 * inode's move/rename, now do their move/rename operations.
6805
	 */
6806 6807 6808 6809
	if (!is_waiting_for_move(sctx, sctx->cur_ino)) {
		ret = apply_children_dir_moves(sctx);
		if (ret)
			goto out;
6810 6811 6812 6813 6814 6815 6816 6817
		/*
		 * Need to send that every time, no matter if it actually
		 * changed between the two trees as we have done changes to
		 * the inode before. If our inode is a directory and it's
		 * waiting to be moved/renamed, we will send its utimes when
		 * it's moved/renamed, therefore we don't need to do it here.
		 */
		sctx->send_progress = sctx->cur_ino + 1;
6818 6819 6820 6821 6822 6823 6824 6825 6826 6827 6828 6829

		/*
		 * If the current inode is a non-empty directory, delay issuing
		 * the utimes command for it, as it's very likely we have inodes
		 * with an higher number inside it. We want to issue the utimes
		 * command only after adding all dentries to it.
		 */
		if (S_ISDIR(sctx->cur_inode_mode) && sctx->cur_inode_size > 0)
			ret = cache_dir_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);
		else
			ret = send_utimes(sctx, sctx->cur_ino, sctx->cur_inode_gen);

6830 6831
		if (ret < 0)
			goto out;
6832 6833
	}

6834
out:
6835 6836 6837
	if (!ret)
		ret = trim_dir_utimes_cache(sctx);

6838 6839 6840
	return ret;
}

6841 6842 6843 6844 6845 6846 6847 6848 6849 6850 6851 6852 6853 6854 6855 6856 6857 6858 6859 6860 6861 6862 6863 6864
static void close_current_inode(struct send_ctx *sctx)
{
	u64 i_size;

	if (sctx->cur_inode == NULL)
		return;

	i_size = i_size_read(sctx->cur_inode);

	/*
	 * If we are doing an incremental send, we may have extents between the
	 * last processed extent and the i_size that have not been processed
	 * because they haven't changed but we may have read some of their pages
	 * through readahead, see the comments at send_extent_data().
	 */
	if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
		truncate_inode_pages_range(&sctx->cur_inode->i_data,
					   sctx->page_cache_clear_start,
					   round_up(i_size, PAGE_SIZE) - 1);

	iput(sctx->cur_inode);
	sctx->cur_inode = NULL;
}

6865 6866 6867 6868 6869 6870 6871 6872 6873 6874
static int changed_inode(struct send_ctx *sctx,
			 enum btrfs_compare_tree_result result)
{
	int ret = 0;
	struct btrfs_key *key = sctx->cmp_key;
	struct btrfs_inode_item *left_ii = NULL;
	struct btrfs_inode_item *right_ii = NULL;
	u64 left_gen = 0;
	u64 right_gen = 0;

6875
	close_current_inode(sctx);
6876

6877
	sctx->cur_ino = key->objectid;
6878
	sctx->cur_inode_new_gen = false;
6879
	sctx->cur_inode_last_extent = (u64)-1;
6880
	sctx->cur_inode_next_write_offset = 0;
6881
	sctx->ignore_cur_inode = false;
6882 6883 6884 6885 6886 6887

	/*
	 * Set send_progress to current inode. This will tell all get_cur_xxx
	 * functions that the current inode's refs are not updated yet. Later,
	 * when process_recorded_refs is finished, it is set to cur_ino + 1.
	 */
6888 6889 6890 6891 6892 6893 6894 6895 6896 6897 6898 6899 6900 6901 6902 6903 6904 6905 6906 6907 6908 6909 6910
	sctx->send_progress = sctx->cur_ino;

	if (result == BTRFS_COMPARE_TREE_NEW ||
	    result == BTRFS_COMPARE_TREE_CHANGED) {
		left_ii = btrfs_item_ptr(sctx->left_path->nodes[0],
				sctx->left_path->slots[0],
				struct btrfs_inode_item);
		left_gen = btrfs_inode_generation(sctx->left_path->nodes[0],
				left_ii);
	} else {
		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
				sctx->right_path->slots[0],
				struct btrfs_inode_item);
		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
				right_ii);
	}
	if (result == BTRFS_COMPARE_TREE_CHANGED) {
		right_ii = btrfs_item_ptr(sctx->right_path->nodes[0],
				sctx->right_path->slots[0],
				struct btrfs_inode_item);

		right_gen = btrfs_inode_generation(sctx->right_path->nodes[0],
				right_ii);
6911 6912 6913 6914 6915 6916 6917 6918

		/*
		 * The cur_ino = root dir case is special here. We can't treat
		 * the inode as deleted+reused because it would generate a
		 * stream that tries to delete/mkdir the root dir.
		 */
		if (left_gen != right_gen &&
		    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6919
			sctx->cur_inode_new_gen = true;
6920 6921
	}

6922 6923 6924 6925 6926 6927 6928 6929 6930
	/*
	 * Normally we do not find inodes with a link count of zero (orphans)
	 * because the most common case is to create a snapshot and use it
	 * for a send operation. However other less common use cases involve
	 * using a subvolume and send it after turning it to RO mode just
	 * after deleting all hard links of a file while holding an open
	 * file descriptor against it or turning a RO snapshot into RW mode,
	 * keep an open file descriptor against a file, delete it and then
	 * turn the snapshot back to RO mode before using it for a send
6931 6932 6933 6934 6935 6936 6937 6938 6939 6940 6941 6942 6943 6944 6945 6946 6947 6948 6949 6950 6951 6952 6953 6954
	 * operation. The former is what the receiver operation does.
	 * Therefore, if we want to send these snapshots soon after they're
	 * received, we need to handle orphan inodes as well. Moreover, orphans
	 * can appear not only in the send snapshot but also in the parent
	 * snapshot. Here are several cases:
	 *
	 * Case 1: BTRFS_COMPARE_TREE_NEW
	 *       |  send snapshot  | action
	 * --------------------------------
	 * nlink |        0        | ignore
	 *
	 * Case 2: BTRFS_COMPARE_TREE_DELETED
	 *       | parent snapshot | action
	 * ----------------------------------
	 * nlink |        0        | as usual
	 * Note: No unlinks will be sent because there're no paths for it.
	 *
	 * Case 3: BTRFS_COMPARE_TREE_CHANGED
	 *           |       | parent snapshot | send snapshot | action
	 * -----------------------------------------------------------------------
	 * subcase 1 | nlink |        0        |       0       | ignore
	 * subcase 2 | nlink |       >0        |       0       | new_gen(deletion)
	 * subcase 3 | nlink |        0        |      >0       | new_gen(creation)
	 *
6955
	 */
6956 6957
	if (result == BTRFS_COMPARE_TREE_NEW) {
		if (btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii) == 0) {
6958 6959 6960
			sctx->ignore_cur_inode = true;
			goto out;
		}
6961
		sctx->cur_inode_gen = left_gen;
6962 6963
		sctx->cur_inode_new = true;
		sctx->cur_inode_deleted = false;
6964 6965 6966 6967
		sctx->cur_inode_size = btrfs_inode_size(
				sctx->left_path->nodes[0], left_ii);
		sctx->cur_inode_mode = btrfs_inode_mode(
				sctx->left_path->nodes[0], left_ii);
6968 6969
		sctx->cur_inode_rdev = btrfs_inode_rdev(
				sctx->left_path->nodes[0], left_ii);
6970
		if (sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID)
6971
			ret = send_create_inode_if_needed(sctx);
6972 6973
	} else if (result == BTRFS_COMPARE_TREE_DELETED) {
		sctx->cur_inode_gen = right_gen;
6974 6975
		sctx->cur_inode_new = false;
		sctx->cur_inode_deleted = true;
6976 6977 6978 6979 6980
		sctx->cur_inode_size = btrfs_inode_size(
				sctx->right_path->nodes[0], right_ii);
		sctx->cur_inode_mode = btrfs_inode_mode(
				sctx->right_path->nodes[0], right_ii);
	} else if (result == BTRFS_COMPARE_TREE_CHANGED) {
6981 6982 6983 6984 6985 6986 6987 6988 6989 6990
		u32 new_nlinks, old_nlinks;

		new_nlinks = btrfs_inode_nlink(sctx->left_path->nodes[0], left_ii);
		old_nlinks = btrfs_inode_nlink(sctx->right_path->nodes[0], right_ii);
		if (new_nlinks == 0 && old_nlinks == 0) {
			sctx->ignore_cur_inode = true;
			goto out;
		} else if (new_nlinks == 0 || old_nlinks == 0) {
			sctx->cur_inode_new_gen = 1;
		}
6991 6992 6993 6994 6995 6996 6997
		/*
		 * We need to do some special handling in case the inode was
		 * reported as changed with a changed generation number. This
		 * means that the original inode was deleted and new inode
		 * reused the same inum. So we have to treat the old inode as
		 * deleted and the new one as new.
		 */
6998
		if (sctx->cur_inode_new_gen) {
6999 7000 7001
			/*
			 * First, process the inode as if it was deleted.
			 */
7002 7003 7004 7005 7006 7007 7008 7009 7010 7011 7012 7013 7014
			if (old_nlinks > 0) {
				sctx->cur_inode_gen = right_gen;
				sctx->cur_inode_new = false;
				sctx->cur_inode_deleted = true;
				sctx->cur_inode_size = btrfs_inode_size(
						sctx->right_path->nodes[0], right_ii);
				sctx->cur_inode_mode = btrfs_inode_mode(
						sctx->right_path->nodes[0], right_ii);
				ret = process_all_refs(sctx,
						BTRFS_COMPARE_TREE_DELETED);
				if (ret < 0)
					goto out;
			}
7015

7016 7017 7018
			/*
			 * Now process the inode as if it was new.
			 */
7019 7020 7021 7022 7023 7024 7025 7026 7027 7028 7029 7030 7031 7032 7033 7034
			if (new_nlinks > 0) {
				sctx->cur_inode_gen = left_gen;
				sctx->cur_inode_new = true;
				sctx->cur_inode_deleted = false;
				sctx->cur_inode_size = btrfs_inode_size(
						sctx->left_path->nodes[0],
						left_ii);
				sctx->cur_inode_mode = btrfs_inode_mode(
						sctx->left_path->nodes[0],
						left_ii);
				sctx->cur_inode_rdev = btrfs_inode_rdev(
						sctx->left_path->nodes[0],
						left_ii);
				ret = send_create_inode_if_needed(sctx);
				if (ret < 0)
					goto out;
7035

7036 7037 7038 7039 7040 7041 7042 7043 7044
				ret = process_all_refs(sctx, BTRFS_COMPARE_TREE_NEW);
				if (ret < 0)
					goto out;
				/*
				 * Advance send_progress now as we did not get
				 * into process_recorded_refs_if_needed in the
				 * new_gen case.
				 */
				sctx->send_progress = sctx->cur_ino + 1;
7045

7046 7047 7048 7049 7050 7051 7052 7053 7054 7055 7056
				/*
				 * Now process all extents and xattrs of the
				 * inode as if they were all new.
				 */
				ret = process_all_extents(sctx);
				if (ret < 0)
					goto out;
				ret = process_all_new_xattrs(sctx);
				if (ret < 0)
					goto out;
			}
7057 7058
		} else {
			sctx->cur_inode_gen = left_gen;
7059 7060 7061
			sctx->cur_inode_new = false;
			sctx->cur_inode_new_gen = false;
			sctx->cur_inode_deleted = false;
7062 7063 7064 7065 7066 7067 7068 7069 7070 7071 7072
			sctx->cur_inode_size = btrfs_inode_size(
					sctx->left_path->nodes[0], left_ii);
			sctx->cur_inode_mode = btrfs_inode_mode(
					sctx->left_path->nodes[0], left_ii);
		}
	}

out:
	return ret;
}

7073 7074 7075 7076 7077 7078 7079 7080 7081 7082
/*
 * We have to process new refs before deleted refs, but compare_trees gives us
 * the new and deleted refs mixed. To fix this, we record the new/deleted refs
 * first and later process them in process_recorded_refs.
 * For the cur_inode_new_gen case, we skip recording completely because
 * changed_inode did already initiate processing of refs. The reason for this is
 * that in this case, compare_tree actually compares the refs of 2 different
 * inodes. To fix this, process_all_refs is used in changed_inode to handle all
 * refs of the right tree as deleted and all refs of the left tree as new.
 */
7083 7084 7085 7086 7087
static int changed_ref(struct send_ctx *sctx,
		       enum btrfs_compare_tree_result result)
{
	int ret = 0;

7088 7089 7090 7091
	if (sctx->cur_ino != sctx->cmp_key->objectid) {
		inconsistent_snapshot_error(sctx, result, "reference");
		return -EIO;
	}
7092 7093 7094 7095 7096 7097 7098 7099 7100 7101 7102 7103 7104 7105

	if (!sctx->cur_inode_new_gen &&
	    sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) {
		if (result == BTRFS_COMPARE_TREE_NEW)
			ret = record_new_ref(sctx);
		else if (result == BTRFS_COMPARE_TREE_DELETED)
			ret = record_deleted_ref(sctx);
		else if (result == BTRFS_COMPARE_TREE_CHANGED)
			ret = record_changed_ref(sctx);
	}

	return ret;
}

7106 7107 7108 7109 7110
/*
 * Process new/deleted/changed xattrs. We skip processing in the
 * cur_inode_new_gen case because changed_inode did already initiate processing
 * of xattrs. The reason is the same as in changed_ref
 */
7111 7112 7113 7114 7115
static int changed_xattr(struct send_ctx *sctx,
			 enum btrfs_compare_tree_result result)
{
	int ret = 0;

7116 7117 7118 7119
	if (sctx->cur_ino != sctx->cmp_key->objectid) {
		inconsistent_snapshot_error(sctx, result, "xattr");
		return -EIO;
	}
7120 7121 7122 7123 7124 7125 7126 7127 7128 7129 7130 7131 7132

	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
		if (result == BTRFS_COMPARE_TREE_NEW)
			ret = process_new_xattr(sctx);
		else if (result == BTRFS_COMPARE_TREE_DELETED)
			ret = process_deleted_xattr(sctx);
		else if (result == BTRFS_COMPARE_TREE_CHANGED)
			ret = process_changed_xattr(sctx);
	}

	return ret;
}

7133 7134 7135 7136 7137
/*
 * Process new/deleted/changed extents. We skip processing in the
 * cur_inode_new_gen case because changed_inode did already initiate processing
 * of extents. The reason is the same as in changed_ref
 */
7138 7139 7140 7141 7142
static int changed_extent(struct send_ctx *sctx,
			  enum btrfs_compare_tree_result result)
{
	int ret = 0;

7143 7144 7145 7146 7147 7148 7149 7150 7151 7152
	/*
	 * We have found an extent item that changed without the inode item
	 * having changed. This can happen either after relocation (where the
	 * disk_bytenr of an extent item is replaced at
	 * relocation.c:replace_file_extents()) or after deduplication into a
	 * file in both the parent and send snapshots (where an extent item can
	 * get modified or replaced with a new one). Note that deduplication
	 * updates the inode item, but it only changes the iversion (sequence
	 * field in the inode item) of the inode, so if a file is deduplicated
	 * the same amount of times in both the parent and send snapshots, its
7153
	 * iversion becomes the same in both snapshots, whence the inode item is
7154 7155 7156 7157
	 * the same on both snapshots.
	 */
	if (sctx->cur_ino != sctx->cmp_key->objectid)
		return 0;
7158 7159 7160 7161 7162 7163 7164 7165 7166 7167

	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
		if (result != BTRFS_COMPARE_TREE_DELETED)
			ret = process_extent(sctx, sctx->left_path,
					sctx->cmp_key);
	}

	return ret;
}

7168 7169 7170 7171 7172 7173 7174 7175 7176 7177 7178
static int changed_verity(struct send_ctx *sctx, enum btrfs_compare_tree_result result)
{
	int ret = 0;

	if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
		if (result == BTRFS_COMPARE_TREE_NEW)
			sctx->cur_inode_needs_verity = true;
	}
	return ret;
}

7179 7180 7181 7182 7183
static int dir_changed(struct send_ctx *sctx, u64 dir)
{
	u64 orig_gen, new_gen;
	int ret;

7184
	ret = get_inode_gen(sctx->send_root, dir, &new_gen);
7185 7186 7187
	if (ret)
		return ret;

7188
	ret = get_inode_gen(sctx->parent_root, dir, &orig_gen);
7189 7190 7191 7192 7193 7194 7195 7196 7197 7198 7199 7200 7201 7202 7203 7204 7205 7206 7207 7208 7209 7210 7211 7212 7213 7214 7215
	if (ret)
		return ret;

	return (orig_gen != new_gen) ? 1 : 0;
}

static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
			struct btrfs_key *key)
{
	struct btrfs_inode_extref *extref;
	struct extent_buffer *leaf;
	u64 dirid = 0, last_dirid = 0;
	unsigned long ptr;
	u32 item_size;
	u32 cur_offset = 0;
	int ref_name_len;
	int ret = 0;

	/* Easy case, just check this one dirid */
	if (key->type == BTRFS_INODE_REF_KEY) {
		dirid = key->offset;

		ret = dir_changed(sctx, dirid);
		goto out;
	}

	leaf = path->nodes[0];
7216
	item_size = btrfs_item_size(leaf, path->slots[0]);
7217 7218 7219 7220 7221 7222 7223 7224 7225 7226 7227 7228 7229 7230 7231 7232 7233 7234
	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
	while (cur_offset < item_size) {
		extref = (struct btrfs_inode_extref *)(ptr +
						       cur_offset);
		dirid = btrfs_inode_extref_parent(leaf, extref);
		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
		cur_offset += ref_name_len + sizeof(*extref);
		if (dirid == last_dirid)
			continue;
		ret = dir_changed(sctx, dirid);
		if (ret)
			break;
		last_dirid = dirid;
	}
out:
	return ret;
}

7235 7236 7237 7238
/*
 * Updates compare related fields in sctx and simply forwards to the actual
 * changed_xxx functions.
 */
7239
static int changed_cb(struct btrfs_path *left_path,
7240 7241 7242
		      struct btrfs_path *right_path,
		      struct btrfs_key *key,
		      enum btrfs_compare_tree_result result,
7243
		      struct send_ctx *sctx)
7244 7245 7246
{
	int ret = 0;

7247 7248 7249 7250 7251 7252 7253 7254 7255 7256 7257 7258 7259 7260 7261 7262 7263 7264 7265 7266 7267 7268 7269 7270 7271 7272 7273 7274 7275 7276 7277 7278 7279 7280 7281 7282 7283 7284 7285 7286 7287 7288 7289 7290
	/*
	 * We can not hold the commit root semaphore here. This is because in
	 * the case of sending and receiving to the same filesystem, using a
	 * pipe, could result in a deadlock:
	 *
	 * 1) The task running send blocks on the pipe because it's full;
	 *
	 * 2) The task running receive, which is the only consumer of the pipe,
	 *    is waiting for a transaction commit (for example due to a space
	 *    reservation when doing a write or triggering a transaction commit
	 *    when creating a subvolume);
	 *
	 * 3) The transaction is waiting to write lock the commit root semaphore,
	 *    but can not acquire it since it's being held at 1).
	 *
	 * Down this call chain we write to the pipe through kernel_write().
	 * The same type of problem can also happen when sending to a file that
	 * is stored in the same filesystem - when reserving space for a write
	 * into the file, we can trigger a transaction commit.
	 *
	 * Our caller has supplied us with clones of leaves from the send and
	 * parent roots, so we're safe here from a concurrent relocation and
	 * further reallocation of metadata extents while we are here. Below we
	 * also assert that the leaves are clones.
	 */
	lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);

	/*
	 * We always have a send root, so left_path is never NULL. We will not
	 * have a leaf when we have reached the end of the send root but have
	 * not yet reached the end of the parent root.
	 */
	if (left_path->nodes[0])
		ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
				&left_path->nodes[0]->bflags));
	/*
	 * When doing a full send we don't have a parent root, so right_path is
	 * NULL. When doing an incremental send, we may have reached the end of
	 * the parent root already, so we don't have a leaf at right_path.
	 */
	if (right_path && right_path->nodes[0])
		ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
				&right_path->nodes[0]->bflags));

7291
	if (result == BTRFS_COMPARE_TREE_SAME) {
7292 7293 7294 7295 7296 7297 7298 7299 7300 7301
		if (key->type == BTRFS_INODE_REF_KEY ||
		    key->type == BTRFS_INODE_EXTREF_KEY) {
			ret = compare_refs(sctx, left_path, key);
			if (!ret)
				return 0;
			if (ret < 0)
				return ret;
		} else if (key->type == BTRFS_EXTENT_DATA_KEY) {
			return maybe_send_hole(sctx, left_path, key);
		} else {
7302
			return 0;
7303
		}
7304 7305 7306 7307
		result = BTRFS_COMPARE_TREE_CHANGED;
		ret = 0;
	}

7308 7309 7310 7311 7312 7313 7314 7315
	sctx->left_path = left_path;
	sctx->right_path = right_path;
	sctx->cmp_key = key;

	ret = finish_inode_if_needed(sctx, 0);
	if (ret < 0)
		goto out;

7316 7317 7318 7319 7320
	/* Ignore non-FS objects */
	if (key->objectid == BTRFS_FREE_INO_OBJECTID ||
	    key->objectid == BTRFS_FREE_SPACE_OBJECTID)
		goto out;

7321
	if (key->type == BTRFS_INODE_ITEM_KEY) {
7322
		ret = changed_inode(sctx, result);
7323 7324 7325 7326 7327 7328 7329 7330
	} else if (!sctx->ignore_cur_inode) {
		if (key->type == BTRFS_INODE_REF_KEY ||
		    key->type == BTRFS_INODE_EXTREF_KEY)
			ret = changed_ref(sctx, result);
		else if (key->type == BTRFS_XATTR_ITEM_KEY)
			ret = changed_xattr(sctx, result);
		else if (key->type == BTRFS_EXTENT_DATA_KEY)
			ret = changed_extent(sctx, result);
7331 7332 7333
		else if (key->type == BTRFS_VERITY_DESC_ITEM_KEY &&
			 key->offset == 0)
			ret = changed_verity(sctx, result);
7334
	}
7335 7336 7337 7338 7339

out:
	return ret;
}

7340 7341 7342 7343 7344 7345 7346 7347 7348 7349 7350 7351 7352 7353 7354 7355 7356 7357 7358 7359 7360 7361 7362 7363 7364
static int search_key_again(const struct send_ctx *sctx,
			    struct btrfs_root *root,
			    struct btrfs_path *path,
			    const struct btrfs_key *key)
{
	int ret;

	if (!path->need_commit_sem)
		lockdep_assert_held_read(&root->fs_info->commit_root_sem);

	/*
	 * Roots used for send operations are readonly and no one can add,
	 * update or remove keys from them, so we should be able to find our
	 * key again. The only exception is deduplication, which can operate on
	 * readonly roots and add, update or remove keys to/from them - but at
	 * the moment we don't allow it to run in parallel with send.
	 */
	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
	ASSERT(ret <= 0);
	if (ret > 0) {
		btrfs_print_tree(path->nodes[path->lowest_level], false);
		btrfs_err(root->fs_info,
"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
			  key->objectid, key->type, key->offset,
			  (root == sctx->parent_root ? "parent" : "send"),
7365
			  btrfs_root_id(root), path->lowest_level,
7366 7367 7368 7369 7370 7371 7372
			  path->slots[path->lowest_level]);
		return -EUCLEAN;
	}

	return ret;
}

7373 7374 7375 7376 7377
static int full_send_tree(struct send_ctx *sctx)
{
	int ret;
	struct btrfs_root *send_root = sctx->send_root;
	struct btrfs_key key;
7378
	struct btrfs_fs_info *fs_info = send_root->fs_info;
7379 7380 7381 7382 7383
	struct btrfs_path *path;

	path = alloc_path_for_send();
	if (!path)
		return -ENOMEM;
7384
	path->reada = READA_FORWARD_ALWAYS;
7385 7386 7387 7388 7389

	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
	key.type = BTRFS_INODE_ITEM_KEY;
	key.offset = 0;

7390 7391 7392 7393
	down_read(&fs_info->commit_root_sem);
	sctx->last_reloc_trans = fs_info->last_reloc_trans;
	up_read(&fs_info->commit_root_sem);

7394 7395 7396 7397 7398 7399 7400
	ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
	if (ret < 0)
		goto out;
	if (ret)
		goto out_finish;

	while (1) {
7401
		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
7402

7403
		ret = changed_cb(path, NULL, &key,
7404
				 BTRFS_COMPARE_TREE_NEW, sctx);
7405 7406 7407
		if (ret < 0)
			goto out;

7408 7409 7410 7411 7412 7413 7414 7415 7416 7417 7418 7419 7420 7421 7422 7423 7424 7425 7426 7427 7428 7429
		down_read(&fs_info->commit_root_sem);
		if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
			sctx->last_reloc_trans = fs_info->last_reloc_trans;
			up_read(&fs_info->commit_root_sem);
			/*
			 * A transaction used for relocating a block group was
			 * committed or is about to finish its commit. Release
			 * our path (leaf) and restart the search, so that we
			 * avoid operating on any file extent items that are
			 * stale, with a disk_bytenr that reflects a pre
			 * relocation value. This way we avoid as much as
			 * possible to fallback to regular writes when checking
			 * if we can clone file ranges.
			 */
			btrfs_release_path(path);
			ret = search_key_again(sctx, send_root, path, &key);
			if (ret < 0)
				goto out;
		} else {
			up_read(&fs_info->commit_root_sem);
		}

7430 7431 7432 7433 7434 7435 7436 7437 7438 7439 7440 7441 7442 7443 7444 7445 7446
		ret = btrfs_next_item(send_root, path);
		if (ret < 0)
			goto out;
		if (ret) {
			ret  = 0;
			break;
		}
	}

out_finish:
	ret = finish_inode_if_needed(sctx, 1);

out:
	btrfs_free_path(path);
	return ret;
}

7447 7448 7449 7450 7451 7452 7453 7454 7455 7456 7457 7458 7459 7460
static int replace_node_with_clone(struct btrfs_path *path, int level)
{
	struct extent_buffer *clone;

	clone = btrfs_clone_extent_buffer(path->nodes[level]);
	if (!clone)
		return -ENOMEM;

	free_extent_buffer(path->nodes[level]);
	path->nodes[level] = clone;

	return 0;
}

7461
static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
7462 7463
{
	struct extent_buffer *eb;
7464 7465 7466 7467 7468
	struct extent_buffer *parent = path->nodes[*level];
	int slot = path->slots[*level];
	const int nritems = btrfs_header_nritems(parent);
	u64 reada_max;
	u64 reada_done = 0;
7469

7470
	lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
7471
	ASSERT(*level != 0);
7472

7473
	eb = btrfs_read_node_slot(parent, slot);
7474 7475 7476
	if (IS_ERR(eb))
		return PTR_ERR(eb);

7477 7478 7479 7480 7481 7482 7483 7484 7485 7486 7487 7488 7489 7490 7491
	/*
	 * Trigger readahead for the next leaves we will process, so that it is
	 * very likely that when we need them they are already in memory and we
	 * will not block on disk IO. For nodes we only do readahead for one,
	 * since the time window between processing nodes is typically larger.
	 */
	reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);

	for (slot++; slot < nritems && reada_done < reada_max; slot++) {
		if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
			btrfs_readahead_node_child(parent, slot);
			reada_done += eb->fs_info->nodesize;
		}
	}

7492 7493 7494
	path->nodes[*level - 1] = eb;
	path->slots[*level - 1] = 0;
	(*level)--;
7495 7496 7497 7498

	if (*level == 0)
		return replace_node_with_clone(path, 0);

7499 7500 7501 7502 7503 7504 7505 7506 7507 7508 7509 7510 7511
	return 0;
}

static int tree_move_next_or_upnext(struct btrfs_path *path,
				    int *level, int root_level)
{
	int ret = 0;
	int nritems;
	nritems = btrfs_header_nritems(path->nodes[*level]);

	path->slots[*level]++;

	while (path->slots[*level] >= nritems) {
7512 7513
		if (*level == root_level) {
			path->slots[*level] = nritems - 1;
7514
			return -1;
7515
		}
7516 7517 7518 7519 7520 7521 7522 7523 7524 7525 7526 7527 7528 7529 7530 7531 7532 7533 7534 7535 7536

		/* move upnext */
		path->slots[*level] = 0;
		free_extent_buffer(path->nodes[*level]);
		path->nodes[*level] = NULL;
		(*level)++;
		path->slots[*level]++;

		nritems = btrfs_header_nritems(path->nodes[*level]);
		ret = 1;
	}
	return ret;
}

/*
 * Returns 1 if it had to move up and next. 0 is returned if it moved only next
 * or down.
 */
static int tree_advance(struct btrfs_path *path,
			int *level, int root_level,
			int allow_down,
7537 7538
			struct btrfs_key *key,
			u64 reada_min_gen)
7539 7540 7541 7542 7543 7544
{
	int ret;

	if (*level == 0 || !allow_down) {
		ret = tree_move_next_or_upnext(path, level, root_level);
	} else {
7545
		ret = tree_move_down(path, level, reada_min_gen);
7546
	}
7547 7548 7549 7550 7551 7552 7553 7554 7555 7556 7557 7558 7559 7560

	/*
	 * Even if we have reached the end of a tree, ret is -1, update the key
	 * anyway, so that in case we need to restart due to a block group
	 * relocation, we can assert that the last key of the root node still
	 * exists in the tree.
	 */
	if (*level == 0)
		btrfs_item_key_to_cpu(path->nodes[*level], key,
				      path->slots[*level]);
	else
		btrfs_node_key_to_cpu(path->nodes[*level], key,
				      path->slots[*level]);

7561 7562 7563 7564 7565 7566 7567 7568 7569 7570 7571
	return ret;
}

static int tree_compare_item(struct btrfs_path *left_path,
			     struct btrfs_path *right_path,
			     char *tmp_buf)
{
	int cmp;
	int len1, len2;
	unsigned long off1, off2;

7572 7573
	len1 = btrfs_item_size(left_path->nodes[0], left_path->slots[0]);
	len2 = btrfs_item_size(right_path->nodes[0], right_path->slots[0]);
7574 7575 7576 7577 7578 7579 7580 7581 7582 7583 7584 7585 7586 7587 7588
	if (len1 != len2)
		return 1;

	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
				right_path->slots[0]);

	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);

	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
	if (cmp)
		return 1;
	return 0;
}

7589 7590 7591 7592 7593 7594 7595 7596 7597 7598 7599 7600 7601 7602 7603 7604 7605 7606 7607 7608 7609 7610 7611 7612 7613 7614 7615 7616 7617 7618 7619 7620 7621 7622 7623 7624 7625 7626 7627 7628 7629 7630 7631 7632 7633 7634 7635 7636 7637 7638 7639 7640 7641 7642 7643 7644 7645 7646 7647 7648 7649 7650 7651 7652 7653 7654 7655 7656 7657 7658 7659 7660 7661 7662 7663 7664 7665 7666 7667 7668 7669 7670 7671 7672 7673 7674 7675 7676 7677 7678 7679
/*
 * A transaction used for relocating a block group was committed or is about to
 * finish its commit. Release our paths and restart the search, so that we are
 * not using stale extent buffers:
 *
 * 1) For levels > 0, we are only holding references of extent buffers, without
 *    any locks on them, which does not prevent them from having been relocated
 *    and reallocated after the last time we released the commit root semaphore.
 *    The exception are the root nodes, for which we always have a clone, see
 *    the comment at btrfs_compare_trees();
 *
 * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
 *    we are safe from the concurrent relocation and reallocation. However they
 *    can have file extent items with a pre relocation disk_bytenr value, so we
 *    restart the start from the current commit roots and clone the new leaves so
 *    that we get the post relocation disk_bytenr values. Not doing so, could
 *    make us clone the wrong data in case there are new extents using the old
 *    disk_bytenr that happen to be shared.
 */
static int restart_after_relocation(struct btrfs_path *left_path,
				    struct btrfs_path *right_path,
				    const struct btrfs_key *left_key,
				    const struct btrfs_key *right_key,
				    int left_level,
				    int right_level,
				    const struct send_ctx *sctx)
{
	int root_level;
	int ret;

	lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);

	btrfs_release_path(left_path);
	btrfs_release_path(right_path);

	/*
	 * Since keys can not be added or removed to/from our roots because they
	 * are readonly and we do not allow deduplication to run in parallel
	 * (which can add, remove or change keys), the layout of the trees should
	 * not change.
	 */
	left_path->lowest_level = left_level;
	ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
	if (ret < 0)
		return ret;

	right_path->lowest_level = right_level;
	ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
	if (ret < 0)
		return ret;

	/*
	 * If the lowest level nodes are leaves, clone them so that they can be
	 * safely used by changed_cb() while not under the protection of the
	 * commit root semaphore, even if relocation and reallocation happens in
	 * parallel.
	 */
	if (left_level == 0) {
		ret = replace_node_with_clone(left_path, 0);
		if (ret < 0)
			return ret;
	}

	if (right_level == 0) {
		ret = replace_node_with_clone(right_path, 0);
		if (ret < 0)
			return ret;
	}

	/*
	 * Now clone the root nodes (unless they happen to be the leaves we have
	 * already cloned). This is to protect against concurrent snapshotting of
	 * the send and parent roots (see the comment at btrfs_compare_trees()).
	 */
	root_level = btrfs_header_level(sctx->send_root->commit_root);
	if (root_level > 0) {
		ret = replace_node_with_clone(left_path, root_level);
		if (ret < 0)
			return ret;
	}

	root_level = btrfs_header_level(sctx->parent_root->commit_root);
	if (root_level > 0) {
		ret = replace_node_with_clone(right_path, root_level);
		if (ret < 0)
			return ret;
	}

	return 0;
}

7680 7681 7682 7683 7684 7685 7686 7687 7688 7689 7690 7691 7692 7693
/*
 * This function compares two trees and calls the provided callback for
 * every changed/new/deleted item it finds.
 * If shared tree blocks are encountered, whole subtrees are skipped, making
 * the compare pretty fast on snapshotted subvolumes.
 *
 * This currently works on commit roots only. As commit roots are read only,
 * we don't do any locking. The commit roots are protected with transactions.
 * Transactions are ended and rejoined when a commit is tried in between.
 *
 * This function checks for modifications done to the trees while comparing.
 * If it detects a change, it aborts immediately.
 */
static int btrfs_compare_trees(struct btrfs_root *left_root,
7694
			struct btrfs_root *right_root, struct send_ctx *sctx)
7695 7696 7697 7698 7699 7700 7701 7702 7703 7704 7705 7706 7707
{
	struct btrfs_fs_info *fs_info = left_root->fs_info;
	int ret;
	int cmp;
	struct btrfs_path *left_path = NULL;
	struct btrfs_path *right_path = NULL;
	struct btrfs_key left_key;
	struct btrfs_key right_key;
	char *tmp_buf = NULL;
	int left_root_level;
	int right_root_level;
	int left_level;
	int right_level;
7708 7709 7710 7711
	int left_end_reached = 0;
	int right_end_reached = 0;
	int advance_left = 0;
	int advance_right = 0;
7712 7713 7714 7715
	u64 left_blockptr;
	u64 right_blockptr;
	u64 left_gen;
	u64 right_gen;
7716
	u64 reada_min_gen;
7717 7718 7719 7720 7721 7722 7723 7724 7725 7726 7727 7728 7729 7730 7731 7732 7733 7734 7735 7736 7737 7738 7739 7740 7741 7742 7743 7744 7745 7746 7747 7748 7749 7750 7751 7752 7753 7754 7755 7756 7757 7758 7759 7760 7761 7762 7763 7764 7765 7766 7767 7768 7769 7770 7771 7772 7773 7774 7775 7776 7777 7778

	left_path = btrfs_alloc_path();
	if (!left_path) {
		ret = -ENOMEM;
		goto out;
	}
	right_path = btrfs_alloc_path();
	if (!right_path) {
		ret = -ENOMEM;
		goto out;
	}

	tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
	if (!tmp_buf) {
		ret = -ENOMEM;
		goto out;
	}

	left_path->search_commit_root = 1;
	left_path->skip_locking = 1;
	right_path->search_commit_root = 1;
	right_path->skip_locking = 1;

	/*
	 * Strategy: Go to the first items of both trees. Then do
	 *
	 * If both trees are at level 0
	 *   Compare keys of current items
	 *     If left < right treat left item as new, advance left tree
	 *       and repeat
	 *     If left > right treat right item as deleted, advance right tree
	 *       and repeat
	 *     If left == right do deep compare of items, treat as changed if
	 *       needed, advance both trees and repeat
	 * If both trees are at the same level but not at level 0
	 *   Compare keys of current nodes/leafs
	 *     If left < right advance left tree and repeat
	 *     If left > right advance right tree and repeat
	 *     If left == right compare blockptrs of the next nodes/leafs
	 *       If they match advance both trees but stay at the same level
	 *         and repeat
	 *       If they don't match advance both trees while allowing to go
	 *         deeper and repeat
	 * If tree levels are different
	 *   Advance the tree that needs it and repeat
	 *
	 * Advancing a tree means:
	 *   If we are at level 0, try to go to the next slot. If that's not
	 *   possible, go one level up and repeat. Stop when we found a level
	 *   where we could go to the next slot. We may at this point be on a
	 *   node or a leaf.
	 *
	 *   If we are not at level 0 and not on shared tree blocks, go one
	 *   level deeper.
	 *
	 *   If we are not at level 0 and on shared tree blocks, go one slot to
	 *   the right if possible or go up and right.
	 */

	down_read(&fs_info->commit_root_sem);
	left_level = btrfs_header_level(left_root->commit_root);
	left_root_level = left_level;
7779 7780 7781 7782 7783 7784 7785
	/*
	 * We clone the root node of the send and parent roots to prevent races
	 * with snapshot creation of these roots. Snapshot creation COWs the
	 * root node of a tree, so after the transaction is committed the old
	 * extent can be reallocated while this send operation is still ongoing.
	 * So we clone them, under the commit root semaphore, to be race free.
	 */
7786 7787 7788 7789
	left_path->nodes[left_level] =
			btrfs_clone_extent_buffer(left_root->commit_root);
	if (!left_path->nodes[left_level]) {
		ret = -ENOMEM;
7790
		goto out_unlock;
7791 7792 7793 7794 7795 7796 7797 7798
	}

	right_level = btrfs_header_level(right_root->commit_root);
	right_root_level = right_level;
	right_path->nodes[right_level] =
			btrfs_clone_extent_buffer(right_root->commit_root);
	if (!right_path->nodes[right_level]) {
		ret = -ENOMEM;
7799
		goto out_unlock;
7800
	}
7801 7802 7803 7804 7805 7806 7807 7808
	/*
	 * Our right root is the parent root, while the left root is the "send"
	 * root. We know that all new nodes/leaves in the left root must have
	 * a generation greater than the right root's generation, so we trigger
	 * readahead for those nodes and leaves of the left root, as we know we
	 * will need to read them at some point.
	 */
	reada_min_gen = btrfs_header_generation(right_root->commit_root);
7809 7810 7811 7812 7813 7814 7815 7816 7817 7818 7819 7820 7821 7822

	if (left_level == 0)
		btrfs_item_key_to_cpu(left_path->nodes[left_level],
				&left_key, left_path->slots[left_level]);
	else
		btrfs_node_key_to_cpu(left_path->nodes[left_level],
				&left_key, left_path->slots[left_level]);
	if (right_level == 0)
		btrfs_item_key_to_cpu(right_path->nodes[right_level],
				&right_key, right_path->slots[right_level]);
	else
		btrfs_node_key_to_cpu(right_path->nodes[right_level],
				&right_key, right_path->slots[right_level]);

7823
	sctx->last_reloc_trans = fs_info->last_reloc_trans;
7824 7825

	while (1) {
7826 7827 7828 7829 7830 7831 7832 7833 7834 7835 7836 7837 7838 7839 7840 7841 7842
		if (need_resched() ||
		    rwsem_is_contended(&fs_info->commit_root_sem)) {
			up_read(&fs_info->commit_root_sem);
			cond_resched();
			down_read(&fs_info->commit_root_sem);
		}

		if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
			ret = restart_after_relocation(left_path, right_path,
						       &left_key, &right_key,
						       left_level, right_level,
						       sctx);
			if (ret < 0)
				goto out_unlock;
			sctx->last_reloc_trans = fs_info->last_reloc_trans;
		}

7843 7844 7845 7846
		if (advance_left && !left_end_reached) {
			ret = tree_advance(left_path, &left_level,
					left_root_level,
					advance_left != ADVANCE_ONLY_NEXT,
7847
					&left_key, reada_min_gen);
7848 7849 7850
			if (ret == -1)
				left_end_reached = ADVANCE;
			else if (ret < 0)
7851
				goto out_unlock;
7852 7853 7854 7855 7856 7857
			advance_left = 0;
		}
		if (advance_right && !right_end_reached) {
			ret = tree_advance(right_path, &right_level,
					right_root_level,
					advance_right != ADVANCE_ONLY_NEXT,
7858
					&right_key, reada_min_gen);
7859 7860 7861
			if (ret == -1)
				right_end_reached = ADVANCE;
			else if (ret < 0)
7862
				goto out_unlock;
7863 7864 7865 7866 7867
			advance_right = 0;
		}

		if (left_end_reached && right_end_reached) {
			ret = 0;
7868
			goto out_unlock;
7869 7870
		} else if (left_end_reached) {
			if (right_level == 0) {
7871
				up_read(&fs_info->commit_root_sem);
7872 7873 7874
				ret = changed_cb(left_path, right_path,
						&right_key,
						BTRFS_COMPARE_TREE_DELETED,
7875
						sctx);
7876 7877
				if (ret < 0)
					goto out;
7878
				down_read(&fs_info->commit_root_sem);
7879 7880 7881 7882 7883
			}
			advance_right = ADVANCE;
			continue;
		} else if (right_end_reached) {
			if (left_level == 0) {
7884
				up_read(&fs_info->commit_root_sem);
7885 7886 7887
				ret = changed_cb(left_path, right_path,
						&left_key,
						BTRFS_COMPARE_TREE_NEW,
7888
						sctx);
7889 7890
				if (ret < 0)
					goto out;
7891
				down_read(&fs_info->commit_root_sem);
7892 7893 7894 7895 7896 7897
			}
			advance_left = ADVANCE;
			continue;
		}

		if (left_level == 0 && right_level == 0) {
7898
			up_read(&fs_info->commit_root_sem);
7899 7900 7901 7902 7903
			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
			if (cmp < 0) {
				ret = changed_cb(left_path, right_path,
						&left_key,
						BTRFS_COMPARE_TREE_NEW,
7904
						sctx);
7905 7906 7907 7908 7909
				advance_left = ADVANCE;
			} else if (cmp > 0) {
				ret = changed_cb(left_path, right_path,
						&right_key,
						BTRFS_COMPARE_TREE_DELETED,
7910
						sctx);
7911 7912 7913 7914 7915 7916 7917 7918 7919 7920 7921 7922
				advance_right = ADVANCE;
			} else {
				enum btrfs_compare_tree_result result;

				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
				ret = tree_compare_item(left_path, right_path,
							tmp_buf);
				if (ret)
					result = BTRFS_COMPARE_TREE_CHANGED;
				else
					result = BTRFS_COMPARE_TREE_SAME;
				ret = changed_cb(left_path, right_path,
7923
						 &left_key, result, sctx);
7924 7925 7926
				advance_left = ADVANCE;
				advance_right = ADVANCE;
			}
7927 7928 7929 7930

			if (ret < 0)
				goto out;
			down_read(&fs_info->commit_root_sem);
7931 7932 7933 7934 7935 7936 7937 7938 7939 7940 7941 7942 7943 7944 7945 7946 7947 7948 7949 7950 7951 7952 7953 7954 7955 7956 7957 7958 7959 7960 7961 7962 7963 7964 7965 7966 7967 7968 7969
		} else if (left_level == right_level) {
			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
			if (cmp < 0) {
				advance_left = ADVANCE;
			} else if (cmp > 0) {
				advance_right = ADVANCE;
			} else {
				left_blockptr = btrfs_node_blockptr(
						left_path->nodes[left_level],
						left_path->slots[left_level]);
				right_blockptr = btrfs_node_blockptr(
						right_path->nodes[right_level],
						right_path->slots[right_level]);
				left_gen = btrfs_node_ptr_generation(
						left_path->nodes[left_level],
						left_path->slots[left_level]);
				right_gen = btrfs_node_ptr_generation(
						right_path->nodes[right_level],
						right_path->slots[right_level]);
				if (left_blockptr == right_blockptr &&
				    left_gen == right_gen) {
					/*
					 * As we're on a shared block, don't
					 * allow to go deeper.
					 */
					advance_left = ADVANCE_ONLY_NEXT;
					advance_right = ADVANCE_ONLY_NEXT;
				} else {
					advance_left = ADVANCE;
					advance_right = ADVANCE;
				}
			}
		} else if (left_level < right_level) {
			advance_right = ADVANCE;
		} else {
			advance_left = ADVANCE;
		}
	}

7970 7971
out_unlock:
	up_read(&fs_info->commit_root_sem);
7972 7973 7974 7975 7976 7977 7978
out:
	btrfs_free_path(left_path);
	btrfs_free_path(right_path);
	kvfree(tmp_buf);
	return ret;
}

7979 7980 7981 7982
static int send_subvol(struct send_ctx *sctx)
{
	int ret;

7983 7984 7985 7986 7987
	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_STREAM_HEADER)) {
		ret = send_header(sctx);
		if (ret < 0)
			goto out;
	}
7988 7989 7990 7991 7992 7993

	ret = send_subvol_begin(sctx);
	if (ret < 0)
		goto out;

	if (sctx->parent_root) {
7994
		ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx);
7995 7996 7997 7998 7999 8000 8001 8002 8003 8004 8005 8006 8007 8008 8009 8010
		if (ret < 0)
			goto out;
		ret = finish_inode_if_needed(sctx, 1);
		if (ret < 0)
			goto out;
	} else {
		ret = full_send_tree(sctx);
		if (ret < 0)
			goto out;
	}

out:
	free_recorded_refs(sctx);
	return ret;
}

8011 8012 8013 8014 8015 8016 8017 8018 8019 8020 8021 8022 8023 8024 8025
/*
 * If orphan cleanup did remove any orphans from a root, it means the tree
 * was modified and therefore the commit root is not the same as the current
 * root anymore. This is a problem, because send uses the commit root and
 * therefore can see inode items that don't exist in the current root anymore,
 * and for example make calls to btrfs_iget, which will do tree lookups based
 * on the current root and not on the commit root. Those lookups will fail,
 * returning a -ESTALE error, and making send fail with that error. So make
 * sure a send does not see any orphans we have just removed, and that it will
 * see the same inodes regardless of whether a transaction commit happened
 * before it started (meaning that the commit root will be the same as the
 * current root) or not.
 */
static int ensure_commit_roots_uptodate(struct send_ctx *sctx)
{
8026
	struct btrfs_root *root = sctx->parent_root;
8027

8028
	if (root && root->node != root->commit_root)
8029
		return btrfs_commit_current_transaction(root);
8030

8031 8032 8033
	for (int i = 0; i < sctx->clone_roots_cnt; i++) {
		root = sctx->clone_roots[i].root;
		if (root->node != root->commit_root)
8034
			return btrfs_commit_current_transaction(root);
8035
	}
8036 8037 8038 8039

	return 0;
}

8040 8041 8042 8043 8044 8045 8046 8047 8048 8049 8050 8051 8052 8053 8054
/*
 * Make sure any existing dellaloc is flushed for any root used by a send
 * operation so that we do not miss any data and we do not race with writeback
 * finishing and changing a tree while send is using the tree. This could
 * happen if a subvolume is in RW mode, has delalloc, is turned to RO mode and
 * a send operation then uses the subvolume.
 * After flushing delalloc ensure_commit_roots_uptodate() must be called.
 */
static int flush_delalloc_roots(struct send_ctx *sctx)
{
	struct btrfs_root *root = sctx->parent_root;
	int ret;
	int i;

	if (root) {
8055
		ret = btrfs_start_delalloc_snapshot(root, false);
8056 8057
		if (ret)
			return ret;
8058
		btrfs_wait_ordered_extents(root, U64_MAX, NULL);
8059 8060 8061 8062
	}

	for (i = 0; i < sctx->clone_roots_cnt; i++) {
		root = sctx->clone_roots[i].root;
8063
		ret = btrfs_start_delalloc_snapshot(root, false);
8064 8065
		if (ret)
			return ret;
8066
		btrfs_wait_ordered_extents(root, U64_MAX, NULL);
8067 8068 8069 8070 8071
	}

	return 0;
}

8072 8073 8074 8075 8076 8077 8078 8079 8080 8081
static void btrfs_root_dec_send_in_progress(struct btrfs_root* root)
{
	spin_lock(&root->root_item_lock);
	root->send_in_progress--;
	/*
	 * Not much left to do, we don't know why it's unbalanced and
	 * can't blindly reset it to 0.
	 */
	if (root->send_in_progress < 0)
		btrfs_err(root->fs_info,
8082
			  "send_in_progress unbalanced %d root %llu",
8083
			  root->send_in_progress, btrfs_root_id(root));
8084 8085 8086
	spin_unlock(&root->root_item_lock);
}

8087 8088 8089 8090
static void dedupe_in_progress_warn(const struct btrfs_root *root)
{
	btrfs_warn_rl(root->fs_info,
"cannot use root %llu for send while deduplications on it are in progress (%d in progress)",
8091
		      btrfs_root_id(root), root->dedupe_in_progress);
8092 8093
}

8094
long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg)
8095 8096
{
	int ret = 0;
8097
	struct btrfs_root *send_root = inode->root;
8098
	struct btrfs_fs_info *fs_info = send_root->fs_info;
8099 8100 8101 8102
	struct btrfs_root *clone_root;
	struct send_ctx *sctx = NULL;
	u32 i;
	u64 *clone_sources_tmp = NULL;
8103
	int clone_sources_to_rollback = 0;
8104
	size_t alloc_size;
8105
	int sort_clone_roots = 0;
8106 8107
	struct btrfs_lru_cache_entry *entry;
	struct btrfs_lru_cache_entry *tmp;
8108 8109 8110 8111

	if (!capable(CAP_SYS_ADMIN))
		return -EPERM;

8112 8113
	/*
	 * The subvolume must remain read-only during send, protect against
8114
	 * making it RW. This also protects against deletion.
8115 8116
	 */
	spin_lock(&send_root->root_item_lock);
8117 8118 8119 8120 8121
	if (btrfs_root_readonly(send_root) && send_root->dedupe_in_progress) {
		dedupe_in_progress_warn(send_root);
		spin_unlock(&send_root->root_item_lock);
		return -EAGAIN;
	}
8122 8123 8124 8125 8126 8127 8128 8129 8130 8131 8132 8133
	send_root->send_in_progress++;
	spin_unlock(&send_root->root_item_lock);

	/*
	 * Userspace tools do the checks and warn the user if it's
	 * not RO.
	 */
	if (!btrfs_root_readonly(send_root)) {
		ret = -EPERM;
		goto out;
	}

8134 8135 8136
	/*
	 * Check that we don't overflow at later allocations, we request
	 * clone_sources_count + 1 items, and compare to unsigned long inside
8137 8138
	 * access_ok. Also set an upper limit for allocation size so this can't
	 * easily exhaust memory. Max number of clone sources is about 200K.
8139
	 */
8140
	if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
8141 8142 8143 8144
		ret = -EINVAL;
		goto out;
	}

8145
	if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
8146
		ret = -EOPNOTSUPP;
8147 8148 8149
		goto out;
	}

8150
	sctx = kzalloc(sizeof(struct send_ctx), GFP_KERNEL);
8151 8152 8153 8154 8155 8156 8157 8158
	if (!sctx) {
		ret = -ENOMEM;
		goto out;
	}

	INIT_LIST_HEAD(&sctx->new_refs);
	INIT_LIST_HEAD(&sctx->deleted_refs);

8159
	btrfs_lru_cache_init(&sctx->name_cache, SEND_MAX_NAME_CACHE_SIZE);
8160
	btrfs_lru_cache_init(&sctx->backref_cache, SEND_MAX_BACKREF_CACHE_SIZE);
8161 8162
	btrfs_lru_cache_init(&sctx->dir_created_cache,
			     SEND_MAX_DIR_CREATED_CACHE_SIZE);
8163 8164 8165 8166 8167
	/*
	 * This cache is periodically trimmed to a fixed size elsewhere, see
	 * cache_dir_utimes() and trim_dir_utimes_cache().
	 */
	btrfs_lru_cache_init(&sctx->dir_utimes_cache, 0);
8168

8169 8170 8171 8172 8173 8174
	sctx->pending_dir_moves = RB_ROOT;
	sctx->waiting_dir_moves = RB_ROOT;
	sctx->orphan_dirs = RB_ROOT;
	sctx->rbtree_new_refs = RB_ROOT;
	sctx->rbtree_deleted_refs = RB_ROOT;

8175 8176
	sctx->flags = arg->flags;

8177 8178 8179 8180 8181 8182 8183 8184 8185 8186
	if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
		if (arg->version > BTRFS_SEND_STREAM_VERSION) {
			ret = -EPROTO;
			goto out;
		}
		/* Zero means "use the highest version" */
		sctx->proto = arg->version ?: BTRFS_SEND_STREAM_VERSION;
	} else {
		sctx->proto = 1;
	}
8187 8188 8189 8190
	if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) {
		ret = -EINVAL;
		goto out;
	}
8191

8192
	sctx->send_filp = fget(arg->send_fd);
8193
	if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
8194
		ret = -EBADF;
8195 8196 8197 8198
		goto out;
	}

	sctx->send_root = send_root;
8199 8200 8201 8202 8203 8204 8205 8206 8207
	/*
	 * Unlikely but possible, if the subvolume is marked for deletion but
	 * is slow to remove the directory entry, send can still be started
	 */
	if (btrfs_root_dead(sctx->send_root)) {
		ret = -EPERM;
		goto out;
	}

8208 8209
	sctx->clone_roots_cnt = arg->clone_sources_count;

8210 8211 8212
	if (sctx->proto >= 2) {
		u32 send_buf_num_pages;

8213
		sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
8214 8215 8216 8217 8218 8219 8220 8221 8222 8223 8224 8225 8226 8227 8228 8229 8230 8231
		sctx->send_buf = vmalloc(sctx->send_max_size);
		if (!sctx->send_buf) {
			ret = -ENOMEM;
			goto out;
		}
		send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT;
		sctx->send_buf_pages = kcalloc(send_buf_num_pages,
					       sizeof(*sctx->send_buf_pages),
					       GFP_KERNEL);
		if (!sctx->send_buf_pages) {
			ret = -ENOMEM;
			goto out;
		}
		for (i = 0; i < send_buf_num_pages; i++) {
			sctx->send_buf_pages[i] =
				vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT));
		}
	} else {
8232
		sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1;
8233 8234
		sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL);
	}
8235
	if (!sctx->send_buf) {
8236 8237
		ret = -ENOMEM;
		goto out;
8238 8239
	}

8240 8241
	sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
				     sizeof(*sctx->clone_roots),
8242
				     GFP_KERNEL);
8243
	if (!sctx->clone_roots) {
8244 8245
		ret = -ENOMEM;
		goto out;
8246 8247
	}

8248 8249
	alloc_size = array_size(sizeof(*arg->clone_sources),
				arg->clone_sources_count);
8250

8251
	if (arg->clone_sources_count) {
8252
		clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL);
8253
		if (!clone_sources_tmp) {
8254 8255
			ret = -ENOMEM;
			goto out;
8256 8257 8258
		}

		ret = copy_from_user(clone_sources_tmp, arg->clone_sources,
8259
				alloc_size);
8260 8261 8262 8263 8264 8265
		if (ret) {
			ret = -EFAULT;
			goto out;
		}

		for (i = 0; i < arg->clone_sources_count; i++) {
8266 8267
			clone_root = btrfs_get_fs_root(fs_info,
						clone_sources_tmp[i], true);
8268 8269 8270 8271
			if (IS_ERR(clone_root)) {
				ret = PTR_ERR(clone_root);
				goto out;
			}
8272
			spin_lock(&clone_root->root_item_lock);
8273 8274
			if (!btrfs_root_readonly(clone_root) ||
			    btrfs_root_dead(clone_root)) {
8275
				spin_unlock(&clone_root->root_item_lock);
8276
				btrfs_put_root(clone_root);
8277 8278 8279
				ret = -EPERM;
				goto out;
			}
8280 8281 8282
			if (clone_root->dedupe_in_progress) {
				dedupe_in_progress_warn(clone_root);
				spin_unlock(&clone_root->root_item_lock);
8283
				btrfs_put_root(clone_root);
8284 8285 8286
				ret = -EAGAIN;
				goto out;
			}
8287
			clone_root->send_in_progress++;
8288
			spin_unlock(&clone_root->root_item_lock);
8289

8290
			sctx->clone_roots[i].root = clone_root;
8291
			clone_sources_to_rollback = i + 1;
8292
		}
8293
		kvfree(clone_sources_tmp);
8294 8295 8296 8297
		clone_sources_tmp = NULL;
	}

	if (arg->parent_root) {
8298 8299
		sctx->parent_root = btrfs_get_fs_root(fs_info, arg->parent_root,
						      true);
8300 8301
		if (IS_ERR(sctx->parent_root)) {
			ret = PTR_ERR(sctx->parent_root);
8302 8303
			goto out;
		}
8304

8305 8306
		spin_lock(&sctx->parent_root->root_item_lock);
		sctx->parent_root->send_in_progress++;
8307 8308
		if (!btrfs_root_readonly(sctx->parent_root) ||
				btrfs_root_dead(sctx->parent_root)) {
8309 8310 8311 8312
			spin_unlock(&sctx->parent_root->root_item_lock);
			ret = -EPERM;
			goto out;
		}
8313 8314 8315 8316 8317 8318
		if (sctx->parent_root->dedupe_in_progress) {
			dedupe_in_progress_warn(sctx->parent_root);
			spin_unlock(&sctx->parent_root->root_item_lock);
			ret = -EAGAIN;
			goto out;
		}
8319
		spin_unlock(&sctx->parent_root->root_item_lock);
8320 8321 8322 8323 8324 8325 8326
	}

	/*
	 * Clones from send_root are allowed, but only if the clone source
	 * is behind the current send position. This is checked while searching
	 * for possible clone sources.
	 */
8327
	sctx->clone_roots[sctx->clone_roots_cnt++].root =
8328
		btrfs_grab_root(sctx->send_root);
8329 8330 8331 8332 8333

	/* We do a bsearch later */
	sort(sctx->clone_roots, sctx->clone_roots_cnt,
			sizeof(*sctx->clone_roots), __clone_root_cmp_sort,
			NULL);
8334
	sort_clone_roots = 1;
8335

8336 8337 8338 8339
	ret = flush_delalloc_roots(sctx);
	if (ret)
		goto out;

8340 8341 8342 8343
	ret = ensure_commit_roots_uptodate(sctx);
	if (ret)
		goto out;

8344 8345 8346 8347
	ret = send_subvol(sctx);
	if (ret < 0)
		goto out;

8348 8349 8350 8351 8352 8353 8354
	btrfs_lru_cache_for_each_entry_safe(&sctx->dir_utimes_cache, entry, tmp) {
		ret = send_utimes(sctx, entry->key, entry->gen);
		if (ret < 0)
			goto out;
		btrfs_lru_cache_remove(&sctx->dir_utimes_cache, entry);
	}

8355 8356 8357 8358 8359 8360 8361 8362
	if (!(sctx->flags & BTRFS_SEND_FLAG_OMIT_END_CMD)) {
		ret = begin_cmd(sctx, BTRFS_SEND_C_END);
		if (ret < 0)
			goto out;
		ret = send_cmd(sctx);
		if (ret < 0)
			goto out;
	}
8363 8364

out:
8365 8366 8367 8368 8369 8370 8371 8372 8373 8374 8375 8376 8377 8378 8379 8380 8381 8382 8383 8384 8385 8386 8387 8388 8389 8390 8391 8392
	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->pending_dir_moves));
	while (sctx && !RB_EMPTY_ROOT(&sctx->pending_dir_moves)) {
		struct rb_node *n;
		struct pending_dir_move *pm;

		n = rb_first(&sctx->pending_dir_moves);
		pm = rb_entry(n, struct pending_dir_move, node);
		while (!list_empty(&pm->list)) {
			struct pending_dir_move *pm2;

			pm2 = list_first_entry(&pm->list,
					       struct pending_dir_move, list);
			free_pending_move(sctx, pm2);
		}
		free_pending_move(sctx, pm);
	}

	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves));
	while (sctx && !RB_EMPTY_ROOT(&sctx->waiting_dir_moves)) {
		struct rb_node *n;
		struct waiting_dir_move *dm;

		n = rb_first(&sctx->waiting_dir_moves);
		dm = rb_entry(n, struct waiting_dir_move, node);
		rb_erase(&dm->node, &sctx->waiting_dir_moves);
		kfree(dm);
	}

8393 8394 8395 8396 8397 8398 8399 8400 8401 8402
	WARN_ON(sctx && !ret && !RB_EMPTY_ROOT(&sctx->orphan_dirs));
	while (sctx && !RB_EMPTY_ROOT(&sctx->orphan_dirs)) {
		struct rb_node *n;
		struct orphan_dir_info *odi;

		n = rb_first(&sctx->orphan_dirs);
		odi = rb_entry(n, struct orphan_dir_info, node);
		free_orphan_dir_info(sctx, odi);
	}

8403
	if (sort_clone_roots) {
8404
		for (i = 0; i < sctx->clone_roots_cnt; i++) {
8405 8406
			btrfs_root_dec_send_in_progress(
					sctx->clone_roots[i].root);
8407
			btrfs_put_root(sctx->clone_roots[i].root);
8408
		}
8409
	} else {
8410
		for (i = 0; sctx && i < clone_sources_to_rollback; i++) {
8411 8412
			btrfs_root_dec_send_in_progress(
					sctx->clone_roots[i].root);
8413
			btrfs_put_root(sctx->clone_roots[i].root);
8414
		}
8415 8416 8417

		btrfs_root_dec_send_in_progress(send_root);
	}
8418
	if (sctx && !IS_ERR_OR_NULL(sctx->parent_root)) {
8419
		btrfs_root_dec_send_in_progress(sctx->parent_root);
8420
		btrfs_put_root(sctx->parent_root);
8421
	}
8422

8423
	kvfree(clone_sources_tmp);
8424 8425 8426 8427 8428

	if (sctx) {
		if (sctx->send_filp)
			fput(sctx->send_filp);

8429
		kvfree(sctx->clone_roots);
8430
		kfree(sctx->send_buf_pages);
8431
		kvfree(sctx->send_buf);
8432
		kvfree(sctx->verity_descriptor);
8433

8434
		close_current_inode(sctx);
8435

8436
		btrfs_lru_cache_clear(&sctx->name_cache);
8437
		btrfs_lru_cache_clear(&sctx->backref_cache);
8438
		btrfs_lru_cache_clear(&sctx->dir_created_cache);
8439
		btrfs_lru_cache_clear(&sctx->dir_utimes_cache);
8440

8441 8442 8443 8444 8445
		kfree(sctx);
	}

	return ret;
}