btree_iter.c 60.2 KB
Newer Older
1 2 3 4 5 6
// SPDX-License-Identifier: GPL-2.0

#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_iter.h"
7
#include "btree_key_cache.h"
8
#include "btree_locking.h"
9
#include "btree_update.h"
10 11
#include "debug.h"
#include "extents.h"
12
#include "journal.h"
13 14 15 16 17 18 19
#include "trace.h"

#include <linux/prefetch.h>

static inline bool is_btree_node(struct btree_iter *iter, unsigned l)
{
	return l < BTREE_MAX_DEPTH &&
20
		(unsigned long) iter->l[l].b >= 128;
21 22
}

23
static inline struct bpos btree_iter_search_key(struct btree_iter *iter)
24
{
25
	struct bpos pos = iter->pos;
26

27 28 29 30
	if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
	    bkey_cmp(pos, POS_MAX))
		pos = bkey_successor(pos);
	return pos;
31 32
}

33 34 35
static inline bool btree_iter_pos_before_node(struct btree_iter *iter,
					      struct btree *b)
{
36
	return bkey_cmp(btree_iter_search_key(iter), b->data->min_key) < 0;
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
}

static inline bool btree_iter_pos_after_node(struct btree_iter *iter,
					     struct btree *b)
{
	return bkey_cmp(b->key.k.p, btree_iter_search_key(iter)) < 0;
}

static inline bool btree_iter_pos_in_node(struct btree_iter *iter,
					  struct btree *b)
{
	return iter->btree_id == b->c.btree_id &&
		!btree_iter_pos_before_node(iter, b) &&
		!btree_iter_pos_after_node(iter, b);
}

53 54 55 56
/* Btree node locking: */

void bch2_btree_node_unlock_write(struct btree *b, struct btree_iter *iter)
{
57
	bch2_btree_node_unlock_write_inlined(b, iter);
58 59 60 61 62 63 64
}

void __bch2_btree_node_lock_write(struct btree *b, struct btree_iter *iter)
{
	struct btree_iter *linked;
	unsigned readers = 0;

65
	EBUG_ON(!btree_node_intent_locked(iter, b->c.level));
66

67
	trans_for_each_iter(iter->trans, linked)
68 69
		if (linked->l[b->c.level].b == b &&
		    btree_node_read_locked(linked, b->c.level))
70 71 72 73 74 75 76 77 78
			readers++;

	/*
	 * Must drop our read locks before calling six_lock_write() -
	 * six_unlock() won't do wakeups until the reader count
	 * goes to 0, and it's safe because we have the node intent
	 * locked:
	 */
	atomic64_sub(__SIX_VAL(read_lock, readers),
79
		     &b->c.lock.state.counter);
80
	btree_node_lock_type(iter->trans->c, b, SIX_LOCK_write);
81
	atomic64_add(__SIX_VAL(read_lock, readers),
82
		     &b->c.lock.state.counter);
83 84 85 86 87 88 89
}

bool __bch2_btree_node_relock(struct btree_iter *iter, unsigned level)
{
	struct btree *b = btree_iter_node(iter, level);
	int want = __btree_lock_want(iter, level);

90
	if (!is_btree_node(iter, level))
91 92 93 94 95
		return false;

	if (race_fault())
		return false;

96 97
	if (six_relock_type(&b->c.lock, want, iter->l[level].lock_seq) ||
	    (btree_node_lock_seq_matches(iter, b, level) &&
98
	     btree_node_lock_increment(iter->trans, b, level, want))) {
99 100 101
		mark_btree_node_locked(iter, level, want);
		return true;
	} else {
102
		return false;
103
	}
104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
}

static bool bch2_btree_node_upgrade(struct btree_iter *iter, unsigned level)
{
	struct btree *b = iter->l[level].b;

	EBUG_ON(btree_lock_want(iter, level) != BTREE_NODE_INTENT_LOCKED);

	if (!is_btree_node(iter, level))
		return false;

	if (btree_node_intent_locked(iter, level))
		return true;

	if (race_fault())
		return false;

	if (btree_node_locked(iter, level)
122 123
	    ? six_lock_tryupgrade(&b->c.lock)
	    : six_relock_type(&b->c.lock, SIX_LOCK_intent, iter->l[level].lock_seq))
124 125
		goto success;

126
	if (btree_node_lock_seq_matches(iter, b, level) &&
127
	    btree_node_lock_increment(iter->trans, b, level, BTREE_NODE_INTENT_LOCKED)) {
128 129 130 131 132 133 134 135 136 137 138
		btree_node_unlock(iter, level);
		goto success;
	}

	return false;
success:
	mark_btree_node_intent_locked(iter, level);
	return true;
}

static inline bool btree_iter_get_locks(struct btree_iter *iter,
139
					bool upgrade, bool trace)
140 141 142 143 144 145 146 147 148 149 150
{
	unsigned l = iter->level;
	int fail_idx = -1;

	do {
		if (!btree_iter_node(iter, l))
			break;

		if (!(upgrade
		      ? bch2_btree_node_upgrade(iter, l)
		      : bch2_btree_node_relock(iter, l))) {
151 152 153 154
			if (trace)
				(upgrade
				 ? trace_node_upgrade_fail
				 : trace_node_relock_fail)(l, iter->l[l].lock_seq,
155 156 157 158 159 160 161
						is_btree_node(iter, l)
						? 0
						: (unsigned long) iter->l[l].b,
						is_btree_node(iter, l)
						? iter->l[l].b->c.lock.state.seq
						: 0);

162 163 164 165 166 167 168 169 170 171 172 173 174 175
			fail_idx = l;
			btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
		}

		l++;
	} while (l < iter->locks_want);

	/*
	 * When we fail to get a lock, we have to ensure that any child nodes
	 * can't be relocked so bch2_btree_iter_traverse has to walk back up to
	 * the node that we failed to relock:
	 */
	while (fail_idx >= 0) {
		btree_node_unlock(iter, fail_idx);
176
		iter->l[fail_idx].b = BTREE_ITER_NO_NODE_GET_LOCKS;
177 178 179 180 181 182
		--fail_idx;
	}

	if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
		iter->uptodate = BTREE_ITER_NEED_PEEK;

183 184
	bch2_btree_trans_verify_locks(iter->trans);

185 186 187
	return iter->uptodate < BTREE_ITER_NEED_RELOCK;
}

188 189 190 191 192 193 194 195
static struct bpos btree_node_pos(struct btree_bkey_cached_common *_b,
				  enum btree_iter_type type)
{
	return  type != BTREE_ITER_CACHED
		? container_of(_b, struct btree, c)->key.k.p
		: container_of(_b, struct bkey_cached, c)->key.pos;
}

196 197
/* Slowpath: */
bool __bch2_btree_node_lock(struct btree *b, struct bpos pos,
198
			    unsigned level, struct btree_iter *iter,
199
			    enum six_lock_type type,
200 201
			    six_lock_should_sleep_fn should_sleep_fn, void *p,
			    unsigned long ip)
202
{
203
	struct btree_trans *trans = iter->trans;
204
	struct btree_iter *linked, *deadlock_iter = NULL;
205
	u64 start_time = local_clock();
206
	unsigned reason = 9;
207

208
	/* Check if it's safe to block: */
209
	trans_for_each_iter(trans, linked) {
210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
		if (!linked->nodes_locked)
			continue;

		/*
		 * Can't block taking an intent lock if we have _any_ nodes read
		 * locked:
		 *
		 * - Our read lock blocks another thread with an intent lock on
		 *   the same node from getting a write lock, and thus from
		 *   dropping its intent lock
		 *
		 * - And the other thread may have multiple nodes intent locked:
		 *   both the node we want to intent lock, and the node we
		 *   already have read locked - deadlock:
		 */
		if (type == SIX_LOCK_intent &&
		    linked->nodes_locked != linked->nodes_intent_locked) {
227
			if (!(trans->nounlock)) {
228 229 230
				linked->locks_want = max_t(unsigned,
						linked->locks_want,
						__fls(linked->nodes_locked) + 1);
231 232 233 234
				if (!btree_iter_get_locks(linked, true, false)) {
					deadlock_iter = linked;
					reason = 1;
				}
235
			} else {
236 237
				deadlock_iter = linked;
				reason = 2;
238 239 240 241 242 243 244 245 246
			}
		}

		/*
		 * Interior nodes must be locked before their descendants: if
		 * another iterator has possible descendants locked of the node
		 * we're about to lock, it must have the ancestors locked too:
		 */
		if (linked->btree_id == iter->btree_id &&
247
		    btree_iter_is_cached(linked) == btree_iter_is_cached(iter) &&
248
		    level > __fls(linked->nodes_locked)) {
249
			if (!(trans->nounlock)) {
250 251 252 253
				linked->locks_want =
					max(level + 1, max_t(unsigned,
					    linked->locks_want,
					    iter->locks_want));
254 255 256 257
				if (!btree_iter_get_locks(linked, true, false)) {
					deadlock_iter = linked;
					reason = 3;
				}
258
			} else {
259 260
				deadlock_iter = linked;
				reason = 4;
261
			}
262 263 264
		}

		/* Must lock btree nodes in key order: */
265
		if ((cmp_int(iter->btree_id, linked->btree_id) ?:
266 267 268 269
		     -cmp_int(btree_iter_type(iter), btree_iter_type(linked))) < 0) {
			deadlock_iter = linked;
			reason = 5;
		}
270 271 272

		if (iter->btree_id == linked->btree_id &&
		    btree_node_locked(linked, level) &&
273
		    bkey_cmp(pos, btree_node_pos((void *) linked->l[level].b,
274 275 276 277
						 btree_iter_type(linked))) <= 0) {
			deadlock_iter = linked;
			reason = 6;
		}
278 279 280 281 282 283 284 285 286 287

		/*
		 * Recheck if this is a node we already have locked - since one
		 * of the get_locks() calls might've successfully
		 * upgraded/relocked it:
		 */
		if (linked->l[level].b == b &&
		    btree_node_locked_type(linked, level) >= type) {
			six_lock_increment(&b->c.lock, type);
			return true;
288 289 290
		}
	}

291 292 293 294 295 296 297
	if (unlikely(deadlock_iter)) {
		trace_trans_restart_would_deadlock(iter->trans->ip, ip,
				reason,
				deadlock_iter->btree_id,
				btree_iter_type(deadlock_iter),
				iter->btree_id,
				btree_iter_type(iter));
298 299
		return false;
	}
300

301 302 303 304 305 306 307 308
	if (six_trylock_type(&b->c.lock, type))
		return true;

	if (six_lock_type(&b->c.lock, type, should_sleep_fn, p))
		return false;

	bch2_time_stats_update(&trans->c->times[lock_to_time_stat(type)],
			       start_time);
309
	return true;
310 311 312 313 314
}

/* Btree iterator locking: */

#ifdef CONFIG_BCACHEFS_DEBUG
315
static void bch2_btree_iter_verify_locks(struct btree_iter *iter)
316 317 318
{
	unsigned l;

319 320 321 322 323
	if (!(iter->trans->iters_linked & (1ULL << iter->idx))) {
		BUG_ON(iter->nodes_locked);
		return;
	}

324
	for (l = 0; is_btree_node(iter, l); l++) {
325 326 327 328 329 330 331 332
		if (iter->uptodate >= BTREE_ITER_NEED_RELOCK &&
		    !btree_node_locked(iter, l))
			continue;

		BUG_ON(btree_lock_want(iter, l) !=
		       btree_node_locked_type(iter, l));
	}
}
333

334
void bch2_btree_trans_verify_locks(struct btree_trans *trans)
335
{
336
	struct btree_iter *iter;
337

338
	trans_for_each_iter_all(trans, iter)
339
		bch2_btree_iter_verify_locks(iter);
340
}
341 342
#else
static inline void bch2_btree_iter_verify_locks(struct btree_iter *iter) {}
343 344 345
#endif

__flatten
346
bool bch2_btree_iter_relock(struct btree_iter *iter, bool trace)
347
{
348
	return btree_iter_get_locks(iter, false, trace);
349 350 351 352 353 354 355 356 357 358 359
}

bool __bch2_btree_iter_upgrade(struct btree_iter *iter,
			       unsigned new_locks_want)
{
	struct btree_iter *linked;

	EBUG_ON(iter->locks_want >= new_locks_want);

	iter->locks_want = new_locks_want;

360
	if (btree_iter_get_locks(iter, true, true))
361 362 363 364 365 366 367
		return true;

	/*
	 * Ancestor nodes must be locked before child nodes, so set locks_want
	 * on iterators that might lock ancestors before us to avoid getting
	 * -EINTR later:
	 */
368 369 370
	trans_for_each_iter(iter->trans, linked)
		if (linked != iter &&
		    linked->btree_id == iter->btree_id &&
371 372
		    linked->locks_want < new_locks_want) {
			linked->locks_want = new_locks_want;
373
			btree_iter_get_locks(linked, true, false);
374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
		}

	return false;
}

bool __bch2_btree_iter_upgrade_nounlock(struct btree_iter *iter,
					unsigned new_locks_want)
{
	unsigned l = iter->level;

	EBUG_ON(iter->locks_want >= new_locks_want);

	iter->locks_want = new_locks_want;

	do {
		if (!btree_iter_node(iter, l))
			break;

		if (!bch2_btree_node_upgrade(iter, l)) {
			iter->locks_want = l;
			return false;
		}

		l++;
	} while (l < iter->locks_want);

	return true;
}

void __bch2_btree_iter_downgrade(struct btree_iter *iter,
				 unsigned downgrade_to)
{
406 407
	unsigned l, new_locks_want = downgrade_to ?:
		(iter->flags & BTREE_ITER_INTENT ? 1 : 0);
408

409 410
	if (iter->locks_want < downgrade_to) {
		iter->locks_want = new_locks_want;
411

412 413 414 415
		while (iter->nodes_locked &&
		       (l = __fls(iter->nodes_locked)) >= iter->locks_want) {
			if (l > iter->level) {
				btree_node_unlock(iter, l);
416
			} else {
417 418 419
				if (btree_node_intent_locked(iter, l)) {
					six_lock_downgrade(&iter->l[l].b->c.lock);
					iter->nodes_intent_locked ^= 1 << l;
420 421 422 423 424
				}
				break;
			}
		}
	}
425

426
	bch2_btree_trans_verify_locks(iter->trans);
427 428
}

429 430 431 432 433 434 435 436
void bch2_trans_downgrade(struct btree_trans *trans)
{
	struct btree_iter *iter;

	trans_for_each_iter(trans, iter)
		bch2_btree_iter_downgrade(iter);
}

437 438 439
/* Btree transaction locking: */

bool bch2_trans_relock(struct btree_trans *trans)
440 441 442 443 444
{
	struct btree_iter *iter;
	bool ret = true;

	trans_for_each_iter(trans, iter)
445 446
		if (iter->uptodate == BTREE_ITER_NEED_RELOCK)
			ret &= bch2_btree_iter_relock(iter, true);
447 448 449 450

	return ret;
}

451
void bch2_trans_unlock(struct btree_trans *trans)
452 453 454 455 456 457 458
{
	struct btree_iter *iter;

	trans_for_each_iter(trans, iter)
		__bch2_btree_iter_unlock(iter);
}

459 460 461 462
/* Btree iterator: */

#ifdef CONFIG_BCACHEFS_DEBUG

463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
static void bch2_btree_iter_verify_cached(struct btree_iter *iter)
{
	struct bkey_cached *ck;
	bool locked = btree_node_locked(iter, 0);

	if (!bch2_btree_node_relock(iter, 0))
		return;

	ck = (void *) iter->l[0].b;
	BUG_ON(ck->key.btree_id != iter->btree_id ||
	       bkey_cmp(ck->key.pos, iter->pos));

	if (!locked)
		btree_node_unlock(iter, 0);
}

479 480
static void bch2_btree_iter_verify_level(struct btree_iter *iter,
					 unsigned level)
481
{
482
	struct bpos pos = btree_iter_search_key(iter);
483
	struct btree_iter_level *l = &iter->l[level];
484
	struct btree_node_iter tmp = l->iter;
485 486 487 488
	bool locked = btree_node_locked(iter, level);
	struct bkey_packed *p, *k;
	char buf1[100], buf2[100];
	const char *msg;
489

490
	if (!bch2_debug_check_iterators)
491 492
		return;

493 494 495 496 497 498
	if (btree_iter_type(iter) == BTREE_ITER_CACHED) {
		if (!level)
			bch2_btree_iter_verify_cached(iter);
		return;
	}

499 500 501 502 503 504
	BUG_ON(iter->level < iter->min_depth);

	if (!btree_iter_node(iter, level))
		return;

	if (!bch2_btree_node_relock(iter, level))
505 506
		return;

507 508 509 510 511 512 513 514 515 516 517 518 519
	/*
	 * Ideally this invariant would always be true, and hopefully in the
	 * future it will be, but for now set_pos_same_leaf() breaks it:
	 */
	BUG_ON(iter->uptodate < BTREE_ITER_NEED_TRAVERSE &&
	       !btree_iter_pos_in_node(iter, l->b));

	/*
	 * node iterators don't use leaf node iterator:
	 */
	if (btree_iter_type(iter) == BTREE_ITER_NODES &&
	    level <= iter->min_depth)
		goto unlock;
520

521
	bch2_btree_node_iter_verify(&l->iter, l->b);
522 523 524 525

	/*
	 * For interior nodes, the iterator will have skipped past
	 * deleted keys:
526 527 528
	 *
	 * For extents, the iterator may have skipped past deleted keys (but not
	 * whiteouts)
529
	 */
530 531 532 533
	p = level || btree_node_type_is_extents(iter->btree_id)
		? bch2_btree_node_iter_prev_filter(&tmp, l->b, KEY_TYPE_discard)
		: bch2_btree_node_iter_prev_all(&tmp, l->b);
	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
534

535 536 537
	if (p && bkey_iter_pos_cmp(l->b, p, &pos) >= 0) {
		msg = "before";
		goto err;
538 539
	}

540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
	if (k && bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
		msg = "after";
		goto err;
	}
unlock:
	if (!locked)
		btree_node_unlock(iter, level);
	return;
err:
	strcpy(buf1, "(none)");
	strcpy(buf2, "(none)");

	if (p) {
		struct bkey uk = bkey_unpack_key(l->b, p);
		bch2_bkey_to_text(&PBUF(buf1), &uk);
	}
556

557 558 559
	if (k) {
		struct bkey uk = bkey_unpack_key(l->b, k);
		bch2_bkey_to_text(&PBUF(buf2), &uk);
560
	}
561 562 563 564 565 566 567 568 569

	panic("iterator should be %s key at level %u:\n"
	      "iter pos %s %llu:%llu\n"
	      "prev key %s\n"
	      "cur  key %s\n",
	      msg, level,
	      iter->flags & BTREE_ITER_IS_EXTENTS ? ">" : "=>",
	      iter->pos.inode, iter->pos.offset,
	      buf1, buf2);
570 571
}

572
static void bch2_btree_iter_verify(struct btree_iter *iter)
573
{
574
	unsigned i;
575

576 577 578 579 580 581 582 583 584 585
	bch2_btree_trans_verify_locks(iter->trans);

	for (i = 0; i < BTREE_MAX_DEPTH; i++)
		bch2_btree_iter_verify_level(iter, i);
}

void bch2_btree_trans_verify_iters(struct btree_trans *trans, struct btree *b)
{
	struct btree_iter *iter;

586
	if (!bch2_debug_check_iterators)
587 588
		return;

589 590
	trans_for_each_iter_with_node(trans, b, iter)
		bch2_btree_iter_verify_level(iter, b->c.level);
591 592
}

593 594
#else

595 596
static inline void bch2_btree_iter_verify_level(struct btree_iter *iter, unsigned l) {}
static inline void bch2_btree_iter_verify(struct btree_iter *iter) {}
597

598 599
#endif

600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
static void btree_node_iter_set_set_pos(struct btree_node_iter *iter,
					struct btree *b,
					struct bset_tree *t,
					struct bkey_packed *k)
{
	struct btree_node_iter_set *set;

	btree_node_iter_for_each(iter, set)
		if (set->end == t->end_offset) {
			set->k = __btree_node_key_to_offset(b, k);
			bch2_btree_node_iter_sort(iter, b);
			return;
		}

	bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t));
}

617
static void __bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
618 619
					       struct btree *b,
					       struct bkey_packed *where)
620
{
621 622
	struct btree_iter_level *l = &iter->l[b->c.level];
	struct bpos pos = btree_iter_search_key(iter);
623

624 625 626 627 628 629 630
	if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b))
		return;

	if (bkey_iter_pos_cmp(l->b, where, &pos) < 0)
		bch2_btree_node_iter_advance(&l->iter, l->b);

	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
631 632 633 634 635 636 637 638 639 640
}

void bch2_btree_iter_fix_key_modified(struct btree_iter *iter,
				      struct btree *b,
				      struct bkey_packed *where)
{
	struct btree_iter *linked;

	trans_for_each_iter_with_node(iter->trans, b, linked) {
		__bch2_btree_iter_fix_key_modified(linked, b, where);
641
		bch2_btree_iter_verify_level(linked, b->c.level);
642 643 644
	}
}

645 646 647 648 649 650 651 652 653 654 655 656
static void __bch2_btree_node_iter_fix(struct btree_iter *iter,
				      struct btree *b,
				      struct btree_node_iter *node_iter,
				      struct bset_tree *t,
				      struct bkey_packed *where,
				      unsigned clobber_u64s,
				      unsigned new_u64s)
{
	const struct bkey_packed *end = btree_bkey_last(b, t);
	struct btree_node_iter_set *set;
	unsigned offset = __btree_node_key_to_offset(b, where);
	int shift = new_u64s - clobber_u64s;
657
	unsigned old_end = t->end_offset - shift;
658 659 660 661
	unsigned orig_iter_pos = node_iter->data[0].k;
	bool iter_current_key_modified =
		orig_iter_pos >= offset &&
		orig_iter_pos <= offset + clobber_u64s;
662
	struct bpos iter_pos = btree_iter_search_key(iter);
663 664 665 666 667 668 669

	btree_node_iter_for_each(node_iter, set)
		if (set->end == old_end)
			goto found;

	/* didn't find the bset in the iterator - might have to readd it: */
	if (new_u64s &&
670
	    bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
671
		bch2_btree_node_iter_push(node_iter, b, where, end);
672 673 674
		goto fixup_done;
	} else {
		/* Iterator is after key that changed */
675
		return;
676 677
	}
found:
678
	set->end = t->end_offset;
679 680 681

	/* Iterator hasn't gotten to the key that changed yet: */
	if (set->k < offset)
682
		return;
683 684

	if (new_u64s &&
685
	    bkey_iter_pos_cmp(b, where, &iter_pos) >= 0) {
686 687 688 689 690 691
		set->k = offset;
	} else if (set->k < offset + clobber_u64s) {
		set->k = offset + new_u64s;
		if (set->k == set->end)
			bch2_btree_node_iter_set_drop(node_iter, set);
	} else {
692
		/* Iterator is after key that changed */
693
		set->k = (int) set->k + shift;
694
		return;
695 696 697
	}

	bch2_btree_node_iter_sort(node_iter, b);
698 699 700
fixup_done:
	if (node_iter->data[0].k != orig_iter_pos)
		iter_current_key_modified = true;
701

702
	/*
703 704 705
	 * When a new key is added, and the node iterator now points to that
	 * key, the iterator might have skipped past deleted keys that should
	 * come after the key the iterator now points to. We have to rewind to
706 707
	 * before those deleted keys - otherwise
	 * bch2_btree_node_iter_prev_all() breaks:
708
	 */
709
	if (!bch2_btree_node_iter_end(node_iter) &&
710
	    iter_current_key_modified &&
711
	    (b->c.level ||
712
	     btree_node_type_is_extents(iter->btree_id))) {
713 714 715 716
		struct bset_tree *t;
		struct bkey_packed *k, *k2, *p;

		k = bch2_btree_node_iter_peek_all(node_iter, b);
717 718

		for_each_bset(b, t) {
719 720 721
			bool set_pos = false;

			if (node_iter->data[0].end == t->end_offset)
722 723
				continue;

724 725 726 727 728 729
			k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t);

			while ((p = bch2_bkey_prev_all(b, t, k2)) &&
			       bkey_iter_cmp(b, k, p) < 0) {
				k2 = p;
				set_pos = true;
730
			}
731 732 733 734

			if (set_pos)
				btree_node_iter_set_set_pos(node_iter,
							    b, t, k2);
735 736
		}
	}
737

738 739
	if (!b->c.level &&
	    node_iter == &iter->l[0].iter &&
740
	    iter_current_key_modified)
741
		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
742 743 744
}

void bch2_btree_node_iter_fix(struct btree_iter *iter,
745 746 747 748 749
			      struct btree *b,
			      struct btree_node_iter *node_iter,
			      struct bkey_packed *where,
			      unsigned clobber_u64s,
			      unsigned new_u64s)
750
{
751
	struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
752 753
	struct btree_iter *linked;

754
	if (node_iter != &iter->l[b->c.level].iter) {
755
		__bch2_btree_node_iter_fix(iter, b, node_iter, t,
756
					   where, clobber_u64s, new_u64s);
757

758
		if (bch2_debug_check_iterators)
759
			bch2_btree_node_iter_verify(node_iter, b);
760
	}
761

762
	trans_for_each_iter_with_node(iter->trans, b, linked) {
763
		__bch2_btree_node_iter_fix(linked, b,
764 765
					   &linked->l[b->c.level].iter, t,
					   where, clobber_u64s, new_u64s);
766
		bch2_btree_iter_verify_level(linked, b->c.level);
767
	}
768 769 770 771 772 773 774 775 776 777 778 779 780 781
}

static inline struct bkey_s_c __btree_iter_unpack(struct btree_iter *iter,
						  struct btree_iter_level *l,
						  struct bkey *u,
						  struct bkey_packed *k)
{
	struct bkey_s_c ret;

	if (unlikely(!k)) {
		/*
		 * signal to bch2_btree_iter_peek_slot() that we're currently at
		 * a hole
		 */
782
		u->type = KEY_TYPE_deleted;
783 784 785 786 787
		return bkey_s_c_null;
	}

	ret = bkey_disassemble(l->b, k, u);

788
	if (bch2_debug_check_bkeys)
789
		bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809

	return ret;
}

/* peek_all() doesn't skip deleted keys */
static inline struct bkey_s_c __btree_iter_peek_all(struct btree_iter *iter,
						    struct btree_iter_level *l,
						    struct bkey *u)
{
	return __btree_iter_unpack(iter, l, u,
			bch2_btree_node_iter_peek_all(&l->iter, l->b));
}

static inline struct bkey_s_c __btree_iter_peek(struct btree_iter *iter,
						struct btree_iter_level *l)
{
	return __btree_iter_unpack(iter, l, &iter->k,
			bch2_btree_node_iter_peek(&l->iter, l->b));
}

810 811 812 813 814 815 816
static inline struct bkey_s_c __btree_iter_prev(struct btree_iter *iter,
						struct btree_iter_level *l)
{
	return __btree_iter_unpack(iter, l, &iter->k,
			bch2_btree_node_iter_prev(&l->iter, l->b));
}

817 818 819
static inline bool btree_iter_advance_to_pos(struct btree_iter *iter,
					     struct btree_iter_level *l,
					     int max_advance)
820
{
821
	struct bpos pos = btree_iter_search_key(iter);
822 823 824 825
	struct bkey_packed *k;
	int nr_advanced = 0;

	while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) &&
826
	       bkey_iter_pos_cmp(l->b, k, &pos) < 0) {
827 828 829 830 831 832 833 834
		if (max_advance > 0 && nr_advanced >= max_advance)
			return false;

		bch2_btree_node_iter_advance(&l->iter, l->b);
		nr_advanced++;
	}

	return true;
835 836 837 838 839 840 841 842 843 844 845 846 847 848 849
}

/*
 * Verify that iterator for parent node points to child node:
 */
static void btree_iter_verify_new_node(struct btree_iter *iter, struct btree *b)
{
	struct btree_iter_level *l;
	unsigned plevel;
	bool parent_locked;
	struct bkey_packed *k;

	if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
		return;

850
	plevel = b->c.level + 1;
851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866
	if (!btree_iter_node(iter, plevel))
		return;

	parent_locked = btree_node_locked(iter, plevel);

	if (!bch2_btree_node_relock(iter, plevel))
		return;

	l = &iter->l[plevel];
	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
	if (!k ||
	    bkey_deleted(k) ||
	    bkey_cmp_left_packed(l->b, k, &b->key.k.p)) {
		char buf[100];
		struct bkey uk = bkey_unpack_key(b, k);

867
		bch2_bkey_to_text(&PBUF(buf), &uk);
868 869 870 871 872
		panic("parent iter doesn't point to new node:\n%s\n%llu:%llu\n",
		      buf, b->key.k.p.inode, b->key.k.p.offset);
	}

	if (!parent_locked)
873
		btree_node_unlock(iter, b->c.level + 1);
874 875 876
}

static inline void __btree_iter_init(struct btree_iter *iter,
877
				     unsigned level)
878
{
879
	struct bpos pos = btree_iter_search_key(iter);
880 881
	struct btree_iter_level *l = &iter->l[level];

882
	bch2_btree_node_iter_init(&l->iter, l->b, &pos);
883 884 885 886 887 888 889

	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
}

static inline void btree_iter_node_set(struct btree_iter *iter,
				       struct btree *b)
{
890 891
	BUG_ON(btree_iter_type(iter) == BTREE_ITER_CACHED);

892 893 894
	btree_iter_verify_new_node(iter, b);

	EBUG_ON(!btree_iter_pos_in_node(iter, b));
895
	EBUG_ON(b->c.lock.state.seq & 1);
896

897 898 899
	iter->l[b->c.level].lock_seq = b->c.lock.state.seq;
	iter->l[b->c.level].b = b;
	__btree_iter_init(iter, b->c.level);
900 901 902 903 904 905 906 907 908 909 910
}

/*
 * A btree node is being replaced - update the iterator to point to the new
 * node:
 */
void bch2_btree_iter_node_replace(struct btree_iter *iter, struct btree *b)
{
	enum btree_node_locked_type t;
	struct btree_iter *linked;

911
	trans_for_each_iter(iter->trans, linked)
912 913
		if (btree_iter_type(linked) != BTREE_ITER_CACHED &&
		    btree_iter_pos_in_node(linked, b)) {
914 915 916 917 918
			/*
			 * bch2_btree_iter_node_drop() has already been called -
			 * the old node we're replacing has already been
			 * unlocked and the pointer invalidated
			 */
919
			BUG_ON(btree_node_locked(linked, b->c.level));
920

921
			t = btree_lock_want(linked, b->c.level);
922
			if (t != BTREE_NODE_UNLOCKED) {
923 924
				six_lock_increment(&b->c.lock, (enum six_lock_type) t);
				mark_btree_node_locked(linked, b->c.level, (enum six_lock_type) t);
925 926 927 928 929 930 931 932 933
			}

			btree_iter_node_set(linked, b);
		}
}

void bch2_btree_iter_node_drop(struct btree_iter *iter, struct btree *b)
{
	struct btree_iter *linked;
934
	unsigned level = b->c.level;
935

936
	trans_for_each_iter(iter->trans, linked)
937
		if (linked->l[level].b == b) {
938
			__btree_node_unlock(linked, level);
939
			linked->l[level].b = BTREE_ITER_NO_NODE_DROP;
940 941 942 943 944 945 946 947 948 949 950
		}
}

/*
 * A btree node has been modified in such a way as to invalidate iterators - fix
 * them:
 */
void bch2_btree_iter_reinit_node(struct btree_iter *iter, struct btree *b)
{
	struct btree_iter *linked;

951
	trans_for_each_iter_with_node(iter->trans, b, linked)
952
		__btree_iter_init(linked, b->c.level);
953 954
}

955 956 957 958 959 960 961 962
static int lock_root_check_fn(struct six_lock *lock, void *p)
{
	struct btree *b = container_of(lock, struct btree, c.lock);
	struct btree **rootp = p;

	return b == *rootp ? 0 : -1;
}

963
static inline int btree_iter_lock_root(struct btree_iter *iter,
964 965
				       unsigned depth_want,
				       unsigned long trace_ip)
966
{
967
	struct bch_fs *c = iter->trans->c;
968
	struct btree *b, **rootp = &c->btree_roots[iter->btree_id].b;
969 970 971 972 973 974
	enum six_lock_type lock_type;
	unsigned i;

	EBUG_ON(iter->nodes_locked);

	while (1) {
975
		b = READ_ONCE(*rootp);
976
		iter->level = READ_ONCE(b->c.level);
977 978 979 980 981 982 983 984 985

		if (unlikely(iter->level < depth_want)) {
			/*
			 * the root is at a lower depth than the depth we want:
			 * got to the end of the btree, or we're walking nodes
			 * greater than some depth and there are no nodes >=
			 * that depth
			 */
			iter->level = depth_want;
986 987
			for (i = iter->level; i < BTREE_MAX_DEPTH; i++)
				iter->l[i].b = NULL;
988
			return 1;
989 990 991 992
		}

		lock_type = __btree_lock_want(iter, iter->level);
		if (unlikely(!btree_node_lock(b, POS_MAX, iter->level,
993
					      iter, lock_type,
994 995
					      lock_root_check_fn, rootp,
					      trace_ip)))
996 997
			return -EINTR;

998
		if (likely(b == READ_ONCE(*rootp) &&
999
			   b->c.level == iter->level &&
1000 1001
			   !race_fault())) {
			for (i = 0; i < iter->level; i++)
1002
				iter->l[i].b = BTREE_ITER_NO_NODE_LOCK_ROOT;
1003
			iter->l[iter->level].b = b;
1004 1005
			for (i = iter->level + 1; i < BTREE_MAX_DEPTH; i++)
				iter->l[i].b = NULL;
1006 1007 1008 1009 1010 1011

			mark_btree_node_locked(iter, iter->level, lock_type);
			btree_iter_node_set(iter, b);
			return 0;
		}

1012
		six_unlock_type(&b->c.lock, lock_type);
1013 1014 1015 1016 1017 1018
	}
}

noinline
static void btree_iter_prefetch(struct btree_iter *iter)
{
1019
	struct bch_fs *c = iter->trans->c;
1020 1021 1022 1023
	struct btree_iter_level *l = &iter->l[iter->level];
	struct btree_node_iter node_iter = l->iter;
	struct bkey_packed *k;
	BKEY_PADDED(k) tmp;
1024
	unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038
		? (iter->level > 1 ? 0 :  2)
		: (iter->level > 1 ? 1 : 16);
	bool was_locked = btree_node_locked(iter, iter->level);

	while (nr) {
		if (!bch2_btree_node_relock(iter, iter->level))
			return;

		bch2_btree_node_iter_advance(&node_iter, l->b);
		k = bch2_btree_node_iter_peek(&node_iter, l->b);
		if (!k)
			break;

		bch2_bkey_unpack(l->b, &tmp.k, k);
1039
		bch2_btree_node_prefetch(c, iter, &tmp.k, iter->level - 1);
1040 1041 1042 1043 1044 1045
	}

	if (!was_locked)
		btree_node_unlock(iter, iter->level);
}

1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066
static noinline void btree_node_mem_ptr_set(struct btree_iter *iter,
					    unsigned plevel, struct btree *b)
{
	struct btree_iter_level *l = &iter->l[plevel];
	bool locked = btree_node_locked(iter, plevel);
	struct bkey_packed *k;
	struct bch_btree_ptr_v2 *bp;

	if (!bch2_btree_node_relock(iter, plevel))
		return;

	k = bch2_btree_node_iter_peek_all(&l->iter, l->b);
	BUG_ON(k->type != KEY_TYPE_btree_ptr_v2);

	bp = (void *) bkeyp_val(&l->b->format, k);
	bp->mem_ptr = (unsigned long)b;

	if (!locked)
		btree_node_unlock(iter, plevel);
}

1067 1068
static __always_inline int btree_iter_down(struct btree_iter *iter,
					   unsigned long trace_ip)
1069
{
1070
	struct bch_fs *c = iter->trans->c;
1071 1072 1073 1074 1075 1076
	struct btree_iter_level *l = &iter->l[iter->level];
	struct btree *b;
	unsigned level = iter->level - 1;
	enum six_lock_type lock_type = __btree_lock_want(iter, level);
	BKEY_PADDED(k) tmp;

1077
	EBUG_ON(!btree_node_locked(iter, iter->level));
1078 1079 1080 1081

	bch2_bkey_unpack(l->b, &tmp.k,
			 bch2_btree_node_iter_peek(&l->iter, l->b));

1082
	b = bch2_btree_node_get(c, iter, &tmp.k, level, lock_type, trace_ip);
1083 1084 1085 1086 1087 1088
	if (unlikely(IS_ERR(b)))
		return PTR_ERR(b);

	mark_btree_node_locked(iter, level, lock_type);
	btree_iter_node_set(iter, b);

1089 1090 1091 1092
	if (tmp.k.k.type == KEY_TYPE_btree_ptr_v2 &&
	    unlikely(b != btree_node_mem_ptr(&tmp.k)))
		btree_node_mem_ptr_set(iter, level + 1, b);

1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105
	if (iter->flags & BTREE_ITER_PREFETCH)
		btree_iter_prefetch(iter);

	iter->level = level;

	return 0;
}

static void btree_iter_up(struct btree_iter *iter)
{
	btree_node_unlock(iter, iter->level++);
}

1106
static int btree_iter_traverse_one(struct btree_iter *, unsigned long);
1107

1108
static int __btree_iter_traverse_all(struct btree_trans *trans, int ret)
1109
{
1110
	struct bch_fs *c = trans->c;
1111
	struct btree_iter *iter;
1112 1113 1114
	u8 sorted[BTREE_ITER_MAX];
	unsigned i, nr_sorted = 0;

1115 1116 1117 1118 1119 1120 1121
	if (trans->in_traverse_all)
		return -EINTR;

	trans->in_traverse_all = true;
retry_all:
	nr_sorted = 0;

1122
	trans_for_each_iter(trans, iter)
1123
		sorted[nr_sorted++] = iter->idx;
1124 1125

#define btree_iter_cmp_by_idx(_l, _r)				\
1126
		btree_iter_lock_cmp(&trans->iters[_l], &trans->iters[_r])
1127 1128 1129

	bubble_sort(sorted, nr_sorted, btree_iter_cmp_by_idx);
#undef btree_iter_cmp_by_idx
1130
	bch2_trans_unlock(trans);
1131
	cond_resched();
1132

1133
	if (unlikely(ret == -ENOMEM)) {
1134 1135 1136 1137 1138 1139 1140 1141 1142 1143
		struct closure cl;

		closure_init_stack(&cl);

		do {
			ret = bch2_btree_cache_cannibalize_lock(c, &cl);
			closure_sync(&cl);
		} while (ret);
	}

1144
	if (unlikely(ret == -EIO)) {
1145
		trans->error = true;
1146 1147 1148 1149 1150
		goto out;
	}

	BUG_ON(ret && ret != -EINTR);

1151
	/* Now, redo traversals in correct order: */
1152
	for (i = 0; i < nr_sorted; i++) {
1153 1154 1155 1156 1157 1158 1159 1160
		unsigned idx = sorted[i];

		/*
		 * sucessfully traversing one iterator can cause another to be
		 * unlinked, in btree_key_cache_fill()
		 */
		if (!(trans->iters_linked & (1ULL << idx)))
			continue;
1161

1162
		ret = btree_iter_traverse_one(&trans->iters[idx], _THIS_IP_);
1163 1164 1165
		if (ret)
			goto retry_all;
	}
1166

1167 1168 1169 1170 1171 1172 1173 1174
	if (hweight64(trans->iters_live) > 1)
		ret = -EINTR;
	else
		trans_for_each_iter(trans, iter)
			if (iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT) {
				ret = -EINTR;
				break;
			}
1175 1176
out:
	bch2_btree_cache_cannibalize_unlock(c);
1177 1178

	trans->in_traverse_all = false;
1179
	return ret;
1180
}
1181

1182 1183
int bch2_btree_iter_traverse_all(struct btree_trans *trans)
{
1184
	return __btree_iter_traverse_all(trans, 0);
1185 1186
}

1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
static inline bool btree_iter_good_node(struct btree_iter *iter,
					unsigned l, int check_pos)
{
	if (!is_btree_node(iter, l) ||
	    !bch2_btree_node_relock(iter, l))
		return false;

	if (check_pos <= 0 && btree_iter_pos_before_node(iter, iter->l[l].b))
		return false;
	if (check_pos >= 0 && btree_iter_pos_after_node(iter, iter->l[l].b))
		return false;
	return true;
}

static inline unsigned btree_iter_up_until_good_node(struct btree_iter *iter,
						     int check_pos)
1203 1204 1205 1206
{
	unsigned l = iter->level;

	while (btree_iter_node(iter, l) &&
1207
	       !btree_iter_good_node(iter, l, check_pos)) {
1208
		btree_node_unlock(iter, l);
1209
		iter->l[l].b = BTREE_ITER_NO_NODE_UP;
1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222
		l++;
	}

	return l;
}

/*
 * This is the main state machine for walking down the btree - walks down to a
 * specified depth
 *
 * Returns 0 on success, -EIO on error (error reading in a btree node).
 *
 * On error, caller (peek_node()/peek_key()) must return NULL; the error is
1223
 * stashed in the iterator and returned from bch2_trans_exit().
1224
 */
1225 1226
static int btree_iter_traverse_one(struct btree_iter *iter,
				   unsigned long trace_ip)
1227 1228 1229
{
	unsigned depth_want = iter->level;

1230 1231 1232 1233 1234 1235 1236 1237
	/*
	 * if we need interior nodes locked, call btree_iter_relock() to make
	 * sure we walk back up enough that we lock them:
	 */
	if (iter->uptodate == BTREE_ITER_NEED_RELOCK ||
	    iter->locks_want > 1)
		bch2_btree_iter_relock(iter, false);

1238 1239 1240
	if (btree_iter_type(iter) == BTREE_ITER_CACHED)
		return bch2_btree_iter_traverse_cached(iter);

1241
	if (iter->uptodate < BTREE_ITER_NEED_RELOCK)
1242 1243
		return 0;

1244 1245 1246
	if (unlikely(iter->level >= BTREE_MAX_DEPTH))
		return 0;

1247 1248 1249 1250
	/*
	 * XXX: correctly using BTREE_ITER_UPTODATE should make using check_pos
	 * here unnecessary
	 */
1251
	iter->level = btree_iter_up_until_good_node(iter, 0);
1252 1253 1254 1255 1256 1257 1258

	/*
	 * If we've got a btree node locked (i.e. we aren't about to relock the
	 * root) - advance its node iterator if necessary:
	 *
	 * XXX correctly using BTREE_ITER_UPTODATE should make this unnecessary
	 */
1259
	if (is_btree_node(iter, iter->level)) {
1260 1261
		BUG_ON(!btree_iter_pos_in_node(iter, iter->l[iter->level].b));

1262
		btree_iter_advance_to_pos(iter, &iter->l[iter->level], -1);
1263
	}
1264 1265 1266 1267 1268 1269 1270 1271 1272

	/*
	 * Note: iter->nodes[iter->level] may be temporarily NULL here - that
	 * would indicate to other code that we got to the end of the btree,
	 * here it indicates that relocking the root failed - it's critical that
	 * btree_iter_lock_root() comes next and that it can't fail
	 */
	while (iter->level > depth_want) {
		int ret = btree_iter_node(iter, iter->level)
1273 1274
			? btree_iter_down(iter, trace_ip)
			: btree_iter_lock_root(iter, depth_want, trace_ip);
1275
		if (unlikely(ret)) {
1276 1277 1278
			if (ret == 1)
				return 0;

1279
			iter->level = depth_want;
1280 1281 1282 1283 1284 1285 1286 1287 1288

			if (ret == -EIO) {
				iter->flags |= BTREE_ITER_ERROR;
				iter->l[iter->level].b =
					BTREE_ITER_NO_NODE_ERROR;
			} else {
				iter->l[iter->level].b =
					BTREE_ITER_NO_NODE_DOWN;
			}
1289 1290 1291 1292 1293
			return ret;
		}
	}

	iter->uptodate = BTREE_ITER_NEED_PEEK;
1294

1295
	bch2_btree_iter_verify(iter);
1296 1297 1298
	return 0;
}

1299
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter)
1300
{
1301
	struct btree_trans *trans = iter->trans;
1302 1303
	int ret;

1304
	ret =   bch2_trans_cond_resched(trans) ?:
1305
		btree_iter_traverse_one(iter, _RET_IP_);
1306
	if (unlikely(ret))
1307
		ret = __btree_iter_traverse_all(trans, ret);
1308 1309 1310 1311

	return ret;
}

1312
static inline void bch2_btree_iter_checks(struct btree_iter *iter)
1313
{
1314 1315
	enum btree_iter_type type = btree_iter_type(iter);

1316 1317
	EBUG_ON(iter->btree_id >= BTREE_ID_NR);

1318 1319
	BUG_ON((type == BTREE_ITER_KEYS ||
		type == BTREE_ITER_CACHED) &&
1320 1321 1322
	       (bkey_cmp(iter->pos, bkey_start_pos(&iter->k)) < 0 ||
		bkey_cmp(iter->pos, iter->k.p) > 0));

1323 1324
	bch2_btree_iter_verify_locks(iter);
	bch2_btree_iter_verify_level(iter, iter->level);
1325 1326 1327 1328 1329 1330 1331 1332 1333
}

/* Iterate across nodes (leaf and interior nodes) */

struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
{
	struct btree *b;
	int ret;

1334 1335
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
	bch2_btree_iter_checks(iter);
1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352

	if (iter->uptodate == BTREE_ITER_UPTODATE)
		return iter->l[iter->level].b;

	ret = bch2_btree_iter_traverse(iter);
	if (ret)
		return NULL;

	b = btree_iter_node(iter, iter->level);
	if (!b)
		return NULL;

	BUG_ON(bkey_cmp(b->key.k.p, iter->pos) < 0);

	iter->pos = b->key.k.p;
	iter->uptodate = BTREE_ITER_UPTODATE;

1353 1354
	bch2_btree_iter_verify(iter);

1355 1356 1357
	return b;
}

1358
struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
1359 1360 1361 1362
{
	struct btree *b;
	int ret;

1363 1364
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_NODES);
	bch2_btree_iter_checks(iter);
1365 1366 1367 1368 1369

	/* already got to end? */
	if (!btree_iter_node(iter, iter->level))
		return NULL;

1370 1371
	bch2_trans_cond_resched(iter->trans);

1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400
	btree_iter_up(iter);

	if (!bch2_btree_node_relock(iter, iter->level))
		btree_iter_set_dirty(iter, BTREE_ITER_NEED_RELOCK);

	ret = bch2_btree_iter_traverse(iter);
	if (ret)
		return NULL;

	/* got to end? */
	b = btree_iter_node(iter, iter->level);
	if (!b)
		return NULL;

	if (bkey_cmp(iter->pos, b->key.k.p) < 0) {
		/*
		 * Haven't gotten to the end of the parent node: go back down to
		 * the next child node
		 */

		/*
		 * We don't really want to be unlocking here except we can't
		 * directly tell btree_iter_traverse() "traverse to this level"
		 * except by setting iter->level, so we have to unlock so we
		 * don't screw up our lock invariants:
		 */
		if (btree_node_read_locked(iter, iter->level))
			btree_node_unlock(iter, iter->level);

1401
		iter->pos	= bkey_successor(iter->pos);
1402
		iter->level	= iter->min_depth;
1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414

		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
		ret = bch2_btree_iter_traverse(iter);
		if (ret)
			return NULL;

		b = iter->l[iter->level].b;
	}

	iter->pos = b->key.k.p;
	iter->uptodate = BTREE_ITER_UPTODATE;

1415 1416
	bch2_btree_iter_verify(iter);

1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430
	return b;
}

/* Iterate across keys (in leaf nodes only) */

void bch2_btree_iter_set_pos_same_leaf(struct btree_iter *iter, struct bpos new_pos)
{
	struct btree_iter_level *l = &iter->l[0];

	EBUG_ON(iter->level != 0);
	EBUG_ON(bkey_cmp(new_pos, iter->pos) < 0);
	EBUG_ON(!btree_node_locked(iter, 0));
	EBUG_ON(bkey_cmp(new_pos, l->b->key.k.p) > 0);

1431 1432
	bkey_init(&iter->k);
	iter->k.p = iter->pos = new_pos;
1433 1434
	btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);

1435
	btree_iter_advance_to_pos(iter, l, -1);
1436

1437 1438 1439 1440 1441 1442 1443 1444 1445 1446
	/*
	 * XXX:
	 * keeping a node locked that's outside (even just outside) iter->pos
	 * breaks __bch2_btree_node_lock(). This seems to only affect
	 * bch2_btree_node_get_sibling so for now it's fixed there, but we
	 * should try to get rid of this corner case.
	 *
	 * (this behaviour is currently needed for BTREE_INSERT_NOUNLOCK)
	 */

1447 1448
	if (bch2_btree_node_iter_end(&l->iter) &&
	    btree_iter_pos_after_node(iter, l->b))
1449 1450 1451
		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
}

1452
static void btree_iter_pos_changed(struct btree_iter *iter, int cmp)
1453
{
1454 1455 1456 1457 1458
	unsigned l = iter->level;

	if (!cmp)
		goto out;

1459 1460 1461 1462 1463 1464 1465
	if (unlikely(btree_iter_type(iter) == BTREE_ITER_CACHED)) {
		btree_node_unlock(iter, 0);
		iter->l[0].b = BTREE_ITER_NO_NODE_UP;
		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
		return;
	}

1466
	l = btree_iter_up_until_good_node(iter, cmp);
1467

1468
	if (btree_iter_node(iter, l)) {
1469 1470 1471 1472 1473 1474
		/*
		 * We might have to skip over many keys, or just a few: try
		 * advancing the node iterator, and if we have to skip over too
		 * many keys just reinit it (or if we're rewinding, since that
		 * is expensive).
		 */
1475
		if (cmp < 0 ||
1476 1477
		    !btree_iter_advance_to_pos(iter, &iter->l[l], 8))
			__btree_iter_init(iter, l);
1478 1479

		/* Don't leave it locked if we're not supposed to: */
1480 1481
		if (btree_lock_want(iter, l) == BTREE_NODE_UNLOCKED)
			btree_node_unlock(iter, l);
1482
	}
1483 1484 1485 1486 1487
out:
	if (l != iter->level)
		btree_iter_set_dirty(iter, BTREE_ITER_NEED_TRAVERSE);
	else
		btree_iter_set_dirty(iter, BTREE_ITER_NEED_PEEK);
1488 1489
}

1490 1491 1492 1493 1494 1495 1496 1497 1498
void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos,
			       bool strictly_greater)
{
	struct bpos old = btree_iter_search_key(iter);
	int cmp;

	iter->flags &= ~BTREE_ITER_IS_EXTENTS;
	iter->flags |= strictly_greater ? BTREE_ITER_IS_EXTENTS : 0;

1499 1500
	bkey_init(&iter->k);
	iter->k.p = iter->pos = new_pos;
1501

1502
	cmp = bkey_cmp(btree_iter_search_key(iter), old);
1503

1504
	btree_iter_pos_changed(iter, cmp);
1505 1506
}

1507 1508 1509 1510
void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
	int cmp = bkey_cmp(new_pos, iter->pos);

1511 1512
	bkey_init(&iter->k);
	iter->k.p = iter->pos = new_pos;
1513

1514
	btree_iter_pos_changed(iter, cmp);
1515 1516
}

1517 1518 1519
static inline bool btree_iter_set_pos_to_next_leaf(struct btree_iter *iter)
{
	struct btree_iter_level *l = &iter->l[0];
1520
	bool ret;
1521

1522 1523
	bkey_init(&iter->k);
	iter->k.p = iter->pos = l->b->key.k.p;
1524

1525
	ret = bkey_cmp(iter->pos, POS_MAX) != 0;
1526 1527
	if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
		iter->k.p = iter->pos = bkey_successor(iter->pos);
1528 1529

	btree_iter_pos_changed(iter, 1);
1530
	return ret;
1531 1532 1533 1534 1535
}

static inline bool btree_iter_set_pos_to_prev_leaf(struct btree_iter *iter)
{
	struct btree_iter_level *l = &iter->l[0];
1536
	bool ret;
1537

1538 1539
	bkey_init(&iter->k);
	iter->k.p = iter->pos = l->b->data->min_key;
1540 1541
	iter->uptodate	= BTREE_ITER_NEED_TRAVERSE;

1542
	ret = bkey_cmp(iter->pos, POS_MIN) != 0;
1543 1544 1545 1546 1547 1548
	if (ret) {
		iter->k.p = iter->pos = bkey_predecessor(iter->pos);

		if (iter->flags & BTREE_ITER_IS_EXTENTS)
			iter->k.p = iter->pos = bkey_predecessor(iter->pos);
	}
1549 1550

	btree_iter_pos_changed(iter, -1);
1551
	return ret;
1552 1553
}

1554 1555 1556 1557
/**
 * btree_iter_peek_uptodate - given an iterator that is uptodate, return the key
 * it currently points to
 */
1558 1559 1560 1561 1562 1563
static inline struct bkey_s_c btree_iter_peek_uptodate(struct btree_iter *iter)
{
	struct btree_iter_level *l = &iter->l[0];
	struct bkey_s_c ret = { .k = &iter->k };

	if (!bkey_deleted(&iter->k)) {
1564 1565 1566 1567 1568
		struct bkey_packed *_k =
			__bch2_btree_node_iter_peek_all(&l->iter, l->b);

		ret.v = bkeyp_val(&l->b->format, _k);

1569
		if (bch2_debug_check_iterators) {
1570
			struct bkey k = bkey_unpack_key(l->b, _k);
1571

1572 1573 1574
			BUG_ON(memcmp(&k, &iter->k, sizeof(k)));
		}

1575
		if (bch2_debug_check_bkeys)
1576
			bch2_bkey_debugcheck(iter->trans->c, l->b, ret);
1577 1578 1579 1580 1581
	}

	return ret;
}

1582 1583 1584 1585
/**
 * bch2_btree_iter_peek: returns first key greater than or equal to iterator's
 * current position
 */
1586 1587 1588 1589 1590 1591
struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
	struct btree_iter_level *l = &iter->l[0];
	struct bkey_s_c k;
	int ret;

1592 1593
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
	bch2_btree_iter_checks(iter);
1594

1595 1596
	if (iter->uptodate == BTREE_ITER_UPTODATE &&
	    !bkey_deleted(&iter->k))
1597 1598 1599
		return btree_iter_peek_uptodate(iter);

	while (1) {
1600 1601 1602
		ret = bch2_btree_iter_traverse(iter);
		if (unlikely(ret))
			return bkey_s_c_err(ret);
1603 1604 1605 1606 1607

		k = __btree_iter_peek(iter, l);
		if (likely(k.k))
			break;

1608
		if (!btree_iter_set_pos_to_next_leaf(iter))
1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620
			return bkey_s_c_null;
	}

	/*
	 * iter->pos should always be equal to the key we just
	 * returned - except extents can straddle iter->pos:
	 */
	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
		iter->pos = bkey_start_pos(k.k);

	iter->uptodate = BTREE_ITER_UPTODATE;
1621 1622

	bch2_btree_iter_verify_level(iter, 0);
1623 1624 1625
	return k;
}

1626 1627 1628 1629
/**
 * bch2_btree_iter_next: returns first key greater than iterator's current
 * position
 */
1630 1631
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter)
{
1632 1633
	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
		return bkey_s_c_null;
1634

1635
	bch2_btree_iter_set_pos(iter,
1636 1637 1638
		(iter->flags & BTREE_ITER_IS_EXTENTS)
		? iter->k.p
		: bkey_successor(iter->k.p));
1639

1640
	return bch2_btree_iter_peek(iter);
1641 1642
}

1643 1644 1645 1646 1647 1648
static struct bkey_s_c __btree_trans_updates_peek(struct btree_iter *iter)
{
	struct bpos pos = btree_iter_search_key(iter);
	struct btree_trans *trans = iter->trans;
	struct btree_insert_entry *i;

1649
	trans_for_each_update2(trans, i)
1650 1651 1652 1653
		if ((cmp_int(iter->btree_id,	i->iter->btree_id) ?:
		     bkey_cmp(pos,		i->k->k.p)) <= 0)
			break;

1654
	return i < trans->updates2 + trans->nr_updates2 &&
1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679
		iter->btree_id == i->iter->btree_id
		? bkey_i_to_s_c(i->k)
		: bkey_s_c_null;
}

static struct bkey_s_c __bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
{
	struct btree_iter_level *l = &iter->l[0];
	struct bkey_s_c k = __btree_iter_peek(iter, l);
	struct bkey_s_c u = __btree_trans_updates_peek(iter);

	if (k.k && (!u.k || bkey_cmp(k.k->p, u.k->p) < 0))
		return k;
	if (u.k && bkey_cmp(u.k->p, l->b->key.k.p) <= 0) {
		iter->k = *u.k;
		return u;
	}
	return bkey_s_c_null;
}

struct bkey_s_c bch2_btree_iter_peek_with_updates(struct btree_iter *iter)
{
	struct bkey_s_c k;
	int ret;

1680 1681
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
	bch2_btree_iter_checks(iter);
1682 1683 1684 1685 1686 1687 1688 1689 1690 1691

	while (1) {
		ret = bch2_btree_iter_traverse(iter);
		if (unlikely(ret))
			return bkey_s_c_err(ret);

		k = __bch2_btree_iter_peek_with_updates(iter);

		if (k.k && bkey_deleted(k.k)) {
			bch2_btree_iter_set_pos(iter,
1692 1693 1694
				(iter->flags & BTREE_ITER_IS_EXTENTS)
				? iter->k.p
				: bkey_successor(iter->k.p));
1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722
			continue;
		}

		if (likely(k.k))
			break;

		if (!btree_iter_set_pos_to_next_leaf(iter))
			return bkey_s_c_null;
	}

	/*
	 * iter->pos should always be equal to the key we just
	 * returned - except extents can straddle iter->pos:
	 */
	if (!(iter->flags & BTREE_ITER_IS_EXTENTS) ||
	    bkey_cmp(bkey_start_pos(k.k), iter->pos) > 0)
		iter->pos = bkey_start_pos(k.k);

	iter->uptodate = BTREE_ITER_UPTODATE;
	return k;
}

struct bkey_s_c bch2_btree_iter_next_with_updates(struct btree_iter *iter)
{
	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
		return bkey_s_c_null;

	bch2_btree_iter_set_pos(iter,
1723 1724 1725
		(iter->flags & BTREE_ITER_IS_EXTENTS)
		? iter->k.p
		: bkey_successor(iter->k.p));
1726 1727 1728 1729

	return bch2_btree_iter_peek_with_updates(iter);
}

1730 1731 1732 1733 1734
/**
 * bch2_btree_iter_peek_prev: returns first key less than or equal to
 * iterator's current position
 */
struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
1735
{
1736
	struct bpos pos = iter->pos;
1737 1738 1739 1740
	struct btree_iter_level *l = &iter->l[0];
	struct bkey_s_c k;
	int ret;

1741 1742
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
	bch2_btree_iter_checks(iter);
1743

1744 1745
	if (iter->uptodate == BTREE_ITER_UPTODATE &&
	    !bkey_deleted(&iter->k))
1746
		return btree_iter_peek_uptodate(iter);
1747 1748 1749 1750 1751 1752

	while (1) {
		ret = bch2_btree_iter_traverse(iter);
		if (unlikely(ret))
			return bkey_s_c_err(ret);

1753
		k = __btree_iter_peek(iter, l);
1754
		if (!k.k || bkey_cmp(bkey_start_pos(k.k), pos) > 0)
1755 1756 1757
			k = __btree_iter_prev(iter, l);

		if (likely(k.k))
1758 1759
			break;

1760 1761 1762
		if (!btree_iter_set_pos_to_prev_leaf(iter))
			return bkey_s_c_null;
	}
1763

1764
	EBUG_ON(bkey_cmp(bkey_start_pos(k.k), pos) > 0);
1765 1766 1767 1768 1769
	iter->pos	= bkey_start_pos(k.k);
	iter->uptodate	= BTREE_ITER_UPTODATE;
	return k;
}

1770 1771 1772 1773 1774 1775
/**
 * bch2_btree_iter_prev: returns first key less than iterator's current
 * position
 */
struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter)
{
1776
	struct bpos pos = bkey_start_pos(&iter->k);
1777

1778 1779
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
	bch2_btree_iter_checks(iter);
1780

1781 1782
	if (unlikely(!bkey_cmp(pos, POS_MIN)))
		return bkey_s_c_null;
1783

1784
	bch2_btree_iter_set_pos(iter, bkey_predecessor(pos));
1785

1786
	return bch2_btree_iter_peek_prev(iter);
1787 1788
}

1789
static inline struct bkey_s_c
1790
__bch2_btree_iter_peek_slot_extents(struct btree_iter *iter)
1791 1792
{
	struct btree_iter_level *l = &iter->l[0];
1793
	struct btree_node_iter node_iter;
1794 1795 1796 1797
	struct bkey_s_c k;
	struct bkey n;
	int ret;

1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808
	/* keys & holes can't span inode numbers: */
	if (iter->pos.offset == KEY_OFFSET_MAX) {
		if (iter->pos.inode == KEY_INODE_MAX)
			return bkey_s_c_null;

		bch2_btree_iter_set_pos(iter, bkey_successor(iter->pos));

		ret = bch2_btree_iter_traverse(iter);
		if (unlikely(ret))
			return bkey_s_c_err(ret);
	}
1809

1810 1811 1812 1813 1814 1815 1816
	/*
	 * iterator is now at the correct position for inserting at iter->pos,
	 * but we need to keep iterating until we find the first non whiteout so
	 * we know how big a hole we have, if any:
	 */

	node_iter = l->iter;
1817 1818 1819 1820 1821
	k = __btree_iter_unpack(iter, l, &iter->k,
		bch2_btree_node_iter_peek(&node_iter, l->b));

	if (k.k && bkey_cmp(bkey_start_pos(k.k), iter->pos) <= 0) {
		/*
1822 1823
		 * We're not setting iter->uptodate because the node iterator
		 * doesn't necessarily point at the key we're returning:
1824 1825 1826
		 */

		EBUG_ON(bkey_cmp(k.k->p, iter->pos) <= 0);
1827
		bch2_btree_iter_verify_level(iter, 0);
1828 1829
		return k;
	}
1830

1831
	/* hole */
1832 1833 1834 1835

	if (!k.k)
		k.k = &l->b->key.k;

1836 1837
	bkey_init(&n);
	n.p = iter->pos;
1838 1839 1840 1841 1842 1843 1844
	bch2_key_resize(&n,
			min_t(u64, KEY_SIZE_MAX,
			      (k.k->p.inode == n.p.inode
			       ? bkey_start_offset(k.k)
			       : KEY_OFFSET_MAX) -
			      n.p.offset));

1845
	EBUG_ON(!n.size);
1846

1847 1848
	iter->k	= n;
	iter->uptodate = BTREE_ITER_UPTODATE;
1849

1850
	bch2_btree_iter_verify_level(iter, 0);
1851 1852
	return (struct bkey_s_c) { &iter->k, NULL };
}
1853

1854
struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
1855 1856 1857
{
	struct btree_iter_level *l = &iter->l[0];
	struct bkey_s_c k;
1858 1859
	int ret;

1860 1861
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_KEYS);
	bch2_btree_iter_checks(iter);
1862 1863 1864 1865 1866 1867 1868

	if (iter->uptodate == BTREE_ITER_UPTODATE)
		return btree_iter_peek_uptodate(iter);

	ret = bch2_btree_iter_traverse(iter);
	if (unlikely(ret))
		return bkey_s_c_err(ret);
1869

1870 1871
	if (iter->flags & BTREE_ITER_IS_EXTENTS)
		return __bch2_btree_iter_peek_slot_extents(iter);
1872

1873
	k = __btree_iter_peek_all(iter, l, &iter->k);
1874

1875
	EBUG_ON(k.k && bkey_deleted(k.k) && bkey_cmp(k.k->p, iter->pos) == 0);
1876

1877
	if (!k.k || bkey_cmp(iter->pos, k.k->p)) {
1878 1879 1880
		/* hole */
		bkey_init(&iter->k);
		iter->k.p = iter->pos;
1881
		k = (struct bkey_s_c) { &iter->k, NULL };
1882
	}
1883 1884

	iter->uptodate = BTREE_ITER_UPTODATE;
1885
	bch2_btree_iter_verify_level(iter, 0);
1886
	return k;
1887 1888 1889 1890
}

struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter)
{
1891 1892
	if (unlikely(!bkey_cmp(iter->k.p, POS_MAX)))
		return bkey_s_c_null;
1893

1894
	bch2_btree_iter_set_pos(iter,
1895 1896 1897
		(iter->flags & BTREE_ITER_IS_EXTENTS)
		? iter->k.p
		: bkey_successor(iter->k.p));
1898

1899
	return bch2_btree_iter_peek_slot(iter);
1900 1901
}

1902 1903 1904 1905 1906
struct bkey_s_c bch2_btree_iter_peek_cached(struct btree_iter *iter)
{
	struct bkey_cached *ck;
	int ret;

1907 1908
	EBUG_ON(btree_iter_type(iter) != BTREE_ITER_CACHED);
	bch2_btree_iter_checks(iter);
1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922

	ret = bch2_btree_iter_traverse(iter);
	if (unlikely(ret))
		return bkey_s_c_err(ret);

	ck = (void *) iter->l[0].b;

	EBUG_ON(iter->btree_id != ck->key.btree_id ||
		bkey_cmp(iter->pos, ck->key.pos));
	BUG_ON(!ck->valid);

	return bkey_i_to_s_c(ck->k);
}

1923 1924
static inline void bch2_btree_iter_init(struct btree_trans *trans,
			struct btree_iter *iter, enum btree_id btree_id,
1925
			struct bpos pos, unsigned flags)
1926
{
1927
	struct bch_fs *c = trans->c;
1928 1929
	unsigned i;

1930
	if (btree_node_type_is_extents(btree_id) &&
1931 1932
	    !(flags & BTREE_ITER_NODES))
		flags |= BTREE_ITER_IS_EXTENTS;
1933

1934
	iter->trans			= trans;
1935 1936 1937 1938 1939 1940
	iter->pos			= pos;
	bkey_init(&iter->k);
	iter->k.p			= pos;
	iter->flags			= flags;
	iter->uptodate			= BTREE_ITER_NEED_TRAVERSE;
	iter->btree_id			= btree_id;
1941
	iter->level			= 0;
1942
	iter->min_depth			= 0;
1943
	iter->locks_want		= flags & BTREE_ITER_INTENT ? 1 : 0;
1944 1945 1946
	iter->nodes_locked		= 0;
	iter->nodes_intent_locked	= 0;
	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
1947
		iter->l[i].b		= BTREE_ITER_NO_NODE_INIT;
1948 1949 1950 1951 1952 1953

	prefetch(c->btree_roots[btree_id].b);
}

/* new transactional stuff: */

1954 1955 1956
static inline void __bch2_trans_iter_free(struct btree_trans *trans,
					  unsigned idx)
{
1957
	__bch2_btree_iter_unlock(&trans->iters[idx]);
1958 1959 1960 1961 1962
	trans->iters_linked		&= ~(1ULL << idx);
	trans->iters_live		&= ~(1ULL << idx);
	trans->iters_touched		&= ~(1ULL << idx);
}

1963 1964
int bch2_trans_iter_put(struct btree_trans *trans,
			struct btree_iter *iter)
1965
{
1966 1967 1968 1969 1970
	int ret;

	if (IS_ERR_OR_NULL(iter))
		return 0;

1971 1972
	BUG_ON(trans->iters + iter->idx != iter);

1973
	ret = btree_iter_err(iter);
1974

1975 1976 1977 1978 1979
	if (!(trans->iters_touched & (1ULL << iter->idx)) &&
	    !(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT))
		__bch2_trans_iter_free(trans, iter->idx);

	trans->iters_live	&= ~(1ULL << iter->idx);
1980
	return ret;
1981
}
1982

1983 1984
int bch2_trans_iter_free(struct btree_trans *trans,
			 struct btree_iter *iter)
1985
{
1986 1987 1988
	if (IS_ERR_OR_NULL(iter))
		return 0;

1989
	trans->iters_touched &= ~(1ULL << iter->idx);
1990

1991
	return bch2_trans_iter_put(trans, iter);
1992 1993
}

1994
#if 0
1995
static int bch2_trans_realloc_iters(struct btree_trans *trans,
1996
				    unsigned new_size)
1997
{
1998
	void *p, *new_iters, *new_updates, *new_updates2;
1999 2000
	size_t iters_bytes;
	size_t updates_bytes;
2001

2002 2003
	new_size = roundup_pow_of_two(new_size);

2004 2005 2006 2007 2008 2009 2010
	BUG_ON(new_size > BTREE_ITER_MAX);

	if (new_size <= trans->size)
		return 0;

	BUG_ON(trans->used_mempool);

2011 2012
	bch2_trans_unlock(trans);

2013
	iters_bytes	= sizeof(struct btree_iter) * new_size;
2014
	updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
2015

2016 2017 2018 2019
	p = kmalloc(iters_bytes +
		    updates_bytes +
		    updates_bytes, GFP_NOFS);
	if (p)
2020 2021
		goto success;

2022
	p = mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);
2023 2024 2025 2026
	new_size = BTREE_ITER_MAX;

	trans->used_mempool = true;
success:
2027 2028 2029
	new_iters	= p; p += iters_bytes;
	new_updates	= p; p += updates_bytes;
	new_updates2	= p; p += updates_bytes;
2030 2031 2032

	memcpy(new_iters, trans->iters,
	       sizeof(struct btree_iter) * trans->nr_iters);
2033 2034
	memcpy(new_updates, trans->updates,
	       sizeof(struct btree_insert_entry) * trans->nr_updates);
2035 2036
	memcpy(new_updates2, trans->updates2,
	       sizeof(struct btree_insert_entry) * trans->nr_updates2);
2037

2038 2039 2040 2041 2042
	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
		memset(trans->iters, POISON_FREE,
		       sizeof(struct btree_iter) * trans->nr_iters +
		       sizeof(struct btree_insert_entry) * trans->nr_iters);

2043
	kfree(trans->iters);
2044

2045 2046
	trans->iters		= new_iters;
	trans->updates		= new_updates;
2047
	trans->updates2		= new_updates2;
2048
	trans->size		= new_size;
2049

2050
	if (trans->iters_live) {
2051
		trace_trans_restart_iters_realloced(trans->ip, trans->size);
2052 2053 2054 2055
		return -EINTR;
	}

	return 0;
2056
}
2057
#endif
2058

2059
static struct btree_iter *btree_trans_iter_alloc(struct btree_trans *trans)
2060
{
2061
	unsigned idx = __ffs64(~trans->iters_linked);
2062 2063 2064 2065 2066

	if (idx < trans->nr_iters)
		goto got_slot;

	if (trans->nr_iters == trans->size) {
2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079
		struct btree_iter *iter;

		BUG_ON(trans->size < BTREE_ITER_MAX);

		trans_for_each_iter(trans, iter) {
			pr_err("iter: btree %s pos %llu:%llu%s%s%s %ps",
			       bch2_btree_ids[iter->btree_id],
			       iter->pos.inode,
			       iter->pos.offset,
			       (trans->iters_live & (1ULL << iter->idx)) ? " live" : "",
			       (trans->iters_touched & (1ULL << iter->idx)) ? " touched" : "",
			       iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT ? " keep" : "",
			       (void *) iter->ip_allocated);
2080 2081
		}

2082 2083
		panic("trans iter oveflow\n");
#if 0
2084
		ret = bch2_trans_realloc_iters(trans, trans->size * 2);
2085
		if (ret)
2086
			return ERR_PTR(ret);
2087
#endif
2088 2089 2090 2091
	}

	idx = trans->nr_iters++;
	BUG_ON(trans->nr_iters > trans->size);
Kent Overstreet's avatar
Kent Overstreet committed
2092 2093

	trans->iters[idx].idx = idx;
2094 2095 2096
got_slot:
	BUG_ON(trans->iters_linked & (1ULL << idx));
	trans->iters_linked |= 1ULL << idx;
2097
	trans->iters[idx].flags = 0;
2098
	return &trans->iters[idx];
2099 2100
}

2101 2102 2103 2104 2105 2106 2107
static inline void btree_iter_copy(struct btree_iter *dst,
				   struct btree_iter *src)
{
	unsigned i, idx = dst->idx;

	*dst = *src;
	dst->idx = idx;
2108
	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
2109 2110 2111 2112 2113

	for (i = 0; i < BTREE_MAX_DEPTH; i++)
		if (btree_node_locked(dst, i))
			six_lock_increment(&dst->l[i].b->c.lock,
					   __btree_lock_want(dst, i));
2114 2115 2116

	dst->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
	dst->flags &= ~BTREE_ITER_SET_POS_AFTER_COMMIT;
2117 2118 2119 2120 2121 2122 2123 2124 2125 2126
}

static inline struct bpos bpos_diff(struct bpos l, struct bpos r)
{
	if (bkey_cmp(l, r) > 0)
		swap(l, r);

	return POS(r.inode - l.inode, r.offset - l.offset);
}

2127
static struct btree_iter *__btree_trans_get_iter(struct btree_trans *trans,
2128
						 unsigned btree_id, struct bpos pos,
2129
						 unsigned flags)
2130
{
2131
	struct btree_iter *iter, *best = NULL;
2132 2133 2134

	BUG_ON(trans->nr_iters > BTREE_ITER_MAX);

2135 2136 2137
	trans_for_each_iter(trans, iter) {
		if (btree_iter_type(iter) != (flags & BTREE_ITER_TYPE))
			continue;
2138

2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150
		if (iter->btree_id != btree_id)
			continue;

		if (best &&
		    bkey_cmp(bpos_diff(best->pos, pos),
			     bpos_diff(iter->pos, pos)) < 0)
			continue;

		best = iter;
	}

	if (!best) {
2151 2152 2153
		iter = btree_trans_iter_alloc(trans);
		if (IS_ERR(iter))
			return iter;
2154

2155
		bch2_btree_iter_init(trans, iter, btree_id, pos, flags);
2156 2157 2158 2159 2160
	} else if ((trans->iters_live & (1ULL << best->idx)) ||
		   (best->flags & BTREE_ITER_KEEP_UNTIL_COMMIT)) {
		iter = btree_trans_iter_alloc(trans);
		if (IS_ERR(iter))
			return iter;
2161

2162 2163 2164
		btree_iter_copy(iter, best);
	} else {
		iter = best;
2165 2166
	}

2167 2168 2169
	iter->flags &= ~BTREE_ITER_KEEP_UNTIL_COMMIT;
	iter->flags &= ~BTREE_ITER_USER_FLAGS;
	iter->flags |= flags & BTREE_ITER_USER_FLAGS;
2170 2171 2172 2173 2174 2175

	if (iter->flags & BTREE_ITER_INTENT)
		bch2_btree_iter_upgrade(iter, 1);
	else
		bch2_btree_iter_downgrade(iter);

2176
	BUG_ON(iter->btree_id != btree_id);
2177 2178
	BUG_ON((iter->flags ^ flags) & BTREE_ITER_TYPE);
	BUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
2179
	BUG_ON(iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT);
2180
	BUG_ON(trans->iters_live & (1ULL << iter->idx));
2181

2182 2183
	trans->iters_live	|= 1ULL << iter->idx;
	trans->iters_touched	|= 1ULL << iter->idx;
2184 2185 2186 2187

	return iter;
}

2188 2189 2190
struct btree_iter *__bch2_trans_get_iter(struct btree_trans *trans,
					 enum btree_id btree_id,
					 struct bpos pos, unsigned flags)
2191 2192
{
	struct btree_iter *iter =
2193
		__btree_trans_get_iter(trans, btree_id, pos, flags);
2194 2195

	if (!IS_ERR(iter))
2196 2197
		__bch2_btree_iter_set_pos(iter, pos,
			btree_node_type_is_extents(btree_id));
2198 2199 2200
	return iter;
}

2201 2202 2203 2204 2205 2206 2207 2208 2209
struct btree_iter *bch2_trans_get_node_iter(struct btree_trans *trans,
					    enum btree_id btree_id,
					    struct bpos pos,
					    unsigned locks_want,
					    unsigned depth,
					    unsigned flags)
{
	struct btree_iter *iter =
		__btree_trans_get_iter(trans, btree_id, pos,
2210
				       flags|BTREE_ITER_NODES);
2211 2212 2213 2214 2215 2216 2217
	unsigned i;

	BUG_ON(IS_ERR(iter));
	BUG_ON(bkey_cmp(iter->pos, pos));

	iter->locks_want = locks_want;
	iter->level	= depth;
2218
	iter->min_depth	= depth;
2219 2220 2221

	for (i = 0; i < ARRAY_SIZE(iter->l); i++)
		iter->l[i].b		= NULL;
2222
	iter->l[iter->level].b		= BTREE_ITER_NO_NODE_INIT;
2223 2224 2225 2226

	return iter;
}

2227
struct btree_iter *__bch2_trans_copy_iter(struct btree_trans *trans,
2228
					struct btree_iter *src)
2229
{
2230
	struct btree_iter *iter;
2231

2232 2233 2234 2235
	iter = btree_trans_iter_alloc(trans);
	if (IS_ERR(iter))
		return iter;

2236
	btree_iter_copy(iter, src);
2237

2238 2239
	trans->iters_live |= 1ULL << iter->idx;
	/*
2240 2241
	 * We don't need to preserve this iter since it's cheap to copy it
	 * again - this will cause trans_iter_put() to free it right away:
2242 2243
	 */
	trans->iters_touched &= ~(1ULL << iter->idx);
2244

2245
	return iter;
2246 2247
}

2248
static int bch2_trans_preload_mem(struct btree_trans *trans, size_t size)
2249
{
2250
	if (size > trans->mem_bytes) {
2251
		size_t old_bytes = trans->mem_bytes;
2252
		size_t new_bytes = roundup_pow_of_two(size);
2253 2254 2255
		void *new_mem = krealloc(trans->mem, new_bytes, GFP_NOFS);

		if (!new_mem)
2256
			return -ENOMEM;
2257 2258 2259 2260

		trans->mem = new_mem;
		trans->mem_bytes = new_bytes;

2261
		if (old_bytes) {
2262 2263
			trace_trans_restart_mem_realloced(trans->ip, new_bytes);
			return -EINTR;
2264
		}
2265 2266
	}

2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279
	return 0;
}

void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
	void *p;
	int ret;

	ret = bch2_trans_preload_mem(trans, trans->mem_top + size);
	if (ret)
		return ERR_PTR(ret);

	p = trans->mem + trans->mem_top;
2280
	trans->mem_top += size;
2281
	return p;
2282 2283
}

2284
inline void bch2_trans_unlink_iters(struct btree_trans *trans)
2285
{
2286 2287 2288
	u64 iters = trans->iters_linked &
		~trans->iters_touched &
		~trans->iters_live;
2289 2290 2291 2292 2293 2294 2295 2296 2297

	while (iters) {
		unsigned idx = __ffs64(iters);

		iters &= ~(1ULL << idx);
		__bch2_trans_iter_free(trans, idx);
	}
}

2298
void bch2_trans_reset(struct btree_trans *trans, unsigned flags)
2299
{
2300
	struct btree_iter *iter;
2301

2302
	trans_for_each_iter(trans, iter)
2303 2304
		iter->flags &= ~(BTREE_ITER_KEEP_UNTIL_COMMIT|
				 BTREE_ITER_SET_POS_AFTER_COMMIT);
2305

2306
	bch2_trans_unlink_iters(trans);
2307

2308
	trans->iters_touched &= trans->iters_live;
2309

2310
	trans->need_reset		= 0;
2311
	trans->nr_updates		= 0;
2312
	trans->nr_updates2		= 0;
2313
	trans->mem_top			= 0;
2314

2315 2316 2317
	trans->extra_journal_entries	= NULL;
	trans->extra_journal_entry_u64s	= 0;

2318 2319 2320 2321 2322 2323 2324 2325 2326 2327
	if (trans->fs_usage_deltas) {
		trans->fs_usage_deltas->used = 0;
		memset((void *) trans->fs_usage_deltas +
		       offsetof(struct replicas_delta_list, memset_start), 0,
		       (void *) &trans->fs_usage_deltas->memset_end -
		       (void *) &trans->fs_usage_deltas->memset_start);
	}

	if (!(flags & TRANS_RESET_NOTRAVERSE))
		bch2_btree_iter_traverse_all(trans);
2328 2329
}

2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347
static void bch2_trans_alloc_iters(struct btree_trans *trans, struct bch_fs *c)
{
	unsigned new_size = BTREE_ITER_MAX;
	size_t iters_bytes	= sizeof(struct btree_iter) * new_size;
	size_t updates_bytes	= sizeof(struct btree_insert_entry) * new_size;
	void *p;

	BUG_ON(trans->used_mempool);

	p =     this_cpu_xchg(c->btree_iters_bufs->iter, NULL) ?:
		mempool_alloc(&trans->c->btree_iters_pool, GFP_NOFS);

	trans->iters		= p; p += iters_bytes;
	trans->updates		= p; p += updates_bytes;
	trans->updates2		= p; p += updates_bytes;
	trans->size		= new_size;
}

2348 2349 2350
void bch2_trans_init(struct btree_trans *trans, struct bch_fs *c,
		     unsigned expected_nr_iters,
		     size_t expected_mem_bytes)
2351
{
2352
	memset(trans, 0, sizeof(*trans));
2353
	trans->c		= c;
2354
	trans->ip		= _RET_IP_;
2355

2356 2357 2358 2359 2360
	/*
	 * reallocating iterators currently completely breaks
	 * bch2_trans_iter_put(), we always allocate the max:
	 */
	bch2_trans_alloc_iters(trans, c);
2361 2362 2363

	if (expected_mem_bytes)
		bch2_trans_preload_mem(trans, expected_mem_bytes);
2364 2365

#ifdef CONFIG_BCACHEFS_DEBUG
2366
	trans->pid = current->pid;
2367 2368 2369 2370
	mutex_lock(&c->btree_trans_lock);
	list_add(&trans->list, &c->btree_trans_list);
	mutex_unlock(&c->btree_trans_lock);
#endif
2371 2372 2373 2374
}

int bch2_trans_exit(struct btree_trans *trans)
{
2375 2376
	struct bch_fs *c = trans->c;

2377
	bch2_trans_unlock(trans);
2378

2379 2380 2381 2382 2383 2384
#ifdef CONFIG_BCACHEFS_DEBUG
	mutex_lock(&trans->c->btree_trans_lock);
	list_del(&trans->list);
	mutex_unlock(&trans->c->btree_trans_lock);
#endif

2385 2386
	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);

2387
	kfree(trans->fs_usage_deltas);
2388
	kfree(trans->mem);
2389 2390 2391

	trans->iters = this_cpu_xchg(c->btree_iters_bufs->iter, trans->iters);
	if (trans->iters)
2392
		mempool_free(trans->iters, &trans->c->btree_iters_pool);
2393

2394 2395
	trans->mem	= (void *) 0x1;
	trans->iters	= (void *) 0x1;
2396 2397

	return trans->error ? -EIO : 0;
2398
}
2399

2400 2401 2402 2403 2404 2405 2406 2407 2408
static void bch2_btree_iter_node_to_text(struct printbuf *out,
				 struct btree_bkey_cached_common *_b,
				 enum btree_iter_type type)
{
	pr_buf(out, "    %px l=%u %s:",
	       _b, _b->level, bch2_btree_ids[_b->btree_id]);
	bch2_bpos_to_text(out, btree_node_pos(_b, type));
}

2409 2410 2411 2412 2413 2414 2415 2416 2417 2418
void bch2_btree_trans_to_text(struct printbuf *out, struct bch_fs *c)
{
#ifdef CONFIG_BCACHEFS_DEBUG
	struct btree_trans *trans;
	struct btree_iter *iter;
	struct btree *b;
	unsigned l;

	mutex_lock(&c->btree_trans_lock);
	list_for_each_entry(trans, &c->btree_trans_list, list) {
2419
		pr_buf(out, "%i %px %ps\n", trans->pid, trans, (void *) trans->ip);
2420 2421 2422 2423 2424

		trans_for_each_iter(trans, iter) {
			if (!iter->nodes_locked)
				continue;

2425 2426 2427
			pr_buf(out, "  iter %u %s:",
			       iter->idx,
			       bch2_btree_ids[iter->btree_id]);
2428 2429 2430 2431 2432
			bch2_bpos_to_text(out, iter->pos);
			pr_buf(out, "\n");

			for (l = 0; l < BTREE_MAX_DEPTH; l++) {
				if (btree_node_locked(iter, l)) {
2433 2434 2435 2436 2437
					pr_buf(out, "    %s l=%u ",
					       btree_node_intent_locked(iter, l) ? "i" : "r", l);
					bch2_btree_iter_node_to_text(out,
							(void *) iter->l[l].b,
							btree_iter_type(iter));
2438 2439 2440 2441 2442 2443 2444
					pr_buf(out, "\n");
				}
			}
		}

		b = READ_ONCE(trans->locking);
		if (b) {
2445 2446 2447 2448 2449 2450
			pr_buf(out, "  locking iter %u l=%u %s:",
			       trans->locking_iter_idx,
			       trans->locking_level,
			       bch2_btree_ids[trans->locking_btree_id]);
			bch2_bpos_to_text(out, trans->locking_pos);

2451 2452 2453 2454 2455

			pr_buf(out, " node ");
			bch2_btree_iter_node_to_text(out,
					(void *) b,
					btree_iter_type(&trans->iters[trans->locking_iter_idx]));
2456 2457 2458 2459 2460 2461 2462
			pr_buf(out, "\n");
		}
	}
	mutex_unlock(&c->btree_trans_lock);
#endif
}

2463 2464 2465 2466 2467 2468 2469 2470 2471
void bch2_fs_btree_iter_exit(struct bch_fs *c)
{
	mempool_exit(&c->btree_iters_pool);
}

int bch2_fs_btree_iter_init(struct bch_fs *c)
{
	unsigned nr = BTREE_ITER_MAX;

2472 2473 2474
	INIT_LIST_HEAD(&c->btree_trans_list);
	mutex_init(&c->btree_trans_lock);

2475 2476
	return mempool_init_kmalloc_pool(&c->btree_iters_pool, 1,
			sizeof(struct btree_iter) * nr +
2477
			sizeof(struct btree_insert_entry) * nr +
2478
			sizeof(struct btree_insert_entry) * nr);
2479
}