btree_update_interior.c 56.2 KB
Newer Older
1 2 3
// SPDX-License-Identifier: GPL-2.0

#include "bcachefs.h"
4
#include "alloc_foreground.h"
5 6 7 8 9 10 11 12 13
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "buckets.h"
14
#include "error.h"
15 16 17 18 19 20 21 22 23 24
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
#include "replicas.h"
#include "super-io.h"
#include "trace.h"

#include <linux/random.h>

25
static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
Kent Overstreet's avatar
Kent Overstreet committed
26
				   struct btree_path *, struct btree *,
27
				   struct keylist *, unsigned);
28
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
29

30 31
/* Debug code: */

32 33 34
/*
 * Verify that child nodes correctly span parent node's range:
 */
35
static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
36
{
37 38
#ifdef CONFIG_BCACHEFS_DEBUG
	struct bpos next_node = b->data->min_key;
39
	struct btree_node_iter iter;
40 41 42
	struct bkey_s_c k;
	struct bkey_s_c_btree_ptr_v2 bp;
	struct bkey unpacked;
43
	char buf1[100], buf2[100];
44

45
	BUG_ON(!b->c.level);
46

47 48 49
	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
		return;

50 51 52 53
	bch2_btree_node_iter_init_from_start(&iter, b);

	while (1) {
		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
54 55
		if (k.k->type != KEY_TYPE_btree_ptr_v2)
			break;
56 57
		bp = bkey_s_c_to_btree_ptr_v2(k);

58
		if (bpos_cmp(next_node, bp.v->min_key)) {
59
			bch2_dump_btree_node(c, b);
60 61 62
			panic("expected next min_key %s got %s\n",
			      (bch2_bpos_to_text(&PBUF(buf1), next_node), buf1),
			      (bch2_bpos_to_text(&PBUF(buf2), bp.v->min_key), buf2));
63
		}
64 65 66 67

		bch2_btree_node_iter_advance(&iter, b);

		if (bch2_btree_node_iter_end(&iter)) {
68
			if (bpos_cmp(k.k->p, b->key.k.p)) {
69
				bch2_dump_btree_node(c, b);
70 71 72
				panic("expected end %s got %s\n",
				      (bch2_bpos_to_text(&PBUF(buf1), b->key.k.p), buf1),
				      (bch2_bpos_to_text(&PBUF(buf2), k.k->p), buf2));
73
			}
74 75 76
			break;
		}

77
		next_node = bpos_successor(k.k->p);
78
	}
79 80 81 82 83 84 85 86 87 88 89 90
#endif
}

/* Calculate ideal packed bkey format for new btree nodes: */

void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
{
	struct bkey_packed *k;
	struct bset_tree *t;
	struct bkey uk;

	for_each_bset(b, t)
Kent Overstreet's avatar
Kent Overstreet committed
91
		bset_tree_for_each_key(b, t, k)
92
			if (!bkey_deleted(k)) {
93 94 95 96 97 98 99 100 101 102
				uk = bkey_unpack_key(b, k);
				bch2_bkey_format_add_key(s, &uk);
			}
}

static struct bkey_format bch2_btree_calc_format(struct btree *b)
{
	struct bkey_format_state s;

	bch2_bkey_format_init(&s);
103 104
	bch2_bkey_format_add_pos(&s, b->data->min_key);
	bch2_bkey_format_add_pos(&s, b->data->max_key);
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
	__bch2_btree_calc_format(&s, b);

	return bch2_bkey_format_done(&s);
}

static size_t btree_node_u64s_with_format(struct btree *b,
					  struct bkey_format *new_f)
{
	struct bkey_format *old_f = &b->format;

	/* stupid integer promotion rules */
	ssize_t delta =
	    (((int) new_f->key_u64s - old_f->key_u64s) *
	     (int) b->nr.packed_keys) +
	    (((int) new_f->key_u64s - BKEY_U64s) *
	     (int) b->nr.unpacked_keys);

	BUG_ON(delta + b->nr.live_u64s < 0);

	return b->nr.live_u64s + delta;
}

/**
 * btree_node_format_fits - check if we could rewrite node with a new format
 *
 * This assumes all keys can pack with the new format -- it just checks if
 * the re-packed keys would fit inside the node itself.
 */
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
				 struct bkey_format *new_f)
{
	size_t u64s = btree_node_u64s_with_format(b, new_f);

	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
}

/* Btree node freeing/allocation: */

static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
	trace_btree_node_free(c, b);

	BUG_ON(btree_node_dirty(b));
	BUG_ON(btree_node_need_write(b));
	BUG_ON(b == btree_node_root(c, b));
	BUG_ON(b->ob.nr);
	BUG_ON(!list_empty(&b->write_blocked));
	BUG_ON(b->will_make_reachable);

	clear_btree_node_noevict(b);

	mutex_lock(&c->btree_cache.lock);
	list_move(&b->list, &c->btree_cache.freeable);
	mutex_unlock(&c->btree_cache.lock);
}

161 162
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
				       struct btree *b)
163
{
164
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
165
	struct btree_path *path;
166

Kent Overstreet's avatar
Kent Overstreet committed
167 168
	trans_for_each_path(trans, path)
		BUG_ON(path->l[b->c.level].b == b);
169

170
	six_lock_write(&b->c.lock, NULL, NULL);
171 172

	bch2_btree_node_hash_remove(&c->btree_cache, b);
173
	__btree_node_free(c, b);
174

175 176
	six_unlock_write(&b->c.lock);
	six_unlock_intent(&b->c.lock);
177 178 179 180 181 182 183 184 185
}

static struct btree *__bch2_btree_node_alloc(struct bch_fs *c,
					     struct disk_reservation *res,
					     struct closure *cl,
					     unsigned flags)
{
	struct write_point *wp;
	struct btree *b;
186
	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
187
	struct open_buckets ob = { .nr = 0 };
188 189 190 191
	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
	unsigned nr_reserve;
	enum alloc_reserve alloc_reserve;

192
	if (flags & BTREE_INSERT_USE_RESERVE) {
193
		nr_reserve	= 0;
194
		alloc_reserve	= RESERVE_BTREE_MOVINGGC;
195 196
	} else {
		nr_reserve	= BTREE_NODE_RESERVE;
197
		alloc_reserve	= RESERVE_BTREE;
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
	}

	mutex_lock(&c->btree_reserve_cache_lock);
	if (c->btree_reserve_cache_nr > nr_reserve) {
		struct btree_alloc *a =
			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];

		ob = a->ob;
		bkey_copy(&tmp.k, &a->k);
		mutex_unlock(&c->btree_reserve_cache_lock);
		goto mem_alloc;
	}
	mutex_unlock(&c->btree_reserve_cache_lock);

retry:
213 214 215 216
	wp = bch2_alloc_sectors_start(c,
				      c->opts.metadata_target ?:
				      c->opts.foreground_target,
				      0,
217 218 219 220 221 222 223 224 225 226 227 228
				      writepoint_ptr(&c->btree_write_point),
				      &devs_have,
				      res->nr_replicas,
				      c->opts.metadata_replicas_required,
				      alloc_reserve, 0, cl);
	if (IS_ERR(wp))
		return ERR_CAST(wp);

	if (wp->sectors_free < c->opts.btree_node_size) {
		struct open_bucket *ob;
		unsigned i;

229
		open_bucket_for_each(c, &wp->ptrs, ob, i)
230 231 232 233 234 235 236
			if (ob->sectors_free < c->opts.btree_node_size)
				ob->sectors_free = 0;

		bch2_alloc_sectors_done(c, wp);
		goto retry;
	}

237
	bkey_btree_ptr_v2_init(&tmp.k);
238
	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, c->opts.btree_node_size);
239

240
	bch2_open_bucket_get(c, wp, &ob);
241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
	bch2_alloc_sectors_done(c, wp);
mem_alloc:
	b = bch2_btree_node_mem_alloc(c);

	/* we hold cannibalize_lock: */
	BUG_ON(IS_ERR(b));
	BUG_ON(b->ob.nr);

	bkey_copy(&b->key, &tmp.k);
	b->ob = ob;

	return b;
}

static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
{
	struct bch_fs *c = as->c;
	struct btree *b;
Kent Overstreet's avatar
Kent Overstreet committed
259
	int ret;
260 261

	BUG_ON(level >= BTREE_MAX_DEPTH);
262
	BUG_ON(!as->nr_prealloc_nodes);
263

264
	b = as->prealloc_nodes[--as->nr_prealloc_nodes];
265 266

	set_btree_node_accessed(b);
267
	set_btree_node_dirty(c, b);
268
	set_btree_node_need_write(b);
269 270

	bch2_bset_init_first(b, &b->data->keys);
271 272
	b->c.level	= level;
	b->c.btree_id	= as->btree_id;
273
	b->version_ondisk = c->sb.version;
274

275 276
	memset(&b->nr, 0, sizeof(b->nr));
	b->data->magic = cpu_to_le64(bset_magic(c));
277
	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
278 279 280
	b->data->flags = 0;
	SET_BTREE_NODE_ID(b->data, as->btree_id);
	SET_BTREE_NODE_LEVEL(b->data, level);
Kent Overstreet's avatar
Kent Overstreet committed
281 282 283 284 285 286 287 288

	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);

		bp->v.mem_ptr		= 0;
		bp->v.seq		= b->data->keys.seq;
		bp->v.sectors_written	= 0;
	}
289

290
	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
291

292 293
	bch2_btree_build_aux_trees(b);

Kent Overstreet's avatar
Kent Overstreet committed
294 295 296
	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
	BUG_ON(ret);

297 298 299 300
	trace_btree_node_alloc(c, b);
	return b;
}

Kent Overstreet's avatar
Kent Overstreet committed
301 302 303 304 305 306 307 308 309 310 311 312 313
static void btree_set_min(struct btree *b, struct bpos pos)
{
	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
	b->data->min_key = pos;
}

static void btree_set_max(struct btree *b, struct bpos pos)
{
	b->key.k.p = pos;
	b->data->max_key = pos;
}

314 315 316 317 318 319
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
						  struct btree *b,
						  struct bkey_format format)
{
	struct btree *n;

320
	n = bch2_btree_node_alloc(as, b->c.level);
321 322 323

	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);

Kent Overstreet's avatar
Kent Overstreet committed
324 325 326 327
	btree_set_min(n, b->data->min_key);
	btree_set_max(n, b->data->max_key);

	n->data->format		= format;
328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
	btree_node_set_format(n, format);

	bch2_btree_sort_into(as->c, n, b);

	btree_node_reset_sib_u64s(n);

	n->key.k.p = b->key.k.p;
	return n;
}

static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
						       struct btree *b)
{
	struct bkey_format new_f = bch2_btree_calc_format(b);

	/*
	 * The keys might expand with the new format - if they wouldn't fit in
	 * the btree node anymore, use the old format for now:
	 */
	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
		new_f = b->format;

	return __bch2_btree_node_alloc_replacement(as, b, new_f);
}

static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
{
	struct btree *b = bch2_btree_node_alloc(as, level);

Kent Overstreet's avatar
Kent Overstreet committed
357
	btree_set_min(b, POS_MIN);
358
	btree_set_max(b, SPOS_MAX);
359 360 361 362 363
	b->data->format = bch2_btree_calc_format(b);

	btree_node_set_format(b, b->data->format);
	bch2_btree_build_aux_trees(b);

364
	bch2_btree_update_add_new_node(as, b);
365
	six_unlock_write(&b->c.lock);
366 367 368 369

	return b;
}

370
static void bch2_btree_reserve_put(struct btree_update *as)
371
{
372
	struct bch_fs *c = as->c;
373 374 375

	mutex_lock(&c->btree_reserve_cache_lock);

376 377
	while (as->nr_prealloc_nodes) {
		struct btree *b = as->prealloc_nodes[--as->nr_prealloc_nodes];
378

379
		six_unlock_write(&b->c.lock);
380 381 382 383 384 385 386 387 388 389

		if (c->btree_reserve_cache_nr <
		    ARRAY_SIZE(c->btree_reserve_cache)) {
			struct btree_alloc *a =
				&c->btree_reserve_cache[c->btree_reserve_cache_nr++];

			a->ob = b->ob;
			b->ob.nr = 0;
			bkey_copy(&a->k, &b->key);
		} else {
390
			bch2_open_buckets_put(c, &b->ob);
391 392 393 394
		}

		btree_node_lock_type(c, b, SIX_LOCK_write);
		__btree_node_free(c, b);
395
		six_unlock_write(&b->c.lock);
396

397
		six_unlock_intent(&b->c.lock);
398 399 400 401 402
	}

	mutex_unlock(&c->btree_reserve_cache_lock);
}

403 404
static int bch2_btree_reserve_get(struct btree_update *as, unsigned nr_nodes,
				  unsigned flags, struct closure *cl)
405
{
406
	struct bch_fs *c = as->c;
407
	struct btree *b;
408
	int ret;
409 410 411 412 413 414 415 416

	BUG_ON(nr_nodes > BTREE_RESERVE_MAX);

	/*
	 * Protects reaping from the btree node cache and using the btree node
	 * open bucket reserve:
	 */
	ret = bch2_btree_cache_cannibalize_lock(c, cl);
417 418
	if (ret)
		return ret;
419

420 421
	while (as->nr_prealloc_nodes < nr_nodes) {
		b = __bch2_btree_node_alloc(c, &as->disk_res,
422 423 424 425 426 427 428
					    flags & BTREE_INSERT_NOWAIT
					    ? NULL : cl, flags);
		if (IS_ERR(b)) {
			ret = PTR_ERR(b);
			goto err_free;
		}

429
		as->prealloc_nodes[as->nr_prealloc_nodes++] = b;
430 431 432
	}

	bch2_btree_cache_cannibalize_unlock(c);
433
	return 0;
434 435 436
err_free:
	bch2_btree_cache_cannibalize_unlock(c);
	trace_btree_reserve_get_fail(c, nr_nodes, cl);
437
	return ret;
438 439 440 441
}

/* Asynchronous interior node update machinery */

442
static void bch2_btree_update_free(struct btree_update *as)
443 444 445
{
	struct bch_fs *c = as->c;

446 447 448 449
	if (as->took_gc_lock)
		up_read(&c->gc_lock);
	as->took_gc_lock = false;

450 451 452
	bch2_journal_preres_put(&c->journal, &as->journal_preres);

	bch2_journal_pin_drop(&c->journal, &as->journal);
453
	bch2_journal_pin_flush(&c->journal, &as->journal);
454 455
	bch2_disk_reservation_put(c, &as->disk_res);
	bch2_btree_reserve_put(as);
456

457
	mutex_lock(&c->btree_interior_update_lock);
458
	list_del(&as->unwritten_list);
459
	list_del(&as->list);
460
	mutex_unlock(&c->btree_interior_update_lock);
461 462 463 464 465 466 467

	closure_debug_destroy(&as->cl);
	mempool_free(as, &c->btree_interior_update_pool);

	closure_wake_up(&c->btree_interior_update_wait);
}

468 469
static void btree_update_will_delete_key(struct btree_update *as,
					 struct bkey_i *k)
470
{
471 472 473 474
	BUG_ON(bch2_keylist_u64s(&as->old_keys) + k->k.u64s >
	       ARRAY_SIZE(as->_old_keys));
	bch2_keylist_add(&as->old_keys, k);
}
475

476 477 478 479 480 481
static void btree_update_will_add_key(struct btree_update *as,
				      struct bkey_i *k)
{
	BUG_ON(bch2_keylist_u64s(&as->new_keys) + k->k.u64s >
	       ARRAY_SIZE(as->_new_keys));
	bch2_keylist_add(&as->new_keys, k);
482 483
}

484 485 486 487 488 489
/*
 * The transactional part of an interior btree node update, where we journal the
 * update we did to the interior node and update alloc info:
 */
static int btree_update_nodes_written_trans(struct btree_trans *trans,
					    struct btree_update *as)
490
{
491 492 493 494 495 496
	struct bkey_i *k;
	int ret;

	trans->extra_journal_entries = (void *) &as->journal_entries[0];
	trans->extra_journal_entry_u64s = as->journal_u64s;
	trans->journal_pin = &as->journal;
497

498
	for_each_keylist_key(&as->new_keys, k) {
499 500 501
		ret = bch2_trans_mark_key(trans,
					  bkey_s_c_null,
					  bkey_i_to_s_c(k),
502
					  BTREE_TRIGGER_INSERT);
503 504
		if (ret)
			return ret;
505 506
	}

507
	for_each_keylist_key(&as->old_keys, k) {
508 509 510
		ret = bch2_trans_mark_key(trans,
					  bkey_i_to_s_c(k),
					  bkey_s_c_null,
511
					  BTREE_TRIGGER_OVERWRITE);
512 513 514 515 516
		if (ret)
			return ret;
	}

	return 0;
517 518
}

519
static void btree_update_nodes_written(struct btree_update *as)
520 521
{
	struct bch_fs *c = as->c;
522
	struct btree *b = as->b;
523
	struct btree_trans trans;
524 525
	u64 journal_seq = 0;
	unsigned i;
526
	int ret;
527

528 529 530 531 532 533 534 535 536 537 538
	/*
	 * If we're already in an error state, it might be because a btree node
	 * was never written, and we might be trying to free that same btree
	 * node here, but it won't have been marked as allocated and we'll see
	 * spurious disk usage inconsistencies in the transactional part below
	 * if we don't skip it:
	 */
	ret = bch2_journal_error(&c->journal);
	if (ret)
		goto err;

539 540
	BUG_ON(!journal_pin_active(&as->journal));

541 542 543 544 545 546 547 548 549 550 551 552 553
	/*
	 * Wait for any in flight writes to finish before we free the old nodes
	 * on disk:
	 */
	for (i = 0; i < as->nr_old_nodes; i++) {
		struct btree *old = as->old_nodes[i];
		__le64 seq;

		six_lock_read(&old->c.lock, NULL, NULL);
		seq = old->data ? old->data->keys.seq : 0;
		six_unlock_read(&old->c.lock);

		if (seq == as->old_nodes_seq[i])
554 555
			wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
				       TASK_UNINTERRUPTIBLE);
556 557
	}

558 559 560 561 562
	/*
	 * We did an update to a parent node where the pointers we added pointed
	 * to child nodes that weren't written yet: now, the child nodes have
	 * been written so we can write out the update to the interior node.
	 */
563 564 565 566 567 568 569 570

	/*
	 * We can't call into journal reclaim here: we'd block on the journal
	 * reclaim lock, but we may need to release the open buckets we have
	 * pinned in order for other btree updates to make forward progress, and
	 * journal reclaim does btree updates when flushing bkey_cached entries,
	 * which may require allocations as well.
	 */
571 572 573 574 575 576 577 578
	bch2_trans_init(&trans, c, 0, 512);
	ret = __bch2_trans_do(&trans, &as->disk_res, &journal_seq,
			      BTREE_INSERT_NOFAIL|
			      BTREE_INSERT_NOCHECK_RW|
			      BTREE_INSERT_JOURNAL_RECLAIM|
			      BTREE_INSERT_JOURNAL_RESERVED,
			      btree_update_nodes_written_trans(&trans, as));
	bch2_trans_exit(&trans);
579

580 581 582
	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
			     "error %i in btree_update_nodes_written()", ret);
err:
583
	if (b) {
584
		/*
585 586
		 * @b is the node we did the final insert into:
		 *
587 588 589 590 591 592 593 594
		 * On failure to get a journal reservation, we still have to
		 * unblock the write and allow most of the write path to happen
		 * so that shutdown works, but the i->journal_seq mechanism
		 * won't work to prevent the btree write from being visible (we
		 * didn't get a journal sequence number) - instead
		 * __bch2_btree_node_write() doesn't do the actual write if
		 * we're in journal error state:
		 */
595

596 597 598 599
		btree_node_lock_type(c, b, SIX_LOCK_intent);
		btree_node_lock_type(c, b, SIX_LOCK_write);
		mutex_lock(&c->btree_interior_update_lock);

600
		list_del(&as->write_blocked_list);
601

602 603 604 605 606
		/*
		 * Node might have been freed, recheck under
		 * btree_interior_update_lock:
		 */
		if (as->b == b) {
607 608
			struct bset *i = btree_bset_last(b);

609 610 611
			BUG_ON(!b->c.level);
			BUG_ON(!btree_node_dirty(b));

612 613 614 615 616 617 618 619 620 621 622 623 624 625
			if (!ret) {
				i->journal_seq = cpu_to_le64(
					max(journal_seq,
					    le64_to_cpu(i->journal_seq)));

				bch2_btree_add_journal_pin(c, b, journal_seq);
			} else {
				/*
				 * If we didn't get a journal sequence number we
				 * can't write this btree node, because recovery
				 * won't know to ignore this write:
				 */
				set_btree_node_never_write(b);
			}
626 627
		}

628
		mutex_unlock(&c->btree_interior_update_lock);
629
		six_unlock_write(&b->c.lock);
630

631 632
		btree_node_write_if_need(c, b, SIX_LOCK_intent);
		six_unlock_intent(&b->c.lock);
633 634
	}

635 636 637
	bch2_journal_pin_drop(&c->journal, &as->journal);

	bch2_journal_preres_put(&c->journal, &as->journal_preres);
638

639 640 641
	mutex_lock(&c->btree_interior_update_lock);
	for (i = 0; i < as->nr_new_nodes; i++) {
		b = as->new_nodes[i];
642

643 644
		BUG_ON(b->will_make_reachable != (unsigned long) as);
		b->will_make_reachable = 0;
645 646 647 648 649
	}
	mutex_unlock(&c->btree_interior_update_lock);

	for (i = 0; i < as->nr_new_nodes; i++) {
		b = as->new_nodes[i];
650

651 652 653
		btree_node_lock_type(c, b, SIX_LOCK_read);
		btree_node_write_if_need(c, b, SIX_LOCK_read);
		six_unlock_read(&b->c.lock);
654
	}
655

656 657
	for (i = 0; i < as->nr_open_buckets; i++)
		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
658

659 660
	bch2_btree_update_free(as);
}
661

662 663 664 665 666
static void btree_interior_update_work(struct work_struct *work)
{
	struct bch_fs *c =
		container_of(work, struct bch_fs, btree_interior_update_work);
	struct btree_update *as;
667

668 669 670 671 672 673 674
	while (1) {
		mutex_lock(&c->btree_interior_update_lock);
		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
					      struct btree_update, unwritten_list);
		if (as && !as->nodes_written)
			as = NULL;
		mutex_unlock(&c->btree_interior_update_lock);
675

676 677 678 679
		if (!as)
			break;

		btree_update_nodes_written(as);
680
	}
681 682 683 684 685 686
}

static void btree_update_set_nodes_written(struct closure *cl)
{
	struct btree_update *as = container_of(cl, struct btree_update, cl);
	struct bch_fs *c = as->c;
687 688

	mutex_lock(&c->btree_interior_update_lock);
689 690 691 692
	as->nodes_written = true;
	mutex_unlock(&c->btree_interior_update_lock);

	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
693 694 695 696 697 698 699 700 701 702 703
}

/*
 * We're updating @b with pointers to nodes that haven't finished writing yet:
 * block @b from being written until @as completes
 */
static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
	struct bch_fs *c = as->c;

	mutex_lock(&c->btree_interior_update_lock);
704
	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
705 706 707 708

	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
	BUG_ON(!btree_node_dirty(b));

709 710
	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
	as->b		= b;
711 712 713 714 715 716 717 718 719 720
	list_add(&as->write_blocked_list, &b->write_blocked);

	mutex_unlock(&c->btree_interior_update_lock);
}

static void btree_update_reparent(struct btree_update *as,
				  struct btree_update *child)
{
	struct bch_fs *c = as->c;

721 722
	lockdep_assert_held(&c->btree_interior_update_lock);

723 724 725
	child->b = NULL;
	child->mode = BTREE_INTERIOR_UPDATING_AS;

726
	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
727 728
}

729
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
730
{
731
	struct bkey_i *insert = &b->key;
732 733 734
	struct bch_fs *c = as->c;

	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
735 736 737 738 739 740 741 742 743

	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
	       ARRAY_SIZE(as->journal_entries));

	as->journal_u64s +=
		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
				  BCH_JSET_ENTRY_btree_root,
				  b->c.btree_id, b->c.level,
				  insert, insert->k.u64s);
744

745 746
	mutex_lock(&c->btree_interior_update_lock);
	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
747

748
	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
749 750 751
	mutex_unlock(&c->btree_interior_update_lock);
}

752 753 754 755 756 757 758 759 760 761 762 763
/*
 * bch2_btree_update_add_new_node:
 *
 * This causes @as to wait on @b to be written, before it gets to
 * bch2_btree_update_nodes_written
 *
 * Additionally, it sets b->will_make_reachable to prevent any additional writes
 * to @b from happening besides the first until @b is reachable on disk
 *
 * And it adds @b to the list of @as's new nodes, so that we can update sector
 * counts in bch2_btree_update_nodes_written:
 */
764
static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
765 766 767
{
	struct bch_fs *c = as->c;

768 769
	closure_get(&as->cl);

770 771 772 773 774 775 776 777
	mutex_lock(&c->btree_interior_update_lock);
	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
	BUG_ON(b->will_make_reachable);

	as->new_nodes[as->nr_new_nodes++] = b;
	b->will_make_reachable = 1UL|(unsigned long) as;

	mutex_unlock(&c->btree_interior_update_lock);
778 779

	btree_update_will_add_key(as, &b->key);
780 781
}

782 783 784
/*
 * returns true if @b was a new node
 */
785 786 787 788 789 790 791
static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
{
	struct btree_update *as;
	unsigned long v;
	unsigned i;

	mutex_lock(&c->btree_interior_update_lock);
792 793 794 795 796
	/*
	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
	 * dropped when it gets written by bch2_btree_complete_write - the
	 * xchg() is for synchronization with bch2_btree_complete_write:
	 */
797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
	v = xchg(&b->will_make_reachable, 0);
	as = (struct btree_update *) (v & ~1UL);

	if (!as) {
		mutex_unlock(&c->btree_interior_update_lock);
		return;
	}

	for (i = 0; i < as->nr_new_nodes; i++)
		if (as->new_nodes[i] == b)
			goto found;

	BUG();
found:
	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
	mutex_unlock(&c->btree_interior_update_lock);

	if (v & 1)
		closure_put(&as->cl);
}

818
static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
819
{
820 821 822
	while (b->ob.nr)
		as->open_buckets[as->nr_open_buckets++] =
			b->ob.v[--b->ob.nr];
823 824 825 826 827 828 829
}

/*
 * @b is being split/rewritten: it may have pointers to not-yet-written btree
 * nodes and thus outstanding btree_updates - redirect @b's
 * btree_updates to point to this btree_update:
 */
830
static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
					       struct btree *b)
{
	struct bch_fs *c = as->c;
	struct btree_update *p, *n;
	struct btree_write *w;

	set_btree_node_dying(b);

	if (btree_node_fake(b))
		return;

	mutex_lock(&c->btree_interior_update_lock);

	/*
	 * Does this node have any btree_update operations preventing
	 * it from being written?
	 *
	 * If so, redirect them to point to this btree_update: we can
	 * write out our new nodes, but we won't make them visible until those
	 * operations complete
	 */
	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
853
		list_del_init(&p->write_blocked_list);
854
		btree_update_reparent(as, p);
855 856 857 858 859 860

		/*
		 * for flush_held_btree_writes() waiting on updates to flush or
		 * nodes to be writeable:
		 */
		closure_wake_up(&c->btree_interior_update_wait);
861 862
	}

863
	clear_btree_node_dirty(c, b);
864 865 866 867 868 869 870 871 872 873
	clear_btree_node_need_write(b);

	/*
	 * Does this node have unwritten data that has a pin on the journal?
	 *
	 * If so, transfer that pin to the btree_update operation -
	 * note that if we're freeing multiple nodes, we only need to keep the
	 * oldest pin of any of the nodes we're freeing. We'll release the pin
	 * when the new nodes are persistent and reachable on disk:
	 */
874 875
	w = btree_current_write(b);
	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
876 877 878
	bch2_journal_pin_drop(&c->journal, &w->journal);

	w = btree_prev_write(b);
879
	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
880 881 882
	bch2_journal_pin_drop(&c->journal, &w->journal);

	mutex_unlock(&c->btree_interior_update_lock);
883 884 885 886 887 888 889 890 891 892 893 894 895

	/*
	 * Is this a node that isn't reachable on disk yet?
	 *
	 * Nodes that aren't reachable yet have writes blocked until they're
	 * reachable - now that we've cancelled any pending writes and moved
	 * things waiting on that write to wait on this update, we can drop this
	 * node from the list of nodes that the other update is making
	 * reachable, prior to freeing it:
	 */
	btree_update_drop_new_node(c, b);

	btree_update_will_delete_key(as, &b->key);
896

897 898 899
	as->old_nodes[as->nr_old_nodes] = b;
	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
	as->nr_old_nodes++;
900 901
}

902
static void bch2_btree_update_done(struct btree_update *as)
903 904 905
{
	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);

906 907 908 909
	if (as->took_gc_lock)
		up_read(&as->c->gc_lock);
	as->took_gc_lock = false;

910
	bch2_btree_reserve_put(as);
911

912 913
	continue_at(&as->cl, btree_update_set_nodes_written,
		    as->c->btree_interior_update_worker);
914 915
}

916
static struct btree_update *
Kent Overstreet's avatar
Kent Overstreet committed
917
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
918
			unsigned level, unsigned nr_nodes, unsigned flags)
919
{
920
	struct bch_fs *c = trans->c;
921
	struct btree_update *as;
922
	struct closure cl;
923
	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
924
		? BCH_DISK_RESERVATION_NOFAIL : 0;
925
	int journal_flags = 0;
926
	int ret = 0;
927

Kent Overstreet's avatar
Kent Overstreet committed
928
	BUG_ON(!path->should_be_locked);
929

930 931 932
	if (flags & BTREE_INSERT_JOURNAL_RESERVED)
		journal_flags |= JOURNAL_RES_GET_RESERVED;

933 934
	closure_init_stack(&cl);
retry:
935

936 937 938 939
	/*
	 * XXX: figure out how far we might need to split,
	 * instead of locking/reserving all the way to the root:
	 */
Kent Overstreet's avatar
Kent Overstreet committed
940
	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
941
		trace_trans_restart_iter_upgrade(trans->ip, _RET_IP_,
Kent Overstreet's avatar
Kent Overstreet committed
942 943 944
						 path->btree_id, &path->pos);
		ret = btree_trans_restart(trans);
		return ERR_PTR(ret);
945 946 947 948 949 950 951 952 953 954 955 956 957
	}

	if (flags & BTREE_INSERT_GC_LOCK_HELD)
		lockdep_assert_held(&c->gc_lock);
	else if (!down_read_trylock(&c->gc_lock)) {
		bch2_trans_unlock(trans);
		down_read(&c->gc_lock);
		if (!bch2_trans_relock(trans)) {
			up_read(&c->gc_lock);
			return ERR_PTR(-EINTR);
		}
	}

958 959 960 961 962
	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
	memset(as, 0, sizeof(*as));
	closure_init(&as->cl, NULL);
	as->c		= c;
	as->mode	= BTREE_INTERIOR_NO_UPDATE;
963
	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
Kent Overstreet's avatar
Kent Overstreet committed
964
	as->btree_id	= path->btree_id;
965 966 967 968 969 970
	INIT_LIST_HEAD(&as->list);
	INIT_LIST_HEAD(&as->unwritten_list);
	INIT_LIST_HEAD(&as->write_blocked_list);
	bch2_keylist_init(&as->old_keys, as->_old_keys);
	bch2_keylist_init(&as->new_keys, as->_new_keys);
	bch2_keylist_init(&as->parent_keys, as->inline_keys);
971

972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987
	mutex_lock(&c->btree_interior_update_lock);
	list_add_tail(&as->list, &c->btree_interior_update_list);
	mutex_unlock(&c->btree_interior_update_lock);

	/*
	 * We don't want to allocate if we're in an error state, that can cause
	 * deadlock on emergency shutdown due to open buckets getting stuck in
	 * the btree_reserve_cache after allocator shutdown has cleared it out.
	 * This check needs to come after adding us to the btree_interior_update
	 * list but before calling bch2_btree_reserve_get, to synchronize with
	 * __bch2_fs_read_only().
	 */
	ret = bch2_journal_error(&c->journal);
	if (ret)
		goto err;

988 989 990
	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
				      BTREE_UPDATE_JOURNAL_RES,
				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
991 992 993
	if (ret == -EAGAIN) {
		bch2_trans_unlock(trans);

994 995
		if (flags & BTREE_INSERT_JOURNAL_RECLAIM) {
			bch2_btree_update_free(as);
996
			btree_trans_restart(trans);
997 998
			return ERR_PTR(ret);
		}
999

1000
		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
1001 1002
				BTREE_UPDATE_JOURNAL_RES,
				journal_flags);
1003
		if (ret) {
1004
			trace_trans_restart_journal_preres_get(trans->ip, _RET_IP_);
1005
			goto err;
1006
		}
1007 1008

		if (!bch2_trans_relock(trans)) {
1009 1010
			ret = -EINTR;
			goto err;
1011 1012 1013
		}
	}

1014 1015 1016 1017 1018 1019
	ret = bch2_disk_reservation_get(c, &as->disk_res,
			nr_nodes * c->opts.btree_node_size,
			c->opts.metadata_replicas,
			disk_res_flags);
	if (ret)
		goto err;
1020

1021
	ret = bch2_btree_reserve_get(as, nr_nodes, flags, &cl);
1022 1023
	if (ret)
		goto err;
1024

1025 1026 1027 1028
	bch2_journal_pin_add(&c->journal,
			     atomic64_read(&c->journal.seq),
			     &as->journal, NULL);

1029
	return as;
1030 1031
err:
	bch2_btree_update_free(as);
1032 1033 1034 1035 1036 1037 1038 1039 1040 1041

	if (ret == -EAGAIN) {
		bch2_trans_unlock(trans);
		closure_sync(&cl);
		ret = -EINTR;
	}

	if (ret == -EINTR && bch2_trans_relock(trans))
		goto retry;

1042
	return ERR_PTR(ret);
1043 1044 1045 1046
}

/* Btree root updates: */

1047
static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
1048 1049 1050 1051 1052 1053
{
	/* Root nodes cannot be reaped */
	mutex_lock(&c->btree_cache.lock);
	list_del_init(&b->list);
	mutex_unlock(&c->btree_cache.lock);

1054 1055 1056 1057 1058
	if (b->c.level)
		six_lock_pcpu_alloc(&b->c.lock);
	else
		six_lock_pcpu_free(&b->c.lock);

1059 1060
	mutex_lock(&c->btree_root_lock);
	BUG_ON(btree_node_root(c, b) &&
1061
	       (b->c.level < btree_node_root(c, b)->c.level ||
1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081
		!btree_node_dying(btree_node_root(c, b))));

	btree_node_root(c, b) = b;
	mutex_unlock(&c->btree_root_lock);

	bch2_recalc_btree_reserve(c);
}

/**
 * bch_btree_set_root - update the root in memory and on disk
 *
 * To ensure forward progress, the current task must not be holding any
 * btree node write locks. However, you must hold an intent lock on the
 * old root.
 *
 * Note: This allocates a journal entry but doesn't add any keys to
 * it.  All the btree roots are part of every journal write, so there
 * is nothing new to be done.  This just guarantees that there is a
 * journal write.
 */
1082 1083
static void bch2_btree_set_root(struct btree_update *as,
				struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1084
				struct btree_path *path,
1085
				struct btree *b)
1086 1087 1088 1089 1090
{
	struct bch_fs *c = as->c;
	struct btree *old;

	trace_btree_set_root(c, b);
1091 1092
	BUG_ON(!b->written &&
	       !test_bit(BCH_FS_HOLD_BTREE_WRITES, &c->flags));
1093 1094 1095 1096 1097 1098 1099

	old = btree_node_root(c, b);

	/*
	 * Ensure no one is using the old root while we switch to the
	 * new root:
	 */
Kent Overstreet's avatar
Kent Overstreet committed
1100
	bch2_btree_node_lock_write(trans, path, old);
1101

1102
	bch2_btree_set_root_inmem(c, b);
1103

1104
	btree_update_updated_root(as, b);
1105 1106 1107 1108 1109 1110 1111 1112

	/*
	 * Unlock old root after new root is visible:
	 *
	 * The new root isn't persistent, but that's ok: we still have
	 * an intent lock on the new root, and any updates that would
	 * depend on the new root would have to update the new root.
	 */
Kent Overstreet's avatar
Kent Overstreet committed
1113
	bch2_btree_node_unlock_write(trans, path, old);
1114 1115 1116 1117
}

/* Interior node updates: */

1118 1119
static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
					struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1120
					struct btree_path *path,
1121 1122 1123
					struct btree *b,
					struct btree_node_iter *node_iter,
					struct bkey_i *insert)
1124
{
1125
	struct bch_fs *c = as->c;
1126
	struct bkey_packed *k;
1127 1128
	const char *invalid;

1129 1130 1131
	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
	       !btree_ptr_sectors_written(insert));

1132 1133 1134 1135 1136 1137 1138 1139 1140
	invalid = bch2_bkey_invalid(c, bkey_i_to_s_c(insert), btree_node_type(b)) ?:
		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert));
	if (invalid) {
		char buf[160];

		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert));
		bch2_fs_inconsistent(c, "inserting invalid bkey %s: %s", buf, invalid);
		dump_stack();
	}
1141

1142 1143 1144
	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
	       ARRAY_SIZE(as->journal_entries));

1145 1146 1147 1148 1149
	as->journal_u64s +=
		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
				  BCH_JSET_ENTRY_btree_keys,
				  b->c.btree_id, b->c.level,
				  insert, insert->k.u64s);
1150 1151

	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1152
	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1153 1154
		bch2_btree_node_iter_advance(node_iter, b);

Kent Overstreet's avatar
Kent Overstreet committed
1155
	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
1156
	set_btree_node_dirty(c, b);
1157 1158 1159
	set_btree_node_need_write(b);
}

1160
static void
1161 1162
__bch2_btree_insert_keys_interior(struct btree_update *as,
				  struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1163
				  struct btree_path *path,
1164 1165 1166
				  struct btree *b,
				  struct btree_node_iter node_iter,
				  struct keylist *keys)
1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177
{
	struct bkey_i *insert = bch2_keylist_front(keys);
	struct bkey_packed *k;

	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);

	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
		;

	while (!bch2_keylist_empty(keys)) {
Kent Overstreet's avatar
Kent Overstreet committed
1178
		bch2_insert_fixup_btree_ptr(as, trans, path, b,
1179
				&node_iter, bch2_keylist_front(keys));
1180 1181 1182 1183
		bch2_keylist_pop_front(keys);
	}
}

1184 1185 1186 1187 1188
/*
 * Move keys from n1 (original replacement node, now lower node) to n2 (higher
 * node)
 */
static struct btree *__btree_split_node(struct btree_update *as,
Kent Overstreet's avatar
Kent Overstreet committed
1189
					struct btree *n1)
1190
{
1191
	struct bkey_format_state s;
1192 1193 1194
	size_t nr_packed = 0, nr_unpacked = 0;
	struct btree *n2;
	struct bset *set1, *set2;
1195
	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
1196
	struct bpos n1_pos;
1197

1198
	n2 = bch2_btree_node_alloc(as, n1->c.level);
1199
	bch2_btree_update_add_new_node(as, n2);
1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214

	n2->data->max_key	= n1->data->max_key;
	n2->data->format	= n1->format;
	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
	n2->key.k.p = n1->key.k.p;

	set1 = btree_bset_first(n1);
	set2 = btree_bset_first(n2);

	/*
	 * Has to be a linear search because we don't have an auxiliary
	 * search tree yet
	 */
	k = set1->start;
	while (1) {
1215
		struct bkey_packed *n = bkey_next(k);
Kent Overstreet's avatar
Kent Overstreet committed
1216 1217

		if (n == vstruct_last(set1))
1218 1219 1220 1221 1222 1223 1224 1225 1226 1227
			break;
		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
			break;

		if (bkey_packed(k))
			nr_packed++;
		else
			nr_unpacked++;

		prev = k;
Kent Overstreet's avatar
Kent Overstreet committed
1228
		k = n;
1229 1230 1231
	}

	BUG_ON(!prev);
1232 1233
	set2_start	= k;
	set2_end	= vstruct_last(set1);
1234

1235
	set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
1236 1237 1238 1239 1240 1241 1242
	set_btree_bset_end(n1, n1->set);

	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
	n1->nr.packed_keys	= nr_packed;
	n1->nr.unpacked_keys	= nr_unpacked;

1243 1244 1245 1246 1247 1248
	n1_pos = bkey_unpack_pos(n1, prev);
	if (as->c->sb.version < bcachefs_metadata_version_snapshot)
		n1_pos.snapshot = U32_MAX;

	btree_set_max(n1, n1_pos);
	btree_set_min(n2, bpos_successor(n1->key.k.p));
1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275

	bch2_bkey_format_init(&s);
	bch2_bkey_format_add_pos(&s, n2->data->min_key);
	bch2_bkey_format_add_pos(&s, n2->data->max_key);

	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
		struct bkey uk = bkey_unpack_key(n1, k);
		bch2_bkey_format_add_key(&s, &uk);
	}

	n2->data->format = bch2_bkey_format_done(&s);
	btree_node_set_format(n2, n2->data->format);

	out = set2->start;
	memset(&n2->nr, 0, sizeof(n2->nr));

	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
		BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
				       ? &n1->format : &bch2_bkey_format_current, k));
		out->format = KEY_FORMAT_LOCAL_BTREE;
		btree_keys_account_key_add(&n2->nr, 0, out);
		out = bkey_next(out);
	}

	set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
	set_btree_bset_end(n2, n2->set);

1276 1277 1278 1279 1280 1281 1282 1283 1284
	BUG_ON(!set1->u64s);
	BUG_ON(!set2->u64s);

	btree_node_reset_sib_u64s(n1);
	btree_node_reset_sib_u64s(n2);

	bch2_verify_btree_nr_keys(n1);
	bch2_verify_btree_nr_keys(n2);

1285
	if (n1->c.level) {
1286 1287
		btree_node_interior_verify(as->c, n1);
		btree_node_interior_verify(as->c, n2);
1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303
	}

	return n2;
}

/*
 * For updates to interior nodes, we've got to do the insert before we split
 * because the stuff we're inserting has to be inserted atomically. Post split,
 * the keys might have to go in different nodes and the split would no longer be
 * atomic.
 *
 * Worse, if the insert is from btree node coalescing, if we do the insert after
 * we do the split (and pick the pivot) - the pivot we pick might be between
 * nodes that were coalesced, and thus in the middle of a child node post
 * coalescing:
 */
1304 1305
static void btree_split_insert_keys(struct btree_update *as,
				    struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1306
				    struct btree_path *path,
1307
				    struct btree *b,
1308 1309 1310 1311
				    struct keylist *keys)
{
	struct btree_node_iter node_iter;
	struct bkey_i *k = bch2_keylist_front(keys);
Kent Overstreet's avatar
Kent Overstreet committed
1312
	struct bkey_packed *src, *dst, *n;
1313 1314
	struct bset *i;

1315
	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
1316

Kent Overstreet's avatar
Kent Overstreet committed
1317
	__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
1318 1319 1320 1321 1322 1323 1324

	/*
	 * We can't tolerate whiteouts here - with whiteouts there can be
	 * duplicate keys, and it would be rather bad if we picked a duplicate
	 * for the pivot:
	 */
	i = btree_bset_first(b);
Kent Overstreet's avatar
Kent Overstreet committed
1325 1326
	src = dst = i->start;
	while (src != vstruct_last(i)) {
1327
		n = bkey_next(src);
Kent Overstreet's avatar
Kent Overstreet committed
1328 1329 1330 1331 1332 1333 1334
		if (!bkey_deleted(src)) {
			memmove_u64s_down(dst, src, src->u64s);
			dst = bkey_next(dst);
		}
		src = n;
	}

1335 1336 1337
	/* Also clear out the unwritten whiteouts area: */
	b->whiteout_u64s = 0;

Kent Overstreet's avatar
Kent Overstreet committed
1338 1339
	i->u64s = cpu_to_le16((u64 *) dst - i->_data);
	set_btree_bset_end(b, b->set);
1340 1341 1342 1343

	BUG_ON(b->nsets != 1 ||
	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));

1344
	btree_node_interior_verify(as->c, b);
1345 1346
}

Kent Overstreet's avatar
Kent Overstreet committed
1347 1348 1349
static void btree_split(struct btree_update *as, struct btree_trans *trans,
			struct btree_path *path, struct btree *b,
			struct keylist *keys, unsigned flags)
1350 1351
{
	struct bch_fs *c = as->c;
Kent Overstreet's avatar
Kent Overstreet committed
1352
	struct btree *parent = btree_node_parent(path, b);
1353 1354 1355 1356
	struct btree *n1, *n2 = NULL, *n3 = NULL;
	u64 start_time = local_clock();

	BUG_ON(!parent && (b != btree_node_root(c, b)));
Kent Overstreet's avatar
Kent Overstreet committed
1357
	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
1358 1359 1360 1361

	bch2_btree_interior_update_will_free_node(as, b);

	n1 = bch2_btree_node_alloc_replacement(as, b);
1362
	bch2_btree_update_add_new_node(as, n1);
1363 1364

	if (keys)
Kent Overstreet's avatar
Kent Overstreet committed
1365
		btree_split_insert_keys(as, trans, path, n1, keys);
1366

1367
	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
1368 1369
		trace_btree_split(c, b);

Kent Overstreet's avatar
Kent Overstreet committed
1370
		n2 = __btree_split_node(as, n1);
1371 1372 1373

		bch2_btree_build_aux_trees(n2);
		bch2_btree_build_aux_trees(n1);
1374 1375
		six_unlock_write(&n2->c.lock);
		six_unlock_write(&n1->c.lock);
1376

1377
		bch2_btree_node_write(c, n1, SIX_LOCK_intent);
1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
		bch2_btree_node_write(c, n2, SIX_LOCK_intent);

		/*
		 * Note that on recursive parent_keys == keys, so we
		 * can't start adding new keys to parent_keys before emptying it
		 * out (which we did with btree_split_insert_keys() above)
		 */
		bch2_keylist_add(&as->parent_keys, &n1->key);
		bch2_keylist_add(&as->parent_keys, &n2->key);

		if (!parent) {
			/* Depth increases, make a new root */
1390
			n3 = __btree_root_alloc(as, b->c.level + 1);
1391 1392 1393 1394

			n3->sib_u64s[0] = U16_MAX;
			n3->sib_u64s[1] = U16_MAX;

Kent Overstreet's avatar
Kent Overstreet committed
1395
			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
1396 1397 1398 1399 1400 1401 1402

			bch2_btree_node_write(c, n3, SIX_LOCK_intent);
		}
	} else {
		trace_btree_compact(c, b);

		bch2_btree_build_aux_trees(n1);
1403
		six_unlock_write(&n1->c.lock);
1404

1405 1406
		bch2_btree_node_write(c, n1, SIX_LOCK_intent);

1407 1408
		if (parent)
			bch2_keylist_add(&as->parent_keys, &n1->key);
1409 1410 1411 1412 1413 1414
	}

	/* New nodes all written, now make them visible: */

	if (parent) {
		/* Split a non root node */
Kent Overstreet's avatar
Kent Overstreet committed
1415
		bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
1416
	} else if (n3) {
Kent Overstreet's avatar
Kent Overstreet committed
1417
		bch2_btree_set_root(as, trans, path, n3);
1418 1419
	} else {
		/* Root filled up but didn't need to be split */
Kent Overstreet's avatar
Kent Overstreet committed
1420
		bch2_btree_set_root(as, trans, path, n1);
1421 1422
	}

1423
	bch2_btree_update_get_open_buckets(as, n1);
1424
	if (n2)
1425
		bch2_btree_update_get_open_buckets(as, n2);
1426
	if (n3)
1427
		bch2_btree_update_get_open_buckets(as, n3);
1428

Kent Overstreet's avatar
Kent Overstreet committed
1429
	/* Successful split, update the path to point to the new nodes: */
1430

1431
	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1432
	if (n3)
1433
		bch2_trans_node_add(trans, n3);
1434
	if (n2)
1435 1436
		bch2_trans_node_add(trans, n2);
	bch2_trans_node_add(trans, n1);
1437

1438 1439 1440 1441 1442 1443
	/*
	 * The old node must be freed (in memory) _before_ unlocking the new
	 * nodes - else another thread could re-acquire a read lock on the old
	 * node after another thread has locked and updated the new node, thus
	 * seeing stale data:
	 */
1444
	bch2_btree_node_free_inmem(trans, b);
1445

1446 1447 1448 1449 1450 1451
	if (n3)
		six_unlock_intent(&n3->c.lock);
	if (n2)
		six_unlock_intent(&n2->c.lock);
	six_unlock_intent(&n1->c.lock);

Kent Overstreet's avatar
Kent Overstreet committed
1452
	bch2_trans_verify_locks(trans);
1453

1454 1455
	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_split],
			       start_time);
1456 1457 1458
}

static void
1459 1460
bch2_btree_insert_keys_interior(struct btree_update *as,
				struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1461
				struct btree_path *path,
1462 1463
				struct btree *b,
				struct keylist *keys)
1464
{
Kent Overstreet's avatar
Kent Overstreet committed
1465
	struct btree_path *linked;
1466

Kent Overstreet's avatar
Kent Overstreet committed
1467 1468
	__bch2_btree_insert_keys_interior(as, trans, path, b,
					  path->l[b->c.level].iter, keys);
1469 1470 1471

	btree_update_updated_node(as, b);

Kent Overstreet's avatar
Kent Overstreet committed
1472
	trans_for_each_path_with_node(trans, b, linked)
1473
		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
1474

Kent Overstreet's avatar
Kent Overstreet committed
1475
	bch2_trans_verify_paths(trans);
1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489
}

/**
 * bch_btree_insert_node - insert bkeys into a given btree node
 *
 * @iter:		btree iterator
 * @keys:		list of keys to insert
 * @hook:		insert callback
 * @persistent:		if not null, @persistent will wait on journal write
 *
 * Inserts as many keys as it can into a given btree node, splitting it if full.
 * If a split occurred, this function will return early. This can only happen
 * for leaf nodes -- inserts into interior nodes have to be atomic.
 */
Kent Overstreet's avatar
Kent Overstreet committed
1490 1491 1492
static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
				   struct btree_path *path, struct btree *b,
				   struct keylist *keys, unsigned flags)
1493 1494 1495 1496 1497 1498
{
	struct bch_fs *c = as->c;
	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
	int old_live_u64s = b->nr.live_u64s;
	int live_u64s_added, u64s_added;

1499
	lockdep_assert_held(&c->gc_lock);
Kent Overstreet's avatar
Kent Overstreet committed
1500
	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
1501
	BUG_ON(!b->c.level);
1502 1503 1504
	BUG_ON(!as || as->b);
	bch2_verify_keylist_sorted(keys);

Kent Overstreet's avatar
Kent Overstreet committed
1505
	bch2_btree_node_lock_for_insert(trans, path, b);
1506

1507
	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
Kent Overstreet's avatar
Kent Overstreet committed
1508
		bch2_btree_node_unlock_write(trans, path, b);
1509 1510 1511
		goto split;
	}

1512 1513
	btree_node_interior_verify(c, b);

Kent Overstreet's avatar
Kent Overstreet committed
1514
	bch2_btree_insert_keys_interior(as, trans, path, b, keys);
1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525

	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;

	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);

	if (u64s_added > live_u64s_added &&
	    bch2_maybe_compact_whiteouts(c, b))
1526
		bch2_trans_node_reinit_iter(trans, b);
1527

Kent Overstreet's avatar
Kent Overstreet committed
1528
	bch2_btree_node_unlock_write(trans, path, b);
1529

1530
	btree_node_interior_verify(c, b);
1531 1532
	return;
split:
Kent Overstreet's avatar
Kent Overstreet committed
1533
	btree_split(as, trans, path, b, keys, flags);
1534 1535
}

1536
int bch2_btree_split_leaf(struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1537
			  struct btree_path *path,
1538 1539
			  unsigned flags)
{
1540
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
1541
	struct btree *b = path_l(path)->b;
1542
	struct btree_update *as;
1543 1544
	unsigned l;
	int ret = 0;
1545

Kent Overstreet's avatar
Kent Overstreet committed
1546
	as = bch2_btree_update_start(trans, path, path->level,
1547 1548 1549
		btree_update_reserve_required(c, b), flags);
	if (IS_ERR(as))
		return PTR_ERR(as);
1550

Kent Overstreet's avatar
Kent Overstreet committed
1551
	btree_split(as, trans, path, b, NULL, flags);
1552
	bch2_btree_update_done(as);
1553

Kent Overstreet's avatar
Kent Overstreet committed
1554 1555
	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
1556 1557

	return ret;
1558 1559
}

1560
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1561
				  struct btree_path *path,
1562 1563 1564
				  unsigned level,
				  unsigned flags,
				  enum btree_node_sibling sib)
1565
{
1566
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
1567
	struct btree_path *sib_path = NULL;
1568 1569 1570 1571 1572
	struct btree_update *as;
	struct bkey_format_state new_s;
	struct bkey_format new_f;
	struct bkey_i delete;
	struct btree *b, *m, *n, *prev, *next, *parent;
1573
	struct bpos sib_pos;
1574
	size_t sib_u64s;
1575
	int ret = 0, ret2 = 0;
1576

1577
retry:
Kent Overstreet's avatar
Kent Overstreet committed
1578
	ret = bch2_btree_path_traverse(trans, path, false);
1579
	if (ret)
1580
		return ret;
1581

Kent Overstreet's avatar
Kent Overstreet committed
1582 1583
	BUG_ON(!path->should_be_locked);
	BUG_ON(!btree_node_locked(path, level));
1584

Kent Overstreet's avatar
Kent Overstreet committed
1585
	b = path->l[level].b;
1586

1587
	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
1588
	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
1589
		b->sib_u64s[sib] = U16_MAX;
1590
		goto out;
1591
	}
1592

1593 1594 1595
	sib_pos = sib == btree_prev_sib
		? bpos_predecessor(b->data->min_key)
		: bpos_successor(b->data->max_key);
1596

Kent Overstreet's avatar
Kent Overstreet committed
1597 1598 1599
	sib_path = bch2_path_get(trans, false, path->btree_id,
				 sib_pos, U8_MAX, level, true);
	ret = bch2_btree_path_traverse(trans, sib_path, false);
1600
	if (ret)
1601 1602
		goto err;

Kent Overstreet's avatar
Kent Overstreet committed
1603 1604 1605
	sib_path->should_be_locked = true;

	m = sib_path->l[level].b;
1606

Kent Overstreet's avatar
Kent Overstreet committed
1607 1608
	if (btree_node_parent(path, b) !=
	    btree_node_parent(sib_path, m)) {
1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620
		b->sib_u64s[sib] = U16_MAX;
		goto out;
	}

	if (sib == btree_prev_sib) {
		prev = m;
		next = b;
	} else {
		prev = b;
		next = m;
	}

1621 1622 1623 1624 1625
	if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
		char buf1[100], buf2[100];

		bch2_bpos_to_text(&PBUF(buf1), prev->data->max_key);
		bch2_bpos_to_text(&PBUF(buf2), next->data->min_key);
1626 1627 1628 1629 1630 1631
		bch_err(c,
			"btree topology error in btree merge:\n"
			"  prev ends at   %s\n"
			"  next starts at %s",
			buf1, buf2);
		bch2_topology_error(c);
1632 1633 1634
		ret = -EIO;
		goto err;
	}
1635

1636
	bch2_bkey_format_init(&new_s);
1637 1638 1639 1640
	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
	__bch2_btree_calc_format(&new_s, prev);
	__bch2_btree_calc_format(&new_s, next);
	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652
	new_f = bch2_bkey_format_done(&new_s);

	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
		btree_node_u64s_with_format(m, &new_f);

	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
		sib_u64s /= 2;
		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
	}

	sib_u64s = min(sib_u64s, btree_max_u64s(c));
1653
	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
1654 1655
	b->sib_u64s[sib] = sib_u64s;

1656
	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
1657 1658
		goto out;

Kent Overstreet's avatar
Kent Overstreet committed
1659 1660
	parent = btree_node_parent(path, b);
	as = bch2_btree_update_start(trans, path, level,
1661
			 btree_update_reserve_required(c, parent) + 1,
1662
			 flags|
1663
			 BTREE_INSERT_NOFAIL|
1664 1665
			 BTREE_INSERT_USE_RESERVE);
	ret = PTR_ERR_OR_ZERO(as);
1666
	if (ret)
1667
		goto err;
1668 1669 1670 1671 1672 1673

	trace_btree_merge(c, b);

	bch2_btree_interior_update_will_free_node(as, b);
	bch2_btree_interior_update_will_free_node(as, m);

1674
	n = bch2_btree_node_alloc(as, b->c.level);
1675
	bch2_btree_update_add_new_node(as, n);
1676

Kent Overstreet's avatar
Kent Overstreet committed
1677 1678
	btree_set_min(n, prev->data->min_key);
	btree_set_max(n, next->data->max_key);
1679 1680 1681 1682 1683 1684 1685 1686
	n->data->format		= new_f;

	btree_node_set_format(n, new_f);

	bch2_btree_sort_into(c, n, prev);
	bch2_btree_sort_into(c, n, next);

	bch2_btree_build_aux_trees(n);
1687
	six_unlock_write(&n->c.lock);
1688

1689 1690
	bch2_btree_node_write(c, n, SIX_LOCK_intent);

1691 1692 1693 1694 1695
	bkey_init(&delete.k);
	delete.k.p = prev->key.k.p;
	bch2_keylist_add(&as->parent_keys, &delete);
	bch2_keylist_add(&as->parent_keys, &n->key);

1696 1697
	bch2_trans_verify_paths(trans);

Kent Overstreet's avatar
Kent Overstreet committed
1698
	bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
1699

1700 1701
	bch2_trans_verify_paths(trans);

1702
	bch2_btree_update_get_open_buckets(as, n);
1703

1704
	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1705
	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
1706

1707
	bch2_trans_node_add(trans, n);
1708

Kent Overstreet's avatar
Kent Overstreet committed
1709
	bch2_trans_verify_paths(trans);
1710

1711 1712
	bch2_btree_node_free_inmem(trans, b);
	bch2_btree_node_free_inmem(trans, m);
1713

1714 1715
	six_unlock_intent(&n->c.lock);

1716 1717
	bch2_btree_update_done(as);
out:
Kent Overstreet's avatar
Kent Overstreet committed
1718
	bch2_trans_verify_locks(trans);
Kent Overstreet's avatar
Kent Overstreet committed
1719 1720
	if (sib_path)
		bch2_path_put(trans, sib_path, true);
1721

1722 1723 1724 1725 1726 1727 1728 1729 1730
	/*
	 * Don't downgrade locks here: we're called after successful insert,
	 * and the caller will downgrade locks after a successful insert
	 * anyways (in case e.g. a split was required first)
	 *
	 * And we're also called when inserting into interior nodes in the
	 * split path, and downgrading to read locks in there is potentially
	 * confusing:
	 */
1731
	return ret ?: ret2;
1732
err:
Kent Overstreet's avatar
Kent Overstreet committed
1733 1734 1735
	if (sib_path)
		bch2_path_put(trans, sib_path, true);
	sib_path = NULL;
1736

1737
	if (ret == -EINTR && bch2_trans_relock(trans))
1738
		goto retry;
1739 1740 1741 1742

	goto out;
}

1743 1744 1745
/**
 * bch_btree_node_rewrite - Rewrite/move a btree node
 */
1746 1747
int bch2_btree_node_rewrite(struct btree_trans *trans,
			    struct btree_iter *iter,
1748
			    __le64 seq, unsigned flags)
1749
{
1750
	struct bch_fs *c = trans->c;
1751
	struct btree *b, *n, *parent;
1752
	struct btree_update *as;
1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763
	int ret;

	flags |= BTREE_INSERT_NOFAIL;
retry:
	ret = bch2_btree_iter_traverse(iter);
	if (ret)
		goto out;

	b = bch2_btree_iter_peek_node(iter);
	if (!b || b->data->keys.seq != seq)
		goto out;
1764

Kent Overstreet's avatar
Kent Overstreet committed
1765 1766
	parent = btree_node_parent(iter->path, b);
	as = bch2_btree_update_start(trans, iter->path, b->c.level,
1767 1768 1769
		(parent
		 ? btree_update_reserve_required(c, parent)
		 : 0) + 1,
1770 1771 1772 1773 1774
		flags);
	ret = PTR_ERR_OR_ZERO(as);
	if (ret == -EINTR)
		goto retry;
	if (ret) {
1775
		trace_btree_gc_rewrite_node_fail(c, b);
1776
		goto out;
1777 1778 1779 1780 1781
	}

	bch2_btree_interior_update_will_free_node(as, b);

	n = bch2_btree_node_alloc_replacement(as, b);
1782
	bch2_btree_update_add_new_node(as, n);
1783 1784

	bch2_btree_build_aux_trees(n);
1785
	six_unlock_write(&n->c.lock);
1786 1787 1788 1789 1790 1791 1792

	trace_btree_gc_rewrite_node(c, b);

	bch2_btree_node_write(c, n, SIX_LOCK_intent);

	if (parent) {
		bch2_keylist_add(&as->parent_keys, &n->key);
Kent Overstreet's avatar
Kent Overstreet committed
1793
		bch2_btree_insert_node(as, trans, iter->path, parent,
1794
				       &as->parent_keys, flags);
1795
	} else {
Kent Overstreet's avatar
Kent Overstreet committed
1796
		bch2_btree_set_root(as, trans, iter->path, n);
1797 1798
	}

1799
	bch2_btree_update_get_open_buckets(as, n);
1800

1801
	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1802 1803
	bch2_trans_node_add(trans, n);
	bch2_btree_node_free_inmem(trans, b);
1804
	six_unlock_intent(&n->c.lock);
1805 1806

	bch2_btree_update_done(as);
1807
out:
Kent Overstreet's avatar
Kent Overstreet committed
1808
	bch2_btree_path_downgrade(iter->path);
1809 1810 1811
	return ret;
}

1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826
struct async_btree_rewrite {
	struct bch_fs		*c;
	struct work_struct	work;
	enum btree_id		btree_id;
	unsigned		level;
	struct bpos		pos;
	__le64			seq;
};

void async_btree_node_rewrite_work(struct work_struct *work)
{
	struct async_btree_rewrite *a =
		container_of(work, struct async_btree_rewrite, work);
	struct bch_fs *c = a->c;
	struct btree_trans trans;
Kent Overstreet's avatar
Kent Overstreet committed
1827
	struct btree_iter iter;
1828 1829

	bch2_trans_init(&trans, c, 0, 0);
Kent Overstreet's avatar
Kent Overstreet committed
1830
	bch2_trans_node_iter_init(&trans, &iter, a->btree_id, a->pos,
1831
					BTREE_MAX_DEPTH, a->level, 0);
Kent Overstreet's avatar
Kent Overstreet committed
1832 1833
	bch2_btree_node_rewrite(&trans, &iter, a->seq, 0);
	bch2_trans_iter_exit(&trans, &iter);
1834 1835 1836 1837 1838 1839 1840 1841 1842
	bch2_trans_exit(&trans);
	percpu_ref_put(&c->writes);
	kfree(a);
}

void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
	struct async_btree_rewrite *a;

1843 1844 1845
	if (!test_bit(BCH_FS_BTREE_INTERIOR_REPLAY_DONE, &c->flags))
		return;

1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861
	if (!percpu_ref_tryget(&c->writes))
		return;

	a = kmalloc(sizeof(*a), GFP_NOFS);
	if (!a) {
		percpu_ref_put(&c->writes);
		return;
	}

	a->c		= c;
	a->btree_id	= b->c.btree_id;
	a->level	= b->c.level;
	a->pos		= b->key.k.p;
	a->seq		= b->data->keys.seq;

	INIT_WORK(&a->work, async_btree_node_rewrite_work);
1862
	queue_work(c->btree_interior_update_worker, &a->work);
1863 1864
}

1865 1866 1867 1868 1869
static int __bch2_btree_node_update_key(struct btree_trans *trans,
					struct btree_iter *iter,
					struct btree *b, struct btree *new_hash,
					struct bkey_i *new_key,
					bool skip_triggers)
1870
{
1871
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
1872
	struct btree_iter iter2 = { NULL };
1873
	struct btree *parent;
1874
	u64 journal_entries[BKEY_BTREE_PTR_U64s_MAX];
1875 1876
	int ret;

1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898
	if (!skip_triggers) {
		ret = bch2_trans_mark_key(trans,
					  bkey_s_c_null,
					  bkey_i_to_s_c(new_key),
					  BTREE_TRIGGER_INSERT);
		if (ret)
			return ret;

		ret = bch2_trans_mark_key(trans,
					  bkey_i_to_s_c(&b->key),
					  bkey_s_c_null,
					  BTREE_TRIGGER_OVERWRITE);
		if (ret)
			return ret;
	}

	if (new_hash) {
		bkey_copy(&new_hash->key, new_key);
		ret = bch2_btree_node_hash_insert(&c->btree_cache,
				new_hash, b->c.level, b->c.btree_id);
		BUG_ON(ret);
	}
1899

Kent Overstreet's avatar
Kent Overstreet committed
1900
	parent = btree_node_parent(iter->path, b);
1901
	if (parent) {
Kent Overstreet's avatar
Kent Overstreet committed
1902
		bch2_trans_copy_iter(&iter2, iter);
1903

Kent Overstreet's avatar
Kent Overstreet committed
1904 1905
		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
				iter2.flags & BTREE_ITER_INTENT);
1906

Kent Overstreet's avatar
Kent Overstreet committed
1907 1908
		BUG_ON(iter2.path->level != b->c.level);
		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
1909

Kent Overstreet's avatar
Kent Overstreet committed
1910 1911 1912 1913 1914 1915
		btree_node_unlock(iter2.path, iter2.path->level);
		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
		iter2.path->level++;

		ret   = bch2_btree_iter_traverse(&iter2) ?:
			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
1916 1917
		if (ret)
			goto err;
1918 1919 1920
	} else {
		BUG_ON(btree_node_root(c, b) != b);

1921 1922 1923 1924 1925 1926 1927
		trans->extra_journal_entries = (void *) &journal_entries[0];
		trans->extra_journal_entry_u64s =
			journal_entry_set((void *) &journal_entries[0],
					  BCH_JSET_ENTRY_btree_root,
					  b->c.btree_id, b->c.level,
					  new_key, new_key->k.u64s);
	}
1928

1929 1930 1931 1932
	ret = bch2_trans_commit(trans, NULL, NULL,
				BTREE_INSERT_NOFAIL|
				BTREE_INSERT_NOCHECK_RW|
				BTREE_INSERT_JOURNAL_RECLAIM|
1933
				BTREE_INSERT_JOURNAL_RESERVED);
1934 1935
	if (ret)
		goto err;
1936

Kent Overstreet's avatar
Kent Overstreet committed
1937
	bch2_btree_node_lock_write(trans, iter->path, b);
1938

1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949
	if (new_hash) {
		mutex_lock(&c->btree_cache.lock);
		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
		bch2_btree_node_hash_remove(&c->btree_cache, b);

		bkey_copy(&b->key, new_key);
		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
		BUG_ON(ret);
		mutex_unlock(&c->btree_cache.lock);
	} else {
		bkey_copy(&b->key, new_key);
1950 1951
	}

Kent Overstreet's avatar
Kent Overstreet committed
1952
	bch2_btree_node_unlock_write(trans, iter->path, b);
1953
out:
Kent Overstreet's avatar
Kent Overstreet committed
1954
	bch2_trans_iter_exit(trans, &iter2);
1955 1956 1957 1958 1959 1960 1961 1962
	return ret;
err:
	if (new_hash) {
		mutex_lock(&c->btree_cache.lock);
		bch2_btree_node_hash_remove(&c->btree_cache, b);
		mutex_unlock(&c->btree_cache.lock);
	}
	goto out;
1963 1964
}

1965 1966 1967
int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
			       struct btree *b, struct bkey_i *new_key,
			       bool skip_triggers)
1968
{
1969
	struct bch_fs *c = trans->c;
1970 1971
	struct btree *new_hash = NULL;
	struct closure cl;
1972
	int ret = 0;
1973 1974 1975

	closure_init_stack(&cl);

1976 1977 1978 1979 1980
	/*
	 * check btree_ptr_hash_val() after @b is locked by
	 * btree_iter_traverse():
	 */
	if (btree_ptr_hash_val(new_key) != b->hash_val) {
1981 1982
		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
		if (ret) {
1983
			bch2_trans_unlock(trans);
1984
			closure_sync(&cl);
1985
			if (!bch2_trans_relock(trans))
1986
				return -EINTR;
1987 1988 1989 1990 1991
		}

		new_hash = bch2_btree_node_mem_alloc(c);
	}

1992 1993
	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
					   new_key, skip_triggers);
1994 1995 1996 1997 1998 1999

	if (new_hash) {
		mutex_lock(&c->btree_cache.lock);
		list_move(&new_hash->list, &c->btree_cache.freeable);
		mutex_unlock(&c->btree_cache.lock);

2000 2001
		six_unlock_write(&new_hash->c.lock);
		six_unlock_intent(&new_hash->c.lock);
2002 2003
	}
	closure_sync(&cl);
2004
	bch2_btree_cache_cannibalize_unlock(c);
2005 2006 2007
	return ret;
}

2008 2009 2010 2011
int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
					struct btree *b, struct bkey_i *new_key,
					bool skip_triggers)
{
Kent Overstreet's avatar
Kent Overstreet committed
2012
	struct btree_iter iter;
2013 2014
	int ret;

Kent Overstreet's avatar
Kent Overstreet committed
2015 2016 2017 2018
	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
				  BTREE_MAX_DEPTH, b->c.level,
				  BTREE_ITER_INTENT);
	ret = bch2_btree_iter_traverse(&iter);
2019 2020 2021 2022
	if (ret)
		goto out;

	/* has node been freed? */
Kent Overstreet's avatar
Kent Overstreet committed
2023
	if (iter.path->l[b->c.level].b != b) {
2024 2025 2026 2027 2028 2029 2030
		/* node has been freed: */
		BUG_ON(!btree_node_dying(b));
		goto out;
	}

	BUG_ON(!btree_node_hashed(b));

Kent Overstreet's avatar
Kent Overstreet committed
2031
	ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
2032
out:
Kent Overstreet's avatar
Kent Overstreet committed
2033
	bch2_trans_iter_exit(trans, &iter);
2034 2035 2036
	return ret;
}

2037 2038 2039 2040 2041 2042 2043 2044 2045 2046
/* Init code: */

/*
 * Only for filesystem bringup, when first reading the btree roots or allocating
 * btree roots when initializing a new filesystem:
 */
void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
{
	BUG_ON(btree_node_root(c, b));

2047
	bch2_btree_set_root_inmem(c, b);
2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066
}

void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
{
	struct closure cl;
	struct btree *b;
	int ret;

	closure_init_stack(&cl);

	do {
		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
		closure_sync(&cl);
	} while (ret);

	b = bch2_btree_node_mem_alloc(c);
	bch2_btree_cache_cannibalize_unlock(c);

	set_btree_node_fake(b);
2067
	set_btree_node_need_rewrite(b);
2068 2069
	b->c.level	= 0;
	b->c.btree_id	= id;
2070

2071
	bkey_btree_ptr_init(&b->key);
2072
	b->key.k.p = SPOS_MAX;
2073
	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
2074 2075 2076 2077

	bch2_bset_init_first(b, &b->data->keys);
	bch2_btree_build_aux_trees(b);

2078
	b->data->flags = 0;
Kent Overstreet's avatar
Kent Overstreet committed
2079
	btree_set_min(b, POS_MIN);
2080
	btree_set_max(b, SPOS_MAX);
2081 2082 2083
	b->data->format = bch2_btree_calc_format(b);
	btree_node_set_format(b, b->data->format);

2084 2085
	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
					  b->c.level, b->c.btree_id);
2086 2087
	BUG_ON(ret);

2088
	bch2_btree_set_root_inmem(c, b);
2089

2090 2091
	six_unlock_write(&b->c.lock);
	six_unlock_intent(&b->c.lock);
2092 2093
}

2094
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
2095 2096 2097 2098 2099
{
	struct btree_update *as;

	mutex_lock(&c->btree_interior_update_lock);
	list_for_each_entry(as, &c->btree_interior_update_list, list)
2100
		pr_buf(out, "%p m %u w %u r %u j %llu\n",
2101 2102 2103 2104 2105
		       as,
		       as->mode,
		       as->nodes_written,
		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
		       as->journal.seq);
2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120
	mutex_unlock(&c->btree_interior_update_lock);
}

size_t bch2_btree_interior_updates_nr_pending(struct bch_fs *c)
{
	size_t ret = 0;
	struct list_head *i;

	mutex_lock(&c->btree_interior_update_lock);
	list_for_each(i, &c->btree_interior_update_list)
		ret++;
	mutex_unlock(&c->btree_interior_update_lock);

	return ret;
}
2121

2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169
void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
{
	struct btree_root *r;
	struct jset_entry *entry;

	mutex_lock(&c->btree_root_lock);

	vstruct_for_each(jset, entry)
		if (entry->type == BCH_JSET_ENTRY_btree_root) {
			r = &c->btree_roots[entry->btree_id];
			r->level = entry->level;
			r->alive = true;
			bkey_copy(&r->key, &entry->start[0]);
		}

	mutex_unlock(&c->btree_root_lock);
}

struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c,
				    struct jset_entry *start,
				    struct jset_entry *end)
{
	struct jset_entry *entry;
	unsigned long have = 0;
	unsigned i;

	for (entry = start; entry < end; entry = vstruct_next(entry))
		if (entry->type == BCH_JSET_ENTRY_btree_root)
			__set_bit(entry->btree_id, &have);

	mutex_lock(&c->btree_root_lock);

	for (i = 0; i < BTREE_ID_NR; i++)
		if (c->btree_roots[i].alive && !test_bit(i, &have)) {
			journal_entry_set(end,
					  BCH_JSET_ENTRY_btree_root,
					  i, c->btree_roots[i].level,
					  &c->btree_roots[i].key,
					  c->btree_roots[i].key.u64s);
			end = vstruct_next(end);
		}

	mutex_unlock(&c->btree_root_lock);

	return end;
}

2170 2171
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
2172 2173
	if (c->btree_interior_update_worker)
		destroy_workqueue(c->btree_interior_update_worker);
2174 2175 2176 2177 2178 2179 2180 2181 2182
	mempool_exit(&c->btree_interior_update_pool);
}

int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
	mutex_init(&c->btree_reserve_cache_lock);
	INIT_LIST_HEAD(&c->btree_interior_update_list);
	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
	mutex_init(&c->btree_interior_update_lock);
2183 2184 2185 2186 2187 2188
	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);

	c->btree_interior_update_worker =
		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
	if (!c->btree_interior_update_worker)
		return -ENOMEM;
2189

2190 2191
	return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
					 sizeof(struct btree_update));
2192
}