btree_update_interior.c 58.3 KB
Newer Older
1 2 3
// SPDX-License-Identifier: GPL-2.0

#include "bcachefs.h"
4
#include "alloc_foreground.h"
5 6 7 8 9 10 11 12 13
#include "bkey_methods.h"
#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
#include "buckets.h"
14
#include "error.h"
15 16 17 18
#include "extents.h"
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
19
#include "recovery.h"
20 21 22 23 24 25
#include "replicas.h"
#include "super-io.h"
#include "trace.h"

#include <linux/random.h>

26
static void bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
Kent Overstreet's avatar
Kent Overstreet committed
27
				   struct btree_path *, struct btree *,
28
				   struct keylist *, unsigned);
29
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
30

31 32
/* Debug code: */

33 34 35
/*
 * Verify that child nodes correctly span parent node's range:
 */
36
static void btree_node_interior_verify(struct bch_fs *c, struct btree *b)
37
{
38 39
#ifdef CONFIG_BCACHEFS_DEBUG
	struct bpos next_node = b->data->min_key;
40
	struct btree_node_iter iter;
41 42 43
	struct bkey_s_c k;
	struct bkey_s_c_btree_ptr_v2 bp;
	struct bkey unpacked;
44
	struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
45

46
	BUG_ON(!b->c.level);
47

48
	if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))
49 50
		return;

51 52 53 54
	bch2_btree_node_iter_init_from_start(&iter, b);

	while (1) {
		k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked);
55 56
		if (k.k->type != KEY_TYPE_btree_ptr_v2)
			break;
57 58
		bp = bkey_s_c_to_btree_ptr_v2(k);

59
		if (bpos_cmp(next_node, bp.v->min_key)) {
60
			bch2_dump_btree_node(c, b);
61 62 63
			bch2_bpos_to_text(&buf1, next_node);
			bch2_bpos_to_text(&buf2, bp.v->min_key);
			panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf);
64
		}
65 66 67 68

		bch2_btree_node_iter_advance(&iter, b);

		if (bch2_btree_node_iter_end(&iter)) {
69
			if (bpos_cmp(k.k->p, b->key.k.p)) {
70
				bch2_dump_btree_node(c, b);
71 72 73
				bch2_bpos_to_text(&buf1, b->key.k.p);
				bch2_bpos_to_text(&buf2, k.k->p);
				panic("expected end %s got %s\n", buf1.buf, buf2.buf);
74
			}
75 76 77
			break;
		}

78
		next_node = bpos_successor(k.k->p);
79
	}
80 81 82 83 84 85 86 87 88 89 90 91
#endif
}

/* Calculate ideal packed bkey format for new btree nodes: */

void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b)
{
	struct bkey_packed *k;
	struct bset_tree *t;
	struct bkey uk;

	for_each_bset(b, t)
Kent Overstreet's avatar
Kent Overstreet committed
92
		bset_tree_for_each_key(b, t, k)
93
			if (!bkey_deleted(k)) {
94 95 96 97 98 99 100 101 102 103
				uk = bkey_unpack_key(b, k);
				bch2_bkey_format_add_key(s, &uk);
			}
}

static struct bkey_format bch2_btree_calc_format(struct btree *b)
{
	struct bkey_format_state s;

	bch2_bkey_format_init(&s);
104 105
	bch2_bkey_format_add_pos(&s, b->data->min_key);
	bch2_bkey_format_add_pos(&s, b->data->max_key);
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
	__bch2_btree_calc_format(&s, b);

	return bch2_bkey_format_done(&s);
}

static size_t btree_node_u64s_with_format(struct btree *b,
					  struct bkey_format *new_f)
{
	struct bkey_format *old_f = &b->format;

	/* stupid integer promotion rules */
	ssize_t delta =
	    (((int) new_f->key_u64s - old_f->key_u64s) *
	     (int) b->nr.packed_keys) +
	    (((int) new_f->key_u64s - BKEY_U64s) *
	     (int) b->nr.unpacked_keys);

	BUG_ON(delta + b->nr.live_u64s < 0);

	return b->nr.live_u64s + delta;
}

/**
 * btree_node_format_fits - check if we could rewrite node with a new format
 *
 * This assumes all keys can pack with the new format -- it just checks if
 * the re-packed keys would fit inside the node itself.
 */
bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
				 struct bkey_format *new_f)
{
	size_t u64s = btree_node_u64s_with_format(b, new_f);

	return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
}

/* Btree node freeing/allocation: */

static void __btree_node_free(struct bch_fs *c, struct btree *b)
{
	trace_btree_node_free(c, b);

	BUG_ON(btree_node_dirty(b));
	BUG_ON(btree_node_need_write(b));
	BUG_ON(b == btree_node_root(c, b));
	BUG_ON(b->ob.nr);
	BUG_ON(!list_empty(&b->write_blocked));
	BUG_ON(b->will_make_reachable);

	clear_btree_node_noevict(b);

	mutex_lock(&c->btree_cache.lock);
	list_move(&b->list, &c->btree_cache.freeable);
	mutex_unlock(&c->btree_cache.lock);
}

162 163
static void bch2_btree_node_free_inmem(struct btree_trans *trans,
				       struct btree *b)
164
{
165
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
166
	struct btree_path *path;
167

Kent Overstreet's avatar
Kent Overstreet committed
168
	trans_for_each_path(trans, path)
169 170
		BUG_ON(path->l[b->c.level].b == b &&
		       path->l[b->c.level].lock_seq == b->c.lock.state.seq);
171

172
	six_lock_write(&b->c.lock, NULL, NULL);
173 174

	bch2_btree_node_hash_remove(&c->btree_cache, b);
175
	__btree_node_free(c, b);
176

177 178
	six_unlock_write(&b->c.lock);
	six_unlock_intent(&b->c.lock);
179 180
}

181
static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans,
182 183
					     struct disk_reservation *res,
					     struct closure *cl,
184
					     bool interior_node,
185 186
					     unsigned flags)
{
187
	struct bch_fs *c = trans->c;
188 189
	struct write_point *wp;
	struct btree *b;
190
	__BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
191
	struct open_buckets ob = { .nr = 0 };
192 193 194
	struct bch_devs_list devs_have = (struct bch_devs_list) { 0 };
	unsigned nr_reserve;
	enum alloc_reserve alloc_reserve;
195
	int ret;
196

197
	if (flags & BTREE_INSERT_USE_RESERVE) {
198
		nr_reserve	= 0;
199
		alloc_reserve	= RESERVE_btree_movinggc;
200 201
	} else {
		nr_reserve	= BTREE_NODE_RESERVE;
202
		alloc_reserve	= RESERVE_btree;
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217
	}

	mutex_lock(&c->btree_reserve_cache_lock);
	if (c->btree_reserve_cache_nr > nr_reserve) {
		struct btree_alloc *a =
			&c->btree_reserve_cache[--c->btree_reserve_cache_nr];

		ob = a->ob;
		bkey_copy(&tmp.k, &a->k);
		mutex_unlock(&c->btree_reserve_cache_lock);
		goto mem_alloc;
	}
	mutex_unlock(&c->btree_reserve_cache_lock);

retry:
218
	ret = bch2_alloc_sectors_start_trans(trans,
219 220 221
				      c->opts.metadata_target ?:
				      c->opts.foreground_target,
				      0,
222 223 224 225
				      writepoint_ptr(&c->btree_write_point),
				      &devs_have,
				      res->nr_replicas,
				      c->opts.metadata_replicas_required,
226 227 228
				      alloc_reserve, 0, cl, &wp);
	if (unlikely(ret))
		return ERR_PTR(ret);
229

230
	if (wp->sectors_free < btree_sectors(c)) {
231 232 233
		struct open_bucket *ob;
		unsigned i;

234
		open_bucket_for_each(c, &wp->ptrs, ob, i)
235
			if (ob->sectors_free < btree_sectors(c))
236 237 238 239 240 241
				ob->sectors_free = 0;

		bch2_alloc_sectors_done(c, wp);
		goto retry;
	}

242
	bkey_btree_ptr_v2_init(&tmp.k);
243
	bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false);
244

245
	bch2_open_bucket_get(c, wp, &ob);
246 247
	bch2_alloc_sectors_done(c, wp);
mem_alloc:
248
	b = bch2_btree_node_mem_alloc(c, interior_node);
249 250
	six_unlock_write(&b->c.lock);
	six_unlock_intent(&b->c.lock);
251 252 253 254 255 256 257 258 259 260 261 262 263 264 265

	/* we hold cannibalize_lock: */
	BUG_ON(IS_ERR(b));
	BUG_ON(b->ob.nr);

	bkey_copy(&b->key, &tmp.k);
	b->ob = ob;

	return b;
}

static struct btree *bch2_btree_node_alloc(struct btree_update *as, unsigned level)
{
	struct bch_fs *c = as->c;
	struct btree *b;
266
	struct prealloc_nodes *p = &as->prealloc_nodes[!!level];
Kent Overstreet's avatar
Kent Overstreet committed
267
	int ret;
268 269

	BUG_ON(level >= BTREE_MAX_DEPTH);
270
	BUG_ON(!p->nr);
271

272
	b = p->b[--p->nr];
273

274 275 276
	six_lock_intent(&b->c.lock, NULL, NULL);
	six_lock_write(&b->c.lock, NULL, NULL);

277
	set_btree_node_accessed(b);
278
	set_btree_node_dirty_acct(c, b);
279
	set_btree_node_need_write(b);
280 281

	bch2_bset_init_first(b, &b->data->keys);
282 283
	b->c.level	= level;
	b->c.btree_id	= as->btree_id;
284
	b->version_ondisk = c->sb.version;
285

286 287
	memset(&b->nr, 0, sizeof(b->nr));
	b->data->magic = cpu_to_le64(bset_magic(c));
288
	memset(&b->data->_ptr, 0, sizeof(b->data->_ptr));
289 290 291
	b->data->flags = 0;
	SET_BTREE_NODE_ID(b->data, as->btree_id);
	SET_BTREE_NODE_LEVEL(b->data, level);
Kent Overstreet's avatar
Kent Overstreet committed
292 293 294 295 296 297 298 299

	if (b->key.k.type == KEY_TYPE_btree_ptr_v2) {
		struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key);

		bp->v.mem_ptr		= 0;
		bp->v.seq		= b->data->keys.seq;
		bp->v.sectors_written	= 0;
	}
300

301
	SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true);
302

303 304
	bch2_btree_build_aux_trees(b);

Kent Overstreet's avatar
Kent Overstreet committed
305 306 307
	ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
	BUG_ON(ret);

308 309 310 311
	trace_btree_node_alloc(c, b);
	return b;
}

Kent Overstreet's avatar
Kent Overstreet committed
312 313 314 315 316 317 318 319 320 321 322 323 324
static void btree_set_min(struct btree *b, struct bpos pos)
{
	if (b->key.k.type == KEY_TYPE_btree_ptr_v2)
		bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos;
	b->data->min_key = pos;
}

static void btree_set_max(struct btree *b, struct bpos pos)
{
	b->key.k.p = pos;
	b->data->max_key = pos;
}

325 326 327 328 329 330
struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *as,
						  struct btree *b,
						  struct bkey_format format)
{
	struct btree *n;

331
	n = bch2_btree_node_alloc(as, b->c.level);
332 333 334

	SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1);

Kent Overstreet's avatar
Kent Overstreet committed
335 336 337 338
	btree_set_min(n, b->data->min_key);
	btree_set_max(n, b->data->max_key);

	n->data->format		= format;
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
	btree_node_set_format(n, format);

	bch2_btree_sort_into(as->c, n, b);

	btree_node_reset_sib_u64s(n);

	n->key.k.p = b->key.k.p;
	return n;
}

static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as,
						       struct btree *b)
{
	struct bkey_format new_f = bch2_btree_calc_format(b);

	/*
	 * The keys might expand with the new format - if they wouldn't fit in
	 * the btree node anymore, use the old format for now:
	 */
	if (!bch2_btree_node_format_fits(as->c, b, &new_f))
		new_f = b->format;

	return __bch2_btree_node_alloc_replacement(as, b, new_f);
}

static struct btree *__btree_root_alloc(struct btree_update *as, unsigned level)
{
	struct btree *b = bch2_btree_node_alloc(as, level);

Kent Overstreet's avatar
Kent Overstreet committed
368
	btree_set_min(b, POS_MIN);
369
	btree_set_max(b, SPOS_MAX);
370 371 372 373 374
	b->data->format = bch2_btree_calc_format(b);

	btree_node_set_format(b, b->data->format);
	bch2_btree_build_aux_trees(b);

375
	bch2_btree_update_add_new_node(as, b);
376
	six_unlock_write(&b->c.lock);
377 378 379 380

	return b;
}

381
static void bch2_btree_reserve_put(struct btree_update *as)
382
{
383
	struct bch_fs *c = as->c;
384
	struct prealloc_nodes *p;
385

386 387 388 389 390
	for (p = as->prealloc_nodes;
	     p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes);
	     p++) {
		while (p->nr) {
			struct btree *b = p->b[--p->nr];
391

392
			mutex_lock(&c->btree_reserve_cache_lock);
393

394 395 396 397
			if (c->btree_reserve_cache_nr <
			    ARRAY_SIZE(c->btree_reserve_cache)) {
				struct btree_alloc *a =
					&c->btree_reserve_cache[c->btree_reserve_cache_nr++];
398

399 400 401 402 403 404
				a->ob = b->ob;
				b->ob.nr = 0;
				bkey_copy(&a->k, &b->key);
			} else {
				bch2_open_buckets_put(c, &b->ob);
			}
405

406 407 408 409
			mutex_unlock(&c->btree_reserve_cache_lock);

			six_lock_intent(&b->c.lock, NULL, NULL);
			six_lock_write(&b->c.lock, NULL, NULL);
410 411 412 413
			__btree_node_free(c, b);
			six_unlock_write(&b->c.lock);
			six_unlock_intent(&b->c.lock);
		}
414 415 416
	}
}

417 418
static int bch2_btree_reserve_get(struct btree_trans *trans,
				  struct btree_update *as,
419 420 421
				  unsigned nr_nodes[2],
				  unsigned flags,
				  struct closure *cl)
422
{
423
	struct bch_fs *c = as->c;
424
	struct btree *b;
425
	unsigned interior;
426
	int ret = 0;
427

428
	BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX);
429 430 431 432

	/*
	 * Protects reaping from the btree node cache and using the btree node
	 * open bucket reserve:
433 434 435
	 *
	 * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
	 * blocking on this lock:
436 437
	 */
	ret = bch2_btree_cache_cannibalize_lock(c, cl);
438 439
	if (ret)
		return ret;
440

441 442 443 444
	for (interior = 0; interior < 2; interior++) {
		struct prealloc_nodes *p = as->prealloc_nodes + interior;

		while (p->nr < nr_nodes[interior]) {
445
			b = __bch2_btree_node_alloc(trans, &as->disk_res,
446 447
					flags & BTREE_INSERT_NOWAIT ? NULL : cl,
					interior, flags);
448 449 450 451
			if (IS_ERR(b)) {
				ret = PTR_ERR(b);
				goto err;
			}
452

453 454
			p->b[p->nr++] = b;
		}
455
	}
456
err:
457
	bch2_btree_cache_cannibalize_unlock(c);
458
	return ret;
459 460 461 462
}

/* Asynchronous interior node update machinery */

463
static void bch2_btree_update_free(struct btree_update *as)
464 465 466
{
	struct bch_fs *c = as->c;

467 468 469 470
	if (as->took_gc_lock)
		up_read(&c->gc_lock);
	as->took_gc_lock = false;

471 472 473
	bch2_journal_preres_put(&c->journal, &as->journal_preres);

	bch2_journal_pin_drop(&c->journal, &as->journal);
474
	bch2_journal_pin_flush(&c->journal, &as->journal);
475 476
	bch2_disk_reservation_put(c, &as->disk_res);
	bch2_btree_reserve_put(as);
477

478 479 480
	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total],
			       as->start_time);

481
	mutex_lock(&c->btree_interior_update_lock);
482
	list_del(&as->unwritten_list);
483 484 485 486 487
	list_del(&as->list);

	closure_debug_destroy(&as->cl);
	mempool_free(as, &c->btree_interior_update_pool);

488 489 490 491
	/*
	 * Have to do the wakeup with btree_interior_update_lock still held,
	 * since being on btree_interior_update_list is our ref on @c:
	 */
492
	closure_wake_up(&c->btree_interior_update_wait);
493 494

	mutex_unlock(&c->btree_interior_update_lock);
495 496
}

497 498
static void btree_update_add_key(struct btree_update *as,
				 struct keylist *keys, struct btree *b)
499
{
500 501 502
	struct bkey_i *k = &b->key;

	BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s >
503
	       ARRAY_SIZE(as->_old_keys));
504 505 506 507 508 509 510

	bkey_copy(keys->top, k);
	bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1;

	bch2_keylist_push(keys);
}

511 512 513 514 515 516
/*
 * The transactional part of an interior btree node update, where we journal the
 * update we did to the interior node and update alloc info:
 */
static int btree_update_nodes_written_trans(struct btree_trans *trans,
					    struct btree_update *as)
517
{
518 519 520
	struct bkey_i *k;
	int ret;

521 522 523 524 525 526 527 528 529
	ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
	if (ret)
		return ret;

	memcpy(&darray_top(trans->extra_journal_entries),
	       as->journal_entries,
	       as->journal_u64s * sizeof(u64));
	trans->extra_journal_entries.nr += as->journal_u64s;

530
	trans->journal_pin = &as->journal;
531

532
	for_each_keylist_key(&as->old_keys, k) {
533 534
		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;

535
		ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
536 537
		if (ret)
			return ret;
538 539
	}

540
	for_each_keylist_key(&as->new_keys, k) {
541 542
		unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;

543
		ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
544 545 546 547 548
		if (ret)
			return ret;
	}

	return 0;
549 550
}

551
static void btree_update_nodes_written(struct btree_update *as)
552 553
{
	struct bch_fs *c = as->c;
554
	struct btree *b = as->b;
555
	struct btree_trans trans;
556 557
	u64 journal_seq = 0;
	unsigned i;
558
	int ret;
559

560 561 562 563 564 565 566 567 568 569 570
	/*
	 * If we're already in an error state, it might be because a btree node
	 * was never written, and we might be trying to free that same btree
	 * node here, but it won't have been marked as allocated and we'll see
	 * spurious disk usage inconsistencies in the transactional part below
	 * if we don't skip it:
	 */
	ret = bch2_journal_error(&c->journal);
	if (ret)
		goto err;

571 572 573 574 575 576 577 578 579 580 581 582 583
	/*
	 * Wait for any in flight writes to finish before we free the old nodes
	 * on disk:
	 */
	for (i = 0; i < as->nr_old_nodes; i++) {
		struct btree *old = as->old_nodes[i];
		__le64 seq;

		six_lock_read(&old->c.lock, NULL, NULL);
		seq = old->data ? old->data->keys.seq : 0;
		six_unlock_read(&old->c.lock);

		if (seq == as->old_nodes_seq[i])
584 585
			wait_on_bit_io(&old->flags, BTREE_NODE_write_in_flight_inner,
				       TASK_UNINTERRUPTIBLE);
586 587
	}

588 589 590 591 592
	/*
	 * We did an update to a parent node where the pointers we added pointed
	 * to child nodes that weren't written yet: now, the child nodes have
	 * been written so we can write out the update to the interior node.
	 */
593 594 595 596 597 598 599 600

	/*
	 * We can't call into journal reclaim here: we'd block on the journal
	 * reclaim lock, but we may need to release the open buckets we have
	 * pinned in order for other btree updates to make forward progress, and
	 * journal reclaim does btree updates when flushing bkey_cached entries,
	 * which may require allocations as well.
	 */
601
	bch2_trans_init(&trans, c, 0, 512);
602
	ret = commit_do(&trans, &as->disk_res, &journal_seq,
603 604 605
			      BTREE_INSERT_NOFAIL|
			      BTREE_INSERT_NOCHECK_RW|
			      BTREE_INSERT_JOURNAL_RECLAIM|
606
			      JOURNAL_WATERMARK_reserved,
607 608
			      btree_update_nodes_written_trans(&trans, as));
	bch2_trans_exit(&trans);
609

610 611 612
	bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
			     "error %i in btree_update_nodes_written()", ret);
err:
613
	if (b) {
614
		/*
615 616
		 * @b is the node we did the final insert into:
		 *
617 618 619 620 621 622 623 624
		 * On failure to get a journal reservation, we still have to
		 * unblock the write and allow most of the write path to happen
		 * so that shutdown works, but the i->journal_seq mechanism
		 * won't work to prevent the btree write from being visible (we
		 * didn't get a journal sequence number) - instead
		 * __bch2_btree_node_write() doesn't do the actual write if
		 * we're in journal error state:
		 */
625

626 627
		six_lock_intent(&b->c.lock, NULL, NULL);
		six_lock_write(&b->c.lock, NULL, NULL);
628 629
		mutex_lock(&c->btree_interior_update_lock);

630
		list_del(&as->write_blocked_list);
631 632
		if (list_empty(&b->write_blocked))
			clear_btree_node_write_blocked(b);
633

634 635 636 637 638
		/*
		 * Node might have been freed, recheck under
		 * btree_interior_update_lock:
		 */
		if (as->b == b) {
639 640
			struct bset *i = btree_bset_last(b);

641 642 643
			BUG_ON(!b->c.level);
			BUG_ON(!btree_node_dirty(b));

644 645
			if (!ret) {
				i->journal_seq = cpu_to_le64(
646 647
							     max(journal_seq,
								 le64_to_cpu(i->journal_seq)));
648 649 650 651 652 653 654 655 656 657

				bch2_btree_add_journal_pin(c, b, journal_seq);
			} else {
				/*
				 * If we didn't get a journal sequence number we
				 * can't write this btree node, because recovery
				 * won't know to ignore this write:
				 */
				set_btree_node_never_write(b);
			}
658 659
		}

660
		mutex_unlock(&c->btree_interior_update_lock);
661
		six_unlock_write(&b->c.lock);
662

663 664
		btree_node_write_if_need(c, b, SIX_LOCK_intent);
		six_unlock_intent(&b->c.lock);
665 666
	}

667 668 669
	bch2_journal_pin_drop(&c->journal, &as->journal);

	bch2_journal_preres_put(&c->journal, &as->journal_preres);
670

671 672 673
	mutex_lock(&c->btree_interior_update_lock);
	for (i = 0; i < as->nr_new_nodes; i++) {
		b = as->new_nodes[i];
674

675 676
		BUG_ON(b->will_make_reachable != (unsigned long) as);
		b->will_make_reachable = 0;
677
		clear_btree_node_will_make_reachable(b);
678 679 680 681 682
	}
	mutex_unlock(&c->btree_interior_update_lock);

	for (i = 0; i < as->nr_new_nodes; i++) {
		b = as->new_nodes[i];
683

684
		six_lock_read(&b->c.lock, NULL, NULL);
685 686
		btree_node_write_if_need(c, b, SIX_LOCK_read);
		six_unlock_read(&b->c.lock);
687
	}
688

689 690
	for (i = 0; i < as->nr_open_buckets; i++)
		bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]);
691

692 693
	bch2_btree_update_free(as);
}
694

695 696 697 698 699
static void btree_interior_update_work(struct work_struct *work)
{
	struct bch_fs *c =
		container_of(work, struct bch_fs, btree_interior_update_work);
	struct btree_update *as;
700

701 702 703 704 705 706 707
	while (1) {
		mutex_lock(&c->btree_interior_update_lock);
		as = list_first_entry_or_null(&c->btree_interior_updates_unwritten,
					      struct btree_update, unwritten_list);
		if (as && !as->nodes_written)
			as = NULL;
		mutex_unlock(&c->btree_interior_update_lock);
708

709 710 711 712
		if (!as)
			break;

		btree_update_nodes_written(as);
713
	}
714 715 716 717 718 719
}

static void btree_update_set_nodes_written(struct closure *cl)
{
	struct btree_update *as = container_of(cl, struct btree_update, cl);
	struct bch_fs *c = as->c;
720 721

	mutex_lock(&c->btree_interior_update_lock);
722 723 724 725
	as->nodes_written = true;
	mutex_unlock(&c->btree_interior_update_lock);

	queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work);
726 727 728 729 730 731 732 733 734 735 736
}

/*
 * We're updating @b with pointers to nodes that haven't finished writing yet:
 * block @b from being written until @as completes
 */
static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
	struct bch_fs *c = as->c;

	mutex_lock(&c->btree_interior_update_lock);
737
	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
738 739 740 741

	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
	BUG_ON(!btree_node_dirty(b));

742 743
	as->mode	= BTREE_INTERIOR_UPDATING_NODE;
	as->b		= b;
744 745

	set_btree_node_write_blocked(b);
746 747 748 749 750 751 752 753 754 755
	list_add(&as->write_blocked_list, &b->write_blocked);

	mutex_unlock(&c->btree_interior_update_lock);
}

static void btree_update_reparent(struct btree_update *as,
				  struct btree_update *child)
{
	struct bch_fs *c = as->c;

756 757
	lockdep_assert_held(&c->btree_interior_update_lock);

758 759 760
	child->b = NULL;
	child->mode = BTREE_INTERIOR_UPDATING_AS;

761
	bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
762 763
}

764
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
765
{
766
	struct bkey_i *insert = &b->key;
767 768 769
	struct bch_fs *c = as->c;

	BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE);
770 771 772 773 774 775 776 777 778

	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
	       ARRAY_SIZE(as->journal_entries));

	as->journal_u64s +=
		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
				  BCH_JSET_ENTRY_btree_root,
				  b->c.btree_id, b->c.level,
				  insert, insert->k.u64s);
779

780 781
	mutex_lock(&c->btree_interior_update_lock);
	list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
782

783
	as->mode	= BTREE_INTERIOR_UPDATING_ROOT;
784 785 786
	mutex_unlock(&c->btree_interior_update_lock);
}

787 788 789 790 791 792 793 794 795 796 797 798
/*
 * bch2_btree_update_add_new_node:
 *
 * This causes @as to wait on @b to be written, before it gets to
 * bch2_btree_update_nodes_written
 *
 * Additionally, it sets b->will_make_reachable to prevent any additional writes
 * to @b from happening besides the first until @b is reachable on disk
 *
 * And it adds @b to the list of @as's new nodes, so that we can update sector
 * counts in bch2_btree_update_nodes_written:
 */
799
static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b)
800 801 802
{
	struct bch_fs *c = as->c;

803 804
	closure_get(&as->cl);

805 806 807 808 809 810
	mutex_lock(&c->btree_interior_update_lock);
	BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes));
	BUG_ON(b->will_make_reachable);

	as->new_nodes[as->nr_new_nodes++] = b;
	b->will_make_reachable = 1UL|(unsigned long) as;
811
	set_btree_node_will_make_reachable(b);
812 813

	mutex_unlock(&c->btree_interior_update_lock);
814

815
	btree_update_add_key(as, &as->new_keys, b);
816 817
}

818 819 820
/*
 * returns true if @b was a new node
 */
821 822 823 824 825 826 827
static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b)
{
	struct btree_update *as;
	unsigned long v;
	unsigned i;

	mutex_lock(&c->btree_interior_update_lock);
828 829 830 831 832
	/*
	 * When b->will_make_reachable != 0, it owns a ref on as->cl that's
	 * dropped when it gets written by bch2_btree_complete_write - the
	 * xchg() is for synchronization with bch2_btree_complete_write:
	 */
833
	v = xchg(&b->will_make_reachable, 0);
834
	clear_btree_node_will_make_reachable(b);
835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
	as = (struct btree_update *) (v & ~1UL);

	if (!as) {
		mutex_unlock(&c->btree_interior_update_lock);
		return;
	}

	for (i = 0; i < as->nr_new_nodes; i++)
		if (as->new_nodes[i] == b)
			goto found;

	BUG();
found:
	array_remove_item(as->new_nodes, as->nr_new_nodes, i);
	mutex_unlock(&c->btree_interior_update_lock);

	if (v & 1)
		closure_put(&as->cl);
}

855
static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b)
856
{
857 858 859
	while (b->ob.nr)
		as->open_buckets[as->nr_open_buckets++] =
			b->ob.v[--b->ob.nr];
860 861 862 863 864 865 866
}

/*
 * @b is being split/rewritten: it may have pointers to not-yet-written btree
 * nodes and thus outstanding btree_updates - redirect @b's
 * btree_updates to point to this btree_update:
 */
867
static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
868
						      struct btree *b)
869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889
{
	struct bch_fs *c = as->c;
	struct btree_update *p, *n;
	struct btree_write *w;

	set_btree_node_dying(b);

	if (btree_node_fake(b))
		return;

	mutex_lock(&c->btree_interior_update_lock);

	/*
	 * Does this node have any btree_update operations preventing
	 * it from being written?
	 *
	 * If so, redirect them to point to this btree_update: we can
	 * write out our new nodes, but we won't make them visible until those
	 * operations complete
	 */
	list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) {
890
		list_del_init(&p->write_blocked_list);
891
		btree_update_reparent(as, p);
892 893 894 895 896 897

		/*
		 * for flush_held_btree_writes() waiting on updates to flush or
		 * nodes to be writeable:
		 */
		closure_wake_up(&c->btree_interior_update_wait);
898 899
	}

900
	clear_btree_node_dirty_acct(c, b);
901 902 903 904 905 906 907 908 909 910
	clear_btree_node_need_write(b);

	/*
	 * Does this node have unwritten data that has a pin on the journal?
	 *
	 * If so, transfer that pin to the btree_update operation -
	 * note that if we're freeing multiple nodes, we only need to keep the
	 * oldest pin of any of the nodes we're freeing. We'll release the pin
	 * when the new nodes are persistent and reachable on disk:
	 */
911 912
	w = btree_current_write(b);
	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
913 914 915
	bch2_journal_pin_drop(&c->journal, &w->journal);

	w = btree_prev_write(b);
916
	bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
917 918 919
	bch2_journal_pin_drop(&c->journal, &w->journal);

	mutex_unlock(&c->btree_interior_update_lock);
920 921 922 923 924 925 926 927 928 929 930 931

	/*
	 * Is this a node that isn't reachable on disk yet?
	 *
	 * Nodes that aren't reachable yet have writes blocked until they're
	 * reachable - now that we've cancelled any pending writes and moved
	 * things waiting on that write to wait on this update, we can drop this
	 * node from the list of nodes that the other update is making
	 * reachable, prior to freeing it:
	 */
	btree_update_drop_new_node(c, b);

932
	btree_update_add_key(as, &as->old_keys, b);
933

934 935 936
	as->old_nodes[as->nr_old_nodes] = b;
	as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq;
	as->nr_old_nodes++;
937 938
}

939
static void bch2_btree_update_done(struct btree_update *as)
940
{
941 942 943
	struct bch_fs *c = as->c;
	u64 start_time = as->start_time;

944 945
	BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE);

946 947 948 949
	if (as->took_gc_lock)
		up_read(&as->c->gc_lock);
	as->took_gc_lock = false;

950
	bch2_btree_reserve_put(as);
951

952 953
	continue_at(&as->cl, btree_update_set_nodes_written,
		    as->c->btree_interior_update_worker);
954 955 956

	bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground],
			       start_time);
957 958
}

959
static struct btree_update *
Kent Overstreet's avatar
Kent Overstreet committed
960
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
961
			unsigned level, bool split, unsigned flags)
962
{
963
	struct bch_fs *c = trans->c;
964
	struct btree_update *as;
965
	u64 start_time = local_clock();
966
	int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
967
		? BCH_DISK_RESERVATION_NOFAIL : 0;
968
	unsigned nr_nodes[2] = { 0, 0 };
969
	unsigned update_level = level;
970
	int journal_flags = flags & JOURNAL_WATERMARK_MASK;
971
	int ret = 0;
972

Kent Overstreet's avatar
Kent Overstreet committed
973
	BUG_ON(!path->should_be_locked);
974

975 976
	if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
		journal_flags |= JOURNAL_RES_GET_NONBLOCK;
977 978

	while (1) {
979
		nr_nodes[!!update_level] += 1 + split;
980 981 982 983 984 985 986 987 988 989 990 991 992 993
		update_level++;

		if (!btree_path_node(path, update_level))
			break;

		/*
		 * XXX: figure out how far we might need to split,
		 * instead of locking/reserving all the way to the root:
		 */
		split = update_level + 1 < BTREE_MAX_DEPTH;
	}

	/* Might have to allocate a new root: */
	if (update_level < BTREE_MAX_DEPTH)
994
		nr_nodes[1] += 1;
995

Kent Overstreet's avatar
Kent Overstreet committed
996
	if (!bch2_btree_path_upgrade(trans, path, U8_MAX)) {
997
		trace_trans_restart_iter_upgrade(trans->fn, _RET_IP_,
Kent Overstreet's avatar
Kent Overstreet committed
998 999 1000
						 path->btree_id, &path->pos);
		ret = btree_trans_restart(trans);
		return ERR_PTR(ret);
1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
	}

	if (flags & BTREE_INSERT_GC_LOCK_HELD)
		lockdep_assert_held(&c->gc_lock);
	else if (!down_read_trylock(&c->gc_lock)) {
		bch2_trans_unlock(trans);
		down_read(&c->gc_lock);
		if (!bch2_trans_relock(trans)) {
			up_read(&c->gc_lock);
			return ERR_PTR(-EINTR);
		}
	}

1014 1015 1016 1017
	as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOIO);
	memset(as, 0, sizeof(*as));
	closure_init(&as->cl, NULL);
	as->c		= c;
1018
	as->start_time	= start_time;
1019
	as->mode	= BTREE_INTERIOR_NO_UPDATE;
1020
	as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
Kent Overstreet's avatar
Kent Overstreet committed
1021
	as->btree_id	= path->btree_id;
1022 1023 1024 1025 1026 1027
	INIT_LIST_HEAD(&as->list);
	INIT_LIST_HEAD(&as->unwritten_list);
	INIT_LIST_HEAD(&as->write_blocked_list);
	bch2_keylist_init(&as->old_keys, as->_old_keys);
	bch2_keylist_init(&as->new_keys, as->_new_keys);
	bch2_keylist_init(&as->parent_keys, as->inline_keys);
1028

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
	mutex_lock(&c->btree_interior_update_lock);
	list_add_tail(&as->list, &c->btree_interior_update_list);
	mutex_unlock(&c->btree_interior_update_lock);

	/*
	 * We don't want to allocate if we're in an error state, that can cause
	 * deadlock on emergency shutdown due to open buckets getting stuck in
	 * the btree_reserve_cache after allocator shutdown has cleared it out.
	 * This check needs to come after adding us to the btree_interior_update
	 * list but before calling bch2_btree_reserve_get, to synchronize with
	 * __bch2_fs_read_only().
	 */
	ret = bch2_journal_error(&c->journal);
	if (ret)
		goto err;

1045 1046 1047
	ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
				      BTREE_UPDATE_JOURNAL_RES,
				      journal_flags|JOURNAL_RES_GET_NONBLOCK);
1048
	if (ret) {
1049 1050
		bch2_trans_unlock(trans);

1051
		ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
1052 1053
					      BTREE_UPDATE_JOURNAL_RES,
					      journal_flags);
1054
		if (ret) {
1055
			trace_trans_restart_journal_preres_get(trans->fn, _RET_IP_);
1056
			btree_trans_restart(trans);
1057
			goto err;
1058
		}
1059

1060 1061
		ret = bch2_trans_relock(trans);
		if (ret)
1062
			goto err;
1063 1064
	}

1065
	ret = bch2_disk_reservation_get(c, &as->disk_res,
1066
			(nr_nodes[0] + nr_nodes[1]) * btree_sectors(c),
1067 1068 1069 1070
			c->opts.metadata_replicas,
			disk_res_flags);
	if (ret)
		goto err;
1071

1072 1073 1074
	ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL);
	if (ret == -EAGAIN ||
	    ret == -ENOMEM) {
1075
		struct closure cl;
1076

1077
		closure_init_stack(&cl);
1078 1079

		bch2_trans_unlock(trans);
1080 1081

		do {
1082
			ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl);
1083 1084
			closure_sync(&cl);
		} while (ret == -EAGAIN);
1085 1086
	}

1087 1088 1089 1090 1091
	if (ret) {
		trace_btree_reserve_get_fail(trans->fn, _RET_IP_,
					     nr_nodes[0] + nr_nodes[1]);
		goto err;
	}
1092

1093 1094 1095 1096 1097 1098 1099 1100
	if (!bch2_trans_relock(trans)) {
		ret = -EINTR;
		goto err;
	}

	return as;
err:
	bch2_btree_update_free(as);
1101
	return ERR_PTR(ret);
1102 1103 1104 1105
}

/* Btree root updates: */

1106
static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b)
1107 1108 1109 1110 1111 1112 1113 1114
{
	/* Root nodes cannot be reaped */
	mutex_lock(&c->btree_cache.lock);
	list_del_init(&b->list);
	mutex_unlock(&c->btree_cache.lock);

	mutex_lock(&c->btree_root_lock);
	BUG_ON(btree_node_root(c, b) &&
1115
	       (b->c.level < btree_node_root(c, b)->c.level ||
1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135
		!btree_node_dying(btree_node_root(c, b))));

	btree_node_root(c, b) = b;
	mutex_unlock(&c->btree_root_lock);

	bch2_recalc_btree_reserve(c);
}

/**
 * bch_btree_set_root - update the root in memory and on disk
 *
 * To ensure forward progress, the current task must not be holding any
 * btree node write locks. However, you must hold an intent lock on the
 * old root.
 *
 * Note: This allocates a journal entry but doesn't add any keys to
 * it.  All the btree roots are part of every journal write, so there
 * is nothing new to be done.  This just guarantees that there is a
 * journal write.
 */
1136 1137
static void bch2_btree_set_root(struct btree_update *as,
				struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1138
				struct btree_path *path,
1139
				struct btree *b)
1140 1141 1142 1143 1144
{
	struct bch_fs *c = as->c;
	struct btree *old;

	trace_btree_set_root(c, b);
1145
	BUG_ON(!b->written);
1146 1147 1148 1149 1150 1151 1152

	old = btree_node_root(c, b);

	/*
	 * Ensure no one is using the old root while we switch to the
	 * new root:
	 */
Kent Overstreet's avatar
Kent Overstreet committed
1153
	bch2_btree_node_lock_write(trans, path, old);
1154

1155
	bch2_btree_set_root_inmem(c, b);
1156

1157
	btree_update_updated_root(as, b);
1158 1159 1160 1161 1162 1163 1164 1165

	/*
	 * Unlock old root after new root is visible:
	 *
	 * The new root isn't persistent, but that's ok: we still have
	 * an intent lock on the new root, and any updates that would
	 * depend on the new root would have to update the new root.
	 */
Kent Overstreet's avatar
Kent Overstreet committed
1166
	bch2_btree_node_unlock_write(trans, path, old);
1167 1168 1169 1170
}

/* Interior node updates: */

1171 1172
static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
					struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1173
					struct btree_path *path,
1174 1175 1176
					struct btree *b,
					struct btree_node_iter *node_iter,
					struct bkey_i *insert)
1177
{
1178
	struct bch_fs *c = as->c;
1179
	struct bkey_packed *k;
1180
	struct printbuf buf = PRINTBUF;
1181

1182 1183 1184
	BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 &&
	       !btree_ptr_sectors_written(insert));

1185 1186 1187
	if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)))
		bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p);

1188 1189
	if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
			      btree_node_type(b), WRITE, &buf) ?:
1190 1191
	    bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) {
		printbuf_reset(&buf);
1192
		prt_printf(&buf, "inserting invalid bkey\n  ");
1193
		bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
1194
		prt_printf(&buf, "\n  ");
1195 1196
		bch2_bkey_invalid(c, bkey_i_to_s_c(insert),
				  btree_node_type(b), WRITE, &buf);
1197 1198 1199
		bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf);

		bch2_fs_inconsistent(c, "%s", buf.buf);
1200 1201
		dump_stack();
	}
1202

1203 1204 1205
	BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) >
	       ARRAY_SIZE(as->journal_entries));

1206 1207 1208 1209 1210
	as->journal_u64s +=
		journal_entry_set((void *) &as->journal_entries[as->journal_u64s],
				  BCH_JSET_ENTRY_btree_keys,
				  b->c.btree_id, b->c.level,
				  insert, insert->k.u64s);
1211 1212

	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
1213
	       bkey_iter_pos_cmp(b, k, &insert->k.p) < 0)
1214 1215
		bch2_btree_node_iter_advance(node_iter, b);

Kent Overstreet's avatar
Kent Overstreet committed
1216
	bch2_btree_bset_insert_key(trans, path, b, node_iter, insert);
1217
	set_btree_node_dirty_acct(c, b);
1218
	set_btree_node_need_write(b);
1219 1220

	printbuf_exit(&buf);
1221 1222
}

1223
static void
1224 1225
__bch2_btree_insert_keys_interior(struct btree_update *as,
				  struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1226
				  struct btree_path *path,
1227 1228 1229
				  struct btree *b,
				  struct btree_node_iter node_iter,
				  struct keylist *keys)
1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240
{
	struct bkey_i *insert = bch2_keylist_front(keys);
	struct bkey_packed *k;

	BUG_ON(btree_node_type(b) != BKEY_TYPE_btree);

	while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) &&
	       (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0))
		;

	while (!bch2_keylist_empty(keys)) {
Kent Overstreet's avatar
Kent Overstreet committed
1241
		bch2_insert_fixup_btree_ptr(as, trans, path, b,
1242
				&node_iter, bch2_keylist_front(keys));
1243 1244 1245 1246
		bch2_keylist_pop_front(keys);
	}
}

1247 1248 1249 1250 1251
/*
 * Move keys from n1 (original replacement node, now lower node) to n2 (higher
 * node)
 */
static struct btree *__btree_split_node(struct btree_update *as,
Kent Overstreet's avatar
Kent Overstreet committed
1252
					struct btree *n1)
1253
{
1254
	struct bkey_format_state s;
1255 1256 1257
	size_t nr_packed = 0, nr_unpacked = 0;
	struct btree *n2;
	struct bset *set1, *set2;
1258
	struct bkey_packed *k, *set2_start, *set2_end, *out, *prev = NULL;
1259
	struct bpos n1_pos;
1260

1261
	n2 = bch2_btree_node_alloc(as, n1->c.level);
1262 1263 1264 1265 1266 1267

	n2->data->max_key	= n1->data->max_key;
	n2->data->format	= n1->format;
	SET_BTREE_NODE_SEQ(n2->data, BTREE_NODE_SEQ(n1->data));
	n2->key.k.p = n1->key.k.p;

1268 1269
	bch2_btree_update_add_new_node(as, n2);

1270 1271 1272 1273 1274 1275 1276 1277 1278
	set1 = btree_bset_first(n1);
	set2 = btree_bset_first(n2);

	/*
	 * Has to be a linear search because we don't have an auxiliary
	 * search tree yet
	 */
	k = set1->start;
	while (1) {
1279
		struct bkey_packed *n = bkey_next(k);
Kent Overstreet's avatar
Kent Overstreet committed
1280 1281

		if (n == vstruct_last(set1))
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291
			break;
		if (k->_data - set1->_data >= (le16_to_cpu(set1->u64s) * 3) / 5)
			break;

		if (bkey_packed(k))
			nr_packed++;
		else
			nr_unpacked++;

		prev = k;
Kent Overstreet's avatar
Kent Overstreet committed
1292
		k = n;
1293 1294 1295
	}

	BUG_ON(!prev);
1296 1297
	set2_start	= k;
	set2_end	= vstruct_last(set1);
1298

1299
	set1->u64s = cpu_to_le16((u64 *) set2_start - set1->_data);
1300 1301 1302 1303 1304 1305 1306
	set_btree_bset_end(n1, n1->set);

	n1->nr.live_u64s	= le16_to_cpu(set1->u64s);
	n1->nr.bset_u64s[0]	= le16_to_cpu(set1->u64s);
	n1->nr.packed_keys	= nr_packed;
	n1->nr.unpacked_keys	= nr_unpacked;

1307 1308 1309 1310 1311 1312
	n1_pos = bkey_unpack_pos(n1, prev);
	if (as->c->sb.version < bcachefs_metadata_version_snapshot)
		n1_pos.snapshot = U32_MAX;

	btree_set_max(n1, n1_pos);
	btree_set_min(n2, bpos_successor(n1->key.k.p));
1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339

	bch2_bkey_format_init(&s);
	bch2_bkey_format_add_pos(&s, n2->data->min_key);
	bch2_bkey_format_add_pos(&s, n2->data->max_key);

	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
		struct bkey uk = bkey_unpack_key(n1, k);
		bch2_bkey_format_add_key(&s, &uk);
	}

	n2->data->format = bch2_bkey_format_done(&s);
	btree_node_set_format(n2, n2->data->format);

	out = set2->start;
	memset(&n2->nr, 0, sizeof(n2->nr));

	for (k = set2_start; k != set2_end; k = bkey_next(k)) {
		BUG_ON(!bch2_bkey_transform(&n2->format, out, bkey_packed(k)
				       ? &n1->format : &bch2_bkey_format_current, k));
		out->format = KEY_FORMAT_LOCAL_BTREE;
		btree_keys_account_key_add(&n2->nr, 0, out);
		out = bkey_next(out);
	}

	set2->u64s = cpu_to_le16((u64 *) out - set2->_data);
	set_btree_bset_end(n2, n2->set);

1340 1341 1342 1343 1344 1345 1346 1347 1348
	BUG_ON(!set1->u64s);
	BUG_ON(!set2->u64s);

	btree_node_reset_sib_u64s(n1);
	btree_node_reset_sib_u64s(n2);

	bch2_verify_btree_nr_keys(n1);
	bch2_verify_btree_nr_keys(n2);

1349
	if (n1->c.level) {
1350 1351
		btree_node_interior_verify(as->c, n1);
		btree_node_interior_verify(as->c, n2);
1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367
	}

	return n2;
}

/*
 * For updates to interior nodes, we've got to do the insert before we split
 * because the stuff we're inserting has to be inserted atomically. Post split,
 * the keys might have to go in different nodes and the split would no longer be
 * atomic.
 *
 * Worse, if the insert is from btree node coalescing, if we do the insert after
 * we do the split (and pick the pivot) - the pivot we pick might be between
 * nodes that were coalesced, and thus in the middle of a child node post
 * coalescing:
 */
1368 1369
static void btree_split_insert_keys(struct btree_update *as,
				    struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1370
				    struct btree_path *path,
1371
				    struct btree *b,
1372 1373 1374 1375
				    struct keylist *keys)
{
	struct btree_node_iter node_iter;
	struct bkey_i *k = bch2_keylist_front(keys);
Kent Overstreet's avatar
Kent Overstreet committed
1376
	struct bkey_packed *src, *dst, *n;
1377 1378
	struct bset *i;

1379
	bch2_btree_node_iter_init(&node_iter, b, &k->k.p);
1380

Kent Overstreet's avatar
Kent Overstreet committed
1381
	__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
1382 1383 1384 1385 1386 1387 1388

	/*
	 * We can't tolerate whiteouts here - with whiteouts there can be
	 * duplicate keys, and it would be rather bad if we picked a duplicate
	 * for the pivot:
	 */
	i = btree_bset_first(b);
Kent Overstreet's avatar
Kent Overstreet committed
1389 1390
	src = dst = i->start;
	while (src != vstruct_last(i)) {
1391
		n = bkey_next(src);
Kent Overstreet's avatar
Kent Overstreet committed
1392 1393 1394 1395 1396 1397 1398
		if (!bkey_deleted(src)) {
			memmove_u64s_down(dst, src, src->u64s);
			dst = bkey_next(dst);
		}
		src = n;
	}

1399 1400 1401
	/* Also clear out the unwritten whiteouts area: */
	b->whiteout_u64s = 0;

Kent Overstreet's avatar
Kent Overstreet committed
1402 1403
	i->u64s = cpu_to_le16((u64 *) dst - i->_data);
	set_btree_bset_end(b, b->set);
1404 1405 1406 1407

	BUG_ON(b->nsets != 1 ||
	       b->nr.live_u64s != le16_to_cpu(btree_bset_first(b)->u64s));

1408
	btree_node_interior_verify(as->c, b);
1409 1410
}

Kent Overstreet's avatar
Kent Overstreet committed
1411 1412 1413
static void btree_split(struct btree_update *as, struct btree_trans *trans,
			struct btree_path *path, struct btree *b,
			struct keylist *keys, unsigned flags)
1414 1415
{
	struct bch_fs *c = as->c;
Kent Overstreet's avatar
Kent Overstreet committed
1416
	struct btree *parent = btree_node_parent(path, b);
1417 1418 1419 1420
	struct btree *n1, *n2 = NULL, *n3 = NULL;
	u64 start_time = local_clock();

	BUG_ON(!parent && (b != btree_node_root(c, b)));
Kent Overstreet's avatar
Kent Overstreet committed
1421
	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
1422 1423 1424 1425 1426 1427

	bch2_btree_interior_update_will_free_node(as, b);

	n1 = bch2_btree_node_alloc_replacement(as, b);

	if (keys)
Kent Overstreet's avatar
Kent Overstreet committed
1428
		btree_split_insert_keys(as, trans, path, n1, keys);
1429

1430
	if (bset_u64s(&n1->set[0]) > BTREE_SPLIT_THRESHOLD(c)) {
1431 1432
		trace_btree_split(c, b);

Kent Overstreet's avatar
Kent Overstreet committed
1433
		n2 = __btree_split_node(as, n1);
1434 1435 1436

		bch2_btree_build_aux_trees(n2);
		bch2_btree_build_aux_trees(n1);
1437 1438
		six_unlock_write(&n2->c.lock);
		six_unlock_write(&n1->c.lock);
1439

1440 1441
		bch2_btree_update_add_new_node(as, n1);

1442 1443
		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
		bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0);
1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454

		/*
		 * Note that on recursive parent_keys == keys, so we
		 * can't start adding new keys to parent_keys before emptying it
		 * out (which we did with btree_split_insert_keys() above)
		 */
		bch2_keylist_add(&as->parent_keys, &n1->key);
		bch2_keylist_add(&as->parent_keys, &n2->key);

		if (!parent) {
			/* Depth increases, make a new root */
1455
			n3 = __btree_root_alloc(as, b->c.level + 1);
1456 1457 1458 1459

			n3->sib_u64s[0] = U16_MAX;
			n3->sib_u64s[1] = U16_MAX;

Kent Overstreet's avatar
Kent Overstreet committed
1460
			btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
1461

1462
			bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0);
1463 1464 1465 1466 1467
		}
	} else {
		trace_btree_compact(c, b);

		bch2_btree_build_aux_trees(n1);
1468
		six_unlock_write(&n1->c.lock);
1469

1470 1471
		bch2_btree_update_add_new_node(as, n1);

1472
		bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0);
1473

1474 1475
		if (parent)
			bch2_keylist_add(&as->parent_keys, &n1->key);
1476 1477 1478 1479 1480 1481
	}

	/* New nodes all written, now make them visible: */

	if (parent) {
		/* Split a non root node */
Kent Overstreet's avatar
Kent Overstreet committed
1482
		bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
1483
	} else if (n3) {
Kent Overstreet's avatar
Kent Overstreet committed
1484
		bch2_btree_set_root(as, trans, path, n3);
1485 1486
	} else {
		/* Root filled up but didn't need to be split */
Kent Overstreet's avatar
Kent Overstreet committed
1487
		bch2_btree_set_root(as, trans, path, n1);
1488 1489
	}

1490
	bch2_btree_update_get_open_buckets(as, n1);
1491
	if (n2)
1492
		bch2_btree_update_get_open_buckets(as, n2);
1493
	if (n3)
1494
		bch2_btree_update_get_open_buckets(as, n3);
1495

Kent Overstreet's avatar
Kent Overstreet committed
1496
	/* Successful split, update the path to point to the new nodes: */
1497

1498
	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1499
	if (n3)
1500
		bch2_trans_node_add(trans, n3);
1501
	if (n2)
1502 1503
		bch2_trans_node_add(trans, n2);
	bch2_trans_node_add(trans, n1);
1504

1505 1506 1507 1508 1509 1510
	/*
	 * The old node must be freed (in memory) _before_ unlocking the new
	 * nodes - else another thread could re-acquire a read lock on the old
	 * node after another thread has locked and updated the new node, thus
	 * seeing stale data:
	 */
1511
	bch2_btree_node_free_inmem(trans, b);
1512

1513 1514 1515 1516 1517 1518
	if (n3)
		six_unlock_intent(&n3->c.lock);
	if (n2)
		six_unlock_intent(&n2->c.lock);
	six_unlock_intent(&n1->c.lock);

Kent Overstreet's avatar
Kent Overstreet committed
1519
	bch2_trans_verify_locks(trans);
1520

1521 1522 1523
	bch2_time_stats_update(&c->times[n2
			       ? BCH_TIME_btree_node_split
			       : BCH_TIME_btree_node_compact],
1524
			       start_time);
1525 1526 1527
}

static void
1528 1529
bch2_btree_insert_keys_interior(struct btree_update *as,
				struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1530
				struct btree_path *path,
1531 1532
				struct btree *b,
				struct keylist *keys)
1533
{
Kent Overstreet's avatar
Kent Overstreet committed
1534
	struct btree_path *linked;
1535

Kent Overstreet's avatar
Kent Overstreet committed
1536 1537
	__bch2_btree_insert_keys_interior(as, trans, path, b,
					  path->l[b->c.level].iter, keys);
1538 1539 1540

	btree_update_updated_node(as, b);

Kent Overstreet's avatar
Kent Overstreet committed
1541
	trans_for_each_path_with_node(trans, b, linked)
1542
		bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
1543

Kent Overstreet's avatar
Kent Overstreet committed
1544
	bch2_trans_verify_paths(trans);
1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
}

/**
 * bch_btree_insert_node - insert bkeys into a given btree node
 *
 * @iter:		btree iterator
 * @keys:		list of keys to insert
 * @hook:		insert callback
 * @persistent:		if not null, @persistent will wait on journal write
 *
 * Inserts as many keys as it can into a given btree node, splitting it if full.
 * If a split occurred, this function will return early. This can only happen
 * for leaf nodes -- inserts into interior nodes have to be atomic.
 */
Kent Overstreet's avatar
Kent Overstreet committed
1559 1560 1561
static void bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
				   struct btree_path *path, struct btree *b,
				   struct keylist *keys, unsigned flags)
1562 1563 1564 1565 1566 1567
{
	struct bch_fs *c = as->c;
	int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
	int old_live_u64s = b->nr.live_u64s;
	int live_u64s_added, u64s_added;

1568
	lockdep_assert_held(&c->gc_lock);
Kent Overstreet's avatar
Kent Overstreet committed
1569
	BUG_ON(!btree_node_intent_locked(path, btree_node_root(c, b)->c.level));
1570
	BUG_ON(!b->c.level);
1571 1572 1573
	BUG_ON(!as || as->b);
	bch2_verify_keylist_sorted(keys);

Kent Overstreet's avatar
Kent Overstreet committed
1574
	bch2_btree_node_lock_for_insert(trans, path, b);
1575

1576
	if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
Kent Overstreet's avatar
Kent Overstreet committed
1577
		bch2_btree_node_unlock_write(trans, path, b);
1578 1579 1580
		goto split;
	}

1581 1582
	btree_node_interior_verify(c, b);

Kent Overstreet's avatar
Kent Overstreet committed
1583
	bch2_btree_insert_keys_interior(as, trans, path, b, keys);
1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594

	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
	u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;

	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);

	if (u64s_added > live_u64s_added &&
	    bch2_maybe_compact_whiteouts(c, b))
1595
		bch2_trans_node_reinit_iter(trans, b);
1596

Kent Overstreet's avatar
Kent Overstreet committed
1597
	bch2_btree_node_unlock_write(trans, path, b);
1598

1599
	btree_node_interior_verify(c, b);
1600 1601
	return;
split:
Kent Overstreet's avatar
Kent Overstreet committed
1602
	btree_split(as, trans, path, b, keys, flags);
1603 1604
}

1605
int bch2_btree_split_leaf(struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1606
			  struct btree_path *path,
1607 1608
			  unsigned flags)
{
Kent Overstreet's avatar
Kent Overstreet committed
1609
	struct btree *b = path_l(path)->b;
1610
	struct btree_update *as;
1611 1612
	unsigned l;
	int ret = 0;
1613

Kent Overstreet's avatar
Kent Overstreet committed
1614
	as = bch2_btree_update_start(trans, path, path->level,
1615
				     true, flags);
1616 1617
	if (IS_ERR(as))
		return PTR_ERR(as);
1618

Kent Overstreet's avatar
Kent Overstreet committed
1619
	btree_split(as, trans, path, b, NULL, flags);
1620
	bch2_btree_update_done(as);
1621

Kent Overstreet's avatar
Kent Overstreet committed
1622 1623
	for (l = path->level + 1; btree_path_node(path, l) && !ret; l++)
		ret = bch2_foreground_maybe_merge(trans, path, l, flags);
1624 1625

	return ret;
1626 1627
}

1628
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
Kent Overstreet's avatar
Kent Overstreet committed
1629
				  struct btree_path *path,
1630 1631 1632
				  unsigned level,
				  unsigned flags,
				  enum btree_node_sibling sib)
1633
{
1634
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
1635
	struct btree_path *sib_path = NULL;
1636 1637 1638 1639 1640
	struct btree_update *as;
	struct bkey_format_state new_s;
	struct bkey_format new_f;
	struct bkey_i delete;
	struct btree *b, *m, *n, *prev, *next, *parent;
1641
	struct bpos sib_pos;
1642
	size_t sib_u64s;
1643
	u64 start_time = local_clock();
1644
	int ret = 0;
1645

Kent Overstreet's avatar
Kent Overstreet committed
1646 1647
	BUG_ON(!path->should_be_locked);
	BUG_ON(!btree_node_locked(path, level));
1648

Kent Overstreet's avatar
Kent Overstreet committed
1649
	b = path->l[level].b;
1650

1651
	if ((sib == btree_prev_sib && !bpos_cmp(b->data->min_key, POS_MIN)) ||
1652
	    (sib == btree_next_sib && !bpos_cmp(b->data->max_key, SPOS_MAX))) {
1653
		b->sib_u64s[sib] = U16_MAX;
1654
		return 0;
1655
	}
1656

1657 1658 1659
	sib_pos = sib == btree_prev_sib
		? bpos_predecessor(b->data->min_key)
		: bpos_successor(b->data->max_key);
1660

1661 1662
	sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
				 U8_MAX, level, BTREE_ITER_INTENT);
Kent Overstreet's avatar
Kent Overstreet committed
1663
	ret = bch2_btree_path_traverse(trans, sib_path, false);
1664
	if (ret)
1665 1666
		goto err;

Kent Overstreet's avatar
Kent Overstreet committed
1667 1668 1669
	sib_path->should_be_locked = true;

	m = sib_path->l[level].b;
1670

Kent Overstreet's avatar
Kent Overstreet committed
1671 1672
	if (btree_node_parent(path, b) !=
	    btree_node_parent(sib_path, m)) {
1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
		b->sib_u64s[sib] = U16_MAX;
		goto out;
	}

	if (sib == btree_prev_sib) {
		prev = m;
		next = b;
	} else {
		prev = b;
		next = m;
	}

1685
	if (bkey_cmp(bpos_successor(prev->data->max_key), next->data->min_key)) {
1686
		struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF;
1687

1688 1689
		bch2_bpos_to_text(&buf1, prev->data->max_key);
		bch2_bpos_to_text(&buf2, next->data->min_key);
1690 1691 1692 1693
		bch_err(c,
			"btree topology error in btree merge:\n"
			"  prev ends at   %s\n"
			"  next starts at %s",
1694 1695 1696
			buf1.buf, buf2.buf);
		printbuf_exit(&buf1);
		printbuf_exit(&buf2);
1697
		bch2_topology_error(c);
1698 1699 1700
		ret = -EIO;
		goto err;
	}
1701

1702
	bch2_bkey_format_init(&new_s);
1703 1704 1705 1706
	bch2_bkey_format_add_pos(&new_s, prev->data->min_key);
	__bch2_btree_calc_format(&new_s, prev);
	__bch2_btree_calc_format(&new_s, next);
	bch2_bkey_format_add_pos(&new_s, next->data->max_key);
1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718
	new_f = bch2_bkey_format_done(&new_s);

	sib_u64s = btree_node_u64s_with_format(b, &new_f) +
		btree_node_u64s_with_format(m, &new_f);

	if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) {
		sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
		sib_u64s /= 2;
		sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c);
	}

	sib_u64s = min(sib_u64s, btree_max_u64s(c));
1719
	sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1);
1720 1721
	b->sib_u64s[sib] = sib_u64s;

1722
	if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
1723 1724
		goto out;

Kent Overstreet's avatar
Kent Overstreet committed
1725
	parent = btree_node_parent(path, b);
1726
	as = bch2_btree_update_start(trans, path, level, false,
1727
			 BTREE_INSERT_NOFAIL|
1728 1729
			 BTREE_INSERT_USE_RESERVE|
			 flags);
1730
	ret = PTR_ERR_OR_ZERO(as);
1731
	if (ret)
1732
		goto err;
1733 1734 1735 1736 1737 1738

	trace_btree_merge(c, b);

	bch2_btree_interior_update_will_free_node(as, b);
	bch2_btree_interior_update_will_free_node(as, m);

1739
	n = bch2_btree_node_alloc(as, b->c.level);
1740

1741 1742 1743 1744
	SET_BTREE_NODE_SEQ(n->data,
			   max(BTREE_NODE_SEQ(b->data),
			       BTREE_NODE_SEQ(m->data)) + 1);

Kent Overstreet's avatar
Kent Overstreet committed
1745 1746
	btree_set_min(n, prev->data->min_key);
	btree_set_max(n, next->data->max_key);
1747

1748 1749 1750
	bch2_btree_update_add_new_node(as, n);

	n->data->format	 = new_f;
1751 1752 1753 1754 1755 1756
	btree_node_set_format(n, new_f);

	bch2_btree_sort_into(c, n, prev);
	bch2_btree_sort_into(c, n, next);

	bch2_btree_build_aux_trees(n);
1757
	six_unlock_write(&n->c.lock);
1758

1759
	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
1760

1761 1762 1763 1764 1765
	bkey_init(&delete.k);
	delete.k.p = prev->key.k.p;
	bch2_keylist_add(&as->parent_keys, &delete);
	bch2_keylist_add(&as->parent_keys, &n->key);

1766 1767
	bch2_trans_verify_paths(trans);

Kent Overstreet's avatar
Kent Overstreet committed
1768
	bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags);
1769

1770 1771
	bch2_trans_verify_paths(trans);

1772
	bch2_btree_update_get_open_buckets(as, n);
1773

1774
	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1775
	six_lock_increment(&m->c.lock, SIX_LOCK_intent);
1776

1777
	bch2_trans_node_add(trans, n);
1778

Kent Overstreet's avatar
Kent Overstreet committed
1779
	bch2_trans_verify_paths(trans);
1780

1781 1782
	bch2_btree_node_free_inmem(trans, b);
	bch2_btree_node_free_inmem(trans, m);
1783

1784 1785
	six_unlock_intent(&n->c.lock);

1786
	bch2_btree_update_done(as);
1787 1788

	bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time);
1789 1790
out:
err:
1791 1792 1793
	bch2_path_put(trans, sib_path, true);
	bch2_trans_verify_locks(trans);
	return ret;
1794 1795
}

1796 1797 1798
/**
 * bch_btree_node_rewrite - Rewrite/move a btree node
 */
1799 1800
int bch2_btree_node_rewrite(struct btree_trans *trans,
			    struct btree_iter *iter,
1801 1802
			    struct btree *b,
			    unsigned flags)
1803
{
1804
	struct bch_fs *c = trans->c;
1805
	struct btree *n, *parent;
1806
	struct btree_update *as;
1807 1808 1809
	int ret;

	flags |= BTREE_INSERT_NOFAIL;
1810

Kent Overstreet's avatar
Kent Overstreet committed
1811 1812
	parent = btree_node_parent(iter->path, b);
	as = bch2_btree_update_start(trans, iter->path, b->c.level,
1813
				     false, flags);
1814
	ret = PTR_ERR_OR_ZERO(as);
1815
	if (ret)
1816
		goto out;
1817 1818 1819 1820

	bch2_btree_interior_update_will_free_node(as, b);

	n = bch2_btree_node_alloc_replacement(as, b);
1821
	bch2_btree_update_add_new_node(as, n);
1822 1823

	bch2_btree_build_aux_trees(n);
1824
	six_unlock_write(&n->c.lock);
1825

1826
	trace_btree_rewrite(c, b);
1827

1828
	bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
1829 1830 1831

	if (parent) {
		bch2_keylist_add(&as->parent_keys, &n->key);
Kent Overstreet's avatar
Kent Overstreet committed
1832
		bch2_btree_insert_node(as, trans, iter->path, parent,
1833
				       &as->parent_keys, flags);
1834
	} else {
Kent Overstreet's avatar
Kent Overstreet committed
1835
		bch2_btree_set_root(as, trans, iter->path, n);
1836 1837
	}

1838
	bch2_btree_update_get_open_buckets(as, n);
1839

1840
	six_lock_increment(&b->c.lock, SIX_LOCK_intent);
1841 1842
	bch2_trans_node_add(trans, n);
	bch2_btree_node_free_inmem(trans, b);
1843
	six_unlock_intent(&n->c.lock);
1844 1845

	bch2_btree_update_done(as);
1846
out:
1847
	bch2_btree_path_downgrade(trans, iter->path);
1848 1849 1850
	return ret;
}

1851 1852 1853 1854 1855 1856 1857 1858 1859
struct async_btree_rewrite {
	struct bch_fs		*c;
	struct work_struct	work;
	enum btree_id		btree_id;
	unsigned		level;
	struct bpos		pos;
	__le64			seq;
};

1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883
static int async_btree_node_rewrite_trans(struct btree_trans *trans,
					  struct async_btree_rewrite *a)
{
	struct btree_iter iter;
	struct btree *b;
	int ret;

	bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos,
				  BTREE_MAX_DEPTH, a->level, 0);
	b = bch2_btree_iter_peek_node(&iter);
	ret = PTR_ERR_OR_ZERO(b);
	if (ret)
		goto out;

	if (!b || b->data->keys.seq != a->seq)
		goto out;

	ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
out :
	bch2_trans_iter_exit(trans, &iter);

	return ret;
}

1884 1885 1886 1887 1888 1889
void async_btree_node_rewrite_work(struct work_struct *work)
{
	struct async_btree_rewrite *a =
		container_of(work, struct async_btree_rewrite, work);
	struct bch_fs *c = a->c;

1890 1891
	bch2_trans_do(c, NULL, NULL, 0,
		      async_btree_node_rewrite_trans(&trans, a));
1892 1893 1894 1895 1896 1897 1898 1899
	percpu_ref_put(&c->writes);
	kfree(a);
}

void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
{
	struct async_btree_rewrite *a;

1900
	if (!percpu_ref_tryget_live(&c->writes))
1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915
		return;

	a = kmalloc(sizeof(*a), GFP_NOFS);
	if (!a) {
		percpu_ref_put(&c->writes);
		return;
	}

	a->c		= c;
	a->btree_id	= b->c.btree_id;
	a->level	= b->c.level;
	a->pos		= b->key.k.p;
	a->seq		= b->data->keys.seq;

	INIT_WORK(&a->work, async_btree_node_rewrite_work);
1916
	queue_work(c->btree_interior_update_worker, &a->work);
1917 1918
}

1919 1920 1921 1922 1923
static int __bch2_btree_node_update_key(struct btree_trans *trans,
					struct btree_iter *iter,
					struct btree *b, struct btree *new_hash,
					struct bkey_i *new_key,
					bool skip_triggers)
1924
{
1925
	struct bch_fs *c = trans->c;
Kent Overstreet's avatar
Kent Overstreet committed
1926
	struct btree_iter iter2 = { NULL };
1927 1928 1929
	struct btree *parent;
	int ret;

1930
	if (!skip_triggers) {
1931 1932
		ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
					  bkey_i_to_s_c(&b->key), 0);
1933 1934 1935
		if (ret)
			return ret;

1936 1937
		ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
					  new_key, 0);
1938 1939 1940 1941 1942 1943 1944 1945 1946 1947
		if (ret)
			return ret;
	}

	if (new_hash) {
		bkey_copy(&new_hash->key, new_key);
		ret = bch2_btree_node_hash_insert(&c->btree_cache,
				new_hash, b->c.level, b->c.btree_id);
		BUG_ON(ret);
	}
1948

Kent Overstreet's avatar
Kent Overstreet committed
1949
	parent = btree_node_parent(iter->path, b);
1950
	if (parent) {
Kent Overstreet's avatar
Kent Overstreet committed
1951
		bch2_trans_copy_iter(&iter2, iter);
1952

Kent Overstreet's avatar
Kent Overstreet committed
1953 1954
		iter2.path = bch2_btree_path_make_mut(trans, iter2.path,
				iter2.flags & BTREE_ITER_INTENT);
1955

Kent Overstreet's avatar
Kent Overstreet committed
1956 1957
		BUG_ON(iter2.path->level != b->c.level);
		BUG_ON(bpos_cmp(iter2.path->pos, new_key->k.p));
1958

1959
		btree_node_unlock(trans, iter2.path, iter2.path->level);
Kent Overstreet's avatar
Kent Overstreet committed
1960 1961
		path_l(iter2.path)->b = BTREE_ITER_NO_NODE_UP;
		iter2.path->level++;
1962
		btree_path_set_dirty(iter2.path, BTREE_ITER_NEED_TRAVERSE);
Kent Overstreet's avatar
Kent Overstreet committed
1963

1964 1965
		trans->paths_sorted = false;

Kent Overstreet's avatar
Kent Overstreet committed
1966 1967
		ret   = bch2_btree_iter_traverse(&iter2) ?:
			bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN);
1968 1969
		if (ret)
			goto err;
1970 1971 1972
	} else {
		BUG_ON(btree_node_root(c, b) != b);

1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
		ret = darray_make_room(&trans->extra_journal_entries,
				       jset_u64s(new_key->k.u64s));
		if (ret)
			return ret;

		journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
				  BCH_JSET_ENTRY_btree_root,
				  b->c.btree_id, b->c.level,
				  new_key, new_key->k.u64s);
		trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
1983
	}
1984

1985 1986 1987
	ret = bch2_trans_commit(trans, NULL, NULL,
				BTREE_INSERT_NOFAIL|
				BTREE_INSERT_NOCHECK_RW|
1988
				BTREE_INSERT_USE_RESERVE|
1989
				BTREE_INSERT_JOURNAL_RECLAIM|
1990
				JOURNAL_WATERMARK_reserved);
1991 1992
	if (ret)
		goto err;
1993

Kent Overstreet's avatar
Kent Overstreet committed
1994
	bch2_btree_node_lock_write(trans, iter->path, b);
1995

1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006
	if (new_hash) {
		mutex_lock(&c->btree_cache.lock);
		bch2_btree_node_hash_remove(&c->btree_cache, new_hash);
		bch2_btree_node_hash_remove(&c->btree_cache, b);

		bkey_copy(&b->key, new_key);
		ret = __bch2_btree_node_hash_insert(&c->btree_cache, b);
		BUG_ON(ret);
		mutex_unlock(&c->btree_cache.lock);
	} else {
		bkey_copy(&b->key, new_key);
2007 2008
	}

Kent Overstreet's avatar
Kent Overstreet committed
2009
	bch2_btree_node_unlock_write(trans, iter->path, b);
2010
out:
Kent Overstreet's avatar
Kent Overstreet committed
2011
	bch2_trans_iter_exit(trans, &iter2);
2012 2013 2014 2015 2016 2017 2018 2019
	return ret;
err:
	if (new_hash) {
		mutex_lock(&c->btree_cache.lock);
		bch2_btree_node_hash_remove(&c->btree_cache, b);
		mutex_unlock(&c->btree_cache.lock);
	}
	goto out;
2020 2021
}

2022 2023 2024
int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter,
			       struct btree *b, struct bkey_i *new_key,
			       bool skip_triggers)
2025
{
2026
	struct bch_fs *c = trans->c;
2027
	struct btree *new_hash = NULL;
2028
	struct btree_path *path = iter->path;
2029
	struct closure cl;
2030
	int ret = 0;
2031

2032 2033 2034 2035 2036 2037
	if (!btree_node_intent_locked(path, b->c.level) &&
	    !bch2_btree_path_upgrade(trans, path, b->c.level + 1)) {
		btree_trans_restart(trans);
		return -EINTR;
	}

2038 2039
	closure_init_stack(&cl);

2040 2041 2042 2043 2044
	/*
	 * check btree_ptr_hash_val() after @b is locked by
	 * btree_iter_traverse():
	 */
	if (btree_ptr_hash_val(new_key) != b->hash_val) {
2045 2046
		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
		if (ret) {
2047
			bch2_trans_unlock(trans);
2048
			closure_sync(&cl);
2049
			if (!bch2_trans_relock(trans))
2050
				return -EINTR;
2051 2052
		}

2053
		new_hash = bch2_btree_node_mem_alloc(c, false);
2054 2055
	}

2056
	path->intent_ref++;
2057 2058
	ret = __bch2_btree_node_update_key(trans, iter, b, new_hash,
					   new_key, skip_triggers);
2059
	--path->intent_ref;
2060 2061 2062 2063 2064 2065

	if (new_hash) {
		mutex_lock(&c->btree_cache.lock);
		list_move(&new_hash->list, &c->btree_cache.freeable);
		mutex_unlock(&c->btree_cache.lock);

2066 2067
		six_unlock_write(&new_hash->c.lock);
		six_unlock_intent(&new_hash->c.lock);
2068 2069
	}
	closure_sync(&cl);
2070
	bch2_btree_cache_cannibalize_unlock(c);
2071 2072 2073
	return ret;
}

2074 2075 2076 2077
int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
					struct btree *b, struct bkey_i *new_key,
					bool skip_triggers)
{
Kent Overstreet's avatar
Kent Overstreet committed
2078
	struct btree_iter iter;
2079 2080
	int ret;

Kent Overstreet's avatar
Kent Overstreet committed
2081 2082 2083 2084
	bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p,
				  BTREE_MAX_DEPTH, b->c.level,
				  BTREE_ITER_INTENT);
	ret = bch2_btree_iter_traverse(&iter);
2085 2086 2087 2088
	if (ret)
		goto out;

	/* has node been freed? */
Kent Overstreet's avatar
Kent Overstreet committed
2089
	if (iter.path->l[b->c.level].b != b) {
2090 2091 2092 2093 2094 2095 2096
		/* node has been freed: */
		BUG_ON(!btree_node_dying(b));
		goto out;
	}

	BUG_ON(!btree_node_hashed(b));

Kent Overstreet's avatar
Kent Overstreet committed
2097
	ret = bch2_btree_node_update_key(trans, &iter, b, new_key, skip_triggers);
2098
out:
Kent Overstreet's avatar
Kent Overstreet committed
2099
	bch2_trans_iter_exit(trans, &iter);
2100 2101 2102
	return ret;
}

2103 2104 2105 2106 2107 2108 2109 2110 2111 2112
/* Init code: */

/*
 * Only for filesystem bringup, when first reading the btree roots or allocating
 * btree roots when initializing a new filesystem:
 */
void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b)
{
	BUG_ON(btree_node_root(c, b));

2113
	bch2_btree_set_root_inmem(c, b);
2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128
}

void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id)
{
	struct closure cl;
	struct btree *b;
	int ret;

	closure_init_stack(&cl);

	do {
		ret = bch2_btree_cache_cannibalize_lock(c, &cl);
		closure_sync(&cl);
	} while (ret);

2129
	b = bch2_btree_node_mem_alloc(c, false);
2130 2131 2132
	bch2_btree_cache_cannibalize_unlock(c);

	set_btree_node_fake(b);
2133
	set_btree_node_need_rewrite(b);
2134 2135
	b->c.level	= 0;
	b->c.btree_id	= id;
2136

2137
	bkey_btree_ptr_init(&b->key);
2138
	b->key.k.p = SPOS_MAX;
2139
	*((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id;
2140 2141 2142 2143

	bch2_bset_init_first(b, &b->data->keys);
	bch2_btree_build_aux_trees(b);

2144
	b->data->flags = 0;
Kent Overstreet's avatar
Kent Overstreet committed
2145
	btree_set_min(b, POS_MIN);
2146
	btree_set_max(b, SPOS_MAX);
2147 2148 2149
	b->data->format = bch2_btree_calc_format(b);
	btree_node_set_format(b, b->data->format);

2150 2151
	ret = bch2_btree_node_hash_insert(&c->btree_cache, b,
					  b->c.level, b->c.btree_id);
2152 2153
	BUG_ON(ret);

2154
	bch2_btree_set_root_inmem(c, b);
2155

2156 2157
	six_unlock_write(&b->c.lock);
	six_unlock_intent(&b->c.lock);
2158 2159
}

2160
void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c)
2161 2162 2163 2164 2165
{
	struct btree_update *as;

	mutex_lock(&c->btree_interior_update_lock);
	list_for_each_entry(as, &c->btree_interior_update_list, list)
2166
		prt_printf(out, "%p m %u w %u r %u j %llu\n",
2167 2168 2169 2170 2171
		       as,
		       as->mode,
		       as->nodes_written,
		       atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK,
		       as->journal.seq);
2172 2173 2174
	mutex_unlock(&c->btree_interior_update_lock);
}

2175
static bool bch2_btree_interior_updates_pending(struct bch_fs *c)
2176
{
2177
	bool ret;
2178 2179

	mutex_lock(&c->btree_interior_update_lock);
2180
	ret = !list_empty(&c->btree_interior_update_list);
2181 2182 2183 2184
	mutex_unlock(&c->btree_interior_update_lock);

	return ret;
}
2185

2186 2187 2188 2189 2190 2191 2192 2193 2194 2195
bool bch2_btree_interior_updates_flush(struct bch_fs *c)
{
	bool ret = bch2_btree_interior_updates_pending(c);

	if (ret)
		closure_wait_event(&c->btree_interior_update_wait,
				   !bch2_btree_interior_updates_pending(c));
	return ret;
}

2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243
void bch2_journal_entries_to_btree_roots(struct bch_fs *c, struct jset *jset)
{
	struct btree_root *r;
	struct jset_entry *entry;

	mutex_lock(&c->btree_root_lock);

	vstruct_for_each(jset, entry)
		if (entry->type == BCH_JSET_ENTRY_btree_root) {
			r = &c->btree_roots[entry->btree_id];
			r->level = entry->level;
			r->alive = true;
			bkey_copy(&r->key, &entry->start[0]);
		}

	mutex_unlock(&c->btree_root_lock);
}

struct jset_entry *
bch2_btree_roots_to_journal_entries(struct bch_fs *c,
				    struct jset_entry *start,
				    struct jset_entry *end)
{
	struct jset_entry *entry;
	unsigned long have = 0;
	unsigned i;

	for (entry = start; entry < end; entry = vstruct_next(entry))
		if (entry->type == BCH_JSET_ENTRY_btree_root)
			__set_bit(entry->btree_id, &have);

	mutex_lock(&c->btree_root_lock);

	for (i = 0; i < BTREE_ID_NR; i++)
		if (c->btree_roots[i].alive && !test_bit(i, &have)) {
			journal_entry_set(end,
					  BCH_JSET_ENTRY_btree_root,
					  i, c->btree_roots[i].level,
					  &c->btree_roots[i].key,
					  c->btree_roots[i].key.u64s);
			end = vstruct_next(end);
		}

	mutex_unlock(&c->btree_root_lock);

	return end;
}

2244 2245
void bch2_fs_btree_interior_update_exit(struct bch_fs *c)
{
2246 2247
	if (c->btree_interior_update_worker)
		destroy_workqueue(c->btree_interior_update_worker);
2248 2249 2250 2251 2252 2253 2254 2255 2256
	mempool_exit(&c->btree_interior_update_pool);
}

int bch2_fs_btree_interior_update_init(struct bch_fs *c)
{
	mutex_init(&c->btree_reserve_cache_lock);
	INIT_LIST_HEAD(&c->btree_interior_update_list);
	INIT_LIST_HEAD(&c->btree_interior_updates_unwritten);
	mutex_init(&c->btree_interior_update_lock);
2257 2258 2259 2260 2261 2262
	INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work);

	c->btree_interior_update_worker =
		alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1);
	if (!c->btree_interior_update_worker)
		return -ENOMEM;
2263

2264 2265
	return mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1,
					 sizeof(struct btree_update));
2266
}