btree_update_leaf.c 24.4 KB
Newer Older
1 2 3 4 5
// SPDX-License-Identifier: GPL-2.0

#include "bcachefs.h"
#include "btree_update.h"
#include "btree_update_interior.h"
6
#include "btree_gc.h"
7 8 9
#include "btree_io.h"
#include "btree_iter.h"
#include "btree_locking.h"
10
#include "buckets.h"
11
#include "debug.h"
12
#include "error.h"
13
#include "extent_update.h"
14 15 16
#include "journal.h"
#include "journal_reclaim.h"
#include "keylist.h"
17
#include "replicas.h"
18 19
#include "trace.h"

20
#include <linux/prefetch.h>
21 22
#include <linux/sort.h>

23
static inline bool same_leaf_as_prev(struct btree_trans *trans,
24
				     struct btree_insert_entry *i)
25
{
26 27
	return i != trans->updates &&
		i[0].iter->l[0].b == i[-1].iter->l[0].b;
28 29 30
}


31 32 33 34 35
inline void bch2_btree_node_lock_for_insert(struct bch_fs *c, struct btree *b,
					    struct btree_iter *iter)
{
	bch2_btree_node_lock_write(b, iter);

36
	if (unlikely(btree_node_just_written(b)) &&
37 38 39 40 41 42 43 44 45 46 47
	    bch2_btree_post_write_cleanup(c, b))
		bch2_btree_iter_reinit_node(iter, b);

	/*
	 * If the last bset has been written, or if it's gotten too big - start
	 * a new bset to insert into:
	 */
	if (want_new_bset(c, b))
		bch2_btree_init_next(c, b, iter);
}

48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
/* Inserting into a given leaf node (last stage of insert): */

/* Handle overwrites and do insert, for non extents: */
bool bch2_btree_bset_insert_key(struct btree_iter *iter,
				struct btree *b,
				struct btree_node_iter *node_iter,
				struct bkey_i *insert)
{
	const struct bkey_format *f = &b->format;
	struct bkey_packed *k;
	unsigned clobber_u64s;

	EBUG_ON(btree_node_just_written(b));
	EBUG_ON(bset_written(b, btree_bset_last(b)));
	EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
	EBUG_ON(bkey_cmp(bkey_start_pos(&insert->k), b->data->min_key) < 0 ||
		bkey_cmp(insert->k.p, b->data->max_key) > 0);

	k = bch2_btree_node_iter_peek_all(node_iter, b);
67 68
	if (k && bkey_cmp_packed(b, k, &insert->k))
		k = NULL;
69

70
	/* @k is the key being overwritten/deleted, if any: */
71

72
	EBUG_ON(k && bkey_whiteout(k));
73

74 75
	if (bkey_whiteout(&insert->k)) {
		/* Deleting: */
76

77 78 79
		/* Not found? Nothing to do: */
		if (!k)
			return false;
80

81 82
		btree_account_key_drop(b, k);
		k->type = KEY_TYPE_deleted;
83

84 85 86 87
		if (k->needs_whiteout) {
			push_whiteout(iter->trans->c, b, k);
			k->needs_whiteout = false;
		}
88

89 90 91 92
		if (k >= btree_bset_last(b)->start) {
			clobber_u64s = k->u64s;

			bch2_bset_delete(b, k, clobber_u64s);
93
			bch2_btree_node_iter_fix(iter, b, node_iter, k,
94 95 96 97 98 99 100
						 clobber_u64s, 0);
		} else {
			bch2_btree_iter_fix_key_modified(iter, b, k);
		}

		return true;
	}
101

102 103 104 105 106 107 108 109
	if (k) {
		/* Overwriting: */
		if (!bkey_written(b, k) &&
		    bkey_val_u64s(&insert->k) == bkeyp_val_u64s(f, k)) {
			k->type = insert->k.type;
			memcpy_u64s(bkeyp_val(f, k), &insert->v,
				    bkey_val_u64s(&insert->k));
			return true;
110 111
		}

112 113 114
		btree_account_key_drop(b, k);
		k->type = KEY_TYPE_deleted;

115 116 117
		insert->k.needs_whiteout = k->needs_whiteout;
		k->needs_whiteout = false;

118 119
		if (k >= btree_bset_last(b)->start) {
			clobber_u64s = k->u64s;
120
			goto overwrite;
121 122
		} else {
			bch2_btree_iter_fix_key_modified(iter, b, k);
123 124 125
		}
	}

126
	k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b));
127 128 129
	clobber_u64s = 0;
overwrite:
	bch2_bset_insert(b, node_iter, k, insert, clobber_u64s);
130 131
	bch2_btree_node_iter_fix(iter, b, node_iter, k,
				 clobber_u64s, k->u64s);
132 133 134 135 136 137 138 139 140 141 142 143
	return true;
}

static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
			       unsigned i, u64 seq)
{
	struct bch_fs *c = container_of(j, struct bch_fs, journal);
	struct btree_write *w = container_of(pin, struct btree_write, journal);
	struct btree *b = container_of(w, struct btree, writes[i]);

	btree_node_lock_type(c, b, SIX_LOCK_read);
	bch2_btree_node_write_cond(c, b,
144
		(btree_current_write(b) == w && w->journal.seq == seq));
145
	six_unlock_read(&b->c.lock);
146 147 148 149 150 151 152 153 154 155 156 157
}

static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
	return __btree_node_flush(j, pin, 0, seq);
}

static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
{
	return __btree_node_flush(j, pin, 1, seq);
}

158
static inline void __btree_journal_key(struct btree_trans *trans,
159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
				       enum btree_id btree_id,
				       struct bkey_i *insert)
{
	struct journal *j = &trans->c->journal;
	u64 seq = trans->journal_res.seq;
	bool needs_whiteout = insert->k.needs_whiteout;

	/* ick */
	insert->k.needs_whiteout = false;
	bch2_journal_add_keys(j, &trans->journal_res,
			      btree_id, insert);
	insert->k.needs_whiteout = needs_whiteout;

	bch2_journal_set_has_inode(j, &trans->journal_res,
				   insert->k.p.inode);

	if (trans->journal_seq)
		*trans->journal_seq = seq;
}

179
void bch2_btree_journal_key(struct btree_trans *trans,
180 181 182 183 184 185 186 187
			   struct btree_iter *iter,
			   struct bkey_i *insert)
{
	struct bch_fs *c = trans->c;
	struct journal *j = &c->journal;
	struct btree *b = iter->l[0].b;
	struct btree_write *w = btree_current_write(b);

188
	EBUG_ON(iter->level || b->c.level);
189 190 191 192
	EBUG_ON(trans->journal_res.ref !=
		!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY));

	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
193 194 195
		__btree_journal_key(trans, iter->btree_id, insert);
		btree_bset_last(b)->journal_seq =
			cpu_to_le64(trans->journal_res.seq);
196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212
	}

	if (unlikely(!journal_pin_active(&w->journal))) {
		u64 seq = likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
			? trans->journal_res.seq
			: j->replay_journal_seq;

		bch2_journal_pin_add(j, seq, &w->journal,
				     btree_node_write_idx(b) == 0
				     ? btree_node_flush0
				     : btree_node_flush1);
	}

	if (unlikely(!btree_node_dirty(b)))
		set_btree_node_dirty(b);
}

213
static void bch2_insert_fixup_key(struct btree_trans *trans,
214 215
				  struct btree_iter *iter,
				  struct bkey_i *insert)
216 217 218 219
{
	struct btree_iter_level *l = &iter->l[0];

	EBUG_ON(iter->level);
220
	EBUG_ON(insert->k.u64s >
221 222
		bch_btree_keys_u64s_remaining(trans->c, l->b));

223 224
	if (likely(bch2_btree_bset_insert_key(iter, l->b, &l->iter, insert)))
		bch2_btree_journal_key(trans, iter, insert);
225 226 227 228 229
}

/**
 * btree_insert_key - insert a key one key into a leaf node
 */
230
static void btree_insert_key_leaf(struct btree_trans *trans,
231 232
				  struct btree_iter *iter,
				  struct bkey_i *insert)
233 234 235
{
	struct bch_fs *c = trans->c;
	struct btree *b = iter->l[0].b;
236 237
	struct bset_tree *t = bset_tree_last(b);
	int old_u64s = bset_u64s(t);
238 239 240
	int old_live_u64s = b->nr.live_u64s;
	int live_u64s_added, u64s_added;

241
	insert->k.needs_whiteout = false;
242

243
	if (!btree_node_is_extents(b))
244
		bch2_insert_fixup_key(trans, iter, insert);
245
	else
246
		bch2_insert_fixup_extent(trans, iter, insert);
247 248

	live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
249
	u64s_added = (int) bset_u64s(t) - old_u64s;
250 251 252 253 254 255 256 257 258 259

	if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0)
		b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added);
	if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0)
		b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added);

	if (u64s_added > live_u64s_added &&
	    bch2_maybe_compact_whiteouts(c, b))
		bch2_btree_iter_reinit_node(iter, b);

260
	trace_btree_insert_key(c, b, insert);
261 262
}

263
/* Normal update interface: */
264

265
static inline void btree_insert_entry_checks(struct btree_trans *trans,
266 267
					     struct btree_iter *iter,
					     struct bkey_i *insert)
268
{
269
	struct bch_fs *c = trans->c;
270

271 272 273 274
	BUG_ON(iter->level);
	BUG_ON(bkey_cmp(bkey_start_pos(&insert->k), iter->pos));
	EBUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
		bkey_cmp(insert->k.p, iter->l[0].b->key.k.p) > 0);
275

276
	BUG_ON(debug_check_bkeys(c) &&
277 278
	       !bkey_deleted(&insert->k) &&
	       bch2_bkey_invalid(c, bkey_i_to_s_c(insert), iter->btree_id));
279
}
280

281 282
static noinline int
bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned u64s)
283
{
284 285
	struct bch_fs *c = trans->c;
	int ret;
286

287
	bch2_trans_unlock(trans);
288

289 290 291 292
	ret = bch2_journal_preres_get(&c->journal,
			&trans->journal_preres, u64s, 0);
	if (ret)
		return ret;
293

294
	if (!bch2_trans_relock(trans)) {
295
		trace_trans_restart_journal_preres_get(trans->ip);
296 297
		return -EINTR;
	}
298

299
	return 0;
300 301
}

302 303
static inline int bch2_trans_journal_res_get(struct btree_trans *trans,
					     unsigned flags)
304
{
305 306
	struct bch_fs *c = trans->c;
	int ret;
307

308 309
	if (trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
		flags |= JOURNAL_RES_GET_RESERVED;
310

311
	ret = bch2_journal_res_get(&c->journal, &trans->journal_res,
312
				   trans->journal_u64s, flags);
313 314 315

	return ret == -EAGAIN ? BTREE_INSERT_NEED_JOURNAL_RES : ret;
}
316

317
static enum btree_insert_ret
318
btree_key_can_insert(struct btree_trans *trans,
319 320
		     struct btree_iter *iter,
		     struct bkey_i *insert,
321
		     unsigned *u64s)
322 323
{
	struct bch_fs *c = trans->c;
324
	struct btree *b = iter->l[0].b;
325 326 327 328 329 330 331
	static enum btree_insert_ret ret;

	if (unlikely(btree_node_fake(b)))
		return BTREE_INSERT_BTREE_NODE_FULL;

	ret = !btree_node_is_extents(b)
		? BTREE_INSERT_OK
332
		: bch2_extent_can_insert(trans, iter, insert, u64s);
333 334 335 336 337 338 339 340 341
	if (ret)
		return ret;

	if (*u64s > bch_btree_keys_u64s_remaining(c, b))
		return BTREE_INSERT_BTREE_NODE_FULL;

	return BTREE_INSERT_OK;
}

342
static inline void do_btree_insert_one(struct btree_trans *trans,
343 344
				       struct btree_iter *iter,
				       struct bkey_i *insert)
345
{
346
	btree_insert_key_leaf(trans, iter, insert);
347 348
}

349
static inline bool iter_has_trans_triggers(struct btree_iter *iter)
350
{
351
	return BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << iter->btree_id);
352 353
}

354
static inline bool iter_has_nontrans_triggers(struct btree_iter *iter)
355
{
356 357
	return (BTREE_NODE_TYPE_HAS_TRIGGERS &
		~BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS) &
358
		(1U << iter->btree_id);
359 360
}

361 362 363 364 365 366
static noinline void bch2_btree_iter_unlock_noinline(struct btree_iter *iter)
{
	__bch2_btree_iter_unlock(iter);
}

static noinline void bch2_trans_mark_gc(struct btree_trans *trans)
367 368 369 370
{
	struct bch_fs *c = trans->c;
	struct btree_insert_entry *i;

371
	trans_for_each_update(trans, i)
372
		if (gc_visited(c, gc_pos_btree_node(i->iter->l[0].b)))
373
			bch2_mark_update(trans, i->iter, i->k, NULL,
374
					 i->trigger_flags|BTREE_TRIGGER_GC);
375
}
376

377 378 379 380 381 382 383
static inline int
bch2_trans_commit_write_locked(struct btree_trans *trans,
			       struct btree_insert_entry **stopped_at)
{
	struct bch_fs *c = trans->c;
	struct bch_fs_usage_online *fs_usage = NULL;
	struct btree_insert_entry *i;
384
	unsigned u64s = 0;
385
	bool marking = false;
386
	int ret;
387

388
	if (race_fault()) {
389
		trace_trans_restart_fault_inject(trans->ip);
390
		return -EINTR;
391 392
	}

393 394 395 396 397
	/*
	 * Check if the insert will fit in the leaf node with the write lock
	 * held, otherwise another thread could write the node changing the
	 * amount of space available:
	 */
398

399
	prefetch(&trans->c->journal.flags);
400

401
	trans_for_each_update(trans, i) {
402
		/* Multiple inserts might go to same leaf: */
403
		if (!same_leaf_as_prev(trans, i))
404
			u64s = 0;
405

406
		u64s += i->k->k.u64s;
407
		ret = btree_key_can_insert(trans, i->iter, i->k, &u64s);
408 409 410
		if (ret) {
			*stopped_at = i;
			return ret;
411
		}
412 413 414 415 416 417 418 419

		if (btree_node_type_needs_gc(i->iter->btree_id))
			marking = true;
	}

	if (marking) {
		percpu_down_read(&c->mark_lock);
		fs_usage = bch2_fs_usage_scratch_get(c);
420 421
	}

422 423 424 425
	/*
	 * Don't get journal reservation until after we know insert will
	 * succeed:
	 */
426
	if (likely(!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))) {
427 428
		ret = bch2_trans_journal_res_get(trans,
				JOURNAL_RES_GET_NONBLOCK);
429
		if (ret)
430
			goto err;
431
	}
432

433 434 435 436 437
	/*
	 * Not allowed to fail after we've gotten our journal reservation - we
	 * have to use it:
	 */

438 439
	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY)) {
		if (journal_seq_verify(c))
440
			trans_for_each_update(trans, i)
441 442
				i->k->k.version.lo = trans->journal_res.seq;
		else if (inject_invalid_keys(c))
443
			trans_for_each_update(trans, i)
444 445 446
				i->k->k.version = MAX_VERSION;
	}

447 448 449 450 451 452 453 454
	/* Must be called under mark_lock: */
	if (marking && trans->fs_usage_deltas &&
	    bch2_replicas_delta_list_apply(c, &fs_usage->u,
					   trans->fs_usage_deltas)) {
		ret = BTREE_INSERT_NEED_MARK_REPLICAS;
		goto err;
	}

455
	trans_for_each_update(trans, i)
456
		if (iter_has_nontrans_triggers(i->iter))
457
			bch2_mark_update(trans, i->iter, i->k,
458
					 &fs_usage->u, i->trigger_flags);
459

460
	if (marking)
461
		bch2_trans_fs_usage_apply(trans, fs_usage);
462

463 464
	if (unlikely(c->gc_pos.phase))
		bch2_trans_mark_gc(trans);
465

466
	trans_for_each_update(trans, i)
467
		do_btree_insert_one(trans, i->iter, i->k);
468
err:
469
	if (marking) {
470 471 472 473
		bch2_fs_usage_scratch_put(c, fs_usage);
		percpu_up_read(&c->mark_lock);
	}

474 475 476 477 478 479 480 481 482 483 484 485 486
	return ret;
}

/*
 * Get journal reservation, take write locks, and attempt to do btree update(s):
 */
static inline int do_bch2_trans_commit(struct btree_trans *trans,
				       struct btree_insert_entry **stopped_at)
{
	struct btree_insert_entry *i;
	struct btree_iter *iter;
	int ret;

487 488
	trans_for_each_update(trans, i)
		BUG_ON(!btree_node_intent_locked(i->iter, 0));
489

490
	ret = bch2_journal_preres_get(&trans->c->journal,
491
			&trans->journal_preres, trans->journal_preres_u64s,
492 493 494
			JOURNAL_RES_GET_NONBLOCK);
	if (unlikely(ret == -EAGAIN))
		ret = bch2_trans_journal_preres_get_cold(trans,
495
						trans->journal_preres_u64s);
496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
	if (unlikely(ret))
		return ret;

	/*
	 * Can't be holding any read locks when we go to take write locks:
	 *
	 * note - this must be done after bch2_trans_journal_preres_get_cold()
	 * or anything else that might call bch2_trans_relock(), since that
	 * would just retake the read locks:
	 */
	trans_for_each_iter_all(trans, iter) {
		if (iter->nodes_locked != iter->nodes_intent_locked) {
			EBUG_ON(iter->flags & BTREE_ITER_KEEP_UNTIL_COMMIT);
			EBUG_ON(trans->iters_live & (1ULL << iter->idx));
			bch2_btree_iter_unlock_noinline(iter);
		}
	}

	if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
		trans_for_each_update(trans, i)
516
			btree_insert_entry_checks(trans, i->iter, i->k);
517 518
	bch2_btree_trans_verify_locks(trans);

519 520
	trans_for_each_update(trans, i)
		if (!same_leaf_as_prev(trans, i))
521 522 523
			bch2_btree_node_lock_for_insert(trans->c,
						i->iter->l[0].b, i->iter);

524
	ret = bch2_trans_commit_write_locked(trans, stopped_at);
525

526 527
	trans_for_each_update(trans, i)
		if (!same_leaf_as_prev(trans, i))
528 529
			bch2_btree_node_unlock_write_inlined(i->iter->l[0].b,
							     i->iter);
530 531 532 533 534 535 536 537 538 539 540 541 542

	/*
	 * Drop journal reservation after dropping write locks, since dropping
	 * the journal reservation may kick off a journal write:
	 */
	bch2_journal_res_put(&trans->c->journal, &trans->journal_res);

	if (unlikely(ret))
		return ret;

	if (trans->flags & BTREE_INSERT_NOUNLOCK)
		trans->nounlock = true;

543 544
	trans_for_each_update(trans, i)
		if (!same_leaf_as_prev(trans, i))
545 546 547 548 549 550 551 552 553
			bch2_foreground_maybe_merge(trans->c, i->iter,
						    0, trans->flags);

	trans->nounlock = false;

	trans_for_each_update(trans, i)
		bch2_btree_iter_downgrade(i->iter);

	return 0;
554 555
}

556 557 558 559
static noinline
int bch2_trans_commit_error(struct btree_trans *trans,
			    struct btree_insert_entry *i,
			    int ret)
560 561
{
	struct bch_fs *c = trans->c;
562
	unsigned flags = trans->flags;
563 564 565 566 567

	/*
	 * BTREE_INSERT_NOUNLOCK means don't unlock _after_ successful btree
	 * update; if we haven't done anything yet it doesn't apply
	 */
568
	flags &= ~BTREE_INSERT_NOUNLOCK;
569

570 571 572
	switch (ret) {
	case BTREE_INSERT_BTREE_NODE_FULL:
		ret = bch2_btree_split_leaf(c, i->iter, flags);
573 574 575

		/*
		 * if the split succeeded without dropping locks the insert will
576 577
		 * still be atomic (what the caller peeked() and is overwriting
		 * won't have changed)
578 579 580 581 582 583
		 */
#if 0
		/*
		 * XXX:
		 * split -> btree node merging (of parent node) might still drop
		 * locks when we're not passing it BTREE_INSERT_NOUNLOCK
584 585 586 587
		 *
		 * we don't want to pass BTREE_INSERT_NOUNLOCK to split as that
		 * will inhibit merging - but we don't have a reliable way yet
		 * (do we?) of checking if we dropped locks in this path
588
		 */
589
		if (!ret)
590 591 592 593 594 595 596
			goto retry;
#endif

		/*
		 * don't care if we got ENOSPC because we told split it
		 * couldn't block:
		 */
597 598 599
		if (!ret ||
		    ret == -EINTR ||
		    (flags & BTREE_INSERT_NOUNLOCK)) {
600
			trace_trans_restart_btree_node_split(trans->ip);
601
			ret = -EINTR;
602
		}
603 604 605 606 607
		break;
	case BTREE_INSERT_ENOSPC:
		ret = -ENOSPC;
		break;
	case BTREE_INSERT_NEED_MARK_REPLICAS:
608
		bch2_trans_unlock(trans);
609

610
		trans_for_each_update(trans, i) {
611 612 613
			ret = bch2_mark_bkey_replicas(c, bkey_i_to_s_c(i->k));
			if (ret)
				return ret;
614
		}
615

616
		if (bch2_trans_relock(trans))
617 618
			return 0;

619
		trace_trans_restart_mark_replicas(trans->ip);
620
		ret = -EINTR;
621
		break;
622
	case BTREE_INSERT_NEED_JOURNAL_RES:
623
		bch2_trans_unlock(trans);
624 625 626

		ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
		if (ret)
627
			return ret;
628

629
		if (bch2_trans_relock(trans))
630
			return 0;
631

632
		trace_trans_restart_journal_res_get(trans->ip);
633 634
		ret = -EINTR;
		break;
635 636 637
	default:
		BUG_ON(ret >= 0);
		break;
638 639 640
	}

	if (ret == -EINTR) {
641
		int ret2 = bch2_btree_iter_traverse_all(trans);
642

643
		if (ret2) {
644
			trace_trans_restart_traverse(trans->ip);
645
			return ret2;
646 647
		}

648
		trace_trans_restart_atomic(trans->ip);
649 650
	}

651 652 653
	return ret;
}

654 655
static noinline int
bch2_trans_commit_get_rw_cold(struct btree_trans *trans)
656 657 658 659
{
	struct bch_fs *c = trans->c;
	int ret;

660 661
	if (likely(!(trans->flags & BTREE_INSERT_LAZY_RW)))
		return -EROFS;
662

663
	bch2_trans_unlock(trans);
664

665 666 667
	ret = bch2_fs_read_write_early(c);
	if (ret)
		return ret;
668

669 670
	percpu_ref_get(&c->writes);
	return 0;
671 672
}

673
int __bch2_trans_commit(struct btree_trans *trans)
674
{
675
	struct btree_insert_entry *i = NULL;
676 677
	struct btree_iter *iter;
	bool trans_trigger_run;
678
	unsigned u64s;
679
	int ret = 0;
680

681 682
	BUG_ON(trans->need_reset);

683
	if (!trans->nr_updates)
684
		goto out_noupdates;
685

686 687
	if (trans->flags & BTREE_INSERT_GC_LOCK_HELD)
		lockdep_assert_held(&trans->c->gc_lock);
688

689
	memset(&trans->journal_preres, 0, sizeof(trans->journal_preres));
690

691 692 693
	trans->journal_u64s		= 0;
	trans->journal_preres_u64s	= 0;

694 695 696
	if (!(trans->flags & BTREE_INSERT_NOCHECK_RW) &&
	    unlikely(!percpu_ref_tryget(&trans->c->writes))) {
		ret = bch2_trans_commit_get_rw_cold(trans);
Kent Overstreet's avatar
Kent Overstreet committed
697 698
		if (ret)
			return ret;
699
	}
700 701

	/*
702 703
	 * Running triggers will append more updates to the list of updates as
	 * we're walking it:
704
	 */
705 706 707 708
	do {
		trans_trigger_run = false;

		trans_for_each_update(trans, i) {
709 710 711 712 713 714 715 716 717 718 719 720
			if (unlikely(i->iter->uptodate > BTREE_ITER_NEED_PEEK)) {
				trace_trans_restart_traverse(trans->ip);
				ret = -EINTR;
				goto out;
			}

			/*
			 * We're not using bch2_btree_iter_upgrade here because
			 * we know trans->nounlock can't be set:
			 */
			if (unlikely(i->iter->locks_want < 1 &&
				     !__bch2_btree_iter_upgrade(i->iter, 1))) {
721 722 723 724 725
				trace_trans_restart_upgrade(trans->ip);
				ret = -EINTR;
				goto out;
			}

726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741
			if (iter_has_trans_triggers(i->iter) &&
			    !i->trans_triggers_run) {
				i->trans_triggers_run = true;
				trans_trigger_run = true;

				ret = bch2_trans_mark_update(trans, i->iter, i->k,
							     i->trigger_flags);
				if (unlikely(ret)) {
					if (ret == -EINTR)
						trace_trans_restart_mark(trans->ip);
					goto out;
				}
			}
		}
	} while (trans_trigger_run);

742 743 744 745 746 747
	trans_for_each_update(trans, i) {
		u64s = jset_u64s(i->k->k.u64s);
		if (0)
			trans->journal_preres_u64s += u64s;
		trans->journal_u64s += u64s;
	}
748 749
retry:
	memset(&trans->journal_res, 0, sizeof(trans->journal_res));
Kent Overstreet's avatar
Kent Overstreet committed
750

751
	ret = do_bch2_trans_commit(trans, &i);
Kent Overstreet's avatar
Kent Overstreet committed
752

753 754 755
	/* make sure we didn't drop or screw up locks: */
	bch2_btree_trans_verify_locks(trans);

756 757
	if (ret)
		goto err;
758 759 760 761 762 763 764 765 766

	trans_for_each_iter(trans, iter)
		if ((trans->iters_live & (1ULL << iter->idx)) &&
		    (iter->flags & BTREE_ITER_SET_POS_AFTER_COMMIT)) {
			if (trans->flags & BTREE_INSERT_NOUNLOCK)
				bch2_btree_iter_set_pos_same_leaf(iter, iter->pos_after_commit);
			else
				bch2_btree_iter_set_pos(iter, iter->pos_after_commit);
		}
767
out:
768
	bch2_journal_preres_put(&trans->c->journal, &trans->journal_preres);
769

770 771
	if (likely(!(trans->flags & BTREE_INSERT_NOCHECK_RW)))
		percpu_ref_put(&trans->c->writes);
772
out_noupdates:
773
	bch2_trans_reset(trans, TRANS_RESET_MEM|TRANS_RESET_NOTRAVERSE);
774

775
	return ret;
776 777
err:
	ret = bch2_trans_commit_error(trans, i, ret);
778 779
	if (ret)
		goto out;
780

781
	goto retry;
782 783
}

784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853
int bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
		      struct bkey_i *k, enum btree_trigger_flags flags)
{
	struct btree_insert_entry *i, n = (struct btree_insert_entry) {
		.trigger_flags = flags, .iter = iter, .k = k
	};

	EBUG_ON(bkey_cmp(iter->pos, bkey_start_pos(&k->k)));

	iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;

	if (iter->flags & BTREE_ITER_IS_EXTENTS) {
		iter->pos_after_commit = k->k.p;
		iter->flags |= BTREE_ITER_SET_POS_AFTER_COMMIT;
	}

	/*
	 * Pending updates are kept sorted: first, find position of new update:
	 */
	trans_for_each_update(trans, i)
		if (btree_iter_cmp(iter, i->iter) <= 0)
			break;

	/*
	 * Now delete/trim any updates the new update overwrites:
	 */
	if (i > trans->updates &&
	    i[-1].iter->btree_id == iter->btree_id &&
	    bkey_cmp(iter->pos, i[-1].k->k.p) < 0)
		bch2_cut_back(n.iter->pos, i[-1].k);

	while (i < trans->updates + trans->nr_updates &&
	       iter->btree_id == i->iter->btree_id &&
	       bkey_cmp(n.k->k.p, i->k->k.p) >= 0)
		array_remove_item(trans->updates, trans->nr_updates,
				  i - trans->updates);

	if (i < trans->updates + trans->nr_updates &&
	    iter->btree_id == i->iter->btree_id &&
	    bkey_cmp(n.k->k.p, i->iter->pos) > 0) {
		/*
		 * When we have an extent that overwrites the start of another
		 * update, trimming that extent will mean the iterator's
		 * position has to change since the iterator position has to
		 * match the extent's start pos - but we don't want to change
		 * the iterator pos if some other code is using it, so we may
		 * need to clone it:
		 */
		if (trans->iters_live & (1ULL << i->iter->idx)) {
			i->iter = bch2_trans_copy_iter(trans, i->iter);
			if (IS_ERR(i->iter)) {
				trans->need_reset = true;
				return PTR_ERR(i->iter);
			}

			i->iter->flags |= BTREE_ITER_KEEP_UNTIL_COMMIT;
			bch2_trans_iter_put(trans, i->iter);
		}

		bch2_cut_front(n.k->k.p, i->k);
		bch2_btree_iter_set_pos(i->iter, n.k->k.p);
	}

	EBUG_ON(trans->nr_updates >= trans->nr_iters);

	array_insert_item(trans->updates, trans->nr_updates,
			  i - trans->updates, n);
	return 0;
}

854 855 856 857 858 859 860 861 862 863
static int __bch2_btree_insert(struct btree_trans *trans,
			       enum btree_id id, struct bkey_i *k)
{
	struct btree_iter *iter;

	iter = bch2_trans_get_iter(trans, id, bkey_start_pos(&k->k),
				   BTREE_ITER_INTENT);
	if (IS_ERR(iter))
		return PTR_ERR(iter);

864
	bch2_trans_update(trans, iter, k, 0);
865 866 867
	return 0;
}

868
/**
869
 * bch2_btree_insert - insert keys into the extent btree
870 871 872 873 874 875
 * @c:			pointer to struct bch_fs
 * @id:			btree to insert into
 * @insert_keys:	list of keys to insert
 * @hook:		insert callback
 */
int bch2_btree_insert(struct bch_fs *c, enum btree_id id,
876 877 878
		      struct bkey_i *k,
		      struct disk_reservation *disk_res,
		      u64 *journal_seq, int flags)
879
{
880 881
	return bch2_trans_do(c, disk_res, journal_seq, flags,
			     __bch2_btree_insert(&trans, id, k));
882 883
}

884 885 886 887
int bch2_btree_delete_at_range(struct btree_trans *trans,
			       struct btree_iter *iter,
			       struct bpos end,
			       u64 *journal_seq)
888 889 890
{
	struct bkey_s_c k;
	int ret = 0;
891
retry:
892
	while ((k = bch2_btree_iter_peek(iter)).k &&
893
	       !(ret = bkey_err(k)) &&
894
	       bkey_cmp(iter->pos, end) < 0) {
895 896
		struct bkey_i delete;

897 898
		bch2_trans_reset(trans, TRANS_RESET_MEM);

899 900 901 902 903 904 905 906 907 908 909 910
		bkey_init(&delete.k);

		/*
		 * For extents, iter.pos won't necessarily be the same as
		 * bkey_start_pos(k.k) (for non extents they always will be the
		 * same). It's important that we delete starting from iter.pos
		 * because the range we want to delete could start in the middle
		 * of k.
		 *
		 * (bch2_btree_iter_peek() does guarantee that iter.pos >=
		 * bkey_start_pos(k.k)).
		 */
911
		delete.k.p = iter->pos;
912

913
		if (iter->flags & BTREE_ITER_IS_EXTENTS) {
914 915 916
			unsigned max_sectors =
				KEY_SIZE_MAX & (~0 << trans->c->block_bits);

917 918
			/* create the biggest key we can */
			bch2_key_resize(&delete.k, max_sectors);
919
			bch2_cut_back(end, &delete);
920 921 922 923

			ret = bch2_extent_trim_atomic(&delete, iter);
			if (ret)
				break;
924 925
		}

926
		bch2_trans_update(trans, iter, &delete, 0);
927
		ret = bch2_trans_commit(trans, NULL, journal_seq,
928
					BTREE_INSERT_NOFAIL);
929 930 931
		if (ret)
			break;

932
		bch2_trans_cond_resched(trans);
933 934
	}

935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951
	if (ret == -EINTR) {
		ret = 0;
		goto retry;
	}

	return ret;

}

int bch2_btree_delete_at(struct btree_trans *trans,
			 struct btree_iter *iter, unsigned flags)
{
	struct bkey_i k;

	bkey_init(&k.k);
	k.k.p = iter->pos;

952
	bch2_trans_update(trans, iter, &k, 0);
953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970
	return bch2_trans_commit(trans, NULL, NULL,
				 BTREE_INSERT_NOFAIL|
				 BTREE_INSERT_USE_RESERVE|flags);
}

/*
 * bch_btree_delete_range - delete everything within a given range
 *
 * Range is a half open interval - [start, end)
 */
int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
			    struct bpos start, struct bpos end,
			    u64 *journal_seq)
{
	struct btree_trans trans;
	struct btree_iter *iter;
	int ret = 0;

971 972 973 974 975
	/*
	 * XXX: whether we need mem/more iters depends on whether this btree id
	 * has triggers
	 */
	bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512);
976 977 978 979 980 981

	iter = bch2_trans_get_iter(&trans, id, start, BTREE_ITER_INTENT);

	ret = bch2_btree_delete_at_range(&trans, iter, end, journal_seq);
	ret = bch2_trans_exit(&trans) ?: ret;

982
	BUG_ON(ret == -EINTR);
983 984
	return ret;
}