From 7ef2a73a5881323d53453cc3be7261fe1a49af1d Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@gmail.com>
Date: Mon, 21 Jan 2019 15:32:13 -0500
Subject: [PATCH] bcachefs: Fix check for if extent update is allocating

Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
---
 fs/bcachefs/bcachefs.h              |   5 +-
 fs/bcachefs/btree_gc.c              |  82 ++++----
 fs/bcachefs/btree_update_interior.c |  29 +--
 fs/bcachefs/buckets.c               | 228 ++++++++++++++--------
 fs/bcachefs/buckets.h               |  17 +-
 fs/bcachefs/buckets_types.h         |  14 +-
 fs/bcachefs/chardev.c               |  20 +-
 fs/bcachefs/ec_types.h              |   7 +
 fs/bcachefs/extents.c               |  32 +++-
 fs/bcachefs/extents.h               |   3 +-
 fs/bcachefs/eytzinger.h             |  26 +--
 fs/bcachefs/fs-io.c                 |  19 +-
 fs/bcachefs/journal_io.c            |  20 +-
 fs/bcachefs/journal_reclaim.c       |   8 +-
 fs/bcachefs/migrate.c               |  11 ++
 fs/bcachefs/move.c                  |  11 ++
 fs/bcachefs/replicas.c              | 280 +++++++++++++++++++---------
 fs/bcachefs/replicas.h              |  35 +++-
 fs/bcachefs/super.c                 |   9 +-
 fs/bcachefs/sysfs.c                 |  38 ++--
 fs/bcachefs/util.c                  |  25 +++
 fs/bcachefs/util.h                  |   2 +
 22 files changed, 622 insertions(+), 299 deletions(-)

diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index a5203fbc089e..17eb0dd657a8 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -624,10 +624,11 @@ struct bch_fs {
 
 	struct bch_fs_pcpu __percpu	*pcpu;
 
-	struct bch_fs_usage __percpu	*usage[2];
-
 	struct percpu_rw_semaphore	mark_lock;
 
+	struct bch_fs_usage __percpu	*usage[2];
+	struct bch_fs_usage __percpu	*usage_scratch;
+
 	/*
 	 * When we invalidate buckets, we use both the priority and the amount
 	 * of good data to determine which buckets to reuse first - to weight
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 466469a0d852..a725a106f6dc 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -478,33 +478,12 @@ static void bch2_gc_free(struct bch_fs *c)
 		ca->usage[1] = NULL;
 	}
 
+	percpu_down_write(&c->mark_lock);
+
 	free_percpu(c->usage[1]);
 	c->usage[1] = NULL;
-}
-
-/*
- * Accumulate percpu counters onto one cpu's copy - only valid when access
- * against any percpu counter is guarded against
- */
-static u64 *acc_percpu_u64s(u64 __percpu *p, unsigned nr)
-{
-	u64 *ret;
-	int cpu;
-
-	preempt_disable();
-	ret = this_cpu_ptr(p);
-	preempt_enable();
-
-	for_each_possible_cpu(cpu) {
-		u64 *i = per_cpu_ptr(p, cpu);
 
-		if (i != ret) {
-			acc_u64s(ret, i, nr);
-			memset(i, 0, nr * sizeof(u64));
-		}
-	}
-
-	return ret;
+	percpu_up_write(&c->mark_lock);
 }
 
 static void bch2_gc_done_nocheck(struct bch_fs *c)
@@ -542,24 +521,25 @@ static void bch2_gc_done_nocheck(struct bch_fs *c)
 	for_each_member_device(ca, c, i) {
 		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
 		struct bch_dev_usage *dst = (void *)
-			acc_percpu_u64s((void *) ca->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
 		struct bch_dev_usage *src = (void *)
-			acc_percpu_u64s((void *) ca->usage[1], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
 
 		*dst = *src;
 	}
 
 	{
-		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
+			c->replicas.nr;
 		struct bch_fs_usage *dst = (void *)
-			acc_percpu_u64s((void *) c->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) c->usage[0], nr);
 		struct bch_fs_usage *src = (void *)
-			acc_percpu_u64s((void *) c->usage[1], nr);
+			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 		unsigned offset = offsetof(typeof(*dst), s.gc_start);
 
 		memcpy((void *) dst + offset,
 		       (void *) src + offset,
-		       sizeof(*dst) - offset);
+		       nr * sizeof(u64) - offset);
 	}
 }
 
@@ -655,9 +635,9 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 	for_each_member_device(ca, c, i) {
 		unsigned nr = sizeof(struct bch_dev_usage) / sizeof(u64);
 		struct bch_dev_usage *dst = (void *)
-			acc_percpu_u64s((void *) ca->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[0], nr);
 		struct bch_dev_usage *src = (void *)
-			acc_percpu_u64s((void *) ca->usage[1], nr);
+			bch2_acc_percpu_u64s((void *) ca->usage[1], nr);
 		unsigned b;
 
 		for (b = 0; b < BCH_DATA_NR; b++)
@@ -674,12 +654,12 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 	}
 
 	{
-		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64);
+		unsigned nr = sizeof(struct bch_fs_usage) / sizeof(u64) +
+			c->replicas.nr;
 		struct bch_fs_usage *dst = (void *)
-			acc_percpu_u64s((void *) c->usage[0], nr);
+			bch2_acc_percpu_u64s((void *) c->usage[0], nr);
 		struct bch_fs_usage *src = (void *)
-			acc_percpu_u64s((void *) c->usage[1], nr);
-		unsigned r, b;
+			bch2_acc_percpu_u64s((void *) c->usage[1], nr);
 
 		copy_fs_field(s.hidden,		"hidden");
 		copy_fs_field(s.data,		"data");
@@ -687,20 +667,16 @@ static void bch2_gc_done(struct bch_fs *c, bool initial)
 		copy_fs_field(s.reserved,	"reserved");
 		copy_fs_field(s.nr_inodes,	"nr_inodes");
 
-		for (r = 0; r < BCH_REPLICAS_MAX; r++) {
-			for (b = 0; b < BCH_DATA_NR; b++)
-				copy_fs_field(replicas[r].data[b],
-					      "replicas[%i].data[%s]",
-					      r, bch2_data_types[b]);
-			copy_fs_field(replicas[r].ec_data,
-				      "replicas[%i].ec_data", r);
-			copy_fs_field(replicas[r].persistent_reserved,
-				      "replicas[%i].persistent_reserved", r);
-		}
+		for (i = 0; i < BCH_REPLICAS_MAX; i++)
+			copy_fs_field(persistent_reserved[i],
+				      "persistent_reserved[%i]", i);
 
-		for (b = 0; b < BCH_DATA_NR; b++)
-			copy_fs_field(buckets[b],
-				      "buckets[%s]", bch2_data_types[b]);
+		for (i = 0; i < c->replicas.nr; i++) {
+			/*
+			 * XXX: print out replicas entry
+			 */
+			copy_fs_field(data[i], "data[%i]", i);
+		}
 	}
 out:
 	percpu_up_write(&c->mark_lock);
@@ -723,9 +699,15 @@ static int bch2_gc_start(struct bch_fs *c)
 	 */
 	gc_pos_set(c, gc_phase(GC_PHASE_START));
 
+	percpu_down_write(&c->mark_lock);
 	BUG_ON(c->usage[1]);
 
-	c->usage[1] = alloc_percpu(struct bch_fs_usage);
+	c->usage[1] = __alloc_percpu_gfp(sizeof(struct bch_fs_usage) +
+					 sizeof(u64) * c->replicas.nr,
+					 sizeof(u64),
+					 GFP_KERNEL);
+	percpu_up_write(&c->mark_lock);
+
 	if (!c->usage[1])
 		return -ENOMEM;
 
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 2efe191cdc30..d55778696bcd 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -1070,25 +1070,28 @@ static void bch2_btree_set_root_inmem(struct btree_update *as, struct btree *b)
 {
 	struct bch_fs *c = as->c;
 	struct btree *old = btree_node_root(c, b);
-	struct bch_fs_usage stats = { 0 };
+	struct bch_fs_usage *fs_usage;
 
 	__bch2_btree_set_root_inmem(c, b);
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	fs_usage = bch2_fs_usage_get_scratch(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(&b->key),
 		      true, 0,
 		      gc_pos_btree_root(b->btree_id),
-		      &stats, 0, 0);
+		      fs_usage, 0, 0);
 
 	if (old && !btree_node_fake(old))
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&old->key),
-					   &stats);
-	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+					   fs_usage);
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
 			    gc_pos_btree_root(b->btree_id));
 
+	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 	mutex_unlock(&c->btree_interior_update_lock);
 }
@@ -1161,7 +1164,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 					struct btree_node_iter *node_iter)
 {
 	struct bch_fs *c = as->c;
-	struct bch_fs_usage stats = { 0 };
+	struct bch_fs_usage *fs_usage;
 	struct bkey_packed *k;
 	struct bkey tmp;
 
@@ -1169,10 +1172,11 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 
 	mutex_lock(&c->btree_interior_update_lock);
 	percpu_down_read(&c->mark_lock);
+	fs_usage = bch2_fs_usage_get_scratch(c);
 
 	bch2_mark_key_locked(c, bkey_i_to_s_c(insert),
 			     true, 0,
-			     gc_pos_btree_node(b), &stats, 0, 0);
+			     gc_pos_btree_node(b), fs_usage, 0, 0);
 
 	while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) &&
 	       bkey_iter_pos_cmp(b, &insert->k.p, k) > 0)
@@ -1185,9 +1189,9 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, struct btree *b
 	if (k && !bkey_cmp_packed(b, k, &insert->k))
 		bch2_btree_node_free_index(as, b,
 					   bkey_disassemble(b, k, &tmp),
-					   &stats);
+					   fs_usage);
 
-	bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+	bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
 			    gc_pos_btree_node(b));
 
 	percpu_up_read(&c->mark_lock);
@@ -1971,7 +1975,7 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 			bkey_copy(&b->key, &new_key->k_i);
 		}
 	} else {
-		struct bch_fs_usage stats = { 0 };
+		struct bch_fs_usage *fs_usage;
 
 		BUG_ON(btree_node_root(c, b) != b);
 
@@ -1979,15 +1983,16 @@ static void __bch2_btree_node_update_key(struct bch_fs *c,
 
 		mutex_lock(&c->btree_interior_update_lock);
 		percpu_down_read(&c->mark_lock);
+		fs_usage = bch2_fs_usage_get_scratch(c);
 
 		bch2_mark_key_locked(c, bkey_i_to_s_c(&new_key->k_i),
 			      true, 0,
 			      gc_pos_btree_root(b->btree_id),
-			      &stats, 0, 0);
+			      fs_usage, 0, 0);
 		bch2_btree_node_free_index(as, NULL,
 					   bkey_i_to_s_c(&b->key),
-					   &stats);
-		bch2_fs_usage_apply(c, &stats, &as->reserve->disk_res,
+					   fs_usage);
+		bch2_fs_usage_apply(c, fs_usage, &as->reserve->disk_res,
 				    gc_pos_btree_root(b->btree_id));
 
 		percpu_up_read(&c->mark_lock);
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 34e5f81b2b5e..cbebc712a1da 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -72,12 +72,11 @@
 #include "ec.h"
 #include "error.h"
 #include "movinggc.h"
+#include "replicas.h"
 #include "trace.h"
 
 #include <linux/preempt.h>
 
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
-
 /*
  * Clear journal_seq_valid for buckets for which it's not needed, to prevent
  * wraparound:
@@ -132,9 +131,29 @@ struct bch_dev_usage bch2_dev_usage_read(struct bch_fs *c, struct bch_dev *ca)
 	return bch2_usage_read_raw(ca->usage[0]);
 }
 
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *c)
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *c)
 {
-	return bch2_usage_read_raw(c->usage[0]);
+	struct bch_fs_usage *ret;
+	unsigned nr = READ_ONCE(c->replicas.nr);
+retry:
+	ret = kzalloc(sizeof(*ret) + nr * sizeof(u64), GFP_NOFS);
+	if (unlikely(!ret))
+		return NULL;
+
+	percpu_down_read(&c->mark_lock);
+
+	if (unlikely(nr < c->replicas.nr)) {
+		nr = c->replicas.nr;
+		percpu_up_read(&c->mark_lock);
+		kfree(ret);
+		goto retry;
+	}
+
+	acc_u64s_percpu((u64 *) ret,
+			(u64 __percpu *) c->usage[0],
+			sizeof(*ret) / sizeof(u64) + nr);
+
+	return ret;
 }
 
 #define RESERVE_FACTOR	6
@@ -149,17 +168,13 @@ static u64 avail_factor(u64 r)
 	return (r << RESERVE_FACTOR) / ((1 << RESERVE_FACTOR) + 1);
 }
 
-static inline u64 __bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
-{
-	return fs_usage.s.hidden +
-		fs_usage.s.data +
-		reserve_factor(fs_usage.s.reserved +
-			       fs_usage.s.online_reserved);
-}
-
 u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage fs_usage)
 {
-	return min(c->capacity, __bch2_fs_sectors_used(c, fs_usage));
+	return min(fs_usage.s.hidden +
+		   fs_usage.s.data +
+		   reserve_factor(fs_usage.s.reserved +
+				  fs_usage.s.online_reserved),
+		   c->capacity);
 }
 
 struct bch_fs_usage_short
@@ -208,13 +223,14 @@ static bool bucket_became_unavailable(struct bucket_mark old,
 	       !is_available_bucket(new);
 }
 
-void bch2_fs_usage_apply(struct bch_fs *c,
-			 struct bch_fs_usage *fs_usage,
-			 struct disk_reservation *disk_res,
-			 struct gc_pos gc_pos)
+int bch2_fs_usage_apply(struct bch_fs *c,
+			struct bch_fs_usage *fs_usage,
+			struct disk_reservation *disk_res,
+			struct gc_pos gc_pos)
 {
 	s64 added = fs_usage->s.data + fs_usage->s.reserved;
 	s64 should_not_have_added;
+	int ret = 0;
 
 	percpu_rwsem_assert_held(&c->mark_lock);
 
@@ -227,6 +243,7 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 		      "disk usage increased without a reservation")) {
 		atomic64_sub(should_not_have_added, &c->sectors_available);
 		added -= should_not_have_added;
+		ret = -1;
 	}
 
 	if (added > 0) {
@@ -237,17 +254,17 @@ void bch2_fs_usage_apply(struct bch_fs *c,
 	preempt_disable();
 	acc_u64s((u64 *) this_cpu_ptr(c->usage[0]),
 		 (u64 *) fs_usage,
-		 sizeof(*fs_usage) / sizeof(u64));
+		 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
 
 	if (gc_visited(c, gc_pos)) {
 		BUG_ON(!c->usage[1]);
 		acc_u64s((u64 *) this_cpu_ptr(c->usage[1]),
 			 (u64 *) fs_usage,
-			 sizeof(*fs_usage) / sizeof(u64));
+			 sizeof(*fs_usage) / sizeof(u64) + c->replicas.nr);
 	}
 	preempt_enable();
 
-	memset(fs_usage, 0, sizeof(*fs_usage));
+	return ret;
 }
 
 static inline void account_bucket(struct bch_fs_usage *fs_usage,
@@ -258,7 +275,6 @@ static inline void account_bucket(struct bch_fs_usage *fs_usage,
 	if (type == BCH_DATA_SB || type == BCH_DATA_JOURNAL)
 		fs_usage->s.hidden	+= size;
 
-	fs_usage->buckets[type]		+= size;
 	dev_usage->buckets[type]	+= nr;
 }
 
@@ -332,6 +348,34 @@ void bch2_dev_usage_from_buckets(struct bch_fs *c, struct bch_dev *ca)
 	_old;							\
 })
 
+static inline void update_replicas(struct bch_fs *c,
+				   struct bch_fs_usage *fs_usage,
+				   struct bch_replicas_entry *r,
+				   s64 sectors)
+{
+	int idx = bch2_replicas_entry_idx(c, r);
+
+	BUG_ON(idx < 0);
+	BUG_ON(!sectors);
+
+	if (r->data_type == BCH_DATA_CACHED)
+		fs_usage->s.cached	+= sectors;
+	else
+		fs_usage->s.data	+= sectors;
+	fs_usage->data[idx]		+= sectors;
+}
+
+static inline void update_cached_sectors(struct bch_fs *c,
+					 struct bch_fs_usage *fs_usage,
+					 unsigned dev, s64 sectors)
+{
+	struct bch_replicas_padded r;
+
+	bch2_replicas_entry_cached(&r.e, dev);
+
+	update_replicas(c, fs_usage, &r.e, sectors);
+}
+
 static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 				     size_t b, struct bucket_mark *old,
 				     bool gc)
@@ -350,8 +394,9 @@ static void __bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.gen++;
 	}));
 
-	fs_usage->replicas[0].data[BCH_DATA_CACHED]	-= old->cached_sectors;
-	fs_usage->s.cached				-= old->cached_sectors;
+	if (old->cached_sectors)
+		update_cached_sectors(c, fs_usage, ca->dev_idx,
+				      -old->cached_sectors);
 }
 
 void bch2_invalidate_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -418,11 +463,6 @@ static void __bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
 		new.data_type	= type;
 		checked_add(new.dirty_sectors, sectors);
 	}));
-
-	if (type == BCH_DATA_BTREE ||
-	    type == BCH_DATA_USER)
-		fs_usage->s.data		+= sectors;
-	fs_usage->replicas[0].data[type]	+= sectors;
 }
 
 void bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -550,9 +590,9 @@ static void bch2_mark_pointer(struct bch_fs *c,
 
 static int bch2_mark_stripe_ptr(struct bch_fs *c,
 				struct bch_extent_stripe_ptr p,
+				enum bch_data_type data_type,
+				struct bch_fs_usage *fs_usage,
 				s64 sectors, unsigned flags,
-				s64 *adjusted_disk_sectors,
-				unsigned *redundancy,
 				bool gc)
 {
 	struct stripe *m;
@@ -568,16 +608,15 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 		return -1;
 	}
 
+	BUG_ON(m->r.e.data_type != data_type);
+
 	nr_data = m->nr_blocks - m->nr_redundant;
 
 	parity_sectors = DIV_ROUND_UP(abs(sectors) * m->nr_redundant, nr_data);
 
 	if (sectors < 0)
 		parity_sectors = -parity_sectors;
-
-	*adjusted_disk_sectors += parity_sectors;
-
-	*redundancy = max_t(unsigned, *redundancy, m->nr_redundant + 1);
+	sectors += parity_sectors;
 
 	new = atomic_add_return(sectors, &m->block_sectors[p.block]);
 	old = new - sectors;
@@ -593,6 +632,8 @@ static int bch2_mark_stripe_ptr(struct bch_fs *c,
 	if (!gc)
 		bch2_stripes_heap_update(c, m, p.idx);
 
+	update_replicas(c, fs_usage, &m->r.e, sectors);
+
 	return 0;
 }
 
@@ -605,58 +646,46 @@ static int bch2_mark_extent(struct bch_fs *c, struct bkey_s_c k,
 	struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
 	const union bch_extent_entry *entry;
 	struct extent_ptr_decoded p;
-	s64 cached_sectors	= 0;
-	s64 dirty_sectors	= 0;
-	s64 ec_sectors		= 0;
-	unsigned replicas	= 0;
-	unsigned ec_redundancy	= 0;
+	struct bch_replicas_padded r;
+	s64 dirty_sectors = 0;
 	unsigned i;
 	int ret;
 
+	r.e.data_type	= data_type;
+	r.e.nr_devs	= 0;
+	r.e.nr_required	= 1;
+
 	BUG_ON(!sectors);
 
 	bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
 		s64 disk_sectors = data_type == BCH_DATA_BTREE
 			? sectors
 			: ptr_disk_sectors_delta(p, sectors);
-		s64 adjusted_disk_sectors = disk_sectors;
 
 		bch2_mark_pointer(c, p, disk_sectors, data_type,
 				  fs_usage, journal_seq, flags, gc);
 
-		if (!p.ptr.cached)
+		if (p.ptr.cached) {
+			update_cached_sectors(c, fs_usage, p.ptr.dev,
+					      disk_sectors);
+		} else if (!p.ec_nr) {
+			dirty_sectors	       += disk_sectors;
+			r.e.devs[r.e.nr_devs++]	= p.ptr.dev;
+		} else {
 			for (i = 0; i < p.ec_nr; i++) {
 				ret = bch2_mark_stripe_ptr(c, p.ec[i],
-						disk_sectors, flags,
-						&adjusted_disk_sectors,
-						&ec_redundancy, gc);
+						data_type, fs_usage,
+						disk_sectors, flags, gc);
 				if (ret)
 					return ret;
 			}
-		if (!p.ptr.cached)
-			replicas++;
 
-		if (p.ptr.cached)
-			cached_sectors	+= adjusted_disk_sectors;
-		else if (!p.ec_nr)
-			dirty_sectors	+= adjusted_disk_sectors;
-		else
-			ec_sectors	+= adjusted_disk_sectors;
+			r.e.nr_required = 0;
+		}
 	}
 
-	replicas	= clamp_t(unsigned,	replicas,
-				  1, ARRAY_SIZE(fs_usage->replicas));
-	ec_redundancy	= clamp_t(unsigned,	ec_redundancy,
-				  1, ARRAY_SIZE(fs_usage->replicas));
-
-	fs_usage->s.cached					+= cached_sectors;
-	fs_usage->replicas[0].data[BCH_DATA_CACHED]		+= cached_sectors;
-
-	fs_usage->s.data					+= dirty_sectors;
-	fs_usage->replicas[replicas - 1].data[data_type]	+= dirty_sectors;
-
-	fs_usage->s.data					+= ec_sectors;
-	fs_usage->replicas[ec_redundancy - 1].ec_data		+= ec_sectors;
+	if (dirty_sectors)
+		update_replicas(c, fs_usage, &r.e, dirty_sectors);
 
 	return 0;
 }
@@ -724,8 +753,24 @@ static int bch2_mark_stripe(struct bch_fs *c, struct bkey_s_c k,
 		m->algorithm	= s.v->algorithm;
 		m->nr_blocks	= s.v->nr_blocks;
 		m->nr_redundant	= s.v->nr_redundant;
+
+		memset(&m->r, 0, sizeof(m->r));
+
+		m->r.e.data_type	= BCH_DATA_USER;
+		m->r.e.nr_devs		= s.v->nr_blocks;
+		m->r.e.nr_required	= s.v->nr_blocks - s.v->nr_redundant;
+
+		for (i = 0; i < s.v->nr_blocks; i++)
+			m->r.e.devs[i] = s.v->ptrs[i].dev;
 	}
 
+	/*
+	 * XXX: account for stripes somehow here
+	 */
+#if 0
+	update_replicas(c, fs_usage, &m->r.e, stripe_sectors);
+#endif
+
 	if (!gc) {
 		if (inserting)
 			bch2_stripes_heap_insert(c, m, idx);
@@ -773,11 +818,11 @@ static int __bch2_mark_key(struct bch_fs *c, struct bkey_s_c k,
 		unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
 
 		sectors *= replicas;
-		replicas = clamp_t(unsigned, replicas,
-				   1, ARRAY_SIZE(fs_usage->replicas));
+		replicas = clamp_t(unsigned, replicas, 1,
+				   ARRAY_SIZE(fs_usage->persistent_reserved));
 
-		fs_usage->s.reserved					+= sectors;
-		fs_usage->replicas[replicas - 1].persistent_reserved	+= sectors;
+		fs_usage->s.reserved				+= sectors;
+		fs_usage->persistent_reserved[replicas - 1]	+= sectors;
 		break;
 	}
 	default:
@@ -839,20 +884,24 @@ void bch2_mark_update(struct btree_insert *trans,
 	struct btree_iter	*iter = insert->iter;
 	struct btree		*b = iter->l[0].b;
 	struct btree_node_iter	node_iter = iter->l[0].iter;
-	struct bch_fs_usage	fs_usage = { 0 };
+	struct bch_fs_usage	*fs_usage;
 	struct gc_pos		pos = gc_pos_btree_node(b);
 	struct bkey_packed	*_k;
+	u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
+	static int warned_disk_usage = 0;
 
 	if (!btree_node_type_needs_gc(iter->btree_id))
 		return;
 
 	percpu_down_read(&c->mark_lock);
+	preempt_disable();
+	fs_usage = bch2_fs_usage_get_scratch(c);
 
 	if (!(trans->flags & BTREE_INSERT_JOURNAL_REPLAY))
 		bch2_mark_key_locked(c, bkey_i_to_s_c(insert->k), true,
 			bpos_min(insert->k->k.p, b->key.k.p).offset -
 			bkey_start_offset(&insert->k->k),
-			pos, &fs_usage, trans->journal_res.seq, 0);
+			pos, fs_usage, trans->journal_res.seq, 0);
 
 	while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
 						      KEY_TYPE_discard))) {
@@ -885,7 +934,7 @@ void bch2_mark_update(struct btree_insert *trans,
 				BUG_ON(sectors <= 0);
 
 				bch2_mark_key_locked(c, k, true, sectors,
-					pos, &fs_usage, trans->journal_res.seq, 0);
+					pos, fs_usage, trans->journal_res.seq, 0);
 
 				sectors = bkey_start_offset(&insert->k->k) -
 					k.k->p.offset;
@@ -896,13 +945,44 @@ void bch2_mark_update(struct btree_insert *trans,
 		}
 
 		bch2_mark_key_locked(c, k, false, sectors,
-			pos, &fs_usage, trans->journal_res.seq, 0);
+			pos, fs_usage, trans->journal_res.seq, 0);
 
 		bch2_btree_node_iter_advance(&node_iter, b);
 	}
 
-	bch2_fs_usage_apply(c, &fs_usage, trans->disk_res, pos);
+	if (bch2_fs_usage_apply(c, fs_usage, trans->disk_res, pos) &&
+	    !warned_disk_usage &&
+	    !xchg(&warned_disk_usage, 1)) {
+		char buf[200];
+
+		pr_err("disk usage increased more than %llu sectors reserved", disk_res_sectors);
+
+		pr_err("while inserting");
+		bch2_bkey_val_to_text(&PBUF(buf), c, bkey_i_to_s_c(insert->k));
+		pr_err("%s", buf);
+		pr_err("overlapping with");
+
+		node_iter = iter->l[0].iter;
+		while ((_k = bch2_btree_node_iter_peek_filter(&node_iter, b,
+							      KEY_TYPE_discard))) {
+			struct bkey		unpacked;
+			struct bkey_s_c		k;
+
+			k = bkey_disassemble(b, _k, &unpacked);
 
+			if (btree_node_is_extents(b)
+			    ? bkey_cmp(insert->k->k.p, bkey_start_pos(k.k)) <= 0
+			    : bkey_cmp(insert->k->k.p, k.k->p))
+				break;
+
+			bch2_bkey_val_to_text(&PBUF(buf), c, k);
+			pr_err("%s", buf);
+
+			bch2_btree_node_iter_advance(&node_iter, b);
+		}
+	}
+
+	preempt_enable();
 	percpu_up_read(&c->mark_lock);
 }
 
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 88e083325232..107cb48e3929 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -218,7 +218,18 @@ static inline u64 dev_buckets_free(struct bch_fs *c, struct bch_dev *ca)
 
 /* Filesystem usage: */
 
-struct bch_fs_usage bch2_fs_usage_read(struct bch_fs *);
+static inline struct bch_fs_usage *bch2_fs_usage_get_scratch(struct bch_fs *c)
+{
+	struct bch_fs_usage *ret;
+
+	ret = this_cpu_ptr(c->usage_scratch);
+
+	memset(ret, 0, sizeof(*ret) + c->replicas.nr * sizeof(u64));
+
+	return ret;
+}
+
+struct bch_fs_usage *bch2_fs_usage_read(struct bch_fs *);
 
 u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage);
 
@@ -254,8 +265,8 @@ int bch2_mark_key(struct bch_fs *, struct bkey_s_c,
 		  bool, s64, struct gc_pos,
 		  struct bch_fs_usage *, u64, unsigned);
 void bch2_mark_update(struct btree_insert *, struct btree_insert_entry *);
-void bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
-			 struct disk_reservation *, struct gc_pos);
+int bch2_fs_usage_apply(struct bch_fs *, struct bch_fs_usage *,
+			struct disk_reservation *, struct gc_pos);
 
 /* disk reservations: */
 
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 196f07f41728..65b4bb39f88e 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -75,16 +75,18 @@ struct bch_fs_usage {
 		u64		cached;
 		u64		reserved;
 		u64		nr_inodes;
+
+		/* XXX: add stats for compression ratio */
+#if 0
+		u64		uncompressed;
+		u64		compressed;
+#endif
 	} s;
 
 	/* broken out: */
-	struct {
-		u64		data[BCH_DATA_NR];
-		u64		ec_data;
-		u64		persistent_reserved;
-	}			replicas[BCH_REPLICAS_MAX];
 
-	u64			buckets[BCH_DATA_NR];
+	u64			persistent_reserved[BCH_REPLICAS_MAX];
+	u64			data[];
 };
 
 struct bch_fs_usage_short {
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 7f79f020d904..f090b61f23f1 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -394,21 +394,31 @@ static long bch2_ioctl_usage(struct bch_fs *c,
 	}
 
 	{
-		struct bch_fs_usage src = bch2_fs_usage_read(c);
+		struct bch_fs_usage *src;
 		struct bch_ioctl_fs_usage dst = {
 			.capacity		= c->capacity,
-			.used			= bch2_fs_sectors_used(c, src),
-			.online_reserved	= src.s.online_reserved,
 		};
 
+		src = bch2_fs_usage_read(c);
+		if (!src)
+			return -ENOMEM;
+
+		percpu_up_read(&c->mark_lock);
+
+		dst.used		= bch2_fs_sectors_used(c, *src);
+		dst.online_reserved	= src->s.online_reserved;
+
 		for (i = 0; i < BCH_REPLICAS_MAX; i++) {
 			dst.persistent_reserved[i] =
-				src.replicas[i].persistent_reserved;
-
+				src->persistent_reserved[i];
+#if 0
 			for (j = 0; j < BCH_DATA_NR; j++)
 				dst.sectors[j][i] = src.replicas[i].data[j];
+#endif
 		}
 
+		kfree(src);
+
 		ret = copy_to_user(&user_arg->fs, &dst, sizeof(dst));
 		if (ret)
 			return ret;
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index a3216ca01913..e416dac7ee19 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -6,6 +6,11 @@
 
 #define EC_STRIPE_MAX	16
 
+struct bch_replicas_padded {
+	struct bch_replicas_entry	e;
+	u8				pad[EC_STRIPE_MAX];
+};
+
 struct stripe {
 	size_t			heap_idx;
 
@@ -18,6 +23,8 @@ struct stripe {
 	u8			alive;
 	atomic_t		blocks_nonempty;
 	atomic_t		block_sectors[EC_STRIPE_MAX];
+
+	struct bch_replicas_padded r;
 };
 
 struct ec_stripe_heap_entry {
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 67f6250ef91a..1d96a1773f74 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -1669,12 +1669,13 @@ static bool bch2_extent_merge_inline(struct bch_fs *c,
 	return ret == BCH_MERGE_MERGE;
 }
 
-int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
+bool bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size,
+			       unsigned nr_replicas)
 {
 	struct btree_iter iter;
 	struct bpos end = pos;
 	struct bkey_s_c k;
-	int ret = 0;
+	bool ret = true;
 
 	end.offset += size;
 
@@ -1683,8 +1684,8 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
 		if (bkey_cmp(bkey_start_pos(k.k), end) >= 0)
 			break;
 
-		if (!bch2_extent_is_fully_allocated(k)) {
-			ret = -ENOSPC;
+		if (nr_replicas > bch2_bkey_nr_ptrs_allocated(k)) {
+			ret = false;
 			break;
 		}
 	}
@@ -1693,6 +1694,29 @@ int bch2_check_range_allocated(struct bch_fs *c, struct bpos pos, u64 size)
 	return ret;
 }
 
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k)
+{
+	unsigned ret = 0;
+
+	switch (k.k->type) {
+	case KEY_TYPE_extent: {
+		struct bkey_s_c_extent e = bkey_s_c_to_extent(k);
+		const union bch_extent_entry *entry;
+		struct extent_ptr_decoded p;
+
+		extent_for_each_ptr_decode(e, p, entry)
+			ret += !p.ptr.cached &&
+				p.crc.compression_type == BCH_COMPRESSION_NONE;
+		break;
+	}
+	case KEY_TYPE_reservation:
+		ret = bkey_s_c_to_reservation(k).v->nr_replicas;
+		break;
+	}
+
+	return ret;
+}
+
 /* KEY_TYPE_reservation: */
 
 const char *bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k)
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index 57eb35699545..17cae891bccb 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -572,6 +572,7 @@ static inline void extent_save(struct btree *b, struct bkey_packed *dst,
 		BUG_ON(!bch2_bkey_pack_key(dst, src, f));
 }
 
-int bch2_check_range_allocated(struct bch_fs *, struct bpos, u64);
+bool bch2_check_range_allocated(struct bch_fs *, struct bpos, u64, unsigned);
+unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c);
 
 #endif /* _BCACHEFS_EXTENTS_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 7cb4942cacf7..26d5cad7e6a5 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -263,18 +263,20 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
 	}
 }
 
-static inline size_t eytzinger0_find(void *base, size_t nr, size_t size,
-				     eytzinger_cmp_fn cmp, const void *search)
-{
-	size_t i = 0;
-	int res;
-
-	while (i < nr &&
-	       (res = cmp(search, base + i * size, size)))
-		i = eytzinger0_child(i, res > 0);
-
-	return i;
-}
+#define eytzinger0_find(base, nr, size, _cmp, search)			\
+({									\
+	void *_base	= (base);					\
+	void *_search	= (search);					\
+	size_t _nr	= (nr);						\
+	size_t _size	= (size);					\
+	size_t _i	= 0;						\
+	int _res;							\
+									\
+	while (_i < _nr &&						\
+	       (_res = _cmp(_search, _base + _i * _size, _size)))	\
+		_i = eytzinger0_child(_i, _res > 0);			\
+	_i;								\
+})
 
 void eytzinger0_sort(void *, size_t, size_t,
 		    int (*cmp_func)(const void *, const void *, size_t),
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index a59fedcaed07..7681cfbc6bed 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -253,7 +253,9 @@ static s64 sum_sector_overwrites(struct bkey_i *new, struct btree_iter *_iter,
 		BUG_ON(btree_iter_err(old));
 
 		if (allocating &&
-		    !bch2_extent_is_fully_allocated(old))
+		    !*allocating &&
+		    bch2_bkey_nr_ptrs_allocated(old) <
+		    bch2_bkey_nr_dirty_ptrs(bkey_i_to_s_c(new)))
 			*allocating = true;
 
 		delta += (min(new->k.p.offset,
@@ -812,9 +814,7 @@ static void bch2_add_page_sectors(struct bio *bio, struct bkey_s_c k)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
-	unsigned nr_ptrs = !bch2_extent_is_compressed(k)
-		? bch2_bkey_nr_dirty_ptrs(k)
-		: 0;
+	unsigned nr_ptrs = bch2_bkey_nr_ptrs_allocated(k);
 
 	bio_for_each_segment(bv, bio, iter) {
 		/* brand new pages, don't need to be locked: */
@@ -1930,19 +1930,20 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
 	if (unlikely(ret))
 		goto err;
 
+	dio->iop.op.nr_replicas	= dio->iop.op.opts.data_replicas;
+
 	ret = bch2_disk_reservation_get(c, &dio->iop.op.res, iter->count >> 9,
 					dio->iop.op.opts.data_replicas, 0);
 	if (unlikely(ret)) {
-		if (bch2_check_range_allocated(c, POS(inode->v.i_ino,
-						      req->ki_pos >> 9),
-					       iter->count >> 9))
+		if (!bch2_check_range_allocated(c, POS(inode->v.i_ino,
+						       req->ki_pos >> 9),
+						iter->count >> 9,
+						dio->iop.op.opts.data_replicas))
 			goto err;
 
 		dio->iop.unalloc = true;
 	}
 
-	dio->iop.op.nr_replicas	= dio->iop.op.res.nr_replicas;
-
 	return bch2_dio_write_loop(dio);
 err:
 	bch2_disk_reservation_put(c, &dio->iop.op.res);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 67ff2633ba16..9c794c9a1924 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -694,6 +694,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 	}
 
 	list_for_each_entry(i, list, list) {
+		struct bch_replicas_padded replicas;
+		char buf[80];
+
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, i->devs);
+
 		ret = jset_validate_entries(c, &i->j, READ);
 		if (ret)
 			goto fsck_err;
@@ -705,11 +710,11 @@ int bch2_journal_read(struct bch_fs *c, struct list_head *list)
 
 		if (!degraded &&
 		    (test_bit(BCH_FS_REBUILD_REPLICAS, &c->flags) ||
-		     fsck_err_on(!bch2_replicas_marked(c, BCH_DATA_JOURNAL,
-						       i->devs, false), c,
-				 "superblock not marked as containing replicas (type %u)",
-				 BCH_DATA_JOURNAL))) {
-			ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, i->devs);
+		     fsck_err_on(!bch2_replicas_marked(c, &replicas.e, false), c,
+				 "superblock not marked as containing replicas %s",
+				 (bch2_replicas_entry_to_text(&PBUF(buf),
+							      &replicas.e), buf)))) {
+			ret = bch2_mark_replicas(c, &replicas.e);
 			if (ret)
 				return ret;
 		}
@@ -1108,6 +1113,7 @@ static void journal_write_done(struct closure *cl)
 	struct journal_buf *w = journal_prev_buf(j);
 	struct bch_devs_list devs =
 		bch2_bkey_devs(bkey_i_to_s_c(&w->key));
+	struct bch_replicas_padded replicas;
 	u64 seq = le64_to_cpu(w->data->seq);
 	u64 last_seq = le64_to_cpu(w->data->last_seq);
 
@@ -1118,7 +1124,9 @@ static void journal_write_done(struct closure *cl)
 		goto err;
 	}
 
-	if (bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs))
+	bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL, devs);
+
+	if (bch2_mark_replicas(c, &replicas.e))
 		goto err;
 
 	spin_lock(&j->lock);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index f24546dbf3ed..98345dcd1e67 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -388,7 +388,6 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 {
 	struct bch_fs *c = container_of(j, struct bch_fs, journal);
 	struct journal_entry_pin_list *p;
-	struct bch_devs_list devs;
 	u64 iter, seq = 0;
 	int ret = 0;
 
@@ -413,12 +412,15 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
 
 	spin_lock(&j->lock);
 	while (!ret && seq < j->pin.back) {
+		struct bch_replicas_padded replicas;
+
 		seq = max(seq, journal_last_seq(j));
-		devs = journal_seq_pin(j, seq)->devs;
+		bch2_devlist_to_replicas(&replicas.e, BCH_DATA_JOURNAL,
+					 journal_seq_pin(j, seq)->devs);
 		seq++;
 
 		spin_unlock(&j->lock);
-		ret = bch2_mark_replicas(c, BCH_DATA_JOURNAL, devs);
+		ret = bch2_mark_replicas(c, &replicas.e);
 		spin_lock(&j->lock);
 	}
 	spin_unlock(&j->lock);
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index 63fe8cbb0564..b97a5a8f3910 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -5,6 +5,7 @@
 
 #include "bcachefs.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "extents.h"
 #include "io.h"
@@ -153,6 +154,16 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
 		bch2_btree_iter_unlock(&iter);
 	}
 
+	/* flush relevant btree updates */
+	while (1) {
+		closure_wait_event(&c->btree_interior_update_wait,
+				   !bch2_btree_interior_updates_nr_pending(c) ||
+				   c->btree_roots_dirty);
+		if (!bch2_btree_interior_updates_nr_pending(c))
+			break;
+		bch2_journal_meta(&c->journal);
+	}
+
 	ret = 0;
 out:
 	ret = bch2_replicas_gc_end(c, ret);
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 9081952316b0..5a35f76006cf 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -4,6 +4,7 @@
 #include "alloc_foreground.h"
 #include "btree_gc.h"
 #include "btree_update.h"
+#include "btree_update_interior.h"
 #include "buckets.h"
 #include "disk_groups.h"
 #include "inode.h"
@@ -762,6 +763,16 @@ int bch2_data_job(struct bch_fs *c,
 		ret = bch2_journal_flush_device_pins(&c->journal, -1);
 
 		ret = bch2_move_btree(c, rereplicate_pred, c, stats) ?: ret;
+
+		while (1) {
+			closure_wait_event(&c->btree_interior_update_wait,
+					   !bch2_btree_interior_updates_nr_pending(c) ||
+					   c->btree_roots_dirty);
+			if (!bch2_btree_interior_updates_nr_pending(c))
+				break;
+			bch2_journal_meta(&c->journal);
+		}
+
 		ret = bch2_gc_btree_replicas(c) ?: ret;
 
 		ret = bch2_move_data(c, NULL,
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index b63da1bef760..34a5475cfaba 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -4,11 +4,6 @@
 #include "replicas.h"
 #include "super-io.h"
 
-struct bch_replicas_padded {
-	struct bch_replicas_entry	e;
-	u8				pad[BCH_SB_MEMBERS_MAX];
-};
-
 static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
 					    struct bch_replicas_cpu *);
 
@@ -19,6 +14,16 @@ static inline int u8_cmp(u8 l, u8 r)
 	return (l > r) - (l < r);
 }
 
+static void verify_replicas_entry_sorted(struct bch_replicas_entry *e)
+{
+#ifdef CONFIG_BCACHES_DEBUG
+	unsigned i;
+
+	for (i = 0; i + 1 < e->nr_devs; i++)
+		BUG_ON(e->devs[i] >= e->devs[i + 1]);
+#endif
+}
+
 static void replicas_entry_sort(struct bch_replicas_entry *e)
 {
 	bubble_sort(e->devs, e->nr_devs, u8_cmp);
@@ -29,19 +34,13 @@ static void replicas_entry_sort(struct bch_replicas_entry *e)
 	     (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\
 	     _i = (void *) (_i) + (_r)->entry_size)
 
-static inline struct bch_replicas_entry *
-cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
-{
-	return (void *) r->entries + r->entry_size * i;
-}
-
 static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
 {
 	eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
 }
 
-static void replicas_entry_to_text(struct printbuf *out,
-				  struct bch_replicas_entry *e)
+void bch2_replicas_entry_to_text(struct printbuf *out,
+				 struct bch_replicas_entry *e)
 {
 	unsigned i;
 
@@ -66,7 +65,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
 			pr_buf(out, " ");
 		first = false;
 
-		replicas_entry_to_text(out, e);
+		bch2_replicas_entry_to_text(out, e);
 	}
 }
 
@@ -106,8 +105,8 @@ static void stripe_to_replicas(struct bkey_s_c k,
 		r->devs[r->nr_devs++] = ptr->dev;
 }
 
-static void bkey_to_replicas(struct bkey_s_c k,
-			     struct bch_replicas_entry *e)
+static void bkey_to_replicas(struct bch_replicas_entry *e,
+			     struct bkey_s_c k)
 {
 	e->nr_devs = 0;
 
@@ -129,9 +128,9 @@ static void bkey_to_replicas(struct bkey_s_c k,
 	replicas_entry_sort(e);
 }
 
-static inline void devlist_to_replicas(struct bch_devs_list devs,
-				       enum bch_data_type data_type,
-				       struct bch_replicas_entry *e)
+void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+			      enum bch_data_type data_type,
+			      struct bch_devs_list devs)
 {
 	unsigned i;
 
@@ -160,6 +159,9 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 					replicas_entry_bytes(new_entry)),
 	};
 
+	BUG_ON(!new_entry->data_type);
+	verify_replicas_entry_sorted(new_entry);
+
 	new.entries = kcalloc(new.nr, new.entry_size, GFP_NOIO);
 	if (!new.entries)
 		return new;
@@ -177,21 +179,49 @@ cpu_replicas_add_entry(struct bch_replicas_cpu *old,
 	return new;
 }
 
+static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
+				       struct bch_replicas_entry *search)
+{
+	int idx, entry_size = replicas_entry_bytes(search);
+
+	if (unlikely(entry_size > r->entry_size))
+		return -1;
+
+	verify_replicas_entry_sorted(search);
+
+#define entry_cmp(_l, _r, size)	memcmp(_l, _r, entry_size)
+	idx = eytzinger0_find(r->entries, r->nr, r->entry_size,
+			      entry_cmp, search);
+#undef entry_cmp
+
+	return idx < r->nr ? idx : -1;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *c,
+			    struct bch_replicas_entry *search)
+{
+	replicas_entry_sort(search);
+
+	return __replicas_entry_idx(&c->replicas, search);
+}
+
 static bool __replicas_has_entry(struct bch_replicas_cpu *r,
 				 struct bch_replicas_entry *search)
 {
-	return replicas_entry_bytes(search) <= r->entry_size &&
-		eytzinger0_find(r->entries, r->nr,
-				r->entry_size,
-				memcmp, search) < r->nr;
+	return __replicas_entry_idx(r, search) >= 0;
 }
 
-static bool replicas_has_entry(struct bch_fs *c,
-			       struct bch_replicas_entry *search,
-			       bool check_gc_replicas)
+bool bch2_replicas_marked(struct bch_fs *c,
+			  struct bch_replicas_entry *search,
+			  bool check_gc_replicas)
 {
 	bool marked;
 
+	if (!search->nr_devs)
+		return true;
+
+	verify_replicas_entry_sorted(search);
+
 	percpu_down_read(&c->mark_lock);
 	marked = __replicas_has_entry(&c->replicas, search) &&
 		(!check_gc_replicas ||
@@ -202,6 +232,76 @@ static bool replicas_has_entry(struct bch_fs *c,
 	return marked;
 }
 
+static void __replicas_table_update(struct bch_fs_usage __percpu *dst_p,
+				    struct bch_replicas_cpu *dst_r,
+				    struct bch_fs_usage __percpu *src_p,
+				    struct bch_replicas_cpu *src_r)
+{
+	unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr;
+	struct bch_fs_usage *dst, *src = (void *)
+		bch2_acc_percpu_u64s((void *) src_p, src_nr);
+	int src_idx, dst_idx;
+
+	preempt_disable();
+	dst = this_cpu_ptr(dst_p);
+	preempt_enable();
+
+	*dst = *src;
+
+	for (src_idx = 0; src_idx < src_r->nr; src_idx++) {
+		if (!src->data[src_idx])
+			continue;
+
+		dst_idx = __replicas_entry_idx(dst_r,
+				cpu_replicas_entry(src_r, src_idx));
+		BUG_ON(dst_idx < 0);
+
+		dst->data[dst_idx] = src->data[src_idx];
+	}
+}
+
+/*
+ * Resize filesystem accounting:
+ */
+static int replicas_table_update(struct bch_fs *c,
+				 struct bch_replicas_cpu *new_r)
+{
+	struct bch_fs_usage __percpu *new_usage[3] = { NULL, NULL, NULL };
+	unsigned bytes = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * new_r->nr;
+	unsigned i;
+	int ret = -ENOMEM;
+
+	for (i = 0; i < 3; i++) {
+		if (i < 2 && !c->usage[i])
+			continue;
+
+		new_usage[i] = __alloc_percpu_gfp(bytes, sizeof(u64),
+						  GFP_NOIO);
+		if (!new_usage[i])
+			goto err;
+	}
+
+	for (i = 0; i < 2; i++) {
+		if (!c->usage[i])
+			continue;
+
+		__replicas_table_update(new_usage[i],	new_r,
+					c->usage[i],	&c->replicas);
+
+		swap(c->usage[i], new_usage[i]);
+	}
+
+	swap(c->usage_scratch, new_usage[2]);
+
+	swap(c->replicas, *new_r);
+	ret = 0;
+err:
+	for (i = 0; i < 3; i++)
+		free_percpu(new_usage[i]);
+	return ret;
+}
+
 noinline
 static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 				struct bch_replicas_entry *new_entry)
@@ -243,7 +343,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	/* don't update in memory replicas until changes are persistent */
 	percpu_down_write(&c->mark_lock);
 	if (new_r.entries)
-		swap(new_r, c->replicas);
+		ret = replicas_table_update(c, &new_r);
 	if (new_gc.entries)
 		swap(new_gc, c->replicas_gc);
 	percpu_up_write(&c->mark_lock);
@@ -258,30 +358,32 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c,
 	return ret;
 }
 
-static int __bch2_mark_replicas(struct bch_fs *c,
-				struct bch_replicas_entry *devs)
+int bch2_mark_replicas(struct bch_fs *c,
+		       struct bch_replicas_entry *r)
 {
-	return likely(replicas_has_entry(c, devs, true))
+	return likely(bch2_replicas_marked(c, r, true))
 		? 0
-		: bch2_mark_replicas_slowpath(c, devs);
+		: bch2_mark_replicas_slowpath(c, r);
 }
 
-int bch2_mark_replicas(struct bch_fs *c,
-		       enum bch_data_type data_type,
-		       struct bch_devs_list devs)
+bool bch2_bkey_replicas_marked(struct bch_fs *c,
+			       struct bkey_s_c k,
+			       bool check_gc_replicas)
 {
 	struct bch_replicas_padded search;
+	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
+	unsigned i;
 
-	if (!devs.nr)
-		return 0;
-
-	memset(&search, 0, sizeof(search));
+	for (i = 0; i < cached.nr; i++) {
+		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-	BUG_ON(devs.nr >= BCH_REPLICAS_MAX);
+		if (!bch2_replicas_marked(c, &search.e, check_gc_replicas))
+			return false;
+	}
 
-	devlist_to_replicas(devs, data_type, &search.e);
+	bkey_to_replicas(&search.e, k);
 
-	return __bch2_mark_replicas(c, &search.e);
+	return bch2_replicas_marked(c, &search.e, check_gc_replicas);
 }
 
 int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
@@ -291,22 +393,23 @@ int bch2_mark_bkey_replicas(struct bch_fs *c, struct bkey_s_c k)
 	unsigned i;
 	int ret;
 
-	memset(&search, 0, sizeof(search));
+	for (i = 0; i < cached.nr; i++) {
+		bch2_replicas_entry_cached(&search.e, cached.devs[i]);
 
-	for (i = 0; i < cached.nr; i++)
-		if ((ret = bch2_mark_replicas(c, BCH_DATA_CACHED,
-					      bch2_dev_list_single(cached.devs[i]))))
+		ret = bch2_mark_replicas(c, &search.e);
+		if (ret)
 			return ret;
+	}
 
-	bkey_to_replicas(k, &search.e);
+	bkey_to_replicas(&search.e, k);
 
-	return search.e.nr_devs
-		? __bch2_mark_replicas(c, &search.e)
-		: 0;
+	return bch2_mark_replicas(c, &search.e);
 }
 
 int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 {
+	unsigned i;
+
 	lockdep_assert_held(&c->replicas_gc_lock);
 
 	mutex_lock(&c->sb_lock);
@@ -314,6 +417,39 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 	if (ret)
 		goto err;
 
+	/*
+	 * this is kind of crappy; the replicas gc mechanism needs to be ripped
+	 * out
+	 */
+
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
+		struct bch_replicas_cpu n;
+		u64 v = 0;
+		int cpu;
+
+		if (__replicas_has_entry(&c->replicas_gc, e))
+			continue;
+
+		for_each_possible_cpu(cpu)
+			v += *per_cpu_ptr(&c->usage[0]->data[i], cpu);
+		if (!v)
+			continue;
+
+		n = cpu_replicas_add_entry(&c->replicas_gc, e);
+		if (!n.entries) {
+			ret = -ENOSPC;
+			goto err;
+		}
+
+		percpu_down_write(&c->mark_lock);
+		swap(n, c->replicas_gc);
+		percpu_up_write(&c->mark_lock);
+
+		kfree(n.entries);
+	}
+
 	if (bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc)) {
 		ret = -ENOSPC;
 		goto err;
@@ -325,7 +461,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
 err:
 	percpu_down_write(&c->mark_lock);
 	if (!ret)
-		swap(c->replicas, c->replicas_gc);
+		ret = replicas_table_update(c, &c->replicas_gc);
 
 	kfree(c->replicas_gc.entries);
 	c->replicas_gc.entries = NULL;
@@ -461,7 +597,7 @@ int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c)
 	bch2_cpu_replicas_sort(&new_r);
 
 	percpu_down_write(&c->mark_lock);
-	swap(c->replicas, new_r);
+	ret = replicas_table_update(c, &new_r);
 	percpu_up_write(&c->mark_lock);
 
 	kfree(new_r.entries);
@@ -628,7 +764,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
 			pr_buf(out, " ");
 		first = false;
 
-		replicas_entry_to_text(out, e);
+		bch2_replicas_entry_to_text(out, e);
 	}
 }
 
@@ -677,46 +813,6 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
 
 /* Query replicas: */
 
-bool bch2_replicas_marked(struct bch_fs *c,
-			  enum bch_data_type data_type,
-			  struct bch_devs_list devs,
-			  bool check_gc_replicas)
-{
-	struct bch_replicas_padded search;
-
-	if (!devs.nr)
-		return true;
-
-	memset(&search, 0, sizeof(search));
-
-	devlist_to_replicas(devs, data_type, &search.e);
-
-	return replicas_has_entry(c, &search.e, check_gc_replicas);
-}
-
-bool bch2_bkey_replicas_marked(struct bch_fs *c,
-			       struct bkey_s_c k,
-			       bool check_gc_replicas)
-{
-	struct bch_replicas_padded search;
-	struct bch_devs_list cached = bch2_bkey_cached_devs(k);
-	unsigned i;
-
-	memset(&search, 0, sizeof(search));
-
-	for (i = 0; i < cached.nr; i++)
-		if (!bch2_replicas_marked(c, BCH_DATA_CACHED,
-					  bch2_dev_list_single(cached.devs[i]),
-					  check_gc_replicas))
-			return false;
-
-	bkey_to_replicas(k, &search.e);
-
-	return search.e.nr_devs
-		? replicas_has_entry(c, &search.e, check_gc_replicas)
-		: true;
-}
-
 struct replicas_status __bch2_replicas_status(struct bch_fs *c,
 					      struct bch_devs_mask online_devs)
 {
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index 03aaafdc7c17..923bddb21ec3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -2,17 +2,42 @@
 #ifndef _BCACHEFS_REPLICAS_H
 #define _BCACHEFS_REPLICAS_H
 
+#include "eytzinger.h"
 #include "replicas_types.h"
 
-bool bch2_replicas_marked(struct bch_fs *, enum bch_data_type,
-			  struct bch_devs_list, bool);
+void bch2_replicas_entry_to_text(struct printbuf *,
+				 struct bch_replicas_entry *);
+void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+
+static inline struct bch_replicas_entry *
+cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
+{
+	return (void *) r->entries + r->entry_size * i;
+}
+
+int bch2_replicas_entry_idx(struct bch_fs *,
+			    struct bch_replicas_entry *);
+
+void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+			      enum bch_data_type,
+			      struct bch_devs_list);
+bool bch2_replicas_marked(struct bch_fs *,
+			  struct bch_replicas_entry *, bool);
+int bch2_mark_replicas(struct bch_fs *,
+		       struct bch_replicas_entry *);
+
 bool bch2_bkey_replicas_marked(struct bch_fs *,
 			       struct bkey_s_c, bool);
-int bch2_mark_replicas(struct bch_fs *, enum bch_data_type,
-		       struct bch_devs_list);
 int bch2_mark_bkey_replicas(struct bch_fs *, struct bkey_s_c);
 
-void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+					      unsigned dev)
+{
+	e->data_type	= BCH_DATA_CACHED;
+	e->nr_devs	= 1;
+	e->nr_required	= 1;
+	e->devs[0]	= dev;
+}
 
 struct replicas_status {
 	struct {
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 55069f40d04b..9a862b19ce22 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -375,6 +375,7 @@ static void bch2_fs_free(struct bch_fs *c)
 	bch2_io_clock_exit(&c->io_clock[READ]);
 	bch2_fs_compress_exit(c);
 	percpu_free_rwsem(&c->mark_lock);
+	free_percpu(c->usage_scratch);
 	free_percpu(c->usage[0]);
 	free_percpu(c->pcpu);
 	mempool_exit(&c->btree_iters_pool);
@@ -506,7 +507,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 {
 	struct bch_sb_field_members *mi;
 	struct bch_fs *c;
-	unsigned i, iter_size;
+	unsigned i, iter_size, fs_usage_size;
 	const char *err;
 
 	pr_verbose_init(opts, "");
@@ -600,6 +601,9 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 		(btree_blocks(c) + 1) * 2 *
 		sizeof(struct btree_node_iter_set);
 
+	fs_usage_size = sizeof(struct bch_fs_usage) +
+		sizeof(u64) * c->replicas.nr;
+
 	if (!(c->wq = alloc_workqueue("bcachefs",
 				WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_HIGHPRI, 1)) ||
 	    !(c->copygc_wq = alloc_workqueue("bcache_copygc",
@@ -616,7 +620,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
 			max(offsetof(struct btree_read_bio, bio),
 			    offsetof(struct btree_write_bio, wbio.bio)),
 			BIOSET_NEED_BVECS) ||
-	    !(c->usage[0] = alloc_percpu(struct bch_fs_usage)) ||
+	    !(c->usage[0] = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
+	    !(c->usage_scratch = __alloc_percpu(fs_usage_size, sizeof(u64))) ||
 	    !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
 	    mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
 					btree_bytes(c)) ||
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index 27fd6dfe83f5..424636310bbf 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -234,33 +234,45 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
 static ssize_t show_fs_alloc_debug(struct bch_fs *c, char *buf)
 {
 	struct printbuf out = _PBUF(buf, PAGE_SIZE);
-	struct bch_fs_usage stats = bch2_fs_usage_read(c);
-	unsigned replicas, type;
+	struct bch_fs_usage *fs_usage = bch2_fs_usage_read(c);
+	unsigned i;
 
-	pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
+	if (!fs_usage)
+		return -ENOMEM;
 
-	for (replicas = 0; replicas < ARRAY_SIZE(stats.replicas); replicas++) {
-		pr_buf(&out, "%u replicas:\n", replicas + 1);
+	pr_buf(&out, "capacity:\t\t%llu\n", c->capacity);
 
+	for (i = 0;
+	     i < ARRAY_SIZE(fs_usage->persistent_reserved);
+	     i++) {
+		pr_buf(&out, "%u replicas:\n", i + 1);
+#if 0
 		for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
 			pr_buf(&out, "\t%s:\t\t%llu\n",
 			       bch2_data_types[type],
 			       stats.replicas[replicas].data[type]);
 		pr_buf(&out, "\terasure coded:\t%llu\n",
 		       stats.replicas[replicas].ec_data);
+#endif
 		pr_buf(&out, "\treserved:\t%llu\n",
-		       stats.replicas[replicas].persistent_reserved);
+		       fs_usage->persistent_reserved[i]);
 	}
 
-	pr_buf(&out, "bucket usage\n");
+	pr_buf(&out, "online reserved:\t%llu\n",
+	       fs_usage->s.online_reserved);
 
-	for (type = BCH_DATA_SB; type < BCH_DATA_NR; type++)
-		pr_buf(&out, "\t%s:\t\t%llu\n",
-		       bch2_data_types[type],
-		       stats.buckets[type]);
+	for (i = 0; i < c->replicas.nr; i++) {
+		struct bch_replicas_entry *e =
+			cpu_replicas_entry(&c->replicas, i);
 
-	pr_buf(&out, "online reserved:\t%llu\n",
-	       stats.s.online_reserved);
+		pr_buf(&out, "\t");
+		bch2_replicas_entry_to_text(&out, e);
+		pr_buf(&out, ":\t%llu\n", fs_usage->data[i]);
+	}
+
+	percpu_up_read(&c->mark_lock);
+
+	kfree(fs_usage);
 
 	return out.pos - buf;
 }
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 9f3eafb3e0d4..295f4577e9c1 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -904,3 +904,28 @@ void eytzinger0_find_test(void)
 	kfree(test_array);
 }
 #endif
+
+/*
+ * Accumulate percpu counters onto one cpu's copy - only valid when access
+ * against any percpu counter is guarded against
+ */
+u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
+{
+	u64 *ret;
+	int cpu;
+
+	preempt_disable();
+	ret = this_cpu_ptr(p);
+	preempt_enable();
+
+	for_each_possible_cpu(cpu) {
+		u64 *i = per_cpu_ptr(p, cpu);
+
+		if (i != ret) {
+			acc_u64s(ret, i, nr);
+			memset(i, 0, nr * sizeof(u64));
+		}
+	}
+
+	return ret;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index 8bbb0e30d07f..fa1a3adc87df 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -718,4 +718,6 @@ static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src,
 		acc_u64s(acc, per_cpu_ptr(src, cpu), nr);
 }
 
+u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned);
+
 #endif /* _BCACHEFS_UTIL_H */
-- 
2.30.9