Commit e1dc191d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'bcachefs-2024-04-10' of https://evilpiepirate.org/git/bcachefs

Pull more bcachefs fixes from Kent Overstreet:
 "Notable user impacting bugs

   - On multi device filesystems, recovery was looping in
     btree_trans_too_many_iters(). This checks if a transaction has
     touched too many btree paths (because of iteration over many keys),
     and isuses a restart to drop unneeded paths.

     But it's now possible for some paths to exceed the previous limit
     without iteration in the interior btree update path, since the
     transaction commit will do alloc updates for every old and new
     btree node, and during journal replay we don't use the btree write
     buffer for locking reasons and thus those updates use btree paths
     when they wouldn't normally.

   - Fix a corner case in rebalance when moving extents on a
     durability=0 device. This wouldn't be hit when a device was
     formatted with durability=0 since in that case we'll only use it as
     a write through cache (only cached extents will live on it), but
     durability can now be changed on an existing device.

   - bch2_get_acl() could rarely forget to handle a transaction restart;
     this manifested as the occasional missing acl that came back after
     dropping caches.

   - Fix a major performance regression on high iops multithreaded write
     workloads (only since 6.9-rc1); a previous fix for a deadlock in
     the interior btree update path to check the journal watermark
     introduced a dependency on the state of btree write buffer flushing
     that we didn't want.

   - Assorted other repair paths and recovery fixes"

* tag 'bcachefs-2024-04-10' of https://evilpiepirate.org/git/bcachefs: (25 commits)
  bcachefs: Fix __bch2_btree_and_journal_iter_init_node_iter()
  bcachefs: Kill read lock dropping in bch2_btree_node_lock_write_nofail()
  bcachefs: Fix a race in btree_update_nodes_written()
  bcachefs: btree_node_scan: Respect member.data_allowed
  bcachefs: Don't scan for btree nodes when we can reconstruct
  bcachefs: Fix check_topology() when using node scan
  bcachefs: fix eytzinger0_find_gt()
  bcachefs: fix bch2_get_acl() transaction restart handling
  bcachefs: fix the count of nr_freed_pcpu after changing bc->freed_nonpcpu list
  bcachefs: Fix gap buffer bug in bch2_journal_key_insert_take()
  bcachefs: Rename struct field swap to prevent macro naming collision
  MAINTAINERS: Add entry for bcachefs documentation
  Documentation: filesystems: Add bcachefs toctree
  bcachefs: JOURNAL_SPACE_LOW
  bcachefs: Disable errors=panic for BCH_IOCTL_FSCK_OFFLINE
  bcachefs: Fix BCH_IOCTL_FSCK_OFFLINE for encrypted filesystems
  bcachefs: fix rand_delete unit test
  bcachefs: fix ! vs ~ typo in __clear_bit_le64()
  bcachefs: Fix rebalance from durability=0 device
  bcachefs: Print shutdown journal sequence number
  ...
parents 346668f0 1189bdda
.. SPDX-License-Identifier: GPL-2.0
======================
bcachefs Documentation
======================
.. toctree::
:maxdepth: 2
:numbered:
errorcodes
......@@ -69,6 +69,7 @@ Documentation for filesystem implementations.
afs
autofs
autofs-mount-control
bcachefs/index
befs
bfs
btrfs
......
......@@ -3573,6 +3573,7 @@ S: Supported
C: irc://irc.oftc.net/bcache
T: git https://evilpiepirate.org/git/bcachefs.git
F: fs/bcachefs/
F: Documentation/filesystems/bcachefs/
BDISP ST MEDIA DRIVER
M: Fabien Dessenne <fabien.dessenne@foss.st.com>
......
......@@ -281,7 +281,6 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0);
struct btree_trans *trans = bch2_trans_get(c);
struct btree_iter iter = { NULL };
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
struct bkey_s_c k;
int ret;
......@@ -290,28 +289,27 @@ struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc,
&hash, inode_inum(inode), &search, 0);
if (ret) {
if (!bch2_err_matches(ret, ENOENT))
acl = ERR_PTR(ret);
goto out;
}
if (ret)
goto err;
k = bch2_btree_iter_peek_slot(&iter);
ret = bkey_err(k);
if (ret) {
acl = ERR_PTR(ret);
goto out;
}
if (ret)
goto err;
xattr = bkey_s_c_to_xattr(k);
struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k);
acl = bch2_acl_from_disk(trans, xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
le16_to_cpu(xattr.v->x_val_len));
ret = PTR_ERR_OR_ZERO(acl);
err:
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto retry;
if (!IS_ERR(acl))
if (ret)
acl = !bch2_err_matches(ret, ENOENT) ? ERR_PTR(ret) : NULL;
if (!IS_ERR_OR_NULL(acl))
set_cached_acl(&inode->v, type, acl);
out:
if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart))
goto retry;
bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
......
......@@ -1535,6 +1535,20 @@ enum btree_id {
BTREE_ID_NR
};
static inline bool btree_id_is_alloc(enum btree_id id)
{
switch (id) {
case BTREE_ID_alloc:
case BTREE_ID_backpointers:
case BTREE_ID_need_discard:
case BTREE_ID_freespace:
case BTREE_ID_bucket_gens:
return true;
default:
return false;
}
}
#define BTREE_MAX_DEPTH 4U
/* Btree nodes */
......
......@@ -368,11 +368,16 @@ static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct
buf.buf)) {
bch2_btree_node_evict(trans, cur_k.k);
cur = NULL;
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?:
bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
ret = bch2_journal_key_delete(c, b->c.btree_id,
b->c.level, cur_k.k->k.p);
if (ret)
break;
if (!btree_id_is_alloc(b->c.btree_id)) {
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
if (ret)
break;
}
continue;
}
......@@ -544,12 +549,12 @@ int bch2_check_topology(struct bch_fs *c)
bch2_btree_root_alloc_fake(c, i, 0);
} else {
bch2_btree_root_alloc_fake(c, i, 1);
bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
ret = bch2_get_scanned_nodes(c, i, 0, POS_MIN, SPOS_MAX);
if (ret)
break;
}
bch2_shoot_down_journal_keys(c, i, 1, BTREE_MAX_DEPTH, POS_MIN, SPOS_MAX);
reconstructed_root = true;
}
......
......@@ -642,7 +642,7 @@ int __bch2_btree_trans_too_many_iters(struct btree_trans *);
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_NORMAL_LIMIT - 8)
return __bch2_btree_trans_too_many_iters(trans);
return 0;
......
......@@ -130,12 +130,30 @@ struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree
return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx);
}
static void journal_iter_verify(struct journal_iter *iter)
{
struct journal_keys *keys = iter->keys;
size_t gap_size = keys->size - keys->nr;
BUG_ON(iter->idx >= keys->gap &&
iter->idx < keys->gap + gap_size);
if (iter->idx < keys->size) {
struct journal_key *k = keys->data + iter->idx;
int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
cmp_int(k->level, iter->level);
BUG_ON(cmp < 0);
}
}
static void journal_iters_fix(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
/* The key we just inserted is immediately before the gap: */
size_t gap_end = keys->gap + (keys->size - keys->nr);
struct btree_and_journal_iter *iter;
struct journal_key *new_key = &keys->data[keys->gap - 1];
struct journal_iter *iter;
/*
* If an iterator points one after the key we just inserted, decrement
......@@ -143,9 +161,14 @@ static void journal_iters_fix(struct bch_fs *c)
* decrement was unnecessary, bch2_btree_and_journal_iter_peek() will
* handle that:
*/
list_for_each_entry(iter, &c->journal_iters, journal.list)
if (iter->journal.idx == gap_end)
iter->journal.idx = keys->gap - 1;
list_for_each_entry(iter, &c->journal_iters, list) {
journal_iter_verify(iter);
if (iter->idx == gap_end &&
new_key->btree_id == iter->btree_id &&
new_key->level == iter->level)
iter->idx = keys->gap - 1;
journal_iter_verify(iter);
}
}
static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap)
......@@ -192,7 +215,12 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
if (idx > keys->gap)
idx -= keys->size - keys->nr;
size_t old_gap = keys->gap;
if (keys->nr == keys->size) {
journal_iters_move_gap(c, old_gap, keys->size);
old_gap = keys->size;
struct journal_keys new_keys = {
.nr = keys->nr,
.size = max_t(size_t, keys->size, 8) * 2,
......@@ -216,7 +244,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
keys->gap = keys->nr;
}
journal_iters_move_gap(c, keys->gap, idx);
journal_iters_move_gap(c, old_gap, idx);
move_gap(keys, idx);
......@@ -301,16 +329,21 @@ static void bch2_journal_iter_advance(struct journal_iter *iter)
static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter)
{
struct journal_key *k = iter->keys->data + iter->idx;
journal_iter_verify(iter);
while (iter->idx < iter->keys->size) {
struct journal_key *k = iter->keys->data + iter->idx;
int cmp = cmp_int(k->btree_id, iter->btree_id) ?:
cmp_int(k->level, iter->level);
if (cmp > 0)
break;
BUG_ON(cmp);
while (k < iter->keys->data + iter->keys->size &&
k->btree_id == iter->btree_id &&
k->level == iter->level) {
if (!k->overwritten)
return bkey_i_to_s_c(k->k);
bch2_journal_iter_advance(iter);
k = iter->keys->data + iter->idx;
}
return bkey_s_c_null;
......@@ -330,6 +363,8 @@ static void bch2_journal_iter_init(struct bch_fs *c,
iter->level = level;
iter->keys = &c->journal_keys;
iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos);
journal_iter_verify(iter);
}
static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter)
......@@ -434,10 +469,15 @@ void __bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
iter->trans = trans;
iter->b = b;
iter->node_iter = node_iter;
bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
INIT_LIST_HEAD(&iter->journal.list);
iter->pos = b->data->min_key;
iter->at_end = false;
INIT_LIST_HEAD(&iter->journal.list);
if (trans->journal_replay_not_finished) {
bch2_journal_iter_init(trans->c, &iter->journal, b->c.btree_id, b->c.level, pos);
if (!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
list_add(&iter->journal.list, &trans->c->journal_iters);
}
}
/*
......@@ -452,9 +492,6 @@ void bch2_btree_and_journal_iter_init_node_iter(struct btree_trans *trans,
bch2_btree_node_iter_init_from_start(&node_iter, b);
__bch2_btree_and_journal_iter_init_node_iter(trans, iter, b, node_iter, b->data->min_key);
if (trans->journal_replay_not_finished &&
!test_bit(BCH_FS_may_go_rw, &trans->c->flags))
list_add(&iter->journal.list, &trans->c->journal_iters);
}
/* sort and dedup all keys in the journal: */
......
......@@ -169,6 +169,7 @@ static void bkey_cached_move_to_freelist(struct btree_key_cache *bc,
} else {
mutex_lock(&bc->lock);
list_move_tail(&ck->list, &bc->freed_pcpu);
bc->nr_freed_pcpu++;
mutex_unlock(&bc->lock);
}
}
......@@ -245,6 +246,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path,
if (!list_empty(&bc->freed_pcpu)) {
ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list);
list_del_init(&ck->list);
bc->nr_freed_pcpu--;
}
mutex_unlock(&bc->lock);
}
......@@ -659,7 +661,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
commit_flags |= BCH_WATERMARK_reclaim;
if (ck->journal.seq != journal_last_seq(j) ||
j->watermark == BCH_WATERMARK_stripe)
!test_bit(JOURNAL_SPACE_LOW, &c->journal.flags))
commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
ret = bch2_btree_iter_traverse(&b_iter) ?:
......
......@@ -440,33 +440,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_path *path,
struct btree_bkey_cached_common *b)
{
struct btree_path *linked;
unsigned i, iter;
int ret;
/*
* XXX BIG FAT NOTICE
*
* Drop all read locks before taking a write lock:
*
* This is a hack, because bch2_btree_node_lock_write_nofail() is a
* hack - but by dropping read locks first, this should never fail, and
* we only use this in code paths where whatever read locks we've
* already taken are no longer needed:
*/
trans_for_each_path(trans, linked, iter) {
if (!linked->nodes_locked)
continue;
for (i = 0; i < BTREE_MAX_DEPTH; i++)
if (btree_node_read_locked(linked, i)) {
btree_node_unlock(trans, linked, i);
btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK);
}
}
ret = __btree_node_lock_write(trans, path, b, true);
int ret = __btree_node_lock_write(trans, path, b, true);
BUG_ON(ret);
}
......
......@@ -133,6 +133,9 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca,
if (le64_to_cpu(bn->magic) != bset_magic(c))
return;
if (btree_id_is_alloc(BTREE_NODE_ID(bn)))
return;
rcu_read_lock();
struct found_btree_node n = {
.btree_id = BTREE_NODE_ID(bn),
......@@ -213,6 +216,9 @@ static int read_btree_nodes(struct find_btree_nodes *f)
closure_init_stack(&cl);
for_each_online_member(c, ca) {
if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree)))
continue;
struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL);
struct task_struct *t;
......@@ -290,7 +296,7 @@ static int handle_overwrites(struct bch_fs *c,
found_btree_node_to_text(&buf, c, n);
bch_err(c, "%s", buf.buf);
printbuf_exit(&buf);
return -1;
return -BCH_ERR_fsck_repair_unimplemented;
}
}
......@@ -436,6 +442,9 @@ bool bch2_btree_has_scanned_nodes(struct bch_fs *c, enum btree_id btree)
int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree,
unsigned level, struct bpos node_min, struct bpos node_max)
{
if (btree_id_is_alloc(btree))
return 0;
struct find_btree_nodes *f = &c->found_btree_nodes;
int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes);
......
......@@ -364,7 +364,21 @@ struct btree_insert_entry {
unsigned long ip_allocated;
};
/* Number of btree paths we preallocate, usually enough */
#define BTREE_ITER_INITIAL 64
/*
* Lmiit for btree_trans_too_many_iters(); this is enough that almost all code
* paths should run inside this limit, and if they don't it usually indicates a
* bug (leaking/duplicated btree paths).
*
* exception: some fsck paths
*
* bugs with excessive path usage seem to have possibly been eliminated now, so
* we might consider eliminating this (and btree_trans_too_many_iter()) at some
* point.
*/
#define BTREE_ITER_NORMAL_LIMIT 256
/* never exceed limit */
#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
......
......@@ -26,9 +26,9 @@
#include <linux/random.h>
const char * const bch2_btree_update_modes[] = {
static const char * const bch2_btree_update_modes[] = {
#define x(t) #t,
BCH_WATERMARKS()
BTREE_UPDATE_MODES()
#undef x
NULL
};
......@@ -704,9 +704,13 @@ static void btree_update_nodes_written(struct btree_update *as)
bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c,
"%s", bch2_err_str(ret));
err:
if (as->b) {
b = as->b;
/*
* We have to be careful because another thread might be getting ready
* to free as->b and calling btree_update_reparent() on us - we'll
* recheck under btree_update_lock below:
*/
b = READ_ONCE(as->b);
if (b) {
btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
as->btree_id, b->c.level, b->key.k.p);
struct btree_path *path = trans->paths + path_idx;
......@@ -850,15 +854,17 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
{
struct bch_fs *c = as->c;
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
BUG_ON(as->mode != BTREE_UPDATE_none);
BUG_ON(as->update_level_end < b->c.level);
BUG_ON(!btree_node_dirty(b));
BUG_ON(!b->c.level);
mutex_lock(&c->btree_interior_update_lock);
list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten);
as->mode = BTREE_UPDATE_node;
as->b = b;
as->update_level_end = b->c.level;
set_btree_node_write_blocked(b);
list_add(&as->write_blocked_list, &b->write_blocked);
......@@ -1100,7 +1106,7 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *
static struct btree_update *
bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
unsigned level, bool split, unsigned flags)
unsigned level_start, bool split, unsigned flags)
{
struct bch_fs *c = trans->c;
struct btree_update *as;
......@@ -1108,7 +1114,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
unsigned level_end = level_start;
enum bch_watermark watermark = flags & BCH_WATERMARK_MASK;
int ret = 0;
u32 restart_count = trans->restart_count;
......@@ -1123,34 +1129,30 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
if (watermark < c->journal.watermark) {
struct journal_res res = { 0 };
unsigned journal_flags = watermark|JOURNAL_RES_GET_CHECK;
if (watermark < BCH_WATERMARK_reclaim &&
test_bit(JOURNAL_SPACE_LOW, &c->journal.flags)) {
if (flags & BCH_TRANS_COMMIT_journal_reclaim)
return ERR_PTR(-BCH_ERR_journal_reclaim_would_deadlock);
if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < BCH_WATERMARK_reclaim)
journal_flags |= JOURNAL_RES_GET_NONBLOCK;
ret = drop_locks_do(trans,
bch2_journal_res_get(&c->journal, &res, 1, journal_flags));
if (bch2_err_matches(ret, BCH_ERR_operation_blocked))
ret = -BCH_ERR_journal_reclaim_would_deadlock;
bch2_trans_unlock(trans);
wait_event(c->journal.wait, !test_bit(JOURNAL_SPACE_LOW, &c->journal.flags));
ret = bch2_trans_relock(trans);
if (ret)
return ERR_PTR(ret);
}
while (1) {
nr_nodes[!!update_level] += 1 + split;
update_level++;
nr_nodes[!!level_end] += 1 + split;
level_end++;
ret = bch2_btree_path_upgrade(trans, path, update_level + 1);
ret = bch2_btree_path_upgrade(trans, path, level_end + 1);
if (ret)
return ERR_PTR(ret);
if (!btree_path_node(path, update_level)) {
if (!btree_path_node(path, level_end)) {
/* Allocating new root? */
nr_nodes[1] += split;
update_level = BTREE_MAX_DEPTH;
level_end = BTREE_MAX_DEPTH;
break;
}
......@@ -1158,11 +1160,11 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
if (bch2_btree_node_insert_fits(path->l[update_level].b,
if (bch2_btree_node_insert_fits(path->l[level_end].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
split = path->l[level_end].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
if (!down_read_trylock(&c->gc_lock)) {
......@@ -1176,14 +1178,15 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS);
memset(as, 0, sizeof(*as));
closure_init(&as->cl, NULL);
as->c = c;
as->start_time = start_time;
as->ip_started = _RET_IP_;
as->mode = BTREE_UPDATE_none;
as->watermark = watermark;
as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
as->c = c;
as->start_time = start_time;
as->ip_started = _RET_IP_;
as->mode = BTREE_UPDATE_none;
as->watermark = watermark;
as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level_start = level_start;
as->update_level_end = level_end;
INIT_LIST_HEAD(&as->list);
INIT_LIST_HEAD(&as->unwritten_list);
INIT_LIST_HEAD(&as->write_blocked_list);
......@@ -1373,12 +1376,12 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as,
}
static void
__bch2_btree_insert_keys_interior(struct btree_update *as,
struct btree_trans *trans,
struct btree_path *path,
struct btree *b,
struct btree_node_iter node_iter,
struct keylist *keys)
bch2_btree_insert_keys_interior(struct btree_update *as,
struct btree_trans *trans,
struct btree_path *path,
struct btree *b,
struct btree_node_iter node_iter,
struct keylist *keys)
{
struct bkey_i *insert = bch2_keylist_front(keys);
struct bkey_packed *k;
......@@ -1534,7 +1537,7 @@ static void btree_split_insert_keys(struct btree_update *as,
bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p);
__bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys);
BUG_ON(bch2_btree_node_check_topology(trans, b));
}
......@@ -1714,27 +1717,6 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
goto out;
}
static void
bch2_btree_insert_keys_interior(struct btree_update *as,
struct btree_trans *trans,
struct btree_path *path,
struct btree *b,
struct keylist *keys)
{
struct btree_path *linked;
unsigned i;
__bch2_btree_insert_keys_interior(as, trans, path, b,
path->l[b->c.level].iter, keys);
btree_update_updated_node(as, b);
trans_for_each_path_with_node(trans, b, linked, i)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
bch2_trans_verify_paths(trans);
}
/**
* bch2_btree_insert_node - insert bkeys into a given btree node
*
......@@ -1755,7 +1737,8 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
struct keylist *keys)
{
struct bch_fs *c = as->c;
struct btree_path *path = trans->paths + path_idx;
struct btree_path *path = trans->paths + path_idx, *linked;
unsigned i;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
......@@ -1784,7 +1767,13 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
return ret;
}
bch2_btree_insert_keys_interior(as, trans, path, b, keys);
bch2_btree_insert_keys_interior(as, trans, path, b,
path->l[b->c.level].iter, keys);
trans_for_each_path_with_node(trans, b, linked, i)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
bch2_trans_verify_paths(trans);
live_u64s_added = (int) b->nr.live_u64s - old_live_u64s;
u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s;
......@@ -1798,6 +1787,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_maybe_compact_whiteouts(c, b))
bch2_trans_node_reinit_iter(trans, b);
btree_update_updated_node(as, b);
bch2_btree_node_unlock_write(trans, path, b);
BUG_ON(bch2_btree_node_check_topology(trans, b));
......@@ -1807,7 +1797,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
* We could attempt to avoid the transaction restart, by calling
* bch2_btree_path_upgrade() and allocating more nodes:
*/
if (b->c.level >= as->update_level) {
if (b->c.level >= as->update_level_end) {
trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
......@@ -2519,9 +2509,11 @@ void bch2_btree_root_alloc_fake(struct bch_fs *c, enum btree_id id, unsigned lev
static void bch2_btree_update_to_text(struct printbuf *out, struct btree_update *as)
{
prt_printf(out, "%ps: btree=%s watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
prt_printf(out, "%ps: btree=%s l=%u-%u watermark=%s mode=%s nodes_written=%u cl.remaining=%u journal_seq=%llu\n",
(void *) as->ip_started,
bch2_btree_id_str(as->btree_id),
as->update_level_start,
as->update_level_end,
bch2_watermarks[as->watermark],
bch2_btree_update_modes[as->mode],
as->nodes_written,
......
......@@ -57,7 +57,8 @@ struct btree_update {
unsigned took_gc_lock:1;
enum btree_id btree_id;
unsigned update_level;
unsigned update_level_start;
unsigned update_level_end;
struct disk_reservation disk_res;
......
......@@ -134,42 +134,38 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
struct fsck_thread {
struct thread_with_stdio thr;
struct bch_fs *c;
char **devs;
size_t nr_devs;
struct bch_opts opts;
};
static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
{
struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
if (thr->devs)
for (size_t i = 0; i < thr->nr_devs; i++)
kfree(thr->devs[i]);
kfree(thr->devs);
kfree(thr);
}
static int bch2_fsck_offline_thread_fn(struct thread_with_stdio *stdio)
{
struct fsck_thread *thr = container_of(stdio, struct fsck_thread, thr);
struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
if (IS_ERR(c))
return PTR_ERR(c);
struct bch_fs *c = thr->c;
int ret = 0;
if (test_bit(BCH_FS_errors_fixed, &c->flags))
ret |= 1;
if (test_bit(BCH_FS_error, &c->flags))
ret |= 4;
int ret = PTR_ERR_OR_ZERO(c);
if (ret)
return ret;
bch2_fs_stop(c);
ret = bch2_fs_start(thr->c);
if (ret)
goto err;
if (ret & 1)
if (test_bit(BCH_FS_errors_fixed, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: errors fixed\n", c->name);
if (ret & 4)
ret |= 1;
}
if (test_bit(BCH_FS_error, &c->flags)) {
bch2_stdio_redirect_printf(&stdio->stdio, false, "%s: still has errors\n", c->name);
ret |= 4;
}
err:
bch2_fs_stop(c);
return ret;
}
......@@ -182,7 +178,7 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
{
struct bch_ioctl_fsck_offline arg;
struct fsck_thread *thr = NULL;
u64 *devs = NULL;
darray_str(devs) = {};
long ret = 0;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
......@@ -194,29 +190,32 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
!(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
!(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
ret = -ENOMEM;
goto err;
}
for (size_t i = 0; i < arg.nr_devs; i++) {
u64 dev_u64;
ret = copy_from_user_errcode(&dev_u64, &user_arg->devs[i], sizeof(u64));
if (ret)
goto err;
thr->opts = bch2_opts_empty();
thr->nr_devs = arg.nr_devs;
char *dev_str = strndup_user((char __user *)(unsigned long) dev_u64, PATH_MAX);
ret = PTR_ERR_OR_ZERO(dev_str);
if (ret)
goto err;
if (copy_from_user(devs, &user_arg->devs[0],
array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
ret = -EINVAL;
goto err;
ret = darray_push(&devs, dev_str);
if (ret) {
kfree(dev_str);
goto err;
}
}
for (size_t i = 0; i < arg.nr_devs; i++) {
thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
ret = PTR_ERR_OR_ZERO(thr->devs[i]);
if (ret)
goto err;
thr = kzalloc(sizeof(*thr), GFP_KERNEL);
if (!thr) {
ret = -ENOMEM;
goto err;
}
thr->opts = bch2_opts_empty();
if (arg.opts) {
char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
......@@ -230,15 +229,26 @@ static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_a
opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
/* We need request_key() to be called before we punt to kthread: */
opt_set(thr->opts, nostart, true);
thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts);
if (!IS_ERR(thr->c) &&
thr->c->opts.errors == BCH_ON_ERROR_panic)
thr->c->opts.errors = BCH_ON_ERROR_ro;
ret = bch2_run_thread_with_stdio(&thr->thr, &bch2_offline_fsck_ops);
err:
if (ret < 0) {
if (thr)
bch2_fsck_thread_exit(&thr->thr);
pr_err("ret %s", bch2_err_str(ret));
}
kfree(devs);
out:
darray_for_each(devs, i)
kfree(*i);
darray_exit(&devs);
return ret;
err:
if (thr)
bch2_fsck_thread_exit(&thr->thr);
pr_err("ret %s", bch2_err_str(ret));
goto out;
}
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
......
......@@ -598,6 +598,8 @@ int bch2_data_update_init(struct btree_trans *trans,
i++;
}
unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have));
/*
* If current extent durability is less than io_opts.data_replicas,
* we're not trying to rereplicate the extent up to data_replicas here -
......@@ -607,7 +609,7 @@ int bch2_data_update_init(struct btree_trans *trans,
* rereplicate, currently, so that users don't get an unexpected -ENOSPC
*/
if (!(m->data_opts.write_flags & BCH_WRITE_CACHED) &&
durability_have >= io_opts.data_replicas) {
!durability_required) {
m->data_opts.kill_ptrs |= m->data_opts.rewrite_ptrs;
m->data_opts.rewrite_ptrs = 0;
/* if iter == NULL, it's just a promote */
......@@ -616,11 +618,18 @@ int bch2_data_update_init(struct btree_trans *trans,
goto done;
}
m->op.nr_replicas = min(durability_removing, io_opts.data_replicas - durability_have) +
m->op.nr_replicas = min(durability_removing, durability_required) +
m->data_opts.extra_replicas;
m->op.nr_replicas_required = m->op.nr_replicas;
BUG_ON(!m->op.nr_replicas);
/*
* If device(s) were set to durability=0 after data was written to them
* we can end up with a duribilty=0 extent, and the normal algorithm
* that tries not to increase durability doesn't work:
*/
if (!(durability_have + durability_removing))
m->op.nr_replicas = max((unsigned) m->op.nr_replicas, 1);
m->op.nr_replicas_required = m->op.nr_replicas;
if (reserve_sectors) {
ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors,
......
......@@ -13,6 +13,7 @@
#include "btree_iter.h"
#include "btree_locking.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "buckets.h"
#include "debug.h"
#include "error.h"
......@@ -668,7 +669,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
i->size = size;
i->ret = 0;
do {
while (1) {
err = flush_buf(i);
if (err)
return err;
......@@ -676,9 +677,12 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
if (!i->size)
break;
if (done)
break;
done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter);
i->iter++;
} while (!done);
}
if (i->buf.allocation_failure)
return -ENOMEM;
......@@ -693,13 +697,45 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
struct bch_fs *c = i->c;
int err;
i->ubuf = buf;
i->size = size;
i->ret = 0;
if (!i->iter) {
bch2_btree_updates_to_text(&i->buf, c);
i->iter++;
}
err = flush_buf(i);
if (err)
return err;
if (i->buf.allocation_failure)
return -ENOMEM;
return i->ret;
}
static const struct file_operations btree_updates_ops = {
.owner = THIS_MODULE,
.open = bch2_dump_open,
.release = bch2_dump_release,
.read = bch2_btree_updates_read,
};
static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL);
if (!i)
return -ENOMEM;
......@@ -866,6 +902,20 @@ void bch2_fs_debug_exit(struct bch_fs *c)
debugfs_remove_recursive(c->fs_debug_dir);
}
static void bch2_fs_debug_btree_init(struct bch_fs *c, struct btree_debug *bd)
{
struct dentry *d;
d = debugfs_create_dir(bch2_btree_id_str(bd->id), c->btree_debug_dir);
debugfs_create_file("keys", 0400, d, bd, &btree_debug_ops);
debugfs_create_file("formats", 0400, d, bd, &btree_format_debug_ops);
debugfs_create_file("bfloat-failed", 0400, d, bd,
&bfloat_failed_debug_ops);
}
void bch2_fs_debug_init(struct bch_fs *c)
{
struct btree_debug *bd;
......@@ -888,6 +938,9 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
debugfs_create_file("btree_updates", 0400, c->fs_debug_dir,
c->btree_debug, &btree_updates_ops);
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
c, &btree_transaction_stats_op);
......@@ -902,21 +955,7 @@ void bch2_fs_debug_init(struct bch_fs *c)
bd < c->btree_debug + ARRAY_SIZE(c->btree_debug);
bd++) {
bd->id = bd - c->btree_debug;
debugfs_create_file(bch2_btree_id_str(bd->id),
0400, c->btree_debug_dir, bd,
&btree_debug_ops);
snprintf(name, sizeof(name), "%s-formats",
bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&btree_format_debug_ops);
snprintf(name, sizeof(name), "%s-bfloat-failed",
bch2_btree_id_str(bd->id));
debugfs_create_file(name, 0400, c->btree_debug_dir, bd,
&bfloat_failed_debug_ops);
bch2_fs_debug_btree_init(c, bd);
}
}
......
......@@ -115,7 +115,7 @@ static void swap_bytes(void *a, void *b, size_t n)
struct wrapper {
cmp_func_t cmp;
swap_func_t swap;
swap_func_t swap_func;
};
/*
......@@ -125,7 +125,7 @@ struct wrapper {
static void do_swap(void *a, void *b, size_t size, swap_r_func_t swap_func, const void *priv)
{
if (swap_func == SWAP_WRAPPER) {
((const struct wrapper *)priv)->swap(a, b, (int)size);
((const struct wrapper *)priv)->swap_func(a, b, (int)size);
return;
}
......@@ -174,7 +174,7 @@ void eytzinger0_sort_r(void *base, size_t n, size_t size,
int i, c, r;
/* called from 'sort' without swap function, let's pick the default */
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap)
if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func)
swap_func = NULL;
if (!swap_func) {
......@@ -227,7 +227,7 @@ void eytzinger0_sort(void *base, size_t n, size_t size,
{
struct wrapper w = {
.cmp = cmp_func,
.swap = swap_func,
.swap_func = swap_func,
};
return eytzinger0_sort_r(base, n, size, _CMP_WRAPPER, SWAP_WRAPPER, &w);
......
......@@ -242,8 +242,8 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
(_i) = eytzinger0_next((_i), (_size)))
/* return greatest node <= @search, or -1 if not found */
static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
static inline int eytzinger0_find_le(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
unsigned i, n = 0;
......@@ -256,18 +256,32 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
} while (n < nr);
if (n & 1) {
/* @i was greater than @search, return previous node: */
/*
* @i was greater than @search, return previous node:
*
* if @i was leftmost/smallest element,
* eytzinger0_prev(eytzinger0_first())) returns -1, as expected
*/
return eytzinger0_prev(i, nr);
} else {
return i;
}
}
static inline ssize_t eytzinger0_find_gt(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size,
cmp_func_t cmp, const void *search)
{
ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search);
return eytzinger0_next(idx, size);
/*
* if eytitzinger0_find_le() returned -1 - no element was <= search - we
* want to return the first element; next/prev identities mean this work
* as expected
*
* similarly if find_le() returns last element, we should return -1;
* identities mean this all works out:
*/
return eytzinger0_next(idx, nr);
}
#define eytzinger0_find(base, nr, size, _cmp, search) \
......
......@@ -67,6 +67,8 @@ void bch2_journal_set_watermark(struct journal *j)
track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full], low_on_wb))
trace_and_count(c, journal_full, c);
mod_bit(JOURNAL_SPACE_LOW, &j->flags, low_on_space || low_on_pin);
swap(watermark, j->watermark);
if (watermark > j->watermark)
journal_wake(j);
......
......@@ -134,6 +134,7 @@ enum journal_flags {
JOURNAL_STARTED,
JOURNAL_MAY_SKIP_FLUSH,
JOURNAL_NEED_FLUSH_WRITE,
JOURNAL_SPACE_LOW,
};
/* Reasons we may fail to get a journal reservation: */
......
......@@ -47,20 +47,6 @@ void bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree)
}
}
static bool btree_id_is_alloc(enum btree_id id)
{
switch (id) {
case BTREE_ID_alloc:
case BTREE_ID_backpointers:
case BTREE_ID_need_discard:
case BTREE_ID_freespace:
case BTREE_ID_bucket_gens:
return true;
default:
return false;
}
}
/* for -o reconstruct_alloc: */
static void bch2_reconstruct_alloc(struct bch_fs *c)
{
......
......@@ -125,6 +125,15 @@ static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ances
return s->parent;
}
static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
{
const struct snapshot_t *s = __snapshot_t(t, id);
if (!s)
return false;
return test_bit(ancestor - id - 1, s->is_ancestor);
}
bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
{
bool ret;
......@@ -140,13 +149,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
while (id && id < ancestor - IS_ANCESTOR_BITMAP)
id = get_ancestor_below(t, id, ancestor);
if (id && id < ancestor) {
ret = test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor);
ret = id && id < ancestor
? test_ancestor_bitmap(t, id, ancestor)
: id == ancestor;
EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
} else {
ret = id == ancestor;
}
EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
out:
rcu_read_unlock();
......
......@@ -288,8 +288,13 @@ static void __bch2_fs_read_only(struct bch_fs *c)
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
!test_bit(BCH_FS_emergency_ro, &c->flags))
set_bit(BCH_FS_clean_shutdown, &c->flags);
bch2_fs_journal_stop(&c->journal);
bch_info(c, "%sshutdown complete, journal seq %llu",
test_bit(BCH_FS_clean_shutdown, &c->flags) ? "" : "un",
c->journal.seq_ondisk);
/*
* After stopping journal:
*/
......
......@@ -17,7 +17,6 @@
#include "btree_iter.h"
#include "btree_key_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
......@@ -166,7 +165,6 @@ read_attribute(btree_write_stats);
read_attribute(btree_cache_size);
read_attribute(compression_stats);
read_attribute(journal_debug);
read_attribute(btree_updates);
read_attribute(btree_cache);
read_attribute(btree_key_cache);
read_attribute(stripes_heap);
......@@ -415,9 +413,6 @@ SHOW(bch2_fs)
if (attr == &sysfs_journal_debug)
bch2_journal_debug_to_text(out, &c->journal);
if (attr == &sysfs_btree_updates)
bch2_btree_updates_to_text(out, c);
if (attr == &sysfs_btree_cache)
bch2_btree_cache_to_text(out, c);
......@@ -639,7 +634,6 @@ SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
&sysfs_flags,
&sysfs_journal_debug,
&sysfs_btree_updates,
&sysfs_btree_cache,
&sysfs_btree_key_cache,
&sysfs_new_stripes,
......
......@@ -672,7 +672,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos)
bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos,
BTREE_ITER_INTENT);
k = bch2_btree_iter_peek(&iter);
k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX));
ret = bkey_err(k);
if (ret)
goto err;
......
......@@ -788,6 +788,14 @@ static inline int copy_from_user_errcode(void *to, const void __user *from, unsi
#endif
static inline void mod_bit(long nr, volatile unsigned long *addr, bool v)
{
if (v)
set_bit(nr, addr);
else
clear_bit(nr, addr);
}
static inline void __set_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] |= cpu_to_le64(BIT_ULL(bit % 64));
......@@ -795,7 +803,7 @@ static inline void __set_bit_le64(size_t bit, __le64 *addr)
static inline void __clear_bit_le64(size_t bit, __le64 *addr)
{
addr[bit / 64] &= !cpu_to_le64(BIT_ULL(bit % 64));
addr[bit / 64] &= ~cpu_to_le64(BIT_ULL(bit % 64));
}
static inline bool test_bit_le64(size_t bit, __le64 *addr)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment