Commit ff9bce3d authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'bcachefs-2024-05-30' of https://evilpiepirate.org/git/bcachefs

Pull bcachefs fixes from Kent Overstreet:
 "Assorted odds and ends...

   - two downgrade fixes

   - a couple snapshot deletion and repair fixes, thanks to noradtux for
     finding these and providing the image to debug them

   - a couple assert fixes

   - convert to folio helper, from Matthew

   - some improved error messages

   - bit of code reorganization (just moving things around); doing this
     while things are quiet so I'm not rebasing fixes past reorgs

   - don't return -EROFS on inconsistency error in recovery, this
     confuses util-linux and has it retry the mount

   - fix failure to return error on misaligned dio write; reported as an
     issue with coreutils shred"

* tag 'bcachefs-2024-05-30' of https://evilpiepirate.org/git/bcachefs: (21 commits)
  bcachefs: Fix failure to return error on misaligned dio write
  bcachefs: Don't return -EROFS from mount on inconsistency error
  bcachefs: Fix uninitialized var warning
  bcachefs: Split out sb-errors_format.h
  bcachefs: Split out journal_seq_blacklist_format.h
  bcachefs: Split out replicas_format.h
  bcachefs: Split out disk_groups_format.h
  bcachefs: split out sb-downgrade_format.h
  bcachefs: split out sb-members_format.h
  bcachefs: Better fsck error message for key version
  bcachefs: btree_gc can now handle unknown btrees
  bcachefs: add missing MODULE_DESCRIPTION()
  bcachefs: Fix setting of downgrade recovery passes/errors
  bcachefs: Run check_key_has_snapshot in snapshot_delete_keys()
  bcachefs: Refactor delete_dead_snapshots()
  bcachefs: Fix locking assert
  bcachefs: Fix lookup_first_inode() when inode_generations are present
  bcachefs: Plumb bkey into __btree_err()
  bcachefs: Use copy_folio_from_iter_atomic()
  bcachefs: Fix sb-downgrade validation
  ...
parents d8ec1985 7b038b56
......@@ -690,7 +690,7 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bpos bucket_pos;
struct bpos bucket_pos = POS_MIN;
struct bch_backpointer bp;
if (p.ptr.cached)
......
......@@ -457,6 +457,7 @@ enum bch_time_stats {
};
#include "alloc_types.h"
#include "btree_gc_types.h"
#include "btree_types.h"
#include "btree_node_scan_types.h"
#include "btree_write_buffer_types.h"
......@@ -488,49 +489,6 @@ enum bch_time_stats {
struct btree;
enum gc_phase {
GC_PHASE_NOT_RUNNING,
GC_PHASE_START,
GC_PHASE_SB,
GC_PHASE_BTREE_stripes,
GC_PHASE_BTREE_extents,
GC_PHASE_BTREE_inodes,
GC_PHASE_BTREE_dirents,
GC_PHASE_BTREE_xattrs,
GC_PHASE_BTREE_alloc,
GC_PHASE_BTREE_quotas,
GC_PHASE_BTREE_reflink,
GC_PHASE_BTREE_subvolumes,
GC_PHASE_BTREE_snapshots,
GC_PHASE_BTREE_lru,
GC_PHASE_BTREE_freespace,
GC_PHASE_BTREE_need_discard,
GC_PHASE_BTREE_backpointers,
GC_PHASE_BTREE_bucket_gens,
GC_PHASE_BTREE_snapshot_trees,
GC_PHASE_BTREE_deleted_inodes,
GC_PHASE_BTREE_logged_ops,
GC_PHASE_BTREE_rebalance_work,
GC_PHASE_BTREE_subvolume_children,
GC_PHASE_PENDING_DELETE,
};
struct gc_pos {
enum gc_phase phase;
u16 level;
struct bpos pos;
};
struct reflink_gc {
u64 offset;
u32 size;
u32 refcount;
};
typedef GENRADIX(struct reflink_gc) reflink_gc_table;
struct io_count {
u64 sectors[2][BCH_DATA_NR];
};
......
......@@ -503,16 +503,22 @@ struct bch_sb_field {
#include "alloc_background_format.h"
#include "extents_format.h"
#include "reflink_format.h"
#include "ec_format.h"
#include "inode_format.h"
#include "dirent_format.h"
#include "xattr_format.h"
#include "quota_format.h"
#include "disk_groups_format.h"
#include "inode_format.h"
#include "journal_seq_blacklist_format.h"
#include "logged_ops_format.h"
#include "quota_format.h"
#include "reflink_format.h"
#include "replicas_format.h"
#include "snapshot_format.h"
#include "subvolume_format.h"
#include "sb-counters_format.h"
#include "sb-downgrade_format.h"
#include "sb-errors_format.h"
#include "sb-members_format.h"
#include "xattr_format.h"
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
......@@ -545,107 +551,6 @@ struct bch_sb_field_journal_v2 {
} d[];
};
/* BCH_SB_FIELD_members_v1: */
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \
x(seqread, 0) \
x(seqwrite, 1) \
x(randread, 2) \
x(randwrite, 3)
enum bch_iops_measurement {
#define x(t, n) BCH_IOPS_##t = n,
BCH_IOPS_MEASUREMENTS()
#undef x
BCH_IOPS_NR
};
#define BCH_MEMBER_ERROR_TYPES() \
x(read, 0) \
x(write, 1) \
x(checksum, 2)
enum bch_member_error_type {
#define x(t, n) BCH_MEMBER_ERROR_##t = n,
BCH_MEMBER_ERROR_TYPES()
#undef x
BCH_MEMBER_ERROR_NR
};
struct bch_member {
__uuid_t uuid;
__le64 nbuckets; /* device size */
__le16 first_bucket; /* index of first bucket used */
__le16 bucket_size; /* sectors */
__u8 btree_bitmap_shift;
__u8 pad[3];
__le64 last_mount; /* time_t */
__le64 flags;
__le32 iops[4];
__le64 errors[BCH_MEMBER_ERROR_NR];
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
__le64 seq;
__le64 btree_allocated_bitmap;
/*
* On recovery from a clean shutdown we don't normally read the journal,
* but we still want to resume writing from where we left off so we
* don't overwrite more than is necessary, for list journal debugging:
*/
__le32 last_journal_bucket;
__le32 last_journal_bucket_offset;
};
/*
* This limit comes from the bucket_gens array - it's a single allocation, and
* kernel allocation are limited to INT_MAX
*/
#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
#define BCH_MEMBER_V1_BYTES 56
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
struct bch_member, flags, 30, 31)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
#endif
#define BCH_MEMBER_STATES() \
x(rw, 0) \
x(ro, 1) \
x(failed, 2) \
x(spare, 3)
enum bch_member_state {
#define x(t, n) BCH_MEMBER_STATE_##t = n,
BCH_MEMBER_STATES()
#undef x
BCH_MEMBER_STATE_NR
};
struct bch_sb_field_members_v1 {
struct bch_sb_field field;
struct bch_member _members[]; //Members are now variable size
};
struct bch_sb_field_members_v2 {
struct bch_sb_field field;
__le16 member_bytes; //size of single member entry
u8 pad[6];
struct bch_member _members[];
};
/* BCH_SB_FIELD_crypt: */
struct nonce {
......@@ -694,8 +599,6 @@ LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16);
LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32);
LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48);
/* BCH_SB_FIELD_replicas: */
#define BCH_DATA_TYPES() \
x(free, 0) \
x(sb, 1) \
......@@ -738,50 +641,6 @@ static inline bool data_type_is_hidden(enum bch_data_type type)
}
}
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
__u8 devs[];
} __packed;
struct bch_sb_field_replicas_v0 {
struct bch_sb_field field;
struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
__u8 devs[];
} __packed;
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
struct bch_sb_field_replicas {
struct bch_sb_field field;
struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_disk_groups: */
#define BCH_SB_LABEL_SIZE 32
struct bch_disk_group {
__u8 label[BCH_SB_LABEL_SIZE];
__le64 flags[2];
} __packed __aligned(8);
LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
struct bch_sb_field_disk_groups {
struct bch_sb_field field;
struct bch_disk_group entries[];
} __packed __aligned(8);
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
......@@ -809,27 +668,6 @@ struct bch_sb_field_clean {
__u64 _data[];
};
struct journal_seq_blacklist_entry {
__le64 start;
__le64 end;
};
struct bch_sb_field_journal_seq_blacklist {
struct bch_sb_field field;
struct journal_seq_blacklist_entry start[];
};
struct bch_sb_field_errors {
struct bch_sb_field field;
struct bch_sb_field_error_entry {
__le64 v;
__le64 last_error_time;
} entries[];
};
LE64_BITMASK(BCH_SB_ERROR_ENTRY_ID, struct bch_sb_field_error_entry, v, 0, 16);
LE64_BITMASK(BCH_SB_ERROR_ENTRY_NR, struct bch_sb_field_error_entry, v, 16, 64);
struct bch_sb_field_ext {
struct bch_sb_field field;
__le64 recovery_passes_required[2];
......@@ -837,18 +675,6 @@ struct bch_sb_field_ext {
__le64 btrees_lost_data;
};
struct bch_sb_field_downgrade_entry {
__le16 version;
__le64 recovery_passes[2];
__le16 nr_errors;
__le16 errors[] __counted_by(nr_errors);
} __packed __aligned(2);
struct bch_sb_field_downgrade {
struct bch_sb_field field;
struct bch_sb_field_downgrade_entry entries[];
};
/* Superblock: */
/*
......@@ -909,7 +735,6 @@ unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_re
#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1)
#define BCH_SB_SECTOR 8
#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */
#define BCH_SB_LAYOUT_SIZE_BITS_MAX 16 /* 32 MB */
......
......@@ -585,16 +585,17 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
if (fsck_err_on(k.k->version.lo > atomic64_read(&c->key_version), c,
bkey_version_in_future,
"key version number higher than recorded: %llu > %llu",
k.k->version.lo,
atomic64_read(&c->key_version)))
"key version number higher than recorded %llu\n %s",
atomic64_read(&c->key_version),
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
atomic64_set(&c->key_version, k.k->version.lo);
}
if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k),
c, btree_bitmap_not_marked,
"btree ptr not marked in member info btree allocated bitmap\n %s",
(bch2_bkey_val_to_text(&buf, c, k),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k),
buf.buf))) {
mutex_lock(&c->sb_lock);
bch2_dev_btree_bitmap_mark(c, k);
......@@ -673,8 +674,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in
static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r)
{
return (int) btree_id_to_gc_phase(l) -
(int) btree_id_to_gc_phase(r);
return cmp_int(gc_btree_order(l), gc_btree_order(r));
}
static int bch2_gc_btrees(struct bch_fs *c)
......@@ -711,7 +711,7 @@ static int bch2_gc_btrees(struct bch_fs *c)
static int bch2_mark_superblocks(struct bch_fs *c)
{
mutex_lock(&c->sb_lock);
gc_pos_set(c, gc_phase(GC_PHASE_SB));
gc_pos_set(c, gc_phase(GC_PHASE_sb));
int ret = bch2_trans_mark_dev_sbs_flags(c, BTREE_TRIGGER_gc);
mutex_unlock(&c->sb_lock);
......@@ -1209,7 +1209,7 @@ int bch2_check_allocations(struct bch_fs *c)
if (ret)
goto out;
gc_pos_set(c, gc_phase(GC_PHASE_START));
gc_pos_set(c, gc_phase(GC_PHASE_start));
ret = bch2_mark_superblocks(c);
BUG_ON(ret);
......@@ -1231,7 +1231,7 @@ int bch2_check_allocations(struct bch_fs *c)
percpu_down_write(&c->mark_lock);
/* Indicates that gc is no longer in progress: */
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
__gc_pos_set(c, gc_phase(GC_PHASE_not_running));
bch2_gc_free(c);
percpu_up_write(&c->mark_lock);
......
......@@ -3,6 +3,7 @@
#define _BCACHEFS_BTREE_GC_H
#include "bkey.h"
#include "btree_gc_types.h"
#include "btree_types.h"
int bch2_check_topology(struct bch_fs *);
......@@ -32,36 +33,15 @@ int bch2_check_allocations(struct bch_fs *);
/* Position of (the start of) a gc phase: */
static inline struct gc_pos gc_phase(enum gc_phase phase)
{
return (struct gc_pos) {
.phase = phase,
.level = 0,
.pos = POS_MIN,
};
}
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
return cmp_int(l.phase, r.phase) ?:
-cmp_int(l.level, r.level) ?:
bpos_cmp(l.pos, r.pos);
}
static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id)
{
switch (id) {
#define x(name, v, ...) case BTREE_ID_##name: return GC_PHASE_BTREE_##name;
BCH_BTREE_IDS()
#undef x
default:
BUG();
}
return (struct gc_pos) { .phase = phase, };
}
static inline struct gc_pos gc_pos_btree(enum btree_id btree, unsigned level,
struct bpos pos)
{
return (struct gc_pos) {
.phase = btree_id_to_gc_phase(btree),
.phase = GC_PHASE_btree,
.btree = btree,
.level = level,
.pos = pos,
};
......@@ -76,6 +56,22 @@ static inline struct gc_pos gc_pos_btree_node(struct btree *b)
return gc_pos_btree(b->c.btree_id, b->c.level, b->key.k.p);
}
static inline int gc_btree_order(enum btree_id btree)
{
if (btree == BTREE_ID_stripes)
return -1;
return btree;
}
static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r)
{
return cmp_int(l.phase, r.phase) ?:
cmp_int(gc_btree_order(l.btree),
gc_btree_order(r.btree)) ?:
-cmp_int(l.level, r.level) ?:
bpos_cmp(l.pos, r.pos);
}
static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos)
{
unsigned seq;
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_BTREE_GC_TYPES_H
#define _BCACHEFS_BTREE_GC_TYPES_H
#include <linux/generic-radix-tree.h>
enum gc_phase {
GC_PHASE_not_running,
GC_PHASE_start,
GC_PHASE_sb,
GC_PHASE_btree,
};
struct gc_pos {
enum gc_phase phase:8;
enum btree_id btree:8;
u16 level;
struct bpos pos;
};
struct reflink_gc {
u64 offset;
u32 size;
u32 refcount;
};
typedef GENRADIX(struct reflink_gc) reflink_gc_table;
#endif /* _BCACHEFS_BTREE_GC_TYPES_H */
This diff is collapsed.
......@@ -424,16 +424,16 @@ static int btree_key_cache_fill(struct btree_trans *trans,
goto err;
}
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
ret = bch2_trans_relock(trans);
if (ret) {
kfree(new_k);
trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
goto err;
}
ret = bch2_trans_relock(trans);
if (ret) {
if (!bch2_btree_node_relock(trans, ck_path, 0)) {
kfree(new_k);
trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path);
ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill);
goto err;
}
}
......
......@@ -1134,7 +1134,7 @@ static int __trigger_extent(struct btree_trans *trans,
r.e.nr_required = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
s64 disk_sectors;
s64 disk_sectors = 0;
ret = bch2_trigger_pointer(trans, btree_id, level, k, p, entry, &disk_sectors, flags);
if (ret < 0)
return ret;
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_DISK_GROUPS_FORMAT_H
#define _BCACHEFS_DISK_GROUPS_FORMAT_H
#define BCH_SB_LABEL_SIZE 32
struct bch_disk_group {
__u8 label[BCH_SB_LABEL_SIZE];
__le64 flags[2];
} __packed __aligned(8);
LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1)
LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6)
LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24)
struct bch_sb_field_disk_groups {
struct bch_sb_field field;
struct bch_disk_group entries[];
} __packed __aligned(8);
#endif /* _BCACHEFS_DISK_GROUPS_FORMAT_H */
......@@ -908,7 +908,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp)
if (!genradix_ptr_alloc(&c->stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING &&
if (c->gc_pos.phase != GC_PHASE_not_running &&
!genradix_ptr_alloc(&c->gc_stripes, idx, gfp))
return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc;
......
......@@ -437,8 +437,8 @@ static void bch2_writepage_io_done(struct bch_write_op *op)
*/
/*
* PageWriteback is effectively our ref on the inode - fixup i_blocks
* before calling end_page_writeback:
* The writeback flag is effectively our ref on the inode -
* fixup i_blocks before calling folio_end_writeback:
*/
bch2_i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta);
......@@ -898,7 +898,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
darray_for_each(fs, fi) {
f = *fi;
f_len = min(end, folio_end_pos(f)) - f_pos;
f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter);
f_copied = copy_folio_from_iter_atomic(f, f_offset, f_len, iter);
if (!f_copied) {
folios_trunc(&fs, fi);
break;
......
......@@ -609,8 +609,10 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter)
if (unlikely(ret))
goto err_put_write_ref;
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1)))
if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) {
ret = -EINVAL;
goto err_put_write_ref;
}
inode_dio_begin(&inode->v);
bch2_pagecache_block_get(inode);
......
......@@ -1939,8 +1939,7 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
ret = bch2_err_class(ret);
return ERR_PTR(ret);
goto err;
}
c = sb->s_fs_info;
......@@ -2016,6 +2015,15 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
err_put_super:
__bch2_fs_stop(c);
deactivate_locked_super(sb);
err:
/*
* On an inconsistency error in recovery we might see an -EROFS derived
* errorcode (from the journal), but we don't want to return that to
* userspace as that causes util-linux to retry the mount RO - which is
* confusing:
*/
if (bch2_err_matches(ret, EROFS) && ret != -EROFS)
ret = -EIO;
return ERR_PTR(bch2_err_class(ret));
}
......
......@@ -77,21 +77,17 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
struct bkey_s_c k;
int ret;
bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes,
POS(0, inode_nr),
BTREE_ITER_all_snapshots);
k = bch2_btree_iter_peek(&iter);
ret = bkey_err(k);
if (ret)
goto err;
if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) {
ret = -BCH_ERR_ENOENT_inode;
goto err;
for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr),
BTREE_ITER_all_snapshots, k, ret) {
if (k.k->p.offset != inode_nr)
break;
if (!bkey_is_inode(k.k))
continue;
ret = bch2_inode_unpack(k, inode);
goto found;
}
ret = bch2_inode_unpack(k, inode);
err:
ret = -BCH_ERR_ENOENT_inode;
found:
bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr);
bch2_trans_iter_exit(trans, &iter);
return ret;
......@@ -770,25 +766,6 @@ static int get_visible_inodes(struct btree_trans *trans,
return ret;
}
static int check_key_has_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
bkey_in_missing_snapshot,
"key in missing snapshot: %s",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node) ?: 1;
fsck_err:
printbuf_exit(&buf);
return ret;
}
static int hash_redo_key(struct btree_trans *trans,
const struct bch_hash_desc desc,
struct bch_hash_info *hash_info,
......@@ -983,7 +960,7 @@ static int check_inode(struct btree_trans *trans,
bool do_update = false;
int ret;
ret = check_key_has_snapshot(trans, iter, k);
ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
goto err;
if (ret)
......@@ -1487,7 +1464,7 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter,
struct printbuf buf = PRINTBUF;
int ret = 0;
ret = check_key_has_snapshot(trans, iter, k);
ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
......@@ -2010,7 +1987,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
struct printbuf buf = PRINTBUF;
int ret = 0;
ret = check_key_has_snapshot(trans, iter, k);
ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret) {
ret = ret < 0 ? ret : 0;
goto out;
......@@ -2165,7 +2142,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter,
struct inode_walker_entry *i;
int ret;
ret = check_key_has_snapshot(trans, iter, k);
ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret < 0)
return ret;
if (ret)
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H
struct journal_seq_blacklist_entry {
__le64 start;
__le64 end;
};
struct bch_sb_field_journal_seq_blacklist {
struct bch_sb_field field;
struct journal_seq_blacklist_entry start[];
};
#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_FORMAT_H */
......@@ -217,4 +217,5 @@ static struct kunit_suite mean_and_variance_test_suite = {
kunit_test_suite(mean_and_variance_test_suite);
MODULE_AUTHOR("Daniel B. Hill");
MODULE_DESCRIPTION("bcachefs filesystem mean and variance unit tests");
MODULE_LICENSE("GPL");
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_REPLICAS_FORMAT_H
#define _BCACHEFS_REPLICAS_FORMAT_H
struct bch_replicas_entry_v0 {
__u8 data_type;
__u8 nr_devs;
__u8 devs[];
} __packed;
struct bch_sb_field_replicas_v0 {
struct bch_sb_field field;
struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
__u8 devs[];
} __packed;
struct bch_sb_field_replicas {
struct bch_sb_field field;
struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
#define replicas_entry_bytes(_i) \
(offsetof(typeof(*(_i)), devs) + (_i)->nr_devs)
#endif /* _BCACHEFS_REPLICAS_FORMAT_H */
......@@ -146,10 +146,17 @@ static int bch2_sb_downgrade_validate(struct bch_sb *sb, struct bch_sb_field *f,
for (const struct bch_sb_field_downgrade_entry *i = e->entries;
(void *) i < vstruct_end(&e->field);
i = downgrade_entry_next_c(i)) {
/*
* Careful: sb_field_downgrade_entry is only 2 byte aligned, but
* section sizes are 8 byte aligned - an empty entry spanning
* the end of the section is allowed (and ignored):
*/
if ((void *) &i->errors[0] > vstruct_end(&e->field))
break;
if (flags & BCH_VALIDATE_write &&
((void *) &i->errors[0] > vstruct_end(&e->field) ||
(void *) downgrade_entry_next_c(i) > vstruct_end(&e->field))) {
prt_printf(err, "downgrade entry overruns end of superblock section)");
(void *) downgrade_entry_next_c(i) > vstruct_end(&e->field)) {
prt_printf(err, "downgrade entry overruns end of superblock section");
return -BCH_ERR_invalid_sb_downgrade;
}
......
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SB_DOWNGRADE_FORMAT_H
#define _BCACHEFS_SB_DOWNGRADE_FORMAT_H
struct bch_sb_field_downgrade_entry {
__le16 version;
__le64 recovery_passes[2];
__le16 nr_errors;
__le16 errors[] __counted_by(nr_errors);
} __packed __aligned(2);
struct bch_sb_field_downgrade {
struct bch_sb_field field;
struct bch_sb_field_downgrade_entry entries[];
};
#endif /* _BCACHEFS_SB_DOWNGRADE_FORMAT_H */
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_SB_MEMBERS_FORMAT_H
#define _BCACHEFS_SB_MEMBERS_FORMAT_H
/*
* We refer to members with bitmasks in various places - but we need to get rid
* of this limit:
*/
#define BCH_SB_MEMBERS_MAX 64
#define BCH_MIN_NR_NBUCKETS (1 << 6)
#define BCH_IOPS_MEASUREMENTS() \
x(seqread, 0) \
x(seqwrite, 1) \
x(randread, 2) \
x(randwrite, 3)
enum bch_iops_measurement {
#define x(t, n) BCH_IOPS_##t = n,
BCH_IOPS_MEASUREMENTS()
#undef x
BCH_IOPS_NR
};
#define BCH_MEMBER_ERROR_TYPES() \
x(read, 0) \
x(write, 1) \
x(checksum, 2)
enum bch_member_error_type {
#define x(t, n) BCH_MEMBER_ERROR_##t = n,
BCH_MEMBER_ERROR_TYPES()
#undef x
BCH_MEMBER_ERROR_NR
};
struct bch_member {
__uuid_t uuid;
__le64 nbuckets; /* device size */
__le16 first_bucket; /* index of first bucket used */
__le16 bucket_size; /* sectors */
__u8 btree_bitmap_shift;
__u8 pad[3];
__le64 last_mount; /* time_t */
__le64 flags;
__le32 iops[4];
__le64 errors[BCH_MEMBER_ERROR_NR];
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
__le64 seq;
__le64 btree_allocated_bitmap;
/*
* On recovery from a clean shutdown we don't normally read the journal,
* but we still want to resume writing from where we left off so we
* don't overwrite more than is necessary, for list journal debugging:
*/
__le32 last_journal_bucket;
__le32 last_journal_bucket_offset;
};
/*
* This limit comes from the bucket_gens array - it's a single allocation, and
* kernel allocation are limited to INT_MAX
*/
#define BCH_MEMBER_NBUCKETS_MAX (INT_MAX - 64)
#define BCH_MEMBER_V1_BYTES 56
LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4)
/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */
LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15)
LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags, 15, 20)
LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28)
LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30)
LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED,
struct bch_member, flags, 30, 31)
#if 0
LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20);
LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40);
#endif
#define BCH_MEMBER_STATES() \
x(rw, 0) \
x(ro, 1) \
x(failed, 2) \
x(spare, 3)
enum bch_member_state {
#define x(t, n) BCH_MEMBER_STATE_##t = n,
BCH_MEMBER_STATES()
#undef x
BCH_MEMBER_STATE_NR
};
struct bch_sb_field_members_v1 {
struct bch_sb_field field;
struct bch_member _members[]; //Members are now variable size
};
struct bch_sb_field_members_v2 {
struct bch_sb_field field;
__le16 member_bytes; //size of single member entry
u8 pad[6];
struct bch_member _members[];
};
#endif /* _BCACHEFS_SB_MEMBERS_FORMAT_H */
......@@ -1042,6 +1042,25 @@ int bch2_reconstruct_snapshots(struct bch_fs *c)
return ret;
}
int bch2_check_key_has_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct printbuf buf = PRINTBUF;
int ret = 0;
if (fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c,
bkey_in_missing_snapshot,
"key in missing snapshot %s, delete?",
(bch2_bkey_val_to_text(&buf, c, k), buf.buf)))
ret = bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node) ?: 1;
fsck_err:
printbuf_exit(&buf);
return ret;
}
/*
* Mark a snapshot as deleted, for future cleanup:
*/
......@@ -1351,35 +1370,39 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
* that key to snapshot leaf nodes, where we can mutate it
*/
static int snapshot_delete_key(struct btree_trans *trans,
static int delete_dead_snapshots_process_key(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
snapshot_id_list *deleted,
snapshot_id_list *equiv_seen,
struct bpos *last_pos)
{
int ret = bch2_check_key_has_snapshot(trans, iter, k);
if (ret)
return ret < 0 ? ret : 0;
struct bch_fs *c = trans->c;
u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
if (!equiv) /* key for invalid snapshot node, but we chose not to delete */
return 0;
if (!bkey_eq(k.k->p, *last_pos))
equiv_seen->nr = 0;
*last_pos = k.k->p;
if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
snapshot_list_has_id(equiv_seen, equiv)) {
if (snapshot_list_has_id(deleted, k.k->p.snapshot))
return bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node);
} else {
return snapshot_list_add(c, equiv_seen, equiv);
}
}
static int move_key_to_correct_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
if (!bpos_eq(*last_pos, k.k->p) &&
snapshot_list_has_id(equiv_seen, equiv))
return bch2_btree_delete_at(trans, iter,
BTREE_UPDATE_internal_snapshot_node);
*last_pos = k.k->p;
ret = snapshot_list_add_nodup(c, equiv_seen, equiv);
if (ret)
return ret;
/*
* When we have a linear chain of snapshot nodes, we consider
......@@ -1389,21 +1412,20 @@ static int move_key_to_correct_snapshot(struct btree_trans *trans,
*
* If there are multiple keys in different snapshots at the same
* position, we're only going to keep the one in the newest
* snapshot - the rest have been overwritten and are redundant,
* and for the key we're going to keep we need to move it to the
* equivalance class ID if it's not there already.
* snapshot (we delete the others above) - the rest have been
* overwritten and are redundant, and for the key we're going to keep we
* need to move it to the equivalance class ID if it's not there
* already.
*/
if (equiv != k.k->p.snapshot) {
struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
struct btree_iter new_iter;
int ret;
ret = PTR_ERR_OR_ZERO(new);
int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
new->k.p.snapshot = equiv;
struct btree_iter new_iter;
bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
BTREE_ITER_all_snapshots|
BTREE_ITER_cached|
......@@ -1538,7 +1560,6 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
struct btree_trans *trans;
snapshot_id_list deleted = { 0 };
snapshot_id_list deleted_interior = { 0 };
u32 id;
int ret = 0;
if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
......@@ -1585,33 +1606,20 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
if (ret)
goto err;
for (id = 0; id < BTREE_ID_NR; id++) {
for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
struct bpos last_pos = POS_MIN;
snapshot_id_list equiv_seen = { 0 };
struct disk_reservation res = { 0 };
if (!btree_type_has_snapshots(id))
continue;
/*
* deleted inodes btree is maintained by a trigger on the inodes
* btree - no work for us to do here, and it's not safe to scan
* it because we'll see out of date keys due to the btree write
* buffer:
*/
if (id == BTREE_ID_deleted_inodes)
if (!btree_type_has_snapshots(btree))
continue;
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter,
id, POS_MIN,
btree, POS_MIN,
BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k,
&res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k));
delete_dead_snapshots_process_key(trans, &iter, k, &deleted,
&equiv_seen, &last_pos));
bch2_disk_reservation_put(c, &res);
darray_exit(&equiv_seen);
......
......@@ -242,6 +242,7 @@ int bch2_snapshot_node_create(struct btree_trans *, u32,
int bch2_check_snapshot_trees(struct bch_fs *);
int bch2_check_snapshots(struct bch_fs *);
int bch2_reconstruct_snapshots(struct bch_fs *);
int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c);
int bch2_snapshot_node_set_deleted(struct btree_trans *, u32);
void bch2_delete_dead_snapshots_work(struct work_struct *);
......
......@@ -1132,18 +1132,12 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
* c->sb will be checked before we write the superblock, so update it as
* well:
*/
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
}
if (c->sb.version > bcachefs_metadata_version_current) {
if (c->sb.version > bcachefs_metadata_version_current)
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
c->sb.version = bcachefs_metadata_version_current;
}
if (c->sb.version_min > bcachefs_metadata_version_current) {
if (c->sb.version_min > bcachefs_metadata_version_current)
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
c->sb.version_min = bcachefs_metadata_version_current;
}
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
......
......@@ -564,7 +564,7 @@ static void __bch2_fs_free(struct bch_fs *c)
BUG_ON(atomic_read(&c->journal_keys.ref));
bch2_fs_btree_write_buffer_exit(c);
percpu_free_rwsem(&c->mark_lock);
EBUG_ON(percpu_u64_get(c->online_reserved));
EBUG_ON(c->online_reserved && percpu_u64_get(c->online_reserved));
free_percpu(c->online_reserved);
darray_exit(&c->btree_roots_extra);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment