Commit 1c6fdbd8 authored by Kent Overstreet's avatar Kent Overstreet Committed by Kent Overstreet

bcachefs: Initial commit

Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write
filesystem with every feature you could possibly want.

Website: https://bcachefs.orgSigned-off-by: default avatarKent Overstreet <kent.overstreet@linux.dev>
parent 0d29a833
...@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig" ...@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
source "fs/btrfs/Kconfig" source "fs/btrfs/Kconfig"
source "fs/nilfs2/Kconfig" source "fs/nilfs2/Kconfig"
source "fs/f2fs/Kconfig" source "fs/f2fs/Kconfig"
source "fs/bcachefs/Kconfig"
source "fs/zonefs/Kconfig" source "fs/zonefs/Kconfig"
endif # BLOCK endif # BLOCK
......
...@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ ...@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_F2FS_FS) += f2fs/ obj-$(CONFIG_F2FS_FS) += f2fs/
obj-$(CONFIG_BCACHEFS_FS) += bcachefs/
obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/
......
config BCACHEFS_FS
tristate "bcachefs filesystem support"
depends on BLOCK
select EXPORTFS
select CLOSURES
select LIBCRC32C
select FS_POSIX_ACL
select LZ4_COMPRESS
select LZ4_DECOMPRESS
select ZLIB_DEFLATE
select ZLIB_INFLATE
select ZSTD_COMPRESS
select ZSTD_DECOMPRESS
select CRYPTO_SHA256
select CRYPTO_CHACHA20
select CRYPTO_POLY1305
select KEYS
help
The bcachefs filesystem - a modern, copy on write filesystem, with
support for multiple devices, compression, checksumming, etc.
config BCACHEFS_QUOTA
bool "bcachefs quota support"
depends on BCACHEFS_FS
select QUOTACTL
config BCACHEFS_POSIX_ACL
bool "bcachefs POSIX ACL support"
depends on BCACHEFS_FS
select FS_POSIX_ACL
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
help
Enables many extra debugging checks and assertions.
The resulting code will be significantly slower than normal; you
probably shouldn't select this option unless you're a developer.
config BCACHEFS_TESTS
bool "bcachefs unit and performance tests"
depends on BCACHEFS_FS
help
Include some unit and performance tests for the core btree code
config BCACHEFS_NO_LATENCY_ACCT
bool "disable latency accounting and time stats"
depends on BCACHEFS_FS
help
This disables device latency tracking and time stats, only for performance testing
obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o
bcachefs-y := \
acl.o \
alloc.o \
bkey.o \
bkey_methods.o \
bset.o \
btree_cache.o \
btree_gc.o \
btree_io.o \
btree_iter.o \
btree_update_interior.o \
btree_update_leaf.o \
buckets.o \
chardev.o \
checksum.o \
clock.o \
compress.o \
debug.o \
dirent.o \
disk_groups.o \
error.o \
extents.o \
fs.o \
fs-ioctl.o \
fs-io.o \
fsck.o \
inode.o \
io.o \
journal.o \
journal_io.o \
journal_reclaim.o \
journal_seq_blacklist.o \
keylist.o \
migrate.o \
move.o \
movinggc.o \
opts.o \
quota.o \
rebalance.o \
recovery.o \
replicas.o \
siphash.o \
six.o \
super.o \
super-io.o \
sysfs.o \
tests.o \
trace.o \
util.o \
xattr.o
// SPDX-License-Identifier: GPL-2.0
#ifdef CONFIG_BCACHEFS_POSIX_ACL
#include "bcachefs.h"
#include <linux/fs.h>
#include <linux/posix_acl.h>
#include <linux/posix_acl_xattr.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include "acl.h"
#include "fs.h"
#include "xattr.h"
static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
{
return sizeof(bch_acl_header) +
sizeof(bch_acl_entry_short) * nr_short +
sizeof(bch_acl_entry) * nr_long;
}
static inline int acl_to_xattr_type(int type)
{
switch (type) {
case ACL_TYPE_ACCESS:
return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
case ACL_TYPE_DEFAULT:
return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
default:
BUG();
}
}
/*
* Convert from filesystem to in-memory representation.
*/
static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
{
const void *p, *end = value + size;
struct posix_acl *acl;
struct posix_acl_entry *out;
unsigned count = 0;
if (!value)
return NULL;
if (size < sizeof(bch_acl_header))
goto invalid;
if (((bch_acl_header *)value)->a_version !=
cpu_to_le32(BCH_ACL_VERSION))
goto invalid;
p = value + sizeof(bch_acl_header);
while (p < end) {
const bch_acl_entry *entry = p;
if (p + sizeof(bch_acl_entry_short) > end)
goto invalid;
switch (le16_to_cpu(entry->e_tag)) {
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
p += sizeof(bch_acl_entry_short);
break;
case ACL_USER:
case ACL_GROUP:
p += sizeof(bch_acl_entry);
break;
default:
goto invalid;
}
count++;
}
if (p > end)
goto invalid;
if (!count)
return NULL;
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
return ERR_PTR(-ENOMEM);
out = acl->a_entries;
p = value + sizeof(bch_acl_header);
while (p < end) {
const bch_acl_entry *in = p;
out->e_tag = le16_to_cpu(in->e_tag);
out->e_perm = le16_to_cpu(in->e_perm);
switch (out->e_tag) {
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
p += sizeof(bch_acl_entry_short);
break;
case ACL_USER:
out->e_uid = make_kuid(&init_user_ns,
le32_to_cpu(in->e_id));
p += sizeof(bch_acl_entry);
break;
case ACL_GROUP:
out->e_gid = make_kgid(&init_user_ns,
le32_to_cpu(in->e_id));
p += sizeof(bch_acl_entry);
break;
}
out++;
}
BUG_ON(out != acl->a_entries + acl->a_count);
return acl;
invalid:
pr_err("invalid acl entry");
return ERR_PTR(-EINVAL);
}
#define acl_for_each_entry(acl, acl_e) \
for (acl_e = acl->a_entries; \
acl_e < acl->a_entries + acl->a_count; \
acl_e++)
/*
* Convert from in-memory to filesystem representation.
*/
static struct bkey_i_xattr *
bch2_acl_to_xattr(struct btree_trans *trans,
const struct posix_acl *acl,
int type)
{
struct bkey_i_xattr *xattr;
bch_acl_header *acl_header;
const struct posix_acl_entry *acl_e;
void *outptr;
unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
acl_for_each_entry(acl, acl_e) {
switch (acl_e->e_tag) {
case ACL_USER:
case ACL_GROUP:
nr_long++;
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
nr_short++;
break;
default:
return ERR_PTR(-EINVAL);
}
}
acl_len = bch2_acl_size(nr_short, nr_long);
u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
if (u64s > U8_MAX)
return ERR_PTR(-E2BIG);
xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
if (IS_ERR(xattr))
return xattr;
bkey_xattr_init(&xattr->k_i);
xattr->k.u64s = u64s;
xattr->v.x_type = acl_to_xattr_type(type);
xattr->v.x_name_len = 0,
xattr->v.x_val_len = cpu_to_le16(acl_len);
acl_header = xattr_val(&xattr->v);
acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
outptr = (void *) acl_header + sizeof(*acl_header);
acl_for_each_entry(acl, acl_e) {
bch_acl_entry *entry = outptr;
entry->e_tag = cpu_to_le16(acl_e->e_tag);
entry->e_perm = cpu_to_le16(acl_e->e_perm);
switch (acl_e->e_tag) {
case ACL_USER:
entry->e_id = cpu_to_le32(
from_kuid(&init_user_ns, acl_e->e_uid));
outptr += sizeof(bch_acl_entry);
break;
case ACL_GROUP:
entry->e_id = cpu_to_le32(
from_kgid(&init_user_ns, acl_e->e_gid));
outptr += sizeof(bch_acl_entry);
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
case ACL_MASK:
case ACL_OTHER:
outptr += sizeof(bch_acl_entry_short);
break;
}
}
BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
return xattr;
}
struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
struct dentry *dentry, int type)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
struct posix_acl *acl = NULL;
bch2_trans_init(&trans, c);
retry:
bch2_trans_begin(&trans);
iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
&inode->ei_str_hash, inode->v.i_ino,
&X_SEARCH(acl_to_xattr_type(type), "", 0),
0);
if (IS_ERR(iter)) {
if (PTR_ERR(iter) == -EINTR)
goto retry;
if (PTR_ERR(iter) != -ENOENT)
acl = ERR_CAST(iter);
goto out;
}
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (!IS_ERR(acl))
set_cached_acl(&inode->v, type, acl);
out:
bch2_trans_exit(&trans);
return acl;
}
int bch2_set_acl_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
int ret;
if (type == ACL_TYPE_DEFAULT &&
!S_ISDIR(inode_u->bi_mode))
return acl ? -EACCES : 0;
if (acl) {
struct bkey_i_xattr *xattr =
bch2_acl_to_xattr(trans, acl, type);
if (IS_ERR(xattr))
return PTR_ERR(xattr);
ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &xattr->k_i, 0);
} else {
struct xattr_search_key search =
X_SEARCH(acl_to_xattr_type(type), "", 0);
ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
inode_u->bi_inum, &search);
}
return ret == -ENOENT ? 0 : ret;
}
static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
struct bch_inode_unpacked *bi,
void *p)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct timespec64 now = current_time(&inode->v);
umode_t mode = (unsigned long) p;
bi->bi_ctime = timespec_to_bch2_time(c, now);
bi->bi_mode = mode;
return 0;
}
int bch2_set_acl(struct mnt_idmap *idmap,
struct dentry *dentry,
struct posix_acl *acl, int type)
{
struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct btree_trans trans;
struct bch_inode_unpacked inode_u;
umode_t mode = inode->v.i_mode;
int ret;
if (type == ACL_TYPE_ACCESS && acl) {
ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
if (ret)
return ret;
}
bch2_trans_init(&trans, c);
retry:
bch2_trans_begin(&trans);
ret = bch2_set_acl_trans(&trans,
&inode->ei_inode,
&inode->ei_str_hash,
acl, type) ?:
bch2_write_inode_trans(&trans, inode, &inode_u,
inode_update_for_set_acl_fn,
(void *)(unsigned long) mode) ?:
bch2_trans_commit(&trans, NULL, NULL,
&inode->ei_journal_seq,
BTREE_INSERT_ATOMIC|
BTREE_INSERT_NOUNLOCK);
if (ret == -EINTR)
goto retry;
if (unlikely(ret))
goto err;
bch2_inode_update_after_write(c, inode, &inode_u,
ATTR_CTIME|ATTR_MODE);
set_cached_acl(&inode->v, type, acl);
err:
bch2_trans_exit(&trans);
return ret;
}
int bch2_acl_chmod(struct btree_trans *trans,
struct bch_inode_info *inode,
umode_t mode,
struct posix_acl **new_acl)
{
struct btree_iter *iter;
struct bkey_s_c_xattr xattr;
struct bkey_i_xattr *new;
struct posix_acl *acl;
int ret = 0;
iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
&inode->ei_str_hash, inode->v.i_ino,
&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
BTREE_ITER_INTENT);
if (IS_ERR(iter))
return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
acl = bch2_acl_from_disk(xattr_val(xattr.v),
le16_to_cpu(xattr.v->x_val_len));
if (IS_ERR_OR_NULL(acl))
return PTR_ERR(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
if (ret)
goto err;
new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto err;
}
bch2_trans_update(trans, iter, &new->k_i, 0);
*new_acl = acl;
acl = NULL;
err:
kfree(acl);
return ret;
}
#endif /* CONFIG_BCACHEFS_POSIX_ACL */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ACL_H
#define _BCACHEFS_ACL_H
struct bch_inode_unpacked;
struct bch_hash_info;
struct bch_inode_info;
struct posix_acl;
#ifdef CONFIG_BCACHEFS_POSIX_ACL
#define BCH_ACL_VERSION 0x0001
typedef struct {
__le16 e_tag;
__le16 e_perm;
__le32 e_id;
} bch_acl_entry;
typedef struct {
__le16 e_tag;
__le16 e_perm;
} bch_acl_entry_short;
typedef struct {
__le32 a_version;
} bch_acl_header;
struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
int bch2_set_acl_trans(struct btree_trans *,
struct bch_inode_unpacked *,
const struct bch_hash_info *,
struct posix_acl *, int);
int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
umode_t, struct posix_acl **);
#else
static inline int bch2_set_acl_trans(struct btree_trans *trans,
struct bch_inode_unpacked *inode_u,
const struct bch_hash_info *hash_info,
struct posix_acl *acl, int type)
{
return 0;
}
static inline int bch2_acl_chmod(struct btree_trans *trans,
struct bch_inode_info *inode,
umode_t mode,
struct posix_acl **new_acl)
{
return 0;
}
#endif /* CONFIG_BCACHEFS_POSIX_ACL */
#endif /* _BCACHEFS_ACL_H */
This diff is collapsed.
#ifndef _BCACHEFS_ALLOC_H
#define _BCACHEFS_ALLOC_H
#include "bcachefs.h"
#include "alloc_types.h"
struct bkey;
struct bch_dev;
struct bch_fs;
struct bch_devs_List;
const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
#define bch2_bkey_alloc_ops (struct bkey_ops) { \
.key_invalid = bch2_alloc_invalid, \
.val_to_text = bch2_alloc_to_text, \
}
struct dev_alloc_list {
unsigned nr;
u8 devs[BCH_SB_MEMBERS_MAX];
};
struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
struct write_point *,
struct bch_devs_mask *);
void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
struct write_point *);
int bch2_alloc_read(struct bch_fs *, struct list_head *);
int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
enum bucket_alloc_ret {
ALLOC_SUCCESS = 0,
OPEN_BUCKETS_EMPTY = -1,
FREELIST_EMPTY = -2, /* Allocator thread not keeping up */
NO_DEVICES = -3, /* -EROFS */
};
long bch2_bucket_alloc_new_fs(struct bch_dev *);
int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
struct closure *);
#define __writepoint_for_each_ptr(_wp, _ob, _i, _start) \
for ((_i) = (_start); \
(_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \
(_i)++)
#define writepoint_for_each_ptr_all(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, 0)
#define writepoint_for_each_ptr(_wp, _ob, _i) \
__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
{
if (atomic_dec_and_test(&ob->pin))
__bch2_open_bucket_put(c, ob);
}
static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
{
unsigned i;
for (i = 0; i < *nr; i++)
bch2_open_bucket_put(c, c->open_buckets + refs[i]);
*nr = 0;
}
static inline void bch2_open_bucket_get(struct bch_fs *c,
struct write_point *wp,
u8 *nr, u8 *refs)
{
struct open_bucket *ob;
unsigned i;
writepoint_for_each_ptr(wp, ob, i) {
atomic_inc(&ob->pin);
refs[(*nr)++] = ob - c->open_buckets;
}
}
struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
unsigned,
struct write_point_specifier,
struct bch_devs_list *,
unsigned, unsigned,
enum alloc_reserve,
unsigned,
struct closure *);
void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
struct bkey_i_extent *, unsigned);
void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
static inline void bch2_wake_allocator(struct bch_dev *ca)
{
struct task_struct *p;
rcu_read_lock();
p = rcu_dereference(ca->alloc_thread);
if (p)
wake_up_process(p);
rcu_read_unlock();
}
static inline struct write_point_specifier writepoint_hashed(unsigned long v)
{
return (struct write_point_specifier) { .v = v | 1 };
}
static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
{
return (struct write_point_specifier) { .v = (unsigned long) wp };
}
void bch2_recalc_capacity(struct bch_fs *);
void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
void bch2_dev_allocator_stop(struct bch_dev *);
int bch2_dev_allocator_start(struct bch_dev *);
static inline void writepoint_init(struct write_point *wp,
enum bch_data_type type)
{
mutex_init(&wp->lock);
wp->type = type;
}
int bch2_alloc_write(struct bch_fs *);
int bch2_fs_allocator_start(struct bch_fs *);
void bch2_fs_allocator_init(struct bch_fs *);
#endif /* _BCACHEFS_ALLOC_H */
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_ALLOC_TYPES_H
#define _BCACHEFS_ALLOC_TYPES_H
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include "clock_types.h"
#include "fifo.h"
/* There's two of these clocks, one for reads and one for writes: */
struct bucket_clock {
/*
* "now" in (read/write) IO time - incremented whenever we do X amount
* of reads or writes.
*
* Goes with the bucket read/write prios: when we read or write to a
* bucket we reset the bucket's prio to the current hand; thus hand -
* prio = time since bucket was last read/written.
*
* The units are some amount (bytes/sectors) of data read/written, and
* the units can change on the fly if we need to rescale to fit
* everything in a u16 - your only guarantee is that the units are
* consistent.
*/
u16 hand;
u16 max_last_io;
int rw;
struct io_timer rescale;
struct mutex lock;
};
/* There is one reserve for each type of btree, one for prios and gens
* and one for moving GC */
enum alloc_reserve {
RESERVE_ALLOC = -1,
RESERVE_BTREE = 0,
RESERVE_MOVINGGC = 1,
RESERVE_NONE = 2,
RESERVE_NR = 3,
};
typedef FIFO(long) alloc_fifo;
/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
#define OPEN_BUCKETS_COUNT 256
#define WRITE_POINT_COUNT 32
struct open_bucket {
spinlock_t lock;
atomic_t pin;
u8 freelist;
bool valid;
bool on_partial_list;
unsigned sectors_free;
struct bch_extent_ptr ptr;
};
struct write_point {
struct hlist_node node;
struct mutex lock;
u64 last_used;
unsigned long write_point;
enum bch_data_type type;
u8 nr_ptrs;
u8 first_ptr;
/* calculated based on how many pointers we're actually going to use: */
unsigned sectors_free;
struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2];
u64 next_alloc[BCH_SB_MEMBERS_MAX];
};
struct write_point_specifier {
unsigned long v;
};
struct alloc_heap_entry {
size_t bucket;
size_t nr;
unsigned long key;
};
typedef HEAP(struct alloc_heap_entry) alloc_heap;
#endif /* _BCACHEFS_ALLOC_TYPES_H */
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_IOCTL_H
#define _BCACHEFS_IOCTL_H
#include <linux/uuid.h>
#include <asm/ioctl.h>
#include "bcachefs_format.h"
/*
* Flags common to multiple ioctls:
*/
#define BCH_FORCE_IF_DATA_LOST (1 << 0)
#define BCH_FORCE_IF_METADATA_LOST (1 << 1)
#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2)
#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3)
#define BCH_FORCE_IF_DEGRADED \
(BCH_FORCE_IF_DATA_DEGRADED| \
BCH_FORCE_IF_METADATA_DEGRADED)
/*
* If cleared, ioctl that refer to a device pass it as a pointer to a pathname
* (e.g. /dev/sda1); if set, the dev field is the device's index within the
* filesystem:
*/
#define BCH_BY_INDEX (1 << 4)
/*
* For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
* wide superblock:
*/
#define BCH_READ_DEV (1 << 5)
/* global control dev: */
/* These are currently broken, and probably unnecessary: */
#if 0
#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble)
#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental)
struct bch_ioctl_assemble {
__u32 flags;
__u32 nr_devs;
__u64 pad;
__u64 devs[];
};
struct bch_ioctl_incremental {
__u32 flags;
__u64 pad;
__u64 dev;
};
#endif
/* filesystem ioctls: */
#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid)
/* These only make sense when we also have incremental assembly */
#if 0
#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start)
#define BCH_IOCTL_STOP _IO(0xbc, 3)
#endif
#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk)
#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state)
#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data)
#define BCH_IOCTL_USAGE _IOWR(0xbc, 11, struct bch_ioctl_usage)
#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super)
#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx)
#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 13, struct bch_ioctl_disk_resize)
/*
* BCH_IOCTL_QUERY_UUID: get filesystem UUID
*
* Returns user visible UUID, not internal UUID (which may not ever be changed);
* the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
* this UUID.
*/
struct bch_ioctl_query_uuid {
__uuid_t uuid;
};
#if 0
struct bch_ioctl_start {
__u32 flags;
__u32 pad;
};
#endif
/*
* BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
*
* The specified device must not be open or in use. On success, the new device
* will be an online member of the filesystem just like any other member.
*
* The device must first be prepared by userspace by formatting with a bcachefs
* superblock, which is only used for passing in superblock options/parameters
* for that device (in struct bch_member). The new device's superblock should
* not claim to be a member of any existing filesystem - UUIDs on it will be
* ignored.
*/
/*
* BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
*
* Any data present on @dev will be permanently deleted, and @dev will be
* removed from its slot in the filesystem's list of member devices. The device
* may be either offline or offline.
*
* Will fail removing @dev would leave us with insufficient read write devices
* or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
* set.
*/
/*
* BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
* but is not open (e.g. because we started in degraded mode), bring it online
*
* all existing data on @dev will be available once the device is online,
* exactly as if @dev was present when the filesystem was first mounted
*/
/*
* BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
* block device, without removing it from the filesystem (so it can be brought
* back online later)
*
* Data present on @dev will be unavailable while @dev is offline (unless
* replicated), but will still be intact and untouched if @dev is brought back
* online
*
* Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
* leave us with insufficient read write devices or degraded/unavailable data,
* unless the approprate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk {
__u32 flags;
__u32 pad;
__u64 dev;
};
/*
* BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
*
* @new_state - one of the bch_member_state states (rw, ro, failed,
* spare)
*
* Will refuse to change member state if we would then have insufficient devices
* to write to, or if it would result in degraded data (when @new_state is
* failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
*/
struct bch_ioctl_disk_set_state {
__u32 flags;
__u8 new_state;
__u8 pad[3];
__u64 dev;
};
enum bch_data_ops {
BCH_DATA_OP_SCRUB = 0,
BCH_DATA_OP_REREPLICATE = 1,
BCH_DATA_OP_MIGRATE = 2,
BCH_DATA_OP_NR = 3,
};
/*
* BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
* scrub, rereplicate, migrate).
*
* This ioctl kicks off a job in the background, and returns a file descriptor.
* Reading from the file descriptor returns a struct bch_ioctl_data_event,
* indicating current progress, and closing the file descriptor will stop the
* job. The file descriptor is O_CLOEXEC.
*/
struct bch_ioctl_data {
__u32 op;
__u32 flags;
struct bpos start;
struct bpos end;
union {
struct {
__u32 dev;
__u32 pad;
} migrate;
struct {
__u64 pad[8];
};
};
} __attribute__((packed, aligned(8)));
enum bch_data_event {
BCH_DATA_EVENT_PROGRESS = 0,
/* XXX: add an event for reporting errors */
BCH_DATA_EVENT_NR = 1,
};
struct bch_ioctl_data_progress {
__u8 data_type;
__u8 btree_id;
__u8 pad[2];
struct bpos pos;
__u64 sectors_done;
__u64 sectors_total;
} __attribute__((packed, aligned(8)));
struct bch_ioctl_data_event {
__u8 type;
__u8 pad[7];
union {
struct bch_ioctl_data_progress p;
__u64 pad2[15];
};
} __attribute__((packed, aligned(8)));
struct bch_ioctl_dev_usage {
__u8 state;
__u8 alive;
__u8 pad[6];
__u32 dev;
__u32 bucket_size;
__u64 nr_buckets;
__u64 buckets[BCH_DATA_NR];
__u64 sectors[BCH_DATA_NR];
};
struct bch_ioctl_fs_usage {
__u64 capacity;
__u64 used;
__u64 online_reserved;
__u64 persistent_reserved[BCH_REPLICAS_MAX];
__u64 sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
};
/*
* BCH_IOCTL_USAGE: query filesystem disk space usage
*
* Returns disk space usage broken out by data type, number of replicas, and
* by component device
*
* @nr_devices - number of devices userspace allocated space for in @devs
*
* On success, @fs and @devs will be filled out appropriately and devs[i].alive
* will indicate if a device was present in that slot
*
* Returns -ERANGE if @nr_devices was too small
*/
struct bch_ioctl_usage {
__u16 nr_devices;
__u16 pad[3];
struct bch_ioctl_fs_usage fs;
struct bch_ioctl_dev_usage devs[0];
};
/*
* BCH_IOCTL_READ_SUPER: read filesystem superblock
*
* Equivalent to reading the superblock directly from the block device, except
* avoids racing with the kernel writing the superblock or having to figure out
* which block device to read
*
* @sb - buffer to read into
* @size - size of userspace allocated buffer
* @dev - device to read superblock for, if BCH_READ_DEV flag is
* specified
*
* Returns -ERANGE if buffer provided is too small
*/
struct bch_ioctl_read_super {
__u32 flags;
__u32 pad;
__u64 dev;
__u64 size;
__u64 sb;
};
/*
* BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
* determine if disk is a (online) member - if so, returns device's index
*
* Returns -ENOENT if not found
*/
struct bch_ioctl_disk_get_idx {
__u64 dev;
};
/*
* BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
*
* @dev - member to resize
* @nbuckets - new number of buckets
*/
struct bch_ioctl_disk_resize {
__u32 flags;
__u32 pad;
__u64 dev;
__u64 nbuckets;
};
#endif /* _BCACHEFS_IOCTL_H */
This diff is collapsed.
This diff is collapsed.
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "bkey_methods.h"
#include "btree_types.h"
#include "alloc.h"
#include "dirent.h"
#include "error.h"
#include "extents.h"
#include "inode.h"
#include "quota.h"
#include "xattr.h"
const struct bkey_ops bch2_bkey_ops[] = {
[BKEY_TYPE_EXTENTS] = bch2_bkey_extent_ops,
[BKEY_TYPE_INODES] = bch2_bkey_inode_ops,
[BKEY_TYPE_DIRENTS] = bch2_bkey_dirent_ops,
[BKEY_TYPE_XATTRS] = bch2_bkey_xattr_ops,
[BKEY_TYPE_ALLOC] = bch2_bkey_alloc_ops,
[BKEY_TYPE_QUOTAS] = bch2_bkey_quota_ops,
[BKEY_TYPE_BTREE] = bch2_bkey_btree_ops,
};
const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
switch (k.k->type) {
case KEY_TYPE_DELETED:
case KEY_TYPE_DISCARD:
return NULL;
case KEY_TYPE_ERROR:
return bkey_val_bytes(k.k) != 0
? "value size should be zero"
: NULL;
case KEY_TYPE_COOKIE:
return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
? "incorrect value size"
: NULL;
default:
if (k.k->type < KEY_TYPE_GENERIC_NR)
return "invalid type";
return ops->key_invalid(c, k);
}
}
const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
if (k.k->u64s < BKEY_U64s)
return "u64s too small";
if (!ops->is_extents) {
if (k.k->size)
return "nonzero size field";
} else {
if ((k.k->size == 0) != bkey_deleted(k.k))
return "bad size field";
}
if (ops->is_extents &&
!k.k->size &&
!bkey_deleted(k.k))
return "zero size field";
if (k.k->p.snapshot)
return "nonzero snapshot";
if (type != BKEY_TYPE_BTREE &&
!bkey_cmp(k.k->p, POS_MAX))
return "POS_MAX key";
return NULL;
}
const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
struct bkey_s_c k)
{
return __bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_val_invalid(c, type, k);
}
const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
{
if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
return "key before start of btree node";
if (bkey_cmp(k.k->p, b->data->max_key) > 0)
return "key past end of btree node";
return NULL;
}
void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
{
enum bkey_type type = btree_node_type(b);
const struct bkey_ops *ops = &bch2_bkey_ops[type];
const char *invalid;
BUG_ON(!k.k->u64s);
invalid = bch2_bkey_invalid(c, type, k) ?:
bch2_bkey_in_btree_node(b, k);
if (invalid) {
char buf[160];
bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
return;
}
if (k.k->type >= KEY_TYPE_GENERIC_NR &&
ops->key_debugcheck)
ops->key_debugcheck(c, b, k);
}
#define p(...) (out += scnprintf(out, end - out, __VA_ARGS__))
int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
{
char *out = buf, *end = buf + size;
p("u64s %u type %u ", k->u64s, k->type);
if (bkey_cmp(k->p, POS_MAX))
p("%llu:%llu", k->p.inode, k->p.offset);
else
p("POS_MAX");
p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
return out - buf;
}
int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
char *out = buf, *end = buf + size;
switch (k.k->type) {
case KEY_TYPE_DELETED:
p(" deleted");
break;
case KEY_TYPE_DISCARD:
p(" discard");
break;
case KEY_TYPE_ERROR:
p(" error");
break;
case KEY_TYPE_COOKIE:
p(" cookie");
break;
default:
if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
ops->val_to_text(c, buf, size, k);
break;
}
return out - buf;
}
int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
char *buf, size_t size, struct bkey_s_c k)
{
char *out = buf, *end = buf + size;
out += bch2_bkey_to_text(out, end - out, k.k);
out += scnprintf(out, end - out, ": ");
out += bch2_val_to_text(c, type, out, end - out, k);
return out - buf;
}
void bch2_bkey_swab(enum bkey_type type,
const struct bkey_format *f,
struct bkey_packed *k)
{
const struct bkey_ops *ops = &bch2_bkey_ops[type];
bch2_bkey_swab_key(f, k);
if (ops->swab)
ops->swab(f, k);
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_CHARDEV_H
#define _BCACHEFS_CHARDEV_H
#ifndef NO_BCACHEFS_FS
long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
void bch2_fs_chardev_exit(struct bch_fs *);
int bch2_fs_chardev_init(struct bch_fs *);
void bch2_chardev_exit(void);
int __init bch2_chardev_init(void);
#else
static inline long bch2_fs_ioctl(struct bch_fs *c,
unsigned cmd, void __user * arg)
{
return -ENOSYS;
}
static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
static inline void bch2_chardev_exit(void) {}
static inline int __init bch2_chardev_init(void) { return 0; }
#endif /* NO_BCACHEFS_FS */
#endif /* _BCACHEFS_CHARDEV_H */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_CLOCK_H
#define _BCACHEFS_CLOCK_H
void bch2_io_timer_add(struct io_clock *, struct io_timer *);
void bch2_io_timer_del(struct io_clock *, struct io_timer *);
void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
unsigned long);
void bch2_increment_clock(struct bch_fs *, unsigned, int);
void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_timeout(wq, condition, timeout); \
__ret; \
})
void bch2_io_clock_exit(struct io_clock *);
int bch2_io_clock_init(struct io_clock *);
#endif /* _BCACHEFS_CLOCK_H */
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_COMPRESS_H
#define _BCACHEFS_COMPRESS_H
#include "extents_types.h"
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
struct bvec_iter, struct bch_extent_crc_unpacked);
unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
struct bio *, size_t *, unsigned);
int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
void bch2_fs_compress_exit(struct bch_fs *);
int bch2_fs_compress_init(struct bch_fs *);
#endif /* _BCACHEFS_COMPRESS_H */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _BCACHEFS_FS_IOCTL_H
#define _BCACHEFS_FS_IOCTL_H
void bch2_inode_flags_to_vfs(struct bch_inode_info *);
long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
#endif /* _BCACHEFS_FS_IOCTL_H */
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment