bcachefs: Initial commit

Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write filesystem with every feature you could possibly want. Website: https://bcachefs.orgSigned-off-by: Kent Overstreet <kent.overstreet@linux.dev>

bcachefs: Initial commit
Initially forked from drivers/md/bcache, bcachefs is a new copy-on-write filesystem with every feature you could possibly want. Website: https://bcachefs.orgSigned-off-by: Kent Overstreet <kent.overstreet@linux.dev>
1c6fdbd8 · Kent Overstreet · Kent Overstreet · 0d29a833 · 1c6fdbd8 · 1c6fdbd8
Commit 1c6fdbd8 authored Mar 16, 2017 by Kent Overstreet Committed by Kent Overstreet Oct 22, 2023
122 changed files
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -48,6 +48,7 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
 source "fs/f2fs/Kconfig"
+source "fs/bcachefs/Kconfig"
 source "fs/zonefs/Kconfig"
 endif # BLOCK

--- a/fs/Makefile
+++ b/fs/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
 obj-$(CONFIG_F2FS_FS)		+= f2fs/
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs/
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/

--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
+config BCACHEFS_FS
+	tristate "bcachefs filesystem support"
+	depends on BLOCK
+	select EXPORTFS
+	select CLOSURES
+	select LIBCRC32C
+	select FS_POSIX_ACL
+	select LZ4_COMPRESS
+	select LZ4_DECOMPRESS
+	select ZLIB_DEFLATE
+	select ZLIB_INFLATE
+	select ZSTD_COMPRESS
+	select ZSTD_DECOMPRESS
+	select CRYPTO_SHA256
+	select CRYPTO_CHACHA20
+	select CRYPTO_POLY1305
+	select KEYS
+	help
+	The bcachefs filesystem - a modern, copy on write filesystem, with
+	support for multiple devices, compression, checksumming, etc.
+config BCACHEFS_QUOTA
+	bool "bcachefs quota support"
+	depends on BCACHEFS_FS
+	select QUOTACTL
+config BCACHEFS_POSIX_ACL
+	bool "bcachefs POSIX ACL support"
+	depends on BCACHEFS_FS
+	select FS_POSIX_ACL
+config BCACHEFS_DEBUG
+	bool "bcachefs debugging"
+	depends on BCACHEFS_FS
+	help
+	Enables many extra debugging checks and assertions.
+	The resulting code will be significantly slower than normal; you
+	probably shouldn't select this option unless you're a developer.
+config BCACHEFS_TESTS
+	bool "bcachefs unit and performance tests"
+	depends on BCACHEFS_FS
+	help
+	Include some unit and performance tests for the core btree code
+config BCACHEFS_NO_LATENCY_ACCT
+	bool "disable latency accounting and time stats"
+	depends on BCACHEFS_FS
+	help
+	This disables device latency tracking and time stats, only for performance testing
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
+obj-$(CONFIG_BCACHEFS_FS)	+= bcachefs.o
+bcachefs-y		:=	\
+	acl.o			\
+	alloc.o			\
+	bkey.o			\
+	bkey_methods.o		\
+	bset.o			\
+	btree_cache.o		\
+	btree_gc.o		\
+	btree_io.o		\
+	btree_iter.o		\
+	btree_update_interior.o	\
+	btree_update_leaf.o	\
+	buckets.o		\
+	chardev.o		\
+	checksum.o		\
+	clock.o			\
+	compress.o		\
+	debug.o			\
+	dirent.o		\
+	disk_groups.o		\
+	error.o			\
+	extents.o		\
+	fs.o			\
+	fs-ioctl.o		\
+	fs-io.o			\
+	fsck.o			\
+	inode.o			\
+	io.o			\
+	journal.o		\
+	journal_io.o		\
+	journal_reclaim.o	\
+	journal_seq_blacklist.o	\
+	keylist.o		\
+	migrate.o		\
+	move.o			\
+	movinggc.o		\
+	opts.o			\
+	quota.o			\
+	rebalance.o		\
+	recovery.o		\
+	replicas.o		\
+	siphash.o		\
+	six.o			\
+	super.o			\
+	super-io.o		\
+	sysfs.o			\
+	tests.o			\
+	trace.o			\
+	util.o			\
+	xattr.o
--- a/fs/bcachefs/acl.c
+++ b/fs/bcachefs/acl.c
+// SPDX-License-Identifier: GPL-2.0
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#include "bcachefs.h"
+#include <linux/fs.h>
+#include <linux/posix_acl.h>
+#include <linux/posix_acl_xattr.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include "acl.h"
+#include "fs.h"
+#include "xattr.h"
+static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long)
+{
+	return sizeof(bch_acl_header) +
+		sizeof(bch_acl_entry_short) * nr_short +
+		sizeof(bch_acl_entry) * nr_long;
+}
+static inline int acl_to_xattr_type(int type)
+{
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		return BCH_XATTR_INDEX_POSIX_ACL_ACCESS;
+	case ACL_TYPE_DEFAULT:
+		return BCH_XATTR_INDEX_POSIX_ACL_DEFAULT;
+	default:
+		BUG();
+	}
+}
+/*
+ * Convert from filesystem to in-memory representation.
+ */
+static struct posix_acl *bch2_acl_from_disk(const void *value, size_t size)
+{
+	const void *p, *end = value + size;
+	struct posix_acl *acl;
+	struct posix_acl_entry *out;
+	unsigned count = 0;
+	if (!value)
+		return NULL;
+	if (size < sizeof(bch_acl_header))
+		goto invalid;
+	if (((bch_acl_header *)value)->a_version !=
+	    cpu_to_le32(BCH_ACL_VERSION))
+		goto invalid;
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *entry = p;
+		if (p + sizeof(bch_acl_entry_short) > end)
+			goto invalid;
+		switch (le16_to_cpu(entry->e_tag)) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+		case ACL_GROUP:
+			p += sizeof(bch_acl_entry);
+			break;
+		default:
+			goto invalid;
+		}
+		count++;
+	}
+	if (p > end)
+		goto invalid;
+	if (!count)
+		return NULL;
+	acl = posix_acl_alloc(count, GFP_KERNEL);
+	if (!acl)
+		return ERR_PTR(-ENOMEM);
+	out = acl->a_entries;
+	p = value + sizeof(bch_acl_header);
+	while (p < end) {
+		const bch_acl_entry *in = p;
+		out->e_tag  = le16_to_cpu(in->e_tag);
+		out->e_perm = le16_to_cpu(in->e_perm);
+		switch (out->e_tag) {
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			p += sizeof(bch_acl_entry_short);
+			break;
+		case ACL_USER:
+			out->e_uid = make_kuid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			out->e_gid = make_kgid(&init_user_ns,
+					       le32_to_cpu(in->e_id));
+			p += sizeof(bch_acl_entry);
+			break;
+		}
+		out++;
+	}
+	BUG_ON(out != acl->a_entries + acl->a_count);
+	return acl;
+invalid:
+	pr_err("invalid acl entry");
+	return ERR_PTR(-EINVAL);
+}
+#define acl_for_each_entry(acl, acl_e)			\
+	for (acl_e = acl->a_entries;			\
+	     acl_e < acl->a_entries + acl->a_count;	\
+	     acl_e++)
+/*
+ * Convert from in-memory to filesystem representation.
+ */
+static struct bkey_i_xattr *
+bch2_acl_to_xattr(struct btree_trans *trans,
+		  const struct posix_acl *acl,
+		  int type)
+{
+	struct bkey_i_xattr *xattr;
+	bch_acl_header *acl_header;
+	const struct posix_acl_entry *acl_e;
+	void *outptr;
+	unsigned nr_short = 0, nr_long = 0, acl_len, u64s;
+	acl_for_each_entry(acl, acl_e) {
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+		case ACL_GROUP:
+			nr_long++;
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			nr_short++;
+			break;
+		default:
+			return ERR_PTR(-EINVAL);
+		}
+	}
+	acl_len = bch2_acl_size(nr_short, nr_long);
+	u64s = BKEY_U64s + xattr_val_u64s(0, acl_len);
+	if (u64s > U8_MAX)
+		return ERR_PTR(-E2BIG);
+	xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64));
+	if (IS_ERR(xattr))
+		return xattr;
+	bkey_xattr_init(&xattr->k_i);
+	xattr->k.u64s		= u64s;
+	xattr->v.x_type		= acl_to_xattr_type(type);
+	xattr->v.x_name_len	= 0,
+	xattr->v.x_val_len	= cpu_to_le16(acl_len);
+	acl_header = xattr_val(&xattr->v);
+	acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION);
+	outptr = (void *) acl_header + sizeof(*acl_header);
+	acl_for_each_entry(acl, acl_e) {
+		bch_acl_entry *entry = outptr;
+		entry->e_tag = cpu_to_le16(acl_e->e_tag);
+		entry->e_perm = cpu_to_le16(acl_e->e_perm);
+		switch (acl_e->e_tag) {
+		case ACL_USER:
+			entry->e_id = cpu_to_le32(
+				from_kuid(&init_user_ns, acl_e->e_uid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+		case ACL_GROUP:
+			entry->e_id = cpu_to_le32(
+				from_kgid(&init_user_ns, acl_e->e_gid));
+			outptr += sizeof(bch_acl_entry);
+			break;
+		case ACL_USER_OBJ:
+		case ACL_GROUP_OBJ:
+		case ACL_MASK:
+		case ACL_OTHER:
+			outptr += sizeof(bch_acl_entry_short);
+			break;
+		}
+	}
+	BUG_ON(outptr != xattr_val(&xattr->v) + acl_len);
+	return xattr;
+}
+struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap,
+			       struct dentry *dentry, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct posix_acl *acl = NULL;
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+	iter = bch2_hash_lookup(&trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(acl_to_xattr_type(type), "", 0),
+			0);
+	if (IS_ERR(iter)) {
+		if (PTR_ERR(iter) == -EINTR)
+			goto retry;
+		if (PTR_ERR(iter) != -ENOENT)
+			acl = ERR_CAST(iter);
+		goto out;
+	}
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+	if (!IS_ERR(acl))
+		set_cached_acl(&inode->v, type, acl);
+out:
+	bch2_trans_exit(&trans);
+	return acl;
+}
+int bch2_set_acl_trans(struct btree_trans *trans,
+		       struct bch_inode_unpacked *inode_u,
+		       const struct bch_hash_info *hash_info,
+		       struct posix_acl *acl, int type)
+{
+	int ret;
+	if (type == ACL_TYPE_DEFAULT &&
+	    !S_ISDIR(inode_u->bi_mode))
+		return acl ? -EACCES : 0;
+	if (acl) {
+		struct bkey_i_xattr *xattr =
+			bch2_acl_to_xattr(trans, acl, type);
+		if (IS_ERR(xattr))
+			return PTR_ERR(xattr);
+		ret = __bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info,
+				      inode_u->bi_inum, &xattr->k_i, 0);
+	} else {
+		struct xattr_search_key search =
+			X_SEARCH(acl_to_xattr_type(type), "", 0);
+		ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, hash_info,
+				       inode_u->bi_inum, &search);
+	}
+	return ret == -ENOENT ? 0 : ret;
+}
+static int inode_update_for_set_acl_fn(struct bch_inode_info *inode,
+				       struct bch_inode_unpacked *bi,
+				       void *p)
+{
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct timespec64 now = current_time(&inode->v);
+	umode_t mode = (unsigned long) p;
+	bi->bi_ctime	= timespec_to_bch2_time(c, now);
+	bi->bi_mode	= mode;
+	return 0;
+}
+int bch2_set_acl(struct mnt_idmap *idmap,
+		 struct dentry *dentry,
+		 struct posix_acl *acl, int type)
+{
+	struct bch_inode_info *inode = to_bch_ei(dentry->d_inode);
+	struct bch_fs *c = inode->v.i_sb->s_fs_info;
+	struct btree_trans trans;
+	struct bch_inode_unpacked inode_u;
+	umode_t mode = inode->v.i_mode;
+	int ret;
+	if (type == ACL_TYPE_ACCESS && acl) {
+		ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl);
+		if (ret)
+			return ret;
+	}
+	bch2_trans_init(&trans, c);
+retry:
+	bch2_trans_begin(&trans);
+	ret   = bch2_set_acl_trans(&trans,
+				   &inode->ei_inode,
+				   &inode->ei_str_hash,
+				   acl, type) ?:
+		bch2_write_inode_trans(&trans, inode, &inode_u,
+				       inode_update_for_set_acl_fn,
+				       (void *)(unsigned long) mode) ?:
+		bch2_trans_commit(&trans, NULL, NULL,
+				  &inode->ei_journal_seq,
+				  BTREE_INSERT_ATOMIC|
+				  BTREE_INSERT_NOUNLOCK);
+	if (ret == -EINTR)
+		goto retry;
+	if (unlikely(ret))
+		goto err;
+	bch2_inode_update_after_write(c, inode, &inode_u,
+				      ATTR_CTIME|ATTR_MODE);
+	set_cached_acl(&inode->v, type, acl);
+err:
+	bch2_trans_exit(&trans);
+	return ret;
+}
+int bch2_acl_chmod(struct btree_trans *trans,
+		   struct bch_inode_info *inode,
+		   umode_t mode,
+		   struct posix_acl **new_acl)
+{
+	struct btree_iter *iter;
+	struct bkey_s_c_xattr xattr;
+	struct bkey_i_xattr *new;
+	struct posix_acl *acl;
+	int ret = 0;
+	iter = bch2_hash_lookup(trans, bch2_xattr_hash_desc,
+			&inode->ei_str_hash, inode->v.i_ino,
+			&X_SEARCH(BCH_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0),
+			BTREE_ITER_INTENT);
+	if (IS_ERR(iter))
+		return PTR_ERR(iter) != -ENOENT ? PTR_ERR(iter) : 0;
+	xattr = bkey_s_c_to_xattr(bch2_btree_iter_peek_slot(iter));
+	acl = bch2_acl_from_disk(xattr_val(xattr.v),
+			le16_to_cpu(xattr.v->x_val_len));
+	if (IS_ERR_OR_NULL(acl))
+		return PTR_ERR(acl);
+	ret = __posix_acl_chmod(&acl, GFP_KERNEL, mode);
+	if (ret)
+		goto err;
+	new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS);
+	if (IS_ERR(new)) {
+		ret = PTR_ERR(new);
+		goto err;
+	}
+	bch2_trans_update(trans, iter, &new->k_i, 0);
+	*new_acl = acl;
+	acl = NULL;
+err:
+	kfree(acl);
+	return ret;
+}
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
--- a/fs/bcachefs/acl.h
+++ b/fs/bcachefs/acl.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ACL_H
+#define _BCACHEFS_ACL_H
+struct bch_inode_unpacked;
+struct bch_hash_info;
+struct bch_inode_info;
+struct posix_acl;
+#ifdef CONFIG_BCACHEFS_POSIX_ACL
+#define BCH_ACL_VERSION	0x0001
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+	__le32		e_id;
+} bch_acl_entry;
+typedef struct {
+	__le16		e_tag;
+	__le16		e_perm;
+} bch_acl_entry_short;
+typedef struct {
+	__le32		a_version;
+} bch_acl_header;
+struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int);
+int bch2_set_acl_trans(struct btree_trans *,
+		       struct bch_inode_unpacked *,
+		       const struct bch_hash_info *,
+		       struct posix_acl *, int);
+int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int);
+int bch2_acl_chmod(struct btree_trans *, struct bch_inode_info *,
+		   umode_t, struct posix_acl **);
+#else
+static inline int bch2_set_acl_trans(struct btree_trans *trans,
+				     struct bch_inode_unpacked *inode_u,
+				     const struct bch_hash_info *hash_info,
+				     struct posix_acl *acl, int type)
+{
+	return 0;
+}
+static inline int bch2_acl_chmod(struct btree_trans *trans,
+				 struct bch_inode_info *inode,
+				 umode_t mode,
+				 struct posix_acl **new_acl)
+{
+	return 0;
+}
+#endif /* CONFIG_BCACHEFS_POSIX_ACL */
+#endif /* _BCACHEFS_ACL_H */
--- a/fs/bcachefs/alloc.c
+++ b/fs/bcachefs/alloc.c
--- a/fs/bcachefs/alloc.h
+++ b/fs/bcachefs/alloc.h
+#ifndef _BCACHEFS_ALLOC_H
+#define _BCACHEFS_ALLOC_H
+#include "bcachefs.h"
+#include "alloc_types.h"
+struct bkey;
+struct bch_dev;
+struct bch_fs;
+struct bch_devs_List;
+const char *bch2_alloc_invalid(const struct bch_fs *, struct bkey_s_c);
+void bch2_alloc_to_text(struct bch_fs *, char *, size_t, struct bkey_s_c);
+#define bch2_bkey_alloc_ops (struct bkey_ops) {		\
+	.key_invalid	= bch2_alloc_invalid,		\
+	.val_to_text	= bch2_alloc_to_text,		\
+}
+struct dev_alloc_list {
+	unsigned	nr;
+	u8		devs[BCH_SB_MEMBERS_MAX];
+};
+struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *,
+					 struct write_point *,
+					 struct bch_devs_mask *);
+void bch2_wp_rescale(struct bch_fs *, struct bch_dev *,
+		     struct write_point *);
+int bch2_alloc_read(struct bch_fs *, struct list_head *);
+int bch2_alloc_replay_key(struct bch_fs *, struct bpos);
+enum bucket_alloc_ret {
+	ALLOC_SUCCESS		= 0,
+	OPEN_BUCKETS_EMPTY	= -1,
+	FREELIST_EMPTY		= -2,	/* Allocator thread not keeping up */
+	NO_DEVICES		= -3,	/* -EROFS */
+};
+long bch2_bucket_alloc_new_fs(struct bch_dev *);
+int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool,
+		      struct closure *);
+#define __writepoint_for_each_ptr(_wp, _ob, _i, _start)			\
+	for ((_i) = (_start);						\
+	     (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true);	\
+	     (_i)++)
+#define writepoint_for_each_ptr_all(_wp, _ob, _i)			\
+	__writepoint_for_each_ptr(_wp, _ob, _i, 0)
+#define writepoint_for_each_ptr(_wp, _ob, _i)				\
+	__writepoint_for_each_ptr(_wp, _ob, _i, wp->first_ptr)
+void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *);
+static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob)
+{
+	if (atomic_dec_and_test(&ob->pin))
+		__bch2_open_bucket_put(c, ob);
+}
+static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs)
+{
+	unsigned i;
+	for (i = 0; i < *nr; i++)
+		bch2_open_bucket_put(c, c->open_buckets + refs[i]);
+	*nr = 0;
+}
+static inline void bch2_open_bucket_get(struct bch_fs *c,
+					struct write_point *wp,
+					u8 *nr, u8 *refs)
+{
+	struct open_bucket *ob;
+	unsigned i;
+	writepoint_for_each_ptr(wp, ob, i) {
+		atomic_inc(&ob->pin);
+		refs[(*nr)++] = ob - c->open_buckets;
+	}
+}
+struct write_point *bch2_alloc_sectors_start(struct bch_fs *,
+					     unsigned,
+					     struct write_point_specifier,
+					     struct bch_devs_list *,
+					     unsigned, unsigned,
+					     enum alloc_reserve,
+					     unsigned,
+					     struct closure *);
+void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *,
+				    struct bkey_i_extent *, unsigned);
+void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *);
+static inline void bch2_wake_allocator(struct bch_dev *ca)
+{
+	struct task_struct *p;
+	rcu_read_lock();
+	p = rcu_dereference(ca->alloc_thread);
+	if (p)
+		wake_up_process(p);
+	rcu_read_unlock();
+}
+static inline struct write_point_specifier writepoint_hashed(unsigned long v)
+{
+	return (struct write_point_specifier) { .v = v | 1 };
+}
+static inline struct write_point_specifier writepoint_ptr(struct write_point *wp)
+{
+	return (struct write_point_specifier) { .v = (unsigned long) wp };
+}
+void bch2_recalc_capacity(struct bch_fs *);
+void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *);
+void bch2_dev_allocator_stop(struct bch_dev *);
+int bch2_dev_allocator_start(struct bch_dev *);
+static inline void writepoint_init(struct write_point *wp,
+				   enum bch_data_type type)
+{
+	mutex_init(&wp->lock);
+	wp->type = type;
+}
+int bch2_alloc_write(struct bch_fs *);
+int bch2_fs_allocator_start(struct bch_fs *);
+void bch2_fs_allocator_init(struct bch_fs *);
+#endif /* _BCACHEFS_ALLOC_H */
--- a/fs/bcachefs/alloc_types.h
+++ b/fs/bcachefs/alloc_types.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_TYPES_H
+#define _BCACHEFS_ALLOC_TYPES_H
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include "clock_types.h"
+#include "fifo.h"
+/* There's two of these clocks, one for reads and one for writes: */
+struct bucket_clock {
+	/*
+	 * "now" in (read/write) IO time - incremented whenever we do X amount
+	 * of reads or writes.
+	 *
+	 * Goes with the bucket read/write prios: when we read or write to a
+	 * bucket we reset the bucket's prio to the current hand; thus hand -
+	 * prio = time since bucket was last read/written.
+	 *
+	 * The units are some amount (bytes/sectors) of data read/written, and
+	 * the units can change on the fly if we need to rescale to fit
+	 * everything in a u16 - your only guarantee is that the units are
+	 * consistent.
+	 */
+	u16			hand;
+	u16			max_last_io;
+	int			rw;
+	struct io_timer		rescale;
+	struct mutex		lock;
+};
+/* There is one reserve for each type of btree, one for prios and gens
+ * and one for moving GC */
+enum alloc_reserve {
+	RESERVE_ALLOC		= -1,
+	RESERVE_BTREE		= 0,
+	RESERVE_MOVINGGC	= 1,
+	RESERVE_NONE		= 2,
+	RESERVE_NR		= 3,
+};
+typedef FIFO(long)	alloc_fifo;
+/* Enough for 16 cache devices, 2 tiers and some left over for pipelining */
+#define OPEN_BUCKETS_COUNT	256
+#define WRITE_POINT_COUNT	32
+struct open_bucket {
+	spinlock_t		lock;
+	atomic_t		pin;
+	u8			freelist;
+	bool			valid;
+	bool			on_partial_list;
+	unsigned		sectors_free;
+	struct bch_extent_ptr	ptr;
+};
+struct write_point {
+	struct hlist_node	node;
+	struct mutex		lock;
+	u64			last_used;
+	unsigned long		write_point;
+	enum bch_data_type	type;
+	u8			nr_ptrs;
+	u8			first_ptr;
+	/* calculated based on how many pointers we're actually going to use: */
+	unsigned		sectors_free;
+	struct open_bucket	*ptrs[BCH_REPLICAS_MAX * 2];
+	u64			next_alloc[BCH_SB_MEMBERS_MAX];
+};
+struct write_point_specifier {
+	unsigned long		v;
+};
+struct alloc_heap_entry {
+	size_t			bucket;
+	size_t			nr;
+	unsigned long		key;
+};
+typedef HEAP(struct alloc_heap_entry) alloc_heap;
+#endif /* _BCACHEFS_ALLOC_TYPES_H */
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_IOCTL_H
+#define _BCACHEFS_IOCTL_H
+#include <linux/uuid.h>
+#include <asm/ioctl.h>
+#include "bcachefs_format.h"
+/*
+ * Flags common to multiple ioctls:
+ */
+#define BCH_FORCE_IF_DATA_LOST		(1 << 0)
+#define BCH_FORCE_IF_METADATA_LOST	(1 << 1)
+#define BCH_FORCE_IF_DATA_DEGRADED	(1 << 2)
+#define BCH_FORCE_IF_METADATA_DEGRADED	(1 << 3)
+#define BCH_FORCE_IF_DEGRADED			\
+	(BCH_FORCE_IF_DATA_DEGRADED|		\
+	 BCH_FORCE_IF_METADATA_DEGRADED)
+/*
+ * If cleared, ioctl that refer to a device pass it as a pointer to a pathname
+ * (e.g. /dev/sda1); if set, the dev field is the device's index within the
+ * filesystem:
+ */
+#define BCH_BY_INDEX			(1 << 4)
+/*
+ * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem
+ * wide superblock:
+ */
+#define BCH_READ_DEV			(1 << 5)
+/* global control dev: */
+/* These are currently broken, and probably unnecessary: */
+#if 0
+#define BCH_IOCTL_ASSEMBLE	_IOW(0xbc, 1, struct bch_ioctl_assemble)
+#define BCH_IOCTL_INCREMENTAL	_IOW(0xbc, 2, struct bch_ioctl_incremental)
+struct bch_ioctl_assemble {
+	__u32			flags;
+	__u32			nr_devs;
+	__u64			pad;
+	__u64			devs[];
+};
+struct bch_ioctl_incremental {
+	__u32			flags;
+	__u64			pad;
+	__u64			dev;
+};
+#endif
+/* filesystem ioctls: */
+#define BCH_IOCTL_QUERY_UUID	_IOR(0xbc,	1,  struct bch_ioctl_query_uuid)
+/* These only make sense when we also have incremental assembly */
+#if 0
+#define BCH_IOCTL_START		_IOW(0xbc,	2,  struct bch_ioctl_start)
+#define BCH_IOCTL_STOP		_IO(0xbc,	3)
+#endif
+#define BCH_IOCTL_DISK_ADD	_IOW(0xbc,	4,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_REMOVE	_IOW(0xbc,	5,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_ONLINE	_IOW(0xbc,	6,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_OFFLINE	_IOW(0xbc,	7,  struct bch_ioctl_disk)
+#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc,	8,  struct bch_ioctl_disk_set_state)
+#define BCH_IOCTL_DATA		_IOW(0xbc,	10, struct bch_ioctl_data)
+#define BCH_IOCTL_USAGE		_IOWR(0xbc,	11, struct bch_ioctl_usage)
+#define BCH_IOCTL_READ_SUPER	_IOW(0xbc,	12, struct bch_ioctl_read_super)
+#define BCH_IOCTL_DISK_GET_IDX	_IOW(0xbc,	13,  struct bch_ioctl_disk_get_idx)
+#define BCH_IOCTL_DISK_RESIZE	_IOW(0xbc,	13,  struct bch_ioctl_disk_resize)
+/*
+ * BCH_IOCTL_QUERY_UUID: get filesystem UUID
+ *
+ * Returns user visible UUID, not internal UUID (which may not ever be changed);
+ * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with
+ * this UUID.
+ */
+struct bch_ioctl_query_uuid {
+	__uuid_t		uuid;
+};
+#if 0
+struct bch_ioctl_start {
+	__u32			flags;
+	__u32			pad;
+};
+#endif
+/*
+ * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem
+ *
+ * The specified device must not be open or in use. On success, the new device
+ * will be an online member of the filesystem just like any other member.
+ *
+ * The device must first be prepared by userspace by formatting with a bcachefs
+ * superblock, which is only used for passing in superblock options/parameters
+ * for that device (in struct bch_member). The new device's superblock should
+ * not claim to be a member of any existing filesystem - UUIDs on it will be
+ * ignored.
+ */
+/*
+ * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem
+ *
+ * Any data present on @dev will be permanently deleted, and @dev will be
+ * removed from its slot in the filesystem's list of member devices. The device
+ * may be either offline or offline.
+ *
+ * Will fail removing @dev would leave us with insufficient read write devices
+ * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are
+ * set.
+ */
+/*
+ * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem
+ * but is not open (e.g. because we started in degraded mode), bring it online
+ *
+ * all existing data on @dev will be available once the device is online,
+ * exactly as if @dev was present when the filesystem was first mounted
+ */
+/*
+ * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that
+ * block device, without removing it from the filesystem (so it can be brought
+ * back online later)
+ *
+ * Data present on @dev will be unavailable while @dev is offline (unless
+ * replicated), but will still be intact and untouched if @dev is brought back
+ * online
+ *
+ * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would
+ * leave us with insufficient read write devices or degraded/unavailable data,
+ * unless the approprate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+};
+/*
+ * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem
+ *
+ * @new_state		- one of the bch_member_state states (rw, ro, failed,
+ *			  spare)
+ *
+ * Will refuse to change member state if we would then have insufficient devices
+ * to write to, or if it would result in degraded data (when @new_state is
+ * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set.
+ */
+struct bch_ioctl_disk_set_state {
+	__u32			flags;
+	__u8			new_state;
+	__u8			pad[3];
+	__u64			dev;
+};
+enum bch_data_ops {
+	BCH_DATA_OP_SCRUB	= 0,
+	BCH_DATA_OP_REREPLICATE	= 1,
+	BCH_DATA_OP_MIGRATE	= 2,
+	BCH_DATA_OP_NR		= 3,
+};
+/*
+ * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g.
+ * scrub, rereplicate, migrate).
+ *
+ * This ioctl kicks off a job in the background, and returns a file descriptor.
+ * Reading from the file descriptor returns a struct bch_ioctl_data_event,
+ * indicating current progress, and closing the file descriptor will stop the
+ * job. The file descriptor is O_CLOEXEC.
+ */
+struct bch_ioctl_data {
+	__u32			op;
+	__u32			flags;
+	struct bpos		start;
+	struct bpos		end;
+	union {
+	struct {
+		__u32		dev;
+		__u32		pad;
+	}			migrate;
+	struct {
+		__u64		pad[8];
+	};
+	};
+} __attribute__((packed, aligned(8)));
+enum bch_data_event {
+	BCH_DATA_EVENT_PROGRESS	= 0,
+	/* XXX: add an event for reporting errors */
+	BCH_DATA_EVENT_NR	= 1,
+};
+struct bch_ioctl_data_progress {
+	__u8			data_type;
+	__u8			btree_id;
+	__u8			pad[2];
+	struct bpos		pos;
+	__u64			sectors_done;
+	__u64			sectors_total;
+} __attribute__((packed, aligned(8)));
+struct bch_ioctl_data_event {
+	__u8			type;
+	__u8			pad[7];
+	union {
+	struct bch_ioctl_data_progress p;
+	__u64			pad2[15];
+	};
+} __attribute__((packed, aligned(8)));
+struct bch_ioctl_dev_usage {
+	__u8			state;
+	__u8			alive;
+	__u8			pad[6];
+	__u32			dev;
+	__u32			bucket_size;
+	__u64			nr_buckets;
+	__u64			buckets[BCH_DATA_NR];
+	__u64			sectors[BCH_DATA_NR];
+};
+struct bch_ioctl_fs_usage {
+	__u64			capacity;
+	__u64			used;
+	__u64			online_reserved;
+	__u64			persistent_reserved[BCH_REPLICAS_MAX];
+	__u64			sectors[BCH_DATA_NR][BCH_REPLICAS_MAX];
+};
+/*
+ * BCH_IOCTL_USAGE: query filesystem disk space usage
+ *
+ * Returns disk space usage broken out by data type, number of replicas, and
+ * by component device
+ *
+ * @nr_devices	- number of devices userspace allocated space for in @devs
+ *
+ * On success, @fs and @devs will be filled out appropriately and devs[i].alive
+ * will indicate if a device was present in that slot
+ *
+ * Returns -ERANGE if @nr_devices was too small
+ */
+struct bch_ioctl_usage {
+	__u16			nr_devices;
+	__u16			pad[3];
+	struct bch_ioctl_fs_usage fs;
+	struct bch_ioctl_dev_usage devs[0];
+};
+/*
+ * BCH_IOCTL_READ_SUPER: read filesystem superblock
+ *
+ * Equivalent to reading the superblock directly from the block device, except
+ * avoids racing with the kernel writing the superblock or having to figure out
+ * which block device to read
+ *
+ * @sb		- buffer to read into
+ * @size	- size of userspace allocated buffer
+ * @dev		- device to read superblock for, if BCH_READ_DEV flag is
+ *		  specified
+ *
+ * Returns -ERANGE if buffer provided is too small
+ */
+struct bch_ioctl_read_super {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			size;
+	__u64			sb;
+};
+/*
+ * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to
+ * determine if disk is a (online) member - if so, returns device's index
+ *
+ * Returns -ENOENT if not found
+ */
+struct bch_ioctl_disk_get_idx {
+	__u64			dev;
+};
+/*
+ * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device
+ *
+ * @dev		- member to resize
+ * @nbuckets	- new number of buckets
+ */
+struct bch_ioctl_disk_resize {
+	__u32			flags;
+	__u32			pad;
+	__u64			dev;
+	__u64			nbuckets;
+};
+#endif /* _BCACHEFS_IOCTL_H */
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
--- a/fs/bcachefs/bkey.h
+++ b/fs/bcachefs/bkey.h
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
+// SPDX-License-Identifier: GPL-2.0
+#include "bcachefs.h"
+#include "bkey_methods.h"
+#include "btree_types.h"
+#include "alloc.h"
+#include "dirent.h"
+#include "error.h"
+#include "extents.h"
+#include "inode.h"
+#include "quota.h"
+#include "xattr.h"
+const struct bkey_ops bch2_bkey_ops[] = {
+	[BKEY_TYPE_EXTENTS]	= bch2_bkey_extent_ops,
+	[BKEY_TYPE_INODES]	= bch2_bkey_inode_ops,
+	[BKEY_TYPE_DIRENTS]	= bch2_bkey_dirent_ops,
+	[BKEY_TYPE_XATTRS]	= bch2_bkey_xattr_ops,
+	[BKEY_TYPE_ALLOC]	= bch2_bkey_alloc_ops,
+	[BKEY_TYPE_QUOTAS]	= bch2_bkey_quota_ops,
+	[BKEY_TYPE_BTREE]	= bch2_bkey_btree_ops,
+};
+const char *bch2_bkey_val_invalid(struct bch_fs *c, enum bkey_type type,
+				  struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+	case KEY_TYPE_DISCARD:
+		return NULL;
+	case KEY_TYPE_ERROR:
+		return bkey_val_bytes(k.k) != 0
+			? "value size should be zero"
+			: NULL;
+	case KEY_TYPE_COOKIE:
+		return bkey_val_bytes(k.k) != sizeof(struct bch_cookie)
+			? "incorrect value size"
+			: NULL;
+	default:
+		if (k.k->type < KEY_TYPE_GENERIC_NR)
+			return "invalid type";
+		return ops->key_invalid(c, k);
+	}
+}
+const char *__bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	if (k.k->u64s < BKEY_U64s)
+		return "u64s too small";
+	if (!ops->is_extents) {
+		if (k.k->size)
+			return "nonzero size field";
+	} else {
+		if ((k.k->size == 0) != bkey_deleted(k.k))
+			return "bad size field";
+	}
+	if (ops->is_extents &&
+	    !k.k->size &&
+	    !bkey_deleted(k.k))
+		return "zero size field";
+	if (k.k->p.snapshot)
+		return "nonzero snapshot";
+	if (type != BKEY_TYPE_BTREE &&
+	    !bkey_cmp(k.k->p, POS_MAX))
+		return "POS_MAX key";
+	return NULL;
+}
+const char *bch2_bkey_invalid(struct bch_fs *c, enum bkey_type type,
+			      struct bkey_s_c k)
+{
+	return __bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_val_invalid(c, type, k);
+}
+const char *bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k)
+{
+	if (bkey_cmp(bkey_start_pos(k.k), b->data->min_key) < 0)
+		return "key before start of btree node";
+	if (bkey_cmp(k.k->p, b->data->max_key) > 0)
+		return "key past end of btree node";
+	return NULL;
+}
+void bch2_bkey_debugcheck(struct bch_fs *c, struct btree *b, struct bkey_s_c k)
+{
+	enum bkey_type type = btree_node_type(b);
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	const char *invalid;
+	BUG_ON(!k.k->u64s);
+	invalid = bch2_bkey_invalid(c, type, k) ?:
+		bch2_bkey_in_btree_node(b, k);
+	if (invalid) {
+		char buf[160];
+		bch2_bkey_val_to_text(c, type, buf, sizeof(buf), k);
+		bch2_fs_bug(c, "invalid bkey %s: %s", buf, invalid);
+		return;
+	}
+	if (k.k->type >= KEY_TYPE_GENERIC_NR &&
+	    ops->key_debugcheck)
+		ops->key_debugcheck(c, b, k);
+}
+#define p(...)	(out += scnprintf(out, end - out, __VA_ARGS__))
+int bch2_bkey_to_text(char *buf, size_t size, const struct bkey *k)
+{
+	char *out = buf, *end = buf + size;
+	p("u64s %u type %u ", k->u64s, k->type);
+	if (bkey_cmp(k->p, POS_MAX))
+		p("%llu:%llu", k->p.inode, k->p.offset);
+	else
+		p("POS_MAX");
+	p(" snap %u len %u ver %llu", k->p.snapshot, k->size, k->version.lo);
+	return out - buf;
+}
+int bch2_val_to_text(struct bch_fs *c, enum bkey_type type,
+		     char *buf, size_t size, struct bkey_s_c k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	char *out = buf, *end = buf + size;
+	switch (k.k->type) {
+	case KEY_TYPE_DELETED:
+		p(" deleted");
+		break;
+	case KEY_TYPE_DISCARD:
+		p(" discard");
+		break;
+	case KEY_TYPE_ERROR:
+		p(" error");
+		break;
+	case KEY_TYPE_COOKIE:
+		p(" cookie");
+		break;
+	default:
+		if (k.k->type >= KEY_TYPE_GENERIC_NR && ops->val_to_text)
+			ops->val_to_text(c, buf, size, k);
+		break;
+	}
+	return out - buf;
+}
+int bch2_bkey_val_to_text(struct bch_fs *c, enum bkey_type type,
+			  char *buf, size_t size, struct bkey_s_c k)
+{
+	char *out = buf, *end = buf + size;
+	out += bch2_bkey_to_text(out, end - out, k.k);
+	out += scnprintf(out, end - out, ": ");
+	out += bch2_val_to_text(c, type, out, end - out, k);
+	return out - buf;
+}
+void bch2_bkey_swab(enum bkey_type type,
+		   const struct bkey_format *f,
+		   struct bkey_packed *k)
+{
+	const struct bkey_ops *ops = &bch2_bkey_ops[type];
+	bch2_bkey_swab_key(f, k);
+	if (ops->swab)
+		ops->swab(f, k);
+}
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
--- a/fs/bcachefs/btree_gc.h
+++ b/fs/bcachefs/btree_gc.h
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
--- a/fs/bcachefs/btree_update_leaf.c
+++ b/fs/bcachefs/btree_update_leaf.c
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
--- a/fs/bcachefs/chardev.h
+++ b/fs/bcachefs/chardev.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CHARDEV_H
+#define _BCACHEFS_CHARDEV_H
+#ifndef NO_BCACHEFS_FS
+long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *);
+void bch2_fs_chardev_exit(struct bch_fs *);
+int bch2_fs_chardev_init(struct bch_fs *);
+void bch2_chardev_exit(void);
+int __init bch2_chardev_init(void);
+#else
+static inline long bch2_fs_ioctl(struct bch_fs *c,
+				unsigned cmd, void __user * arg)
+{
+	return -ENOSYS;
+}
+static inline void bch2_fs_chardev_exit(struct bch_fs *c) {}
+static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; }
+static inline void bch2_chardev_exit(void) {}
+static inline int __init bch2_chardev_init(void) { return 0; }
+#endif /* NO_BCACHEFS_FS */
+#endif /* _BCACHEFS_CHARDEV_H */
--- a/fs/bcachefs/checksum.c
+++ b/fs/bcachefs/checksum.c
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
--- a/fs/bcachefs/clock.h
+++ b/fs/bcachefs/clock.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_CLOCK_H
+#define _BCACHEFS_CLOCK_H
+void bch2_io_timer_add(struct io_clock *, struct io_timer *);
+void bch2_io_timer_del(struct io_clock *, struct io_timer *);
+void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long,
+				unsigned long);
+void bch2_increment_clock(struct bch_fs *, unsigned, int);
+void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long);
+#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\
+({									\
+	long __ret = timeout;						\
+	might_sleep();							\
+	if (!___wait_cond_timeout(condition))				\
+		__ret = __wait_event_timeout(wq, condition, timeout);	\
+	__ret;								\
+})
+void bch2_io_clock_exit(struct io_clock *);
+int bch2_io_clock_init(struct io_clock *);
+#endif /* _BCACHEFS_CLOCK_H */
--- a/fs/bcachefs/clock_types.h
+++ b/fs/bcachefs/clock_types.h
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_COMPRESS_H
+#define _BCACHEFS_COMPRESS_H
+#include "extents_types.h"
+int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
+				struct bch_extent_crc_unpacked *);
+int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
+		       struct bvec_iter, struct bch_extent_crc_unpacked);
+unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *,
+			   struct bio *, size_t *, unsigned);
+int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned);
+void bch2_fs_compress_exit(struct bch_fs *);
+int bch2_fs_compress_init(struct bch_fs *);
+#endif /* _BCACHEFS_COMPRESS_H */
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
--- a/fs/bcachefs/debug.h
+++ b/fs/bcachefs/debug.h
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
--- a/fs/bcachefs/disk_groups.h
+++ b/fs/bcachefs/disk_groups.h
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
--- a/fs/bcachefs/error.h
+++ b/fs/bcachefs/error.h
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
--- a/fs/bcachefs/extents_types.h
+++ b/fs/bcachefs/extents_types.h
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
--- a/fs/bcachefs/fifo.h
+++ b/fs/bcachefs/fifo.h
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
--- a/fs/bcachefs/fs-io.h
+++ b/fs/bcachefs/fs-io.h
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
--- a/fs/bcachefs/fs-ioctl.h
+++ b/fs/bcachefs/fs-ioctl.h
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_FS_IOCTL_H
+#define _BCACHEFS_FS_IOCTL_H
+void bch2_inode_flags_to_vfs(struct bch_inode_info *);
+long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long);
+long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long);
+#endif /* _BCACHEFS_FS_IOCTL_H */
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
--- a/fs/bcachefs/fsck.h
+++ b/fs/bcachefs/fsck.h
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
--- a/fs/bcachefs/io.c
+++ b/fs/bcachefs/io.c
--- a/fs/bcachefs/io.h
+++ b/fs/bcachefs/io.h
--- a/fs/bcachefs/io_types.h
+++ b/fs/bcachefs/io_types.h
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
--- a/fs/bcachefs/journal_io.h
+++ b/fs/bcachefs/journal_io.h
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
--- a/fs/bcachefs/journal_seq_blacklist.h
+++ b/fs/bcachefs/journal_seq_blacklist.h
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
--- a/fs/bcachefs/keylist_types.h
+++ b/fs/bcachefs/keylist_types.h
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
--- a/fs/bcachefs/migrate.h
+++ b/fs/bcachefs/migrate.h
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
--- a/fs/bcachefs/move_types.h
+++ b/fs/bcachefs/move_types.h
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
--- a/fs/bcachefs/movinggc.h
+++ b/fs/bcachefs/movinggc.h
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
--- a/fs/bcachefs/quota.h
+++ b/fs/bcachefs/quota.h
--- a/fs/bcachefs/quota_types.h
+++ b/fs/bcachefs/quota_types.h
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
--- a/fs/bcachefs/rebalance.h
+++ b/fs/bcachefs/rebalance.h
--- a/fs/bcachefs/rebalance_types.h
+++ b/fs/bcachefs/rebalance_types.h
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
--- a/fs/bcachefs/siphash.c
+++ b/fs/bcachefs/siphash.c
--- a/fs/bcachefs/siphash.h
+++ b/fs/bcachefs/siphash.h
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
--- a/fs/bcachefs/sysfs.h
+++ b/fs/bcachefs/sysfs.h
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
--- a/fs/bcachefs/tests.h
+++ b/fs/bcachefs/tests.h
--- a/fs/bcachefs/trace.c
+++ b/fs/bcachefs/trace.c
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
--- a/fs/bcachefs/xattr.h
+++ b/fs/bcachefs/xattr.h