Commit 76e45035 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-6.1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "There's a bunch of performance improvements, most notably the FIEMAP
  speedup, the new block group tree to speed up mount on large
  filesystems, more io_uring integration, some sysfs exports and the
  usual fixes and core updates.

  Summary:

  Performance:

   - outstanding FIEMAP speed improvement
      - algorithmic change how extents are enumerated leads to orders of
        magnitude speed boost (uncached and cached)
      - extent sharing check speedup (2.2x uncached, 3x cached)
      - add more cancellation points, allowing to interrupt seeking in
        files with large number of extents
      - more efficient hole and data seeking (4x uncached, 1.3x cached)
      - sample results:
	    256M, 32K extents:   4s ->  29ms  (~150x)
	    512M, 64K extents:  30s ->  59ms  (~550x)
	    1G,  128K extents: 225s -> 120ms (~1800x)

   - improved inode logging, especially for directories (on dbench
     workload throughput +25%, max latency -21%)

   - improved buffered IO, remove redundant extent state tracking,
     lowering memory consumption and avoiding rb tree traversal

   - add sysfs tunable to let qgroup temporarily skip exact accounting
     when deleting snapshot, leading to a speedup but requiring a rescan
     after that, will be used by snapper

   - support io_uring and buffered writes, until now it was just for
     direct IO, with the no-wait semantics implemented in the buffered
     write path it now works and leads to speed improvement in IOPS
     (2x), throughput (2.2x), latency (depends, 2x to 150x)

   - small performance improvements when dropping and searching for
     extent maps as well as when flushing delalloc in COW mode
     (throughput +5MB/s)

  User visible changes:

   - new incompatible feature block-group-tree adding a dedicated tree
     for tracking block groups, this allows a much faster load during
     mount and avoids seeking unlike when it's scattered in the extent
     tree items
      - this reduces mount time for many-terabyte sized filesystems
      - conversion tool will be provided so existing filesystem can also
        be updated in place
      - to reduce test matrix and feature combinations requires no-holes
        and free-space-tree (mkfs defaults since 5.15)

   - improved reporting of super block corruption detected by scrub

   - scrub also tries to repair super block and does not wait until next
     commit

   - discard stats and tunables are exported in sysfs
     (/sys/fs/btrfs/FSID/discard)

   - qgroup status is exported in sysfs
     (/sys/sys/fs/btrfs/FSID/qgroups/)

   - verify that super block was not modified when thawing filesystem

  Fixes:

   - FIEMAP fixes
      - fix extent sharing status, does not depend on the cached status
        where merged
      - flush delalloc so compressed extents are reported correctly

   - fix alignment of VMA for memory mapped files on THP

   - send: fix failures when processing inodes with no links (orphan
     files and directories)

   - fix race between quota enable and quota rescan ioctl

   - handle more corner cases for read-only compat feature verification

   - fix missed extent on fsync after dropping extent maps

  Core:

   - lockdep annotations to validate various transactions states and
     state transitions

   - preliminary support for fs-verity in send

   - more effective memory use in scrub for subpage where sector is
     smaller than page

   - block group caching progress logic has been removed, load is now
     synchronous

   - simplify end IO callbacks and bio handling, use chained bios
     instead of own tracking

   - add no-wait semantics to several functions (tree search, nocow,
     flushing, buffered write

   - cleanups and refactoring

  MM changes:

   - export balance_dirty_pages_ratelimited_flags"

* tag 'for-6.1-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (177 commits)
  btrfs: set generation before calling btrfs_clean_tree_block in btrfs_init_new_buffer
  btrfs: drop extent map range more efficiently
  btrfs: avoid pointless extent map tree search when flushing delalloc
  btrfs: remove unnecessary next extent map search
  btrfs: remove unnecessary NULL pointer checks when searching extent maps
  btrfs: assert tree is locked when clearing extent map from logging
  btrfs: remove unnecessary extent map initializations
  btrfs: remove the refcount warning/check at free_extent_map()
  btrfs: add helper to replace extent map range with a new extent map
  btrfs: move open coded extent map tree deletion out of inode eviction
  btrfs: use cond_resched_rwlock_write() during inode eviction
  btrfs: use extent_map_end() at btrfs_drop_extent_map_range()
  btrfs: move btrfs_drop_extent_cache() to extent_map.c
  btrfs: fix missed extent on fsync after dropping extent maps
  btrfs: remove stale prototype of btrfs_write_inode
  btrfs: enable nowait async buffered writes
  btrfs: assert nowait mode is not used for some btree search functions
  btrfs: make btrfs_buffered_write nowait compatible
  btrfs: plumb NOWAIT through the write path
  btrfs: make lock_and_cleanup_extent_if_need nowait compatible
  ...
parents 4c0ed7d8 cbddcc4f
......@@ -31,7 +31,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o
subpage.o tree-mod-log.o extent-io-tree.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
......
......@@ -1511,16 +1511,118 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
return ret;
}
/**
* Check if an extent is shared or not
/*
* The caller has joined a transaction or is holding a read lock on the
* fs_info->commit_root_sem semaphore, so no need to worry about the root's last
* snapshot field changing while updating or checking the cache.
*/
static bool lookup_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
struct btrfs_root *root,
u64 bytenr, int level, bool *is_shared)
{
struct btrfs_backref_shared_cache_entry *entry;
if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
return false;
/*
* Level -1 is used for the data extent, which is not reliable to cache
* because its reference count can increase or decrease without us
* realizing. We cache results only for extent buffers that lead from
* the root node down to the leaf with the file extent item.
*/
ASSERT(level >= 0);
entry = &cache->entries[level];
/* Unused cache entry or being used for some other extent buffer. */
if (entry->bytenr != bytenr)
return false;
/*
* We cached a false result, but the last snapshot generation of the
* root changed, so we now have a snapshot. Don't trust the result.
*/
if (!entry->is_shared &&
entry->gen != btrfs_root_last_snapshot(&root->root_item))
return false;
/*
* If we cached a true result and the last generation used for dropping
* a root changed, we can not trust the result, because the dropped root
* could be a snapshot sharing this extent buffer.
*/
if (entry->is_shared &&
entry->gen != btrfs_get_last_root_drop_gen(root->fs_info))
return false;
*is_shared = entry->is_shared;
return true;
}
/*
* The caller has joined a transaction or is holding a read lock on the
* fs_info->commit_root_sem semaphore, so no need to worry about the root's last
* snapshot field changing while updating or checking the cache.
*/
static void store_backref_shared_cache(struct btrfs_backref_shared_cache *cache,
struct btrfs_root *root,
u64 bytenr, int level, bool is_shared)
{
struct btrfs_backref_shared_cache_entry *entry;
u64 gen;
if (WARN_ON_ONCE(level >= BTRFS_MAX_LEVEL))
return;
/*
* Level -1 is used for the data extent, which is not reliable to cache
* because its reference count can increase or decrease without us
* realizing. We cache results only for extent buffers that lead from
* the root node down to the leaf with the file extent item.
*/
ASSERT(level >= 0);
if (is_shared)
gen = btrfs_get_last_root_drop_gen(root->fs_info);
else
gen = btrfs_root_last_snapshot(&root->root_item);
entry = &cache->entries[level];
entry->bytenr = bytenr;
entry->is_shared = is_shared;
entry->gen = gen;
/*
* If we found an extent buffer is shared, set the cache result for all
* extent buffers below it to true. As nodes in the path are COWed,
* their sharedness is moved to their children, and if a leaf is COWed,
* then the sharedness of a data extent becomes direct, the refcount of
* data extent is increased in the extent item at the extent tree.
*/
if (is_shared) {
for (int i = 0; i < level; i++) {
entry = &cache->entries[i];
entry->is_shared = is_shared;
entry->gen = gen;
}
}
}
/*
* Check if a data extent is shared or not.
*
* @root: root inode belongs to
* @inum: inode number of the inode whose extent we are checking
* @bytenr: logical bytenr of the extent we are checking
* @roots: list of roots this extent is shared among
* @tmp: temporary list used for iteration
* @root: The root the inode belongs to.
* @inum: Number of the inode whose extent we are checking.
* @bytenr: Logical bytenr of the extent we are checking.
* @extent_gen: Generation of the extent (file extent item) or 0 if it is
* not known.
* @roots: List of roots this extent is shared among.
* @tmp: Temporary list used for iteration.
* @cache: A backref lookup result cache.
*
* btrfs_check_shared uses the backref walking code but will short
* btrfs_is_data_extent_shared uses the backref walking code but will short
* circuit as soon as it finds a root or inode that doesn't match the
* one passed in. This provides a significant performance benefit for
* callers (such as fiemap) which want to know whether the extent is
......@@ -1531,8 +1633,10 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
*
* Return: 0 if extent is not shared, 1 if it is shared, < 0 on error.
*/
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct ulist *roots, struct ulist *tmp)
int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
u64 extent_gen,
struct ulist *roots, struct ulist *tmp,
struct btrfs_backref_shared_cache *cache)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_trans_handle *trans;
......@@ -1545,6 +1649,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
.inum = inum,
.share_count = 0,
};
int level;
ulist_init(roots);
ulist_init(tmp);
......@@ -1561,22 +1666,52 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
btrfs_get_tree_mod_seq(fs_info, &elem);
}
/* -1 means we are in the bytenr of the data extent. */
level = -1;
ULIST_ITER_INIT(&uiter);
while (1) {
bool is_shared;
bool cached;
ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp,
roots, NULL, &shared, false);
if (ret == BACKREF_FOUND_SHARED) {
/* this is the only condition under which we return 1 */
ret = 1;
if (level >= 0)
store_backref_shared_cache(cache, root, bytenr,
level, true);
break;
}
if (ret < 0 && ret != -ENOENT)
break;
ret = 0;
/*
* If our data extent is not shared through reflinks and it was
* created in a generation after the last one used to create a
* snapshot of the inode's root, then it can not be shared
* indirectly through subtrees, as that can only happen with
* snapshots. In this case bail out, no need to check for the
* sharedness of extent buffers.
*/
if (level == -1 &&
extent_gen > btrfs_root_last_snapshot(&root->root_item))
break;
if (level >= 0)
store_backref_shared_cache(cache, root, bytenr,
level, false);
node = ulist_next(tmp, &uiter);
if (!node)
break;
bytenr = node->val;
level++;
cached = lookup_backref_shared_cache(cache, root, bytenr, level,
&is_shared);
if (cached) {
ret = (is_shared ? 1 : 0);
break;
}
shared.share_count = 0;
cond_resched();
}
......
......@@ -17,6 +17,20 @@ struct inode_fs_paths {
struct btrfs_data_container *fspath;
};
struct btrfs_backref_shared_cache_entry {
u64 bytenr;
u64 gen;
bool is_shared;
};
struct btrfs_backref_shared_cache {
/*
* A path from a root to a leaf that has a file extent item pointing to
* a given data extent should never exceed the maximum b+tree height.
*/
struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
};
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
......@@ -62,8 +76,10 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct ulist *roots, struct ulist *tmp_ulist);
int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
u64 extent_gen,
struct ulist *roots, struct ulist *tmp,
struct btrfs_backref_shared_cache *cache);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
......
This diff is collapsed.
......@@ -46,19 +46,44 @@ enum btrfs_chunk_alloc_enum {
CHUNK_ALLOC_FORCE_FOR_EXTENT,
};
/* Block group flags set at runtime */
enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_IREF,
BLOCK_GROUP_FLAG_REMOVED,
BLOCK_GROUP_FLAG_TO_COPY,
BLOCK_GROUP_FLAG_RELOCATING_REPAIR,
BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
};
enum btrfs_caching_type {
BTRFS_CACHE_NO,
BTRFS_CACHE_STARTED,
BTRFS_CACHE_FINISHED,
BTRFS_CACHE_ERROR,
};
struct btrfs_caching_control {
struct list_head list;
struct mutex mutex;
wait_queue_head_t wait;
struct btrfs_work work;
struct btrfs_block_group *block_group;
u64 progress;
refcount_t count;
};
/* Once caching_thread() finds this much free space, it will wake up waiters. */
#define CACHING_CTL_WAKE_UP SZ_2M
/*
* Tree to record all locked full stripes of a RAID5/6 block group
*/
struct btrfs_full_stripe_locks_tree {
struct rb_root root;
struct mutex lock;
};
struct btrfs_block_group {
struct btrfs_fs_info *fs_info;
struct inode *inode;
......@@ -95,23 +120,15 @@ struct btrfs_block_group {
/* For raid56, this is a full stripe, without parity */
unsigned long full_stripe_len;
unsigned long runtime_flags;
unsigned int ro;
unsigned int iref:1;
unsigned int has_caching_ctl:1;
unsigned int removed:1;
unsigned int to_copy:1;
unsigned int relocating_repair:1;
unsigned int chunk_item_inserted:1;
unsigned int zone_is_active:1;
unsigned int zoned_data_reloc_ongoing:1;
int disk_cache_state;
/* Cache tracking stuff */
int cached;
struct btrfs_caching_control *caching_ctl;
u64 last_byte_to_unpin;
struct btrfs_space_info *space_info;
......@@ -305,8 +322,6 @@ void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags);
void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
int btrfs_free_block_groups(struct btrfs_fs_info *info);
void btrfs_wait_space_cache_v1_finished(struct btrfs_block_group *cache,
struct btrfs_caching_control *caching_ctl);
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
struct block_device *bdev, u64 physical, u64 **logical,
int *naddrs, int *stripe_len);
......
......@@ -286,7 +286,7 @@ u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
*/
if (block_rsv == delayed_rsv)
target = global_rsv;
else if (block_rsv != global_rsv && !delayed_rsv->full)
else if (block_rsv != global_rsv && !btrfs_block_rsv_full(delayed_rsv))
target = delayed_rsv;
if (target && block_rsv->space_info != target->space_info)
......@@ -424,6 +424,7 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root)
case BTRFS_CSUM_TREE_OBJECTID:
case BTRFS_EXTENT_TREE_OBJECTID:
case BTRFS_FREE_SPACE_TREE_OBJECTID:
case BTRFS_BLOCK_GROUP_TREE_OBJECTID:
root->block_rsv = &fs_info->delayed_refs_rsv;
break;
case BTRFS_ROOT_TREE_OBJECTID:
......
......@@ -92,4 +92,13 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0, NULL);
}
/*
* Fast path to check if the reserve is full, may be carefully used outside of
* locks.
*/
static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
{
return data_race(rsv->full);
}
#endif /* BTRFS_BLOCK_RSV_H */
......@@ -65,6 +65,8 @@ enum {
* on the same file.
*/
BTRFS_INODE_VERITY_IN_PROGRESS,
/* Set when this inode is a free space inode. */
BTRFS_INODE_FREE_SPACE_INODE,
};
/* in memory btrfs inode */
......@@ -94,7 +96,8 @@ struct btrfs_inode {
/* special utility tree used to record which mirrors have already been
* tried when checksums fail for a given block
*/
struct extent_io_tree io_failure_tree;
struct rb_root io_failure_tree;
spinlock_t io_failure_lock;
/*
* Keep track of where the inode has extent items mapped in order to
......@@ -250,11 +253,6 @@ struct btrfs_inode {
struct inode vfs_inode;
};
static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode)
{
return inode->root->fs_info->sectorsize;
}
static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
{
return container_of(inode, struct btrfs_inode, vfs_inode);
......@@ -272,13 +270,6 @@ static inline unsigned long btrfs_inode_hash(u64 objectid,
return (unsigned long)h;
}
static inline void btrfs_insert_inode_hash(struct inode *inode)
{
unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root);
__insert_inode_hash(inode, h);
}
#if BITS_PER_LONG == 32
/*
......@@ -312,13 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size)
static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode)
{
struct btrfs_root *root = inode->root;
if (root == root->fs_info->tree_root &&
btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
return true;
return false;
return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags);
}
static inline bool is_data_inode(struct inode *inode)
......
......@@ -152,9 +152,7 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
}
/* Do io completion on the original bio */
if (cb->status != BLK_STS_OK)
cb->orig_bio->bi_status = cb->status;
bio_endio(cb->orig_bio);
btrfs_bio_end_io(btrfs_bio(cb->orig_bio), cb->status);
/* Finally free the cb struct */
kfree(cb->compressed_pages);
......@@ -166,16 +164,15 @@ static void finish_compressed_bio_read(struct compressed_bio *cb)
* before decompressing it into the original bio and freeing the uncompressed
* pages.
*/
static void end_compressed_bio_read(struct bio *bio)
static void end_compressed_bio_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = bio->bi_private;
struct compressed_bio *cb = bbio->private;
struct inode *inode = cb->inode;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *bi = BTRFS_I(inode);
bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) &&
!test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
blk_status_t status = bio->bi_status;
struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t status = bbio->bio.bi_status;
struct bvec_iter iter;
struct bio_vec bv;
u32 offset;
......@@ -186,9 +183,8 @@ static void end_compressed_bio_read(struct bio *bio)
if (!status &&
(!csum || !btrfs_check_data_csum(inode, bbio, offset,
bv.bv_page, bv.bv_offset))) {
clean_io_failure(fs_info, &bi->io_failure_tree,
&bi->io_tree, start, bv.bv_page,
btrfs_ino(bi), bv.bv_offset);
btrfs_clean_io_failure(bi, start, bv.bv_page,
bv.bv_offset);
} else {
int ret;
......@@ -209,7 +205,7 @@ static void end_compressed_bio_read(struct bio *bio)
if (refcount_dec_and_test(&cb->pending_ios))
finish_compressed_bio_read(cb);
btrfs_bio_free_csum(bbio);
bio_put(bio);
bio_put(&bbio->bio);
}
/*
......@@ -301,20 +297,20 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
static void end_compressed_bio_write(struct bio *bio)
static void end_compressed_bio_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = bio->bi_private;
struct compressed_bio *cb = bbio->private;
if (bio->bi_status)
cb->status = bio->bi_status;
if (bbio->bio.bi_status)
cb->status = bbio->bio.bi_status;
if (refcount_dec_and_test(&cb->pending_ios)) {
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
btrfs_record_physical_zoned(cb->inode, cb->start, bio);
btrfs_record_physical_zoned(cb->inode, cb->start, &bbio->bio);
queue_work(fs_info->compressed_write_workers, &cb->write_end_work);
}
bio_put(bio);
bio_put(&bbio->bio);
}
/*
......@@ -335,7 +331,8 @@ static void end_compressed_bio_write(struct bio *bio)
static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr,
blk_opf_t opf, bio_end_io_t endio_func,
blk_opf_t opf,
btrfs_bio_end_io_t endio_func,
u64 *next_stripe_start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb);
......@@ -344,12 +341,8 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte
struct bio *bio;
int ret;
bio = btrfs_bio_alloc(BIO_MAX_VECS);
bio = btrfs_bio_alloc(BIO_MAX_VECS, opf, endio_func, cb);
bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
bio->bi_opf = opf;
bio->bi_private = cb;
bio->bi_end_io = endio_func;
em = btrfs_get_chunk_map(fs_info, disk_bytenr, fs_info->sectorsize);
if (IS_ERR(em)) {
......@@ -478,8 +471,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
if (!skip_sum) {
ret = btrfs_csum_one_bio(inode, bio, start, true);
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
btrfs_bio_end_io(btrfs_bio(bio), ret);
break;
}
}
......@@ -596,7 +588,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
}
page_end = (pg_index << PAGE_SHIFT) + PAGE_SIZE - 1;
lock_extent(tree, cur, page_end);
lock_extent(tree, cur, page_end, NULL);
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur);
read_unlock(&em_tree->lock);
......@@ -610,7 +602,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
(cur + fs_info->sectorsize > extent_map_end(em)) ||
(em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
free_extent_map(em);
unlock_extent(tree, cur, page_end);
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
......@@ -630,7 +622,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
add_size = min(em->start + em->len, page_end + 1) - cur;
ret = bio_add_page(cb->orig_bio, page, add_size, offset_in_page(cur));
if (ret != add_size) {
unlock_extent(tree, cur, page_end);
unlock_extent(tree, cur, page_end, NULL);
unlock_page(page);
put_page(page);
break;
......@@ -799,8 +791,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL);
if (ret) {
comp_bio->bi_status = ret;
bio_endio(comp_bio);
btrfs_bio_end_io(btrfs_bio(comp_bio), ret);
break;
}
......@@ -826,8 +817,7 @@ void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
kfree(cb);
out:
free_extent_map(em);
bio->bi_status = ret;
bio_endio(bio);
btrfs_bio_end_io(btrfs_bio(bio), ret);
return;
}
......
......@@ -1447,6 +1447,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
if (p->nowait) {
free_extent_buffer(tmp);
return -EAGAIN;
}
if (unlock_up)
btrfs_unlock_up_safe(p, level + 1);
......@@ -1467,6 +1472,8 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
ret = -EAGAIN;
goto out;
} else if (p->nowait) {
return -EAGAIN;
}
if (unlock_up) {
......@@ -1634,7 +1641,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
* We don't know the level of the root node until we actually
* have it read locked
*/
if (p->nowait) {
b = btrfs_try_read_lock_root_node(root);
if (IS_ERR(b))
return b;
} else {
b = btrfs_read_lock_root_node(root);
}
level = btrfs_header_level(b);
if (level > write_lock_level)
goto out;
......@@ -1910,6 +1923,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
WARN_ON(p->nodes[0] != NULL);
BUG_ON(!cow && ins_len);
/*
* For now only allow nowait for read only operations. There's no
* strict reason why we can't, we just only need it for reads so it's
* only implemented for reads.
*/
ASSERT(!p->nowait || !cow);
if (ins_len < 0) {
lowest_unlock = 2;
......@@ -1936,8 +1956,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (p->need_commit_sem) {
ASSERT(p->search_commit_root);
if (p->nowait) {
if (!down_read_trylock(&fs_info->commit_root_sem))
return -EAGAIN;
} else {
down_read(&fs_info->commit_root_sem);
}
}
again:
prev_cmp = -1;
......@@ -2081,8 +2106,16 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (level <= write_lock_level) {
btrfs_tree_lock(b);
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
if (p->nowait) {
if (!btrfs_try_tree_read_lock(b)) {
free_extent_buffer(b);
ret = -EAGAIN;
goto done;
}
} else {
btrfs_tree_read_lock(b);
}
p->locks[level] = BTRFS_READ_LOCK;
}
p->nodes[level] = b;
......@@ -2131,6 +2164,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
lowest_level = p->lowest_level;
WARN_ON(p->nodes[0] != NULL);
ASSERT(!p->nowait);
if (p->search_commit_root) {
BUG_ON(time_seq);
......@@ -4432,6 +4466,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int ret = 1;
int keep_locks = path->keep_locks;
ASSERT(!path->nowait);
path->keep_locks = 1;
again:
cur = btrfs_read_lock_root_node(root);
......@@ -4612,6 +4647,8 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int ret;
int i;
ASSERT(!path->nowait);
nritems = btrfs_header_nritems(path->nodes[0]);
if (nritems == 0)
return 1;
......
This diff is collapsed.
......@@ -127,9 +127,11 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
}
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
struct extent_changeset **reserved, u64 start,
u64 len, bool noflush)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
int ret;
/* align the range */
......@@ -137,7 +139,12 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode,
round_down(start, fs_info->sectorsize);
start = round_down(start, fs_info->sectorsize);
ret = btrfs_alloc_data_chunk_ondemand(inode, len);
if (noflush)
flush = BTRFS_RESERVE_NO_FLUSH;
else if (btrfs_is_free_space_inode(inode))
flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
ret = btrfs_reserve_data_bytes(fs_info, len, flush);
if (ret < 0)
return ret;
......@@ -454,7 +461,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
{
int ret;
ret = btrfs_check_data_free_space(inode, reserved, start, len);
ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
......
......@@ -7,7 +7,8 @@ struct extent_changeset;
int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
int btrfs_check_data_free_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
struct extent_changeset **reserved, u64 start, u64 len,
bool noflush);
void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
......
This diff is collapsed.
......@@ -16,9 +16,10 @@
#include <linux/refcount.h>
#include "ctree.h"
/* types of the delayed item */
#define BTRFS_DELAYED_INSERTION_ITEM 1
#define BTRFS_DELAYED_DELETION_ITEM 2
enum btrfs_delayed_item_type {
BTRFS_DELAYED_INSERTION_ITEM,
BTRFS_DELAYED_DELETION_ITEM
};
struct btrfs_delayed_root {
spinlock_t lock;
......@@ -73,14 +74,27 @@ struct btrfs_delayed_node {
struct btrfs_delayed_item {
struct rb_node rb_node;
struct btrfs_key key;
/* Offset value of the corresponding dir index key. */
u64 index;
struct list_head tree_list; /* used for batch insert/delete items */
struct list_head readdir_list; /* used for readdir items */
/*
* Used when logging a directory.
* Insertions and deletions to this list are protected by the parent
* delayed node's mutex.
*/
struct list_head log_list;
u64 bytes_reserved;
struct btrfs_delayed_node *delayed_node;
refcount_t refs;
int ins_or_del;
u32 data_len;
enum btrfs_delayed_item_type type:8;
/*
* Track if this delayed item was already logged.
* Protected by the mutex of the parent delayed inode.
*/
bool logged;
/* The maximum leaf size is 64K, so u16 is more than enough. */
u16 data_len;
char data[];
};
......@@ -144,6 +158,14 @@ int btrfs_should_delete_dir_index(struct list_head *del_list,
int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
struct list_head *ins_list);
/* Used during directory logging. */
void btrfs_log_get_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
void btrfs_log_put_delayed_items(struct btrfs_inode *inode,
struct list_head *ins_list,
struct list_head *del_list);
/* for init */
int __init btrfs_delayed_inode_init(void);
void __cold btrfs_delayed_inode_exit(void);
......
......@@ -545,10 +545,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (!cache)
continue;
spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);
set_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
btrfs_put_block_group(cache);
}
if (iter_ret < 0)
......@@ -577,7 +574,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
return true;
spin_lock(&cache->lock);
if (cache->removed) {
if (test_bit(BLOCK_GROUP_FLAG_REMOVED, &cache->runtime_flags)) {
spin_unlock(&cache->lock);
return true;
}
......@@ -610,9 +607,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
/* Last stripe on this device */
spin_lock(&cache->lock);
cache->to_copy = 0;
spin_unlock(&cache->lock);
clear_bit(BLOCK_GROUP_FLAG_TO_COPY, &cache->runtime_flags);
return true;
}
......@@ -1288,11 +1283,6 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
return 1;
}
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
percpu_counter_inc(&fs_info->dev_replace.bio_counter);
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
......
......@@ -7,6 +7,10 @@
#define BTRFS_DEV_REPLACE_H
struct btrfs_ioctl_dev_replace_args;
struct btrfs_fs_info;
struct btrfs_trans_handle;
struct btrfs_dev_replace;
struct btrfs_block_group;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans);
......
This diff is collapsed.
......@@ -46,10 +46,13 @@ int __cold open_ctree(struct super_block *sb,
struct btrfs_fs_devices *fs_devices,
char *options);
void __cold close_ctree(struct btrfs_fs_info *fs_info);
int btrfs_validate_super(struct btrfs_fs_info *fs_info,
struct btrfs_super_block *sb, int mirror_num);
int btrfs_check_features(struct btrfs_fs_info *fs_info, struct super_block *sb);
int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors);
struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev);
struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev,
int copy_num);
int copy_num, bool drop_cache);
int btrfs_commit_super(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
struct btrfs_key *key);
......@@ -103,7 +106,7 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
......
This diff is collapsed.
......@@ -17,7 +17,6 @@ struct io_failure_record;
#define EXTENT_NODATASUM (1U << 7)
#define EXTENT_CLEAR_META_RESV (1U << 8)
#define EXTENT_NEED_WAIT (1U << 9)
#define EXTENT_DAMAGED (1U << 10)
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
......@@ -35,10 +34,18 @@ struct io_failure_record;
* delalloc bytes decremented, in an atomic way to prevent races with stat(2).
*/
#define EXTENT_ADD_INODE_BYTES (1U << 15)
/*
* Set during truncate when we're clearing an entire range and we just want the
* extent states to go away.
*/
#define EXTENT_CLEAR_ALL_BITS (1U << 16)
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | \
EXTENT_ADD_INODE_BYTES)
EXTENT_ADD_INODE_BYTES | \
EXTENT_CLEAR_ALL_BITS)
/*
* Redefined bits above which are used only in the device allocation tree,
......@@ -56,7 +63,6 @@ enum {
IO_TREE_FS_EXCLUDED_EXTENTS,
IO_TREE_BTREE_INODE_IO,
IO_TREE_INODE_IO,
IO_TREE_INODE_IO_FAILURE,
IO_TREE_RELOC_BLOCKS,
IO_TREE_TRANS_DIRTY_PAGES,
IO_TREE_ROOT_DIRTY_LOG_PAGES,
......@@ -70,8 +76,6 @@ struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
void *private_data;
u64 dirty_bytes;
bool track_uptodate;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
......@@ -89,33 +93,23 @@ struct extent_state {
refcount_t refs;
u32 state;
struct io_failure_record *failrec;
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
#endif
};
int __init extent_state_cache_init(void);
void __cold extent_state_cache_exit(void);
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data);
void extent_io_tree_release(struct extent_io_tree *tree);
int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
{
return lock_extent_bits(tree, start, end, NULL);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int __init extent_io_init(void);
void __cold extent_io_exit(void);
int __init extent_state_init_cachep(void);
void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
......@@ -126,72 +120,66 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int filled, struct extent_state *cached_state);
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int wake, int delete,
struct extent_state **cached);
int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, int wake, int delete,
struct extent_state **cached, gfp_t mask,
u32 bits, struct extent_state **cached, gfp_t mask,
struct extent_changeset *changeset);
static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits,
struct extent_state **cached)
{
return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL);
return __clear_extent_bit(tree, start, end, bits, cached,
GFP_NOFS, NULL);
}
static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
GFP_NOFS, NULL);
}
static inline int unlock_extent_cached_atomic(struct extent_io_tree *tree,
u64 start, u64 end, struct extent_state **cached)
static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
int wake = 0;
if (bits & EXTENT_LOCKED)
wake = 1;
return clear_extent_bit(tree, start, end, bits, wake, 0, NULL);
return clear_extent_bit(tree, start, end, bits, NULL);
}
int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_changeset *changeset);
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, unsigned exclusive_bits, u64 *failed_start,
struct extent_state **cached_state, gfp_t mask,
struct extent_changeset *changeset);
int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits);
u32 bits, struct extent_state **cached_state, gfp_t mask);
static inline int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
return set_extent_bit(tree, start, end, bits, NULL, GFP_NOWAIT);
}
static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
NULL);
return set_extent_bit(tree, start, end, bits, NULL, GFP_NOFS);
}
static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
cached_state, GFP_NOFS, NULL);
}
static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
u64 end, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, NULL,
mask, NULL);
return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, mask);
}
static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
......@@ -199,7 +187,7 @@ static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
{
return clear_extent_bit(tree, start, end,
EXTENT_DIRTY | EXTENT_DELALLOC |
EXTENT_DO_ACCOUNTING, 0, 0, cached);
EXTENT_DO_ACCOUNTING, cached);
}
int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
......@@ -211,30 +199,29 @@ static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | extra_bits,
0, NULL, cached_state, GFP_NOFS, NULL);
EXTENT_DELALLOC | extra_bits,
cached_state, GFP_NOFS);
}
static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state)
{
return set_extent_bit(tree, start, end,
EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
0, NULL, cached_state, GFP_NOFS, NULL);
EXTENT_DELALLOC | EXTENT_DEFRAG,
cached_state, GFP_NOFS);
}
static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
u64 end)
{
return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, NULL,
GFP_NOFS, NULL);
return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
cached_state, mask, NULL);
return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
cached_state, mask);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
......@@ -244,24 +231,9 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits);
int extent_invalidate_folio(struct extent_io_tree *tree,
struct folio *folio, size_t offset);
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
/* This should be reworked in the future and put elsewhere. */
struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start);
int set_state_failrec(struct extent_io_tree *tree, u64 start,
struct io_failure_record *failrec);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start,
u64 end);
int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
int clean_io_failure(struct btrfs_fs_info *fs_info,
struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree, u64 start,
struct page *page, u64 ino, unsigned int pg_offset);
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
#endif /* BTRFS_EXTENT_IO_TREE_H */
......@@ -2220,6 +2220,12 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
}
if (!mutex_trylock(&head->mutex)) {
if (path->nowait) {
spin_unlock(&delayed_refs->lock);
btrfs_put_transaction(cur_trans);
return -EAGAIN;
}
refcount_inc(&head->refs);
spin_unlock(&delayed_refs->lock);
......@@ -2686,13 +2692,8 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info,
len = cache->start + cache->length - start;
len = min(len, end + 1 - start);
down_read(&fs_info->commit_root_sem);
if (start < cache->last_byte_to_unpin && return_free_space) {
u64 add_len = min(len, cache->last_byte_to_unpin - start);
btrfs_add_free_space(cache, start, add_len);
}
up_read(&fs_info->commit_root_sem);
if (return_free_space)
btrfs_add_free_space(cache, start, len);
start += len;
total_unpinned += len;
......@@ -3804,7 +3805,8 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
block_group->start == fs_info->data_reloc_bg ||
fs_info->data_reloc_bg == 0);
if (block_group->ro || block_group->zoned_data_reloc_ongoing) {
if (block_group->ro ||
test_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags)) {
ret = 1;
goto out;
}
......@@ -3881,7 +3883,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
* regular extents) at the same time to the same zone, which
* easily break the write pointer.
*/
block_group->zoned_data_reloc_ongoing = 1;
set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &block_group->runtime_flags);
fs_info->data_reloc_bg = 0;
}
spin_unlock(&fs_info->relocation_bg_lock);
......@@ -4888,6 +4890,9 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
!test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
lockdep_owner = BTRFS_FS_TREE_OBJECTID;
/* btrfs_clean_tree_block() accesses generation field. */
btrfs_set_header_generation(buf, trans->transid);
/*
* This needs to stay, because we could allocate a freed block from an
* old tree into a new tree, so we need to make sure this new block is
......@@ -5639,6 +5644,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
*/
int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
{
const bool is_reloc_root = (root->root_key.objectid ==
BTRFS_TREE_RELOC_OBJECTID);
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_trans_handle *trans;
......@@ -5798,6 +5805,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
if (!is_reloc_root)
btrfs_set_last_root_drop_gen(fs_info, trans->transid);
btrfs_end_transaction_throttle(trans);
if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
btrfs_debug(fs_info,
......@@ -5832,7 +5842,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
goto out_end_trans;
}
if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
if (!is_reloc_root) {
ret = btrfs_find_root(tree_root, &root->root_key, path,
NULL, NULL);
if (ret < 0) {
......@@ -5864,6 +5874,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_put_root(root);
root_dropped = true;
out_end_trans:
if (!is_reloc_root)
btrfs_set_last_root_drop_gen(fs_info, trans->transid);
btrfs_end_transaction_throttle(trans);
out_free:
kfree(wc);
......
This diff is collapsed.
......@@ -60,11 +60,13 @@ enum {
struct btrfs_bio;
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
int __init extent_buffer_init_cachep(void);
void __cold extent_buffer_free_cachep(void);
typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
enum btrfs_compression_type compress_type);
......@@ -240,10 +242,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
int extent_invalidate_folio(struct extent_io_tree *tree,
struct folio *folio, size_t offset);
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
......@@ -257,8 +259,12 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
* bio end_io callback is called to indicate things have failed.
*/
struct io_failure_record {
/* Use rb_simple_node for search/insert */
struct {
struct rb_node rb_node;
u64 bytenr;
};
struct page *page;
u64 start;
u64 len;
u64 logical;
int this_mirror;
......@@ -269,6 +275,9 @@ struct io_failure_record {
int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
u32 bio_offset, struct page *page, unsigned int pgoff,
submit_bio_hook_t *submit_bio_hook);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
struct page *page, unsigned int pg_offset);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
......
This diff is collapsed.
......@@ -63,6 +63,8 @@ struct extent_map_tree {
rwlock_t lock;
};
struct btrfs_inode;
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
......@@ -104,5 +106,11 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
struct extent_map_tree *em_tree,
struct extent_map **em_in, u64 start, u64 len);
void btrfs_drop_extent_map_range(struct btrfs_inode *inode,
u64 start, u64 end,
bool skip_pinned);
int btrfs_replace_extent_map_range(struct btrfs_inode *inode,
struct extent_map *new_em,
bool modified);
#endif
......@@ -118,7 +118,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
return clear_extent_bit(&inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, 0, 0, NULL);
start + len - 1, EXTENT_DIRTY, NULL);
}
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
......@@ -129,12 +129,20 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
return ncsums * fs_info->sectorsize;
}
int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
/*
* Calculate the total size needed to allocate for an ordered sum structure
* spanning @bytes in the file.
*/
static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
{
int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
}
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 objectid, u64 pos,
u64 disk_offset, u64 disk_num_bytes,
u64 num_bytes, u64 offset, u64 ram_bytes,
u8 compression, u8 encryption, u16 other_encoding)
u64 objectid, u64 pos, u64 num_bytes)
{
int ret = 0;
struct btrfs_file_extent_item *item;
......@@ -157,16 +165,16 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
item = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset);
btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes);
btrfs_set_file_extent_offset(leaf, item, offset);
btrfs_set_file_extent_disk_bytenr(leaf, item, 0);
btrfs_set_file_extent_disk_num_bytes(leaf, item, 0);
btrfs_set_file_extent_offset(leaf, item, 0);
btrfs_set_file_extent_num_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes);
btrfs_set_file_extent_ram_bytes(leaf, item, num_bytes);
btrfs_set_file_extent_generation(leaf, item, trans->transid);
btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
btrfs_set_file_extent_compression(leaf, item, compression);
btrfs_set_file_extent_encryption(leaf, item, encryption);
btrfs_set_file_extent_other_encoding(leaf, item, other_encoding);
btrfs_set_file_extent_compression(leaf, item, 0);
btrfs_set_file_extent_encryption(leaf, item, 0);
btrfs_set_file_extent_other_encoding(leaf, item, 0);
btrfs_mark_buffer_dirty(leaf);
out:
......@@ -503,7 +511,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit)
struct list_head *list, int search_commit,
bool nowait)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
......@@ -525,6 +534,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (!path)
return -ENOMEM;
path->nowait = nowait;
if (search_commit) {
path->skip_locking = 1;
path->reada = READA_FORWARD;
......
This diff is collapsed.
This diff is collapsed.
......@@ -113,7 +113,6 @@ int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
int btrfs_remove_free_space(struct btrfs_block_group *block_group,
u64 bytenr, u64 size);
void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group);
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group);
u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group,
......
......@@ -1453,8 +1453,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
caching_ctl->progress = key.objectid;
offset = key.objectid;
while (offset < key.objectid + key.offset) {
bit = free_space_test_bit(block_group, path, offset);
......@@ -1490,8 +1488,6 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
goto out;
}
caching_ctl->progress = (u64)-1;
ret = 0;
out:
return ret;
......@@ -1531,8 +1527,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
ASSERT(key.objectid < end && key.objectid + key.offset <= end);
caching_ctl->progress = key.objectid;
total_found += add_new_free_space(block_group, key.objectid,
key.objectid + key.offset);
if (total_found > CACHING_CTL_WAKE_UP) {
......@@ -1552,8 +1546,6 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
goto out;
}
caching_ctl->progress = (u64)-1;
ret = 0;
out:
return ret;
......
This diff is collapsed.
......@@ -1218,10 +1218,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* get the big lock and read metadata off disk */
if (!locked)
lock_extent_bits(io_tree, start, end, &cached);
lock_extent(io_tree, start, end, &cached);
em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked)
unlock_extent_cached(io_tree, start, end, &cached);
unlock_extent(io_tree, start, end, &cached);
if (IS_ERR(em))
return NULL;
......@@ -1333,9 +1333,9 @@ static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
unlock_extent_cached(&inode->io_tree, page_start, page_end,
unlock_extent(&inode->io_tree, page_start, page_end,
&cached_state);
if (!ordered)
break;
......@@ -1616,7 +1616,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
return ret;
clear_extent_bit(&inode->io_tree, start, start + len - 1,
EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
EXTENT_DEFRAG, 0, 0, cached_state);
EXTENT_DEFRAG, cached_state);
set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
/* Update the page status */
......@@ -1666,7 +1666,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
wait_on_page_writeback(pages[i]);
/* Lock the pages range */
lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
lock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
(last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
&cached_state);
/*
......@@ -1694,7 +1694,7 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
kfree(entry);
}
unlock_extent:
unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT,
(last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
&cached_state);
free_pages:
......
......@@ -285,6 +285,31 @@ struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
return eb;
}
/*
* Loop around taking references on and locking the root node of the tree in
* nowait mode until we end up with a lock on the root node or returning to
* avoid blocking.
*
* Return: root extent buffer with read lock held or -EAGAIN.
*/
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
{
struct extent_buffer *eb;
while (1) {
eb = btrfs_root_node(root);
if (!btrfs_try_tree_read_lock(eb)) {
free_extent_buffer(eb);
return ERR_PTR(-EAGAIN);
}
if (eb == root->node)
break;
btrfs_tree_read_unlock(eb);
free_extent_buffer(eb);
}
return eb;
}
/*
* DREW locks
* ==========
......
......@@ -94,6 +94,7 @@ int btrfs_try_tree_read_lock(struct extent_buffer *eb);
int btrfs_try_tree_write_lock(struct extent_buffer *eb);
struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root);
struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root);
#ifdef CONFIG_BTRFS_DEBUG
static inline void btrfs_assert_tree_write_locked(struct extent_buffer *eb)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -270,11 +270,8 @@ int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 ino = btrfs_ino(BTRFS_I(inode));
int ret;
ret = iterate_object_props(root, path, ino, inode_prop_iterator, inode);
return ret;
return iterate_object_props(root, path, ino, inode_prop_iterator, inode);
}
static int prop_compression_validate(const struct btrfs_inode *inode,
......
This diff is collapsed.
......@@ -100,6 +100,9 @@
* subtree rescan for them.
*/
#define BTRFS_QGROUP_RUNTIME_FLAG_CANCEL_RESCAN (1UL << 3)
#define BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING (1UL << 4)
/*
* Record a dirty extent, and info qgroup to update quota on it
* TODO: Use kmem cache to alloc it.
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment