Commit 6ca1765b authored by Liu Bo's avatar Liu Bo Committed by David Sterba

Btrfs: scrub: batch rebuild for raid56

In case of raid56, writes and rebuilds always take BTRFS_STRIPE_LEN(64K)
as unit, however, scrub_extent() sets blocksize as unit, so rebuild
process may be triggered on every block on a same stripe.

A typical example would be that when we're replacing a disappeared disk,
all reads on the disks get -EIO, every block (size is 4K if blocksize is
4K) would go thru these,

scrub_handle_errored_block
  scrub_recheck_block # re-read pages one by one
  scrub_recheck_block # rebuild by calling raid56_parity_recover()
                        page by page

Although with raid56 stripe cache most of reads during rebuild can be
avoided, the parity recover calculation(xor or raid6 algorithms) needs to
be done $(BTRFS_STRIPE_LEN / blocksize) times.

This makes it smarter by doing raid56 scrub/replace on stripe length.
Signed-off-by: default avatarLiu Bo <bo.li.liu@oracle.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 416a7202
...@@ -1727,6 +1727,45 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, ...@@ -1727,6 +1727,45 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
return blk_status_to_errno(bio->bi_status); return blk_status_to_errno(bio->bi_status);
} }
static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
struct scrub_block *sblock)
{
struct scrub_page *first_page = sblock->pagev[0];
struct bio *bio;
int page_num;
/* All pages in sblock belong to the same stripe on the same device. */
ASSERT(first_page->dev);
if (!first_page->dev->bdev)
goto out;
bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
bio_set_dev(bio, first_page->dev->bdev);
for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct scrub_page *page = sblock->pagev[page_num];
WARN_ON(!page->page);
bio_add_page(bio, page->page, PAGE_SIZE, 0);
}
if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
bio_put(bio);
goto out;
}
bio_put(bio);
scrub_recheck_block_checksum(sblock);
return;
out:
for (page_num = 0; page_num < sblock->page_count; page_num++)
sblock->pagev[page_num]->io_error = 1;
sblock->no_io_error_seen = 0;
}
/* /*
* this function will check the on disk data for checksum errors, header * this function will check the on disk data for checksum errors, header
* errors and read I/O errors. If any I/O errors happen, the exact pages * errors and read I/O errors. If any I/O errors happen, the exact pages
...@@ -1742,6 +1781,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, ...@@ -1742,6 +1781,10 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
sblock->no_io_error_seen = 1; sblock->no_io_error_seen = 1;
/* short cut for raid56 */
if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
return scrub_recheck_block_on_raid56(fs_info, sblock);
for (page_num = 0; page_num < sblock->page_count; page_num++) { for (page_num = 0; page_num < sblock->page_count; page_num++) {
struct bio *bio; struct bio *bio;
struct scrub_page *page = sblock->pagev[page_num]; struct scrub_page *page = sblock->pagev[page_num];
...@@ -1757,19 +1800,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, ...@@ -1757,19 +1800,12 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
bio_set_dev(bio, page->dev->bdev); bio_set_dev(bio, page->dev->bdev);
bio_add_page(bio, page->page, PAGE_SIZE, 0); bio_add_page(bio, page->page, PAGE_SIZE, 0);
if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { bio->bi_iter.bi_sector = page->physical >> 9;
if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { bio->bi_opf = REQ_OP_READ;
page->io_error = 1;
sblock->no_io_error_seen = 0;
}
} else {
bio->bi_iter.bi_sector = page->physical >> 9;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
if (btrfsic_submit_bio_wait(bio)) { if (btrfsic_submit_bio_wait(bio)) {
page->io_error = 1; page->io_error = 1;
sblock->no_io_error_seen = 0; sblock->no_io_error_seen = 0;
}
} }
bio_put(bio); bio_put(bio);
...@@ -2737,7 +2773,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) ...@@ -2737,7 +2773,8 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
} }
/* scrub extent tries to collect up to 64 kB for each bio */ /* scrub extent tries to collect up to 64 kB for each bio */
static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
u64 logical, u64 len,
u64 physical, struct btrfs_device *dev, u64 flags, u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u64 physical_for_dev_replace) u64 gen, int mirror_num, u64 physical_for_dev_replace)
{ {
...@@ -2746,13 +2783,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, ...@@ -2746,13 +2783,19 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
u32 blocksize; u32 blocksize;
if (flags & BTRFS_EXTENT_FLAG_DATA) { if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sctx->fs_info->sectorsize; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
blocksize = sctx->fs_info->sectorsize;
spin_lock(&sctx->stat_lock); spin_lock(&sctx->stat_lock);
sctx->stat.data_extents_scrubbed++; sctx->stat.data_extents_scrubbed++;
sctx->stat.data_bytes_scrubbed += len; sctx->stat.data_bytes_scrubbed += len;
spin_unlock(&sctx->stat_lock); spin_unlock(&sctx->stat_lock);
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
blocksize = sctx->fs_info->nodesize; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
blocksize = map->stripe_len;
else
blocksize = sctx->fs_info->nodesize;
spin_lock(&sctx->stat_lock); spin_lock(&sctx->stat_lock);
sctx->stat.tree_extents_scrubbed++; sctx->stat.tree_extents_scrubbed++;
sctx->stat.tree_bytes_scrubbed += len; sctx->stat.tree_bytes_scrubbed += len;
...@@ -2892,9 +2935,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, ...@@ -2892,9 +2935,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
} }
if (flags & BTRFS_EXTENT_FLAG_DATA) { if (flags & BTRFS_EXTENT_FLAG_DATA) {
blocksize = sctx->fs_info->sectorsize; blocksize = sparity->stripe_len;
} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
blocksize = sctx->fs_info->nodesize; blocksize = sparity->stripe_len;
} else { } else {
blocksize = sctx->fs_info->sectorsize; blocksize = sctx->fs_info->sectorsize;
WARN_ON(1); WARN_ON(1);
...@@ -3604,7 +3647,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ...@@ -3604,7 +3647,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
if (ret) if (ret)
goto out; goto out;
ret = scrub_extent(sctx, extent_logical, extent_len, ret = scrub_extent(sctx, map, extent_logical, extent_len,
extent_physical, extent_dev, flags, extent_physical, extent_dev, flags,
generation, extent_mirror_num, generation, extent_mirror_num,
extent_logical - logical + physical); extent_logical - logical + physical);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment