Commit d276bb29 authored by Jens Axboe's avatar Jens Axboe

Merge tag 'md-next-20230729' of...

Merge tag 'md-next-20230729' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.6/block

Pull MD updates from Song:

"1. Deprecate bitmap file support, by Christoph Hellwig;
 2. Fix deadlock with md sync thread, by Yu Kuai;
 3. Refactor md io accounting, by Yu Kuai;
 4. Various non-urgent fixes by Li Nan, Yu Kuai, and Jack Wang."

* tag 'md-next-20230729' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: (36 commits)
  md/md-bitmap: hold 'reconfig_mutex' in backlog_store()
  md/md-bitmap: remove unnecessary local variable in backlog_store()
  md/raid10: use dereference_rdev_and_rrdev() to get devices
  md/raid10: factor out dereference_rdev_and_rrdev()
  md/raid10: check replacement and rdev to prevent submit the same io twice
  md/raid1: Avoid lock contention from wake_up()
  md: restore 'noio_flag' for the last mddev_resume()
  md: don't quiesce in mddev_suspend()
  md: remove redundant check in fix_read_error()
  md/raid10: optimize fix_read_error
  md/raid1: prioritize adding disk to 'removed' mirror
  md/md-faulty: enable io accounting
  md/md-linear: enable io accounting
  md/md-multipath: enable io accounting
  md/raid10: switch to use md_account_bio() for io accounting
  md/raid1: switch to use md_account_bio() for io accounting
  raid5: fix missing io accounting in raid5_align_endio()
  md: also clone new io if io accounting is disabled
  md: move initialization and destruction of 'io_acct_set' to md.c
  md: deprecate bitmap file support
  ...
parents 51d74ec9 44abfa6a
......@@ -50,6 +50,16 @@ config MD_AUTODETECT
If unsure, say Y.
config MD_BITMAP_FILE
bool "MD bitmap file support (deprecated)"
default y
help
If you say Y here, support for write intent bitmaps in files on an
external file system is enabled. This is an alternative to the internal
bitmaps near the MD superblock, and very problematic code that abuses
various kernel APIs and can only work with files on a file system not
actually sitting on the MD device.
config MD_LINEAR
tristate "Linear (append) mode (deprecated)"
depends on BLK_DEV_MD
......
......@@ -3725,7 +3725,6 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
} else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
......
......@@ -139,29 +139,26 @@ static void md_bitmap_checkfree(struct bitmap_counts *bitmap, unsigned long page
*/
/* IO operations when bitmap is stored near all superblocks */
/* choose a good rdev and read the page from there */
static int read_sb_page(struct mddev *mddev, loff_t offset,
struct page *page,
unsigned long index, int size)
struct page *page, unsigned long index, int size)
{
/* choose a good rdev and read the page from there */
sector_t sector = mddev->bitmap_info.offset + offset +
index * (PAGE_SIZE / SECTOR_SIZE);
struct md_rdev *rdev;
sector_t target;
rdev_for_each(rdev, mddev) {
if (! test_bit(In_sync, &rdev->flags)
|| test_bit(Faulty, &rdev->flags)
|| test_bit(Bitmap_sync, &rdev->flags))
continue;
u32 iosize = roundup(size, bdev_logical_block_size(rdev->bdev));
target = offset + index * (PAGE_SIZE/512);
if (!test_bit(In_sync, &rdev->flags) ||
test_bit(Faulty, &rdev->flags) ||
test_bit(Bitmap_sync, &rdev->flags))
continue;
if (sync_page_io(rdev, target,
roundup(size, bdev_logical_block_size(rdev->bdev)),
page, REQ_OP_READ, true)) {
page->index = index;
if (sync_page_io(rdev, sector, iosize, page, REQ_OP_READ, true))
return 0;
}
}
return -EIO;
}
......@@ -225,18 +222,19 @@ static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
}
static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
struct page *page)
unsigned long pg_index, struct page *page)
{
struct block_device *bdev;
struct mddev *mddev = bitmap->mddev;
struct bitmap_storage *store = &bitmap->storage;
loff_t sboff, offset = mddev->bitmap_info.offset;
sector_t ps, doff;
sector_t ps = pg_index * PAGE_SIZE / SECTOR_SIZE;
unsigned int size = PAGE_SIZE;
unsigned int opt_size = PAGE_SIZE;
sector_t doff;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
if (page->index == store->file_pages - 1) {
if (pg_index == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
......@@ -245,7 +243,6 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
opt_size = optimal_io_size(bdev, last_page_size, size);
}
ps = page->index * PAGE_SIZE / SECTOR_SIZE;
sboff = rdev->sb_start + offset;
doff = rdev->data_offset;
......@@ -279,55 +276,41 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
return 0;
}
static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
static void write_sb_page(struct bitmap *bitmap, unsigned long pg_index,
struct page *page, bool wait)
{
struct md_rdev *rdev;
struct mddev *mddev = bitmap->mddev;
int ret;
do {
rdev = NULL;
struct md_rdev *rdev = NULL;
while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
ret = __write_sb_page(rdev, bitmap, page);
if (ret)
return ret;
if (__write_sb_page(rdev, bitmap, pg_index, page) < 0) {
set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
return;
}
}
} while (wait && md_super_wait(mddev) < 0);
return 0;
}
static void md_bitmap_file_kick(struct bitmap *bitmap);
/*
* write out a page to a file
*/
static void write_page(struct bitmap *bitmap, struct page *page, int wait)
{
struct buffer_head *bh;
if (bitmap->storage.file == NULL) {
switch (write_sb_page(bitmap, page, wait)) {
case -EINVAL:
set_bit(BITMAP_WRITE_ERROR, &bitmap->flags);
}
} else {
bh = page_buffers(page);
while (bh && bh->b_blocknr) {
atomic_inc(&bitmap->pending_writes);
set_buffer_locked(bh);
set_buffer_mapped(bh);
submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
bh = bh->b_this_page;
}
#ifdef CONFIG_MD_BITMAP_FILE
static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
{
struct buffer_head *bh = page_buffers(page);
if (wait)
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
while (bh && bh->b_blocknr) {
atomic_inc(&bitmap->pending_writes);
set_buffer_locked(bh);
set_buffer_mapped(bh);
submit_bh(REQ_OP_WRITE | REQ_SYNC, bh);
bh = bh->b_this_page;
}
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags))
md_bitmap_file_kick(bitmap);
if (wait)
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes) == 0);
}
static void end_bitmap_write(struct buffer_head *bh, int uptodate)
......@@ -364,10 +347,8 @@ static void free_buffers(struct page *page)
* This usage is similar to how swap files are handled, and allows us
* to write to a file with no concerns of memory allocation failing.
*/
static int read_page(struct file *file, unsigned long index,
struct bitmap *bitmap,
unsigned long count,
struct page *page)
static int read_file_page(struct file *file, unsigned long index,
struct bitmap *bitmap, unsigned long count, struct page *page)
{
int ret = 0;
struct inode *inode = file_inode(file);
......@@ -415,7 +396,6 @@ static int read_page(struct file *file, unsigned long index,
blk_cur++;
bh = bh->b_this_page;
}
page->index = index;
wait_event(bitmap->write_wait,
atomic_read(&bitmap->pending_writes)==0);
......@@ -429,11 +409,45 @@ static int read_page(struct file *file, unsigned long index,
ret);
return ret;
}
#else /* CONFIG_MD_BITMAP_FILE */
static void write_file_page(struct bitmap *bitmap, struct page *page, int wait)
{
}
static int read_file_page(struct file *file, unsigned long index,
struct bitmap *bitmap, unsigned long count, struct page *page)
{
return -EIO;
}
static void free_buffers(struct page *page)
{
put_page(page);
}
#endif /* CONFIG_MD_BITMAP_FILE */
/*
* bitmap file superblock operations
*/
/*
* write out a page to a file
*/
static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
bool wait)
{
struct bitmap_storage *store = &bitmap->storage;
struct page *page = store->filemap[pg_index];
if (mddev_is_clustered(bitmap->mddev)) {
pg_index += bitmap->cluster_slot *
DIV_ROUND_UP(store->bytes, PAGE_SIZE);
}
if (store->file)
write_file_page(bitmap, page, wait);
else
write_sb_page(bitmap, pg_index, page, wait);
}
/*
* md_bitmap_wait_writes() should be called before writing any bitmap
* blocks, to ensure previous writes, particularly from
......@@ -488,7 +502,12 @@ void md_bitmap_update_sb(struct bitmap *bitmap)
sb->sectors_reserved = cpu_to_le32(bitmap->mddev->
bitmap_info.space);
kunmap_atomic(sb);
write_page(bitmap, bitmap->storage.sb_page, 1);
if (bitmap->storage.file)
write_file_page(bitmap, bitmap->storage.sb_page, 1);
else
write_sb_page(bitmap, bitmap->storage.sb_index,
bitmap->storage.sb_page, 1);
}
EXPORT_SYMBOL(md_bitmap_update_sb);
......@@ -540,7 +559,7 @@ static int md_bitmap_new_disk_sb(struct bitmap *bitmap)
bitmap->storage.sb_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (bitmap->storage.sb_page == NULL)
return -ENOMEM;
bitmap->storage.sb_page->index = 0;
bitmap->storage.sb_index = 0;
sb = kmap_atomic(bitmap->storage.sb_page);
......@@ -601,7 +620,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
unsigned long sectors_reserved = 0;
int err = -EINVAL;
struct page *sb_page;
loff_t offset = bitmap->mddev->bitmap_info.offset;
loff_t offset = 0;
if (!bitmap->storage.file && !bitmap->mddev->bitmap_info.offset) {
chunksize = 128 * 1024 * 1024;
......@@ -628,7 +647,7 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
bm_blocks = ((bm_blocks+7) >> 3) + sizeof(bitmap_super_t);
/* to 4k blocks */
bm_blocks = DIV_ROUND_UP_SECTOR_T(bm_blocks, 4096);
offset = bitmap->mddev->bitmap_info.offset + (bitmap->cluster_slot * (bm_blocks << 3));
offset = bitmap->cluster_slot * (bm_blocks << 3);
pr_debug("%s:%d bm slot: %d offset: %llu\n", __func__, __LINE__,
bitmap->cluster_slot, offset);
}
......@@ -637,13 +656,11 @@ static int md_bitmap_read_sb(struct bitmap *bitmap)
loff_t isize = i_size_read(bitmap->storage.file->f_mapping->host);
int bytes = isize > PAGE_SIZE ? PAGE_SIZE : isize;
err = read_page(bitmap->storage.file, 0,
err = read_file_page(bitmap->storage.file, 0,
bitmap, bytes, sb_page);
} else {
err = read_sb_page(bitmap->mddev,
offset,
sb_page,
0, sizeof(bitmap_super_t));
err = read_sb_page(bitmap->mddev, offset, sb_page, 0,
sizeof(bitmap_super_t));
}
if (err)
return err;
......@@ -819,7 +836,7 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
if (store->sb_page) {
store->filemap[0] = store->sb_page;
pnum = 1;
store->sb_page->index = offset;
store->sb_index = offset;
}
for ( ; pnum < num_pages; pnum++) {
......@@ -828,7 +845,6 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
store->file_pages = pnum;
return -ENOMEM;
}
store->filemap[pnum]->index = pnum + offset;
}
store->file_pages = pnum;
......@@ -847,14 +863,10 @@ static int md_bitmap_storage_alloc(struct bitmap_storage *store,
static void md_bitmap_file_unmap(struct bitmap_storage *store)
{
struct page **map, *sb_page;
int pages;
struct file *file;
file = store->file;
map = store->filemap;
pages = store->file_pages;
sb_page = store->sb_page;
struct file *file = store->file;
struct page *sb_page = store->sb_page;
struct page **map = store->filemap;
int pages = store->file_pages;
while (pages--)
if (map[pages] != sb_page) /* 0 is sb_page, release it below */
......@@ -879,21 +891,13 @@ static void md_bitmap_file_unmap(struct bitmap_storage *store)
*/
static void md_bitmap_file_kick(struct bitmap *bitmap)
{
char *path, *ptr = NULL;
if (!test_and_set_bit(BITMAP_STALE, &bitmap->flags)) {
md_bitmap_update_sb(bitmap);
if (bitmap->storage.file) {
path = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (path)
ptr = file_path(bitmap->storage.file,
path, PAGE_SIZE);
pr_warn("%s: kicking failed bitmap file %pD4 from array!\n",
bmname(bitmap), bitmap->storage.file);
pr_warn("%s: kicking failed bitmap file %s from array!\n",
bmname(bitmap), IS_ERR(ptr) ? "" : ptr);
kfree(path);
} else
pr_warn("%s: disabling internal bitmap due to errors\n",
bmname(bitmap));
......@@ -945,6 +949,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
void *kaddr;
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
if (mddev_is_clustered(bitmap->mddev))
......@@ -962,9 +967,9 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
else
set_bit_le(bit, kaddr);
kunmap_atomic(kaddr);
pr_debug("set file bit %lu page %lu\n", bit, page->index);
pr_debug("set file bit %lu page %lu\n", bit, index);
/* record page number so it gets flushed to disk when unplug occurs */
set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_DIRTY);
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_DIRTY);
}
static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
......@@ -974,6 +979,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
void *paddr;
unsigned long chunk = block >> bitmap->counts.chunkshift;
struct bitmap_storage *store = &bitmap->storage;
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
if (mddev_is_clustered(bitmap->mddev))
......@@ -989,8 +995,8 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
else
clear_bit_le(bit, paddr);
kunmap_atomic(paddr);
if (!test_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
set_page_attr(bitmap, page->index - node_offset, BITMAP_PAGE_PENDING);
if (!test_page_attr(bitmap, index - node_offset, BITMAP_PAGE_NEEDWRITE)) {
set_page_attr(bitmap, index - node_offset, BITMAP_PAGE_PENDING);
bitmap->allclean = 0;
}
}
......@@ -1042,7 +1048,7 @@ void md_bitmap_unplug(struct bitmap *bitmap)
"md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
write_page(bitmap, bitmap->storage.filemap[i], 0);
filemap_write_page(bitmap, i, false);
writing = 1;
}
}
......@@ -1084,33 +1090,31 @@ void md_bitmap_unplug_async(struct bitmap *bitmap)
EXPORT_SYMBOL(md_bitmap_unplug_async);
static void md_bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int needed);
/* * bitmap_init_from_disk -- called at bitmap_create time to initialize
* the in-memory bitmap from the on-disk bitmap -- also, sets up the
* memory mapping of the bitmap file
* Special cases:
* if there's no bitmap file, or if the bitmap file had been
* previously kicked from the array, we mark all the bits as
* 1's in order to cause a full resync.
/*
* Initialize the in-memory bitmap from the on-disk bitmap and set up the memory
* mapping of the bitmap file.
*
* Special case: If there's no bitmap file, or if the bitmap file had been
* previously kicked from the array, we mark all the bits as 1's in order to
* cause a full resync.
*
* We ignore all bits for sectors that end earlier than 'start'.
* This is used when reading an out-of-date bitmap...
* This is used when reading an out-of-date bitmap.
*/
static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
{
unsigned long i, chunks, index, oldindex, bit, node_offset = 0;
struct page *page = NULL;
unsigned long bit_cnt = 0;
struct file *file;
unsigned long offset;
int outofdate;
int ret = -ENOSPC;
void *paddr;
bool outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
struct mddev *mddev = bitmap->mddev;
unsigned long chunks = bitmap->counts.chunks;
struct bitmap_storage *store = &bitmap->storage;
struct file *file = store->file;
unsigned long node_offset = 0;
unsigned long bit_cnt = 0;
unsigned long i;
int ret;
chunks = bitmap->counts.chunks;
file = store->file;
if (!file && !bitmap->mddev->bitmap_info.offset) {
if (!file && !mddev->bitmap_info.offset) {
/* No permanent bitmap - fill with '1s'. */
store->filemap = NULL;
store->file_pages = 0;
......@@ -1125,77 +1129,79 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
return 0;
}
outofdate = test_bit(BITMAP_STALE, &bitmap->flags);
if (outofdate)
pr_warn("%s: bitmap file is out of date, doing full recovery\n", bmname(bitmap));
if (file && i_size_read(file->f_mapping->host) < store->bytes) {
pr_warn("%s: bitmap file too short %lu < %lu\n",
bmname(bitmap),
(unsigned long) i_size_read(file->f_mapping->host),
store->bytes);
ret = -ENOSPC;
goto err;
}
oldindex = ~0L;
offset = 0;
if (!bitmap->mddev->bitmap_info.external)
offset = sizeof(bitmap_super_t);
if (mddev_is_clustered(bitmap->mddev))
if (mddev_is_clustered(mddev))
node_offset = bitmap->cluster_slot * (DIV_ROUND_UP(store->bytes, PAGE_SIZE));
for (i = 0; i < chunks; i++) {
int b;
index = file_page_index(&bitmap->storage, i);
bit = file_page_offset(&bitmap->storage, i);
if (index != oldindex) { /* this is a new page, read it in */
int count;
/* unmap the old page, we're done with it */
if (index == store->file_pages-1)
count = store->bytes - index * PAGE_SIZE;
else
count = PAGE_SIZE;
page = store->filemap[index];
if (file)
ret = read_page(file, index, bitmap,
count, page);
else
ret = read_sb_page(
bitmap->mddev,
bitmap->mddev->bitmap_info.offset,
page,
index + node_offset, count);
for (i = 0; i < store->file_pages; i++) {
struct page *page = store->filemap[i];
int count;
if (ret)
goto err;
/* unmap the old page, we're done with it */
if (i == store->file_pages - 1)
count = store->bytes - i * PAGE_SIZE;
else
count = PAGE_SIZE;
oldindex = index;
if (file)
ret = read_file_page(file, i, bitmap, count, page);
else
ret = read_sb_page(mddev, 0, page, i + node_offset,
count);
if (ret)
goto err;
}
if (outofdate) {
/*
* if bitmap is out of date, dirty the
* whole page and write it out
*/
paddr = kmap_atomic(page);
memset(paddr + offset, 0xff,
PAGE_SIZE - offset);
kunmap_atomic(paddr);
write_page(bitmap, page, 1);
if (outofdate) {
pr_warn("%s: bitmap file is out of date, doing full recovery\n",
bmname(bitmap));
for (i = 0; i < store->file_pages; i++) {
struct page *page = store->filemap[i];
unsigned long offset = 0;
void *paddr;
if (i == 0 && !mddev->bitmap_info.external)
offset = sizeof(bitmap_super_t);
/*
* If the bitmap is out of date, dirty the whole page
* and write it out
*/
paddr = kmap_atomic(page);
memset(paddr + offset, 0xff, PAGE_SIZE - offset);
kunmap_atomic(paddr);
filemap_write_page(bitmap, i, true);
if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) {
ret = -EIO;
if (test_bit(BITMAP_WRITE_ERROR,
&bitmap->flags))
goto err;
goto err;
}
}
}
for (i = 0; i < chunks; i++) {
struct page *page = filemap_get_page(&bitmap->storage, i);
unsigned long bit = file_page_offset(&bitmap->storage, i);
void *paddr;
bool was_set;
paddr = kmap_atomic(page);
if (test_bit(BITMAP_HOSTENDIAN, &bitmap->flags))
b = test_bit(bit, paddr);
was_set = test_bit(bit, paddr);
else
b = test_bit_le(bit, paddr);
was_set = test_bit_le(bit, paddr);
kunmap_atomic(paddr);
if (b) {
if (was_set) {
/* if the disk bit is set, set the memory bit */
int needed = ((sector_t)(i+1) << bitmap->counts.chunkshift
>= start);
......@@ -1204,7 +1210,6 @@ static int md_bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
needed);
bit_cnt++;
}
offset = 0;
}
pr_debug("%s: bitmap initialized from disk: read %lu pages, set %lu of %lu bits\n",
......@@ -1396,9 +1401,8 @@ void md_bitmap_daemon_work(struct mddev *mddev)
break;
if (bitmap->storage.filemap &&
test_and_clear_page_attr(bitmap, j,
BITMAP_PAGE_NEEDWRITE)) {
write_page(bitmap, bitmap->storage.filemap[j], 0);
}
BITMAP_PAGE_NEEDWRITE))
filemap_write_page(bitmap, j, false);
}
done:
......@@ -2542,6 +2546,10 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (backlog > COUNTER_MAX)
return -EINVAL;
rv = mddev_lock(mddev);
if (rv)
return rv;
/*
* Without write mostly device, it doesn't make sense to set
* backlog for max_write_behind.
......@@ -2555,6 +2563,7 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
if (!has_write_mostly) {
pr_warn_ratelimited("%s: can't set backlog, no write mostly device available\n",
mdname(mddev));
mddev_unlock(mddev);
return -EINVAL;
}
......@@ -2565,13 +2574,13 @@ backlog_store(struct mddev *mddev, const char *buf, size_t len)
mddev_destroy_serial_pool(mddev, NULL, false);
} else if (backlog && !mddev->serial_info_pool) {
/* serial_info_pool is needed since backlog is not zero */
struct md_rdev *rdev;
rdev_for_each(rdev, mddev)
mddev_create_serial_pool(mddev, rdev, false);
}
if (old_mwb != backlog)
md_bitmap_update_sb(mddev->bitmap);
mddev_unlock(mddev);
return len;
}
......
......@@ -201,6 +201,7 @@ struct bitmap {
struct file *file; /* backing disk file */
struct page *sb_page; /* cached copy of the bitmap
* file superblock */
unsigned long sb_index;
struct page **filemap; /* list of cache pages for
* the file */
unsigned long *filemap_attr; /* attributes associated
......
......@@ -204,6 +204,8 @@ static bool faulty_make_request(struct mddev *mddev, struct bio *bio)
failit = 1;
}
}
md_account_bio(mddev, &bio);
if (failit) {
struct bio *b = bio_alloc_clone(conf->rdev->bdev, bio, GFP_NOIO,
&mddev->bio_set);
......
......@@ -238,6 +238,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio)
bio = split;
}
md_account_bio(mddev, &bio);
bio_set_dev(bio, tmp_dev->rdev->bdev);
bio->bi_iter.bi_sector = bio->bi_iter.bi_sector -
start_sector + data_offset;
......
......@@ -107,6 +107,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio)
&& md_flush_request(mddev, bio))
return true;
md_account_bio(mddev, &bio);
mp_bh = mempool_alloc(&conf->pool, GFP_NOIO);
mp_bh->master_bio = bio;
......
......@@ -453,7 +453,6 @@ void mddev_suspend(struct mddev *mddev)
mddev->pers->prepare_suspend(mddev);
wait_event(mddev->sb_wait, percpu_ref_is_zero(&mddev->active_io));
mddev->pers->quiesce(mddev, 1);
clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
......@@ -465,14 +464,15 @@ EXPORT_SYMBOL_GPL(mddev_suspend);
void mddev_resume(struct mddev *mddev)
{
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
lockdep_assert_held(&mddev->reconfig_mutex);
if (--mddev->suspended)
return;
/* entred the memalloc scope from mddev_suspend() */
memalloc_noio_restore(mddev->noio_flag);
percpu_ref_resurrect(&mddev->active_io);
wake_up(&mddev->sb_wait);
mddev->pers->quiesce(mddev, 0);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
......@@ -643,6 +643,7 @@ void mddev_init(struct mddev *mddev)
{
mutex_init(&mddev->open_mutex);
mutex_init(&mddev->reconfig_mutex);
mutex_init(&mddev->sync_mutex);
mutex_init(&mddev->bitmap_info.mutex);
INIT_LIST_HEAD(&mddev->disks);
INIT_LIST_HEAD(&mddev->all_mddevs);
......@@ -650,6 +651,7 @@ void mddev_init(struct mddev *mddev)
timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
atomic_set(&mddev->active, 1);
atomic_set(&mddev->openers, 0);
atomic_set(&mddev->sync_seq, 0);
spin_lock_init(&mddev->lock);
atomic_set(&mddev->flush_pending, 0);
init_waitqueue_head(&mddev->sb_wait);
......@@ -2304,7 +2306,7 @@ int md_integrity_register(struct mddev *mddev)
pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) ||
(mddev->level != 1 && mddev->level != 10 &&
bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) {
bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) {
/*
* No need to handle the failure of bioset_integrity_create,
* because the function is called by md_run() -> pers->run(),
......@@ -4747,6 +4749,62 @@ action_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", type);
}
static void stop_sync_thread(struct mddev *mddev)
{
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return;
if (mddev_lock(mddev))
return;
/*
* Check again in case MD_RECOVERY_RUNNING is cleared before lock is
* held.
*/
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
mddev_unlock(mddev);
return;
}
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
/*
* Thread might be blocked waiting for metadata update which will now
* never happen
*/
md_wakeup_thread_directly(mddev->sync_thread);
mddev_unlock(mddev);
}
static void idle_sync_thread(struct mddev *mddev)
{
int sync_seq = atomic_read(&mddev->sync_seq);
mutex_lock(&mddev->sync_mutex);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev);
wait_event(resync_wait, sync_seq != atomic_read(&mddev->sync_seq) ||
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
mutex_unlock(&mddev->sync_mutex);
}
static void frozen_sync_thread(struct mddev *mddev)
{
mutex_lock(&mddev->sync_mutex);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
stop_sync_thread(mddev);
wait_event(resync_wait, mddev->sync_thread == NULL &&
!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
mutex_unlock(&mddev->sync_mutex);
}
static ssize_t
action_store(struct mddev *mddev, const char *page, size_t len)
{
......@@ -4754,35 +4812,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
return -EINVAL;
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (cmd_match(page, "frozen"))
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
else
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
mddev_lock(mddev) == 0) {
if (work_pending(&mddev->del_work))
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
sector_t save_rp = mddev->reshape_position;
mddev_unlock(mddev);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
mddev_lock_nointr(mddev);
/*
* set RECOVERY_INTR again and restore reshape
* position in case others changed them after
* got lock, eg, reshape_position_store and
* md_check_recovery.
*/
mddev->reshape_position = save_rp;
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_reap_sync_thread(mddev);
}
mddev_unlock(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
if (cmd_match(page, "idle"))
idle_sync_thread(mddev);
else if (cmd_match(page, "frozen"))
frozen_sync_thread(mddev);
else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
else if (cmd_match(page, "resync"))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
......@@ -5842,6 +5876,13 @@ int md_run(struct mddev *mddev)
goto exit_bio_set;
}
if (!bioset_initialized(&mddev->io_clone_set)) {
err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE,
offsetof(struct md_io_clone, bio_clone), 0);
if (err)
goto exit_sync_set;
}
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
if (!pers || !try_module_get(pers->owner)) {
......@@ -6019,6 +6060,8 @@ int md_run(struct mddev *mddev)
module_put(pers->owner);
md_bitmap_destroy(mddev);
abort:
bioset_exit(&mddev->io_clone_set);
exit_sync_set:
bioset_exit(&mddev->sync_set);
exit_bio_set:
bioset_exit(&mddev->bio_set);
......@@ -6176,7 +6219,6 @@ static void __md_stop_writes(struct mddev *mddev)
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
}
......@@ -6243,6 +6285,7 @@ static void __md_stop(struct mddev *mddev)
percpu_ref_exit(&mddev->active_io);
bioset_exit(&mddev->bio_set);
bioset_exit(&mddev->sync_set);
bioset_exit(&mddev->io_clone_set);
}
void md_stop(struct mddev *mddev)
......@@ -7010,6 +7053,15 @@ static int set_bitmap_file(struct mddev *mddev, int fd)
if (mddev->bitmap || mddev->bitmap_info.file)
return -EEXIST; /* cannot add when bitmap is present */
if (!IS_ENABLED(CONFIG_MD_BITMAP_FILE)) {
pr_warn("%s: bitmap files not supported by this kernel\n",
mdname(mddev));
return -EINVAL;
}
pr_warn("%s: using deprecated bitmap file support\n",
mdname(mddev));
f = fget(fd);
if (f == NULL) {
......@@ -8599,62 +8651,44 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
int acct_bioset_init(struct mddev *mddev)
static void md_end_clone_io(struct bio *bio)
{
int err = 0;
if (!bioset_initialized(&mddev->io_acct_set))
err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE,
offsetof(struct md_io_acct, bio_clone), 0);
return err;
}
EXPORT_SYMBOL_GPL(acct_bioset_init);
void acct_bioset_exit(struct mddev *mddev)
{
bioset_exit(&mddev->io_acct_set);
}
EXPORT_SYMBOL_GPL(acct_bioset_exit);
static void md_end_io_acct(struct bio *bio)
{
struct md_io_acct *md_io_acct = bio->bi_private;
struct bio *orig_bio = md_io_acct->orig_bio;
struct mddev *mddev = md_io_acct->mddev;
struct md_io_clone *md_io_clone = bio->bi_private;
struct bio *orig_bio = md_io_clone->orig_bio;
struct mddev *mddev = md_io_clone->mddev;
orig_bio->bi_status = bio->bi_status;
bio_end_io_acct(orig_bio, md_io_acct->start_time);
if (md_io_clone->start_time)
bio_end_io_acct(orig_bio, md_io_clone->start_time);
bio_put(bio);
bio_endio(orig_bio);
percpu_ref_put(&mddev->active_io);
}
/*
* Used by personalities that don't already clone the bio and thus can't
* easily add the timestamp to their extended bio structure.
*/
void md_account_bio(struct mddev *mddev, struct bio **bio)
static void md_clone_bio(struct mddev *mddev, struct bio **bio)
{
struct block_device *bdev = (*bio)->bi_bdev;
struct md_io_acct *md_io_acct;
struct bio *clone;
if (!blk_queue_io_stat(bdev->bd_disk->queue))
return;
struct md_io_clone *md_io_clone;
struct bio *clone =
bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_clone_set);
md_io_clone = container_of(clone, struct md_io_clone, bio_clone);
md_io_clone->orig_bio = *bio;
md_io_clone->mddev = mddev;
if (blk_queue_io_stat(bdev->bd_disk->queue))
md_io_clone->start_time = bio_start_io_acct(*bio);
clone->bi_end_io = md_end_clone_io;
clone->bi_private = md_io_clone;
*bio = clone;
}
void md_account_bio(struct mddev *mddev, struct bio **bio)
{
percpu_ref_get(&mddev->active_io);
clone = bio_alloc_clone(bdev, *bio, GFP_NOIO, &mddev->io_acct_set);
md_io_acct = container_of(clone, struct md_io_acct, bio_clone);
md_io_acct->orig_bio = *bio;
md_io_acct->start_time = bio_start_io_acct(*bio);
md_io_acct->mddev = mddev;
clone->bi_end_io = md_end_io_acct;
clone->bi_private = md_io_acct;
*bio = clone;
md_clone_bio(mddev, bio);
}
EXPORT_SYMBOL_GPL(md_account_bio);
......@@ -9327,7 +9361,6 @@ void md_check_recovery(struct mddev *mddev)
* ->spare_active and clear saved_raid_disk
*/
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
md_unregister_thread(&mddev->sync_thread);
md_reap_sync_thread(mddev);
clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
......@@ -9356,17 +9389,24 @@ void md_check_recovery(struct mddev *mddev)
if (mddev->sb_flags)
md_update_sb(mddev, 0);
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
/* resync/recovery still happening */
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
if (mddev->sync_thread) {
md_unregister_thread(&mddev->sync_thread);
/*
* Never start a new sync thread if MD_RECOVERY_RUNNING is
* still set.
*/
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
if (!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
/* resync/recovery still happening */
clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
goto unlock;
}
if (WARN_ON_ONCE(!mddev->sync_thread))
goto unlock;
md_reap_sync_thread(mddev);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
* any transients in the value of "sync_action".
*/
......@@ -9443,7 +9483,10 @@ void md_reap_sync_thread(struct mddev *mddev)
sector_t old_dev_sectors = mddev->dev_sectors;
bool is_reshaped = false;
/* sync_thread should be unregistered, collect result */
/* resync has finished, collect result */
md_unregister_thread(&mddev->sync_thread);
atomic_inc(&mddev->sync_seq);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
mddev->degraded != mddev->raid_disks) {
......@@ -9488,7 +9531,6 @@ void md_reap_sync_thread(struct mddev *mddev)
if (mddev_is_clustered(mddev) && is_reshaped
&& !test_bit(MD_CLOSING, &mddev->flags))
md_cluster_ops->update_size(mddev, old_dev_sectors);
wake_up(&resync_wait);
/* flag recovery needed just to double check */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
sysfs_notify_dirent_safe(mddev->sysfs_completed);
......@@ -9496,6 +9538,7 @@ void md_reap_sync_thread(struct mddev *mddev)
md_new_event();
if (mddev->event_work.func)
queue_work(md_misc_wq, &mddev->event_work);
wake_up(&resync_wait);
}
EXPORT_SYMBOL(md_reap_sync_thread);
......
......@@ -510,7 +510,7 @@ struct mddev {
struct bio_set sync_set; /* for sync operations like
* metadata and bitmap writes
*/
struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */
struct bio_set io_clone_set;
/* Generic flush handling.
* The last to finish preflush schedules a worker to submit
......@@ -535,6 +535,11 @@ struct mddev {
*/
struct list_head deleting;
/* Used to synchronize idle and frozen for action_store() */
struct mutex sync_mutex;
/* The sequence number for sync thread */
atomic_t sync_seq;
bool has_superblocks:1;
bool fail_last_dev:1;
bool serialize_policy:1;
......@@ -731,7 +736,7 @@ struct md_thread {
void *private;
};
struct md_io_acct {
struct md_io_clone {
struct mddev *mddev;
struct bio *orig_bio;
unsigned long start_time;
......@@ -769,8 +774,6 @@ extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
int acct_bioset_init(struct mddev *mddev);
void acct_bioset_exit(struct mddev *mddev);
void md_account_bio(struct mddev *mddev, struct bio **bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
......
......@@ -377,7 +377,6 @@ static void raid0_free(struct mddev *mddev, void *priv)
struct r0conf *conf = priv;
free_conf(mddev, conf);
acct_bioset_exit(mddev);
}
static int raid0_run(struct mddev *mddev)
......@@ -392,16 +391,11 @@ static int raid0_run(struct mddev *mddev)
if (md_check_no_bitmap(mddev))
return -EINVAL;
if (acct_bioset_init(mddev)) {
pr_err("md/raid0:%s: alloc acct bioset failed.\n", mdname(mddev));
return -ENOMEM;
}
/* if private is not null, we are here after takeover */
if (mddev->private == NULL) {
ret = create_strip_zones(mddev, &conf);
if (ret < 0)
goto exit_acct_set;
return ret;
mddev->private = conf;
}
conf = mddev->private;
......@@ -432,15 +426,9 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret)
goto free;
free_conf(mddev, conf);
return ret;
free:
free_conf(mddev, conf);
exit_acct_set:
acct_bioset_exit(mddev);
return ret;
}
/*
......
......@@ -304,8 +304,6 @@ static void call_bio_endio(struct r1bio *r1_bio)
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_status = BLK_STS_IOERR;
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
bio_end_io_acct(bio, r1_bio->start_time);
bio_endio(bio);
}
......@@ -791,11 +789,17 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
return best_disk;
}
static void wake_up_barrier(struct r1conf *conf)
{
if (wq_has_sleeper(&conf->wait_barrier))
wake_up(&conf->wait_barrier);
}
static void flush_bio_list(struct r1conf *conf, struct bio *bio)
{
/* flush any pending bitmap writes to disk before proceeding w/ I/O */
raid1_prepare_flush_writes(conf->mddev->bitmap);
wake_up(&conf->wait_barrier);
wake_up_barrier(conf);
while (bio) { /* submit pending writes */
struct bio *next = bio->bi_next;
......@@ -972,7 +976,7 @@ static bool _wait_barrier(struct r1conf *conf, int idx, bool nowait)
* In case freeze_array() is waiting for
* get_unqueued_pending() == extra
*/
wake_up(&conf->wait_barrier);
wake_up_barrier(conf);
/* Wait for the barrier in same barrier unit bucket to drop. */
/* Return false when nowait flag is set */
......@@ -1015,7 +1019,7 @@ static bool wait_read_barrier(struct r1conf *conf, sector_t sector_nr, bool nowa
* In case freeze_array() is waiting for
* get_unqueued_pending() == extra
*/
wake_up(&conf->wait_barrier);
wake_up_barrier(conf);
/* Wait for array to be unfrozen */
/* Return false when nowait flag is set */
......@@ -1044,7 +1048,7 @@ static bool wait_barrier(struct r1conf *conf, sector_t sector_nr, bool nowait)
static void _allow_barrier(struct r1conf *conf, int idx)
{
atomic_dec(&conf->nr_pending[idx]);
wake_up(&conf->wait_barrier);
wake_up_barrier(conf);
}
static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
......@@ -1173,7 +1177,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_barrier);
wake_up_barrier(conf);
md_wakeup_thread(mddev->thread);
kfree(plug);
return;
......@@ -1303,10 +1307,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
}
r1_bio->read_disk = rdisk;
if (!r1bio_existed && blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r1_bio->start_time = bio_start_io_acct(bio);
if (!r1bio_existed) {
md_account_bio(mddev, &bio);
r1_bio->master_bio = bio;
}
read_bio = bio_alloc_clone(mirror->rdev->bdev, bio, gfp,
&mddev->bio_set);
......@@ -1500,8 +1504,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio->sectors = max_sectors;
}
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r1_bio->start_time = bio_start_io_acct(bio);
md_account_bio(mddev, &bio);
r1_bio->master_bio = bio;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
......@@ -1576,7 +1580,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
r1_bio_write_done(r1_bio);
/* In case raid1d snuck in to freeze_array */
wake_up(&conf->wait_barrier);
wake_up_barrier(conf);
}
static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
......@@ -1766,7 +1770,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
int err = -EEXIST;
int mirror = 0;
int mirror = 0, repl_slot = -1;
struct raid1_info *p;
int first = 0;
int last = conf->raid_disks - 1;
......@@ -1809,17 +1813,21 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
p[conf->raid_disks].rdev == NULL) {
/* Add this device as a replacement */
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = mirror;
err = 0;
conf->fullsync = 1;
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
break;
}
p[conf->raid_disks].rdev == NULL && repl_slot < 0)
repl_slot = mirror;
}
if (err && repl_slot >= 0) {
/* Add this device as a replacement */
p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot;
err = 0;
conf->fullsync = 1;
rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
}
print_conf(conf);
return err;
}
......@@ -2299,7 +2307,7 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
d++;
if (d == conf->raid_disks * 2)
d = 0;
} while (!success && d != read_disk);
} while (d != read_disk);
if (!success) {
/* Cannot read from anywhere - mark it bad */
......
......@@ -157,7 +157,6 @@ struct r1bio {
sector_t sector;
int sectors;
unsigned long state;
unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
......
......@@ -325,8 +325,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio)
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
bio->bi_status = BLK_STS_IOERR;
if (r10_bio->start_time)
bio_end_io_acct(bio, r10_bio->start_time);
bio_endio(bio);
/*
* Wake up any possible resync thread that waits for the device
......@@ -1172,7 +1170,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
}
static void raid10_read_request(struct mddev *mddev, struct bio *bio,
struct r10bio *r10_bio)
struct r10bio *r10_bio, bool io_accounting)
{
struct r10conf *conf = mddev->private;
struct bio *read_bio;
......@@ -1243,9 +1241,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
}
slot = r10_bio->read_slot;
if (!r10_bio->start_time &&
blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r10_bio->start_time = bio_start_io_acct(bio);
if (io_accounting) {
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
}
read_bio = bio_alloc_clone(rdev->bdev, bio, gfp, &mddev->bio_set);
r10_bio->devs[slot].bio = read_bio;
......@@ -1322,6 +1321,25 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
}
}
static struct md_rdev *dereference_rdev_and_rrdev(struct raid10_info *mirror,
struct md_rdev **prrdev)
{
struct md_rdev *rdev, *rrdev;
rrdev = rcu_dereference(mirror->replacement);
/*
* Read replacement first to prevent reading both rdev and
* replacement as NULL during replacement replace rdev.
*/
smp_mb();
rdev = rcu_dereference(mirror->rdev);
if (rdev == rrdev)
rrdev = NULL;
*prrdev = rrdev;
return rdev;
}
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
{
int i;
......@@ -1332,11 +1350,9 @@ static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
blocked_rdev = NULL;
rcu_read_lock();
for (i = 0; i < conf->copies; i++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[i].replacement);
if (rdev == rrdev)
rrdev = NULL;
struct md_rdev *rdev, *rrdev;
rdev = dereference_rdev_and_rrdev(&conf->mirrors[i], &rrdev);
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
atomic_inc(&rdev->nr_pending);
blocked_rdev = rdev;
......@@ -1465,15 +1481,7 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
int d = r10_bio->devs[i].devnum;
struct md_rdev *rdev, *rrdev;
rrdev = rcu_dereference(conf->mirrors[d].replacement);
/*
* Read replacement first to prevent reading both rdev and
* replacement as NULL during replacement replace rdev.
*/
smp_mb();
rdev = rcu_dereference(conf->mirrors[d].rdev);
if (rdev == rrdev)
rrdev = NULL;
rdev = dereference_rdev_and_rrdev(&conf->mirrors[d], &rrdev);
if (rdev && (test_bit(Faulty, &rdev->flags)))
rdev = NULL;
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
......@@ -1543,8 +1551,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
r10_bio->master_bio = bio;
}
if (blk_queue_io_stat(bio->bi_bdev->bd_disk->queue))
r10_bio->start_time = bio_start_io_acct(bio);
md_account_bio(mddev, &bio);
r10_bio->master_bio = bio;
atomic_set(&r10_bio->remaining, 1);
md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
......@@ -1571,12 +1579,11 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
r10_bio->sector = bio->bi_iter.bi_sector;
r10_bio->state = 0;
r10_bio->read_slot = -1;
r10_bio->start_time = 0;
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) *
conf->geo.raid_disks);
if (bio_data_dir(bio) == READ)
raid10_read_request(mddev, bio, r10_bio);
raid10_read_request(mddev, bio, r10_bio, true);
else
raid10_write_request(mddev, bio, r10_bio);
}
......@@ -1780,10 +1787,9 @@ static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
*/
rcu_read_lock();
for (disk = 0; disk < geo->raid_disks; disk++) {
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
struct md_rdev *rrdev = rcu_dereference(
conf->mirrors[disk].replacement);
struct md_rdev *rdev, *rrdev;
rdev = dereference_rdev_and_rrdev(&conf->mirrors[disk], &rrdev);
r10_bio->devs[disk].bio = NULL;
r10_bio->devs[disk].repl_bio = NULL;
......@@ -2720,10 +2726,10 @@ static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
{
int sect = 0; /* Offset from r10_bio->sector */
int sectors = r10_bio->sectors;
int sectors = r10_bio->sectors, slot = r10_bio->read_slot;
struct md_rdev *rdev;
int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
int d = r10_bio->devs[r10_bio->read_slot].devnum;
int d = r10_bio->devs[slot].devnum;
/* still own a reference to this rdev, so it cannot
* have been cleared recently.
......@@ -2744,13 +2750,13 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
pr_notice("md/raid10:%s: %pg: Failing raid device\n",
mdname(mddev), rdev->bdev);
md_error(mddev, rdev);
r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
r10_bio->devs[slot].bio = IO_BLOCKED;
return;
}
while(sectors) {
int s = sectors;
int sl = r10_bio->read_slot;
int sl = slot;
int success = 0;
int start;
......@@ -2785,7 +2791,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
sl++;
if (sl == conf->copies)
sl = 0;
} while (!success && sl != r10_bio->read_slot);
} while (sl != slot);
rcu_read_unlock();
if (!success) {
......@@ -2793,16 +2799,16 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
* as bad on the first device to discourage future
* reads.
*/
int dn = r10_bio->devs[r10_bio->read_slot].devnum;
int dn = r10_bio->devs[slot].devnum;
rdev = conf->mirrors[dn].rdev;
if (!rdev_set_badblocks(
rdev,
r10_bio->devs[r10_bio->read_slot].addr
r10_bio->devs[slot].addr
+ sect,
s, 0)) {
md_error(mddev, rdev);
r10_bio->devs[r10_bio->read_slot].bio
r10_bio->devs[slot].bio
= IO_BLOCKED;
}
break;
......@@ -2811,7 +2817,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
start = sl;
/* write it back and re-read */
rcu_read_lock();
while (sl != r10_bio->read_slot) {
while (sl != slot) {
if (sl==0)
sl = conf->copies;
sl--;
......@@ -2845,7 +2851,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
rcu_read_lock();
}
sl = start;
while (sl != r10_bio->read_slot) {
while (sl != slot) {
if (sl==0)
sl = conf->copies;
sl--;
......@@ -2985,7 +2991,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
rdev_dec_pending(rdev, mddev);
r10_bio->state = 0;
raid10_read_request(mddev, r10_bio->master_bio, r10_bio);
raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false);
/*
* allow_barrier after re-submit to ensure no sync io
* can be issued while regular io pending.
......
......@@ -123,7 +123,6 @@ struct r10bio {
sector_t sector; /* virtual sector number */
int sectors;
unsigned long state;
unsigned long start_time;
struct mddev *mddev;
/*
* original bio going to /dev/mdx
......
......@@ -5468,26 +5468,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf,
*/
static void raid5_align_endio(struct bio *bi)
{
struct md_io_acct *md_io_acct = bi->bi_private;
struct bio *raid_bi = md_io_acct->orig_bio;
struct mddev *mddev;
struct r5conf *conf;
struct md_rdev *rdev;
struct bio *raid_bi = bi->bi_private;
struct md_rdev *rdev = (void *)raid_bi->bi_next;
struct mddev *mddev = rdev->mddev;
struct r5conf *conf = mddev->private;
blk_status_t error = bi->bi_status;
unsigned long start_time = md_io_acct->start_time;
bio_put(bi);
rdev = (void*)raid_bi->bi_next;
raid_bi->bi_next = NULL;
mddev = rdev->mddev;
conf = mddev->private;
rdev_dec_pending(rdev, conf->mddev);
if (!error) {
if (blk_queue_io_stat(raid_bi->bi_bdev->bd_disk->queue))
bio_end_io_acct(raid_bi, start_time);
bio_endio(raid_bi);
if (atomic_dec_and_test(&conf->active_aligned_reads))
wake_up(&conf->wait_for_quiescent);
......@@ -5506,7 +5497,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct md_rdev *rdev;
sector_t sector, end_sector, first_bad;
int bad_sectors, dd_idx;
struct md_io_acct *md_io_acct;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
......@@ -5543,16 +5533,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
return 0;
}
align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
&mddev->io_acct_set);
md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone);
md_account_bio(mddev, &raid_bio);
raid_bio->bi_next = (void *)rdev;
if (blk_queue_io_stat(raid_bio->bi_bdev->bd_disk->queue))
md_io_acct->start_time = bio_start_io_acct(raid_bio);
md_io_acct->orig_bio = raid_bio;
align_bio = bio_alloc_clone(rdev->bdev, raid_bio, GFP_NOIO,
&mddev->bio_set);
align_bio->bi_end_io = raid5_align_endio;
align_bio->bi_private = md_io_acct;
align_bio->bi_private = raid_bio;
align_bio->bi_iter.bi_sector = sector;
/* No reshape active, so we can trust rdev->data_offset */
......@@ -7787,19 +7774,12 @@ static int raid5_run(struct mddev *mddev)
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
int i, ret = 0;
int i;
long long min_offset_diff = 0;
int first = 1;
if (acct_bioset_init(mddev)) {
pr_err("md/raid456:%s: alloc acct bioset failed.\n", mdname(mddev));
if (mddev_init_writes_pending(mddev) < 0)
return -ENOMEM;
}
if (mddev_init_writes_pending(mddev) < 0) {
ret = -ENOMEM;
goto exit_acct_set;
}
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
......@@ -7830,8 +7810,7 @@ static int raid5_run(struct mddev *mddev)
(mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
return -EINVAL;
}
if (mddev->reshape_position != MaxSector) {
......@@ -7856,15 +7835,13 @@ static int raid5_run(struct mddev *mddev)
if (journal_dev) {
pr_warn("md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
return -EINVAL;
}
if (mddev->new_level != mddev->level) {
pr_warn("md/raid:%s: unsupported reshape required - aborting.\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
return -EINVAL;
}
old_disks = mddev->raid_disks - mddev->delta_disks;
/* reshape_position must be on a new-stripe boundary, and one
......@@ -7880,8 +7857,7 @@ static int raid5_run(struct mddev *mddev)
if (sector_div(here_new, chunk_sectors * new_data_disks)) {
pr_warn("md/raid:%s: reshape_position not on a stripe boundary\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
return -EINVAL;
}
reshape_offset = here_new * chunk_sectors;
/* here_new is the stripe we will write to */
......@@ -7903,8 +7879,7 @@ static int raid5_run(struct mddev *mddev)
else if (mddev->ro == 0) {
pr_warn("md/raid:%s: in-place reshape must be started in read-only mode - aborting\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
return -EINVAL;
}
} else if (mddev->reshape_backwards
? (here_new * chunk_sectors + min_offset_diff <=
......@@ -7914,8 +7889,7 @@ static int raid5_run(struct mddev *mddev)
/* Reading from the same stripe as writing to - bad */
pr_warn("md/raid:%s: reshape_position too early for auto-recovery - aborting.\n",
mdname(mddev));
ret = -EINVAL;
goto exit_acct_set;
return -EINVAL;
}
pr_debug("md/raid:%s: reshape will continue\n", mdname(mddev));
/* OK, we should be able to continue; */
......@@ -7939,10 +7913,8 @@ static int raid5_run(struct mddev *mddev)
else
conf = mddev->private;
if (IS_ERR(conf)) {
ret = PTR_ERR(conf);
goto exit_acct_set;
}
if (IS_ERR(conf))
return PTR_ERR(conf);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
if (!journal_dev) {
......@@ -8140,10 +8112,7 @@ static int raid5_run(struct mddev *mddev)
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
ret = -EIO;
exit_acct_set:
acct_bioset_exit(mddev);
return ret;
return -EIO;
}
static void raid5_free(struct mddev *mddev, void *priv)
......@@ -8151,7 +8120,6 @@ static void raid5_free(struct mddev *mddev, void *priv)
struct r5conf *conf = priv;
free_conf(conf);
acct_bioset_exit(mddev);
mddev->to_remove = &raid5_attrs_group;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment