Commit 7e303099 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD bugfixes from Shaohua Li:

 - fix raid5-ppl flush request handling hang from Artur

 - fix a potential deadlock in raid5/10 reshape from BingJing

 - fix a deadlock for dm-raid from Heinz

 - fix two md-cluster of raid10 from Lidong and Guoqing

 - fix a NULL deference problem in device removal from Neil

 - fix a NULL deference problem in raid1/raid10 in specific condition
   from Yufen

 - other cleanup and fixes

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  md/raid1: fix NULL pointer dereference
  md: fix a potential deadlock of raid5/raid10 reshape
  md-cluster: choose correct label when clustered layout is not supported
  md: raid5: avoid string overflow warning
  raid5-ppl: fix handling flush requests
  md raid10: fix NULL deference in handle_write_completed()
  md: only allow remove_and_add_spares when no sync_thread running.
  md: document lifetime of internal rdev pointer.
  md: fix md_write_start() deadlock w/o metadata devices
  MD: Free bioset when md_run fails
  raid10: change the size of resync window for clustered raid
  md-multipath: Use seq_putc() in multipath_status()
  md/raid1: Fix trailing semicolon
  md/raid5: simplify uninitialization of shrinker
parents 7bec4a96 3de59bb9
...@@ -157,7 +157,7 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev) ...@@ -157,7 +157,7 @@ static void multipath_status(struct seq_file *seq, struct mddev *mddev)
seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_"); seq_printf (seq, "%s", rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
} }
rcu_read_unlock(); rcu_read_unlock();
seq_printf (seq, "]"); seq_putc(seq, ']');
} }
static int multipath_congested(struct mddev *mddev, int bits) static int multipath_congested(struct mddev *mddev, int bits)
......
...@@ -801,6 +801,9 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev, ...@@ -801,6 +801,9 @@ void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio; struct bio *bio;
int ff = 0; int ff = 0;
if (!page)
return;
if (test_bit(Faulty, &rdev->flags)) if (test_bit(Faulty, &rdev->flags))
return; return;
...@@ -5452,6 +5455,7 @@ int md_run(struct mddev *mddev) ...@@ -5452,6 +5455,7 @@ int md_run(struct mddev *mddev)
* the only valid external interface is through the md * the only valid external interface is through the md
* device. * device.
*/ */
mddev->has_superblocks = false;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (test_bit(Faulty, &rdev->flags)) if (test_bit(Faulty, &rdev->flags))
continue; continue;
...@@ -5465,6 +5469,9 @@ int md_run(struct mddev *mddev) ...@@ -5465,6 +5469,9 @@ int md_run(struct mddev *mddev)
set_disk_ro(mddev->gendisk, 1); set_disk_ro(mddev->gendisk, 1);
} }
if (rdev->sb_page)
mddev->has_superblocks = true;
/* perform some consistency tests on the device. /* perform some consistency tests on the device.
* We don't want the data to overlap the metadata, * We don't want the data to overlap the metadata,
* Internal Bitmap issues have been handled elsewhere. * Internal Bitmap issues have been handled elsewhere.
...@@ -5497,8 +5504,10 @@ int md_run(struct mddev *mddev) ...@@ -5497,8 +5504,10 @@ int md_run(struct mddev *mddev)
} }
if (mddev->sync_set == NULL) { if (mddev->sync_set == NULL) {
mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
if (!mddev->sync_set) if (!mddev->sync_set) {
return -ENOMEM; err = -ENOMEM;
goto abort;
}
} }
spin_lock(&pers_lock); spin_lock(&pers_lock);
...@@ -5511,7 +5520,8 @@ int md_run(struct mddev *mddev) ...@@ -5511,7 +5520,8 @@ int md_run(struct mddev *mddev)
else else
pr_warn("md: personality for level %s is not loaded!\n", pr_warn("md: personality for level %s is not loaded!\n",
mddev->clevel); mddev->clevel);
return -EINVAL; err = -EINVAL;
goto abort;
} }
spin_unlock(&pers_lock); spin_unlock(&pers_lock);
if (mddev->level != pers->level) { if (mddev->level != pers->level) {
...@@ -5524,7 +5534,8 @@ int md_run(struct mddev *mddev) ...@@ -5524,7 +5534,8 @@ int md_run(struct mddev *mddev)
pers->start_reshape == NULL) { pers->start_reshape == NULL) {
/* This personality cannot handle reshaping... */ /* This personality cannot handle reshaping... */
module_put(pers->owner); module_put(pers->owner);
return -EINVAL; err = -EINVAL;
goto abort;
} }
if (pers->sync_request) { if (pers->sync_request) {
...@@ -5593,7 +5604,7 @@ int md_run(struct mddev *mddev) ...@@ -5593,7 +5604,7 @@ int md_run(struct mddev *mddev)
mddev->private = NULL; mddev->private = NULL;
module_put(pers->owner); module_put(pers->owner);
bitmap_destroy(mddev); bitmap_destroy(mddev);
return err; goto abort;
} }
if (mddev->queue) { if (mddev->queue) {
bool nonrot = true; bool nonrot = true;
...@@ -5655,6 +5666,18 @@ int md_run(struct mddev *mddev) ...@@ -5655,6 +5666,18 @@ int md_run(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_action); sysfs_notify_dirent_safe(mddev->sysfs_action);
sysfs_notify(&mddev->kobj, NULL, "degraded"); sysfs_notify(&mddev->kobj, NULL, "degraded");
return 0; return 0;
abort:
if (mddev->bio_set) {
bioset_free(mddev->bio_set);
mddev->bio_set = NULL;
}
if (mddev->sync_set) {
bioset_free(mddev->sync_set);
mddev->sync_set = NULL;
}
return err;
} }
EXPORT_SYMBOL_GPL(md_run); EXPORT_SYMBOL_GPL(md_run);
...@@ -8049,6 +8072,7 @@ EXPORT_SYMBOL(md_done_sync); ...@@ -8049,6 +8072,7 @@ EXPORT_SYMBOL(md_done_sync);
bool md_write_start(struct mddev *mddev, struct bio *bi) bool md_write_start(struct mddev *mddev, struct bio *bi)
{ {
int did_change = 0; int did_change = 0;
if (bio_data_dir(bi) != WRITE) if (bio_data_dir(bi) != WRITE)
return true; return true;
...@@ -8081,6 +8105,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi) ...@@ -8081,6 +8105,8 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
rcu_read_unlock(); rcu_read_unlock();
if (did_change) if (did_change)
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
if (!mddev->has_superblocks)
return true;
wait_event(mddev->sb_wait, wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) || !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
mddev->suspended); mddev->suspended);
...@@ -8543,6 +8569,19 @@ void md_do_sync(struct md_thread *thread) ...@@ -8543,6 +8569,19 @@ void md_do_sync(struct md_thread *thread)
set_mask_bits(&mddev->sb_flags, 0, set_mask_bits(&mddev->sb_flags, 0,
BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS)); BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
mddev->delta_disks > 0 &&
mddev->pers->finish_reshape &&
mddev->pers->size &&
mddev->queue) {
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
}
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */ /* We completed so min/max setting can be forgotten if used. */
...@@ -8569,6 +8608,10 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -8569,6 +8608,10 @@ static int remove_and_add_spares(struct mddev *mddev,
int removed = 0; int removed = 0;
bool remove_some = false; bool remove_some = false;
if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
/* Mustn't remove devices when resync thread is running */
return 0;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if ((this == NULL || rdev == this) && if ((this == NULL || rdev == this) &&
rdev->raid_disk >= 0 && rdev->raid_disk >= 0 &&
......
...@@ -468,6 +468,8 @@ struct mddev { ...@@ -468,6 +468,8 @@ struct mddev {
void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
struct md_cluster_info *cluster_info; struct md_cluster_info *cluster_info;
unsigned int good_device_nr; /* good device num within cluster raid */ unsigned int good_device_nr; /* good device num within cluster raid */
bool has_superblocks:1;
}; };
enum recovery_flags { enum recovery_flags {
......
...@@ -1809,6 +1809,17 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1809,6 +1809,17 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct md_rdev *repl = struct md_rdev *repl =
conf->mirrors[conf->raid_disks + number].rdev; conf->mirrors[conf->raid_disks + number].rdev;
freeze_array(conf, 0); freeze_array(conf, 0);
if (atomic_read(&repl->nr_pending)) {
/* It means that some queued IO of retry_list
* hold repl. Thus, we cannot set replacement
* as NULL, avoiding rdev NULL pointer
* dereference in sync_request_write and
* handle_write_finished.
*/
err = -EBUSY;
unfreeze_array(conf);
goto abort;
}
clear_bit(Replacement, &repl->flags); clear_bit(Replacement, &repl->flags);
p->rdev = repl; p->rdev = repl;
conf->mirrors[conf->raid_disks + number].rdev = NULL; conf->mirrors[conf->raid_disks + number].rdev = NULL;
......
...@@ -26,6 +26,18 @@ ...@@ -26,6 +26,18 @@
#define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t))) #define BARRIER_BUCKETS_NR_BITS (PAGE_SHIFT - ilog2(sizeof(atomic_t)))
#define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS) #define BARRIER_BUCKETS_NR (1<<BARRIER_BUCKETS_NR_BITS)
/* Note: raid1_info.rdev can be set to NULL asynchronously by raid1_remove_disk.
* There are three safe ways to access raid1_info.rdev.
* 1/ when holding mddev->reconfig_mutex
* 2/ when resync/recovery is known to be happening - i.e. in code that is
* called as part of performing resync/recovery.
* 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
* and if it is non-NULL, increment rdev->nr_pending before dropping the
* RCU lock.
* When .rdev is set to NULL, the nr_pending count checked again and if it has
* been incremented, the pointer is put back in .rdev.
*/
struct raid1_info { struct raid1_info {
struct md_rdev *rdev; struct md_rdev *rdev;
sector_t head_position; sector_t head_position;
......
...@@ -141,7 +141,7 @@ static void r10bio_pool_free(void *r10_bio, void *data) ...@@ -141,7 +141,7 @@ static void r10bio_pool_free(void *r10_bio, void *data)
#define RESYNC_WINDOW (1024*1024) #define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */ /* maximum number of concurrent requests, memory permitting */
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) #define CLUSTER_RESYNC_WINDOW (32 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) #define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
/* /*
...@@ -2655,7 +2655,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2655,7 +2655,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
for (m = 0; m < conf->copies; m++) { for (m = 0; m < conf->copies; m++) {
int dev = r10_bio->devs[m].devnum; int dev = r10_bio->devs[m].devnum;
rdev = conf->mirrors[dev].rdev; rdev = conf->mirrors[dev].rdev;
if (r10_bio->devs[m].bio == NULL) if (r10_bio->devs[m].bio == NULL ||
r10_bio->devs[m].bio->bi_end_io == NULL)
continue; continue;
if (!r10_bio->devs[m].bio->bi_status) { if (!r10_bio->devs[m].bio->bi_status) {
rdev_clear_badblocks( rdev_clear_badblocks(
...@@ -2670,7 +2671,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) ...@@ -2670,7 +2671,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
md_error(conf->mddev, rdev); md_error(conf->mddev, rdev);
} }
rdev = conf->mirrors[dev].replacement; rdev = conf->mirrors[dev].replacement;
if (r10_bio->devs[m].repl_bio == NULL) if (r10_bio->devs[m].repl_bio == NULL ||
r10_bio->devs[m].repl_bio->bi_end_io == NULL)
continue; continue;
if (!r10_bio->devs[m].repl_bio->bi_status) { if (!r10_bio->devs[m].repl_bio->bi_status) {
...@@ -3782,7 +3784,7 @@ static int raid10_run(struct mddev *mddev) ...@@ -3782,7 +3784,7 @@ static int raid10_run(struct mddev *mddev)
if (fc > 1 || fo > 0) { if (fc > 1 || fo > 0) {
pr_err("only near layout is supported by clustered" pr_err("only near layout is supported by clustered"
" raid10\n"); " raid10\n");
goto out; goto out_free_conf;
} }
} }
...@@ -4830,17 +4832,11 @@ static void raid10_finish_reshape(struct mddev *mddev) ...@@ -4830,17 +4832,11 @@ static void raid10_finish_reshape(struct mddev *mddev)
return; return;
if (mddev->delta_disks > 0) { if (mddev->delta_disks > 0) {
sector_t size = raid10_size(mddev, 0, 0);
md_set_array_sectors(mddev, size);
if (mddev->recovery_cp > mddev->resync_max_sectors) { if (mddev->recovery_cp > mddev->resync_max_sectors) {
mddev->recovery_cp = mddev->resync_max_sectors; mddev->recovery_cp = mddev->resync_max_sectors;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
} }
mddev->resync_max_sectors = size; mddev->resync_max_sectors = mddev->array_sectors;
if (mddev->queue) {
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
}
} else { } else {
int d; int d;
rcu_read_lock(); rcu_read_lock();
......
...@@ -2,6 +2,19 @@ ...@@ -2,6 +2,19 @@
#ifndef _RAID10_H #ifndef _RAID10_H
#define _RAID10_H #define _RAID10_H
/* Note: raid10_info.rdev can be set to NULL asynchronously by
* raid10_remove_disk.
* There are three safe ways to access raid10_info.rdev.
* 1/ when holding mddev->reconfig_mutex
* 2/ when resync/recovery/reshape is known to be happening - i.e. in code
* that is called as part of performing resync/recovery/reshape.
* 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
* and if it is non-NULL, increment rdev->nr_pending before dropping the
* RCU lock.
* When .rdev is set to NULL, the nr_pending count checked again and if it has
* been incremented, the pointer is put back in .rdev.
*/
struct raid10_info { struct raid10_info {
struct md_rdev *rdev, *replacement; struct md_rdev *rdev, *replacement;
sector_t head_position; sector_t head_position;
......
...@@ -44,6 +44,7 @@ extern void ppl_write_stripe_run(struct r5conf *conf); ...@@ -44,6 +44,7 @@ extern void ppl_write_stripe_run(struct r5conf *conf);
extern void ppl_stripe_write_finished(struct stripe_head *sh); extern void ppl_stripe_write_finished(struct stripe_head *sh);
extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
extern void ppl_quiesce(struct r5conf *conf, int quiesce); extern void ppl_quiesce(struct r5conf *conf, int quiesce);
extern int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio);
static inline bool raid5_has_ppl(struct r5conf *conf) static inline bool raid5_has_ppl(struct r5conf *conf)
{ {
...@@ -104,7 +105,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio) ...@@ -104,7 +105,7 @@ static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
if (conf->log) if (conf->log)
ret = r5l_handle_flush_request(conf->log, bio); ret = r5l_handle_flush_request(conf->log, bio);
else if (raid5_has_ppl(conf)) else if (raid5_has_ppl(conf))
ret = 0; ret = ppl_handle_flush_request(conf->log, bio);
return ret; return ret;
} }
......
...@@ -693,6 +693,16 @@ void ppl_quiesce(struct r5conf *conf, int quiesce) ...@@ -693,6 +693,16 @@ void ppl_quiesce(struct r5conf *conf, int quiesce)
} }
} }
int ppl_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
if (bio->bi_iter.bi_size == 0) {
bio_endio(bio);
return 0;
}
bio->bi_opf &= ~REQ_PREFLUSH;
return -EAGAIN;
}
void ppl_stripe_write_finished(struct stripe_head *sh) void ppl_stripe_write_finished(struct stripe_head *sh)
{ {
struct ppl_io_unit *io; struct ppl_io_unit *io;
......
...@@ -2196,15 +2196,16 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) ...@@ -2196,15 +2196,16 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
static int grow_stripes(struct r5conf *conf, int num) static int grow_stripes(struct r5conf *conf, int num)
{ {
struct kmem_cache *sc; struct kmem_cache *sc;
size_t namelen = sizeof(conf->cache_name[0]);
int devs = max(conf->raid_disks, conf->previous_raid_disks); int devs = max(conf->raid_disks, conf->previous_raid_disks);
if (conf->mddev->gendisk) if (conf->mddev->gendisk)
sprintf(conf->cache_name[0], snprintf(conf->cache_name[0], namelen,
"raid%d-%s", conf->level, mdname(conf->mddev)); "raid%d-%s", conf->level, mdname(conf->mddev));
else else
sprintf(conf->cache_name[0], snprintf(conf->cache_name[0], namelen,
"raid%d-%p", conf->level, conf->mddev); "raid%d-%p", conf->level, conf->mddev);
sprintf(conf->cache_name[1], "%s-alt", conf->cache_name[0]); snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
conf->active_name = 0; conf->active_name = 0;
sc = kmem_cache_create(conf->cache_name[conf->active_name], sc = kmem_cache_create(conf->cache_name[conf->active_name],
...@@ -6764,9 +6765,7 @@ static void free_conf(struct r5conf *conf) ...@@ -6764,9 +6765,7 @@ static void free_conf(struct r5conf *conf)
log_exit(conf); log_exit(conf);
if (conf->shrinker.nr_deferred) unregister_shrinker(&conf->shrinker);
unregister_shrinker(&conf->shrinker);
free_thread_groups(conf); free_thread_groups(conf);
shrink_stripes(conf); shrink_stripes(conf);
raid5_free_percpu(conf); raid5_free_percpu(conf);
...@@ -8001,13 +8000,7 @@ static void raid5_finish_reshape(struct mddev *mddev) ...@@ -8001,13 +8000,7 @@ static void raid5_finish_reshape(struct mddev *mddev)
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
if (mddev->delta_disks > 0) { if (mddev->delta_disks <= 0) {
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
if (mddev->queue) {
set_capacity(mddev->gendisk, mddev->array_sectors);
revalidate_disk(mddev->gendisk);
}
} else {
int d; int d;
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
mddev->degraded = raid5_calc_degraded(conf); mddev->degraded = raid5_calc_degraded(conf);
......
...@@ -450,6 +450,18 @@ enum { ...@@ -450,6 +450,18 @@ enum {
* HANDLE gets cleared if stripe_handle leaves nothing locked. * HANDLE gets cleared if stripe_handle leaves nothing locked.
*/ */
/* Note: disk_info.rdev can be set to NULL asynchronously by raid5_remove_disk.
* There are three safe ways to access disk_info.rdev.
* 1/ when holding mddev->reconfig_mutex
* 2/ when resync/recovery/reshape is known to be happening - i.e. in code that
* is called as part of performing resync/recovery/reshape.
* 3/ while holding rcu_read_lock(), use rcu_dereference to get the pointer
* and if it is non-NULL, increment rdev->nr_pending before dropping the RCU
* lock.
* When .rdev is set to NULL, the nr_pending count checked again and if
* it has been incremented, the pointer is put back in .rdev.
*/
struct disk_info { struct disk_info {
struct md_rdev *rdev, *replacement; struct md_rdev *rdev, *replacement;
struct page *extra_page; /* extra page to use in prexor */ struct page *extra_page; /* extra page to use in prexor */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment