Commit 040639b7 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li:
 "Some small fixes for MD:

   - fix raid5-cache potential problems if raid5 cache isn't fully
     recovered

   - fix a wait-within-wait warning in raid1/10

   - make raid5-PPL support disks with writeback cache enabled"

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  raid5-ppl: PPL support for disks with write-back cache enabled
  md/r5cache: print more info of log recovery
  md/raid1,raid10: silence warning about wait-within-wait
  md: introduce new personality funciton start()
parents 20c59c71 1532d9e8
...@@ -39,6 +39,7 @@ case the behavior is the same as in plain raid5. ...@@ -39,6 +39,7 @@ case the behavior is the same as in plain raid5.
PPL is available for md version-1 metadata and external (specifically IMSM) PPL is available for md version-1 metadata and external (specifically IMSM)
metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl. metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl.
Currently, volatile write-back cache should be disabled on all member drives There is a limitation of maximum 64 disks in the array for PPL. It allows to
when using PPL. Otherwise it cannot guarantee consistency in case of power keep data structures and implementation simple. RAID5 arrays with so many disks
failure. are not likely due to high risk of multiple disks failure. Such restriction
should not be a real life limitation.
...@@ -3151,6 +3151,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -3151,6 +3151,14 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
goto bad; goto bad;
} }
r = md_start(&rs->md);
if (r) {
ti->error = "Failed to start raid array";
mddev_unlock(&rs->md);
goto bad_md_start;
}
rs->callbacks.congested_fn = raid_is_congested; rs->callbacks.congested_fn = raid_is_congested;
dm_table_add_target_callbacks(ti->table, &rs->callbacks); dm_table_add_target_callbacks(ti->table, &rs->callbacks);
...@@ -3198,6 +3206,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv) ...@@ -3198,6 +3206,7 @@ static int raid_ctr(struct dm_target *ti, unsigned int argc, char **argv)
mddev_unlock(&rs->md); mddev_unlock(&rs->md);
return 0; return 0;
bad_md_start:
bad_journal_mode_set: bad_journal_mode_set:
bad_stripe_cache: bad_stripe_cache:
bad_check_reshape: bad_check_reshape:
......
...@@ -711,7 +711,7 @@ static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) ...@@ -711,7 +711,7 @@ static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
return NULL; return NULL;
} }
static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
{ {
struct md_rdev *rdev; struct md_rdev *rdev;
...@@ -721,6 +721,7 @@ static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) ...@@ -721,6 +721,7 @@ static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
return NULL; return NULL;
} }
EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
static struct md_personality *find_pers(int level, char *clevel) static struct md_personality *find_pers(int level, char *clevel)
{ {
...@@ -5560,11 +5561,6 @@ int md_run(struct mddev *mddev) ...@@ -5560,11 +5561,6 @@ int md_run(struct mddev *mddev)
if (start_readonly && mddev->ro == 0) if (start_readonly && mddev->ro == 0)
mddev->ro = 2; /* read-only, but switch on first write */ mddev->ro = 2; /* read-only, but switch on first write */
/*
* NOTE: some pers->run(), for example r5l_recovery_log(), wakes
* up mddev->thread. It is important to initialize critical
* resources for mddev->thread BEFORE calling pers->run().
*/
err = pers->run(mddev); err = pers->run(mddev);
if (err) if (err)
pr_warn("md: pers->run() failed ...\n"); pr_warn("md: pers->run() failed ...\n");
...@@ -5678,6 +5674,9 @@ static int do_md_run(struct mddev *mddev) ...@@ -5678,6 +5674,9 @@ static int do_md_run(struct mddev *mddev)
if (mddev_is_clustered(mddev)) if (mddev_is_clustered(mddev))
md_allow_write(mddev); md_allow_write(mddev);
/* run start up tasks that require md_thread */
md_start(mddev);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
...@@ -5689,6 +5688,21 @@ static int do_md_run(struct mddev *mddev) ...@@ -5689,6 +5688,21 @@ static int do_md_run(struct mddev *mddev)
return err; return err;
} }
int md_start(struct mddev *mddev)
{
int ret = 0;
if (mddev->pers->start) {
set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
md_wakeup_thread(mddev->thread);
ret = mddev->pers->start(mddev);
clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
md_wakeup_thread(mddev->sync_thread);
}
return ret;
}
EXPORT_SYMBOL_GPL(md_start);
static int restart_array(struct mddev *mddev) static int restart_array(struct mddev *mddev)
{ {
struct gendisk *disk = mddev->gendisk; struct gendisk *disk = mddev->gendisk;
...@@ -6997,7 +7011,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev) ...@@ -6997,7 +7011,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
return -ENODEV; return -ENODEV;
rcu_read_lock(); rcu_read_lock();
rdev = find_rdev_rcu(mddev, dev); rdev = md_find_rdev_rcu(mddev, dev);
if (!rdev) if (!rdev)
err = -ENODEV; err = -ENODEV;
else { else {
...@@ -8169,7 +8183,8 @@ void md_do_sync(struct md_thread *thread) ...@@ -8169,7 +8183,8 @@ void md_do_sync(struct md_thread *thread)
int ret; int ret;
/* just incase thread restarts... */ /* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
return; return;
if (mddev->ro) {/* never try to sync a read-only array */ if (mddev->ro) {/* never try to sync a read-only array */
set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_RECOVERY_INTR, &mddev->recovery);
......
...@@ -485,6 +485,7 @@ enum recovery_flags { ...@@ -485,6 +485,7 @@ enum recovery_flags {
MD_RECOVERY_RESHAPE, /* A reshape is happening */ MD_RECOVERY_RESHAPE, /* A reshape is happening */
MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */ MD_RECOVERY_FROZEN, /* User request to abort, and not restart, any action */
MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */ MD_RECOVERY_ERROR, /* sync-action interrupted because io-error */
MD_RECOVERY_WAIT, /* waiting for pers->start() to finish */
}; };
static inline int __must_check mddev_lock(struct mddev *mddev) static inline int __must_check mddev_lock(struct mddev *mddev)
...@@ -523,7 +524,13 @@ struct md_personality ...@@ -523,7 +524,13 @@ struct md_personality
struct list_head list; struct list_head list;
struct module *owner; struct module *owner;
bool (*make_request)(struct mddev *mddev, struct bio *bio); bool (*make_request)(struct mddev *mddev, struct bio *bio);
/*
* start up works that do NOT require md_thread. tasks that
* requires md_thread should go into start()
*/
int (*run)(struct mddev *mddev); int (*run)(struct mddev *mddev);
/* start up works that require md threads */
int (*start)(struct mddev *mddev);
void (*free)(struct mddev *mddev, void *priv); void (*free)(struct mddev *mddev, void *priv);
void (*status)(struct seq_file *seq, struct mddev *mddev); void (*status)(struct seq_file *seq, struct mddev *mddev);
/* error_handler must set ->faulty and clear ->in_sync /* error_handler must set ->faulty and clear ->in_sync
...@@ -687,6 +694,7 @@ extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); ...@@ -687,6 +694,7 @@ extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
extern void mddev_init(struct mddev *mddev); extern void mddev_init(struct mddev *mddev);
extern int md_run(struct mddev *mddev); extern int md_run(struct mddev *mddev);
extern int md_start(struct mddev *mddev);
extern void md_stop(struct mddev *mddev); extern void md_stop(struct mddev *mddev);
extern void md_stop_writes(struct mddev *mddev); extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev); extern int md_rdev_init(struct md_rdev *rdev);
...@@ -702,6 +710,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk); ...@@ -702,6 +710,7 @@ extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force); extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev);
static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
{ {
......
...@@ -815,6 +815,17 @@ static void flush_pending_writes(struct r1conf *conf) ...@@ -815,6 +815,17 @@ static void flush_pending_writes(struct r1conf *conf)
bio = bio_list_get(&conf->pending_bio_list); bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0; conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
/*
* As this is called in a wait_event() loop (see freeze_array),
* current->state might be TASK_UNINTERRUPTIBLE which will
* cause a warning when we prepare to wait again. As it is
* rare that this path is taken, it is perfectly safe to force
* us to go around the wait_event() loop again, so the warning
* is a false-positive. Silence the warning by resetting
* thread state
*/
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug); blk_start_plug(&plug);
flush_bio_list(conf, bio); flush_bio_list(conf, bio);
blk_finish_plug(&plug); blk_finish_plug(&plug);
......
...@@ -900,6 +900,18 @@ static void flush_pending_writes(struct r10conf *conf) ...@@ -900,6 +900,18 @@ static void flush_pending_writes(struct r10conf *conf)
bio = bio_list_get(&conf->pending_bio_list); bio = bio_list_get(&conf->pending_bio_list);
conf->pending_count = 0; conf->pending_count = 0;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
/*
* As this is called in a wait_event() loop (see freeze_array),
* current->state might be TASK_UNINTERRUPTIBLE which will
* cause a warning when we prepare to wait again. As it is
* rare that this path is taken, it is perfectly safe to force
* us to go around the wait_event() loop again, so the warning
* is a false-positive. Silence the warning by resetting
* thread state
*/
__set_current_state(TASK_RUNNING);
blk_start_plug(&plug); blk_start_plug(&plug);
/* flush any pending bitmap writes to disk /* flush any pending bitmap writes to disk
* before proceeding w/ I/O */ * before proceeding w/ I/O */
......
...@@ -1111,9 +1111,6 @@ void r5l_write_stripe_run(struct r5l_log *log) ...@@ -1111,9 +1111,6 @@ void r5l_write_stripe_run(struct r5l_log *log)
int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio) int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{ {
if (!log)
return -ENODEV;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) { if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) {
/* /*
* in write through (journal only) * in write through (journal only)
...@@ -1592,8 +1589,6 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space) ...@@ -1592,8 +1589,6 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
void r5l_quiesce(struct r5l_log *log, int quiesce) void r5l_quiesce(struct r5l_log *log, int quiesce)
{ {
struct mddev *mddev; struct mddev *mddev;
if (!log)
return;
if (quiesce) { if (quiesce) {
/* make sure r5l_write_super_and_discard_space exits */ /* make sure r5l_write_super_and_discard_space exits */
...@@ -2448,7 +2443,6 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log, ...@@ -2448,7 +2443,6 @@ static void r5c_recovery_flush_data_only_stripes(struct r5l_log *log,
raid5_release_stripe(sh); raid5_release_stripe(sh);
} }
md_wakeup_thread(conf->mddev->thread);
/* reuse conf->wait_for_quiescent in recovery */ /* reuse conf->wait_for_quiescent in recovery */
wait_event(conf->wait_for_quiescent, wait_event(conf->wait_for_quiescent,
atomic_read(&conf->active_stripes) == 0); atomic_read(&conf->active_stripes) == 0);
...@@ -2491,10 +2485,10 @@ static int r5l_recovery_log(struct r5l_log *log) ...@@ -2491,10 +2485,10 @@ static int r5l_recovery_log(struct r5l_log *log)
ctx->seq += 10000; ctx->seq += 10000;
if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0)) if ((ctx->data_only_stripes == 0) && (ctx->data_parity_stripes == 0))
pr_debug("md/raid:%s: starting from clean shutdown\n", pr_info("md/raid:%s: starting from clean shutdown\n",
mdname(mddev)); mdname(mddev));
else else
pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n", pr_info("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
mdname(mddev), ctx->data_only_stripes, mdname(mddev), ctx->data_only_stripes,
ctx->data_parity_stripes); ctx->data_parity_stripes);
...@@ -3036,6 +3030,23 @@ static int r5l_load_log(struct r5l_log *log) ...@@ -3036,6 +3030,23 @@ static int r5l_load_log(struct r5l_log *log)
return ret; return ret;
} }
int r5l_start(struct r5l_log *log)
{
int ret;
if (!log)
return 0;
ret = r5l_load_log(log);
if (ret) {
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
r5l_exit_log(conf);
}
return ret;
}
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev) void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
...@@ -3138,13 +3149,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) ...@@ -3138,13 +3149,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
rcu_assign_pointer(conf->log, log); rcu_assign_pointer(conf->log, log);
if (r5l_load_log(log))
goto error;
set_bit(MD_HAS_JOURNAL, &conf->mddev->flags); set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return 0; return 0;
error:
rcu_assign_pointer(conf->log, NULL); rcu_assign_pointer(conf->log, NULL);
md_unregister_thread(&log->reclaim_thread); md_unregister_thread(&log->reclaim_thread);
reclaim_thread: reclaim_thread:
......
...@@ -32,6 +32,7 @@ extern struct md_sysfs_entry r5c_journal_mode; ...@@ -32,6 +32,7 @@ extern struct md_sysfs_entry r5c_journal_mode;
extern void r5c_update_on_rdev_error(struct mddev *mddev, extern void r5c_update_on_rdev_error(struct mddev *mddev,
struct md_rdev *rdev); struct md_rdev *rdev);
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
extern int r5l_start(struct r5l_log *log);
extern struct dma_async_tx_descriptor * extern struct dma_async_tx_descriptor *
ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu, ops_run_partial_parity(struct stripe_head *sh, struct raid5_percpu *percpu,
...@@ -42,6 +43,7 @@ extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh); ...@@ -42,6 +43,7 @@ extern int ppl_write_stripe(struct r5conf *conf, struct stripe_head *sh);
extern void ppl_write_stripe_run(struct r5conf *conf); extern void ppl_write_stripe_run(struct r5conf *conf);
extern void ppl_stripe_write_finished(struct stripe_head *sh); extern void ppl_stripe_write_finished(struct stripe_head *sh);
extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add); extern int ppl_modify_log(struct r5conf *conf, struct md_rdev *rdev, bool add);
extern void ppl_quiesce(struct r5conf *conf, int quiesce);
static inline bool raid5_has_ppl(struct r5conf *conf) static inline bool raid5_has_ppl(struct r5conf *conf)
{ {
...@@ -87,6 +89,34 @@ static inline void log_write_stripe_run(struct r5conf *conf) ...@@ -87,6 +89,34 @@ static inline void log_write_stripe_run(struct r5conf *conf)
ppl_write_stripe_run(conf); ppl_write_stripe_run(conf);
} }
static inline void log_flush_stripe_to_raid(struct r5conf *conf)
{
if (conf->log)
r5l_flush_stripe_to_raid(conf->log);
else if (raid5_has_ppl(conf))
ppl_write_stripe_run(conf);
}
static inline int log_handle_flush_request(struct r5conf *conf, struct bio *bio)
{
int ret = -ENODEV;
if (conf->log)
ret = r5l_handle_flush_request(conf->log, bio);
else if (raid5_has_ppl(conf))
ret = 0;
return ret;
}
static inline void log_quiesce(struct r5conf *conf, int quiesce)
{
if (conf->log)
r5l_quiesce(conf->log, quiesce);
else if (raid5_has_ppl(conf))
ppl_quiesce(conf, quiesce);
}
static inline void log_exit(struct r5conf *conf) static inline void log_exit(struct r5conf *conf)
{ {
if (conf->log) if (conf->log)
......
...@@ -85,6 +85,9 @@ ...@@ -85,6 +85,9 @@
* (for a single member disk). New io_units are added to the end of the list * (for a single member disk). New io_units are added to the end of the list
* and the first io_unit is submitted, if it is not submitted already. * and the first io_unit is submitted, if it is not submitted already.
* The current io_unit accepting new stripes is always at the end of the list. * The current io_unit accepting new stripes is always at the end of the list.
*
* If write-back cache is enabled for any of the disks in the array, its data
* must be flushed before next io_unit is submitted.
*/ */
#define PPL_SPACE_SIZE (128 * 1024) #define PPL_SPACE_SIZE (128 * 1024)
...@@ -104,6 +107,7 @@ struct ppl_conf { ...@@ -104,6 +107,7 @@ struct ppl_conf {
struct kmem_cache *io_kc; struct kmem_cache *io_kc;
mempool_t *io_pool; mempool_t *io_pool;
struct bio_set *bs; struct bio_set *bs;
struct bio_set *flush_bs;
/* used only for recovery */ /* used only for recovery */
int recovered_entries; int recovered_entries;
...@@ -128,6 +132,8 @@ struct ppl_log { ...@@ -128,6 +132,8 @@ struct ppl_log {
sector_t next_io_sector; sector_t next_io_sector;
unsigned int entry_space; unsigned int entry_space;
bool use_multippl; bool use_multippl;
bool wb_cache_on;
unsigned long disk_flush_bitmap;
}; };
#define PPL_IO_INLINE_BVECS 32 #define PPL_IO_INLINE_BVECS 32
...@@ -145,6 +151,7 @@ struct ppl_io_unit { ...@@ -145,6 +151,7 @@ struct ppl_io_unit {
struct list_head stripe_list; /* stripes added to the io_unit */ struct list_head stripe_list; /* stripes added to the io_unit */
atomic_t pending_stripes; /* how many stripes not written to raid */ atomic_t pending_stripes; /* how many stripes not written to raid */
atomic_t pending_flushes; /* how many disk flushes are in progress */
bool submitted; /* true if write to log started */ bool submitted; /* true if write to log started */
...@@ -249,6 +256,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log, ...@@ -249,6 +256,7 @@ static struct ppl_io_unit *ppl_new_iounit(struct ppl_log *log,
INIT_LIST_HEAD(&io->log_sibling); INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list); INIT_LIST_HEAD(&io->stripe_list);
atomic_set(&io->pending_stripes, 0); atomic_set(&io->pending_stripes, 0);
atomic_set(&io->pending_flushes, 0);
bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS); bio_init(&io->bio, io->biovec, PPL_IO_INLINE_BVECS);
pplhdr = page_address(io->header_page); pplhdr = page_address(io->header_page);
...@@ -475,7 +483,18 @@ static void ppl_submit_iounit(struct ppl_io_unit *io) ...@@ -475,7 +483,18 @@ static void ppl_submit_iounit(struct ppl_io_unit *io)
if (log->use_multippl) if (log->use_multippl)
log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9; log->next_io_sector += (PPL_HEADER_SIZE + io->pp_size) >> 9;
WARN_ON(log->disk_flush_bitmap != 0);
list_for_each_entry(sh, &io->stripe_list, log_list) { list_for_each_entry(sh, &io->stripe_list, log_list) {
for (i = 0; i < sh->disks; i++) {
struct r5dev *dev = &sh->dev[i];
if ((ppl_conf->child_logs[i].wb_cache_on) &&
(test_bit(R5_Wantwrite, &dev->flags))) {
set_bit(i, &log->disk_flush_bitmap);
}
}
/* entries for full stripe writes have no partial parity */ /* entries for full stripe writes have no partial parity */
if (test_bit(STRIPE_FULL_WRITE, &sh->state)) if (test_bit(STRIPE_FULL_WRITE, &sh->state))
continue; continue;
...@@ -540,6 +559,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io) ...@@ -540,6 +559,7 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
{ {
struct ppl_log *log = io->log; struct ppl_log *log = io->log;
struct ppl_conf *ppl_conf = log->ppl_conf; struct ppl_conf *ppl_conf = log->ppl_conf;
struct r5conf *conf = ppl_conf->mddev->private;
unsigned long flags; unsigned long flags;
pr_debug("%s: seq: %llu\n", __func__, io->seq); pr_debug("%s: seq: %llu\n", __func__, io->seq);
...@@ -565,6 +585,112 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io) ...@@ -565,6 +585,112 @@ static void ppl_io_unit_finished(struct ppl_io_unit *io)
spin_unlock(&ppl_conf->no_mem_stripes_lock); spin_unlock(&ppl_conf->no_mem_stripes_lock);
local_irq_restore(flags); local_irq_restore(flags);
wake_up(&conf->wait_for_quiescent);
}
static void ppl_flush_endio(struct bio *bio)
{
struct ppl_io_unit *io = bio->bi_private;
struct ppl_log *log = io->log;
struct ppl_conf *ppl_conf = log->ppl_conf;
struct r5conf *conf = ppl_conf->mddev->private;
char b[BDEVNAME_SIZE];
pr_debug("%s: dev: %s\n", __func__, bio_devname(bio, b));
if (bio->bi_status) {
struct md_rdev *rdev;
rcu_read_lock();
rdev = md_find_rdev_rcu(conf->mddev, bio_dev(bio));
if (rdev)
md_error(rdev->mddev, rdev);
rcu_read_unlock();
}
bio_put(bio);
if (atomic_dec_and_test(&io->pending_flushes)) {
ppl_io_unit_finished(io);
md_wakeup_thread(conf->mddev->thread);
}
}
static void ppl_do_flush(struct ppl_io_unit *io)
{
struct ppl_log *log = io->log;
struct ppl_conf *ppl_conf = log->ppl_conf;
struct r5conf *conf = ppl_conf->mddev->private;
int raid_disks = conf->raid_disks;
int flushed_disks = 0;
int i;
atomic_set(&io->pending_flushes, raid_disks);
for_each_set_bit(i, &log->disk_flush_bitmap, raid_disks) {
struct md_rdev *rdev;
struct block_device *bdev = NULL;
rcu_read_lock();
rdev = rcu_dereference(conf->disks[i].rdev);
if (rdev && !test_bit(Faulty, &rdev->flags))
bdev = rdev->bdev;
rcu_read_unlock();
if (bdev) {
struct bio *bio;
char b[BDEVNAME_SIZE];
bio = bio_alloc_bioset(GFP_NOIO, 0, ppl_conf->flush_bs);
bio_set_dev(bio, bdev);
bio->bi_private = io;
bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
bio->bi_end_io = ppl_flush_endio;
pr_debug("%s: dev: %s\n", __func__,
bio_devname(bio, b));
submit_bio(bio);
flushed_disks++;
}
}
log->disk_flush_bitmap = 0;
for (i = flushed_disks ; i < raid_disks; i++) {
if (atomic_dec_and_test(&io->pending_flushes))
ppl_io_unit_finished(io);
}
}
static inline bool ppl_no_io_unit_submitted(struct r5conf *conf,
struct ppl_log *log)
{
struct ppl_io_unit *io;
io = list_first_entry_or_null(&log->io_list, struct ppl_io_unit,
log_sibling);
return !io || !io->submitted;
}
void ppl_quiesce(struct r5conf *conf, int quiesce)
{
struct ppl_conf *ppl_conf = conf->log_private;
int i;
if (quiesce) {
for (i = 0; i < ppl_conf->count; i++) {
struct ppl_log *log = &ppl_conf->child_logs[i];
spin_lock_irq(&log->io_list_lock);
wait_event_lock_irq(conf->wait_for_quiescent,
ppl_no_io_unit_submitted(conf, log),
log->io_list_lock);
spin_unlock_irq(&log->io_list_lock);
}
}
} }
void ppl_stripe_write_finished(struct stripe_head *sh) void ppl_stripe_write_finished(struct stripe_head *sh)
...@@ -574,8 +700,12 @@ void ppl_stripe_write_finished(struct stripe_head *sh) ...@@ -574,8 +700,12 @@ void ppl_stripe_write_finished(struct stripe_head *sh)
io = sh->ppl_io; io = sh->ppl_io;
sh->ppl_io = NULL; sh->ppl_io = NULL;
if (io && atomic_dec_and_test(&io->pending_stripes)) if (io && atomic_dec_and_test(&io->pending_stripes)) {
ppl_io_unit_finished(io); if (io->log->disk_flush_bitmap)
ppl_do_flush(io);
else
ppl_io_unit_finished(io);
}
} }
static void ppl_xor(int size, struct page *page1, struct page *page2) static void ppl_xor(int size, struct page *page1, struct page *page2)
...@@ -1108,6 +1238,8 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf) ...@@ -1108,6 +1238,8 @@ static void __ppl_exit_log(struct ppl_conf *ppl_conf)
if (ppl_conf->bs) if (ppl_conf->bs)
bioset_free(ppl_conf->bs); bioset_free(ppl_conf->bs);
if (ppl_conf->flush_bs)
bioset_free(ppl_conf->flush_bs);
mempool_destroy(ppl_conf->io_pool); mempool_destroy(ppl_conf->io_pool);
kmem_cache_destroy(ppl_conf->io_kc); kmem_cache_destroy(ppl_conf->io_kc);
...@@ -1173,6 +1305,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev) ...@@ -1173,6 +1305,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
{ {
struct request_queue *q;
if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE + if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
PPL_HEADER_SIZE) * 2) { PPL_HEADER_SIZE) * 2) {
log->use_multippl = true; log->use_multippl = true;
...@@ -1185,6 +1319,10 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev) ...@@ -1185,6 +1319,10 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
PPL_HEADER_SIZE; PPL_HEADER_SIZE;
} }
log->next_io_sector = rdev->ppl.sector; log->next_io_sector = rdev->ppl.sector;
q = bdev_get_queue(rdev->bdev);
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
log->wb_cache_on = true;
} }
int ppl_init_log(struct r5conf *conf) int ppl_init_log(struct r5conf *conf)
...@@ -1192,8 +1330,8 @@ int ppl_init_log(struct r5conf *conf) ...@@ -1192,8 +1330,8 @@ int ppl_init_log(struct r5conf *conf)
struct ppl_conf *ppl_conf; struct ppl_conf *ppl_conf;
struct mddev *mddev = conf->mddev; struct mddev *mddev = conf->mddev;
int ret = 0; int ret = 0;
int max_disks;
int i; int i;
bool need_cache_flush = false;
pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n", pr_debug("md/raid:%s: enabling distributed Partial Parity Log\n",
mdname(conf->mddev)); mdname(conf->mddev));
...@@ -1219,6 +1357,14 @@ int ppl_init_log(struct r5conf *conf) ...@@ -1219,6 +1357,14 @@ int ppl_init_log(struct r5conf *conf)
return -EINVAL; return -EINVAL;
} }
max_disks = FIELD_SIZEOF(struct ppl_log, disk_flush_bitmap) *
BITS_PER_BYTE;
if (conf->raid_disks > max_disks) {
pr_warn("md/raid:%s PPL doesn't support over %d disks in the array\n",
mdname(mddev), max_disks);
return -EINVAL;
}
ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL); ppl_conf = kzalloc(sizeof(struct ppl_conf), GFP_KERNEL);
if (!ppl_conf) if (!ppl_conf)
return -ENOMEM; return -ENOMEM;
...@@ -1244,6 +1390,12 @@ int ppl_init_log(struct r5conf *conf) ...@@ -1244,6 +1390,12 @@ int ppl_init_log(struct r5conf *conf)
goto err; goto err;
} }
ppl_conf->flush_bs = bioset_create(conf->raid_disks, 0, 0);
if (!ppl_conf->flush_bs) {
ret = -ENOMEM;
goto err;
}
ppl_conf->count = conf->raid_disks; ppl_conf->count = conf->raid_disks;
ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log), ppl_conf->child_logs = kcalloc(ppl_conf->count, sizeof(struct ppl_log),
GFP_KERNEL); GFP_KERNEL);
...@@ -1275,23 +1427,14 @@ int ppl_init_log(struct r5conf *conf) ...@@ -1275,23 +1427,14 @@ int ppl_init_log(struct r5conf *conf)
log->rdev = rdev; log->rdev = rdev;
if (rdev) { if (rdev) {
struct request_queue *q;
ret = ppl_validate_rdev(rdev); ret = ppl_validate_rdev(rdev);
if (ret) if (ret)
goto err; goto err;
q = bdev_get_queue(rdev->bdev);
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
need_cache_flush = true;
ppl_init_child_log(log, rdev); ppl_init_child_log(log, rdev);
} }
} }
if (need_cache_flush)
pr_warn("md/raid:%s: Volatile write-back cache should be disabled on all member drives when using PPL!\n",
mdname(mddev));
/* load and possibly recover the logs from the member disks */ /* load and possibly recover the logs from the member disks */
ret = ppl_load(ppl_conf); ret = ppl_load(ppl_conf);
......
...@@ -5563,7 +5563,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) ...@@ -5563,7 +5563,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
bool do_flush = false; bool do_flush = false;
if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { if (unlikely(bi->bi_opf & REQ_PREFLUSH)) {
int ret = r5l_handle_flush_request(conf->log, bi); int ret = log_handle_flush_request(conf, bi);
if (ret == 0) if (ret == 0)
return true; return true;
...@@ -6168,7 +6168,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, ...@@ -6168,7 +6168,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
break; break;
if (i == NR_STRIPE_HASH_LOCKS) { if (i == NR_STRIPE_HASH_LOCKS) {
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
r5l_flush_stripe_to_raid(conf->log); log_flush_stripe_to_raid(conf);
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
return batch_size; return batch_size;
} }
...@@ -8060,7 +8060,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce) ...@@ -8060,7 +8060,7 @@ static void raid5_quiesce(struct mddev *mddev, int quiesce)
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf); unlock_all_device_hash_locks_irq(conf);
} }
r5l_quiesce(conf->log, quiesce); log_quiesce(conf, quiesce);
} }
static void *raid45_takeover_raid0(struct mddev *mddev, int level) static void *raid45_takeover_raid0(struct mddev *mddev, int level)
...@@ -8364,6 +8364,13 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf) ...@@ -8364,6 +8364,13 @@ static int raid5_change_consistency_policy(struct mddev *mddev, const char *buf)
return err; return err;
} }
static int raid5_start(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
return r5l_start(conf->log);
}
static struct md_personality raid6_personality = static struct md_personality raid6_personality =
{ {
.name = "raid6", .name = "raid6",
...@@ -8371,6 +8378,7 @@ static struct md_personality raid6_personality = ...@@ -8371,6 +8378,7 @@ static struct md_personality raid6_personality =
.owner = THIS_MODULE, .owner = THIS_MODULE,
.make_request = raid5_make_request, .make_request = raid5_make_request,
.run = raid5_run, .run = raid5_run,
.start = raid5_start,
.free = raid5_free, .free = raid5_free,
.status = raid5_status, .status = raid5_status,
.error_handler = raid5_error, .error_handler = raid5_error,
...@@ -8395,6 +8403,7 @@ static struct md_personality raid5_personality = ...@@ -8395,6 +8403,7 @@ static struct md_personality raid5_personality =
.owner = THIS_MODULE, .owner = THIS_MODULE,
.make_request = raid5_make_request, .make_request = raid5_make_request,
.run = raid5_run, .run = raid5_run,
.start = raid5_start,
.free = raid5_free, .free = raid5_free,
.status = raid5_status, .status = raid5_status,
.error_handler = raid5_error, .error_handler = raid5_error,
...@@ -8420,6 +8429,7 @@ static struct md_personality raid4_personality = ...@@ -8420,6 +8429,7 @@ static struct md_personality raid4_personality =
.owner = THIS_MODULE, .owner = THIS_MODULE,
.make_request = raid5_make_request, .make_request = raid5_make_request,
.run = raid5_run, .run = raid5_run,
.start = raid5_start,
.free = raid5_free, .free = raid5_free,
.status = raid5_status, .status = raid5_status,
.error_handler = raid5_error, .error_handler = raid5_error,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment