Commit 8b4822de authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD fixes from Shaohua Li:

 - Several bug fixes for raid5-cache from Song Liu, mainly handle
   journal disk error

 - Fix bad block handling in choosing raid1 disk from Tomasz Majchrzak

 - Simplify external metadata array sysfs handling from Artur
   Paszkiewicz

 - Optimize raid0 discard handling from me, now raid0 will dispatch
   large discard IO directly to underlayer disks.

* tag 'md/4.12-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md:
  raid1: prefer disk without bad blocks
  md/r5cache: handle sync with data in write back cache
  md/r5cache: gracefully handle journal device errors for writeback mode
  md/raid1/10: avoid unnecessary locking
  md/raid5-cache: in r5l_do_submit_io(), submit io->split_bio first
  md/md0: optimize raid0 discard handling
  md: don't return -EAGAIN in md_allow_write for external metadata arrays
  md/raid5: make use of spin_lock_irq over local_irq_disable + spin_lock
parents 667f867c d82dd0e3
......@@ -8022,18 +8022,15 @@ EXPORT_SYMBOL(md_write_end);
* may proceed without blocking. It is important to call this before
* attempting a GFP_KERNEL allocation while holding the mddev lock.
* Must be called with mddev_lock held.
*
* In the ->external case MD_SB_CHANGE_PENDING can not be cleared until mddev->lock
* is dropped, so return -EAGAIN after notifying userspace.
*/
int md_allow_write(struct mddev *mddev)
void md_allow_write(struct mddev *mddev)
{
if (!mddev->pers)
return 0;
return;
if (mddev->ro)
return 0;
return;
if (!mddev->pers->sync_request)
return 0;
return;
spin_lock(&mddev->lock);
if (mddev->in_sync) {
......@@ -8046,13 +8043,12 @@ int md_allow_write(struct mddev *mddev)
spin_unlock(&mddev->lock);
md_update_sb(mddev, 0);
sysfs_notify_dirent_safe(mddev->sysfs_state);
/* wait for the dirty state to be recorded in the metadata */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags) &&
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
} else
spin_unlock(&mddev->lock);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
return -EAGAIN;
else
return 0;
}
EXPORT_SYMBOL_GPL(md_allow_write);
......
......@@ -665,7 +665,7 @@ extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
bool metadata_op);
extern void md_do_sync(struct md_thread *thread);
extern void md_new_event(struct mddev *mddev);
extern int md_allow_write(struct mddev *mddev);
extern void md_allow_write(struct mddev *mddev);
extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
extern int md_check_no_bitmap(struct mddev *mddev);
......
......@@ -385,7 +385,7 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
......@@ -459,6 +459,95 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
}
}
static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
{
struct r0conf *conf = mddev->private;
struct strip_zone *zone;
sector_t start = bio->bi_iter.bi_sector;
sector_t end;
unsigned int stripe_size;
sector_t first_stripe_index, last_stripe_index;
sector_t start_disk_offset;
unsigned int start_disk_index;
sector_t end_disk_offset;
unsigned int end_disk_index;
unsigned int disk;
zone = find_zone(conf, &start);
if (bio_end_sector(bio) > zone->zone_end) {
struct bio *split = bio_split(bio,
zone->zone_end - bio->bi_iter.bi_sector, GFP_NOIO,
mddev->bio_set);
bio_chain(split, bio);
generic_make_request(bio);
bio = split;
end = zone->zone_end;
} else
end = bio_end_sector(bio);
if (zone != conf->strip_zone)
end = end - zone[-1].zone_end;
/* Now start and end is the offset in zone */
stripe_size = zone->nb_dev * mddev->chunk_sectors;
first_stripe_index = start;
sector_div(first_stripe_index, stripe_size);
last_stripe_index = end;
sector_div(last_stripe_index, stripe_size);
start_disk_index = (int)(start - first_stripe_index * stripe_size) /
mddev->chunk_sectors;
start_disk_offset = ((int)(start - first_stripe_index * stripe_size) %
mddev->chunk_sectors) +
first_stripe_index * mddev->chunk_sectors;
end_disk_index = (int)(end - last_stripe_index * stripe_size) /
mddev->chunk_sectors;
end_disk_offset = ((int)(end - last_stripe_index * stripe_size) %
mddev->chunk_sectors) +
last_stripe_index * mddev->chunk_sectors;
for (disk = 0; disk < zone->nb_dev; disk++) {
sector_t dev_start, dev_end;
struct bio *discard_bio = NULL;
struct md_rdev *rdev;
if (disk < start_disk_index)
dev_start = (first_stripe_index + 1) *
mddev->chunk_sectors;
else if (disk > start_disk_index)
dev_start = first_stripe_index * mddev->chunk_sectors;
else
dev_start = start_disk_offset;
if (disk < end_disk_index)
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
else if (disk > end_disk_index)
dev_end = last_stripe_index * mddev->chunk_sectors;
else
dev_end = end_disk_offset;
if (dev_end <= dev_start)
continue;
rdev = conf->devlist[(zone - conf->strip_zone) *
conf->strip_zone[0].nb_dev + disk];
if (__blkdev_issue_discard(rdev->bdev,
dev_start + zone->dev_start + rdev->data_offset,
dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
!discard_bio)
continue;
bio_chain(discard_bio, bio);
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
discard_bio, disk_devt(mddev->gendisk),
bio->bi_iter.bi_sector);
generic_make_request(discard_bio);
}
bio_endio(bio);
}
static void raid0_make_request(struct mddev *mddev, struct bio *bio)
{
struct strip_zone *zone;
......@@ -473,6 +562,11 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
return;
}
if (unlikely((bio_op(bio) == REQ_OP_DISCARD))) {
raid0_handle_discard(mddev, bio);
return;
}
bio_sector = bio->bi_iter.bi_sector;
sector = bio_sector;
chunk_sects = mddev->chunk_sectors;
......@@ -498,11 +592,6 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
!blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) {
/* Just ignore it */
bio_endio(bio);
} else {
if (mddev->gendisk)
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
bio, disk_devt(mddev->gendisk),
......@@ -510,7 +599,6 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
mddev_check_writesame(mddev, bio);
mddev_check_write_zeroes(mddev, bio);
generic_make_request(bio);
}
}
static void raid0_status(struct seq_file *seq, struct mddev *mddev)
......
......@@ -666,8 +666,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
break;
}
continue;
} else
} else {
if ((sectors > best_good_sectors) && (best_disk >= 0))
best_disk = -1;
best_good_sectors = sectors;
}
if (best_disk >= 0)
/* At least two disks to choose from so failfast is OK */
......@@ -1529,18 +1532,17 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
plug = container_of(cb, struct raid1_plug_cb, cb);
else
plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
}
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
md_wakeup_thread(mddev->thread);
}
}
r1_bio_write_done(r1_bio);
......@@ -3197,7 +3199,7 @@ static int raid1_reshape(struct mddev *mddev)
struct r1conf *conf = mddev->private;
int cnt, raid_disks;
unsigned long flags;
int d, d2, err;
int d, d2;
/* Cannot change chunk_size, layout, or level */
if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
......@@ -3209,11 +3211,8 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}
if (!mddev_is_clustered(mddev)) {
err = md_allow_write(mddev);
if (err)
return err;
}
if (!mddev_is_clustered(mddev))
md_allow_write(mddev);
raid_disks = mddev->raid_disks + mddev->delta_disks;
......
......@@ -1282,17 +1282,16 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
plug = container_of(cb, struct raid10_plug_cb, cb);
else
plug = NULL;
spin_lock_irqsave(&conf->device_lock, flags);
if (plug) {
bio_list_add(&plug->pending, mbio);
plug->pending_cnt++;
} else {
spin_lock_irqsave(&conf->device_lock, flags);
bio_list_add(&conf->pending_bio_list, mbio);
conf->pending_count++;
}
spin_unlock_irqrestore(&conf->device_lock, flags);
if (!plug)
md_wakeup_thread(mddev->thread);
}
}
static void raid10_write_request(struct mddev *mddev, struct bio *bio,
......
......@@ -24,6 +24,7 @@
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
#include "raid5-log.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
......@@ -622,20 +623,30 @@ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (io->has_flush)
io->current_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->current_bio->bi_opf |= REQ_FUA;
submit_bio(io->current_bio);
if (!io->split_bio)
return;
/*
* In case of journal device failures, submit_bio will get error
* and calls endio, then active stripes will continue write
* process. Therefore, it is not necessary to check Faulty bit
* of journal device here.
*
* We can't check split_bio after current_bio is submitted. If
* io->split_bio is null, after current_bio is submitted, current_bio
* might already be completed and the io_unit is freed. We submit
* split_bio first to avoid the issue.
*/
if (io->split_bio) {
if (io->has_flush)
io->split_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->split_bio->bi_opf |= REQ_FUA;
submit_bio(io->split_bio);
}
if (io->has_flush)
io->current_bio->bi_opf |= REQ_PREFLUSH;
if (io->has_fua)
io->current_bio->bi_opf |= REQ_FUA;
submit_bio(io->current_bio);
}
/* deferred io_unit will be dispatched here */
......@@ -670,6 +681,11 @@ static void r5c_disable_writeback_async(struct work_struct *work)
return;
pr_info("md/raid:%s: Disabling writeback cache for degraded array.\n",
mdname(mddev));
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
......@@ -2621,8 +2637,11 @@ int r5c_try_caching_write(struct r5conf *conf,
* When run in degraded mode, array is set to write-through mode.
* This check helps drain pending write safely in the transition to
* write-through mode.
*
* When a stripe is syncing, the write is also handled in write
* through mode.
*/
if (s->failed) {
if (s->failed || test_bit(STRIPE_SYNCING, &sh->state)) {
r5c_make_stripe_write_out(sh);
return -EAGAIN;
}
......@@ -2825,6 +2844,9 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
}
r5l_append_flush_payload(log, sh->sector);
/* stripe is flused to raid disks, we can do resync now */
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state))
set_bit(STRIPE_HANDLE, &sh->state);
}
int r5c_cache_data(struct r5l_log *log, struct stripe_head *sh)
......@@ -2973,7 +2995,7 @@ static int r5l_load_log(struct r5l_log *log)
return ret;
}
void r5c_update_on_rdev_error(struct mddev *mddev)
void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
{
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
......@@ -2981,7 +3003,8 @@ void r5c_update_on_rdev_error(struct mddev *mddev)
if (!log)
return;
if (raid5_calc_degraded(conf) > 0 &&
if ((raid5_calc_degraded(conf) > 0 ||
test_bit(Journal, &rdev->flags)) &&
conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_BACK)
schedule_work(&log->disable_writeback_work);
}
......
......@@ -28,7 +28,8 @@ extern void r5c_flush_cache(struct r5conf *conf, int num);
extern void r5c_check_stripe_cache_usage(struct r5conf *conf);
extern void r5c_check_cached_full_stripe(struct r5conf *conf);
extern struct md_sysfs_entry r5c_journal_mode;
extern void r5c_update_on_rdev_error(struct mddev *mddev);
extern void r5c_update_on_rdev_error(struct mddev *mddev,
struct md_rdev *rdev);
extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect);
extern struct dma_async_tx_descriptor *
......
......@@ -103,8 +103,7 @@ static inline void unlock_device_hash_lock(struct r5conf *conf, int hash)
static inline void lock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
local_irq_disable();
spin_lock(conf->hash_locks);
spin_lock_irq(conf->hash_locks);
for (i = 1; i < NR_STRIPE_HASH_LOCKS; i++)
spin_lock_nest_lock(conf->hash_locks + i, conf->hash_locks);
spin_lock(&conf->device_lock);
......@@ -114,9 +113,9 @@ static inline void unlock_all_device_hash_locks_irq(struct r5conf *conf)
{
int i;
spin_unlock(&conf->device_lock);
for (i = NR_STRIPE_HASH_LOCKS; i; i--)
spin_unlock(conf->hash_locks + i - 1);
local_irq_enable();
for (i = NR_STRIPE_HASH_LOCKS - 1; i; i--)
spin_unlock(conf->hash_locks + i);
spin_unlock_irq(conf->hash_locks);
}
/* Find first data disk in a raid6 stripe */
......@@ -234,11 +233,15 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
if (test_bit(R5_InJournal, &sh->dev[i].flags))
injournal++;
/*
* When quiesce in r5c write back, set STRIPE_HANDLE for stripes with
* data in journal, so they are not released to cached lists
*/
if (conf->quiesce && r5c_is_writeback(conf->log) &&
!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0) {
* In the following cases, the stripe cannot be released to cached
* lists. Therefore, we make the stripe write out and set
* STRIPE_HANDLE:
* 1. when quiesce in r5c write back;
* 2. when resync is requested fot the stripe.
*/
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) ||
(conf->quiesce && r5c_is_writeback(conf->log) &&
!test_bit(STRIPE_HANDLE, &sh->state) && injournal != 0)) {
if (test_bit(STRIPE_R5C_CACHING, &sh->state))
r5c_make_stripe_write_out(sh);
set_bit(STRIPE_HANDLE, &sh->state);
......@@ -714,12 +717,11 @@ static bool is_full_stripe_write(struct stripe_head *sh)
static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
local_irq_disable();
if (sh1 > sh2) {
spin_lock(&sh2->stripe_lock);
spin_lock_irq(&sh2->stripe_lock);
spin_lock_nested(&sh1->stripe_lock, 1);
} else {
spin_lock(&sh1->stripe_lock);
spin_lock_irq(&sh1->stripe_lock);
spin_lock_nested(&sh2->stripe_lock, 1);
}
}
......@@ -727,8 +729,7 @@ static void lock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
{
spin_unlock(&sh1->stripe_lock);
spin_unlock(&sh2->stripe_lock);
local_irq_enable();
spin_unlock_irq(&sh2->stripe_lock);
}
/* Only freshly new full stripe normal write stripe can be added to a batch list */
......@@ -2312,14 +2313,12 @@ static int resize_stripes(struct r5conf *conf, int newsize)
struct stripe_head *osh, *nsh;
LIST_HEAD(newstripes);
struct disk_info *ndisks;
int err;
int err = 0;
struct kmem_cache *sc;
int i;
int hash, cnt;
err = md_allow_write(conf->mddev);
if (err)
return err;
md_allow_write(conf->mddev);
/* Step 1 */
sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
......@@ -2694,7 +2693,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev)
bdevname(rdev->bdev, b),
mdname(mddev),
conf->raid_disks - mddev->degraded);
r5c_update_on_rdev_error(mddev);
r5c_update_on_rdev_error(mddev, rdev);
}
/*
......@@ -3055,6 +3054,11 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
* When LOG_CRITICAL, stripes with injournal == 0 will be sent to
* no_space_stripes list.
*
* 3. during journal failure
* In journal failure, we try to flush all cached data to raid disks
* based on data in stripe cache. The array is read-only to upper
* layers, so we would skip all pending writes.
*
*/
static inline bool delay_towrite(struct r5conf *conf,
struct r5dev *dev,
......@@ -3068,6 +3072,9 @@ static inline bool delay_towrite(struct r5conf *conf,
if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) &&
s->injournal > 0)
return true;
/* case 3 above */
if (s->log_failed && s->injournal)
return true;
return false;
}
......@@ -4653,8 +4660,13 @@ static void handle_stripe(struct stripe_head *sh)
if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state) && !sh->batch_head) {
spin_lock(&sh->stripe_lock);
/* Cannot process 'sync' concurrently with 'discard' */
if (!test_bit(STRIPE_DISCARD, &sh->state) &&
/*
* Cannot process 'sync' concurrently with 'discard'.
* Flush data in r5cache before 'sync'.
*/
if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) &&
!test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) &&
!test_bit(STRIPE_DISCARD, &sh->state) &&
test_and_clear_bit(STRIPE_SYNC_REQUESTED, &sh->state)) {
set_bit(STRIPE_SYNCING, &sh->state);
clear_bit(STRIPE_INSYNC, &sh->state);
......@@ -4701,10 +4713,15 @@ static void handle_stripe(struct stripe_head *sh)
" to_write=%d failed=%d failed_num=%d,%d\n",
s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
s.failed_num[0], s.failed_num[1]);
/* check if the array has lost more than max_degraded devices and,
/*
* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
*
* When journal device failed (log_failed), we will only process
* the stripe if there is data need write to raid disks
*/
if (s.failed > conf->max_degraded || s.log_failed) {
if (s.failed > conf->max_degraded ||
(s.log_failed && s.injournal == 0)) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
......@@ -5277,8 +5294,10 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
struct stripe_head *sh, *tmp;
struct list_head *handle_list = NULL;
struct r5worker_group *wg;
bool second_try = !r5c_is_writeback(conf->log);
bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
bool second_try = !r5c_is_writeback(conf->log) &&
!r5l_log_disk_error(conf);
bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state) ||
r5l_log_disk_error(conf);
again:
wg = NULL;
......@@ -6313,7 +6332,6 @@ int
raid5_set_cache_size(struct mddev *mddev, int size)
{
struct r5conf *conf = mddev->private;
int err;
if (size <= 16 || size > 32768)
return -EINVAL;
......@@ -6325,10 +6343,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
;
mutex_unlock(&conf->cache_size_mutex);
err = md_allow_write(mddev);
if (err)
return err;
md_allow_write(mddev);
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
......@@ -7530,7 +7545,9 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
* neilb: there is no locking about new writes here,
* so this cannot be safe.
*/
if (atomic_read(&conf->active_stripes)) {
if (atomic_read(&conf->active_stripes) ||
atomic_read(&conf->r5c_cached_full_stripes) ||
atomic_read(&conf->r5c_cached_partial_stripes)) {
return -EBUSY;
}
log_exit(conf);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment