Merge branch 'md-next' of...

Merge branch 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.1/block Pull MD updates and fixes from Song: "1. Various raid5 fix and clean up, by Logan Gunthorpe and David Sloan. 2. Raid10 performance optimization, by Yu Kuai." * 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: md: Fix spelling mistake in comments of r5l_log md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d md/raid10: convert resync_lock to use seqlock md/raid10: fix improper BUG_ON() in raise_barrier() md/raid10: prevent unnecessary calls to wake_up() in fast path md/raid10: don't modify 'nr_waitng' in wait_barrier() for the case nowait md/raid10: factor out code from wait_barrier() to stop_waiting_barrier() md: Remove extra mddev_get() in md_seq_start() md/raid5: Remove unnecessary bio_put() in raid5_read_one_chunk() md/raid5: Ensure stripe_fill happens on non-read IO with journal md/raid5: Don't read ->active_stripes if it's not needed md/raid5: Cleanup prototype of raid5_get_active_stripe() md/raid5: Drop extern on function declarations in raid5.h md/raid5: Refactor raid5_get_active_stripe() md: Replace snprintf with scnprintf md/raid10: fix compile warning md/raid5: Fix spelling mistakes in comments

Merge branch 'md-next' of...
Merge branch 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md into for-6.1/block Pull MD updates and fixes from Song: "1. Various raid5 fix and clean up, by Logan Gunthorpe and David Sloan. 2. Raid10 performance optimization, by Yu Kuai." * 'md-next' of https://git.kernel.org/pub/scm/linux/kernel/git/song/md: md: Fix spelling mistake in comments of r5l_log md/raid5: Wait for MD_SB_CHANGE_PENDING in raid5d md/raid10: convert resync_lock to use seqlock md/raid10: fix improper BUG_ON() in raise_barrier() md/raid10: prevent unnecessary calls to wake_up() in fast path md/raid10: don't modify 'nr_waitng' in wait_barrier() for the case nowait md/raid10: factor out code from wait_barrier() to stop_waiting_barrier() md: Remove extra mddev_get() in md_seq_start() md/raid5: Remove unnecessary bio_put() in raid5_read_one_chunk() md/raid5: Ensure stripe_fill happens on non-read IO with journal md/raid5: Don't read ->active_stripes if it's not needed md/raid5: Cleanup prototype of raid5_get_active_stripe() md/raid5: Drop extern on function declarations in raid5.h md/raid5: Refactor raid5_get_active_stripe() md: Replace snprintf with scnprintf md/raid10: fix compile warning md/raid5: Fix spelling mistakes in comments
4324796e · Jens Axboe · 9713a670 · 65b94b52 · 4324796e · 4324796e
Commit 4324796e authored Sep 23, 2022 by Jens Axboe
7 changed files
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8154,7 +8154,6 @@ static void *md_seq_start(struct seq_file *seq, loff_t *pos)
 	list_for_each(tmp,&all_mddevs)
 		if (!l--) {
 			mddev = list_entry(tmp, struct mddev, all_mddevs);
-			mddev_get(mddev);
 			if (!mddev_get(mddev))
 				continue;
 			spin_unlock(&all_mddevs_lock);

--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -47,7 +47,7 @@ static void dump_zones(struct mddev *mddev)
 		int len = 0;
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			len += snprintf(line+len, 200-len, "%s%pg", k?"/":"",
+			len += scnprintf(line+len, 200-len, "%s%pg", k?"/":"",
 				conf->devlist[j * raid_disks + k]->bdev);
 		pr_debug("md: zone%d=[%s]\n", j, line);

--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -79,6 +79,21 @@ static void end_reshape(struct r10conf *conf);
 #include "raid1-10.c"
+#define NULL_CMD
+#define cmd_before(conf, cmd) \
+	do { \
+		write_sequnlock_irq(&(conf)->resync_lock); \
+		cmd; \
+	} while (0)
+#define cmd_after(conf) write_seqlock_irq(&(conf)->resync_lock)
+#define wait_event_barrier_cmd(conf, cond, cmd) \
+	wait_event_cmd((conf)->wait_barrier, cond, cmd_before(conf, cmd), \
+		       cmd_after(conf))
+#define wait_event_barrier(conf, cond) \
+	wait_event_barrier_cmd(conf, cond, NULL_CMD)
 /*
 * for resync bio, r10bio pointer can be retrieved from the per-bio
 * 'struct resync_pages'.
@@ -274,6 +289,12 @@ static void put_buf(struct r10bio *r10_bio)
 	lower_barrier(conf);
 }
+static void wake_up_barrier(struct r10conf *conf)
+{
+	if (wq_has_sleeper(&conf->wait_barrier))
+		wake_up(&conf->wait_barrier);
+}
 static void reschedule_retry(struct r10bio *r10_bio)
 {
 	unsigned long flags;
@@ -930,78 +951,101 @@ static void flush_pending_writes(struct r10conf *conf)
 static void raise_barrier(struct r10conf *conf, int force)
 {
+	write_seqlock_irq(&conf->resync_lock);
 	BUG_ON(force && !conf->barrier);
-	spin_lock_irq(&conf->resync_lock);
 	/* Wait until no block IO is waiting (unless 'force') */
-	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
+	wait_event_barrier(conf, force || !conf->nr_waiting);
-			    conf->resync_lock);
 	/* block any new IO from starting */
-	conf->barrier++;
+	WRITE_ONCE(conf->barrier, conf->barrier + 1);
 	/* Now wait for all pending IO to complete */
-	wait_event_lock_irq(conf->wait_barrier,
+	wait_event_barrier(conf, !atomic_read(&conf->nr_pending) &&
-			    !atomic_read(&conf->nr_pending) && conf->barrier < RESYNC_DEPTH,
+				 conf->barrier < RESYNC_DEPTH);
-			    conf->resync_lock);
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 }
 static void lower_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
-	spin_lock_irqsave(&conf->resync_lock, flags);
-	conf->barrier--;
+	write_seqlock_irqsave(&conf->resync_lock, flags);
-	spin_unlock_irqrestore(&conf->resync_lock, flags);
+	WRITE_ONCE(conf->barrier, conf->barrier - 1);
+	write_sequnlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
+static bool stop_waiting_barrier(struct r10conf *conf)
+{
+	struct bio_list *bio_list = current->bio_list;
+	/* barrier is dropped */
+	if (!conf->barrier)
+		return true;
+	/*
+	 * If there are already pending requests (preventing the barrier from
+	 * rising completely), and the pre-process bio queue isn't empty, then
+	 * don't wait, as we need to empty that queue to get the nr_pending
+	 * count down.
+	 */
+	if (atomic_read(&conf->nr_pending) && bio_list &&
+	    (!bio_list_empty(&bio_list[0]) || !bio_list_empty(&bio_list[1])))
+		return true;
+	/* move on if recovery thread is blocked by us */
+	if (conf->mddev->thread->tsk == current &&
+	    test_bit(MD_RECOVERY_RUNNING, &conf->mddev->recovery) &&
+	    conf->nr_queued > 0)
+		return true;
+	return false;
+}
+static bool wait_barrier_nolock(struct r10conf *conf)
+{
+	unsigned int seq = read_seqbegin(&conf->resync_lock);
+	if (READ_ONCE(conf->barrier))
+		return false;
+	atomic_inc(&conf->nr_pending);
+	if (!read_seqretry(&conf->resync_lock, seq))
+		return true;
+	if (atomic_dec_and_test(&conf->nr_pending))
+		wake_up_barrier(conf);
+	return false;
+}
 static bool wait_barrier(struct r10conf *conf, bool nowait)
 {
 	bool ret = true;
-	spin_lock_irq(&conf->resync_lock);
+	if (wait_barrier_nolock(conf))
+		return true;
+	write_seqlock_irq(&conf->resync_lock);
 	if (conf->barrier) {
-		struct bio_list *bio_list = current->bio_list;
-		conf->nr_waiting++;
-		/* Wait for the barrier to drop.
-		 * However if there are already pending
-		 * requests (preventing the barrier from
-		 * rising completely), and the
-		 * pre-process bio queue isn't empty,
-		 * then don't wait, as we need to empty
-		 * that queue to get the nr_pending
-		 * count down.
-		 */
 		/* Return false when nowait flag is set */
 		if (nowait) {
 			ret = false;
 		} else {
+			conf->nr_waiting++;
 			raid10_log(conf->mddev, "wait barrier");
-			wait_event_lock_irq(conf->wait_barrier,
+			wait_event_barrier(conf, stop_waiting_barrier(conf));
-					    !conf->barrier ||
+			conf->nr_waiting--;
-					    (atomic_read(&conf->nr_pending) &&
-					     bio_list &&
-					     (!bio_list_empty(&bio_list[0]) ||
-					      !bio_list_empty(&bio_list[1]))) ||
-					     /* move on if recovery thread is
-					      * blocked by us
-					      */
-					     (conf->mddev->thread->tsk == current &&
-					      test_bit(MD_RECOVERY_RUNNING,
-						       &conf->mddev->recovery) &&
-					      conf->nr_queued > 0),
-					    conf->resync_lock);
 		}
-		conf->nr_waiting--;
 		if (!conf->nr_waiting)
 			wake_up(&conf->wait_barrier);
 	}
 	/* Only increment nr_pending when we wait */
 	if (ret)
 		atomic_inc(&conf->nr_pending);
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 	return ret;
 }
@@ -1009,7 +1053,7 @@ static void allow_barrier(struct r10conf *conf)
 {
 	if ((atomic_dec_and_test(&conf->nr_pending)) ||
 			(conf->array_freeze_pending))
-		wake_up(&conf->wait_barrier);
+		wake_up_barrier(conf);
 }
 static void freeze_array(struct r10conf *conf, int extra)
@@ -1026,27 +1070,24 @@ static void freeze_array(struct r10conf *conf, int extra)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
-	spin_lock_irq(&conf->resync_lock);
+	write_seqlock_irq(&conf->resync_lock);
 	conf->array_freeze_pending++;
-	conf->barrier++;
+	WRITE_ONCE(conf->barrier, conf->barrier + 1);
 	conf->nr_waiting++;
-	wait_event_lock_irq_cmd(conf->wait_barrier,
+	wait_event_barrier_cmd(conf, atomic_read(&conf->nr_pending) ==
-				atomic_read(&conf->nr_pending) == conf->nr_queued+extra,
+			conf->nr_queued + extra, flush_pending_writes(conf));
-				conf->resync_lock,
-				flush_pending_writes(conf));
 	conf->array_freeze_pending--;
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r10conf *conf)
 {
 	/* reverse the effect of the freeze */
-	spin_lock_irq(&conf->resync_lock);
+	write_seqlock_irq(&conf->resync_lock);
-	conf->barrier--;
+	WRITE_ONCE(conf->barrier, conf->barrier - 1);
 	conf->nr_waiting--;
 	wake_up(&conf->wait_barrier);
-	spin_unlock_irq(&conf->resync_lock);
+	write_sequnlock_irq(&conf->resync_lock);
 }
 static sector_t choose_data_offset(struct r10bio *r10_bio,
@@ -1885,7 +1926,7 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
 	__make_request(mddev, bio, sectors);
 	/* In case raid10d snuck in to freeze_array */
-	wake_up(&conf->wait_barrier);
+	wake_up_barrier(conf);
 	return true;
 }
@@ -1980,7 +2021,7 @@ static int enough(struct r10conf *conf, int ignore)
 * Otherwise, it must be degraded:
 *	- recovery is interrupted.
 *	- &mddev->degraded is bumped.
+ *
 * @rdev is marked as &Faulty excluding case when array is failed and
 * &mddev->fail_last_dev is off.
 */
@@ -4033,7 +4074,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);
 	INIT_LIST_HEAD(&conf->bio_end_io_list);
-	spin_lock_init(&conf->resync_lock);
+	seqlock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
 	atomic_set(&conf->nr_pending, 0);
@@ -4352,7 +4393,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev, sector_t size, int devs)
 				rdev->new_raid_disk = rdev->raid_disk * 2;
 				rdev->sectors = size;
 			}
-		conf->barrier = 1;
+		WRITE_ONCE(conf->barrier, 1);
 	}
 	return conf;

--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -76,7 +76,7 @@ struct r10conf {
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
-	spinlock_t		resync_lock;
+	seqlock_t		resync_lock;
 	atomic_t		nr_pending;
 	int			nr_waiting;
 	int			nr_queued;

--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -125,7 +125,7 @@ struct r5l_log {
 					 * reclaimed.  if it's 0, reclaim spaces
 					 * used by io_units which are in
 					 * IO_UNIT_STRIPE_END state (eg, reclaim
-					 * dones't wait for specific io_unit
+					 * doesn't wait for specific io_unit
 					 * switching to IO_UNIT_STRIPE_END
 					 * state) */
 	wait_queue_head_t iounit_wait;
@@ -1327,9 +1327,9 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 	 * superblock is updated to new log tail. Updating superblock (either
 	 * directly call md_update_sb() or depend on md thread) must hold
 	 * reconfig mutex. On the other hand, raid5_quiesce is called with
-	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
+	 * reconfig_mutex hold. The first step of raid5_quiesce() is waiting
-	 * for all IO finish, hence waitting for reclaim thread, while reclaim
+	 * for all IO finish, hence waiting for reclaim thread, while reclaim
-	 * thread is calling this function and waitting for reconfig mutex. So
+	 * thread is calling this function and waiting for reconfig mutex. So
 	 * there is a deadlock. We workaround this issue with a trylock.
 	 * FIXME: we could miss discard if we can't take reconfig mutex
 	 */
@@ -1923,7 +1923,8 @@ r5c_recovery_alloc_stripe(
 {
 	struct stripe_head *sh;
-	sh = raid5_get_active_stripe(conf, stripe_sect, 0, noblock, 0);
+	sh = raid5_get_active_stripe(conf, NULL, stripe_sect,
+				     noblock ? R5_GAS_NOBLOCK : 0);
 	if (!sh)
 		return NULL;  /* no more stripe available */

--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,6 +36,7 @@
 */
 #include <linux/blkdev.h>
+#include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
 #include <linux/async_tx.h>
@@ -789,87 +790,80 @@ struct stripe_request_ctx {
 */
 static bool is_inactive_blocked(struct r5conf *conf, int hash)
 {
-	int active = atomic_read(&conf->active_stripes);
 	if (list_empty(conf->inactive_list + hash))
 		return false;
 	if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
 		return true;
-	return active < (conf->max_nr_stripes * 3 / 4);
+	return (atomic_read(&conf->active_stripes) <
+		(conf->max_nr_stripes * 3 / 4));
 }
-static struct stripe_head *__raid5_get_active_stripe(struct r5conf *conf,
+struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
 		struct stripe_request_ctx *ctx, sector_t sector,
-		bool previous, bool noblock, bool noquiesce)
+		unsigned int flags)
 {
 	struct stripe_head *sh;
 	int hash = stripe_hash_locks_hash(conf, sector);
+	int previous = !!(flags & R5_GAS_PREVIOUS);
 	pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
 	spin_lock_irq(conf->hash_locks + hash);
-retry:
+	for (;;) {
-	if (!noquiesce && conf->quiesce) {
+		if (!(flags & R5_GAS_NOQUIESCE) && conf->quiesce) {
-		/*
+			/*
-		 * Must release the reference to batch_last before waiting,
+			 * Must release the reference to batch_last before
-		 * on quiesce, otherwise the batch_last will hold a reference
+			 * waiting, on quiesce, otherwise the batch_last will
-		 * to a stripe and raid5_quiesce() will deadlock waiting for
+			 * hold a reference to a stripe and raid5_quiesce()
-		 * active_stripes to go to zero.
+			 * will deadlock waiting for active_stripes to go to
-		 */
+			 * zero.
-		if (ctx && ctx->batch_last) {
+			 */
-			raid5_release_stripe(ctx->batch_last);
+			if (ctx && ctx->batch_last) {
-			ctx->batch_last = NULL;
+				raid5_release_stripe(ctx->batch_last);
-		}
+				ctx->batch_last = NULL;
+			}
-		wait_event_lock_irq(conf->wait_for_quiescent, !conf->quiesce,
-				    *(conf->hash_locks + hash));
-	}
-	sh = find_get_stripe(conf, sector, conf->generation - previous, hash);
+			wait_event_lock_irq(conf->wait_for_quiescent,
-	if (sh)
+					    !conf->quiesce,
-		goto out;
+					    *(conf->hash_locks + hash));
+		}
-	if (test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state))
+		sh = find_get_stripe(conf, sector, conf->generation - previous,
-		goto wait_for_stripe;
+				     hash);
+		if (sh)
+			break;
-	sh = get_free_stripe(conf, hash);
+		if (!test_bit(R5_INACTIVE_BLOCKED, &conf->cache_state)) {
-	if (sh) {
+			sh = get_free_stripe(conf, hash);
-		r5c_check_stripe_cache_usage(conf);
+			if (sh) {
-		init_stripe(sh, sector, previous);
+				r5c_check_stripe_cache_usage(conf);
-		atomic_inc(&sh->count);
+				init_stripe(sh, sector, previous);
-		goto out;
+				atomic_inc(&sh->count);
-	}
+				break;
+			}
-	if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
+			if (!test_bit(R5_DID_ALLOC, &conf->cache_state))
-		set_bit(R5_ALLOC_MORE, &conf->cache_state);
+				set_bit(R5_ALLOC_MORE, &conf->cache_state);
+		}
-wait_for_stripe:
+		if (flags & R5_GAS_NOBLOCK)
-	if (noblock)
+			break;
-		goto out;
-	set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+		set_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
-	r5l_wake_reclaim(conf->log, 0);
+		r5l_wake_reclaim(conf->log, 0);
-	wait_event_lock_irq(conf->wait_for_stripe,
+		wait_event_lock_irq(conf->wait_for_stripe,
-			    is_inactive_blocked(conf, hash),
+				    is_inactive_blocked(conf, hash),
-			    *(conf->hash_locks + hash));
+				    *(conf->hash_locks + hash));
-	clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
+		clear_bit(R5_INACTIVE_BLOCKED, &conf->cache_state);
-	goto retry;
+	}
-out:
 	spin_unlock_irq(conf->hash_locks + hash);
 	return sh;
 }
-struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
-		sector_t sector, bool previous, bool noblock, bool noquiesce)
-{
-	return __raid5_get_active_stripe(conf, NULL, sector, previous, noblock,
-					 noquiesce);
-}
 static bool is_full_stripe_write(struct stripe_head *sh)
 {
 	BUG_ON(sh->overwrite_disks > (sh->disks - sh->raid_conf->max_degraded));
@@ -4047,7 +4041,7 @@ static void handle_stripe_fill(struct stripe_head *sh,
 		 * back cache (prexor with orig_page, and then xor with
 		 * page) in the read path
 		 */
-		if (s->injournal && s->failed) {
+		if (s->to_read && s->injournal && s->failed) {
 			if (test_bit(STRIPE_R5C_CACHING, &sh->state))
 				r5c_make_stripe_write_out(sh);
 			goto out;
@@ -4636,7 +4630,8 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
 			sector_t bn = raid5_compute_blocknr(sh, i, 1);
 			sector_t s = raid5_compute_sector(conf, bn, 0,
 							  &dd_idx, NULL);
-			sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
+			sh2 = raid5_get_active_stripe(conf, NULL, s,
+				R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
 			if (sh2 == NULL)
 				/* so far only the early blocks of this stripe
 				 * have been requested.  When later blocks
@@ -5273,7 +5268,9 @@ static void handle_stripe(struct stripe_head *sh)
 	/* Finish reconstruct operations initiated by the expansion process */
 	if (sh->reconstruct_state == reconstruct_state_result) {
 		struct stripe_head *sh_src
-			= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
+			= raid5_get_active_stripe(conf, NULL, sh->sector,
+					R5_GAS_PREVIOUS | R5_GAS_NOBLOCK |
+					R5_GAS_NOQUIESCE);
 		if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
 			/* sh cannot be written until sh_src has been read.
 			 * so arrange for sh to be delayed a little
@@ -5542,7 +5539,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 	if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
 			&bad_sectors)) {
-		bio_put(raid_bio);
 		rdev_dec_pending(rdev, mddev);
 		return 0;
 	}
@@ -5823,7 +5819,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
 		DEFINE_WAIT(w);
 		int d;
 	again:
-		sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
+		sh = raid5_get_active_stripe(conf, NULL, logical_sector, 0);
 		prepare_to_wait(&conf->wait_for_overlap, &w,
 				TASK_UNINTERRUPTIBLE);
 		set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
@@ -5978,7 +5974,7 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
 	enum stripe_result ret;
 	struct stripe_head *sh;
 	sector_t new_sector;
-	int previous = 0;
+	int previous = 0, flags = 0;
 	int seq, dd_idx;
 	seq = read_seqcount_begin(&conf->gen_lock);
@@ -6012,8 +6008,11 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
 	pr_debug("raid456: %s, sector %llu logical %llu\n", __func__,
 		 new_sector, logical_sector);
-	sh = __raid5_get_active_stripe(conf, ctx, new_sector, previous,
+	if (previous)
-				       (bi->bi_opf & REQ_RAHEAD), 0);
+		flags |= R5_GAS_PREVIOUS;
+	if (bi->bi_opf & REQ_RAHEAD)
+		flags |= R5_GAS_NOBLOCK;
+	sh = raid5_get_active_stripe(conf, ctx, new_sector, flags);
 	if (unlikely(!sh)) {
 		/* cannot get stripe, just give-up */
 		bi->bi_status = BLK_STS_IOERR;
@@ -6362,7 +6361,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) {
 		int j;
 		int skipped_disk = 0;
-		sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
+		sh = raid5_get_active_stripe(conf, NULL, stripe_addr+i,
+					     R5_GAS_NOQUIESCE);
 		set_bit(STRIPE_EXPANDING, &sh->state);
 		atomic_inc(&conf->reshape_stripes);
 		/* If any of this stripe is beyond the end of the old
@@ -6411,7 +6411,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	if (last_sector >= mddev->dev_sectors)
 		last_sector = mddev->dev_sectors - 1;
 	while (first_sector <= last_sector) {
-		sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
+		sh = raid5_get_active_stripe(conf, NULL, first_sector,
+				R5_GAS_PREVIOUS | R5_GAS_NOQUIESCE);
 		set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
 		set_bit(STRIPE_HANDLE, &sh->state);
 		raid5_release_stripe(sh);
@@ -6531,9 +6532,10 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n
 	md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
-	sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
+	sh = raid5_get_active_stripe(conf, NULL, sector_nr,
+				     R5_GAS_NOBLOCK);
 	if (sh == NULL) {
-		sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
+		sh = raid5_get_active_stripe(conf, NULL, sector_nr, 0);
 		/* make sure we don't swamp the stripe cache if someone else
 		 * is trying to get access
 		 */
@@ -6596,8 +6598,8 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio,
 			/* already done this stripe */
 			continue;
-		sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
+		sh = raid5_get_active_stripe(conf, NULL, sector,
+				R5_GAS_NOBLOCK | R5_GAS_NOQUIESCE);
 		if (!sh) {
 			/* failed to get a stripe - must wait */
 			conf->retry_read_aligned = raid_bio;
@@ -6781,7 +6783,18 @@ static void raid5d(struct md_thread *thread)
 			spin_unlock_irq(&conf->device_lock);
 			md_check_recovery(mddev);
 			spin_lock_irq(&conf->device_lock);
+			/*
+			 * Waiting on MD_SB_CHANGE_PENDING below may deadlock
+			 * seeing md_check_recovery() is needed to clear
+			 * the flag when using mdmon.
+			 */
+			continue;
 		}
+		wait_event_lock_irq(mddev->sb_wait,
+			!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+			conf->device_lock);
 	}
 	pr_debug("%d stripes handled\n", handled);

--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -803,16 +803,24 @@ raid5_get_dev_page(struct stripe_head *sh, int disk_idx)
 }
 #endif
-extern void md_raid5_kick_device(struct r5conf *conf);
+void md_raid5_kick_device(struct r5conf *conf);
-extern int raid5_set_cache_size(struct mddev *mddev, int size);
+int raid5_set_cache_size(struct mddev *mddev, int size);
-extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
+sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
-extern void raid5_release_stripe(struct stripe_head *sh);
+void raid5_release_stripe(struct stripe_head *sh);
-extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
+sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
-				     int previous, int *dd_idx,
+		int previous, int *dd_idx, struct stripe_head *sh);
-				     struct stripe_head *sh);
-extern struct stripe_head *
+struct stripe_request_ctx;
-raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
+/* get stripe from previous generation (when reshaping) */
-			bool previous, bool noblock, bool noquiesce);
+#define R5_GAS_PREVIOUS		(1 << 0)
-extern int raid5_calc_degraded(struct r5conf *conf);
+/* do not block waiting for a free stripe */
-extern int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
+#define R5_GAS_NOBLOCK		(1 << 1)
+/* do not block waiting for quiesce to be released */
+#define R5_GAS_NOQUIESCE	(1 << 2)
+struct stripe_head *raid5_get_active_stripe(struct r5conf *conf,
+		struct stripe_request_ctx *ctx, sector_t sector,
+		unsigned int flags);
+int raid5_calc_degraded(struct r5conf *conf);
+int r5c_journal_mode_set(struct mddev *mddev, int journal_mode);
 #endif