Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD updates from Shaohua Li: - a raid5 writeback cache feature. The goal is to aggregate writes to make full stripe write and reduce read-modify-write. It's helpful for workload which does sequential write and follows fsync for example. This feature is experimental and off by default right now. - FAILFAST support. This fails IOs to broken raid disks quickly, so can improve latency. It's mainly for DASD storage, but some patches help normal raid array too. - support bad block for raid array with external metadata - AVX2 instruction support for raid6 parity calculation - normalize MD info output - add missing blktrace - other bug fixes * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (66 commits) md: separate flags for superblock changes md: MD_RECOVERY_NEEDED is set for mddev->recovery md: takeover should clear unrelated bits md/r5cache: after recovery, increase journal seq by 10000 md/raid5-cache: fix crc in rewrite_data_only_stripes() md/raid5-cache: no recovery is required when create super-block md: fix refcount problem on mddev when stopping array. md/r5cache: do r5c_update_log_state after log recovery md/raid5-cache: adjust the write position of the empty block if no data blocks md/r5cache: run_no_space_stripes() when R5C_LOG_CRITICAL == 0 md/raid5: limit request size according to implementation limits md/raid5-cache: do not need to set STRIPE_PREREAD_ACTIVE repeatedly md/raid5-cache: remove the unnecessary next_cp_seq field from the r5l_log md/raid5-cache: release the stripe_head at the appropriate location md/raid5-cache: use ring add to prevent overflow md/raid5-cache: remove unnecessary function parameters raid5-cache: don't set STRIPE_R5C_PARTIAL_STRIPE flag while load stripe into cache raid5-cache: add another check conditon before replaying one stripe md/r5cache: enable IRQs on error path md/r5cache: handle alloc_page failure ...

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md
Pull MD updates from Shaohua Li: - a raid5 writeback cache feature. The goal is to aggregate writes to make full stripe write and reduce read-modify-write. It's helpful for workload which does sequential write and follows fsync for example. This feature is experimental and off by default right now. - FAILFAST support. This fails IOs to broken raid disks quickly, so can improve latency. It's mainly for DASD storage, but some patches help normal raid array too. - support bad block for raid array with external metadata - AVX2 instruction support for raid6 parity calculation - normalize MD info output - add missing blktrace - other bug fixes * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (66 commits) md: separate flags for superblock changes md: MD_RECOVERY_NEEDED is set for mddev->recovery md: takeover should clear unrelated bits md/r5cache: after recovery, increase journal seq by 10000 md/raid5-cache: fix crc in rewrite_data_only_stripes() md/raid5-cache: no recovery is required when create super-block md: fix refcount problem on mddev when stopping array. md/r5cache: do r5c_update_log_state after log recovery md/raid5-cache: adjust the write position of the empty block if no data blocks md/r5cache: run_no_space_stripes() when R5C_LOG_CRITICAL == 0 md/raid5: limit request size according to implementation limits md/raid5-cache: do not need to set STRIPE_PREREAD_ACTIVE repeatedly md/raid5-cache: remove the unnecessary next_cp_seq field from the r5l_log md/raid5-cache: release the stripe_head at the appropriate location md/raid5-cache: use ring add to prevent overflow md/raid5-cache: remove unnecessary function parameters raid5-cache: don't set STRIPE_R5C_PARTIAL_STRIPE flag while load stripe into cache raid5-cache: add another check conditon before replaying one stripe md/r5cache: enable IRQs on error path md/r5cache: handle alloc_page failure ...
2a4c32ed · Linus Torvalds · b9f98bd4 · 20737738 · 2a4c32ed · 2a4c32ed
Commit 2a4c32ed authored Dec 14, 2016 by Linus Torvalds
16 changed files
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -2011,7 +2011,7 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
 		sb->compat_features = cpu_to_le32(FEATURE_FLAG_SUPPORTS_V190);
 		/* Force writing of superblocks to disk */
-		set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
+		set_bit(MD_SB_CHANGE_DEVS, &rdev->mddev->sb_flags);
 		/* Any superblock is better than none, choose that if given */
 		return refdev ? 0 : 1;
@@ -3497,7 +3497,7 @@ static void rs_update_sbs(struct raid_set *rs)
 	struct mddev *mddev = &rs->md;
 	int ro = mddev->ro;
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
 	mddev->ro = 0;
 	md_update_sb(mddev, 1);
 	mddev->ro = ro;

--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "linear.h"
@@ -101,7 +102,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 		sector_t sectors;
 		if (j < 0 || j >= raid_disks || disk->rdev) {
-			printk(KERN_ERR "md/linear:%s: disk numbering problem. Aborting!\n",
+			pr_warn("md/linear:%s: disk numbering problem. Aborting!\n",
 				mdname(mddev));
 			goto out;
 		}
@@ -123,7 +124,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 			discard_supported = true;
 	}
 	if (cnt != raid_disks) {
-		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
+		pr_warn("md/linear:%s: not enough drives present. Aborting!\n",
 			mdname(mddev));
 		goto out;
 	}
@@ -227,22 +228,22 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 	}
 	do {
-		tmp_dev = which_dev(mddev, bio->bi_iter.bi_sector);
+		sector_t bio_sector = bio->bi_iter.bi_sector;
+		tmp_dev = which_dev(mddev, bio_sector);
 		start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
 		end_sector = tmp_dev->end_sector;
 		data_offset = tmp_dev->rdev->data_offset;
 		bio->bi_bdev = tmp_dev->rdev->bdev;
-		if (unlikely(bio->bi_iter.bi_sector >= end_sector ||
+		if (unlikely(bio_sector >= end_sector ||
-			     bio->bi_iter.bi_sector < start_sector))
+			     bio_sector < start_sector))
 			goto out_of_bounds;
 		if (unlikely(bio_end_sector(bio) > end_sector)) {
 			/* This bio crosses a device boundary, so we have to
 			 * split it.
 			 */
-			split = bio_split(bio, end_sector -
+			split = bio_split(bio, end_sector - bio_sector,
-					  bio->bi_iter.bi_sector,
 					  GFP_NOIO, fs_bio_set);
 			bio_chain(split, bio);
 		} else {
@@ -256,15 +257,18 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
 			bio_endio(split);
-		} else
+		} else {
+			if (mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+						      split, disk_devt(mddev->gendisk),
+						      bio_sector);
 			generic_make_request(split);
+		}
 	} while (split != bio);
 	return;
 out_of_bounds:
-	printk(KERN_ERR
+	pr_err("md/linear:%s: make_request: Sector %llu out of bounds on dev %s: %llu sectors, offset %llu\n",
-	       "md/linear:%s: make_request: Sector %llu out of bounds on "
-	       "dev %s: %llu sectors, offset %llu\n",
 	       mdname(mddev),
 	       (unsigned long long)bio->bi_iter.bi_sector,
 	       bdevname(tmp_dev->rdev->bdev, b),
@@ -275,7 +279,6 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
 static void linear_status (struct seq_file *seq, struct mddev *mddev)
 {
 	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }

--- a/drivers/md/md.c
+++ b/drivers/md/md.c
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,6 +29,16 @@
 #define MaxSector (~(sector_t)0)
+/*
+ * These flags should really be called "NO_RETRY" rather than
+ * "FAILFAST" because they don't make any promise about time lapse,
+ * only about the number of retries, which will be zero.
+ * REQ_FAILFAST_DRIVER is not included because
+ * Commit: 4a27446f3e39 ("[SCSI] modify scsi to handle new fail fast flags.")
+ * seems to suggest that the errors it avoids retrying should usually
+ * be retried.
+ */
+#define	MD_FAILFAST	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT)
 /*
 * MD's 'extended' device
 */
@@ -168,6 +178,19 @@ enum flag_bits {
 				 * so it is safe to remove without
 				 * another synchronize_rcu() call.
 				 */
+	ExternalBbl,            /* External metadata provides bad
+				 * block management for a disk
+				 */
+	FailFast,		/* Minimal retries should be attempted on
+				 * this device, so use REQ_FAILFAST_DEV.
+				 * Also don't try to repair failed reads.
+				 * It is expects that no bad block log
+				 * is present.
+				 */
+	LastDev,		/* Seems to be the last working dev as
+				 * it didn't fail, so don't use FailFast
+				 * any more for metadata
+				 */
 };
 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -189,6 +212,31 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
 struct md_cluster_info;
+enum mddev_flags {
+	MD_ARRAY_FIRST_USE,	/* First use of array, needs initialization */
+	MD_CLOSING,		/* If set, we are closing the array, do not open
+				 * it then */
+	MD_JOURNAL_CLEAN,	/* A raid with journal is already clean */
+	MD_HAS_JOURNAL,		/* The raid array has journal feature set */
+	MD_RELOAD_SB,		/* Reload the superblock because another node
+				 * updated it.
+				 */
+	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
+				   * already took resync lock, need to
+				   * release the lock */
+	MD_FAILFAST_SUPPORTED,	/* Using MD_FAILFAST on metadata writes is
+				 * supported as calls to md_error() will
+				 * never cause the array to become failed.
+				 */
+};
+enum mddev_sb_flags {
+	MD_SB_CHANGE_DEVS,		/* Some device status has changed */
+	MD_SB_CHANGE_CLEAN,	/* transition to or from 'clean' */
+	MD_SB_CHANGE_PENDING,	/* switch from 'clean' to 'active' in progress */
+	MD_SB_NEED_REWRITE,	/* metadata write needs to be repeated */
+};
 struct mddev {
 	void				*private;
 	struct md_personality		*pers;
@@ -196,21 +244,7 @@ struct mddev {
 	int				md_minor;
 	struct list_head		disks;
 	unsigned long			flags;
-#define MD_CHANGE_DEVS	0	/* Some device status has changed */
+	unsigned long			sb_flags;
-#define MD_CHANGE_CLEAN 1	/* transition to or from 'clean' */
-#define MD_CHANGE_PENDING 2	/* switch from 'clean' to 'active' in progress */
-#define MD_UPDATE_SB_FLAGS (1 | 2 | 4)	/* If these are set, md_update_sb needed */
-#define MD_ARRAY_FIRST_USE 3    /* First use of array, needs initialization */
-#define MD_CLOSING	4	/* If set, we are closing the array, do not open
-				 * it then */
-#define MD_JOURNAL_CLEAN 5	/* A raid with journal is already clean */
-#define MD_HAS_JOURNAL	6	/* The raid array has journal feature set */
-#define MD_RELOAD_SB	7	/* Reload the superblock because another node
-				 * updated it.
-				 */
-#define MD_CLUSTER_RESYNC_LOCKED 8 /* cluster raid only, which means node
-				    * already took resync lock, need to
-				    * release the lock */
 	int				suspended;
 	atomic_t			active_io;
@@ -304,31 +338,6 @@ struct mddev {
 	int				parallel_resync;
 	int				ok_start_degraded;
-	/* recovery/resync flags
-	 * NEEDED:   we might need to start a resync/recover
-	 * RUNNING:  a thread is running, or about to be started
-	 * SYNC:     actually doing a resync, not a recovery
-	 * RECOVER:  doing recovery, or need to try it.
-	 * INTR:     resync needs to be aborted for some reason
-	 * DONE:     thread is done and is waiting to be reaped
-	 * REQUEST:  user-space has requested a sync (used with SYNC)
-	 * CHECK:    user-space request for check-only, no repair
-	 * RESHAPE:  A reshape is happening
-	 * ERROR:    sync-action interrupted because io-error
-	 *
-	 * If neither SYNC or RESHAPE are set, then it is a recovery.
-	 */
-#define	MD_RECOVERY_RUNNING	0
-#define	MD_RECOVERY_SYNC	1
-#define	MD_RECOVERY_RECOVER	2
-#define	MD_RECOVERY_INTR	3
-#define	MD_RECOVERY_DONE	4
-#define	MD_RECOVERY_NEEDED	5
-#define	MD_RECOVERY_REQUESTED	6
-#define	MD_RECOVERY_CHECK	7
-#define MD_RECOVERY_RESHAPE	8
-#define	MD_RECOVERY_FROZEN	9
-#define	MD_RECOVERY_ERROR	10
 	unsigned long			recovery;
 	/* If a RAID personality determines that recovery (of a particular
@@ -442,6 +451,23 @@ struct mddev {
 	unsigned int			good_device_nr;	/* good device num within cluster raid */
 };
+enum recovery_flags {
+	/*
+	 * If neither SYNC or RESHAPE are set, then it is a recovery.
+	 */
+	MD_RECOVERY_RUNNING,	/* a thread is running, or about to be started */
+	MD_RECOVERY_SYNC,	/* actually doing a resync, not a recovery */
+	MD_RECOVERY_RECOVER,	/* doing recovery, or need to try it. */
+	MD_RECOVERY_INTR,	/* resync needs to be aborted for some reason */
+	MD_RECOVERY_DONE,	/* thread is done and is waiting to be reaped */
+	MD_RECOVERY_NEEDED,	/* we might need to start a resync/recover */
+	MD_RECOVERY_REQUESTED,	/* user-space has requested a sync (used with SYNC) */
+	MD_RECOVERY_CHECK,	/* user-space request for check-only, no repair */
+	MD_RECOVERY_RESHAPE,	/* A reshape is happening */
+	MD_RECOVERY_FROZEN,	/* User request to abort, and not restart, any action */
+	MD_RECOVERY_ERROR,	/* sync-action interrupted because io-error */
+};
 static inline int __must_check mddev_lock(struct mddev *mddev)
 {
 	return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -623,7 +649,7 @@ extern int mddev_congested(struct mddev *mddev, int bits);
 extern void md_flush_request(struct mddev *mddev, struct bio *bio);
 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
 			   sector_t sector, int size, struct page *page);
-extern void md_super_wait(struct mddev *mddev);
+extern int md_super_wait(struct mddev *mddev);
 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
 			struct page *page, int op, int op_flags,
 			bool metadata_op);

--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -52,7 +52,7 @@ static int multipath_map (struct mpconf *conf)
 	}
 	rcu_read_unlock();
-	printk(KERN_ERR "multipath_map(): no more operational IO paths?\n");
+	pr_crit_ratelimited("multipath_map(): no more operational IO paths?\n");
 	return (-1);
 }
@@ -97,7 +97,7 @@ static void multipath_end_request(struct bio *bio)
 		 */
 		char b[BDEVNAME_SIZE];
 		md_error (mp_bh->mddev, rdev);
-		printk(KERN_ERR "multipath: %s: rescheduling sector %llu\n",
+		pr_info("multipath: %s: rescheduling sector %llu\n",
 			bdevname(rdev->bdev,b),
 			(unsigned long long)bio->bi_iter.bi_sector);
 		multipath_reschedule_retry(mp_bh);
@@ -194,8 +194,7 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 		 * first check if this is a queued request for a device
 		 * which has just failed.
 		 */
-		printk(KERN_ALERT
+		pr_warn("multipath: only one IO path left and IO error.\n");
-		       "multipath: only one IO path left and IO error.\n");
 		/* leave it active... it's all we have */
 		return;
 	}
@@ -209,11 +208,9 @@ static void multipath_error (struct mddev *mddev, struct md_rdev *rdev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	set_bit(Faulty, &rdev->flags);
-	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
-	printk(KERN_ALERT "multipath: IO failure on %s,"
+	pr_err("multipath: IO failure on %s, disabling IO path.\n"
-	       " disabling IO path.\n"
+	       "multipath: Operation continuing on %d IO paths.\n",
-	       "multipath: Operation continuing"
-	       " on %d IO paths.\n",
 	       bdevname(rdev->bdev, b),
 	       conf->raid_disks - mddev->degraded);
 }
@@ -223,19 +220,19 @@ static void print_multipath_conf (struct mpconf *conf)
 	int i;
 	struct multipath_info *tmp;
-	printk("MULTIPATH conf printout:\n");
+	pr_debug("MULTIPATH conf printout:\n");
 	if (!conf) {
-		printk("(conf==NULL)\n");
+		pr_debug("(conf==NULL)\n");
 		return;
 	}
-	printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
+	pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 		 conf->raid_disks);
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->multipaths + i;
 		if (tmp->rdev)
-			printk(" disk%d, o:%d, dev:%s\n",
+			pr_debug(" disk%d, o:%d, dev:%s\n",
 				 i,!test_bit(Faulty, &tmp->rdev->flags),
 				 bdevname(tmp->rdev->bdev,b));
 	}
@@ -292,8 +289,7 @@ static int multipath_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 	if (rdev == p->rdev) {
 		if (test_bit(In_sync, &rdev->flags) ||
 		    atomic_read(&rdev->nr_pending)) {
-			printk(KERN_ERR "hot-remove-disk, slot %d is identified"
+			pr_warn("hot-remove-disk, slot %d is identified but is still operational!\n", number);
-			       " but is still operational!\n", number);
 			err = -EBUSY;
 			goto abort;
 		}
@@ -346,14 +342,12 @@ static void multipathd(struct md_thread *thread)
 		bio->bi_iter.bi_sector = mp_bh->master_bio->bi_iter.bi_sector;
 		if ((mp_bh->path = multipath_map (conf))<0) {
-			printk(KERN_ALERT "multipath: %s: unrecoverable IO read"
+			pr_err("multipath: %s: unrecoverable IO read error for block %llu\n",
-				" error for block %llu\n",
 			       bdevname(bio->bi_bdev,b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			multipath_end_bh_io(mp_bh, -EIO);
 		} else {
-			printk(KERN_ERR "multipath: %s: redirecting sector %llu"
+			pr_err("multipath: %s: redirecting sector %llu to another IO path\n",
-				" to another IO path\n",
 			       bdevname(bio->bi_bdev,b),
 			       (unsigned long long)bio->bi_iter.bi_sector);
 			*bio = *(mp_bh->master_bio);
@@ -389,7 +383,7 @@ static int multipath_run (struct mddev *mddev)
 		return -EINVAL;
 	if (mddev->level != LEVEL_MULTIPATH) {
-		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
+		pr_warn("multipath: %s: raid level not set to multipath IO (%d)\n",
 			mdname(mddev), mddev->level);
 		goto out;
 	}
@@ -401,21 +395,13 @@ static int multipath_run (struct mddev *mddev)
 	conf = kzalloc(sizeof(struct mpconf), GFP_KERNEL);
 	mddev->private = conf;
-	if (!conf) {
+	if (!conf)
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
 		goto out;
-	}
 	conf->multipaths = kzalloc(sizeof(struct multipath_info)*mddev->raid_disks,
 				   GFP_KERNEL);
-	if (!conf->multipaths) {
+	if (!conf->multipaths)
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
 		goto out_free_conf;
-	}
 	working_disks = 0;
 	rdev_for_each(rdev, mddev) {
@@ -439,7 +425,7 @@ static int multipath_run (struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->retry_list);
 	if (!working_disks) {
-		printk(KERN_ERR "multipath: no operational IO paths for %s\n",
+		pr_warn("multipath: no operational IO paths for %s\n",
 			mdname(mddev));
 		goto out_free_conf;
 	}
@@ -447,25 +433,15 @@ static int multipath_run (struct mddev *mddev)
 	conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS,
 						 sizeof(struct multipath_bh));
-	if (conf->pool == NULL) {
+	if (conf->pool == NULL)
-		printk(KERN_ERR
-			"multipath: couldn't allocate memory for %s\n",
-			mdname(mddev));
 		goto out_free_conf;
-	}
-	{
 	mddev->thread = md_register_thread(multipathd, mddev,
 					   "multipath");
-		if (!mddev->thread) {
+	if (!mddev->thread)
-			printk(KERN_ERR "multipath: couldn't allocate thread"
-				" for %s\n", mdname(mddev));
 		goto out_free_conf;
-		}
-	}
-	printk(KERN_INFO
+	pr_info("multipath: array %s active with %d out of %d IO paths\n",
-		"multipath: array %s active with %d out of %d IO paths\n",
 		mdname(mddev), conf->raid_disks - mddev->degraded,
 		mddev->raid_disks);
 	/*

--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -21,6 +21,7 @@
 #include <linux/seq_file.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <trace/events/block.h>
 #include "md.h"
 #include "raid0.h"
 #include "raid5.h"
@@ -51,20 +52,21 @@ static void dump_zones(struct mddev *mddev)
 	char b[BDEVNAME_SIZE];
 	struct r0conf *conf = mddev->private;
 	int raid_disks = conf->strip_zone[0].nb_dev;
-	printk(KERN_INFO "md: RAID0 configuration for %s - %d zone%s\n",
+	pr_debug("md: RAID0 configuration for %s - %d zone%s\n",
 		 mdname(mddev),
 		 conf->nr_strip_zones, conf->nr_strip_zones==1?"":"s");
 	for (j = 0; j < conf->nr_strip_zones; j++) {
-		printk(KERN_INFO "md: zone%d=[", j);
+		char line[200];
+		int len = 0;
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
-			printk(KERN_CONT "%s%s", k?"/":"",
+			len += snprintf(line+len, 200-len, "%s%s", k?"/":"",
 					bdevname(conf->devlist[j*raid_disks
 							       + k]->bdev, b));
-		printk(KERN_CONT "]\n");
+		pr_debug("md: zone%d=[%s]\n", j, line);
 		zone_size  = conf->strip_zone[j].zone_end - zone_start;
-		printk(KERN_INFO "      zone-offset=%10lluKB, "
+		pr_debug("      zone-offset=%10lluKB, device-offset=%10lluKB, size=%10lluKB\n",
-				"device-offset=%10lluKB, size=%10lluKB\n",
 			(unsigned long long)zone_start>>1,
 			(unsigned long long)conf->strip_zone[j].dev_start>>1,
 			(unsigned long long)zone_size>>1);
@@ -142,7 +144,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	 * chunk size is a multiple of that sector size
 	 */
 	if ((mddev->chunk_sectors << 9) % blksize) {
-		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+		pr_warn("md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
 			mdname(mddev),
 			mddev->chunk_sectors << 9, blksize);
 		err = -EINVAL;
@@ -186,19 +188,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}
 		if (j < 0) {
-			printk(KERN_ERR
+			pr_warn("md/raid0:%s: remove inactive devices before converting to RAID0\n",
-			       "md/raid0:%s: remove inactive devices before converting to RAID0\n",
 				mdname(mddev));
 			goto abort;
 		}
 		if (j >= mddev->raid_disks) {
-			printk(KERN_ERR "md/raid0:%s: bad disk number %d - "
+			pr_warn("md/raid0:%s: bad disk number %d - aborting!\n",
-			       "aborting!\n", mdname(mddev), j);
+				mdname(mddev), j);
 			goto abort;
 		}
 		if (dev[j]) {
-			printk(KERN_ERR "md/raid0:%s: multiple devices for %d - "
+			pr_warn("md/raid0:%s: multiple devices for %d - aborting!\n",
-			       "aborting!\n", mdname(mddev), j);
+				mdname(mddev), j);
 			goto abort;
 		}
 		dev[j] = rdev1;
@@ -208,8 +209,8 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		cnt++;
 	}
 	if (cnt != mddev->raid_disks) {
-		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
+		pr_warn("md/raid0:%s: too few disks (%d of %d) - aborting!\n",
-		       "aborting!\n", mdname(mddev), cnt, mddev->raid_disks);
+			mdname(mddev), cnt, mddev->raid_disks);
 		goto abort;
 	}
 	zone->nb_dev = cnt;
@@ -357,8 +358,7 @@ static int raid0_run(struct mddev *mddev)
 	int ret;
 	if (mddev->chunk_sectors == 0) {
-		printk(KERN_ERR "md/raid0:%s: chunk size must be set.\n",
+		pr_warn("md/raid0:%s: chunk size must be set.\n", mdname(mddev));
-		       mdname(mddev));
 		return -EINVAL;
 	}
 	if (md_check_no_bitmap(mddev))
@@ -399,7 +399,7 @@ static int raid0_run(struct mddev *mddev)
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
-	printk(KERN_INFO "md/raid0:%s: md_size is %llu sectors.\n",
+	pr_debug("md/raid0:%s: md_size is %llu sectors.\n",
 		 mdname(mddev),
 		 (unsigned long long)mddev->array_sectors);
@@ -464,7 +464,8 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 	}
 	do {
-		sector_t sector = bio->bi_iter.bi_sector;
+		sector_t bio_sector = bio->bi_iter.bi_sector;
+		sector_t sector = bio_sector;
 		unsigned chunk_sects = mddev->chunk_sectors;
 		unsigned sectors = chunk_sects -
@@ -473,7 +474,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			 : sector_div(sector, chunk_sects));
 		/* Restore due to sector_div */
-		sector = bio->bi_iter.bi_sector;
+		sector = bio_sector;
 		if (sectors < bio_sectors(bio)) {
 			split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
@@ -492,8 +493,13 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 			 !blk_queue_discard(bdev_get_queue(split->bi_bdev)))) {
 			/* Just ignore it */
 			bio_endio(split);
-		} else
+		} else {
+			if (mddev->gendisk)
+				trace_block_bio_remap(bdev_get_queue(split->bi_bdev),
+						      split, disk_devt(mddev->gendisk),
+						      bio_sector);
 			generic_make_request(split);
+		}
 	} while (split != bio);
 }
@@ -509,7 +515,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	struct r0conf *priv_conf;
 	if (mddev->degraded != 1) {
-		printk(KERN_ERR "md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
+		pr_warn("md/raid0:%s: raid5 must be degraded! Degraded disks: %d\n",
 			mdname(mddev),
 			mddev->degraded);
 		return ERR_PTR(-EINVAL);
@@ -518,7 +524,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	rdev_for_each(rdev, mddev) {
 		/* check slot number for a disk */
 		if (rdev->raid_disk == mddev->raid_disks-1) {
-			printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
+			pr_warn("md/raid0:%s: raid5 must have missing parity disk!\n",
 				mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
@@ -533,8 +539,11 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	mddev->delta_disks = -1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_HAS_JOURNAL, &mddev->flags);
+	clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
 }
@@ -549,18 +558,18 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	 *  - all mirrors must be already degraded
 	 */
 	if (mddev->layout != ((1 << 8) + 2)) {
-		printk(KERN_ERR "md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
+		pr_warn("md/raid0:%s:: Raid0 cannot takeover layout: 0x%x\n",
 			mdname(mddev),
 			mddev->layout);
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->raid_disks & 1) {
-		printk(KERN_ERR "md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
+		pr_warn("md/raid0:%s: Raid0 cannot takeover Raid10 with odd disk number.\n",
 			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 	if (mddev->degraded != (mddev->raid_disks>>1)) {
-		printk(KERN_ERR "md/raid0:%s: All mirrors must be already degraded!\n",
+		pr_warn("md/raid0:%s: All mirrors must be already degraded!\n",
 			mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
@@ -574,6 +583,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	mddev->degraded = 0;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
@@ -588,7 +598,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	 *  - (N - 1) mirror drives must be already faulty
 	 */
 	if ((mddev->raid_disks - 1) != mddev->degraded) {
-		printk(KERN_ERR "md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
+		pr_err("md/raid0:%s: (N - 1) mirrors drives must be already faulty!\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
@@ -616,6 +626,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	mddev->raid_disks = 1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
+	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
@@ -631,7 +642,7 @@ static void *raid0_takeover(struct mddev *mddev)
 	 */
 	if (mddev->bitmap) {
-		printk(KERN_ERR "md/raid0: %s: cannot takeover array with bitmap\n",
+		pr_warn("md/raid0: %s: cannot takeover array with bitmap\n",
 			mdname(mddev));
 		return ERR_PTR(-EBUSY);
 	}
@@ -642,7 +653,7 @@ static void *raid0_takeover(struct mddev *mddev)
 		if (mddev->layout == ALGORITHM_PARITY_N)
 			return raid0_takeover_raid45(mddev);
-		printk(KERN_ERR "md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
+		pr_warn("md/raid0:%s: Raid can only takeover Raid5 with layout: %d\n",
 			mdname(mddev), ALGORITHM_PARITY_N);
 	}
@@ -652,7 +663,7 @@ static void *raid0_takeover(struct mddev *mddev)
 	if (mddev->level == 1)
 		return raid0_takeover_raid1(mddev);
-	printk(KERN_ERR "Takeover from raid%i to raid0 not supported\n",
+	pr_warn("Takeover from raid%i to raid0 not supported\n",
 		mddev->level);
 	return ERR_PTR(-EINVAL);

--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -161,14 +161,15 @@ struct r1bio {
 };
 /* bits for r1bio.state */
-#define	R1BIO_Uptodate	0
+enum r1bio_state {
-#define	R1BIO_IsSync	1
+	R1BIO_Uptodate,
-#define	R1BIO_Degraded	2
+	R1BIO_IsSync,
-#define	R1BIO_BehindIO	3
+	R1BIO_Degraded,
+	R1BIO_BehindIO,
 /* Set ReadError on bios that experience a readerror so that
 * raid1d knows what to do with them.
 */
-#define R1BIO_ReadError 4
+	R1BIO_ReadError,
 /* For write-behind requests, we call bi_end_io when
 * the last non-write-behind device completes, providing
 * any write was successful.  Otherwise we call when
@@ -176,10 +177,12 @@ struct r1bio {
 * with failure when last write completes (and all failed).
 * Record that bi_end_io was called with this flag...
 */
-#define	R1BIO_Returned 6
+	R1BIO_Returned,
 /* If a write for this request means we can clear some
 * known-bad-block records, we set this flag
 */
-#define	R1BIO_MadeGood 7
+	R1BIO_MadeGood,
-#define	R1BIO_WriteError 8
+	R1BIO_WriteError,
+	R1BIO_FailFast,
+};
 #endif
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -156,5 +156,7 @@ enum r10bio_state {
 * flag is set
 */
 	R10BIO_Previous,
+/* failfast devices did receive failfast requests. */
+	R10BIO_FailFast,
 };
 #endif
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
--- a/include/uapi/linux/raid/md_p.h
+++ b/include/uapi/linux/raid/md_p.h
@@ -84,6 +84,10 @@
 #define MD_DISK_CANDIDATE	5 /* disk is added as spare (local) until confirmed
 				   * For clustered enviroments only.
 				   */
+#define MD_DISK_FAILFAST	10 /* Send REQ_FAILFAST if there are multiple
+				    * devices available - and don't try to
+				    * correct read errors.
+				    */
 #define	MD_DISK_WRITEMOSTLY	9 /* disk is "write-mostly" is RAID1 config.
 				   * read requests will only be sent here in
@@ -265,8 +269,9 @@ struct mdp_superblock_1 {
 	__le32	dev_number;	/* permanent identifier of this  device - not role in raid */
 	__le32	cnt_corrected_read; /* number of read errors that were corrected by re-writing */
 	__u8	device_uuid[16]; /* user-space setable, ignored by kernel */
-	__u8	devflags;	/* per-device flags.  Only one defined...*/
+	__u8	devflags;	/* per-device flags.  Only two defined...*/
 #define	WriteMostly1	1	/* mask for writemostly flag in above */
+#define	FailFast1	2	/* Should avoid retries and fixups and just fail */
 	/* Bad block log.  If there are any bad blocks the feature flag is set.
 	 * If offset and size are non-zero, that space is reserved and available
 	 */

--- a/lib/raid6/avx2.c
+++ b/lib/raid6/avx2.c