Commit e9c7469b authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

md: implment REQ_FLUSH/FUA support

This patch converts md to support REQ_FLUSH/FUA instead of now
deprecated REQ_HARDBARRIER.  In the core part (md.c), the following
changes are notable.

* Unlike REQ_HARDBARRIER, REQ_FLUSH/FUA don't interfere with
  processing of other requests and thus there is no reason to mark the
  queue congested while FLUSH/FUA is in progress.

* REQ_FLUSH/FUA failures are final and its users don't need retry
  logic.  Retry logic is removed.

* Preflush needs to be issued to all member devices but FUA writes can
  be handled the same way as other writes - their processing can be
  deferred to request_queue of member devices.  md_barrier_request()
  is renamed to md_flush_request() and simplified accordingly.

For linear, raid0 and multipath, the core changes are enough.  raid1,
5 and 10 need the following conversions.

* raid1: Handling of FLUSH/FUA bio's can simply be deferred to
  request_queues of member devices.  Barrier related logic removed.

* raid5: Queue draining logic dropped.  FUA bit is propagated through
  biodrain and stripe resconstruction such that all the updated parts
  of the stripe are written out with FUA writes if any of the dirtying
  writes was FUA.  preread_active_stripes handling in make_request()
  is updated as suggested by Neil Brown.

* raid10: FUA bit needs to be propagated to write clones.

linear, raid0, 1, 5 and 10 tested.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Reviewed-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarJens Axboe <jaxboe@fusionio.com>
parent 7bc9fdda
...@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio) ...@@ -294,8 +294,8 @@ static int linear_make_request (mddev_t *mddev, struct bio *bio)
dev_info_t *tmp_dev; dev_info_t *tmp_dev;
sector_t start_sector; sector_t start_sector;
if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_barrier_request(mddev, bio); md_flush_request(mddev, bio);
return 0; return 0;
} }
......
...@@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio) ...@@ -226,12 +226,12 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
return 0; return 0;
} }
rcu_read_lock(); rcu_read_lock();
if (mddev->suspended || mddev->barrier) { if (mddev->suspended) {
DEFINE_WAIT(__wait); DEFINE_WAIT(__wait);
for (;;) { for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait, prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
if (!mddev->suspended && !mddev->barrier) if (!mddev->suspended)
break; break;
rcu_read_unlock(); rcu_read_unlock();
schedule(); schedule();
...@@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume); ...@@ -282,40 +282,29 @@ EXPORT_SYMBOL_GPL(mddev_resume);
int mddev_congested(mddev_t *mddev, int bits) int mddev_congested(mddev_t *mddev, int bits)
{ {
if (mddev->barrier)
return 1;
return mddev->suspended; return mddev->suspended;
} }
EXPORT_SYMBOL(mddev_congested); EXPORT_SYMBOL(mddev_congested);
/* /*
* Generic barrier handling for md * Generic flush handling for md
*/ */
#define POST_REQUEST_BARRIER ((void*)1) static void md_end_flush(struct bio *bio, int err)
static void md_end_barrier(struct bio *bio, int err)
{ {
mdk_rdev_t *rdev = bio->bi_private; mdk_rdev_t *rdev = bio->bi_private;
mddev_t *mddev = rdev->mddev; mddev_t *mddev = rdev->mddev;
if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) { if (atomic_dec_and_test(&mddev->flush_pending)) {
if (mddev->barrier == POST_REQUEST_BARRIER) { /* The pre-request flush has finished */
/* This was a post-request barrier */ schedule_work(&mddev->flush_work);
mddev->barrier = NULL;
wake_up(&mddev->sb_wait);
} else
/* The pre-request barrier has finished */
schedule_work(&mddev->barrier_work);
} }
bio_put(bio); bio_put(bio);
} }
static void submit_barriers(mddev_t *mddev) static void submit_flushes(mddev_t *mddev)
{ {
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
...@@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev) ...@@ -332,60 +321,56 @@ static void submit_barriers(mddev_t *mddev)
atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending);
rcu_read_unlock(); rcu_read_unlock();
bi = bio_alloc(GFP_KERNEL, 0); bi = bio_alloc(GFP_KERNEL, 0);
bi->bi_end_io = md_end_barrier; bi->bi_end_io = md_end_flush;
bi->bi_private = rdev; bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev; bi->bi_bdev = rdev->bdev;
atomic_inc(&mddev->flush_pending); atomic_inc(&mddev->flush_pending);
submit_bio(WRITE_BARRIER, bi); submit_bio(WRITE_FLUSH, bi);
rcu_read_lock(); rcu_read_lock();
rdev_dec_pending(rdev, mddev); rdev_dec_pending(rdev, mddev);
} }
rcu_read_unlock(); rcu_read_unlock();
} }
static void md_submit_barrier(struct work_struct *ws) static void md_submit_flush_data(struct work_struct *ws)
{ {
mddev_t *mddev = container_of(ws, mddev_t, barrier_work); mddev_t *mddev = container_of(ws, mddev_t, flush_work);
struct bio *bio = mddev->barrier; struct bio *bio = mddev->flush_bio;
atomic_set(&mddev->flush_pending, 1); atomic_set(&mddev->flush_pending, 1);
if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags)) if (bio->bi_size == 0)
bio_endio(bio, -EOPNOTSUPP);
else if (bio->bi_size == 0)
/* an empty barrier - all done */ /* an empty barrier - all done */
bio_endio(bio, 0); bio_endio(bio, 0);
else { else {
bio->bi_rw &= ~REQ_HARDBARRIER; bio->bi_rw &= ~REQ_FLUSH;
if (mddev->pers->make_request(mddev, bio)) if (mddev->pers->make_request(mddev, bio))
generic_make_request(bio); generic_make_request(bio);
mddev->barrier = POST_REQUEST_BARRIER;
submit_barriers(mddev);
} }
if (atomic_dec_and_test(&mddev->flush_pending)) { if (atomic_dec_and_test(&mddev->flush_pending)) {
mddev->barrier = NULL; mddev->flush_bio = NULL;
wake_up(&mddev->sb_wait); wake_up(&mddev->sb_wait);
} }
} }
void md_barrier_request(mddev_t *mddev, struct bio *bio) void md_flush_request(mddev_t *mddev, struct bio *bio)
{ {
spin_lock_irq(&mddev->write_lock); spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait, wait_event_lock_irq(mddev->sb_wait,
!mddev->barrier, !mddev->flush_bio,
mddev->write_lock, /*nothing*/); mddev->write_lock, /*nothing*/);
mddev->barrier = bio; mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock); spin_unlock_irq(&mddev->write_lock);
atomic_set(&mddev->flush_pending, 1); atomic_set(&mddev->flush_pending, 1);
INIT_WORK(&mddev->barrier_work, md_submit_barrier); INIT_WORK(&mddev->flush_work, md_submit_flush_data);
submit_barriers(mddev); submit_flushes(mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) if (atomic_dec_and_test(&mddev->flush_pending))
schedule_work(&mddev->barrier_work); schedule_work(&mddev->flush_work);
} }
EXPORT_SYMBOL(md_barrier_request); EXPORT_SYMBOL(md_flush_request);
/* Support for plugging. /* Support for plugging.
* This mirrors the plugging support in request_queue, but does not * This mirrors the plugging support in request_queue, but does not
...@@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error) ...@@ -696,31 +681,6 @@ static void super_written(struct bio *bio, int error)
bio_put(bio); bio_put(bio);
} }
static void super_written_barrier(struct bio *bio, int error)
{
struct bio *bio2 = bio->bi_private;
mdk_rdev_t *rdev = bio2->bi_private;
mddev_t *mddev = rdev->mddev;
if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
error == -EOPNOTSUPP) {
unsigned long flags;
/* barriers don't appear to be supported :-( */
set_bit(BarriersNotsupp, &rdev->flags);
mddev->barriers_work = 0;
spin_lock_irqsave(&mddev->write_lock, flags);
bio2->bi_next = mddev->biolist;
mddev->biolist = bio2;
spin_unlock_irqrestore(&mddev->write_lock, flags);
wake_up(&mddev->sb_wait);
bio_put(bio);
} else {
bio_put(bio2);
bio->bi_private = rdev;
super_written(bio, error);
}
}
void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page) sector_t sector, int size, struct page *page)
{ {
...@@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, ...@@ -729,51 +689,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
* and decrement it on completion, waking up sb_wait * and decrement it on completion, waking up sb_wait
* if zero is reached. * if zero is reached.
* If an error occurred, call md_error * If an error occurred, call md_error
*
* As we might need to resubmit the request if REQ_HARDBARRIER
* causes ENOTSUPP, we allocate a spare bio...
*/ */
struct bio *bio = bio_alloc(GFP_NOIO, 1); struct bio *bio = bio_alloc(GFP_NOIO, 1);
int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
bio->bi_bdev = rdev->bdev; bio->bi_bdev = rdev->bdev;
bio->bi_sector = sector; bio->bi_sector = sector;
bio_add_page(bio, page, size, 0); bio_add_page(bio, page, size, 0);
bio->bi_private = rdev; bio->bi_private = rdev;
bio->bi_end_io = super_written; bio->bi_end_io = super_written;
bio->bi_rw = rw;
atomic_inc(&mddev->pending_writes); atomic_inc(&mddev->pending_writes);
if (!test_bit(BarriersNotsupp, &rdev->flags)) { submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
struct bio *rbio; bio);
rw |= REQ_HARDBARRIER;
rbio = bio_clone(bio, GFP_NOIO);
rbio->bi_private = bio;
rbio->bi_end_io = super_written_barrier;
submit_bio(rw, rbio);
} else
submit_bio(rw, bio);
} }
void md_super_wait(mddev_t *mddev) void md_super_wait(mddev_t *mddev)
{ {
/* wait for all superblock writes that were scheduled to complete. /* wait for all superblock writes that were scheduled to complete */
* if any had to be retried (due to BARRIER problems), retry them
*/
DEFINE_WAIT(wq); DEFINE_WAIT(wq);
for(;;) { for(;;) {
prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
if (atomic_read(&mddev->pending_writes)==0) if (atomic_read(&mddev->pending_writes)==0)
break; break;
while (mddev->biolist) {
struct bio *bio;
spin_lock_irq(&mddev->write_lock);
bio = mddev->biolist;
mddev->biolist = bio->bi_next ;
bio->bi_next = NULL;
spin_unlock_irq(&mddev->write_lock);
submit_bio(bio->bi_rw, bio);
}
schedule(); schedule();
} }
finish_wait(&mddev->sb_wait, &wq); finish_wait(&mddev->sb_wait, &wq);
...@@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1070,7 +1007,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags); clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags); clear_bit(WriteMostly, &rdev->flags);
clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) { if (mddev->raid_disks == 0) {
mddev->major_version = 0; mddev->major_version = 0;
...@@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) ...@@ -1485,7 +1421,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags); clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags); clear_bit(WriteMostly, &rdev->flags);
clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) { if (mddev->raid_disks == 0) {
mddev->major_version = 1; mddev->major_version = 1;
...@@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev) ...@@ -4506,7 +4441,6 @@ int md_run(mddev_t *mddev)
/* may be over-ridden by personality */ /* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors; mddev->resync_max_sectors = mddev->dev_sectors;
mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded; mddev->ok_start_degraded = start_dirty_degraded;
if (start_readonly && mddev->ro == 0) if (start_readonly && mddev->ro == 0)
...@@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev) ...@@ -4685,7 +4619,6 @@ static void md_clean(mddev_t *mddev)
mddev->recovery = 0; mddev->recovery = 0;
mddev->in_sync = 0; mddev->in_sync = 0;
mddev->degraded = 0; mddev->degraded = 0;
mddev->barriers_work = 0;
mddev->safemode = 0; mddev->safemode = 0;
mddev->bitmap_info.offset = 0; mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0; mddev->bitmap_info.default_offset = 0;
......
...@@ -87,7 +87,6 @@ struct mdk_rdev_s ...@@ -87,7 +87,6 @@ struct mdk_rdev_s
#define Faulty 1 /* device is known to have a fault */ #define Faulty 1 /* device is known to have a fault */
#define In_sync 2 /* device is in_sync with rest of array */ #define In_sync 2 /* device is in_sync with rest of array */
#define WriteMostly 4 /* Avoid reading if at all possible */ #define WriteMostly 4 /* Avoid reading if at all possible */
#define BarriersNotsupp 5 /* REQ_HARDBARRIER is not supported */
#define AllReserved 6 /* If whole device is reserved for #define AllReserved 6 /* If whole device is reserved for
* one array */ * one array */
#define AutoDetected 7 /* added by auto-detect */ #define AutoDetected 7 /* added by auto-detect */
...@@ -273,13 +272,6 @@ struct mddev_s ...@@ -273,13 +272,6 @@ struct mddev_s
int degraded; /* whether md should consider int degraded; /* whether md should consider
* adding a spare * adding a spare
*/ */
int barriers_work; /* initialised to true, cleared as soon
* as a barrier request to slave
* fails. Only supported
*/
struct bio *biolist; /* bios that need to be retried
* because REQ_HARDBARRIER is not supported
*/
atomic_t recovery_active; /* blocks scheduled, but not written */ atomic_t recovery_active; /* blocks scheduled, but not written */
wait_queue_head_t recovery_wait; wait_queue_head_t recovery_wait;
...@@ -339,16 +331,13 @@ struct mddev_s ...@@ -339,16 +331,13 @@ struct mddev_s
struct attribute_group *to_remove; struct attribute_group *to_remove;
struct plug_handle *plug; /* if used by personality */ struct plug_handle *plug; /* if used by personality */
/* Generic barrier handling. /* Generic flush handling.
* If there is a pending barrier request, all other * The last to finish preflush schedules a worker to submit
* writes are blocked while the devices are flushed. * the rest of the request (without the REQ_FLUSH flag).
* The last to finish a flush schedules a worker to
* submit the barrier request (without the barrier flag),
* then submit more flush requests.
*/ */
struct bio *barrier; struct bio *flush_bio;
atomic_t flush_pending; atomic_t flush_pending;
struct work_struct barrier_work; struct work_struct flush_work;
struct work_struct event_work; /* used by dm to report failure event */ struct work_struct event_work; /* used by dm to report failure event */
}; };
...@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok); ...@@ -502,7 +491,7 @@ extern void md_done_sync(mddev_t *mddev, int blocks, int ok);
extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev); extern void md_error(mddev_t *mddev, mdk_rdev_t *rdev);
extern int mddev_congested(mddev_t *mddev, int bits); extern int mddev_congested(mddev_t *mddev, int bits);
extern void md_barrier_request(mddev_t *mddev, struct bio *bio); extern void md_flush_request(mddev_t *mddev, struct bio *bio);
extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, extern void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page); sector_t sector, int size, struct page *page);
extern void md_super_wait(mddev_t *mddev); extern void md_super_wait(mddev_t *mddev);
......
...@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio) ...@@ -142,8 +142,8 @@ static int multipath_make_request(mddev_t *mddev, struct bio * bio)
struct multipath_bh * mp_bh; struct multipath_bh * mp_bh;
struct multipath_info *multipath; struct multipath_info *multipath;
if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_barrier_request(mddev, bio); md_flush_request(mddev, bio);
return 0; return 0;
} }
......
...@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio) ...@@ -483,8 +483,8 @@ static int raid0_make_request(mddev_t *mddev, struct bio *bio)
struct strip_zone *zone; struct strip_zone *zone;
mdk_rdev_t *tmp_dev; mdk_rdev_t *tmp_dev;
if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_barrier_request(mddev, bio); md_flush_request(mddev, bio);
return 0; return 0;
} }
......
...@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error) ...@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
if (r1_bio->bios[mirror] == bio) if (r1_bio->bios[mirror] == bio)
break; break;
if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { /*
set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); * 'one mirror IO has finished' event handler:
set_bit(R1BIO_BarrierRetry, &r1_bio->state); */
r1_bio->mddev->barriers_work = 0; r1_bio->bios[mirror] = NULL;
/* Don't rdev_dec_pending in this branch - keep it for the retry */ to_put = bio;
} else { if (!uptodate) {
md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
/* an I/O failed, we can't clear the bitmap */
set_bit(R1BIO_Degraded, &r1_bio->state);
} else
/* /*
* this branch is our 'one mirror IO has finished' event handler: * Set R1BIO_Uptodate in our master bio, so that we
* will return a good error code for to the higher
* levels even if IO on some other mirrored buffer
* fails.
*
* The 'master' represents the composite IO operation
* to user-side. So if something waits for IO, then it
* will wait for the 'master' bio.
*/ */
r1_bio->bios[mirror] = NULL; set_bit(R1BIO_Uptodate, &r1_bio->state);
to_put = bio;
if (!uptodate) { update_head_pos(mirror, r1_bio);
md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
/* an I/O failed, we can't clear the bitmap */ if (behind) {
set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
} else atomic_dec(&r1_bio->behind_remaining);
/*
* Set R1BIO_Uptodate in our master bio, so that /*
* we will return a good error code for to the higher * In behind mode, we ACK the master bio once the I/O
* levels even if IO on some other mirrored buffer fails. * has safely reached all non-writemostly
* * disks. Setting the Returned bit ensures that this
* The 'master' represents the composite IO operation to * gets done only once -- we don't ever want to return
* user-side. So if something waits for IO, then it will * -EIO here, instead we'll wait
* wait for the 'master' bio. */
*/ if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
set_bit(R1BIO_Uptodate, &r1_bio->state); test_bit(R1BIO_Uptodate, &r1_bio->state)) {
/* Maybe we can return now */
update_head_pos(mirror, r1_bio); if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
struct bio *mbio = r1_bio->master_bio;
if (behind) { PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) (unsigned long long) mbio->bi_sector,
atomic_dec(&r1_bio->behind_remaining); (unsigned long long) mbio->bi_sector +
(mbio->bi_size >> 9) - 1);
/* In behind mode, we ACK the master bio once the I/O has safely bio_endio(mbio, 0);
* reached all non-writemostly disks. Setting the Returned bit
* ensures that this gets done only once -- we don't ever want to
* return -EIO here, instead we'll wait */
if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
test_bit(R1BIO_Uptodate, &r1_bio->state)) {
/* Maybe we can return now */
if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
struct bio *mbio = r1_bio->master_bio;
PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
(unsigned long long) mbio->bi_sector,
(unsigned long long) mbio->bi_sector +
(mbio->bi_size >> 9) - 1);
bio_endio(mbio, 0);
}
} }
} }
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
} }
rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
/* /*
*
* Let's see if all mirrored write operations have finished * Let's see if all mirrored write operations have finished
* already. * already.
*/ */
if (atomic_dec_and_test(&r1_bio->remaining)) { if (atomic_dec_and_test(&r1_bio->remaining)) {
if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
reschedule_retry(r1_bio); /* free extra copy of the data pages */
else { int i = bio->bi_vcnt;
/* it really is the end of this request */ while (i--)
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { safe_put_page(bio->bi_io_vec[i].bv_page);
/* free extra copy of the data pages */
int i = bio->bi_vcnt;
while (i--)
safe_put_page(bio->bi_io_vec[i].bv_page);
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state),
behind);
md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio);
} }
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
r1_bio->sectors,
!test_bit(R1BIO_Degraded, &r1_bio->state),
behind);
md_write_end(r1_bio->mddev);
raid_end_bio_io(r1_bio);
} }
if (to_put) if (to_put)
...@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
struct page **behind_pages = NULL; struct page **behind_pages = NULL;
const int rw = bio_data_dir(bio); const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
unsigned long do_barriers; const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
mdk_rdev_t *blocked_rdev; mdk_rdev_t *blocked_rdev;
/* /*
* Register the new request and wait if the reconstruction * Register the new request and wait if the reconstruction
* thread has put up a bar for new requests. * thread has put up a bar for new requests.
* Continue immediately if no resync is active currently. * Continue immediately if no resync is active currently.
* We test barriers_work *after* md_write_start as md_write_start
* may cause the first superblock write, and that will check out
* if barriers work.
*/ */
md_write_start(mddev, bio); /* wait on superblock update early */ md_write_start(mddev, bio); /* wait on superblock update early */
...@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
} }
finish_wait(&conf->wait_barrier, &w); finish_wait(&conf->wait_barrier, &w);
} }
if (unlikely(!mddev->barriers_work &&
(bio->bi_rw & REQ_HARDBARRIER))) {
if (rw == WRITE)
md_write_end(mddev);
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
wait_barrier(conf); wait_barrier(conf);
...@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
atomic_set(&r1_bio->remaining, 0); atomic_set(&r1_bio->remaining, 0);
atomic_set(&r1_bio->behind_remaining, 0); atomic_set(&r1_bio->behind_remaining, 0);
do_barriers = bio->bi_rw & REQ_HARDBARRIER;
if (do_barriers)
set_bit(R1BIO_Barrier, &r1_bio->state);
bio_list_init(&bl); bio_list_init(&bl);
for (i = 0; i < disks; i++) { for (i = 0; i < disks; i++) {
struct bio *mbio; struct bio *mbio;
...@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
mbio->bi_end_io = raid1_end_write_request; mbio->bi_end_io = raid1_end_write_request;
mbio->bi_rw = WRITE | do_barriers | do_sync; mbio->bi_rw = WRITE | do_flush_fua | do_sync;
mbio->bi_private = r1_bio; mbio->bi_private = r1_bio;
if (behind_pages) { if (behind_pages) {
...@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev) ...@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
if (test_bit(R1BIO_IsSync, &r1_bio->state)) { if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
sync_request_write(mddev, r1_bio); sync_request_write(mddev, r1_bio);
unplug = 1; unplug = 1;
} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
/* some requests in the r1bio were REQ_HARDBARRIER
* requests which failed with -EOPNOTSUPP. Hohumm..
* Better resubmit without the barrier.
* We know which devices to resubmit for, because
* all others have had their bios[] entry cleared.
* We already have a nr_pending reference on these rdevs.
*/
int i;
const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
clear_bit(R1BIO_Barrier, &r1_bio->state);
for (i=0; i < conf->raid_disks; i++)
if (r1_bio->bios[i])
atomic_inc(&r1_bio->remaining);
for (i=0; i < conf->raid_disks; i++)
if (r1_bio->bios[i]) {
struct bio_vec *bvec;
int j;
bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
/* copy pages from the failed bio, as
* this might be a write-behind device */
__bio_for_each_segment(bvec, bio, j, 0)
bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
bio_put(r1_bio->bios[i]);
bio->bi_sector = r1_bio->sector +
conf->mirrors[i].rdev->data_offset;
bio->bi_bdev = conf->mirrors[i].rdev->bdev;
bio->bi_end_io = raid1_end_write_request;
bio->bi_rw = WRITE | do_sync;
bio->bi_private = r1_bio;
r1_bio->bios[i] = bio;
generic_make_request(bio);
}
} else { } else {
int disk; int disk;
......
...@@ -117,8 +117,6 @@ struct r1bio_s { ...@@ -117,8 +117,6 @@ struct r1bio_s {
#define R1BIO_IsSync 1 #define R1BIO_IsSync 1
#define R1BIO_Degraded 2 #define R1BIO_Degraded 2
#define R1BIO_BehindIO 3 #define R1BIO_BehindIO 3
#define R1BIO_Barrier 4
#define R1BIO_BarrierRetry 5
/* For write-behind requests, we call bi_end_io when /* For write-behind requests, we call bi_end_io when
* the last non-write-behind device completes, providing * the last non-write-behind device completes, providing
* any write was successful. Otherwise we call when * any write was successful. Otherwise we call when
......
...@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -800,12 +800,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
int chunk_sects = conf->chunk_mask + 1; int chunk_sects = conf->chunk_mask + 1;
const int rw = bio_data_dir(bio); const int rw = bio_data_dir(bio);
const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
struct bio_list bl; struct bio_list bl;
unsigned long flags; unsigned long flags;
mdk_rdev_t *blocked_rdev; mdk_rdev_t *blocked_rdev;
if (unlikely(bio->bi_rw & REQ_HARDBARRIER)) { if (unlikely(bio->bi_rw & REQ_FLUSH)) {
md_barrier_request(mddev, bio); md_flush_request(mddev, bio);
return 0; return 0;
} }
...@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) ...@@ -965,7 +966,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
conf->mirrors[d].rdev->data_offset; conf->mirrors[d].rdev->data_offset;
mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
mbio->bi_end_io = raid10_end_write_request; mbio->bi_end_io = raid10_end_write_request;
mbio->bi_rw = WRITE | do_sync; mbio->bi_rw = WRITE | do_sync | do_fua;
mbio->bi_private = r10_bio; mbio->bi_private = r10_bio;
atomic_inc(&r10_bio->remaining); atomic_inc(&r10_bio->remaining);
......
...@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -506,9 +506,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
int rw; int rw;
struct bio *bi; struct bio *bi;
mdk_rdev_t *rdev; mdk_rdev_t *rdev;
if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) {
rw = WRITE; if (test_and_clear_bit(R5_WantFUA, &sh->dev[i].flags))
else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) rw = WRITE_FUA;
else
rw = WRITE;
} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
rw = READ; rw = READ;
else else
continue; continue;
...@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) ...@@ -1031,6 +1034,8 @@ ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
while (wbi && wbi->bi_sector < while (wbi && wbi->bi_sector <
dev->sector + STRIPE_SECTORS) { dev->sector + STRIPE_SECTORS) {
if (wbi->bi_rw & REQ_FUA)
set_bit(R5_WantFUA, &dev->flags);
tx = async_copy_data(1, wbi, dev->page, tx = async_copy_data(1, wbi, dev->page,
dev->sector, tx); dev->sector, tx);
wbi = r5_next_bio(wbi, dev->sector); wbi = r5_next_bio(wbi, dev->sector);
...@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref) ...@@ -1048,15 +1053,22 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
int pd_idx = sh->pd_idx; int pd_idx = sh->pd_idx;
int qd_idx = sh->qd_idx; int qd_idx = sh->qd_idx;
int i; int i;
bool fua = false;
pr_debug("%s: stripe %llu\n", __func__, pr_debug("%s: stripe %llu\n", __func__,
(unsigned long long)sh->sector); (unsigned long long)sh->sector);
for (i = disks; i--; )
fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
for (i = disks; i--; ) { for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) if (dev->written || i == pd_idx || i == qd_idx) {
set_bit(R5_UPTODATE, &dev->flags); set_bit(R5_UPTODATE, &dev->flags);
if (fua)
set_bit(R5_WantFUA, &dev->flags);
}
} }
if (sh->reconstruct_state == reconstruct_state_drain_run) if (sh->reconstruct_state == reconstruct_state_drain_run)
...@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh) ...@@ -3281,7 +3293,7 @@ static void handle_stripe5(struct stripe_head *sh)
if (dec_preread_active) { if (dec_preread_active) {
/* We delay this until after ops_run_io so that if make_request /* We delay this until after ops_run_io so that if make_request
* is waiting on a barrier, it won't continue until the writes * is waiting on a flush, it won't continue until the writes
* have actually been submitted. * have actually been submitted.
*/ */
atomic_dec(&conf->preread_active_stripes); atomic_dec(&conf->preread_active_stripes);
...@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh) ...@@ -3583,7 +3595,7 @@ static void handle_stripe6(struct stripe_head *sh)
if (dec_preread_active) { if (dec_preread_active) {
/* We delay this until after ops_run_io so that if make_request /* We delay this until after ops_run_io so that if make_request
* is waiting on a barrier, it won't continue until the writes * is waiting on a flush, it won't continue until the writes
* have actually been submitted. * have actually been submitted.
*/ */
atomic_dec(&conf->preread_active_stripes); atomic_dec(&conf->preread_active_stripes);
...@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi) ...@@ -3978,14 +3990,8 @@ static int make_request(mddev_t *mddev, struct bio * bi)
const int rw = bio_data_dir(bi); const int rw = bio_data_dir(bi);
int remaining; int remaining;
if (unlikely(bi->bi_rw & REQ_HARDBARRIER)) { if (unlikely(bi->bi_rw & REQ_FLUSH)) {
/* Drain all pending writes. We only really need md_flush_request(mddev, bi);
* to ensure they have been submitted, but this is
* easier.
*/
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
md_barrier_request(mddev, bi);
return 0; return 0;
} }
...@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi) ...@@ -4103,7 +4109,7 @@ static int make_request(mddev_t *mddev, struct bio * bi)
finish_wait(&conf->wait_for_overlap, &w); finish_wait(&conf->wait_for_overlap, &w);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state);
if (mddev->barrier && if ((bi->bi_rw & REQ_SYNC) &&
!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) !test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
atomic_inc(&conf->preread_active_stripes); atomic_inc(&conf->preread_active_stripes);
release_stripe(sh); release_stripe(sh);
...@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi) ...@@ -4126,13 +4132,6 @@ static int make_request(mddev_t *mddev, struct bio * bi)
bio_endio(bi, 0); bio_endio(bi, 0);
} }
if (mddev->barrier) {
/* We need to wait for the stripes to all be handled.
* So: wait for preread_active_stripes to drop to 0.
*/
wait_event(mddev->thread->wqueue,
atomic_read(&conf->preread_active_stripes) == 0);
}
return 0; return 0;
} }
......
...@@ -275,6 +275,7 @@ struct r6_state { ...@@ -275,6 +275,7 @@ struct r6_state {
* filling * filling
*/ */
#define R5_Wantdrain 13 /* dev->towrite needs to be drained */ #define R5_Wantdrain 13 /* dev->towrite needs to be drained */
#define R5_WantFUA 14 /* Write should be FUA */
/* /*
* Write method * Write method
*/ */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment