Commit 47f521ba authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull MD update from Shaohua Li:
 "This update mostly includes bug fixes:

   - md-cluster now supports raid10 from Guoqing

   - raid5 PPL fixes from Artur

   - badblock regression fix from Bo

   - suspend hang related fixes from Neil

   - raid5 reshape fixes from Neil

   - raid1 freeze deadlock fix from Nate

   - memleak fixes from Zdenek

   - bitmap related fixes from Me and Tao

   - other fixes and cleanups"

* 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: (33 commits)
  md: free unused memory after bitmap resize
  md: release allocated bitset sync_set
  md/bitmap: clear BITMAP_WRITE_ERROR bit before writing it to sb
  md: be cautious about using ->curr_resync_completed for ->recovery_offset
  badblocks: fix wrong return value in badblocks_set if badblocks are disabled
  md: don't check MD_SB_CHANGE_CLEAN in md_allow_write
  md-cluster: update document for raid10
  md: remove redundant variable q
  raid1: remove obsolete code in raid1_write_request
  md-cluster: Use a small window for raid10 resync
  md-cluster: Suspend writes in RAID10 if within range
  md-cluster/raid10: set "do_balance = 0" if area is resyncing
  md: use lockdep_assert_held
  raid1: prevent freeze_array/wait_all_barriers deadlock
  md: use TASK_IDLE instead of blocking signals
  md: remove special meaning of ->quiesce(.., 2)
  md: allow metadata update while suspending.
  md: use mddev_suspend/resume instead of ->quiesce()
  md: move suspend_hi/lo handling into core md code
  md: don't call bitmap_create() while array is quiesced.
  ...
parents b91593fa 0868b99c
The cluster MD is a shared-device RAID for a cluster.
The cluster MD is a shared-device RAID for a cluster, it supports
two levels: raid1 and raid10 (limited support).
1. On-disk format
......
......@@ -4103,6 +4103,8 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm.git
T: quilt http://people.redhat.com/agk/patches/linux/editing/
S: Maintained
F: Documentation/device-mapper/
F: drivers/md/Makefile
F: drivers/md/Kconfig
F: drivers/md/dm*
F: drivers/md/persistent-data/
F: include/linux/device-mapper.h
......@@ -12487,7 +12489,10 @@ M: Shaohua Li <shli@kernel.org>
L: linux-raid@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/shli/md.git
S: Supported
F: drivers/md/
F: drivers/md/Makefile
F: drivers/md/Kconfig
F: drivers/md/md*
F: drivers/md/raid*
F: include/linux/raid/
F: include/uapi/linux/raid/
......
......@@ -178,7 +178,7 @@ int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
if (bb->shift < 0)
/* badblocks are disabled */
return 0;
return 1;
if (bb->shift) {
/* round the start down, and the end up */
......
......@@ -178,7 +178,7 @@ config MD_FAULTY
config MD_CLUSTER
tristate "Cluster Support for MD (EXPERIMENTAL)"
tristate "Cluster Support for MD"
depends on BLK_DEV_MD
depends on DLM
default n
......@@ -188,7 +188,8 @@ config MD_CLUSTER
nodes in the cluster can access the MD devices simultaneously.
This brings the redundancy (and uptime) of RAID levels across the
nodes of the cluster.
nodes of the cluster. Currently, it can work with raid1 and raid10
(limited support).
If unsure, say N.
......
......@@ -19,9 +19,12 @@ dm-cache-y += dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
dm-cache-smq-y += dm-cache-policy-smq.o
dm-era-y += dm-era-target.o
dm-verity-y += dm-verity-target.o
md-mod-y += md.o bitmap.o
md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
linear-y += md-linear.o
multipath-y += md-multipath.o
faulty-y += md-faulty.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
......
......@@ -12,7 +12,7 @@
#include "raid1.h"
#include "raid5.h"
#include "raid10.h"
#include "bitmap.h"
#include "md-bitmap.h"
#include <linux/device-mapper.h>
......@@ -3630,8 +3630,11 @@ static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
mddev_lock_nointr(&rs->md);
mddev_suspend(&rs->md);
mddev_unlock(&rs->md);
}
rs->md.ro = 1;
}
......@@ -3888,8 +3891,11 @@ static void raid_resume(struct dm_target *ti)
if (!(rs->ctr_flags & RESUME_STAY_FROZEN_FLAGS))
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags))
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
mddev_lock_nointr(mddev);
mddev_resume(mddev);
mddev_unlock(mddev);
}
}
static struct target_type raid_target = {
......
......@@ -29,7 +29,7 @@
#include <linux/seq_file.h>
#include <trace/events/block.h>
#include "md.h"
#include "bitmap.h"
#include "md-bitmap.h"
static inline char *bmname(struct bitmap *bitmap)
{
......@@ -459,7 +459,11 @@ void bitmap_update_sb(struct bitmap *bitmap)
/* rocking back to read-only */
bitmap->events_cleared = bitmap->mddev->events;
sb->events_cleared = cpu_to_le64(bitmap->events_cleared);
sb->state = cpu_to_le32(bitmap->flags);
/*
* clear BITMAP_WRITE_ERROR bit to protect against the case that
* a bitmap write error occurred but the later writes succeeded.
*/
sb->state = cpu_to_le32(bitmap->flags & ~BIT(BITMAP_WRITE_ERROR));
/* Just in case these have been changed via sysfs: */
sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
......@@ -625,7 +629,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
err = read_sb_page(bitmap->mddev,
offset,
sb_page,
0, PAGE_SIZE);
0, sizeof(bitmap_super_t));
}
if (err)
return err;
......@@ -1816,6 +1820,12 @@ struct bitmap *bitmap_create(struct mddev *mddev, int slot)
BUG_ON(file && mddev->bitmap_info.offset);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
pr_notice("md/raid:%s: array with journal cannot have bitmap\n",
mdname(mddev));
return ERR_PTR(-EBUSY);
}
bitmap = kzalloc(sizeof(*bitmap), GFP_KERNEL);
if (!bitmap)
return ERR_PTR(-ENOMEM);
......@@ -2123,7 +2133,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
if (store.sb_page && bitmap->storage.sb_page)
memcpy(page_address(store.sb_page),
page_address(bitmap->storage.sb_page),
PAGE_SIZE);
sizeof(bitmap_super_t));
bitmap_file_unmap(&bitmap->storage);
bitmap->storage = store;
......@@ -2152,6 +2162,7 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
for (k = 0; k < page; k++) {
kfree(new_bp[k].map);
}
kfree(new_bp);
/* restore some fields from old_counts */
bitmap->counts.bp = old_counts.bp;
......@@ -2202,6 +2213,14 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks,
block += old_blocks;
}
if (bitmap->counts.bp != old_counts.bp) {
unsigned long k;
for (k = 0; k < old_counts.pages; k++)
if (!old_counts.bp[k].hijacked)
kfree(old_counts.bp[k].map);
kfree(old_counts.bp);
}
if (!init) {
int i;
while (block < (chunks << chunkshift)) {
......
......@@ -15,7 +15,7 @@
#include <linux/sched.h>
#include <linux/raid/md_p.h>
#include "md.h"
#include "bitmap.h"
#include "md-bitmap.h"
#include "md-cluster.h"
#define LVB_SIZE 64
......@@ -442,10 +442,11 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
static void remove_suspend_info(struct mddev *mddev, int slot)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
mddev->pers->quiesce(mddev, 1);
spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
mddev->pers->quiesce(mddev, 0);
}
......@@ -492,13 +493,12 @@ static void process_suspend_info(struct mddev *mddev,
s->lo = lo;
s->hi = hi;
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
mddev->pers->quiesce(mddev, 0);
}
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
......@@ -1094,7 +1094,7 @@ static void metadata_update_cancel(struct mddev *mddev)
/*
* return 0 if all the bitmaps have the same sync_size
*/
int cluster_check_sync_size(struct mddev *mddev)
static int cluster_check_sync_size(struct mddev *mddev)
{
int i, rv;
bitmap_super_t *sb;
......@@ -1478,7 +1478,7 @@ static struct md_cluster_operations cluster_ops = {
static int __init cluster_init(void)
{
pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n");
pr_warn("md-cluster: support raid1 and raid10 (limited support)\n");
pr_info("Registering Cluster MD functions\n");
register_md_cluster_operations(&cluster_ops, THIS_MODULE);
return 0;
......
......@@ -23,7 +23,7 @@
#include <linux/slab.h>
#include <trace/events/block.h>
#include "md.h"
#include "linear.h"
#include "md-linear.h"
/*
* find which device holds a particular offset
......
......@@ -25,7 +25,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include "md.h"
#include "multipath.h"
#include "md-multipath.h"
#define MAX_WORK_PER_DISK 128
......@@ -243,7 +243,6 @@ static void print_multipath_conf (struct mpconf *conf)
static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct mpconf *conf = mddev->private;
struct request_queue *q;
int err = -EEXIST;
int path;
struct multipath_info *p;
......@@ -257,7 +256,6 @@ static int multipath_add_disk(struct mddev *mddev, struct md_rdev *rdev)
for (path = first; path <= last; path++)
if ((p=conf->multipaths+path)->rdev == NULL) {
q = rdev->bdev->bd_disk->queue;
disk_stack_limits(mddev->gendisk, rdev->bdev,
rdev->data_offset << 9);
......
This diff is collapsed.
......@@ -237,6 +237,12 @@ enum mddev_flags {
*/
MD_HAS_PPL, /* The raid array has PPL feature set */
MD_HAS_MULTIPLE_PPLS, /* The raid array has multiple PPLs feature set */
MD_ALLOW_SB_UPDATE, /* md_check_recovery is allowed to update
* the metadata without taking reconfig_mutex.
*/
MD_UPDATING_SB, /* md_check_recovery is updating the metadata
* without explicitly holding reconfig_mutex.
*/
};
enum mddev_sb_flags {
......@@ -494,11 +500,6 @@ static inline void mddev_lock_nointr(struct mddev *mddev)
mutex_lock(&mddev->reconfig_mutex);
}
static inline int mddev_is_locked(struct mddev *mddev)
{
return mutex_is_locked(&mddev->reconfig_mutex);
}
static inline int mddev_trylock(struct mddev *mddev)
{
return mutex_trylock(&mddev->reconfig_mutex);
......@@ -538,12 +539,11 @@ struct md_personality
int (*check_reshape) (struct mddev *mddev);
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
/* quiesce moves between quiescence states
* 0 - fully active
* 1 - no new requests allowed
* others - reserved
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
*/
void (*quiesce) (struct mddev *mddev, int state);
void (*quiesce) (struct mddev *mddev, int quiesce);
/* takeover is used to transition an array from one
* personality to another. The new personality must be able
* to handle the data in the current layout.
......
......@@ -768,7 +768,7 @@ static void *raid0_takeover(struct mddev *mddev)
return ERR_PTR(-EINVAL);
}
static void raid0_quiesce(struct mddev *mddev, int state)
static void raid0_quiesce(struct mddev *mddev, int quiesce)
{
}
......
......@@ -37,13 +37,12 @@
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include "md.h"
#include "raid1.h"
#include "bitmap.h"
#include "md-bitmap.h"
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
......@@ -990,14 +989,6 @@ static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
_wait_barrier(conf, idx);
}
static void wait_all_barriers(struct r1conf *conf)
{
int idx;
for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
_wait_barrier(conf, idx);
}
static void _allow_barrier(struct r1conf *conf, int idx)
{
atomic_dec(&conf->nr_pending[idx]);
......@@ -1011,14 +1002,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
_allow_barrier(conf, idx);
}
static void allow_all_barriers(struct r1conf *conf)
{
int idx;
for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
_allow_barrier(conf, idx);
}
/* conf->resync_lock should be held */
static int get_unqueued_pending(struct r1conf *conf)
{
......@@ -1303,42 +1286,28 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
int first_clone;
int max_sectors;
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
*/
if ((bio_end_sector(bio) > mddev->suspend_lo &&
bio->bi_iter.bi_sector < mddev->suspend_hi) ||
(mddev_is_clustered(mddev) &&
if (mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
bio->bi_iter.bi_sector, bio_end_sector(bio))) {
/*
* As the suspend_* range is controlled by userspace, we want
* an interruptible wait.
*/
DEFINE_WAIT(w);
for (;;) {
sigset_t full, old;
prepare_to_wait(&conf->wait_barrier,
&w, TASK_INTERRUPTIBLE);
if (bio_end_sector(bio) <= mddev->suspend_lo ||
bio->bi_iter.bi_sector >= mddev->suspend_hi ||
(mddev_is_clustered(mddev) &&
!md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio))))
&w, TASK_IDLE);
if (!md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))
break;
sigfillset(&full);
sigprocmask(SIG_BLOCK, &full, &old);
schedule();
sigprocmask(SIG_SETMASK, &old, NULL);
}
finish_wait(&conf->wait_barrier, &w);
}
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
* Continue immediately if no resync is active currently.
*/
wait_barrier(conf, bio->bi_iter.bi_sector);
r1_bio = alloc_r1bio(mddev, bio);
......@@ -1654,8 +1623,12 @@ static void print_conf(struct r1conf *conf)
static void close_sync(struct r1conf *conf)
{
wait_all_barriers(conf);
allow_all_barriers(conf);
int idx;
for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++) {
_wait_barrier(conf, idx);
_allow_barrier(conf, idx);
}
mempool_destroy(conf->r1buf_pool);
conf->r1buf_pool = NULL;
......@@ -3277,21 +3250,14 @@ static int raid1_reshape(struct mddev *mddev)
return 0;
}
static void raid1_quiesce(struct mddev *mddev, int state)
static void raid1_quiesce(struct mddev *mddev, int quiesce)
{
struct r1conf *conf = mddev->private;
switch(state) {
case 2: /* wake for suspend */
wake_up(&conf->wait_barrier);
break;
case 1:
if (quiesce)
freeze_array(conf, 0);
break;
case 0:
else
unfreeze_array(conf);
break;
}
}
static void *raid1_takeover(struct mddev *mddev)
......
......@@ -29,7 +29,7 @@
#include "md.h"
#include "raid10.h"
#include "raid0.h"
#include "bitmap.h"
#include "md-bitmap.h"
/*
* RAID10 provides a combination of RAID0 and RAID1 functionality.
......@@ -136,10 +136,13 @@ static void r10bio_pool_free(void *r10_bio, void *data)
kfree(r10_bio);
}
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
/* amount of memory to reserve for resync requests */
#define RESYNC_WINDOW (1024*1024)
/* maximum number of concurrent requests, memory permitting */
#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
/*
* When performing a resync, we need to read and compare, so
......@@ -383,12 +386,11 @@ static void raid10_end_read_request(struct bio *bio)
{
int uptodate = !bio->bi_status;
struct r10bio *r10_bio = bio->bi_private;
int slot, dev;
int slot;
struct md_rdev *rdev;
struct r10conf *conf = r10_bio->mddev->private;
slot = r10_bio->read_slot;
dev = r10_bio->devs[slot].devnum;
rdev = r10_bio->devs[slot].rdev;
/*
* this branch is our 'one mirror IO has finished' event handler:
......@@ -748,7 +750,6 @@ static struct md_rdev *read_balance(struct r10conf *conf,
raid10_find_phys(conf, r10_bio);
rcu_read_lock();
sectors = r10_bio->sectors;
best_slot = -1;
best_rdev = NULL;
best_dist = MaxSector;
......@@ -761,8 +762,11 @@ static struct md_rdev *read_balance(struct r10conf *conf,
* the resync window. We take the first readable disk when
* above the resync window.
*/
if (conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync))
if ((conf->mddev->recovery_cp < MaxSector
&& (this_sector + sectors >= conf->next_resync)) ||
(mddev_is_clustered(conf->mddev) &&
md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
this_sector + sectors)))
do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) {
......@@ -1293,6 +1297,22 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
sector_t sectors;
int max_sectors;
if ((mddev_is_clustered(mddev) &&
md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector,
bio_end_sector(bio)))) {
DEFINE_WAIT(w);
for (;;) {
prepare_to_wait(&conf->wait_barrier,
&w, TASK_IDLE);
if (!md_cluster_ops->area_resyncing(mddev, WRITE,
bio->bi_iter.bi_sector, bio_end_sector(bio)))
break;
schedule();
}
finish_wait(&conf->wait_barrier, &w);
}
/*
* Register the new request and wait if the reconstruction
* thread has put up a bar for new requests.
......@@ -2575,7 +2595,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
struct bio *bio;
struct r10conf *conf = mddev->private;
struct md_rdev *rdev = r10_bio->devs[slot].rdev;
sector_t bio_last_sector;
/* we got a read error. Maybe the drive is bad. Maybe just
* the block and we can fix it.
......@@ -2586,7 +2605,6 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
* frozen.
*/
bio = r10_bio->devs[slot].bio;
bio_last_sector = r10_bio->devs[slot].addr + rdev->data_offset + r10_bio->sectors;
bio_put(bio);
r10_bio->devs[slot].bio = NULL;
......@@ -2825,6 +2843,43 @@ static struct r10bio *raid10_alloc_init_r10buf(struct r10conf *conf)
return r10bio;
}
/*
* Set cluster_sync_high since we need other nodes to add the
* range [cluster_sync_low, cluster_sync_high] to suspend list.
*/
static void raid10_set_cluster_sync_high(struct r10conf *conf)
{
sector_t window_size;
int extra_chunk, chunks;
/*
* First, here we define "stripe" as a unit which across
* all member devices one time, so we get chunks by use
* raid_disks / near_copies. Otherwise, if near_copies is
* close to raid_disks, then resync window could increases
* linearly with the increase of raid_disks, which means
* we will suspend a really large IO window while it is not
* necessary. If raid_disks is not divisible by near_copies,
* an extra chunk is needed to ensure the whole "stripe" is
* covered.
*/
chunks = conf->geo.raid_disks / conf->geo.near_copies;
if (conf->geo.raid_disks % conf->geo.near_copies == 0)
extra_chunk = 0;
else
extra_chunk = 1;
window_size = (chunks + extra_chunk) * conf->mddev->chunk_sectors;
/*
* At least use a 32M window to align with raid1's resync window
*/
window_size = (CLUSTER_RESYNC_WINDOW_SECTORS > window_size) ?
CLUSTER_RESYNC_WINDOW_SECTORS : window_size;
conf->cluster_sync_high = conf->cluster_sync_low + window_size;
}
/*
* perform a "sync" on one "block"
*
......@@ -2897,6 +2952,9 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
max_sector = mddev->resync_max_sectors;
if (sector_nr >= max_sector) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
/* If we aborted, we need to abort the
* sync on the 'current' bitmap chucks (there can
* be several when recovering multiple devices).
......@@ -3251,7 +3309,17 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
/*
* Since curr_resync_completed could probably not update in
* time, and we will set cluster_sync_low based on it.
* Let's check against "sector_nr + 2 * RESYNC_SECTORS" for
* safety reason, which ensures curr_resync_completed is
* updated in bitmap_cond_end_sync.
*/
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) &&
(sector_nr + 2 * RESYNC_SECTORS >
conf->cluster_sync_high));
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
......@@ -3385,6 +3453,52 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
} while (++page_idx < RESYNC_PAGES);
r10_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
/* It is resync not recovery */
if (conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
raid10_set_cluster_sync_high(conf);
/* Send resync message */
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
} else if (mddev_is_clustered(mddev)) {
/* This is recovery not resync */
sector_t sect_va1, sect_va2;
bool broadcast_msg = false;
for (i = 0; i < conf->geo.raid_disks; i++) {
/*
* sector_nr is a device address for recovery, so we
* need translate it to array address before compare
* with cluster_sync_high.
*/
sect_va1 = raid10_find_virt(conf, sector_nr, i);
if (conf->cluster_sync_high < sect_va1 + nr_sectors) {
broadcast_msg = true;
/*
* curr_resync_completed is similar as
* sector_nr, so make the translation too.
*/
sect_va2 = raid10_find_virt(conf,
mddev->curr_resync_completed, i);
if (conf->cluster_sync_low == 0 ||
conf->cluster_sync_low > sect_va2)
conf->cluster_sync_low = sect_va2;
}
}
if (broadcast_msg) {
raid10_set_cluster_sync_high(conf);
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
}
while (biolist) {
bio = biolist;
biolist = biolist->bi_next;
......@@ -3644,6 +3758,18 @@ static int raid10_run(struct mddev *mddev)
if (!conf)
goto out;
if (mddev_is_clustered(conf->mddev)) {
int fc, fo;
fc = (mddev->layout >> 8) & 255;
fo = mddev->layout & (1<<16);
if (fc > 1 || fo > 0) {
pr_err("only near layout is supported by clustered"
" raid10\n");
goto out;
}
}
mddev->thread = conf->thread;
conf->thread = NULL;
......@@ -3832,18 +3958,14 @@ static void raid10_free(struct mddev *mddev, void *priv)
kfree(conf);
}
static void raid10_quiesce(struct mddev *mddev, int state)
static void raid10_quiesce(struct mddev *mddev, int quiesce)
{
struct r10conf *conf = mddev->private;
switch(state) {
case 1:
if (quiesce)
raise_barrier(conf, 0);
break;
case 0:
else
lower_barrier(conf);
break;
}
}
static int raid10_resize(struct mddev *mddev, sector_t sectors)
......@@ -4578,15 +4700,18 @@ static int handle_reshape_read_error(struct mddev *mddev,
/* Use sync reads to get the blocks from somewhere else */
int sectors = r10_bio->sectors;
struct r10conf *conf = mddev->private;
struct {
struct r10bio r10_bio;
struct r10dev devs[conf->copies];
} on_stack;
struct r10bio *r10b = &on_stack.r10_bio;
struct r10bio *r10b;
int slot = 0;
int idx = 0;
struct page **pages;
r10b = kmalloc(sizeof(*r10b) +
sizeof(struct r10dev) * conf->copies, GFP_NOIO);
if (!r10b) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return -ENOMEM;
}
/* reshape IOs share pages from .devs[0].bio */
pages = get_resync_pages(r10_bio->devs[0].bio)->pages;
......@@ -4635,11 +4760,13 @@ static int handle_reshape_read_error(struct mddev *mddev,
/* couldn't read this block, must give up */
set_bit(MD_RECOVERY_INTR,
&mddev->recovery);
kfree(r10b);
return -EIO;
}
sectors -= s;
idx++;
}
kfree(r10b);
return 0;
}
......
......@@ -89,6 +89,12 @@ struct r10conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
/*
* Keep track of cluster resync window to send to other nodes.
*/
sector_t cluster_sync_low;
sector_t cluster_sync_high;
};
/*
......
......@@ -23,7 +23,7 @@
#include <linux/types.h>
#include "md.h"
#include "raid5.h"
#include "bitmap.h"
#include "md-bitmap.h"
#include "raid5-log.h"
/*
......@@ -539,7 +539,7 @@ static void r5l_log_run_stripes(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
assert_spin_locked(&log->io_list_lock);
lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
......@@ -555,7 +555,7 @@ static void r5l_move_to_end_ios(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
assert_spin_locked(&log->io_list_lock);
lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
......@@ -693,6 +693,8 @@ static void r5c_disable_writeback_async(struct work_struct *work)
struct r5l_log *log = container_of(work, struct r5l_log,
disable_writeback_work);
struct mddev *mddev = log->rdev->mddev;
struct r5conf *conf = mddev->private;
int locked = 0;
if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
return;
......@@ -701,11 +703,15 @@ static void r5c_disable_writeback_async(struct work_struct *work)
/* wait superblock change before suspend */
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
conf->log == NULL ||
(!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) &&
(locked = mddev_trylock(mddev))));
if (locked) {
mddev_suspend(mddev);
log->r5c_journal_mode = R5C_JOURNAL_MODE_WRITE_THROUGH;
mddev_resume(mddev);
mddev_unlock(mddev);
}
}
static void r5l_submit_current_io(struct r5l_log *log)
......@@ -1194,7 +1200,7 @@ static void r5l_run_no_mem_stripe(struct r5l_log *log)
{
struct stripe_head *sh;
assert_spin_locked(&log->io_list_lock);
lockdep_assert_held(&log->io_list_lock);
if (!list_empty(&log->no_mem_stripes)) {
sh = list_first_entry(&log->no_mem_stripes,
......@@ -1210,7 +1216,7 @@ static bool r5l_complete_finished_ios(struct r5l_log *log)
struct r5l_io_unit *io, *next;
bool found = false;
assert_spin_locked(&log->io_list_lock);
lockdep_assert_held(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
/* don't change list order */
......@@ -1382,7 +1388,7 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh)
* raid5_release_stripe() while holding conf->device_lock
*/
BUG_ON(test_bit(STRIPE_ON_RELEASE_LIST, &sh->state));
assert_spin_locked(&conf->device_lock);
lockdep_assert_held(&conf->device_lock);
list_del_init(&sh->lru);
atomic_inc(&sh->count);
......@@ -1409,7 +1415,7 @@ void r5c_flush_cache(struct r5conf *conf, int num)
int count;
struct stripe_head *sh, *next;
assert_spin_locked(&conf->device_lock);
lockdep_assert_held(&conf->device_lock);
if (!conf->log)
return;
......@@ -1583,21 +1589,21 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
md_wakeup_thread(log->reclaim_thread);
}
void r5l_quiesce(struct r5l_log *log, int state)
void r5l_quiesce(struct r5l_log *log, int quiesce)
{
struct mddev *mddev;
if (!log || state == 2)
if (!log)
return;
if (state == 0)
kthread_unpark(log->reclaim_thread->tsk);
else if (state == 1) {
if (quiesce) {
/* make sure r5l_write_super_and_discard_space exits */
mddev = log->rdev->mddev;
wake_up(&mddev->sb_wait);
kthread_park(log->reclaim_thread->tsk);
r5l_wake_reclaim(log, MaxSector);
r5l_do_reclaim(log);
}
} else
kthread_unpark(log->reclaim_thread->tsk);
}
bool r5l_log_disk_error(struct r5conf *conf)
......@@ -3165,6 +3171,8 @@ void r5l_exit_log(struct r5conf *conf)
conf->log = NULL;
synchronize_rcu();
/* Ensure disable_writeback_work wakes up and exits */
wake_up(&conf->mddev->sb_wait);
flush_work(&log->disable_writeback_work);
md_unregister_thread(&log->reclaim_thread);
mempool_destroy(log->meta_pool);
......
......@@ -9,7 +9,7 @@ extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int state);
extern void r5l_quiesce(struct r5l_log *log, int quiesce);
extern bool r5l_log_disk_error(struct r5conf *conf);
extern bool r5c_is_writeback(struct r5l_log *log);
extern int
......
......@@ -758,7 +758,8 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e,
(unsigned long long)sector);
rdev = conf->disks[dd_idx].rdev;
if (!rdev) {
if (!rdev || (!test_bit(In_sync, &rdev->flags) &&
sector >= rdev->recovery_offset)) {
pr_debug("%s:%*s data member disk %d missing\n",
__func__, indent, "", dd_idx);
update_parity = false;
......@@ -1296,8 +1297,7 @@ int ppl_init_log(struct r5conf *conf)
if (ret) {
goto err;
} else if (!mddev->pers &&
mddev->recovery_cp == 0 && !mddev->degraded &&
} else if (!mddev->pers && mddev->recovery_cp == 0 &&
ppl_conf->recovered_entries > 0 &&
ppl_conf->mismatch_count == 0) {
/*
......
......@@ -55,7 +55,6 @@
#include <linux/ratelimit.h>
#include <linux/nodemask.h>
#include <linux/flex_array.h>
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/list_sort.h>
......@@ -63,7 +62,7 @@
#include "md.h"
#include "raid5.h"
#include "raid0.h"
#include "bitmap.h"
#include "md-bitmap.h"
#include "raid5-log.h"
#define UNSUPPORTED_MDDEV_FLAGS (1L << MD_FAILFAST_SUPPORTED)
......@@ -1818,8 +1817,11 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
struct r5dev *dev = &sh->dev[i];
if (dev->written || i == pd_idx || i == qd_idx) {
if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
if (!discard && !test_bit(R5_SkipCopy, &dev->flags)) {
set_bit(R5_UPTODATE, &dev->flags);
if (test_bit(STRIPE_EXPAND_READY, &sh->state))
set_bit(R5_Expanded, &dev->flags);
}
if (fua)
set_bit(R5_WantFUA, &dev->flags);
if (sync)
......@@ -5682,28 +5684,6 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
goto retry;
}
if (rw == WRITE &&
logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
raid5_release_stripe(sh);
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
*/
prepare_to_wait(&conf->wait_for_overlap,
&w, TASK_INTERRUPTIBLE);
if (logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
sigset_t full, old;
sigfillset(&full);
sigprocmask(SIG_BLOCK, &full, &old);
schedule();
sigprocmask(SIG_SETMASK, &old, NULL);
do_prepare = true;
}
goto retry;
}
if (test_bit(STRIPE_EXPANDING, &sh->state) ||
!add_stripe_bio(sh, bi, dd_idx, rw, previous)) {
/* Stripe is busy expanding or
......@@ -5758,6 +5738,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
*/
struct r5conf *conf = mddev->private;
struct stripe_head *sh;
struct md_rdev *rdev;
sector_t first_sector, last_sector;
int raid_disks = conf->previous_raid_disks;
int data_disks = raid_disks - conf->max_degraded;
......@@ -5880,6 +5861,15 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
return 0;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
if (!mddev->reshape_backwards)
/* Can update recovery_offset */
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < sector_nr)
rdev->recovery_offset = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
......@@ -5978,6 +5968,14 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
goto ret;
mddev->reshape_position = conf->reshape_progress;
mddev->curr_resync_completed = sector_nr;
if (!mddev->reshape_backwards)
/* Can update recovery_offset */
rdev_for_each(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < sector_nr)
rdev->recovery_offset = sector_nr;
conf->reshape_checkpoint = jiffies;
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
md_wakeup_thread(mddev->thread);
......@@ -7156,6 +7154,13 @@ static int raid5_run(struct mddev *mddev)
min_offset_diff = diff;
}
if ((test_bit(MD_HAS_JOURNAL, &mddev->flags) || journal_dev) &&
(mddev->bitmap_info.offset || mddev->bitmap_info.file)) {
pr_notice("md/raid:%s: array cannot have both journal and bitmap\n",
mdname(mddev));
return -EINVAL;
}
if (mddev->reshape_position != MaxSector) {
/* Check that we can continue the reshape.
* Difficulties arise if the stripe we would write to
......@@ -7958,6 +7963,7 @@ static void end_reshape(struct r5conf *conf)
{
if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
struct md_rdev *rdev;
spin_lock_irq(&conf->device_lock);
conf->previous_raid_disks = conf->raid_disks;
......@@ -7965,6 +7971,11 @@ static void end_reshape(struct r5conf *conf)
smp_wmb();
conf->reshape_progress = MaxSector;
conf->mddev->reshape_position = MaxSector;
rdev_for_each(rdev, conf->mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags))
rdev->recovery_offset = MaxSector;
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
......@@ -8020,16 +8031,12 @@ static void raid5_finish_reshape(struct mddev *mddev)
}
}
static void raid5_quiesce(struct mddev *mddev, int state)
static void raid5_quiesce(struct mddev *mddev, int quiesce)
{
struct r5conf *conf = mddev->private;
switch(state) {
case 2: /* resume for a suspend */
wake_up(&conf->wait_for_overlap);
break;
case 1: /* stop all writes */
if (quiesce) {
/* stop all writes */
lock_all_device_hash_locks_irq(conf);
/* '2' tells resync/reshape to pause so that all
* active stripes can drain
......@@ -8045,17 +8052,15 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf);
/* allow reshape to continue */
wake_up(&conf->wait_for_overlap);
break;
case 0: /* re-enable writes */
} else {
/* re-enable writes */
lock_all_device_hash_locks_irq(conf);
conf->quiesce = 0;
wake_up(&conf->wait_for_quiescent);
wake_up(&conf->wait_for_overlap);
unlock_all_device_hash_locks_irq(conf);
break;
}
r5l_quiesce(conf->log, state);
r5l_quiesce(conf->log, quiesce);
}
static void *raid45_takeover_raid0(struct mddev *mddev, int level)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment