Commit ac322de6 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md/4.4' of git://neil.brown.name/md

Pull md updates from Neil Brown:
 "Two major components to this update.

   1) The clustered-raid1 support from SUSE is nearly complete.  There
      are a few outstanding issues being worked on.  Maybe half a dozen
      patches will bring this to a usable state.

   2) The first stage of journalled-raid5 support from Facebook makes an
      appearance.  With a journal device configured (typically NVRAM or
      SSD), the "RAID5 write hole" should be closed - a crash during
      degraded operations cannot result in data corruption.

      The next stage will be to use the journal as a write-behind cache
      so that latency can be reduced and in some cases throughput
      increased by performing more full-stripe writes.

* tag 'md/4.4' of git://neil.brown.name/md: (66 commits)
  MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW
  MD: set journal disk ->raid_disk
  MD: kick out journal disk if it's not fresh
  raid5-cache: start raid5 readonly if journal is missing
  MD: add new bit to indicate raid array with journal
  raid5-cache: IO error handling
  raid5: journal disk can't be removed
  raid5-cache: add trim support for log
  MD: fix info output for journal disk
  raid5-cache: use bio chaining
  raid5-cache: small log->seq cleanup
  raid5-cache: new helper: r5_reserve_log_entry
  raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta
  raid5-cache: take rdev->data_offset into account early on
  raid5-cache: refactor bio allocation
  raid5-cache: clean up r5l_get_meta
  raid5-cache: simplify state machine when caches flushes are not needed
  raid5-cache: factor out a helper to run all stripes for an I/O unit
  raid5-cache: rename flushed_ios to finished_ios
  raid5-cache: free I/O units earlier
  ...
parents ccf21b69 339421de
...@@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o ...@@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o md-mod-y += md.o bitmap.o
raid456-y += raid5.o raid456-y += raid5.o raid5-cache.o
# Note: link order is important. All raid personalities # Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise # and must come before md.o, as they each initialise
......
...@@ -613,12 +613,10 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -613,12 +613,10 @@ static int bitmap_read_sb(struct bitmap *bitmap)
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind); write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved); sectors_reserved = le32_to_cpu(sb->sectors_reserved);
/* XXX: This is a hack to ensure that we don't use clustering /* Setup nodes/clustername only if bitmap version is
* in case: * cluster-compatible
* - dm-raid is in use and
* - the nodes written in bitmap_sb is erroneous.
*/ */
if (!bitmap->mddev->sync_super) { if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes); nodes = le32_to_cpu(sb->nodes);
strlcpy(bitmap->mddev->bitmap_info.cluster_name, strlcpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64); sb->cluster_name, 64);
...@@ -628,7 +626,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) ...@@ -628,7 +626,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic"; reason = "bad magic";
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO || else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
le32_to_cpu(sb->version) > BITMAP_MAJOR_HI) le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
reason = "unrecognized superblock version"; reason = "unrecognized superblock version";
else if (chunksize < 512) else if (chunksize < 512)
reason = "bitmap chunksize too small"; reason = "bitmap chunksize too small";
...@@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap) ...@@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
} }
EXPORT_SYMBOL(bitmap_close_sync); EXPORT_SYMBOL(bitmap_close_sync);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
{ {
sector_t s = 0; sector_t s = 0;
sector_t blocks; sector_t blocks;
...@@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) ...@@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->last_end_sync = jiffies; bitmap->last_end_sync = jiffies;
return; return;
} }
if (time_before(jiffies, (bitmap->last_end_sync if (!force && time_before(jiffies, (bitmap->last_end_sync
+ bitmap->mddev->bitmap_info.daemon_sleep))) + bitmap->mddev->bitmap_info.daemon_sleep)))
return; return;
wait_event(bitmap->mddev->recovery_wait, wait_event(bitmap->mddev->recovery_wait,
......
...@@ -9,8 +9,10 @@ ...@@ -9,8 +9,10 @@
#define BITMAP_MAJOR_LO 3 #define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order /* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable * with version 3, it is host-endian which is non-portable
* Version 5 is currently set only for clustered devices
*/ */
#define BITMAP_MAJOR_HI 4 #define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3 #define BITMAP_MAJOR_HOSTENDIAN 3
/* /*
...@@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, ...@@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap); void bitmap_close_sync(struct bitmap *bitmap);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector); void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void bitmap_unplug(struct bitmap *bitmap); void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev); void bitmap_daemon_work(struct mddev *mddev);
......
...@@ -28,6 +28,7 @@ struct dlm_lock_resource { ...@@ -28,6 +28,7 @@ struct dlm_lock_resource {
struct completion completion; /* completion for synchronized locking */ struct completion completion; /* completion for synchronized locking */
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
struct mddev *mddev; /* pointing back to mddev. */ struct mddev *mddev; /* pointing back to mddev. */
int mode;
}; };
struct suspend_info { struct suspend_info {
...@@ -53,8 +54,8 @@ struct md_cluster_info { ...@@ -53,8 +54,8 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace; dlm_lockspace_t *lockspace;
int slot_number; int slot_number;
struct completion completion; struct completion completion;
struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres; struct dlm_lock_resource *bitmap_lockres;
struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list; struct list_head suspend_list;
spinlock_t suspend_lock; spinlock_t suspend_lock;
struct md_thread *recovery_thread; struct md_thread *recovery_thread;
...@@ -79,20 +80,20 @@ enum msg_type { ...@@ -79,20 +80,20 @@ enum msg_type {
}; };
struct cluster_msg { struct cluster_msg {
int type; __le32 type;
int slot; __le32 slot;
/* TODO: Unionize this for smaller footprint */ /* TODO: Unionize this for smaller footprint */
sector_t low; __le64 low;
sector_t high; __le64 high;
char uuid[16]; char uuid[16];
int raid_slot; __le32 raid_slot;
}; };
static void sync_ast(void *arg) static void sync_ast(void *arg)
{ {
struct dlm_lock_resource *res; struct dlm_lock_resource *res;
res = (struct dlm_lock_resource *) arg; res = arg;
complete(&res->completion); complete(&res->completion);
} }
...@@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) ...@@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
if (ret) if (ret)
return ret; return ret;
wait_for_completion(&res->completion); wait_for_completion(&res->completion);
if (res->lksb.sb_status == 0)
res->mode = mode;
return res->lksb.sb_status; return res->lksb.sb_status;
} }
...@@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev, ...@@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
init_completion(&res->completion); init_completion(&res->completion);
res->ls = cinfo->lockspace; res->ls = cinfo->lockspace;
res->mddev = mddev; res->mddev = mddev;
res->mode = DLM_LOCK_IV;
namelen = strlen(name); namelen = strlen(name);
res->name = kzalloc(namelen + 1, GFP_KERNEL); res->name = kzalloc(namelen + 1, GFP_KERNEL);
if (!res->name) { if (!res->name) {
...@@ -191,7 +195,7 @@ static void lockres_free(struct dlm_lock_resource *res) ...@@ -191,7 +195,7 @@ static void lockres_free(struct dlm_lock_resource *res)
kfree(res); kfree(res);
} }
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, static void add_resync_info(struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi) sector_t lo, sector_t hi)
{ {
struct resync_info *ri; struct resync_info *ri;
...@@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc ...@@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc
dlm_lock_sync(lockres, DLM_LOCK_CR); dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
hi = le64_to_cpu(ri.hi); hi = le64_to_cpu(ri.hi);
if (ri.hi > 0) { if (hi > 0) {
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
if (!s) if (!s)
goto out; goto out;
...@@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = { ...@@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = {
*/ */
static void ack_bast(void *arg, int mode) static void ack_bast(void *arg, int mode)
{ {
struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg; struct dlm_lock_resource *res = arg;
struct md_cluster_info *cinfo = res->mddev->cluster_info; struct md_cluster_info *cinfo = res->mddev->cluster_info;
if (mode == DLM_LOCK_EX) if (mode == DLM_LOCK_EX)
...@@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot) ...@@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
if (slot == s->slot) { if (slot == s->slot) {
pr_info("%s:%d Deleting suspend_info: %d\n",
__func__, __LINE__, slot);
list_del(&s->list); list_del(&s->list);
kfree(s); kfree(s);
break; break;
} }
} }
static void remove_suspend_info(struct md_cluster_info *cinfo, int slot) static void remove_suspend_info(struct mddev *mddev, int slot)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info;
spin_lock_irq(&cinfo->suspend_lock); spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot); __remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock); spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
} }
static void process_suspend_info(struct md_cluster_info *cinfo, static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi) int slot, sector_t lo, sector_t hi)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info;
struct suspend_info *s; struct suspend_info *s;
if (!hi) { if (!hi) {
remove_suspend_info(cinfo, slot); remove_suspend_info(mddev, slot);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return; return;
} }
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
...@@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo, ...@@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
s->slot = slot; s->slot = slot;
s->lo = lo; s->lo = lo;
s->hi = hi; s->hi = hi;
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock); spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */ /* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot); __remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list); list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock); spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
} }
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
...@@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) ...@@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
len = snprintf(disk_uuid, 64, "DEVICE_UUID="); len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
sprintf(disk_uuid + len, "%pU", cmsg->uuid); sprintf(disk_uuid + len, "%pU", cmsg->uuid);
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot); snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot); pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion); init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state); set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
...@@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg) ...@@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg) static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
md_reload_sb(mddev);
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
} }
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg) static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
{ {
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
le32_to_cpu(msg->raid_slot));
if (rdev) if (rdev)
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
else else
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot); pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
} }
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg) static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
{ {
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot); struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
le32_to_cpu(msg->raid_slot));
if (rdev && test_bit(Faulty, &rdev->flags)) if (rdev && test_bit(Faulty, &rdev->flags))
clear_bit(Faulty, &rdev->flags); clear_bit(Faulty, &rdev->flags);
else else
pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot); pr_warn("%s: %d Could not find disk(%d) which is faulty",
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
} }
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg) static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
{ {
switch (msg->type) { if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
"node %d received it's own msg\n", le32_to_cpu(msg->slot)))
return;
switch (le32_to_cpu(msg->type)) {
case METADATA_UPDATED: case METADATA_UPDATED:
pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
__func__, __LINE__, msg->slot);
process_metadata_update(mddev, msg); process_metadata_update(mddev, msg);
break; break;
case RESYNCING: case RESYNCING:
pr_info("%s: %d Received message: RESYNCING from %d\n", process_suspend_info(mddev, le32_to_cpu(msg->slot),
__func__, __LINE__, msg->slot); le64_to_cpu(msg->low),
process_suspend_info(mddev->cluster_info, msg->slot, le64_to_cpu(msg->high));
msg->low, msg->high);
break; break;
case NEWDISK: case NEWDISK:
pr_info("%s: %d Received message: NEWDISK from %d\n",
__func__, __LINE__, msg->slot);
process_add_new_disk(mddev, msg); process_add_new_disk(mddev, msg);
break; break;
case REMOVE: case REMOVE:
pr_info("%s: %d Received REMOVE from %d\n",
__func__, __LINE__, msg->slot);
process_remove_disk(mddev, msg); process_remove_disk(mddev, msg);
break; break;
case RE_ADD: case RE_ADD:
pr_info("%s: %d Received RE_ADD from %d\n",
__func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg); process_readd_disk(mddev, msg);
break; break;
case BITMAP_NEEDS_SYNC: case BITMAP_NEEDS_SYNC:
pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n", __recover_slot(mddev, le32_to_cpu(msg->slot));
__func__, __LINE__, msg->slot);
__recover_slot(mddev, msg->slot);
break; break;
default: default:
pr_warn("%s:%d Received unknown message from %d\n", pr_warn("%s:%d Received unknown message from %d\n",
...@@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread) ...@@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread)
/* lock_comm() /* lock_comm()
* Takes the lock on the TOKEN lock resource so no other * Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway. * node can communicate while the operation is underway.
* If called again, and the TOKEN lock is alread in EX mode
* return success. However, care must be taken that unlock_comm()
* is called only once.
*/ */
static int lock_comm(struct md_cluster_info *cinfo) static int lock_comm(struct md_cluster_info *cinfo)
{ {
int error; int error;
if (cinfo->token_lockres->mode == DLM_LOCK_EX)
return 0;
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX); error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
if (error) if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n", pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
...@@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo) ...@@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
static void unlock_comm(struct md_cluster_info *cinfo) static void unlock_comm(struct md_cluster_info *cinfo)
{ {
WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
dlm_unlock_sync(cinfo->token_lockres); dlm_unlock_sync(cinfo->token_lockres);
} }
...@@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes) ...@@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes)
init_completion(&cinfo->completion); init_completion(&cinfo->completion);
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state); set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo; mddev->cluster_info = cinfo;
memset(str, 0, 64); memset(str, 0, 64);
...@@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes) ...@@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes)
goto err; goto err;
} }
cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
if (!cinfo->resync_lockres)
goto err;
ret = gather_all_resync_info(mddev, nodes); ret = gather_all_resync_info(mddev, nodes);
if (ret) if (ret)
goto err; goto err;
...@@ -763,6 +778,7 @@ static int join(struct mddev *mddev, int nodes) ...@@ -763,6 +778,7 @@ static int join(struct mddev *mddev, int nodes)
lockres_free(cinfo->token_lockres); lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres); lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres); lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres); lockres_free(cinfo->bitmap_lockres);
if (cinfo->lockspace) if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2); dlm_release_lockspace(cinfo->lockspace, 2);
...@@ -771,12 +787,32 @@ static int join(struct mddev *mddev, int nodes) ...@@ -771,12 +787,32 @@ static int join(struct mddev *mddev, int nodes)
return ret; return ret;
} }
static void resync_bitmap(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg = {0};
int err;
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
err = sendmsg(cinfo, &cmsg);
if (err)
pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
__func__, __LINE__, err);
}
static int leave(struct mddev *mddev) static int leave(struct mddev *mddev)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
if (!cinfo) if (!cinfo)
return 0; return 0;
/* BITMAP_NEEDS_SYNC message should be sent when node
* is leaving the cluster with dirty bitmap, also we
* can only deliver it when dlm connection is available */
if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
resync_bitmap(mddev);
md_unregister_thread(&cinfo->recovery_thread); md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread); md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres); lockres_free(cinfo->message_lockres);
...@@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev) ...@@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev)
return cinfo->slot_number - 1; return cinfo->slot_number - 1;
} }
static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
/* Re-acquire the lock to refresh LVB */
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
}
static int metadata_update_start(struct mddev *mddev) static int metadata_update_start(struct mddev *mddev)
{ {
return lock_comm(mddev->cluster_info); return lock_comm(mddev->cluster_info);
...@@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev) ...@@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg; struct cluster_msg cmsg;
int ret; struct md_rdev *rdev;
int ret = 0;
int raid_slot = -1;
memset(&cmsg, 0, sizeof(cmsg)); memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(METADATA_UPDATED); cmsg.type = cpu_to_le32(METADATA_UPDATED);
/* Pick up a good active device number to send.
*/
rdev_for_each(rdev, mddev)
if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
raid_slot = rdev->desc_nr;
break;
}
if (raid_slot >= 0) {
cmsg.raid_slot = cpu_to_le32(raid_slot);
ret = __sendmsg(cinfo, &cmsg); ret = __sendmsg(cinfo, &cmsg);
} else
pr_warn("md-cluster: No good device id found to send\n");
unlock_comm(cinfo); unlock_comm(cinfo);
return ret; return ret;
} }
static int metadata_update_cancel(struct mddev *mddev) static void metadata_update_cancel(struct mddev *mddev)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
unlock_comm(cinfo);
}
return dlm_unlock_sync(cinfo->token_lockres); static int resync_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
} }
static int resync_send(struct mddev *mddev, enum msg_type type, static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
sector_t lo, sector_t hi)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg; struct cluster_msg cmsg = {0};
int slot = cinfo->slot_number - 1;
pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__, add_resync_info(cinfo->bitmap_lockres, lo, hi);
(unsigned long long)lo, /* Re-acquire the lock to refresh LVB */
(unsigned long long)hi); dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
resync_info_update(mddev, lo, hi); cmsg.type = cpu_to_le32(RESYNCING);
cmsg.type = cpu_to_le32(type);
cmsg.slot = cpu_to_le32(slot);
cmsg.low = cpu_to_le64(lo); cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi); cmsg.high = cpu_to_le64(hi);
return sendmsg(cinfo, &cmsg);
}
static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi) return sendmsg(cinfo, &cmsg);
{
pr_info("%s:%d\n", __func__, __LINE__);
return resync_send(mddev, RESYNCING, lo, hi);
} }
static void resync_finish(struct mddev *mddev) static int resync_finish(struct mddev *mddev)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg; cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
int slot = cinfo->slot_number - 1; dlm_unlock_sync(cinfo->resync_lockres);
return resync_info_update(mddev, 0, 0);
pr_info("%s:%d\n", __func__, __LINE__);
resync_send(mddev, RESYNCING, 0, 0);
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
cmsg.slot = cpu_to_le32(slot);
sendmsg(cinfo, &cmsg);
}
} }
static int area_resyncing(struct mddev *mddev, int direction, static int area_resyncing(struct mddev *mddev, int direction,
...@@ -896,7 +926,11 @@ static int area_resyncing(struct mddev *mddev, int direction, ...@@ -896,7 +926,11 @@ static int area_resyncing(struct mddev *mddev, int direction,
return ret; return ret;
} }
static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) /* add_new_disk() - initiates a disk add
* However, if this fails before writing md_update_sb(),
* add_new_disk_cancel() must be called to release token lock
*/
static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg; struct cluster_msg cmsg;
...@@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) ...@@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
memset(&cmsg, 0, sizeof(cmsg)); memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(NEWDISK); cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16); memcpy(cmsg.uuid, uuid, 16);
cmsg.raid_slot = rdev->desc_nr; cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo); lock_comm(cinfo);
ret = __sendmsg(cinfo, &cmsg); ret = __sendmsg(cinfo, &cmsg);
if (ret) if (ret)
...@@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev) ...@@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* Some node does not "see" the device */ /* Some node does not "see" the device */
if (ret == -EAGAIN) if (ret == -EAGAIN)
ret = -ENOENT; ret = -ENOENT;
if (ret)
unlock_comm(cinfo);
else else
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR); dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
return ret; return ret;
} }
static int add_new_disk_finish(struct mddev *mddev) static void add_new_disk_cancel(struct mddev *mddev)
{ {
struct cluster_msg cmsg;
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
int ret;
/* Write sb and inform others */
md_update_sb(mddev, 1);
cmsg.type = METADATA_UPDATED;
ret = __sendmsg(cinfo, &cmsg);
unlock_comm(cinfo); unlock_comm(cinfo);
return ret;
} }
static int new_disk_ack(struct mddev *mddev, bool ack) static int new_disk_ack(struct mddev *mddev, bool ack)
...@@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack) ...@@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev) static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct cluster_msg cmsg; struct cluster_msg cmsg = {0};
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = REMOVE; cmsg.type = cpu_to_le32(REMOVE);
cmsg.raid_slot = rdev->desc_nr; cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
return __sendmsg(cinfo, &cmsg); return __sendmsg(cinfo, &cmsg);
} }
...@@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev) ...@@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
{ {
int sn, err; int sn, err;
sector_t lo, hi; sector_t lo, hi;
struct cluster_msg cmsg; struct cluster_msg cmsg = {0};
struct mddev *mddev = rdev->mddev; struct mddev *mddev = rdev->mddev;
struct md_cluster_info *cinfo = mddev->cluster_info; struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = RE_ADD; cmsg.type = cpu_to_le32(RE_ADD);
cmsg.raid_slot = rdev->desc_nr; cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
err = sendmsg(cinfo, &cmsg); err = sendmsg(cinfo, &cmsg);
if (err) if (err)
goto out; goto out;
...@@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = { ...@@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
.join = join, .join = join,
.leave = leave, .leave = leave,
.slot_number = slot_number, .slot_number = slot_number,
.resync_info_update = resync_info_update,
.resync_start = resync_start, .resync_start = resync_start,
.resync_finish = resync_finish, .resync_finish = resync_finish,
.resync_info_update = resync_info_update,
.metadata_update_start = metadata_update_start, .metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish, .metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel, .metadata_update_cancel = metadata_update_cancel,
.area_resyncing = area_resyncing, .area_resyncing = area_resyncing,
.add_new_disk_start = add_new_disk_start, .add_new_disk = add_new_disk,
.add_new_disk_finish = add_new_disk_finish, .add_new_disk_cancel = add_new_disk_cancel,
.new_disk_ack = new_disk_ack, .new_disk_ack = new_disk_ack,
.remove_disk = remove_disk, .remove_disk = remove_disk,
.gather_bitmaps = gather_bitmaps, .gather_bitmaps = gather_bitmaps,
...@@ -1022,5 +1051,6 @@ static void cluster_exit(void) ...@@ -1022,5 +1051,6 @@ static void cluster_exit(void)
module_init(cluster_init); module_init(cluster_init);
module_exit(cluster_exit); module_exit(cluster_exit);
MODULE_AUTHOR("SUSE");
MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clustering support for MD"); MODULE_DESCRIPTION("Clustering support for MD");
...@@ -12,15 +12,15 @@ struct md_cluster_operations { ...@@ -12,15 +12,15 @@ struct md_cluster_operations {
int (*join)(struct mddev *mddev, int nodes); int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev); int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev); int (*slot_number)(struct mddev *mddev);
void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi); int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
void (*resync_finish)(struct mddev *mddev);
int (*metadata_update_start)(struct mddev *mddev); int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev); int (*metadata_update_finish)(struct mddev *mddev);
int (*metadata_update_cancel)(struct mddev *mddev); void (*metadata_update_cancel)(struct mddev *mddev);
int (*resync_start)(struct mddev *mddev);
int (*resync_finish)(struct mddev *mddev);
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi); int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev); int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
int (*add_new_disk_finish)(struct mddev *mddev); void (*add_new_disk_cancel)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack); int (*new_disk_ack)(struct mddev *mddev, bool ack);
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev); int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
int (*gather_bitmaps)(struct md_rdev *rdev); int (*gather_bitmaps)(struct md_rdev *rdev);
......
...@@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
++ev1; ++ev1;
if (rdev->desc_nr >= 0 && if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) && rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events) if (ev1 < mddev->events)
return -EINVAL; return -EINVAL;
} else if (mddev->bitmap) { } else if (mddev->bitmap) {
...@@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
int role; int role;
if (rdev->desc_nr < 0 || if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = 0xffff; role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1; rdev->desc_nr = -1;
} else } else
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) { switch(role) {
case 0xffff: /* spare */ case MD_DISK_ROLE_SPARE: /* spare */
break; break;
case 0xfffe: /* faulty */ case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags); set_bit(Faulty, &rdev->flags);
break; break;
case MD_DISK_ROLE_JOURNAL: /* journal device */
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
/* journal device without journal feature */
printk(KERN_WARNING
"md: journal device provided without journal feature, ignoring the device\n");
return -EINVAL;
}
set_bit(Journal, &rdev->flags);
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
if (mddev->recovery_cp == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
rdev->raid_disk = mddev->raid_disks;
break;
default: default:
rdev->saved_raid_disk = role; rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) & if ((le32_to_cpu(sb->feature_map) &
...@@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
set_bit(WriteMostly, &rdev->flags); set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags); set_bit(Replacement, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
} else /* MULTIPATH are always insync */ } else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
...@@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events); sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync) if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp); sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
sb->resync_offset = cpu_to_le64(MaxSector);
else else
sb->resync_offset = cpu_to_le64(0); sb->resync_offset = cpu_to_le64(0);
...@@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
} }
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags)) { !test_bit(In_sync, &rdev->flags)) {
sb->feature_map |= sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
...@@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map |= sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
} }
/* Note: recovery_offset and journal_tail share space */
if (test_bit(Journal, &rdev->flags))
sb->journal_tail = cpu_to_le64(rdev->journal_tail);
if (test_bit(Replacement, &rdev->flags)) if (test_bit(Replacement, &rdev->flags))
sb->feature_map |= sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT); cpu_to_le32(MD_FEATURE_REPLACEMENT);
...@@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
} }
} }
if (mddev_is_clustered(mddev))
sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
if (rdev->badblocks.count == 0) if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ; /* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0) else if (sb->bblog_offset == 0)
...@@ -1785,18 +1809,23 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1785,18 +1809,23 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
max_dev = le32_to_cpu(sb->max_dev); max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++) for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe); sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
rdev_for_each(rdev2, mddev) { rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr; i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags)) if (test_bit(Faulty, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(0xfffe); sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
else if (test_bit(In_sync, &rdev2->flags)) else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else if (test_bit(Journal, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
else if (rdev2->raid_disk >= 0) else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else else
sb->dev_roles[i] = cpu_to_le16(0xffff); sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
} }
sb->sb_csum = calc_sb_1_csum(sb); sb->sb_csum = calc_sb_1_csum(sb);
...@@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) ...@@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
struct md_rdev *rdev, *rdev2; struct md_rdev *rdev, *rdev2;
rcu_read_lock(); rcu_read_lock();
rdev_for_each_rcu(rdev, mddev1) rdev_for_each_rcu(rdev, mddev1) {
rdev_for_each_rcu(rdev2, mddev2) if (test_bit(Faulty, &rdev->flags) ||
test_bit(Journal, &rdev->flags) ||
rdev->raid_disk == -1)
continue;
rdev_for_each_rcu(rdev2, mddev2) {
if (test_bit(Faulty, &rdev2->flags) ||
test_bit(Journal, &rdev2->flags) ||
rdev2->raid_disk == -1)
continue;
if (rdev->bdev->bd_contains == if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) { rdev2->bdev->bd_contains) {
rcu_read_unlock(); rcu_read_unlock();
return 1; return 1;
} }
}
}
rcu_read_unlock(); rcu_read_unlock();
return 0; return 0;
} }
...@@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares) ...@@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares)
} }
} }
static bool does_sb_need_changing(struct mddev *mddev)
{
struct md_rdev *rdev;
struct mdp_superblock_1 *sb;
int role;
/* Find a good rdev */
rdev_for_each(rdev, mddev)
if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
break;
/* No good device found. */
if (!rdev)
return false;
sb = page_address(rdev->sb_page);
/* Check if a device has become faulty or a spare become active */
rdev_for_each(rdev, mddev) {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
/* Device activated? */
if (role == 0xffff && rdev->raid_disk >=0 &&
!test_bit(Faulty, &rdev->flags))
return true;
/* Device turned faulty? */
if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
return true;
}
/* Check if any mddev parameters have changed */
if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
(mddev->layout != le64_to_cpu(sb->layout)) ||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
return true;
return false;
}
void md_update_sb(struct mddev *mddev, int force_change) void md_update_sb(struct mddev *mddev, int force_change)
{ {
struct md_rdev *rdev; struct md_rdev *rdev;
int sync_req; int sync_req;
int nospares = 0; int nospares = 0;
int any_badblocks_changed = 0; int any_badblocks_changed = 0;
int ret = -1;
if (mddev->ro) { if (mddev->ro) {
if (force_change) if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
return; return;
} }
if (mddev_is_clustered(mddev)) {
if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
force_change = 1;
ret = md_cluster_ops->metadata_update_start(mddev);
/* Has someone else has updated the sb */
if (!does_sb_need_changing(mddev)) {
if (ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
return;
}
}
repeat: repeat:
/* First make sure individual recovery_offsets are correct */ /* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 && mddev->delta_disks >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset) mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed; rdev->recovery_offset = mddev->curr_resync_completed;
...@@ -2354,6 +2447,9 @@ void md_update_sb(struct mddev *mddev, int force_change) ...@@ -2354,6 +2447,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
clear_bit(BlockedBadBlocks, &rdev->flags); clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait); wake_up(&rdev->blocked_wait);
} }
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_finish(mddev);
} }
EXPORT_SYMBOL(md_update_sb); EXPORT_SYMBOL(md_update_sb);
...@@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page) ...@@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%sin_sync",sep); len += sprintf(page+len, "%sin_sync",sep);
sep = ","; sep = ",";
} }
if (test_bit(Journal, &flags)) {
len += sprintf(page+len, "%sjournal",sep);
sep = ",";
}
if (test_bit(WriteMostly, &flags)) { if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep); len += sprintf(page+len, "%swrite_mostly",sep);
sep = ","; sep = ",";
...@@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page) ...@@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page)
sep = ","; sep = ",";
} }
if (!test_bit(Faulty, &flags) && if (!test_bit(Faulty, &flags) &&
!test_bit(Journal, &flags) &&
!test_bit(In_sync, &flags)) { !test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep); len += sprintf(page+len, "%sspare", sep);
sep = ","; sep = ",";
...@@ -2488,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2488,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = -EBUSY; err = -EBUSY;
else { else {
struct mddev *mddev = rdev->mddev; struct mddev *mddev = rdev->mddev;
err = 0;
if (mddev_is_clustered(mddev)) if (mddev_is_clustered(mddev))
md_cluster_ops->remove_disk(mddev, rdev); err = md_cluster_ops->remove_disk(mddev, rdev);
if (err == 0) {
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (mddev->pers) if (mddev->pers)
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
md_new_event(mddev); md_new_event(mddev);
if (mddev_is_clustered(mddev)) }
md_cluster_ops->metadata_update_finish(mddev);
err = 0;
} }
} else if (cmd_match(buf, "writemostly")) { } else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags); set_bit(WriteMostly, &rdev->flags);
...@@ -2527,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2527,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
err = 0; err = 0;
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags)) {
if (rdev->mddev->pers == NULL) { if (rdev->mddev->pers == NULL) {
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
rdev->saved_raid_disk = rdev->raid_disk; rdev->saved_raid_disk = rdev->raid_disk;
...@@ -2546,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2546,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* check if recovery is needed. * check if recovery is needed.
*/ */
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Replacement, &rdev->flags)) !test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags); set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
...@@ -2623,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); ...@@ -2623,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t static ssize_t
slot_show(struct md_rdev *rdev, char *page) slot_show(struct md_rdev *rdev, char *page)
{ {
if (rdev->raid_disk < 0) if (test_bit(Journal, &rdev->flags))
return sprintf(page, "journal\n");
else if (rdev->raid_disk < 0)
return sprintf(page, "none\n"); return sprintf(page, "none\n");
else else
return sprintf(page, "%d\n", rdev->raid_disk); return sprintf(page, "%d\n", rdev->raid_disk);
...@@ -2635,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2635,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
int slot; int slot;
int err; int err;
if (test_bit(Journal, &rdev->flags))
return -EBUSY;
if (strncmp(buf, "none", 4)==0) if (strncmp(buf, "none", 4)==0)
slot = -1; slot = -1;
else { else {
...@@ -2686,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2686,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
rdev->saved_raid_disk = -1; rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags); clear_bit(Bitmap_sync, &rdev->flags);
err = rdev->mddev->pers-> remove_and_add_spares(rdev->mddev, rdev);
hot_add_disk(rdev->mddev, rdev); if (rdev->raid_disk == -1)
if (err) { return -EBUSY;
rdev->raid_disk = -1;
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
if (sysfs_link_rdev(rdev->mddev, rdev))
/* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */ /* don't wakeup anyone, leave that to userspace. */
} else { } else {
if (slot >= rdev->mddev->raid_disks && if (slot >= rdev->mddev->raid_disks &&
...@@ -2839,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) ...@@ -2839,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
sector_t oldsectors = rdev->sectors; sector_t oldsectors = rdev->sectors;
sector_t sectors; sector_t sectors;
if (test_bit(Journal, &rdev->flags))
return -EBUSY;
if (strict_blocks_to_sectors(buf, &sectors) < 0) if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL; return -EINVAL;
if (rdev->data_offset != rdev->new_data_offset) if (rdev->data_offset != rdev->new_data_offset)
...@@ -3196,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev) ...@@ -3196,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev)
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
continue; continue;
} }
/* No device should have a Candidate flag
* when reading devices
*/
if (test_bit(Candidate, &rdev->flags)) {
pr_info("md: kicking Cluster Candidate %s from array!\n",
bdevname(rdev->bdev, b));
md_kick_rdev_from_array(rdev);
}
} }
if (mddev->level == LEVEL_MULTIPATH) { if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++; rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr; rdev->raid_disk = rdev->desc_nr;
set_bit(In_sync, &rdev->flags); set_bit(In_sync, &rdev->flags);
} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { } else if (rdev->raid_disk >=
(mddev->raid_disks - min(0, mddev->delta_disks)) &&
!test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1; rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
} }
...@@ -3267,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) ...@@ -3267,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
{ {
unsigned long msec; unsigned long msec;
if (mddev_is_clustered(mddev)) {
pr_info("md: Safemode is disabled for clustered mode\n");
return -EINVAL;
}
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL; return -EINVAL;
if (msec == 0) if (msec == 0)
...@@ -3867,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3867,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break; break;
case clean: case clean:
if (mddev->pers) { if (mddev->pers) {
restart_array(mddev); err = restart_array(mddev);
if (err)
break;
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) { if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) { if (mddev->in_sync == 0) {
...@@ -3885,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -3885,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break; break;
case active: case active:
if (mddev->pers) { if (mddev->pers) {
restart_array(mddev); err = restart_array(mddev);
if (err)
break;
clear_bit(MD_CHANGE_PENDING, &mddev->flags); clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait); wake_up(&mddev->sb_wait);
err = 0; err = 0;
...@@ -4064,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len) ...@@ -4064,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err) if (err)
return err; return err;
if (mddev->pers) { if (mddev->pers) {
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors); err = update_size(mddev, sectors);
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
} else { } else {
if (mddev->dev_sectors == 0 || if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors) mddev->dev_sectors > sectors)
...@@ -5181,6 +5282,9 @@ int md_run(struct mddev *mddev) ...@@ -5181,6 +5282,9 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors, atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0; mddev->safemode = 0;
if (mddev_is_clustered(mddev))
mddev->safemode_delay = 0;
else
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1; mddev->in_sync = 1;
smp_wmb(); smp_wmb();
...@@ -5224,6 +5328,9 @@ static int do_md_run(struct mddev *mddev) ...@@ -5224,6 +5328,9 @@ static int do_md_run(struct mddev *mddev)
goto out; goto out;
} }
if (mddev_is_clustered(mddev))
md_allow_write(mddev);
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
...@@ -5246,6 +5353,25 @@ static int restart_array(struct mddev *mddev) ...@@ -5246,6 +5353,25 @@ static int restart_array(struct mddev *mddev)
return -EINVAL; return -EINVAL;
if (!mddev->ro) if (!mddev->ro)
return -EBUSY; return -EBUSY;
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
struct md_rdev *rdev;
bool has_journal = false;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
if (test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
has_journal = true;
break;
}
}
rcu_read_unlock();
/* Don't restart rw with journal missing/faulty */
if (!has_journal)
return -EINVAL;
}
mddev->safemode = 0; mddev->safemode = 0;
mddev->ro = 0; mddev->ro = 0;
set_disk_ro(disk, 0); set_disk_ro(disk, 0);
...@@ -5307,8 +5433,6 @@ static void md_clean(struct mddev *mddev) ...@@ -5307,8 +5433,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev)
{ {
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq); flush_workqueue(md_misc_wq);
if (mddev->sync_thread) { if (mddev->sync_thread) {
...@@ -5322,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev) ...@@ -5322,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev)
md_super_wait(mddev); md_super_wait(mddev);
if (mddev->ro == 0 && if (mddev->ro == 0 &&
(!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
(mddev->flags & MD_UPDATE_SB_FLAGS))) {
/* mark array as shutdown cleanly */ /* mark array as shutdown cleanly */
if (!mddev_is_clustered(mddev))
mddev->in_sync = 1; mddev->in_sync = 1;
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
} }
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
} }
void md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev)
...@@ -5789,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg) ...@@ -5789,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
info.state |= (1<<MD_DISK_ACTIVE); info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC); info.state |= (1<<MD_DISK_SYNC);
} }
if (test_bit(Journal, &rdev->flags))
info.state |= (1<<MD_DISK_JOURNAL);
if (test_bit(WriteMostly, &rdev->flags)) if (test_bit(WriteMostly, &rdev->flags))
info.state |= (1<<MD_DISK_WRITEMOSTLY); info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else { } else {
...@@ -5903,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) ...@@ -5903,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else else
clear_bit(WriteMostly, &rdev->flags); clear_bit(WriteMostly, &rdev->flags);
if (info->state & (1<<MD_DISK_JOURNAL))
set_bit(Journal, &rdev->flags);
/* /*
* check whether the device shows up in other nodes * check whether the device shows up in other nodes
*/ */
if (mddev_is_clustered(mddev)) { if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) { if (info->state & (1 << MD_DISK_CANDIDATE))
/* Through --cluster-confirm */
set_bit(Candidate, &rdev->flags); set_bit(Candidate, &rdev->flags);
err = md_cluster_ops->new_disk_ack(mddev, true); else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
if (err) {
export_rdev(rdev);
return err;
}
} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */ /* --add initiated by this node */
err = md_cluster_ops->add_new_disk_start(mddev, rdev); err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) { if (err) {
md_cluster_ops->add_new_disk_finish(mddev);
export_rdev(rdev); export_rdev(rdev);
return err; return err;
} }
...@@ -5928,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) ...@@ -5928,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
rdev->raid_disk = -1; rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
if (err) if (err)
export_rdev(rdev); export_rdev(rdev);
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE))
md_cluster_ops->new_disk_ack(mddev, (err == 0));
else {
if (err)
md_cluster_ops->add_new_disk_cancel(mddev);
else else
err = add_bound_rdev(rdev); err = add_bound_rdev(rdev);
if (mddev_is_clustered(mddev) && }
(info->state & (1 << MD_DISK_CLUSTER_ADD)))
md_cluster_ops->add_new_disk_finish(mddev); } else if (!err)
err = add_bound_rdev(rdev);
return err; return err;
} }
...@@ -5990,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) ...@@ -5990,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
{ {
char b[BDEVNAME_SIZE]; char b[BDEVNAME_SIZE];
struct md_rdev *rdev; struct md_rdev *rdev;
int ret = -1;
rdev = find_rdev(mddev, dev); rdev = find_rdev(mddev, dev);
if (!rdev) if (!rdev)
return -ENXIO; return -ENXIO;
if (mddev_is_clustered(mddev)) if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev); ret = md_cluster_ops->metadata_update_start(mddev);
if (rdev->raid_disk < 0)
goto kick_rdev;
clear_bit(Blocked, &rdev->flags); clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev); remove_and_add_spares(mddev, rdev);
...@@ -6004,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev) ...@@ -6004,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
goto busy; goto busy;
if (mddev_is_clustered(mddev)) kick_rdev:
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->remove_disk(mddev, rdev); md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev); md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
md_new_event(mddev); md_new_event(mddev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
return 0; return 0;
busy: busy:
if (mddev_is_clustered(mddev)) if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_cancel(mddev); md_cluster_ops->metadata_update_cancel(mddev);
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev)); bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY; return -EBUSY;
...@@ -6068,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) ...@@ -6068,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
goto abort_export; goto abort_export;
} }
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags); clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1; rdev->desc_nr = -1;
rdev->saved_raid_disk = -1; rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev); err = bind_rdev_to_array(rdev, mddev);
if (err) if (err)
goto abort_clustered; goto abort_export;
/* /*
* The rest should better be atomic, we can have disk failures * The rest should better be atomic, we can have disk failures
...@@ -6085,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) ...@@ -6085,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev->raid_disk = -1; rdev->raid_disk = -1;
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
/* /*
* Kick recovery, maybe this spare has to be added to the * Kick recovery, maybe this spare has to be added to the
* array immediately. * array immediately.
...@@ -6097,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) ...@@ -6097,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_new_event(mddev); md_new_event(mddev);
return 0; return 0;
abort_clustered:
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_cancel(mddev);
abort_export: abort_export:
export_rdev(rdev); export_rdev(rdev);
return err; return err;
...@@ -6417,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) ...@@ -6417,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return rv; return rv;
} }
} }
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2); rv = update_size(mddev, (sector_t)info->size * 2);
...@@ -6476,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) ...@@ -6476,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
} }
} }
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
return rv; return rv;
err: err:
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_cancel(mddev);
return rv; return rv;
} }
...@@ -7282,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v) ...@@ -7282,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
bdevname(rdev->bdev,b), rdev->desc_nr); bdevname(rdev->bdev,b), rdev->desc_nr);
if (test_bit(WriteMostly, &rdev->flags)) if (test_bit(WriteMostly, &rdev->flags))
seq_printf(seq, "(W)"); seq_printf(seq, "(W)");
if (test_bit(Journal, &rdev->flags))
seq_printf(seq, "(J)");
if (test_bit(Faulty, &rdev->flags)) { if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)"); seq_printf(seq, "(F)");
continue; continue;
...@@ -7594,11 +7716,7 @@ int md_allow_write(struct mddev *mddev) ...@@ -7594,11 +7716,7 @@ int md_allow_write(struct mddev *mddev)
mddev->safemode == 0) mddev->safemode == 0)
mddev->safemode = 1; mddev->safemode = 1;
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
} else } else
spin_unlock(&mddev->lock); spin_unlock(&mddev->lock);
...@@ -7630,6 +7748,7 @@ void md_do_sync(struct md_thread *thread) ...@@ -7630,6 +7748,7 @@ void md_do_sync(struct md_thread *thread)
struct md_rdev *rdev; struct md_rdev *rdev;
char *desc, *action = NULL; char *desc, *action = NULL;
struct blk_plug plug; struct blk_plug plug;
bool cluster_resync_finished = false;
/* just incase thread restarts... */ /* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
...@@ -7739,6 +7858,7 @@ void md_do_sync(struct md_thread *thread) ...@@ -7739,6 +7858,7 @@ void md_do_sync(struct md_thread *thread)
rcu_read_lock(); rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j) rdev->recovery_offset < j)
...@@ -7799,9 +7919,6 @@ void md_do_sync(struct md_thread *thread) ...@@ -7799,9 +7919,6 @@ void md_do_sync(struct md_thread *thread)
md_new_event(mddev); md_new_event(mddev);
update_time = jiffies; update_time = jiffies;
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_start(mddev, j, max_sectors);
blk_start_plug(&plug); blk_start_plug(&plug);
while (j < max_sectors) { while (j < max_sectors) {
sector_t sectors; sector_t sectors;
...@@ -7865,8 +7982,6 @@ void md_do_sync(struct md_thread *thread) ...@@ -7865,8 +7982,6 @@ void md_do_sync(struct md_thread *thread)
j = max_sectors; j = max_sectors;
if (j > 2) if (j > 2)
mddev->curr_resync = j; mddev->curr_resync = j;
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors; mddev->curr_mark_cnt = io_sectors;
if (last_check == 0) if (last_check == 0)
/* this is the earliest that rebuild will be /* this is the earliest that rebuild will be
...@@ -7937,7 +8052,11 @@ void md_do_sync(struct md_thread *thread) ...@@ -7937,7 +8052,11 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = mddev->curr_resync; mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed"); sysfs_notify(&mddev->kobj, NULL, "sync_completed");
} }
/* tell personality that we are finished */ /* tell personality and other nodes that we are finished */
if (mddev_is_clustered(mddev)) {
md_cluster_ops->resync_finish(mddev);
cluster_resync_finished = true;
}
mddev->pers->sync_request(mddev, max_sectors, &skipped); mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
...@@ -7965,6 +8084,7 @@ void md_do_sync(struct md_thread *thread) ...@@ -7965,6 +8084,7 @@ void md_do_sync(struct md_thread *thread)
rdev_for_each_rcu(rdev, mddev) rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 && mddev->delta_disks >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) && !test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync) rdev->recovery_offset < mddev->curr_resync)
...@@ -7973,11 +8093,13 @@ void md_do_sync(struct md_thread *thread) ...@@ -7973,11 +8093,13 @@ void md_do_sync(struct md_thread *thread)
} }
} }
skip: skip:
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_finish(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!cluster_resync_finished)
md_cluster_ops->resync_finish(mddev);
spin_lock(&mddev->lock); spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */ /* We completed so min/max setting can be forgotten if used. */
...@@ -8008,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -8008,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->raid_disk >= 0 && rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) && !test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) || (test_bit(Faulty, &rdev->flags) ||
! test_bit(In_sync, &rdev->flags)) && (!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0) { atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk( if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) { mddev, rdev) == 0) {
...@@ -8020,18 +8143,25 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -8020,18 +8143,25 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd) if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded"); sysfs_notify(&mddev->kobj, NULL, "degraded");
if (this) if (this && removed)
goto no_add; goto no_add;
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
if (this && this != rdev)
continue;
if (test_bit(Candidate, &rdev->flags))
continue;
if (rdev->raid_disk >= 0 && if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) && !test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) !test_bit(Faulty, &rdev->flags))
spares++; spares++;
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
continue; continue;
if (test_bit(Faulty, &rdev->flags)) if (test_bit(Faulty, &rdev->flags))
continue; continue;
if (test_bit(Journal, &rdev->flags))
continue;
if (mddev->ro && if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 && ! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags))) !test_bit(Bitmap_sync, &rdev->flags)))
...@@ -8056,11 +8186,22 @@ static int remove_and_add_spares(struct mddev *mddev, ...@@ -8056,11 +8186,22 @@ static int remove_and_add_spares(struct mddev *mddev,
static void md_start_sync(struct work_struct *ws) static void md_start_sync(struct work_struct *ws)
{ {
struct mddev *mddev = container_of(ws, struct mddev, del_work); struct mddev *mddev = container_of(ws, struct mddev, del_work);
int ret = 0;
if (mddev_is_clustered(mddev)) {
ret = md_cluster_ops->resync_start(mddev);
if (ret) {
mddev->sync_thread = NULL;
goto out;
}
}
mddev->sync_thread = md_register_thread(md_do_sync, mddev->sync_thread = md_register_thread(md_do_sync,
mddev, mddev,
"resync"); "resync");
out:
if (!mddev->sync_thread) { if (!mddev->sync_thread) {
if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
printk(KERN_ERR "%s: could not start resync" printk(KERN_ERR "%s: could not start resync"
" thread...\n", " thread...\n",
mdname(mddev)); mdname(mddev));
...@@ -8182,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev) ...@@ -8182,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state); sysfs_notify_dirent_safe(mddev->sysfs_state);
} }
if (mddev->flags & MD_UPDATE_SB_FLAGS) { if (mddev->flags & MD_UPDATE_SB_FLAGS)
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0); md_update_sb(mddev, 0);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
}
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
...@@ -8286,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev) ...@@ -8286,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev)
set_bit(MD_CHANGE_DEVS, &mddev->flags); set_bit(MD_CHANGE_DEVS, &mddev->flags);
} }
} }
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape) mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev); mddev->pers->finish_reshape(mddev);
...@@ -8300,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev) ...@@ -8300,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1; rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1); md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery); clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
...@@ -8924,25 +9056,128 @@ static int __init md_init(void) ...@@ -8924,25 +9056,128 @@ static int __init md_init(void)
return ret; return ret;
} }
void md_reload_sb(struct mddev *mddev) static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{ {
struct md_rdev *rdev, *tmp; struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
struct md_rdev *rdev2;
int role, ret;
char b[BDEVNAME_SIZE];
rdev_for_each_safe(rdev, tmp, mddev) { /* Check for change of roles in the active devices */
rdev->sb_loaded = 0; rdev_for_each(rdev2, mddev) {
if (test_bit(Faulty, &rdev2->flags))
continue;
/* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
if (test_bit(Candidate, &rdev2->flags)) {
if (role == 0xfffe) {
pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
md_kick_rdev_from_array(rdev2);
continue;
}
else
clear_bit(Candidate, &rdev2->flags);
}
if (role != rdev2->raid_disk) {
/* got activated */
if (rdev2->raid_disk == -1 && role != 0xffff) {
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %s\n",
bdevname(rdev2->bdev,b));
continue;
}
/* device faulty
* We just want to do the minimum to mark the disk
* as faulty. The recovery is performed by the
* one who initiated the error.
*/
if ((role == 0xfffe) || (role == 0xfffd)) {
md_error(mddev, rdev2);
clear_bit(Blocked, &rdev2->flags);
}
}
}
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
/* Finally set the event to be up to date */
mddev->events = le64_to_cpu(sb->events);
}
static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
int err;
struct page *swapout = rdev->sb_page;
struct mdp_superblock_1 *sb;
/* Store the sb page of the rdev in the swapout temporary
* variable in case we err in the future
*/
rdev->sb_page = NULL;
alloc_disk_sb(rdev);
ClearPageUptodate(rdev->sb_page); ClearPageUptodate(rdev->sb_page);
rdev->sb_loaded = 0;
err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
if (err < 0) {
pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
__func__, __LINE__, rdev->desc_nr, err);
put_page(rdev->sb_page);
rdev->sb_page = swapout;
rdev->sb_loaded = 1;
return err;
} }
mddev->raid_disks = 0;
analyze_sbs(mddev); sb = page_address(rdev->sb_page);
rdev_for_each_safe(rdev, tmp, mddev) { /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
struct mdp_superblock_1 *sb = page_address(rdev->sb_page); * is not set
/* since we don't write to faulty devices, we figure out if the
* disk is faulty by comparing events
*/ */
if (mddev->events > sb->events)
set_bit(Faulty, &rdev->flags); if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
/* The other node finished recovery, call spare_active to set
* device In_sync and mddev->degraded
*/
if (rdev->recovery_offset == MaxSector &&
!test_bit(In_sync, &rdev->flags) &&
mddev->pers->spare_active(mddev))
sysfs_notify(&mddev->kobj, NULL, "degraded");
put_page(swapout);
return 0;
}
void md_reload_sb(struct mddev *mddev, int nr)
{
struct md_rdev *rdev;
int err;
/* Find the rdev */
rdev_for_each_rcu(rdev, mddev) {
if (rdev->desc_nr == nr)
break;
} }
if (!rdev || rdev->desc_nr != nr) {
pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
return;
}
err = read_rdev(mddev, rdev);
if (err < 0)
return;
check_sb_changes(mddev, rdev);
/* Read all rdev's to update recovery_offset */
rdev_for_each_rcu(rdev, mddev)
read_rdev(mddev, rdev);
} }
EXPORT_SYMBOL(md_reload_sb); EXPORT_SYMBOL(md_reload_sb);
......
...@@ -87,10 +87,16 @@ struct md_rdev { ...@@ -87,10 +87,16 @@ struct md_rdev {
* array and could again if we did a partial * array and could again if we did a partial
* resync from the bitmap * resync from the bitmap
*/ */
union {
sector_t recovery_offset;/* If this device has been partially sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were * recovered, this is where we were
* up to. * up to.
*/ */
sector_t journal_tail; /* If this device is a journal device,
* this is the journal tail (journal
* recovery start point)
*/
};
atomic_t nr_pending; /* number of pending requests. atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that * only maintained for arrays that
...@@ -172,6 +178,11 @@ enum flag_bits { ...@@ -172,6 +178,11 @@ enum flag_bits {
* This device is seen locally but not * This device is seen locally but not
* by the whole cluster * by the whole cluster
*/ */
Journal, /* This device is used as journal for
* raid-5/6.
* Usually, this device should be faster
* than other devices in the array
*/
}; };
#define BB_LEN_MASK (0x00000000000001FFULL) #define BB_LEN_MASK (0x00000000000001FFULL)
...@@ -221,6 +232,8 @@ struct mddev { ...@@ -221,6 +232,8 @@ struct mddev {
#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since #define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
* md_ioctl checked on it. * md_ioctl checked on it.
*/ */
#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
int suspended; int suspended;
atomic_t active_io; atomic_t active_io;
...@@ -658,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, ...@@ -658,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev); struct mddev *mddev);
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
extern void md_reload_sb(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force); extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev); extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr); struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
......
...@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data) ...@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) #define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) #define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) #define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
...@@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0) if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk; first = last = rdev->raid_disk;
/*
* find the disk ... but prefer rdev->saved_raid_disk
* if possible.
*/
if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
first = last = rdev->saved_raid_disk;
for (mirror = first; mirror <= last; mirror++) { for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror; p = conf->mirrors+mirror;
if (!p->rdev) { if (!p->rdev) {
...@@ -2495,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2495,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bitmap_close_sync(mddev->bitmap); bitmap_close_sync(mddev->bitmap);
close_sync(conf); close_sync(conf);
if (mddev_is_clustered(mddev)) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
}
return 0; return 0;
} }
...@@ -2515,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2515,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
return sync_blocks; return sync_blocks;
} }
bitmap_cond_end_sync(mddev->bitmap, sector_nr); /* we are incrementing sector_nr below. To be safe, we check against
* sector_nr + two times RESYNC_SECTORS
*/
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
raise_barrier(conf, sector_nr); raise_barrier(conf, sector_nr);
...@@ -2706,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp ...@@ -2706,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio_full: bio_full:
r1_bio->sectors = nr_sectors; r1_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
/* Send resync message */
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
/* For a user-requested sync, we read all readable devices and do a /* For a user-requested sync, we read all readable devices and do a
* compare * compare
*/ */
...@@ -3020,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev) ...@@ -3020,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL; return -EINVAL;
} }
if (!mddev_is_clustered(mddev)) {
err = md_allow_write(mddev); err = md_allow_write(mddev);
if (err) if (err)
return err; return err;
}
raid_disks = mddev->raid_disks + mddev->delta_disks; raid_disks = mddev->raid_disks + mddev->delta_disks;
......
...@@ -111,6 +111,13 @@ struct r1conf { ...@@ -111,6 +111,13 @@ struct r1conf {
* the new thread here until we fully activate the array. * the new thread here until we fully activate the array.
*/ */
struct md_thread *thread; struct md_thread *thread;
/* Keep track of cluster resync window to send to other
* nodes.
*/
sector_t cluster_sync_low;
sector_t cluster_sync_high;
}; };
/* /*
......
...@@ -3149,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, ...@@ -3149,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */ /* resync. Schedule a read for every block at this virt offset */
int count = 0; int count = 0;
bitmap_cond_end_sync(mddev->bitmap, sector_nr); bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
if (!bitmap_start_sync(mddev->bitmap, sector_nr, if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) && &sync_blocks, mddev->degraded) &&
......
/*
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
*/
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/raid/md_p.h>
#include <linux/crc32c.h>
#include <linux/random.h>
#include "md.h"
#include "raid5.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
* underneath hardware sector size. only works with PAGE_SIZE == 4096
*/
#define BLOCK_SECTORS (8)
/*
* reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
* recovery scans a very long log
*/
#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
struct r5l_log {
struct md_rdev *rdev;
u32 uuid_checksum;
sector_t device_size; /* log device size, round to
* BLOCK_SECTORS */
sector_t max_free_space; /* reclaim run if free space is at
* this size */
sector_t last_checkpoint; /* log tail. where recovery scan
* starts from */
u64 last_cp_seq; /* log tail sequence */
sector_t log_start; /* log head. where new data appends */
u64 seq; /* log head sequence */
sector_t next_checkpoint;
u64 next_cp_seq;
struct mutex io_mutex;
struct r5l_io_unit *current_io; /* current io_unit accepting new data */
spinlock_t io_list_lock;
struct list_head running_ios; /* io_units which are still running,
* and have not yet been completely
* written to the log */
struct list_head io_end_ios; /* io_units which have been completely
* written to the log but not yet written
* to the RAID */
struct list_head flushing_ios; /* io_units which are waiting for log
* cache flush */
struct list_head finished_ios; /* io_units which settle down in log disk */
struct bio flush_bio;
struct kmem_cache *io_kc;
struct md_thread *reclaim_thread;
unsigned long reclaim_target; /* number of space that need to be
* reclaimed. if it's 0, reclaim spaces
* used by io_units which are in
* IO_UNIT_STRIPE_END state (eg, reclaim
* dones't wait for specific io_unit
* switching to IO_UNIT_STRIPE_END
* state) */
wait_queue_head_t iounit_wait;
struct list_head no_space_stripes; /* pending stripes, log has no space */
spinlock_t no_space_stripes_lock;
bool need_cache_flush;
bool in_teardown;
};
/*
* an IO range starts from a meta data block and end at the next meta data
* block. The io unit's the meta data block tracks data/parity followed it. io
* unit is written to log disk with normal write, as we always flush log disk
* first and then start move data to raid disks, there is no requirement to
* write io unit with FLUSH/FUA
*/
struct r5l_io_unit {
struct r5l_log *log;
struct page *meta_page; /* store meta block */
int meta_offset; /* current offset in meta_page */
struct bio *current_bio;/* current_bio accepting new data */
atomic_t pending_stripe;/* how many stripes not flushed to raid */
u64 seq; /* seq number of the metablock */
sector_t log_start; /* where the io_unit starts */
sector_t log_end; /* where the io_unit ends */
struct list_head log_sibling; /* log->running_ios */
struct list_head stripe_list; /* stripes added to the io_unit */
int state;
bool need_split_bio;
};
/* r5l_io_unit state */
enum r5l_io_unit_state {
IO_UNIT_RUNNING = 0, /* accepting new IO */
IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
* don't accepting new bio */
IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
};
static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
{
start += inc;
if (start >= log->device_size)
start = start - log->device_size;
return start;
}
static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
sector_t end)
{
if (end >= start)
return end - start;
else
return end + log->device_size - start;
}
static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
{
sector_t used_size;
used_size = r5l_ring_distance(log, log->last_checkpoint,
log->log_start);
return log->device_size > used_size + size;
}
static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
{
__free_page(io->meta_page);
kmem_cache_free(log->io_kc, io);
}
static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
enum r5l_io_unit_state state)
{
struct r5l_io_unit *io;
while (!list_empty(from)) {
io = list_first_entry(from, struct r5l_io_unit, log_sibling);
/* don't change list order */
if (io->state >= state)
list_move_tail(&io->log_sibling, to);
else
break;
}
}
static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
enum r5l_io_unit_state state)
{
if (WARN_ON(io->state >= state))
return;
io->state = state;
}
static void r5l_io_run_stripes(struct r5l_io_unit *io)
{
struct stripe_head *sh, *next;
list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
list_del_init(&sh->log_list);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
}
static void r5l_log_run_stripes(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
assert_spin_locked(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
if (io->state < IO_UNIT_IO_END)
break;
list_move_tail(&io->log_sibling, &log->finished_ios);
r5l_io_run_stripes(io);
}
}
static void r5l_log_endio(struct bio *bio)
{
struct r5l_io_unit *io = bio->bi_private;
struct r5l_log *log = io->log;
unsigned long flags;
if (bio->bi_error)
md_error(log->rdev->mddev, log->rdev);
bio_put(bio);
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
if (log->need_cache_flush)
r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
IO_UNIT_IO_END);
else
r5l_log_run_stripes(log);
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (log->need_cache_flush)
md_wakeup_thread(log->rdev->mddev->thread);
}
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
struct r5l_meta_block *block;
unsigned long flags;
u32 crc;
if (!io)
return;
block = page_address(io->meta_page);
block->meta_size = cpu_to_le32(io->meta_offset);
crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
log->current_io = NULL;
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags);
submit_bio(WRITE, io->current_bio);
}
static struct bio *r5l_bio_alloc(struct r5l_log *log)
{
struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
bio->bi_rw = WRITE;
bio->bi_bdev = log->rdev->bdev;
bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
return bio;
}
static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
{
log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
/*
* If we filled up the log device start from the beginning again,
* which will require a new bio.
*
* Note: for this to work properly the log size needs to me a multiple
* of BLOCK_SECTORS.
*/
if (log->log_start == 0)
io->need_split_bio = true;
io->log_end = log->log_start;
}
static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
{
struct r5l_io_unit *io;
struct r5l_meta_block *block;
/* We can't handle memory allocate failure so far */
io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
io->state = IO_UNIT_RUNNING;
io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
block = page_address(io->meta_page);
block->magic = cpu_to_le32(R5LOG_MAGIC);
block->version = R5LOG_VERSION;
block->seq = cpu_to_le64(log->seq);
block->position = cpu_to_le64(log->log_start);
io->log_start = log->log_start;
io->meta_offset = sizeof(struct r5l_meta_block);
io->seq = log->seq++;
io->current_bio = r5l_bio_alloc(log);
io->current_bio->bi_end_io = r5l_log_endio;
io->current_bio->bi_private = io;
bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
r5_reserve_log_entry(log, io);
spin_lock_irq(&log->io_list_lock);
list_add_tail(&io->log_sibling, &log->running_ios);
spin_unlock_irq(&log->io_list_lock);
return io;
}
static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
{
if (log->current_io &&
log->current_io->meta_offset + payload_size > PAGE_SIZE)
r5l_submit_current_io(log);
if (!log->current_io)
log->current_io = r5l_new_meta(log);
return 0;
}
static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
sector_t location,
u32 checksum1, u32 checksum2,
bool checksum2_valid)
{
struct r5l_io_unit *io = log->current_io;
struct r5l_payload_data_parity *payload;
payload = page_address(io->meta_page) + io->meta_offset;
payload->header.type = cpu_to_le16(type);
payload->header.flags = cpu_to_le16(0);
payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
(PAGE_SHIFT - 9));
payload->location = cpu_to_le64(location);
payload->checksum[0] = cpu_to_le32(checksum1);
if (checksum2_valid)
payload->checksum[1] = cpu_to_le32(checksum2);
io->meta_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * (1 + !!checksum2_valid);
}
static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
{
struct r5l_io_unit *io = log->current_io;
if (io->need_split_bio) {
struct bio *prev = io->current_bio;
io->current_bio = r5l_bio_alloc(log);
bio_chain(io->current_bio, prev);
submit_bio(WRITE, prev);
}
if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
BUG();
r5_reserve_log_entry(log, io);
}
static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
int data_pages, int parity_pages)
{
int i;
int meta_size;
struct r5l_io_unit *io;
meta_size =
((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
* data_pages) +
sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * parity_pages;
r5l_get_meta(log, meta_size);
io = log->current_io;
for (i = 0; i < sh->disks; i++) {
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
if (i == sh->pd_idx || i == sh->qd_idx)
continue;
r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
raid5_compute_blocknr(sh, i, 0),
sh->dev[i].log_checksum, 0, false);
r5l_append_payload_page(log, sh->dev[i].page);
}
if (sh->qd_idx >= 0) {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum,
sh->dev[sh->qd_idx].log_checksum, true);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
} else {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum,
0, false);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
}
list_add_tail(&sh->log_list, &io->stripe_list);
atomic_inc(&io->pending_stripe);
sh->log_io = io;
}
static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
/*
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{
int write_disks = 0;
int data_pages, parity_pages;
int meta_size;
int reserve;
int i;
if (!log)
return -EAGAIN;
/* Don't support stripe batch */
if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
test_bit(STRIPE_SYNCING, &sh->state)) {
/* the stripe is written to log, we start writing it to raid */
clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
return -EAGAIN;
}
for (i = 0; i < sh->disks; i++) {
void *addr;
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
write_disks++;
/* checksum is already calculated in last run */
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
continue;
addr = kmap_atomic(sh->dev[i].page);
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
addr, PAGE_SIZE);
kunmap_atomic(addr);
}
parity_pages = 1 + !!(sh->qd_idx >= 0);
data_pages = write_disks - parity_pages;
meta_size =
((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
* data_pages) +
sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * parity_pages;
/* Doesn't work with very big raid array */
if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
return -EINVAL;
set_bit(STRIPE_LOG_TRAPPED, &sh->state);
/*
* The stripe must enter state machine again to finish the write, so
* don't delay.
*/
clear_bit(STRIPE_DELAYED, &sh->state);
atomic_inc(&sh->count);
mutex_lock(&log->io_mutex);
/* meta + data */
reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
if (r5l_has_free_space(log, reserve))
r5l_log_stripe(log, sh, data_pages, parity_pages);
else {
spin_lock(&log->no_space_stripes_lock);
list_add_tail(&sh->log_list, &log->no_space_stripes);
spin_unlock(&log->no_space_stripes_lock);
r5l_wake_reclaim(log, reserve);
}
mutex_unlock(&log->io_mutex);
return 0;
}
void r5l_write_stripe_run(struct r5l_log *log)
{
if (!log)
return;
mutex_lock(&log->io_mutex);
r5l_submit_current_io(log);
mutex_unlock(&log->io_mutex);
}
int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
if (!log)
return -ENODEV;
/*
* we flush log disk cache first, then write stripe data to raid disks.
* So if bio is finished, the log disk cache is flushed already. The
* recovery guarantees we can recovery the bio from log disk, so we
* don't need to flush again
*/
if (bio->bi_iter.bi_size == 0) {
bio_endio(bio);
return 0;
}
bio->bi_rw &= ~REQ_FLUSH;
return -EAGAIN;
}
/* This will run after log space is reclaimed */
static void r5l_run_no_space_stripes(struct r5l_log *log)
{
struct stripe_head *sh;
spin_lock(&log->no_space_stripes_lock);
while (!list_empty(&log->no_space_stripes)) {
sh = list_first_entry(&log->no_space_stripes,
struct stripe_head, log_list);
list_del_init(&sh->log_list);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
spin_unlock(&log->no_space_stripes_lock);
}
static sector_t r5l_reclaimable_space(struct r5l_log *log)
{
return r5l_ring_distance(log, log->last_checkpoint,
log->next_checkpoint);
}
static bool r5l_complete_finished_ios(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
bool found = false;
assert_spin_locked(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
/* don't change list order */
if (io->state < IO_UNIT_STRIPE_END)
break;
log->next_checkpoint = io->log_start;
log->next_cp_seq = io->seq;
list_del(&io->log_sibling);
r5l_free_io_unit(log, io);
found = true;
}
return found;
}
static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
{
struct r5l_log *log = io->log;
unsigned long flags;
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
if (!r5l_complete_finished_ios(log)) {
spin_unlock_irqrestore(&log->io_list_lock, flags);
return;
}
if (r5l_reclaimable_space(log) > log->max_free_space)
r5l_wake_reclaim(log, 0);
spin_unlock_irqrestore(&log->io_list_lock, flags);
wake_up(&log->iounit_wait);
}
void r5l_stripe_write_finished(struct stripe_head *sh)
{
struct r5l_io_unit *io;
io = sh->log_io;
sh->log_io = NULL;
if (io && atomic_dec_and_test(&io->pending_stripe))
__r5l_stripe_write_finished(io);
}
static void r5l_log_flush_endio(struct bio *bio)
{
struct r5l_log *log = container_of(bio, struct r5l_log,
flush_bio);
unsigned long flags;
struct r5l_io_unit *io;
if (bio->bi_error)
md_error(log->rdev->mddev, log->rdev);
spin_lock_irqsave(&log->io_list_lock, flags);
list_for_each_entry(io, &log->flushing_ios, log_sibling)
r5l_io_run_stripes(io);
list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
spin_unlock_irqrestore(&log->io_list_lock, flags);
}
/*
* Starting dispatch IO to raid.
* io_unit(meta) consists of a log. There is one situation we want to avoid. A
* broken meta in the middle of a log causes recovery can't find meta at the
* head of log. If operations require meta at the head persistent in log, we
* must make sure meta before it persistent in log too. A case is:
*
* stripe data/parity is in log, we start write stripe to raid disks. stripe
* data/parity must be persistent in log before we do the write to raid disks.
*
* The solution is we restrictly maintain io_unit list order. In this case, we
* only write stripes of an io_unit to raid disks till the io_unit is the first
* one whose data/parity is in log.
*/
void r5l_flush_stripe_to_raid(struct r5l_log *log)
{
bool do_flush;
if (!log || !log->need_cache_flush)
return;
spin_lock_irq(&log->io_list_lock);
/* flush bio is running */
if (!list_empty(&log->flushing_ios)) {
spin_unlock_irq(&log->io_list_lock);
return;
}
list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
do_flush = !list_empty(&log->flushing_ios);
spin_unlock_irq(&log->io_list_lock);
if (!do_flush)
return;
bio_reset(&log->flush_bio);
log->flush_bio.bi_bdev = log->rdev->bdev;
log->flush_bio.bi_end_io = r5l_log_flush_endio;
submit_bio(WRITE_FLUSH, &log->flush_bio);
}
static void r5l_write_super(struct r5l_log *log, sector_t cp);
static void r5l_write_super_and_discard_space(struct r5l_log *log,
sector_t end)
{
struct block_device *bdev = log->rdev->bdev;
struct mddev *mddev;
r5l_write_super(log, end);
if (!blk_queue_discard(bdev_get_queue(bdev)))
return;
mddev = log->rdev->mddev;
/*
* This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
* wait for this thread to finish. This thread waits for
* MD_CHANGE_PENDING clear, which is supposed to be done in
* md_check_recovery(). md_check_recovery() tries to get
* reconfig_mutex. Since r5l_quiesce already holds the mutex,
* md_check_recovery() fails, so the PENDING never get cleared. The
* in_teardown check workaround this issue.
*/
if (!log->in_teardown) {
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
log->in_teardown);
/*
* r5l_quiesce could run after in_teardown check and hold
* mutex first. Superblock might get updated twice.
*/
if (log->in_teardown)
md_update_sb(mddev, 1);
} else {
WARN_ON(!mddev_is_locked(mddev));
md_update_sb(mddev, 1);
}
/* discard IO error really doesn't matter, ignore it */
if (log->last_checkpoint < end) {
blkdev_issue_discard(bdev,
log->last_checkpoint + log->rdev->data_offset,
end - log->last_checkpoint, GFP_NOIO, 0);
} else {
blkdev_issue_discard(bdev,
log->last_checkpoint + log->rdev->data_offset,
log->device_size - log->last_checkpoint,
GFP_NOIO, 0);
blkdev_issue_discard(bdev, log->rdev->data_offset, end,
GFP_NOIO, 0);
}
}
static void r5l_do_reclaim(struct r5l_log *log)
{
sector_t reclaim_target = xchg(&log->reclaim_target, 0);
sector_t reclaimable;
sector_t next_checkpoint;
u64 next_cp_seq;
spin_lock_irq(&log->io_list_lock);
/*
* move proper io_unit to reclaim list. We should not change the order.
* reclaimable/unreclaimable io_unit can be mixed in the list, we
* shouldn't reuse space of an unreclaimable io_unit
*/
while (1) {
reclaimable = r5l_reclaimable_space(log);
if (reclaimable >= reclaim_target ||
(list_empty(&log->running_ios) &&
list_empty(&log->io_end_ios) &&
list_empty(&log->flushing_ios) &&
list_empty(&log->finished_ios)))
break;
md_wakeup_thread(log->rdev->mddev->thread);
wait_event_lock_irq(log->iounit_wait,
r5l_reclaimable_space(log) > reclaimable,
log->io_list_lock);
}
next_checkpoint = log->next_checkpoint;
next_cp_seq = log->next_cp_seq;
spin_unlock_irq(&log->io_list_lock);
BUG_ON(reclaimable < 0);
if (reclaimable == 0)
return;
/*
* write_super will flush cache of each raid disk. We must write super
* here, because the log area might be reused soon and we don't want to
* confuse recovery
*/
r5l_write_super_and_discard_space(log, next_checkpoint);
mutex_lock(&log->io_mutex);
log->last_checkpoint = next_checkpoint;
log->last_cp_seq = next_cp_seq;
mutex_unlock(&log->io_mutex);
r5l_run_no_space_stripes(log);
}
static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
if (!log)
return;
r5l_do_reclaim(log);
}
static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
{
unsigned long target;
unsigned long new = (unsigned long)space; /* overflow in theory */
do {
target = log->reclaim_target;
if (new < target)
return;
} while (cmpxchg(&log->reclaim_target, target, new) != target);
md_wakeup_thread(log->reclaim_thread);
}
void r5l_quiesce(struct r5l_log *log, int state)
{
struct mddev *mddev;
if (!log || state == 2)
return;
if (state == 0) {
log->in_teardown = 0;
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
} else if (state == 1) {
/*
* at this point all stripes are finished, so io_unit is at
* least in STRIPE_END state
*/
log->in_teardown = 1;
/* make sure r5l_write_super_and_discard_space exits */
mddev = log->rdev->mddev;
wake_up(&mddev->sb_wait);
r5l_wake_reclaim(log, -1L);
md_unregister_thread(&log->reclaim_thread);
r5l_do_reclaim(log);
}
}
bool r5l_log_disk_error(struct r5conf *conf)
{
/* don't allow write if journal disk is missing */
if (!conf->log)
return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return test_bit(Faulty, &conf->log->rdev->flags);
}
struct r5l_recovery_ctx {
struct page *meta_page; /* current meta */
sector_t meta_total_blocks; /* total size of current meta and data */
sector_t pos; /* recovery position */
u64 seq; /* recovery position seq */
};
static int r5l_read_meta_block(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct page *page = ctx->meta_page;
struct r5l_meta_block *mb;
u32 crc, stored_crc;
if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
return -EIO;
mb = page_address(page);
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
le64_to_cpu(mb->seq) != ctx->seq ||
mb->version != R5LOG_VERSION ||
le64_to_cpu(mb->position) != ctx->pos)
return -EINVAL;
crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != crc)
return -EINVAL;
if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
return -EINVAL;
ctx->meta_total_blocks = BLOCK_SECTORS;
return 0;
}
static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
struct r5l_recovery_ctx *ctx,
sector_t stripe_sect,
int *offset, sector_t *log_offset)
{
struct r5conf *conf = log->rdev->mddev->private;
struct stripe_head *sh;
struct r5l_payload_data_parity *payload;
int disk_index;
sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
while (1) {
payload = page_address(ctx->meta_page) + *offset;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0,
&disk_index, sh);
sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
sh->dev[disk_index].page, READ, false);
sh->dev[disk_index].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
ctx->meta_total_blocks += BLOCK_SECTORS;
} else {
disk_index = sh->pd_idx;
sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
sh->dev[disk_index].page, READ, false);
sh->dev[disk_index].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
if (sh->qd_idx >= 0) {
disk_index = sh->qd_idx;
sync_page_io(log->rdev,
r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
PAGE_SIZE, sh->dev[disk_index].page,
READ, false);
sh->dev[disk_index].log_checksum =
le32_to_cpu(payload->checksum[1]);
set_bit(R5_Wantwrite,
&sh->dev[disk_index].flags);
}
ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
}
*log_offset = r5l_ring_add(log, *log_offset,
le32_to_cpu(payload->size));
*offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
break;
}
for (disk_index = 0; disk_index < sh->disks; disk_index++) {
void *addr;
u32 checksum;
if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
continue;
addr = kmap_atomic(sh->dev[disk_index].page);
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
kunmap_atomic(addr);
if (checksum != sh->dev[disk_index].log_checksum)
goto error;
}
for (disk_index = 0; disk_index < sh->disks; disk_index++) {
struct md_rdev *rdev, *rrdev;
if (!test_and_clear_bit(R5_Wantwrite,
&sh->dev[disk_index].flags))
continue;
/* in case device is broken */
rdev = rcu_dereference(conf->disks[disk_index].rdev);
if (rdev)
sync_page_io(rdev, stripe_sect, PAGE_SIZE,
sh->dev[disk_index].page, WRITE, false);
rrdev = rcu_dereference(conf->disks[disk_index].replacement);
if (rrdev)
sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
sh->dev[disk_index].page, WRITE, false);
}
raid5_release_stripe(sh);
return 0;
error:
for (disk_index = 0; disk_index < sh->disks; disk_index++)
sh->dev[disk_index].flags = 0;
raid5_release_stripe(sh);
return -EINVAL;
}
static int r5l_recovery_flush_one_meta(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct r5conf *conf = log->rdev->mddev->private;
struct r5l_payload_data_parity *payload;
struct r5l_meta_block *mb;
int offset;
sector_t log_offset;
sector_t stripe_sector;
mb = page_address(ctx->meta_page);
offset = sizeof(struct r5l_meta_block);
log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
while (offset < le32_to_cpu(mb->meta_size)) {
int dd;
payload = (void *)mb + offset;
stripe_sector = raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0, &dd, NULL);
if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
&offset, &log_offset))
return -EINVAL;
}
return 0;
}
/* copy data/parity from log to raid disks */
static void r5l_recovery_flush_log(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
while (1) {
if (r5l_read_meta_block(log, ctx))
return;
if (r5l_recovery_flush_one_meta(log, ctx))
return;
ctx->seq++;
ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
}
}
static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
u64 seq)
{
struct page *page;
struct r5l_meta_block *mb;
u32 crc;
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page)
return -ENOMEM;
mb = page_address(page);
mb->magic = cpu_to_le32(R5LOG_MAGIC);
mb->version = R5LOG_VERSION;
mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
mb->seq = cpu_to_le64(seq);
mb->position = cpu_to_le64(pos);
crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
mb->checksum = cpu_to_le32(crc);
if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
__free_page(page);
return -EIO;
}
__free_page(page);
return 0;
}
static int r5l_recovery_log(struct r5l_log *log)
{
struct r5l_recovery_ctx ctx;
ctx.pos = log->last_checkpoint;
ctx.seq = log->last_cp_seq;
ctx.meta_page = alloc_page(GFP_KERNEL);
if (!ctx.meta_page)
return -ENOMEM;
r5l_recovery_flush_log(log, &ctx);
__free_page(ctx.meta_page);
/*
* we did a recovery. Now ctx.pos points to an invalid meta block. New
* log will start here. but we can't let superblock point to last valid
* meta block. The log might looks like:
* | meta 1| meta 2| meta 3|
* meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
* superblock points to meta 1, we write a new valid meta 2n. if crash
* happens again, new recovery will start from meta 1. Since meta 2n is
* valid now, recovery will think meta 3 is valid, which is wrong.
* The solution is we create a new meta in meta2 with its seq == meta
* 1's seq + 10 and let superblock points to meta2. The same recovery will
* not think meta 3 is a valid meta, because its seq doesn't match
*/
if (ctx.seq > log->last_cp_seq + 1) {
int ret;
ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
if (ret)
return ret;
log->seq = ctx.seq + 11;
log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
r5l_write_super(log, ctx.pos);
} else {
log->log_start = ctx.pos;
log->seq = ctx.seq;
}
return 0;
}
static void r5l_write_super(struct r5l_log *log, sector_t cp)
{
struct mddev *mddev = log->rdev->mddev;
log->rdev->journal_tail = cp;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
static int r5l_load_log(struct r5l_log *log)
{
struct md_rdev *rdev = log->rdev;
struct page *page;
struct r5l_meta_block *mb;
sector_t cp = log->rdev->journal_tail;
u32 stored_crc, expected_crc;
bool create_super = false;
int ret;
/* Make sure it's valid */
if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
cp = 0;
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
ret = -EIO;
goto ioerr;
}
mb = page_address(page);
if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
mb->version != R5LOG_VERSION) {
create_super = true;
goto create;
}
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != expected_crc) {
create_super = true;
goto create;
}
if (le64_to_cpu(mb->position) != cp) {
create_super = true;
goto create;
}
create:
if (create_super) {
log->last_cp_seq = prandom_u32();
cp = 0;
/*
* Make sure super points to correct address. Log might have
* data very soon. If super hasn't correct log tail address,
* recovery can't find the log
*/
r5l_write_super(log, cp);
} else
log->last_cp_seq = le64_to_cpu(mb->seq);
log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
log->max_free_space = RECLAIM_MAX_FREE_SPACE;
log->last_checkpoint = cp;
__free_page(page);
return r5l_recovery_log(log);
ioerr:
__free_page(page);
return ret;
}
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct r5l_log *log;
if (PAGE_SIZE != 4096)
return -EINVAL;
log = kzalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return -ENOMEM;
log->rdev = rdev;
log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
sizeof(rdev->mddev->uuid));
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
INIT_LIST_HEAD(&log->running_ios);
INIT_LIST_HEAD(&log->io_end_ios);
INIT_LIST_HEAD(&log->flushing_ios);
INIT_LIST_HEAD(&log->finished_ios);
bio_init(&log->flush_bio);
log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
if (!log->io_kc)
goto io_kc;
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
if (!log->reclaim_thread)
goto reclaim_thread;
init_waitqueue_head(&log->iounit_wait);
INIT_LIST_HEAD(&log->no_space_stripes);
spin_lock_init(&log->no_space_stripes_lock);
if (r5l_load_log(log))
goto error;
conf->log = log;
return 0;
error:
md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
kmem_cache_destroy(log->io_kc);
io_kc:
kfree(log);
return -EINVAL;
}
void r5l_exit_log(struct r5l_log *log)
{
md_unregister_thread(&log->reclaim_thread);
kmem_cache_destroy(log->io_kc);
kfree(log);
}
...@@ -353,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf, ...@@ -353,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
struct list_head *list = &temp_inactive_list[size - 1]; struct list_head *list = &temp_inactive_list[size - 1];
/* /*
* We don't hold any lock here yet, get_active_stripe() might * We don't hold any lock here yet, raid5_get_active_stripe() might
* remove stripes from the list * remove stripes from the list
*/ */
if (!list_empty_careful(list)) { if (!list_empty_careful(list)) {
...@@ -413,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf, ...@@ -413,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf,
return count; return count;
} }
static void release_stripe(struct stripe_head *sh) void raid5_release_stripe(struct stripe_head *sh)
{ {
struct r5conf *conf = sh->raid_conf; struct r5conf *conf = sh->raid_conf;
unsigned long flags; unsigned long flags;
...@@ -658,8 +658,8 @@ static int has_failed(struct r5conf *conf) ...@@ -658,8 +658,8 @@ static int has_failed(struct r5conf *conf)
return 0; return 0;
} }
static struct stripe_head * struct stripe_head *
get_active_stripe(struct r5conf *conf, sector_t sector, raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce) int previous, int noblock, int noquiesce)
{ {
struct stripe_head *sh; struct stripe_head *sh;
...@@ -755,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2) ...@@ -755,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
/* Only freshly new full stripe normal write stripe can be added to a batch list */ /* Only freshly new full stripe normal write stripe can be added to a batch list */
static bool stripe_can_batch(struct stripe_head *sh) static bool stripe_can_batch(struct stripe_head *sh)
{ {
struct r5conf *conf = sh->raid_conf;
if (conf->log)
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) && return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) && !test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh); is_full_stripe_write(sh);
...@@ -858,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh ...@@ -858,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
unlock_out: unlock_out:
unlock_two_stripes(head, sh); unlock_two_stripes(head, sh);
out: out:
release_stripe(head); raid5_release_stripe(head);
} }
/* Determine if 'data_offset' or 'new_data_offset' should be used /* Determine if 'data_offset' or 'new_data_offset' should be used
...@@ -895,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -895,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep(); might_sleep();
if (r5l_write_stripe(conf->log, sh) == 0)
return;
for (i = disks; i--; ) { for (i = disks; i--; ) {
int rw; int rw;
int replace_only = 0; int replace_only = 0;
...@@ -1208,7 +1214,7 @@ static void ops_complete_biofill(void *stripe_head_ref) ...@@ -1208,7 +1214,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
return_io(&return_bi); return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
} }
static void ops_run_biofill(struct stripe_head *sh) static void ops_run_biofill(struct stripe_head *sh)
...@@ -1271,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref) ...@@ -1271,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref)
if (sh->check_state == check_state_compute_run) if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result; sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
} }
/* return a pointer to the address conversion region of the scribble buffer */ /* return a pointer to the address conversion region of the scribble buffer */
...@@ -1697,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref) ...@@ -1697,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
} }
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
} }
static void static void
...@@ -1855,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref) ...@@ -1855,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref)
sh->check_state = check_state_check_result; sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
} }
static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
...@@ -2017,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) ...@@ -2017,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
/* we just created an active stripe so... */ /* we just created an active stripe so... */
atomic_inc(&conf->active_stripes); atomic_inc(&conf->active_stripes);
release_stripe(sh); raid5_release_stripe(sh);
conf->max_nr_stripes++; conf->max_nr_stripes++;
return 1; return 1;
} }
...@@ -2236,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize) ...@@ -2236,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!p) if (!p)
err = -ENOMEM; err = -ENOMEM;
} }
release_stripe(nsh); raid5_release_stripe(nsh);
} }
/* critical section pass, GFP_NOIO no longer needed */ /* critical section pass, GFP_NOIO no longer needed */
...@@ -2394,7 +2400,7 @@ static void raid5_end_read_request(struct bio * bi) ...@@ -2394,7 +2400,7 @@ static void raid5_end_read_request(struct bio * bi)
rdev_dec_pending(rdev, conf->mddev); rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags); clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
} }
static void raid5_end_write_request(struct bio *bi) static void raid5_end_write_request(struct bio *bi)
...@@ -2468,14 +2474,12 @@ static void raid5_end_write_request(struct bio *bi) ...@@ -2468,14 +2474,12 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags)) if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags); clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head) if (sh->batch_head && sh != sh->batch_head)
release_stripe(sh->batch_head); raid5_release_stripe(sh->batch_head);
} }
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
static void raid5_build_block(struct stripe_head *sh, int i, int previous) static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{ {
struct r5dev *dev = &sh->dev[i]; struct r5dev *dev = &sh->dev[i];
...@@ -2491,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous) ...@@ -2491,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->rreq.bi_private = sh; dev->rreq.bi_private = sh;
dev->flags = 0; dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous); dev->sector = raid5_compute_blocknr(sh, i, previous);
} }
static void error(struct mddev *mddev, struct md_rdev *rdev) static void error(struct mddev *mddev, struct md_rdev *rdev)
...@@ -2524,7 +2528,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev) ...@@ -2524,7 +2528,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
* Input: a 'big' sector number, * Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them. * Output: index of the data and parity disk, and the sector # in them.
*/ */
static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx, int previous, int *dd_idx,
struct stripe_head *sh) struct stripe_head *sh)
{ {
...@@ -2726,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector, ...@@ -2726,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
return new_sector; return new_sector;
} }
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
{ {
struct r5conf *conf = sh->raid_conf; struct r5conf *conf = sh->raid_conf;
int raid_disks = sh->disks; int raid_disks = sh->disks;
...@@ -3098,6 +3102,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -3098,6 +3102,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi) if (bi)
bitmap_end = 1; bitmap_end = 1;
r5l_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap); wake_up(&conf->wait_for_overlap);
...@@ -3141,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, ...@@ -3141,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
* the data has not reached the cache yet. * the data has not reached the cache yet.
*/ */
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
s->failed > conf->max_degraded &&
(!test_bit(R5_Insync, &sh->dev[i].flags) || (!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) { test_bit(R5_ReadError, &sh->dev[i].flags))) {
spin_lock_irq(&sh->stripe_lock); spin_lock_irq(&sh->stripe_lock);
...@@ -3497,6 +3504,9 @@ static void handle_stripe_clean_event(struct r5conf *conf, ...@@ -3497,6 +3504,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
WARN_ON(test_bit(R5_SkipCopy, &dev->flags)); WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
WARN_ON(dev->page != dev->orig_page); WARN_ON(dev->page != dev->orig_page);
} }
r5l_stripe_write_finished(sh);
if (!discard_pending && if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) { test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
int hash; int hash;
...@@ -3939,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) ...@@ -3939,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
struct stripe_head *sh2; struct stripe_head *sh2;
struct async_submit_ctl submit; struct async_submit_ctl submit;
sector_t bn = compute_blocknr(sh, i, 1); sector_t bn = raid5_compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0, sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL); &dd_idx, NULL);
sh2 = get_active_stripe(conf, s, 0, 1, 1); sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
if (sh2 == NULL) if (sh2 == NULL)
/* so far only the early blocks of this stripe /* so far only the early blocks of this stripe
* have been requested. When later blocks * have been requested. When later blocks
...@@ -3952,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) ...@@ -3952,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh2->state) || if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
/* must have already done this block */ /* must have already done this block */
release_stripe(sh2); raid5_release_stripe(sh2);
continue; continue;
} }
...@@ -3973,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) ...@@ -3973,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
set_bit(STRIPE_EXPAND_READY, &sh2->state); set_bit(STRIPE_EXPAND_READY, &sh2->state);
set_bit(STRIPE_HANDLE, &sh2->state); set_bit(STRIPE_HANDLE, &sh2->state);
} }
release_stripe(sh2); raid5_release_stripe(sh2);
} }
/* done submitting copies, wait for them to complete */ /* done submitting copies, wait for them to complete */
...@@ -4008,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) ...@@ -4008,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head; s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1; s->failed_num[0] = -1;
s->failed_num[1] = -1; s->failed_num[1] = -1;
s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */ /* Now to look around and see what can be done */
rcu_read_lock(); rcu_read_lock();
...@@ -4259,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, ...@@ -4259,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
if (handle_flags == 0 || if (handle_flags == 0 ||
sh->state & handle_flags) sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
} }
spin_lock_irq(&head_sh->stripe_lock); spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL; head_sh->batch_head = NULL;
...@@ -4320,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4320,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh)
analyse_stripe(sh, &s); analyse_stripe(sh, &s);
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
if (s.handle_bad_blocks) { if (s.handle_bad_blocks) {
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
goto finish; goto finish;
...@@ -4348,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4348,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh)
/* check if the array has lost more than max_degraded devices and, /* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed. * if so, some requests might need to be failed.
*/ */
if (s.failed > conf->max_degraded) { if (s.failed > conf->max_degraded || s.log_failed) {
sh->check_state = 0; sh->check_state = 0;
sh->reconstruct_state = 0; sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0); break_stripe_batch_list(sh, 0);
...@@ -4506,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4506,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */ /* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) { if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src struct stripe_head *sh_src
= get_active_stripe(conf, sh->sector, 1, 1, 1); = raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) { if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read. /* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little * so arrange for sh to be delayed a little
...@@ -4516,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh) ...@@ -4516,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh_src->state)) &sh_src->state))
atomic_inc(&conf->preread_active_stripes); atomic_inc(&conf->preread_active_stripes);
release_stripe(sh_src); raid5_release_stripe(sh_src);
goto finish; goto finish;
} }
if (sh_src) if (sh_src)
release_stripe(sh_src); raid5_release_stripe(sh_src);
sh->reconstruct_state = reconstruct_state_idle; sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state); clear_bit(STRIPE_EXPANDING, &sh->state);
...@@ -5012,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev, ...@@ -5012,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev,
struct raid5_plug_cb *cb; struct raid5_plug_cb *cb;
if (!blk_cb) { if (!blk_cb) {
release_stripe(sh); raid5_release_stripe(sh);
return; return;
} }
...@@ -5028,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev, ...@@ -5028,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev,
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)) if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list); list_add_tail(&sh->lru, &cb->list);
else else
release_stripe(sh); raid5_release_stripe(sh);
} }
static void make_discard_request(struct mddev *mddev, struct bio *bi) static void make_discard_request(struct mddev *mddev, struct bio *bi)
...@@ -5063,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) ...@@ -5063,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
DEFINE_WAIT(w); DEFINE_WAIT(w);
int d; int d;
again: again:
sh = get_active_stripe(conf, logical_sector, 0, 0, 0); sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w, prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE); TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags); set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) { if (test_bit(STRIPE_SYNCING, &sh->state)) {
release_stripe(sh); raid5_release_stripe(sh);
schedule(); schedule();
goto again; goto again;
} }
...@@ -5080,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) ...@@ -5080,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
if (sh->dev[d].towrite || sh->dev[d].toread) { if (sh->dev[d].towrite || sh->dev[d].toread) {
set_bit(R5_Overlap, &sh->dev[d].flags); set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock); spin_unlock_irq(&sh->stripe_lock);
release_stripe(sh); raid5_release_stripe(sh);
schedule(); schedule();
goto again; goto again;
} }
...@@ -5136,9 +5150,16 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -5136,9 +5150,16 @@ static void make_request(struct mddev *mddev, struct bio * bi)
bool do_prepare; bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) { if (unlikely(bi->bi_rw & REQ_FLUSH)) {
int ret = r5l_handle_flush_request(conf->log, bi);
if (ret == 0)
return;
if (ret == -ENODEV) {
md_flush_request(mddev, bi); md_flush_request(mddev, bi);
return; return;
} }
/* ret == -EAGAIN, fallback */
}
md_write_start(mddev, bi); md_write_start(mddev, bi);
...@@ -5210,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -5210,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)new_sector, (unsigned long long)new_sector,
(unsigned long long)logical_sector); (unsigned long long)logical_sector);
sh = get_active_stripe(conf, new_sector, previous, sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK), 0); (bi->bi_rw&RWA_MASK), 0);
if (sh) { if (sh) {
if (unlikely(previous)) { if (unlikely(previous)) {
...@@ -5231,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -5231,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
must_retry = 1; must_retry = 1;
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
if (must_retry) { if (must_retry) {
release_stripe(sh); raid5_release_stripe(sh);
schedule(); schedule();
do_prepare = true; do_prepare = true;
goto retry; goto retry;
...@@ -5241,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -5241,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi)
/* Might have got the wrong stripe_head /* Might have got the wrong stripe_head
* by accident * by accident
*/ */
release_stripe(sh); raid5_release_stripe(sh);
goto retry; goto retry;
} }
if (rw == WRITE && if (rw == WRITE &&
logical_sector >= mddev->suspend_lo && logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) { logical_sector < mddev->suspend_hi) {
release_stripe(sh); raid5_release_stripe(sh);
/* As the suspend_* range is controlled by /* As the suspend_* range is controlled by
* userspace, we want an interruptible * userspace, we want an interruptible
* wait. * wait.
...@@ -5271,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) ...@@ -5271,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* and wait a while * and wait a while
*/ */
md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->thread);
release_stripe(sh); raid5_release_stripe(sh);
schedule(); schedule();
do_prepare = true; do_prepare = true;
goto retry; goto retry;
...@@ -5458,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -5458,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j; int j;
int skipped_disk = 0; int skipped_disk = 0;
sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state); set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes); atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old /* If any of this stripe is beyond the end of the old
...@@ -5471,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -5471,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (conf->level == 6 && if (conf->level == 6 &&
j == sh->qd_idx) j == sh->qd_idx)
continue; continue;
s = compute_blocknr(sh, j, 0); s = raid5_compute_blocknr(sh, j, 0);
if (s < raid5_size(mddev, 0, 0)) { if (s < raid5_size(mddev, 0, 0)) {
skipped_disk = 1; skipped_disk = 1;
continue; continue;
...@@ -5507,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -5507,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (last_sector >= mddev->dev_sectors) if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1; last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) { while (first_sector <= last_sector) {
sh = get_active_stripe(conf, first_sector, 1, 0, 1); sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
first_sector += STRIPE_SECTORS; first_sector += STRIPE_SECTORS;
} }
/* Now that the sources are clearly marked, we can release /* Now that the sources are clearly marked, we can release
...@@ -5519,7 +5540,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk ...@@ -5519,7 +5540,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
while (!list_empty(&stripes)) { while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru); sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru); list_del_init(&sh->lru);
release_stripe(sh); raid5_release_stripe(sh);
} }
/* If this takes us to the resync_max point where we have to pause, /* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock. * then we need to write out the superblock.
...@@ -5615,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int ...@@ -5615,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
} }
bitmap_cond_end_sync(mddev->bitmap, sector_nr); bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
sh = get_active_stripe(conf, sector_nr, 0, 1, 0); sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
if (sh == NULL) { if (sh == NULL) {
sh = get_active_stripe(conf, sector_nr, 0, 0, 0); sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
/* make sure we don't swamp the stripe cache if someone else /* make sure we don't swamp the stripe cache if someone else
* is trying to get access * is trying to get access
*/ */
...@@ -5643,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int ...@@ -5643,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
set_bit(STRIPE_SYNC_REQUESTED, &sh->state); set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh); raid5_release_stripe(sh);
return STRIPE_SECTORS; return STRIPE_SECTORS;
} }
...@@ -5682,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) ...@@ -5682,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */ /* already done this stripe */
continue; continue;
sh = get_active_stripe(conf, sector, 0, 1, 1); sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) { if (!sh) {
/* failed to get a stripe - must wait */ /* failed to get a stripe - must wait */
...@@ -5692,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) ...@@ -5692,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
} }
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) { if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
release_stripe(sh); raid5_release_stripe(sh);
raid5_set_bi_processed_stripes(raid_bio, scnt); raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio; conf->retry_read_aligned = raid_bio;
return handled; return handled;
...@@ -5700,7 +5721,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) ...@@ -5700,7 +5721,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags); set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
handle_stripe(sh); handle_stripe(sh);
release_stripe(sh); raid5_release_stripe(sh);
handled++; handled++;
} }
remaining = raid5_dec_bi_active_stripes(raid_bio); remaining = raid5_dec_bi_active_stripes(raid_bio);
...@@ -5730,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group, ...@@ -5730,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
if (!list_empty(temp_inactive_list + i)) if (!list_empty(temp_inactive_list + i))
break; break;
if (i == NR_STRIPE_HASH_LOCKS) if (i == NR_STRIPE_HASH_LOCKS) {
spin_unlock_irq(&conf->device_lock);
r5l_flush_stripe_to_raid(conf->log);
spin_lock_irq(&conf->device_lock);
return batch_size; return batch_size;
}
release_inactive = true; release_inactive = true;
} }
spin_unlock_irq(&conf->device_lock); spin_unlock_irq(&conf->device_lock);
...@@ -5739,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, ...@@ -5739,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
release_inactive_stripe_list(conf, temp_inactive_list, release_inactive_stripe_list(conf, temp_inactive_list,
NR_STRIPE_HASH_LOCKS); NR_STRIPE_HASH_LOCKS);
r5l_flush_stripe_to_raid(conf->log);
if (release_inactive) { if (release_inactive) {
spin_lock_irq(&conf->device_lock); spin_lock_irq(&conf->device_lock);
return 0; return 0;
...@@ -5746,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group, ...@@ -5746,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++) for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]); handle_stripe(batch[i]);
r5l_write_stripe_run(conf->log);
cond_resched(); cond_resched();
...@@ -5879,6 +5906,8 @@ static void raid5d(struct md_thread *thread) ...@@ -5879,6 +5906,8 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex); mutex_unlock(&conf->cache_size_mutex);
} }
r5l_flush_stripe_to_raid(conf->log);
async_tx_issue_pending_all(); async_tx_issue_pending_all();
blk_finish_plug(&plug); blk_finish_plug(&plug);
...@@ -6316,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf) ...@@ -6316,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf) static void free_conf(struct r5conf *conf)
{ {
if (conf->log)
r5l_exit_log(conf->log);
if (conf->shrinker.seeks) if (conf->shrinker.seeks)
unregister_shrinker(&conf->shrinker); unregister_shrinker(&conf->shrinker);
free_thread_groups(conf); free_thread_groups(conf);
shrink_stripes(conf); shrink_stripes(conf);
raid5_free_percpu(conf); raid5_free_percpu(conf);
...@@ -6530,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) ...@@ -6530,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk; raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks if (raid_disk >= max_disks
|| raid_disk < 0) || raid_disk < 0 || test_bit(Journal, &rdev->flags))
continue; continue;
disk = conf->disks + raid_disk; disk = conf->disks + raid_disk;
...@@ -6650,6 +6682,7 @@ static int run(struct mddev *mddev) ...@@ -6650,6 +6682,7 @@ static int run(struct mddev *mddev)
int working_disks = 0; int working_disks = 0;
int dirty_parity_disks = 0; int dirty_parity_disks = 0;
struct md_rdev *rdev; struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0; sector_t reshape_offset = 0;
int i; int i;
long long min_offset_diff = 0; long long min_offset_diff = 0;
...@@ -6662,6 +6695,11 @@ static int run(struct mddev *mddev) ...@@ -6662,6 +6695,11 @@ static int run(struct mddev *mddev)
rdev_for_each(rdev, mddev) { rdev_for_each(rdev, mddev) {
long long diff; long long diff;
if (test_bit(Journal, &rdev->flags)) {
journal_dev = rdev;
continue;
}
if (rdev->raid_disk < 0) if (rdev->raid_disk < 0)
continue; continue;
diff = (rdev->new_data_offset - rdev->data_offset); diff = (rdev->new_data_offset - rdev->data_offset);
...@@ -6695,6 +6733,12 @@ static int run(struct mddev *mddev) ...@@ -6695,6 +6733,12 @@ static int run(struct mddev *mddev)
int chunk_sectors; int chunk_sectors;
int new_data_disks; int new_data_disks;
if (journal_dev) {
printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev));
return -EINVAL;
}
if (mddev->new_level != mddev->level) { if (mddev->new_level != mddev->level) {
printk(KERN_ERR "md/raid:%s: unsupported reshape " printk(KERN_ERR "md/raid:%s: unsupported reshape "
"required - aborting.\n", "required - aborting.\n",
...@@ -6770,6 +6814,13 @@ static int run(struct mddev *mddev) ...@@ -6770,6 +6814,13 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf)) if (IS_ERR(conf))
return PTR_ERR(conf); return PTR_ERR(conf);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
mdname(mddev));
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
}
conf->min_offset_diff = min_offset_diff; conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread; mddev->thread = conf->thread;
conf->thread = NULL; conf->thread = NULL;
...@@ -6973,6 +7024,14 @@ static int run(struct mddev *mddev) ...@@ -6973,6 +7024,14 @@ static int run(struct mddev *mddev)
mddev->queue); mddev->queue);
} }
if (journal_dev) {
char b[BDEVNAME_SIZE];
printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
mdname(mddev), bdevname(journal_dev->bdev, b));
r5l_init_log(conf, journal_dev);
}
return 0; return 0;
abort: abort:
md_unregister_thread(&mddev->thread); md_unregister_thread(&mddev->thread);
...@@ -7082,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -7082,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct disk_info *p = conf->disks + number; struct disk_info *p = conf->disks + number;
print_raid5_conf(conf); print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags)) {
/*
* journal disk is not removable, but we need give a chance to
* update superblock of other disks. Otherwise journal disk
* will be considered as 'fresh'
*/
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return -EINVAL;
}
if (rdev == p->rdev) if (rdev == p->rdev)
rdevp = &p->rdev; rdevp = &p->rdev;
else if (rdev == p->replacement) else if (rdev == p->replacement)
...@@ -7144,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) ...@@ -7144,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int first = 0; int first = 0;
int last = conf->raid_disks - 1; int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags))
return -EINVAL;
if (mddev->recovery_disabled == conf->recovery_disabled) if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY; return -EBUSY;
...@@ -7205,6 +7275,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors) ...@@ -7205,6 +7275,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize; sector_t newsize;
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
if (conf->log)
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1); sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks); newsize = raid5_size(mddev, sectors, mddev->raid_disks);
if (mddev->external_size && if (mddev->external_size &&
...@@ -7256,6 +7328,8 @@ static int check_reshape(struct mddev *mddev) ...@@ -7256,6 +7328,8 @@ static int check_reshape(struct mddev *mddev)
{ {
struct r5conf *conf = mddev->private; struct r5conf *conf = mddev->private;
if (conf->log)
return -EINVAL;
if (mddev->delta_disks == 0 && if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout && mddev->new_layout == mddev->layout &&
mddev->new_chunk_sectors == mddev->chunk_sectors) mddev->new_chunk_sectors == mddev->chunk_sectors)
...@@ -7532,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state) ...@@ -7532,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf); unlock_all_device_hash_locks_irq(conf);
break; break;
} }
r5l_quiesce(conf->log, state);
} }
static void *raid45_takeover_raid0(struct mddev *mddev, int level) static void *raid45_takeover_raid0(struct mddev *mddev, int level)
......
...@@ -223,6 +223,9 @@ struct stripe_head { ...@@ -223,6 +223,9 @@ struct stripe_head {
struct stripe_head *batch_head; /* protected by stripe lock */ struct stripe_head *batch_head; /* protected by stripe lock */
spinlock_t batch_lock; /* only header's lock is useful */ spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/ struct list_head batch_list; /* protected by head's batch lock*/
struct r5l_io_unit *log_io;
struct list_head log_list;
/** /**
* struct stripe_operations * struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target * @target - STRIPE_OP_COMPUTE_BLK target
...@@ -244,6 +247,7 @@ struct stripe_head { ...@@ -244,6 +247,7 @@ struct stripe_head {
struct bio *toread, *read, *towrite, *written; struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */ sector_t sector; /* sector of this page */
unsigned long flags; unsigned long flags;
u32 log_checksum;
} dev[1]; /* allocated with extra space depending of RAID geometry */ } dev[1]; /* allocated with extra space depending of RAID geometry */
}; };
...@@ -268,6 +272,7 @@ struct stripe_head_state { ...@@ -268,6 +272,7 @@ struct stripe_head_state {
struct bio_list return_bi; struct bio_list return_bi;
struct md_rdev *blocked_rdev; struct md_rdev *blocked_rdev;
int handle_bad_blocks; int handle_bad_blocks;
int log_failed;
}; };
/* Flags for struct r5dev.flags */ /* Flags for struct r5dev.flags */
...@@ -340,6 +345,7 @@ enum { ...@@ -340,6 +345,7 @@ enum {
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet. * to batch yet.
*/ */
STRIPE_LOG_TRAPPED, /* trapped into log */
}; };
#define STRIPE_EXPAND_SYNC_FLAGS \ #define STRIPE_EXPAND_SYNC_FLAGS \
...@@ -543,6 +549,7 @@ struct r5conf { ...@@ -543,6 +549,7 @@ struct r5conf {
struct r5worker_group *worker_groups; struct r5worker_group *worker_groups;
int group_cnt; int group_cnt;
int worker_cnt_per_group; int worker_cnt_per_group;
struct r5l_log *log;
}; };
...@@ -609,4 +616,21 @@ static inline int algorithm_is_DDF(int layout) ...@@ -609,4 +616,21 @@ static inline int algorithm_is_DDF(int layout)
extern void md_raid5_kick_device(struct r5conf *conf); extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size); extern int raid5_set_cache_size(struct mddev *mddev, int size);
extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
extern void raid5_release_stripe(struct stripe_head *sh);
extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx,
struct stripe_head *sh);
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int state);
extern bool r5l_log_disk_error(struct r5conf *conf);
#endif #endif
...@@ -89,6 +89,12 @@ ...@@ -89,6 +89,12 @@
* read requests will only be sent here in * read requests will only be sent here in
* dire need * dire need
*/ */
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
#define MD_DISK_ROLE_SPARE 0xffff
#define MD_DISK_ROLE_FAULTY 0xfffe
#define MD_DISK_ROLE_JOURNAL 0xfffd
#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
typedef struct mdp_device_descriptor_s { typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */ __u32 number; /* 0 Device number in the entire set */
...@@ -252,7 +258,10 @@ struct mdp_superblock_1 { ...@@ -252,7 +258,10 @@ struct mdp_superblock_1 {
__le64 data_offset; /* sector start of data, often 0 */ __le64 data_offset; /* sector start of data, often 0 */
__le64 data_size; /* sectors in this device that can be used for data */ __le64 data_size; /* sectors in this device that can be used for data */
__le64 super_offset; /* sector start of this superblock */ __le64 super_offset; /* sector start of this superblock */
union {
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ __le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
__le64 journal_tail;/* journal tail of journal device (from data_offset) */
};
__le32 dev_number; /* permanent identifier of this device - not role in raid */ __le32 dev_number; /* permanent identifier of this device - not role in raid */
__le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ __le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
...@@ -302,6 +311,8 @@ struct mdp_superblock_1 { ...@@ -302,6 +311,8 @@ struct mdp_superblock_1 {
#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening #define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening
* is guided by bitmap. * is guided by bitmap.
*/ */
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \ |MD_FEATURE_RESHAPE_ACTIVE \
...@@ -310,6 +321,66 @@ struct mdp_superblock_1 { ...@@ -310,6 +321,66 @@ struct mdp_superblock_1 {
|MD_FEATURE_RESHAPE_BACKWARDS \ |MD_FEATURE_RESHAPE_BACKWARDS \
|MD_FEATURE_NEW_OFFSET \ |MD_FEATURE_NEW_OFFSET \
|MD_FEATURE_RECOVERY_BITMAP \ |MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
) )
struct r5l_payload_header {
__le16 type;
__le16 flags;
} __attribute__ ((__packed__));
enum r5l_payload_type {
R5LOG_PAYLOAD_DATA = 0,
R5LOG_PAYLOAD_PARITY = 1,
R5LOG_PAYLOAD_FLUSH = 2,
};
struct r5l_payload_data_parity {
struct r5l_payload_header header;
__le32 size; /* sector. data/parity size. each 4k
* has a checksum */
__le64 location; /* sector. For data, it's raid sector. For
* parity, it's stripe sector */
__le32 checksum[];
} __attribute__ ((__packed__));
enum r5l_payload_data_parity_flag {
R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
/*
* RESHAPED/RESHAPING is only set when there is reshape activity. Note,
* both data/parity of a stripe should have the same flag set
*
* RESHAPED: reshape is running, and this stripe finished reshape
* RESHAPING: reshape is running, and this stripe isn't reshaped
*/
R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
};
struct r5l_payload_flush {
struct r5l_payload_header header;
__le32 size; /* flush_stripes size, bytes */
__le64 flush_stripes[];
} __attribute__ ((__packed__));
enum r5l_payload_flush_flag {
R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
};
struct r5l_meta_block {
__le32 magic;
__le32 checksum;
__u8 version;
__u8 __zero_pading_1;
__le16 __zero_pading_2;
__le32 meta_size; /* whole size of the block */
__le64 seq;
__le64 position; /* sector, start from rdev->data_offset, current position */
struct r5l_payload_header payloads[];
} __attribute__ ((__packed__));
#define R5LOG_VERSION 0x1
#define R5LOG_MAGIC 0x6433c509
#endif #endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment