Commit ac322de6 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'md/4.4' of git://neil.brown.name/md

Pull md updates from Neil Brown:
 "Two major components to this update.

   1) The clustered-raid1 support from SUSE is nearly complete.  There
      are a few outstanding issues being worked on.  Maybe half a dozen
      patches will bring this to a usable state.

   2) The first stage of journalled-raid5 support from Facebook makes an
      appearance.  With a journal device configured (typically NVRAM or
      SSD), the "RAID5 write hole" should be closed - a crash during
      degraded operations cannot result in data corruption.

      The next stage will be to use the journal as a write-behind cache
      so that latency can be reduced and in some cases throughput
      increased by performing more full-stripe writes.

* tag 'md/4.4' of git://neil.brown.name/md: (66 commits)
  MD: when RAID journal is missing/faulty, block RESTART_ARRAY_RW
  MD: set journal disk ->raid_disk
  MD: kick out journal disk if it's not fresh
  raid5-cache: start raid5 readonly if journal is missing
  MD: add new bit to indicate raid array with journal
  raid5-cache: IO error handling
  raid5: journal disk can't be removed
  raid5-cache: add trim support for log
  MD: fix info output for journal disk
  raid5-cache: use bio chaining
  raid5-cache: small log->seq cleanup
  raid5-cache: new helper: r5_reserve_log_entry
  raid5-cache: inline r5l_alloc_io_unit into r5l_new_meta
  raid5-cache: take rdev->data_offset into account early on
  raid5-cache: refactor bio allocation
  raid5-cache: clean up r5l_get_meta
  raid5-cache: simplify state machine when caches flushes are not needed
  raid5-cache: factor out a helper to run all stripes for an I/O unit
  raid5-cache: rename flushed_ios to finished_ios
  raid5-cache: free I/O units earlier
  ...
parents ccf21b69 339421de
......@@ -17,7 +17,7 @@ dm-cache-smq-y += dm-cache-policy-smq.o
dm-cache-cleaner-y += dm-cache-policy-cleaner.o
dm-era-y += dm-era-target.o
md-mod-y += md.o bitmap.o
raid456-y += raid5.o
raid456-y += raid5.o raid5-cache.o
# Note: link order is important. All raid personalities
# and must come before md.o, as they each initialise
......
......@@ -613,12 +613,10 @@ static int bitmap_read_sb(struct bitmap *bitmap)
daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
write_behind = le32_to_cpu(sb->write_behind);
sectors_reserved = le32_to_cpu(sb->sectors_reserved);
/* XXX: This is a hack to ensure that we don't use clustering
* in case:
* - dm-raid is in use and
* - the nodes written in bitmap_sb is erroneous.
/* Setup nodes/clustername only if bitmap version is
* cluster-compatible
*/
if (!bitmap->mddev->sync_super) {
if (sb->version == cpu_to_le32(BITMAP_MAJOR_CLUSTERED)) {
nodes = le32_to_cpu(sb->nodes);
strlcpy(bitmap->mddev->bitmap_info.cluster_name,
sb->cluster_name, 64);
......@@ -628,7 +626,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
if (sb->magic != cpu_to_le32(BITMAP_MAGIC))
reason = "bad magic";
else if (le32_to_cpu(sb->version) < BITMAP_MAJOR_LO ||
le32_to_cpu(sb->version) > BITMAP_MAJOR_HI)
le32_to_cpu(sb->version) > BITMAP_MAJOR_CLUSTERED)
reason = "unrecognized superblock version";
else if (chunksize < 512)
reason = "bitmap chunksize too small";
......@@ -1572,7 +1570,7 @@ void bitmap_close_sync(struct bitmap *bitmap)
}
EXPORT_SYMBOL(bitmap_close_sync);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force)
{
sector_t s = 0;
sector_t blocks;
......@@ -1583,7 +1581,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
bitmap->last_end_sync = jiffies;
return;
}
if (time_before(jiffies, (bitmap->last_end_sync
if (!force && time_before(jiffies, (bitmap->last_end_sync
+ bitmap->mddev->bitmap_info.daemon_sleep)))
return;
wait_event(bitmap->mddev->recovery_wait,
......
......@@ -9,8 +9,10 @@
#define BITMAP_MAJOR_LO 3
/* version 4 insists the bitmap is in little-endian order
* with version 3, it is host-endian which is non-portable
* Version 5 is currently set only for clustered devices
*/
#define BITMAP_MAJOR_HI 4
#define BITMAP_MAJOR_CLUSTERED 5
#define BITMAP_MAJOR_HOSTENDIAN 3
/*
......@@ -255,7 +257,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset,
int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded);
void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted);
void bitmap_close_sync(struct bitmap *bitmap);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector);
void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force);
void bitmap_unplug(struct bitmap *bitmap);
void bitmap_daemon_work(struct mddev *mddev);
......
......@@ -28,6 +28,7 @@ struct dlm_lock_resource {
struct completion completion; /* completion for synchronized locking */
void (*bast)(void *arg, int mode); /* blocking AST function pointer*/
struct mddev *mddev; /* pointing back to mddev. */
int mode;
};
struct suspend_info {
......@@ -53,8 +54,8 @@ struct md_cluster_info {
dlm_lockspace_t *lockspace;
int slot_number;
struct completion completion;
struct mutex sb_mutex;
struct dlm_lock_resource *bitmap_lockres;
struct dlm_lock_resource *resync_lockres;
struct list_head suspend_list;
spinlock_t suspend_lock;
struct md_thread *recovery_thread;
......@@ -79,20 +80,20 @@ enum msg_type {
};
struct cluster_msg {
int type;
int slot;
__le32 type;
__le32 slot;
/* TODO: Unionize this for smaller footprint */
sector_t low;
sector_t high;
__le64 low;
__le64 high;
char uuid[16];
int raid_slot;
__le32 raid_slot;
};
static void sync_ast(void *arg)
{
struct dlm_lock_resource *res;
res = (struct dlm_lock_resource *) arg;
res = arg;
complete(&res->completion);
}
......@@ -106,6 +107,8 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
if (ret)
return ret;
wait_for_completion(&res->completion);
if (res->lksb.sb_status == 0)
res->mode = mode;
return res->lksb.sb_status;
}
......@@ -127,6 +130,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
init_completion(&res->completion);
res->ls = cinfo->lockspace;
res->mddev = mddev;
res->mode = DLM_LOCK_IV;
namelen = strlen(name);
res->name = kzalloc(namelen + 1, GFP_KERNEL);
if (!res->name) {
......@@ -191,7 +195,7 @@ static void lockres_free(struct dlm_lock_resource *res)
kfree(res);
}
static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
static void add_resync_info(struct dlm_lock_resource *lockres,
sector_t lo, sector_t hi)
{
struct resync_info *ri;
......@@ -210,7 +214,7 @@ static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_loc
dlm_lock_sync(lockres, DLM_LOCK_CR);
memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info));
hi = le64_to_cpu(ri.hi);
if (ri.hi > 0) {
if (hi > 0) {
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
if (!s)
goto out;
......@@ -345,7 +349,7 @@ static const struct dlm_lockspace_ops md_ls_ops = {
*/
static void ack_bast(void *arg, int mode)
{
struct dlm_lock_resource *res = (struct dlm_lock_resource *)arg;
struct dlm_lock_resource *res = arg;
struct md_cluster_info *cinfo = res->mddev->cluster_info;
if (mode == DLM_LOCK_EX)
......@@ -358,29 +362,32 @@ static void __remove_suspend_info(struct md_cluster_info *cinfo, int slot)
list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list)
if (slot == s->slot) {
pr_info("%s:%d Deleting suspend_info: %d\n",
__func__, __LINE__, slot);
list_del(&s->list);
kfree(s);
break;
}
}
static void remove_suspend_info(struct md_cluster_info *cinfo, int slot)
static void remove_suspend_info(struct mddev *mddev, int slot)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
spin_lock_irq(&cinfo->suspend_lock);
__remove_suspend_info(cinfo, slot);
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
}
static void process_suspend_info(struct md_cluster_info *cinfo,
static void process_suspend_info(struct mddev *mddev,
int slot, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct suspend_info *s;
if (!hi) {
remove_suspend_info(cinfo, slot);
remove_suspend_info(mddev, slot);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
return;
}
s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL);
......@@ -389,11 +396,14 @@ static void process_suspend_info(struct md_cluster_info *cinfo,
s->slot = slot;
s->lo = lo;
s->hi = hi;
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
spin_lock_irq(&cinfo->suspend_lock);
/* Remove existing entry (if exists) before adding */
__remove_suspend_info(cinfo, slot);
list_add(&s->list, &cinfo->suspend_list);
spin_unlock_irq(&cinfo->suspend_lock);
mddev->pers->quiesce(mddev, 2);
}
static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
......@@ -407,7 +417,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
sprintf(disk_uuid + len, "%pU", cmsg->uuid);
snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
snprintf(raid_slot, 16, "RAID_DISK=%d", le32_to_cpu(cmsg->raid_slot));
pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
init_completion(&cinfo->newdisk_completion);
set_bit(MD_CLUSTER_WAITING_FOR_NEWDISK, &cinfo->state);
......@@ -421,64 +431,59 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
static void process_metadata_update(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
md_reload_sb(mddev);
md_reload_sb(mddev, le32_to_cpu(msg->raid_slot));
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
}
static void process_remove_disk(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
le32_to_cpu(msg->raid_slot));
if (rdev)
md_kick_rdev_from_array(rdev);
else
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n", __func__, __LINE__, msg->raid_slot);
pr_warn("%s: %d Could not find disk(%d) to REMOVE\n",
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_readd_disk(struct mddev *mddev, struct cluster_msg *msg)
{
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev, msg->raid_slot);
struct md_rdev *rdev = md_find_rdev_nr_rcu(mddev,
le32_to_cpu(msg->raid_slot));
if (rdev && test_bit(Faulty, &rdev->flags))
clear_bit(Faulty, &rdev->flags);
else
pr_warn("%s: %d Could not find disk(%d) which is faulty", __func__, __LINE__, msg->raid_slot);
pr_warn("%s: %d Could not find disk(%d) which is faulty",
__func__, __LINE__, le32_to_cpu(msg->raid_slot));
}
static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
{
switch (msg->type) {
if (WARN(mddev->cluster_info->slot_number - 1 == le32_to_cpu(msg->slot),
"node %d received it's own msg\n", le32_to_cpu(msg->slot)))
return;
switch (le32_to_cpu(msg->type)) {
case METADATA_UPDATED:
pr_info("%s: %d Received message: METADATA_UPDATE from %d\n",
__func__, __LINE__, msg->slot);
process_metadata_update(mddev, msg);
break;
case RESYNCING:
pr_info("%s: %d Received message: RESYNCING from %d\n",
__func__, __LINE__, msg->slot);
process_suspend_info(mddev->cluster_info, msg->slot,
msg->low, msg->high);
process_suspend_info(mddev, le32_to_cpu(msg->slot),
le64_to_cpu(msg->low),
le64_to_cpu(msg->high));
break;
case NEWDISK:
pr_info("%s: %d Received message: NEWDISK from %d\n",
__func__, __LINE__, msg->slot);
process_add_new_disk(mddev, msg);
break;
case REMOVE:
pr_info("%s: %d Received REMOVE from %d\n",
__func__, __LINE__, msg->slot);
process_remove_disk(mddev, msg);
break;
case RE_ADD:
pr_info("%s: %d Received RE_ADD from %d\n",
__func__, __LINE__, msg->slot);
process_readd_disk(mddev, msg);
break;
case BITMAP_NEEDS_SYNC:
pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
__func__, __LINE__, msg->slot);
__recover_slot(mddev, msg->slot);
__recover_slot(mddev, le32_to_cpu(msg->slot));
break;
default:
pr_warn("%s:%d Received unknown message from %d\n",
......@@ -528,11 +533,17 @@ static void recv_daemon(struct md_thread *thread)
/* lock_comm()
* Takes the lock on the TOKEN lock resource so no other
* node can communicate while the operation is underway.
* If called again, and the TOKEN lock is alread in EX mode
* return success. However, care must be taken that unlock_comm()
* is called only once.
*/
static int lock_comm(struct md_cluster_info *cinfo)
{
int error;
if (cinfo->token_lockres->mode == DLM_LOCK_EX)
return 0;
error = dlm_lock_sync(cinfo->token_lockres, DLM_LOCK_EX);
if (error)
pr_err("md-cluster(%s:%d): failed to get EX on TOKEN (%d)\n",
......@@ -542,6 +553,7 @@ static int lock_comm(struct md_cluster_info *cinfo)
static void unlock_comm(struct md_cluster_info *cinfo)
{
WARN_ON(cinfo->token_lockres->mode != DLM_LOCK_EX);
dlm_unlock_sync(cinfo->token_lockres);
}
......@@ -696,7 +708,6 @@ static int join(struct mddev *mddev, int nodes)
init_completion(&cinfo->completion);
set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
mutex_init(&cinfo->sb_mutex);
mddev->cluster_info = cinfo;
memset(str, 0, 64);
......@@ -753,6 +764,10 @@ static int join(struct mddev *mddev, int nodes)
goto err;
}
cinfo->resync_lockres = lockres_init(mddev, "resync", NULL, 0);
if (!cinfo->resync_lockres)
goto err;
ret = gather_all_resync_info(mddev, nodes);
if (ret)
goto err;
......@@ -763,6 +778,7 @@ static int join(struct mddev *mddev, int nodes)
lockres_free(cinfo->token_lockres);
lockres_free(cinfo->ack_lockres);
lockres_free(cinfo->no_new_dev_lockres);
lockres_free(cinfo->resync_lockres);
lockres_free(cinfo->bitmap_lockres);
if (cinfo->lockspace)
dlm_release_lockspace(cinfo->lockspace, 2);
......@@ -771,12 +787,32 @@ static int join(struct mddev *mddev, int nodes)
return ret;
}
static void resync_bitmap(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg = {0};
int err;
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
err = sendmsg(cinfo, &cmsg);
if (err)
pr_err("%s:%d: failed to send BITMAP_NEEDS_SYNC message (%d)\n",
__func__, __LINE__, err);
}
static int leave(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
if (!cinfo)
return 0;
/* BITMAP_NEEDS_SYNC message should be sent when node
* is leaving the cluster with dirty bitmap, also we
* can only deliver it when dlm connection is available */
if (cinfo->slot_number > 0 && mddev->recovery_cp != MaxSector)
resync_bitmap(mddev);
md_unregister_thread(&cinfo->recovery_thread);
md_unregister_thread(&cinfo->recv_thread);
lockres_free(cinfo->message_lockres);
......@@ -799,15 +835,6 @@ static int slot_number(struct mddev *mddev)
return cinfo->slot_number - 1;
}
static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi);
/* Re-acquire the lock to refresh LVB */
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
}
static int metadata_update_start(struct mddev *mddev)
{
return lock_comm(mddev->cluster_info);
......@@ -817,59 +844,62 @@ static int metadata_update_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int ret;
struct md_rdev *rdev;
int ret = 0;
int raid_slot = -1;
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(METADATA_UPDATED);
/* Pick up a good active device number to send.
*/
rdev_for_each(rdev, mddev)
if (rdev->raid_disk > -1 && !test_bit(Faulty, &rdev->flags)) {
raid_slot = rdev->desc_nr;
break;
}
if (raid_slot >= 0) {
cmsg.raid_slot = cpu_to_le32(raid_slot);
ret = __sendmsg(cinfo, &cmsg);
} else
pr_warn("md-cluster: No good device id found to send\n");
unlock_comm(cinfo);
return ret;
}
static int metadata_update_cancel(struct mddev *mddev)
static void metadata_update_cancel(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
unlock_comm(cinfo);
}
return dlm_unlock_sync(cinfo->token_lockres);
static int resync_start(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
cinfo->resync_lockres->flags |= DLM_LKF_NOQUEUE;
return dlm_lock_sync(cinfo->resync_lockres, DLM_LOCK_EX);
}
static int resync_send(struct mddev *mddev, enum msg_type type,
sector_t lo, sector_t hi)
static int resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int slot = cinfo->slot_number - 1;
struct cluster_msg cmsg = {0};
pr_info("%s:%d lo: %llu hi: %llu\n", __func__, __LINE__,
(unsigned long long)lo,
(unsigned long long)hi);
resync_info_update(mddev, lo, hi);
cmsg.type = cpu_to_le32(type);
cmsg.slot = cpu_to_le32(slot);
add_resync_info(cinfo->bitmap_lockres, lo, hi);
/* Re-acquire the lock to refresh LVB */
dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW);
cmsg.type = cpu_to_le32(RESYNCING);
cmsg.low = cpu_to_le64(lo);
cmsg.high = cpu_to_le64(hi);
return sendmsg(cinfo, &cmsg);
}
static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
{
pr_info("%s:%d\n", __func__, __LINE__);
return resync_send(mddev, RESYNCING, lo, hi);
return sendmsg(cinfo, &cmsg);
}
static void resync_finish(struct mddev *mddev)
static int resync_finish(struct mddev *mddev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
int slot = cinfo->slot_number - 1;
pr_info("%s:%d\n", __func__, __LINE__);
resync_send(mddev, RESYNCING, 0, 0);
if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
cmsg.slot = cpu_to_le32(slot);
sendmsg(cinfo, &cmsg);
}
cinfo->resync_lockres->flags &= ~DLM_LKF_NOQUEUE;
dlm_unlock_sync(cinfo->resync_lockres);
return resync_info_update(mddev, 0, 0);
}
static int area_resyncing(struct mddev *mddev, int direction,
......@@ -896,7 +926,11 @@ static int area_resyncing(struct mddev *mddev, int direction,
return ret;
}
static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* add_new_disk() - initiates a disk add
* However, if this fails before writing md_update_sb(),
* add_new_disk_cancel() must be called to release token lock
*/
static int add_new_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct md_cluster_info *cinfo = mddev->cluster_info;
struct cluster_msg cmsg;
......@@ -907,7 +941,7 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
memset(&cmsg, 0, sizeof(cmsg));
cmsg.type = cpu_to_le32(NEWDISK);
memcpy(cmsg.uuid, uuid, 16);
cmsg.raid_slot = rdev->desc_nr;
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
lock_comm(cinfo);
ret = __sendmsg(cinfo, &cmsg);
if (ret)
......@@ -918,22 +952,17 @@ static int add_new_disk_start(struct mddev *mddev, struct md_rdev *rdev)
/* Some node does not "see" the device */
if (ret == -EAGAIN)
ret = -ENOENT;
if (ret)
unlock_comm(cinfo);
else
dlm_lock_sync(cinfo->no_new_dev_lockres, DLM_LOCK_CR);
return ret;
}
static int add_new_disk_finish(struct mddev *mddev)
static void add_new_disk_cancel(struct mddev *mddev)
{
struct cluster_msg cmsg;
struct md_cluster_info *cinfo = mddev->cluster_info;
int ret;
/* Write sb and inform others */
md_update_sb(mddev, 1);
cmsg.type = METADATA_UPDATED;
ret = __sendmsg(cinfo, &cmsg);
unlock_comm(cinfo);
return ret;
}
static int new_disk_ack(struct mddev *mddev, bool ack)
......@@ -953,10 +982,10 @@ static int new_disk_ack(struct mddev *mddev, bool ack)
static int remove_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct cluster_msg cmsg;
struct cluster_msg cmsg = {0};
struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = REMOVE;
cmsg.raid_slot = rdev->desc_nr;
cmsg.type = cpu_to_le32(REMOVE);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
return __sendmsg(cinfo, &cmsg);
}
......@@ -964,12 +993,12 @@ static int gather_bitmaps(struct md_rdev *rdev)
{
int sn, err;
sector_t lo, hi;
struct cluster_msg cmsg;
struct cluster_msg cmsg = {0};
struct mddev *mddev = rdev->mddev;
struct md_cluster_info *cinfo = mddev->cluster_info;
cmsg.type = RE_ADD;
cmsg.raid_slot = rdev->desc_nr;
cmsg.type = cpu_to_le32(RE_ADD);
cmsg.raid_slot = cpu_to_le32(rdev->desc_nr);
err = sendmsg(cinfo, &cmsg);
if (err)
goto out;
......@@ -993,15 +1022,15 @@ static struct md_cluster_operations cluster_ops = {
.join = join,
.leave = leave,
.slot_number = slot_number,
.resync_info_update = resync_info_update,
.resync_start = resync_start,
.resync_finish = resync_finish,
.resync_info_update = resync_info_update,
.metadata_update_start = metadata_update_start,
.metadata_update_finish = metadata_update_finish,
.metadata_update_cancel = metadata_update_cancel,
.area_resyncing = area_resyncing,
.add_new_disk_start = add_new_disk_start,
.add_new_disk_finish = add_new_disk_finish,
.add_new_disk = add_new_disk,
.add_new_disk_cancel = add_new_disk_cancel,
.new_disk_ack = new_disk_ack,
.remove_disk = remove_disk,
.gather_bitmaps = gather_bitmaps,
......@@ -1022,5 +1051,6 @@ static void cluster_exit(void)
module_init(cluster_init);
module_exit(cluster_exit);
MODULE_AUTHOR("SUSE");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Clustering support for MD");
......@@ -12,15 +12,15 @@ struct md_cluster_operations {
int (*join)(struct mddev *mddev, int nodes);
int (*leave)(struct mddev *mddev);
int (*slot_number)(struct mddev *mddev);
void (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*resync_start)(struct mddev *mddev, sector_t lo, sector_t hi);
void (*resync_finish)(struct mddev *mddev);
int (*resync_info_update)(struct mddev *mddev, sector_t lo, sector_t hi);
int (*metadata_update_start)(struct mddev *mddev);
int (*metadata_update_finish)(struct mddev *mddev);
int (*metadata_update_cancel)(struct mddev *mddev);
void (*metadata_update_cancel)(struct mddev *mddev);
int (*resync_start)(struct mddev *mddev);
int (*resync_finish)(struct mddev *mddev);
int (*area_resyncing)(struct mddev *mddev, int direction, sector_t lo, sector_t hi);
int (*add_new_disk_start)(struct mddev *mddev, struct md_rdev *rdev);
int (*add_new_disk_finish)(struct mddev *mddev);
int (*add_new_disk)(struct mddev *mddev, struct md_rdev *rdev);
void (*add_new_disk_cancel)(struct mddev *mddev);
int (*new_disk_ack)(struct mddev *mddev, bool ack);
int (*remove_disk)(struct mddev *mddev, struct md_rdev *rdev);
int (*gather_bitmaps)(struct md_rdev *rdev);
......
......@@ -1608,7 +1608,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
++ev1;
if (rdev->desc_nr >= 0 &&
rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
(le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
if (ev1 < mddev->events)
return -EINVAL;
} else if (mddev->bitmap) {
......@@ -1628,16 +1629,29 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
int role;
if (rdev->desc_nr < 0 ||
rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
role = 0xffff;
role = MD_DISK_ROLE_SPARE;
rdev->desc_nr = -1;
} else
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
switch(role) {
case 0xffff: /* spare */
case MD_DISK_ROLE_SPARE: /* spare */
break;
case 0xfffe: /* faulty */
case MD_DISK_ROLE_FAULTY: /* faulty */
set_bit(Faulty, &rdev->flags);
break;
case MD_DISK_ROLE_JOURNAL: /* journal device */
if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
/* journal device without journal feature */
printk(KERN_WARNING
"md: journal device provided without journal feature, ignoring the device\n");
return -EINVAL;
}
set_bit(Journal, &rdev->flags);
rdev->journal_tail = le64_to_cpu(sb->journal_tail);
if (mddev->recovery_cp == MaxSector)
set_bit(MD_JOURNAL_CLEAN, &mddev->flags);
rdev->raid_disk = mddev->raid_disks;
break;
default:
rdev->saved_raid_disk = role;
if ((le32_to_cpu(sb->feature_map) &
......@@ -1655,6 +1669,8 @@ static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
set_bit(WriteMostly, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
set_bit(Replacement, &rdev->flags);
if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
set_bit(MD_HAS_JOURNAL, &mddev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
......@@ -1679,6 +1695,8 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->events = cpu_to_le64(mddev->events);
if (mddev->in_sync)
sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
sb->resync_offset = cpu_to_le64(MaxSector);
else
sb->resync_offset = cpu_to_le64(0);
......@@ -1702,7 +1720,7 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
}
if (rdev->raid_disk >= 0 &&
if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags)) {
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
......@@ -1712,6 +1730,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
sb->feature_map |=
cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
}
/* Note: recovery_offset and journal_tail share space */
if (test_bit(Journal, &rdev->flags))
sb->journal_tail = cpu_to_le64(rdev->journal_tail);
if (test_bit(Replacement, &rdev->flags))
sb->feature_map |=
cpu_to_le32(MD_FEATURE_REPLACEMENT);
......@@ -1735,6 +1756,9 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
}
}
if (mddev_is_clustered(mddev))
sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
if (rdev->badblocks.count == 0)
/* Nothing to do for bad blocks*/ ;
else if (sb->bblog_offset == 0)
......@@ -1785,18 +1809,23 @@ static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
max_dev = le32_to_cpu(sb->max_dev);
for (i=0; i<max_dev;i++)
sb->dev_roles[i] = cpu_to_le16(0xfffe);
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
rdev_for_each(rdev2, mddev) {
i = rdev2->desc_nr;
if (test_bit(Faulty, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(0xfffe);
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
else if (test_bit(In_sync, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else if (test_bit(Journal, &rdev2->flags))
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
else if (rdev2->raid_disk >= 0)
sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
else
sb->dev_roles[i] = cpu_to_le16(0xffff);
sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
}
sb->sb_csum = calc_sb_1_csum(sb);
......@@ -1912,13 +1941,23 @@ static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
struct md_rdev *rdev, *rdev2;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev1)
rdev_for_each_rcu(rdev2, mddev2)
rdev_for_each_rcu(rdev, mddev1) {
if (test_bit(Faulty, &rdev->flags) ||
test_bit(Journal, &rdev->flags) ||
rdev->raid_disk == -1)
continue;
rdev_for_each_rcu(rdev2, mddev2) {
if (test_bit(Faulty, &rdev2->flags) ||
test_bit(Journal, &rdev2->flags) ||
rdev2->raid_disk == -1)
continue;
if (rdev->bdev->bd_contains ==
rdev2->bdev->bd_contains) {
rcu_read_unlock();
return 1;
}
}
}
rcu_read_unlock();
return 0;
}
......@@ -2194,23 +2233,77 @@ static void sync_sbs(struct mddev *mddev, int nospares)
}
}
static bool does_sb_need_changing(struct mddev *mddev)
{
struct md_rdev *rdev;
struct mdp_superblock_1 *sb;
int role;
/* Find a good rdev */
rdev_for_each(rdev, mddev)
if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
break;
/* No good device found. */
if (!rdev)
return false;
sb = page_address(rdev->sb_page);
/* Check if a device has become faulty or a spare become active */
rdev_for_each(rdev, mddev) {
role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
/* Device activated? */
if (role == 0xffff && rdev->raid_disk >=0 &&
!test_bit(Faulty, &rdev->flags))
return true;
/* Device turned faulty? */
if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
return true;
}
/* Check if any mddev parameters have changed */
if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
(mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
(mddev->layout != le64_to_cpu(sb->layout)) ||
(mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
(mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
return true;
return false;
}
void md_update_sb(struct mddev *mddev, int force_change)
{
struct md_rdev *rdev;
int sync_req;
int nospares = 0;
int any_badblocks_changed = 0;
int ret = -1;
if (mddev->ro) {
if (force_change)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return;
}
if (mddev_is_clustered(mddev)) {
if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
force_change = 1;
ret = md_cluster_ops->metadata_update_start(mddev);
/* Has someone else has updated the sb */
if (!does_sb_need_changing(mddev)) {
if (ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
return;
}
}
repeat:
/* First make sure individual recovery_offsets are correct */
rdev_for_each(rdev, mddev) {
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
mddev->curr_resync_completed > rdev->recovery_offset)
rdev->recovery_offset = mddev->curr_resync_completed;
......@@ -2354,6 +2447,9 @@ void md_update_sb(struct mddev *mddev, int force_change)
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
}
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_finish(mddev);
}
EXPORT_SYMBOL(md_update_sb);
......@@ -2429,6 +2525,10 @@ state_show(struct md_rdev *rdev, char *page)
len += sprintf(page+len, "%sin_sync",sep);
sep = ",";
}
if (test_bit(Journal, &flags)) {
len += sprintf(page+len, "%sjournal",sep);
sep = ",";
}
if (test_bit(WriteMostly, &flags)) {
len += sprintf(page+len, "%swrite_mostly",sep);
sep = ",";
......@@ -2440,6 +2540,7 @@ state_show(struct md_rdev *rdev, char *page)
sep = ",";
}
if (!test_bit(Faulty, &flags) &&
!test_bit(Journal, &flags) &&
!test_bit(In_sync, &flags)) {
len += sprintf(page+len, "%sspare", sep);
sep = ",";
......@@ -2488,17 +2589,16 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
err = -EBUSY;
else {
struct mddev *mddev = rdev->mddev;
err = 0;
if (mddev_is_clustered(mddev))
md_cluster_ops->remove_disk(mddev, rdev);
err = md_cluster_ops->remove_disk(mddev, rdev);
if (err == 0) {
md_kick_rdev_from_array(rdev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (mddev->pers)
md_update_sb(mddev, 1);
md_new_event(mddev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
err = 0;
}
}
} else if (cmd_match(buf, "writemostly")) {
set_bit(WriteMostly, &rdev->flags);
......@@ -2527,7 +2627,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
set_bit(In_sync, &rdev->flags);
err = 0;
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
} else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags)) {
if (rdev->mddev->pers == NULL) {
clear_bit(In_sync, &rdev->flags);
rdev->saved_raid_disk = rdev->raid_disk;
......@@ -2546,6 +2647,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
* check if recovery is needed.
*/
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
......@@ -2623,7 +2725,9 @@ __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
static ssize_t
slot_show(struct md_rdev *rdev, char *page)
{
if (rdev->raid_disk < 0)
if (test_bit(Journal, &rdev->flags))
return sprintf(page, "journal\n");
else if (rdev->raid_disk < 0)
return sprintf(page, "none\n");
else
return sprintf(page, "%d\n", rdev->raid_disk);
......@@ -2635,6 +2739,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
int slot;
int err;
if (test_bit(Journal, &rdev->flags))
return -EBUSY;
if (strncmp(buf, "none", 4)==0)
slot = -1;
else {
......@@ -2686,15 +2792,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
clear_bit(Bitmap_sync, &rdev->flags);
err = rdev->mddev->pers->
hot_add_disk(rdev->mddev, rdev);
if (err) {
rdev->raid_disk = -1;
return err;
} else
sysfs_notify_dirent_safe(rdev->sysfs_state);
if (sysfs_link_rdev(rdev->mddev, rdev))
/* failure here is OK */;
remove_and_add_spares(rdev->mddev, rdev);
if (rdev->raid_disk == -1)
return -EBUSY;
/* don't wakeup anyone, leave that to userspace. */
} else {
if (slot >= rdev->mddev->raid_disks &&
......@@ -2839,6 +2939,8 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
sector_t oldsectors = rdev->sectors;
sector_t sectors;
if (test_bit(Journal, &rdev->flags))
return -EBUSY;
if (strict_blocks_to_sectors(buf, &sectors) < 0)
return -EINVAL;
if (rdev->data_offset != rdev->new_data_offset)
......@@ -3196,20 +3298,14 @@ static void analyze_sbs(struct mddev *mddev)
md_kick_rdev_from_array(rdev);
continue;
}
/* No device should have a Candidate flag
* when reading devices
*/
if (test_bit(Candidate, &rdev->flags)) {
pr_info("md: kicking Cluster Candidate %s from array!\n",
bdevname(rdev->bdev, b));
md_kick_rdev_from_array(rdev);
}
}
if (mddev->level == LEVEL_MULTIPATH) {
rdev->desc_nr = i++;
rdev->raid_disk = rdev->desc_nr;
set_bit(In_sync, &rdev->flags);
} else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
} else if (rdev->raid_disk >=
(mddev->raid_disks - min(0, mddev->delta_disks)) &&
!test_bit(Journal, &rdev->flags)) {
rdev->raid_disk = -1;
clear_bit(In_sync, &rdev->flags);
}
......@@ -3267,6 +3363,11 @@ safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
{
unsigned long msec;
if (mddev_is_clustered(mddev)) {
pr_info("md: Safemode is disabled for clustered mode\n");
return -EINVAL;
}
if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
return -EINVAL;
if (msec == 0)
......@@ -3867,7 +3968,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case clean:
if (mddev->pers) {
restart_array(mddev);
err = restart_array(mddev);
if (err)
break;
spin_lock(&mddev->lock);
if (atomic_read(&mddev->writes_pending) == 0) {
if (mddev->in_sync == 0) {
......@@ -3885,7 +3988,9 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
break;
case active:
if (mddev->pers) {
restart_array(mddev);
err = restart_array(mddev);
if (err)
break;
clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
err = 0;
......@@ -4064,12 +4169,8 @@ size_store(struct mddev *mddev, const char *buf, size_t len)
if (err)
return err;
if (mddev->pers) {
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
err = update_size(mddev, sectors);
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
} else {
if (mddev->dev_sectors == 0 ||
mddev->dev_sectors > sectors)
......@@ -5181,6 +5282,9 @@ int md_run(struct mddev *mddev)
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
mddev->safemode = 0;
if (mddev_is_clustered(mddev))
mddev->safemode_delay = 0;
else
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
smp_wmb();
......@@ -5224,6 +5328,9 @@ static int do_md_run(struct mddev *mddev)
goto out;
}
if (mddev_is_clustered(mddev))
md_allow_write(mddev);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
......@@ -5246,6 +5353,25 @@ static int restart_array(struct mddev *mddev)
return -EINVAL;
if (!mddev->ro)
return -EBUSY;
if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
struct md_rdev *rdev;
bool has_journal = false;
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev) {
if (test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags)) {
has_journal = true;
break;
}
}
rcu_read_unlock();
/* Don't restart rw with journal missing/faulty */
if (!has_journal)
return -EINVAL;
}
mddev->safemode = 0;
mddev->ro = 0;
set_disk_ro(disk, 0);
......@@ -5307,8 +5433,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
flush_workqueue(md_misc_wq);
if (mddev->sync_thread) {
......@@ -5322,13 +5446,13 @@ static void __md_stop_writes(struct mddev *mddev)
md_super_wait(mddev);
if (mddev->ro == 0 &&
(!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
(mddev->flags & MD_UPDATE_SB_FLAGS))) {
/* mark array as shutdown cleanly */
if (!mddev_is_clustered(mddev))
mddev->in_sync = 1;
md_update_sb(mddev, 1);
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
}
void md_stop_writes(struct mddev *mddev)
......@@ -5789,6 +5913,8 @@ static int get_disk_info(struct mddev *mddev, void __user * arg)
info.state |= (1<<MD_DISK_ACTIVE);
info.state |= (1<<MD_DISK_SYNC);
}
if (test_bit(Journal, &rdev->flags))
info.state |= (1<<MD_DISK_JOURNAL);
if (test_bit(WriteMostly, &rdev->flags))
info.state |= (1<<MD_DISK_WRITEMOSTLY);
} else {
......@@ -5903,23 +6029,18 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
else
clear_bit(WriteMostly, &rdev->flags);
if (info->state & (1<<MD_DISK_JOURNAL))
set_bit(Journal, &rdev->flags);
/*
* check whether the device shows up in other nodes
*/
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE)) {
/* Through --cluster-confirm */
if (info->state & (1 << MD_DISK_CANDIDATE))
set_bit(Candidate, &rdev->flags);
err = md_cluster_ops->new_disk_ack(mddev, true);
if (err) {
export_rdev(rdev);
return err;
}
} else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
/* --add initiated by this node */
err = md_cluster_ops->add_new_disk_start(mddev, rdev);
err = md_cluster_ops->add_new_disk(mddev, rdev);
if (err) {
md_cluster_ops->add_new_disk_finish(mddev);
export_rdev(rdev);
return err;
}
......@@ -5928,13 +6049,23 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
rdev->raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
export_rdev(rdev);
if (mddev_is_clustered(mddev)) {
if (info->state & (1 << MD_DISK_CANDIDATE))
md_cluster_ops->new_disk_ack(mddev, (err == 0));
else {
if (err)
md_cluster_ops->add_new_disk_cancel(mddev);
else
err = add_bound_rdev(rdev);
if (mddev_is_clustered(mddev) &&
(info->state & (1 << MD_DISK_CLUSTER_ADD)))
md_cluster_ops->add_new_disk_finish(mddev);
}
} else if (!err)
err = add_bound_rdev(rdev);
return err;
}
......@@ -5990,13 +6121,17 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
{
char b[BDEVNAME_SIZE];
struct md_rdev *rdev;
int ret = -1;
rdev = find_rdev(mddev, dev);
if (!rdev)
return -ENXIO;
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
ret = md_cluster_ops->metadata_update_start(mddev);
if (rdev->raid_disk < 0)
goto kick_rdev;
clear_bit(Blocked, &rdev->flags);
remove_and_add_spares(mddev, rdev);
......@@ -6004,20 +6139,19 @@ static int hot_remove_disk(struct mddev *mddev, dev_t dev)
if (rdev->raid_disk >= 0)
goto busy;
if (mddev_is_clustered(mddev))
kick_rdev:
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->remove_disk(mddev, rdev);
md_kick_rdev_from_array(rdev);
md_update_sb(mddev, 1);
md_new_event(mddev);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
return 0;
busy:
if (mddev_is_clustered(mddev))
if (mddev_is_clustered(mddev) && ret == 0)
md_cluster_ops->metadata_update_cancel(mddev);
printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
bdevname(rdev->bdev,b), mdname(mddev));
return -EBUSY;
......@@ -6068,14 +6202,12 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
goto abort_export;
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
clear_bit(In_sync, &rdev->flags);
rdev->desc_nr = -1;
rdev->saved_raid_disk = -1;
err = bind_rdev_to_array(rdev, mddev);
if (err)
goto abort_clustered;
goto abort_export;
/*
* The rest should better be atomic, we can have disk failures
......@@ -6085,9 +6217,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
rdev->raid_disk = -1;
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
......@@ -6097,9 +6226,6 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
md_new_event(mddev);
return 0;
abort_clustered:
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_cancel(mddev);
abort_export:
export_rdev(rdev);
return err;
......@@ -6417,8 +6543,6 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
return rv;
}
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
rv = update_size(mddev, (sector_t)info->size * 2);
......@@ -6476,12 +6600,8 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
}
}
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
return rv;
err:
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_cancel(mddev);
return rv;
}
......@@ -7282,6 +7402,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
bdevname(rdev->bdev,b), rdev->desc_nr);
if (test_bit(WriteMostly, &rdev->flags))
seq_printf(seq, "(W)");
if (test_bit(Journal, &rdev->flags))
seq_printf(seq, "(J)");
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
......@@ -7594,11 +7716,7 @@ int md_allow_write(struct mddev *mddev)
mddev->safemode == 0)
mddev->safemode = 1;
spin_unlock(&mddev->lock);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
md_update_sb(mddev, 0);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
sysfs_notify_dirent_safe(mddev->sysfs_state);
} else
spin_unlock(&mddev->lock);
......@@ -7630,6 +7748,7 @@ void md_do_sync(struct md_thread *thread)
struct md_rdev *rdev;
char *desc, *action = NULL;
struct blk_plug plug;
bool cluster_resync_finished = false;
/* just incase thread restarts... */
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
......@@ -7739,6 +7858,7 @@ void md_do_sync(struct md_thread *thread)
rcu_read_lock();
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < j)
......@@ -7799,9 +7919,6 @@ void md_do_sync(struct md_thread *thread)
md_new_event(mddev);
update_time = jiffies;
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_start(mddev, j, max_sectors);
blk_start_plug(&plug);
while (j < max_sectors) {
sector_t sectors;
......@@ -7865,8 +7982,6 @@ void md_do_sync(struct md_thread *thread)
j = max_sectors;
if (j > 2)
mddev->curr_resync = j;
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_info_update(mddev, j, max_sectors);
mddev->curr_mark_cnt = io_sectors;
if (last_check == 0)
/* this is the earliest that rebuild will be
......@@ -7937,7 +8052,11 @@ void md_do_sync(struct md_thread *thread)
mddev->curr_resync_completed = mddev->curr_resync;
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
/* tell personality that we are finished */
/* tell personality and other nodes that we are finished */
if (mddev_is_clustered(mddev)) {
md_cluster_ops->resync_finish(mddev);
cluster_resync_finished = true;
}
mddev->pers->sync_request(mddev, max_sectors, &skipped);
if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
......@@ -7965,6 +8084,7 @@ void md_do_sync(struct md_thread *thread)
rdev_for_each_rcu(rdev, mddev)
if (rdev->raid_disk >= 0 &&
mddev->delta_disks >= 0 &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
!test_bit(In_sync, &rdev->flags) &&
rdev->recovery_offset < mddev->curr_resync)
......@@ -7973,11 +8093,13 @@ void md_do_sync(struct md_thread *thread)
}
}
skip:
if (mddev_is_clustered(mddev))
md_cluster_ops->resync_finish(mddev);
set_bit(MD_CHANGE_DEVS, &mddev->flags);
if (mddev_is_clustered(mddev) &&
test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
!cluster_resync_finished)
md_cluster_ops->resync_finish(mddev);
spin_lock(&mddev->lock);
if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
/* We completed so min/max setting can be forgotten if used. */
......@@ -8008,7 +8130,8 @@ static int remove_and_add_spares(struct mddev *mddev,
rdev->raid_disk >= 0 &&
!test_bit(Blocked, &rdev->flags) &&
(test_bit(Faulty, &rdev->flags) ||
! test_bit(In_sync, &rdev->flags)) &&
(!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags))) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
mddev, rdev) == 0) {
......@@ -8020,18 +8143,25 @@ static int remove_and_add_spares(struct mddev *mddev,
if (removed && mddev->kobj.sd)
sysfs_notify(&mddev->kobj, NULL, "degraded");
if (this)
if (this && removed)
goto no_add;
rdev_for_each(rdev, mddev) {
if (this && this != rdev)
continue;
if (test_bit(Candidate, &rdev->flags))
continue;
if (rdev->raid_disk >= 0 &&
!test_bit(In_sync, &rdev->flags) &&
!test_bit(Journal, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags))
spares++;
if (rdev->raid_disk >= 0)
continue;
if (test_bit(Faulty, &rdev->flags))
continue;
if (test_bit(Journal, &rdev->flags))
continue;
if (mddev->ro &&
! (rdev->saved_raid_disk >= 0 &&
!test_bit(Bitmap_sync, &rdev->flags)))
......@@ -8056,11 +8186,22 @@ static int remove_and_add_spares(struct mddev *mddev,
static void md_start_sync(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
int ret = 0;
if (mddev_is_clustered(mddev)) {
ret = md_cluster_ops->resync_start(mddev);
if (ret) {
mddev->sync_thread = NULL;
goto out;
}
}
mddev->sync_thread = md_register_thread(md_do_sync,
mddev,
"resync");
out:
if (!mddev->sync_thread) {
if (!(mddev_is_clustered(mddev) && ret == -EAGAIN))
printk(KERN_ERR "%s: could not start resync"
" thread...\n",
mdname(mddev));
......@@ -8182,13 +8323,8 @@ void md_check_recovery(struct mddev *mddev)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
if (mddev->flags & MD_UPDATE_SB_FLAGS) {
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (mddev->flags & MD_UPDATE_SB_FLAGS)
md_update_sb(mddev, 0);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
}
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
!test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
......@@ -8286,8 +8422,6 @@ void md_reap_sync_thread(struct mddev *mddev)
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_start(mddev);
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
mddev->pers->finish_reshape)
mddev->pers->finish_reshape(mddev);
......@@ -8300,8 +8434,6 @@ void md_reap_sync_thread(struct mddev *mddev)
rdev->saved_raid_disk = -1;
md_update_sb(mddev, 1);
if (mddev_is_clustered(mddev))
md_cluster_ops->metadata_update_finish(mddev);
clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
......@@ -8924,25 +9056,128 @@ static int __init md_init(void)
return ret;
}
void md_reload_sb(struct mddev *mddev)
static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
{
struct md_rdev *rdev, *tmp;
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
struct md_rdev *rdev2;
int role, ret;
char b[BDEVNAME_SIZE];
rdev_for_each_safe(rdev, tmp, mddev) {
rdev->sb_loaded = 0;
/* Check for change of roles in the active devices */
rdev_for_each(rdev2, mddev) {
if (test_bit(Faulty, &rdev2->flags))
continue;
/* Check if the roles changed */
role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
if (test_bit(Candidate, &rdev2->flags)) {
if (role == 0xfffe) {
pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
md_kick_rdev_from_array(rdev2);
continue;
}
else
clear_bit(Candidate, &rdev2->flags);
}
if (role != rdev2->raid_disk) {
/* got activated */
if (rdev2->raid_disk == -1 && role != 0xffff) {
rdev2->saved_raid_disk = role;
ret = remove_and_add_spares(mddev, rdev2);
pr_info("Activated spare: %s\n",
bdevname(rdev2->bdev,b));
continue;
}
/* device faulty
* We just want to do the minimum to mark the disk
* as faulty. The recovery is performed by the
* one who initiated the error.
*/
if ((role == 0xfffe) || (role == 0xfffd)) {
md_error(mddev, rdev2);
clear_bit(Blocked, &rdev2->flags);
}
}
}
if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
/* Finally set the event to be up to date */
mddev->events = le64_to_cpu(sb->events);
}
static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
{
int err;
struct page *swapout = rdev->sb_page;
struct mdp_superblock_1 *sb;
/* Store the sb page of the rdev in the swapout temporary
* variable in case we err in the future
*/
rdev->sb_page = NULL;
alloc_disk_sb(rdev);
ClearPageUptodate(rdev->sb_page);
rdev->sb_loaded = 0;
err = super_types[mddev->major_version].load_super(rdev, NULL, mddev->minor_version);
if (err < 0) {
pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
__func__, __LINE__, rdev->desc_nr, err);
put_page(rdev->sb_page);
rdev->sb_page = swapout;
rdev->sb_loaded = 1;
return err;
}
mddev->raid_disks = 0;
analyze_sbs(mddev);
rdev_for_each_safe(rdev, tmp, mddev) {
struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
/* since we don't write to faulty devices, we figure out if the
* disk is faulty by comparing events
sb = page_address(rdev->sb_page);
/* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
* is not set
*/
if (mddev->events > sb->events)
set_bit(Faulty, &rdev->flags);
if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
/* The other node finished recovery, call spare_active to set
* device In_sync and mddev->degraded
*/
if (rdev->recovery_offset == MaxSector &&
!test_bit(In_sync, &rdev->flags) &&
mddev->pers->spare_active(mddev))
sysfs_notify(&mddev->kobj, NULL, "degraded");
put_page(swapout);
return 0;
}
void md_reload_sb(struct mddev *mddev, int nr)
{
struct md_rdev *rdev;
int err;
/* Find the rdev */
rdev_for_each_rcu(rdev, mddev) {
if (rdev->desc_nr == nr)
break;
}
if (!rdev || rdev->desc_nr != nr) {
pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
return;
}
err = read_rdev(mddev, rdev);
if (err < 0)
return;
check_sb_changes(mddev, rdev);
/* Read all rdev's to update recovery_offset */
rdev_for_each_rcu(rdev, mddev)
read_rdev(mddev, rdev);
}
EXPORT_SYMBOL(md_reload_sb);
......
......@@ -87,10 +87,16 @@ struct md_rdev {
* array and could again if we did a partial
* resync from the bitmap
*/
union {
sector_t recovery_offset;/* If this device has been partially
* recovered, this is where we were
* up to.
*/
sector_t journal_tail; /* If this device is a journal device,
* this is the journal tail (journal
* recovery start point)
*/
};
atomic_t nr_pending; /* number of pending requests.
* only maintained for arrays that
......@@ -172,6 +178,11 @@ enum flag_bits {
* This device is seen locally but not
* by the whole cluster
*/
Journal, /* This device is used as journal for
* raid-5/6.
* Usually, this device should be faster
* than other devices in the array
*/
};
#define BB_LEN_MASK (0x00000000000001FFULL)
......@@ -221,6 +232,8 @@ struct mddev {
#define MD_STILL_CLOSED 4 /* If set, then array has not been opened since
* md_ioctl checked on it.
*/
#define MD_JOURNAL_CLEAN 5 /* A raid with journal is already clean */
#define MD_HAS_JOURNAL 6 /* The raid array has journal feature set */
int suspended;
atomic_t active_io;
......@@ -658,7 +671,7 @@ extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
struct mddev *mddev);
extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
extern void md_reload_sb(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
extern void md_kick_rdev_from_array(struct md_rdev * rdev);
struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr);
......
......@@ -90,6 +90,8 @@ static void r1bio_pool_free(void *r1_bio, void *data)
#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS)
static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
......@@ -1590,6 +1592,15 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (rdev->raid_disk >= 0)
first = last = rdev->raid_disk;
/*
* find the disk ... but prefer rdev->saved_raid_disk
* if possible.
*/
if (rdev->saved_raid_disk >= 0 &&
rdev->saved_raid_disk >= first &&
conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
first = last = rdev->saved_raid_disk;
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors+mirror;
if (!p->rdev) {
......@@ -2495,6 +2506,11 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bitmap_close_sync(mddev->bitmap);
close_sync(conf);
if (mddev_is_clustered(mddev)) {
conf->cluster_sync_low = 0;
conf->cluster_sync_high = 0;
}
return 0;
}
......@@ -2515,7 +2531,12 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
return sync_blocks;
}
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
/* we are incrementing sector_nr below. To be safe, we check against
* sector_nr + two times RESYNC_SECTORS
*/
bitmap_cond_end_sync(mddev->bitmap, sector_nr,
mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
raise_barrier(conf, sector_nr);
......@@ -2706,6 +2727,16 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
bio_full:
r1_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
conf->cluster_sync_high < sector_nr + nr_sectors) {
conf->cluster_sync_low = mddev->curr_resync_completed;
conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
/* Send resync message */
md_cluster_ops->resync_info_update(mddev,
conf->cluster_sync_low,
conf->cluster_sync_high);
}
/* For a user-requested sync, we read all readable devices and do a
* compare
*/
......@@ -3020,9 +3051,11 @@ static int raid1_reshape(struct mddev *mddev)
return -EINVAL;
}
if (!mddev_is_clustered(mddev)) {
err = md_allow_write(mddev);
if (err)
return err;
}
raid_disks = mddev->raid_disks + mddev->delta_disks;
......
......@@ -111,6 +111,13 @@ struct r1conf {
* the new thread here until we fully activate the array.
*/
struct md_thread *thread;
/* Keep track of cluster resync window to send to other
* nodes.
*/
sector_t cluster_sync_low;
sector_t cluster_sync_high;
};
/*
......
......@@ -3149,7 +3149,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
/* resync. Schedule a read for every block at this virt offset */
int count = 0;
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
bitmap_cond_end_sync(mddev->bitmap, sector_nr, 0);
if (!bitmap_start_sync(mddev->bitmap, sector_nr,
&sync_blocks, mddev->degraded) &&
......
/*
* Copyright (C) 2015 Shaohua Li <shli@fb.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
*/
#include <linux/kernel.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/raid/md_p.h>
#include <linux/crc32c.h>
#include <linux/random.h>
#include "md.h"
#include "raid5.h"
/*
* metadata/data stored in disk with 4k size unit (a block) regardless
* underneath hardware sector size. only works with PAGE_SIZE == 4096
*/
#define BLOCK_SECTORS (8)
/*
* reclaim runs every 1/4 disk size or 10G reclaimable space. This can prevent
* recovery scans a very long log
*/
#define RECLAIM_MAX_FREE_SPACE (10 * 1024 * 1024 * 2) /* sector */
#define RECLAIM_MAX_FREE_SPACE_SHIFT (2)
struct r5l_log {
struct md_rdev *rdev;
u32 uuid_checksum;
sector_t device_size; /* log device size, round to
* BLOCK_SECTORS */
sector_t max_free_space; /* reclaim run if free space is at
* this size */
sector_t last_checkpoint; /* log tail. where recovery scan
* starts from */
u64 last_cp_seq; /* log tail sequence */
sector_t log_start; /* log head. where new data appends */
u64 seq; /* log head sequence */
sector_t next_checkpoint;
u64 next_cp_seq;
struct mutex io_mutex;
struct r5l_io_unit *current_io; /* current io_unit accepting new data */
spinlock_t io_list_lock;
struct list_head running_ios; /* io_units which are still running,
* and have not yet been completely
* written to the log */
struct list_head io_end_ios; /* io_units which have been completely
* written to the log but not yet written
* to the RAID */
struct list_head flushing_ios; /* io_units which are waiting for log
* cache flush */
struct list_head finished_ios; /* io_units which settle down in log disk */
struct bio flush_bio;
struct kmem_cache *io_kc;
struct md_thread *reclaim_thread;
unsigned long reclaim_target; /* number of space that need to be
* reclaimed. if it's 0, reclaim spaces
* used by io_units which are in
* IO_UNIT_STRIPE_END state (eg, reclaim
* dones't wait for specific io_unit
* switching to IO_UNIT_STRIPE_END
* state) */
wait_queue_head_t iounit_wait;
struct list_head no_space_stripes; /* pending stripes, log has no space */
spinlock_t no_space_stripes_lock;
bool need_cache_flush;
bool in_teardown;
};
/*
* an IO range starts from a meta data block and end at the next meta data
* block. The io unit's the meta data block tracks data/parity followed it. io
* unit is written to log disk with normal write, as we always flush log disk
* first and then start move data to raid disks, there is no requirement to
* write io unit with FLUSH/FUA
*/
struct r5l_io_unit {
struct r5l_log *log;
struct page *meta_page; /* store meta block */
int meta_offset; /* current offset in meta_page */
struct bio *current_bio;/* current_bio accepting new data */
atomic_t pending_stripe;/* how many stripes not flushed to raid */
u64 seq; /* seq number of the metablock */
sector_t log_start; /* where the io_unit starts */
sector_t log_end; /* where the io_unit ends */
struct list_head log_sibling; /* log->running_ios */
struct list_head stripe_list; /* stripes added to the io_unit */
int state;
bool need_split_bio;
};
/* r5l_io_unit state */
enum r5l_io_unit_state {
IO_UNIT_RUNNING = 0, /* accepting new IO */
IO_UNIT_IO_START = 1, /* io_unit bio start writing to log,
* don't accepting new bio */
IO_UNIT_IO_END = 2, /* io_unit bio finish writing to log */
IO_UNIT_STRIPE_END = 3, /* stripes data finished writing to raid */
};
static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
{
start += inc;
if (start >= log->device_size)
start = start - log->device_size;
return start;
}
static sector_t r5l_ring_distance(struct r5l_log *log, sector_t start,
sector_t end)
{
if (end >= start)
return end - start;
else
return end + log->device_size - start;
}
static bool r5l_has_free_space(struct r5l_log *log, sector_t size)
{
sector_t used_size;
used_size = r5l_ring_distance(log, log->last_checkpoint,
log->log_start);
return log->device_size > used_size + size;
}
static void r5l_free_io_unit(struct r5l_log *log, struct r5l_io_unit *io)
{
__free_page(io->meta_page);
kmem_cache_free(log->io_kc, io);
}
static void r5l_move_io_unit_list(struct list_head *from, struct list_head *to,
enum r5l_io_unit_state state)
{
struct r5l_io_unit *io;
while (!list_empty(from)) {
io = list_first_entry(from, struct r5l_io_unit, log_sibling);
/* don't change list order */
if (io->state >= state)
list_move_tail(&io->log_sibling, to);
else
break;
}
}
static void __r5l_set_io_unit_state(struct r5l_io_unit *io,
enum r5l_io_unit_state state)
{
if (WARN_ON(io->state >= state))
return;
io->state = state;
}
static void r5l_io_run_stripes(struct r5l_io_unit *io)
{
struct stripe_head *sh, *next;
list_for_each_entry_safe(sh, next, &io->stripe_list, log_list) {
list_del_init(&sh->log_list);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
}
static void r5l_log_run_stripes(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
assert_spin_locked(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->running_ios, log_sibling) {
/* don't change list order */
if (io->state < IO_UNIT_IO_END)
break;
list_move_tail(&io->log_sibling, &log->finished_ios);
r5l_io_run_stripes(io);
}
}
static void r5l_log_endio(struct bio *bio)
{
struct r5l_io_unit *io = bio->bi_private;
struct r5l_log *log = io->log;
unsigned long flags;
if (bio->bi_error)
md_error(log->rdev->mddev, log->rdev);
bio_put(bio);
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_END);
if (log->need_cache_flush)
r5l_move_io_unit_list(&log->running_ios, &log->io_end_ios,
IO_UNIT_IO_END);
else
r5l_log_run_stripes(log);
spin_unlock_irqrestore(&log->io_list_lock, flags);
if (log->need_cache_flush)
md_wakeup_thread(log->rdev->mddev->thread);
}
static void r5l_submit_current_io(struct r5l_log *log)
{
struct r5l_io_unit *io = log->current_io;
struct r5l_meta_block *block;
unsigned long flags;
u32 crc;
if (!io)
return;
block = page_address(io->meta_page);
block->meta_size = cpu_to_le32(io->meta_offset);
crc = crc32c_le(log->uuid_checksum, block, PAGE_SIZE);
block->checksum = cpu_to_le32(crc);
log->current_io = NULL;
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_IO_START);
spin_unlock_irqrestore(&log->io_list_lock, flags);
submit_bio(WRITE, io->current_bio);
}
static struct bio *r5l_bio_alloc(struct r5l_log *log)
{
struct bio *bio = bio_kmalloc(GFP_NOIO | __GFP_NOFAIL, BIO_MAX_PAGES);
bio->bi_rw = WRITE;
bio->bi_bdev = log->rdev->bdev;
bio->bi_iter.bi_sector = log->rdev->data_offset + log->log_start;
return bio;
}
static void r5_reserve_log_entry(struct r5l_log *log, struct r5l_io_unit *io)
{
log->log_start = r5l_ring_add(log, log->log_start, BLOCK_SECTORS);
/*
* If we filled up the log device start from the beginning again,
* which will require a new bio.
*
* Note: for this to work properly the log size needs to me a multiple
* of BLOCK_SECTORS.
*/
if (log->log_start == 0)
io->need_split_bio = true;
io->log_end = log->log_start;
}
static struct r5l_io_unit *r5l_new_meta(struct r5l_log *log)
{
struct r5l_io_unit *io;
struct r5l_meta_block *block;
/* We can't handle memory allocate failure so far */
io = kmem_cache_zalloc(log->io_kc, GFP_NOIO | __GFP_NOFAIL);
io->log = log;
INIT_LIST_HEAD(&io->log_sibling);
INIT_LIST_HEAD(&io->stripe_list);
io->state = IO_UNIT_RUNNING;
io->meta_page = alloc_page(GFP_NOIO | __GFP_NOFAIL | __GFP_ZERO);
block = page_address(io->meta_page);
block->magic = cpu_to_le32(R5LOG_MAGIC);
block->version = R5LOG_VERSION;
block->seq = cpu_to_le64(log->seq);
block->position = cpu_to_le64(log->log_start);
io->log_start = log->log_start;
io->meta_offset = sizeof(struct r5l_meta_block);
io->seq = log->seq++;
io->current_bio = r5l_bio_alloc(log);
io->current_bio->bi_end_io = r5l_log_endio;
io->current_bio->bi_private = io;
bio_add_page(io->current_bio, io->meta_page, PAGE_SIZE, 0);
r5_reserve_log_entry(log, io);
spin_lock_irq(&log->io_list_lock);
list_add_tail(&io->log_sibling, &log->running_ios);
spin_unlock_irq(&log->io_list_lock);
return io;
}
static int r5l_get_meta(struct r5l_log *log, unsigned int payload_size)
{
if (log->current_io &&
log->current_io->meta_offset + payload_size > PAGE_SIZE)
r5l_submit_current_io(log);
if (!log->current_io)
log->current_io = r5l_new_meta(log);
return 0;
}
static void r5l_append_payload_meta(struct r5l_log *log, u16 type,
sector_t location,
u32 checksum1, u32 checksum2,
bool checksum2_valid)
{
struct r5l_io_unit *io = log->current_io;
struct r5l_payload_data_parity *payload;
payload = page_address(io->meta_page) + io->meta_offset;
payload->header.type = cpu_to_le16(type);
payload->header.flags = cpu_to_le16(0);
payload->size = cpu_to_le32((1 + !!checksum2_valid) <<
(PAGE_SHIFT - 9));
payload->location = cpu_to_le64(location);
payload->checksum[0] = cpu_to_le32(checksum1);
if (checksum2_valid)
payload->checksum[1] = cpu_to_le32(checksum2);
io->meta_offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * (1 + !!checksum2_valid);
}
static void r5l_append_payload_page(struct r5l_log *log, struct page *page)
{
struct r5l_io_unit *io = log->current_io;
if (io->need_split_bio) {
struct bio *prev = io->current_bio;
io->current_bio = r5l_bio_alloc(log);
bio_chain(io->current_bio, prev);
submit_bio(WRITE, prev);
}
if (!bio_add_page(io->current_bio, page, PAGE_SIZE, 0))
BUG();
r5_reserve_log_entry(log, io);
}
static void r5l_log_stripe(struct r5l_log *log, struct stripe_head *sh,
int data_pages, int parity_pages)
{
int i;
int meta_size;
struct r5l_io_unit *io;
meta_size =
((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
* data_pages) +
sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * parity_pages;
r5l_get_meta(log, meta_size);
io = log->current_io;
for (i = 0; i < sh->disks; i++) {
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
if (i == sh->pd_idx || i == sh->qd_idx)
continue;
r5l_append_payload_meta(log, R5LOG_PAYLOAD_DATA,
raid5_compute_blocknr(sh, i, 0),
sh->dev[i].log_checksum, 0, false);
r5l_append_payload_page(log, sh->dev[i].page);
}
if (sh->qd_idx >= 0) {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum,
sh->dev[sh->qd_idx].log_checksum, true);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
r5l_append_payload_page(log, sh->dev[sh->qd_idx].page);
} else {
r5l_append_payload_meta(log, R5LOG_PAYLOAD_PARITY,
sh->sector, sh->dev[sh->pd_idx].log_checksum,
0, false);
r5l_append_payload_page(log, sh->dev[sh->pd_idx].page);
}
list_add_tail(&sh->log_list, &io->stripe_list);
atomic_inc(&io->pending_stripe);
sh->log_io = io;
}
static void r5l_wake_reclaim(struct r5l_log *log, sector_t space);
/*
* running in raid5d, where reclaim could wait for raid5d too (when it flushes
* data from log to raid disks), so we shouldn't wait for reclaim here
*/
int r5l_write_stripe(struct r5l_log *log, struct stripe_head *sh)
{
int write_disks = 0;
int data_pages, parity_pages;
int meta_size;
int reserve;
int i;
if (!log)
return -EAGAIN;
/* Don't support stripe batch */
if (sh->log_io || !test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags) ||
test_bit(STRIPE_SYNCING, &sh->state)) {
/* the stripe is written to log, we start writing it to raid */
clear_bit(STRIPE_LOG_TRAPPED, &sh->state);
return -EAGAIN;
}
for (i = 0; i < sh->disks; i++) {
void *addr;
if (!test_bit(R5_Wantwrite, &sh->dev[i].flags))
continue;
write_disks++;
/* checksum is already calculated in last run */
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
continue;
addr = kmap_atomic(sh->dev[i].page);
sh->dev[i].log_checksum = crc32c_le(log->uuid_checksum,
addr, PAGE_SIZE);
kunmap_atomic(addr);
}
parity_pages = 1 + !!(sh->qd_idx >= 0);
data_pages = write_disks - parity_pages;
meta_size =
((sizeof(struct r5l_payload_data_parity) + sizeof(__le32))
* data_pages) +
sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) * parity_pages;
/* Doesn't work with very big raid array */
if (meta_size + sizeof(struct r5l_meta_block) > PAGE_SIZE)
return -EINVAL;
set_bit(STRIPE_LOG_TRAPPED, &sh->state);
/*
* The stripe must enter state machine again to finish the write, so
* don't delay.
*/
clear_bit(STRIPE_DELAYED, &sh->state);
atomic_inc(&sh->count);
mutex_lock(&log->io_mutex);
/* meta + data */
reserve = (1 + write_disks) << (PAGE_SHIFT - 9);
if (r5l_has_free_space(log, reserve))
r5l_log_stripe(log, sh, data_pages, parity_pages);
else {
spin_lock(&log->no_space_stripes_lock);
list_add_tail(&sh->log_list, &log->no_space_stripes);
spin_unlock(&log->no_space_stripes_lock);
r5l_wake_reclaim(log, reserve);
}
mutex_unlock(&log->io_mutex);
return 0;
}
void r5l_write_stripe_run(struct r5l_log *log)
{
if (!log)
return;
mutex_lock(&log->io_mutex);
r5l_submit_current_io(log);
mutex_unlock(&log->io_mutex);
}
int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio)
{
if (!log)
return -ENODEV;
/*
* we flush log disk cache first, then write stripe data to raid disks.
* So if bio is finished, the log disk cache is flushed already. The
* recovery guarantees we can recovery the bio from log disk, so we
* don't need to flush again
*/
if (bio->bi_iter.bi_size == 0) {
bio_endio(bio);
return 0;
}
bio->bi_rw &= ~REQ_FLUSH;
return -EAGAIN;
}
/* This will run after log space is reclaimed */
static void r5l_run_no_space_stripes(struct r5l_log *log)
{
struct stripe_head *sh;
spin_lock(&log->no_space_stripes_lock);
while (!list_empty(&log->no_space_stripes)) {
sh = list_first_entry(&log->no_space_stripes,
struct stripe_head, log_list);
list_del_init(&sh->log_list);
set_bit(STRIPE_HANDLE, &sh->state);
raid5_release_stripe(sh);
}
spin_unlock(&log->no_space_stripes_lock);
}
static sector_t r5l_reclaimable_space(struct r5l_log *log)
{
return r5l_ring_distance(log, log->last_checkpoint,
log->next_checkpoint);
}
static bool r5l_complete_finished_ios(struct r5l_log *log)
{
struct r5l_io_unit *io, *next;
bool found = false;
assert_spin_locked(&log->io_list_lock);
list_for_each_entry_safe(io, next, &log->finished_ios, log_sibling) {
/* don't change list order */
if (io->state < IO_UNIT_STRIPE_END)
break;
log->next_checkpoint = io->log_start;
log->next_cp_seq = io->seq;
list_del(&io->log_sibling);
r5l_free_io_unit(log, io);
found = true;
}
return found;
}
static void __r5l_stripe_write_finished(struct r5l_io_unit *io)
{
struct r5l_log *log = io->log;
unsigned long flags;
spin_lock_irqsave(&log->io_list_lock, flags);
__r5l_set_io_unit_state(io, IO_UNIT_STRIPE_END);
if (!r5l_complete_finished_ios(log)) {
spin_unlock_irqrestore(&log->io_list_lock, flags);
return;
}
if (r5l_reclaimable_space(log) > log->max_free_space)
r5l_wake_reclaim(log, 0);
spin_unlock_irqrestore(&log->io_list_lock, flags);
wake_up(&log->iounit_wait);
}
void r5l_stripe_write_finished(struct stripe_head *sh)
{
struct r5l_io_unit *io;
io = sh->log_io;
sh->log_io = NULL;
if (io && atomic_dec_and_test(&io->pending_stripe))
__r5l_stripe_write_finished(io);
}
static void r5l_log_flush_endio(struct bio *bio)
{
struct r5l_log *log = container_of(bio, struct r5l_log,
flush_bio);
unsigned long flags;
struct r5l_io_unit *io;
if (bio->bi_error)
md_error(log->rdev->mddev, log->rdev);
spin_lock_irqsave(&log->io_list_lock, flags);
list_for_each_entry(io, &log->flushing_ios, log_sibling)
r5l_io_run_stripes(io);
list_splice_tail_init(&log->flushing_ios, &log->finished_ios);
spin_unlock_irqrestore(&log->io_list_lock, flags);
}
/*
* Starting dispatch IO to raid.
* io_unit(meta) consists of a log. There is one situation we want to avoid. A
* broken meta in the middle of a log causes recovery can't find meta at the
* head of log. If operations require meta at the head persistent in log, we
* must make sure meta before it persistent in log too. A case is:
*
* stripe data/parity is in log, we start write stripe to raid disks. stripe
* data/parity must be persistent in log before we do the write to raid disks.
*
* The solution is we restrictly maintain io_unit list order. In this case, we
* only write stripes of an io_unit to raid disks till the io_unit is the first
* one whose data/parity is in log.
*/
void r5l_flush_stripe_to_raid(struct r5l_log *log)
{
bool do_flush;
if (!log || !log->need_cache_flush)
return;
spin_lock_irq(&log->io_list_lock);
/* flush bio is running */
if (!list_empty(&log->flushing_ios)) {
spin_unlock_irq(&log->io_list_lock);
return;
}
list_splice_tail_init(&log->io_end_ios, &log->flushing_ios);
do_flush = !list_empty(&log->flushing_ios);
spin_unlock_irq(&log->io_list_lock);
if (!do_flush)
return;
bio_reset(&log->flush_bio);
log->flush_bio.bi_bdev = log->rdev->bdev;
log->flush_bio.bi_end_io = r5l_log_flush_endio;
submit_bio(WRITE_FLUSH, &log->flush_bio);
}
static void r5l_write_super(struct r5l_log *log, sector_t cp);
static void r5l_write_super_and_discard_space(struct r5l_log *log,
sector_t end)
{
struct block_device *bdev = log->rdev->bdev;
struct mddev *mddev;
r5l_write_super(log, end);
if (!blk_queue_discard(bdev_get_queue(bdev)))
return;
mddev = log->rdev->mddev;
/*
* This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
* wait for this thread to finish. This thread waits for
* MD_CHANGE_PENDING clear, which is supposed to be done in
* md_check_recovery(). md_check_recovery() tries to get
* reconfig_mutex. Since r5l_quiesce already holds the mutex,
* md_check_recovery() fails, so the PENDING never get cleared. The
* in_teardown check workaround this issue.
*/
if (!log->in_teardown) {
set_bit(MD_CHANGE_DEVS, &mddev->flags);
set_bit(MD_CHANGE_PENDING, &mddev->flags);
md_wakeup_thread(mddev->thread);
wait_event(mddev->sb_wait,
!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
log->in_teardown);
/*
* r5l_quiesce could run after in_teardown check and hold
* mutex first. Superblock might get updated twice.
*/
if (log->in_teardown)
md_update_sb(mddev, 1);
} else {
WARN_ON(!mddev_is_locked(mddev));
md_update_sb(mddev, 1);
}
/* discard IO error really doesn't matter, ignore it */
if (log->last_checkpoint < end) {
blkdev_issue_discard(bdev,
log->last_checkpoint + log->rdev->data_offset,
end - log->last_checkpoint, GFP_NOIO, 0);
} else {
blkdev_issue_discard(bdev,
log->last_checkpoint + log->rdev->data_offset,
log->device_size - log->last_checkpoint,
GFP_NOIO, 0);
blkdev_issue_discard(bdev, log->rdev->data_offset, end,
GFP_NOIO, 0);
}
}
static void r5l_do_reclaim(struct r5l_log *log)
{
sector_t reclaim_target = xchg(&log->reclaim_target, 0);
sector_t reclaimable;
sector_t next_checkpoint;
u64 next_cp_seq;
spin_lock_irq(&log->io_list_lock);
/*
* move proper io_unit to reclaim list. We should not change the order.
* reclaimable/unreclaimable io_unit can be mixed in the list, we
* shouldn't reuse space of an unreclaimable io_unit
*/
while (1) {
reclaimable = r5l_reclaimable_space(log);
if (reclaimable >= reclaim_target ||
(list_empty(&log->running_ios) &&
list_empty(&log->io_end_ios) &&
list_empty(&log->flushing_ios) &&
list_empty(&log->finished_ios)))
break;
md_wakeup_thread(log->rdev->mddev->thread);
wait_event_lock_irq(log->iounit_wait,
r5l_reclaimable_space(log) > reclaimable,
log->io_list_lock);
}
next_checkpoint = log->next_checkpoint;
next_cp_seq = log->next_cp_seq;
spin_unlock_irq(&log->io_list_lock);
BUG_ON(reclaimable < 0);
if (reclaimable == 0)
return;
/*
* write_super will flush cache of each raid disk. We must write super
* here, because the log area might be reused soon and we don't want to
* confuse recovery
*/
r5l_write_super_and_discard_space(log, next_checkpoint);
mutex_lock(&log->io_mutex);
log->last_checkpoint = next_checkpoint;
log->last_cp_seq = next_cp_seq;
mutex_unlock(&log->io_mutex);
r5l_run_no_space_stripes(log);
}
static void r5l_reclaim_thread(struct md_thread *thread)
{
struct mddev *mddev = thread->mddev;
struct r5conf *conf = mddev->private;
struct r5l_log *log = conf->log;
if (!log)
return;
r5l_do_reclaim(log);
}
static void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
{
unsigned long target;
unsigned long new = (unsigned long)space; /* overflow in theory */
do {
target = log->reclaim_target;
if (new < target)
return;
} while (cmpxchg(&log->reclaim_target, target, new) != target);
md_wakeup_thread(log->reclaim_thread);
}
void r5l_quiesce(struct r5l_log *log, int state)
{
struct mddev *mddev;
if (!log || state == 2)
return;
if (state == 0) {
log->in_teardown = 0;
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
} else if (state == 1) {
/*
* at this point all stripes are finished, so io_unit is at
* least in STRIPE_END state
*/
log->in_teardown = 1;
/* make sure r5l_write_super_and_discard_space exits */
mddev = log->rdev->mddev;
wake_up(&mddev->sb_wait);
r5l_wake_reclaim(log, -1L);
md_unregister_thread(&log->reclaim_thread);
r5l_do_reclaim(log);
}
}
bool r5l_log_disk_error(struct r5conf *conf)
{
/* don't allow write if journal disk is missing */
if (!conf->log)
return test_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
return test_bit(Faulty, &conf->log->rdev->flags);
}
struct r5l_recovery_ctx {
struct page *meta_page; /* current meta */
sector_t meta_total_blocks; /* total size of current meta and data */
sector_t pos; /* recovery position */
u64 seq; /* recovery position seq */
};
static int r5l_read_meta_block(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct page *page = ctx->meta_page;
struct r5l_meta_block *mb;
u32 crc, stored_crc;
if (!sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, READ, false))
return -EIO;
mb = page_address(page);
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
le64_to_cpu(mb->seq) != ctx->seq ||
mb->version != R5LOG_VERSION ||
le64_to_cpu(mb->position) != ctx->pos)
return -EINVAL;
crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != crc)
return -EINVAL;
if (le32_to_cpu(mb->meta_size) > PAGE_SIZE)
return -EINVAL;
ctx->meta_total_blocks = BLOCK_SECTORS;
return 0;
}
static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
struct r5l_recovery_ctx *ctx,
sector_t stripe_sect,
int *offset, sector_t *log_offset)
{
struct r5conf *conf = log->rdev->mddev->private;
struct stripe_head *sh;
struct r5l_payload_data_parity *payload;
int disk_index;
sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
while (1) {
payload = page_address(ctx->meta_page) + *offset;
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0,
&disk_index, sh);
sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
sh->dev[disk_index].page, READ, false);
sh->dev[disk_index].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
ctx->meta_total_blocks += BLOCK_SECTORS;
} else {
disk_index = sh->pd_idx;
sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
sh->dev[disk_index].page, READ, false);
sh->dev[disk_index].log_checksum =
le32_to_cpu(payload->checksum[0]);
set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
if (sh->qd_idx >= 0) {
disk_index = sh->qd_idx;
sync_page_io(log->rdev,
r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
PAGE_SIZE, sh->dev[disk_index].page,
READ, false);
sh->dev[disk_index].log_checksum =
le32_to_cpu(payload->checksum[1]);
set_bit(R5_Wantwrite,
&sh->dev[disk_index].flags);
}
ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
}
*log_offset = r5l_ring_add(log, *log_offset,
le32_to_cpu(payload->size));
*offset += sizeof(struct r5l_payload_data_parity) +
sizeof(__le32) *
(le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
break;
}
for (disk_index = 0; disk_index < sh->disks; disk_index++) {
void *addr;
u32 checksum;
if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
continue;
addr = kmap_atomic(sh->dev[disk_index].page);
checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
kunmap_atomic(addr);
if (checksum != sh->dev[disk_index].log_checksum)
goto error;
}
for (disk_index = 0; disk_index < sh->disks; disk_index++) {
struct md_rdev *rdev, *rrdev;
if (!test_and_clear_bit(R5_Wantwrite,
&sh->dev[disk_index].flags))
continue;
/* in case device is broken */
rdev = rcu_dereference(conf->disks[disk_index].rdev);
if (rdev)
sync_page_io(rdev, stripe_sect, PAGE_SIZE,
sh->dev[disk_index].page, WRITE, false);
rrdev = rcu_dereference(conf->disks[disk_index].replacement);
if (rrdev)
sync_page_io(rrdev, stripe_sect, PAGE_SIZE,
sh->dev[disk_index].page, WRITE, false);
}
raid5_release_stripe(sh);
return 0;
error:
for (disk_index = 0; disk_index < sh->disks; disk_index++)
sh->dev[disk_index].flags = 0;
raid5_release_stripe(sh);
return -EINVAL;
}
static int r5l_recovery_flush_one_meta(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
struct r5conf *conf = log->rdev->mddev->private;
struct r5l_payload_data_parity *payload;
struct r5l_meta_block *mb;
int offset;
sector_t log_offset;
sector_t stripe_sector;
mb = page_address(ctx->meta_page);
offset = sizeof(struct r5l_meta_block);
log_offset = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
while (offset < le32_to_cpu(mb->meta_size)) {
int dd;
payload = (void *)mb + offset;
stripe_sector = raid5_compute_sector(conf,
le64_to_cpu(payload->location), 0, &dd, NULL);
if (r5l_recovery_flush_one_stripe(log, ctx, stripe_sector,
&offset, &log_offset))
return -EINVAL;
}
return 0;
}
/* copy data/parity from log to raid disks */
static void r5l_recovery_flush_log(struct r5l_log *log,
struct r5l_recovery_ctx *ctx)
{
while (1) {
if (r5l_read_meta_block(log, ctx))
return;
if (r5l_recovery_flush_one_meta(log, ctx))
return;
ctx->seq++;
ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
}
}
static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
u64 seq)
{
struct page *page;
struct r5l_meta_block *mb;
u32 crc;
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!page)
return -ENOMEM;
mb = page_address(page);
mb->magic = cpu_to_le32(R5LOG_MAGIC);
mb->version = R5LOG_VERSION;
mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
mb->seq = cpu_to_le64(seq);
mb->position = cpu_to_le64(pos);
crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
mb->checksum = cpu_to_le32(crc);
if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, WRITE_FUA, false)) {
__free_page(page);
return -EIO;
}
__free_page(page);
return 0;
}
static int r5l_recovery_log(struct r5l_log *log)
{
struct r5l_recovery_ctx ctx;
ctx.pos = log->last_checkpoint;
ctx.seq = log->last_cp_seq;
ctx.meta_page = alloc_page(GFP_KERNEL);
if (!ctx.meta_page)
return -ENOMEM;
r5l_recovery_flush_log(log, &ctx);
__free_page(ctx.meta_page);
/*
* we did a recovery. Now ctx.pos points to an invalid meta block. New
* log will start here. but we can't let superblock point to last valid
* meta block. The log might looks like:
* | meta 1| meta 2| meta 3|
* meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
* superblock points to meta 1, we write a new valid meta 2n. if crash
* happens again, new recovery will start from meta 1. Since meta 2n is
* valid now, recovery will think meta 3 is valid, which is wrong.
* The solution is we create a new meta in meta2 with its seq == meta
* 1's seq + 10 and let superblock points to meta2. The same recovery will
* not think meta 3 is a valid meta, because its seq doesn't match
*/
if (ctx.seq > log->last_cp_seq + 1) {
int ret;
ret = r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq + 10);
if (ret)
return ret;
log->seq = ctx.seq + 11;
log->log_start = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
r5l_write_super(log, ctx.pos);
} else {
log->log_start = ctx.pos;
log->seq = ctx.seq;
}
return 0;
}
static void r5l_write_super(struct r5l_log *log, sector_t cp)
{
struct mddev *mddev = log->rdev->mddev;
log->rdev->journal_tail = cp;
set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
static int r5l_load_log(struct r5l_log *log)
{
struct md_rdev *rdev = log->rdev;
struct page *page;
struct r5l_meta_block *mb;
sector_t cp = log->rdev->journal_tail;
u32 stored_crc, expected_crc;
bool create_super = false;
int ret;
/* Make sure it's valid */
if (cp >= rdev->sectors || round_down(cp, BLOCK_SECTORS) != cp)
cp = 0;
page = alloc_page(GFP_KERNEL);
if (!page)
return -ENOMEM;
if (!sync_page_io(rdev, cp, PAGE_SIZE, page, READ, false)) {
ret = -EIO;
goto ioerr;
}
mb = page_address(page);
if (le32_to_cpu(mb->magic) != R5LOG_MAGIC ||
mb->version != R5LOG_VERSION) {
create_super = true;
goto create;
}
stored_crc = le32_to_cpu(mb->checksum);
mb->checksum = 0;
expected_crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
if (stored_crc != expected_crc) {
create_super = true;
goto create;
}
if (le64_to_cpu(mb->position) != cp) {
create_super = true;
goto create;
}
create:
if (create_super) {
log->last_cp_seq = prandom_u32();
cp = 0;
/*
* Make sure super points to correct address. Log might have
* data very soon. If super hasn't correct log tail address,
* recovery can't find the log
*/
r5l_write_super(log, cp);
} else
log->last_cp_seq = le64_to_cpu(mb->seq);
log->device_size = round_down(rdev->sectors, BLOCK_SECTORS);
log->max_free_space = log->device_size >> RECLAIM_MAX_FREE_SPACE_SHIFT;
if (log->max_free_space > RECLAIM_MAX_FREE_SPACE)
log->max_free_space = RECLAIM_MAX_FREE_SPACE;
log->last_checkpoint = cp;
__free_page(page);
return r5l_recovery_log(log);
ioerr:
__free_page(page);
return ret;
}
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
struct r5l_log *log;
if (PAGE_SIZE != 4096)
return -EINVAL;
log = kzalloc(sizeof(*log), GFP_KERNEL);
if (!log)
return -ENOMEM;
log->rdev = rdev;
log->need_cache_flush = (rdev->bdev->bd_disk->queue->flush_flags != 0);
log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
sizeof(rdev->mddev->uuid));
mutex_init(&log->io_mutex);
spin_lock_init(&log->io_list_lock);
INIT_LIST_HEAD(&log->running_ios);
INIT_LIST_HEAD(&log->io_end_ios);
INIT_LIST_HEAD(&log->flushing_ios);
INIT_LIST_HEAD(&log->finished_ios);
bio_init(&log->flush_bio);
log->io_kc = KMEM_CACHE(r5l_io_unit, 0);
if (!log->io_kc)
goto io_kc;
log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
log->rdev->mddev, "reclaim");
if (!log->reclaim_thread)
goto reclaim_thread;
init_waitqueue_head(&log->iounit_wait);
INIT_LIST_HEAD(&log->no_space_stripes);
spin_lock_init(&log->no_space_stripes_lock);
if (r5l_load_log(log))
goto error;
conf->log = log;
return 0;
error:
md_unregister_thread(&log->reclaim_thread);
reclaim_thread:
kmem_cache_destroy(log->io_kc);
io_kc:
kfree(log);
return -EINVAL;
}
void r5l_exit_log(struct r5l_log *log)
{
md_unregister_thread(&log->reclaim_thread);
kmem_cache_destroy(log->io_kc);
kfree(log);
}
......@@ -353,7 +353,7 @@ static void release_inactive_stripe_list(struct r5conf *conf,
struct list_head *list = &temp_inactive_list[size - 1];
/*
* We don't hold any lock here yet, get_active_stripe() might
* We don't hold any lock here yet, raid5_get_active_stripe() might
* remove stripes from the list
*/
if (!list_empty_careful(list)) {
......@@ -413,7 +413,7 @@ static int release_stripe_list(struct r5conf *conf,
return count;
}
static void release_stripe(struct stripe_head *sh)
void raid5_release_stripe(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
unsigned long flags;
......@@ -658,8 +658,8 @@ static int has_failed(struct r5conf *conf)
return 0;
}
static struct stripe_head *
get_active_stripe(struct r5conf *conf, sector_t sector,
struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce)
{
struct stripe_head *sh;
......@@ -755,6 +755,10 @@ static void unlock_two_stripes(struct stripe_head *sh1, struct stripe_head *sh2)
/* Only freshly new full stripe normal write stripe can be added to a batch list */
static bool stripe_can_batch(struct stripe_head *sh)
{
struct r5conf *conf = sh->raid_conf;
if (conf->log)
return false;
return test_bit(STRIPE_BATCH_READY, &sh->state) &&
!test_bit(STRIPE_BITMAP_PENDING, &sh->state) &&
is_full_stripe_write(sh);
......@@ -858,7 +862,7 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh
unlock_out:
unlock_two_stripes(head, sh);
out:
release_stripe(head);
raid5_release_stripe(head);
}
/* Determine if 'data_offset' or 'new_data_offset' should be used
......@@ -895,6 +899,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
might_sleep();
if (r5l_write_stripe(conf->log, sh) == 0)
return;
for (i = disks; i--; ) {
int rw;
int replace_only = 0;
......@@ -1208,7 +1214,7 @@ static void ops_complete_biofill(void *stripe_head_ref)
return_io(&return_bi);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void ops_run_biofill(struct stripe_head *sh)
......@@ -1271,7 +1277,7 @@ static void ops_complete_compute(void *stripe_head_ref)
if (sh->check_state == check_state_compute_run)
sh->check_state = check_state_compute_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
/* return a pointer to the address conversion region of the scribble buffer */
......@@ -1697,7 +1703,7 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
}
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void
......@@ -1855,7 +1861,7 @@ static void ops_complete_check(void *stripe_head_ref)
sh->check_state = check_state_check_result;
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
......@@ -2017,7 +2023,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
/* we just created an active stripe so... */
atomic_inc(&conf->active_stripes);
release_stripe(sh);
raid5_release_stripe(sh);
conf->max_nr_stripes++;
return 1;
}
......@@ -2236,7 +2242,7 @@ static int resize_stripes(struct r5conf *conf, int newsize)
if (!p)
err = -ENOMEM;
}
release_stripe(nsh);
raid5_release_stripe(nsh);
}
/* critical section pass, GFP_NOIO no longer needed */
......@@ -2394,7 +2400,7 @@ static void raid5_end_read_request(struct bio * bi)
rdev_dec_pending(rdev, conf->mddev);
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
static void raid5_end_write_request(struct bio *bi)
......@@ -2468,14 +2474,12 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
clear_bit(R5_LOCKED, &sh->dev[i].flags);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
if (sh->batch_head && sh != sh->batch_head)
release_stripe(sh->batch_head);
raid5_release_stripe(sh->batch_head);
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
static void raid5_build_block(struct stripe_head *sh, int i, int previous)
{
struct r5dev *dev = &sh->dev[i];
......@@ -2491,7 +2495,7 @@ static void raid5_build_block(struct stripe_head *sh, int i, int previous)
dev->rreq.bi_private = sh;
dev->flags = 0;
dev->sector = compute_blocknr(sh, i, previous);
dev->sector = raid5_compute_blocknr(sh, i, previous);
}
static void error(struct mddev *mddev, struct md_rdev *rdev)
......@@ -2524,7 +2528,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
* Input: a 'big' sector number,
* Output: index of the data and parity disk, and the sector # in them.
*/
static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx,
struct stripe_head *sh)
{
......@@ -2726,7 +2730,7 @@ static sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
return new_sector;
}
static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous)
{
struct r5conf *conf = sh->raid_conf;
int raid_disks = sh->disks;
......@@ -3098,6 +3102,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
if (bi)
bitmap_end = 1;
r5l_stripe_write_finished(sh);
if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
wake_up(&conf->wait_for_overlap);
......@@ -3141,6 +3147,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
* the data has not reached the cache yet.
*/
if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
s->failed > conf->max_degraded &&
(!test_bit(R5_Insync, &sh->dev[i].flags) ||
test_bit(R5_ReadError, &sh->dev[i].flags))) {
spin_lock_irq(&sh->stripe_lock);
......@@ -3497,6 +3504,9 @@ static void handle_stripe_clean_event(struct r5conf *conf,
WARN_ON(test_bit(R5_SkipCopy, &dev->flags));
WARN_ON(dev->page != dev->orig_page);
}
r5l_stripe_write_finished(sh);
if (!discard_pending &&
test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
int hash;
......@@ -3939,10 +3949,10 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
struct stripe_head *sh2;
struct async_submit_ctl submit;
sector_t bn = compute_blocknr(sh, i, 1);
sector_t bn = raid5_compute_blocknr(sh, i, 1);
sector_t s = raid5_compute_sector(conf, bn, 0,
&dd_idx, NULL);
sh2 = get_active_stripe(conf, s, 0, 1, 1);
sh2 = raid5_get_active_stripe(conf, s, 0, 1, 1);
if (sh2 == NULL)
/* so far only the early blocks of this stripe
* have been requested. When later blocks
......@@ -3952,7 +3962,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
/* must have already done this block */
release_stripe(sh2);
raid5_release_stripe(sh2);
continue;
}
......@@ -3973,7 +3983,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh)
set_bit(STRIPE_EXPAND_READY, &sh2->state);
set_bit(STRIPE_HANDLE, &sh2->state);
}
release_stripe(sh2);
raid5_release_stripe(sh2);
}
/* done submitting copies, wait for them to complete */
......@@ -4008,6 +4018,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
s->expanded = test_bit(STRIPE_EXPAND_READY, &sh->state) && !sh->batch_head;
s->failed_num[0] = -1;
s->failed_num[1] = -1;
s->log_failed = r5l_log_disk_error(conf);
/* Now to look around and see what can be done */
rcu_read_lock();
......@@ -4259,7 +4270,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
if (handle_flags == 0 ||
sh->state & handle_flags)
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
}
spin_lock_irq(&head_sh->stripe_lock);
head_sh->batch_head = NULL;
......@@ -4320,6 +4331,9 @@ static void handle_stripe(struct stripe_head *sh)
analyse_stripe(sh, &s);
if (test_bit(STRIPE_LOG_TRAPPED, &sh->state))
goto finish;
if (s.handle_bad_blocks) {
set_bit(STRIPE_HANDLE, &sh->state);
goto finish;
......@@ -4348,7 +4362,7 @@ static void handle_stripe(struct stripe_head *sh)
/* check if the array has lost more than max_degraded devices and,
* if so, some requests might need to be failed.
*/
if (s.failed > conf->max_degraded) {
if (s.failed > conf->max_degraded || s.log_failed) {
sh->check_state = 0;
sh->reconstruct_state = 0;
break_stripe_batch_list(sh, 0);
......@@ -4506,7 +4520,7 @@ static void handle_stripe(struct stripe_head *sh)
/* Finish reconstruct operations initiated by the expansion process */
if (sh->reconstruct_state == reconstruct_state_result) {
struct stripe_head *sh_src
= get_active_stripe(conf, sh->sector, 1, 1, 1);
= raid5_get_active_stripe(conf, sh->sector, 1, 1, 1);
if (sh_src && test_bit(STRIPE_EXPAND_SOURCE, &sh_src->state)) {
/* sh cannot be written until sh_src has been read.
* so arrange for sh to be delayed a little
......@@ -4516,11 +4530,11 @@ static void handle_stripe(struct stripe_head *sh)
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
&sh_src->state))
atomic_inc(&conf->preread_active_stripes);
release_stripe(sh_src);
raid5_release_stripe(sh_src);
goto finish;
}
if (sh_src)
release_stripe(sh_src);
raid5_release_stripe(sh_src);
sh->reconstruct_state = reconstruct_state_idle;
clear_bit(STRIPE_EXPANDING, &sh->state);
......@@ -5012,7 +5026,7 @@ static void release_stripe_plug(struct mddev *mddev,
struct raid5_plug_cb *cb;
if (!blk_cb) {
release_stripe(sh);
raid5_release_stripe(sh);
return;
}
......@@ -5028,7 +5042,7 @@ static void release_stripe_plug(struct mddev *mddev,
if (!test_and_set_bit(STRIPE_ON_UNPLUG_LIST, &sh->state))
list_add_tail(&sh->lru, &cb->list);
else
release_stripe(sh);
raid5_release_stripe(sh);
}
static void make_discard_request(struct mddev *mddev, struct bio *bi)
......@@ -5063,12 +5077,12 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
DEFINE_WAIT(w);
int d;
again:
sh = get_active_stripe(conf, logical_sector, 0, 0, 0);
sh = raid5_get_active_stripe(conf, logical_sector, 0, 0, 0);
prepare_to_wait(&conf->wait_for_overlap, &w,
TASK_UNINTERRUPTIBLE);
set_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags);
if (test_bit(STRIPE_SYNCING, &sh->state)) {
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
goto again;
}
......@@ -5080,7 +5094,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi)
if (sh->dev[d].towrite || sh->dev[d].toread) {
set_bit(R5_Overlap, &sh->dev[d].flags);
spin_unlock_irq(&sh->stripe_lock);
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
goto again;
}
......@@ -5136,9 +5150,16 @@ static void make_request(struct mddev *mddev, struct bio * bi)
bool do_prepare;
if (unlikely(bi->bi_rw & REQ_FLUSH)) {
int ret = r5l_handle_flush_request(conf->log, bi);
if (ret == 0)
return;
if (ret == -ENODEV) {
md_flush_request(mddev, bi);
return;
}
/* ret == -EAGAIN, fallback */
}
md_write_start(mddev, bi);
......@@ -5210,7 +5231,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
(unsigned long long)new_sector,
(unsigned long long)logical_sector);
sh = get_active_stripe(conf, new_sector, previous,
sh = raid5_get_active_stripe(conf, new_sector, previous,
(bi->bi_rw&RWA_MASK), 0);
if (sh) {
if (unlikely(previous)) {
......@@ -5231,7 +5252,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
must_retry = 1;
spin_unlock_irq(&conf->device_lock);
if (must_retry) {
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
......@@ -5241,14 +5262,14 @@ static void make_request(struct mddev *mddev, struct bio * bi)
/* Might have got the wrong stripe_head
* by accident
*/
release_stripe(sh);
raid5_release_stripe(sh);
goto retry;
}
if (rw == WRITE &&
logical_sector >= mddev->suspend_lo &&
logical_sector < mddev->suspend_hi) {
release_stripe(sh);
raid5_release_stripe(sh);
/* As the suspend_* range is controlled by
* userspace, we want an interruptible
* wait.
......@@ -5271,7 +5292,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
* and wait a while
*/
md_wakeup_thread(mddev->thread);
release_stripe(sh);
raid5_release_stripe(sh);
schedule();
do_prepare = true;
goto retry;
......@@ -5458,7 +5479,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
int j;
int skipped_disk = 0;
sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
set_bit(STRIPE_EXPANDING, &sh->state);
atomic_inc(&conf->reshape_stripes);
/* If any of this stripe is beyond the end of the old
......@@ -5471,7 +5492,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (conf->level == 6 &&
j == sh->qd_idx)
continue;
s = compute_blocknr(sh, j, 0);
s = raid5_compute_blocknr(sh, j, 0);
if (s < raid5_size(mddev, 0, 0)) {
skipped_disk = 1;
continue;
......@@ -5507,10 +5528,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
if (last_sector >= mddev->dev_sectors)
last_sector = mddev->dev_sectors - 1;
while (first_sector <= last_sector) {
sh = get_active_stripe(conf, first_sector, 1, 0, 1);
sh = raid5_get_active_stripe(conf, first_sector, 1, 0, 1);
set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
first_sector += STRIPE_SECTORS;
}
/* Now that the sources are clearly marked, we can release
......@@ -5519,7 +5540,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
while (!list_empty(&stripes)) {
sh = list_entry(stripes.next, struct stripe_head, lru);
list_del_init(&sh->lru);
release_stripe(sh);
raid5_release_stripe(sh);
}
/* If this takes us to the resync_max point where we have to pause,
* then we need to write out the superblock.
......@@ -5615,11 +5636,11 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
}
bitmap_cond_end_sync(mddev->bitmap, sector_nr);
bitmap_cond_end_sync(mddev->bitmap, sector_nr, false);
sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
sh = raid5_get_active_stripe(conf, sector_nr, 0, 1, 0);
if (sh == NULL) {
sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
sh = raid5_get_active_stripe(conf, sector_nr, 0, 0, 0);
/* make sure we don't swamp the stripe cache if someone else
* is trying to get access
*/
......@@ -5643,7 +5664,7 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int
set_bit(STRIPE_SYNC_REQUESTED, &sh->state);
set_bit(STRIPE_HANDLE, &sh->state);
release_stripe(sh);
raid5_release_stripe(sh);
return STRIPE_SECTORS;
}
......@@ -5682,7 +5703,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
/* already done this stripe */
continue;
sh = get_active_stripe(conf, sector, 0, 1, 1);
sh = raid5_get_active_stripe(conf, sector, 0, 1, 1);
if (!sh) {
/* failed to get a stripe - must wait */
......@@ -5692,7 +5713,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
}
if (!add_stripe_bio(sh, raid_bio, dd_idx, 0, 0)) {
release_stripe(sh);
raid5_release_stripe(sh);
raid5_set_bi_processed_stripes(raid_bio, scnt);
conf->retry_read_aligned = raid_bio;
return handled;
......@@ -5700,7 +5721,7 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
set_bit(R5_ReadNoMerge, &sh->dev[dd_idx].flags);
handle_stripe(sh);
release_stripe(sh);
raid5_release_stripe(sh);
handled++;
}
remaining = raid5_dec_bi_active_stripes(raid_bio);
......@@ -5730,8 +5751,12 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++)
if (!list_empty(temp_inactive_list + i))
break;
if (i == NR_STRIPE_HASH_LOCKS)
if (i == NR_STRIPE_HASH_LOCKS) {
spin_unlock_irq(&conf->device_lock);
r5l_flush_stripe_to_raid(conf->log);
spin_lock_irq(&conf->device_lock);
return batch_size;
}
release_inactive = true;
}
spin_unlock_irq(&conf->device_lock);
......@@ -5739,6 +5764,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
release_inactive_stripe_list(conf, temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
r5l_flush_stripe_to_raid(conf->log);
if (release_inactive) {
spin_lock_irq(&conf->device_lock);
return 0;
......@@ -5746,6 +5772,7 @@ static int handle_active_stripes(struct r5conf *conf, int group,
for (i = 0; i < batch_size; i++)
handle_stripe(batch[i]);
r5l_write_stripe_run(conf->log);
cond_resched();
......@@ -5879,6 +5906,8 @@ static void raid5d(struct md_thread *thread)
mutex_unlock(&conf->cache_size_mutex);
}
r5l_flush_stripe_to_raid(conf->log);
async_tx_issue_pending_all();
blk_finish_plug(&plug);
......@@ -6316,8 +6345,11 @@ static void raid5_free_percpu(struct r5conf *conf)
static void free_conf(struct r5conf *conf)
{
if (conf->log)
r5l_exit_log(conf->log);
if (conf->shrinker.seeks)
unregister_shrinker(&conf->shrinker);
free_thread_groups(conf);
shrink_stripes(conf);
raid5_free_percpu(conf);
......@@ -6530,7 +6562,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
raid_disk = rdev->raid_disk;
if (raid_disk >= max_disks
|| raid_disk < 0)
|| raid_disk < 0 || test_bit(Journal, &rdev->flags))
continue;
disk = conf->disks + raid_disk;
......@@ -6650,6 +6682,7 @@ static int run(struct mddev *mddev)
int working_disks = 0;
int dirty_parity_disks = 0;
struct md_rdev *rdev;
struct md_rdev *journal_dev = NULL;
sector_t reshape_offset = 0;
int i;
long long min_offset_diff = 0;
......@@ -6662,6 +6695,11 @@ static int run(struct mddev *mddev)
rdev_for_each(rdev, mddev) {
long long diff;
if (test_bit(Journal, &rdev->flags)) {
journal_dev = rdev;
continue;
}
if (rdev->raid_disk < 0)
continue;
diff = (rdev->new_data_offset - rdev->data_offset);
......@@ -6695,6 +6733,12 @@ static int run(struct mddev *mddev)
int chunk_sectors;
int new_data_disks;
if (journal_dev) {
printk(KERN_ERR "md/raid:%s: don't support reshape with journal - aborting.\n",
mdname(mddev));
return -EINVAL;
}
if (mddev->new_level != mddev->level) {
printk(KERN_ERR "md/raid:%s: unsupported reshape "
"required - aborting.\n",
......@@ -6770,6 +6814,13 @@ static int run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !journal_dev) {
printk(KERN_ERR "md/raid:%s: journal disk is missing, force array readonly\n",
mdname(mddev));
mddev->ro = 1;
set_disk_ro(mddev->gendisk, 1);
}
conf->min_offset_diff = min_offset_diff;
mddev->thread = conf->thread;
conf->thread = NULL;
......@@ -6973,6 +7024,14 @@ static int run(struct mddev *mddev)
mddev->queue);
}
if (journal_dev) {
char b[BDEVNAME_SIZE];
printk(KERN_INFO"md/raid:%s: using device %s as journal\n",
mdname(mddev), bdevname(journal_dev->bdev, b));
r5l_init_log(conf, journal_dev);
}
return 0;
abort:
md_unregister_thread(&mddev->thread);
......@@ -7082,6 +7141,15 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
struct disk_info *p = conf->disks + number;
print_raid5_conf(conf);
if (test_bit(Journal, &rdev->flags)) {
/*
* journal disk is not removable, but we need give a chance to
* update superblock of other disks. Otherwise journal disk
* will be considered as 'fresh'
*/
set_bit(MD_CHANGE_DEVS, &mddev->flags);
return -EINVAL;
}
if (rdev == p->rdev)
rdevp = &p->rdev;
else if (rdev == p->replacement)
......@@ -7144,6 +7212,8 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
int first = 0;
int last = conf->raid_disks - 1;
if (test_bit(Journal, &rdev->flags))
return -EINVAL;
if (mddev->recovery_disabled == conf->recovery_disabled)
return -EBUSY;
......@@ -7205,6 +7275,8 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
sector_t newsize;
struct r5conf *conf = mddev->private;
if (conf->log)
return -EINVAL;
sectors &= ~((sector_t)conf->chunk_sectors - 1);
newsize = raid5_size(mddev, sectors, mddev->raid_disks);
if (mddev->external_size &&
......@@ -7256,6 +7328,8 @@ static int check_reshape(struct mddev *mddev)
{
struct r5conf *conf = mddev->private;
if (conf->log)
return -EINVAL;
if (mddev->delta_disks == 0 &&
mddev->new_layout == mddev->layout &&
mddev->new_chunk_sectors == mddev->chunk_sectors)
......@@ -7532,6 +7606,7 @@ static void raid5_quiesce(struct mddev *mddev, int state)
unlock_all_device_hash_locks_irq(conf);
break;
}
r5l_quiesce(conf->log, state);
}
static void *raid45_takeover_raid0(struct mddev *mddev, int level)
......
......@@ -223,6 +223,9 @@ struct stripe_head {
struct stripe_head *batch_head; /* protected by stripe lock */
spinlock_t batch_lock; /* only header's lock is useful */
struct list_head batch_list; /* protected by head's batch lock*/
struct r5l_io_unit *log_io;
struct list_head log_list;
/**
* struct stripe_operations
* @target - STRIPE_OP_COMPUTE_BLK target
......@@ -244,6 +247,7 @@ struct stripe_head {
struct bio *toread, *read, *towrite, *written;
sector_t sector; /* sector of this page */
unsigned long flags;
u32 log_checksum;
} dev[1]; /* allocated with extra space depending of RAID geometry */
};
......@@ -268,6 +272,7 @@ struct stripe_head_state {
struct bio_list return_bi;
struct md_rdev *blocked_rdev;
int handle_bad_blocks;
int log_failed;
};
/* Flags for struct r5dev.flags */
......@@ -340,6 +345,7 @@ enum {
STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add
* to batch yet.
*/
STRIPE_LOG_TRAPPED, /* trapped into log */
};
#define STRIPE_EXPAND_SYNC_FLAGS \
......@@ -543,6 +549,7 @@ struct r5conf {
struct r5worker_group *worker_groups;
int group_cnt;
int worker_cnt_per_group;
struct r5l_log *log;
};
......@@ -609,4 +616,21 @@ static inline int algorithm_is_DDF(int layout)
extern void md_raid5_kick_device(struct r5conf *conf);
extern int raid5_set_cache_size(struct mddev *mddev, int size);
extern sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous);
extern void raid5_release_stripe(struct stripe_head *sh);
extern sector_t raid5_compute_sector(struct r5conf *conf, sector_t r_sector,
int previous, int *dd_idx,
struct stripe_head *sh);
extern struct stripe_head *
raid5_get_active_stripe(struct r5conf *conf, sector_t sector,
int previous, int noblock, int noquiesce);
extern int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev);
extern void r5l_exit_log(struct r5l_log *log);
extern int r5l_write_stripe(struct r5l_log *log, struct stripe_head *head_sh);
extern void r5l_write_stripe_run(struct r5l_log *log);
extern void r5l_flush_stripe_to_raid(struct r5l_log *log);
extern void r5l_stripe_write_finished(struct stripe_head *sh);
extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
extern void r5l_quiesce(struct r5l_log *log, int state);
extern bool r5l_log_disk_error(struct r5conf *conf);
#endif
......@@ -89,6 +89,12 @@
* read requests will only be sent here in
* dire need
*/
#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
#define MD_DISK_ROLE_SPARE 0xffff
#define MD_DISK_ROLE_FAULTY 0xfffe
#define MD_DISK_ROLE_JOURNAL 0xfffd
#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
typedef struct mdp_device_descriptor_s {
__u32 number; /* 0 Device number in the entire set */
......@@ -252,7 +258,10 @@ struct mdp_superblock_1 {
__le64 data_offset; /* sector start of data, often 0 */
__le64 data_size; /* sectors in this device that can be used for data */
__le64 super_offset; /* sector start of this superblock */
union {
__le64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
__le64 journal_tail;/* journal tail of journal device (from data_offset) */
};
__le32 dev_number; /* permanent identifier of this device - not role in raid */
__le32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
__u8 device_uuid[16]; /* user-space setable, ignored by kernel */
......@@ -302,6 +311,8 @@ struct mdp_superblock_1 {
#define MD_FEATURE_RECOVERY_BITMAP 128 /* recovery that is happening
* is guided by bitmap.
*/
#define MD_FEATURE_CLUSTERED 256 /* clustered MD */
#define MD_FEATURE_JOURNAL 512 /* support write cache */
#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
|MD_FEATURE_RECOVERY_OFFSET \
|MD_FEATURE_RESHAPE_ACTIVE \
......@@ -310,6 +321,66 @@ struct mdp_superblock_1 {
|MD_FEATURE_RESHAPE_BACKWARDS \
|MD_FEATURE_NEW_OFFSET \
|MD_FEATURE_RECOVERY_BITMAP \
|MD_FEATURE_CLUSTERED \
|MD_FEATURE_JOURNAL \
)
struct r5l_payload_header {
__le16 type;
__le16 flags;
} __attribute__ ((__packed__));
enum r5l_payload_type {
R5LOG_PAYLOAD_DATA = 0,
R5LOG_PAYLOAD_PARITY = 1,
R5LOG_PAYLOAD_FLUSH = 2,
};
struct r5l_payload_data_parity {
struct r5l_payload_header header;
__le32 size; /* sector. data/parity size. each 4k
* has a checksum */
__le64 location; /* sector. For data, it's raid sector. For
* parity, it's stripe sector */
__le32 checksum[];
} __attribute__ ((__packed__));
enum r5l_payload_data_parity_flag {
R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
/*
* RESHAPED/RESHAPING is only set when there is reshape activity. Note,
* both data/parity of a stripe should have the same flag set
*
* RESHAPED: reshape is running, and this stripe finished reshape
* RESHAPING: reshape is running, and this stripe isn't reshaped
*/
R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
};
struct r5l_payload_flush {
struct r5l_payload_header header;
__le32 size; /* flush_stripes size, bytes */
__le64 flush_stripes[];
} __attribute__ ((__packed__));
enum r5l_payload_flush_flag {
R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
};
struct r5l_meta_block {
__le32 magic;
__le32 checksum;
__u8 version;
__u8 __zero_pading_1;
__le16 __zero_pading_2;
__le32 meta_size; /* whole size of the block */
__le64 seq;
__le64 position; /* sector, start from rdev->data_offset, current position */
struct r5l_payload_header payloads[];
} __attribute__ ((__packed__));
#define R5LOG_VERSION 0x1
#define R5LOG_MAGIC 0x6433c509
#endif
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment